From ad019b600d3f8600d2ae5c0d8ab548b30c7d1882 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <amueller@ais.uni-bonn.de>
Date: Sun, 18 Dec 2011 00:13:03 +0100
Subject: [PATCH] DOC: Corrections and additions to the dataset docs. Also more
 detailed docstrings for the functions loading the data.

---
 doc/datasets/index.rst   | 10 ++++++----
 sklearn/datasets/base.py | 41 ++++++++++++++++++++++++++++++++--------
 2 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst
index f44ce2de9c..8f2a973305 100644
--- a/doc/datasets/index.rst
+++ b/doc/datasets/index.rst
@@ -28,7 +28,8 @@ that comes from the 'real world'.
 
 General dataset API
 ===================
-There are three distinct kinds of dataset interfaces used at the moment.
+There are three distinct kinds of dataset interfaces for different types
+of datasets.
 The simplest one is the interface for sample images, which is described
 below in the :ref: _Sample_images section.
 
@@ -41,9 +42,10 @@ fetched from mldata.org have more sophisticated structure.
 These functions return a ``bunch`` (which is a dictionary that is
 accessible with the 'dict.key' syntax).
 All datasets have at least two keys, ``data``, containg an array of shape
-``n_samples x n_features`` and ``target``, a numpy array of length ``n_features``,
-containing the targets.
-The datasets also contain a description in ``DESC`` and some contain
+``n_samples x n_features`` (except for 20newsgroups) and ``target``, a numpy
+array of length ``n_features``, containing the targets.
+
+The datasets also contain a description in ``DESCR`` and some contain
 ``feature_names`` and ``target_names``.
 See the dataset descriptions below for details.
 
diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
index bfb3674cba..2e485027c8 100644
--- a/sklearn/datasets/base.py
+++ b/sklearn/datasets/base.py
@@ -180,13 +180,21 @@ def load_files(container_path, description=None, categories=None,
 
 def load_iris():
     """Load and return the iris dataset (classification).
+    Classes: 3
+    Samples per class: 50
+    Samples total: 150
+    Dimensionality: 4
+    Features: real, positive
+
+    The iris dataset is a classic and very easy multi-class classification dataset.
 
     Return
     ------
     data : Bunch
         Dictionary-like object, the interesting attributes are:
         'data', the data to learn, 'target', the classification labels,
-        'target_names', the meaning of the labels, and 'DESCR', the
+        'target_names', the meaning of the labels, 'feature_names', the
+        meaning of the features, and 'DESCR', the
         full description of the dataset.
 
     Examples
@@ -224,6 +232,13 @@ def load_iris():
 
 def load_digits(n_class=10):
     """Load and return the digits dataset (classification).
+    Classes: 10
+    Samples per class: ~180
+    Samples total: 1797
+    Dimensionality: 64
+    Features: integers 0-16
+
+    Each datapoint is a 8x8 image of a digit.
 
     Parameters
     ----------
@@ -234,7 +249,7 @@ def load_digits(n_class=10):
     ------
     data : Bunch
         Dictionary-like object, the interesting attributes are:
-        'data', the data to learn, `images`, the images corresponding
+        'data', the data to learn, 'images', the images corresponding
         to each sample, 'target', the classification labels for each
         sample, 'target_names', the meaning of the labels, and 'DESCR',
         the full description of the dataset.
@@ -276,11 +291,16 @@ def load_digits(n_class=10):
 def load_diabetes():
     """Load and return the diabetes dataset (regression).
 
+    Samples total: 442
+    Dimensionality: 1r03
+    Features: real, -.2 < x < .2
+    Targets: integer 25 - 346
+
     Return
     ------
     data : Bunch
         Dictionary-like object, the interesting attributes are:
-        'data', the data to learn and 'target', the labels for each
+        'data', the data to learn and 'target', the regression target for each
         sample.
     """
     base_dir = join(dirname(__file__), 'data')
@@ -322,11 +342,16 @@ def load_linnerud():
 def load_boston():
     """Load and return the boston house-prices dataset (regression).
 
+    Samples total: 506
+    Dimensionality: 13
+    Features: real, positive
+    Targets: real 5. - 50.
+
     Return
     ------
     data : Bunch
         Dictionary-like object, the interesting attributes are:
-        'data', the data to learn, 'target', the classification labels,
+        'data', the data to learn, 'target', the regression targets,
         'target_names', the meaning of the labels, and 'DESCR', the
         full description of the dataset.
 
@@ -361,14 +386,14 @@ def load_boston():
 
 def load_sample_images():
     """Load sample images for image manipulation.
+    Loads both, ``china`` and ``flower``.
 
     Return
     ------
     data : Bunch
-        Dictionary-like object, the interesting attributes are:
-        'data', the data to learn, `images`, the images corresponding
-        to each sample, 'target', the classification labels for each
-        sample, 'target_names', the meaning of the labels, and 'DESCR',
+        Dictionary-like object with the following attributes :
+        'images', the two sample images, 'filenames', the file
+        names for the images, and 'DESCR'
         the full description of the dataset.
 
     Examples
-- 
GitLab