From ad019b600d3f8600d2ae5c0d8ab548b30c7d1882 Mon Sep 17 00:00:00 2001 From: Andreas Mueller <amueller@ais.uni-bonn.de> Date: Sun, 18 Dec 2011 00:13:03 +0100 Subject: [PATCH] DOC: Corrections and additions to the dataset docs. Also more detailed docstrings for the functions loading the data. --- doc/datasets/index.rst | 10 ++++++---- sklearn/datasets/base.py | 41 ++++++++++++++++++++++++++++++++-------- 2 files changed, 39 insertions(+), 12 deletions(-) diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst index f44ce2de9c..8f2a973305 100644 --- a/doc/datasets/index.rst +++ b/doc/datasets/index.rst @@ -28,7 +28,8 @@ that comes from the 'real world'. General dataset API =================== -There are three distinct kinds of dataset interfaces used at the moment. +There are three distinct kinds of dataset interfaces for different types +of datasets. The simplest one is the interface for sample images, which is described below in the :ref: _Sample_images section. @@ -41,9 +42,10 @@ fetched from mldata.org have more sophisticated structure. These functions return a ``bunch`` (which is a dictionary that is accessible with the 'dict.key' syntax). All datasets have at least two keys, ``data``, containg an array of shape -``n_samples x n_features`` and ``target``, a numpy array of length ``n_features``, -containing the targets. -The datasets also contain a description in ``DESC`` and some contain +``n_samples x n_features`` (except for 20newsgroups) and ``target``, a numpy +array of length ``n_features``, containing the targets. + +The datasets also contain a description in ``DESCR`` and some contain ``feature_names`` and ``target_names``. See the dataset descriptions below for details. diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index bfb3674cba..2e485027c8 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -180,13 +180,21 @@ def load_files(container_path, description=None, categories=None, def load_iris(): """Load and return the iris dataset (classification). + Classes: 3 + Samples per class: 50 + Samples total: 150 + Dimensionality: 4 + Features: real, positive + + The iris dataset is a classic and very easy multi-class classification dataset. Return ------ data : Bunch Dictionary-like object, the interesting attributes are: 'data', the data to learn, 'target', the classification labels, - 'target_names', the meaning of the labels, and 'DESCR', the + 'target_names', the meaning of the labels, 'feature_names', the + meaning of the features, and 'DESCR', the full description of the dataset. Examples @@ -224,6 +232,13 @@ def load_iris(): def load_digits(n_class=10): """Load and return the digits dataset (classification). + Classes: 10 + Samples per class: ~180 + Samples total: 1797 + Dimensionality: 64 + Features: integers 0-16 + + Each datapoint is a 8x8 image of a digit. Parameters ---------- @@ -234,7 +249,7 @@ def load_digits(n_class=10): ------ data : Bunch Dictionary-like object, the interesting attributes are: - 'data', the data to learn, `images`, the images corresponding + 'data', the data to learn, 'images', the images corresponding to each sample, 'target', the classification labels for each sample, 'target_names', the meaning of the labels, and 'DESCR', the full description of the dataset. @@ -276,11 +291,16 @@ def load_digits(n_class=10): def load_diabetes(): """Load and return the diabetes dataset (regression). + Samples total: 442 + Dimensionality: 1r03 + Features: real, -.2 < x < .2 + Targets: integer 25 - 346 + Return ------ data : Bunch Dictionary-like object, the interesting attributes are: - 'data', the data to learn and 'target', the labels for each + 'data', the data to learn and 'target', the regression target for each sample. """ base_dir = join(dirname(__file__), 'data') @@ -322,11 +342,16 @@ def load_linnerud(): def load_boston(): """Load and return the boston house-prices dataset (regression). + Samples total: 506 + Dimensionality: 13 + Features: real, positive + Targets: real 5. - 50. + Return ------ data : Bunch Dictionary-like object, the interesting attributes are: - 'data', the data to learn, 'target', the classification labels, + 'data', the data to learn, 'target', the regression targets, 'target_names', the meaning of the labels, and 'DESCR', the full description of the dataset. @@ -361,14 +386,14 @@ def load_boston(): def load_sample_images(): """Load sample images for image manipulation. + Loads both, ``china`` and ``flower``. Return ------ data : Bunch - Dictionary-like object, the interesting attributes are: - 'data', the data to learn, `images`, the images corresponding - to each sample, 'target', the classification labels for each - sample, 'target_names', the meaning of the labels, and 'DESCR', + Dictionary-like object with the following attributes : + 'images', the two sample images, 'filenames', the file + names for the images, and 'DESCR' the full description of the dataset. Examples -- GitLab