From f1e5c48691b87f8c36acab079f07efeaaa37ac70 Mon Sep 17 00:00:00 2001
From: Fabian Pedregosa <fabian.pedregosa@inria.fr>
Date: Mon, 1 Mar 2010 14:30:48 +0000
Subject: [PATCH] Datasets refactoring.

Do not return dictionary on the datasets. We still have to define
a common API for all datasets, but in the meantime this will allow
us to have some clean examples that will hopefully lead to a cleaner
dataset implementation.

From: Fabian Pedregosa <fabian.pedregosa@inria.fr>

git-svn-id: https://scikit-learn.svn.sourceforge.net/svnroot/scikit-learn/trunk@412 22fbfee3-77ab-4535-9bad-27d1bd3bc7d8
---
 scikits/learn/datasets/faithful/__init__.py |  1 -
 scikits/learn/datasets/faithful/data.py     | 17 +++++----
 scikits/learn/datasets/iris/data.py         | 40 +++++++--------------
 scikits/learn/datasets/iris/iris.py         |  2 +-
 4 files changed, 24 insertions(+), 36 deletions(-)

diff --git a/scikits/learn/datasets/faithful/__init__.py b/scikits/learn/datasets/faithful/__init__.py
index 429a66aac7..58177e96c0 100644
--- a/scikits/learn/datasets/faithful/__init__.py
+++ b/scikits/learn/datasets/faithful/__init__.py
@@ -6,4 +6,3 @@ copyright   = _faith.COPYRIGHT
 source      = _faith.SOURCE
 
 load        = _faith.load
-load_data   = _faith.load_data
diff --git a/scikits/learn/datasets/faithful/data.py b/scikits/learn/datasets/faithful/data.py
index 77aed611eb..4da277cf23 100644
--- a/scikits/learn/datasets/faithful/data.py
+++ b/scikits/learn/datasets/faithful/data.py
@@ -27,24 +27,27 @@ DESCRLONG   = """According to Azzalini and Bowman's article, those data
 were recorded continuously from 1th August to 15th August 1985.
 
 Some of the durations times are labelled as L, M or S (Large, Small, Medium).
-According to Azzalini and Bowman's paper: "because the unbroken sequence
+According to Azzalini and Bowman's paper: 'because the unbroken sequence
 required measurements to be taken at night, some duration times are recorded as
 L (long), S (short) and M (medium). Other data sets do not contain a con-
-tinuous stream of data, making it difficult to deal with time series features."
+tinuous stream of data, making it difficult to deal with time series features'
 """
 
-NOTE        = """Eruptions time in minutes, waiting time to next eruption in
+NOTE = """Eruptions time in minutes, waiting time to next eruption in
 minutes"""
 
 import numpy as np
 
-
 def load():
     """load the actual data and returns them.
     
     :returns:
-        data: recordarray
-            a record array of the data.
+        waiting: array
+             waiting time until next eruption, in minutes
+        duration: array
+             duration of eruption, in minutes
     """
     from faithful import waiting, duration
-    return {'data': np.array(zip(waiting, duration))}
+    waiting = np.array(waiting, np.float)
+    duration = np.array(duration, np.float)
+    return waiting, duration
diff --git a/scikits/learn/datasets/iris/data.py b/scikits/learn/datasets/iris/data.py
index ce470280ed..444a9ecbc7 100644
--- a/scikits/learn/datasets/iris/data.py
+++ b/scikits/learn/datasets/iris/data.py
@@ -94,6 +94,9 @@ Missing Attribute Values: None
 Class Distribution: 33.3% for each of 3 classes.
 """
 
+import numpy as np
+
+
 def load():
     """load the iris data and returns them.
     
@@ -111,32 +114,15 @@ def load():
     know their class name.
 
     >>>> d = load()
-    >>>> ind = [10, 25, 50]
-    >>>> lind = d['label'][ind] # returns the label index of each sample
-    >>>> d['class'][lind] # returns the class name of each sample
+    # >>>> ind = [10, 25, 50]
+    # >>>> lind = d['label'][ind] # returns the label index of each sample
+    # >>>> d['class'][lind] # returns the class name of each sample
 
     """
-    import numpy
-    from iris import SL, SW, PL, PW, LABELS, LI2LN
-    PW = numpy.array(PW).astype(numpy.float)
-    PL = numpy.array(PL).astype(numpy.float)
-    SW = numpy.array(SW).astype(numpy.float)
-    SL = numpy.array(SL).astype(numpy.float)
-    data    = {}
-    data['data'] = numpy.empty(len(PW), 
-                               [('petal width', numpy.int),
-                                ('petal length', numpy.int),
-                                ('sepal width', numpy.int),
-                                ('sepal length', numpy.int)])
-
-    data['data']['petal width'] = numpy.round(PW * 10)
-    data['data']['petal length'] = numpy.round(PL * 10)
-    data['data']['sepal width'] = numpy.round(SW * 10)
-    data['data']['sepal length'] = numpy.round(SL * 10)
-    data['label'] = numpy.array(LABELS).astype(numpy.int)
-    data['class'] = numpy.empty(len(LI2LN), 
-                                'S%d' % numpy.max([len(i) for i in LI2LN.values()]))
-    for i,c in LI2LN.items():
-        data['class'][i] = c
-    
-    return data
+    from iris import SL, SW, PL, PW, labels, LI2LN
+    PW     = np.array(PW, dtype=np.float)
+    PL     = np.array(PL, dtype=np.float)
+    SW     = np.array(SW, dtype=np.float)
+    SL     = np.array(SL, dtype=np.float)
+    labels = np.array(labels, dtype=np.int)
+    return PW, PL, SW, SL, labels
diff --git a/scikits/learn/datasets/iris/iris.py b/scikits/learn/datasets/iris/iris.py
index 94da7409f7..13ff411eac 100644
--- a/scikits/learn/datasets/iris/iris.py
+++ b/scikits/learn/datasets/iris/iris.py
@@ -60,7 +60,7 @@ PW = ['0.2', '0.2', '0.2', '0.2', '0.2', '0.4', '0.3', '0.2', '0.2', '0.1',
 '2.0', '2.2', '1.5', '1.4', '2.3', '2.4', '1.8', '1.8', '2.1', '2.4', '2.3',
 '1.9', '2.3', '2.5', '2.3', '1.9', '2.0', '2.3', '1.8']
 
-LABELS = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+labels = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
-- 
GitLab