diff --git a/scikits/learn/datasets/__init__.py b/scikits/learn/datasets/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..d89713f7dfcbdee04d867a88c89cf1eab9e8c2b2 100644 --- a/scikits/learn/datasets/__init__.py +++ b/scikits/learn/datasets/__init__.py @@ -0,0 +1 @@ +from base import load diff --git a/scikits/learn/datasets/base.py b/scikits/learn/datasets/base.py new file mode 100644 index 0000000000000000000000000000000000000000..a92f85f73c7745e2a6db3894c4d34b2bc8ffa36c --- /dev/null +++ b/scikits/learn/datasets/base.py @@ -0,0 +1,65 @@ +""" +Base object for all datasets +""" + +# Copyright (c) 2007 David Cournapeau <cournape@gmail.com> +# 2010 Fabian Pedregosa <fabian.pedregosa@inria.fr> +# + + +import numpy as np + +class Bunch(dict): + """ + Container for dataset. + + Members + ------- + - data : a record array with the actual data + - label : label[i] = label index of data[i] + - class : class[i] is the string corresponding to label index i. + - COPYRIGHT, TITLE, SOURCE, DESCRSHORT, DESCRLONG, + NOTE. Information about the dataset. + """ + + def __init__(self, **kwargs): + dict.__init__(self, kwargs) + self.__dict__ = self + +def load(dataset): + """load the data and returns them. + + Returns + ------- + data : Bunch + See docstring of bunch for a complete description of its members. + + Available datasets + - iris + + Example + ------- + Let's say you are interested in the samples 10, 25, and 50, and want to + know their class name. + + >>> data = load() + >>> print data.label #doctest: +ELLIPSIS + [ 0. 0. ...] + """ + import csv + import os + DESCR = '' + + firis = csv.reader(open(os.path.dirname(__file__) + '/data/%s.csv' % dataset)) + fdescr = open(os.path.dirname(__file__) + '/descr/%s.rst' % dataset) + temp = firis.next() + nsamples = int(temp[0]) + nfeat = int(temp[1]) + labelnames = temp[2:] + data = np.empty((nsamples, nfeat)) + label = np.empty((nsamples,)) + for i, ir in enumerate(firis): + data[i] = np.asanyarray(ir[:-1], dtype=np.float) + label[i] = np.asanyarray(ir[-1], dtype=np.int) + return Bunch(data = data, label=label, labelnames=labelnames, DESCR=fdescr.read()) + diff --git a/scikits/learn/datasets/data/iris.csv b/scikits/learn/datasets/data/iris.csv new file mode 100644 index 0000000000000000000000000000000000000000..3f521bf33e1cf08cbe28b213fd9fc565a8e488c8 --- /dev/null +++ b/scikits/learn/datasets/data/iris.csv @@ -0,0 +1,151 @@ +150, 4, setosa, versicolor, virginica +5.1,3.5,1.4,0.2,0 +4.9,3.0,1.4,0.2,0 +4.7,3.2,1.3,0.2,0 +4.6,3.1,1.5,0.2,0 +5.0,3.6,1.4,0.2,0 +5.4,3.9,1.7,0.4,0 +4.6,3.4,1.4,0.3,0 +5.0,3.4,1.5,0.2,0 +4.4,2.9,1.4,0.2,0 +4.9,3.1,1.5,0.1,0 +5.4,3.7,1.5,0.2,0 +4.8,3.4,1.6,0.2,0 +4.8,3.0,1.4,0.1,0 +4.3,3.0,1.1,0.1,0 +5.8,4.0,1.2,0.2,0 +5.7,4.4,1.5,0.4,0 +5.4,3.9,1.3,0.4,0 +5.1,3.5,1.4,0.3,0 +5.7,3.8,1.7,0.3,0 +5.1,3.8,1.5,0.3,0 +5.4,3.4,1.7,0.2,0 +5.1,3.7,1.5,0.4,0 +4.6,3.6,1.0,0.2,0 +5.1,3.3,1.7,0.5,0 +4.8,3.4,1.9,0.2,0 +5.0,3.0,1.6,0.2,0 +5.0,3.4,1.6,0.4,0 +5.2,3.5,1.5,0.2,0 +5.2,3.4,1.4,0.2,0 +4.7,3.2,1.6,0.2,0 +4.8,3.1,1.6,0.2,0 +5.4,3.4,1.5,0.4,0 +5.2,4.1,1.5,0.1,0 +5.5,4.2,1.4,0.2,0 +4.9,3.1,1.5,0.1,0 +5.0,3.2,1.2,0.2,0 +5.5,3.5,1.3,0.2,0 +4.9,3.1,1.5,0.1,0 +4.4,3.0,1.3,0.2,0 +5.1,3.4,1.5,0.2,0 +5.0,3.5,1.3,0.3,0 +4.5,2.3,1.3,0.3,0 +4.4,3.2,1.3,0.2,0 +5.0,3.5,1.6,0.6,0 +5.1,3.8,1.9,0.4,0 +4.8,3.0,1.4,0.3,0 +5.1,3.8,1.6,0.2,0 +4.6,3.2,1.4,0.2,0 +5.3,3.7,1.5,0.2,0 +5.0,3.3,1.4,0.2,0 +7.0,3.2,4.7,1.4,1 +6.4,3.2,4.5,1.5,1 +6.9,3.1,4.9,1.5,1 +5.5,2.3,4.0,1.3,1 +6.5,2.8,4.6,1.5,1 +5.7,2.8,4.5,1.3,1 +6.3,3.3,4.7,1.6,1 +4.9,2.4,3.3,1.0,1 +6.6,2.9,4.6,1.3,1 +5.2,2.7,3.9,1.4,1 +5.0,2.0,3.5,1.0,1 +5.9,3.0,4.2,1.5,1 +6.0,2.2,4.0,1.0,1 +6.1,2.9,4.7,1.4,1 +5.6,2.9,3.6,1.3,1 +6.7,3.1,4.4,1.4,1 +5.6,3.0,4.5,1.5,1 +5.8,2.7,4.1,1.0,1 +6.2,2.2,4.5,1.5,1 +5.6,2.5,3.9,1.1,1 +5.9,3.2,4.8,1.8,1 +6.1,2.8,4.0,1.3,1 +6.3,2.5,4.9,1.5,1 +6.1,2.8,4.7,1.2,1 +6.4,2.9,4.3,1.3,1 +6.6,3.0,4.4,1.4,1 +6.8,2.8,4.8,1.4,1 +6.7,3.0,5.0,1.7,1 +6.0,2.9,4.5,1.5,1 +5.7,2.6,3.5,1.0,1 +5.5,2.4,3.8,1.1,1 +5.5,2.4,3.7,1.0,1 +5.8,2.7,3.9,1.2,1 +6.0,2.7,5.1,1.6,1 +5.4,3.0,4.5,1.5,1 +6.0,3.4,4.5,1.6,1 +6.7,3.1,4.7,1.5,1 +6.3,2.3,4.4,1.3,1 +5.6,3.0,4.1,1.3,1 +5.5,2.5,4.0,1.3,1 +5.5,2.6,4.4,1.2,1 +6.1,3.0,4.6,1.4,1 +5.8,2.6,4.0,1.2,1 +5.0,2.3,3.3,1.0,1 +5.6,2.7,4.2,1.3,1 +5.7,3.0,4.2,1.2,1 +5.7,2.9,4.2,1.3,1 +6.2,2.9,4.3,1.3,1 +5.1,2.5,3.0,1.1,1 +5.7,2.8,4.1,1.3,1 +6.3,3.3,6.0,2.5,2 +5.8,2.7,5.1,1.9,2 +7.1,3.0,5.9,2.1,2 +6.3,2.9,5.6,1.8,2 +6.5,3.0,5.8,2.2,2 +7.6,3.0,6.6,2.1,2 +4.9,2.5,4.5,1.7,2 +7.3,2.9,6.3,1.8,2 +6.7,2.5,5.8,1.8,2 +7.2,3.6,6.1,2.5,2 +6.5,3.2,5.1,2.0,2 +6.4,2.7,5.3,1.9,2 +6.8,3.0,5.5,2.1,2 +5.7,2.5,5.0,2.0,2 +5.8,2.8,5.1,2.4,2 +6.4,3.2,5.3,2.3,2 +6.5,3.0,5.5,1.8,2 +7.7,3.8,6.7,2.2,2 +7.7,2.6,6.9,2.3,2 +6.0,2.2,5.0,1.5,2 +6.9,3.2,5.7,2.3,2 +5.6,2.8,4.9,2.0,2 +7.7,2.8,6.7,2.0,2 +6.3,2.7,4.9,1.8,2 +6.7,3.3,5.7,2.1,2 +7.2,3.2,6.0,1.8,2 +6.2,2.8,4.8,1.8,2 +6.1,3.0,4.9,1.8,2 +6.4,2.8,5.6,2.1,2 +7.2,3.0,5.8,1.6,2 +7.4,2.8,6.1,1.9,2 +7.9,3.8,6.4,2.0,2 +6.4,2.8,5.6,2.2,2 +6.3,2.8,5.1,1.5,2 +6.1,2.6,5.6,1.4,2 +7.7,3.0,6.1,2.3,2 +6.3,3.4,5.6,2.4,2 +6.4,3.1,5.5,1.8,2 +6.0,3.0,4.8,1.8,2 +6.9,3.1,5.4,2.1,2 +6.7,3.1,5.6,2.4,2 +6.9,3.1,5.1,2.3,2 +5.8,2.7,5.1,1.9,2 +6.8,3.2,5.9,2.3,2 +6.7,3.3,5.7,2.5,2 +6.7,3.0,5.2,2.3,2 +6.3,2.5,5.0,1.9,2 +6.5,3.0,5.2,2.0,2 +6.2,3.4,5.4,2.3,2 +5.9,3.0,5.1,1.8,2 diff --git a/scikits/learn/datasets/descr/iris.rst b/scikits/learn/datasets/descr/iris.rst new file mode 100644 index 0000000000000000000000000000000000000000..62a2d8e48404beb4da452a71bd95a173d69dd6ee --- /dev/null +++ b/scikits/learn/datasets/descr/iris.rst @@ -0,0 +1,69 @@ +Iris Plants Database + +Source +------ +Creator: R.A. Fisher +Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) +Date: July, 1988 + +This is a copy of UCI ML iris datasets. + +The famous Iris database, first used by Sir R.A Fisher + +This is perhaps the best known database to be found in the +pattern recognition literature. Fisher's paper is a classic in the field and +is referenced frequently to this day. (See Duda & Hart, for example.) The +data set contains 3 classes of 50 instances each, where each class refers to a +type of iris plant. One class is linearly separable from the other 2; the +latter are NOT linearly separable from each other. + + +References +---------- + + - Fisher,R.A. "The use of multiple measurements in taxonomic problems" + Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to + Mathematical Statistics" (John Wiley, NY, 1950). + - Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis. + (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218. + - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System + Structure and Classification Rule for Recognition in Partially Exposed + Environments". IEEE Transactions on Pattern Analysis and Machine + Intelligence, Vol. PAMI-2, No. 1, 67-71. + - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE Transactions + on Information Theory, May 1972, 431-433. + - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al"s AUTOCLASS II + conceptual clustering system finds 3 classes in the data. + - Many, many more ... + + +Number of Instances: 150 (50 in each of three classes) + +Number of Attributes: 4 numeric, predictive attributes and the class + +Attribute Information: + - sepal length in cm + - sepal width in cm + - petal length in cm + - petal width in cm + - class: + - Iris-Setosa + - Iris-Versicolour + - Iris-Virginica + +Summary Statistics: + Min Max Mean SD Class Correlation + sepal length: 4.3 7.9 5.84 0.83 0.7826 + sepal width: 2.0 4.4 3.05 0.43 -0.4194 + petal length: 1.0 6.9 3.76 1.76 0.9490 (high!) + petal width: 0.1 2.5 1.20 0.76 0.9565 (high!) + +Missing Attribute Values: None + +Class Distribution: 33.3% for each of 3 classes. + +Example +------- + >>> data = load() + >>> print data.label #doctest: +ELLIPSIS + [ 0. 0. ...][ 0. 0. ...] diff --git a/scikits/learn/datasets/iris.py b/scikits/learn/datasets/iris.py new file mode 100644 index 0000000000000000000000000000000000000000..59b8cac70e498e69fb093c42a17d8122c5a9dc6a --- /dev/null +++ b/scikits/learn/datasets/iris.py @@ -0,0 +1,92 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +# The code and descriptive text is copyrighted and offered under the terms of +# the BSD License from the authors; see below. However, the actual dataset may +# have a different origin and intellectual property status. See the SOURCE and +# COPYRIGHT variables for this information. + +# Copyright (c) 2007 David Cournapeau <cournape@gmail.com> +# 2010 Fabian Pedregosa <fabian.pedregosa@inria.fr> +# +Iris Plants Database + +Creator: R.A. Fisher +Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) +Date: July, 1988 + +This is a copy of UCI ML iris datasets. + +References: + - Fisher,R.A. 'The use of multiple measurements in taxonomic problems' + Annual Eugenics, 7, Part II, 179-188 (1936); also in 'Contributions to + Mathematical Statistics' (John Wiley, NY, 1950). + - Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis. + (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218. + - Dasarathy, B.V. (1980) 'Nosing Around the Neighborhood: A New System + Structure and Classification Rule for Recognition in Partially Exposed + Environments'. IEEE Transactions on Pattern Analysis and Machine + Intelligence, Vol. PAMI-2, No. 1, 67-71. + - Gates, G.W. (1972) 'The Reduced Nearest Neighbor Rule'. IEEE Transactions + on Information Theory, May 1972, 431-433. + - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al's AUTOCLASS II + conceptual clustering system finds 3 classes in the data. + - Many, many more + """ + +DESCR = """ +The famous Iris database, first used by Sir R.A Fisher + +This is perhaps the best known database to be found in the +pattern recognition literature. Fisher's paper is a classic in the field and +is referenced frequently to this day. (See Duda & Hart, for example.) The +data set contains 3 classes of 50 instances each, where each class refers to a +type of iris plant. One class is linearly separable from the other 2; the +latter are NOT linearly separable from each other. + +Number of Instances: 150 (50 in each of three classes) + +Number of Attributes: 4 numeric, predictive attributes and the class + +Attribute Information: + - sepal length in cm + - sepal width in cm + - petal length in cm + - petal width in cm + - class: + - Iris-Setosa + - Iris-Versicolour + - Iris-Virginica + +Summary Statistics: + Min Max Mean SD Class Correlation + sepal length: 4.3 7.9 5.84 0.83 0.7826 + sepal width: 2.0 4.4 3.05 0.43 -0.4194 + petal length: 1.0 6.9 3.76 1.76 0.9490 (high!) + petal width: 0.1 2.5 1.20 0.76 0.9565 (high!) + +Missing Attribute Values: None + +Class Distribution: 33.3% for each of 3 classes. +""" + +import numpy as np +from .base import Bunch + +def load(): + """load the iris data and returns them. + + Returns + ------- + iris : Bunch + See docstring of bunch for a complete description of its members. + + Example + ------- + Let's say you are interested in the samples 10, 25, and 50, and want to + know their class name. + + >>> data = load() + >>> print data.label #doctest: +ELLIPSIS + [ 0. 0. ...][ 0. 0. ...] + """ diff --git a/scikits/learn/datasets/iris/COPYING b/scikits/learn/datasets/iris/COPYING deleted file mode 100644 index 465e793f163956fa6218b782b8197e06677cdcdd..0000000000000000000000000000000000000000 --- a/scikits/learn/datasets/iris/COPYING +++ /dev/null @@ -1,34 +0,0 @@ -# The code and descriptive text is copyrighted and offered under the terms of -# the BSD License from the authors; see below. However, the actual dataset may -# have a different origin and intellectual property status. See the SOURCE and -# COPYRIGHT variables for this information. - -# Copyright (c) 2007 David Cournapeau <cournape@gmail.com> -# -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the author nor the names of any contributors may be used -# to endorse or promote products derived from this software without -# specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF -# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/scikits/learn/datasets/iris/__init__.py b/scikits/learn/datasets/iris/__init__.py deleted file mode 100644 index 11f79492c6bf831645979d0881e96f5666710413..0000000000000000000000000000000000000000 --- a/scikits/learn/datasets/iris/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -#! /usr/bin/env python -# Last Change: Mon Jul 02 02:00 PM 2007 J -import data as _iris -__doc__ = _iris.DESCRSHORT -copyright = _iris.COPYRIGHT -source = _iris.SOURCE - -load = _iris.load - -all = ['load', 'copyright', 'source'] diff --git a/scikits/learn/datasets/iris/data.py b/scikits/learn/datasets/iris/data.py deleted file mode 100644 index 444a9ecbc7b7c6a434bf961c6362b0875049b382..0000000000000000000000000000000000000000 --- a/scikits/learn/datasets/iris/data.py +++ /dev/null @@ -1,128 +0,0 @@ -#! /usr/bin/env python -# -*- coding: utf-8 -*- -# Last Change: Tue Jul 17 04:00 PM 2007 J - -# The code and descriptive text is copyrighted and offered under the terms of -# the BSD License from the authors; see below. However, the actual dataset may -# have a different origin and intellectual property status. See the SOURCE and -# COPYRIGHT variables for this information. - -# Copyright (c) 2007 David Cournapeau <cournape@gmail.com> -# -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the author nor the names of any contributors may be used -# to endorse or promote products derived from this software without -# specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF -# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -"""Iris dataset.""" - -__docformat__ = 'restructuredtext' - -COPYRIGHT = """See SOURCE. """ -TITLE = "Iris Plants Database" -SOURCE = """Creator: R.A. Fisher -Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) -Date: July, 1988 - -This is a copy of UCI ML iris datasets, except that the data are in mm instead -of cm, so that exact values as int can be given. - -References: - - Fisher,R.A. "The use of multiple measurements in taxonomic problems" - Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to - Mathematical Statistics" (John Wiley, NY, 1950). - - Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis. - (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218. - - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System - Structure and Classification Rule for Recognition in Partially Exposed - Environments". IEEE Transactions on Pattern Analysis and Machine - Intelligence, Vol. PAMI-2, No. 1, 67-71. - - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE Transactions - on Information Theory, May 1972, 431-433. - - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al's AUTOCLASS II - conceptual clustering system finds 3 classes in the data.""" - -DESCRSHORT = """The famous Iris database, first used by Sir R.A Fisher""" - -DESCRLONG = """This is perhaps the best known database to be found in the -pattern recognition literature. Fisher's paper is a classic in the field and -is referenced frequently to this day. (See Duda & Hart, for example.) The -data set contains 3 classes of 50 instances each, where each class refers to a -type of iris plant. One class is linearly separable from the other 2; the -latter are NOT linearly separable from each other. """ - -NOTE = """ -Number of Instances: 150 (50 in each of three classes) - -Number of Attributes: 4 numeric, predictive attributes and the class - -Attribute Information: - - sepal length in mm - - sepal width in mm - - petal length in mm - - petal width in mm - - class: - - Iris-Setosa - - Iris-Versicolour - - Iris-Virginica - -Missing Attribute Values: None - -Class Distribution: 33.3% for each of 3 classes. -""" - -import numpy as np - - -def load(): - """load the iris data and returns them. - - :returns: - d : dict - contains the following values: - - 'data' : a record array with the actual data - - 'label' : label[i] = label index of data[i] - - 'class' : class[i] is the string corresponding to label index i. - - Example - ------- - - Let's say you are interested in the samples 10, 25, and 50, and want to - know their class name. - - >>>> d = load() - # >>>> ind = [10, 25, 50] - # >>>> lind = d['label'][ind] # returns the label index of each sample - # >>>> d['class'][lind] # returns the class name of each sample - - """ - from iris import SL, SW, PL, PW, labels, LI2LN - PW = np.array(PW, dtype=np.float) - PL = np.array(PL, dtype=np.float) - SW = np.array(SW, dtype=np.float) - SL = np.array(SL, dtype=np.float) - labels = np.array(labels, dtype=np.int) - return PW, PL, SW, SL, labels diff --git a/scikits/learn/datasets/iris/iris.py b/scikits/learn/datasets/iris/iris.py deleted file mode 100644 index 13ff411eac62740b94583fdc3c0ec517295ec6e3..0000000000000000000000000000000000000000 --- a/scikits/learn/datasets/iris/iris.py +++ /dev/null @@ -1,71 +0,0 @@ -# Autogenerated by convert.py at Tue, 17 Jul 2007 06:47:17 +0000 - -SL = ['5.1', '4.9', '4.7', '4.6', '5.0', '5.4', '4.6', '5.0', '4.4', '4.9', -'5.4', '4.8', '4.8', '4.3', '5.8', '5.7', '5.4', '5.1', '5.7', '5.1', '5.4', -'5.1', '4.6', '5.1', '4.8', '5.0', '5.0', '5.2', '5.2', '4.7', '4.8', '5.4', -'5.2', '5.5', '4.9', '5.0', '5.5', '4.9', '4.4', '5.1', '5.0', '4.5', '4.4', -'5.0', '5.1', '4.8', '5.1', '4.6', '5.3', '5.0', '7.0', '6.4', '6.9', '5.5', -'6.5', '5.7', '6.3', '4.9', '6.6', '5.2', '5.0', '5.9', '6.0', '6.1', '5.6', -'6.7', '5.6', '5.8', '6.2', '5.6', '5.9', '6.1', '6.3', '6.1', '6.4', '6.6', -'6.8', '6.7', '6.0', '5.7', '5.5', '5.5', '5.8', '6.0', '5.4', '6.0', '6.7', -'6.3', '5.6', '5.5', '5.5', '6.1', '5.8', '5.0', '5.6', '5.7', '5.7', '6.2', -'5.1', '5.7', '6.3', '5.8', '7.1', '6.3', '6.5', '7.6', '4.9', '7.3', '6.7', -'7.2', '6.5', '6.4', '6.8', '5.7', '5.8', '6.4', '6.5', '7.7', '7.7', '6.0', -'6.9', '5.6', '7.7', '6.3', '6.7', '7.2', '6.2', '6.1', '6.4', '7.2', '7.4', -'7.9', '6.4', '6.3', '6.1', '7.7', '6.3', '6.4', '6.0', '6.9', '6.7', '6.9', -'5.8', '6.8', '6.7', '6.7', '6.3', '6.5', '6.2', '5.9'] - -SW = ['3.5', '3.0', '3.2', '3.1', '3.6', '3.9', '3.4', '3.4', '2.9', '3.1', -'3.7', '3.4', '3.0', '3.0', '4.0', '4.4', '3.9', '3.5', '3.8', '3.8', '3.4', -'3.7', '3.6', '3.3', '3.4', '3.0', '3.4', '3.5', '3.4', '3.2', '3.1', '3.4', -'4.1', '4.2', '3.1', '3.2', '3.5', '3.1', '3.0', '3.4', '3.5', '2.3', '3.2', -'3.5', '3.8', '3.0', '3.8', '3.2', '3.7', '3.3', '3.2', '3.2', '3.1', '2.3', -'2.8', '2.8', '3.3', '2.4', '2.9', '2.7', '2.0', '3.0', '2.2', '2.9', '2.9', -'3.1', '3.0', '2.7', '2.2', '2.5', '3.2', '2.8', '2.5', '2.8', '2.9', '3.0', -'2.8', '3.0', '2.9', '2.6', '2.4', '2.4', '2.7', '2.7', '3.0', '3.4', '3.1', -'2.3', '3.0', '2.5', '2.6', '3.0', '2.6', '2.3', '2.7', '3.0', '2.9', '2.9', -'2.5', '2.8', '3.3', '2.7', '3.0', '2.9', '3.0', '3.0', '2.5', '2.9', '2.5', -'3.6', '3.2', '2.7', '3.0', '2.5', '2.8', '3.2', '3.0', '3.8', '2.6', '2.2', -'3.2', '2.8', '2.8', '2.7', '3.3', '3.2', '2.8', '3.0', '2.8', '3.0', '2.8', -'3.8', '2.8', '2.8', '2.6', '3.0', '3.4', '3.1', '3.0', '3.1', '3.1', '3.1', -'2.7', '3.2', '3.3', '3.0', '2.5', '3.0', '3.4', '3.0'] - -PL = ['1.4', '1.4', '1.3', '1.5', '1.4', '1.7', '1.4', '1.5', '1.4', '1.5', -'1.5', '1.6', '1.4', '1.1', '1.2', '1.5', '1.3', '1.4', '1.7', '1.5', '1.7', -'1.5', '1.0', '1.7', '1.9', '1.6', '1.6', '1.5', '1.4', '1.6', '1.6', '1.5', -'1.5', '1.4', '1.5', '1.2', '1.3', '1.5', '1.3', '1.5', '1.3', '1.3', '1.3', -'1.6', '1.9', '1.4', '1.6', '1.4', '1.5', '1.4', '4.7', '4.5', '4.9', '4.0', -'4.6', '4.5', '4.7', '3.3', '4.6', '3.9', '3.5', '4.2', '4.0', '4.7', '3.6', -'4.4', '4.5', '4.1', '4.5', '3.9', '4.8', '4.0', '4.9', '4.7', '4.3', '4.4', -'4.8', '5.0', '4.5', '3.5', '3.8', '3.7', '3.9', '5.1', '4.5', '4.5', '4.7', -'4.4', '4.1', '4.0', '4.4', '4.6', '4.0', '3.3', '4.2', '4.2', '4.2', '4.3', -'3.0', '4.1', '6.0', '5.1', '5.9', '5.6', '5.8', '6.6', '4.5', '6.3', '5.8', -'6.1', '5.1', '5.3', '5.5', '5.0', '5.1', '5.3', '5.5', '6.7', '6.9', '5.0', -'5.7', '4.9', '6.7', '4.9', '5.7', '6.0', '4.8', '4.9', '5.6', '5.8', '6.1', -'6.4', '5.6', '5.1', '5.6', '6.1', '5.6', '5.5', '4.8', '5.4', '5.6', '5.1', -'5.1', '5.9', '5.7', '5.2', '5.0', '5.2', '5.4', '5.1'] - -PW = ['0.2', '0.2', '0.2', '0.2', '0.2', '0.4', '0.3', '0.2', '0.2', '0.1', -'0.2', '0.2', '0.1', '0.1', '0.2', '0.4', '0.4', '0.3', '0.3', '0.3', '0.2', -'0.4', '0.2', '0.5', '0.2', '0.2', '0.4', '0.2', '0.2', '0.2', '0.2', '0.4', -'0.1', '0.2', '0.1', '0.2', '0.2', '0.1', '0.2', '0.2', '0.3', '0.3', '0.2', -'0.6', '0.4', '0.3', '0.2', '0.2', '0.2', '0.2', '1.4', '1.5', '1.5', '1.3', -'1.5', '1.3', '1.6', '1.0', '1.3', '1.4', '1.0', '1.5', '1.0', '1.4', '1.3', -'1.4', '1.5', '1.0', '1.5', '1.1', '1.8', '1.3', '1.5', '1.2', '1.3', '1.4', -'1.4', '1.7', '1.5', '1.0', '1.1', '1.0', '1.2', '1.6', '1.5', '1.6', '1.5', -'1.3', '1.3', '1.3', '1.2', '1.4', '1.2', '1.0', '1.3', '1.2', '1.3', '1.3', -'1.1', '1.3', '2.5', '1.9', '2.1', '1.8', '2.2', '2.1', '1.7', '1.8', '1.8', -'2.5', '2.0', '1.9', '2.1', '2.0', '2.4', '2.3', '1.8', '2.2', '2.3', '1.5', -'2.3', '2.0', '2.0', '1.8', '2.1', '1.8', '1.8', '1.8', '2.1', '1.6', '1.9', -'2.0', '2.2', '1.5', '1.4', '2.3', '2.4', '1.8', '1.8', '2.1', '2.4', '2.3', -'1.9', '2.3', '2.5', '2.3', '1.9', '2.0', '2.3', '1.8'] - -labels = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, -2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] - -LI2LN = {0: 'Iris-setosa', 1: 'Iris-versicolor', 2: 'Iris-virginica'} - diff --git a/scikits/learn/datasets/iris/src/convert.py b/scikits/learn/datasets/iris/src/convert.py deleted file mode 100755 index 86111fd77ce3b4cd9779ddf666045dc2fc287ebd..0000000000000000000000000000000000000000 --- a/scikits/learn/datasets/iris/src/convert.py +++ /dev/null @@ -1,44 +0,0 @@ -#! /usr/bin/env python -# Last Change: Tue Jul 17 03:00 PM 2007 J - -# This script generates a python file from the txt data -import time -import csv - -from scikits.learn.datasets.misc import dumpvar - -# array for equivalence label index <> label name -ln2li = {'Iris-setosa' : 0, 'Iris-versicolor': 1, 'Iris-virginica' :2} -li2ln = {} -for c,i in ln2li.items(): - li2ln[i] = c - -# Load the data -dataname = 'iris.data' -f = open(dataname, 'r') -a = csv.reader(f) -el = [i for i in a] -# Remove last value corresponding to empty line in data file -el.remove(el[-1]) -assert len(el) == 150 - -sl = [i[0] for i in el] -sw = [i[1] for i in el] -pl = [i[2] for i in el] -pw = [i[3] for i in el] -cl = [i[4] for i in el] - -# dcl[i] = label index of data[i] -dcl = [ln2li[i] for i in cl] - -# Write the data in oldfaitful.py -a = open("../iris.py", "w") -a.write('# Autogenerated by convert.py at %s\n\n' % - time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())) - -a.writelines(dumpvar(sl, 'SL')) -a.writelines(dumpvar(sw, 'SW')) -a.writelines(dumpvar(pl, 'PL')) -a.writelines(dumpvar(pw, 'PW')) -a.writelines(dumpvar(dcl, 'LABELS')) -a.writelines(dumpvar(li2ln, 'LI2LN')) diff --git a/scikits/learn/datasets/iris/src/iris.data b/scikits/learn/datasets/iris/src/iris.data deleted file mode 100644 index 5c4316cd695e7c72f1db7ef496ffd2d2ef705b25..0000000000000000000000000000000000000000 --- a/scikits/learn/datasets/iris/src/iris.data +++ /dev/null @@ -1,151 +0,0 @@ -5.1,3.5,1.4,0.2,Iris-setosa -4.9,3.0,1.4,0.2,Iris-setosa -4.7,3.2,1.3,0.2,Iris-setosa -4.6,3.1,1.5,0.2,Iris-setosa -5.0,3.6,1.4,0.2,Iris-setosa -5.4,3.9,1.7,0.4,Iris-setosa -4.6,3.4,1.4,0.3,Iris-setosa -5.0,3.4,1.5,0.2,Iris-setosa -4.4,2.9,1.4,0.2,Iris-setosa -4.9,3.1,1.5,0.1,Iris-setosa -5.4,3.7,1.5,0.2,Iris-setosa -4.8,3.4,1.6,0.2,Iris-setosa -4.8,3.0,1.4,0.1,Iris-setosa -4.3,3.0,1.1,0.1,Iris-setosa -5.8,4.0,1.2,0.2,Iris-setosa -5.7,4.4,1.5,0.4,Iris-setosa -5.4,3.9,1.3,0.4,Iris-setosa -5.1,3.5,1.4,0.3,Iris-setosa -5.7,3.8,1.7,0.3,Iris-setosa -5.1,3.8,1.5,0.3,Iris-setosa -5.4,3.4,1.7,0.2,Iris-setosa -5.1,3.7,1.5,0.4,Iris-setosa -4.6,3.6,1.0,0.2,Iris-setosa -5.1,3.3,1.7,0.5,Iris-setosa -4.8,3.4,1.9,0.2,Iris-setosa -5.0,3.0,1.6,0.2,Iris-setosa -5.0,3.4,1.6,0.4,Iris-setosa -5.2,3.5,1.5,0.2,Iris-setosa -5.2,3.4,1.4,0.2,Iris-setosa -4.7,3.2,1.6,0.2,Iris-setosa -4.8,3.1,1.6,0.2,Iris-setosa -5.4,3.4,1.5,0.4,Iris-setosa -5.2,4.1,1.5,0.1,Iris-setosa -5.5,4.2,1.4,0.2,Iris-setosa -4.9,3.1,1.5,0.1,Iris-setosa -5.0,3.2,1.2,0.2,Iris-setosa -5.5,3.5,1.3,0.2,Iris-setosa -4.9,3.1,1.5,0.1,Iris-setosa -4.4,3.0,1.3,0.2,Iris-setosa -5.1,3.4,1.5,0.2,Iris-setosa -5.0,3.5,1.3,0.3,Iris-setosa -4.5,2.3,1.3,0.3,Iris-setosa -4.4,3.2,1.3,0.2,Iris-setosa -5.0,3.5,1.6,0.6,Iris-setosa -5.1,3.8,1.9,0.4,Iris-setosa -4.8,3.0,1.4,0.3,Iris-setosa -5.1,3.8,1.6,0.2,Iris-setosa -4.6,3.2,1.4,0.2,Iris-setosa -5.3,3.7,1.5,0.2,Iris-setosa -5.0,3.3,1.4,0.2,Iris-setosa -7.0,3.2,4.7,1.4,Iris-versicolor -6.4,3.2,4.5,1.5,Iris-versicolor -6.9,3.1,4.9,1.5,Iris-versicolor -5.5,2.3,4.0,1.3,Iris-versicolor -6.5,2.8,4.6,1.5,Iris-versicolor -5.7,2.8,4.5,1.3,Iris-versicolor -6.3,3.3,4.7,1.6,Iris-versicolor -4.9,2.4,3.3,1.0,Iris-versicolor -6.6,2.9,4.6,1.3,Iris-versicolor -5.2,2.7,3.9,1.4,Iris-versicolor -5.0,2.0,3.5,1.0,Iris-versicolor -5.9,3.0,4.2,1.5,Iris-versicolor -6.0,2.2,4.0,1.0,Iris-versicolor -6.1,2.9,4.7,1.4,Iris-versicolor -5.6,2.9,3.6,1.3,Iris-versicolor -6.7,3.1,4.4,1.4,Iris-versicolor -5.6,3.0,4.5,1.5,Iris-versicolor -5.8,2.7,4.1,1.0,Iris-versicolor -6.2,2.2,4.5,1.5,Iris-versicolor -5.6,2.5,3.9,1.1,Iris-versicolor -5.9,3.2,4.8,1.8,Iris-versicolor -6.1,2.8,4.0,1.3,Iris-versicolor -6.3,2.5,4.9,1.5,Iris-versicolor -6.1,2.8,4.7,1.2,Iris-versicolor -6.4,2.9,4.3,1.3,Iris-versicolor -6.6,3.0,4.4,1.4,Iris-versicolor -6.8,2.8,4.8,1.4,Iris-versicolor -6.7,3.0,5.0,1.7,Iris-versicolor -6.0,2.9,4.5,1.5,Iris-versicolor -5.7,2.6,3.5,1.0,Iris-versicolor -5.5,2.4,3.8,1.1,Iris-versicolor -5.5,2.4,3.7,1.0,Iris-versicolor -5.8,2.7,3.9,1.2,Iris-versicolor -6.0,2.7,5.1,1.6,Iris-versicolor -5.4,3.0,4.5,1.5,Iris-versicolor -6.0,3.4,4.5,1.6,Iris-versicolor -6.7,3.1,4.7,1.5,Iris-versicolor -6.3,2.3,4.4,1.3,Iris-versicolor -5.6,3.0,4.1,1.3,Iris-versicolor -5.5,2.5,4.0,1.3,Iris-versicolor -5.5,2.6,4.4,1.2,Iris-versicolor -6.1,3.0,4.6,1.4,Iris-versicolor -5.8,2.6,4.0,1.2,Iris-versicolor -5.0,2.3,3.3,1.0,Iris-versicolor -5.6,2.7,4.2,1.3,Iris-versicolor -5.7,3.0,4.2,1.2,Iris-versicolor -5.7,2.9,4.2,1.3,Iris-versicolor -6.2,2.9,4.3,1.3,Iris-versicolor -5.1,2.5,3.0,1.1,Iris-versicolor -5.7,2.8,4.1,1.3,Iris-versicolor -6.3,3.3,6.0,2.5,Iris-virginica -5.8,2.7,5.1,1.9,Iris-virginica -7.1,3.0,5.9,2.1,Iris-virginica -6.3,2.9,5.6,1.8,Iris-virginica -6.5,3.0,5.8,2.2,Iris-virginica -7.6,3.0,6.6,2.1,Iris-virginica -4.9,2.5,4.5,1.7,Iris-virginica -7.3,2.9,6.3,1.8,Iris-virginica -6.7,2.5,5.8,1.8,Iris-virginica -7.2,3.6,6.1,2.5,Iris-virginica -6.5,3.2,5.1,2.0,Iris-virginica -6.4,2.7,5.3,1.9,Iris-virginica -6.8,3.0,5.5,2.1,Iris-virginica -5.7,2.5,5.0,2.0,Iris-virginica -5.8,2.8,5.1,2.4,Iris-virginica -6.4,3.2,5.3,2.3,Iris-virginica -6.5,3.0,5.5,1.8,Iris-virginica -7.7,3.8,6.7,2.2,Iris-virginica -7.7,2.6,6.9,2.3,Iris-virginica -6.0,2.2,5.0,1.5,Iris-virginica -6.9,3.2,5.7,2.3,Iris-virginica -5.6,2.8,4.9,2.0,Iris-virginica -7.7,2.8,6.7,2.0,Iris-virginica -6.3,2.7,4.9,1.8,Iris-virginica -6.7,3.3,5.7,2.1,Iris-virginica -7.2,3.2,6.0,1.8,Iris-virginica -6.2,2.8,4.8,1.8,Iris-virginica -6.1,3.0,4.9,1.8,Iris-virginica -6.4,2.8,5.6,2.1,Iris-virginica -7.2,3.0,5.8,1.6,Iris-virginica -7.4,2.8,6.1,1.9,Iris-virginica -7.9,3.8,6.4,2.0,Iris-virginica -6.4,2.8,5.6,2.2,Iris-virginica -6.3,2.8,5.1,1.5,Iris-virginica -6.1,2.6,5.6,1.4,Iris-virginica -7.7,3.0,6.1,2.3,Iris-virginica -6.3,3.4,5.6,2.4,Iris-virginica -6.4,3.1,5.5,1.8,Iris-virginica -6.0,3.0,4.8,1.8,Iris-virginica -6.9,3.1,5.4,2.1,Iris-virginica -6.7,3.1,5.6,2.4,Iris-virginica -6.9,3.1,5.1,2.3,Iris-virginica -5.8,2.7,5.1,1.9,Iris-virginica -6.8,3.2,5.9,2.3,Iris-virginica -6.7,3.3,5.7,2.5,Iris-virginica -6.7,3.0,5.2,2.3,Iris-virginica -6.3,2.5,5.0,1.9,Iris-virginica -6.5,3.0,5.2,2.0,Iris-virginica -6.2,3.4,5.4,2.3,Iris-virginica -5.9,3.0,5.1,1.8,Iris-virginica - diff --git a/scikits/learn/datasets/iris/src/iris.names b/scikits/learn/datasets/iris/src/iris.names deleted file mode 100644 index 7730e5d2699edf21fc3f600bba26d09962cc19fb..0000000000000000000000000000000000000000 --- a/scikits/learn/datasets/iris/src/iris.names +++ /dev/null @@ -1,62 +0,0 @@ -1. Title: Iris Plants Database - -2. Sources: - (a) Creator: R.A. Fisher - (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) - (c) Date: July, 1988 - -3. Past Usage: - - Publications: too many to mention!!! Here are a few. - 1. Fisher,R.A. "The use of multiple measurements in taxonomic problems" - Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions - to Mathematical Statistics" (John Wiley, NY, 1950). - 2. Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis. - (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218. - 3. Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System - Structure and Classification Rule for Recognition in Partially Exposed - Environments". IEEE Transactions on Pattern Analysis and Machine - Intelligence, Vol. PAMI-2, No. 1, 67-71. - -- Results: - -- very low misclassification rates (0% for the setosa class) - 4. Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE - Transactions on Information Theory, May 1972, 431-433. - -- Results: - -- very low misclassification rates again - 5. See also: 1988 MLC Proceedings, 54-64. Cheeseman et al's AUTOCLASS II - conceptual clustering system finds 3 classes in the data. - -4. Relevant Information: - --- This is perhaps the best known database to be found in the pattern - recognition literature. Fisher's paper is a classic in the field - and is referenced frequently to this day. (See Duda & Hart, for - example.) The data set contains 3 classes of 50 instances each, - where each class refers to a type of iris plant. One class is - linearly separable from the other 2; the latter are NOT linearly - separable from each other. - --- Predicted attribute: class of iris plant. - --- This is an exceedingly simple domain. - -5. Number of Instances: 150 (50 in each of three classes) - -6. Number of Attributes: 4 numeric, predictive attributes and the class - -7. Attribute Information: - 1. sepal length in cm - 2. sepal width in cm - 3. petal length in cm - 4. petal width in cm - 5. class: - -- Iris Setosa - -- Iris Versicolour - -- Iris Virginica - -8. Missing Attribute Values: None - -Summary Statistics: - Min Max Mean SD Class Correlation - sepal length: 4.3 7.9 5.84 0.83 0.7826 - sepal width: 2.0 4.4 3.05 0.43 -0.4194 - petal length: 1.0 6.9 3.76 1.76 0.9490 (high!) - petal width: 0.1 2.5 1.20 0.76 0.9565 (high!) - -9. Class Distribution: 33.3% for each of 3 classes.