diff --git a/examples/lda.py b/examples/lda.py index 149cc61421a5452912976f066d79bdb0027835a1..8560026554d0f6c05d3dc255cc3b3306898afc0a 100644 --- a/examples/lda.py +++ b/examples/lda.py @@ -16,7 +16,7 @@ import pylab as pl # The IRIS dataset from scikits.learn import datasets, svm -iris = datasets.load('iris') +iris = datasets.load_iris() # Some noisy data not correlated E = np.random.normal(size=(len(iris.data), 35)) diff --git a/examples/naive_bayes.py b/examples/naive_bayes.py index d4a39baa202a817191f6700dfbd1d118ad459c71..86b3a4cf3f47c0e5498dea0ff9b1473d986815bd 100644 --- a/examples/naive_bayes.py +++ b/examples/naive_bayes.py @@ -16,7 +16,7 @@ import pylab as pl # The IRIS dataset from scikits.learn import datasets, svm -iris = datasets.load('iris') +iris = datasets.load_iris() X = iris.data @@ -29,4 +29,4 @@ gnb = GNB() y_pred = gnb.fit(X, y).predict(X) -print "Number of mislabeled points : %d"%(y != y_pred).sum() +print "Number of mislabeled points : %d" % (y != y_pred).sum() diff --git a/examples/plot_feature_selection.py b/examples/plot_feature_selection.py index 5d5bca900a9984068de40d92622273a4b07e47d1..063b67ffc202219164692f0f3515f3dd47b5705b 100644 --- a/examples/plot_feature_selection.py +++ b/examples/plot_feature_selection.py @@ -28,7 +28,7 @@ import pylab as pl # The IRIS dataset from scikits.learn import datasets, svm -iris = datasets.load('iris') +iris = datasets.load_iris() # Some noisy data not correlated E = np.random.normal(size=(len(iris.data), 35)) diff --git a/examples/plot_neighbors.py b/examples/plot_neighbors.py index c1c932cd4deae6ae5c972e35d5f166b296e86b81..3c22fe3e389bacb87511a37582e5b0d0b3aeebec 100644 --- a/examples/plot_neighbors.py +++ b/examples/plot_neighbors.py @@ -11,7 +11,7 @@ import pylab as pl from scikits.learn import neighbors, datasets # import some data to play with -iris = datasets.load('iris') +iris = datasets.load_iris() X = iris.data[:, :2] # we only take the first two features. We could # avoid this ugly slicing by using a two-dim dataset Y = iris.target diff --git a/examples/plot_svm.py b/examples/plot_svm.py index 6d4a56f1730b85c77b611221ccc24b06aed20bab..3a1aadade32a17371531d70db004bdce4fc9a4e0 100644 --- a/examples/plot_svm.py +++ b/examples/plot_svm.py @@ -11,7 +11,7 @@ import pylab as pl from scikits.learn import svm, datasets # import some data to play with -iris = datasets.load('iris') +iris = datasets.load_iris() X = iris.data[:, :2] # we only take the first two features. We could # avoid this ugly slicing by using a two-dim dataset Y = iris.target diff --git a/scikits/learn/datasets/__init__.py b/scikits/learn/datasets/__init__.py index d89713f7dfcbdee04d867a88c89cf1eab9e8c2b2..15bbd36d9f12ce1ca344c9b64d93b547b360790f 100644 --- a/scikits/learn/datasets/__init__.py +++ b/scikits/learn/datasets/__init__.py @@ -1 +1 @@ -from base import load +from base import load_iris, load_digits diff --git a/scikits/learn/datasets/base.py b/scikits/learn/datasets/base.py index 76e0e3f88be37112aa213c9c9d7871af9840bd18..f4de58c45e6ed407fa65020753dda0b436dc4b8d 100644 --- a/scikits/learn/datasets/base.py +++ b/scikits/learn/datasets/base.py @@ -1,67 +1,101 @@ """ -Base object for all datasets +Base IO code for all datasets """ # Copyright (c) 2007 David Cournapeau <cournape@gmail.com> # 2010 Fabian Pedregosa <fabian.pedregosa@inria.fr> # +import csv +import os import numpy as np -class Bunch(dict): - """ - Container for dataset. - Members - ------- - - data : a record array with the actual data - - label : label[i] = label index of data[i] - - class : class[i] is the string corresponding to label index i. - - COPYRIGHT, TITLE, SOURCE, DESCRSHORT, DESCRLONG, - NOTE. Information about the dataset. +class Bunch(dict): + """ Container object for datasets: dictionnary-like object that + exposes its keys as attributes. """ def __init__(self, **kwargs): dict.__init__(self, kwargs) self.__dict__ = self -def load(dataset): - """load the data and returns them. + +################################################################################ + +def load_iris(): + """load the iris dataset and returns it. Returns ------- data : Bunch - See docstring of bunch for a complete description of its members. - - Available datasets - - iris + Dictionnary-like object, the interesting attributes are: + 'data', the data to learn, 'target', the classification labels, + 'target_names', the meaning of the labels, and 'DESCR', the + full description of the dataset. Example ------- Let's say you are interested in the samples 10, 25, and 50, and want to know their class name. - >>> data = load() - >>> print data.label #doctest: +ELLIPSIS - [ 0. 0. ...] + >>> data = load_iris() + >>> print data.target[[10, 25, 50]] + [0 0 1] + >>> print data.target_names[data.target[[10, 25, 50]]] + ['setosa' 'setosaosa' 'versicolor'] """ - import csv - import os - firis = csv.reader(open(os.path.dirname(__file__) - + '/data/%s.csv' % dataset)) + data_file = csv.reader(open(os.path.dirname(__file__) + + '/data/iris.csv')) fdescr = open(os.path.dirname(__file__) - + '/descr/%s.rst' % dataset) - temp = firis.next() - nsamples = int(temp[0]) - nfeat = int(temp[1]) - targetnames = temp[2:] - data = np.empty((nsamples, nfeat)) - target = np.empty((nsamples,)) - for i, ir in enumerate(firis): + + '/descr/iris.rst') + temp = data_file.next() + n_samples = int(temp[0]) + n_features = int(temp[1]) + target_names = np.array(temp[2:]) + data = np.empty((n_samples, n_features)) + target = np.empty((n_samples,), dtype=np.int) + for i, ir in enumerate(data_file): data[i] = np.asanyarray(ir[:-1], dtype=np.float) - target[i] = np.asanyarray(ir[-1], dtype=np.float) - return Bunch(data=data, target=target, targetnames=targetnames, + target[i] = np.asanyarray(ir[-1], dtype=np.int) + return Bunch(data=data, target=target, target_names=target_names, + DESCR=fdescr.read()) + +def load_digits(): + """load the digits dataset and returns it. + + Returns + ------- + data : Bunch + Dictionnary-like object, the interesting attributes are: + 'data', the data to learn, `raw_data`, the images corresponding + to each sample, 'target', the classification labels for each + sample, 'target_names', the meaning of the labels, and 'DESCR', + the full description of the dataset. + + Example + ------- + To load the data and visualize the images:: + + import pylab as pl + digits = datasets.load_digits() + pl.gray() + # Visualize the first image: + pl.matshow(digits.raw_data[0]) + +""" + + data = np.loadtxt(os.path.join(os.path.dirname(__file__) + + '/data/digits.csv.gz'), delimiter=',') + fdescr = open(os.path.join(os.path.dirname(__file__) + + '/descr/iris.rst')) + target = data[:, -1] + flat_data = data[:, :-1] + images = flat_data.view() + images.shape = (-1, 8, 8) + return Bunch(data=data, target=target, target_names=np.arange(10), + raw_data=images, DESCR=fdescr.read()) diff --git a/scikits/learn/datasets/data/digits.csv.gz b/scikits/learn/datasets/data/digits.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..e191a13019b607d4477d1031503ab46218e21040 Binary files /dev/null and b/scikits/learn/datasets/data/digits.csv.gz differ diff --git a/scikits/learn/datasets/descr/digits.rst b/scikits/learn/datasets/descr/digits.rst new file mode 100644 index 0000000000000000000000000000000000000000..c8d92ecb3072d84593f46fd5ad069df4bbb48e00 --- /dev/null +++ b/scikits/learn/datasets/descr/digits.rst @@ -0,0 +1,55 @@ + Optical Recognition of Handwritten Digits Data Set + +Data Set Characteristics: + +Source +------- + +Creator: E. Alpaydin (alpaydin '@' boun.edu.tr) +Date: July; 1998 + +This is a copy of the test set of the UCI ML hand-written digits datasets + +The data set contains images of hand-written digits: 10 classes where +each class refers to a digit. + +Preprocessing programs made available by NIST were used to extract +normalized bitmaps of handwritten digits from a preprinted form. From a +total of 43 people, 30 contributed to the training set and different 13 +to the test set. 32x32 bitmaps are divided into nonoverlapping blocks of +4x4 and the number of on pixels are counted in each block. This generates +an input matrix of 8x8 where each element is an integer in the range +0..16. This reduces dimensionality and gives invariance to small +distortions. + +For info on NIST preprocessing routines, see M. D. Garris, J. L. Blue, G. +T. Candela, D. L. Dimmick, J. Geist, P. J. Grother, S. A. Janet, and C. +L. Wilson, NIST Form-Based Handprint Recognition System, NISTIR 5469, +1994. + + +References +----------- + + - C. Kaynak (1995) Methods of Combining Multiple Classifiers and Their + Applications to Handwritten Digit Recognition, MSc Thesis, Institute of + Graduate Studies in Science and Engineering, Bogazici University. + - E. Alpaydin, C. Kaynak (1998) Cascading Classifiers, Kybernetika. + - Ken Tang and Ponnuthurai N. Suganthan and Xi Yao and A. Kai Qin. + Linear dimensionalityreduction using relevance weighted LDA. School of + Electrical and Electronic Engineering Nanyang Technological University. + 2005. + - Claudio Gentile. A New Approximate Maximal Margin Classification + Algorithm. NIPS. 2000. + - ... + + +Number of Instances: 5620 + +Number of Attributes: 64 + +Attribute Information: 8x8 image of integer pixels in the range 0..16. + +Missing Attribute Values: None + + diff --git a/scikits/learn/datasets/descr/iris.rst b/scikits/learn/datasets/descr/iris.rst index 62a2d8e48404beb4da452a71bd95a173d69dd6ee..c5f4e1fbfb7ad264ac9d12f7ef81a5d5b1625463 100644 --- a/scikits/learn/datasets/descr/iris.rst +++ b/scikits/learn/datasets/descr/iris.rst @@ -62,8 +62,3 @@ Missing Attribute Values: None Class Distribution: 33.3% for each of 3 classes. -Example -------- - >>> data = load() - >>> print data.label #doctest: +ELLIPSIS - [ 0. 0. ...][ 0. 0. ...]