diff --git a/examples/preprocessing/plot_scaling_importance.py b/examples/preprocessing/plot_scaling_importance.py new file mode 100644 index 0000000000000000000000000000000000000000..f1e599ed753c7f2a5c73ffed79a3c5fc039772d6 --- /dev/null +++ b/examples/preprocessing/plot_scaling_importance.py @@ -0,0 +1,131 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +""" +========================================================= +Importance of Feature Scaling +========================================================= + +Feature scaling though standardization (or Z-score normalization) +can be an important preprocessing step for many machine learning +algorithms. Standardization involves rescaling the features such +that they have the properties of a standard normal distribution +with a mean of zero and a standard deviation of one. + +While many algorithms (such as SVM, K-nearest neighbors, and logistic +regression) require features to be normalized, intuitively we can +think of Principle Component Analysis (PCA) as being a prime example +of when normalization is important. In PCA we are interested in the +components that maximize the variance. If one component (e.g. human +height) varies less than another (e.g. weight) because of their +respective scales (meters vs. kilos), PCA might determine that the +direction of maximal variance more closely corresponds with the +'weight' axis, if those features are not scaled. As a change in +height of one meter can be considered much more important than the +change in weight of one kilogram, this is clearly incorrect. + +To illustrate this, PCA is performed comparing the use of data with +:class:`StandardScaler <sklearn.preprocessing.StandardScaler>` applied, +to unscaled data. The results are visualized and a clear difference noted. +The 1st principal component in the unscaled set can be seen. It can be seen +that feature #13 dominates the direction, being a whole two orders of +magnitude above the other features. This is contrasted when observing +the principal component for the scaled version of the data. In the scaled +version, the orders of magnitude are roughly the same across all the features. + +The dataset used is the Wine Dataset available at UCI. This dataset +has continuous features that are heterogeneous in scale due to differing +properties that they measure (i.e alcohol content, and malic acid). + +The transformed data is then used to train a naive Bayes classifier, and a +clear difference in prediction accuracies is observed wherein the dataset +which is scaled before PCA vastly outperforms the unscaled version. + +""" +from __future__ import print_function +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.decomposition import PCA +from sklearn.naive_bayes import GaussianNB +from sklearn import metrics +import matplotlib.pyplot as plt +from sklearn.datasets import load_wine +from sklearn.pipeline import make_pipeline +print(__doc__) + +# Code source: Tyler Lanigan <tylerlanigan@gmail.com> +# Sebastian Raschka <mail@sebastianraschka.com> + +# License: BSD 3 clause + +RANDOM_STATE = 42 +FIG_SIZE = (10, 7) + + +features, target = load_wine(return_X_y=True) + +# Make a train/test split using 30% test size +X_train, X_test, y_train, y_test = train_test_split(features, target, + test_size=0.30, + random_state=RANDOM_STATE) + +# Fit to data and predict using pipelined GNB and PCA. +unscaled_clf = make_pipeline(PCA(n_components=2), GaussianNB()) +unscaled_clf.fit(X_train, y_train) +pred_test = unscaled_clf.predict(X_test) + +# Fit to data and predict using pipelined scaling, GNB and PCA. +std_clf = make_pipeline(StandardScaler(), PCA(n_components=2), GaussianNB()) +std_clf.fit(X_train, y_train) +pred_test_std = std_clf.predict(X_test) + +# Show prediction accuracies in scaled and unscaled data. +print('\nPrediction accuracy for the normal test dataset with PCA') +print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test))) + +print('\nPrediction accuracy for the standardized test dataset with PCA') +print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test_std))) + +# Extract PCA from pipeline +pca = unscaled_clf.named_steps['pca'] +pca_std = std_clf.named_steps['pca'] + +# Show first principal componenets +print('\nPC 1 without scaling:\n', pca.components_[0]) +print('\nPC 1 with scaling:\n', pca_std.components_[0]) + +# Scale and use PCA on X_train data for visualization. +scaler = std_clf.named_steps['standardscaler'] +X_train_std = pca_std.transform(scaler.transform(X_train)) + +# visualize standardized vs. untouched dataset with PCA performed +fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=FIG_SIZE) + + +for l, c, m in zip(range(0, 3), ('blue', 'red', 'green'), ('^', 's', 'o')): + ax1.scatter(X_train[y_train == l, 0], X_train[y_train == l, 1], + color=c, + label='class %s' % l, + alpha=0.5, + marker=m + ) + +for l, c, m in zip(range(0, 3), ('blue', 'red', 'green'), ('^', 's', 'o')): + ax2.scatter(X_train_std[y_train == l, 0], X_train_std[y_train == l, 1], + color=c, + label='class %s' % l, + alpha=0.5, + marker=m + ) + +ax1.set_title('Training dataset after PCA') +ax2.set_title('Standardized training dataset after PCA') + +for ax in (ax1, ax2): + ax.set_xlabel('1st principal component') + ax.set_ylabel('2nd principal component') + ax.legend(loc='upper right') + ax.grid() + +plt.tight_layout() + +plt.show() diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py index c38e99acd3d5b17122c41f88cd064c74400217ca..c43c0c4758b10dd5720e42b9e290d676140fbb3c 100644 --- a/sklearn/datasets/__init__.py +++ b/sklearn/datasets/__init__.py @@ -3,18 +3,18 @@ The :mod:`sklearn.datasets` module includes utilities to load datasets, including methods to load and fetch popular reference datasets. It also features some artificial data generators. """ - +from .base import load_breast_cancer +from .base import load_boston from .base import load_diabetes from .base import load_digits from .base import load_files from .base import load_iris -from .base import load_breast_cancer from .base import load_linnerud -from .base import load_boston -from .base import get_data_home -from .base import clear_data_home from .base import load_sample_images from .base import load_sample_image +from .base import load_wine +from .base import get_data_home +from .base import clear_data_home from .covtype import fetch_covtype from .kddcup99 import fetch_kddcup99 from .mlcomp import load_mlcomp @@ -78,6 +78,7 @@ __all__ = ['clear_data_home', 'load_sample_images', 'load_svmlight_file', 'load_svmlight_files', + 'load_wine', 'make_biclusters', 'make_blobs', 'make_circles', diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index b83f9d4985e4630ad213e6dd95702891e588bdd6..2325d971428d2f74f5a2708c3b24142f81ab2b19 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -242,6 +242,122 @@ def load_files(container_path, description=None, categories=None, DESCR=description) +def load_data(module_path, data_file_name): + """Loads data from module_path/data/data_file_name. + + Parameters + ---------- + data_file_name : String. Name of csv file to be loaded from + module_path/data/data_file_name. For example 'wine_data.csv'. + + Returns + ------- + data : Numpy Array + A 2D array with each row representing one sample and each column + representing the features of a given sample. + + target : Numpy Array + A 1D array holding target variables for all the samples in `data. + For example target[0] is the target varible for data[0]. + + target_names : Numpy Array + A 1D array containing the names of the classifications. For example + target_names[0] is the name of the target[0] class. + """ + with open(join(module_path, 'data', data_file_name)) as csv_file: + data_file = csv.reader(csv_file) + temp = next(data_file) + n_samples = int(temp[0]) + n_features = int(temp[1]) + target_names = np.array(temp[2:]) + data = np.empty((n_samples, n_features)) + target = np.empty((n_samples,), dtype=np.int) + + for i, ir in enumerate(data_file): + data[i] = np.asarray(ir[:-1], dtype=np.float64) + target[i] = np.asarray(ir[-1], dtype=np.int) + + return data, target, target_names + + +def load_wine(return_X_y=False): + """Load and return the wine dataset (classification). + + .. versionadded:: 0.18 + + The wine dataset is a classic and very easy multi-class classification + dataset. + + ================= ============== + Classes 3 + Samples per class [59,71,48] + Samples total 178 + Dimensionality 13 + Features real, positive + ================= ============== + + Read more in the :ref:`User Guide <datasets>`. + + Parameters + ---------- + return_X_y : boolean, default=False. + If True, returns ``(data, target)`` instead of a Bunch object. + See below for more information about the `data` and `target` object. + + Returns + ------- + data : Bunch + Dictionary-like object, the interesting attributes are: + 'data', the data to learn, 'target', the classification labels, + 'target_names', the meaning of the labels, 'feature_names', the + meaning of the features, and 'DESCR', the + full description of the dataset. + + (data, target) : tuple if ``return_X_y`` is True + + The copy of UCI ML Wine Data Set dataset is + downloaded and modified to fit standard format from: + https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data + + Examples + -------- + Let's say you are interested in the samples 10, 80, and 140, and want to + know their class name. + + >>> from sklearn.datasets import load_wine + >>> data = load_wine() + >>> data.target[[10, 80, 140]] + array([0, 1, 2]) + >>> list(data.target_names) + ['class_0', 'class_1', 'class_2'] + """ + module_path = dirname(__file__) + data, target, target_names = load_data(module_path, 'wine_data.csv') + + with open(join(module_path, 'descr', 'wine_data.rst')) as rst_file: + fdescr = rst_file.read() + + if return_X_y: + return data, target + + return Bunch(data=data, target=target, + target_names=target_names, + DESCR=fdescr, + feature_names=['alcohol', + 'malic_acid', + 'ash', + 'alcalinity_of_ash', + 'magnesium', + 'total_phenols', + 'flavanoids', + 'nonflavanoid_phenols', + 'proanthocyanins', + 'color_intensity', + 'hue', + 'od280/od315_of_diluted_wines', + 'proline']) + + def load_iris(return_X_y=False): """Load and return the iris dataset (classification). @@ -292,18 +408,7 @@ def load_iris(return_X_y=False): ['setosa', 'versicolor', 'virginica'] """ module_path = dirname(__file__) - with open(join(module_path, 'data', 'iris.csv')) as csv_file: - data_file = csv.reader(csv_file) - temp = next(data_file) - n_samples = int(temp[0]) - n_features = int(temp[1]) - target_names = np.array(temp[2:]) - data = np.empty((n_samples, n_features)) - target = np.empty((n_samples,), dtype=np.int) - - for i, ir in enumerate(data_file): - data[i] = np.asarray(ir[:-1], dtype=np.float64) - target[i] = np.asarray(ir[-1], dtype=np.int) + data, target, target_names = load_data(module_path, 'iris.csv') with open(join(module_path, 'descr', 'iris.rst')) as rst_file: fdescr = rst_file.read() @@ -370,18 +475,7 @@ def load_breast_cancer(return_X_y=False): ['malignant', 'benign'] """ module_path = dirname(__file__) - with open(join(module_path, 'data', 'breast_cancer.csv')) as csv_file: - data_file = csv.reader(csv_file) - first_line = next(data_file) - n_samples = int(first_line[0]) - n_features = int(first_line[1]) - target_names = np.array(first_line[2:4]) - data = np.empty((n_samples, n_features)) - target = np.empty((n_samples,), dtype=np.int) - - for count, value in enumerate(data_file): - data[count] = np.asarray(value[:-1], dtype=np.float64) - target[count] = np.asarray(value[-1], dtype=np.int) + data, target, target_names = load_data(module_path, 'breast_cancer.csv') with open(join(module_path, 'descr', 'breast_cancer.rst')) as rst_file: fdescr = rst_file.read() @@ -517,12 +611,12 @@ def load_diabetes(return_X_y=False): (data, target) : tuple if ``return_X_y`` is True - .. versionadded:: 0.18 + .. versionadded:: 0.18 """ base_dir = join(dirname(__file__), 'data') data = np.loadtxt(join(base_dir, 'diabetes_data.csv.gz')) target = np.loadtxt(join(base_dir, 'diabetes_target.csv.gz')) - + if return_X_y: return data, target @@ -554,7 +648,7 @@ def load_linnerud(return_X_y=False): 'targets', the two multivariate datasets, with 'data' corresponding to the exercise and 'targets' corresponding to the physiological measurements, as well as 'feature_names' and 'target_names'. - + (data, target) : tuple if ``return_X_y`` is True .. versionadded:: 0.18 @@ -608,7 +702,7 @@ def load_boston(return_X_y=False): (data, target) : tuple if ``return_X_y`` is True - .. versionadded:: 0.18 + .. versionadded:: 0.18 Examples -------- diff --git a/sklearn/datasets/data/breast_cancer.csv b/sklearn/datasets/data/breast_cancer.csv index 8eafb958159787376747c74d1e46ae6ed486c0c6..979a3dcb6786a29213bec3ea3a427c514c79975b 100644 --- a/sklearn/datasets/data/breast_cancer.csv +++ b/sklearn/datasets/data/breast_cancer.csv @@ -1,4 +1,4 @@ -569,30,malignant,benign,,,,,,,,,,,,,,,,,,,,,,,,,,, +569,30,malignant,benign 17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0 20.57,17.77,132.9,1326,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0 19.69,21.25,130,1203,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0 diff --git a/sklearn/datasets/data/wine_data.csv b/sklearn/datasets/data/wine_data.csv new file mode 100644 index 0000000000000000000000000000000000000000..6c7fe81952aa6129023730ced4581b42ecd085af --- /dev/null +++ b/sklearn/datasets/data/wine_data.csv @@ -0,0 +1,179 @@ +178,13,class_0,class_1,class_2 +14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065,0 +13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050,0 +13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185,0 +14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480,0 +13.24,2.59,2.87,21,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735,0 +14.2,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450,0 +14.39,1.87,2.45,14.6,96,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290,0 +14.06,2.15,2.61,17.6,121,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295,0 +14.83,1.64,2.17,14,97,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045,0 +13.86,1.35,2.27,16,98,2.98,3.15,0.22,1.85,7.22,1.01,3.55,1045,0 +14.1,2.16,2.3,18,105,2.95,3.32,0.22,2.38,5.75,1.25,3.17,1510,0 +14.12,1.48,2.32,16.8,95,2.2,2.43,0.26,1.57,5,1.17,2.82,1280,0 +13.75,1.73,2.41,16,89,2.6,2.76,0.29,1.81,5.6,1.15,2.9,1320,0 +14.75,1.73,2.39,11.4,91,3.1,3.69,0.43,2.81,5.4,1.25,2.73,1150,0 +14.38,1.87,2.38,12,102,3.3,3.64,0.29,2.96,7.5,1.2,3,1547,0 +13.63,1.81,2.7,17.2,112,2.85,2.91,0.3,1.46,7.3,1.28,2.88,1310,0 +14.3,1.92,2.72,20,120,2.8,3.14,0.33,1.97,6.2,1.07,2.65,1280,0 +13.83,1.57,2.62,20,115,2.95,3.4,0.4,1.72,6.6,1.13,2.57,1130,0 +14.19,1.59,2.48,16.5,108,3.3,3.93,0.32,1.86,8.7,1.23,2.82,1680,0 +13.64,3.1,2.56,15.2,116,2.7,3.03,0.17,1.66,5.1,0.96,3.36,845,0 +14.06,1.63,2.28,16,126,3,3.17,0.24,2.1,5.65,1.09,3.71,780,0 +12.93,3.8,2.65,18.6,102,2.41,2.41,0.25,1.98,4.5,1.03,3.52,770,0 +13.71,1.86,2.36,16.6,101,2.61,2.88,0.27,1.69,3.8,1.11,4,1035,0 +12.85,1.6,2.52,17.8,95,2.48,2.37,0.26,1.46,3.93,1.09,3.63,1015,0 +13.5,1.81,2.61,20,96,2.53,2.61,0.28,1.66,3.52,1.12,3.82,845,0 +13.05,2.05,3.22,25,124,2.63,2.68,0.47,1.92,3.58,1.13,3.2,830,0 +13.39,1.77,2.62,16.1,93,2.85,2.94,0.34,1.45,4.8,0.92,3.22,1195,0 +13.3,1.72,2.14,17,94,2.4,2.19,0.27,1.35,3.95,1.02,2.77,1285,0 +13.87,1.9,2.8,19.4,107,2.95,2.97,0.37,1.76,4.5,1.25,3.4,915,0 +14.02,1.68,2.21,16,96,2.65,2.33,0.26,1.98,4.7,1.04,3.59,1035,0 +13.73,1.5,2.7,22.5,101,3,3.25,0.29,2.38,5.7,1.19,2.71,1285,0 +13.58,1.66,2.36,19.1,106,2.86,3.19,0.22,1.95,6.9,1.09,2.88,1515,0 +13.68,1.83,2.36,17.2,104,2.42,2.69,0.42,1.97,3.84,1.23,2.87,990,0 +13.76,1.53,2.7,19.5,132,2.95,2.74,0.5,1.35,5.4,1.25,3,1235,0 +13.51,1.8,2.65,19,110,2.35,2.53,0.29,1.54,4.2,1.1,2.87,1095,0 +13.48,1.81,2.41,20.5,100,2.7,2.98,0.26,1.86,5.1,1.04,3.47,920,0 +13.28,1.64,2.84,15.5,110,2.6,2.68,0.34,1.36,4.6,1.09,2.78,880,0 +13.05,1.65,2.55,18,98,2.45,2.43,0.29,1.44,4.25,1.12,2.51,1105,0 +13.07,1.5,2.1,15.5,98,2.4,2.64,0.28,1.37,3.7,1.18,2.69,1020,0 +14.22,3.99,2.51,13.2,128,3,3.04,0.2,2.08,5.1,0.89,3.53,760,0 +13.56,1.71,2.31,16.2,117,3.15,3.29,0.34,2.34,6.13,0.95,3.38,795,0 +13.41,3.84,2.12,18.8,90,2.45,2.68,0.27,1.48,4.28,0.91,3,1035,0 +13.88,1.89,2.59,15,101,3.25,3.56,0.17,1.7,5.43,0.88,3.56,1095,0 +13.24,3.98,2.29,17.5,103,2.64,2.63,0.32,1.66,4.36,0.82,3,680,0 +13.05,1.77,2.1,17,107,3,3,0.28,2.03,5.04,0.88,3.35,885,0 +14.21,4.04,2.44,18.9,111,2.85,2.65,0.3,1.25,5.24,0.87,3.33,1080,0 +14.38,3.59,2.28,16,102,3.25,3.17,0.27,2.19,4.9,1.04,3.44,1065,0 +13.9,1.68,2.12,16,101,3.1,3.39,0.21,2.14,6.1,0.91,3.33,985,0 +14.1,2.02,2.4,18.8,103,2.75,2.92,0.32,2.38,6.2,1.07,2.75,1060,0 +13.94,1.73,2.27,17.4,108,2.88,3.54,0.32,2.08,8.9,1.12,3.1,1260,0 +13.05,1.73,2.04,12.4,92,2.72,3.27,0.17,2.91,7.2,1.12,2.91,1150,0 +13.83,1.65,2.6,17.2,94,2.45,2.99,0.22,2.29,5.6,1.24,3.37,1265,0 +13.82,1.75,2.42,14,111,3.88,3.74,0.32,1.87,7.05,1.01,3.26,1190,0 +13.77,1.9,2.68,17.1,115,3,2.79,0.39,1.68,6.3,1.13,2.93,1375,0 +13.74,1.67,2.25,16.4,118,2.6,2.9,0.21,1.62,5.85,0.92,3.2,1060,0 +13.56,1.73,2.46,20.5,116,2.96,2.78,0.2,2.45,6.25,0.98,3.03,1120,0 +14.22,1.7,2.3,16.3,118,3.2,3,0.26,2.03,6.38,0.94,3.31,970,0 +13.29,1.97,2.68,16.8,102,3,3.23,0.31,1.66,6,1.07,2.84,1270,0 +13.72,1.43,2.5,16.7,108,3.4,3.67,0.19,2.04,6.8,0.89,2.87,1285,0 +12.37,0.94,1.36,10.6,88,1.98,0.57,0.28,0.42,1.95,1.05,1.82,520,1 +12.33,1.1,2.28,16,101,2.05,1.09,0.63,0.41,3.27,1.25,1.67,680,1 +12.64,1.36,2.02,16.8,100,2.02,1.41,0.53,0.62,5.75,0.98,1.59,450,1 +13.67,1.25,1.92,18,94,2.1,1.79,0.32,0.73,3.8,1.23,2.46,630,1 +12.37,1.13,2.16,19,87,3.5,3.1,0.19,1.87,4.45,1.22,2.87,420,1 +12.17,1.45,2.53,19,104,1.89,1.75,0.45,1.03,2.95,1.45,2.23,355,1 +12.37,1.21,2.56,18.1,98,2.42,2.65,0.37,2.08,4.6,1.19,2.3,678,1 +13.11,1.01,1.7,15,78,2.98,3.18,0.26,2.28,5.3,1.12,3.18,502,1 +12.37,1.17,1.92,19.6,78,2.11,2,0.27,1.04,4.68,1.12,3.48,510,1 +13.34,0.94,2.36,17,110,2.53,1.3,0.55,0.42,3.17,1.02,1.93,750,1 +12.21,1.19,1.75,16.8,151,1.85,1.28,0.14,2.5,2.85,1.28,3.07,718,1 +12.29,1.61,2.21,20.4,103,1.1,1.02,0.37,1.46,3.05,0.906,1.82,870,1 +13.86,1.51,2.67,25,86,2.95,2.86,0.21,1.87,3.38,1.36,3.16,410,1 +13.49,1.66,2.24,24,87,1.88,1.84,0.27,1.03,3.74,0.98,2.78,472,1 +12.99,1.67,2.6,30,139,3.3,2.89,0.21,1.96,3.35,1.31,3.5,985,1 +11.96,1.09,2.3,21,101,3.38,2.14,0.13,1.65,3.21,0.99,3.13,886,1 +11.66,1.88,1.92,16,97,1.61,1.57,0.34,1.15,3.8,1.23,2.14,428,1 +13.03,0.9,1.71,16,86,1.95,2.03,0.24,1.46,4.6,1.19,2.48,392,1 +11.84,2.89,2.23,18,112,1.72,1.32,0.43,0.95,2.65,0.96,2.52,500,1 +12.33,0.99,1.95,14.8,136,1.9,1.85,0.35,2.76,3.4,1.06,2.31,750,1 +12.7,3.87,2.4,23,101,2.83,2.55,0.43,1.95,2.57,1.19,3.13,463,1 +12,0.92,2,19,86,2.42,2.26,0.3,1.43,2.5,1.38,3.12,278,1 +12.72,1.81,2.2,18.8,86,2.2,2.53,0.26,1.77,3.9,1.16,3.14,714,1 +12.08,1.13,2.51,24,78,2,1.58,0.4,1.4,2.2,1.31,2.72,630,1 +13.05,3.86,2.32,22.5,85,1.65,1.59,0.61,1.62,4.8,0.84,2.01,515,1 +11.84,0.89,2.58,18,94,2.2,2.21,0.22,2.35,3.05,0.79,3.08,520,1 +12.67,0.98,2.24,18,99,2.2,1.94,0.3,1.46,2.62,1.23,3.16,450,1 +12.16,1.61,2.31,22.8,90,1.78,1.69,0.43,1.56,2.45,1.33,2.26,495,1 +11.65,1.67,2.62,26,88,1.92,1.61,0.4,1.34,2.6,1.36,3.21,562,1 +11.64,2.06,2.46,21.6,84,1.95,1.69,0.48,1.35,2.8,1,2.75,680,1 +12.08,1.33,2.3,23.6,70,2.2,1.59,0.42,1.38,1.74,1.07,3.21,625,1 +12.08,1.83,2.32,18.5,81,1.6,1.5,0.52,1.64,2.4,1.08,2.27,480,1 +12,1.51,2.42,22,86,1.45,1.25,0.5,1.63,3.6,1.05,2.65,450,1 +12.69,1.53,2.26,20.7,80,1.38,1.46,0.58,1.62,3.05,0.96,2.06,495,1 +12.29,2.83,2.22,18,88,2.45,2.25,0.25,1.99,2.15,1.15,3.3,290,1 +11.62,1.99,2.28,18,98,3.02,2.26,0.17,1.35,3.25,1.16,2.96,345,1 +12.47,1.52,2.2,19,162,2.5,2.27,0.32,3.28,2.6,1.16,2.63,937,1 +11.81,2.12,2.74,21.5,134,1.6,0.99,0.14,1.56,2.5,0.95,2.26,625,1 +12.29,1.41,1.98,16,85,2.55,2.5,0.29,1.77,2.9,1.23,2.74,428,1 +12.37,1.07,2.1,18.5,88,3.52,3.75,0.24,1.95,4.5,1.04,2.77,660,1 +12.29,3.17,2.21,18,88,2.85,2.99,0.45,2.81,2.3,1.42,2.83,406,1 +12.08,2.08,1.7,17.5,97,2.23,2.17,0.26,1.4,3.3,1.27,2.96,710,1 +12.6,1.34,1.9,18.5,88,1.45,1.36,0.29,1.35,2.45,1.04,2.77,562,1 +12.34,2.45,2.46,21,98,2.56,2.11,0.34,1.31,2.8,0.8,3.38,438,1 +11.82,1.72,1.88,19.5,86,2.5,1.64,0.37,1.42,2.06,0.94,2.44,415,1 +12.51,1.73,1.98,20.5,85,2.2,1.92,0.32,1.48,2.94,1.04,3.57,672,1 +12.42,2.55,2.27,22,90,1.68,1.84,0.66,1.42,2.7,0.86,3.3,315,1 +12.25,1.73,2.12,19,80,1.65,2.03,0.37,1.63,3.4,1,3.17,510,1 +12.72,1.75,2.28,22.5,84,1.38,1.76,0.48,1.63,3.3,0.88,2.42,488,1 +12.22,1.29,1.94,19,92,2.36,2.04,0.39,2.08,2.7,0.86,3.02,312,1 +11.61,1.35,2.7,20,94,2.74,2.92,0.29,2.49,2.65,0.96,3.26,680,1 +11.46,3.74,1.82,19.5,107,3.18,2.58,0.24,3.58,2.9,0.75,2.81,562,1 +12.52,2.43,2.17,21,88,2.55,2.27,0.26,1.22,2,0.9,2.78,325,1 +11.76,2.68,2.92,20,103,1.75,2.03,0.6,1.05,3.8,1.23,2.5,607,1 +11.41,0.74,2.5,21,88,2.48,2.01,0.42,1.44,3.08,1.1,2.31,434,1 +12.08,1.39,2.5,22.5,84,2.56,2.29,0.43,1.04,2.9,0.93,3.19,385,1 +11.03,1.51,2.2,21.5,85,2.46,2.17,0.52,2.01,1.9,1.71,2.87,407,1 +11.82,1.47,1.99,20.8,86,1.98,1.6,0.3,1.53,1.95,0.95,3.33,495,1 +12.42,1.61,2.19,22.5,108,2,2.09,0.34,1.61,2.06,1.06,2.96,345,1 +12.77,3.43,1.98,16,80,1.63,1.25,0.43,0.83,3.4,0.7,2.12,372,1 +12,3.43,2,19,87,2,1.64,0.37,1.87,1.28,0.93,3.05,564,1 +11.45,2.4,2.42,20,96,2.9,2.79,0.32,1.83,3.25,0.8,3.39,625,1 +11.56,2.05,3.23,28.5,119,3.18,5.08,0.47,1.87,6,0.93,3.69,465,1 +12.42,4.43,2.73,26.5,102,2.2,2.13,0.43,1.71,2.08,0.92,3.12,365,1 +13.05,5.8,2.13,21.5,86,2.62,2.65,0.3,2.01,2.6,0.73,3.1,380,1 +11.87,4.31,2.39,21,82,2.86,3.03,0.21,2.91,2.8,0.75,3.64,380,1 +12.07,2.16,2.17,21,85,2.6,2.65,0.37,1.35,2.76,0.86,3.28,378,1 +12.43,1.53,2.29,21.5,86,2.74,3.15,0.39,1.77,3.94,0.69,2.84,352,1 +11.79,2.13,2.78,28.5,92,2.13,2.24,0.58,1.76,3,0.97,2.44,466,1 +12.37,1.63,2.3,24.5,88,2.22,2.45,0.4,1.9,2.12,0.89,2.78,342,1 +12.04,4.3,2.38,22,80,2.1,1.75,0.42,1.35,2.6,0.79,2.57,580,1 +12.86,1.35,2.32,18,122,1.51,1.25,0.21,0.94,4.1,0.76,1.29,630,2 +12.88,2.99,2.4,20,104,1.3,1.22,0.24,0.83,5.4,0.74,1.42,530,2 +12.81,2.31,2.4,24,98,1.15,1.09,0.27,0.83,5.7,0.66,1.36,560,2 +12.7,3.55,2.36,21.5,106,1.7,1.2,0.17,0.84,5,0.78,1.29,600,2 +12.51,1.24,2.25,17.5,85,2,0.58,0.6,1.25,5.45,0.75,1.51,650,2 +12.6,2.46,2.2,18.5,94,1.62,0.66,0.63,0.94,7.1,0.73,1.58,695,2 +12.25,4.72,2.54,21,89,1.38,0.47,0.53,0.8,3.85,0.75,1.27,720,2 +12.53,5.51,2.64,25,96,1.79,0.6,0.63,1.1,5,0.82,1.69,515,2 +13.49,3.59,2.19,19.5,88,1.62,0.48,0.58,0.88,5.7,0.81,1.82,580,2 +12.84,2.96,2.61,24,101,2.32,0.6,0.53,0.81,4.92,0.89,2.15,590,2 +12.93,2.81,2.7,21,96,1.54,0.5,0.53,0.75,4.6,0.77,2.31,600,2 +13.36,2.56,2.35,20,89,1.4,0.5,0.37,0.64,5.6,0.7,2.47,780,2 +13.52,3.17,2.72,23.5,97,1.55,0.52,0.5,0.55,4.35,0.89,2.06,520,2 +13.62,4.95,2.35,20,92,2,0.8,0.47,1.02,4.4,0.91,2.05,550,2 +12.25,3.88,2.2,18.5,112,1.38,0.78,0.29,1.14,8.21,0.65,2,855,2 +13.16,3.57,2.15,21,102,1.5,0.55,0.43,1.3,4,0.6,1.68,830,2 +13.88,5.04,2.23,20,80,0.98,0.34,0.4,0.68,4.9,0.58,1.33,415,2 +12.87,4.61,2.48,21.5,86,1.7,0.65,0.47,0.86,7.65,0.54,1.86,625,2 +13.32,3.24,2.38,21.5,92,1.93,0.76,0.45,1.25,8.42,0.55,1.62,650,2 +13.08,3.9,2.36,21.5,113,1.41,1.39,0.34,1.14,9.4,0.57,1.33,550,2 +13.5,3.12,2.62,24,123,1.4,1.57,0.22,1.25,8.6,0.59,1.3,500,2 +12.79,2.67,2.48,22,112,1.48,1.36,0.24,1.26,10.8,0.48,1.47,480,2 +13.11,1.9,2.75,25.5,116,2.2,1.28,0.26,1.56,7.1,0.61,1.33,425,2 +13.23,3.3,2.28,18.5,98,1.8,0.83,0.61,1.87,10.52,0.56,1.51,675,2 +12.58,1.29,2.1,20,103,1.48,0.58,0.53,1.4,7.6,0.58,1.55,640,2 +13.17,5.19,2.32,22,93,1.74,0.63,0.61,1.55,7.9,0.6,1.48,725,2 +13.84,4.12,2.38,19.5,89,1.8,0.83,0.48,1.56,9.01,0.57,1.64,480,2 +12.45,3.03,2.64,27,97,1.9,0.58,0.63,1.14,7.5,0.67,1.73,880,2 +14.34,1.68,2.7,25,98,2.8,1.31,0.53,2.7,13,0.57,1.96,660,2 +13.48,1.67,2.64,22.5,89,2.6,1.1,0.52,2.29,11.75,0.57,1.78,620,2 +12.36,3.83,2.38,21,88,2.3,0.92,0.5,1.04,7.65,0.56,1.58,520,2 +13.69,3.26,2.54,20,107,1.83,0.56,0.5,0.8,5.88,0.96,1.82,680,2 +12.85,3.27,2.58,22,106,1.65,0.6,0.6,0.96,5.58,0.87,2.11,570,2 +12.96,3.45,2.35,18.5,106,1.39,0.7,0.4,0.94,5.28,0.68,1.75,675,2 +13.78,2.76,2.3,22,90,1.35,0.68,0.41,1.03,9.58,0.7,1.68,615,2 +13.73,4.36,2.26,22.5,88,1.28,0.47,0.52,1.15,6.62,0.78,1.75,520,2 +13.45,3.7,2.6,23,111,1.7,0.92,0.43,1.46,10.68,0.85,1.56,695,2 +12.82,3.37,2.3,19.5,88,1.48,0.66,0.4,0.97,10.26,0.72,1.75,685,2 +13.58,2.58,2.69,24.5,105,1.55,0.84,0.39,1.54,8.66,0.74,1.8,750,2 +13.4,4.6,2.86,25,112,1.98,0.96,0.27,1.11,8.5,0.67,1.92,630,2 +12.2,3.03,2.32,19,96,1.25,0.49,0.4,0.73,5.5,0.66,1.83,510,2 +12.77,2.39,2.28,19.5,86,1.39,0.51,0.48,0.64,9.899999,0.57,1.63,470,2 +14.16,2.51,2.48,20,91,1.68,0.7,0.44,1.24,9.7,0.62,1.71,660,2 +13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.7,0.64,1.74,740,2 +13.4,3.91,2.48,23,102,1.8,0.75,0.43,1.41,7.3,0.7,1.56,750,2 +13.27,4.28,2.26,20,120,1.59,0.69,0.43,1.35,10.2,0.59,1.56,835,2 +13.17,2.59,2.37,20,120,1.65,0.68,0.53,1.46,9.3,0.6,1.62,840,2 +14.13,4.1,2.74,24.5,96,2.05,0.76,0.56,1.35,9.2,0.61,1.6,560,2 diff --git a/sklearn/datasets/descr/wine_data.rst b/sklearn/datasets/descr/wine_data.rst new file mode 100644 index 0000000000000000000000000000000000000000..3d3341874a584d57cdf200bd062df20a7656a81b --- /dev/null +++ b/sklearn/datasets/descr/wine_data.rst @@ -0,0 +1,95 @@ +Wine Data Database +==================== + +Notes +----- +Data Set Characteristics: + :Number of Instances: 178 (50 in each of three classes) + :Number of Attributes: 13 numeric, predictive attributes and the class + :Attribute Information: + - 1) Alcohol + - 2) Malic acid + - 3) Ash + - 4) Alcalinity of ash + - 5) Magnesium + - 6) Total phenols + - 7) Flavanoids + - 8) Nonflavanoid phenols + - 9) Proanthocyanins + - 10)Color intensity + - 11)Hue + - 12)OD280/OD315 of diluted wines + - 13)Proline + - class: + - class_0 + - class_1 + - class_2 + + :Summary Statistics: + + ============================= ==== ===== ======= ===== + Min Max Mean SD + ============================= ==== ===== ======= ===== + Alcohol: 11.0 14.8 13.0 0.8 + Malic Acid: 0.74 5.80 2.34 1.12 + Ash: 1.36 3.23 2.36 0.27 + Alcalinity of Ash: 10.6 30.0 19.5 3.3 + Magnesium: 70.0 162.0 99.7 14.3 + Total Phenols: 0.98 3.88 2.29 0.63 + Flavanoids: 0.34 5.08 2.03 1.00 + Nonflavanoid Phenols: 0.13 0.66 0.36 0.12 + Proanthocyanins: 0.41 3.58 1.59 0.57 + Colour Intensity: 1.3 13.0 5.1 2.3 + Hue: 0.48 1.71 0.96 0.23 + OD280/OD315 of diluted wines: 1.27 4.00 2.61 0.71 + Proline: 278 1680 746 315 + ============================= ==== ===== ======= ===== + + :Missing Attribute Values: None + :Class Distribution: class_0 (59), class_1 (71), class_2 (48) + :Creator: R.A. Fisher + :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) + :Date: July, 1988 + +This is a copy of UCI ML Wine recognition datasets. +https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data + +The data is the results of a chemical analysis of wines grown in the same +region in Italy by three different cultivators. There are thirteen different +measurements taken for different constituents found in the three types of +wine. + +Original Owners: + +Forina, M. et al, PARVUS - +An Extendible Package for Data Exploration, Classification and Correlation. +Institute of Pharmaceutical and Food Analysis and Technologies, +Via Brigata Salerno, 16147 Genoa, Italy. + +Citation: + +Lichman, M. (2013). UCI Machine Learning Repository +[http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, +School of Information and Computer Science. + +References +---------- +(1) +S. Aeberhard, D. Coomans and O. de Vel, +Comparison of Classifiers in High Dimensional Settings, +Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of +Mathematics and Statistics, James Cook University of North Queensland. +(Also submitted to Technometrics). + +The data was used with many others for comparing various +classifiers. The classes are separable, though only RDA +has achieved 100% correct classification. +(RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data)) +(All results using the leave-one-out technique) + +(2) +S. Aeberhard, D. Coomans and O. de Vel, +"THE CLASSIFICATION PERFORMANCE OF RDA" +Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of +Mathematics and Statistics, James Cook University of North Queensland. +(Also submitted to Journal of Chemometrics). diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index 92fe96fa106568f0c4124d301e004b08bee4d969..c0dd5101904d911f55bb5df2efe704ef48fe8f32 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -17,6 +17,7 @@ from sklearn.datasets import load_linnerud from sklearn.datasets import load_iris from sklearn.datasets import load_breast_cancer from sklearn.datasets import load_boston +from sklearn.datasets import load_wine from sklearn.datasets.base import Bunch from sklearn.externals.six import b, u @@ -195,6 +196,7 @@ def test_load_linnerud(): assert_array_equal(X_y_tuple[0], bunch.data) assert_array_equal(X_y_tuple[1], bunch.target) + def test_load_iris(): res = load_iris() assert_equal(res.data.shape, (150, 4)) @@ -210,6 +212,21 @@ def test_load_iris(): assert_array_equal(X_y_tuple[1], bunch.target) +def test_load_wine(): + res = load_wine() + assert_equal(res.data.shape, (178, 13)) + assert_equal(res.target.size, 178) + assert_equal(res.target_names.size, 3) + assert_true(res.DESCR) + + # test return_X_y option + X_y_tuple = load_wine(return_X_y=True) + bunch = load_wine() + assert_true(isinstance(X_y_tuple, tuple)) + assert_array_equal(X_y_tuple[0], bunch.data) + assert_array_equal(X_y_tuple[1], bunch.target) + + def test_load_breast_cancer(): res = load_breast_cancer() assert_equal(res.data.shape, (569, 30)) @@ -239,6 +256,7 @@ def test_load_boston(): assert_array_equal(X_y_tuple[0], bunch.data) assert_array_equal(X_y_tuple[1], bunch.target) + def test_loads_dumps_bunch(): bunch = Bunch(x="x") bunch_from_pkl = loads(dumps(bunch))