diff --git a/doc/developers/utilities.rst b/doc/developers/utilities.rst index 509807c31d496496d55f3889b4f487facda0d3a3..7313ca976eec162c64ad6fab10db3c1276fa47a8 100644 --- a/doc/developers/utilities.rst +++ b/doc/developers/utilities.rst @@ -144,7 +144,7 @@ efficiently process ``scipy.sparse`` data. - :func:`sparsefuncs.inplace_csr_column_scale`: can be used to multiply the columns of a CSR matrix by a constant scale (one scale per column). Used for scaling features to unit standard deviation in - :class:`sklearn.preprocessing.Scaler`. + :class:`sklearn.preprocessing.StandardScaler`. Graph Routines diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 2f63701d15310365f79e20576d24107ddf4a27f9..cc48154098b4c5173519c264fc24c7404e07de63 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -836,12 +836,13 @@ Pairwise metrics :toctree: generated/ :template: class.rst - preprocessing.Scaler - preprocessing.Normalizer preprocessing.Binarizer + preprocessing.KernelCenterer preprocessing.LabelBinarizer preprocessing.LabelEncoder - preprocessing.KernelCenterer + preprocessing.MinMaxScaler + preprocessing.Normalizer + preprocessing.StandardScaler .. autosummary:: :toctree: generated/ diff --git a/doc/modules/manifold.rst b/doc/modules/manifold.rst index 75b2e871383bf27b9c44d3d0f72f36c53362f6eb..e68439bc97a1e4fee61b34d108d54b8d3b8231e8 100644 --- a/doc/modules/manifold.rst +++ b/doc/modules/manifold.rst @@ -427,7 +427,7 @@ Tips on practical use * Make sure the same scale is used over all features. Because manifold learning methods are based on a nearest-neighbor search, the algorithm - may perform poorly otherwise. See :ref:`Scaler <preprocessing_scaler>` + may perform poorly otherwise. See :ref:`StandardScaler <preprocessing_scaler>` for convenient ways of scaling heterogeneous data. * The reconstruction error computed by each routine can be used to choose diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 6853d32632f13034ea65c9d4a851177c72d2db2c..c1dba225fac50f7a5c26a47a62c80eaeb85f2344 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -64,15 +64,15 @@ Scaled data has zero mean and unit variance:: .. >>> print_options = np.set_printoptions(print_options) The ``preprocessing`` module further provides a utility class -:class:`Scaler` that implements the ``Transformer`` API to compute +:class:`StandardScaler` that implements the ``Transformer`` API to compute the mean and standard deviation on a training set so as to be able to later reapply the same transformation on the testing set. This class is hence suitable for use in the early steps of a :class:`sklearn.pipeline.Pipeline`:: - >>> scaler = preprocessing.Scaler().fit(X) + >>> scaler = preprocessing.StandardScaler().fit(X) >>> scaler - Scaler(copy=True, with_mean=True, with_std=True) + StandardScaler(copy=True, with_mean=True, with_std=True) >>> scaler.mean_ # doctest: +ELLIPSIS array([ 1. ..., 0. ..., 0.33...]) @@ -94,7 +94,7 @@ same way it did on the training set:: It is possible to disable either centering or scaling by either passing ``with_mean=False`` or ``with_std=False`` to the constructor -of :class:`Scaler`. +of :class:`StandardScaler`. .. topic:: References: @@ -115,7 +115,7 @@ of :class:`Scaler`. .. topic:: Sparse input - :func:`scale` and :class:`Scaler` accept ``scipy.sparse`` matrices + :func:`scale` and :class:`StandardScaler` accept ``scipy.sparse`` matrices as input **only when with_mean=False is explicitly passed to the constructor**. Otherwise a ``ValueError`` will be raised as silently centering would break the sparsity and would often crash the @@ -132,7 +132,7 @@ of :class:`Scaler`. .. topic:: Scaling target variables in regression - :func:`scale` and :class:`Scaler` work out-of-the-box with 1d arrays. + :func:`scale` and :class:`StandardScaler` work out-of-the-box with 1d arrays. This is very useful for scaling the target / response variables used for regression. @@ -243,7 +243,7 @@ It is possible to adjust the threshold of the binarizer:: [ 1., 0., 0.], [ 0., 0., 0.]]) -As for the :class:`Scaler` and :class:`Normalizer` classes, the +As for the :class:`StandardScaler` and :class:`Normalizer` classes, the preprocessing module provides a companion function :func:`binarize` to be used when the transformer API is not necessary. diff --git a/doc/modules/sgd.rst b/doc/modules/sgd.rst index 92b2ea3b471f5f572209f06acde26ba1b53e798a..c236a7520ff7830851fd7e254611a6c1b4c30751 100644 --- a/doc/modules/sgd.rst +++ b/doc/modules/sgd.rst @@ -229,10 +229,10 @@ Tips on Practical Use attribute on the input vector X to [0,1] or [-1,+1], or standardize it to have mean 0 and variance 1. Note that the *same* scaling must be applied to the test vector to obtain meaningful - results. This can be easily done using :class:`Scaler`:: + results. This can be easily done using :class:`StandardScaler`:: - from sklearn.preprocessing import Scaler - scaler = Scaler() + from sklearn.preprocessing import StandardScaler + scaler = StandardScaler() scaler.fit(X_train) # Don't cheat - fit only on training data X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) # apply same transformation to test data diff --git a/examples/cluster/plot_cluster_comparison.py b/examples/cluster/plot_cluster_comparison.py index 0e114cb1b9833d0aa0601c0fdf68f07f04b49edd..9e10e51935a73b101d6b2bea6e27129be14ccb5f 100644 --- a/examples/cluster/plot_cluster_comparison.py +++ b/examples/cluster/plot_cluster_comparison.py @@ -30,7 +30,7 @@ import pylab as pl from sklearn import cluster, datasets from sklearn.metrics import euclidean_distances from sklearn.neighbors import kneighbors_graph -from sklearn.preprocessing import Scaler +from sklearn.preprocessing import StandardScaler np.random.seed(0) @@ -55,7 +55,7 @@ for i_dataset, dataset in enumerate([noisy_circles, noisy_moons, blobs, no_structure]): X, y = dataset # normalize dataset for easier parameter selection - X = Scaler().fit_transform(X) + X = StandardScaler().fit_transform(X) # estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(X, quantile=0.3) diff --git a/examples/linear_model/plot_logistic_l1_l2_sparsity.py b/examples/linear_model/plot_logistic_l1_l2_sparsity.py index 7393f967ac2545c5a14429bc509962c19603c535..8a89204dddad636c4613d6589426f4bfb37b053d 100644 --- a/examples/linear_model/plot_logistic_l1_l2_sparsity.py +++ b/examples/linear_model/plot_logistic_l1_l2_sparsity.py @@ -25,12 +25,12 @@ import pylab as pl from sklearn.linear_model import LogisticRegression from sklearn import datasets -from sklearn.preprocessing import Scaler +from sklearn.preprocessing import StandardScaler digits = datasets.load_digits() X, y = digits.data, digits.target -X = Scaler().fit_transform(X) +X = StandardScaler().fit_transform(X) # classify small against large digits y = (y > 4).astype(np.int) diff --git a/examples/linear_model/plot_sparse_recovery.py b/examples/linear_model/plot_sparse_recovery.py index 5c080d96489e7bb92394796ce81a8a9abee2094b..05f62341b2d3eea10a8f204dacc7dda77cbb8fcc 100644 --- a/examples/linear_model/plot_sparse_recovery.py +++ b/examples/linear_model/plot_sparse_recovery.py @@ -50,7 +50,7 @@ from scipy import linalg from sklearn.linear_model import RandomizedLasso, lasso_stability_path, \ LassoLarsCV from sklearn.feature_selection import f_regression -from sklearn.preprocessing import Scaler +from sklearn.preprocessing import StandardScaler from sklearn.metrics import auc, precision_recall_curve from sklearn.ensemble import ExtraTreesRegressor from sklearn.utils.extmath import pinvh @@ -97,7 +97,7 @@ for conditionning in (1, 1e-4): # Keep [Wainwright2006] (26c) constant X[:n_relevant_features] /= np.abs( linalg.svdvals(X[:n_relevant_features])).max() - X = Scaler().fit_transform(X.copy()) + X = StandardScaler().fit_transform(X.copy()) # The output variable y = np.dot(X, coef) diff --git a/examples/svm/plot_rbf_parameters.py b/examples/svm/plot_rbf_parameters.py index 465098199be2cdab0fee19e850b7e7a20abc775e..96865b65b36336a7799142f7846e5df257107375 100644 --- a/examples/svm/plot_rbf_parameters.py +++ b/examples/svm/plot_rbf_parameters.py @@ -25,7 +25,7 @@ import numpy as np import pylab as pl from sklearn.svm import SVC -from sklearn.preprocessing import Scaler +from sklearn.preprocessing import StandardScaler from sklearn.datasets import load_iris from sklearn.cross_validation import StratifiedKFold from sklearn.grid_search import GridSearchCV @@ -49,7 +49,7 @@ Y_2d -= 1 # instead of fitting the transformation on the training set and # just applying it on the test set. -scaler = Scaler() +scaler = StandardScaler() X = scaler.fit_transform(X) X_2d = scaler.fit_transform(X_2d) diff --git a/sklearn/cluster/_k_means.pyx b/sklearn/cluster/_k_means.pyx index ee9714deb218937a41923d9b6d8b93b2dc8249ec..db9427342785959f8c840aacb2b6b3c4ae56a8fc 100644 --- a/sklearn/cluster/_k_means.pyx +++ b/sklearn/cluster/_k_means.pyx @@ -243,7 +243,7 @@ def csr_row_norm_l2(X, squared=True): """Get L2 norm of each row in CSR matrix X. TODO: refactor me in the sklearn.utils.sparsefuncs module once the CSR - sklearn.preprocessing.Scaler has been refactored as well. + sklearn.preprocessing.StandardScaler has been refactored as well. """ cdef: unsigned int n_samples = X.shape[0] diff --git a/sklearn/linear_model/tests/test_randomized_l1.py b/sklearn/linear_model/tests/test_randomized_l1.py index 128d6a0917e085ab15d92645d3f4bf3fcf2d3096..df6680a432ebeb7a5d2abb8f1bd67a4aed17bf8a 100644 --- a/sklearn/linear_model/tests/test_randomized_l1.py +++ b/sklearn/linear_model/tests/test_randomized_l1.py @@ -8,12 +8,12 @@ from sklearn.linear_model.randomized_l1 import lasso_stability_path, \ RandomizedLasso, RandomizedLogisticRegression from sklearn.datasets import load_diabetes, load_iris from sklearn.feature_selection import f_regression, f_classif -from sklearn.preprocessing import Scaler +from sklearn.preprocessing import StandardScaler diabetes = load_diabetes() X = diabetes.data y = diabetes.target -X = Scaler().fit_transform(X) +X = StandardScaler().fit_transform(X) X = X[:, [2, 3, 6, 7, 8]] # test that the feature score of the best features diff --git a/sklearn/preprocessing.py b/sklearn/preprocessing.py index 574958bcfc5ee9e686404894031c1c0220edf3c3..087c25f75b2649cf79b956162843f2b289dc53b2 100644 --- a/sklearn/preprocessing.py +++ b/sklearn/preprocessing.py @@ -23,7 +23,7 @@ __all__ = ['Binarizer', 'LabelBinarizer', 'LabelEncoder', 'Normalizer', - 'Scaler', + 'StandardScaler', 'binarize', 'normalize', 'scale'] @@ -96,7 +96,7 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True): See also -------- - :class:`sklearn.preprocessing.Scaler` to perform centering and + :class:`sklearn.preprocessing.StandardScaler` to perform centering and scaling using the ``Transformer`` API (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`) """ @@ -148,8 +148,6 @@ class MinMaxScaler(BaseEstimator, TransformerMixin): This standardization is often used as an alternative to zero mean, unit variance scaling. - It is in particular useful for sparse positive data, as it retains the - sparsity structure of the data (if scaled between zero and some number). Parameters ---------- @@ -158,8 +156,7 @@ class MinMaxScaler(BaseEstimator, TransformerMixin): copy : boolean, optional, default is True Set to False to perform inplace row normalization and avoid a - copy (if the input is already a numpy array or a scipy.sparse - CSR matrix and if axis is 1). + copy (if the input is already a numpy array). Attributes ---------- @@ -930,7 +927,7 @@ class KernelCenterer(BaseEstimator, TransformerMixin): """Center a kernel matrix This is equivalent to centering phi(X) with - sklearn.preprocessing.Scaler(with_std=False). + sklearn.preprocessing.StandardScaler(with_std=False). """ def fit(self, K, y=None): diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 4eafd977ba72fb4bba72f5dea1770ae7b980f2bc..80a87b8e46a559730f66cd2263afe82fa5d0761e 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -18,7 +18,7 @@ from sklearn.utils.testing import assert_greater from sklearn.base import clone, ClassifierMixin, RegressorMixin, \ TransformerMixin, ClusterMixin from sklearn.utils import shuffle -from sklearn.preprocessing import Scaler +from sklearn.preprocessing import StandardScaler, Scaler #from sklearn.cross_validation import train_test_split from sklearn.datasets import load_iris, load_boston, make_blobs from sklearn.metrics import zero_one_score, adjusted_rand_score @@ -117,7 +117,7 @@ def test_transformers(): X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) n_samples, n_features = X.shape - X = Scaler().fit_transform(X) + X = StandardScaler().fit_transform(X) X -= X.min() succeeded = True @@ -196,7 +196,7 @@ def test_transformers_sparse_data(): continue # catch deprecation warnings with warnings.catch_warnings(record=True): - if Trans is Scaler: + if Trans in [Scaler, StandardScaler]: trans = Trans(with_mean=False) else: trans = Trans() @@ -267,7 +267,7 @@ def test_clustering(): X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=7) n_samples, n_features = X.shape - X = Scaler().fit_transform(X) + X = StandardScaler().fit_transform(X) for name, Alg in clustering: if Alg is WardAgglomeration: # this is clustering on the features @@ -308,7 +308,7 @@ def test_classifiers_train(): iris = load_iris() X_m, y_m = iris.data, iris.target X_m, y_m = shuffle(X_m, y_m, random_state=7) - X_m = Scaler().fit_transform(X_m) + X_m = StandardScaler().fit_transform(X_m) # generate binary problem from multi-class one y_b = y_m[y_m != 2] X_b = X_m[y_m != 2] @@ -378,7 +378,7 @@ def test_classifiers_classes(): iris = load_iris() X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=7) - X = Scaler().fit_transform(X) + X = StandardScaler().fit_transform(X) y = 2 * y + 1 # TODO: make work with next line :) #y = y.astype(np.str) @@ -409,7 +409,7 @@ def test_regressors_int(): boston = load_boston() X, y = boston.data, boston.target X, y = shuffle(X, y, random_state=0) - X = Scaler().fit_transform(X) + X = StandardScaler().fit_transform(X) y = np.random.randint(2, size=X.shape[0]) for name, Reg in regressors: if Reg in dont_test or Reg in meta_estimators or Reg in (CCA,): @@ -449,8 +449,8 @@ def test_regressors_train(): X, y = shuffle(X, y, random_state=0) # TODO: test with intercept # TODO: test with multiple responses - X = Scaler().fit_transform(X) - y = Scaler().fit_transform(y) + X = StandardScaler().fit_transform(X) + y = StandardScaler().fit_transform(y) succeeded = True for name, Reg in regressors: if Reg in dont_test or Reg in meta_estimators: diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 19c65bb855d12549ac3e8626528d6006e3abb3b7..258c4b0598ee68794ebd7c628c653ffed35ef3ce 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -12,7 +12,7 @@ from sklearn.linear_model import LogisticRegression from sklearn.feature_selection import SelectKBest, f_classif from sklearn.decomposition.pca import PCA, RandomizedPCA from sklearn.datasets import load_iris -from sklearn.preprocessing import Scaler +from sklearn.preprocessing import StandardScaler class IncorrectT(BaseEstimator): @@ -152,7 +152,7 @@ def test_pipeline_methods_preprocessing_svm(): y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) - scaler = Scaler() + scaler = StandardScaler() pca = RandomizedPCA(n_components=2, whiten=True) clf = SVC(probability=True)