diff --git a/doc/developers/utilities.rst b/doc/developers/utilities.rst
index 509807c31d496496d55f3889b4f487facda0d3a3..7313ca976eec162c64ad6fab10db3c1276fa47a8 100644
--- a/doc/developers/utilities.rst
+++ b/doc/developers/utilities.rst
@@ -144,7 +144,7 @@ efficiently process ``scipy.sparse`` data.
 - :func:`sparsefuncs.inplace_csr_column_scale`: can be used to multiply the
   columns of a CSR matrix by a constant scale (one scale per column).
   Used for scaling features to unit standard deviation in
-  :class:`sklearn.preprocessing.Scaler`.
+  :class:`sklearn.preprocessing.StandardScaler`.
 
 
 Graph Routines
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 2f63701d15310365f79e20576d24107ddf4a27f9..cc48154098b4c5173519c264fc24c7404e07de63 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -836,12 +836,13 @@ Pairwise metrics
    :toctree: generated/
    :template: class.rst
 
-   preprocessing.Scaler
-   preprocessing.Normalizer
    preprocessing.Binarizer
+   preprocessing.KernelCenterer
    preprocessing.LabelBinarizer
    preprocessing.LabelEncoder
-   preprocessing.KernelCenterer
+   preprocessing.MinMaxScaler
+   preprocessing.Normalizer
+   preprocessing.StandardScaler
 
 .. autosummary::
    :toctree: generated/
diff --git a/doc/modules/manifold.rst b/doc/modules/manifold.rst
index 75b2e871383bf27b9c44d3d0f72f36c53362f6eb..e68439bc97a1e4fee61b34d108d54b8d3b8231e8 100644
--- a/doc/modules/manifold.rst
+++ b/doc/modules/manifold.rst
@@ -427,7 +427,7 @@ Tips on practical use
 
 * Make sure the same scale is used over all features. Because manifold
   learning methods are based on a nearest-neighbor search, the algorithm
-  may perform poorly otherwise.  See :ref:`Scaler <preprocessing_scaler>`
+  may perform poorly otherwise.  See :ref:`StandardScaler <preprocessing_scaler>`
   for convenient ways of scaling heterogeneous data.
 
 * The reconstruction error computed by each routine can be used to choose
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 6853d32632f13034ea65c9d4a851177c72d2db2c..c1dba225fac50f7a5c26a47a62c80eaeb85f2344 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -64,15 +64,15 @@ Scaled data has zero mean and unit variance::
 ..    >>> print_options = np.set_printoptions(print_options)
 
 The ``preprocessing`` module further provides a utility class
-:class:`Scaler` that implements the ``Transformer`` API to compute
+:class:`StandardScaler` that implements the ``Transformer`` API to compute
 the mean and standard deviation on a training set so as to be
 able to later reapply the same transformation on the testing set.
 This class is hence suitable for use in the early steps of a
 :class:`sklearn.pipeline.Pipeline`::
 
-  >>> scaler = preprocessing.Scaler().fit(X)
+  >>> scaler = preprocessing.StandardScaler().fit(X)
   >>> scaler
-  Scaler(copy=True, with_mean=True, with_std=True)
+  StandardScaler(copy=True, with_mean=True, with_std=True)
 
   >>> scaler.mean_                                      # doctest: +ELLIPSIS
   array([ 1. ...,  0. ...,  0.33...])
@@ -94,7 +94,7 @@ same way it did on the training set::
 
 It is possible to disable either centering or scaling by either
 passing ``with_mean=False`` or ``with_std=False`` to the constructor
-of :class:`Scaler`.
+of :class:`StandardScaler`.
 
 
 .. topic:: References:
@@ -115,7 +115,7 @@ of :class:`Scaler`.
 
 .. topic:: Sparse input
 
-  :func:`scale` and :class:`Scaler` accept ``scipy.sparse`` matrices
+  :func:`scale` and :class:`StandardScaler` accept ``scipy.sparse`` matrices
   as input **only when with_mean=False is explicitly passed to the
   constructor**. Otherwise a ``ValueError`` will be raised as
   silently centering would break the sparsity and would often crash the
@@ -132,7 +132,7 @@ of :class:`Scaler`.
 
 .. topic:: Scaling target variables in regression
 
-    :func:`scale` and :class:`Scaler` work out-of-the-box with 1d arrays.
+    :func:`scale` and :class:`StandardScaler` work out-of-the-box with 1d arrays.
     This is very useful for scaling the target / response variables used
     for regression.
 
@@ -243,7 +243,7 @@ It is possible to adjust the threshold of the binarizer::
          [ 1.,  0.,  0.],
          [ 0.,  0.,  0.]])
 
-As for the :class:`Scaler` and :class:`Normalizer` classes, the
+As for the :class:`StandardScaler` and :class:`Normalizer` classes, the
 preprocessing module provides a companion function :func:`binarize`
 to be used when the transformer API is not necessary.
 
diff --git a/doc/modules/sgd.rst b/doc/modules/sgd.rst
index 92b2ea3b471f5f572209f06acde26ba1b53e798a..c236a7520ff7830851fd7e254611a6c1b4c30751 100644
--- a/doc/modules/sgd.rst
+++ b/doc/modules/sgd.rst
@@ -229,10 +229,10 @@ Tips on Practical Use
     attribute on the input vector X to [0,1] or [-1,+1], or standardize
     it to have mean 0 and variance 1. Note that the *same* scaling
     must be applied to the test vector to obtain meaningful
-    results. This can be easily done using :class:`Scaler`::
+    results. This can be easily done using :class:`StandardScaler`::
 
-      from sklearn.preprocessing import Scaler
-      scaler = Scaler()
+      from sklearn.preprocessing import StandardScaler
+      scaler = StandardScaler()
       scaler.fit(X_train)  # Don't cheat - fit only on training data
       X_train = scaler.transform(X_train)
       X_test = scaler.transform(X_test)  # apply same transformation to test data
diff --git a/examples/cluster/plot_cluster_comparison.py b/examples/cluster/plot_cluster_comparison.py
index 0e114cb1b9833d0aa0601c0fdf68f07f04b49edd..9e10e51935a73b101d6b2bea6e27129be14ccb5f 100644
--- a/examples/cluster/plot_cluster_comparison.py
+++ b/examples/cluster/plot_cluster_comparison.py
@@ -30,7 +30,7 @@ import pylab as pl
 from sklearn import cluster, datasets
 from sklearn.metrics import euclidean_distances
 from sklearn.neighbors import kneighbors_graph
-from sklearn.preprocessing import Scaler
+from sklearn.preprocessing import StandardScaler
 
 np.random.seed(0)
 
@@ -55,7 +55,7 @@ for i_dataset, dataset in enumerate([noisy_circles, noisy_moons, blobs,
                 no_structure]):
     X, y = dataset
     # normalize dataset for easier parameter selection
-    X = Scaler().fit_transform(X)
+    X = StandardScaler().fit_transform(X)
 
     # estimate bandwidth for mean shift
     bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)
diff --git a/examples/linear_model/plot_logistic_l1_l2_sparsity.py b/examples/linear_model/plot_logistic_l1_l2_sparsity.py
index 7393f967ac2545c5a14429bc509962c19603c535..8a89204dddad636c4613d6589426f4bfb37b053d 100644
--- a/examples/linear_model/plot_logistic_l1_l2_sparsity.py
+++ b/examples/linear_model/plot_logistic_l1_l2_sparsity.py
@@ -25,12 +25,12 @@ import pylab as pl
 
 from sklearn.linear_model import LogisticRegression
 from sklearn import datasets
-from sklearn.preprocessing import Scaler
+from sklearn.preprocessing import StandardScaler
 
 digits = datasets.load_digits()
 
 X, y = digits.data, digits.target
-X = Scaler().fit_transform(X)
+X = StandardScaler().fit_transform(X)
 
 # classify small against large digits
 y = (y > 4).astype(np.int)
diff --git a/examples/linear_model/plot_sparse_recovery.py b/examples/linear_model/plot_sparse_recovery.py
index 5c080d96489e7bb92394796ce81a8a9abee2094b..05f62341b2d3eea10a8f204dacc7dda77cbb8fcc 100644
--- a/examples/linear_model/plot_sparse_recovery.py
+++ b/examples/linear_model/plot_sparse_recovery.py
@@ -50,7 +50,7 @@ from scipy import linalg
 from sklearn.linear_model import RandomizedLasso, lasso_stability_path, \
                                  LassoLarsCV
 from sklearn.feature_selection import f_regression
-from sklearn.preprocessing import Scaler
+from sklearn.preprocessing import StandardScaler
 from sklearn.metrics import auc, precision_recall_curve
 from sklearn.ensemble import ExtraTreesRegressor
 from sklearn.utils.extmath import pinvh
@@ -97,7 +97,7 @@ for conditionning in (1, 1e-4):
     # Keep [Wainwright2006] (26c) constant
     X[:n_relevant_features] /= np.abs(
             linalg.svdvals(X[:n_relevant_features])).max()
-    X = Scaler().fit_transform(X.copy())
+    X = StandardScaler().fit_transform(X.copy())
 
     # The output variable
     y = np.dot(X, coef)
diff --git a/examples/svm/plot_rbf_parameters.py b/examples/svm/plot_rbf_parameters.py
index 465098199be2cdab0fee19e850b7e7a20abc775e..96865b65b36336a7799142f7846e5df257107375 100644
--- a/examples/svm/plot_rbf_parameters.py
+++ b/examples/svm/plot_rbf_parameters.py
@@ -25,7 +25,7 @@ import numpy as np
 import pylab as pl
 
 from sklearn.svm import SVC
-from sklearn.preprocessing import Scaler
+from sklearn.preprocessing import StandardScaler
 from sklearn.datasets import load_iris
 from sklearn.cross_validation import StratifiedKFold
 from sklearn.grid_search import GridSearchCV
@@ -49,7 +49,7 @@ Y_2d -= 1
 # instead of fitting the transformation on the training set and
 # just applying it on the test set.
 
-scaler = Scaler()
+scaler = StandardScaler()
 
 X = scaler.fit_transform(X)
 X_2d = scaler.fit_transform(X_2d)
diff --git a/sklearn/cluster/_k_means.pyx b/sklearn/cluster/_k_means.pyx
index ee9714deb218937a41923d9b6d8b93b2dc8249ec..db9427342785959f8c840aacb2b6b3c4ae56a8fc 100644
--- a/sklearn/cluster/_k_means.pyx
+++ b/sklearn/cluster/_k_means.pyx
@@ -243,7 +243,7 @@ def csr_row_norm_l2(X, squared=True):
     """Get L2 norm of each row in CSR matrix X.
 
     TODO: refactor me in the sklearn.utils.sparsefuncs module once the CSR
-    sklearn.preprocessing.Scaler has been refactored as well.
+    sklearn.preprocessing.StandardScaler has been refactored as well.
     """
     cdef:
         unsigned int n_samples = X.shape[0]
diff --git a/sklearn/linear_model/tests/test_randomized_l1.py b/sklearn/linear_model/tests/test_randomized_l1.py
index 128d6a0917e085ab15d92645d3f4bf3fcf2d3096..df6680a432ebeb7a5d2abb8f1bd67a4aed17bf8a 100644
--- a/sklearn/linear_model/tests/test_randomized_l1.py
+++ b/sklearn/linear_model/tests/test_randomized_l1.py
@@ -8,12 +8,12 @@ from sklearn.linear_model.randomized_l1 import lasso_stability_path, \
         RandomizedLasso, RandomizedLogisticRegression
 from sklearn.datasets import load_diabetes, load_iris
 from sklearn.feature_selection import f_regression, f_classif
-from sklearn.preprocessing import Scaler
+from sklearn.preprocessing import StandardScaler
 
 diabetes = load_diabetes()
 X = diabetes.data
 y = diabetes.target
-X = Scaler().fit_transform(X)
+X = StandardScaler().fit_transform(X)
 X = X[:, [2, 3, 6, 7, 8]]
 
 # test that the feature score of the best features
diff --git a/sklearn/preprocessing.py b/sklearn/preprocessing.py
index 574958bcfc5ee9e686404894031c1c0220edf3c3..087c25f75b2649cf79b956162843f2b289dc53b2 100644
--- a/sklearn/preprocessing.py
+++ b/sklearn/preprocessing.py
@@ -23,7 +23,7 @@ __all__ = ['Binarizer',
            'LabelBinarizer',
            'LabelEncoder',
            'Normalizer',
-           'Scaler',
+           'StandardScaler',
            'binarize',
            'normalize',
            'scale']
@@ -96,7 +96,7 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
 
     See also
     --------
-    :class:`sklearn.preprocessing.Scaler` to perform centering and
+    :class:`sklearn.preprocessing.StandardScaler` to perform centering and
     scaling using the ``Transformer`` API (e.g. as part of a preprocessing
     :class:`sklearn.pipeline.Pipeline`)
     """
@@ -148,8 +148,6 @@ class MinMaxScaler(BaseEstimator, TransformerMixin):
 
     This standardization is often used as an alternative to zero mean,
     unit variance scaling.
-    It is in particular useful for sparse positive data, as it retains the
-    sparsity structure of the data (if scaled between zero and some number).
 
     Parameters
     ----------
@@ -158,8 +156,7 @@ class MinMaxScaler(BaseEstimator, TransformerMixin):
 
     copy : boolean, optional, default is True
         Set to False to perform inplace row normalization and avoid a
-        copy (if the input is already a numpy array or a scipy.sparse
-        CSR matrix and if axis is 1).
+        copy (if the input is already a numpy array).
 
     Attributes
     ----------
@@ -930,7 +927,7 @@ class KernelCenterer(BaseEstimator, TransformerMixin):
     """Center a kernel matrix
 
     This is equivalent to centering phi(X) with
-    sklearn.preprocessing.Scaler(with_std=False).
+    sklearn.preprocessing.StandardScaler(with_std=False).
     """
 
     def fit(self, K, y=None):
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 4eafd977ba72fb4bba72f5dea1770ae7b980f2bc..80a87b8e46a559730f66cd2263afe82fa5d0761e 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -18,7 +18,7 @@ from sklearn.utils.testing import assert_greater
 from sklearn.base import clone, ClassifierMixin, RegressorMixin, \
         TransformerMixin, ClusterMixin
 from sklearn.utils import shuffle
-from sklearn.preprocessing import Scaler
+from sklearn.preprocessing import StandardScaler, Scaler
 #from sklearn.cross_validation import train_test_split
 from sklearn.datasets import load_iris, load_boston, make_blobs
 from sklearn.metrics import zero_one_score, adjusted_rand_score
@@ -117,7 +117,7 @@ def test_transformers():
     X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
             random_state=0, n_features=2, cluster_std=0.1)
     n_samples, n_features = X.shape
-    X = Scaler().fit_transform(X)
+    X = StandardScaler().fit_transform(X)
     X -= X.min()
 
     succeeded = True
@@ -196,7 +196,7 @@ def test_transformers_sparse_data():
             continue
         # catch deprecation warnings
         with warnings.catch_warnings(record=True):
-            if Trans is Scaler:
+            if Trans in [Scaler, StandardScaler]:
                 trans = Trans(with_mean=False)
             else:
                 trans = Trans()
@@ -267,7 +267,7 @@ def test_clustering():
     X, y = iris.data, iris.target
     X, y = shuffle(X, y, random_state=7)
     n_samples, n_features = X.shape
-    X = Scaler().fit_transform(X)
+    X = StandardScaler().fit_transform(X)
     for name, Alg in clustering:
         if Alg is WardAgglomeration:
             # this is clustering on the features
@@ -308,7 +308,7 @@ def test_classifiers_train():
     iris = load_iris()
     X_m, y_m = iris.data, iris.target
     X_m, y_m = shuffle(X_m, y_m, random_state=7)
-    X_m = Scaler().fit_transform(X_m)
+    X_m = StandardScaler().fit_transform(X_m)
     # generate binary problem from multi-class one
     y_b = y_m[y_m != 2]
     X_b = X_m[y_m != 2]
@@ -378,7 +378,7 @@ def test_classifiers_classes():
     iris = load_iris()
     X, y = iris.data, iris.target
     X, y = shuffle(X, y, random_state=7)
-    X = Scaler().fit_transform(X)
+    X = StandardScaler().fit_transform(X)
     y = 2 * y + 1
     # TODO: make work with next line :)
     #y = y.astype(np.str)
@@ -409,7 +409,7 @@ def test_regressors_int():
     boston = load_boston()
     X, y = boston.data, boston.target
     X, y = shuffle(X, y, random_state=0)
-    X = Scaler().fit_transform(X)
+    X = StandardScaler().fit_transform(X)
     y = np.random.randint(2, size=X.shape[0])
     for name, Reg in regressors:
         if Reg in dont_test or Reg in meta_estimators or Reg in (CCA,):
@@ -449,8 +449,8 @@ def test_regressors_train():
     X, y = shuffle(X, y, random_state=0)
     # TODO: test with intercept
     # TODO: test with multiple responses
-    X = Scaler().fit_transform(X)
-    y = Scaler().fit_transform(y)
+    X = StandardScaler().fit_transform(X)
+    y = StandardScaler().fit_transform(y)
     succeeded = True
     for name, Reg in regressors:
         if Reg in dont_test or Reg in meta_estimators:
diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py
index 19c65bb855d12549ac3e8626528d6006e3abb3b7..258c4b0598ee68794ebd7c628c653ffed35ef3ce 100644
--- a/sklearn/tests/test_pipeline.py
+++ b/sklearn/tests/test_pipeline.py
@@ -12,7 +12,7 @@ from sklearn.linear_model import LogisticRegression
 from sklearn.feature_selection import SelectKBest, f_classif
 from sklearn.decomposition.pca import PCA, RandomizedPCA
 from sklearn.datasets import load_iris
-from sklearn.preprocessing import Scaler
+from sklearn.preprocessing import StandardScaler
 
 
 class IncorrectT(BaseEstimator):
@@ -152,7 +152,7 @@ def test_pipeline_methods_preprocessing_svm():
     y = iris.target
     n_samples = X.shape[0]
     n_classes = len(np.unique(y))
-    scaler = Scaler()
+    scaler = StandardScaler()
     pca = RandomizedPCA(n_components=2, whiten=True)
     clf = SVC(probability=True)