diff --git a/doc/datasets/rcv1.rst b/doc/datasets/rcv1.rst
index ded38584cedab330a8b121480a9e498b866ece89..280e90f8b82fbe279e77c21116e103b159cbb31b 100644
--- a/doc/datasets/rcv1.rst
+++ b/doc/datasets/rcv1.rst
@@ -35,7 +35,7 @@ Each sample can be identified by its ID, ranging (with gaps) from 2286 to 810596
     array([2286, 2287, 2288], dtype=int32)
 
 ``target_names``:
-The target values are the topics of each sample. Each sample belongs to at least one topic, and to up to 17 topics. 
+The target values are the topics of each sample. Each sample belongs to at least one topic, and to up to 17 topics.
 There are 103 topics, each represented by a string. Their corpus frequencies span five orders of magnitude, from 5 occurrences for 'GMIL', to 381327 for 'CCAT'::
 
     >>> rcv1.target_names[:3].tolist()  # doctest: +SKIP
diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
index 15cf04b4a9e96388866f2ccd2d05cf5ffa215fc6..fc12805df4a5275cf2742cbe893ab32928326b57 100644
--- a/sklearn/linear_model/base.py
+++ b/sklearn/linear_model/base.py
@@ -10,7 +10,7 @@ Generalized Linear models.
 #         Mathieu Blondel <mathieu@mblondel.org>
 #         Lars Buitinck <L.J.Buitinck@uva.nl>
 #         Maryan Morel <maryan.morel@polytechnique.edu>
-#
+#         Giorgio Patrini <giorgio.patrini@anu.edu.au>
 # License: BSD 3 clause
 
 from __future__ import division
@@ -26,20 +26,16 @@ from scipy import sparse
 from ..externals import six
 from ..externals.joblib import Parallel, delayed
 from ..base import BaseEstimator, ClassifierMixin, RegressorMixin
-from ..utils import as_float_array, check_array, check_X_y, deprecated
-from ..utils import check_random_state, column_or_1d
+from ..utils import check_array, check_X_y, deprecated, as_float_array
+from ..utils.validation import FLOAT_DTYPES
+from ..utils import check_random_state
 from ..utils.extmath import safe_sparse_dot
 from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale
 from ..utils.fixes import sparse_lsqr
 from ..utils.seq_dataset import ArrayDataset, CSRDataset
 from ..utils.validation import check_is_fitted
 from ..exceptions import NotFittedError
-
-
-#
-# TODO: intercept for all models
-# We should define a common function to center data instead of
-# repeating the same code inside each fit method.
+from ..preprocessing.data import normalize as f_normalize
 
 # TODO: bayesian_ridge_regression and bayesian_regression_ard
 # should be squashed into its respective objects.
@@ -71,6 +67,8 @@ def make_dataset(X, y, sample_weight, random_state=None):
     return dataset, intercept_decay
 
 
+@deprecated("sparse_center_data will be removed in "
+            "0.20. Use utilities in preprocessing.data instead")
 def sparse_center_data(X, y, fit_intercept, normalize=False):
     """
     Compute information needed to center data to have mean zero along
@@ -87,10 +85,9 @@ def sparse_center_data(X, y, fit_intercept, normalize=False):
         else:
             X = sp.csc_matrix(X, copy=normalize, dtype=np.float64)
 
-        X_mean, X_var = mean_variance_axis(X, axis=0)
+        X_offset, X_var = mean_variance_axis(X, axis=0)
         if normalize:
             # transform variance to std in-place
-            # XXX: currently scaled to variance=n_samples to match center_data
             X_var *= X.shape[0]
             X_std = np.sqrt(X_var, X_var)
             del X_var
@@ -98,22 +95,23 @@ def sparse_center_data(X, y, fit_intercept, normalize=False):
             inplace_column_scale(X, 1. / X_std)
         else:
             X_std = np.ones(X.shape[1])
-        y_mean = y.mean(axis=0)
-        y = y - y_mean
+        y_offset = y.mean(axis=0)
+        y = y - y_offset
     else:
-        X_mean = np.zeros(X.shape[1])
+        X_offset = np.zeros(X.shape[1])
         X_std = np.ones(X.shape[1])
-        y_mean = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)
+        y_offset = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)
 
-    return X, y, X_mean, y_mean, X_std
+    return X, y, X_offset, y_offset, X_std
 
 
+@deprecated("center_data will be removed in "
+            "0.20. Use utilities in preprocessing.data instead")
 def center_data(X, y, fit_intercept, normalize=False, copy=True,
                 sample_weight=None):
     """
     Centers data to have mean zero along axis 0. This is here because
     nearly all linear models will want their data to be centered.
-
     If sample_weight is not None, then the weighted mean of X and y
     is zero, and not the mean itself
     """
@@ -122,26 +120,95 @@ def center_data(X, y, fit_intercept, normalize=False, copy=True,
         if isinstance(sample_weight, numbers.Number):
             sample_weight = None
         if sp.issparse(X):
-            X_mean = np.zeros(X.shape[1])
+            X_offset = np.zeros(X.shape[1])
             X_std = np.ones(X.shape[1])
         else:
-            X_mean = np.average(X, axis=0, weights=sample_weight)
-            X -= X_mean
+            X_offset = np.average(X, axis=0, weights=sample_weight)
+            X -= X_offset
+            # XXX: currently scaled to variance=n_samples
             if normalize:
-                # XXX: currently scaled to variance=n_samples
                 X_std = np.sqrt(np.sum(X ** 2, axis=0))
                 X_std[X_std == 0] = 1
                 X /= X_std
             else:
                 X_std = np.ones(X.shape[1])
-        y_mean = np.average(y, axis=0, weights=sample_weight)
-        y = y - y_mean
+        y_offset = np.average(y, axis=0, weights=sample_weight)
+        y = y - y_offset
     else:
-        X_mean = np.zeros(X.shape[1])
+        X_offset = np.zeros(X.shape[1])
         X_std = np.ones(X.shape[1])
-        y_mean = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)
-    return X, y, X_mean, y_mean, X_std
+        y_offset = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)
+    return X, y, X_offset, y_offset, X_std
+
+
+def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
+                     sample_weight=None, return_mean=False):
+    """
+    Centers data to have mean zero along axis 0. If fit_intercept=False or if
+    the X is a sparse matrix, no centering is done, but normalization can still
+    be applied. The function returns the statistics necessary to reconstruct
+    the input data, which are X_offset, y_offset, X_scale, such that the output
+
+        X = (X - X_offset) / X_scale
+
+    X_scale is the L2 norm of X - X_offset. If sample_weight is not None,
+    then the weighted mean of X and y is zero, and not the mean itself. If
+    return_mean=True, the mean, eventually weighted, is returned, independently
+    of whether X was centered (option used for optimization with sparse data in
+    coordinate_descend).
+
+    This is here because nearly all linear models will want their data to be
+    centered.
+    """
+
+    if isinstance(sample_weight, numbers.Number):
+        sample_weight = None
+
+    X = check_array(X, copy=copy, accept_sparse=['csr', 'csc'],
+                    dtype=FLOAT_DTYPES)
+
+    if fit_intercept:
+        if sp.issparse(X):
+            X_offset, X_var = mean_variance_axis(X, axis=0)
+            if not return_mean:
+                X_offset = np.zeros(X.shape[1])
+
+            if normalize:
+
+                # TODO: f_normalize could be used here as well but the function
+                # inplace_csr_row_normalize_l2 must be changed such that it
+                # can return also the norms computed internally
+
+                # transform variance to norm in-place
+                X_var *= X.shape[0]
+                X_scale = np.sqrt(X_var, X_var)
+                del X_var
+                X_scale[X_scale == 0] = 1
+                inplace_column_scale(X, 1. / X_scale)
+            else:
+                X_scale = np.ones(X.shape[1])
+
+        else:
+            X_offset = np.average(X, axis=0, weights=sample_weight)
+            X -= X_offset
+            if normalize:
+                X, X_scale = f_normalize(X, axis=0, copy=False,
+                                         return_norm=True)
+            else:
+                X_scale = np.ones(X.shape[1])
+        y_offset = np.average(y, axis=0, weights=sample_weight)
+        y = y - y_offset
+    else:
+        X_offset = np.zeros(X.shape[1])
+        X_scale = np.ones(X.shape[1])
+        y_offset = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)
+
+    return X, y, X_offset, y_offset, X_scale
+
 
+# TODO: _rescale_data should be factored into _preprocess_data.
+# Currently, the fact that sag implements its own way to deal with
+# sample_weight makes the refactoring tricky.
 
 def _rescale_data(X, y, sample_weight):
     """Rescale data so as to support sample_weight"""
@@ -200,14 +267,14 @@ class LinearModel(six.with_metaclass(ABCMeta, BaseEstimator)):
         """
         return self._decision_function(X)
 
-    _center_data = staticmethod(center_data)
+    _preprocess_data = staticmethod(_preprocess_data)
 
-    def _set_intercept(self, X_mean, y_mean, X_std):
+    def _set_intercept(self, X_offset, y_offset, X_scale):
         """Set the intercept_
         """
         if self.fit_intercept:
-            self.coef_ = self.coef_ / X_std
-            self.intercept_ = y_mean - np.dot(X_mean, self.coef_.T)
+            self.coef_ = self.coef_ / X_scale
+            self.intercept_ = y_offset - np.dot(X_offset, self.coef_.T)
         else:
             self.intercept_ = 0.
 
@@ -360,6 +427,13 @@ class LinearRegression(LinearModel, RegressorMixin):
 
     normalize : boolean, optional, default False
         If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     copy_X : boolean, optional, default True
         If True, X will be copied; else, it may be overwritten.
@@ -435,13 +509,12 @@ class LinearRegression(LinearModel, RegressorMixin):
         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
                          y_numeric=True, multi_output=True)
 
-        if ((sample_weight is not None) and np.atleast_1d(
-                sample_weight).ndim > 1):
-            sample_weight = column_or_1d(sample_weight, warn=True)
+        if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1:
+            raise ValueError("Sample weights must be 1D array or scalar")
 
-        X, y, X_mean, y_mean, X_std = self._center_data(
-            X, y, self.fit_intercept, self.normalize, self.copy_X,
-            sample_weight=sample_weight)
+        X, y, X_offset, y_offset, X_scale = self._preprocess_data(
+            X, y, fit_intercept=self.fit_intercept, normalize=self.normalize,
+            copy=self.copy_X, sample_weight=sample_weight)
 
         if sample_weight is not None:
             # Sample weight can be implemented via a simple rescaling.
@@ -466,7 +539,7 @@ class LinearRegression(LinearModel, RegressorMixin):
 
         if y.ndim == 1:
             self.coef_ = np.ravel(self.coef_)
-        self._set_intercept(X_mean, y_mean, X_std)
+        self._set_intercept(X_offset, y_offset, X_scale)
         return self
 
 
@@ -476,15 +549,16 @@ def _pre_fit(X, y, Xy, precompute, normalize, fit_intercept, copy):
 
     if sparse.isspmatrix(X):
         precompute = False
-        X, y, X_mean, y_mean, X_std = sparse_center_data(
-            X, y, fit_intercept, normalize)
+        X, y, X_offset, y_offset, X_scale = _preprocess_data(
+            X, y, fit_intercept=fit_intercept, normalize=normalize,
+            return_mean=True)
     else:
         # copy was done in fit if necessary
-        X, y, X_mean, y_mean, X_std = center_data(
-            X, y, fit_intercept, normalize, copy=copy)
+        X, y, X_offset, y_offset, X_scale = _preprocess_data(
+            X, y, fit_intercept=fit_intercept, normalize=normalize, copy=copy)
     if hasattr(precompute, '__array__') and (
-            fit_intercept and not np.allclose(X_mean, np.zeros(n_features))
-            or normalize and not np.allclose(X_std, np.ones(n_features))):
+            fit_intercept and not np.allclose(X_offset, np.zeros(n_features)) or
+            normalize and not np.allclose(X_scale, np.ones(n_features))):
         warnings.warn("Gram matrix was provided but X was centered"
                       " to fit intercept, "
                       "or X was normalized : recomputing Gram matrix.",
@@ -521,4 +595,4 @@ def _pre_fit(X, y, Xy, precompute, normalize, fit_intercept, copy):
                           order='F')
             np.dot(y.T, X, out=Xy.T)
 
-    return X, y, X_mean, y_mean, X_std, precompute, Xy
+    return X, y, X_offset, y_offset, X_scale, precompute, Xy
diff --git a/sklearn/linear_model/bayes.py b/sklearn/linear_model/bayes.py
index 1cf541721a29397b0d0083169b42ffa006eb1784..79f913b63448964598448b987b7a7a176d791403 100644
--- a/sklearn/linear_model/bayes.py
+++ b/sklearn/linear_model/bayes.py
@@ -65,6 +65,13 @@ class BayesianRidge(LinearModel, RegressorMixin):
 
     normalize : boolean, optional, default False
         If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     copy_X : boolean, optional, default True
         If True, X will be copied; else, it may be overwritten.
@@ -135,11 +142,11 @@ class BayesianRidge(LinearModel, RegressorMixin):
         self : returns an instance of self.
         """
         X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True)
-        X, y, X_mean, y_mean, X_std = self._center_data(
+        X, y, X_offset, y_offset, X_scale = self._preprocess_data(
             X, y, self.fit_intercept, self.normalize, self.copy_X)
         n_samples, n_features = X.shape
 
-        ### Initialization of the values of the parameters
+        # Initialization of the values of the parameters
         alpha_ = 1. / np.var(y)
         lambda_ = 1.
 
@@ -156,10 +163,10 @@ class BayesianRidge(LinearModel, RegressorMixin):
         U, S, Vh = linalg.svd(X, full_matrices=False)
         eigen_vals_ = S ** 2
 
-        ### Convergence loop of the bayesian ridge regression
+        # Convergence loop of the bayesian ridge regression
         for iter_ in range(self.n_iter):
 
-            ### Compute mu and sigma
+            # Compute mu and sigma
             # sigma_ = lambda_ / alpha_ * np.eye(n_features) + np.dot(X.T, X)
             # coef_ = sigma_^-1 * XT * y
             if n_samples > n_features:
@@ -178,28 +185,28 @@ class BayesianRidge(LinearModel, RegressorMixin):
                     logdet_sigma_[:n_samples] += alpha_ * eigen_vals_
                     logdet_sigma_ = - np.sum(np.log(logdet_sigma_))
 
-            ### Update alpha and lambda
+            # Update alpha and lambda
             rmse_ = np.sum((y - np.dot(X, coef_)) ** 2)
-            gamma_ = (np.sum((alpha_ * eigen_vals_)
-                      / (lambda_ + alpha_ * eigen_vals_)))
-            lambda_ = ((gamma_ + 2 * lambda_1)
-                       / (np.sum(coef_ ** 2) + 2 * lambda_2))
-            alpha_ = ((n_samples - gamma_ + 2 * alpha_1)
-                      / (rmse_ + 2 * alpha_2))
-
-            ### Compute the objective function
+            gamma_ = (np.sum((alpha_ * eigen_vals_) /
+                      (lambda_ + alpha_ * eigen_vals_)))
+            lambda_ = ((gamma_ + 2 * lambda_1) /
+                       (np.sum(coef_ ** 2) + 2 * lambda_2))
+            alpha_ = ((n_samples - gamma_ + 2 * alpha_1) /
+                      (rmse_ + 2 * alpha_2))
+
+            # Compute the objective function
             if self.compute_score:
                 s = lambda_1 * log(lambda_) - lambda_2 * lambda_
                 s += alpha_1 * log(alpha_) - alpha_2 * alpha_
-                s += 0.5 * (n_features * log(lambda_)
-                            + n_samples * log(alpha_)
-                            - alpha_ * rmse_
-                            - (lambda_ * np.sum(coef_ ** 2))
-                            - logdet_sigma_
-                            - n_samples * log(2 * np.pi))
+                s += 0.5 * (n_features * log(lambda_) +
+                            n_samples * log(alpha_) -
+                            alpha_ * rmse_ -
+                            (lambda_ * np.sum(coef_ ** 2)) -
+                            logdet_sigma_ -
+                            n_samples * log(2 * np.pi))
                 self.scores_.append(s)
 
-            ### Check for convergence
+            # Check for convergence
             if iter_ != 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
                 if verbose:
                     print("Convergence after ", str(iter_), " iterations")
@@ -210,7 +217,7 @@ class BayesianRidge(LinearModel, RegressorMixin):
         self.lambda_ = lambda_
         self.coef_ = coef_
 
-        self._set_intercept(X_mean, y_mean, X_std)
+        self._set_intercept(X_offset, y_offset, X_scale)
         return self
 
 
@@ -269,6 +276,13 @@ class ARDRegression(LinearModel, RegressorMixin):
 
     normalize : boolean, optional, default False
         If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     copy_X : boolean, optional, default True.
         If True, X will be copied; else, it may be overwritten.
@@ -351,10 +365,10 @@ class ARDRegression(LinearModel, RegressorMixin):
         n_samples, n_features = X.shape
         coef_ = np.zeros(n_features)
 
-        X, y, X_mean, y_mean, X_std = self._center_data(
+        X, y, X_offset, y_offset, X_scale = self._preprocess_data(
             X, y, self.fit_intercept, self.normalize, self.copy_X)
 
-        ### Launch the convergence loop
+        # Launch the convergence loop
         keep_lambda = np.ones(n_features, dtype=bool)
 
         lambda_1 = self.lambda_1
@@ -363,51 +377,51 @@ class ARDRegression(LinearModel, RegressorMixin):
         alpha_2 = self.alpha_2
         verbose = self.verbose
 
-        ### Initialization of the values of the parameters
+        # Initialization of the values of the parameters
         alpha_ = 1. / np.var(y)
         lambda_ = np.ones(n_features)
 
         self.scores_ = list()
         coef_old_ = None
 
-        ### Iterative procedure of ARDRegression
+        # Iterative procedure of ARDRegression
         for iter_ in range(self.n_iter):
-            ### Compute mu and sigma (using Woodbury matrix identity)
+            # Compute mu and sigma (using Woodbury matrix identity)
             sigma_ = pinvh(np.eye(n_samples) / alpha_ +
                            np.dot(X[:, keep_lambda] *
                            np.reshape(1. / lambda_[keep_lambda], [1, -1]),
                            X[:, keep_lambda].T))
-            sigma_ = np.dot(sigma_, X[:, keep_lambda]
-                            * np.reshape(1. / lambda_[keep_lambda], [1, -1]))
-            sigma_ = - np.dot(np.reshape(1. / lambda_[keep_lambda], [-1, 1])
-                              * X[:, keep_lambda].T, sigma_)
+            sigma_ = np.dot(sigma_, X[:, keep_lambda] *
+                            np.reshape(1. / lambda_[keep_lambda], [1, -1]))
+            sigma_ = - np.dot(np.reshape(1. / lambda_[keep_lambda], [-1, 1]) *
+                              X[:, keep_lambda].T, sigma_)
             sigma_.flat[::(sigma_.shape[1] + 1)] += 1. / lambda_[keep_lambda]
             coef_[keep_lambda] = alpha_ * np.dot(
                 sigma_, np.dot(X[:, keep_lambda].T, y))
 
-            ### Update alpha and lambda
+            # Update alpha and lambda
             rmse_ = np.sum((y - np.dot(X, coef_)) ** 2)
             gamma_ = 1. - lambda_[keep_lambda] * np.diag(sigma_)
-            lambda_[keep_lambda] = ((gamma_ + 2. * lambda_1)
-                                    / ((coef_[keep_lambda]) ** 2
-                                       + 2. * lambda_2))
-            alpha_ = ((n_samples - gamma_.sum() + 2. * alpha_1)
-                      / (rmse_ + 2. * alpha_2))
+            lambda_[keep_lambda] = ((gamma_ + 2. * lambda_1) /
+                                    ((coef_[keep_lambda]) ** 2 +
+                                     2. * lambda_2))
+            alpha_ = ((n_samples - gamma_.sum() + 2. * alpha_1) /
+                      (rmse_ + 2. * alpha_2))
 
-            ### Prune the weights with a precision over a threshold
+            # Prune the weights with a precision over a threshold
             keep_lambda = lambda_ < self.threshold_lambda
             coef_[~keep_lambda] = 0
 
-            ### Compute the objective function
+            # Compute the objective function
             if self.compute_score:
                 s = (lambda_1 * np.log(lambda_) - lambda_2 * lambda_).sum()
                 s += alpha_1 * log(alpha_) - alpha_2 * alpha_
-                s += 0.5 * (fast_logdet(sigma_) + n_samples * log(alpha_)
-                                                + np.sum(np.log(lambda_)))
+                s += 0.5 * (fast_logdet(sigma_) + n_samples * log(alpha_) +
+                                                np.sum(np.log(lambda_)))
                 s -= 0.5 * (alpha_ * rmse_ + (lambda_ * coef_ ** 2).sum())
                 self.scores_.append(s)
 
-            ### Check for convergence
+            # Check for convergence
             if iter_ > 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
                 if verbose:
                     print("Converged after %s iterations" % iter_)
@@ -418,5 +432,5 @@ class ARDRegression(LinearModel, RegressorMixin):
         self.alpha_ = alpha_
         self.sigma_ = sigma_
         self.lambda_ = lambda_
-        self._set_intercept(X_mean, y_mean, X_std)
+        self._set_intercept(X_offset, y_offset, X_scale)
         return self
diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index 1711fbe481be5e8a278d158cb83cc8f02bb5e340..cdad75b2f8c9d368aeb35badf4274fc4e251fb15 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -14,7 +14,7 @@ from scipy import sparse
 
 from .base import LinearModel, _pre_fit
 from ..base import RegressorMixin
-from .base import center_data, sparse_center_data
+from .base import _preprocess_data
 from ..utils import check_array, check_X_y, deprecated
 from ..utils.validation import check_random_state
 from ..model_selection import check_cv
@@ -65,7 +65,14 @@ def _alpha_grid(X, y, Xy=None, l1_ratio=1.0, fit_intercept=True,
         Whether to fit an intercept or not
 
     normalize : boolean, optional, default False
-        If ``True``, the regressors X will be normalized before regression.
+        If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     copy_X : boolean, optional, default True
         If ``True``, X will be copied; else, it may be overwritten.
@@ -80,16 +87,17 @@ def _alpha_grid(X, y, Xy=None, l1_ratio=1.0, fit_intercept=True,
                         copy=(copy_X and fit_intercept and not X_sparse))
         if not X_sparse:
             # X can be touched inplace thanks to the above line
-            X, y, _, _, _ = center_data(X, y, fit_intercept,
-                                        normalize, copy=False)
+            X, y, _, _, _ = _preprocess_data(X, y, fit_intercept,
+                                             normalize, copy=False)
         Xy = safe_sparse_dot(X.T, y, dense_output=True)
 
         if sparse_center:
             # Workaround to find alpha_max for sparse matrices.
             # since we should not destroy the sparsity of such matrices.
-            _, _, X_mean, _, X_std = sparse_center_data(X, y, fit_intercept,
-                                                        normalize)
-            mean_dot = X_mean * np.sum(y)
+            _, _, X_offset, _, X_scale = _preprocess_data(X, y, fit_intercept,
+                                                      normalize,
+                                                      return_mean=True)
+            mean_dot = X_offset * np.sum(y)
 
     if Xy.ndim == 1:
         Xy = Xy[:, np.newaxis]
@@ -98,7 +106,7 @@ def _alpha_grid(X, y, Xy=None, l1_ratio=1.0, fit_intercept=True,
         if fit_intercept:
             Xy -= mean_dot[:, np.newaxis]
         if normalize:
-            Xy /= X_std[:, np.newaxis]
+            Xy /= X_scale[:, np.newaxis]
 
     alpha_max = (np.sqrt(np.sum(Xy ** 2, axis=1)).max() /
                  (n_samples * l1_ratio))
@@ -383,17 +391,17 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
 
     # MultiTaskElasticNet does not support sparse matrices
     if not multi_output and sparse.isspmatrix(X):
-        if 'X_mean' in params:
+        if 'X_offset' in params:
             # As sparse matrices are not actually centered we need this
             # to be passed to the CD solver.
-            X_sparse_scaling = params['X_mean'] / params['X_std']
+            X_sparse_scaling = params['X_offset'] / params['X_scale']
         else:
             X_sparse_scaling = np.zeros(n_features)
 
     # X should be normalized and fit already if function is called
     # from ElasticNet.fit
     if check_input:
-        X, y, X_mean, y_mean, X_std, precompute, Xy = \
+        X, y, X_offset, y_offset, X_scale, precompute, Xy = \
             _pre_fit(X, y, Xy, precompute, normalize=False,
                      fit_intercept=False, copy=False)
     if alphas is None:
@@ -529,7 +537,14 @@ class ElasticNet(LinearModel, RegressorMixin):
         data is assumed to be already centered.
 
     normalize : boolean, optional, default False
-        If ``True``, the regressors X will be normalized before regression.
+        If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     precompute : True | False | array-like
         Whether to use a precomputed Gram matrix to speed up
@@ -642,6 +657,12 @@ class ElasticNet(LinearModel, RegressorMixin):
                           "well. You are advised to use the LinearRegression "
                           "estimator", stacklevel=2)
 
+        if (isinstance(self.precompute, six.string_types) and
+           self.precompute == 'auto'):
+            warnings.warn("Setting precompute to 'auto', was found to be "
+                          "slower even when n_samples > n_features. Hence "
+                          "it will be removed in 0.18.",
+                          DeprecationWarning, stacklevel=2)
         # We expect X and y to be already float64 Fortran ordered arrays
         # when bypassing checks
         if check_input:
@@ -652,7 +673,7 @@ class ElasticNet(LinearModel, RegressorMixin):
                              multi_output=True, y_numeric=True)
             y = check_array(y, dtype=np.float64, order='F', copy=False,
                             ensure_2d=False)
-        X, y, X_mean, y_mean, X_std, precompute, Xy = \
+        X, y, X_offset, y_offset, X_scale, precompute, Xy = \
             _pre_fit(X, y, None, self.precompute, self.normalize,
                      self.fit_intercept, copy=False)
         if y.ndim == 1:
@@ -689,7 +710,7 @@ class ElasticNet(LinearModel, RegressorMixin):
                           precompute=precompute, Xy=this_Xy,
                           fit_intercept=False, normalize=False, copy_X=True,
                           verbose=False, tol=self.tol, positive=self.positive,
-                          X_mean=X_mean, X_std=X_std, return_n_iter=True,
+                          X_offset=X_offset, X_scale=X_scale, return_n_iter=True,
                           coef_init=coef_[k], max_iter=self.max_iter,
                           random_state=self.random_state,
                           selection=self.selection,
@@ -702,7 +723,7 @@ class ElasticNet(LinearModel, RegressorMixin):
             self.n_iter_ = self.n_iter_[0]
 
         self.coef_, self.dual_gap_ = map(np.squeeze, [coef_, dual_gaps_])
-        self._set_intercept(X_mean, y_mean, X_std)
+        self._set_intercept(X_offset, y_offset, X_scale)
 
         # return self for chaining fit and predict calls
         return self
@@ -741,8 +762,9 @@ class ElasticNet(LinearModel, RegressorMixin):
         """
         check_is_fitted(self, 'n_iter_')
         if sparse.isspmatrix(X):
-            return np.ravel(safe_sparse_dot(self.coef_, X.T, dense_output=True)
-                            + self.intercept_)
+            return np.ravel(safe_sparse_dot(self.coef_, X.T,
+                                            dense_output=True) +
+                            self.intercept_)
         else:
             return super(ElasticNet, self)._decision_function(X)
 
@@ -777,7 +799,14 @@ class Lasso(ElasticNet):
         (e.g. data is expected to be already centered).
 
     normalize : boolean, optional, default False
-        If ``True``, the regressors X will be normalized before regression.
+        If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     copy_X : boolean, optional, default True
         If ``True``, X will be copied; else, it may be overwritten.
@@ -934,14 +963,14 @@ def _path_residuals(X, y, train, test, path, path_params, alphas=None,
         # Fall back to default enet_multitask
         precompute = False
 
-    X_train, y_train, X_mean, y_mean, X_std, precompute, Xy = \
+    X_train, y_train, X_offset, y_offset, X_scale, precompute, Xy = \
         _pre_fit(X_train, y_train, None, precompute, normalize, fit_intercept,
                  copy=False)
 
     path_params = path_params.copy()
     path_params['Xy'] = Xy
-    path_params['X_mean'] = X_mean
-    path_params['X_std'] = X_std
+    path_params['X_offset'] = X_offset
+    path_params['X_scale'] = X_scale
     path_params['precompute'] = precompute
     path_params['copy_X'] = False
     path_params['alphas'] = alphas
@@ -958,14 +987,14 @@ def _path_residuals(X, y, train, test, path, path_params, alphas=None,
     if y.ndim == 1:
         # Doing this so that it becomes coherent with multioutput.
         coefs = coefs[np.newaxis, :, :]
-        y_mean = np.atleast_1d(y_mean)
+        y_offset = np.atleast_1d(y_offset)
         y_test = y_test[:, np.newaxis]
 
     if normalize:
-        nonzeros = np.flatnonzero(X_std)
-        coefs[:, nonzeros] /= X_std[nonzeros][:, np.newaxis]
+        nonzeros = np.flatnonzero(X_scale)
+        coefs[:, nonzeros] /= X_scale[nonzeros][:, np.newaxis]
 
-    intercepts = y_mean[:, np.newaxis] - np.dot(X_mean, coefs)
+    intercepts = y_offset[:, np.newaxis] - np.dot(X_offset, coefs)
     if sparse.issparse(X_test):
         n_order, n_features, n_alphas = coefs.shape
         # Work around for sparse matices since coefs is a 3-D numpy array.
@@ -1069,7 +1098,7 @@ class LinearModelCV(six.with_metaclass(ABCMeta, LinearModel)):
             X = check_array(X, 'csc', copy=False)
             if sparse.isspmatrix(X):
                 if (hasattr(reference_to_old_X, "data") and
-                        not np.may_share_memory(reference_to_old_X.data, X.data)):
+                   not np.may_share_memory(reference_to_old_X.data, X.data)):
                     # X is a sparse matrix and has been copied
                     copy_X = False
             elif not np.may_share_memory(reference_to_old_X, X):
@@ -1256,7 +1285,14 @@ class LassoCV(LinearModelCV, RegressorMixin):
         (e.g. data is expected to be already centered).
 
     normalize : boolean, optional, default False
-        If ``True``, the regressors X will be normalized before regression.
+        If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     copy_X : boolean, optional, default True
         If ``True``, X will be copied; else, it may be overwritten.
@@ -1403,7 +1439,14 @@ class ElasticNetCV(LinearModelCV, RegressorMixin):
         (e.g. data is expected to be already centered).
 
     normalize : boolean, optional, default False
-        If ``True``, the regressors X will be normalized before regression.
+        If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     copy_X : boolean, optional, default True
         If ``True``, X will be copied; else, it may be overwritten.
@@ -1528,7 +1571,14 @@ class MultiTaskElasticNet(Lasso):
         (e.g. data is expected to be already centered).
 
     normalize : boolean, optional, default False
-        If ``True``, the regressors X will be normalized before regression.
+        If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     copy_X : boolean, optional, default True
         If ``True``, X will be copied; else, it may be overwritten.
@@ -1650,7 +1700,7 @@ class MultiTaskElasticNet(Lasso):
             raise ValueError("X and y have inconsistent dimensions (%d != %d)"
                              % (n_samples, y.shape[0]))
 
-        X, y, X_mean, y_mean, X_std = center_data(
+        X, y, X_offset, y_offset, X_scale = _preprocess_data(
             X, y, self.fit_intercept, self.normalize, copy=False)
 
         if not self.warm_start or self.coef_ is None:
@@ -1671,7 +1721,7 @@ class MultiTaskElasticNet(Lasso):
                 self.coef_, l1_reg, l2_reg, X, y, self.max_iter, self.tol,
                 check_random_state(self.random_state), random)
 
-        self._set_intercept(X_mean, y_mean, X_std)
+        self._set_intercept(X_offset, y_offset, X_scale)
 
         if self.dual_gap_ > self.eps_:
             warnings.warn('Objective did not converge, you might want'
@@ -1707,7 +1757,14 @@ class MultiTaskLasso(MultiTaskElasticNet):
         (e.g. data is expected to be already centered).
 
     normalize : boolean, optional, default False
-        If ``True``, the regressors X will be normalized before regression.
+        If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     copy_X : boolean, optional, default True
         If ``True``, X will be copied; else, it may be overwritten.
@@ -1837,7 +1894,14 @@ class MultiTaskElasticNetCV(LinearModelCV, RegressorMixin):
         (e.g. data is expected to be already centered).
 
     normalize : boolean, optional, default False
-        If ``True``, the regressors X will be normalized before regression.
+        If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     copy_X : boolean, optional, default True
         If ``True``, X will be copied; else, it may be overwritten.
@@ -1995,7 +2059,14 @@ class MultiTaskLassoCV(LinearModelCV, RegressorMixin):
         (e.g. data is expected to be already centered).
 
     normalize : boolean, optional, default False
-        If ``True``, the regressors X will be normalized before regression.
+        If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     copy_X : boolean, optional, default True
         If ``True``, X will be copied; else, it may be overwritten.
diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py
index 8f6531b7d7f5962f6e3eecd169e4b4f137251eb6..70c93c7be5bc8593ee707e5daa662c80a4f16d27 100644
--- a/sklearn/linear_model/least_angle.py
+++ b/sklearn/linear_model/least_angle.py
@@ -295,9 +295,9 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500,
                 # We have degenerate vectors in our active set.
                 # We'll 'drop for good' the last regressor added.
 
-                # Note: this case is very rare. It is no longer triggered by the
-                # test suite. The `equality_tolerance` margin added in 0.16.0 to
-                # get early stopping to work consistently on all versions of
+                # Note: this case is very rare. It is no longer triggered by
+                # the test suite. The `equality_tolerance` margin added in 0.16
+                # to get early stopping to work consistently on all versions of
                 # Python including 32 bit Python under Windows seems to make it
                 # very difficult to trigger the 'drop for good' strategy.
                 warnings.warn('Regressors in active set degenerate. '
@@ -512,7 +512,14 @@ class Lars(LinearModel, RegressorMixin):
         Sets the verbosity amount
 
     normalize : boolean, optional, default False
-        If ``True``, the regressors X will be normalized before regression.
+        If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     precompute : True | False | 'auto' | array-like
         Whether to use a precomputed Gram matrix to speed up
@@ -628,7 +635,7 @@ class Lars(LinearModel, RegressorMixin):
         X, y = check_X_y(X, y, y_numeric=True, multi_output=True)
         n_features = X.shape[1]
 
-        X, y, X_mean, y_mean, X_std = self._center_data(X, y,
+        X, y, X_offset, y_offset, X_scale = self._preprocess_data(X, y,
                                                         self.fit_intercept,
                                                         self.normalize,
                                                         self.copy_X)
@@ -695,7 +702,7 @@ class Lars(LinearModel, RegressorMixin):
             if n_targets == 1:
                 self.alphas_ = self.alphas_[0]
                 self.n_iter_ = self.n_iter_[0]
-        self._set_intercept(X_mean, y_mean, X_std)
+        self._set_intercept(X_offset, y_offset, X_scale)
         return self
 
 
@@ -739,6 +746,13 @@ class LassoLars(Lars):
 
     normalize : boolean, optional, default False
         If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     copy_X : boolean, optional, default True
         If True, X will be copied; else, it may be overwritten.
@@ -889,6 +903,13 @@ def _lars_path_residues(X_train, y_train, X_test, y_test, Gram=None,
 
     normalize : boolean, optional, default False
         If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     max_iter : integer, optional
         Maximum number of iterations to perform.
@@ -968,6 +989,13 @@ class LarsCV(Lars):
 
     normalize : boolean, optional, default False
         If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     copy_X : boolean, optional, default True
         If ``True``, X will be copied; else, it may be overwritten.
@@ -1171,6 +1199,13 @@ class LassoLarsCV(LarsCV):
 
     normalize : boolean, optional, default False
         If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     precompute : True | False | 'auto' | array-like
         Whether to use a precomputed Gram matrix to speed up
@@ -1299,6 +1334,13 @@ class LassoLarsIC(LassoLars):
 
     normalize : boolean, optional, default False
         If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     copy_X : boolean, optional, default True
         If True, X will be copied; else, it may be overwritten.
@@ -1402,7 +1444,7 @@ class LassoLarsIC(LassoLars):
         self.fit_path = True
         X, y = check_X_y(X, y, y_numeric=True)
 
-        X, y, Xmean, ymean, Xstd = LinearModel._center_data(
+        X, y, Xmean, ymean, Xstd = LinearModel._preprocess_data(
             X, y, self.fit_intercept, self.normalize, self.copy_X)
         max_iter = self.max_iter
 
diff --git a/sklearn/linear_model/omp.py b/sklearn/linear_model/omp.py
index 967122765c438ff51514eef35679776908c77530..269e18fe7029b4dcb976f2c58f19a2bf347859c2 100644
--- a/sklearn/linear_model/omp.py
+++ b/sklearn/linear_model/omp.py
@@ -557,8 +557,15 @@ class OrthogonalMatchingPursuit(LinearModel, RegressorMixin):
         to false, no intercept will be used in calculations
         (e.g. data is expected to be already centered).
 
-    normalize : boolean, optional
-        If False, the regressors X are assumed to be already normalized.
+    normalize : boolean, optional, default False
+        If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to `False`.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     precompute : {True, False, 'auto'}, default 'auto'
         Whether to use a precomputed Gram and Xy matrix to speed up
@@ -629,7 +636,7 @@ class OrthogonalMatchingPursuit(LinearModel, RegressorMixin):
         X, y = check_X_y(X, y, multi_output=True, y_numeric=True)
         n_features = X.shape[1]
 
-        X, y, X_mean, y_mean, X_std, Gram, Xy = \
+        X, y, X_offset, y_offset, X_scale, Gram, Xy = \
             _pre_fit(X, y, None, self.precompute, self.normalize,
                      self.fit_intercept, copy=True)
 
@@ -657,7 +664,7 @@ class OrthogonalMatchingPursuit(LinearModel, RegressorMixin):
                 copy_Gram=True, copy_Xy=True,
                 return_n_iter=True)
         self.coef_ = coef_.T
-        self._set_intercept(X_mean, y_mean, X_std)
+        self._set_intercept(X_offset, y_offset, X_scale)
         return self
 
 
@@ -690,6 +697,13 @@ def _omp_path_residues(X_train, y_train, X_test, y_test, copy=True,
 
     normalize : boolean, optional, default False
         If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to `False`.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     max_iter : integer, optional
         Maximum numbers of iterations to perform, therefore maximum features
@@ -748,8 +762,15 @@ class OrthogonalMatchingPursuitCV(LinearModel, RegressorMixin):
         to false, no intercept will be used in calculations
         (e.g. data is expected to be already centered).
 
-    normalize : boolean, optional
-        If False, the regressors X are assumed to be already normalized.
+    normalize : boolean, optional, default False
+        If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to `False`.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     max_iter : integer, optional
         Maximum numbers of iterations to perform, therefore maximum features
diff --git a/sklearn/linear_model/randomized_l1.py b/sklearn/linear_model/randomized_l1.py
index 962438545856d7f8fc9baf7e58db5f9a0783ce33..e4c9ab20d8dd3dfa7dbf93e5bd73c149645d5d0d 100644
--- a/sklearn/linear_model/randomized_l1.py
+++ b/sklearn/linear_model/randomized_l1.py
@@ -15,7 +15,7 @@ from scipy.sparse import issparse
 from scipy import sparse
 from scipy.interpolate import interp1d
 
-from .base import center_data
+from .base import _preprocess_data
 from ..base import BaseEstimator, TransformerMixin
 from ..externals import six
 from ..externals.joblib import Memory, Parallel, delayed
@@ -71,7 +71,7 @@ class BaseRandomizedLinearModel(six.with_metaclass(ABCMeta, BaseEstimator,
     def __init__(self):
         pass
 
-    _center_data = staticmethod(center_data)
+    _preprocess_data = staticmethod(_preprocess_data)
 
     def fit(self, X, y):
         """Fit the model using X, y as training data.
@@ -94,9 +94,8 @@ class BaseRandomizedLinearModel(six.with_metaclass(ABCMeta, BaseEstimator,
         X = as_float_array(X, copy=False)
         n_samples, n_features = X.shape
 
-        X, y, X_mean, y_mean, X_std = self._center_data(X, y,
-                                                        self.fit_intercept,
-                                                        self.normalize)
+        X, y, X_offset, y_offset, X_scale = \
+            self._preprocess_data(X, y, self.fit_intercept, self.normalize)
 
         estimator_func, params = self._make_estimator_and_params(X, y)
         memory = self.memory
@@ -223,8 +222,15 @@ class RandomizedLasso(BaseRandomizedLinearModel):
     verbose : boolean or integer, optional
         Sets the verbosity amount
 
-    normalize : boolean, optional, default True
+    normalize : boolean, optional, default False
         If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     precompute : True | False | 'auto'
         Whether to use a precomputed Gram matrix to speed up
@@ -405,8 +411,15 @@ class RandomizedLogisticRegression(BaseRandomizedLinearModel):
     verbose : boolean or integer, optional
         Sets the verbosity amount
 
-    normalize : boolean, optional, default=True
+    normalize : boolean, optional, default False
         If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     tol : float, optional, default=1e-3
          tolerance for stopping criteria of LogisticRegression
@@ -500,11 +513,11 @@ class RandomizedLogisticRegression(BaseRandomizedLinearModel):
                       fit_intercept=self.fit_intercept)
         return _randomized_logistic, params
 
-    def _center_data(self, X, y, fit_intercept, normalize=False):
+    def _preprocess_data(self, X, y, fit_intercept, normalize=False):
         """Center the data in X but not in y"""
-        X, _, Xmean, _, X_std = center_data(X, y, fit_intercept,
-                                            normalize=normalize)
-        return X, y, Xmean, y, X_std
+        X, _, X_offset, _, X_scale = _preprocess_data(X, y, fit_intercept,
+                                                      normalize=normalize)
+        return X, y, X_offset, y, X_scale
 
 
 ###############################################################################
diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py
index 955fa01bdcd41cee92d5246aa02e100cc995063a..913f862cf6c60adbae68d7dd9a7e370f796efce6 100644
--- a/sklearn/linear_model/ridge.py
+++ b/sklearn/linear_model/ridge.py
@@ -280,7 +280,7 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
         If True and if X is sparse, the method also returns the intercept,
         and the solver is automatically changed to 'sag'. This is only a
         temporary fix for fitting the intercept with sparse data. For dense
-        data, use sklearn.linear_model.center_data before your regression.
+        data, use sklearn.linear_model._preprocess_data before your regression.
 
         .. versionadded:: 0.17
 
@@ -463,7 +463,7 @@ class _BaseRidge(six.with_metaclass(ABCMeta, LinearModel)):
                 np.atleast_1d(sample_weight).ndim > 1):
             raise ValueError("Sample weights must be 1D array or scalar")
 
-        X, y, X_mean, y_mean, X_std = self._center_data(
+        X, y, X_offset, y_offset, X_scale = self._preprocess_data(
             X, y, self.fit_intercept, self.normalize, self.copy_X,
             sample_weight=sample_weight)
 
@@ -474,14 +474,14 @@ class _BaseRidge(six.with_metaclass(ABCMeta, LinearModel)):
                 max_iter=self.max_iter, tol=self.tol, solver=self.solver,
                 random_state=self.random_state, return_n_iter=True,
                 return_intercept=True)
-            self.intercept_ += y_mean
+            self.intercept_ += y_offset
         else:
             self.coef_, self.n_iter_ = ridge_regression(
                 X, y, alpha=self.alpha, sample_weight=sample_weight,
                 max_iter=self.max_iter, tol=self.tol, solver=self.solver,
                 random_state=self.random_state, return_n_iter=True,
                 return_intercept=False)
-            self._set_intercept(X_mean, y_mean, X_std)
+            self._set_intercept(X_offset, y_offset, X_scale)
 
         return self
 
@@ -521,6 +521,13 @@ class Ridge(_BaseRidge, RegressorMixin):
 
     normalize : boolean, optional, default False
         If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag'}
         Solver to use in the computational routines:
@@ -663,6 +670,13 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
 
     normalize : boolean, optional, default False
         If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag'}
         Solver to use in the computational routines:
@@ -921,7 +935,7 @@ class _RidgeGCV(LinearModel):
 
         n_samples, n_features = X.shape
 
-        X, y, X_mean, y_mean, X_std = LinearModel._center_data(
+        X, y, X_offset, y_offset, X_scale = LinearModel._preprocess_data(
             X, y, self.fit_intercept, self.normalize, self.copy_X,
             sample_weight=sample_weight)
 
@@ -989,7 +1003,7 @@ class _RidgeGCV(LinearModel):
         self.dual_coef_ = C[best]
         self.coef_ = safe_sparse_dot(self.dual_coef_.T, X)
 
-        self._set_intercept(X_mean, y_mean, X_std)
+        self._set_intercept(X_offset, y_offset, X_scale)
 
         if self.store_cv_values:
             if len(y.shape) == 1:
@@ -1085,6 +1099,13 @@ class RidgeCV(_BaseRidgeCV, RegressorMixin):
 
     normalize : boolean, optional, default False
         If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     scoring : string, callable or None, optional, default: None
         A string (see model evaluation documentation) or
@@ -1179,6 +1200,13 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV):
 
     normalize : boolean, optional, default False
         If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learnt more robust and almost independent of the number
+        of samples. The same property is not valid for standardized data.
+        However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     scoring : string, callable or None, optional, default: None
         A string (see model evaluation documentation) or
diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py
index f8a2d531e0c21d53facf00f9af73b52160403eec..4c7b326f24f448996b6e35401827df2495a2dec3 100644
--- a/sklearn/linear_model/tests/test_base.py
+++ b/sklearn/linear_model/tests/test_base.py
@@ -6,20 +6,25 @@
 import numpy as np
 from scipy import sparse
 from scipy import linalg
+from itertools import product
+
 
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_equal
+from sklearn.utils.testing import ignore_warnings
 
 from sklearn.linear_model.base import LinearRegression
-from sklearn.linear_model.base import center_data
-from sklearn.linear_model.base import sparse_center_data
+from sklearn.linear_model.base import _preprocess_data
+from sklearn.linear_model.base import sparse_center_data, center_data
 from sklearn.linear_model.base import _rescale_data
 from sklearn.utils import check_random_state
 from sklearn.utils.testing import assert_greater
 from sklearn.datasets.samples_generator import make_sparse_uncorrelated
 from sklearn.datasets.samples_generator import make_regression
 
+rng = np.random.RandomState(0)
+
 
 def test_linear_regression():
     # Test LinearRegression on a simple dataset.
@@ -93,8 +98,6 @@ def test_raises_value_error_if_sample_weights_greater_than_1d():
     n_sampless = [2, 3]
     n_featuress = [3, 2]
 
-    rng = np.random.RandomState(42)
-
     for n_samples, n_features in zip(n_sampless, n_featuress):
         X = rng.randn(n_samples, n_features)
         y = rng.randn(n_samples)
@@ -181,73 +184,69 @@ def test_linear_regression_sparse_multiple_outcome(random_state=0):
     assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
 
 
-def test_center_data():
+def test_preprocess_data():
     n_samples = 200
     n_features = 2
-    rng = check_random_state(0)
     X = rng.rand(n_samples, n_features)
     y = rng.rand(n_samples)
     expected_X_mean = np.mean(X, axis=0)
-    # XXX: currently scaled to variance=n_samples
-    expected_X_std = np.std(X, axis=0) * np.sqrt(X.shape[0])
+    expected_X_norm = np.std(X, axis=0) * np.sqrt(X.shape[0])
     expected_y_mean = np.mean(y, axis=0)
 
-    Xt, yt, X_mean, y_mean, X_std = center_data(X, y, fit_intercept=False,
-                                                normalize=False)
+    Xt, yt, X_mean, y_mean, X_norm = \
+        _preprocess_data(X, y, fit_intercept=False, normalize=False)
     assert_array_almost_equal(X_mean, np.zeros(n_features))
     assert_array_almost_equal(y_mean, 0)
-    assert_array_almost_equal(X_std, np.ones(n_features))
+    assert_array_almost_equal(X_norm, np.ones(n_features))
     assert_array_almost_equal(Xt, X)
     assert_array_almost_equal(yt, y)
 
-    Xt, yt, X_mean, y_mean, X_std = center_data(X, y, fit_intercept=True,
-                                                normalize=False)
+    Xt, yt, X_mean, y_mean, X_norm = \
+        _preprocess_data(X, y, fit_intercept=True, normalize=False)
     assert_array_almost_equal(X_mean, expected_X_mean)
     assert_array_almost_equal(y_mean, expected_y_mean)
-    assert_array_almost_equal(X_std, np.ones(n_features))
+    assert_array_almost_equal(X_norm, np.ones(n_features))
     assert_array_almost_equal(Xt, X - expected_X_mean)
     assert_array_almost_equal(yt, y - expected_y_mean)
 
-    Xt, yt, X_mean, y_mean, X_std = center_data(X, y, fit_intercept=True,
-                                                normalize=True)
+    Xt, yt, X_mean, y_mean, X_norm = \
+        _preprocess_data(X, y, fit_intercept=True, normalize=True)
     assert_array_almost_equal(X_mean, expected_X_mean)
     assert_array_almost_equal(y_mean, expected_y_mean)
-    assert_array_almost_equal(X_std, expected_X_std)
-    assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_std)
+    assert_array_almost_equal(X_norm, expected_X_norm)
+    assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm)
     assert_array_almost_equal(yt, y - expected_y_mean)
 
 
-def test_center_data_multioutput():
+def test_preprocess_data_multioutput():
     n_samples = 200
     n_features = 3
     n_outputs = 2
-    rng = check_random_state(0)
     X = rng.rand(n_samples, n_features)
     y = rng.rand(n_samples, n_outputs)
     expected_y_mean = np.mean(y, axis=0)
 
-    args = [(center_data, X), (sparse_center_data, sparse.csc_matrix(X))]
-    for center, X in args:
-        _, yt, _, y_mean, _ = center(X, y, fit_intercept=False,
-                                     normalize=False)
+    args = [X, sparse.csc_matrix(X)]
+    for X in args:
+        _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=False,
+                                               normalize=False)
         assert_array_almost_equal(y_mean, np.zeros(n_outputs))
         assert_array_almost_equal(yt, y)
 
-        _, yt, _, y_mean, _ = center(X, y, fit_intercept=True,
-                                     normalize=False)
+        _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=True,
+                                               normalize=False)
         assert_array_almost_equal(y_mean, expected_y_mean)
         assert_array_almost_equal(yt, y - y_mean)
 
-        _, yt, _, y_mean, _ = center(X, y, fit_intercept=True,
-                                     normalize=True)
+        _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=True,
+                                               normalize=True)
         assert_array_almost_equal(y_mean, expected_y_mean)
         assert_array_almost_equal(yt, y - y_mean)
 
 
-def test_center_data_weighted():
+def test_preprocess_data_weighted():
     n_samples = 200
     n_features = 2
-    rng = check_random_state(0)
     X = rng.rand(n_samples, n_features)
     y = rng.rand(n_samples)
     sample_weight = rng.rand(n_samples)
@@ -256,75 +255,72 @@ def test_center_data_weighted():
 
     # XXX: if normalize=True, should we expect a weighted standard deviation?
     #      Currently not weighted, but calculated with respect to weighted mean
-    # XXX: currently scaled to variance=n_samples
-    expected_X_std = (np.sqrt(X.shape[0]) *
-                      np.mean((X - expected_X_mean) ** 2, axis=0) ** .5)
+    expected_X_norm = (np.sqrt(X.shape[0]) *
+                       np.mean((X - expected_X_mean) ** 2, axis=0) ** .5)
 
-    Xt, yt, X_mean, y_mean, X_std = center_data(X, y, fit_intercept=True,
-                                                normalize=False,
-                                                sample_weight=sample_weight)
+    Xt, yt, X_mean, y_mean, X_norm = \
+        _preprocess_data(X, y, fit_intercept=True, normalize=False,
+                         sample_weight=sample_weight)
     assert_array_almost_equal(X_mean, expected_X_mean)
     assert_array_almost_equal(y_mean, expected_y_mean)
-    assert_array_almost_equal(X_std, np.ones(n_features))
+    assert_array_almost_equal(X_norm, np.ones(n_features))
     assert_array_almost_equal(Xt, X - expected_X_mean)
     assert_array_almost_equal(yt, y - expected_y_mean)
 
-    Xt, yt, X_mean, y_mean, X_std = center_data(X, y, fit_intercept=True,
-                                                normalize=True,
-                                                sample_weight=sample_weight)
+    Xt, yt, X_mean, y_mean, X_norm = \
+        _preprocess_data(X, y, fit_intercept=True, normalize=True,
+                         sample_weight=sample_weight)
     assert_array_almost_equal(X_mean, expected_X_mean)
     assert_array_almost_equal(y_mean, expected_y_mean)
-    assert_array_almost_equal(X_std, expected_X_std)
-    assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_std)
+    assert_array_almost_equal(X_norm, expected_X_norm)
+    assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm)
     assert_array_almost_equal(yt, y - expected_y_mean)
 
 
-def test_sparse_center_data():
+def test_sparse_preprocess_data_with_return_mean():
     n_samples = 200
     n_features = 2
-    rng = check_random_state(0)
     # random_state not supported yet in sparse.rand
     X = sparse.rand(n_samples, n_features, density=.5)  # , random_state=rng
     X = X.tolil()
     y = rng.rand(n_samples)
     XA = X.toarray()
-    # XXX: currently scaled to variance=n_samples
-    expected_X_std = np.std(XA, axis=0) * np.sqrt(X.shape[0])
+    expected_X_norm = np.std(XA, axis=0) * np.sqrt(X.shape[0])
 
-    Xt, yt, X_mean, y_mean, X_std = sparse_center_data(X, y,
-                                                       fit_intercept=False,
-                                                       normalize=False)
+    Xt, yt, X_mean, y_mean, X_norm = \
+        _preprocess_data(X, y, fit_intercept=False, normalize=False,
+                         return_mean=True)
     assert_array_almost_equal(X_mean, np.zeros(n_features))
     assert_array_almost_equal(y_mean, 0)
-    assert_array_almost_equal(X_std, np.ones(n_features))
+    assert_array_almost_equal(X_norm, np.ones(n_features))
     assert_array_almost_equal(Xt.A, XA)
     assert_array_almost_equal(yt, y)
 
-    Xt, yt, X_mean, y_mean, X_std = sparse_center_data(X, y,
-                                                       fit_intercept=True,
-                                                       normalize=False)
+    Xt, yt, X_mean, y_mean, X_norm = \
+        _preprocess_data(X, y, fit_intercept=True, normalize=False,
+                         return_mean=True)
     assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
     assert_array_almost_equal(y_mean, np.mean(y, axis=0))
-    assert_array_almost_equal(X_std, np.ones(n_features))
+    assert_array_almost_equal(X_norm, np.ones(n_features))
     assert_array_almost_equal(Xt.A, XA)
     assert_array_almost_equal(yt, y - np.mean(y, axis=0))
 
-    Xt, yt, X_mean, y_mean, X_std = sparse_center_data(X, y,
-                                                       fit_intercept=True,
-                                                       normalize=True)
+    Xt, yt, X_mean, y_mean, X_norm = \
+        _preprocess_data(X, y, fit_intercept=True, normalize=True,
+                         return_mean=True)
     assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
     assert_array_almost_equal(y_mean, np.mean(y, axis=0))
-    assert_array_almost_equal(X_std, expected_X_std)
-    assert_array_almost_equal(Xt.A, XA / expected_X_std)
+    assert_array_almost_equal(X_norm, expected_X_norm)
+    assert_array_almost_equal(Xt.A, XA / expected_X_norm)
     assert_array_almost_equal(yt, y - np.mean(y, axis=0))
 
 
-def test_csr_sparse_center_data():
-    # Test output format of sparse_center_data, when input is csr
+def test_csr_preprocess_data():
+    # Test output format of _preprocess_data, when input is csr
     X, y = make_regression()
     X[X < 2.5] = 0.0
     csr = sparse.csr_matrix(X)
-    csr_, y, _, _, _ = sparse_center_data(csr, y, True)
+    csr_, y, _, _, _ = _preprocess_data(csr, y, True)
     assert_equal(csr_.getformat(), 'csr')
 
 
@@ -332,7 +328,6 @@ def test_rescale_data():
     n_samples = 200
     n_features = 2
 
-    rng = np.random.RandomState(0)
     sample_weight = 1.0 + rng.rand(n_samples)
     X = rng.rand(n_samples, n_features)
     y = rng.rand(n_samples)
@@ -341,3 +336,74 @@ def test_rescale_data():
     rescaled_y2 = y * np.sqrt(sample_weight)
     assert_array_almost_equal(rescaled_X, rescaled_X2)
     assert_array_almost_equal(rescaled_y, rescaled_y2)
+
+
+@ignore_warnings  # all deprecation warnings
+def test_deprecation_center_data():
+    n_samples = 200
+    n_features = 2
+
+    w = 1.0 + rng.rand(n_samples)
+    X = rng.rand(n_samples, n_features)
+    y = rng.rand(n_samples)
+
+    param_grid = product([True, False], [True, False], [True, False],
+                         [None, w])
+
+    for (fit_intercept, normalize, copy, sample_weight) in param_grid:
+
+        XX = X.copy()  # such that we can try copy=False as well
+
+        X1, y1, X1_mean, X1_var, y1_mean = \
+            center_data(XX, y, fit_intercept=fit_intercept,
+                        normalize=normalize, copy=copy,
+                        sample_weight=sample_weight)
+
+        XX = X.copy()
+
+        X2, y2, X2_mean, X2_var, y2_mean = \
+            _preprocess_data(XX, y, fit_intercept=fit_intercept,
+                             normalize=normalize, copy=copy,
+                             sample_weight=sample_weight)
+
+        assert_array_almost_equal(X1, X2)
+        assert_array_almost_equal(y1, y2)
+        assert_array_almost_equal(X1_mean, X2_mean)
+        assert_array_almost_equal(X1_var, X2_var)
+        assert_array_almost_equal(y1_mean, y2_mean)
+
+    # Sparse cases
+    X = sparse.csr_matrix(X)
+
+    for (fit_intercept, normalize, copy, sample_weight) in param_grid:
+
+        X1, y1, X1_mean, X1_var, y1_mean = \
+            center_data(X, y, fit_intercept=fit_intercept, normalize=normalize,
+                        copy=copy, sample_weight=sample_weight)
+
+        X2, y2, X2_mean, X2_var, y2_mean = \
+            _preprocess_data(X, y, fit_intercept=fit_intercept,
+                             normalize=normalize, copy=copy,
+                             sample_weight=sample_weight, return_mean=False)
+
+        assert_array_almost_equal(X1.toarray(), X2.toarray())
+        assert_array_almost_equal(y1, y2)
+        assert_array_almost_equal(X1_mean, X2_mean)
+        assert_array_almost_equal(X1_var, X2_var)
+        assert_array_almost_equal(y1_mean, y2_mean)
+
+    for (fit_intercept, normalize) in product([True, False], [True, False]):
+
+        X1, y1, X1_mean, X1_var, y1_mean = \
+            sparse_center_data(X, y, fit_intercept=fit_intercept,
+                               normalize=normalize)
+
+        X2, y2, X2_mean, X2_var, y2_mean = \
+            _preprocess_data(X, y, fit_intercept=fit_intercept,
+                             normalize=normalize, return_mean=True)
+
+        assert_array_almost_equal(X1.toarray(), X2.toarray())
+        assert_array_almost_equal(y1, y2)
+        assert_array_almost_equal(X1_mean, X2_mean)
+        assert_array_almost_equal(X1_var, X2_var)
+        assert_array_almost_equal(y1_mean, y2_mean)
diff --git a/sklearn/linear_model/tests/test_randomized_l1.py b/sklearn/linear_model/tests/test_randomized_l1.py
index 4057ebdfa2ffdb24f96501d15e81db191210114c..95bffc0fbd33bd862ba7e14286001d0e80120636 100644
--- a/sklearn/linear_model/tests/test_randomized_l1.py
+++ b/sklearn/linear_model/tests/test_randomized_l1.py
@@ -14,7 +14,7 @@ from sklearn.linear_model.randomized_l1 import (lasso_stability_path,
 from sklearn.datasets import load_diabetes, load_iris
 from sklearn.feature_selection import f_regression, f_classif
 from sklearn.preprocessing import StandardScaler
-from sklearn.linear_model.base import center_data
+from sklearn.linear_model.base import _preprocess_data
 
 diabetes = load_diabetes()
 X = diabetes.data
@@ -111,7 +111,7 @@ def test_randomized_logistic_sparse():
 
     # center here because sparse matrices are usually not centered
     # labels should not be centered
-    X, _, _, _, _ = center_data(X, y, True, True)
+    X, _, _, _, _ = _preprocess_data(X, y, True, True)
 
     X_sp = sparse.csr_matrix(X)
 
diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py
index 44a1b267f2d75c867e71683631326002c170352a..ff54bba9716c46a35e90ccf3e972d2a7c77744b7 100644
--- a/sklearn/linear_model/tests/test_ridge.py
+++ b/sklearn/linear_model/tests/test_ridge.py
@@ -227,21 +227,21 @@ def test_toy_ridge_object():
     # TODO: test also n_samples > n_features
     X = np.array([[1], [2]])
     Y = np.array([1, 2])
-    clf = Ridge(alpha=0.0)
-    clf.fit(X, Y)
+    reg = Ridge(alpha=0.0)
+    reg.fit(X, Y)
     X_test = [[1], [2], [3], [4]]
-    assert_almost_equal(clf.predict(X_test), [1., 2, 3, 4])
+    assert_almost_equal(reg.predict(X_test), [1., 2, 3, 4])
 
-    assert_equal(len(clf.coef_.shape), 1)
-    assert_equal(type(clf.intercept_), np.float64)
+    assert_equal(len(reg.coef_.shape), 1)
+    assert_equal(type(reg.intercept_), np.float64)
 
     Y = np.vstack((Y, Y)).T
 
-    clf.fit(X, Y)
+    reg.fit(X, Y)
     X_test = [[1], [2], [3], [4]]
 
-    assert_equal(len(clf.coef_.shape), 2)
-    assert_equal(type(clf.intercept_), np.ndarray)
+    assert_equal(len(reg.coef_.shape), 2)
+    assert_equal(type(reg.intercept_), np.ndarray)
 
 
 def test_ridge_vs_lstsq():
@@ -417,16 +417,16 @@ def _test_multi_ridge_diabetes(filter_):
 def _test_ridge_classifiers(filter_):
     n_classes = np.unique(y_iris).shape[0]
     n_features = X_iris.shape[1]
-    for clf in (RidgeClassifier(), RidgeClassifierCV()):
-        clf.fit(filter_(X_iris), y_iris)
-        assert_equal(clf.coef_.shape, (n_classes, n_features))
-        y_pred = clf.predict(filter_(X_iris))
+    for reg in (RidgeClassifier(), RidgeClassifierCV()):
+        reg.fit(filter_(X_iris), y_iris)
+        assert_equal(reg.coef_.shape, (n_classes, n_features))
+        y_pred = reg.predict(filter_(X_iris))
         assert_greater(np.mean(y_iris == y_pred), .79)
 
     cv = KFold(5)
-    clf = RidgeClassifierCV(cv=cv)
-    clf.fit(filter_(X_iris), y_iris)
-    y_pred = clf.predict(filter_(X_iris))
+    reg = RidgeClassifierCV(cv=cv)
+    reg.fit(filter_(X_iris), y_iris)
+    y_pred = reg.predict(filter_(X_iris))
     assert_true(np.mean(y_iris == y_pred) >= 0.8)
 
 
@@ -481,63 +481,63 @@ def test_class_weights():
                   [1.0, 1.0], [1.0, 0.0]])
     y = [1, 1, 1, -1, -1]
 
-    clf = RidgeClassifier(class_weight=None)
-    clf.fit(X, y)
-    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))
+    reg = RidgeClassifier(class_weight=None)
+    reg.fit(X, y)
+    assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([1]))
 
     # we give a small weights to class 1
-    clf = RidgeClassifier(class_weight={1: 0.001})
-    clf.fit(X, y)
+    reg = RidgeClassifier(class_weight={1: 0.001})
+    reg.fit(X, y)
 
     # now the hyperplane should rotate clock-wise and
     # the prediction on this point should shift
-    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
+    assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([-1]))
 
     # check if class_weight = 'balanced' can handle negative labels.
-    clf = RidgeClassifier(class_weight='balanced')
-    clf.fit(X, y)
-    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))
+    reg = RidgeClassifier(class_weight='balanced')
+    reg.fit(X, y)
+    assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([1]))
 
     # class_weight = 'balanced', and class_weight = None should return
     # same values when y has equal number of all labels
     X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0]])
     y = [1, 1, -1, -1]
-    clf = RidgeClassifier(class_weight=None)
-    clf.fit(X, y)
-    clfa = RidgeClassifier(class_weight='balanced')
-    clfa.fit(X, y)
-    assert_equal(len(clfa.classes_), 2)
-    assert_array_almost_equal(clf.coef_, clfa.coef_)
-    assert_array_almost_equal(clf.intercept_, clfa.intercept_)
+    reg = RidgeClassifier(class_weight=None)
+    reg.fit(X, y)
+    rega = RidgeClassifier(class_weight='balanced')
+    rega.fit(X, y)
+    assert_equal(len(rega.classes_), 2)
+    assert_array_almost_equal(reg.coef_, rega.coef_)
+    assert_array_almost_equal(reg.intercept_, rega.intercept_)
 
 
 def test_class_weight_vs_sample_weight():
     """Check class_weights resemble sample_weights behavior."""
-    for clf in (RidgeClassifier, RidgeClassifierCV):
+    for reg in (RidgeClassifier, RidgeClassifierCV):
 
         # Iris is balanced, so no effect expected for using 'balanced' weights
-        clf1 = clf()
-        clf1.fit(iris.data, iris.target)
-        clf2 = clf(class_weight='balanced')
-        clf2.fit(iris.data, iris.target)
-        assert_almost_equal(clf1.coef_, clf2.coef_)
+        reg1 = reg()
+        reg1.fit(iris.data, iris.target)
+        reg2 = reg(class_weight='balanced')
+        reg2.fit(iris.data, iris.target)
+        assert_almost_equal(reg1.coef_, reg2.coef_)
 
         # Inflate importance of class 1, check against user-defined weights
         sample_weight = np.ones(iris.target.shape)
         sample_weight[iris.target == 1] *= 100
         class_weight = {0: 1., 1: 100., 2: 1.}
-        clf1 = clf()
-        clf1.fit(iris.data, iris.target, sample_weight)
-        clf2 = clf(class_weight=class_weight)
-        clf2.fit(iris.data, iris.target)
-        assert_almost_equal(clf1.coef_, clf2.coef_)
+        reg1 = reg()
+        reg1.fit(iris.data, iris.target, sample_weight)
+        reg2 = reg(class_weight=class_weight)
+        reg2.fit(iris.data, iris.target)
+        assert_almost_equal(reg1.coef_, reg2.coef_)
 
         # Check that sample_weight and class_weight are multiplicative
-        clf1 = clf()
-        clf1.fit(iris.data, iris.target, sample_weight ** 2)
-        clf2 = clf(class_weight=class_weight)
-        clf2.fit(iris.data, iris.target, sample_weight)
-        assert_almost_equal(clf1.coef_, clf2.coef_)
+        reg1 = reg()
+        reg1.fit(iris.data, iris.target, sample_weight ** 2)
+        reg2 = reg(class_weight=class_weight)
+        reg2.fit(iris.data, iris.target, sample_weight)
+        assert_almost_equal(reg1.coef_, reg2.coef_)
 
 
 def test_class_weights_cv():
@@ -546,14 +546,14 @@ def test_class_weights_cv():
                   [1.0, 1.0], [1.0, 0.0]])
     y = [1, 1, 1, -1, -1]
 
-    clf = RidgeClassifierCV(class_weight=None, alphas=[.01, .1, 1])
-    clf.fit(X, y)
+    reg = RidgeClassifierCV(class_weight=None, alphas=[.01, .1, 1])
+    reg.fit(X, y)
 
     # we give a small weights to class 1
-    clf = RidgeClassifierCV(class_weight={1: 0.001}, alphas=[.01, .1, 1, 10])
-    clf.fit(X, y)
+    reg = RidgeClassifierCV(class_weight={1: 0.001}, alphas=[.01, .1, 1, 10])
+    reg.fit(X, y)
 
-    assert_array_equal(clf.predict([[-.2, 2]]), np.array([-1]))
+    assert_array_equal(reg.predict([[-.2, 2]]), np.array([-1]))
 
 
 def test_ridgecv_store_cv_values():
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 094100cd7024cf85e655ea6ddad701791d667853..d447e715c0f27c3678fe78d6084fb8ce30804fd6 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1228,7 +1228,7 @@ class PolynomialFeatures(BaseEstimator, TransformerMixin):
         return XP
 
 
-def normalize(X, norm='l2', axis=1, copy=True):
+def normalize(X, norm='l2', axis=1, copy=True, return_norm=False):
     """Scale input vectors individually to unit norm (vector length).
 
     Read more in the :ref:`User Guide <preprocessing_normalization>`.
@@ -1253,6 +1253,9 @@ def normalize(X, norm='l2', axis=1, copy=True):
         copy (if the input is already a numpy array or a scipy.sparse
         CSR matrix and if axis is 1).
 
+    return_norm : boolean, default False
+        whether to return the computed norms
+
     See also
     --------
     :class:`sklearn.preprocessing.Normalizer` to perform normalization
@@ -1297,7 +1300,10 @@ def normalize(X, norm='l2', axis=1, copy=True):
     if axis == 0:
         X = X.T
 
-    return X
+    if return_norm:
+        return X, norms
+    else:
+        return X
 
 
 class Normalizer(BaseEstimator, TransformerMixin):