From 2653833a0790053887f8522de134f83a0fe1427f Mon Sep 17 00:00:00 2001
From: Andreas Mueller <amueller@nyu.edu>
Date: Tue, 3 Nov 2015 12:23:23 -0500
Subject: [PATCH] DOC some fixes to the doc build.

---
 doc/datasets/index.rst                        | 12 +--
 doc/datasets/rcv1.rst                         |  4 +-
 doc/modules/decomposition.rst                 |  1 +
 doc/modules/feature_selection.rst             |  2 +
 doc/modules/gaussian_process.rst              | 59 ++++++------
 doc/modules/multiclass.rst                    |  8 +-
 doc/modules/neural_networks_supervised.rst    |  6 +-
 doc/modules/outlier_detection.rst             |  1 +
 doc/whats_new.rst                             | 18 ++--
 examples/applications/face_recognition.py     |  4 +-
 examples/gaussian_process/plot_gpr_co2.py     | 48 +++++-----
 sklearn/cross_decomposition/pls_.py           | 23 +++--
 sklearn/datasets/descr/breast_cancer.rst      | 89 ++++++++++---------
 sklearn/datasets/descr/diabetes.rst           |  2 +-
 sklearn/datasets/descr/digits.rst             |  2 +-
 sklearn/datasets/descr/iris.rst               |  2 +
 sklearn/datasets/kddcup99.py                  |  4 +
 sklearn/gaussian_process/gpc.py               | 18 ++--
 sklearn/gaussian_process/gpr.py               | 10 +--
 sklearn/gaussian_process/kernels.py           | 10 +--
 sklearn/linear_model/base.py                  |  2 +-
 sklearn/model_selection/_validation.py        | 14 +--
 .../neural_network/multilayer_perceptron.py   |  4 +-
 sklearn/preprocessing/data.py                 | 14 +--
 sklearn/preprocessing/tests/test_data.py      |  8 +-
 25 files changed, 198 insertions(+), 167 deletions(-)

diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst
index e18cfdd152..e7925f3e94 100644
--- a/doc/datasets/index.rst
+++ b/doc/datasets/index.rst
@@ -267,26 +267,26 @@ features::
 
 .. include:: rcv1.rst
 
-.. _boston_house_prices
+.. _boston_house_prices:
 
 .. include:: ../../sklearn/datasets/descr/boston_house_prices.rst
 
-.. _breast_cancer
+.. _breast_cancer:
 
 .. include:: ../../sklearn/datasets/descr/breast_cancer.rst
 
-.. _diabetes
+.. _diabetes:
 
 .. include:: ../../sklearn/datasets/descr/diabetes.rst
 
-.. _digits
+.. _digits:
 
 .. include:: ../../sklearn/datasets/descr/digits.rst
 
-.. _iris
+.. _iris:
 
 .. include:: ../../sklearn/datasets/descr/iris.rst
 
-.. _linnerud
+.. _linnerud:
 
 .. include:: ../../sklearn/datasets/descr/linnerud.rst
diff --git a/doc/datasets/rcv1.rst b/doc/datasets/rcv1.rst
index 486eeee905..ded38584ce 100644
--- a/doc/datasets/rcv1.rst
+++ b/doc/datasets/rcv1.rst
@@ -41,10 +41,10 @@ There are 103 topics, each represented by a string. Their corpus frequencies spa
     >>> rcv1.target_names[:3].tolist()  # doctest: +SKIP
     ['E11', 'ECAT', 'M11']
 
-The dataset will be downloaded from the `dataset's homepage`_ if necessary.
+The dataset will be downloaded from the `rcv1 homepage`_ if necessary.
 The compressed size is about 656 MB.
 
-.. _dataset's homepage: http://jmlr.csail.mit.edu/papers/volume5/lewis04a/
+.. _rcv1 homepage: http://jmlr.csail.mit.edu/papers/volume5/lewis04a/
 
 
 .. topic:: References
diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst
index 91d003ce70..f10e105664 100644
--- a/doc/modules/decomposition.rst
+++ b/doc/modules/decomposition.rst
@@ -776,6 +776,7 @@ a corpus with :math:`D` documents and :math:`K` topics:
   2. For each document :math:`d`, draw :math:`\theta_d \sim Dirichlet(\alpha), \: d=1...D`
 
   3. For each word :math:`i` in document :math:`d`:
+
     a. Draw a topic index :math:`z_{di} \sim Multinomial(\theta_d)`
     b. Draw the observed word :math:`w_{ij} \sim Multinomial(beta_{z_{di}}.)`
 
diff --git a/doc/modules/feature_selection.rst b/doc/modules/feature_selection.rst
index 88ff7d56d6..60e4d0a38f 100644
--- a/doc/modules/feature_selection.rst
+++ b/doc/modules/feature_selection.rst
@@ -153,6 +153,8 @@ For examples on how it is to be used refer to the sections below.
       most important features from the Boston dataset without knowing the
       threshold beforehand.
 
+.. _l1_feature_selection:
+
 L1-based feature selection
 --------------------------
 
diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 52049efc16..b710564dbb 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -67,12 +67,15 @@ level from the data (see example below).
 
 The implementation is based on Algorithm 2.1 of [RW2006]_. In addition to
 the API of standard sklearn estimators, GaussianProcessRegressor:
-     * allows prediction without prior fitting (based on the GP prior)
-     * provides an additional method ``sample_y(X)``, which evaluates samples
-       drawn from the GPR (prior or posterior) at given inputs
-     * exposes a method ``log_marginal_likelihood(theta)``, which can be used
-       externally for other ways of selecting hyperparameters, e.g., via
-       Markov chain Monte Carlo.
+
+* allows prediction without prior fitting (based on the GP prior)
+
+* provides an additional method ``sample_y(X)``, which evaluates samples
+  drawn from the GPR (prior or posterior) at given inputs
+
+* exposes a method ``log_marginal_likelihood(theta)``, which can be used
+  externally for other ways of selecting hyperparameters, e.g., via
+  Markov chain Monte Carlo.
 
 
 GPR examples
@@ -171,26 +174,30 @@ model the CO2 concentration as a function of the time t.
 
 The kernel is composed of several terms that are responsible for explaining
 different properties of the signal:
- - a long term, smooth rising trend is to be explained by an RBF kernel. The
-   RBF kernel with a large length-scale enforces this component to be smooth;
-   it is not enforced that the trend is rising which leaves this choice to the
-   GP. The specific length-scale and the amplitude are free hyperparameters.
- - a seasonal component, which is to be explained by the periodic
-   ExpSineSquared kernel with a fixed periodicity of 1 year. The length-scale
-   of this periodic component, controlling its smoothness, is a free parameter.
-   In order to allow decaying away from exact periodicity, the product with an
-   RBF kernel is taken. The length-scale of this RBF component controls the
-   decay time and is a further free parameter.
- - smaller, medium term irregularities are to be explained by a
-   RationalQuadratic kernel component, whose length-scale and alpha parameter,
-   which determines the diffuseness of the length-scales, are to be determined.
-   According to [RW2006]_, these irregularities can better be explained by
-   a RationalQuadratic than an RBF kernel component, probably because it can
-   accommodate several length-scales.
- - a "noise" term, consisting of an RBF kernel contribution, which shall
-   explain the correlated noise components such as local weather phenomena,
-   and a WhiteKernel contribution for the white noise. The relative amplitudes
-   and the RBF's length scale are further free parameters.
+
+- a long term, smooth rising trend is to be explained by an RBF kernel. The
+  RBF kernel with a large length-scale enforces this component to be smooth;
+  it is not enforced that the trend is rising which leaves this choice to the
+  GP. The specific length-scale and the amplitude are free hyperparameters.
+
+- a seasonal component, which is to be explained by the periodic
+  ExpSineSquared kernel with a fixed periodicity of 1 year. The length-scale
+  of this periodic component, controlling its smoothness, is a free parameter.
+  In order to allow decaying away from exact periodicity, the product with an
+  RBF kernel is taken. The length-scale of this RBF component controls the
+  decay time and is a further free parameter.
+
+- smaller, medium term irregularities are to be explained by a
+  RationalQuadratic kernel component, whose length-scale and alpha parameter,
+  which determines the diffuseness of the length-scales, are to be determined.
+  According to [RW2006]_, these irregularities can better be explained by
+  a RationalQuadratic than an RBF kernel component, probably because it can
+  accommodate several length-scales.
+
+- a "noise" term, consisting of an RBF kernel contribution, which shall
+  explain the correlated noise components such as local weather phenomena,
+  and a WhiteKernel contribution for the white noise. The relative amplitudes
+  and the RBF's length scale are further free parameters.
 
 Maximizing the log-marginal-likelihood after subtracting the target's mean
 yields the following kernel with an LML of -83.214:
diff --git a/doc/modules/multiclass.rst b/doc/modules/multiclass.rst
index 49ea0d588e..9db951f4c4 100644
--- a/doc/modules/multiclass.rst
+++ b/doc/modules/multiclass.rst
@@ -215,7 +215,7 @@ code book. The code size is the dimensionality of the aforementioned space.
 Intuitively, each class should be represented by a code as unique as
 possible and a good code book should be designed to optimize classification
 accuracy. In this implementation, we simply use a randomly-generated code
-book as advocated in [2]_ although more elaborate methods may be added in the
+book as advocated in [3]_ although more elaborate methods may be added in the
 future.
 
 At fitting time, one binary classifier per bit in the code book is fitted.
@@ -262,16 +262,16 @@ Below is an example of multiclass learning using Output-Codes::
 
 .. topic:: References:
 
-    .. [1] "Solving multiclass learning problems via error-correcting output codes",
+    .. [2] "Solving multiclass learning problems via error-correcting output codes",
         Dietterich T., Bakiri G.,
         Journal of Artificial Intelligence Research 2,
         1995.
 
-    .. [2] "The error coding method and PICTs",
+    .. [3] "The error coding method and PICTs",
         James G., Hastie T.,
         Journal of Computational and Graphical statistics 7,
         1998.
 
-    .. [3] "The Elements of Statistical Learning",
+    .. [4] "The Elements of Statistical Learning",
         Hastie T., Tibshirani R., Friedman J., page 606 (second-edition)
         2008.
diff --git a/doc/modules/neural_networks_supervised.rst b/doc/modules/neural_networks_supervised.rst
index a921c2d975..cc17be204c 100644
--- a/doc/modules/neural_networks_supervised.rst
+++ b/doc/modules/neural_networks_supervised.rst
@@ -153,7 +153,7 @@ See the examples below and the doc string of
 
 .. topic:: Examples:
 
- * :ref:`example_plot_mlp_alpha.py`
+ * :ref:`example_neural_networks_plot_mlp_alpha.py`
 
 
 Regression
@@ -175,7 +175,7 @@ Algorithms
 MLP trains using `Stochastic Gradient Descent
 <http://en.wikipedia.org/wiki/Stochastic_gradient_descent>`_,
 `Adam <http://arxiv.org/abs/1412.6980>`_, or
-`L-BFGS <http://en.wikipedia.org/wiki/Limited-memory_BFGS>`_.
+`L-BFGS <http://en.wikipedia.org/wiki/Limited-memory_BFGS>`__.
 Stochastic Gradient Descent (SGD) updates parameters using the gradient of the
 loss function with respect to a parameter that needs adaptation, i.e.
 
@@ -201,7 +201,7 @@ L-BFGS is a fast learning algorithm that approximates the Hessian matrix which
 represents the second-order partial derivative of a function. Further it
 approximates the inverse of the Hessian matrix to perform parameter updates.
 The implementation uses the Scipy version of
-`L-BFGS <http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.fmin_l_bfgs_b.html>`_..
+`L-BFGS <http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.fmin_l_bfgs_b.html>`__..
 
 If the selected algorithm is 'L-BFGS', training does not support online nor
 mini-batch learning.
diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst
index ff24e01225..93de5ad490 100644
--- a/doc/modules/outlier_detection.rst
+++ b/doc/modules/outlier_detection.rst
@@ -169,6 +169,7 @@ This strategy is illustrated below.
      :class:`covariance.MinCovDet`.
 
 .. topic:: References:
+
     .. [LTZ2008] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation forest."
            Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on.
 
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index 8587580795..6acd6b8021 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -33,7 +33,7 @@ Enhancements
      that takes in the data and yields a generator for the different splits.
      This change makes it possible to do nested cross-validation with ease,
      facilitated by :class:`model_selection.GridSearchCV` and similar
-     utilities.  (`#4294 https://github.com/scikit-learn/scikit-learn/pull/4294>`_) by `Raghav R V`_.
+     utilities.  (`#4294 <https://github.com/scikit-learn/scikit-learn/pull/4294>`_) by `Raghav R V`_.
 
    - The random forest, extra trees and decision tree estimators now has a
      method ``decision_path`` which returns the decision path of samples in
@@ -56,16 +56,16 @@ Bug fixes
 .........
 
     - :class:`RandomizedPCA` default number of `iterated_power` is 2 instead of 3.
-      This is a speed up with a minor precision decrease. (`#5141 https://github.com/scikit-learn/scikit-learn/pull/5141>`_) by `Giorgio Patrini`_.
+      This is a speed up with a minor precision decrease. (`#5141 <https://github.com/scikit-learn/scikit-learn/pull/5141>`_) by `Giorgio Patrini`_.
 
     - :func:`randomized_svd` performs 2 power iterations by default, instead or 0.
       In practice this is often enough for obtaining a good approximation of the
-      true eigenvalues/vectors in the presence of noise. (`#5141 https://github.com/scikit-learn/scikit-learn/pull/5141>`_) by `Giorgio Patrini`_.
+      true eigenvalues/vectors in the presence of noise. (`#5141 <https://github.com/scikit-learn/scikit-learn/pull/5141>`_) by `Giorgio Patrini`_.
 
     - :func:`randomized_range_finder` is more numerically stable when many
       power iterations are requested, since it applies LU normalization by default.
       If `n_iter<2` numerical issues are unlikely, thus no normalization is applied.
-      Other normalization options are available: 'none', 'LU' and 'QR'. (`#5141 https://github.com/scikit-learn/scikit-learn/pull/5141>`_) by `Giorgio Patrini`_.
+      Other normalization options are available: 'none', 'LU' and 'QR'. (`#5141 <https://github.com/scikit-learn/scikit-learn/pull/5141>`_) by `Giorgio Patrini`_.
 
     - Fixed bug in :func:`manifold.spectral_embedding` where diagonal of unnormalized
       Laplacian matrix was incorrectly set to 1. By `Peter Fischer`_.
@@ -85,7 +85,7 @@ API changes summary
    - The :mod:`cross_validation`, :mod:`grid_search` and :mod:`learning_curve`
      have been deprecated and the classes and functions have been reorganized into
      the :mod:`model_selection` module.
-     (`#4294 https://github.com/scikit-learn/scikit-learn/pull/4294>`_) by `Raghav R V`_.
+     (`#4294 <https://github.com/scikit-learn/scikit-learn/pull/4294>`_) by `Raghav R V`_.
 
 
 .. _changes_0_17:
@@ -366,7 +366,7 @@ Bug fixes
 
     - Fixed bug in :class:`cross_decomposition.PLS` that yielded unstable and
       platform dependent output, and failed on `fit_transform`.
-       By `Arthur Mensch`_.
+      By `Arthur Mensch`_.
 
     - Fixed a bug in :class:`linear_model.LogisticRegression` and
       :class:`linear_model.LogisticRegressionCV` when using
@@ -3403,8 +3403,8 @@ Changelog
 
   - New :ref:`gaussian_process` module by Vincent Dubourg. This module
     also has great documentation and some very neat examples. See
-    :ref:`example_gaussian_process_plot_gp_regression.py` or
-    :ref:`example_gaussian_process_plot_gp_probabilistic_classification_after_regression.py`
+    example_gaussian_process_plot_gp_regression.py or
+    example_gaussian_process_plot_gp_probabilistic_classification_after_regression.py
     for a taste of what can be done.
 
   - It is now possible to use liblinear’s Multi-class SVC (option
@@ -3866,4 +3866,4 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
 .. _Graham Clenaghan: https://github.com/gclenaghan
 .. _Giorgio Patrini: https://github.com/giorgiop
 .. _Elvis Dohmatob: https://github.com/dohmatob
-.. _yelite https://github.com/yelite
+.. _yelite: https://github.com/yelite
diff --git a/examples/applications/face_recognition.py b/examples/applications/face_recognition.py
index b79599ecb3..b23dda4749 100644
--- a/examples/applications/face_recognition.py
+++ b/examples/applications/face_recognition.py
@@ -12,8 +12,9 @@ The dataset used in this example is a preprocessed excerpt of the
 
 Expected results for the top 5 most represented people in the dataset::
 
+================== ============ ======= ========== =======
                    precision    recall  f1-score   support
-
+================== ============ ======= ========== =======
      Ariel Sharon       0.67      0.92      0.77        13
      Colin Powell       0.75      0.78      0.76        60
   Donald Rumsfeld       0.78      0.67      0.72        27
@@ -23,6 +24,7 @@ Gerhard Schroeder       0.76      0.76      0.76        25
        Tony Blair       0.81      0.69      0.75        36
 
       avg / total       0.80      0.80      0.80       322
+================== ============ ======= ========== =======
 
 """
 from __future__ import print_function
diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index 07f1d8214d..b0b271a364 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -13,34 +13,40 @@ model the CO2 concentration as a function of the time t.
 
 The kernel is composed of several terms that are responsible for explaining
 different properties of the signal:
- - a long term, smooth rising trend is to be explained by an RBF kernel. The
-   RBF kernel with a large length-scale enforces this component to be smooth;
-   it is not enforced that the trend is rising which leaves this choice to the
-   GP. The specific length-scale and the amplitude are free hyperparameters.
- - a seasonal component, which is to be explained by the periodic
-   ExpSineSquared kernel with a fixed periodicity of 1 year. The length-scale
-   of this periodic component, controlling its smoothness, is a free parameter.
-   In order to allow decaying away from exact periodicity, the product with an
-   RBF kernel is taken. The length-scale of this RBF component controls the
-   decay time and is a further free parameter.
- - smaller, medium term irregularities are to be explained by a
-   RationalQuadratic kernel component, whose length-scale and alpha parameter,
-   which determines the diffuseness of the length-scales, are to be determined.
-   According to [RW2006], these irregularities can better be explained by
-   a RationalQuadratic than an RBF kernel component, probably because it can
-   accommodate several length-scales.
- - a "noise" term, consisting of an RBF kernel contribution, which shall
-   explain the correlated noise components such as local weather phenomena,
-   and a WhiteKernel contribution for the white noise. The relative amplitudes
-   and the RBF's length scale are further free parameters.
+
+- a long term, smooth rising trend is to be explained by an RBF kernel. The
+  RBF kernel with a large length-scale enforces this component to be smooth;
+  it is not enforced that the trend is rising which leaves this choice to the
+  GP. The specific length-scale and the amplitude are free hyperparameters.
+
+- a seasonal component, which is to be explained by the periodic
+  ExpSineSquared kernel with a fixed periodicity of 1 year. The length-scale
+  of this periodic component, controlling its smoothness, is a free parameter.
+  In order to allow decaying away from exact periodicity, the product with an
+  RBF kernel is taken. The length-scale of this RBF component controls the
+  decay time and is a further free parameter.
+
+- smaller, medium term irregularities are to be explained by a
+  RationalQuadratic kernel component, whose length-scale and alpha parameter,
+  which determines the diffuseness of the length-scales, are to be determined.
+  According to [RW2006], these irregularities can better be explained by
+  a RationalQuadratic than an RBF kernel component, probably because it can
+  accommodate several length-scales.
+
+- a "noise" term, consisting of an RBF kernel contribution, which shall
+  explain the correlated noise components such as local weather phenomena,
+  and a WhiteKernel contribution for the white noise. The relative amplitudes
+  and the RBF's length scale are further free parameters.
 
 Maximizing the log-marginal-likelihood after subtracting the target's mean
-yields the following kernel with an LML of -83.214:
+yields the following kernel with an LML of -83.214::
+
    34.4**2 * RBF(length_scale=41.8)
    + 3.27**2 * RBF(length_scale=180) * ExpSineSquared(length_scale=1.44,
                                                       periodicity=1)
    + 0.446**2 * RationalQuadratic(alpha=17.7, length_scale=0.957)
    + 0.197**2 * RBF(length_scale=0.138) + WhiteKernel(noise_level=0.0336)
+
 Thus, most of the target signal (34.4ppm) is explained by a long-term rising
 trend (length-scale 41.8 years). The periodic component has an amplitude of
 3.27ppm, a decay time of 180 years and a length-scale of 1.44. The long decay
diff --git a/sklearn/cross_decomposition/pls_.py b/sklearn/cross_decomposition/pls_.py
index fa481bb4f5..4d77bc2b32 100644
--- a/sklearn/cross_decomposition/pls_.py
+++ b/sklearn/cross_decomposition/pls_.py
@@ -521,7 +521,8 @@ class PLSRegression(_PLS):
 
     Notes
     -----
-    Matrices :
+    Matrices::
+
         T: x_scores_
         U: y_scores_
         W: x_weights_
@@ -529,16 +530,17 @@ class PLSRegression(_PLS):
         P: x_loadings_
         Q: y_loadings__
 
-    Are computed such that:
+    Are computed such that::
+
         X = T P.T + Err and Y = U Q.T + Err
         T[:, k] = Xk W[:, k] for k in range(n_components)
         U[:, k] = Yk C[:, k] for k in range(n_components)
         x_rotations_ = W (P.T W)^(-1)
         y_rotations_ = C (Q.T C)^(-1)
+
     where Xk and Yk are residual matrices at iteration k.
 
-    Slides explaining PLS
-    :ref:http://www.eigenvector.com/Docs/Wise_pls_properties.pdf
+    `Slides explaining PLS <http://www.eigenvector.com/Docs/Wise_pls_properties.pdf>`
 
     For each component k, find weights u, v that optimizes:
     ``max corr(Xk u, Yk v) * std(Xk u) std(Yk u)``, such that ``|u| = 1``
@@ -656,7 +658,8 @@ class PLSCanonical(_PLS):
 
     Notes
     -----
-    Matrices :
+    Matrices::
+
         T: x_scores_
         U: y_scores_
         W: x_weights_
@@ -664,19 +667,21 @@ class PLSCanonical(_PLS):
         P: x_loadings_
         Q: y_loadings__
 
-    Are computed such that:
+    Are computed such that::
+
         X = T P.T + Err and Y = U Q.T + Err
         T[:, k] = Xk W[:, k] for k in range(n_components)
         U[:, k] = Yk C[:, k] for k in range(n_components)
         x_rotations_ = W (P.T W)^(-1)
         y_rotations_ = C (Q.T C)^(-1)
+
     where Xk and Yk are residual matrices at iteration k.
 
-    Slides explaining PLS
-    :ref:http://www.eigenvector.com/Docs/Wise_pls_properties.pdf
+    `Slides explaining PLS <http://www.eigenvector.com/Docs/Wise_pls_properties.pdf>`
 
     For each component k, find weights u, v that optimize::
-    max corr(Xk u, Yk v) * std(Xk u) std(Yk u), such that ``|u| = |v| = 1``
+
+        max corr(Xk u, Yk v) * std(Xk u) std(Yk u), such that ``|u| = |v| = 1``
 
     Note that it maximizes both the correlations between the scores and the
     intra-block variances.
diff --git a/sklearn/datasets/descr/breast_cancer.rst b/sklearn/datasets/descr/breast_cancer.rst
index 642484f5e3..6e3dfc708f 100644
--- a/sklearn/datasets/descr/breast_cancer.rst
+++ b/sklearn/datasets/descr/breast_cancer.rst
@@ -19,51 +19,52 @@ Data Set Characteristics:
         - concave points (number of concave portions of the contour)
         - symmetry 
         - fractal dimension ("coastline approximation" - 1)
-		
-		The mean, standard error, and "worst" or largest (mean of the three
-		largest values) of these features were computed for each image,
-		resulting in 30 features.  For instance, field 3 is Mean Radius, field
-		13 is Radius SE, field 23 is Worst Radius.
-		
+        
+        The mean, standard error, and "worst" or largest (mean of the three
+        largest values) of these features were computed for each image,
+        resulting in 30 features.  For instance, field 3 is Mean Radius, field
+        13 is Radius SE, field 23 is Worst Radius.
+        
         - class:
                 - WDBC-Malignant
                 - WDBC-Benign
 
     :Summary Statistics:
-    ===================================== ====== ======
-										   Min    Max
-    ===================================== ====== ====== 
-    radius (mean):   					  6.981  28.11
-    texture (mean):    					  9.71   39.28
-    perimeter (mean):   				  43.79  188.5
-    area (mean):    					  143.5  2501.0
-	smoothness (mean):					  0.053  0.163
-	compactness (mean):					  0.019  0.345
-	concavity (mean):					  0.0    0.427
-	concave points (mean):				  0.0	 0.201
-	symmetry (mean): 					  0.106  0.304
-	fractal dimension (mean):			  0.05	 0.097
-    radius (standard error):   			  0.112  2.873
-    texture (standard error):    		  0.36	 4.885
-    perimeter (standard error):   		  0.757  21.98
-    area (standard error):				  6.802  542.2
-	smoothness (standard error):		  0.002	 0.031
-	compactness (standard error):		  0.002  0.135
-	concavity (standard error):			  0.0    0.396
-	concave points (standard error):	  0.0	 0.053
-	symmetry (standard error):			  0.008  0.079
-	fractal dimension (standard error):   0.001  0.03
-    radius (worst):   					  7.93	 36.04
-    texture (worst):    				  12.02  49.54
-    perimeter (worst):   				  50.41  251.2
-    area (worst):    					  185.2  4254.0
-	smoothness (worst):					  0.071  0.223
-	compactness (worst):				  0.027  1.058
-	concavity (worst):					  0.0    1.252
-	concave points (worst):				  0.0    0.291
-	symmetry (worst): 					  0.156  0.664
-	fractal dimension (worst):			  0.055	 0.208
-    ===================================== ====== ======
+
+    ===================================== ======= ========
+                                           Min     Max
+    ===================================== ======= ========
+    radius (mean):                         6.981   28.11
+    texture (mean):                        9.71    39.28
+    perimeter (mean):                      43.79   188.5
+    area (mean):                           143.5   2501.0
+    smoothness (mean):                     0.053   0.163
+    compactness (mean):                    0.019   0.345
+    concavity (mean):                      0.0     0.427
+    concave points (mean):                 0.0     0.201
+    symmetry (mean):                       0.106   0.304
+    fractal dimension (mean):              0.05    0.097
+    radius (standard error):               0.112   2.873
+    texture (standard error):              0.36    4.885
+    perimeter (standard error):            0.757   21.98
+    area (standard error):                 6.802   542.2
+    smoothness (standard error):           0.002   0.031
+    compactness (standard error):          0.002   0.135
+    concavity (standard error):            0.0     0.396
+    concave points (standard error):       0.0     0.053
+    symmetry (standard error):             0.008   0.079
+    fractal dimension (standard error):    0.001   0.03
+    radius (worst):                        7.93    36.04
+    texture (worst):                       12.02   49.54
+    perimeter (worst):                     50.41   251.2
+    area (worst):                          185.2   4254.0
+    smoothness (worst):                    0.071   0.223
+    compactness (worst):                   0.027   1.058
+    concavity (worst):                     0.0     1.252
+    concave points (worst):                0.0     0.291
+    symmetry (worst):                      0.156   0.664
+    fractal dimension (worst):             0.055   0.208
+    ===================================== ======= ========
 
     :Missing Attribute Values: None
 
@@ -108,11 +109,11 @@ References
 ----------
    - W.N. Street, W.H. Wolberg and O.L. Mangasarian. Nuclear feature extraction 
      for breast tumor diagnosis. IS&T/SPIE 1993 International Symposium on 
-	 Electronic Imaging: Science and Technology, volume 1905, pages 861-870, 
-	 San Jose, CA, 1993. 
+     Electronic Imaging: Science and Technology, volume 1905, pages 861-870, 
+     San Jose, CA, 1993. 
    - O.L. Mangasarian, W.N. Street and W.H. Wolberg. Breast cancer diagnosis and 
      prognosis via linear programming. Operations Research, 43(4), pages 570-577, 
-	 July-August 1995.
+     July-August 1995.
    - W.H. Wolberg, W.N. Street, and O.L. Mangasarian. Machine learning techniques
      to diagnose breast cancer from fine-needle aspirates. Cancer Letters 77 (1994) 
-	 163-171.
\ No newline at end of file
+     163-171.
diff --git a/sklearn/datasets/descr/diabetes.rst b/sklearn/datasets/descr/diabetes.rst
index 192d6c055e..df102a1bec 100644
--- a/sklearn/datasets/descr/diabetes.rst
+++ b/sklearn/datasets/descr/diabetes.rst
@@ -29,7 +29,7 @@ Data Set Characteristics:
     :S5:
     :S6:
 
-*Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).
+Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).
 
 Source URL:
 http://www4.stat.ncsu.edu/~boos/var.select/diabetes.html
diff --git a/sklearn/datasets/descr/digits.rst b/sklearn/datasets/descr/digits.rst
index 32aaa4de35..a30514474f 100644
--- a/sklearn/datasets/descr/digits.rst
+++ b/sklearn/datasets/descr/digits.rst
@@ -1,4 +1,4 @@
- Optical Recognition of Handwritten Digits Data Set
+Optical Recognition of Handwritten Digits Data Set
 ===================================================
 
 Notes
diff --git a/sklearn/datasets/descr/iris.rst b/sklearn/datasets/descr/iris.rst
index aa44f6e5ef..6e7aba2ec5 100644
--- a/sklearn/datasets/descr/iris.rst
+++ b/sklearn/datasets/descr/iris.rst
@@ -16,6 +16,7 @@ Data Set Characteristics:
                 - Iris-Versicolour
                 - Iris-Virginica
     :Summary Statistics:
+
     ============== ==== ==== ======= ===== ====================
                     Min  Max   Mean    SD   Class Correlation
     ============== ==== ==== ======= ===== ====================
@@ -24,6 +25,7 @@ Data Set Characteristics:
     petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
     petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)
     ============== ==== ==== ======= ===== ====================
+
     :Missing Attribute Values: None
     :Class Distribution: 33.3% for each of 3 classes.
     :Creator: R.A. Fisher
diff --git a/sklearn/datasets/kddcup99.py b/sklearn/datasets/kddcup99.py
index 9e7696f68c..589749851e 100644
--- a/sklearn/datasets/kddcup99.py
+++ b/sklearn/datasets/kddcup99.py
@@ -81,6 +81,7 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
     ================      ==========================================
 
     SA structure :
+
     ================      ==========================================
     Samples total         976158
     Dimensionality        41
@@ -89,6 +90,7 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
     ================      ==========================================
 
     SF structure :
+
     ================      ==========================================
     Samples total         699691
     Dimensionality        40
@@ -97,6 +99,7 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
     ================      ==========================================
 
     http structure :
+
     ================      ==========================================
     Samples total         619052
     Dimensionality        39
@@ -105,6 +108,7 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
     ================      ==========================================
 
     smtp structure :
+
     ================      ==========================================
     Samples total         95373
     Dimensionality        39
diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index f9c00a98be..745fab94a1 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -137,7 +137,7 @@ class _BinaryGaussianProcessClassifierLaplace(BaseEstimator):
         of sqrt(W) is stored.
 
     log_marginal_likelihood_value_: float
-        The log-marginal-likelihood of self.kernel_.theta
+        The log-marginal-likelihood of ``self.kernel_.theta``
     """
     def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b",
                  n_restarts_optimizer=0, max_iter_predict=100,
@@ -246,7 +246,7 @@ class _BinaryGaussianProcessClassifierLaplace(BaseEstimator):
         Returns
         -------
         C : array, shape = (n_samples,)
-            Predicted target values for X, values are from classes_
+            Predicted target values for X, values are from ``classes_``
         """
         check_is_fitted(self, ["X_train_", "y_train_", "pi_", "W_sr_", "L_"])
 
@@ -270,7 +270,7 @@ class _BinaryGaussianProcessClassifierLaplace(BaseEstimator):
         C : array-like, shape = (n_samples, n_classes)
             Returns the probability of the samples for each class in
             the model. The columns correspond to the classes in sorted
-            order, as they appear in the attribute `classes_`.
+            order, as they appear in the attribute ``classes_``.
         """
         check_is_fitted(self, ["X_train_", "y_train_", "pi_", "W_sr_", "L_"])
 
@@ -305,7 +305,7 @@ class _BinaryGaussianProcessClassifierLaplace(BaseEstimator):
         theta : array-like, shape = (n_kernel_params,) or None
             Kernel hyperparameters for which the log-marginal likelihood is
             evaluated. If None, the precomputed log_marginal_likelihood
-            of self.kernel_.theta is returned.
+            of ``self.kernel_.theta`` is returned.
 
         eval_gradient : bool, default: False
             If True, the gradient of the log-marginal likelihood with respect
@@ -437,7 +437,7 @@ class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
     """Gaussian process classification (GPC) based on Laplace approximation.
 
     The implementation is based on Algorithm 3.1, 3.2, and 5.1 of
-    ``Gaussian Processes for Machine Learning'' (GPML) by Rasmussen and
+    Gaussian Processes for Machine Learning (GPML) by Rasmussen and
     Williams.
 
     Internally, the Laplace approximation is used for approximating the
@@ -539,7 +539,7 @@ class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
         different kernels used in the one-versus-rest classifiers.
 
     log_marginal_likelihood_value_: float
-        The log-marginal-likelihood of self.kernel_.theta
+        The log-marginal-likelihood of ``self.kernel_.theta``
 
     classes_ : array-like, shape = (n_classes,)
         Unique class labels.
@@ -624,7 +624,7 @@ class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
         Returns
         -------
         C : array, shape = (n_samples,)
-            Predicted target values for X, values are from classes_
+            Predicted target values for X, values are from ``classes_``
         """
         check_is_fitted(self, ["classes_", "n_classes_"])
         X = check_array(X)
@@ -675,7 +675,7 @@ class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
             be the  hyperparameters of the compound kernel or of an individual
             kernel. In the latter case, all individual kernel get assigned the
             same theta values. If None, the precomputed log_marginal_likelihood
-            of self.kernel_.theta is returned.
+            of ``self.kernel_.theta`` is returned.
 
         eval_gradient : bool, default: False
             If True, the gradient of the log-marginal likelihood with respect
@@ -720,7 +720,7 @@ class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
                 # theta for compound kernel
                 return np.mean(
                     [estimator.log_marginal_likelihood(
-                        theta[n_dims*i:n_dims*(i+1)])
+                        theta[n_dims * i:n_dims * (i + 1)])
                      for i, estimator in enumerate(estimators)])
             else:
                 raise ValueError("Shape of theta must be either %d or %d. "
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 85ff65b8a8..3a46153680 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -20,8 +20,8 @@ from sklearn.utils.validation import check_X_y, check_array
 class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
     """Gaussian process regression (GPR).
 
-    The implementation is based on Algorithm 2.1 of ``Gaussian Processes
-    for Machine Learning'' (GPML) by Rasmussen and Williams.
+    The implementation is based on Algorithm 2.1 of Gaussian Processes
+    for Machine Learning (GPML) by Rasmussen and Williams.
 
     In addition to standard sklearn estimator API, GaussianProcessRegressor:
 
@@ -115,13 +115,13 @@ class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
         same as the one passed as parameter but with optimized hyperparameters
 
     L_: array-like, shape = (n_samples, n_samples)
-        Lower-triangular Cholesky decomposition of the kernel in X_train_
+        Lower-triangular Cholesky decomposition of the kernel in ``X_train_``
 
     alpha_: array-like, shape = (n_samples,)
         Dual coefficients of training data points in kernel space
 
     log_marginal_likelihood_value_: float
-        The log-marginal-likelihood of self.kernel_.theta
+        The log-marginal-likelihood of ``self.kernel_.theta``
     """
     def __init__(self, kernel=None, alpha=1e-10,
                  optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0,
@@ -347,7 +347,7 @@ class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
         theta : array-like, shape = (n_kernel_params,) or None
             Kernel hyperparameters for which the log-marginal likelihood is
             evaluated. If None, the precomputed log_marginal_likelihood
-            of self.kernel_.theta is returned.
+            of ``self.kernel_.theta`` is returned.
 
         eval_gradient : bool, default: False
             If True, the gradient of the log-marginal likelihood with respect
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index a34286e358..6c36d19712 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -21,7 +21,6 @@ optimization.
 
 from abc import ABCMeta, abstractmethod
 from collections import namedtuple
-import inspect
 import math
 
 import numpy as np
@@ -33,13 +32,14 @@ from ..externals import six
 from ..base import clone
 from sklearn.externals.funcsigs import signature
 
+
 class Hyperparameter(namedtuple('Hyperparameter',
                                 ('name', 'value_type', 'bounds',
                                  'n_elements', 'fixed'))):
     """A kernel hyperparameter's specification in form of a namedtuple.
 
-    Entries
-    -------
+    Attributes
+    ----------
     name : string
         The name of the hyperparameter. Note that a kernel using a
         hyperparameter with name "x" must have the attributes self.x and
@@ -405,7 +405,7 @@ class CompoundKernel(Kernel):
         """
         k_dims = self.k1.n_dims
         for i, kernel in enumerate(self.kernels):
-            kernel.theta = theta[i*k_dims:(i+1)*k_dims]
+            kernel.theta = theta[i * k_dims:(i + 1) * k_dims]
 
     @property
     def bounds(self):
@@ -1316,7 +1316,7 @@ class Matern(RBF):
                     3 * D * np.exp(-np.sqrt(3 * D.sum(-1)))[..., np.newaxis]
             elif self.nu == 2.5:
                 tmp = np.sqrt(5 * D.sum(-1))[..., np.newaxis]
-                K_gradient = 5.0/3.0 * D * (tmp + 1) * np.exp(-tmp)
+                K_gradient = 5.0 / 3.0 * D * (tmp + 1) * np.exp(-tmp)
             else:
                 # approximate gradient numerically
                 def f(theta):  # helper function
diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
index fa8678d913..4055a00027 100644
--- a/sklearn/linear_model/base.py
+++ b/sklearn/linear_model/base.py
@@ -403,7 +403,7 @@ class LinearRegression(LinearModel, RegressorMixin):
         self.n_jobs = n_jobs
 
     @property
-    @deprecated("residues_ is deprecated and will be removed in 0.19")
+    @deprecated("``residues_`` is deprecated and will be removed in 0.19")
     def residues_(self):
         """Get the residues of the fitted model."""
         return self._residues
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 05673b954e..190ec8c3a9 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -66,7 +66,7 @@ def cross_val_score(estimator, X, y=None, labels=None, scoring=None, cv=None,
                     pre_dispatch='2*n_jobs'):
     """Evaluate a score by cross-validation
 
-    Read more in the :ref:`User Guide <validate>`.
+    Read more in the :ref:`User Guide <cross_validation>`.
 
     Parameters
     ----------
@@ -295,7 +295,7 @@ def cross_val_predict(estimator, X, y=None, labels=None, cv=None, n_jobs=1,
                       verbose=0, fit_params=None, pre_dispatch='2*n_jobs'):
     """Generate cross-validated estimates for each input data point
 
-    Read more in the :ref:`User Guide <validate>`.
+    Read more in the :ref:`User Guide <cross_validation>`.
 
     Parameters
     ----------
@@ -394,7 +394,7 @@ def cross_val_predict(estimator, X, y=None, labels=None, cv=None, n_jobs=1,
 def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params):
     """Fit estimator and predict values for a given dataset split.
 
-    Read more in the :ref:`User Guide <validate>`.
+    Read more in the :ref:`User Guide <cross_validation>`.
 
     Parameters
     ----------
@@ -483,7 +483,7 @@ def permutation_test_score(estimator, X, y, labels=None, cv=None,
                            verbose=0, scoring=None):
     """Evaluate the significance of a cross-validated score with permutations
 
-    Read more in the :ref:`User Guide <validate>`.
+    Read more in the :ref:`User Guide <cross_validation>`.
 
     Parameters
     ----------
@@ -520,7 +520,7 @@ def permutation_test_score(estimator, X, y, labels=None, cv=None,
         See the :mod:`sklearn.model_selection` module for the list of
         cross-validation strategies that can be used here.
 
-        Also refer :ref:`cross-validation documentation <_cross_validation>`
+        Also refer :ref:`cross-validation documentation <cross_validation>`
 
     n_permutations : integer, optional
         Number of times to permute ``y``.
@@ -618,7 +618,7 @@ def learning_curve(estimator, X, y, labels=None,
     test set will be computed. Afterwards, the scores will be averaged over
     all k runs for each training subset size.
 
-    Read more in the :ref:`User Guide <validate>`.
+    Read more in the :ref:`User Guide <learning_curve>`.
 
     Parameters
     ----------
@@ -836,7 +836,7 @@ def validation_curve(estimator, X, y, param_name, param_range, labels=None,
     will also compute training scores and is merely a utility for plotting the
     results.
 
-    Read more in the :ref:`User Guide <validate>`.
+    Read more in the :ref:`User Guide <learning_curve>`.
 
     Parameters
     ----------
diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py
index 3912f1f673..c5c6b13e40 100644
--- a/sklearn/neural_network/multilayer_perceptron.py
+++ b/sklearn/neural_network/multilayer_perceptron.py
@@ -724,7 +724,7 @@ class MLPClassifier(BaseMultilayerPerceptron, ClassifierMixin):
         -'constant', is a constant learning rate given by
          'learning_rate_init'.
 
-        -'invscaling' gradually decreases the learning rate 'learning_rate_' at
+        -'invscaling' gradually decreases the learning rate ``learning_rate_`` at
           each time step 't' using an inverse scaling exponent of 'power_t'.
           effective_learning_rate = learning_rate_init / pow(t, power_t)
 
@@ -1077,7 +1077,7 @@ class MLPRegressor(BaseMultilayerPerceptron, RegressorMixin):
         -'constant', is a constant learnign rate given by
          'learning_rate_init'.
 
-        -'invscaling' gradually decreases the learning rate 'learning_rate_' at
+        -'invscaling' gradually decreases the learning rate ``learning_rate_`` at
           each time step 't' using an inverse scaling exponent of 'power_t'.
           effective_learning_rate = learning_rate_init / pow(t, power_t)
 
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 98c8b4acf9..3ee4cf91f1 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -233,7 +233,7 @@ class MinMaxScaler(BaseEstimator, TransformerMixin):
         Per feature maximum seen in the data
 
     data_range_ : ndarray, shape (n_features,)
-        Per feature range (data_max_ - data_min_) seen in the data
+        Per feature range ``(data_max_ - data_min_)`` seen in the data
     """
 
     def __init__(self, feature_range=(0, 1), copy=True):
@@ -242,13 +242,13 @@ class MinMaxScaler(BaseEstimator, TransformerMixin):
 
     @property
     @deprecated("Attribute data_range will be removed in "
-                "0.19. Use data_range_ instead")
+                "0.19. Use ``data_range_`` instead")
     def data_range(self):
         return self.data_range_
 
     @property
     @deprecated("Attribute data_min will be removed in "
-                "0.19. Use data_min_ instead")
+                "0.19. Use ``data_min_`` instead")
     def data_min(self):
         return self.data_min_
 
@@ -290,7 +290,7 @@ class MinMaxScaler(BaseEstimator, TransformerMixin):
 
         Parameters
         ----------
-        X : array-like, shape [n_samples_, n_features]
+        X : array-like, shape [n_samples, n_features]
             The data used to compute the mean and standard deviation
             used for later scaling along the features axis.
 
@@ -504,7 +504,7 @@ class StandardScaler(BaseEstimator, TransformerMixin):
         self.copy = copy
 
     @property
-    @deprecated("Attribute std_ will be removed in 0.19. Use scale_ instead")
+    @deprecated("Attribute ``std_`` will be removed in 0.19. Use ``scale_`` instead")
     def std_(self):
         return self.scale_
 
@@ -551,7 +551,7 @@ class StandardScaler(BaseEstimator, TransformerMixin):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape [n_samples_, n_features]
+        X : {array-like, sparse matrix}, shape [n_samples, n_features]
             The data used to compute the mean and standard deviation
             used for later scaling along the features axis.
 
@@ -742,7 +742,7 @@ class MaxAbsScaler(BaseEstimator, TransformerMixin):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape [n_samples_, n_features]
+        X : {array-like, sparse matrix}, shape [n_samples, n_features]
             The data used to compute the mean and standard deviation
             used for later scaling along the features axis.
 
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index d87f92d10c..30a85c8ee7 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -956,13 +956,13 @@ def test_deprecation_minmax_scaler():
     scaler = MinMaxScaler().fit(X)
 
     depr_message = ("Attribute data_range will be removed in "
-                    "0.19. Use data_range_ instead")
+                    "0.19. Use ``data_range_`` instead")
     data_range = assert_warns_message(DeprecationWarning, depr_message,
                                       getattr, scaler, "data_range")
     assert_array_equal(data_range, scaler.data_range)
 
     depr_message = ("Attribute data_min will be removed in "
-                    "0.19. Use data_min_ instead")
+                    "0.19. Use ``data_min_`` instead")
     data_min = assert_warns_message(DeprecationWarning, depr_message,
                                     getattr, scaler, "data_min")
     assert_array_equal(data_min, scaler.data_min)
@@ -1322,8 +1322,8 @@ def test_deprecation_standard_scaler():
     rng = np.random.RandomState(0)
     X = rng.random_sample((5, 4))
     scaler = StandardScaler().fit(X)
-    depr_message = ("Function std_ is deprecated; Attribute std_ will be "
-                    "removed in 0.19. Use scale_ instead")
+    depr_message = ("Function std_ is deprecated; Attribute ``std_`` will be "
+                    "removed in 0.19. Use ``scale_`` instead")
     std_ = assert_warns_message(DeprecationWarning, depr_message, getattr,
                                 scaler, "std_")
     assert_array_equal(std_, scaler.scale_)
-- 
GitLab