diff --git a/doc/faq.rst b/doc/faq.rst
index d5d15a1ed024325d096a4a77807e2e490e58991a..16101bc5c9ba7ed767350c7b4c9de56c11677828 100644
--- a/doc/faq.rst
+++ b/doc/faq.rst
@@ -127,6 +127,7 @@ together with scikit-learn tools. You can implement your favorite algorithm in
 a scikit-learn compatible way, upload it to github and let us know. We will
 list it under :ref:`related_projects`.
 
+.. _selectiveness:
 
 Why are you so selective on what algorithms you include in scikit-learn?
 ------------------------------------------------------------------------
@@ -313,7 +314,7 @@ not close your pull request or discontinue your work solely because of
 this reason.
 
 How do I set a ``random_state`` for an entire execution?
-----------------------------------------------------
+---------------------------------------------------------
 
 For testing and replicability, it is often important to have the entire execution
 controlled by a single seed for the pseudo-random number generator used in
diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 9f9e7ac19f87eaf528468cfeaaa336fbadfc5619..63e16932cf9af9316507128aeafed0277a4761e4 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -1101,7 +1101,7 @@ Here is a small example of usage of this function:::
 
 .. topic:: Example:
 
-  * See :ref:`sphx_glr_calibration_plot_calibration.py`
+  * See :ref:`sphx_glr_auto_examples_calibration_plot_calibration.py`
     for an example of Brier score loss usage to perform probability
     calibration of classifiers.
 
diff --git a/examples/ensemble/plot_random_forest_regression_multioutput.py b/examples/ensemble/plot_random_forest_regression_multioutput.py
index 5849833fef4bfe566c669e60345cf5180fbdb107..2b795ab3a92b2279a46d1c1139fa5e34f78aaf24 100644
--- a/examples/ensemble/plot_random_forest_regression_multioutput.py
+++ b/examples/ensemble/plot_random_forest_regression_multioutput.py
@@ -4,10 +4,10 @@ Comparing random forests and the multi-output meta estimator
 ============================================================
 
 An example to compare multi-output regression with random forest and
-the :ref:`multioutput.MultiOutputRegressor <_multiclass>` meta-estimator.
+the :ref:`multioutput.MultiOutputRegressor <multiclass>` meta-estimator.
 
 This example illustrates the use of the
-:ref:`multioutput.MultiOutputRegressor <_multiclass>` meta-estimator
+:ref:`multioutput.MultiOutputRegressor <multiclass>` meta-estimator
 to perform multi-output regression. A random forest regressor is used,
 which supports multi-output regression natively, so the results can be
 compared.
diff --git a/sklearn/ensemble/iforest.py b/sklearn/ensemble/iforest.py
index 05ea557b58bd3471bd16974d074dd59fef7a31c3..c96622c95f1ba2e905c059dec018a01842ac591f 100644
--- a/sklearn/ensemble/iforest.py
+++ b/sklearn/ensemble/iforest.py
@@ -64,6 +64,7 @@ class IsolationForest(BaseBagging):
 
     max_features : int or float, optional (default=1.0)
         The number of features to draw from X to train each base estimator.
+
             - If int, then draw `max_features` features.
             - If float, then draw `max_features * X.shape[1]` features.
 
diff --git a/sklearn/mixture/bayesian_mixture.py b/sklearn/mixture/bayesian_mixture.py
index 97aaacb4564c1d00c8f17524d639c8cf8b2e5bb0..d22ba73691ad9cf3784828ef77613bc6c2fa45e6 100644
--- a/sklearn/mixture/bayesian_mixture.py
+++ b/sklearn/mixture/bayesian_mixture.py
@@ -90,13 +90,14 @@ class BayesianGaussianMixture(BaseMixture):
         close to zero. The number of effective components is therefore smaller
         than n_components.
 
-    covariance_type : {'full', 'tied', 'diag', 'spherical'}, defaults to 'full'.
+    covariance_type : {'full', 'tied', 'diag', 'spherical'}, defaults to 'full'
         String describing the type of covariance parameters to use.
         Must be one of::
-        'full' (each component has its own general covariance matrix),
-        'tied' (all components share the same general covariance matrix),
-        'diag' (each component has its own diagonal covariance matrix),
-        'spherical' (each component has its own single variance).
+
+            'full' (each component has its own general covariance matrix),
+            'tied' (all components share the same general covariance matrix),
+            'diag' (each component has its own diagonal covariance matrix),
+            'spherical' (each component has its own single variance).
 
     tol : float, defaults to 1e-3.
         The convergence threshold. EM iterations will stop when the
@@ -118,14 +119,16 @@ class BayesianGaussianMixture(BaseMixture):
         The method used to initialize the weights, the means and the
         covariances.
         Must be one of::
-        'kmeans' : responsibilities are initialized using kmeans.
-        'random' : responsibilities are initialized randomly.
+
+            'kmeans' : responsibilities are initialized using kmeans.
+            'random' : responsibilities are initialized randomly.
 
     weight_concentration_prior_type : str, defaults to 'dirichlet_process'.
         String describing the type of the weight concentration prior.
         Must be one of::
-        'dirichlet_process' (using the Stick-breaking representation),
-        'dirichlet_distribution' (can favor more uniform weights).
+
+            'dirichlet_process' (using the Stick-breaking representation),
+            'dirichlet_distribution' (can favor more uniform weights).
 
     weight_concentration_prior : float | None, optional.
         The dirichlet concentration of each component on the weight
@@ -133,7 +136,7 @@ class BayesianGaussianMixture(BaseMixture):
         the center and will lead to more components being active, while a lower
         concentration parameter will lead to more mass at the edge of the
         mixture weights simplex. The value of the parameter must be greater
-        than 0. If it is None, it's set to `1. / n_components`.
+        than 0. If it is None, it's set to ``1. / n_components``.
 
     mean_precision_prior : float | None, optional.
         The precision prior on the mean distribution (Gaussian).
@@ -142,7 +145,7 @@ class BayesianGaussianMixture(BaseMixture):
         The value of the parameter must be greater than 0.
         If it is None, it's set to 1.
 
-    mean_prior : array-like, shape (`n_features`,), optional
+    mean_prior : array-like, shape (n_features,), optional
         The prior on the mean distribution (Gaussian).
         If it is None, it's set to the mean of X.
 
@@ -154,10 +157,11 @@ class BayesianGaussianMixture(BaseMixture):
         The prior on the covariance distribution (Wishart).
         If it is None, the emiprical covariance prior is initialized using the
         covariance of X. The shape depends on `covariance_type`::
-            (`n_features`, `n_features`) if 'full',
-            (`n_features`, `n_features`) if 'tied',
-            (`n_features`)               if 'diag',
-            float                        if 'spherical'
+
+                (n_features, n_features) if 'full',
+                (n_features, n_features) if 'tied',
+                (n_features)             if 'diag',
+                float                    if 'spherical'
 
     random_state: RandomState or an int seed, defaults to None.
         A random number generator instance.
@@ -178,15 +182,16 @@ class BayesianGaussianMixture(BaseMixture):
 
     Attributes
     ----------
-    weights_ : array-like, shape (`n_components`,)
+    weights_ : array-like, shape (n_components,)
         The weights of each mixture components.
 
-    means_ : array-like, shape (`n_components`, `n_features`)
+    means_ : array-like, shape (n_components, n_features)
         The mean of each mixture component.
 
     covariances_ : array-like
         The covariance of each mixture component.
         The shape depends on `covariance_type`::
+
             (n_components,)                        if 'spherical',
             (n_features, n_features)               if 'tied',
             (n_components, n_features)             if 'diag',
@@ -199,7 +204,8 @@ class BayesianGaussianMixture(BaseMixture):
         equivalently parameterized by the precision matrices. Storing the
         precision matrices instead of the covariance matrices makes it more
         efficient to compute the log-likelihood of new samples at test time.
-        The shape depends on `covariance_type`::
+        The shape depends on ``covariance_type``::
+
             (n_components,)                        if 'spherical',
             (n_features, n_features)               if 'tied',
             (n_components, n_features)             if 'diag',
@@ -212,7 +218,8 @@ class BayesianGaussianMixture(BaseMixture):
         Gaussian can be equivalently parameterized by the precision matrices.
         Storing the precision matrices instead of the covariance matrices makes
         it more efficient to compute the log-likelihood of new samples at test
-        time. The shape depends on `covariance_type`::
+        time. The shape depends on ``covariance_type``::
+
             (n_components,)                        if 'spherical',
             (n_features, n_features)               if 'tied',
             (n_components, n_features)             if 'diag',
@@ -232,15 +239,17 @@ class BayesianGaussianMixture(BaseMixture):
     weight_concentration_prior_ : tuple or float
         The dirichlet concentration of each component on the weight
         distribution (Dirichlet). The type depends on
-        `weight_concentration_prior_type`::
+        ``weight_concentration_prior_type``::
+
             (float, float) if 'dirichlet_process' (Beta parameters),
             float          if 'dirichlet_distribution' (Dirichlet parameters).
+
         The higher concentration puts more mass in
         the center and will lead to more components being active, while a lower
         concentration parameter will lead to more mass at the edge of the
         simplex.
 
-    weight_concentration_ : array-like, shape (`n_components`, )
+    weight_concentration_ : array-like, shape (n_components,)
         The dirichlet concentration of each component on the weight
         distribution (Dirichlet).
 
@@ -250,26 +259,27 @@ class BayesianGaussianMixture(BaseMixture):
         Smaller values concentrate the means of each clusters around
         `mean_prior`.
 
-    mean_precision_ : array-like, shape (`n_components`, )
+    mean_precision_ : array-like, shape (n_components,)
         The precision of each components on the mean distribution (Gaussian).
 
-    means_prior_ : array-like, shape (`n_features`,)
+    means_prior_ : array-like, shape (n_features,)
         The prior on the mean distribution (Gaussian).
 
     degrees_of_freedom_prior_ : float
         The prior of the number of degrees of freedom on the covariance
         distributions (Wishart).
 
-    degrees_of_freedom_ : array-like, shape (`n_components`,)
+    degrees_of_freedom_ : array-like, shape (n_components,)
         The number of degrees of freedom of each components in the model.
 
     covariance_prior_ : float or array-like
         The prior on the covariance distribution (Wishart).
         The shape depends on `covariance_type`::
-            (`n_features`, `n_features`) if 'full',
-            (`n_features`, `n_features`) if 'tied',
-            (`n_features`)               if 'diag',
-            float                        if 'spherical'
+
+            (n_features, n_features) if 'full',
+            (n_features, n_features) if 'tied',
+            (n_features)             if 'diag',
+            float                    if 'spherical'
 
     See Also
     --------
diff --git a/sklearn/mixture/gaussian_mixture.py b/sklearn/mixture/gaussian_mixture.py
index 57fa24e891836ecb572d880895abe0c10e21e62a..f4a182a7c95672c42899ae26e9b9ce9431d6a013 100644
--- a/sklearn/mixture/gaussian_mixture.py
+++ b/sklearn/mixture/gaussian_mixture.py
@@ -450,13 +450,14 @@ class GaussianMixture(BaseMixture):
         The number of mixture components.
 
     covariance_type : {'full', 'tied', 'diag', 'spherical'},
-        defaults to 'full'.
+            defaults to 'full'.
         String describing the type of covariance parameters to use.
         Must be one of::
-        'full' (each component has its own general covariance matrix),
-        'tied' (all components share the same general covariance matrix),
-        'diag' (each component has its own diagonal covariance matrix),
-        'spherical' (each component has its own single variance).
+
+            'full' (each component has its own general covariance matrix),
+            'tied' (all components share the same general covariance matrix),
+            'diag' (each component has its own diagonal covariance matrix),
+            'spherical' (each component has its own single variance).
 
     tol : float, defaults to 1e-3.
         The convergence threshold. EM iterations will stop when the
@@ -476,8 +477,9 @@ class GaussianMixture(BaseMixture):
         The method used to initialize the weights, the means and the
         precisions.
         Must be one of::
-        'kmeans' : responsibilities are initialized using kmeans.
-        'random' : responsibilities are initialized randomly.
+
+            'kmeans' : responsibilities are initialized using kmeans.
+            'random' : responsibilities are initialized randomly.
 
     weights_init : array-like, shape (n_components, ), optional
         The user-provided initial weights, defaults to None.
@@ -492,6 +494,7 @@ class GaussianMixture(BaseMixture):
         matrices), defaults to None.
         If it None, precisions are initialized using the 'init_params' method.
         The shape depends on 'covariance_type'::
+
             (n_components,)                        if 'spherical',
             (n_features, n_features)               if 'tied',
             (n_components, n_features)             if 'diag',
@@ -525,6 +528,7 @@ class GaussianMixture(BaseMixture):
     covariances_ : array-like
         The covariance of each mixture component.
         The shape depends on `covariance_type`::
+
             (n_components,)                        if 'spherical',
             (n_features, n_features)               if 'tied',
             (n_components, n_features)             if 'diag',
@@ -538,6 +542,7 @@ class GaussianMixture(BaseMixture):
         precision matrices instead of the covariance matrices makes it more
         efficient to compute the log-likelihood of new samples at test time.
         The shape depends on `covariance_type`::
+
             (n_components,)                        if 'spherical',
             (n_features, n_features)               if 'tied',
             (n_components, n_features)             if 'diag',
@@ -551,6 +556,7 @@ class GaussianMixture(BaseMixture):
         Storing the precision matrices instead of the covariance matrices makes
         it more efficient to compute the log-likelihood of new samples at test
         time. The shape depends on `covariance_type`::
+
             (n_components,)                        if 'spherical',
             (n_features, n_features)               if 'tied',
             (n_components, n_features)             if 'diag',