diff --git a/sklearn/cluster/affinity_propagation_.py b/sklearn/cluster/affinity_propagation_.py
index 1c9903dc2efe14b935fee37aef3c5c5a4f6403d7..53072e24c4ae2b38c1a1548a494bf6df155e4364 100644
--- a/sklearn/cluster/affinity_propagation_.py
+++ b/sklearn/cluster/affinity_propagation_.py
@@ -71,7 +71,8 @@ def affinity_propagation(S, preference=None, convergence_iter=15, max_iter=200,
 
     Notes
     -----
-    See examples/cluster/plot_affinity_propagation.py for an example.
+    For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
+    <sphx_glr_auto_examples_cluster_plot_affinity_propagation.py>`.
 
     References
     ----------
@@ -243,7 +244,8 @@ class AffinityPropagation(BaseEstimator, ClusterMixin):
 
     Notes
     -----
-    See examples/cluster/plot_affinity_propagation.py for an example.
+    For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
+    <sphx_glr_auto_examples_cluster_plot_affinity_propagation.py>`.
 
     The algorithmic complexity of affinity propagation is quadratic
     in the number of points.
diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py
index 6c7bba5af9f8c9385370413bc4872c1d481cd727..115e534b448cbe445c7eeb7e8b7b35df6168acb9 100644
--- a/sklearn/cluster/dbscan_.py
+++ b/sklearn/cluster/dbscan_.py
@@ -89,7 +89,8 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None,
 
     Notes
     -----
-    See examples/cluster/plot_dbscan.py for an example.
+    For an example, see :ref:`examples/cluster/plot_dbscan.py
+    <sphx_glr_auto_examples_cluster_plot_dbscan.py>`.
 
     This implementation bulk-computes all neighborhood queries, which increases
     the memory complexity to O(n.d) where d is the average number of neighbors,
@@ -228,7 +229,8 @@ class DBSCAN(BaseEstimator, ClusterMixin):
 
     Notes
     -----
-    See examples/cluster/plot_dbscan.py for an example.
+    For an example, see :ref:`examples/cluster/plot_dbscan.py
+    <sphx_glr_auto_examples_cluster_plot_dbscan.py>`.
 
     This implementation bulk-computes all neighborhood queries, which increases
     the memory complexity to O(n.d) where d is the average number of neighbors,
diff --git a/sklearn/cluster/mean_shift_.py b/sklearn/cluster/mean_shift_.py
index 928842f179da7af7583916dfaf4aaf0d6f2141cc..b1680fea3f2e7cbb50ccc80f30af0d78fb127c7b 100644
--- a/sklearn/cluster/mean_shift_.py
+++ b/sklearn/cluster/mean_shift_.py
@@ -172,7 +172,8 @@ def mean_shift(X, bandwidth=None, seeds=None, bin_seeding=False,
 
     Notes
     -----
-    See examples/cluster/plot_mean_shift.py for an example.
+    For an example, see :ref:`examples/cluster/plot_mean_shift.py
+    <sphx_glr_auto_examples_cluster_plot_mean_shift.py>`.
 
     """
 
diff --git a/sklearn/datasets/species_distributions.py b/sklearn/datasets/species_distributions.py
index 518880534f08e17e83f14078e3c5a967c20b75de..556ad9ea45e0533df461804fe8f6fde376b9f06b 100644
--- a/sklearn/datasets/species_distributions.py
+++ b/sklearn/datasets/species_distributions.py
@@ -17,17 +17,19 @@ The two species are:
    also known as the Forest Small Rice Rat, a rodent that lives in Peru,
    Colombia, Ecuador, Peru, and Venezuela.
 
-References:
+References
+----------
 
- * `"Maximum entropy modeling of species geographic distributions"
-   <http://rob.schapire.net/papers/ecolmod.pdf>`_
-   S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling,
-   190:231-259, 2006.
+`"Maximum entropy modeling of species geographic distributions"
+<http://rob.schapire.net/papers/ecolmod.pdf>`_ S. J. Phillips,
+R. P. Anderson, R. E. Schapire - Ecological Modelling, 190:231-259, 2006.
 
-Notes:
+Notes
+-----
 
- * See examples/applications/plot_species_distribution_modeling.py
-   for an example of using this dataset
+For an example of using this dataset, see
+:ref:`examples/applications/plot_species_distribution_modeling.py
+<sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py>`.
 """
 
 # Authors: Peter Prettenhofer <peter.prettenhofer@gmail.com>
@@ -202,9 +204,9 @@ def fetch_species_distributions(data_home=None,
     Notes
     -----
 
-    * See examples/applications/plot_species_distribution_modeling.py
-      for an example of using this dataset with scikit-learn
-
+    * For an example of using this dataset with scikit-learn, see
+      :ref:`examples/applications/plot_species_distribution_modeling.py
+      <sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py>`.
     """
     data_home = get_data_home(data_home)
     if not exists(data_home):
diff --git a/sklearn/linear_model/bayes.py b/sklearn/linear_model/bayes.py
index be58fd2b854b893cba11d2fa831a010889c3ff6c..82153024e33a7698fa6599b408d12f029d25da32 100644
--- a/sklearn/linear_model/bayes.py
+++ b/sklearn/linear_model/bayes.py
@@ -110,7 +110,8 @@ class BayesianRidge(LinearModel, RegressorMixin):
 
     Notes
     -----
-    See examples/linear_model/plot_bayesian_ridge.py for an example.
+    For an example, see :ref:`examples/linear_model/plot_bayesian_ridge.py
+    <sphx_glr_auto_examples_linear_model_plot_bayesian_ridge.py>`.
 
     References
     ----------
@@ -372,8 +373,9 @@ class ARDRegression(LinearModel, RegressorMixin):
     array([ 1.])
 
     Notes
-    --------
-    See examples/linear_model/plot_ard.py for an example.
+    -----
+    For an example, see :ref:`examples/linear_model/plot_ard.py
+    <sphx_glr_auto_examples_linear_model_plot_ard.py>`.
 
     References
     ----------
diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index 0b950b26a624014f2abb4208b0379ce54fe6b351..6a1061f0a906a99d9b4ac661914e97b730ec1858 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -213,8 +213,9 @@ def lasso_path(X, y, eps=1e-3, n_alphas=100, alphas=None,
 
     Notes
     -----
-    See examples/linear_model/plot_lasso_coordinate_descent_path.py
-    for an example.
+    For an example, see
+    :ref:`examples/linear_model/plot_lasso_coordinate_descent_path.py
+    <sphx_glr_auto_examples_linear_model_plot_lasso_coordinate_descent_path.py>`.
 
     To avoid unnecessary memory duplication the X argument of the fit method
     should be directly passed as a Fortran-contiguous numpy array.
@@ -368,8 +369,9 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
 
     Notes
     -----
-    See examples/linear_model/plot_lasso_coordinate_descent_path.py for an
-    example.
+    For an example, see
+    :ref:`examples/linear_model/plot_lasso_coordinate_descent_path.py
+    <sphx_glr_auto_examples_linear_model_plot_lasso_coordinate_descent_path.py>`.
 
     See also
     --------
@@ -1329,8 +1331,9 @@ class LassoCV(LinearModelCV, RegressorMixin):
 
     Notes
     -----
-    See examples/linear_model/plot_lasso_model_selection.py
-    for an example.
+    For an example, see
+    :ref:`examples/linear_model/plot_lasso_model_selection.py
+    <sphx_glr_auto_examples_linear_model_plot_lasso_model_selection.py>`.
 
     To avoid unnecessary memory duplication the X argument of the fit method
     should be directly passed as a Fortran-contiguous numpy array.
@@ -1485,8 +1488,9 @@ class ElasticNetCV(LinearModelCV, RegressorMixin):
 
     Notes
     -----
-    See examples/linear_model/plot_lasso_model_selection.py
-    for an example.
+    For an example, see
+    :ref:`examples/linear_model/plot_lasso_model_selection.py
+    <sphx_glr_auto_examples_linear_model_plot_lasso_model_selection.py>`.
 
     To avoid unnecessary memory duplication the X argument of the fit method
     should be directly passed as a Fortran-contiguous numpy array.
diff --git a/sklearn/linear_model/randomized_l1.py b/sklearn/linear_model/randomized_l1.py
index ac5b89722488e93f5e89a5536037fdb34282dd3d..5ee0782b7f2a2f70748458b4789b5bb7c4a4976b 100644
--- a/sklearn/linear_model/randomized_l1.py
+++ b/sklearn/linear_model/randomized_l1.py
@@ -294,7 +294,8 @@ class RandomizedLasso(BaseRandomizedLinearModel):
 
     Notes
     -----
-    See examples/linear_model/plot_sparse_recovery.py for an example.
+    For an example, see :ref:`examples/linear_model/plot_sparse_recovery.py
+    <sphx_glr_auto_examples_linear_model_plot_sparse_recovery.py>`.
 
     References
     ----------
@@ -486,7 +487,8 @@ class RandomizedLogisticRegression(BaseRandomizedLinearModel):
 
     Notes
     -----
-    See examples/linear_model/plot_sparse_recovery.py for an example.
+    For an example, see :ref:`examples/linear_model/plot_sparse_recovery.py
+    <sphx_glr_auto_examples_linear_model_plot_sparse_recovery.py>`.
 
     References
     ----------
@@ -621,7 +623,8 @@ def lasso_stability_path(X, y, scaling=0.5, random_state=None,
 
     Notes
     -----
-    See examples/linear_model/plot_sparse_recovery.py for an example.
+    For an example, see :ref:`examples/linear_model/plot_sparse_recovery.py
+    <sphx_glr_auto_examples_linear_model_plot_sparse_recovery.py>`.
     """
     X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'])
     rng = check_random_state(random_state)
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 107656702bad95690f14c3bd225fd6614cfb17cf..c9de8a99a0f3d35fa99806bd4e262b8c8a338bd9 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -117,8 +117,9 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
 
     To avoid memory copy the caller should pass a CSC matrix.
 
-    See examples/preprocessing/plot_all_scaling.py for a comparison of the
-    different scalers, transformers, and normalizers.
+    For a comparison of the different scalers, transformers, and normalizers,
+    see :ref:`examples/preprocessing/plot_all_scaling.py
+    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
 
     See also
     --------
@@ -248,8 +249,9 @@ class MinMaxScaler(BaseEstimator, TransformerMixin):
 
     Notes
     -----
-    See examples/preprocessing/plot_all_scaling.py for a comparison of the
-    different scalers, transformers, and normalizers.
+    For a comparison of the different scalers, transformers, and normalizers,
+    see :ref:`examples/preprocessing/plot_all_scaling.py
+    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
     """
 
     def __init__(self, feature_range=(0, 1), copy=True):
@@ -409,8 +411,9 @@ def minmax_scale(X, feature_range=(0, 1), axis=0, copy=True):
 
     Notes
     -----
-    See examples/preprocessing/plot_all_scaling.py for a comparison of the
-    different scalers, transformers, and normalizers.
+    For a comparison of the different scalers, transformers, and normalizers,
+    see :ref:`examples/preprocessing/plot_all_scaling.py
+    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
     """  # noqa
     # Unlike the scaler object, this function allows 1d input.
     # If copy is required, it will be done inside the scaler object.
@@ -506,8 +509,9 @@ class StandardScaler(BaseEstimator, TransformerMixin):
 
     Notes
     -----
-    See examples/preprocessing/plot_all_scaling.py for a comparison of the
-    different scalers, transformers, and normalizers.
+    For a comparison of the different scalers, transformers, and normalizers,
+    see :ref:`examples/preprocessing/plot_all_scaling.py
+    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
     """  # noqa
 
     def __init__(self, copy=True, with_mean=True, with_std=True):
@@ -713,8 +717,9 @@ class MaxAbsScaler(BaseEstimator, TransformerMixin):
 
     Notes
     -----
-    See examples/preprocessing/plot_all_scaling.py for a comparison of the
-    different scalers, transformers, and normalizers.
+    For a comparison of the different scalers, transformers, and normalizers,
+    see :ref:`examples/preprocessing/plot_all_scaling.py
+    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
     """
 
     def __init__(self, copy=True):
@@ -845,8 +850,9 @@ def maxabs_scale(X, axis=0, copy=True):
 
     Notes
     -----
-    See examples/preprocessing/plot_all_scaling.py for a comparison of the
-    different scalers, transformers, and normalizers.
+    For a comparison of the different scalers, transformers, and normalizers,
+    see :ref:`examples/preprocessing/plot_all_scaling.py
+    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
     """  # noqa
     # Unlike the scaler object, this function allows 1d input.
 
@@ -939,7 +945,9 @@ class RobustScaler(BaseEstimator, TransformerMixin):
 
     Notes
     -----
-    See examples/preprocessing/plot_all_scaling.py for an example.
+    For a comparison of the different scalers, transformers, and normalizers,
+    see :ref:`examples/preprocessing/plot_all_scaling.py
+    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
 
     https://en.wikipedia.org/wiki/Median_(statistics)
     https://en.wikipedia.org/wiki/Interquartile_range
@@ -1089,8 +1097,9 @@ def robust_scale(X, axis=0, with_centering=True, with_scaling=True,
 
     To avoid memory copy the caller should pass a CSR matrix.
 
-    See examples/preprocessing/plot_all_scaling.py for a comparison of the
-    different scalers, transformers, and normalizers.
+    For a comparison of the different scalers, transformers, and normalizers,
+    see :ref:`examples/preprocessing/plot_all_scaling.py
+    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
 
     See also
     --------
@@ -1311,8 +1320,10 @@ def normalize(X, norm='l2', axis=1, copy=True, return_norm=False):
 
     Notes
     -----
-    See examples/preprocessing/plot_all_scaling.py for a comparison of the
-    different scalers, transformers, and normalizers.
+    For a comparison of the different scalers, transformers, and normalizers,
+    see :ref:`examples/preprocessing/plot_all_scaling.py
+    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
+
     """
     if norm not in ('l1', 'l2', 'max'):
         raise ValueError("'%s' is not a supported norm" % norm)
@@ -1396,8 +1407,10 @@ class Normalizer(BaseEstimator, TransformerMixin):
     This estimator is stateless (besides constructor parameters), the
     fit method does nothing but is useful when used in a pipeline.
 
-    See examples/preprocessing/plot_all_scaling.py for a comparison of the
-    different scalers, transformers, and normalizers.
+    For a comparison of the different scalers, transformers, and normalizers,
+    see :ref:`examples/preprocessing/plot_all_scaling.py
+    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
+
 
     See also
     --------
@@ -2026,9 +2039,9 @@ class QuantileTransformer(BaseEstimator, TransformerMixin):
 
     Notes
     -----
-    See examples/preprocessing/plot_all_scaling.py for a comparison of the
-    different scalers, transformers, and normalizers.
-
+    For a comparison of the different scalers, transformers, and normalizers,
+    see :ref:`examples/preprocessing/plot_all_scaling.py
+    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
     """
 
     def __init__(self, n_quantiles=1000, output_distribution='uniform',
@@ -2410,9 +2423,9 @@ def quantile_transform(X, axis=0, n_quantiles=1000,
 
     Notes
     -----
-    See examples/preprocessing/plot_all_scaling.py for a comparison of the
-    different scalers, transformers, and normalizers.
-
+    For a comparison of the different scalers, transformers, and normalizers,
+    see :ref:`examples/preprocessing/plot_all_scaling.py
+    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
     """
     n = QuantileTransformer(n_quantiles=n_quantiles,
                             output_distribution=output_distribution,