From 04116ce286887d33d1b7e5ec44cf5541605b7790 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=93scar=20N=C3=A1jera?= <najera.oscar@gmail.com>
Date: Mon, 25 Jan 2016 00:13:05 +0100
Subject: [PATCH] update image sources namespace

---
 doc/datasets/index.rst                        |  6 +--
 doc/modules/biclustering.rst                  |  8 +--
 doc/modules/calibration.rst                   | 14 +++---
 doc/modules/clustering.rst                    | 50 +++++++++----------
 doc/modules/computational_performance.rst     | 16 +++---
 doc/modules/covariance.rst                    | 10 ++--
 doc/modules/cross_decomposition.rst           |  2 +-
 doc/modules/decomposition.rst                 | 42 ++++++++--------
 doc/modules/density.rst                       | 10 ++--
 doc/modules/ensemble.rst                      | 14 +++---
 doc/modules/feature_extraction.rst            |  2 +-
 doc/modules/feature_selection.rst             |  2 +-
 doc/modules/gaussian_process.rst              | 28 +++++------
 doc/modules/isotonic.rst                      |  2 +-
 doc/modules/kernel_approximation.rst          |  2 +-
 doc/modules/kernel_ridge.rst                  |  4 +-
 doc/modules/label_propagation.rst             |  2 +-
 doc/modules/lda_qda.rst                       |  4 +-
 doc/modules/learning_curve.rst                |  8 +--
 doc/modules/linear_model.rst                  | 36 ++++++-------
 doc/modules/manifold.rst                      | 26 +++++-----
 doc/modules/mixture.rst                       | 12 ++---
 doc/modules/model_evaluation.rst              |  6 +--
 doc/modules/multiclass.rst                    |  2 +-
 doc/modules/neighbors.rst                     | 22 ++++----
 doc/modules/neural_networks_unsupervised.rst  |  2 +-
 doc/modules/outlier_detection.rst             | 36 ++++++-------
 doc/modules/random_projection.rst             |  4 +-
 doc/modules/scaling_strategies.rst            |  4 +-
 doc/modules/sgd.rst                           |  8 +--
 doc/modules/svm.rst                           | 10 ++--
 doc/modules/tree.rst                          | 10 ++--
 doc/tutorial/basic/tutorial.rst               |  2 +-
 .../statistical_inference/model_selection.rst |  2 +-
 .../putting_together.rst                      |  6 +--
 .../statistical_inference/settings.rst        |  2 +-
 .../supervised_learning.rst                   | 32 ++++++------
 .../unsupervised_learning.rst                 | 26 +++++-----
 doc/whats_new.rst                             |  6 +--
 39 files changed, 239 insertions(+), 241 deletions(-)

diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst
index 49d056902c..d8045a3d48 100644
--- a/doc/datasets/index.rst
+++ b/doc/datasets/index.rst
@@ -82,7 +82,7 @@ and pipeline on 2D data.
    load_sample_images
    load_sample_image
 
-.. image:: ../auto_examples/cluster/images/plot_color_quantization_001.png
+.. image:: ../auto_examples/cluster/images/sphx_glr_plot_color_quantization_001.png
    :target: ../auto_examples/cluster/plot_color_quantization.html
    :scale: 30
    :align: right
@@ -130,7 +130,7 @@ per class; and linear transformations of the feature space.
 near-equal-size classes separated by concentric hyperspheres.
 :func:`make_hastie_10_2` generates a similar binary, 10-dimensional problem.
 
-.. image:: ../auto_examples/datasets/images/plot_random_dataset_001.png
+.. image:: ../auto_examples/datasets/images/sphx_glr_plot_random_dataset_001.png
    :target: ../auto_examples/datasets/plot_random_dataset.html
    :scale: 50
    :align: center
@@ -159,7 +159,7 @@ respect to true bag-of-words mixtures include:
 * Documents without labels words at random, rather than from a base
   distribution.
 
-.. image:: ../auto_examples/datasets/images/plot_random_multilabel_dataset_001.png
+.. image:: ../auto_examples/datasets/images/sphx_glr_plot_random_multilabel_dataset_001.png
    :target: ../auto_examples/datasets/plot_random_multilabel_dataset.html
    :scale: 50
    :align: center
diff --git a/doc/modules/biclustering.rst b/doc/modules/biclustering.rst
index 4595eec40f..10de7b6a4f 100644
--- a/doc/modules/biclustering.rst
+++ b/doc/modules/biclustering.rst
@@ -44,8 +44,8 @@ biclusters on the diagonal. Here is an example of this structure
 where biclusters have higher average values than the other rows and
 columns:
 
-.. figure:: ../auto_examples/bicluster/images/plot_spectral_coclustering_003.png
-   :target: ../auto_examples/bicluster/images/plot_spectral_coclustering_003.png
+.. figure:: ../auto_examples/bicluster/images/sphx_glr_plot_spectral_coclustering_003.png
+   :target: ../auto_examples/bicluster/images/sphx_glr_plot_spectral_coclustering_003.png
    :align: center
    :scale: 50
 
@@ -56,8 +56,8 @@ each column belongs to all row clusters. Here is an example of this
 structure where the variance of the values within each bicluster is
 small:
 
-.. figure:: ../auto_examples/bicluster/images/plot_spectral_biclustering_003.png
-   :target: ../auto_examples/bicluster/images/plot_spectral_biclustering_003.png
+.. figure:: ../auto_examples/bicluster/images/sphx_glr_plot_spectral_biclustering_003.png
+   :target: ../auto_examples/bicluster/images/sphx_glr_plot_spectral_biclustering_003.png
    :align: center
    :scale: 50
 
diff --git a/doc/modules/calibration.rst b/doc/modules/calibration.rst
index 1a11d6f568..cd74f414fe 100644
--- a/doc/modules/calibration.rst
+++ b/doc/modules/calibration.rst
@@ -22,7 +22,7 @@ such that among the samples to which it gave a predict_proba value close to 0.8,
 approximately 80% actually belong to the positive class. The following plot compares
 how well the probabilistic predictions of different classifiers are calibrated:
 
-.. figure:: ../auto_examples/calibration/images/plot_compare_calibration_001.png
+.. figure:: ../auto_examples/calibration/images/sphx_glr_plot_compare_calibration_001.png
    :target: ../auto_examples/calibration/plot_compare_calibration.html
    :align: center
 
@@ -87,7 +87,7 @@ The first image present a dataset with 2 classes and 3 blobs of
 data. The blob in the middle contains random samples of each class.
 The probability for the samples in this blob should be 0.5.
 
-.. figure:: ../auto_examples/calibration/images/plot_calibration_001.png
+.. figure:: ../auto_examples/calibration/images/sphx_glr_plot_calibration_001.png
    :target: ../auto_examples/calibration/plot_calibration.html
    :align: center
 
@@ -98,7 +98,7 @@ calibration. One can observe that the non-parametric model
 provides the most accurate probability estimates for samples
 in the middle, i.e., 0.5.
 
-.. figure:: ../auto_examples/calibration/images/plot_calibration_002.png
+.. figure:: ../auto_examples/calibration/images/sphx_glr_plot_calibration_002.png
    :target: ../auto_examples/calibration/plot_calibration.html
    :align: center
 
@@ -113,7 +113,7 @@ both isotonic calibration and sigmoid calibration. The calibration performance
 is evaluated with Brier score :func:`brier_score_loss`, reported in the legend
 (the smaller the better).
 
-.. figure:: ../auto_examples/calibration/images/plot_calibration_curve_002.png
+.. figure:: ../auto_examples/calibration/images/sphx_glr_plot_calibration_curve_002.png
    :target: ../auto_examples/calibration/plot_calibration_curve.html
    :align: center
 
@@ -126,7 +126,7 @@ kinds of calibration can fix this issue and yield nearly identical results.
 The next figure shows the calibration curve of Gaussian naive Bayes on
 the same data, with both kinds of calibration and also without calibration.
 
-.. figure:: ../auto_examples/calibration/images/plot_calibration_curve_001.png
+.. figure:: ../auto_examples/calibration/images/sphx_glr_plot_calibration_curve_001.png
    :target: ../auto_examples/calibration/plot_calibration_curve.html
    :align: center
 
@@ -167,7 +167,7 @@ probability vectors predicted by the same classifier after sigmoid calibration
 on a hold-out validation set. Colors indicate the true class of an instance
 (red: class 1, green: class 2, blue: class 3).
 
-.. figure:: ../auto_examples/calibration/images/plot_calibration_multiclass_000.png
+.. figure:: ../auto_examples/calibration/images/sphx_glr_plot_calibration_multiclass_000.png
    :target: ../auto_examples/calibration/plot_calibration_multiclass.html
    :align: center
 
@@ -179,7 +179,7 @@ method='sigmoid' on the remaining 200 datapoints reduces the confidence of the
 predictions, i.e., moves the probability vectors from the edges of the simplex
 towards the center:
 
-.. figure:: ../auto_examples/calibration/images/plot_calibration_multiclass_001.png
+.. figure:: ../auto_examples/calibration/images/sphx_glr_plot_calibration_multiclass_001.png
    :target: ../auto_examples/calibration/plot_calibration_multiclass.html
    :align: center
 
diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index e0264da054..0ad450358b 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -29,7 +29,7 @@ data can be found in the ``labels_`` attribute.
 Overview of clustering methods
 ===============================
 
-.. figure:: ../auto_examples/cluster/images/plot_cluster_comparison_001.png
+.. figure:: ../auto_examples/cluster/images/sphx_glr_plot_cluster_comparison_001.png
    :target: ../auto_examples/cluster/plot_cluster_comparison.html
    :align: center
    :scale: 50
@@ -152,7 +152,7 @@ It suffers from various drawbacks:
   prior to k-means clustering can alleviate this problem
   and speed up the computations.
 
-.. image:: ../auto_examples/cluster/images/plot_kmeans_assumptions_001.png
+.. image:: ../auto_examples/cluster/images/sphx_glr_plot_kmeans_assumptions_001.png
    :target: ../auto_examples/cluster/plot_kmeans_assumptions.html
    :align: center
    :scale: 50
@@ -168,7 +168,7 @@ and the new centroids are computed and the algorithm repeats these last two
 steps until this value is less than a threshold. In other words, it repeats
 until the centroids do not move significantly.
 
-.. image:: ../auto_examples/cluster/images/plot_kmeans_digits_001.png
+.. image:: ../auto_examples/cluster/images/sphx_glr_plot_kmeans_digits_001.png
    :target: ../auto_examples/cluster/plot_kmeans_digits.html
    :align: right
    :scale: 35
@@ -254,7 +254,7 @@ convergence or a predetermined number of iterations is reached.
 of the results is reduced. In practice this difference in quality can be quite
 small, as shown in the example and cited reference.
 
-.. figure:: ../auto_examples/cluster/images/plot_mini_batch_kmeans_001.png
+.. figure:: ../auto_examples/cluster/images/sphx_glr_plot_mini_batch_kmeans_001.png
    :target: ../auto_examples/cluster/plot_mini_batch_kmeans.html
    :align: center
    :scale: 100
@@ -292,7 +292,7 @@ values from other pairs. This updating happens iteratively until convergence,
 at which point the final exemplars are chosen, and hence the final clustering
 is given.
 
-.. figure:: ../auto_examples/cluster/images/plot_affinity_propagation_001.png
+.. figure:: ../auto_examples/cluster/images/sphx_glr_plot_affinity_propagation_001.png
    :target: ../auto_examples/cluster/plot_affinity_propagation.html
    :align: center
    :scale: 50
@@ -392,7 +392,7 @@ Labelling a new sample is performed by finding the nearest centroid for a
 given sample.
 
 
-.. figure:: ../auto_examples/cluster/images/plot_mean_shift_001.png
+.. figure:: ../auto_examples/cluster/images/sphx_glr_plot_mean_shift_001.png
    :target: ../auto_examples/cluster/plot_mean_shift.html
    :align: center
    :scale: 50
@@ -432,11 +432,11 @@ graph vertices are pixels, and edges of the similarity graph are a
 function of the gradient of the image.
 
 
-.. |noisy_img| image:: ../auto_examples/cluster/images/plot_segmentation_toy_001.png
+.. |noisy_img| image:: ../auto_examples/cluster/images/sphx_glr_plot_segmentation_toy_001.png
     :target: ../auto_examples/cluster/plot_segmentation_toy.html
     :scale: 50
 
-.. |segmented_img| image:: ../auto_examples/cluster/images/plot_segmentation_toy_002.png
+.. |segmented_img| image:: ../auto_examples/cluster/images/sphx_glr_plot_segmentation_toy_002.png
     :target: ../auto_examples/cluster/plot_segmentation_toy.html
     :scale: 50
 
@@ -463,11 +463,11 @@ function of the gradient of the image.
  * :ref:`example_cluster_plot_face_segmentation.py`: Spectral clustering
    to split the image of the raccoon face in regions.
 
-.. |face_kmeans| image:: ../auto_examples/cluster/images/plot_face_segmentation_001.png
+.. |face_kmeans| image:: ../auto_examples/cluster/images/sphx_glr_plot_face_segmentation_001.png
     :target: ../auto_examples/cluster/plot_face_segmentation.html
     :scale: 65
 
-.. |face_discretize| image:: ../auto_examples/cluster/images/plot_face_segmentation_002.png
+.. |face_discretize| image:: ../auto_examples/cluster/images/sphx_glr_plot_face_segmentation_002.png
     :target: ../auto_examples/cluster/plot_face_segmentation.html
     :scale: 65
 
@@ -553,15 +553,15 @@ Different linkage type: Ward, complete and average linkage
 :class:`AgglomerativeClustering` supports Ward, average, and complete
 linkage strategies.
 
-.. image:: ../auto_examples/cluster/images/plot_digits_linkage_001.png
+.. image:: ../auto_examples/cluster/images/sphx_glr_plot_digits_linkage_001.png
     :target: ../auto_examples/cluster/plot_digits_linkage.html
     :scale: 43
 
-.. image:: ../auto_examples/cluster/images/plot_digits_linkage_002.png
+.. image:: ../auto_examples/cluster/images/sphx_glr_plot_digits_linkage_002.png
     :target: ../auto_examples/cluster/plot_digits_linkage.html
     :scale: 43
 
-.. image:: ../auto_examples/cluster/images/plot_digits_linkage_003.png
+.. image:: ../auto_examples/cluster/images/sphx_glr_plot_digits_linkage_003.png
     :target: ../auto_examples/cluster/plot_digits_linkage.html
     :scale: 43
 
@@ -590,11 +590,11 @@ constraints forbid the merging of points that are not adjacent on the swiss
 roll, and thus avoid forming clusters that extend across overlapping folds of
 the roll.
 
-.. |unstructured| image:: ../auto_examples/cluster/images/plot_ward_structured_vs_unstructured_001.png
+.. |unstructured| image:: ../auto_examples/cluster/images/sphx_glr_plot_ward_structured_vs_unstructured_001.png
         :target: ../auto_examples/cluster/plot_ward_structured_vs_unstructured.html
         :scale: 49
 
-.. |structured| image:: ../auto_examples/cluster/images/plot_ward_structured_vs_unstructured_002.png
+.. |structured| image:: ../auto_examples/cluster/images/sphx_glr_plot_ward_structured_vs_unstructured_002.png
         :target: ../auto_examples/cluster/plot_ward_structured_vs_unstructured.html
         :scale: 49
 
@@ -642,19 +642,19 @@ enable only merging of neighboring pixels on an image, as in the
     clusters and almost empty ones. (see the discussion in
     :ref:`example_cluster_plot_agglomerative_clustering.py`).
 
-.. image:: ../auto_examples/cluster/images/plot_agglomerative_clustering_001.png
+.. image:: ../auto_examples/cluster/images/sphx_glr_plot_agglomerative_clustering_001.png
     :target: ../auto_examples/cluster/plot_agglomerative_clustering.html
     :scale: 38
 
-.. image:: ../auto_examples/cluster/images/plot_agglomerative_clustering_002.png
+.. image:: ../auto_examples/cluster/images/sphx_glr_plot_agglomerative_clustering_002.png
     :target: ../auto_examples/cluster/plot_agglomerative_clustering.html
     :scale: 38
 
-.. image:: ../auto_examples/cluster/images/plot_agglomerative_clustering_003.png
+.. image:: ../auto_examples/cluster/images/sphx_glr_plot_agglomerative_clustering_003.png
     :target: ../auto_examples/cluster/plot_agglomerative_clustering.html
     :scale: 38
 
-.. image:: ../auto_examples/cluster/images/plot_agglomerative_clustering_004.png
+.. image:: ../auto_examples/cluster/images/sphx_glr_plot_agglomerative_clustering_004.png
     :target: ../auto_examples/cluster/plot_agglomerative_clustering.html
     :scale: 38
 
@@ -678,15 +678,15 @@ The guidelines for choosing a metric is to use one that maximizes the
 distance between samples in different classes, and minimizes that within
 each class.
 
-.. image:: ../auto_examples/cluster/images/plot_agglomerative_clustering_metrics_005.png
+.. image:: ../auto_examples/cluster/images/sphx_glr_plot_agglomerative_clustering_metrics_005.png
     :target: ../auto_examples/cluster/plot_agglomerative_clustering_metrics.html
     :scale: 32
 
-.. image:: ../auto_examples/cluster/images/plot_agglomerative_clustering_metrics_006.png
+.. image:: ../auto_examples/cluster/images/sphx_glr_plot_agglomerative_clustering_metrics_006.png
     :target: ../auto_examples/cluster/plot_agglomerative_clustering_metrics.html
     :scale: 32
 
-.. image:: ../auto_examples/cluster/images/plot_agglomerative_clustering_metrics_007.png
+.. image:: ../auto_examples/cluster/images/sphx_glr_plot_agglomerative_clustering_metrics_007.png
     :target: ../auto_examples/cluster/plot_agglomerative_clustering_metrics.html
     :scale: 32
 
@@ -734,7 +734,7 @@ indicating core samples found by the algorithm. Smaller circles are non-core
 samples that are still part of a cluster. Moreover, the outliers are indicated
 by black points below.
 
-.. |dbscan_results| image:: ../auto_examples/cluster/images/plot_dbscan_001.png
+.. |dbscan_results| image:: ../auto_examples/cluster/images/sphx_glr_plot_dbscan_001.png
         :target: ../auto_examples/cluster/plot_dbscan.html
         :scale: 50
 
@@ -865,7 +865,7 @@ the user is advised
  4. Call ``partial_fit`` finally with no arguments, i.e ``brc.partial_fit()``
     which performs the global clustering.
 
-.. image:: ../auto_examples/cluster/images/plot_birch_vs_minibatchkmeans_001.png
+.. image:: ../auto_examples/cluster/images/sphx_glr_plot_birch_vs_minibatchkmeans_001.png
     :target: ../auto_examples/cluster/plot_birch_vs_minibatchkmeans.html
 
 .. topic:: References:
@@ -1278,7 +1278,7 @@ Drawbacks
   smaller sample sizes or larger number of clusters it is safer to use
   an adjusted index such as the Adjusted Rand Index (ARI)**.
 
-.. figure:: ../auto_examples/cluster/images/plot_adjusted_for_chance_measures_001.png
+.. figure:: ../auto_examples/cluster/images/sphx_glr_plot_adjusted_for_chance_measures_001.png
    :target: ../auto_examples/cluster/plot_adjusted_for_chance_measures.html
    :align: center
    :scale: 100
diff --git a/doc/modules/computational_performance.rst b/doc/modules/computational_performance.rst
index a3a488ca6d..9434267f70 100644
--- a/doc/modules/computational_performance.rst
+++ b/doc/modules/computational_performance.rst
@@ -51,13 +51,13 @@ linear algebra libraries optimizations etc.). Here we see on a setting
 with few features that independently of estimator choice the bulk mode is
 always faster, and for some of them by 1 to 2 orders of magnitude:
 
-.. |atomic_prediction_latency| image::  ../auto_examples/applications/images/plot_prediction_latency_001.png
+.. |atomic_prediction_latency| image::  ../auto_examples/applications/images/sphx_glr_plot_prediction_latency_001.png
     :target: ../auto_examples/applications/plot_prediction_latency.html
     :scale: 80
 
 .. centered:: |atomic_prediction_latency|
 
-.. |bulk_prediction_latency| image::  ../auto_examples/applications/images/plot_prediction_latency_002.png
+.. |bulk_prediction_latency| image::  ../auto_examples/applications/images/sphx_glr_plot_prediction_latency_002.png
     :target: ../auto_examples/applications/plot_prediction_latency.html
     :scale: 80
 
@@ -79,7 +79,7 @@ From a computing perspective it also means that the number of basic operations
 too. Here is a graph of the evolution of the prediction latency with the
 number of features:
 
-.. |influence_of_n_features_on_latency| image::  ../auto_examples/applications/images/plot_prediction_latency_003.png
+.. |influence_of_n_features_on_latency| image::  ../auto_examples/applications/images/sphx_glr_plot_prediction_latency_003.png
     :target: ../auto_examples/applications/plot_prediction_latency.html
     :scale: 80
 
@@ -148,7 +148,7 @@ describe it fully. Of course sparsity influences in turn the prediction time
 as the sparse dot-product takes time roughly proportional to the number of
 non-zero coefficients.
 
-.. |en_model_complexity| image::  ../auto_examples/applications/images/plot_model_complexity_influence_001.png
+.. |en_model_complexity| image::  ../auto_examples/applications/images/sphx_glr_plot_model_complexity_influence_001.png
     :target: ../auto_examples/applications/plot_model_complexity_influence.html
     :scale: 80
 
@@ -163,7 +163,7 @@ support vector. In the following graph the ``nu`` parameter of
 :class:`sklearn.svm.classes.NuSVR` was used to influence the number of
 support vectors.
 
-.. |nusvr_model_complexity| image::  ../auto_examples/applications/images/plot_model_complexity_influence_002.png
+.. |nusvr_model_complexity| image::  ../auto_examples/applications/images/sphx_glr_plot_model_complexity_influence_002.png
     :target: ../auto_examples/applications/plot_model_complexity_influence.html
     :scale: 80
 
@@ -175,7 +175,7 @@ important role. Latency and throughput should scale linearly with the number
 of trees. In this case we used directly the ``n_estimators`` parameter of
 :class:`sklearn.ensemble.gradient_boosting.GradientBoostingRegressor`.
 
-.. |gbt_model_complexity| image::  ../auto_examples/applications/images/plot_model_complexity_influence_003.png
+.. |gbt_model_complexity| image::  ../auto_examples/applications/images/sphx_glr_plot_model_complexity_influence_003.png
     :target: ../auto_examples/applications/plot_model_complexity_influence.html
     :scale: 80
 
@@ -199,7 +199,7 @@ files, tokenizing the text and hashing it into a common vector space) is
 taking 100 to 500 times more time than the actual prediction code, depending on
 the chosen model.
 
- .. |prediction_time| image::  ../auto_examples/applications/images/plot_out_of_core_classification_004.png
+ .. |prediction_time| image::  ../auto_examples/applications/images/sphx_glr_plot_out_of_core_classification_004.png
     :target: ../auto_examples/applications/plot_out_of_core_classification.html
     :scale: 80
 
@@ -218,7 +218,7 @@ time. Here is a benchmark from the
 :ref:`example_applications_plot_prediction_latency.py` example that measures
 this quantity for a number of estimators on synthetic data:
 
-.. |throughput_benchmark| image::  ../auto_examples/applications/images/plot_prediction_latency_004.png
+.. |throughput_benchmark| image::  ../auto_examples/applications/images/sphx_glr_plot_prediction_latency_004.png
     :target: ../auto_examples/applications/plot_prediction_latency.html
     :scale: 80
 
diff --git a/doc/modules/covariance.rst b/doc/modules/covariance.rst
index 37512d86e6..12cc1706ce 100644
--- a/doc/modules/covariance.rst
+++ b/doc/modules/covariance.rst
@@ -133,7 +133,7 @@ with the :meth:`oas` function of the `sklearn.covariance`
 package, or it can be otherwise obtained by fitting an :class:`OAS`
 object to the same sample.
 
-.. figure:: ../auto_examples/covariance/images/plot_covariance_estimation_001.png
+.. figure:: ../auto_examples/covariance/images/sphx_glr_plot_covariance_estimation_001.png
    :target: ../auto_examples/covariance/plot_covariance_estimation.html
    :align: center
    :scale: 65%
@@ -155,7 +155,7 @@ object to the same sample.
      an :class:`OAS` estimator of the covariance.
 
 
-.. figure:: ../auto_examples/covariance/images/plot_lw_vs_oas_001.png
+.. figure:: ../auto_examples/covariance/images/sphx_glr_plot_lw_vs_oas_001.png
    :target: ../auto_examples/covariance/plot_lw_vs_oas.html
    :align: center
    :scale: 75%
@@ -187,7 +187,7 @@ the precision matrix: the higher its ``alpha`` parameter, the more sparse
 the precision matrix. The corresponding :class:`GraphLassoCV` object uses
 cross-validation to automatically set the ``alpha`` parameter.
 
-.. figure:: ../auto_examples/covariance/images/plot_sparse_cov_001.png
+.. figure:: ../auto_examples/covariance/images/sphx_glr_plot_sparse_cov_001.png
    :target: ../auto_examples/covariance/plot_sparse_cov.html
    :align: center
    :scale: 60%
@@ -309,11 +309,11 @@ attributes of a :class:`MinCovDet` robust covariance estimator object.
      :class:`MinCovDet` covariance estimators in terms of Mahalanobis distance
      (so we get a better estimate of the precision matrix too).
 
-.. |robust_vs_emp| image:: ../auto_examples/covariance/images/plot_robust_vs_empirical_covariance_001.png
+.. |robust_vs_emp| image:: ../auto_examples/covariance/images/sphx_glr_plot_robust_vs_empirical_covariance_001.png
    :target: ../auto_examples/covariance/plot_robust_vs_empirical_covariance.html
    :scale: 49%
 
-.. |mahalanobis| image:: ../auto_examples/covariance/images/plot_mahalanobis_distances_001.png
+.. |mahalanobis| image:: ../auto_examples/covariance/images/sphx_glr_plot_mahalanobis_distances_001.png
    :target: ../auto_examples/covariance/plot_mahalanobis_distances.html
    :scale: 49%
 
diff --git a/doc/modules/cross_decomposition.rst b/doc/modules/cross_decomposition.rst
index c55a216845..1dda2e2ef0 100644
--- a/doc/modules/cross_decomposition.rst
+++ b/doc/modules/cross_decomposition.rst
@@ -13,7 +13,7 @@ These families of algorithms are useful to find linear relations between two
 multivariate datasets: the ``X`` and ``Y`` arguments of the ``fit`` method
 are 2D arrays.
 
-.. figure:: ../auto_examples/cross_decomposition/images/plot_compare_cross_decomposition_001.png
+.. figure:: ../auto_examples/cross_decomposition/images/sphx_glr_plot_compare_cross_decomposition_001.png
    :target: ../auto_examples/cross_decomposition/plot_compare_cross_decomposition.html
    :scale: 75%
    :align: center
diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst
index c08b9d0c8c..2879c8e4f2 100644
--- a/doc/modules/decomposition.rst
+++ b/doc/modules/decomposition.rst
@@ -33,7 +33,7 @@ clustering algorithm.
 Below is an example of the iris dataset, which is comprised of 4
 features, projected on the 2 dimensions that explain most variance:
 
-.. figure:: ../auto_examples/decomposition/images/plot_pca_vs_lda_001.png
+.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_pca_vs_lda_001.png
     :target: ../auto_examples/decomposition/plot_pca_vs_lda.html
     :align: center
     :scale: 75%
@@ -44,7 +44,7 @@ probabilistic interpretation of the PCA that can give a likelihood of
 data based on the amount of variance it explains. As such it implements a
 `score` method that can be used in cross-validation:
 
-.. figure:: ../auto_examples/decomposition/images/plot_pca_vs_fa_model_selection_001.png
+.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_pca_vs_fa_model_selection_001.png
     :target: ../auto_examples/decomposition/plot_pca_vs_fa_model_selection.html
     :align: center
     :scale: 75%
@@ -80,12 +80,12 @@ in order update ``explained_variance_ratio_`` incrementally. This is why
 memory usage depends on the number of samples per batch, rather than the
 number of samples to be processed in the dataset.
 
-.. figure:: ../auto_examples/decomposition/images/plot_incremental_pca_001.png
+.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_incremental_pca_001.png
     :target: ../auto_examples/decomposition/plot_incremental_pca.html
     :align: center
     :scale: 75%
 
-.. figure:: ../auto_examples/decomposition/images/plot_incremental_pca_002.png
+.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_incremental_pca_002.png
     :target: ../auto_examples/decomposition/plot_incremental_pca.html
     :align: center
     :scale: 75%
@@ -128,11 +128,11 @@ singular vectors reshaped as portraits. Since we only require the top
 and :math:`n_{features} = 64 \times 64 = 4096`, the computation time is
 less than 1s:
 
-.. |orig_img| image:: ../auto_examples/decomposition/images/plot_faces_decomposition_001.png
+.. |orig_img| image:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_001.png
    :target: ../auto_examples/decomposition/plot_faces_decomposition.html
    :scale: 60%
 
-.. |pca_img| image:: ../auto_examples/decomposition/images/plot_faces_decomposition_002.png
+.. |pca_img| image:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_002.png
    :target: ../auto_examples/decomposition/plot_faces_decomposition.html
    :scale: 60%
 
@@ -181,7 +181,7 @@ has many applications including denoising, compression and structured
 prediction (kernel dependency estimation). :class:`KernelPCA` supports both
 ``transform`` and ``inverse_transform``.
 
-.. figure:: ../auto_examples/decomposition/images/plot_kernel_pca_001.png
+.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_kernel_pca_001.png
     :target: ../auto_examples/decomposition/plot_kernel_pca.html
     :align: center
     :scale: 75%
@@ -231,7 +231,7 @@ norms that take into account adjacency and different kinds of structure; see
 For more details on how to use Sparse PCA, see the Examples section, below.
 
 
-.. |spca_img| image:: ../auto_examples/decomposition/images/plot_faces_decomposition_005.png
+.. |spca_img| image:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_005.png
    :target: ../auto_examples/decomposition/plot_faces_decomposition.html
    :scale: 60%
 
@@ -435,11 +435,11 @@ dictionary fixed, and then updating the dictionary to best fit the sparse code.
                 0 \leq k < n_{atoms}
 
 
-.. |pca_img2| image:: ../auto_examples/decomposition/images/plot_faces_decomposition_002.png
+.. |pca_img2| image:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_002.png
    :target: ../auto_examples/decomposition/plot_faces_decomposition.html
    :scale: 60%
 
-.. |dict_img2| image:: ../auto_examples/decomposition/images/plot_faces_decomposition_006.png
+.. |dict_img2| image:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_006.png
    :target: ../auto_examples/decomposition/plot_faces_decomposition.html
    :scale: 60%
 
@@ -454,7 +454,7 @@ The following image shows how a dictionary learned from 4x4 pixel image patches
 extracted from part of the image of a raccoon face looks like.
 
 
-.. figure:: ../auto_examples/decomposition/images/plot_image_denoising_001.png
+.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_image_denoising_001.png
     :target: ../auto_examples/decomposition/plot_image_denoising.html
     :align: center
     :scale: 50%
@@ -492,7 +492,7 @@ does not fit into the memory.
 
 .. currentmodule:: sklearn.cluster
 
-.. image:: ../auto_examples/cluster/images/plot_dict_face_patches_001.png
+.. image:: ../auto_examples/cluster/images/sphx_glr_plot_dict_face_patches_001.png
     :target: ../auto_examples/cluster/plot_dict_face_patches.html
     :scale: 50%
     :align: right
@@ -567,11 +567,11 @@ Factor analysis *can* produce similar components (the columns of its loading
 matrix) to :class:`PCA`. However, one can not make any general statements
 about these components (e.g. whether they are orthogonal):
 
-.. |pca_img3| image:: ../auto_examples/decomposition/images/plot_faces_decomposition_002.png
+.. |pca_img3| image:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_002.png
     :target: ../auto_examples/decomposition/plot_faces_decomposition.html
     :scale: 60%
 
-.. |fa_img3| image:: ../auto_examples/decomposition/images/plot_faces_decomposition_009.png
+.. |fa_img3| image:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_009.png
     :target: ../auto_examples/decomposition/plot_faces_decomposition.html
     :scale: 60%
 
@@ -581,7 +581,7 @@ The main advantage for Factor Analysis (over :class:`PCA` is that
 it can model the variance in every direction of the input space independently
 (heteroscedastic noise):
 
-.. figure:: ../auto_examples/decomposition/images/plot_faces_decomposition_008.png
+.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_008.png
     :target: ../auto_examples/decomposition/plot_faces_decomposition.html
     :align: center
     :scale: 75%
@@ -589,7 +589,7 @@ it can model the variance in every direction of the input space independently
 This allows better model selection than probabilistic PCA in the presence
 of heteroscedastic noise:
 
-.. figure:: ../auto_examples/decomposition/images/plot_pca_vs_fa_model_selection_002.png
+.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_pca_vs_fa_model_selection_002.png
     :target: ../auto_examples/decomposition/plot_pca_vs_fa_model_selection.html
     :align: center
     :scale: 75%
@@ -616,7 +616,7 @@ of the PCA variants.
 It is classically used to separate mixed signals (a problem known as
 *blind source separation*), as in the example below:
 
-.. figure:: ../auto_examples/decomposition/images/plot_ica_blind_source_separation_001.png
+.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_ica_blind_source_separation_001.png
     :target: ../auto_examples/decomposition/plot_ica_blind_source_separation.html
     :align: center
     :scale: 60%
@@ -625,11 +625,11 @@ It is classically used to separate mixed signals (a problem known as
 ICA can also be used as yet another non linear decomposition that finds
 components with some sparsity:
 
-.. |pca_img4| image:: ../auto_examples/decomposition/images/plot_faces_decomposition_002.png
+.. |pca_img4| image:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_002.png
     :target: ../auto_examples/decomposition/plot_faces_decomposition.html
     :scale: 60%
 
-.. |ica_img4| image:: ../auto_examples/decomposition/images/plot_faces_decomposition_004.png
+.. |ica_img4| image:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_004.png
     :target: ../auto_examples/decomposition/plot_faces_decomposition.html
     :scale: 60%
 
@@ -673,11 +673,11 @@ resulting in interpretable models. The following example displays 16
 sparse components found by :class:`NMF` from the images in the Olivetti
 faces dataset, in comparison with the PCA eigenfaces.
 
-.. |pca_img5| image:: ../auto_examples/decomposition/images/plot_faces_decomposition_002.png
+.. |pca_img5| image:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_002.png
     :target: ../auto_examples/decomposition/plot_faces_decomposition.html
     :scale: 60%
 
-.. |nmf_img5| image:: ../auto_examples/decomposition/images/plot_faces_decomposition_003.png
+.. |nmf_img5| image:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_003.png
     :target: ../auto_examples/decomposition/plot_faces_decomposition.html
     :scale: 60%
 
diff --git a/doc/modules/density.rst b/doc/modules/density.rst
index 66a679eaa4..05b885ca77 100644
--- a/doc/modules/density.rst
+++ b/doc/modules/density.rst
@@ -24,7 +24,7 @@ A histogram is a simple visualization of data where bins are defined, and the
 number of data points within each bin is tallied.  An example of a histogram
 can be seen in the upper-left panel of the following figure:
 
-.. |hist_to_kde| image:: ../auto_examples/neighbors/images/plot_kde_1d_001.png
+.. |hist_to_kde| image:: ../auto_examples/neighbors/images/sphx_glr_plot_kde_1d_001.png
    :target: ../auto_examples/neighbors/plot_kde_1d.html
    :scale: 80
 
@@ -68,7 +68,7 @@ dimensionality causes its performance to degrade in high dimensions.
 In the following figure, 100 points are drawn from a bimodal distribution,
 and the kernel density estimates are shown for three choices of kernels:
 
-.. |kde_1d_distribution| image:: ../auto_examples/neighbors/images/plot_kde_1d_003.png
+.. |kde_1d_distribution| image:: ../auto_examples/neighbors/images/sphx_glr_plot_kde_1d_003.png
    :target: ../auto_examples/neighbors/plot_kde_1d.html
    :scale: 80
 
@@ -103,7 +103,7 @@ to an unsmooth (i.e. high-variance) density distribution.
 :class:`sklearn.neighbors.KernelDensity` implements several common kernel
 forms, which are shown in the following figure:
 
-.. |kde_kernels| image:: ../auto_examples/neighbors/images/plot_kde_1d_002.png
+.. |kde_kernels| image:: ../auto_examples/neighbors/images/sphx_glr_plot_kde_1d_002.png
    :target: ../auto_examples/neighbors/plot_kde_1d.html
    :scale: 80
 
@@ -145,7 +145,7 @@ is an example of using a kernel density estimate for a visualization
 of geospatial data, in this case the distribution of observations of two
 different species on the South American continent:
 
-.. |species_kde| image:: ../auto_examples/neighbors/images/plot_species_kde_001.png
+.. |species_kde| image:: ../auto_examples/neighbors/images/sphx_glr_plot_species_kde_001.png
    :target: ../auto_examples/neighbors/plot_species_kde.html
    :scale: 80
 
@@ -158,7 +158,7 @@ Here is an example of using this process to
 create a new set of hand-written digits, using a Gaussian kernel learned
 on a PCA projection of the data:
 
-.. |digits_kde| image:: ../auto_examples/neighbors/images/plot_digits_kde_sampling_001.png
+.. |digits_kde| image:: ../auto_examples/neighbors/images/sphx_glr_plot_digits_kde_sampling_001.png
    :target: ../auto_examples/neighbors/plot_digits_kde_sampling.html
    :scale: 80
 
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index ad31d172c3..6df2e07654 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -183,7 +183,7 @@ in bias::
     >>> scores.mean() > 0.999
     True
 
-.. figure:: ../auto_examples/ensemble/images/plot_forest_iris_001.png
+.. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_forest_iris_001.png
     :target: ../auto_examples/ensemble/plot_forest_iris.html
     :align: center
     :scale: 75%
@@ -263,7 +263,7 @@ The following example shows a color-coded representation of the relative
 importances of each individual pixel for a face recognition task using
 a :class:`ExtraTreesClassifier` model.
 
-.. figure:: ../auto_examples/ensemble/images/plot_forest_importances_faces_001.png
+.. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_forest_importances_faces_001.png
    :target: ../auto_examples/ensemble/plot_forest_importances_faces.html
    :align: center
    :scale: 75
@@ -342,7 +342,7 @@ ever-increasing influence. Each subsequent weak learner is thereby forced to
 concentrate on the examples that are missed by the previous ones in the sequence
 [HTF]_.
 
-.. figure:: ../auto_examples/ensemble/images/plot_adaboost_hastie_10_2_001.png
+.. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_adaboost_hastie_10_2_001.png
    :target: ../auto_examples/ensemble/plot_adaboost_hastie_10_2.html
    :align: center
    :scale: 75
@@ -506,7 +506,7 @@ to determine the optimal number of trees (i.e. ``n_estimators``) by early stoppi
 The plot on the right shows the feature importances which can be obtained via
 the ``feature_importances_`` property.
 
-.. figure:: ../auto_examples/ensemble/images/plot_gradient_boosting_regression_001.png
+.. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_gradient_boosting_regression_001.png
    :target: ../auto_examples/ensemble/plot_gradient_boosting_regression.html
    :align: center
    :scale: 75
@@ -707,7 +707,7 @@ outperforms no-shrinkage. Subsampling with shrinkage can further increase
 the accuracy of the model. Subsampling without shrinkage, on the other hand,
 does poorly.
 
-.. figure:: ../auto_examples/ensemble/images/plot_gradient_boosting_regularization_001.png
+.. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_gradient_boosting_regularization_001.png
    :target: ../auto_examples/ensemble/plot_gradient_boosting_regularization.html
    :align: center
    :scale: 75
@@ -801,7 +801,7 @@ usually chosen among the most important features.
 The Figure below shows four one-way and one two-way partial dependence plots
 for the California housing dataset:
 
-.. figure:: ../auto_examples/ensemble/images/plot_partial_dependence_001.png
+.. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_partial_dependence_001.png
    :target: ../auto_examples/ensemble/plot_partial_dependence.html
    :align: center
    :scale: 70
@@ -1030,7 +1030,7 @@ Vector Machine, a Decision Tree, and a K-nearest neighbor classifier::
    >>> clf3 = clf3.fit(X,y)
    >>> eclf = eclf.fit(X,y)
 
-.. figure:: ../auto_examples/ensemble/images/plot_voting_decision_regions_001.png
+.. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_voting_decision_regions_001.png
     :target: ../auto_examples/ensemble/plot_voting_decision_regions.html
     :align: center
     :scale: 75%
diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst
index c01d726804..b8de3916ce 100644
--- a/doc/modules/feature_extraction.rst
+++ b/doc/modules/feature_extraction.rst
@@ -892,7 +892,7 @@ features or samples. For instance Ward clustering
 (:ref:`hierarchical_clustering`) can cluster together only neighboring pixels
 of an image, thus forming contiguous patches:
 
-.. figure:: ../auto_examples/cluster/images/plot_face_ward_segmentation_001.png
+.. figure:: ../auto_examples/cluster/images/sphx_glr_plot_face_ward_segmentation_001.png
    :target: ../auto_examples/cluster/plot_face_ward_segmentation.html
    :align: center
    :scale: 40
diff --git a/doc/modules/feature_selection.rst b/doc/modules/feature_selection.rst
index 7826cb923f..3cb908fca8 100644
--- a/doc/modules/feature_selection.rst
+++ b/doc/modules/feature_selection.rst
@@ -264,7 +264,7 @@ settings, using the Lasso, while :class:`RandomizedLogisticRegression` uses the
 logistic regression and is suitable for classification tasks. To get a full
 path of stability scores you can use :func:`lasso_stability_path`.
 
-.. figure:: ../auto_examples/linear_model/images/plot_sparse_recovery_003.png
+.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_sparse_recovery_003.png
    :target: ../auto_examples/linear_model/plot_sparse_recovery.html
    :align: center
    :scale: 60
diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 44e4eec877..52211a154d 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -88,14 +88,14 @@ estimate the noise level of data. An illustration of the
 log-marginal-likelihood (LML) landscape shows that there exist two local
 maxima of LML.
 
-.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_noisy_000.png
+.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_000.png
    :target: ../auto_examples/gaussian_process/plot_gpr_noisy.html
    :align: center
 
 The first corresponds to a model with a high noise level and a
 large length scale, which explains all variations in the data by noise.
 
-.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_noisy_001.png
+.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_001.png
    :target: ../auto_examples/gaussian_process/plot_gpr_noisy.html
    :align: center
 
@@ -106,7 +106,7 @@ hyperparameters, the gradient-based optimization might also converge to the
 high-noise solution. It is thus important to repeat the optimization several
 times for different initializations.
 
-.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_noisy_002.png
+.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_002.png
    :target: ../auto_examples/gaussian_process/plot_gpr_noisy.html
    :align: center
 
@@ -142,7 +142,7 @@ Moreover, the noise level
 of the data is learned explicitly by GPR by an additional WhiteKernel component
 in the kernel and by the regularization parameter alpha of KRR.
 
-.. figure:: ../auto_examples/gaussian_process/images/plot_compare_gpr_krr_001.png
+.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_compare_gpr_krr_001.png
    :target: ../auto_examples/gaussian_process/plot_compare_gpr_krr.html
    :align: center
 
@@ -220,7 +220,7 @@ overall noise level is very small, indicating that the data can be very well
 explained by the model. The figure shows also that the model makes very
 confident predictions until around 2015
 
-.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_co2_001.png
+.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_co2_001.png
    :target: ../auto_examples/gaussian_process/plot_gpr_co2.html
    :align: center
 
@@ -306,11 +306,11 @@ The second figure shows the log-marginal-likelihood for different choices of
 the kernel's hyperparameters, highlighting the two choices of the
 hyperparameters used in the first figure by black dots.
 
-.. figure:: ../auto_examples/gaussian_process/images/plot_gpc_000.png
+.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpc_000.png
    :target: ../auto_examples/gaussian_process/plot_gpc.html
    :align: center
 
-.. figure:: ../auto_examples/gaussian_process/images/plot_gpc_001.png
+.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpc_001.png
    :target: ../auto_examples/gaussian_process/plot_gpc.html
    :align: center
 
@@ -326,7 +326,7 @@ dataset, the `DotProduct` kernel obtains considerably better results because the
 class-boundaries are linear and coincide with the coordinate axes. In practice,
 however, stationary kernels such as :class:`RBF` often obtain better results.
 
-.. figure:: ../auto_examples/gaussian_process/images/plot_gpc_xor_001.png
+.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpc_xor_001.png
    :target: ../auto_examples/gaussian_process/plot_gpc_xor.html
    :align: center
 
@@ -342,7 +342,7 @@ This illustrates the applicability of GPC to non-binary classification.
 The anisotropic RBF kernel obtains slightly higher log-marginal-likelihood by
 assigning different length-scales to the two feature dimensions.
 
-.. figure:: ../auto_examples/gaussian_process/images/plot_gpc_iris_001.png
+.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpc_iris_001.png
    :target: ../auto_examples/gaussian_process/plot_gpc_iris.html
    :align: center
 
@@ -493,7 +493,7 @@ kernel as covariance function have mean square derivatives of all orders, and ar
 very smooth. The prior and posterior of a GP resulting from an RBF kernel are shown in
 the following figure:
 
-.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_prior_posterior_000.png
+.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_prior_posterior_000.png
    :target: ../auto_examples/gaussian_process/plot_gpr_prior_posterior.html
    :align: center
 
@@ -534,7 +534,7 @@ allows adapting to the properties of the true underlying functional relation.
 The prior and posterior of a GP resulting from a Matérn kernel are shown in
 the following figure:
 
-.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_prior_posterior_004.png
+.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_prior_posterior_004.png
    :target: ../auto_examples/gaussian_process/plot_gpr_prior_posterior.html
    :align: center
 
@@ -556,7 +556,7 @@ The kernel is given by:
 The prior and posterior of a GP resulting from an RBF kernel are shown in
 the following figure:
 
-.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_prior_posterior_001.png
+.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_prior_posterior_001.png
    :target: ../auto_examples/gaussian_process/plot_gpr_prior_posterior.html
    :align: center
 
@@ -574,7 +574,7 @@ The kernel is given by:
 The prior and posterior of a GP resulting from an ExpSineSquared kernel are shown in
 the following figure:
 
-.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_prior_posterior_002.png
+.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_prior_posterior_002.png
    :target: ../auto_examples/gaussian_process/plot_gpr_prior_posterior.html
    :align: center
 
@@ -594,7 +594,7 @@ is called the homogeneous linear kernel, otherwise it is inhomogeneous. The kern
 The :class:`DotProduct` kernel is commonly combined with exponentiation. An example with exponent 2 is
 shown in the following figure:
 
-.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_prior_posterior_003.png
+.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_prior_posterior_003.png
    :target: ../auto_examples/gaussian_process/plot_gpr_prior_posterior.html
    :align: center
 
diff --git a/doc/modules/isotonic.rst b/doc/modules/isotonic.rst
index c12fc3d4f2..097da60584 100644
--- a/doc/modules/isotonic.rst
+++ b/doc/modules/isotonic.rst
@@ -18,6 +18,6 @@ arbitrary real number. It yields the vector which is composed of non-decreasing
 elements the closest in terms of mean squared error. In practice this list
 of elements forms a function that is piecewise linear.
 
-.. figure:: ../auto_examples/images/plot_isotonic_regression_001.png
+.. figure:: ../auto_examples/images/sphx_glr_plot_isotonic_regression_001.png
    :target: ../auto_examples/plot_isotonic_regression.html
    :align: center
diff --git a/doc/modules/kernel_approximation.rst b/doc/modules/kernel_approximation.rst
index fd0fe7be0b..79ae90d0e5 100644
--- a/doc/modules/kernel_approximation.rst
+++ b/doc/modules/kernel_approximation.rst
@@ -88,7 +88,7 @@ For a given value of ``n_components`` :class:`RBFSampler` is often less accurate
 as :class:`Nystroem`. :class:`RBFSampler` is cheaper to compute, though, making
 use of larger feature spaces more efficient.
 
-.. figure:: ../auto_examples/images/plot_kernel_approximation_002.png
+.. figure:: ../auto_examples/images/sphx_glr_plot_kernel_approximation_002.png
     :target: ../auto_examples/plot_kernel_approximation.html
     :scale: 50%
     :align: center
diff --git a/doc/modules/kernel_ridge.rst b/doc/modules/kernel_ridge.rst
index 6016604755..3d032b52bb 100644
--- a/doc/modules/kernel_ridge.rst
+++ b/doc/modules/kernel_ridge.rst
@@ -32,7 +32,7 @@ using grid-search. The learned functions are very similar; however, fitting
 than three times faster with SVR since it has learned a sparse model using only
 approx. 1/3 of the 100 training datapoints as support vectors.
 
-.. figure:: ../auto_examples/images/plot_kernel_ridge_regression_001.png
+.. figure:: ../auto_examples/images/sphx_glr_plot_kernel_ridge_regression_001.png
    :target: ../auto_examples/plot_kernel_ridge_regression.html
    :align: center
 
@@ -46,7 +46,7 @@ the learned sparse solution. Note that the degree of sparsity and thus the
 prediction time depends on the parameters :math:`\epsilon` and :math:`C` of the
 :class:`SVR`; :math:`\epsilon = 0` would correspond to a dense model.
 
-.. figure:: ../auto_examples/images/plot_kernel_ridge_regression_002.png
+.. figure:: ../auto_examples/images/sphx_glr_plot_kernel_ridge_regression_002.png
    :target: ../auto_examples/plot_kernel_ridge_regression.html
    :align: center
 
diff --git a/doc/modules/label_propagation.rst b/doc/modules/label_propagation.rst
index f4af92eff5..80b65e3f4f 100644
--- a/doc/modules/label_propagation.rst
+++ b/doc/modules/label_propagation.rst
@@ -37,7 +37,7 @@ A few features available in this model:
 :class:`LabelPropagation` and :class:`LabelSpreading`. Both work by
 constructing a similarity graph over all items in the input dataset. 
 
-.. figure:: ../auto_examples/semi_supervised/images/plot_label_propagation_structure_001.png
+.. figure:: ../auto_examples/semi_supervised/images/sphx_glr_plot_label_propagation_structure_001.png
     :target: ../auto_examples/semi_supervised/plot_label_propagation_structure.html
     :align: center
     :scale: 60%
diff --git a/doc/modules/lda_qda.rst b/doc/modules/lda_qda.rst
index f399db28d7..f47a883d0e 100644
--- a/doc/modules/lda_qda.rst
+++ b/doc/modules/lda_qda.rst
@@ -17,7 +17,7 @@ These classifiers are attractive because they have closed-form solutions that
 can be easily computed, are inherently multiclass, have proven to work well in
 practice and have no hyperparameters to tune.
 
-.. |ldaqda| image:: ../auto_examples/classification/images/plot_lda_qda_001.png
+.. |ldaqda| image:: ../auto_examples/classification/images/sphx_glr_plot_lda_qda_001.png
         :target: ../auto_examples/classification/plot_lda_qda.html
         :scale: 80
 
@@ -148,7 +148,7 @@ an estimate for the covariance matrix). Setting this parameter to a value
 between these two extrema will estimate a shrunk version of the covariance
 matrix.
 
-.. |shrinkage| image:: ../auto_examples/classification/images/plot_lda_001.png
+.. |shrinkage| image:: ../auto_examples/classification/images/sphx_glr_plot_lda_001.png
         :target: ../auto_examples/classification/plot_lda.html
         :scale: 75
 
diff --git a/doc/modules/learning_curve.rst b/doc/modules/learning_curve.rst
index 39ecbcbe76..ef6d2fb47f 100644
--- a/doc/modules/learning_curve.rst
+++ b/doc/modules/learning_curve.rst
@@ -21,7 +21,7 @@ the second estimator approximates it almost perfectly and the last estimator
 approximates the training data perfectly but does not fit the true function
 very well, i.e. it is very sensitive to varying training data (high variance).
 
-.. figure:: ../auto_examples/model_selection/images/plot_underfitting_overfitting_001.png
+.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_underfitting_overfitting_001.png
    :target: ../auto_examples/model_selection/plot_underfitting_overfitting.html
    :align: center
    :scale: 50%
@@ -98,7 +98,7 @@ training score and a high validation score is usually not possible. All three
 cases can be found in the plot below where we vary the parameter
 :math:`\gamma` of an SVM on the digits dataset.
 
-.. figure:: ../auto_examples/model_selection/images/plot_validation_curve_001.png
+.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_validation_curve_001.png
    :target: ../auto_examples/model_selection/plot_validation_curve.html
    :align: center
    :scale: 50%
@@ -118,7 +118,7 @@ size of the training set, we will not benefit much from more training data.
 In the following plot you can see an example: naive Bayes roughly converges
 to a low score.
 
-.. figure:: ../auto_examples/model_selection/images/plot_learning_curve_001.png
+.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_learning_curve_001.png
    :target: ../auto_examples/model_selection/plot_learning_curve.html
    :align: center
    :scale: 50%
@@ -130,7 +130,7 @@ the maximum number of training samples, adding more training samples will
 most likely increase generalization. In the following plot you can see that
 the SVM could benefit from more training examples.
 
-.. figure:: ../auto_examples/model_selection/images/plot_learning_curve_002.png
+.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_learning_curve_002.png
    :target: ../auto_examples/model_selection/plot_learning_curve.html
    :align: center
    :scale: 50%
diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 8344233f61..ce0ca79441 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -33,7 +33,7 @@ solves a problem of the form:
 
 .. math:: \underset{w}{min\,} {|| X w - y||_2}^2
 
-.. figure:: ../auto_examples/linear_model/images/plot_ols_001.png
+.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_ols_001.png
    :target: ../auto_examples/linear_model/plot_ols.html
    :align: center
    :scale: 50%
@@ -90,7 +90,7 @@ Here, :math:`\alpha \geq 0` is a complexity parameter that controls the amount
 of shrinkage: the larger the value of :math:`\alpha`, the greater the amount
 of shrinkage and thus the coefficients become more robust to collinearity.
 
-.. figure:: ../auto_examples/linear_model/images/plot_ridge_path_001.png
+.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_ridge_path_001.png
    :target: ../auto_examples/linear_model/plot_ridge_path.html
    :align: center
    :scale: 50%
@@ -231,11 +231,11 @@ the advantage of exploring more relevant values of `alpha` parameter, and
 if the number of samples is very small compared to the number of
 observations, it is often faster than :class:`LassoCV`.
 
-.. |lasso_cv_1| image:: ../auto_examples/linear_model/images/plot_lasso_model_selection_002.png
+.. |lasso_cv_1| image:: ../auto_examples/linear_model/images/sphx_glr_plot_lasso_model_selection_002.png
     :target: ../auto_examples/linear_model/plot_lasso_model_selection.html
     :scale: 48%
 
-.. |lasso_cv_2| image:: ../auto_examples/linear_model/images/plot_lasso_model_selection_003.png
+.. |lasso_cv_2| image:: ../auto_examples/linear_model/images/sphx_glr_plot_lasso_model_selection_003.png
     :target: ../auto_examples/linear_model/plot_lasso_model_selection.html
     :scale: 48%
 
@@ -256,7 +256,7 @@ is correct, i.e. that the data are actually generated by this model.
 They also tend to break when the problem is badly conditioned
 (more features than samples).
 
-.. figure:: ../auto_examples/linear_model/images/plot_lasso_model_selection_001.png
+.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_lasso_model_selection_001.png
     :target: ../auto_examples/linear_model/plot_lasso_model_selection.html
     :align: center
     :scale: 50%
@@ -282,11 +282,11 @@ with a simple Lasso or a MultiTaskLasso. The Lasso estimates yields
 scattered non-zeros while the non-zeros of the MultiTaskLasso are full
 columns.
 
-.. |multi_task_lasso_1| image:: ../auto_examples/linear_model/images/plot_multi_task_lasso_support_001.png
+.. |multi_task_lasso_1| image:: ../auto_examples/linear_model/images/sphx_glr_plot_multi_task_lasso_support_001.png
     :target: ../auto_examples/linear_model/plot_multi_task_lasso_support.html
     :scale: 48%
 
-.. |multi_task_lasso_2| image:: ../auto_examples/linear_model/images/plot_multi_task_lasso_support_002.png
+.. |multi_task_lasso_2| image:: ../auto_examples/linear_model/images/sphx_glr_plot_multi_task_lasso_support_002.png
     :target: ../auto_examples/linear_model/plot_multi_task_lasso_support.html
     :scale: 48%
 
@@ -343,7 +343,7 @@ The objective function to minimize is in this case
     \frac{\alpha(1-\rho)}{2} ||w||_2 ^ 2}
 
 
-.. figure:: ../auto_examples/linear_model/images/plot_lasso_coordinate_descent_path_001.png
+.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_lasso_coordinate_descent_path_001.png
    :target: ../auto_examples/linear_model/plot_lasso_coordinate_descent_path.html
    :align: center
    :scale: 50%
@@ -433,7 +433,7 @@ algorithm, and unlike the implementation based on coordinate_descent,
 this yields the exact solution, which is piecewise linear as a
 function of the norm of its coefficients.
 
-.. figure:: ../auto_examples/linear_model/images/plot_lasso_lars_001.png
+.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_lasso_lars_001.png
    :target: ../auto_examples/linear_model/plot_lasso_lars.html
    :align: center
    :scale: 50%
@@ -593,7 +593,7 @@ log likelihood*.
 By default :math:`\alpha_1 = \alpha_2 =  \lambda_1 = \lambda_2 = 1.e^{-6}`.
 
 
-.. figure:: ../auto_examples/linear_model/images/plot_bayesian_ridge_001.png
+.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_bayesian_ridge_001.png
    :target: ../auto_examples/linear_model/plot_bayesian_ridge.html
    :align: center
    :scale: 50%
@@ -660,7 +660,7 @@ has its own standard deviation :math:`\lambda_i`. The prior over all
 :math:`\lambda_i` is chosen to be the same gamma distribution given by
 hyperparameters :math:`\lambda_1` and :math:`\lambda_2`.
 
-.. figure:: ../auto_examples/linear_model/images/plot_ard_001.png
+.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_ard_001.png
    :target: ../auto_examples/linear_model/plot_ard.html
    :align: center
    :scale: 50%
@@ -857,7 +857,7 @@ Robustness regression: outliers and modeling errors
 Robust regression is interested in fitting a regression model in the
 presence of corrupt data: either outliers, or error in the model.
 
-.. figure:: ../auto_examples/linear_model/images/plot_theilsen_001.png
+.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_theilsen_001.png
    :target: ../auto_examples/linear_model/plot_theilsen.html
    :scale: 50%
    :align: center
@@ -868,15 +868,15 @@ Different scenario and useful concepts
 There are different things to keep in mind when dealing with data
 corrupted by outliers:
 
-.. |y_outliers| image:: ../auto_examples/linear_model/images/plot_robust_fit_003.png
+.. |y_outliers| image:: ../auto_examples/linear_model/images/sphx_glr_plot_robust_fit_003.png
    :target: ../auto_examples/linear_model/plot_robust_fit.html
    :scale: 60%
 
-.. |X_outliers| image:: ../auto_examples/linear_model/images/plot_robust_fit_002.png
+.. |X_outliers| image:: ../auto_examples/linear_model/images/sphx_glr_plot_robust_fit_002.png
    :target: ../auto_examples/linear_model/plot_robust_fit.html
    :scale: 60%
 
-.. |large_y_outliers| image:: ../auto_examples/linear_model/images/plot_robust_fit_005.png
+.. |large_y_outliers| image:: ../auto_examples/linear_model/images/sphx_glr_plot_robust_fit_005.png
    :target: ../auto_examples/linear_model/plot_robust_fit.html
    :scale: 60%
 
@@ -954,7 +954,7 @@ which may be subject to noise, and outliers, which are e.g. caused by erroneous
 measurements or invalid hypotheses about the data. The resulting model is then
 estimated only from the determined inliers.
 
-.. figure:: ../auto_examples/linear_model/images/plot_ransac_001.png
+.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_ransac_001.png
    :target: ../auto_examples/linear_model/plot_ransac.html
    :align: center
    :scale: 50%
@@ -1037,7 +1037,7 @@ setting, Theil-Sen has a breakdown point of about 29.3% in case of a
 simple linear regression which means that it can tolerate arbitrary
 corrupted data of up to 29.3%.
 
-.. figure:: ../auto_examples/linear_model/images/plot_theilsen_001.png
+.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_theilsen_001.png
    :target: ../auto_examples/linear_model/plot_theilsen.html
    :align: center
    :scale: 50%
@@ -1170,7 +1170,7 @@ flexibility to fit a much broader range of data.
 Here is an example of applying this idea to one-dimensional data, using
 polynomial features of varying degrees:
 
-.. figure:: ../auto_examples/linear_model/images/plot_polynomial_interpolation_001.png
+.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_polynomial_interpolation_001.png
    :target: ../auto_examples/linear_model/plot_polynomial_interpolation.html
    :align: center
    :scale: 50%
diff --git a/doc/modules/manifold.rst b/doc/modules/manifold.rst
index c697fe76d8..4c09fe4fcc 100644
--- a/doc/modules/manifold.rst
+++ b/doc/modules/manifold.rst
@@ -20,7 +20,7 @@ Manifold learning
 
 
 
-.. figure:: ../auto_examples/manifold/images/plot_compare_methods_001.png
+.. figure:: ../auto_examples/manifold/images/sphx_glr_plot_compare_methods_001.png
    :target: ../auto_examples/manifold/plot_compare_methods.html
    :align: center
    :scale: 60
@@ -46,11 +46,11 @@ to be desired.  In a random projection, it is likely that the more
 interesting structure within the data will be lost.
 
 
-.. |digits_img| image:: ../auto_examples/manifold/images/plot_lle_digits_001.png
+.. |digits_img| image:: ../auto_examples/manifold/images/sphx_glr_plot_lle_digits_001.png
     :target: ../auto_examples/manifold/plot_lle_digits.html
     :scale: 50
 
-.. |projected_img| image::  ../auto_examples/manifold/images/plot_lle_digits_002.png
+.. |projected_img| image::  ../auto_examples/manifold/images/sphx_glr_plot_lle_digits_002.png
     :target: ../auto_examples/manifold/plot_lle_digits.html
     :scale: 50
 
@@ -66,11 +66,11 @@ These methods can be powerful, but often miss important non-linear
 structure in the data.
 
 
-.. |PCA_img| image:: ../auto_examples/manifold/images/plot_lle_digits_003.png
+.. |PCA_img| image:: ../auto_examples/manifold/images/sphx_glr_plot_lle_digits_003.png
     :target: ../auto_examples/manifold/plot_lle_digits.html
     :scale: 50
 
-.. |LDA_img| image::  ../auto_examples/manifold/images/plot_lle_digits_004.png
+.. |LDA_img| image::  ../auto_examples/manifold/images/sphx_glr_plot_lle_digits_004.png
     :target: ../auto_examples/manifold/plot_lle_digits.html
     :scale: 50
 
@@ -106,7 +106,7 @@ Isomap seeks a lower-dimensional embedding which maintains geodesic
 distances between all points.  Isomap can be performed with the object
 :class:`Isomap`.
 
-.. figure:: ../auto_examples/manifold/images/plot_lle_digits_005.png
+.. figure:: ../auto_examples/manifold/images/sphx_glr_plot_lle_digits_005.png
    :target: ../auto_examples/manifold/plot_lle_digits.html
    :align: center
    :scale: 50
@@ -163,7 +163,7 @@ Locally linear embedding can be performed with function
 :func:`locally_linear_embedding` or its object-oriented counterpart
 :class:`LocallyLinearEmbedding`.
 
-.. figure:: ../auto_examples/manifold/images/plot_lle_digits_006.png
+.. figure:: ../auto_examples/manifold/images/sphx_glr_plot_lle_digits_006.png
    :target: ../auto_examples/manifold/plot_lle_digits.html
    :align: center
    :scale: 50
@@ -217,7 +217,7 @@ linear embedding* (MLLE).  MLLE can be  performed with function
 :class:`LocallyLinearEmbedding`, with the keyword ``method = 'modified'``.
 It requires ``n_neighbors > n_components``.
 
-.. figure:: ../auto_examples/manifold/images/plot_lle_digits_007.png
+.. figure:: ../auto_examples/manifold/images/sphx_glr_plot_lle_digits_007.png
    :target: ../auto_examples/manifold/plot_lle_digits.html
    :align: center
    :scale: 50
@@ -267,7 +267,7 @@ for small output dimension.  HLLE can be  performed with function
 :class:`LocallyLinearEmbedding`, with the keyword ``method = 'hessian'``.
 It requires ``n_neighbors > n_components * (n_components + 3) / 2``.
 
-.. figure:: ../auto_examples/manifold/images/plot_lle_digits_008.png
+.. figure:: ../auto_examples/manifold/images/sphx_glr_plot_lle_digits_008.png
    :target: ../auto_examples/manifold/plot_lle_digits.html
    :align: center
    :scale: 50
@@ -359,7 +359,7 @@ tangent spaces to learn the embedding.  LTSA can be performed with function
 :func:`locally_linear_embedding` or its object-oriented counterpart
 :class:`LocallyLinearEmbedding`, with the keyword ``method = 'ltsa'``.
 
-.. figure:: ../auto_examples/manifold/images/plot_lle_digits_009.png
+.. figure:: ../auto_examples/manifold/images/sphx_glr_plot_lle_digits_009.png
    :target: ../auto_examples/manifold/plot_lle_digits.html
    :align: center
    :scale: 50
@@ -417,7 +417,7 @@ version, the algorithms will try to preserve the order of the distances, and
 hence seek for a monotonic relationship between the distances in the embedded
 space and the similarities/dissimilarities.
 
-.. figure:: ../auto_examples/manifold/images/plot_lle_digits_010.png
+.. figure:: ../auto_examples/manifold/images/sphx_glr_plot_lle_digits_010.png
    :target: ../auto_examples/manifold/plot_lle_digits.html
    :align: center
    :scale: 50
@@ -452,7 +452,7 @@ A trivial solution to this problem is to set all the points on the origin. In
 order to avoid that, the disparities :math:`\hat{d}_{ij}` are normalized.
 
 
-.. figure:: ../auto_examples/manifold/images/plot_mds_001.png
+.. figure:: ../auto_examples/manifold/images/sphx_glr_plot_mds_001.png
    :target: ../auto_examples/manifold/plot_mds.html
    :align: center
    :scale: 60
@@ -513,7 +513,7 @@ The disadvantages to using t-SNE are roughly:
   initializing points with PCA (using `init='pca'`).
 
 
-.. figure:: ../auto_examples/manifold/images/plot_lle_digits_013.png
+.. figure:: ../auto_examples/manifold/images/sphx_glr_plot_lle_digits_013.png
    :target: ../auto_examples/manifold/plot_lle_digits.html
    :align: center
    :scale: 50
diff --git a/doc/modules/mixture.rst b/doc/modules/mixture.rst
index 8249b217da..585ddca26b 100644
--- a/doc/modules/mixture.rst
+++ b/doc/modules/mixture.rst
@@ -14,7 +14,7 @@ matrices supported), sample them, and estimate them from
 data. Facilities to help determine the appropriate number of
 components are also provided.
 
- .. figure:: ../auto_examples/mixture/images/plot_gmm_pdf_001.png
+ .. figure:: ../auto_examples/mixture/images/sphx_glr_plot_gmm_pdf_001.png
    :target: ../auto_examples/mixture/plot_gmm_pdf.html
    :align: center
    :scale: 50%
@@ -55,7 +55,7 @@ The :class:`GaussianMixture` comes with different options to constrain the
 covariance of the difference classes estimated: spherical, diagonal, tied or
 full covariance.
 
-.. figure:: ../auto_examples/mixture/images/plot_gmm_covariances_001.png
+.. figure:: ../auto_examples/mixture/images/sphx_glr_plot_gmm_covariances_001.png
    :target: ../auto_examples/mixture/plot_gmm_covariances.html
    :align: center
    :scale: 75%
@@ -102,7 +102,7 @@ only in the asymptotic regime (i.e. if much data is available).
 Note that using a :ref:`DPGMM <dpgmm>` avoids the specification of the
 number of components for a Gaussian mixture model.
 
-.. figure:: ../auto_examples/mixture/images/plot_gmm_selection_001.png
+.. figure:: ../auto_examples/mixture/images/sphx_glr_plot_gmm_selection_001.png
    :target: ../auto_examples/mixture/plot_gmm_selection.html
    :align: center
    :scale: 50%
@@ -208,11 +208,11 @@ components, and at the expense of extra computational time the user
 only needs to specify a loose upper bound on this number and a
 concentration parameter.
 
-.. |plot_gmm| image:: ../auto_examples/mixture/images/plot_gmm_001.png
+.. |plot_gmm| image:: ../auto_examples/mixture/images/sphx_glr_plot_gmm_001.png
    :target: ../auto_examples/mixture/plot_gmm.html
    :scale: 48%
 
-.. |plot_gmm_sin| image:: ../auto_examples/mixture/images/plot_gmm_sin_001.png
+.. |plot_gmm_sin| image:: ../auto_examples/mixture/images/sphx_glr_plot_gmm_sin_001.png
    :target: ../auto_examples/mixture/plot_gmm_sin.html
    :scale: 48%
 
@@ -321,5 +321,3 @@ complexity, not the actual number of components used).
     :hidden:
 
     dp-derivation.rst
-
-
diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 1e82c3943c..ba511a735b 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -395,7 +395,7 @@ predicted to be in group :math:`j`. Here is an example::
 Here is a visual representation of such a confusion matrix (this figure comes
 from the :ref:`example_model_selection_plot_confusion_matrix.py` example):
 
-.. image:: ../auto_examples/model_selection/images/plot_confusion_matrix_001.png
+.. image:: ../auto_examples/model_selection/images/sphx_glr_plot_confusion_matrix_001.png
    :target: ../auto_examples/model_selection/plot_confusion_matrix.html
    :scale: 75
    :align: center
@@ -935,7 +935,7 @@ Here is a small example of how to use the :func:`roc_curve` function::
 
 This figure shows an example of such an ROC curve:
 
-.. image:: ../auto_examples/model_selection/images/plot_roc_001.png
+.. image:: ../auto_examples/model_selection/images/sphx_glr_plot_roc_001.png
    :target: ../auto_examples/model_selection/plot_roc.html
    :scale: 75
    :align: center
@@ -963,7 +963,7 @@ F1 score, ROC doesn't require optimizing a threshold for each label. The
 if the predicted outputs have been binarized.
 
 
-.. image:: ../auto_examples/model_selection/images/plot_roc_002.png
+.. image:: ../auto_examples/model_selection/images/sphx_glr_plot_roc_002.png
    :target: ../auto_examples/model_selection/plot_roc.html
    :scale: 75
    :align: center
diff --git a/doc/modules/multiclass.rst b/doc/modules/multiclass.rst
index dfaafe56d3..70aeff5d64 100644
--- a/doc/modules/multiclass.rst
+++ b/doc/modules/multiclass.rst
@@ -151,7 +151,7 @@ To use this feature, feed the classifier an indicator matrix, in which cell
 [i, j] indicates the presence of label j in sample i.
 
 
-.. figure:: ../auto_examples/images/plot_multilabel_001.png
+.. figure:: ../auto_examples/images/sphx_glr_plot_multilabel_001.png
     :target: ../auto_examples/plot_multilabel.html
     :align: center
     :scale: 75%
diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index 2a1d61ca8b..450e8e2475 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -184,11 +184,11 @@ distance can be supplied which is used to compute the weights.
 
 
 
-.. |classification_1| image:: ../auto_examples/neighbors/images/plot_classification_001.png
+.. |classification_1| image:: ../auto_examples/neighbors/images/sphx_glr_plot_classification_001.png
    :target: ../auto_examples/neighbors/plot_classification.html
    :scale: 50
 
-.. |classification_2| image:: ../auto_examples/neighbors/images/plot_classification_002.png
+.. |classification_2| image:: ../auto_examples/neighbors/images/sphx_glr_plot_classification_002.png
    :target: ../auto_examples/neighbors/plot_classification.html
    :scale: 50
 
@@ -227,7 +227,7 @@ weights proportional to the inverse of the distance from the query point.
 Alternatively, a user-defined function of the distance can be supplied,
 which will be used to compute the weights.
 
-.. figure:: ../auto_examples/neighbors/images/plot_regression_001.png
+.. figure:: ../auto_examples/neighbors/images/sphx_glr_plot_regression_001.png
    :target: ../auto_examples/neighbors/plot_regression.html
    :align: center
    :scale: 75
@@ -237,7 +237,7 @@ The use of multi-output nearest neighbors for regression is demonstrated in
 X are the pixels of the upper half of faces and the outputs Y are the pixels of
 the lower half of those faces.
 
-.. figure:: ../auto_examples/images/plot_multioutput_face_completion_001.png
+.. figure:: ../auto_examples/images/sphx_glr_plot_multioutput_face_completion_001.png
    :target: ../auto_examples/plot_multioutput_face_completion.html
    :scale: 75
    :align: center
@@ -497,11 +497,11 @@ This is useful, for example, for removing noisy features.
 In the example below, using a small shrink threshold increases the accuracy of
 the model from 0.81 to 0.82.
 
-.. |nearest_centroid_1| image:: ../auto_examples/neighbors/images/plot_nearest_centroid_001.png
+.. |nearest_centroid_1| image:: ../auto_examples/neighbors/images/sphx_glr_plot_nearest_centroid_001.png
    :target: ../auto_examples/neighbors/plot_nearest_centroid.html
    :scale: 50
 
-.. |nearest_centroid_2| image:: ../auto_examples/neighbors/images/plot_nearest_centroid_002.png
+.. |nearest_centroid_2| image:: ../auto_examples/neighbors/images/sphx_glr_plot_nearest_centroid_002.png
    :target: ../auto_examples/neighbors/plot_nearest_centroid.html
    :scale: 50
 
@@ -551,12 +551,12 @@ Hashing <mathematical_description_of_lsh>`).
 ``n_candidates``. The accuracy of queries can be controlled using these
 parameters as demonstrated in the following plots:
 
-.. figure:: ../auto_examples/neighbors/images/plot_approximate_nearest_neighbors_hyperparameters_001.png
+.. figure:: ../auto_examples/neighbors/images/sphx_glr_plot_approximate_nearest_neighbors_hyperparameters_001.png
    :target: ../auto_examples/neighbors/plot_approximate_nearest_neighbors_hyperparameters.html
    :align: center
    :scale: 50
 
-.. figure:: ../auto_examples/neighbors/images/plot_approximate_nearest_neighbors_hyperparameters_002.png
+.. figure:: ../auto_examples/neighbors/images/sphx_glr_plot_approximate_nearest_neighbors_hyperparameters_002.png
    :target: ../auto_examples/neighbors/plot_approximate_nearest_neighbors_hyperparameters.html
    :align: center
    :scale: 50
@@ -574,17 +574,17 @@ environment such as availability of BLAS optimizations, number of CPU cores and
 size of the CPU caches. Following graphs depict scalability of LSHForest queries
 with index size.
 
-.. figure:: ../auto_examples/neighbors/images/plot_approximate_nearest_neighbors_scalability_001.png
+.. figure:: ../auto_examples/neighbors/images/sphx_glr_plot_approximate_nearest_neighbors_scalability_001.png
    :target: ../auto_examples/neighbors/plot_approximate_nearest_neighbors_scalability.html
    :align: center
    :scale: 50
 
-.. figure:: ../auto_examples/neighbors/images/plot_approximate_nearest_neighbors_scalability_002.png
+.. figure:: ../auto_examples/neighbors/images/sphx_glr_plot_approximate_nearest_neighbors_scalability_002.png
    :target: ../auto_examples/neighbors/plot_approximate_nearest_neighbors_scalability.html
    :align: center
    :scale: 50
 
-.. figure:: ../auto_examples/neighbors/images/plot_approximate_nearest_neighbors_scalability_003.png
+.. figure:: ../auto_examples/neighbors/images/sphx_glr_plot_approximate_nearest_neighbors_scalability_003.png
    :target: ../auto_examples/neighbors/plot_approximate_nearest_neighbors_scalability.html
    :align: center
    :scale: 50
diff --git a/doc/modules/neural_networks_unsupervised.rst b/doc/modules/neural_networks_unsupervised.rst
index ec0dfc146f..a7fe5cda87 100644
--- a/doc/modules/neural_networks_unsupervised.rst
+++ b/doc/modules/neural_networks_unsupervised.rst
@@ -32,7 +32,7 @@ density estimation.
 The method gained popularity for initializing deep neural networks with the
 weights of independent RBMs. This method is known as unsupervised pre-training.
 
-.. figure:: ../auto_examples/neural_networks/images/plot_rbm_logistic_classification_001.png
+.. figure:: ../auto_examples/neural_networks/images/sphx_glr_plot_rbm_logistic_classification_001.png
    :target: ../auto_examples/neural_networks/plot_rbm_logistic_classification.html
    :align: center
    :scale: 100%
diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst
index d3b1281054..c609f51ad6 100644
--- a/doc/modules/outlier_detection.rst
+++ b/doc/modules/outlier_detection.rst
@@ -12,7 +12,7 @@ belongs to the same distribution as existing observations (it is an
 Often, this ability is used to clean real data sets. Two important
 distinction must be made:
 
-:novelty detection: 
+:novelty detection:
   The training data is not polluted by outliers, and we are interested in
   detecting anomalies in new observations.
 
@@ -53,7 +53,7 @@ coming from the same population than the initial
 observations. Otherwise, if they lay outside the frontier, we can say
 that they are abnormal with a given confidence in our assessment.
 
-The One-Class SVM has been introduced by Schölkopf et al. for that purpose 
+The One-Class SVM has been introduced by Schölkopf et al. for that purpose
 and implemented in the :ref:`svm` module in the
 :class:`svm.OneClassSVM` object. It requires the choice of a
 kernel and a scalar parameter to define a frontier.  The RBF kernel is
@@ -66,20 +66,20 @@ but regular, observation outside the frontier.
 .. topic:: References:
 
     * `Estimating the support of a high-dimensional distribution
-      <http://dl.acm.org/citation.cfm?id=1119749>`_ Schölkopf, 
+      <http://dl.acm.org/citation.cfm?id=1119749>`_ Schölkopf,
       Bernhard, et al. Neural computation 13.7 (2001): 1443-1471.
-      
+
 .. topic:: Examples:
 
    * See :ref:`example_svm_plot_oneclass.py` for visualizing the
      frontier learned around some data by a
      :class:`svm.OneClassSVM` object.
 
-.. figure:: ../auto_examples/svm/images/plot_oneclass_001.png
+.. figure:: ../auto_examples/svm/images/sphx_glr_plot_oneclass_001.png
    :target: ../auto_examples/svm/plot_oneclass.html
    :align: center
    :scale: 75%
-   
+
 
 Outlier Detection
 =================
@@ -98,7 +98,7 @@ One common way of performing outlier detection is to assume that the
 regular data come from a known distribution (e.g. data are Gaussian
 distributed). From this assumption, we generally try to define the
 "shape" of the data, and can define outlying observations as
-observations which stand far enough from the fit shape. 
+observations which stand far enough from the fit shape.
 
 The scikit-learn provides an object
 :class:`covariance.EllipticEnvelope` that fits a robust covariance
@@ -111,7 +111,7 @@ whithout being influenced by outliers). The Mahalanobis distances
 obtained from this estimate is used to derive a measure of outlyingness.
 This strategy is illustrated below.
 
-.. figure:: ../auto_examples/covariance/images/plot_mahalanobis_distances_001.png
+.. figure:: ../auto_examples/covariance/images/sphx_glr_plot_mahalanobis_distances_001.png
    :target: ../auto_examples/covariance/plot_mahalanobis_distances.html
    :align: center
    :scale: 75%
@@ -153,7 +153,7 @@ lengths for particular samples, they are highly likely to be anomalies.
 
 This strategy is illustrated below.
 
-.. figure:: ../auto_examples/ensemble/images/plot_isolation_forest_001.png
+.. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_isolation_forest_001.png
    :target: ../auto_examples/ensemble/plot_isolation_forest.html
    :align: center
    :scale: 75%
@@ -174,7 +174,7 @@ This strategy is illustrated below.
     .. [LTZ2008] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation forest."
            Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on.
 
-     
+
 One-class SVM versus Elliptic Envelope versus Isolation Forest
 --------------------------------------------------------------
 
@@ -190,15 +190,15 @@ The examples below illustrate how the performance of the
 less unimodal. The :class:`svm.OneClassSVM` works better on data with
 multiple modes and :class:`ensemble.IsolationForest` performs well in every cases.
 
-.. |outlier1| image:: ../auto_examples/covariance/images/plot_outlier_detection_001.png
+.. |outlier1| image:: ../auto_examples/covariance/images/sphx_glr_plot_outlier_detection_001.png
    :target: ../auto_examples/covariance/plot_outlier_detection.html
    :scale: 50%
 
-.. |outlier2| image:: ../auto_examples/covariance/images/plot_outlier_detection_002.png
+.. |outlier2| image:: ../auto_examples/covariance/images/sphx_glr_plot_outlier_detection_002.png
    :target: ../auto_examples/covariance/plot_outlier_detection.html
    :scale: 50%
 
-.. |outlier3| image:: ../auto_examples/covariance/images/plot_outlier_detection_003.png
+.. |outlier3| image:: ../auto_examples/covariance/images/sphx_glr_plot_outlier_detection_003.png
    :target: ../auto_examples/covariance/plot_outlier_detection.html
    :scale: 50%
 
@@ -214,9 +214,9 @@ multiple modes and :class:`ensemble.IsolationForest` performs well in every case
         :class:`covariance.EllipticEnvelope` learns an ellipse, which
         fits well the inlier distribution. The :class:`ensemble.IsolationForest`
 	performs as well.
-      - |outlier1| 
+      - |outlier1|
 
-   * 
+   *
       - As the inlier distribution becomes bimodal, the
         :class:`covariance.EllipticEnvelope` does not fit well the
         inliers. However, we can see that both :class:`ensemble.IsolationForest`
@@ -224,10 +224,10 @@ multiple modes and :class:`ensemble.IsolationForest` performs well in every case
 	and that the :class:`svm.OneClassSVM`
         tends to overfit: because it has not model of inliers, it
         interprets a region where, by chance some outliers are
-        clustered, as inliers. 
-      - |outlier2| 
+        clustered, as inliers.
+      - |outlier2|
 
-   * 
+   *
       - If the inlier distribution is strongly non Gaussian, the
         :class:`svm.OneClassSVM` is able to recover a reasonable
         approximation as well as :class:`ensemble.IsolationForest`,
diff --git a/doc/modules/random_projection.rst b/doc/modules/random_projection.rst
index d0f733b532..0877a0ba4f 100644
--- a/doc/modules/random_projection.rst
+++ b/doc/modules/random_projection.rst
@@ -64,12 +64,12 @@ bounded distortion introduced by the random projection::
   >>> johnson_lindenstrauss_min_dim(n_samples=[1e4, 1e5, 1e6], eps=0.1)
   array([ 7894,  9868, 11841])
 
-.. figure:: ../auto_examples/images/plot_johnson_lindenstrauss_bound_001.png
+.. figure:: ../auto_examples/images/sphx_glr_plot_johnson_lindenstrauss_bound_001.png
    :target: ../auto_examples/plot_johnson_lindenstrauss_bound.html
    :scale: 75
    :align: center
 
-.. figure:: ../auto_examples/images/plot_johnson_lindenstrauss_bound_002.png
+.. figure:: ../auto_examples/images/sphx_glr_plot_johnson_lindenstrauss_bound_002.png
    :target: ../auto_examples/plot_johnson_lindenstrauss_bound.html
    :scale: 75
    :align: center
diff --git a/doc/modules/scaling_strategies.rst b/doc/modules/scaling_strategies.rst
index e0f561b3bf..9d4e40ee27 100644
--- a/doc/modules/scaling_strategies.rst
+++ b/doc/modules/scaling_strategies.rst
@@ -97,7 +97,7 @@ systems and demonstrates most of the notions discussed above.
 Furthermore, it also shows the evolution of the performance of different
 algorithms with the number of processed examples.
 
-.. |accuracy_over_time| image::  ../auto_examples/applications/images/plot_out_of_core_classification_001.png
+.. |accuracy_over_time| image::  ../auto_examples/applications/images/sphx_glr_plot_out_of_core_classification_001.png
     :target: ../auto_examples/applications/plot_out_of_core_classification.html
     :scale: 80
 
@@ -109,7 +109,7 @@ algorithms, ``MultinomialNB`` is the most expensive, but its overhead can be
 mitigated by increasing the size of the mini-batches (exercise: change
 ``minibatch_size`` to 100 and 10000 in the program and compare).
 
-.. |computation_time| image::  ../auto_examples/applications/images/plot_out_of_core_classification_003.png
+.. |computation_time| image::  ../auto_examples/applications/images/sphx_glr_plot_out_of_core_classification_003.png
     :target: ../auto_examples/applications/plot_out_of_core_classification.html
     :scale: 80
 
diff --git a/doc/modules/sgd.rst b/doc/modules/sgd.rst
index 2ac5647002..587f32e6c0 100644
--- a/doc/modules/sgd.rst
+++ b/doc/modules/sgd.rst
@@ -46,7 +46,7 @@ The class :class:`SGDClassifier` implements a plain stochastic gradient
 descent learning routine which supports different loss functions and
 penalties for classification.
 
-.. figure:: ../auto_examples/linear_model/images/plot_sgd_separating_hyperplane_001.png
+.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_sgd_separating_hyperplane_001.png
    :target: ../auto_examples/linear_model/plot_sgd_separating_hyperplane.html
    :align: center
    :scale: 75
@@ -137,7 +137,7 @@ below illustrates the OVA approach on the iris dataset.  The dashed
 lines represent the three OVA classifiers; the background colors show
 the decision surface induced by the three classifiers.
 
-.. figure:: ../auto_examples/linear_model/images/plot_sgd_iris_001.png
+.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_sgd_iris_001.png
    :target: ../auto_examples/linear_model/plot_sgd_iris.html
    :align: center
    :scale: 75
@@ -304,7 +304,7 @@ Different choices for :math:`L` entail different classifiers such as
 All of the above loss functions can be regarded as an upper bound on the
 misclassification error (Zero-one loss) as shown in the Figure below.
 
-.. figure:: ../auto_examples/linear_model/images/plot_sgd_loss_functions_001.png
+.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_sgd_loss_functions_001.png
     :target: ../auto_examples/linear_model/plot_sgd_loss_functions.html
     :align: center
     :scale: 75
@@ -319,7 +319,7 @@ Popular choices for the regularization term :math:`R` include:
 The Figure below shows the contours of the different regularization terms
 in the parameter space when :math:`R(w) = 1`.
 
-.. figure:: ../auto_examples/linear_model/images/plot_sgd_penalties_001.png
+.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_sgd_penalties_001.png
     :target: ../auto_examples/linear_model/plot_sgd_penalties.html
     :align: center
     :scale: 75
diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst
index 7e2d8b0dba..a7041f0942 100644
--- a/doc/modules/svm.rst
+++ b/doc/modules/svm.rst
@@ -51,7 +51,7 @@ Classification
 capable of performing multi-class classification on a dataset.
 
 
-.. figure:: ../auto_examples/svm/images/plot_iris_001.png
+.. figure:: ../auto_examples/svm/images/sphx_glr_plot_iris_001.png
    :target: ../auto_examples/svm/plot_iris.html
    :align: center
 
@@ -254,7 +254,7 @@ classes or certain individual samples keywords ``class_weight`` and
 ``{class_label : value}``, where value is a floating point number > 0
 that sets the parameter ``C`` of class ``class_label`` to ``C * value``.
 
-.. figure:: ../auto_examples/svm/images/plot_separating_hyperplane_unbalanced_001.png
+.. figure:: ../auto_examples/svm/images/sphx_glr_plot_separating_hyperplane_unbalanced_001.png
    :target: ../auto_examples/svm/plot_separating_hyperplane_unbalanced.html
    :align: center
    :scale: 75
@@ -266,7 +266,7 @@ that sets the parameter ``C`` of class ``class_label`` to ``C * value``.
 set the parameter ``C`` for the i-th example to ``C * sample_weight[i]``.
 
 
-.. figure:: ../auto_examples/svm/images/plot_weighted_samples_001.png
+.. figure:: ../auto_examples/svm/images/sphx_glr_plot_weighted_samples_001.png
    :target: ../auto_examples/svm/plot_weighted_samples.html
    :align: center
    :scale: 75
@@ -341,7 +341,7 @@ will only take as input an array X, as there are no class labels.
 
 See, section :ref:`outlier_detection` for more details on this usage.
 
-.. figure:: ../auto_examples/svm/images/plot_oneclass_001.png
+.. figure:: ../auto_examples/svm/images/sphx_glr_plot_oneclass_001.png
    :target: ../auto_examples/svm/plot_oneclass.html
    :align: center
    :scale: 75
@@ -556,7 +556,7 @@ margin), since in general the larger the margin the lower the
 generalization error of the classifier.
 
 
-.. figure:: ../auto_examples/svm/images/plot_separating_hyperplane_001.png
+.. figure:: ../auto_examples/svm/images/sphx_glr_plot_separating_hyperplane_001.png
    :align: center
    :scale: 75
 
diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst
index 07cc8ac578..70d6dd4968 100644
--- a/doc/modules/tree.rst
+++ b/doc/modules/tree.rst
@@ -16,7 +16,7 @@ For instance, in the example below, decision trees learn from data to
 approximate a sine curve with a set of if-then-else decision rules. The deeper
 the tree, the more complex the decision rules and the fitter the model.
 
-.. figure:: ../auto_examples/tree/images/plot_tree_regression_001.png
+.. figure:: ../auto_examples/tree/images/sphx_glr_plot_tree_regression_001.png
    :target: ../auto_examples/tree/plot_tree_regression.html
    :scale: 75
    :align: center
@@ -186,7 +186,7 @@ fraction of training samples of the same class in a leaf::
     >>> clf.predict_proba(iris.data[:1, :])
     array([[ 1.,  0.,  0.]])
 
-.. figure:: ../auto_examples/tree/images/plot_iris_001.png
+.. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_001.png
    :target: ../auto_examples/tree/plot_iris.html
    :align: center
    :scale: 75
@@ -201,7 +201,7 @@ fraction of training samples of the same class in a leaf::
 Regression
 ==========
 
-.. figure:: ../auto_examples/tree/images/plot_tree_regression_001.png
+.. figure:: ../auto_examples/tree/images/sphx_glr_plot_tree_regression_001.png
    :target: ../auto_examples/tree/plot_tree_regression.html
    :scale: 75
    :align: center
@@ -265,7 +265,7 @@ The use of multi-output trees for regression is demonstrated in
 :ref:`example_tree_plot_tree_regression_multioutput.py`. In this example, the input
 X is a single real value and the outputs Y are the sine and cosine of X.
 
-.. figure:: ../auto_examples/tree/images/plot_tree_regression_multioutput_001.png
+.. figure:: ../auto_examples/tree/images/sphx_glr_plot_tree_regression_multioutput_001.png
    :target: ../auto_examples/tree/plot_tree_regression_multioutput.html
    :scale: 75
    :align: center
@@ -275,7 +275,7 @@ The use of multi-output trees for classification is demonstrated in
 X are the pixels of the upper half of faces and the outputs Y are the pixels of
 the lower half of those faces.
 
-.. figure:: ../auto_examples/images/plot_multioutput_face_completion_001.png
+.. figure:: ../auto_examples/images/sphx_glr_plot_multioutput_face_completion_001.png
    :target: ../auto_examples/plot_multioutput_face_completion.html
    :scale: 75
    :align: center
diff --git a/doc/tutorial/basic/tutorial.rst b/doc/tutorial/basic/tutorial.rst
index 181729fb81..1c1ecbe708 100644
--- a/doc/tutorial/basic/tutorial.rst
+++ b/doc/tutorial/basic/tutorial.rst
@@ -190,7 +190,7 @@ which we have not used to train the classifier::
 
 The corresponding image is the following:
 
-.. image:: ../../auto_examples/datasets/images/plot_digits_last_image_001.png
+.. image:: ../../auto_examples/datasets/images/sphx_glr_plot_digits_last_image_001.png
     :target: ../../auto_examples/datasets/plot_digits_last_image.html
     :align: center
     :scale: 50
diff --git a/doc/tutorial/statistical_inference/model_selection.rst b/doc/tutorial/statistical_inference/model_selection.rst
index 67215503cb..e67584ab56 100644
--- a/doc/tutorial/statistical_inference/model_selection.rst
+++ b/doc/tutorial/statistical_inference/model_selection.rst
@@ -183,7 +183,7 @@ scoring method.
 .. topic:: **Exercise**
    :class: green
 
-   .. image:: ../../auto_examples/exercises/images/plot_cv_digits_001.png
+   .. image:: ../../auto_examples/exercises/images/sphx_glr_plot_cv_digits_001.png
         :target: ../../auto_examples/exercises/plot_cv_digits.html
         :align: right
         :scale: 90
diff --git a/doc/tutorial/statistical_inference/putting_together.rst b/doc/tutorial/statistical_inference/putting_together.rst
index 7ca7abfc5a..513eb2ef41 100644
--- a/doc/tutorial/statistical_inference/putting_together.rst
+++ b/doc/tutorial/statistical_inference/putting_together.rst
@@ -11,7 +11,7 @@ Pipelining
 We have seen that some estimators can transform data and that some estimators
 can predict variables. We can also create combined estimators:
 
-.. image:: ../../auto_examples/images/plot_digits_pipe_001.png
+.. image:: ../../auto_examples/images/sphx_glr_plot_digits_pipe_001.png
    :target: ../../auto_examples/plot_digits_pipe.html
    :scale: 65
    :align: right
@@ -34,10 +34,10 @@ The dataset used in this example is a preprocessed excerpt of the
 
 .. literalinclude:: ../../auto_examples/applications/face_recognition.py
 
-.. |prediction| image:: ../../images/plot_face_recognition_1.png
+.. |prediction| image:: ../../images/sphx_glr_plot_face_recognition_1.png
    :scale: 50
  
-.. |eigenfaces| image:: ../../images/plot_face_recognition_2.png
+.. |eigenfaces| image:: ../../images/sphx_glr_plot_face_recognition_2.png
    :scale: 50
 
 .. list-table::
diff --git a/doc/tutorial/statistical_inference/settings.rst b/doc/tutorial/statistical_inference/settings.rst
index 3537151d01..368915c24a 100644
--- a/doc/tutorial/statistical_inference/settings.rst
+++ b/doc/tutorial/statistical_inference/settings.rst
@@ -31,7 +31,7 @@ needs to be preprocessed in order to be used by scikit-learn.
 
 .. topic:: An example of reshaping data would be the digits dataset
 
-    .. image:: ../../auto_examples/datasets/images/plot_digits_last_image_001.png
+    .. image:: ../../auto_examples/datasets/images/sphx_glr_plot_digits_last_image_001.png
         :target: ../../auto_examples/datasets/plot_digits_last_image.html
         :align: right
         :scale: 60
diff --git a/doc/tutorial/statistical_inference/supervised_learning.rst b/doc/tutorial/statistical_inference/supervised_learning.rst
index eaa8a4cd3c..65bc21ae62 100644
--- a/doc/tutorial/statistical_inference/supervised_learning.rst
+++ b/doc/tutorial/statistical_inference/supervised_learning.rst
@@ -38,7 +38,7 @@ Nearest neighbor and the curse of dimensionality
 
 .. topic:: Classifying irises:
 
-    .. image:: ../../auto_examples/datasets/images/plot_iris_dataset_001.png
+    .. image:: ../../auto_examples/datasets/images/sphx_glr_plot_iris_dataset_001.png
         :target: ../../auto_examples/datasets/plot_iris_dataset.html
         :align: right
 	:scale: 65
@@ -75,7 +75,7 @@ Scikit-learn documentation for more information about this type of classifier.)
 
 **KNN (k nearest neighbors) classification example**:
 
-.. image:: ../../auto_examples/neighbors/images/plot_classification_001.png
+.. image:: ../../auto_examples/neighbors/images/sphx_glr_plot_classification_001.png
    :target: ../../auto_examples/neighbors/plot_classification.html
    :align: center
    :scale: 70
@@ -159,7 +159,7 @@ in its simplest form, fits a linear model to the data set by adjusting
 a set of parameters in order to make the sum of the squared residuals
 of the model as small as possible.
 
-.. image:: ../../auto_examples/linear_model/images/plot_ols_001.png
+.. image:: ../../auto_examples/linear_model/images/sphx_glr_plot_ols_001.png
    :target: ../../auto_examples/linear_model/plot_ols.html
    :scale: 40
    :align: right
@@ -200,7 +200,7 @@ Shrinkage
 If there are few data points per dimension, noise in the observations
 induces high variance:
 
-.. image:: ../../auto_examples/linear_model/images/plot_ols_ridge_variance_001.png
+.. image:: ../../auto_examples/linear_model/images/sphx_glr_plot_ols_ridge_variance_001.png
    :target: ../../auto_examples/linear_model/plot_ols_ridge_variance.html
    :scale: 70
    :align: right
@@ -229,7 +229,7 @@ regression coefficients to zero: any two randomly chosen set of
 observations are likely to be uncorrelated. This is called :class:`Ridge`
 regression:
 
-.. image:: ../../auto_examples/linear_model/images/plot_ols_ridge_variance_002.png
+.. image:: ../../auto_examples/linear_model/images/sphx_glr_plot_ols_ridge_variance_002.png
    :target: ../../auto_examples/linear_model/plot_ols_ridge_variance.html
    :scale: 70
    :align: right
@@ -275,15 +275,15 @@ Sparsity
 ----------
 
 
-.. |diabetes_ols_1| image:: ../../auto_examples/linear_model/images/plot_ols_3d_001.png
+.. |diabetes_ols_1| image:: ../../auto_examples/linear_model/images/sphx_glr_plot_ols_3d_001.png
    :target: ../../auto_examples/linear_model/plot_ols_3d.html
    :scale: 65
 
-.. |diabetes_ols_3| image:: ../../auto_examples/linear_model/images/plot_ols_3d_003.png
+.. |diabetes_ols_3| image:: ../../auto_examples/linear_model/images/sphx_glr_plot_ols_3d_003.png
    :target: ../../auto_examples/linear_model/plot_ols_3d.html
    :scale: 65
 
-.. |diabetes_ols_2| image:: ../../auto_examples/linear_model/images/plot_ols_3d_002.png
+.. |diabetes_ols_2| image:: ../../auto_examples/linear_model/images/sphx_glr_plot_ols_3d_002.png
    :target: ../../auto_examples/linear_model/plot_ols_3d.html
    :scale: 65
 
@@ -350,7 +350,7 @@ application of Occam's razor: *prefer simpler models*.
 Classification
 ---------------
 
-.. image:: ../../auto_examples/linear_model/images/plot_logistic_001.png
+.. image:: ../../auto_examples/linear_model/images/sphx_glr_plot_logistic_001.png
    :target: ../../auto_examples/linear_model/plot_logistic.html
    :scale: 65
    :align: right
@@ -377,7 +377,7 @@ function or **logistic** function:
 
 This is known as :class:`LogisticRegression`.
 
-.. image:: ../../auto_examples/linear_model/images/plot_iris_logistic_001.png
+.. image:: ../../auto_examples/linear_model/images/sphx_glr_plot_iris_logistic_001.png
    :target: ../../auto_examples/linear_model/plot_iris_logistic.html
    :scale: 83
 
@@ -425,11 +425,11 @@ the separating line (less regularization).
 
 .. currentmodule :: sklearn.svm
 
-.. |svm_margin_unreg| image:: ../../auto_examples/svm/images/plot_svm_margin_001.png
+.. |svm_margin_unreg| image:: ../../auto_examples/svm/images/sphx_glr_plot_svm_margin_001.png
    :target: ../../auto_examples/svm/plot_svm_margin.html
    :scale: 70
 
-.. |svm_margin_reg| image:: ../../auto_examples/svm/images/plot_svm_margin_002.png
+.. |svm_margin_reg| image:: ../../auto_examples/svm/images/sphx_glr_plot_svm_margin_002.png
    :target: ../../auto_examples/svm/plot_svm_margin.html
    :scale: 70
 
@@ -476,11 +476,11 @@ build a decision function that is not linear but may be polynomial instead.
 This is done using the *kernel trick* that can be seen as
 creating a decision energy by positioning *kernels* on observations:
 
-.. |svm_kernel_linear| image:: ../../auto_examples/svm/images/plot_svm_kernels_001.png
+.. |svm_kernel_linear| image:: ../../auto_examples/svm/images/sphx_glr_plot_svm_kernels_001.png
    :target: ../../auto_examples/svm/plot_svm_kernels.html
    :scale: 65
 
-.. |svm_kernel_poly| image:: ../../auto_examples/svm/images/plot_svm_kernels_002.png
+.. |svm_kernel_poly| image:: ../../auto_examples/svm/images/sphx_glr_plot_svm_kernels_002.png
    :target: ../../auto_examples/svm/plot_svm_kernels.html
    :scale: 65
 
@@ -518,7 +518,7 @@ creating a decision energy by positioning *kernels* on observations:
 
 
 
-.. |svm_kernel_rbf| image:: ../../auto_examples/svm/images/plot_svm_kernels_003.png
+.. |svm_kernel_rbf| image:: ../../auto_examples/svm/images/sphx_glr_plot_svm_kernels_003.png
    :target: ../../auto_examples/svm/plot_svm_kernels.html
    :scale: 65
 
@@ -551,7 +551,7 @@ creating a decision energy by positioning *kernels* on observations:
    ``svm_gui.py``; add data points of both classes with right and left button,
    fit the model and change parameters and data.
 
-.. image:: ../../auto_examples/datasets/images/plot_iris_dataset_001.png
+.. image:: ../../auto_examples/datasets/images/sphx_glr_plot_iris_dataset_001.png
     :target: ../../auto_examples/datasets/plot_iris_dataset.html
     :align: right
     :scale: 70
diff --git a/doc/tutorial/statistical_inference/unsupervised_learning.rst b/doc/tutorial/statistical_inference/unsupervised_learning.rst
index 69b2aa49fa..fc448d0400 100644
--- a/doc/tutorial/statistical_inference/unsupervised_learning.rst
+++ b/doc/tutorial/statistical_inference/unsupervised_learning.rst
@@ -24,7 +24,7 @@ Note that there exist a lot of different clustering criteria and associated
 algorithms. The simplest clustering algorithm is
 :ref:`k_means`.
 
-.. image:: ../../auto_examples/cluster/images/plot_cluster_iris_002.png
+.. image:: ../../auto_examples/cluster/images/sphx_glr_plot_cluster_iris_002.png
     :target: ../../auto_examples/cluster/plot_cluster_iris.html
     :scale: 70
     :align: right
@@ -45,15 +45,15 @@ algorithms. The simplest clustering algorithm is
     >>> print(y_iris[::10])
     [0 0 0 0 0 1 1 1 1 1 2 2 2 2 2]
 
-.. |k_means_iris_bad_init| image:: ../../auto_examples/cluster/images/plot_cluster_iris_003.png
+.. |k_means_iris_bad_init| image:: ../../auto_examples/cluster/images/sphx_glr_plot_cluster_iris_003.png
    :target: ../../auto_examples/cluster/plot_cluster_iris.html
    :scale: 63
 
-.. |k_means_iris_8| image:: ../../auto_examples/cluster/images/plot_cluster_iris_001.png
+.. |k_means_iris_8| image:: ../../auto_examples/cluster/images/sphx_glr_plot_cluster_iris_001.png
    :target: ../../auto_examples/cluster/plot_cluster_iris.html
    :scale: 63
 
-.. |cluster_iris_truth| image:: ../../auto_examples/cluster/images/plot_cluster_iris_004.png
+.. |cluster_iris_truth| image:: ../../auto_examples/cluster/images/sphx_glr_plot_cluster_iris_004.png
    :target: ../../auto_examples/cluster/plot_cluster_iris.html
    :scale: 63
 
@@ -85,19 +85,19 @@ algorithms. The simplest clustering algorithm is
 
     **Don't over-interpret clustering results**
 
-.. |face| image:: ../../auto_examples/cluster/images/plot_face_compress_001.png
+.. |face| image:: ../../auto_examples/cluster/images/sphx_glr_plot_face_compress_001.png
    :target: ../../auto_examples/cluster/plot_face_compress.html
    :scale: 60
 
-.. |face_regular| image:: ../../auto_examples/cluster/images/plot_face_compress_002.png
+.. |face_regular| image:: ../../auto_examples/cluster/images/sphx_glr_plot_face_compress_002.png
    :target: ../../auto_examples/cluster/plot_face_compress.html
    :scale: 60
 
-.. |face_compressed| image:: ../../auto_examples/cluster/images/plot_face_compress_003.png
+.. |face_compressed| image:: ../../auto_examples/cluster/images/sphx_glr_plot_face_compress_003.png
    :target: ../../auto_examples/cluster/plot_face_compress.html
    :scale: 60
 
-.. |face_histogram| image:: ../../auto_examples/cluster/images/plot_face_compress_004.png
+.. |face_histogram| image:: ../../auto_examples/cluster/images/sphx_glr_plot_face_compress_004.png
    :target: ../../auto_examples/cluster/plot_face_compress.html
    :scale: 60
 
@@ -177,7 +177,7 @@ This can be useful, for instance, to retrieve connected regions (sometimes
 also referred to as connected components) when
 clustering an image:
 
-.. image:: ../../auto_examples/cluster/images/plot_face_ward_segmentation_001.png
+.. image:: ../../auto_examples/cluster/images/sphx_glr_plot_face_ward_segmentation_001.png
     :target: ../../auto_examples/cluster/plot_face_ward_segmentation.html
     :scale: 40
     :align: right
@@ -200,7 +200,7 @@ features: **feature agglomeration**. This approach can be implemented by
 clustering in the feature direction, in other words clustering the
 transposed data.
 
-.. image:: ../../auto_examples/cluster/images/plot_digits_agglomeration_001.png
+.. image:: ../../auto_examples/cluster/images/sphx_glr_plot_digits_agglomeration_001.png
     :target: ../../auto_examples/cluster/plot_digits_agglomeration.html
     :align: right
     :scale: 57
@@ -242,11 +242,11 @@ Principal component analysis: PCA
 :ref:`PCA` selects the successive components that
 explain the maximum variance in the signal.
 
-.. |pca_3d_axis| image:: ../../auto_examples/decomposition/images/plot_pca_3d_001.png
+.. |pca_3d_axis| image:: ../../auto_examples/decomposition/images/sphx_glr_plot_pca_3d_001.png
    :target: ../../auto_examples/decomposition/plot_pca_3d.html
    :scale: 70
 
-.. |pca_3d_aligned| image:: ../../auto_examples/decomposition/images/plot_pca_3d_002.png
+.. |pca_3d_aligned| image:: ../../auto_examples/decomposition/images/sphx_glr_plot_pca_3d_002.png
    :target: ../../auto_examples/decomposition/plot_pca_3d.html
    :scale: 70
 
@@ -295,7 +295,7 @@ Independent Component Analysis: ICA
 a maximum amount of independent information. It is able to recover
 **non-Gaussian** independent signals:
 
-.. image:: ../../auto_examples/decomposition/images/plot_ica_blind_source_separation_001.png
+.. image:: ../../auto_examples/decomposition/images/sphx_glr_plot_ica_blind_source_separation_001.png
    :target: ../../auto_examples/decomposition/plot_ica_blind_source_separation.html
    :scale: 70
    :align: center
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index c74c6b9237..1fac81d69b 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -3368,13 +3368,13 @@ This release also includes the dictionary-learning work developed by
 
 
 
-.. |banner1| image:: ./auto_examples/manifold/images/thumb/plot_compare_methods.png
+.. |banner1| image:: ./auto_examples/manifold/images/thumb/sphx_glr_plot_compare_methods_thumb.png
    :target: auto_examples/manifold/plot_compare_methods.html
 
-.. |banner2| image:: ./auto_examples/linear_model/images/thumb/plot_omp.png
+.. |banner2| image:: ./auto_examples/linear_model/images/thumb/sphx_glr_plot_omp_thumb.png
    :target: auto_examples/linear_model/plot_omp.html
 
-.. |banner3| image:: ./auto_examples/decomposition/images/thumb/plot_kernel_pca.png
+.. |banner3| image:: ./auto_examples/decomposition/images/thumb/sphx_glr_plot_kernel_pca_thumb.png
    :target: auto_examples/decomposition/plot_kernel_pca.html
 
 .. |center-div| raw:: html
-- 
GitLab