diff --git a/examples/applications/plot_outlier_detection_housing.py b/examples/applications/plot_outlier_detection_housing.py
index 7aa12598182b393969a1b3b7bcdea07a77040a67..7c2576827a77dca9016e77c115b3c5636dd45c93 100644
--- a/examples/applications/plot_outlier_detection_housing.py
+++ b/examples/applications/plot_outlier_detection_housing.py
@@ -75,7 +75,7 @@ legend2 = {}
 # Learn a frontier for outlier detection with several classifiers
 xx1, yy1 = np.meshgrid(np.linspace(-8, 28, 500), np.linspace(3, 40, 500))
 xx2, yy2 = np.meshgrid(np.linspace(3, 10, 500), np.linspace(-5, 45, 500))
-for i, (clf_name, clf) in enumerate(classifiers.iteritems()):
+for i, (clf_name, clf) in enumerate(classifiers.items()):
     plt.figure(1)
     clf.fit(X1)
     Z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()])
@@ -89,6 +89,9 @@ for i, (clf_name, clf) in enumerate(classifiers.iteritems()):
     legend2[clf_name] = plt.contour(
         xx2, yy2, Z2, levels=[0], linewidths=2, colors=colors[i])
 
+legend1_values_list = list( legend1.values() )
+legend1_keys_list = list( legend1.keys() )
+
 # Plot the results (= shape of the data points cloud)
 plt.figure(1)  # two clusters
 plt.title("Outlier detection on a real data set (boston housing)")
@@ -100,24 +103,27 @@ plt.annotate("several confounded points", xy=(24, 19),
              xytext=(13, 10), bbox=bbox_args, arrowprops=arrow_args)
 plt.xlim((xx1.min(), xx1.max()))
 plt.ylim((yy1.min(), yy1.max()))
-plt.legend((legend1.values()[0].collections[0],
-            legend1.values()[1].collections[0],
-            legend1.values()[2].collections[0]),
-           (legend1.keys()[0], legend1.keys()[1], legend1.keys()[2]),
+plt.legend((legend1_values_list[0].collections[0],
+            legend1_values_list[1].collections[0],
+            legend1_values_list[2].collections[0]),
+           (legend1_keys_list[0], legend1_keys_list[1], legend1_keys_list[2]),
            loc="upper center",
            prop=matplotlib.font_manager.FontProperties(size=12))
 plt.ylabel("accessibility to radial highways")
 plt.xlabel("pupil-teatcher ratio by town")
 
+legend2_values_list = list( legend2.values() )
+legend2_keys_list = list( legend2.keys() )
+
 plt.figure(2)  # "banana" shape
 plt.title("Outlier detection on a real data set (boston housing)")
 plt.scatter(X2[:, 0], X2[:, 1], color='black')
 plt.xlim((xx2.min(), xx2.max()))
 plt.ylim((yy2.min(), yy2.max()))
-plt.legend((legend2.values()[0].collections[0],
-            legend2.values()[1].collections[0],
-            legend2.values()[2].collections[0]),
-           (legend2.keys()[0], legend2.keys()[1], legend2.keys()[2]),
+plt.legend((legend2_values_list[0].collections[0],
+            legend2_values_list[1].collections[0],
+            legend2_values_list[2].collections[0]),
+           (legend2_values_list[0], legend2_values_list[1], legend2_values_list[2]),
            loc="upper center",
            prop=matplotlib.font_manager.FontProperties(size=12))
 plt.ylabel("% lower status of the population")
diff --git a/examples/applications/plot_stock_market.py b/examples/applications/plot_stock_market.py
index 60ec4974934759b37c4830fd5bbeb67328539434..7e2ada521b459115ab4e8cc9e79c334d5425db52 100644
--- a/examples/applications/plot_stock_market.py
+++ b/examples/applications/plot_stock_market.py
@@ -78,8 +78,8 @@ from sklearn import cluster, covariance, manifold
 
 # Choose a time period reasonnably calm (not too long ago so that we get
 # high-tech firms, and before the 2008 crash)
-d1 = datetime.datetime(2003, 01, 01)
-d2 = datetime.datetime(2008, 01, 01)
+d1 = datetime.datetime(2003, 1, 1)
+d2 = datetime.datetime(2008, 1, 1)
 
 # kraft symbol has now changed from KFT to MDLZ in yahoo
 symbol_dict = {
@@ -144,7 +144,7 @@ symbol_dict = {
     'CAT': 'Caterpillar',
     'DD': 'DuPont de Nemours'}
 
-symbols, names = np.array(symbol_dict.items()).T
+symbols, names = np.array(list(symbol_dict.items())).T
 
 quotes = [finance.quotes_historical_yahoo(symbol, d1, d2, asobject=True)
           for symbol in symbols]
diff --git a/examples/bicluster/plot_spectral_biclustering.py b/examples/bicluster/plot_spectral_biclustering.py
index 7446d44fa2bfc6e9a9e0ad2714ba70dbf1265580..fdcbfcdcf7fc5afb0464ef33d3ef554f4bc532f4 100644
--- a/examples/bicluster/plot_spectral_biclustering.py
+++ b/examples/bicluster/plot_spectral_biclustering.py
@@ -46,7 +46,7 @@ model.fit(data)
 score = consensus_score(model.biclusters_,
                         (rows[:, row_idx], columns[:, col_idx]))
 
-print "consensus score: {:.1f}".format(score)
+print("consensus score: {:.1f}".format(score))
 
 fit_data = data[np.argsort(model.row_labels_)]
 fit_data = fit_data[:, np.argsort(model.column_labels_)]
diff --git a/examples/bicluster/plot_spectral_coclustering.py b/examples/bicluster/plot_spectral_coclustering.py
index 00c4cc474dd3df7d756b463543a24d07f5a9a5d5..dbf53f269835eb1fca0bd0035e51e9444df8adc5 100644
--- a/examples/bicluster/plot_spectral_coclustering.py
+++ b/examples/bicluster/plot_spectral_coclustering.py
@@ -43,7 +43,7 @@ model.fit(data)
 score = consensus_score(model.biclusters_,
                         (rows[:, row_idx], columns[:, col_idx]))
 
-print "consensus score: {:.3f}".format(score)
+print("consensus score: {:.3f}".format(score))
 
 fit_data = data[np.argsort(model.row_labels_)]
 fit_data = fit_data[:, np.argsort(model.column_labels_)]
diff --git a/examples/cluster/plot_cluster_iris.py b/examples/cluster/plot_cluster_iris.py
index 9c820cb96dd75f3dd62bf9f095f86453740a0460..6c117bc7a7eeca5f78f50d9524f32dd177726700 100644
--- a/examples/cluster/plot_cluster_iris.py
+++ b/examples/cluster/plot_cluster_iris.py
@@ -45,7 +45,7 @@ estimators = {'k_means_iris_3': KMeans(n_clusters=3),
 
 
 fignum = 1
-for name, est in estimators.iteritems():
+for name, est in estimators.items():
     fig = pl.figure(fignum, figsize=(4, 3))
     pl.clf()
     ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
diff --git a/examples/covariance/plot_outlier_detection.py b/examples/covariance/plot_outlier_detection.py
index 9af56c93c7ba88e1fc18bcc901ee468b3de89549..302eafe6a058d6cc9990340aa730946eaf062904 100644
--- a/examples/covariance/plot_outlier_detection.py
+++ b/examples/covariance/plot_outlier_detection.py
@@ -63,7 +63,7 @@ for i, offset in enumerate(clusters_separation):
 
     # Fit the model with the One-Class SVM
     pl.figure(figsize=(10, 5))
-    for i, (clf_name, clf) in enumerate(classifiers.iteritems()):
+    for i, (clf_name, clf) in enumerate(classifiers.items()):
         # fit the data and tag outliers
         clf.fit(X)
         y_pred = clf.decision_function(X).ravel()
diff --git a/examples/ensemble/plot_forest_iris.py b/examples/ensemble/plot_forest_iris.py
index 1722162bdd35228df76616f1b305f7d5358b8013..63a77ca11770b354a6eb208f941caaf91dbc8aee 100644
--- a/examples/ensemble/plot_forest_iris.py
+++ b/examples/ensemble/plot_forest_iris.py
@@ -99,7 +99,7 @@ for pair in ([0, 1], [0, 2], [2, 3]):
         model_details = model_title
         if hasattr(model, "estimators_"):
             model_details += " with {} estimators".format(len(model.estimators_))
-        print model_details + " with features", pair, "has a score of", scores
+        print( model_details + " with features", pair, "has a score of", scores )
 
         pl.subplot(3, 4, plot_idx)
         if plot_idx <= len(models):
diff --git a/examples/ensemble/plot_gradient_boosting_oob.py b/examples/ensemble/plot_gradient_boosting_oob.py
index 0aee3f41e49996b6f29a3aa128dd1d16ca0faf30..8370daed171d96f2cae398ead9492928b0dc9a52 100644
--- a/examples/ensemble/plot_gradient_boosting_oob.py
+++ b/examples/ensemble/plot_gradient_boosting_oob.py
@@ -106,9 +106,9 @@ cv_score -= cv_score[0]
 cv_best_iter = x[np.argmin(cv_score)]
 
 # color brew for the three curves
-oob_color = map(lambda x: x / 256.0, (190, 174, 212))
-test_color = map(lambda x: x / 256.0, (127, 201, 127))
-cv_color = map(lambda x: x / 256.0, (253, 192, 134))
+oob_color = list(map(lambda x: x / 256.0, (190, 174, 212)))
+test_color = list(map(lambda x: x / 256.0, (127, 201, 127)))
+cv_color = list(map(lambda x: x / 256.0, (253, 192, 134)))
 
 # plot curves and vertical lines for best iterations
 plt.plot(x, cumsum, label='OOB loss', color=oob_color)
@@ -122,7 +122,7 @@ plt.axvline(x=cv_best_iter, color=cv_color)
 xticks = plt.xticks()
 xticks_pos = np.array(xticks[0].tolist() +
                       [oob_best_iter, cv_best_iter, test_best_iter])
-xticks_label = np.array(map(lambda t: int(t), xticks[0]) +
+xticks_label = np.array(list(map(lambda t: int(t), xticks[0])) +
                         ['OOB', 'CV', 'Test'])
 ind = np.argsort(xticks_pos)
 xticks_pos = xticks_pos[ind]
diff --git a/examples/exercises/plot_cv_digits.py b/examples/exercises/plot_cv_digits.py
index 6861a3354a2b614d6482293ddda55f073c9c6747..1783c757ab775f4398606b19b111324d50390978 100644
--- a/examples/exercises/plot_cv_digits.py
+++ b/examples/exercises/plot_cv_digits.py
@@ -37,7 +37,7 @@ pl.semilogx(C_s, scores)
 pl.semilogx(C_s, np.array(scores) + np.array(scores_std), 'b--')
 pl.semilogx(C_s, np.array(scores) - np.array(scores_std), 'b--')
 locs, labels = pl.yticks()
-pl.yticks(locs, map(lambda x: "%g" % x, locs))
+pl.yticks(locs, list(map(lambda x: "%g" % x, locs)))
 pl.ylabel('CV score')
 pl.xlabel('Parameter C')
 pl.ylim(0, 1.1)
diff --git a/examples/linear_model/plot_ols_ridge_variance.py b/examples/linear_model/plot_ols_ridge_variance.py
index 836f25935e9a7b38d1398dbca303f70b2664ae34..a68ed005aef4c773f430176222636644f741d109 100644
--- a/examples/linear_model/plot_ols_ridge_variance.py
+++ b/examples/linear_model/plot_ols_ridge_variance.py
@@ -43,7 +43,7 @@ classifiers = dict(ols=linear_model.LinearRegression(),
                    ridge=linear_model.Ridge(alpha=.1))
 
 fignum = 1
-for name, clf in classifiers.iteritems():
+for name, clf in classifiers.items():
     fig = plt.figure(fignum, figsize=(4, 3))
     plt.clf()
     plt.title(name)
diff --git a/examples/linear_model/plot_ransac.py b/examples/linear_model/plot_ransac.py
index 9b7257f5b16e783908404f46ed513c0b3204a4b9..5e93e55290bdb2d11860165fd194018fbb94c315 100644
--- a/examples/linear_model/plot_ransac.py
+++ b/examples/linear_model/plot_ransac.py
@@ -42,8 +42,8 @@ line_y = model.predict(line_X[:, np.newaxis])
 line_y_ransac = model_ransac.predict(line_X[:, np.newaxis])
 
 # Compare estimated coefficients
-print "Estimated coefficients (true, normal, RANSAC):"
-print coef, model.coef_, model_ransac.estimator_.coef_
+print("Estimated coefficients (true, normal, RANSAC):")
+print(coef, model.coef_, model_ransac.estimator_.coef_)
 
 plt.plot(X[inlier_mask], y[inlier_mask], '.g', label='Inliers')
 plt.plot(X[outlier_mask], y[outlier_mask], '.r', label='Outliers')
diff --git a/examples/mixture/plot_gmm_classifier.py b/examples/mixture/plot_gmm_classifier.py
index 682f0671a157757ef9417f1b06d45886dd3614cd..77c342c1036a8638e9f3552325a8be337a7ae2cf 100644
--- a/examples/mixture/plot_gmm_classifier.py
+++ b/examples/mixture/plot_gmm_classifier.py
@@ -79,7 +79,7 @@ pl.subplots_adjust(bottom=.01, top=0.95, hspace=.15, wspace=.05,
                    left=.01, right=.99)
 
 
-for index, (name, classifier) in enumerate(classifiers.iteritems()):
+for index, (name, classifier) in enumerate(classifiers.items()):
     # Since we have class labels for the training data, we can
     # initialize the GMM parameters in a supervised manner.
     classifier.means_ = np.array([X_train[y_train == i].mean(axis=0)
diff --git a/examples/neighbors/plot_digits_kde_sampling.py b/examples/neighbors/plot_digits_kde_sampling.py
index 9c572c26cdbe499cb1e071d899868e27b96aae62..4680a41780aedbade427b7c74468105b195e5ef0 100644
--- a/examples/neighbors/plot_digits_kde_sampling.py
+++ b/examples/neighbors/plot_digits_kde_sampling.py
@@ -31,7 +31,7 @@ params = {'bandwidth': np.logspace(-1, 1, 20)}
 grid = GridSearchCV(KernelDensity(), params)
 grid.fit(data)
 
-print "best bandwidth: {0}".format(grid.best_estimator_.bandwidth)
+print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))
 
 # use the best estimator to compute the kernel density estimate
 kde = grid.best_estimator_
diff --git a/examples/plot_classification_probability.py b/examples/plot_classification_probability.py
index e7d93e616913d48ff3b51145dc33ee53a4580909..f375299a9d150c91a2d565831007b910149f6bf6 100644
--- a/examples/plot_classification_probability.py
+++ b/examples/plot_classification_probability.py
@@ -42,7 +42,7 @@ n_classifiers = len(classifiers)
 pl.figure(figsize=(3 * 2, n_classifiers * 2))
 pl.subplots_adjust(bottom=.2, top=.95)
 
-for index, (name, classifier) in enumerate(classifiers.iteritems()):
+for index, (name, classifier) in enumerate(classifiers.items()):
     classifier.fit(X, y)
 
     y_pred = classifier.predict(X)
diff --git a/examples/plot_digits_classification.py b/examples/plot_digits_classification.py
index 989f3be59ea67203c71519299c6f05c565f41659..bddc994a0e7b159fc59eb05777a6b2b9ac8e6b01 100644
--- a/examples/plot_digits_classification.py
+++ b/examples/plot_digits_classification.py
@@ -29,7 +29,8 @@ digits = datasets.load_digits()
 # attribute of the dataset. If we were working from image files, we
 # could load them using pylab.imread. For these images know which
 # digit they represent: it is given in the 'target' of the dataset.
-for index, (image, label) in enumerate(zip(digits.images, digits.target)[:4]):
+for index, (image, label) in enumerate(list(zip(digits.images,
+                                                digits.target))[:4]):
     pl.subplot(2, 4, index + 1)
     pl.axis('off')
     pl.imshow(image, cmap=pl.cm.gray_r, interpolation='nearest')
@@ -55,7 +56,7 @@ print("Classification report for classifier %s:\n%s\n"
 print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
 
 for index, (image, prediction) in enumerate(
-        zip(digits.images[n_samples / 2:], predicted)[:4]):
+        list(zip(digits.images[n_samples / 2:], predicted))[:4]):
     pl.subplot(2, 4, index + 5)
     pl.axis('off')
     pl.imshow(image, cmap=pl.cm.gray_r, interpolation='nearest')