diff --git a/benchmarks/bench_covertype.py b/benchmarks/bench_covertype.py
index 93ce585c79a3cdad188fc124c8cbd4bc2e394404..d71a940bc483a7353217cbc479d70af54d4a7473 100644
--- a/benchmarks/bench_covertype.py
+++ b/benchmarks/bench_covertype.py
@@ -40,9 +40,9 @@ The same task has been used in a number of papers including:
 [1] http://archive.ics.uci.edu/ml/datasets/Covertype
 
 """
-from __future__ import division
+from __future__ import division, print_function
 
-print __doc__
+print(__doc__)
 
 # Author: Peter Prettenhoer <peter.prettenhofer@gmail.com>
 # License: BSD Style.
@@ -109,7 +109,7 @@ def load_data(dtype=np.float32, order='F'):
     if not os.path.exists(original_archive):
         # Download the data
         import urllib
-        print "Downloading data, Please Wait (11MB)..."
+        print("Downloading data, Please Wait (11MB)...")
         opener = urllib.urlopen(
             'http://archive.ics.uci.edu/ml/'
             'machine-learning-databases/covtype/covtype.data.gz')
@@ -172,12 +172,14 @@ print("%s %d" % ("number of features:".ljust(25),
 print("%s %d" % ("number of classes:".ljust(25),
                  np.unique(y_train).shape[0]))
 print("%s %s" % ("data type:".ljust(25), X_train.dtype))
-print("%s %d (pos=%d, neg=%d, size=%dMB)" % ("number of train samples:".ljust(25),
-                          X_train.shape[0], np.sum(y_train == 1),
-                          np.sum(y_train == -1), int(X_train.nbytes / 1e6)))
-print("%s %d (pos=%d, neg=%d, size=%dMB)" % ("number of test samples:".ljust(25),
-                          X_test.shape[0], np.sum(y_test == 1),
-                          np.sum(y_test == -1), int(X_test.nbytes / 1e6)))
+print("%s %d (pos=%d, neg=%d, size=%dMB)"
+      % ("number of train samples:".ljust(25),
+         X_train.shape[0], np.sum(y_train == 1),
+         np.sum(y_train == -1), int(X_train.nbytes / 1e6)))
+print("%s %d (pos=%d, neg=%d, size=%dMB)"
+      % ("number of test samples:".ljust(25),
+      X_test.shape[0], np.sum(y_test == 1),
+      np.sum(y_test == -1), int(X_test.nbytes / 1e6)))
 
 
 classifiers = dict()
@@ -255,10 +257,10 @@ for name in selected_classifiers:
         op.error('classifier %r unknown' % name)
         sys.exit(1)
 
-print("")
+print()
 print("Training Classifiers")
 print("====================")
-print("")
+print()
 err, train_time, test_time = {}, {}, {}
 for name in sorted(selected_classifiers):
     print("Training %s ..." % name)
@@ -266,10 +268,10 @@ for name in sorted(selected_classifiers):
 
 ######################################################################
 ## Print classification performance
-print("")
+print()
 print("Classification performance:")
 print("===========================")
-print("")
+print()
 
 
 def print_row(clf_type, train_time, test_time, err):
@@ -284,5 +286,5 @@ print("-" * 44)
 
 for name in sorted(selected_classifiers, key=lambda name: err[name]):
     print_row(name, train_time[name], test_time[name], err[name])
-print("")
-print("")
+print()
+print()
diff --git a/benchmarks/bench_glm.py b/benchmarks/bench_glm.py
index fed5e5bc106f867c9f5d64d05b2db264709436bc..e5cfb943262530660aa085e2bcc772df4c3a42d3 100644
--- a/benchmarks/bench_glm.py
+++ b/benchmarks/bench_glm.py
@@ -24,7 +24,7 @@ if __name__ == '__main__':
 
     for i in range(n_iter):
 
-        print 'Iteration %s of %s' % (i, n_iter)
+        print('Iteration %s of %s' % (i, n_iter))
 
         n_samples, n_features = 10 * i + 3, 10 * i + 3
 
diff --git a/benchmarks/bench_glmnet.py b/benchmarks/bench_glmnet.py
index b9c77fbed06097fec5e1e69a9b3b89ed6a691d66..441e6ed519d0f0b13a85f80672671bade4a22728 100644
--- a/benchmarks/bench_glmnet.py
+++ b/benchmarks/bench_glmnet.py
@@ -38,9 +38,9 @@ def bench(factory, X, Y, X_test, Y_test, ref_coef):
     delta = (time() - tstart)
     # stop time
 
-    print "duration: %0.3fs" % delta
-    print "rmse: %f" % rmse(Y_test, clf.predict(X_test))
-    print "mean coef abs diff: %f" % abs(ref_coef - clf.coef_.ravel()).mean()
+    print("duration: %0.3fs" % delta)
+    print("rmse: %f" % rmse(Y_test, clf.predict(X_test)))
+    print("mean coef abs diff: %f" % abs(ref_coef - clf.coef_.ravel()).mean())
     return delta
 
 
@@ -58,9 +58,9 @@ if __name__ == '__main__':
     n_informative = n_features / 10
     n_test_samples = 1000
     for i in range(1, n + 1):
-        print '=================='
-        print 'Iteration %s of %s' % (i, n)
-        print '=================='
+        print('==================')
+        print('Iteration %s of %s' % (i, n))
+        print('==================')
 
         X, Y, coef_ = make_regression(
             n_samples=(i * step) + n_test_samples, n_features=n_features,
@@ -71,9 +71,9 @@ if __name__ == '__main__':
         X = X[:(i * step)]
         Y = Y[:(i * step)]
 
-        print "benching scikit: "
+        print("benching scikit-learn: ")
         scikit_results.append(bench(ScikitLasso, X, Y, X_test, Y_test, coef_))
-        print "benching glmnet: "
+        print("benching glmnet: ")
         glmnet_results.append(bench(GlmnetLasso, X, Y, X_test, Y_test, coef_))
 
     pl.clf()
@@ -96,9 +96,9 @@ if __name__ == '__main__':
     n_samples = 500
 
     for i in range(1, n + 1):
-        print '=================='
-        print 'Iteration %02d of %02d' % (i, n)
-        print '=================='
+        print('==================')
+        print('Iteration %02d of %02d' % (i, n))
+        print('==================')
         n_features = i * step
         n_informative = n_features / 10
 
@@ -111,9 +111,9 @@ if __name__ == '__main__':
         X = X[:n_samples]
         Y = Y[:n_samples]
 
-        print "benching scikit: "
+        print("benching scikit-learn: ")
         scikit_results.append(bench(ScikitLasso, X, Y, X_test, Y_test, coef_))
-        print "benching glmnet: "
+        print("benching glmnet: ")
         glmnet_results.append(bench(GlmnetLasso, X, Y, X_test, Y_test, coef_))
 
     xx = np.arange(100, 100 + n * step, step)
diff --git a/benchmarks/bench_lasso.py b/benchmarks/bench_lasso.py
index c72b21eb54a0a51d3729e9a8467205b55d78db94..ccc20b684930caecef24d1b8a3398a4ccee98c07 100644
--- a/benchmarks/bench_lasso.py
+++ b/benchmarks/bench_lasso.py
@@ -19,20 +19,18 @@ from sklearn.datasets.samples_generator import make_regression
 
 
 def compute_bench(alpha, n_samples, n_features, precompute):
-
     lasso_results = []
     lars_lasso_results = []
 
-    n_test_samples = 0
     it = 0
 
     for ns in n_samples:
         for nf in n_features:
             it += 1
-            print '=================='
-            print 'Iteration %s of %s' % (it, max(len(n_samples),
-                                          len(n_features)))
-            print '=================='
+            print('==================')
+            print('Iteration %s of %s' % (it, max(len(n_samples),
+                                          len(n_features))))
+            print('==================')
             n_informative = nf // 10
             X, Y, coef_ = make_regression(n_samples=ns, n_features=nf,
                                           n_informative=n_informative,
@@ -41,7 +39,7 @@ def compute_bench(alpha, n_samples, n_features, precompute):
             X /= np.sqrt(np.sum(X ** 2, axis=0))  # Normalize data
 
             gc.collect()
-            print "- benching Lasso"
+            print("- benching Lasso")
             clf = Lasso(alpha=alpha, fit_intercept=False,
                         precompute=precompute)
             tstart = time()
@@ -49,7 +47,7 @@ def compute_bench(alpha, n_samples, n_features, precompute):
             lasso_results.append(time() - tstart)
 
             gc.collect()
-            print "- benching LassoLars"
+            print("- benching LassoLars")
             clf = LassoLars(alpha=alpha, fit_intercept=False,
                             normalize=False, precompute=precompute)
             tstart = time()
diff --git a/benchmarks/bench_plot_fastkmeans.py b/benchmarks/bench_plot_fastkmeans.py
index 974136b75857725af166c7d1c71df7c745a00a50..c58dde0159da8cdbc6b3dd9deb164ae58ea3d60c 100644
--- a/benchmarks/bench_plot_fastkmeans.py
+++ b/benchmarks/bench_plot_fastkmeans.py
@@ -1,6 +1,7 @@
-from time import time
+from __future__ import print_function
 
 from collections import defaultdict
+from time import time
 
 import numpy as np
 from numpy import random as nr
@@ -11,7 +12,6 @@ from sklearn.cluster.k_means_ import KMeans, MiniBatchKMeans
 def compute_bench(samples_range, features_range):
 
     it = 0
-    iterations = 200
     results = defaultdict(lambda: [])
     chunk = 100
 
@@ -19,26 +19,25 @@ def compute_bench(samples_range, features_range):
     for n_samples in samples_range:
         for n_features in features_range:
             it += 1
-            print '=============================='
-            print 'Iteration %03d of %03d' % (it, max_it)
-            print '=============================='
-            print ''
+            print('==============================')
+            print('Iteration %03d of %03d' % (it, max_it))
+            print('==============================')
+            print()
             data = nr.random_integers(-50, 50, (n_samples, n_features))
 
-            print 'K-Means'
+            print('K-Means')
             tstart = time()
-            kmeans = KMeans(init='k-means++',
-                            k=10).fit(data)
+            kmeans = KMeans(init='k-means++', n_clusters=10).fit(data)
 
             delta = time() - tstart
-            print "Speed: %0.3fs" % delta
-            print "Inertia: %0.5f" % kmeans.inertia_
-            print ''
+            print("Speed: %0.3fs" % delta)
+            print("Inertia: %0.5f" % kmeans.inertia_)
+            print()
 
             results['kmeans_speed'].append(delta)
             results['kmeans_quality'].append(kmeans.inertia_)
 
-            print 'Fast K-Means'
+            print('Fast K-Means')
             # let's prepare the data in small chunks
             mbkmeans = MiniBatchKMeans(init='k-means++',
                                       k=10,
@@ -46,10 +45,10 @@ def compute_bench(samples_range, features_range):
             tstart = time()
             mbkmeans.fit(data)
             delta = time() - tstart
-            print "Speed: %0.3fs" % delta
-            print "Inertia: %f" % mbkmeans.inertia_
-            print ''
-            print ''
+            print("Speed: %0.3fs" % delta)
+            print("Inertia: %f" % mbkmeans.inertia_)
+            print()
+            print()
 
             results['minibatchkmeans_speed'].append(delta)
             results['minibatchkmeans_quality'].append(mbkmeans.inertia_)
@@ -69,22 +68,22 @@ def compute_bench_2(chunks):
     it = 0
     for chunk in chunks:
         it += 1
-        print '=============================='
-        print 'Iteration %03d of %03d' % (it, max_it)
-        print '=============================='
-        print ''
+        print('==============================')
+        print('Iteration %03d of %03d' % (it, max_it))
+        print('==============================')
+        print()
 
-        print 'Fast K-Means'
+        print('Fast K-Means')
         tstart = time()
         mbkmeans = MiniBatchKMeans(init='k-means++',
-                                    k=8,
-                                    batch_size=chunk)
+                                   n_clusters=8,
+                                   batch_size=chunk)
 
         mbkmeans.fit(X)
         delta = time() - tstart
-        print "Speed: %0.3fs" % delta
-        print "Inertia: %0.3fs" % mbkmeans.inertia_
-        print ''
+        print("Speed: %0.3fs" % delta)
+        print("Inertia: %0.3fs" % mbkmeans.inertia_)
+        print()
 
         results['minibatchkmeans_speed'].append(delta)
         results['minibatchkmeans_quality'].append(mbkmeans.inertia_)
diff --git a/benchmarks/bench_plot_lasso_path.py b/benchmarks/bench_plot_lasso_path.py
index d79d81ca30a19407c7fd7018905f3bdce1b70fd9..e0ee81521e66b47b53fe15cfdc018d67850cfe62 100644
--- a/benchmarks/bench_plot_lasso_path.py
+++ b/benchmarks/bench_plot_lasso_path.py
@@ -2,12 +2,14 @@
 
 The input data is mostly low rank but is a fat infinite tail.
 """
+from __future__ import print_function
+
+from collections import defaultdict
 import gc
-from time import time
 import sys
+from time import time
 
 import numpy as np
-from collections import defaultdict
 
 from sklearn.linear_model import lars_path
 from sklearn.linear_model import lasso_path
@@ -24,9 +26,9 @@ def compute_bench(samples_range, features_range):
     for n_samples in samples_range:
         for n_features in features_range:
             it += 1
-            print '===================='
-            print 'Iteration %03d of %03d' % (it, max_it)
-            print '===================='
+            print('====================')
+            print('Iteration %03d of %03d' % (it, max_it))
+            print('====================')
             dataset_kwargs = {
                 'n_samples': n_samples,
                 'n_features': n_features,
@@ -35,46 +37,46 @@ def compute_bench(samples_range, features_range):
                 #'effective_rank': None,
                 'bias': 0.0,
             }
-            print "n_samples: %d" % n_samples
-            print "n_features: %d" % n_features
+            print("n_samples: %d" % n_samples)
+            print("n_features: %d" % n_features)
             X, y = make_regression(**dataset_kwargs)
 
             gc.collect()
-            print "benching lars_path (with Gram):",
+            print("benching lars_path (with Gram):", end='')
             sys.stdout.flush()
             tstart = time()
             G = np.dot(X.T, X)  # precomputed Gram matrix
             Xy = np.dot(X.T, y)
             lars_path(X, y, Xy=Xy, Gram=G, method='lasso')
             delta = time() - tstart
-            print "%0.3fs" % delta
+            print("%0.3fs" % delta)
             results['lars_path (with Gram)'].append(delta)
 
             gc.collect()
-            print "benching lars_path (without Gram):",
+            print("benching lars_path (without Gram):", end='')
             sys.stdout.flush()
             tstart = time()
             lars_path(X, y, method='lasso')
             delta = time() - tstart
-            print "%0.3fs" % delta
+            print("%0.3fs" % delta)
             results['lars_path (without Gram)'].append(delta)
 
             gc.collect()
-            print "benching lasso_path (with Gram):",
+            print("benching lasso_path (with Gram):", end='')
             sys.stdout.flush()
             tstart = time()
             lasso_path(X, y, precompute=True)
             delta = time() - tstart
-            print "%0.3fs" % delta
+            print("%0.3fs" % delta)
             results['lasso_path (with Gram)'].append(delta)
 
             gc.collect()
-            print "benching lasso_path (without Gram):",
+            print("benching lasso_path (without Gram):", end='')
             sys.stdout.flush()
             tstart = time()
             lasso_path(X, y, precompute=False)
             delta = time() - tstart
-            print "%0.3fs" % delta
+            print("%0.3fs" % delta)
             results['lasso_path (without Gram)'].append(delta)
 
     return results
diff --git a/benchmarks/bench_plot_neighbors.py b/benchmarks/bench_plot_neighbors.py
index 3cfc9883997ecabbe2551af6a9553c478a1adf79..2cc116ab5717dc5968705e4b4f57f82f3cb96d9c 100644
--- a/benchmarks/bench_plot_neighbors.py
+++ b/benchmarks/bench_plot_neighbors.py
@@ -43,8 +43,8 @@ def barplot_neighbors(Nrange=2 ** np.arange(1, 11),
     N_results_query = dict([(alg, np.zeros(len(Nrange)))
                             for alg in algorithms])
 
-    for i, NN in enumerate(Nrange):
-        print "N = %i (%i out of %i)" % (NN, i + 1, len(Nrange))
+    for i, NN in enumerate(Nrange, 1):
+        print("N = %i (%i out of %i)" % (NN, i, len(Nrange)))
         X = get_data(NN, D, dataset)
         for algorithm in algorithms:
             nbrs = neighbors.NearestNeighbors(n_neighbors=min(NN, k),
@@ -66,8 +66,8 @@ def barplot_neighbors(Nrange=2 ** np.arange(1, 11),
     D_results_query = dict([(alg, np.zeros(len(Drange)))
                             for alg in algorithms])
 
-    for i, DD in enumerate(Drange):
-        print "D = %i (%i out of %i)" % (DD, i + 1, len(Drange))
+    for i, DD in enumerate(Drange, 1):
+        print("D = %i (%i out of %i)" % (DD, i, len(Drange)))
         X = get_data(N, DD, dataset)
         for algorithm in algorithms:
             nbrs = neighbors.NearestNeighbors(n_neighbors=k,
@@ -91,8 +91,8 @@ def barplot_neighbors(Nrange=2 ** np.arange(1, 11),
 
     X = get_data(N, DD, dataset)
 
-    for i, kk in enumerate(krange):
-        print "k = %i (%i out of %i)" % (kk, i + 1, len(krange))
+    for i, kk in enumerate(krange, 1):
+        print("k = %i (%i out of %i)" % (kk, i, len(krange)))
         for algorithm in algorithms:
             nbrs = neighbors.NearestNeighbors(n_neighbors=kk,
                                               algorithm=algorithm,
diff --git a/benchmarks/bench_plot_nmf.py b/benchmarks/bench_plot_nmf.py
index 27c903e055a8c85745b633585fb70ec0c0e3c667..abfd3208d18cecc0a0560f7874aece39c030943e 100644
--- a/benchmarks/bench_plot_nmf.py
+++ b/benchmarks/bench_plot_nmf.py
@@ -75,20 +75,20 @@ def compute_bench(samples_range, features_range, rank=50, tolerance=1e-7):
     for n_samples in samples_range:
         for n_features in features_range:
             it += 1
-            print '===================='
-            print 'Iteration %03d of %03d' % (it, max_it)
-            print '===================='
+            print('====================')
+            print('Iteration %03d of %03d' % (it, max_it))
+            print('====================')
             X = np.abs(make_low_rank_matrix(n_samples, n_features,
                        effective_rank=rank,  tail_strength=0.2))
 
             gc.collect()
-            print "benching nndsvd-nmf: "
+            print("benching nndsvd-nmf: ")
             tstart = time()
             m = NMF(n_components=30, tol=tolerance, init='nndsvd').fit(X)
             tend = time() - tstart
             timeset['nndsvd-nmf'].append(tend)
             err['nndsvd-nmf'].append(m.reconstruction_err_)
-            print m.reconstruction_err_, tend
+            print(m.reconstruction_err_, tend)
 
             gc.collect()
             print "benching nndsvda-nmf: "
@@ -98,36 +98,36 @@ def compute_bench(samples_range, features_range, rank=50, tolerance=1e-7):
             tend = time() - tstart
             timeset['nndsvda-nmf'].append(tend)
             err['nndsvda-nmf'].append(m.reconstruction_err_)
-            print m.reconstruction_err_, tend
+            print(m.reconstruction_err_, tend)
 
             gc.collect()
-            print "benching nndsvdar-nmf: "
+            print("benching nndsvdar-nmf: ")
             tstart = time()
             m = NMF(n_components=30, init='nndsvdar',
                     tol=tolerance).fit(X)
             tend = time() - tstart
             timeset['nndsvdar-nmf'].append(tend)
             err['nndsvdar-nmf'].append(m.reconstruction_err_)
-            print m.reconstruction_err_, tend
+            print(m.reconstruction_err_, tend)
 
             gc.collect()
-            print "benching random-nmf"
+            print("benching random-nmf")
             tstart = time()
             m = NMF(n_components=30, init=None, max_iter=1000,
                     tol=tolerance).fit(X)
             tend = time() - tstart
             timeset['random-nmf'].append(tend)
             err['random-nmf'].append(m.reconstruction_err_)
-            print m.reconstruction_err_, tend
+            print(m.reconstruction_err_, tend)
 
             gc.collect()
-            print "benching alt-random-nmf"
+            print("benching alt-random-nmf")
             tstart = time()
             W, H = alt_nnmf(X, r=30, R=None, tol=tolerance)
             tend = time() - tstart
             timeset['alt-random-nmf'].append(tend)
             err['alt-random-nmf'].append(np.linalg.norm(X - np.dot(W, H)))
-            print np.linalg.norm(X - np.dot(W, H)), tend
+            print(np.linalg.norm(X - np.dot(W, H)), tend)
 
     return timeset, err
 
diff --git a/benchmarks/bench_plot_omp_lars.py b/benchmarks/bench_plot_omp_lars.py
index 8b50df658b4a37db33811a9a879f5ca13acfe54e..f9d9225c1a216be80d82f8f5fde71505c4ffa3f3 100644
--- a/benchmarks/bench_plot_omp_lars.py
+++ b/benchmarks/bench_plot_omp_lars.py
@@ -3,9 +3,11 @@ regression (:ref:`least_angle_regression`)
 
 The input data is mostly low rank but is a fat infinite tail.
 """
+from __future__ import print_function
+
 import gc
-from time import time
 import sys
+from time import time
 
 import numpy as np
 
@@ -28,9 +30,9 @@ def compute_bench(samples_range, features_range):
         for i_f, n_features in enumerate(features_range):
             it += 1
             n_informative = n_features / 10
-            print '===================='
-            print 'Iteration %03d of %03d' % (it, max_it)
-            print '===================='
+            print('====================')
+            print('Iteration %03d of %03d' % (it, max_it))
+            print('====================')
             # dataset_kwargs = {
             #     'n_train_samples': n_samples,
             #     'n_test_samples': 2,
@@ -47,49 +49,49 @@ def compute_bench(samples_range, features_range):
                 'n_nonzero_coefs': n_informative,
                 'random_state': 0
             }
-            print "n_samples: %d" % n_samples
-            print "n_features: %d" % n_features
+            print("n_samples: %d" % n_samples)
+            print("n_features: %d" % n_features)
             y, X, _ = make_sparse_coded_signal(**dataset_kwargs)
             X = np.asfortranarray(X)
 
             gc.collect()
-            print "benching lars_path (with Gram):",
+            print("benching lars_path (with Gram):", end='')
             sys.stdout.flush()
             tstart = time()
             G = np.dot(X.T, X)  # precomputed Gram matrix
             Xy = np.dot(X.T, y)
             lars_path(X, y, Xy=Xy, Gram=G, max_iter=n_informative)
             delta = time() - tstart
-            print "%0.3fs" % delta
+            print("%0.3fs" % delta)
             lars_gram[i_f, i_s] = delta
 
             gc.collect()
-            print "benching lars_path (without Gram):",
+            print("benching lars_path (without Gram):", end='')
             sys.stdout.flush()
             tstart = time()
             lars_path(X, y, Gram=None, max_iter=n_informative)
             delta = time() - tstart
-            print "%0.3fs" % delta
+            print("%0.3fs" % delta)
             lars[i_f, i_s] = delta
 
             gc.collect()
-            print "benching orthogonal_mp (with Gram):",
+            print("benching orthogonal_mp (with Gram):", end='')
             sys.stdout.flush()
             tstart = time()
             orthogonal_mp(X, y, precompute_gram=True,
                           n_nonzero_coefs=n_informative)
             delta = time() - tstart
-            print "%0.3fs" % delta
+            print("%0.3fs" % delta)
             omp_gram[i_f, i_s] = delta
 
             gc.collect()
-            print "benching orthogonal_mp (without Gram):",
+            print("benching orthogonal_mp (without Gram):", end='')
             sys.stdout.flush()
             tstart = time()
             orthogonal_mp(X, y, precompute_gram=False,
                           n_nonzero_coefs=n_informative)
             delta = time() - tstart
-            print "%0.3fs" % delta
+            print("%0.3fs" % delta)
             omp[i_f, i_s] = delta
 
     results['time(LARS) / time(OMP)\n (w/ Gram)'] = (lars_gram / omp_gram)
diff --git a/benchmarks/bench_plot_svd.py b/benchmarks/bench_plot_svd.py
index b8a551c63da1a88e30b0fd69453c7e8a99011e42..ed8af0202837eb2046d87332aeaea7c189860df8 100644
--- a/benchmarks/bench_plot_svd.py
+++ b/benchmarks/bench_plot_svd.py
@@ -22,29 +22,29 @@ def compute_bench(samples_range, features_range, n_iter=3, rank=50):
     for n_samples in samples_range:
         for n_features in features_range:
             it += 1
-            print '===================='
-            print 'Iteration %03d of %03d' % (it, max_it)
-            print '===================='
+            print('====================')
+            print('Iteration %03d of %03d' % (it, max_it))
+            print('====================')
             X = make_low_rank_matrix(n_samples, n_features,
                                   effective_rank=rank,
                                   tail_strength=0.2)
 
             gc.collect()
-            print "benching scipy svd: "
+            print("benching scipy svd: ")
             tstart = time()
             svd(X, full_matrices=False)
             results['scipy svd'].append(time() - tstart)
 
             gc.collect()
-            print "benching scikit-learn randomized_svd: n_iter=0"
+            print("benching scikit-learn randomized_svd: n_iter=0")
             tstart = time()
             randomized_svd(X, rank, n_iter=0)
             results['scikit-learn randomized_svd (n_iter=0)'].append(
                 time() - tstart)
 
             gc.collect()
-            print ("benching scikit-learn randomized_svd: n_iter=%d "
-                   % n_iter)
+            print("benching scikit-learn randomized_svd: n_iter=%d "
+                  % n_iter)
             tstart = time()
             randomized_svd(X, rank, n_iter=n_iter)
             results['scikit-learn randomized_svd (n_iter=%d)'
diff --git a/benchmarks/bench_sgd_regression.py b/benchmarks/bench_sgd_regression.py
index e07bd0add590b7fac6b6e594640542eb505629d5..60d32c8a93e0ccb7c65d4c9aee8dd0f9c80bed8b 100644
--- a/benchmarks/bench_sgd_regression.py
+++ b/benchmarks/bench_sgd_regression.py
@@ -5,7 +5,7 @@ Compares SGD regression against coordinate descent and Ridge
 on synthetik data.
 """
 
-print __doc__
+print(__doc__)
 
 # Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
 # License: BSD Style.
@@ -41,10 +41,10 @@ if __name__ == "__main__":
             X_test = X[n_train:]
             y_test = y[n_train:]
 
-            print "======================="
-            print "Round %d %d" % (i, j)
-            print "n_features:", n_features
-            print "n_samples:", n_train
+            print("=======================")
+            print("Round %d %d" % (i, j))
+            print("n_features:", n_features)
+            print("n_samples:", n_train)
 
             # Shuffle data
             idx = np.arange(n_train)
@@ -64,7 +64,7 @@ if __name__ == "__main__":
             y_test = (y_test - mean) / std
 
             gc.collect()
-            print "- benching ElasticNet"
+            print("- benching ElasticNet")
             clf = ElasticNet(alpha=alpha, rho=0.5, fit_intercept=False)
             tstart = time()
             clf.fit(X_train, y_train)
@@ -73,7 +73,7 @@ if __name__ == "__main__":
             elnet_results[i, j, 1] = time() - tstart
 
             gc.collect()
-            print "- benching SGD"
+            print("- benching SGD")
             n_iter = np.ceil(10 ** 4.0 / n_train)
             clf = SGDRegressor(alpha=alpha, fit_intercept=False,
                                n_iter=n_iter, learning_rate="invscaling",
@@ -86,7 +86,7 @@ if __name__ == "__main__":
             sgd_results[i, j, 1] = time() - tstart
 
             gc.collect()
-            print "- benching RidgeRegression"
+            print("- benching RidgeRegression")
             clf = Ridge(alpha=alpha, fit_intercept=False)
             tstart = time()
             clf.fit(X_train, y_train)
diff --git a/benchmarks/bench_tree.py b/benchmarks/bench_tree.py
index 77f76d8e7049a56d2a837eb5b09936a01aeb4477..3d4c85a89159667b7c1108669bc2e1f2166696e5 100644
--- a/benchmarks/bench_tree.py
+++ b/benchmarks/bench_tree.py
@@ -63,9 +63,9 @@ def bench_scikit_tree_regressor(X, Y):
 
 if __name__ == '__main__':
 
-    print '============================================'
-    print 'Warning: this is going to take a looong time'
-    print '============================================'
+    print('============================================')
+    print('Warning: this is going to take a looong time')
+    print('============================================')
 
     n = 10
     step = 10000
@@ -73,9 +73,9 @@ if __name__ == '__main__':
     dim = 10
     n_classes = 10
     for i in range(n):
-        print '============================================'
-        print 'Entering iteration %s of %s' % (i, n)
-        print '============================================'
+        print('============================================')
+        print('Entering iteration %s of %s' % (i, n))
+        print('============================================')
         n_samples += step
         X = np.random.randn(n_samples, dim)
         Y = np.random.randint(0, n_classes, (n_samples,))
@@ -102,9 +102,9 @@ if __name__ == '__main__':
 
     dim = start_dim
     for i in range(0, n):
-        print '============================================'
-        print 'Entering iteration %s of %s' % (i, n)
-        print '============================================'
+        print('============================================')
+        print('Entering iteration %s of %s' % (i, n))
+        print('============================================')
         dim += step
         X = np.random.randn(100, dim)
         Y = np.random.randint(0, n_classes, (100,))
diff --git a/doc/datasets/labeled_faces.rst b/doc/datasets/labeled_faces.rst
index cc7d14cffeea58d050a431ea41867b5da055a022..bcda19bd82f20b9ecd9015c6069970beee64afd5 100644
--- a/doc/datasets/labeled_faces.rst
+++ b/doc/datasets/labeled_faces.rst
@@ -43,7 +43,7 @@ classification task (hence supervised learning)::
   >>> lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
 
   >>> for name in lfw_people.target_names:
-  ...     print name
+  ...     print(name)
   ...
   Ariel Sharon
   Colin Powell
diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
index 71671162e60972724897ee850cc7cdc6b2edbd1a..576b99f5f8209b63fb4725a701b00ab68adf9920 100644
--- a/doc/modules/cross_validation.rst
+++ b/doc/modules/cross_validation.rst
@@ -78,7 +78,7 @@ the data and fitting a model and computing the score 5 consecutive times
 The mean score and the standard deviation of the score estimate are hence given
 by::
 
-  >>> print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)
+  >>> print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2))
   Accuracy: 0.97 (+/- 0.02)
 
 By default, the score computed at each CV iteration is the ``score``
@@ -158,11 +158,11 @@ Example of 2-fold::
   >>> Y = np.array([0, 1, 0, 1])
 
   >>> kf = KFold(len(Y), n_folds=2, indices=False)
-  >>> print kf
+  >>> print(kf)
   sklearn.cross_validation.KFold(n=4, n_folds=2)
 
   >>> for train, test in kf:
-  ...     print train, test
+  ...     print(train, test)
   [False False  True  True] [ True  True False False]
   [ True  True False False] [False False  True  True]
 
@@ -181,7 +181,7 @@ when creating the cross-validation procedure::
 
   >>> kf = KFold(len(Y), n_folds=2, indices=True)
   >>> for train, test in kf:
-  ...    print train, test
+  ...    print(train, test)
   [2 3] [0 1]
   [0 1] [2 3]
 
@@ -206,11 +206,11 @@ Example of stratified 2-fold::
   >>> Y = [0, 0, 0, 1, 1, 1, 0]
 
   >>> skf = StratifiedKFold(Y, 2)
-  >>> print skf
+  >>> print(skf)
   sklearn.cross_validation.StratifiedKFold(labels=[0 0 0 1 1 1 0], n_folds=2)
 
   >>> for train, test in skf:
-  ...     print train, test
+  ...     print(train, test)
   [1 4 6] [0 2 3 5]
   [0 2 3 5] [1 4 6]
 
@@ -229,11 +229,11 @@ not waste much data as only one sample is removed from the learning set::
   >>> Y = np.array([0, 1, 0, 1])
 
   >>> loo = LeaveOneOut(len(Y))
-  >>> print loo
+  >>> print(loo)
   sklearn.cross_validation.LeaveOneOut(n=4)
 
   >>> for train, test in loo:
-  ...    print train, test
+  ...    print(train, test)
   [1 2 3] [0]
   [0 2 3] [1]
   [0 1 3] [2]
@@ -253,11 +253,11 @@ Example of Leave-2-Out::
   >>> Y = [0, 1, 0, 1]
 
   >>> lpo = LeavePOut(len(Y), 2)
-  >>> print lpo
+  >>> print(lpo)
   sklearn.cross_validation.LeavePOut(n=4, p=2)
 
   >>> for train, test in lpo:
-  ...     print train, test
+  ...     print(train, test)
   [2 3] [0 1]
   [1 3] [0 2]
   [1 2] [0 3]
@@ -287,11 +287,11 @@ a training set using the samples of all the experiments except one::
   >>> labels = [1, 1, 2, 2]
 
   >>> lolo = LeaveOneLabelOut(labels)
-  >>> print lolo
+  >>> print(lolo)
   sklearn.cross_validation.LeaveOneLabelOut(labels=[1, 1, 2, 2])
 
   >>> for train, test in lolo:
-  ...     print train, test
+  ...     print(train, test)
   [2 3] [0 1]
   [0 1] [2 3]
 
@@ -314,11 +314,11 @@ Example of Leave-2-Label Out::
   >>> labels = [1, 1, 2, 2, 3, 3]
 
   >>> lplo = LeavePLabelOut(labels, 2)
-  >>> print lplo
+  >>> print(lplo)
   sklearn.cross_validation.LeavePLabelOut(labels=[1, 1, 2, 2, 3, 3], p=2)
 
   >>> for train, test in lplo:
-  ...     print train, test
+  ...     print(train, test)
   [4 5] [0 1 2 3]
   [2 3] [0 1 4 5]
   [0 1] [2 3 4 5]
@@ -344,11 +344,11 @@ Here is a usage example::
   ...     random_state=0)
   >>> len(ss)
   3
-  >>> print ss                                            # doctest: +ELLIPSIS
+  >>> print(ss)                                           # doctest: +ELLIPSIS
   ShuffleSplit(5, n_iter=3, test_size=0.25, indices=True, ...)
 
   >>> for train_index, test_index in ss:
-  ...    print train_index, test_index
+  ...    print(train_index, test_index)
   ...
   [1 3 4] [2 0]
   [1 4 3] [0 2]
@@ -390,11 +390,11 @@ smaller than the total dataset if it is very large.
   >>> bs = cross_validation.Bootstrap(9, random_state=0)
   >>> len(bs)
   3
-  >>> print bs
+  >>> print(bs)
   Bootstrap(9, n_iter=3, train_size=5, test_size=4, random_state=0)
 
   >>> for train_index, test_index in bs:
-  ...    print train_index, test_index
+  ...    print(train_index, test_index)
   ...
   [1 8 7 7 8] [0 3 0 5]
   [5 4 2 4 2] [6 7 1 0]
diff --git a/doc/modules/naive_bayes.rst b/doc/modules/naive_bayes.rst
index b591b733449ca0640ffb288eae7ff47e5fe3713d..ed2fdcbf687268756a64ca90d7b6e860cf97a42d 100644
--- a/doc/modules/naive_bayes.rst
+++ b/doc/modules/naive_bayes.rst
@@ -93,7 +93,7 @@ are estimated using maximum likelihood.
     >>> from sklearn.naive_bayes import GaussianNB
     >>> gnb = GaussianNB()
     >>> y_pred = gnb.fit(iris.data, iris.target).predict(iris.data)
-    >>> print "Number of mislabeled points : %d" % (iris.target != y_pred).sum()
+    >>> print("Number of mislabeled points : %d" % (iris.target != y_pred).sum())
     Number of mislabeled points : 6
 
 .. _multinomial_naive_bayes:
diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index 367d5e9309e0ee5524d743a8ca2fd441d63de757..756c1e4fd143fcbcf7a53505e1133a3fb3339736 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -380,7 +380,7 @@ methods that do not make this assumption. Usage of the default
     >>> clf = NearestCentroid()
     >>> clf.fit(X, y)
     NearestCentroid(metric='euclidean', shrink_threshold=None)
-    >>> print clf.predict([[-0.8, -1]])
+    >>> print(clf.predict([[-0.8, -1]]))
     [1]
 
 
diff --git a/doc/tutorial/basic/tutorial.rst b/doc/tutorial/basic/tutorial.rst
index 7d9ec2738b09c30dd13b13cccb5539debe46764f..aa8f732d140f2d45da8c1d13f423013208981035 100644
--- a/doc/tutorial/basic/tutorial.rst
+++ b/doc/tutorial/basic/tutorial.rst
@@ -93,7 +93,7 @@ section <datasets>`.
 For instance, in the case of the digits dataset, ``digits.data`` gives
 access to the features that can be used to classify the digits samples::
 
-  >>> print digits.data  # doctest: +NORMALIZE_WHITESPACE
+  >>> print(digits.data)  # doctest: +NORMALIZE_WHITESPACE
   [[  0.   0.   5. ...,   0.   0.   0.]
    [  0.   0.   0. ...,  10.   0.   0.]
    [  0.   0.   0. ...,  16.   9.   0.]
diff --git a/doc/tutorial/statistical_inference/model_selection.rst b/doc/tutorial/statistical_inference/model_selection.rst
index 0ec9050fb5834b1355ff69c9d3ad948990a2d33e..81e839b9595b8656bafc4b846e2f5ea3ba7615ab 100644
--- a/doc/tutorial/statistical_inference/model_selection.rst
+++ b/doc/tutorial/statistical_inference/model_selection.rst
@@ -38,7 +38,7 @@ data in *folds* that we use for training and testing::
     ...     y_test  = y_train.pop(k)
     ...     y_train = np.concatenate(y_train)
     ...     scores.append(svc.fit(X_train, y_train).score(X_test, y_test))
-    >>> print scores
+    >>> print(scores)
     [0.93489148580968284, 0.95659432387312182, 0.93989983305509184]
 
 .. currentmodule:: sklearn.cross_validation
@@ -59,7 +59,7 @@ of indices for this purpose::
     >>> from sklearn import cross_validation
     >>> k_fold = cross_validation.KFold(n=6, n_folds=3, indices=True)
     >>> for train_indices, test_indices in k_fold:
-    ...      print 'Train: %s | test: %s' % (train_indices, test_indices)
+    ...      print('Train: %s | test: %s' % (train_indices, test_indices))
     Train: [2 3 4 5] | test: [0 1]
     Train: [0 1 4 5] | test: [2 3]
     Train: [0 1 2 3] | test: [4 5]
diff --git a/doc/tutorial/statistical_inference/supervised_learning.rst b/doc/tutorial/statistical_inference/supervised_learning.rst
index 3bfdef0e1199caf90e171c0803d062c08285ff04..6c4b5a3e7d059ecce382b618af96671b3f201a41 100644
--- a/doc/tutorial/statistical_inference/supervised_learning.rst
+++ b/doc/tutorial/statistical_inference/supervised_learning.rst
@@ -175,7 +175,7 @@ Linear models: :math:`y = X\beta + \epsilon`
     >>> regr = linear_model.LinearRegression()
     >>> regr.fit(diabetes_X_train, diabetes_y_train)
     LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
-    >>> print regr.coef_
+    >>> print(regr.coef_)
     [   0.30349955 -237.63931533  510.53060544  327.73698041 -814.13170937
       492.81458798  102.84845219  184.60648906  743.51961675   76.09517222]
 
@@ -328,7 +328,7 @@ application of Occam's razor: `prefer simpler models`.
     Lasso(alpha=0.025118864315095794, copy_X=True, fit_intercept=True,
        max_iter=1000, normalize=False, positive=False, precompute='auto',
        tol=0.0001, warm_start=False)
-    >>> print regr.coef_
+    >>> print(regr.coef_)
     [   0.         -212.43764548  517.19478111  313.77959962 -160.8303982    -0.
      -187.19554705   69.38229038  508.66011217   71.84239008]
 
diff --git a/doc/tutorial/statistical_inference/unsupervised_learning.rst b/doc/tutorial/statistical_inference/unsupervised_learning.rst
index 6990ab07d7d4ec263c8017676d7d02eef9116b16..13d0b4e67c7ad5c4b9af6a38a8470ce4cb181e7b 100644
--- a/doc/tutorial/statistical_inference/unsupervised_learning.rst
+++ b/doc/tutorial/statistical_inference/unsupervised_learning.rst
@@ -40,9 +40,9 @@ algorithms. The simplest clustering algorithm is
     >>> k_means = cluster.KMeans(n_clusters=3)
     >>> k_means.fit(X_iris) # doctest: +ELLIPSIS
     KMeans(copy_x=True, init='k-means++', ...
-    >>> print k_means.labels_[::10]
+    >>> print(k_means.labels_[::10])
     [1 1 1 1 1 0 0 0 0 0 2 2 2 2 2]
-    >>> print y_iris[::10]
+    >>> print(y_iris[::10])
     [0 0 0 0 0 1 1 1 1 1 2 2 2 2 2]
 
 .. |k_means_iris_bad_init| image:: ../../auto_examples/cluster/images/plot_cluster_iris_3.png
@@ -276,7 +276,7 @@ data by projecting on a principal subspace.
     >>> pca = decomposition.PCA()
     >>> pca.fit(X)
     PCA(copy=True, n_components=None, whiten=False)
-    >>> print pca.explained_variance_  # doctest: +SKIP
+    >>> print(pca.explained_variance_)  # doctest: +SKIP
     [  2.18565811e+00   1.19346747e+00   8.43026679e-32]
 
     >>> # As we can see, only the 2 first components are useful
diff --git a/examples/ensemble/plot_adaboost_hastie_10_2.py b/examples/ensemble/plot_adaboost_hastie_10_2.py
index db9da0dca04dd980d3bcbe1f6173035568d33152..4c8f83aeae5040f94376314459373b7d4e2b58a4 100644
--- a/examples/ensemble/plot_adaboost_hastie_10_2.py
+++ b/examples/ensemble/plot_adaboost_hastie_10_2.py
@@ -18,7 +18,7 @@ whereas real SAMME.R uses the predicted class probabilities.
 .. [2] J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost", 2009.
 
 """
-print __doc__
+print(__doc__)
 
 # Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>,
 #         Noel Dawe <noel.dawe@gmail.com>
diff --git a/examples/ensemble/plot_adaboost_multiclass.py b/examples/ensemble/plot_adaboost_multiclass.py
index e70f923b3983850a19e13e035f55f1f6437c0d08..3e9ec6e53b4cf280c7e6cc2a17409e4e2444f6af 100644
--- a/examples/ensemble/plot_adaboost_multiclass.py
+++ b/examples/ensemble/plot_adaboost_multiclass.py
@@ -23,7 +23,7 @@ therefore are not shown.
 .. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost", 2009.
 
 """
-print __doc__
+print(__doc__)
 
 # Author: Noel Dawe <noel.dawe@gmail.com>
 #
diff --git a/examples/ensemble/plot_adaboost_regression.py b/examples/ensemble/plot_adaboost_regression.py
index 0daf557d6dafa88be0411611d4ed763188dc4dba..9a419c2ba4dbd7ade9d45d71b976baff1cc64158 100644
--- a/examples/ensemble/plot_adaboost_regression.py
+++ b/examples/ensemble/plot_adaboost_regression.py
@@ -12,7 +12,7 @@ detail.
 .. [1] H. Drucker, "Improving Regressors using Boosting Techniques", 1997.
 
 """
-print __doc__
+print(__doc__)
 
 import numpy as np
 
diff --git a/examples/ensemble/plot_adaboost_twoclass.py b/examples/ensemble/plot_adaboost_twoclass.py
index 18afe629f978e698264ae712ab65e53dd25a0be7..290e841fb64a5588cd270209a639eb4273441b00 100644
--- a/examples/ensemble/plot_adaboost_twoclass.py
+++ b/examples/ensemble/plot_adaboost_twoclass.py
@@ -16,7 +16,7 @@ containing a desired purity of class B, for example, by only selecting samples
 with a decision score above some value.
 
 """
-print __doc__
+print(__doc__)
 
 import pylab as pl
 import numpy as np
diff --git a/examples/exercises/plot_cv_digits.py b/examples/exercises/plot_cv_digits.py
index 6529df4d9ecfeee52ec66ea21313f5cc2fcda5ac..dde717902d6b1a302d6aba018f1fae98657a6022 100644
--- a/examples/exercises/plot_cv_digits.py
+++ b/examples/exercises/plot_cv_digits.py
@@ -6,7 +6,7 @@ Cross-validation on Digits Dataset Exercise
 This exercise is used in the :ref:`cv_generators_tut` part of the
 :ref:`model_selection_tut` section of the :ref:`stat_learn_tut_index`.
 """
-print __doc__
+print(__doc__)
 
 
 import numpy as np
diff --git a/examples/mixture/plot_gmm_classifier.py b/examples/mixture/plot_gmm_classifier.py
index af6ffeb766f61367595ebd5427d8717cfb8d436f..03b043cd3273a919a4a46e51e17493b333ffc925 100644
--- a/examples/mixture/plot_gmm_classifier.py
+++ b/examples/mixture/plot_gmm_classifier.py
@@ -19,7 +19,7 @@ crosses. The iris dataset is four-dimensional. Only the first two
 dimensions are shown here, and thus some points are separated in other
 dimensions.
 """
-print __doc__
+print(__doc__)
 
 # Author: Ron Weiss <ronweiss@gmail.com>, Gael Varoquaux
 # License: BSD Style.
diff --git a/examples/mixture/plot_gmm_selection.py b/examples/mixture/plot_gmm_selection.py
index 4f9b46a95fad04029b167221bf06ce0753866798..4fa015b9523559426ce5871286d4f0403532c7bb 100644
--- a/examples/mixture/plot_gmm_selection.py
+++ b/examples/mixture/plot_gmm_selection.py
@@ -14,7 +14,7 @@ Unlike Bayesian procedures, such inferences are prior-free.
 In that case, the model with 2 components and full covariance
 (which corresponds to the true generative model) is selected.
 """
-print __doc__
+print(__doc__)
 
 import itertools
 
diff --git a/examples/plot_confusion_matrix.py b/examples/plot_confusion_matrix.py
index d4de1c4d93fd58382df6b11912f61f2ed901b0f0..4585303dc3c110103d2a0b5fe99fa097ec8105e6 100644
--- a/examples/plot_confusion_matrix.py
+++ b/examples/plot_confusion_matrix.py
@@ -31,7 +31,7 @@ y_ = classifier.fit(X[:half], y[:half]).predict(X[half:])
 # Compute confusion matrix
 cm = confusion_matrix(y[half:], y_)
 
-print cm
+print(cm)
 
 # Show confusion matrix
 pl.matshow(cm)
diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py
index 67aa0ddb896ec39b7f632fdedd5b6c7cbb0473a3..876435a27dda5d28d067be55129d9057b2194e0a 100644
--- a/sklearn/cluster/spectral.py
+++ b/sklearn/cluster/spectral.py
@@ -137,7 +137,7 @@ def discretize(vectors, copy=True, max_svd_restarts=30, n_iter_max=20,
                 U, S, Vh = np.linalg.svd(t_svd)
                 svd_restarts += 1
             except LinAlgError:
-                print "SVD did not converge, randomizing and trying again"
+                print("SVD did not converge, randomizing and trying again")
                 break
 
             ncut_value = 2.0 * (n_samples - S.sum())
diff --git a/sklearn/covariance/tests/test_covariance.py b/sklearn/covariance/tests/test_covariance.py
index 177b4fb8b998e010d487de9e63118a44a27bc557..d5606117d929d73f8a28e8be49c5be7e901bb68b 100644
--- a/sklearn/covariance/tests/test_covariance.py
+++ b/sklearn/covariance/tests/test_covariance.py
@@ -44,7 +44,7 @@ def test_covariance():
                   cov.error_norm, emp_cov, norm='foo')
     # Mahalanobis distances computation test
     mahal_dist = cov.mahalanobis(X)
-    print np.amin(mahal_dist), np.amax(mahal_dist)
+    print(np.amin(mahal_dist), np.amax(mahal_dist))
     assert(np.amin(mahal_dist) > 0)
 
     # test with n_features = 1
diff --git a/sklearn/covariance/tests/test_robust_covariance.py b/sklearn/covariance/tests/test_robust_covariance.py
index 7f953ee5dc554b05649930e8068fcbf3b79a0805..51e0b0e3878c58b751accf736b232de0a0bb61d7 100644
--- a/sklearn/covariance/tests/test_robust_covariance.py
+++ b/sklearn/covariance/tests/test_robust_covariance.py
@@ -82,7 +82,7 @@ def test_outlier_detection():
     rnd = np.random.RandomState(0)
     X = rnd.randn(100, 10)
     clf = EllipticEnvelope(contamination=0.1)
-    print clf.threshold
+    print(clf.threshold)
     assert_raises(Exception, clf.predict, X)
     assert_raises(Exception, clf.decision_function, X)
     clf.fit(X)
diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
index da957030df5253dc1768ec6272dbd4529b3f4bce..97a2268c4815e7d42b29afa0bce263dddbc1cd93 100644
--- a/sklearn/datasets/base.py
+++ b/sklearn/datasets/base.py
@@ -283,7 +283,7 @@ def load_digits(n_class=10):
 
         >>> from sklearn.datasets import load_digits
         >>> digits = load_digits()
-        >>> print digits.data.shape
+        >>> print(digits.data.shape)
         (1797, 64)
         >>> import pylab as pl #doctest: +SKIP
         >>> pl.gray() #doctest: +SKIP
@@ -391,7 +391,7 @@ def load_boston():
     --------
     >>> from sklearn.datasets import load_boston
     >>> boston = load_boston()
-    >>> print boston.data.shape
+    >>> print(boston.data.shape)
     (506, 13)
     """
     module_path = dirname(__file__)
diff --git a/sklearn/datasets/california_housing.py b/sklearn/datasets/california_housing.py
index c421a9227698e4ef5f1d6fb4aebecaeb172ea8ef..7c866df8ba3ffb2cf31566f7afad6f69b8e64a0b 100644
--- a/sklearn/datasets/california_housing.py
+++ b/sklearn/datasets/california_housing.py
@@ -60,8 +60,7 @@ def fetch_california_housing(data_home=None, download_if_missing=True):
     if not exists(data_home):
         makedirs(data_home)
     if not exists(join(data_home, TARGET_FILENAME)):
-        print 'downloading Cal. housing from %s to %s' % (DATA_URL,
-                                                          data_home)
+        print('downloading Cal. housing from %s to %s' % (DATA_URL, data_home))
         fhandle = urllib2.urlopen(DATA_URL)
         buf = BytesIO(fhandle.read())
         zip_file = ZipFile(buf)
diff --git a/sklearn/datasets/samples_generator.py b/sklearn/datasets/samples_generator.py
index 31ac8f0233c6125d30d4a6bad85bf173f3f41516..0aa9eaee4e8ea3609d578172e393c3f0f4ad7dff 100644
--- a/sklearn/datasets/samples_generator.py
+++ b/sklearn/datasets/samples_generator.py
@@ -638,7 +638,7 @@ def make_blobs(n_samples=100, n_features=2, centers=3, cluster_std=1.0,
     >>> from sklearn.datasets.samples_generator import make_blobs
     >>> X, y = make_blobs(n_samples=10, centers=3, n_features=2,
     ...                   random_state=0)
-    >>> print X.shape
+    >>> print(X.shape)
     (10, 2)
     >>> y
     array([0, 0, 1, 0, 2, 2, 2, 1, 1, 0])
diff --git a/sklearn/decomposition/factor_analysis.py b/sklearn/decomposition/factor_analysis.py
index 71efb637115a5a6f93267ac6481300a3cde30093..f63ddaf4af214f35d08da1666067d875b1b105e5 100644
--- a/sklearn/decomposition/factor_analysis.py
+++ b/sklearn/decomposition/factor_analysis.py
@@ -162,7 +162,7 @@ class FactorAnalysis(BaseEstimator, TransformerMixin):
             psi = np.maximum(var - np.sum(W ** 2, axis=0), SMALL)
         else:
             if self.verbose:
-                print "Did not converge"
+                print("Did not converge")
 
         self.components_ = W
         self.noise_variance_ = psi
diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py
index 0d8221c899a8e99677e28898f1b008be3031ff12..6ea91845b2612d55fa6a0d33080311101b941c44 100644
--- a/sklearn/feature_extraction/image.py
+++ b/sklearn/feature_extraction/image.py
@@ -280,7 +280,7 @@ def extract_patches_2d(image, patch_size, max_patches=None, random_state=None):
            [ 8,  9, 10, 11],
            [12, 13, 14, 15]])
     >>> patches = image.extract_patches_2d(one_image, (2, 2))
-    >>> print patches.shape
+    >>> print(patches.shape)
     (9, 2, 2)
     >>> patches[0]
     array([[0, 1],
diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
index 46931c48b284cfe80f76b2ae2b5a6d86614f5862..229e6fad5ac739b5b4683ad8f59e3c6f23442b77 100644
--- a/sklearn/linear_model/tests/test_least_angle.py
+++ b/sklearn/linear_model/tests/test_least_angle.py
@@ -168,7 +168,7 @@ def test_no_path_all_precomputed():
 
     alphas_, active_, coef_path_ = linear_model.lars_path(
         X, y, method="lasso", Gram=G, Xy=Xy, alpha_min=0.9)
-    print "---"
+    print("---")
     alpha_, active, coef = linear_model.lars_path(
         X, y, method="lasso", Gram=G, Xy=Xy, alpha_min=0.9, return_path=False)
 
diff --git a/sklearn/manifold/mds.py b/sklearn/manifold/mds.py
index 380d98fa052270492540556559b2e609ff5e268c..5e96f98025de4199b303d5e6eec74fdec0d5094b 100644
--- a/sklearn/manifold/mds.py
+++ b/sklearn/manifold/mds.py
@@ -119,12 +119,12 @@ def _smacof_single(similarities, metric=True, n_components=2, init=None,
 
         dis = np.sqrt((X ** 2).sum(axis=1)).sum()
         if verbose == 2:
-            print 'it: %d, stress %s' % (it, stress)
+            print('it: %d, stress %s' % (it, stress))
         if old_stress is not None:
             if(old_stress - stress / dis) < eps:
                 if verbose:
-                    print 'breaking at iteration %d with stress %s' % (it,
-                                                                       stress)
+                    print('breaking at iteration %d with stress %s' % (it,
+                                                                       stress))
                 break
         old_stress = stress / dis
 
diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index a4b999013e3733eafe394fbee35c75ad3153e754..9e687c3f598678e2777fba40bac77a429e1fa796 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -312,20 +312,20 @@ def homogeneity_score(labels_true, labels_pred):
     Non-pefect labelings that futher split classes into more clusters can be
     perfectly homogeneous::
 
-      >>> print ("%.6f" % homogeneity_score([0, 0, 1, 1], [0, 0, 1, 2]))
+      >>> print("%.6f" % homogeneity_score([0, 0, 1, 1], [0, 0, 1, 2]))
       ...                                                  # doctest: +ELLIPSIS
       1.0...
-      >>> print ("%.6f" % homogeneity_score([0, 0, 1, 1], [0, 1, 2, 3]))
+      >>> print("%.6f" % homogeneity_score([0, 0, 1, 1], [0, 1, 2, 3]))
       ...                                                  # doctest: +ELLIPSIS
       1.0...
 
     Clusters that include samples from different classes do not make for an
     homogeneous labeling::
 
-      >>> print ("%.6f" % homogeneity_score([0, 0, 1, 1], [0, 1, 0, 1]))
+      >>> print("%.6f" % homogeneity_score([0, 0, 1, 1], [0, 1, 0, 1]))
       ...                                                  # doctest: +ELLIPSIS
       0.0...
-      >>> print ("%.6f" % homogeneity_score([0, 0, 1, 1], [0, 0, 0, 0]))
+      >>> print("%.6f" % homogeneity_score([0, 0, 1, 1], [0, 0, 0, 0]))
       ...                                                  # doctest: +ELLIPSIS
       0.0...
 
@@ -384,17 +384,17 @@ def completeness_score(labels_true, labels_pred):
     Non-pefect labelings that assign all classes members to the same clusters
     are still complete::
 
-      >>> print completeness_score([0, 0, 1, 1], [0, 0, 0, 0])
+      >>> print(completeness_score([0, 0, 1, 1], [0, 0, 0, 0]))
       1.0
-      >>> print completeness_score([0, 1, 2, 3], [0, 0, 1, 1])
+      >>> print(completeness_score([0, 1, 2, 3], [0, 0, 1, 1]))
       1.0
 
     If classes members are splitted across different clusters, the
     assignment cannot be complete::
 
-      >>> print completeness_score([0, 0, 1, 1], [0, 1, 0, 1])
+      >>> print(completeness_score([0, 0, 1, 1], [0, 1, 0, 1]))
       0.0
-      >>> print completeness_score([0, 0, 0, 0], [0, 1, 2, 3])
+      >>> print(completeness_score([0, 0, 0, 0], [0, 1, 2, 3]))
       0.0
 
     """
diff --git a/sklearn/neighbors/nearest_centroid.py b/sklearn/neighbors/nearest_centroid.py
index 5a9d0c1a6ca896e4e159b1570639ca7e4ac0236a..b313e5121dc69e818b16e12cade5b80d26761178 100644
--- a/sklearn/neighbors/nearest_centroid.py
+++ b/sklearn/neighbors/nearest_centroid.py
@@ -46,7 +46,7 @@ class NearestCentroid(BaseEstimator, ClassifierMixin):
     >>> clf = NearestCentroid()
     >>> clf.fit(X, y)
     NearestCentroid(metric='euclidean', shrink_threshold=None)
-    >>> print clf.predict([[-0.8, -1]])
+    >>> print(clf.predict([[-0.8, -1]]))
     [1]
 
     See also
diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py
index 23949eead75ec45a3d166483aeff836a9e24880a..3ae7ec49d60f2a97b5b8dd470a2d5710e3feead5 100644
--- a/sklearn/svm/base.py
+++ b/sklearn/svm/base.py
@@ -1,3 +1,5 @@
+from __future__ import print_function
+
 import numpy as np
 import scipy.sparse as sp
 import warnings
@@ -180,7 +182,7 @@ class BaseLibSVM(BaseEstimator):
 
         fit = self._sparse_fit if self._sparse else self._dense_fit
         if self.verbose:  # pragma: no cover
-            print '[LibSVM]',
+            print('[LibSVM]', end='')
         fit(X, y, sample_weight, solver_type, kernel)
 
         self.shape_fit_ = X.shape
@@ -681,7 +683,7 @@ class BaseLibLinear(BaseEstimator):
 
         rnd = check_random_state(self.random_state)
         if self.verbose:
-            print '[LibLinear]',
+            print('[LibLinear]', end='')
         self.raw_coef_ = train(X, y, self._get_solver_type(), self.tol,
                                self._get_bias(), self.C,
                                self.class_weight_,
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 7314be71b2ea7803e3b73e051ca590fa82507398..65c1592c4619754cbc9c7f802ed4e29d76497666 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -5,6 +5,8 @@ General tests for all estimators in sklearn.
 # Authors: Andreas Mueller <amueller@ais.uni-bonn.de>
 #          Gael Varoquaux gael.varoquaux@normalesup.org
 # License: BSD Style.
+from __future__ import print_function
+
 import os
 import warnings
 import sys
@@ -139,13 +141,13 @@ def test_estimators_sparse_data():
             clf.fit(X, y)
         except TypeError, e:
             if not 'sparse' in repr(e):
-                print ("Estimator %s doesn't seem to fail gracefully on "
-                       "sparse data" % name)
+                print("Estimator %s doesn't seem to fail gracefully on "
+                      "sparse data" % name)
                 traceback.print_exc(file=sys.stdout)
                 raise e
         except Exception, exc:
-            print ("Estimator %s doesn't seem to fail gracefully on "
-                   "sparse data" % name)
+            print("Estimator %s doesn't seem to fail gracefully on "
+                  "sparse data" % name)
             traceback.print_exc(file=sys.stdout)
             raise exc
 
@@ -207,9 +209,9 @@ def test_transformers():
             else:
                 assert_equal(X_pred.shape[0], n_samples)
         except Exception as e:
-            print trans
-            print e
-            print
+            print(trans)
+            print(e)
+            print()
             succeeded = False
             continue
 
@@ -263,13 +265,13 @@ def test_transformers_sparse_data():
             trans.fit(X, y)
         except TypeError, e:
             if not 'sparse' in repr(e):
-                print ("Estimator %s doesn't seem to fail gracefully on "
-                       "sparse data" % name)
+                print("Estimator %s doesn't seem to fail gracefully on "
+                      "sparse data" % name)
                 traceback.print_exc(file=sys.stdout)
                 raise e
         except Exception, exc:
-            print ("Estimator %s doesn't seem to fail gracefully on "
-                   "sparse data" % name)
+            print("Estimator %s doesn't seem to fail gracefully on "
+                  "sparse data" % name)
             traceback.print_exc(file=sys.stdout)
             raise exc
 
@@ -623,8 +625,8 @@ def test_regressors_train():
                 assert_greater(reg.score(X, y_), 0.5)
         except Exception as e:
             print(reg)
-            print e
-            print
+            print(e)
+            print()
             succeeded = False
 
     assert_true(succeeded)
diff --git a/sklearn/utils/arpack.py b/sklearn/utils/arpack.py
index 17dbcaafc1e2396f12ad2940d18c0ec216c26d63..8aa753d081da68dfdd1f16cff58c5a332b230140 100644
--- a/sklearn/utils/arpack.py
+++ b/sklearn/utils/arpack.py
@@ -1408,7 +1408,7 @@ def _eigsh(A, k=6, M=None, sigma=None, which='LM', v0=None, ncv=None,
     >>> vals, vecs = eigsh(id, k=6)
     >>> vals # doctest: +SKIP
     array([ 1.+0.j,  1.+0.j,  1.+0.j,  1.+0.j,  1.+0.j,  1.+0.j])
-    >>> print vecs.shape
+    >>> print(vecs.shape)
     (13, 6)
 
     References