From 0f6c525d7e6f90522ce985dd033255279eed7d6b Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Sat, 31 Dec 2011 14:31:55 +0100
Subject: [PATCH] ENH: renamed fast_svd to randomized_svd + related
 improvements

---
 benchmarks/bench_plot_svd.py                  | 19 ++++---
 doc/developers/utilities.rst                  |  4 +-
 doc/whats_new.rst                             |  6 +++
 .../wikipedia_principal_eigenvector.py        |  6 +--
 sklearn/decomposition/dict_learning.py        |  4 +-
 sklearn/decomposition/nmf.py                  |  4 +-
 sklearn/decomposition/pca.py                  | 10 ++--
 sklearn/utils/extmath.py                      | 49 ++++++++++---------
 sklearn/utils/tests/test_svd.py               | 37 +++++++-------
 9 files changed, 79 insertions(+), 60 deletions(-)

diff --git a/benchmarks/bench_plot_svd.py b/benchmarks/bench_plot_svd.py
index 6b24964ed3..9f76a23dcc 100644
--- a/benchmarks/bench_plot_svd.py
+++ b/benchmarks/bench_plot_svd.py
@@ -8,11 +8,11 @@ import numpy as np
 from collections import defaultdict
 
 from scipy.linalg import svd
-from sklearn.utils.extmath import fast_svd
+from sklearn.utils.extmath import randomized_svd
 from sklearn.datasets.samples_generator import make_low_rank_matrix
 
 
-def compute_bench(samples_range, features_range, q=3, rank=50):
+def compute_bench(samples_range, features_range, n_iterations=3, rank=50):
 
     it = 0
 
@@ -36,16 +36,19 @@ def compute_bench(samples_range, features_range, q=3, rank=50):
             results['scipy svd'].append(time() - tstart)
 
             gc.collect()
-            print "benching scikit-learn fast_svd: q=0"
+            print "benching scikit-learn randomized_svd: n_iterations=0"
             tstart = time()
-            fast_svd(X, rank, q=0)
-            results['scikit-learn fast_svd (q=0)'].append(time() - tstart)
+            randomized_svd(X, rank, n_iterations=0)
+            results['scikit-learn randomized_svd (n_iterations=0)'].append(
+                time() - tstart)
 
             gc.collect()
-            print "benching scikit-learn fast_svd: q=%d " % q
+            print ("benching scikit-learn randomized_svd: n_iterations=%d "
+                   % n_iterations)
             tstart = time()
-            fast_svd(X, rank, q=q)
-            results['scikit-learn fast_svd (q=%d)' % q].append(time() - tstart)
+            randomized_svd(X, rank, n_iterations=n_iterations)
+            results['scikit-learn randomized_svd (n_iterations=%d)'
+                    % n_iterations].append(time() - tstart)
 
     return results
 
diff --git a/doc/developers/utilities.rst b/doc/developers/utilities.rst
index a7dd46009f..f1fa601790 100644
--- a/doc/developers/utilities.rst
+++ b/doc/developers/utilities.rst
@@ -76,9 +76,9 @@ Efficient Linear Algebra & Array Operations
 
 - :func:`extmath.randomized_range_finder`: construct an orthonormal matrix
   whose range approximates the range of the input.  This is used in
-  :func:`extmath.fast_svd`, below.
+  :func:`extmath.randomized_svd`, below.
 
-- :func:`extmath.fast_svd`: compute the k-truncated randomized SVD.
+- :func:`extmath.randomized_svd`: compute the k-truncated randomized SVD.
   This algorithm finds the exact truncated singular values decomposition
   using randomization to speed up the computations. It is particularly
   fast on large matrices on which you wish to extract only a small
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index 61d08dd4b6..a39259fa64 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -136,6 +136,12 @@ version 0.9:
 
   - ``BaseDictionaryLearning`` class replaced by ``SparseCodingMixin``.
 
+  - :func:`sklearn.utils.extmath.fast_svd` has been renamed
+    :func:`sklearn.utils.extmath.randomized_svd` and the default
+    oversampling is now fixed to 10 additional random vectors instead
+    of doubling the number of components to extract. The new behavior
+    follows the reference paper.
+
 
 .. _changes_0_9:
 
diff --git a/examples/applications/wikipedia_principal_eigenvector.py b/examples/applications/wikipedia_principal_eigenvector.py
index 729f30793c..cbf4e5be80 100644
--- a/examples/applications/wikipedia_principal_eigenvector.py
+++ b/examples/applications/wikipedia_principal_eigenvector.py
@@ -43,7 +43,7 @@ import numpy as np
 
 from scipy import sparse
 
-from sklearn.utils.extmath import fast_svd
+from sklearn.utils.extmath import randomized_svd
 from sklearn.externals.joblib import Memory
 
 
@@ -170,9 +170,9 @@ X, redirects, index_map = get_adjacency_matrix(
     redirects_filename, page_links_filename, limit=5000000)
 names = dict((i, name) for name, i in index_map.iteritems())
 
-print "Computing the principal singular vectors using fast_svd"
+print "Computing the principal singular vectors using randomized_svd"
 t0 = time()
-U, s, V = fast_svd(X, 5, q=3)
+U, s, V = randomized_svd(X, 5, q=3)
 print "done in %0.3fs" % (time() - t0)
 
 # print the names of the wikipedia related strongest compenents of the the
diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py
index 8fdd3208a2..20afe75089 100644
--- a/sklearn/decomposition/dict_learning.py
+++ b/sklearn/decomposition/dict_learning.py
@@ -17,7 +17,7 @@ from numpy.lib.stride_tricks import as_strided
 from ..base import BaseEstimator, TransformerMixin
 from ..externals.joblib import Parallel, delayed, cpu_count
 from ..utils import array2d, check_random_state, gen_even_slices, deprecated
-from ..utils.extmath import fast_svd
+from ..utils.extmath import randomized_svd
 from ..linear_model import Lasso, orthogonal_mp_gram, lars_path
 
 
@@ -622,7 +622,7 @@ def dict_learning_online(X, n_atoms, alpha, n_iter=100, return_code=True,
     if dict_init is not None:
         dictionary = dict_init
     else:
-        _, S, dictionary = fast_svd(X, n_atoms)
+        _, S, dictionary = randomized_svd(X, n_atoms)
         dictionary = S[:, np.newaxis] * dictionary
     r = len(dictionary)
     if n_atoms <= r:
diff --git a/sklearn/decomposition/nmf.py b/sklearn/decomposition/nmf.py
index 5a8a74ceb1..e1ace319d2 100644
--- a/sklearn/decomposition/nmf.py
+++ b/sklearn/decomposition/nmf.py
@@ -12,7 +12,7 @@ from __future__ import division
 
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import atleast2d_or_csr, check_random_state
-from ..utils.extmath import fast_svd, safe_sparse_dot
+from ..utils.extmath import randomized_svd, safe_sparse_dot
 
 import numpy as np
 from scipy.optimize import nnls
@@ -106,7 +106,7 @@ def _initialize_nmf(X, n_components, variant=None, eps=1e-6,
     if variant not in (None, 'a', 'ar'):
         raise ValueError("Invalid variant name")
 
-    U, S, V = fast_svd(X, n_components)
+    U, S, V = randomized_svd(X, n_components)
     W, H = np.zeros(U.shape), np.zeros(V.shape)
 
     # The leading singular triplet is non-negative
diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py
index 976ff79daa..bfd67f4b5e 100644
--- a/sklearn/decomposition/pca.py
+++ b/sklearn/decomposition/pca.py
@@ -11,7 +11,9 @@ from scipy import linalg
 
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import array2d, check_random_state, as_float_array
-from ..utils.extmath import fast_logdet, fast_svd, safe_sparse_dot
+from ..utils.extmath import fast_logdet
+from ..utils.extmath import safe_sparse_dot
+from ..utils.extmath import randomized_svd
 
 
 def _assess_dimension_(spectrum, rank, n_samples, dim):
@@ -458,9 +460,9 @@ class RandomizedPCA(BaseEstimator, TransformerMixin):
             self.mean_ = np.mean(X, axis=0)
             X -= self.mean_
 
-        U, S, V = fast_svd(X, self.n_components,
-                           n_iterations=self.iterated_power,
-                           random_state=self.random_state)
+        U, S, V = randomized_svd(X, self.n_components,
+                                 n_iterations=self.iterated_power,
+                                 random_state=self.random_state)
 
         self.explained_variance_ = (S ** 2) / n_samples
         self.explained_variance_ratio_ = self.explained_variance_ / \
diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
index d27d046d55..ad30eec603 100644
--- a/sklearn/utils/extmath.py
+++ b/sklearn/utils/extmath.py
@@ -4,10 +4,12 @@ Extended math utilities.
 # Authors: G. Varoquaux, A. Gramfort, A. Passos, O. Grisel
 # License: BSD
 
+import warnings
 import numpy as np
 from scipy import linalg
 
 from . import check_random_state
+from . import deprecated
 from .fixes import qr_economic
 
 
@@ -115,8 +117,8 @@ def randomized_range_finder(A, size, n_iterations, random_state=None):
     Y = safe_sparse_dot(A, R)
     del R
 
-    # apply q power iterations on Y to make to further 'imprint' the top
-    # singular values of A in Y
+    # perform power iterations with Y to further 'imprint' the top
+    # singular vectors of A in Y
     for i in xrange(n_iterations):
         Y = safe_sparse_dot(A, safe_sparse_dot(A.T, Y))
 
@@ -125,20 +127,22 @@ def randomized_range_finder(A, size, n_iterations, random_state=None):
     return Q
 
 
-def fast_svd(M, k, p=None, n_iterations=0, transpose='auto', random_state=0):
-    """Computes the k-truncated randomized SVD
+def randomized_svd(M, n_components, n_oversamples=10, n_iterations=0,
+                   transpose='auto', random_state=0):
+    """Computes a truncated randomized SVD
 
     Parameters
     ----------
     M: ndarray or sparse matrix
         Matrix to decompose
 
-    k: int
+    n_components: int
         Number of singular values and vectors to extract.
 
-    p: int (default is k)
-        Additional number of samples of the range of M to ensure proper
-        conditioning. See the notes below.
+    n_oversamples: int (default is 10)
+        Additional number of random vectors to sample the range of M so as
+        to ensure proper conditioning. The total number of random vectors
+        used to find the range of M is n_components + n_oversamples.
 
     n_iterations: int (default is 0)
         Number of power iterations (can be used to deal with very noisy
@@ -156,14 +160,10 @@ def fast_svd(M, k, p=None, n_iterations=0, transpose='auto', random_state=0):
 
     Notes
     -----
-    This algorithm finds the exact truncated singular values decomposition
-    using randomization to speed up the computations. It is particularly
-    fast on large matrices on which you whish to extract only a small
-    number of components.
-
-    (k + p) should be strictly higher than the rank of M. This can be
-    checked by ensuring that the lowest extracted singular value is on
-    the order of the machine precision of floating points.
+    This algorithm finds a (usually very good) approximate truncated
+    singular value decomposition using randomization to speed up the
+    computations. It is particularly fast on large matrices on which
+    you wish to extract only a small number of components.
 
     **References**:
 
@@ -174,10 +174,8 @@ def fast_svd(M, k, p=None, n_iterations=0, transpose='auto', random_state=0):
     * A randomized algorithm for the decomposition of matrices
       Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert
     """
-    if p == None:
-        p = k
-
     random_state = check_random_state(random_state)
+    n_random = n_components + n_oversamples
     n_samples, n_features = M.shape
 
     if transpose == 'auto' and n_samples > n_features:
@@ -186,7 +184,7 @@ def fast_svd(M, k, p=None, n_iterations=0, transpose='auto', random_state=0):
         # this implementation is a bit faster with smaller shape[1]
         M = M.T
 
-    Q = randomized_range_finder(M, k + p, n_iterations, random_state)
+    Q = randomized_range_finder(M, n_random, n_iterations, random_state)
 
     # project M to the (k + p) dimensional space using the basis vectors
     B = safe_sparse_dot(Q.T, M)
@@ -199,9 +197,16 @@ def fast_svd(M, k, p=None, n_iterations=0, transpose='auto', random_state=0):
 
     if transpose:
         # transpose back the results according to the input convention
-        return V[:k, :].T, s[:k], U[:, :k].T
+        return V[:n_components, :].T, s[:n_components], U[:, :n_components].T
     else:
-        return U[:, :k], s[:k], V[:k, :]
+        return U[:, :n_components], s[:n_components], V[:n_components, :]
+
+
+@deprecated("fast_svd is deprecated in 0.10 and will be removed in 0.12: "
+            "use randomized_svd instead")
+def fast_svd(M, k, p=10, n_iterations=0, transpose='auto', random_state=0):
+    return randomized_svd(M, k, n_oversamples=p, n_iterations=n_iterations,
+                          transpose='auto', random_state=random_state)
 
 
 def logsumexp(arr, axis=0):
diff --git a/sklearn/utils/tests/test_svd.py b/sklearn/utils/tests/test_svd.py
index d714a1f8ec..ea4b125e78 100644
--- a/sklearn/utils/tests/test_svd.py
+++ b/sklearn/utils/tests/test_svd.py
@@ -8,12 +8,12 @@ from scipy import linalg
 from numpy.testing import assert_equal
 from numpy.testing import assert_almost_equal
 
-from sklearn.utils.extmath import fast_svd
+from sklearn.utils.extmath import randomized_svd
 from sklearn.datasets.samples_generator import make_low_rank_matrix
 
 
-def test_fast_svd_low_rank():
-    """Check that extmath.fast_svd is consistent with linalg.svd"""
+def test_randomized_svd_low_rank():
+    """Check that extmath.randomized_svd is consistent with linalg.svd"""
     n_samples = 100
     n_features = 500
     rank = 5
@@ -29,7 +29,7 @@ def test_fast_svd_low_rank():
     U, s, V = linalg.svd(X, full_matrices=False)
 
     # compute the singular values of X using the fast approximate method
-    Ua, sa, Va = fast_svd(X, k)
+    Ua, sa, Va = randomized_svd(X, k)
     assert_equal(Ua.shape, (n_samples, k))
     assert_equal(sa.shape, (k,))
     assert_equal(Va.shape, (k, n_features))
@@ -45,12 +45,12 @@ def test_fast_svd_low_rank():
     X = sparse.csr_matrix(X)
 
     # compute the singular values of X using the fast approximate method
-    Ua, sa, Va = fast_svd(X, k)
+    Ua, sa, Va = randomized_svd(X, k)
     assert_almost_equal(s[:rank], sa[:rank])
 
 
-def test_fast_svd_low_rank_with_noise():
-    """Check that extmath.fast_svd can handle noisy matrices"""
+def test_randomized_svd_low_rank_with_noise():
+    """Check that extmath.randomized_svd can handle noisy matrices"""
     n_samples = 100
     n_features = 500
     rank = 5
@@ -67,21 +67,21 @@ def test_fast_svd_low_rank_with_noise():
 
     # compute the singular values of X using the fast approximate method
     # without the iterated power method
-    _, sa, _ = fast_svd(X, k, n_iterations=0)
+    _, sa, _ = randomized_svd(X, k, n_iterations=0)
 
     # the approximation does not tolerate the noise:
     assert np.abs(s[:k] - sa).max() > 0.05
 
     # compute the singular values of X using the fast approximate method with
     # iterated power method
-    _, sap, _ = fast_svd(X, k, n_iterations=5)
+    _, sap, _ = randomized_svd(X, k, n_iterations=5)
 
     # the iterated power method is helping getting rid of the noise:
     assert_almost_equal(s[:k], sap, decimal=3)
 
 
-def test_fast_svd_infinite_rank():
-    """Check that extmath.fast_svd can handle noisy matrices"""
+def test_randomized_svd_infinite_rank():
+    """Check that extmath.randomized_svd can handle noisy matrices"""
     n_samples = 100
     n_features = 500
     rank = 5
@@ -98,21 +98,21 @@ def test_fast_svd_infinite_rank():
 
     # compute the singular values of X using the fast approximate method
     # without the iterated power method
-    _, sa, _ = fast_svd(X, k, n_iterations=0)
+    _, sa, _ = randomized_svd(X, k, n_iterations=0)
 
     # the approximation does not tolerate the noise:
     assert np.abs(s[:k] - sa).max() > 0.1
 
     # compute the singular values of X using the fast approximate method with
     # iterated power method
-    _, sap, _ = fast_svd(X, k, n_iterations=5)
+    _, sap, _ = randomized_svd(X, k, n_iterations=5)
 
     # the iterated power method is still managing to get most of the structure
     # at the requested rank
     assert_almost_equal(s[:k], sap, decimal=3)
 
 
-def test_fast_svd_transpose_consistency():
+def test_randomized_svd_transpose_consistency():
     """Check that transposing the design matrix has limit impact"""
     n_samples = 100
     n_features = 500
@@ -123,9 +123,12 @@ def test_fast_svd_transpose_consistency():
         effective_rank=rank, tail_strength=0.5, random_state=0)
     assert_equal(X.shape, (n_samples, n_features))
 
-    U1, s1, V1 = fast_svd(X, k, n_iterations=3, transpose=False, random_state=0)
-    U2, s2, V2 = fast_svd(X, k, n_iterations=3, transpose=True, random_state=0)
-    U3, s3, V3 = fast_svd(X, k, n_iterations=3, transpose='auto', random_state=0)
+    U1, s1, V1 = randomized_svd(X, k, n_iterations=3, transpose=False,
+                                random_state=0)
+    U2, s2, V2 = randomized_svd(X, k, n_iterations=3, transpose=True,
+                                random_state=0)
+    U3, s3, V3 = randomized_svd(X, k, n_iterations=3, transpose='auto',
+                                random_state=0)
     U4, s4, V4 = linalg.svd(X, full_matrices=False)
 
     assert_almost_equal(s1, s4[:k], decimal=3)
-- 
GitLab