OPTIM: fast_svd now has a auto transpose mode that switch to the fastest impl

cdeee0d4 · Olivier Grisel · e61c7df1 · cdeee0d4 · cdeee0d4
Commit cdeee0d4 authored 14 years ago by Olivier Grisel
--- a/scikits/learn/utils/extmath.py
+++ b/scikits/learn/utils/extmath.py
@@ -94,8 +94,8 @@ def _sparsedot(a, b):
    else:
        return np.dot(a,b)

-def fast_svd(M, k, p=None, rng=0, q=0):
-    """Computes the k-truncated SVD using random projections
+def fast_svd(M, k, p=None, q=0, transpose='auto', rng=0):
+    """Computes the k-truncated randomized SVD

    Parameters
    ===========
@@ -113,6 +113,13 @@ def fast_svd(M, k, p=None, rng=0, q=0):
        Number of power iterations (can be used to deal with very noisy
        problems).

+    transpose: True, False or 'auto' (default)
+        Whether the algorithm should be applied to M.T instead of M. The
+        result should approximately be the same. The 'auto' mode will
+        trigger the transposition if M.shape[1] > M.shape[0] since this
+        implementation of randomized SVD tend to be a little faster in that
+        case).
+
    rng: RandomState or an int seed (0 by default)
        A random number generator instance to make behavior

@@ -136,15 +143,25 @@ def fast_svd(M, k, p=None, rng=0, q=0):
    A randomized algorithm for the decomposition of matrices
    Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert
    """
+    # lazy import of scipy sparse, because it is very slow.
+    from scipy import sparse
+
+    if p == None:
+        p = k
+
    if rng is None:
        rng = np.random.RandomState()
    elif isinstance(rng, int):
        rng = np.random.RandomState(rng)

-    # lazy import of scipy sparse, because it is very slow.
-    from scipy import sparse
-    if p == None:
-        p = k
+    n_samples, n_features = M.shape
+
+    if transpose == 'auto' and n_samples > n_features:
+        transpose = True
+    if transpose:
+        # this implementation is a bit faster with smaller shape[1]
+        M = M.T
+
   # generating random gaussian vectors r with shape: (M.shape[1], k + p)
    r = rng.normal(size=(M.shape[1], k + p))

@@ -168,5 +185,10 @@ def fast_svd(M, k, p=None, rng=0, q=0):
    Uhat, s, V = linalg.svd(B, full_matrices=False)
    del B
    U = np.dot(Q, Uhat)
+
+    if transpose:
+        # transpose back the results according to the input convention
+        return V[:k, :].T, s[:k], U[:, :k].T
+    else:
        return U[:, :k], s[:k], V[:k, :]

--- a/scikits/learn/utils/tests/test_svd.py
+++ b/scikits/learn/utils/tests/test_svd.py
@@ -36,11 +36,10 @@ def test_fast_svd_low_rank():

    # ensure that the singular values of both methods are equal up to the real
    # rank of the matrix
-    assert_almost_equal(s[:rank], sa[:rank])
+    assert_almost_equal(s[:k], sa)

    # check the singular vectors too (while not checking the sign)
-    assert_almost_equal(np.dot(U[:, :rank], V[:rank, :]),
-                        np.dot(Ua[:, :rank], Va[:rank, :]))
+    assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va))

    # check the sparse matrix representation
    X = sparse.csr_matrix(X)
@@ -71,14 +70,14 @@ def test_fast_svd_low_rank_with_noise():
    _, sa, _ = fast_svd(X, k, q=0)

    # the approximation does not tolerate the noise:
-    assert np.abs(s[:rank] - sa[:rank]).max() > 0.1
+    assert np.abs(s[:k] - sa).max() > 0.05

    # compute the singular values of X using the fast approximate method with
    # iterated power method
-    _, sap, _ = fast_svd(X, k, q=3)
+    _, sap, _ = fast_svd(X, k, q=5)

    # the iterated power method is helping getting rid of the noise:
-    assert_almost_equal(s[:rank], sap[:rank], decimal=5)
+    assert_almost_equal(s[:k], sap, decimal=3)


 def test_fast_svd_infinite_rank():
@@ -102,14 +101,44 @@ def test_fast_svd_infinite_rank():
    _, sa, _ = fast_svd(X, k, q=0)

    # the approximation does not tolerate the noise:
-    assert np.abs(s[:rank] - sa[:rank]).max() > 0.1
+    assert np.abs(s[:k] - sa).max() > 0.1

    # compute the singular values of X using the fast approximate method with
    # iterated power method
-    _, sap, _ = fast_svd(X, k, q=7)
+    _, sap, _ = fast_svd(X, k, q=5)

    # the iterated power method is still managing to get most of the structure
    # at the requested rank
-    assert_almost_equal(s[:rank], sap[:rank], decimal=5)
+    assert_almost_equal(s[:k], sap, decimal=3)
+
+
+def test_fast_svd_transpose_consistency():
+    """Check that transposing the design matrix has limit impact"""
+    n_samples = 100
+    n_features = 500
+    rank = 4
+    k = 10
+
+    X = low_rank_fat_tail(n_samples, n_features, effective_rank=rank,
+                          tail_strength=0.5, seed=0)
+    assert_equal(X.shape, (n_samples, n_features))
+
+    U1, s1, V1 = fast_svd(X, k, q=3, transpose=False, rng=0)
+    U2, s2, V2 = fast_svd(X, k, q=3, transpose=True, rng=0)
+    U3, s3, V3 = fast_svd(X, k, q=3, transpose='auto', rng=0)
+    U4, s4, V4 = linalg.svd(X, full_matrices=False)
+
+    assert_almost_equal(s1, s4[:k], decimal=3)
+    assert_almost_equal(s2, s4[:k], decimal=3)
+    assert_almost_equal(s3, s4[:k], decimal=3)
+
+    assert_almost_equal(np.dot(U1, V1), np.dot(U4[:, :k], V4[:k, :]),
+                        decimal=2)
+    assert_almost_equal(np.dot(U2, V2), np.dot(U4[:, :k], V4[:k, :]),
+                        decimal=2)
+
+
+    # in this case 'auto' is equivalent to transpose
+    assert_almost_equal(s2, s3)