From 7e2777d43412fa6c4d2f8459b576b472723a7260 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 8 Dec 2010 11:11:43 +0100
Subject: [PATCH] switch whintening off in PCA by default + ensure unit scale +
 better docstring

---
 scikits/learn/pca.py            | 16 +++++++++++-----
 scikits/learn/tests/test_pca.py | 30 ++++++++++++++++++------------
 2 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/scikits/learn/pca.py b/scikits/learn/pca.py
index e8bb5120d4..1a19ebfaca 100644
--- a/scikits/learn/pca.py
+++ b/scikits/learn/pca.py
@@ -127,10 +127,15 @@ class PCA(BaseEstimator):
         explained variances is equal to 1.0
 
     whiten: bool, optional
-        If True (default) the components_ vectors are divided by the
-        singular values to ensure uncorrelated outputs with identical
+        When True (False by default) the components_ vectors are divided
+        by the singular values to ensure uncorrelated outputs with unit
         component-wise variances.
 
+        Whitening will remove some information from the transformed signal
+        (the relative variance scales of the components) but can sometime
+        improve the predictive accuracy of the downstream estimators by
+        making there data respect some hard-wired assumptions.
+
     iterated_power: int, optional
         Number of iteration for the power method if do_fast_svd is True. 3 by
         default.
@@ -147,7 +152,7 @@ class PCA(BaseEstimator):
     >>> from scikits.learn.pca import PCA
     >>> pca = PCA(n_comp=2)
     >>> pca.fit(X)
-    PCA(do_fast_svd=False, n_comp=2, copy=True, whiten=True, iterated_power=3)
+    PCA(do_fast_svd=False, n_comp=2, copy=True, whiten=False, iterated_power=3)
     >>> print pca.explained_variance_ratio_
     [ 0.99244289  0.00755711]
 
@@ -157,7 +162,7 @@ class PCA(BaseEstimator):
 
     """
     def __init__(self, n_comp=None, copy=True, do_fast_svd=False,
-                 iterated_power=3, whiten=True):
+                 iterated_power=3, whiten=False):
         self.n_comp = n_comp
         self.copy = copy
         self.do_fast_svd = do_fast_svd
@@ -188,7 +193,8 @@ class PCA(BaseEstimator):
                                         self.explained_variance_.sum()
 
         if self.whiten:
-            self.components_ = np.dot(V.T, np.diag(1.0 / S))
+            n = X.shape[0]
+            self.components_ = np.dot(V.T, np.diag(1.0 / S)) * np.sqrt(n)
         else:
             self.components_ = V.T
 
diff --git a/scikits/learn/tests/test_pca.py b/scikits/learn/tests/test_pca.py
index f65694e774..1282213625 100644
--- a/scikits/learn/tests/test_pca.py
+++ b/scikits/learn/tests/test_pca.py
@@ -26,31 +26,37 @@ def test_pca():
 def test_whitening():
     """Check that PCA output has unit-variance"""
     np.random.seed(0)
+    n_samples = 100
+    n_features = 80
+    n_components = 30
+    rank = 50
 
     # some low rank data with correlated features
-    X = np.dot(randn(100, 50),
-               np.dot(np.diag(np.linspace(10.0, 1.0, 50)),
-                      randn(50, 80)))
+    X = np.dot(randn(n_samples, rank),
+               np.dot(np.diag(np.linspace(10.0, 1.0, rank)),
+                      randn(rank, n_features)))
     # the component-wise variance of the first 50 features is 3 times the
     # mean component-wise variance of the remaingin 30 features
     X[:, :50] *= 3
 
-    assert_equal(X.shape, (100, 80))
+    assert_equal(X.shape, (n_samples, n_features))
 
     # the component-wise variance is thus highly varying:
     assert_almost_equal(X.std(axis=0).std(), 43.9, 1)
 
-    # whiten by default
-    X_whitened = PCA(n_comp=30).fit(X).transform(X)
-    assert_equal(X_whitened.shape, (100, 30))
+    # whiten the data while projecting to the lower dim subspace
+    pca = PCA(n_comp=n_components, whiten=True).fit(X)
+    X_whitened = pca.transform(X)
+    assert_equal(X_whitened.shape, (n_samples, n_components))
 
-    # all output component have identical variance
-    assert_almost_equal(X_whitened.std(axis=0).std(), 0.0, 3)
+    # all output component have unit variances
+    assert_almost_equal(X_whitened.std(axis=0), np.ones(n_components))
 
     # is possible to project on the low dim space without scaling by the
     # singular values
-    X_unwhitened = PCA(n_comp=30, whiten=False).fit(X).transform(X)
-    assert_equal(X_unwhitened.shape, (100, 30))
+    pca = PCA(n_comp=n_components, whiten=False).fit(X)
+    X_unwhitened = pca.transform(X)
+    assert_equal(X_unwhitened.shape, (n_samples, n_components))
 
     # in that case the output components still have varying variances
     assert_almost_equal(X_unwhitened.std(axis=0).std(), 74.1, 1)
@@ -182,7 +188,7 @@ def test_probabilistic_pca_4():
     Xt = randn(n, p) + randn(n, 1)*np.array([3, 4, 5]) + np.array([1, 0, 7])
     ll = np.zeros(p)
     for k in range(p):
-        ppca = ProbabilisticPCA(n_comp=k, whiten=False)
+        ppca = ProbabilisticPCA(n_comp=k)
         ppca.fit(Xl)
         ll[k] = ppca.score(Xt).mean()
 
-- 
GitLab