From 7e2777d43412fa6c4d2f8459b576b472723a7260 Mon Sep 17 00:00:00 2001 From: Olivier Grisel <olivier.grisel@ensta.org> Date: Wed, 8 Dec 2010 11:11:43 +0100 Subject: [PATCH] switch whintening off in PCA by default + ensure unit scale + better docstring --- scikits/learn/pca.py | 16 +++++++++++----- scikits/learn/tests/test_pca.py | 30 ++++++++++++++++++------------ 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/scikits/learn/pca.py b/scikits/learn/pca.py index e8bb5120d4..1a19ebfaca 100644 --- a/scikits/learn/pca.py +++ b/scikits/learn/pca.py @@ -127,10 +127,15 @@ class PCA(BaseEstimator): explained variances is equal to 1.0 whiten: bool, optional - If True (default) the components_ vectors are divided by the - singular values to ensure uncorrelated outputs with identical + When True (False by default) the components_ vectors are divided + by the singular values to ensure uncorrelated outputs with unit component-wise variances. + Whitening will remove some information from the transformed signal + (the relative variance scales of the components) but can sometime + improve the predictive accuracy of the downstream estimators by + making there data respect some hard-wired assumptions. + iterated_power: int, optional Number of iteration for the power method if do_fast_svd is True. 3 by default. @@ -147,7 +152,7 @@ class PCA(BaseEstimator): >>> from scikits.learn.pca import PCA >>> pca = PCA(n_comp=2) >>> pca.fit(X) - PCA(do_fast_svd=False, n_comp=2, copy=True, whiten=True, iterated_power=3) + PCA(do_fast_svd=False, n_comp=2, copy=True, whiten=False, iterated_power=3) >>> print pca.explained_variance_ratio_ [ 0.99244289 0.00755711] @@ -157,7 +162,7 @@ class PCA(BaseEstimator): """ def __init__(self, n_comp=None, copy=True, do_fast_svd=False, - iterated_power=3, whiten=True): + iterated_power=3, whiten=False): self.n_comp = n_comp self.copy = copy self.do_fast_svd = do_fast_svd @@ -188,7 +193,8 @@ class PCA(BaseEstimator): self.explained_variance_.sum() if self.whiten: - self.components_ = np.dot(V.T, np.diag(1.0 / S)) + n = X.shape[0] + self.components_ = np.dot(V.T, np.diag(1.0 / S)) * np.sqrt(n) else: self.components_ = V.T diff --git a/scikits/learn/tests/test_pca.py b/scikits/learn/tests/test_pca.py index f65694e774..1282213625 100644 --- a/scikits/learn/tests/test_pca.py +++ b/scikits/learn/tests/test_pca.py @@ -26,31 +26,37 @@ def test_pca(): def test_whitening(): """Check that PCA output has unit-variance""" np.random.seed(0) + n_samples = 100 + n_features = 80 + n_components = 30 + rank = 50 # some low rank data with correlated features - X = np.dot(randn(100, 50), - np.dot(np.diag(np.linspace(10.0, 1.0, 50)), - randn(50, 80))) + X = np.dot(randn(n_samples, rank), + np.dot(np.diag(np.linspace(10.0, 1.0, rank)), + randn(rank, n_features))) # the component-wise variance of the first 50 features is 3 times the # mean component-wise variance of the remaingin 30 features X[:, :50] *= 3 - assert_equal(X.shape, (100, 80)) + assert_equal(X.shape, (n_samples, n_features)) # the component-wise variance is thus highly varying: assert_almost_equal(X.std(axis=0).std(), 43.9, 1) - # whiten by default - X_whitened = PCA(n_comp=30).fit(X).transform(X) - assert_equal(X_whitened.shape, (100, 30)) + # whiten the data while projecting to the lower dim subspace + pca = PCA(n_comp=n_components, whiten=True).fit(X) + X_whitened = pca.transform(X) + assert_equal(X_whitened.shape, (n_samples, n_components)) - # all output component have identical variance - assert_almost_equal(X_whitened.std(axis=0).std(), 0.0, 3) + # all output component have unit variances + assert_almost_equal(X_whitened.std(axis=0), np.ones(n_components)) # is possible to project on the low dim space without scaling by the # singular values - X_unwhitened = PCA(n_comp=30, whiten=False).fit(X).transform(X) - assert_equal(X_unwhitened.shape, (100, 30)) + pca = PCA(n_comp=n_components, whiten=False).fit(X) + X_unwhitened = pca.transform(X) + assert_equal(X_unwhitened.shape, (n_samples, n_components)) # in that case the output components still have varying variances assert_almost_equal(X_unwhitened.std(axis=0).std(), 74.1, 1) @@ -182,7 +188,7 @@ def test_probabilistic_pca_4(): Xt = randn(n, p) + randn(n, 1)*np.array([3, 4, 5]) + np.array([1, 0, 7]) ll = np.zeros(p) for k in range(p): - ppca = ProbabilisticPCA(n_comp=k, whiten=False) + ppca = ProbabilisticPCA(n_comp=k) ppca.fit(Xl) ll[k] = ppca.score(Xt).mean() -- GitLab