diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 4de836363f7a7390a19be38005ffdd3d548e976d..616b13f1e5327454df5001e86f521b5023b0866c 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -122,6 +122,11 @@ Bug fixes when a numpy array is passed in for weights. :issue:`7983` by :user:`Vincent Pham <vincentpham1991>`. + - Fix a bug in :class:`sklearn.decomposition.LatentDirichletAllocation` + where the ``perplexity`` method was returning incorrect results because + the ``transform`` method returns normalized document topic distributions + as of version 0.18. :issue:`7954` by :user:`Gary Foreman <garyForeman>`. + - Fix a bug where :class:`sklearn.ensemble.GradientBoostingClassifier` and :class:`sklearn.ensemble.GradientBoostingRegressor` ignored the ``min_impurity_split`` parameter. @@ -135,6 +140,12 @@ API changes summary ensemble estimators (deriving from :class:`ensemble.BaseEnsemble`) now only have ``self.estimators_`` available after ``fit``. :issue:`7464` by `Lars Buitinck`_ and `Loic Esteve`_. + + - Deprecate the ``doc_topic_distr`` argument of the ``perplexity`` method + in :class:`sklearn.decomposition.LatentDirichletAllocation` because the + user no longer has access to the unnormalized document topic distribution + needed for the perplexity calculation. :issue:`7954` by + :user:`Gary Foreman <garyForeman>`. .. _changes_0_18_1: diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py index 3e7a54cc2a6cf89ded00c5adc3a83076c6531113..8e0c5bfe6b4151a7eef7532942df52c8a9e5b8ce 100644 --- a/sklearn/decomposition/online_lda.py +++ b/sklearn/decomposition/online_lda.py @@ -505,7 +505,7 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin): warnings.warn("The default value for 'learning_method' will be " "changed from 'online' to 'batch' in the release 0.20. " "This warning was introduced in 0.18.", - DeprecationWarning) + DeprecationWarning) learning_method = 'online' batch_size = self.batch_size @@ -531,8 +531,8 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin): doc_topics_distr, _ = self._e_step(X, cal_sstats=False, random_init=False, parallel=parallel) - bound = self.perplexity(X, doc_topics_distr, - sub_sampling=False) + bound = self._perplexity_precomp_distr(X, doc_topics_distr, + sub_sampling=False) if self.verbose: print('iteration: %d, perplexity: %.4f' % (i + 1, bound)) @@ -541,10 +541,18 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin): break last_bound = bound self.n_iter_ += 1 + + # calculate final perplexity value on train set + doc_topics_distr, _ = self._e_step(X, cal_sstats=False, + random_init=False, + parallel=parallel) + self.bound_ = self._perplexity_precomp_distr(X, doc_topics_distr, + sub_sampling=False) + return self - def transform(self, X): - """Transform data X according to the fitted model. + def _unnormalized_transform(self, X): + """Transform data X according to fitted model. Parameters ---------- @@ -556,7 +564,6 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin): doc_topic_distr : shape=(n_samples, n_topics) Document topic distribution for X. """ - if not hasattr(self, 'components_'): raise NotFittedError("no 'components_' attribute in model." " Please fit model first.") @@ -572,7 +579,26 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin): doc_topic_distr, _ = self._e_step(X, cal_sstats=False, random_init=False) - # normalize doc_topic_distr + + return doc_topic_distr + + def transform(self, X): + """Transform data X according to the fitted model. + + .. versionchanged:: 0.18 + *doc_topic_distr* is now normalized + + Parameters + ---------- + X : array-like or sparse matrix, shape=(n_samples, n_features) + Document word matrix. + + Returns + ------- + doc_topic_distr : shape=(n_samples, n_topics) + Document topic distribution for X. + """ + doc_topic_distr = self._unnormalized_transform(X) doc_topic_distr /= doc_topic_distr.sum(axis=1)[:, np.newaxis] return doc_topic_distr @@ -665,15 +691,16 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin): score : float Use approximate bound as score. """ - X = self._check_non_neg_array(X, "LatentDirichletAllocation.score") - doc_topic_distr = self.transform(X) + doc_topic_distr = self._unnormalized_transform(X) score = self._approx_bound(X, doc_topic_distr, sub_sampling=False) return score - def perplexity(self, X, doc_topic_distr=None, sub_sampling=False): - """Calculate approximate perplexity for data X. + def _perplexity_precomp_distr(self, X, doc_topic_distr=None, + sub_sampling=False): + """Calculate approximate perplexity for data X with ability to accept + precomputed doc_topic_distr Perplexity is defined as exp(-1. * log-likelihood per word) @@ -699,7 +726,7 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin): "LatentDirichletAllocation.perplexity") if doc_topic_distr is None: - doc_topic_distr = self.transform(X) + doc_topic_distr = self._unnormalized_transform(X) else: n_samples, n_topics = doc_topic_distr.shape if n_samples != X.shape[0]: @@ -719,3 +746,35 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin): perword_bound = bound / word_cnt return np.exp(-1.0 * perword_bound) + + def perplexity(self, X, doc_topic_distr='deprecated', sub_sampling=False): + """Calculate approximate perplexity for data X. + + Perplexity is defined as exp(-1. * log-likelihood per word) + + .. versionchanged:: 0.19 + *doc_topic_distr* argument has been depricated because user no + longer has access to unnormalized distribution + + Parameters + ---------- + X : array-like or sparse matrix, [n_samples, n_features] + Document word matrix. + + doc_topic_distr : None or array, shape=(n_samples, n_topics) + Document topic distribution. + If it is None, it will be generated by applying transform on X. + + .. deprecated:: 0.19 + + Returns + ------- + score : float + Perplexity score. + """ + if doc_topic_distr != 'deprecated': + warnings.warn("Argument 'doc_topic_distr' is deprecated and will " + "be ignored as of 0.19. Support for this argument " + "will be removed in 0.21.", DeprecationWarning) + + return self._perplexity_precomp_distr(X, sub_sampling=sub_sampling) diff --git a/sklearn/decomposition/tests/test_online_lda.py b/sklearn/decomposition/tests/test_online_lda.py index d55bf5a453d03df03db042412582605dd56d5b1c..c3a221fe4800a7e34feeadd71e72593fa092a20f 100644 --- a/sklearn/decomposition/tests/test_online_lda.py +++ b/sklearn/decomposition/tests/test_online_lda.py @@ -14,6 +14,7 @@ from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_greater_equal from sklearn.utils.testing import assert_raises_regexp from sklearn.utils.testing import if_safe_multiprocessing_with_blas +from sklearn.utils.testing import assert_warns from sklearn.exceptions import NotFittedError from sklearn.externals.six.moves import xrange @@ -238,12 +239,12 @@ def test_lda_preplexity_mismatch(): lda.fit(X) # invalid samples invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_topics)) - assert_raises_regexp(ValueError, r'Number of samples', lda.perplexity, X, - invalid_n_samples) + assert_raises_regexp(ValueError, r'Number of samples', + lda._perplexity_precomp_distr, X, invalid_n_samples) # invalid topic number invalid_n_topics = rng.randint(4, size=(n_samples, n_topics + 1)) - assert_raises_regexp(ValueError, r'Number of topics', lda.perplexity, X, - invalid_n_topics) + assert_raises_regexp(ValueError, r'Number of topics', + lda._perplexity_precomp_distr, X, invalid_n_topics) def test_lda_perplexity(): @@ -257,15 +258,15 @@ def test_lda_perplexity(): lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, learning_method=method, total_samples=100, random_state=0) - distr_1 = lda_1.fit_transform(X) - perp_1 = lda_1.perplexity(X, distr_1, sub_sampling=False) + lda_1.fit(X) + perp_1 = lda_1.perplexity(X, sub_sampling=False) - distr_2 = lda_2.fit_transform(X) - perp_2 = lda_2.perplexity(X, distr_2, sub_sampling=False) + lda_2.fit(X) + perp_2 = lda_2.perplexity(X, sub_sampling=False) assert_greater_equal(perp_1, perp_2) - perp_1_subsampling = lda_1.perplexity(X, distr_1, sub_sampling=True) - perp_2_subsampling = lda_2.perplexity(X, distr_2, sub_sampling=True) + perp_1_subsampling = lda_1.perplexity(X, sub_sampling=True) + perp_2_subsampling = lda_2.perplexity(X, sub_sampling=True) assert_greater_equal(perp_1_subsampling, perp_2_subsampling) @@ -295,12 +296,10 @@ def test_perplexity_input_format(): lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, learning_method='batch', total_samples=100, random_state=0) - distr = lda.fit_transform(X) + lda.fit(X) perp_1 = lda.perplexity(X) - perp_2 = lda.perplexity(X, distr) - perp_3 = lda.perplexity(X.toarray(), distr) + perp_2 = lda.perplexity(X.toarray()) assert_almost_equal(perp_1, perp_2) - assert_almost_equal(perp_1, perp_3) def test_lda_score_perplexity(): @@ -308,14 +307,45 @@ def test_lda_score_perplexity(): n_topics, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, random_state=0) - distr = lda.fit_transform(X) - perplexity_1 = lda.perplexity(X, distr, sub_sampling=False) + lda.fit(X) + perplexity_1 = lda.perplexity(X, sub_sampling=False) score = lda.score(X) perplexity_2 = np.exp(-1. * (score / np.sum(X.data))) assert_almost_equal(perplexity_1, perplexity_2) +def test_lda_fit_perplexity(): + # Test that the perplexity computed during fit is consistent with what is + # returned by the perplexity method + n_topics, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, + learning_method='batch', random_state=0, + evaluate_every=1) + lda.fit(X) + + # Perplexity computed at end of fit method + perplexity1 = lda.bound_ + + # Result of perplexity method on the train set + perplexity2 = lda.perplexity(X) + + assert_almost_equal(perplexity1, perplexity2) + + +def test_doc_topic_distr_deprecation(): + # Test that the appropriate warning message is displayed when a user + # attempts to pass the doc_topic_distr argument to the perplexity method + n_topics, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, + learning_method='batch', + total_samples=100, random_state=0) + distr1 = lda.fit_transform(X) + distr2 = None + assert_warns(DeprecationWarning, lda.perplexity, X, distr1) + assert_warns(DeprecationWarning, lda.perplexity, X, distr2) + + def test_lda_empty_docs(): """Test LDA on empty document (all-zero rows).""" Z = np.zeros((5, 4))