From dbbdbe40e62d8287678f9ba3a6bc2a1bcb9e3010 Mon Sep 17 00:00:00 2001
From: Mathieu Blondel <mathieu@mblondel.org>
Date: Wed, 21 Dec 2011 10:17:03 +0100
Subject: [PATCH] Better doc for the 20newsgroup dataset loader.

---
 doc/datasets/twenty_newsgroups.rst    | 17 ++++++++---------
 sklearn/datasets/twenty_newsgroups.py |  3 +++
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/doc/datasets/twenty_newsgroups.rst b/doc/datasets/twenty_newsgroups.rst
index f60f405b94..0f4d728f71 100644
--- a/doc/datasets/twenty_newsgroups.rst
+++ b/doc/datasets/twenty_newsgroups.rst
@@ -7,15 +7,14 @@ and the other one for testing (or for performance evaluation). The split
 between the train and test set is based upon a messages posted before
 and after a specific date.
 
-The 20 newsgroups dataset is also available through the generic
-``mldata`` dataset loader introduced earlier. However mldata
-provides a version where the data is already vectorized.
-
-This is not the case for this loader. Instead, it returns the list of
-the raw text files that can be fed to  text feature extractors such as
-:class:`sklearn.feature_extraction.text.Vectorizer` with custom
-parameters so as to extract feature vectors.
-
+This module contains two loaders. The first one, 
+``sklearn.datasets.fetch_20newsgroups``,
+returns a list of the raw text files that can be fed to text feature
+extractors such as :class:`sklearn.feature_extraction.text.Vectorizer`
+with custom parameters so as to extract feature vectors.
+The second one, ``sklearn.datasets.fetch_20newsgroups_tfidf``,
+returns ready-to-use features, i.e., it is not necessary to use a feature
+extractor.
 
 Usage
 -----
diff --git a/sklearn/datasets/twenty_newsgroups.py b/sklearn/datasets/twenty_newsgroups.py
index e8ebaf8f8d..706eb96467 100644
--- a/sklearn/datasets/twenty_newsgroups.py
+++ b/sklearn/datasets/twenty_newsgroups.py
@@ -215,6 +215,9 @@ def fetch_20newsgroups_tfidf(subset="train", data_home=None):
     -------
 
     bunch : Bunch object
+        bunch.data: sparse matrix, shape [n_samples, n_features]
+        bunch.target: array, shape [n_samples]
+        bunch.target_names: list, length [n_classes]
     """
     def _vectorize(data_train, data_test):
         vectorizer = Vectorizer()
-- 
GitLab