From dbbdbe40e62d8287678f9ba3a6bc2a1bcb9e3010 Mon Sep 17 00:00:00 2001 From: Mathieu Blondel <mathieu@mblondel.org> Date: Wed, 21 Dec 2011 10:17:03 +0100 Subject: [PATCH] Better doc for the 20newsgroup dataset loader. --- doc/datasets/twenty_newsgroups.rst | 17 ++++++++--------- sklearn/datasets/twenty_newsgroups.py | 3 +++ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/doc/datasets/twenty_newsgroups.rst b/doc/datasets/twenty_newsgroups.rst index f60f405b94..0f4d728f71 100644 --- a/doc/datasets/twenty_newsgroups.rst +++ b/doc/datasets/twenty_newsgroups.rst @@ -7,15 +7,14 @@ and the other one for testing (or for performance evaluation). The split between the train and test set is based upon a messages posted before and after a specific date. -The 20 newsgroups dataset is also available through the generic -``mldata`` dataset loader introduced earlier. However mldata -provides a version where the data is already vectorized. - -This is not the case for this loader. Instead, it returns the list of -the raw text files that can be fed to text feature extractors such as -:class:`sklearn.feature_extraction.text.Vectorizer` with custom -parameters so as to extract feature vectors. - +This module contains two loaders. The first one, +``sklearn.datasets.fetch_20newsgroups``, +returns a list of the raw text files that can be fed to text feature +extractors such as :class:`sklearn.feature_extraction.text.Vectorizer` +with custom parameters so as to extract feature vectors. +The second one, ``sklearn.datasets.fetch_20newsgroups_tfidf``, +returns ready-to-use features, i.e., it is not necessary to use a feature +extractor. Usage ----- diff --git a/sklearn/datasets/twenty_newsgroups.py b/sklearn/datasets/twenty_newsgroups.py index e8ebaf8f8d..706eb96467 100644 --- a/sklearn/datasets/twenty_newsgroups.py +++ b/sklearn/datasets/twenty_newsgroups.py @@ -215,6 +215,9 @@ def fetch_20newsgroups_tfidf(subset="train", data_home=None): ------- bunch : Bunch object + bunch.data: sparse matrix, shape [n_samples, n_features] + bunch.target: array, shape [n_samples] + bunch.target_names: list, length [n_classes] """ def _vectorize(data_train, data_test): vectorizer = Vectorizer() -- GitLab