diff --git a/doc/datasets/twenty_newsgroups.rst b/doc/datasets/twenty_newsgroups.rst index c4fd379e2111ac07ae2cba8011bc9b9206f99134..f60f405b94a443483fc9925df2ae9302dc152236 100644 --- a/doc/datasets/twenty_newsgroups.rst +++ b/doc/datasets/twenty_newsgroups.rst @@ -99,6 +99,9 @@ zero features):: >>> vectors.nnz / vectors.shape[0] 118 +``sklearn.datasets.fetch_20newsgroups_tfidf`` is a function which returns +ready-to-use tfidf features instead of file names. + .. _`20 newsgroups website`: http://people.csail.mit.edu/jrennie/20Newsgroups/ .. _`TF-IDF`: http://en.wikipedia.org/wiki/Tf-idf diff --git a/sklearn/datasets/twenty_newsgroups.py b/sklearn/datasets/twenty_newsgroups.py index 5ce9cdb4add899e44ab0260da4d08cc9fe2803ce..e8ebaf8f8dc711552adf9c244c85e5873dc8247d 100644 --- a/sklearn/datasets/twenty_newsgroups.py +++ b/sklearn/datasets/twenty_newsgroups.py @@ -97,7 +97,7 @@ def download_20newsgroups(target_dir, cache_path): def fetch_20newsgroups(data_home=None, subset='train', categories=None, shuffle=True, random_state=42, download_if_missing=True): - """Load the filenames of the 20 newsgroups dataset + """Load the filenames of the 20 newsgroups dataset. Parameters ---------- @@ -225,6 +225,7 @@ def fetch_20newsgroups_tfidf(subset="train", data_home=None): data_home = get_data_home(data_home=data_home) mem = Memory(cachedir=data_home, verbose=False) + # we shuffle but use a fixed seed for the memoization data_train = fetch_20newsgroups(data_home=data_home, subset='train', categories=None,