From 3d2ea0d3e6d49be8acd57ed06f20d9e0ed318695 Mon Sep 17 00:00:00 2001 From: Mathieu Blondel <mathieu@mblondel.org> Date: Wed, 21 Dec 2011 11:37:33 +0100 Subject: [PATCH] Use int16 for more compactness. --- doc/datasets/twenty_newsgroups.rst | 4 ++-- doc/modules/classes.rst | 2 +- sklearn/datasets/__init__.py | 2 +- sklearn/datasets/tests/test_20news.py | 11 +++++++---- sklearn/datasets/twenty_newsgroups.py | 16 ++++++++++++---- 5 files changed, 23 insertions(+), 12 deletions(-) diff --git a/doc/datasets/twenty_newsgroups.rst b/doc/datasets/twenty_newsgroups.rst index 0f4d728f71..e82b5e4112 100644 --- a/doc/datasets/twenty_newsgroups.rst +++ b/doc/datasets/twenty_newsgroups.rst @@ -12,7 +12,7 @@ This module contains two loaders. The first one, returns a list of the raw text files that can be fed to text feature extractors such as :class:`sklearn.feature_extraction.text.Vectorizer` with custom parameters so as to extract feature vectors. -The second one, ``sklearn.datasets.fetch_20newsgroups_tfidf``, +The second one, ``sklearn.datasets.fetch_20newsgroups_vectorized``, returns ready-to-use features, i.e., it is not necessary to use a feature extractor. @@ -98,7 +98,7 @@ zero features):: >>> vectors.nnz / vectors.shape[0] 118 -``sklearn.datasets.fetch_20newsgroups_tfidf`` is a function which returns +``sklearn.datasets.fetch_20newsgroups_vectorized`` is a function which returns ready-to-use tfidf features instead of file names. .. _`20 newsgroups website`: http://people.csail.mit.edu/jrennie/20Newsgroups/ diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 6eb5103980..4d089e1448 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -130,7 +130,7 @@ Loaders datasets.fetch_lfw_people datasets.load_20newsgroups datasets.fetch_20newsgroups - datasets.fetch_20newsgroups_tfidf + datasets.fetch_20newsgroups_vectorized datasets.fetch_olivetti_faces Samples generator diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py index efa1015b99..74fb2e7108 100644 --- a/sklearn/datasets/__init__.py +++ b/sklearn/datasets/__init__.py @@ -20,7 +20,7 @@ from .lfw import load_lfw_people from .lfw import fetch_lfw_pairs from .lfw import fetch_lfw_people from .twenty_newsgroups import fetch_20newsgroups -from .twenty_newsgroups import fetch_20newsgroups_tfidf +from .twenty_newsgroups import fetch_20newsgroups_vectorized from .twenty_newsgroups import load_20newsgroups from .mldata import fetch_mldata, mldata_filename from .samples_generator import make_classification diff --git a/sklearn/datasets/tests/test_20news.py b/sklearn/datasets/tests/test_20news.py index 22fa6256e3..61d428a6a6 100644 --- a/sklearn/datasets/tests/test_20news.py +++ b/sklearn/datasets/tests/test_20news.py @@ -34,18 +34,21 @@ def test_20news(): assert_equal(entry1, entry2) -def test_20news_tfidf(): +def test_20news_vectorized(): # This test is slow. raise SkipTest - bunch = datasets.fetch_20newsgroups_tfidf(subset="train") + bunch = datasets.fetch_20newsgroups_vectorized(subset="train") assert_equal(bunch.data.shape, (11314, 107130)) assert_equal(bunch.target.shape[0], 11314) + assert_equal(bunch.data.dtype, np.float64) - bunch = datasets.fetch_20newsgroups_tfidf(subset="test") + bunch = datasets.fetch_20newsgroups_vectorized(subset="test") assert_equal(bunch.data.shape, (7532, 107130)) assert_equal(bunch.target.shape[0], 7532) + assert_equal(bunch.data.dtype, np.float64) - bunch = datasets.fetch_20newsgroups_tfidf(subset="all") + bunch = datasets.fetch_20newsgroups_vectorized(subset="all") assert_equal(bunch.data.shape, (11314 + 7532, 107130)) assert_equal(bunch.target.shape[0], 11314 + 7532) + assert_equal(bunch.data.dtype, np.float64) diff --git a/sklearn/datasets/twenty_newsgroups.py b/sklearn/datasets/twenty_newsgroups.py index ab66d4f5a6..ea06d74c0a 100644 --- a/sklearn/datasets/twenty_newsgroups.py +++ b/sklearn/datasets/twenty_newsgroups.py @@ -50,8 +50,9 @@ from .base import Bunch from .base import load_files from ..utils import check_random_state, deprecated from ..utils.fixes import in1d -from ..feature_extraction.text import Vectorizer -from sklearn.externals import joblib +from ..feature_extraction.text import CountVectorizer +from ..preprocessing import normalize +from ..externals import joblib logger = logging.getLogger(__name__) @@ -192,7 +193,7 @@ def fetch_20newsgroups(data_home=None, subset='train', categories=None, return data -def fetch_20newsgroups_tfidf(subset="train", data_home=None): +def fetch_20newsgroups_vectorized(subset="train", data_home=None): """Load the 20 newsgroups dataset and transform it into tf-idf vectors This is a convenience function; the tf-idf transformation is done using the @@ -238,11 +239,18 @@ def fetch_20newsgroups_tfidf(subset="train", data_home=None): if os.path.exists(target_file): X_train, X_test = joblib.load(target_file) else: - vectorizer = Vectorizer() + vectorizer = CountVectorizer(dtype=np.int16) X_train = vectorizer.fit_transform(data_train.data) X_test = vectorizer.transform(data_test.data) joblib.dump((X_train, X_test), target_file) + # the data is stored as int16 for compactness + # but normalize needs floats + X_train = X_train.astype(np.float64) + X_test = X_test.astype(np.float64) + normalize(X_train, copy=False) + normalize(X_test, copy=False) + target_names = data_train.target_names if subset == "train": -- GitLab