diff --git a/examples/mlcomp_document_classification.py b/examples/mlcomp_document_classification.py deleted file mode 100644 index e1936abd3533f3e45d1c6327ae5c3cacea026473..0000000000000000000000000000000000000000 --- a/examples/mlcomp_document_classification.py +++ /dev/null @@ -1,104 +0,0 @@ -""" -================================ -Classification of text documents -================================ - -This is an example showing how the scikit-learn can be used to classify -documents by topics using a bag-of-words approach. - -The dataset used in this example is the 20 newsgroups dataset and should be -downloaded from the http://mlcomp.org (free registration required): - - http://mlcomp.org/datasets/379 - -Once downloaded unzip the arhive somewhere on your filesystem. For instance in:: - - % mkdir -p ~/data/mlcomp - % cd ~/data/mlcomp - % unzip /path/to/dataset-379-20news-18828_XXXXX.zip - -You should get a folder ``~/data/mlcomp/379`` with a file named ``metadata`` and -subfolders ``raw``, ``train`` and ``test`` holding the text documents organized by -newsgroups. - -Then set the ``MLCOMP_DATASETS_HOME`` environment variable pointing to -the root folder holding the uncompressed archive:: - - % export MLCOMP_DATASETS_HOME="~/data/mlcomp" - -Then you are ready to run this example using your favorite python shell:: - - % ipython examples/mlcomp_document_classification.py - -""" -# Author: Olivier Grisel <olivier.grisel@ensta.org> -# License: Simplified BSD - -from time import time -import sys -import os -import numpy as np -import pylab as pl - -from scikits.learn.datasets import load_mlcomp -from scikits.learn.svm import LinearSVC -from scikits.learn.metrics import confusion_matrix -from scikits.learn.metrics import classification_report - -if 'MLCOMP_DATASETS_HOME' not in os.environ: - print "Please follow those instructions to get started:" - print __doc__ - sys.exit(0) - -# Load the training set -print "Loading 20 newsgroups training set... " -t0 = time() -news_train = load_mlcomp('20news-18828', 'train') -print "done in %fs" % (time() - t0) - -# The documents have been hashed into TF-IDF (Term Frequencies times Inverse -# Document Frequencies) vectors of a fixed dimension. -# Currently most scikits.learn wrappers or algorithm implementations are unable -# to leverage efficiently a sparse datastracture; hence we use a dense -# representation of a text dataset. Efficient handling of sparse data -# structures should be expected in an upcoming version of scikits.learn -print "n_samples: %d, n_features: %d" % news_train.data.shape - -print "Training a linear classification model with L1 penalty... " -parameters = { - 'loss': 'l1', - 'penalty': 'l2', - 'C': 10, - 'dual': True, - 'eps': 1e-4, -} -print "parameters:", parameters -t0 = time() -clf = LinearSVC(**parameters).fit(news_train.data, news_train.target) -print "done in %fs" % (time() - t0) -print "Percentage of non zeros coef: %f" % (np.mean(clf.coef_ != 0) * 100) - -print "Loading 20 newsgroups test set... " -t0 = time() -news_test = load_mlcomp('20news-18828', 'test') -print "done in %fs" % (time() - t0) - -print "Predicting the labels of the test set..." -t0 = time() -pred = clf.predict(news_test.data) -print "done in %fs" % (time() - t0) - -print "Classification report on test set:" -print classification_report(news_test.target, pred, - class_names=news_test.target_names) - - -cm = confusion_matrix(news_test.target, pred) -print "Confusion matrix:" -print cm - -# Show confusion matrix -pl.matshow(cm) -pl.title('Confusion matrix') -pl.colorbar() -pl.show() diff --git a/scikits/learn/datasets/mlcomp.py b/scikits/learn/datasets/mlcomp.py index 4ea38494df254332ced437ce55b04758f0b4f1f9..0096990ff9c5fbb5c71a743a07850db40c27dca5 100644 --- a/scikits/learn/datasets/mlcomp.py +++ b/scikits/learn/datasets/mlcomp.py @@ -5,9 +5,6 @@ import os import numpy as np from scikits.learn.datasets.base import load_files -from scikits.learn.feature_extraction.text import HashingVectorizer -from scikits.learn.feature_extraction.text.sparse import HashingVectorizer as \ - SparseCountVectorizer def _load_document_classification(dataset_path, metadata, set_=None): diff --git a/scikits/learn/feature_extraction/tests/test_text.py b/scikits/learn/feature_extraction/tests/test_text.py index bf5ce70b1e81f9b4310b44ae25944304cbb226d7..f477694d3fb6d5f9e09535348605d6bea9ca955f 100644 --- a/scikits/learn/feature_extraction/tests/test_text.py +++ b/scikits/learn/feature_extraction/tests/test_text.py @@ -5,14 +5,12 @@ from scikits.learn.feature_extraction.text import strip_accents from scikits.learn.feature_extraction.text import CountVectorizer from scikits.learn.feature_extraction.text import TfidfTransformer from scikits.learn.feature_extraction.text import Vectorizer -from scikits.learn.feature_extraction.text import HashingVectorizer import scikits.learn.feature_extraction.text.sparse as st SparseCountVectorizer = st.CountVectorizer SparseTfidfTransformer = st.TfidfTransformer SparseVectorizer = st.Vectorizer -SparseHashingVectorizer = st.HashingVectorizer from scikits.learn.grid_search import GridSearchCV from scikits.learn.pipeline import Pipeline @@ -108,72 +106,12 @@ def test_char_ngram_analyzer(): assert_equal(cnga.analyze(text)[-5:], expected) -def test_dense_hashed_tf_idf(): - hv = HashingVectorizer(dim=1000, probes=3) - hv.vectorize(JUNK_FOOD_DOCS) - hv.vectorize(NOTJUNK_FOOD_DOCS) - - # extract the TF-IDF data - X = hv.get_tfidf() - assert_equal(X.shape, (11, 1000)) - - # label junk food as -1, the others as +1 - y = np.ones(X.shape[0]) - y[:6] = -1 - - # train and test a classifier - clf = DenseLinearSVC(C=10).fit(X[1:-1], y[1:-1]) - assert_equal(clf.predict([X[0]]), [-1]) - assert_equal(clf.predict([X[-1]]), [1]) - - -def test_sparse_hashed_tf_idf(): - hv = SparseHashingVectorizer(dim=1000000, probes=3) - hv.vectorize(JUNK_FOOD_DOCS) - hv.vectorize(NOTJUNK_FOOD_DOCS) - - # extract the TF-IDF data - X = hv.get_tfidf() - assert_equal(X.shape, (11, 1000000)) - - # label junk food as -1, the others as +1 - y = np.ones(X.shape[0]) - y[:6] = -1 - - # train and test a classifier - clf = SparseLinearSVC(C=10).fit(X[1:-1], y[1:-1]) - assert_equal(clf.predict(X[0, :]), [-1]) - assert_equal(clf.predict(X[-1, :]), [1]) - - -def test_dense_sparse_hashed_tf_idf_sanity(): - - hv = HashingVectorizer(dim=100, probes=3) - shv = SparseHashingVectorizer(dim=100, probes=3) - - hv.vectorize(JUNK_FOOD_DOCS) - shv.vectorize(JUNK_FOOD_DOCS) - - # check that running TF IDF estimates are the same - dense_tf_idf = hv.get_tfidf() - sparse_tfidf = shv.get_tfidf().todense() - - assert_array_almost_equal(dense_tf_idf, sparse_tfidf) - - # check that incremental behaviour stays the same - hv.vectorize(NOTJUNK_FOOD_DOCS) - shv.vectorize(NOTJUNK_FOOD_DOCS) - - dense_tf_idf = hv.get_tfidf() - sparse_tfidf = shv.get_tfidf().todense() - - assert_array_almost_equal(dense_tf_idf, sparse_tfidf) - def toarray(a): if hasattr(a, "toarray"): a = a.toarray() return a + def _test_vectorizer(cv_class, tf_class, v_class): # results to be compared res = [] diff --git a/scikits/learn/feature_extraction/text/__init__.py b/scikits/learn/feature_extraction/text/__init__.py index 8919688b392842b3af97ead4517f838b12233e63..585ea1c806a29ce5c6c81a0e8c8cd4d943a9e0ba 100644 --- a/scikits/learn/feature_extraction/text/__init__.py +++ b/scikits/learn/feature_extraction/text/__init__.py @@ -2,5 +2,4 @@ from .dense import ENGLISH_STOP_WORDS, strip_accents, strip_tags, \ DefaultPreprocessor, DEFAULT_PREPROCESSOR, \ WordNGramAnalyzer, CharNGramAnalyzer, DEFAULT_ANALYZER, \ - CountVectorizer, TfidfTransformer, Vectorizer, \ - HashingVectorizer + CountVectorizer, TfidfTransformer, Vectorizer diff --git a/scikits/learn/feature_extraction/text/dense.py b/scikits/learn/feature_extraction/text/dense.py index a923c496bf8505272f38f29a7fd00b4b572a68f4..777191633654966fa2b45f0530eaaf09d3c0d87e 100644 --- a/scikits/learn/feature_extraction/text/dense.py +++ b/scikits/learn/feature_extraction/text/dense.py @@ -481,100 +481,3 @@ class Vectorizer(BaseVectorizer): self.tfidf = TfidfTransformer(use_tf, use_idf) -# TODO: refactor the HashingVectorizer implementation to reuse the -# BaseVectorizer infrastructure as mush as possible and align the API - -class HashingVectorizer(object): - """Compute term frequencies vectors using hashed term space - - See the Hashing-trick related papers referenced by John Langford on this - page to get a grasp on the usefulness of this representation: - - http://hunch.net/~jl/projects/hash_reps/index.html - - dim is the number of buckets, higher dim means lower collision rate but - also higher memory requirements and higher processing times on the - resulting tfidf vectors. - - Documents is a sequence of lists of tokens to initialize the DF estimates. - - TODO handle bigrams in a smart way such as demonstrated here: - - http://streamhacker.com/2010/05/24/text-classification-sentiment-analysis-stopwords-collocations/ - - """ - # TODO: implement me using the murmurhash that might be faster: but profile - # me first :) - - def __init__(self, dim=5000, probes=1, use_idf=True, - analyzer=DEFAULT_ANALYZER): - self.dim = dim - self.probes = probes - self.analyzer = analyzer - self.use_idf = use_idf - - # start counts at one to avoid zero division while - # computing IDF - self.df_counts = np.ones(dim, dtype=long) - self.tf_vectors = None - - def hash_sign(self, token, probe=0): - """Compute the hash of token with number proble and hashed sign""" - h = hash(token + (probe * u"#")) - return abs(h) % self.dim, 1.0 if h % 2 == 0 else -1.0 - - def _sample_document(self, text, tf_vector, update_estimates=True): - """Extract features from text and update running freq estimates""" - tokens = self.analyzer.analyze(text) - for token in tokens: - # TODO add support for cooccurence tokens in a sentence - # window - for probe in xrange(self.probes): - i, incr = self.hash_sign(token, probe) - tf_vector[i] += incr - tf_vector /= len(tokens) * self.probes - - if update_estimates and self.use_idf: - # update the running DF estimate - self.df_counts += tf_vector != 0.0 - return tf_vector - - def get_idf(self): - n_samples = float(len(self.tf_vectors)) - return np.log(n_samples / self.df_counts) - - def get_tfidf(self): - """Compute the TF-log(IDF) vectors of the sampled documents""" - if self.tf_vectors is None: - return None - return self.tf_vectors * self.get_idf() - - def vectorize(self, text_documents): - """Vectorize a batch of documents in python utf-8 strings or unicode""" - tf_vectors = np.zeros((len(text_documents), self.dim)) - for i, text in enumerate(text_documents): - self._sample_document(text, tf_vectors[i]) - - if self.tf_vectors is None: - self.tf_vectors = tf_vectors - else: - self.tf_vectors = np.vstack((self.tf_vectors, tf_vectors)) - - def vectorize_files(self, document_filepaths): - """Vectorize a batch of documents stored in utf-8 text files""" - tf_vectors = np.zeros((len(document_filepaths), self.dim)) - for i, filepath in enumerate(document_filepaths): - self._sample_document(file(filepath).read(), tf_vectors[i]) - - if self.tf_vectors is None: - self.tf_vectors = tf_vectors - else: - self.tf_vectors = np.vstack((self.tf_vectors, tf_vectors)) - - def get_vectors(self): - if self.use_idf: - return self.get_tfidf() - else: - return self.tf_vectors - - diff --git a/scikits/learn/feature_extraction/text/sparse.py b/scikits/learn/feature_extraction/text/sparse.py index c49214d106cf2100212974604d332d390499a664..55bc123cc1621c9047acb3da5412ef97e4c0a075 100644 --- a/scikits/learn/feature_extraction/text/sparse.py +++ b/scikits/learn/feature_extraction/text/sparse.py @@ -99,95 +99,3 @@ class Vectorizer(BaseVectorizer): self.tfidf = TfidfTransformer(use_tf, use_idf) -# TODO: refactor the HashingVectorizer implementation to reuse the -# BaseVectorizer infrastructure as mush as possible and align the API - -class HashingVectorizer(object): - """Compute term freq vectors using hashed term space in a sparse matrix - - The logic is the same as HashingVectorizer but it is possible to use much - larger dimension vectors without memory issues thanks to the usage of - scipy.sparse datastructure to store the tf vectors. - - This function requires scipy 0.7 or higher. - """ - - def __init__(self, dim=100000, probes=1, use_idf=True, - analyzer=DEFAULT_ANALYZER): - self.dim = dim - self.probes = probes - self.analyzer = analyzer - self.use_idf = use_idf - - # start counts at one to avoid zero division while - # computing IDF - self.df_counts = np.ones(dim, dtype=long) - self.tf_vectors = None - - def hash_sign(self, token, probe=0): - h = hash(token + (probe * u"#")) - return abs(h) % self.dim, 1.0 if h % 2 == 0 else -1.0 - - def _sample_document(self, text, tf_vectors, idx=0, update_estimates=True): - """Extract features from text and update running freq estimates""" - - tokens = self.analyzer.analyze(text) - counts = defaultdict(lambda: 0.0) - for token in tokens: - # TODO add support for cooccurence tokens in a sentence - # window - for probe in xrange(self.probes): - i, incr = self.hash_sign(token, probe) - counts[i] += incr - for k, v in counts.iteritems(): - if v == 0.0: - # can happen if equally frequent conflicting features - continue - tf_vectors[idx, k] = v / (len(tokens) * self.probes) - - if update_estimates and self.use_idf: - # update the running DF estimate - self.df_counts[k] += 1 - - def get_idf(self): - n_samples = float(self.tf_vectors.shape[0]) - return np.log(n_samples / self.df_counts) - - def get_tfidf(self): - """Compute the TF-log(IDF) vectors of the sampled documents""" - coo = self.tf_vectors.tocoo() - tf_idf = sp.lil_matrix(coo.shape) - idf = self.get_idf() - data, row, col = coo.data, coo.row, coo.col - for i in xrange(len(data)): - tf_idf[row[i], col[i]] = data[i] * idf[col[i]] - return tf_idf.tocsr() - - def vectorize(self, text_documents): - """Vectorize a batch of documents in python utf-8 strings or unicode""" - tf_vectors = sp.dok_matrix((len(text_documents), self.dim)) - for i, text in enumerate(text_documents): - self._sample_document(text, tf_vectors, i) - - if self.tf_vectors is None: - self.tf_vectors = tf_vectors - else: - self.tf_vectors = sp.vstack((self.tf_vectors, tf_vectors)) - - def vectorize_files(self, document_filepaths): - """Vectorize a batch of utf-8 text files""" - tf_vectors = sp.dok_matrix((len(document_filepaths), self.dim)) - for i, filepath in enumerate(document_filepaths): - self._sample_document(file(filepath).read(), tf_vectors, i) - - if self.tf_vectors is None: - self.tf_vectors = tf_vectors - else: - self.tf_vectors = sp.vstack((self.tf_vectors, tf_vectors)) - - def get_vectors(self): - if self.use_idf: - return self.get_tfidf() - else: - return self.tf_vectors -