diff --git a/scikits/learn/features/tests/test_text.py b/scikits/learn/features/tests/test_text.py index 421361466d8b7fad268b7e89b9eb713c851e95ef..688882c50722b8c2504b19055edcd36802d1f102 100644 --- a/scikits/learn/features/tests/test_text.py +++ b/scikits/learn/features/tests/test_text.py @@ -2,6 +2,7 @@ from scikits.learn.features.text import strip_accents from scikits.learn.features.text import SimpleAnalyzer from scikits.learn.features.text import HashingVectorizer from scikits.learn.logistic import LogisticRegression +import numpy as np from nose.tools import * @@ -44,24 +45,27 @@ def test_tf_idf(): hv = HashingVectorizer(dim=1000, probes=3) # junk food documents - hv.sample_document("the pizza pizza beer", label=-1) - hv.sample_document("the pizza pizza beer", label=-1) - hv.sample_document("the the pizza beer beer", label=-1) - hv.sample_document("the pizza beer beer", label=-1) - hv.sample_document("the coke beer coke", label=-1) - hv.sample_document("the coke pizza pizza", label=-1) + hv.sample_document("the pizza pizza beer") + hv.sample_document("the pizza pizza beer") + hv.sample_document("the the pizza beer beer") + hv.sample_document("the pizza beer beer") + hv.sample_document("the coke beer coke") + hv.sample_document("the coke pizza pizza") # not-junk food documents - hv.sample_document("the salad celeri", label=1) - hv.sample_document("the salad salad sparkling water", label=1) - hv.sample_document("the the celeri celeri", label=1) - hv.sample_document("the tomato tomato salad water", label=1) - hv.sample_document("the tomato salad water", label=1) + hv.sample_document("the salad celeri") + hv.sample_document("the salad salad sparkling water") + hv.sample_document("the the celeri celeri") + hv.sample_document("the tomato tomato salad water") + hv.sample_document("the tomato salad water") # extract the TF-IDF data - X, y = hv.get_tfidf(), hv.labels + X = hv.get_tfidf() assert_equal(X.shape, (11, 1000)) - assert_equal(len(y), 11) + + # label junk food as -1, the others as +1 + y = np.ones(X.shape[0]) + y[:6] = -1 # train and test a classifier clf = LogisticRegression().fit(X[1:-1], y[1:-1]) diff --git a/scikits/learn/features/text.py b/scikits/learn/features/text.py index adc9f2e0d9f806fc38a4a68b1d12fbb6d86854f9..41766e58e4e2d7f1f5fb2422de2766df9d40ba2e 100644 --- a/scikits/learn/features/text.py +++ b/scikits/learn/features/text.py @@ -63,14 +63,13 @@ class HashingVectorizer(object): # computing IDF self.df_counts = np.ones(dim, dtype=long) self.tf_vectors = None - self.labels = [] self.sampled = 0 def hash_sign(self, token, probe=0): h = hash(token + (probe * u"#")) return abs(h) % self.dim, 1.0 if h % 2 == 0 else -1.0 - def sample_document(self, text, tf_vector=None, label=None): + def sample_document(self, text, tf_vector=None, update_estimates=True): """Extract features from text and update running freq estimates""" if tf_vector is None: # allocate term frequency vector and stack to history @@ -80,8 +79,6 @@ class HashingVectorizer(object): else: self.tf_vectors = np.vstack((self.tf_vectors, tf_vector)) tf_vector = self.tf_vectors[-1] - if label is not None: - self.labels.append(label) tokens = self.analyzer.analyze(text) for token in tokens: @@ -92,9 +89,10 @@ class HashingVectorizer(object): tf_vector[i] += incr tf_vector /= len(tokens) * self.probes - # update the running DF estimate - self.df_counts += tf_vector != 0.0 - self.sampled += 1 + if update_estimates: + # update the running DF estimate + self.df_counts += tf_vector != 0.0 + self.sampled += 1 return tf_vector def get_tfidf(self):