diff --git a/scikits/learn/features/tests/test_text.py b/scikits/learn/features/tests/test_text.py index 688882c50722b8c2504b19055edcd36802d1f102..2e3462dcb2a8655a9309cd0dddc122c36c3b6958 100644 --- a/scikits/learn/features/tests/test_text.py +++ b/scikits/learn/features/tests/test_text.py @@ -41,7 +41,7 @@ def test_simple_analyzer(): assert_equal(sa.analyze(text), expected) -def test_tf_idf(): +def test_dense_tf_idf(): hv = HashingVectorizer(dim=1000, probes=3) # junk food documents diff --git a/scikits/learn/features/text.py b/scikits/learn/features/text.py index b483f2ecab035b77d79c335e246bc91fce62996c..a0162cd79335ef214ee050a0f59d64301ff3f007 100644 --- a/scikits/learn/features/text.py +++ b/scikits/learn/features/text.py @@ -84,6 +84,102 @@ class SimpleAnalyzer(object): return tokens +class HashingVectorizer(object): + """Compute term frequencies vectors using hashed term space + + See the Hashing-trick related papers referenced by John Langford on this + page to get a grasp on the usefulness of this representation: + + http://hunch.net/~jl/projects/hash_reps/index.html + + dim is the number of buckets, higher dim means lower collision rate but + also higher memory requirements and higher processing times on the + resulting tfidf vectors. + + Documents is a sequence of lists of tokens to initialize the DF estimates. + + TODO handle bigrams in a smart way such as demonstrated here: + + http://streamhacker.com/2010/05/24/text-classification-sentiment-analysis-stopwords-collocations/ + + """ + # TODO: implement me using the murmurhash that might be faster: but profile + # me first :) + + # TODO: make it possible to select between the current dense representation + # and sparse alternatives from scipy.sparse once the liblinear and libsvm + # wrappers have been updated to be able to handle it efficiently + + def __init__(self, dim=5000, probes=1, analyzer=SimpleAnalyzer(), + use_idf=True): + self.dim = dim + self.probes = probes + self.analyzer = analyzer + self.use_idf = use_idf + + # start counts at one to avoid zero division while + # computing IDF + self.df_counts = np.ones(dim, dtype=long) + self.tf_vectors = None + self.sampled = 0 + + def hash_sign(self, token, probe=0): + h = hash(token + (probe * u"#")) + return abs(h) % self.dim, 1.0 if h % 2 == 0 else -1.0 + + def sample_document(self, text, tf_vector=None, update_estimates=True): + """Extract features from text and update running freq estimates""" + if tf_vector is None: + # allocate term frequency vector and stack to history + tf_vector = np.zeros(self.dim, np.float64) + if self.tf_vectors is None: + self.tf_vectors = tf_vector.reshape((1, self.dim)) + else: + self.tf_vectors = np.vstack((self.tf_vectors, tf_vector)) + tf_vector = self.tf_vectors[-1] + + tokens = self.analyzer.analyze(text) + for token in tokens: + # TODO add support for cooccurence tokens in a sentence + # window + for probe in xrange(self.probes): + i, incr = self.hash_sign(token, probe) + tf_vector[i] += incr + tf_vector /= len(tokens) * self.probes + + if update_estimates and self.use_idf: + # update the running DF estimate + self.df_counts += tf_vector != 0.0 + self.sampled += 1 + return tf_vector + + def get_idf(self): + return np.log(float(self.sampled) / self.df_counts) + + def get_tfidf(self): + """Compute the TF-log(IDF) vectors of the sampled documents""" + if self.tf_vectors is None: + return None + return self.tf_vectors * self.get_idf() + + def vectorize(self, document_filepaths): + """Vectorize a batch of documents""" + tf_vectors = np.zeros((len(document_filepaths), self.dim)) + for i, filepath in enumerate(document_filepaths): + self.sample_document(file(filepath).read(), tf_vectors[i]) + + if self.tf_vectors is None: + self.tf_vectors = tf_vectors + else: + self.tf_vectors = np.vstack((self.tf_vectors, tf_vectors)) + + def get_vectors(self): + if self.use_idf: + return self.get_tfidf() + else: + return self.tf_vectors + + class SparseHashingVectorizer(object): """Compute term frequencies vectors using hashed term space in sparse matrix