Skip to content
Snippets Groups Projects
Commit 4efdd779 authored by Olivier Grisel's avatar Olivier Grisel
Browse files

readd the dense version of the vectorizer

parent 3b9b8b46
No related branches found
No related tags found
No related merge requests found
...@@ -41,7 +41,7 @@ def test_simple_analyzer(): ...@@ -41,7 +41,7 @@ def test_simple_analyzer():
assert_equal(sa.analyze(text), expected) assert_equal(sa.analyze(text), expected)
def test_tf_idf(): def test_dense_tf_idf():
hv = HashingVectorizer(dim=1000, probes=3) hv = HashingVectorizer(dim=1000, probes=3)
# junk food documents # junk food documents
......
...@@ -84,6 +84,102 @@ class SimpleAnalyzer(object): ...@@ -84,6 +84,102 @@ class SimpleAnalyzer(object):
return tokens return tokens
class HashingVectorizer(object):
"""Compute term frequencies vectors using hashed term space
See the Hashing-trick related papers referenced by John Langford on this
page to get a grasp on the usefulness of this representation:
http://hunch.net/~jl/projects/hash_reps/index.html
dim is the number of buckets, higher dim means lower collision rate but
also higher memory requirements and higher processing times on the
resulting tfidf vectors.
Documents is a sequence of lists of tokens to initialize the DF estimates.
TODO handle bigrams in a smart way such as demonstrated here:
http://streamhacker.com/2010/05/24/text-classification-sentiment-analysis-stopwords-collocations/
"""
# TODO: implement me using the murmurhash that might be faster: but profile
# me first :)
# TODO: make it possible to select between the current dense representation
# and sparse alternatives from scipy.sparse once the liblinear and libsvm
# wrappers have been updated to be able to handle it efficiently
def __init__(self, dim=5000, probes=1, analyzer=SimpleAnalyzer(),
use_idf=True):
self.dim = dim
self.probes = probes
self.analyzer = analyzer
self.use_idf = use_idf
# start counts at one to avoid zero division while
# computing IDF
self.df_counts = np.ones(dim, dtype=long)
self.tf_vectors = None
self.sampled = 0
def hash_sign(self, token, probe=0):
h = hash(token + (probe * u"#"))
return abs(h) % self.dim, 1.0 if h % 2 == 0 else -1.0
def sample_document(self, text, tf_vector=None, update_estimates=True):
"""Extract features from text and update running freq estimates"""
if tf_vector is None:
# allocate term frequency vector and stack to history
tf_vector = np.zeros(self.dim, np.float64)
if self.tf_vectors is None:
self.tf_vectors = tf_vector.reshape((1, self.dim))
else:
self.tf_vectors = np.vstack((self.tf_vectors, tf_vector))
tf_vector = self.tf_vectors[-1]
tokens = self.analyzer.analyze(text)
for token in tokens:
# TODO add support for cooccurence tokens in a sentence
# window
for probe in xrange(self.probes):
i, incr = self.hash_sign(token, probe)
tf_vector[i] += incr
tf_vector /= len(tokens) * self.probes
if update_estimates and self.use_idf:
# update the running DF estimate
self.df_counts += tf_vector != 0.0
self.sampled += 1
return tf_vector
def get_idf(self):
return np.log(float(self.sampled) / self.df_counts)
def get_tfidf(self):
"""Compute the TF-log(IDF) vectors of the sampled documents"""
if self.tf_vectors is None:
return None
return self.tf_vectors * self.get_idf()
def vectorize(self, document_filepaths):
"""Vectorize a batch of documents"""
tf_vectors = np.zeros((len(document_filepaths), self.dim))
for i, filepath in enumerate(document_filepaths):
self.sample_document(file(filepath).read(), tf_vectors[i])
if self.tf_vectors is None:
self.tf_vectors = tf_vectors
else:
self.tf_vectors = np.vstack((self.tf_vectors, tf_vectors))
def get_vectors(self):
if self.use_idf:
return self.get_tfidf()
else:
return self.tf_vectors
class SparseHashingVectorizer(object): class SparseHashingVectorizer(object):
"""Compute term frequencies vectors using hashed term space in sparse matrix """Compute term frequencies vectors using hashed term space in sparse matrix
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment