diff --git a/scikits/learn/feature_extraction/text.py b/scikits/learn/feature_extraction/text.py index d01c914ebda02576629f505e0df678a534971488..d9a6b0e74fc0892263855ce87ef688cd235e4632 100644 --- a/scikits/learn/feature_extraction/text.py +++ b/scikits/learn/feature_extraction/text.py @@ -63,13 +63,13 @@ def strip_tags(s): return re.compile(r"<([^>]+)>", flags=re.UNICODE).sub("", s) -class Filter(object): +class DefaultPreprocessor(object): - def filter(self, text): + def preprocess(self, text): return strip_accents(strip_tags(text.lower())) -DEFAULT_FILTER = Filter() +DEFAULT_PREPROCESSOR = DefaultPreprocessor() class WordNGramAnalyzer(BaseEstimator): @@ -86,12 +86,12 @@ class WordNGramAnalyzer(BaseEstimator): token_pattern = re.compile(r"\b\w\w+\b", re.UNICODE) def __init__(self, charset='utf-8', min_n=1, max_n=1, - filter=DEFAULT_FILTER, stop_words=None): + preprocessor=DEFAULT_PREPROCESSOR, stop_words=None): self.charset = charset self.stop_words = stop_words self.min_n = min_n self.max_n = max_n - self.filter = filter + self.preprocessor = preprocessor def analyze(self, text_document): if isinstance(text_document, file): @@ -100,7 +100,7 @@ class WordNGramAnalyzer(BaseEstimator): if isinstance(text_document, str): text_document = text_document.decode(self.charset, 'ignore') - text_document = self.filter.filter(text_document) + text_document = self.preprocessor.preprocess(text_document) # word boundaries tokenizer tokens = self.token_pattern.findall(text_document) @@ -135,15 +135,17 @@ class CharNGramAnalyzer(BaseEstimator): white_spaces = re.compile(r"\s\s+") - def __init__(self, charset='utf-8', min_n=3, max_n=6): + def __init__(self, charset='utf-8', preprocessor=DEFAULT_PREPROCESSOR, + min_n=3, max_n=6): self.charset = charset self.min_n = min_n self.max_n = max_n + self.preprocessor = preprocessor def analyze(self, text_document): if isinstance(text_document, str): text_document = text_document.decode(self.charset, 'ignore') - text_document = strip_accents(text_document.lower()) + text_document = self.preprocessor.preprocess(text_document) # normalize white spaces text_document = self.white_spaces.sub(" ", text_document) @@ -162,8 +164,7 @@ DEFAULT_ANALYZER = WordNGramAnalyzer(min_n=1, max_n=1) class BaseCountVectorizer(BaseEstimator): - """ - Convert a collection of raw documents to a matrix of token counts. + """Convert a collection of raw documents to a matrix of token counts This class can't be used directly, use either CountVectorizer or SparseCountVectorizer. @@ -235,8 +236,7 @@ class BaseCountVectorizer(BaseEstimator): return vectors def fit(self, raw_documents, y=None): - """ - Learn the vocabulary dictionary. + """Learn a vocabulary dictionary of all tokens in the raw documents Parameters ---------- @@ -249,12 +249,10 @@ class BaseCountVectorizer(BaseEstimator): self """ self.fit_transform(raw_documents) - return self def fit_transform(self, raw_documents, y=None): - """ - Learn the vocabulary dictionary and return the vectors. + """Learn the vocabulary dictionary and return the count vectors This is more efficient than calling fit followed by transform. @@ -269,12 +267,10 @@ class BaseCountVectorizer(BaseEstimator): vectors: array, [n_samples, n_features] """ vectors, self.vocabulary = self._build_vectors_and_vocab(raw_documents) - return vectors def transform(self, raw_documents): - """ - Return the vectors. + """Extract token counts out of raw text documents Parameters ---------- @@ -291,11 +287,40 @@ class BaseCountVectorizer(BaseEstimator): return self._build_vectors(raw_documents) + class CountVectorizer(BaseCountVectorizer): + """Convert a collection of raw documents to a matrix of token counts + + This implementation produces a dense representation of the counts using + numpy array. + + If you do not provide an a-priori dictionary and you do not use + an analyzer that does some kind of feature selection then the number of + features (the vocabulary size found by analysing the data) might be very + large and the count vectors might not fit in memory. + + For this case it is either recommended to use the SparseCountVectorizer + variant of this class or a HashingVectorizer that will reduce the + dimensionality to an arbitrary number by using random projection. + + Parameters + ---------- + analyzer: WordNGramAnalyzer or CharNGramAnalyzer, optional + + vocabulary: dict, optional + A dictionary where keys are tokens and values are indices in the + matrix. + This is useful in order to fix the vocabulary in advance. + + dtype: type, optional + Type of the matrix returned by fit_transform() or transform(). + """ + def _init_matrix(self, shape): return np.zeros(shape, dtype=self.dtype) + class BaseTfidfTransformer(BaseEstimator): """ Transform a count matrix to a TF (term-frequency) @@ -320,11 +345,12 @@ class BaseTfidfTransformer(BaseEstimator): self.use_idf = use_idf self.idf = None + class TfidfTransformer(BaseTfidfTransformer): + # TODO: write docstring! def fit(self, X, y=None): - """ - Learn the IDF vector (global term weights). + """Learn the IDF vector (global term weights) Parameters ---------- @@ -341,8 +367,7 @@ class TfidfTransformer(BaseTfidfTransformer): return self def transform(self, X, copy=True): - """ - Transform a count matrix to a TF or TF-IDF representation. + """Transform a count matrix to a TF or TF-IDF representation Parameters ---------- @@ -364,9 +389,9 @@ class TfidfTransformer(BaseTfidfTransformer): return X + class BaseVectorizer(BaseEstimator): - """ - Convert a collection of raw documents to a matrix. + """Convert a collection of raw documents to a matrix This class can't be used directly, use either Vectorizer or SparseVectorizer. @@ -413,9 +438,9 @@ class BaseVectorizer(BaseEstimator): X = self.tc.transform(raw_documents) return self.tfidf.transform(X, copy) + class Vectorizer(BaseVectorizer): - """ - Convert a collection of raw documents to a matrix. + """Convert a collection of raw documents to a matrix Equivalent to CountVectorizer followed by TfidfTransformer. """ @@ -428,6 +453,9 @@ class Vectorizer(BaseVectorizer): self.tfidf = TfidfTransformer(use_tf, use_idf) +# TODO: refactor the HashingVectorizer implementation to reuse the +# BaseVectorizer infrastructure as mush as possible and align the API + class HashingVectorizer(object): """Compute term frequencies vectors using hashed term space