diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst index 0b1222e20f1ad7dc7a1e55129e9f737d5bd95ad3..97ec275924c7066881feaff68d8bd5a58cb126ea 100644 --- a/doc/modules/feature_extraction.rst +++ b/doc/modules/feature_extraction.rst @@ -288,7 +288,7 @@ This model has many parameters, however the default values are quite reasonable (please see the :ref:`reference documentation <text_feature_extraction_ref>` for the details):: - >>> vectorizer = CountVectorizer(min_df=1) + >>> vectorizer = CountVectorizer() >>> vectorizer # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS CountVectorizer(analyzer=...'word', binary=False, decode_error=...'strict', dtype=<... 'numpy.int64'>, encoding=...'utf-8', input=...'content', @@ -545,7 +545,7 @@ class called :class:`TfidfVectorizer` that combines all the options of :class:`CountVectorizer` and :class:`TfidfTransformer` in a single model:: >>> from sklearn.feature_extraction.text import TfidfVectorizer - >>> vectorizer = TfidfVectorizer(min_df=1) + >>> vectorizer = TfidfVectorizer() >>> vectorizer.fit_transform(corpus) ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS <4x9 sparse matrix of type '<... 'numpy.float64'>' @@ -695,7 +695,7 @@ A character 2-gram representation, however, would find the documents matching in 4 out of 8 features, which may help the preferred classifier decide better:: - >>> ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2), min_df=1) + >>> ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2)) >>> counts = ngram_vectorizer.fit_transform(['words', 'wprds']) >>> ngram_vectorizer.get_feature_names() == ( ... [' w', 'ds', 'or', 'pr', 'rd', 's ', 'wo', 'wp']) @@ -709,7 +709,7 @@ only from characters inside word boundaries (padded with space on each side). The ``'char'`` analyzer, alternatively, creates n-grams that span across words:: - >>> ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(5, 5), min_df=1) + >>> ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(5, 5)) >>> ngram_vectorizer.fit_transform(['jumpy fox']) ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS <1x4 sparse matrix of type '<... 'numpy.int64'>' @@ -718,7 +718,7 @@ span across words:: ... [' fox ', ' jump', 'jumpy', 'umpy ']) True - >>> ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(5, 5), min_df=1) + >>> ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(5, 5)) >>> ngram_vectorizer.fit_transform(['jumpy fox']) ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS <1x5 sparse matrix of type '<... 'numpy.int64'>' @@ -915,6 +915,33 @@ Some tips and tricks: (Note that this will not filter out punctuation.) + + The following example will, for instance, transform some British spelling + to American spelling:: + + >>> import re + >>> def to_british(tokens): + ... for t in tokens: + ... t = re.sub(r"(...)our$", r"\1or", t) + ... t = re.sub(r"([bt])re$", r"\1er", t) + ... t = re.sub(r"([iy])s(e$|ing|ation)", r"\1z\2", t) + ... t = re.sub(r"ogue$", "og", t) + ... yield t + ... + >>> class CustomVectorizer(CountVectorizer): + ... def build_tokenizer(self): + ... tokenize = super(CustomVectorizer, self).build_tokenizer() + ... return lambda doc: list(to_british(tokenize(doc))) + ... + >>> print(CustomVectorizer().build_analyzer()(u"color colour")) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + [...'color', ...'color'] + + for other styles of preprocessing; examples include stemming, lemmatization, + or normalizing numerical tokens, with the latter illustrated in: + + * :ref:`sphx_glr_auto_examples_bicluster_plot_bicluster_newsgroups.py` + + Customizing the vectorizer can also be useful when handling Asian languages that do not use an explicit word separator such as whitespace. diff --git a/examples/bicluster/plot_bicluster_newsgroups.py b/examples/bicluster/plot_bicluster_newsgroups.py index f576e01eb5efbaa4ad37cb4043ac1446b24bd9d1..12d42e23a0f1979b9328a257bd6a749d1c6b3284 100644 --- a/examples/bicluster/plot_bicluster_newsgroups.py +++ b/examples/bicluster/plot_bicluster_newsgroups.py @@ -26,7 +26,6 @@ from __future__ import print_function from collections import defaultdict import operator -import re from time import time import numpy as np @@ -41,18 +40,20 @@ from sklearn.metrics.cluster import v_measure_score print(__doc__) -def number_aware_tokenizer(doc): - """ Tokenizer that maps all numeric tokens to a placeholder. +def number_normalizer(tokens): + """ Map all numeric tokens to a placeholder. For many applications, tokens that begin with a number are not directly useful, but the fact that such a token exists can be relevant. By applying this form of dimensionality reduction, some methods may perform better. """ - token_pattern = re.compile(u'(?u)\\b\\w\\w+\\b') - tokens = token_pattern.findall(doc) - tokens = ["#NUMBER" if token[0] in "0123456789_" else token - for token in tokens] - return tokens + return ("#NUMBER" if token[0].isdigit() else token for token in tokens) + + +class NumberNormalizingVectorizer(TfidfVectorizer): + def build_tokenizer(self): + tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer() + return lambda doc: list(number_normalizer(tokenize(doc))) # exclude 'comp.os.ms-windows.misc' @@ -67,8 +68,7 @@ categories = ['alt.atheism', 'comp.graphics', newsgroups = fetch_20newsgroups(categories=categories) y_true = newsgroups.target -vectorizer = TfidfVectorizer(stop_words='english', min_df=5, - tokenizer=number_aware_tokenizer) +vectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5) cocluster = SpectralCoclustering(n_clusters=len(categories), svd_method='arpack', random_state=0) kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,