diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst
index 0b1222e20f1ad7dc7a1e55129e9f737d5bd95ad3..97ec275924c7066881feaff68d8bd5a58cb126ea 100644
--- a/doc/modules/feature_extraction.rst
+++ b/doc/modules/feature_extraction.rst
@@ -288,7 +288,7 @@ This model has many parameters, however the default values are quite
 reasonable (please see  the :ref:`reference documentation
 <text_feature_extraction_ref>` for the details)::
 
-  >>> vectorizer = CountVectorizer(min_df=1)
+  >>> vectorizer = CountVectorizer()
   >>> vectorizer                     # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
   CountVectorizer(analyzer=...'word', binary=False, decode_error=...'strict',
           dtype=<... 'numpy.int64'>, encoding=...'utf-8', input=...'content',
@@ -545,7 +545,7 @@ class called :class:`TfidfVectorizer` that combines all the options of
 :class:`CountVectorizer` and :class:`TfidfTransformer` in a single model::
 
   >>> from sklearn.feature_extraction.text import TfidfVectorizer
-  >>> vectorizer = TfidfVectorizer(min_df=1)
+  >>> vectorizer = TfidfVectorizer()
   >>> vectorizer.fit_transform(corpus)
   ...                                # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
   <4x9 sparse matrix of type '<... 'numpy.float64'>'
@@ -695,7 +695,7 @@ A character 2-gram representation, however, would find the documents
 matching in 4 out of 8 features, which may help the preferred classifier
 decide better::
 
-  >>> ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2), min_df=1)
+  >>> ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2))
   >>> counts = ngram_vectorizer.fit_transform(['words', 'wprds'])
   >>> ngram_vectorizer.get_feature_names() == (
   ...     [' w', 'ds', 'or', 'pr', 'rd', 's ', 'wo', 'wp'])
@@ -709,7 +709,7 @@ only from characters inside word boundaries (padded with space on each
 side). The ``'char'`` analyzer, alternatively, creates n-grams that
 span across words::
 
-  >>> ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(5, 5), min_df=1)
+  >>> ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(5, 5))
   >>> ngram_vectorizer.fit_transform(['jumpy fox'])
   ...                                # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
   <1x4 sparse matrix of type '<... 'numpy.int64'>'
@@ -718,7 +718,7 @@ span across words::
   ...     [' fox ', ' jump', 'jumpy', 'umpy '])
   True
 
-  >>> ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(5, 5), min_df=1)
+  >>> ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(5, 5))
   >>> ngram_vectorizer.fit_transform(['jumpy fox'])
   ...                                # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
   <1x5 sparse matrix of type '<... 'numpy.int64'>'
@@ -915,6 +915,33 @@ Some tips and tricks:
 
     (Note that this will not filter out punctuation.)
 
+
+    The following example will, for instance, transform some British spelling 
+    to American spelling::
+
+        >>> import re
+        >>> def to_british(tokens):
+        ...     for t in tokens:
+        ...         t = re.sub(r"(...)our$", r"\1or", t)
+        ...         t = re.sub(r"([bt])re$", r"\1er", t)
+        ...         t = re.sub(r"([iy])s(e$|ing|ation)", r"\1z\2", t)
+        ...         t = re.sub(r"ogue$", "og", t)
+        ...         yield t
+        ...
+        >>> class CustomVectorizer(CountVectorizer):
+        ...     def build_tokenizer(self):
+        ...         tokenize = super(CustomVectorizer, self).build_tokenizer()
+        ...         return lambda doc: list(to_british(tokenize(doc)))
+        ...
+        >>> print(CustomVectorizer().build_analyzer()(u"color colour")) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+        [...'color', ...'color']
+
+    for other styles of preprocessing; examples include stemming, lemmatization,
+    or normalizing numerical tokens, with the latter illustrated in:
+
+     * :ref:`sphx_glr_auto_examples_bicluster_plot_bicluster_newsgroups.py`
+
+
 Customizing the vectorizer can also be useful when handling Asian languages
 that do not use an explicit word separator such as whitespace.
 
diff --git a/examples/bicluster/plot_bicluster_newsgroups.py b/examples/bicluster/plot_bicluster_newsgroups.py
index f576e01eb5efbaa4ad37cb4043ac1446b24bd9d1..12d42e23a0f1979b9328a257bd6a749d1c6b3284 100644
--- a/examples/bicluster/plot_bicluster_newsgroups.py
+++ b/examples/bicluster/plot_bicluster_newsgroups.py
@@ -26,7 +26,6 @@ from __future__ import print_function
 
 from collections import defaultdict
 import operator
-import re
 from time import time
 
 import numpy as np
@@ -41,18 +40,20 @@ from sklearn.metrics.cluster import v_measure_score
 print(__doc__)
 
 
-def number_aware_tokenizer(doc):
-    """ Tokenizer that maps all numeric tokens to a placeholder.
+def number_normalizer(tokens):
+    """ Map all numeric tokens to a placeholder.
 
     For many applications, tokens that begin with a number are not directly
     useful, but the fact that such a token exists can be relevant.  By applying
     this form of dimensionality reduction, some methods may perform better.
     """
-    token_pattern = re.compile(u'(?u)\\b\\w\\w+\\b')
-    tokens = token_pattern.findall(doc)
-    tokens = ["#NUMBER" if token[0] in "0123456789_" else token
-              for token in tokens]
-    return tokens
+    return ("#NUMBER" if token[0].isdigit() else token for token in tokens)
+
+
+class NumberNormalizingVectorizer(TfidfVectorizer):
+    def build_tokenizer(self):
+        tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer()
+        return lambda doc: list(number_normalizer(tokenize(doc)))
 
 
 # exclude 'comp.os.ms-windows.misc'
@@ -67,8 +68,7 @@ categories = ['alt.atheism', 'comp.graphics',
 newsgroups = fetch_20newsgroups(categories=categories)
 y_true = newsgroups.target
 
-vectorizer = TfidfVectorizer(stop_words='english', min_df=5,
-                             tokenizer=number_aware_tokenizer)
+vectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5)
 cocluster = SpectralCoclustering(n_clusters=len(categories),
                                  svd_method='arpack', random_state=0)
 kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,