diff --git a/scikits/learn/feature_extraction/tests/test_text.py b/scikits/learn/feature_extraction/tests/test_text.py
index f5326d9e0f7f6382dff579f0abed487fb3d79d3a..ef85ae791f0db6e2129f83ac8dac5f4fa895d66c 100644
--- a/scikits/learn/feature_extraction/tests/test_text.py
+++ b/scikits/learn/feature_extraction/tests/test_text.py
@@ -1,6 +1,7 @@
 from scikits.learn.feature_extraction.text import CharNGramAnalyzer
 from scikits.learn.feature_extraction.text import WordNGramAnalyzer
 from scikits.learn.feature_extraction.text import strip_accents
+from scikits.learn.feature_extraction.text import to_ascii
 
 from scikits.learn.feature_extraction.text import CountVectorizer
 from scikits.learn.feature_extraction.text import TfidfTransformer
@@ -66,6 +67,27 @@ def test_strip_accents():
     assert_equal(strip_accents(a), expected)
 
 
+def test_to_ascii():
+    # check some classical latin accentuated symbols
+    a = u'\xe0\xe1\xe2\xe3\xe4\xe5\xe7\xe8\xe9\xea\xeb'
+    expected = u'aaaaaaceeee'
+    assert_equal(to_ascii(a), expected)
+
+    a = u'\xec\xed\xee\xef\xf1\xf2\xf3\xf4\xf5\xf6\xf9\xfa\xfb\xfc\xfd'
+    expected = u'iiiinooooouuuuy'
+    assert_equal(to_ascii(a), expected)
+
+    # check some arabic
+    a = u'\u0625' # halef with a hamza below
+    expected = u'' # halef has no direct ascii match
+    assert_equal(to_ascii(a), expected)
+
+    # mix letters accentuated and not
+    a = u"this is \xe0 test"
+    expected = u'this is a test'
+    assert_equal(to_ascii(a), expected)
+
+
 def test_word_analyzer_unigrams():
     wa = WordNGramAnalyzer(min_n=1, max_n=1, stop_words=None)
 
diff --git a/scikits/learn/feature_extraction/text/__init__.py b/scikits/learn/feature_extraction/text/__init__.py
index 585ea1c806a29ce5c6c81a0e8c8cd4d943a9e0ba..70404c438dc55b08afc04eb37102e0fb99c9b07e 100644
--- a/scikits/learn/feature_extraction/text/__init__.py
+++ b/scikits/learn/feature_extraction/text/__init__.py
@@ -1,5 +1,18 @@
+"""Utilities to preprocess text content and vectorize it
 
-from .dense import ENGLISH_STOP_WORDS, strip_accents, strip_tags, \
-                   DefaultPreprocessor, DEFAULT_PREPROCESSOR, \
-                   WordNGramAnalyzer, CharNGramAnalyzer, DEFAULT_ANALYZER, \
-                   CountVectorizer, TfidfTransformer, Vectorizer
+The vectorizers are able to output both dense and sparse representations based
+on the implementation used.
+"""
+
+from .dense import CharNGramAnalyzer
+from .dense import CountVectorizer
+from .dense import DEFAULT_ANALYZER
+from .dense import DEFAULT_PREPROCESSOR
+from .dense import ENGLISH_STOP_WORDS
+from .dense import RomanPreprocessor
+from .dense import TfidfTransformer
+from .dense import Vectorizer
+from .dense import WordNGramAnalyzer
+from .dense import strip_accents
+from .dense import strip_tags
+from .dense import to_ascii
diff --git a/scikits/learn/feature_extraction/text/dense.py b/scikits/learn/feature_extraction/text/dense.py
index 86da448fbea58353e2495b10a0b42f2a25fb8c60..2838187173abe221e2d84dd6e64b45705a582007 100644
--- a/scikits/learn/feature_extraction/text/dense.py
+++ b/scikits/learn/feature_extraction/text/dense.py
@@ -2,7 +2,7 @@
 #          Mathieu Blondel
 #
 # License: BSD Style.
-"""Utilities to build feature vectors from text documents"""
+"""Utilities to build dense feature vectors from text documents"""
 
 from operator import itemgetter
 import re
@@ -55,24 +55,47 @@ ENGLISH_STOP_WORDS = set([
 
 
 def strip_accents(s):
-    """Transform accentuated unicode symbols into their simple counterpart"""
-    return ''.join((c for c in unicodedata.normalize('NFD', s)
-                    if unicodedata.category(c) != 'Mn'))
+    """Transform accentuated unicode symbols into their simple counterpart
+
+    Warning: the python-level loop and join operations make this implementation
+    20 times slower than the to_ascii basic normalization.
+    """
+    return u''.join([c for c in unicodedata.normalize('NFKD', s)
+                     if not unicodedata.combining(c)])
+
+
+def to_ascii(s):
+    """Transform accentuated unicode symbols into ascii or nothing
+
+    Warning: this solution is only suited for roman languages that have a direct
+    transliteration to ASCII symbols.
+
+    A better solution would be to use transliteration based on a precomputed
+    unidecode map to be used by translate as explained here:
+
+        http://stackoverflow.com/questions/2854230/
+
+    """
+    nkfd_form = unicodedata.normalize('NFKD', s)
+    only_ascii = nkfd_form.encode('ASCII', 'ignore')
+    return only_ascii
+
 
 def strip_tags(s):
     return re.compile(r"<([^>]+)>", flags=re.UNICODE).sub("", s)
 
 
-class DefaultPreprocessor(object):
+class RomanPreprocessor(object):
+    """Fast preprocessor suitable for roman languages"""
 
-    def preprocess(self, text):
-        return strip_accents(strip_tags(text.lower()))
+    def preprocess(self, unicode_text):
+        return to_ascii(strip_tags(unicode_text.lower()))
 
     def __repr__(self):
-        return "DefaultPreprocessor()"
+        return "RomanPreprocessor()"
 
 
-DEFAULT_PREPROCESSOR = DefaultPreprocessor()
+DEFAULT_PREPROCESSOR = RomanPreprocessor()
 
 
 class WordNGramAnalyzer(BaseEstimator):
@@ -119,7 +142,7 @@ class WordNGramAnalyzer(BaseEstimator):
                 if n_original_tokens < n:
                     continue
                 for i in xrange(n_original_tokens - n + 1):
-                    tokens.append(" ".join(original_tokens[i: i + n]))
+                    tokens.append(u" ".join(original_tokens[i: i + n]))
 
         # handle stop words
         if self.stop_words is not None: