diff --git a/scikits/learn/feature_extraction/tests/test_text.py b/scikits/learn/feature_extraction/tests/test_text.py index f5326d9e0f7f6382dff579f0abed487fb3d79d3a..ef85ae791f0db6e2129f83ac8dac5f4fa895d66c 100644 --- a/scikits/learn/feature_extraction/tests/test_text.py +++ b/scikits/learn/feature_extraction/tests/test_text.py @@ -1,6 +1,7 @@ from scikits.learn.feature_extraction.text import CharNGramAnalyzer from scikits.learn.feature_extraction.text import WordNGramAnalyzer from scikits.learn.feature_extraction.text import strip_accents +from scikits.learn.feature_extraction.text import to_ascii from scikits.learn.feature_extraction.text import CountVectorizer from scikits.learn.feature_extraction.text import TfidfTransformer @@ -66,6 +67,27 @@ def test_strip_accents(): assert_equal(strip_accents(a), expected) +def test_to_ascii(): + # check some classical latin accentuated symbols + a = u'\xe0\xe1\xe2\xe3\xe4\xe5\xe7\xe8\xe9\xea\xeb' + expected = u'aaaaaaceeee' + assert_equal(to_ascii(a), expected) + + a = u'\xec\xed\xee\xef\xf1\xf2\xf3\xf4\xf5\xf6\xf9\xfa\xfb\xfc\xfd' + expected = u'iiiinooooouuuuy' + assert_equal(to_ascii(a), expected) + + # check some arabic + a = u'\u0625' # halef with a hamza below + expected = u'' # halef has no direct ascii match + assert_equal(to_ascii(a), expected) + + # mix letters accentuated and not + a = u"this is \xe0 test" + expected = u'this is a test' + assert_equal(to_ascii(a), expected) + + def test_word_analyzer_unigrams(): wa = WordNGramAnalyzer(min_n=1, max_n=1, stop_words=None) diff --git a/scikits/learn/feature_extraction/text/__init__.py b/scikits/learn/feature_extraction/text/__init__.py index 585ea1c806a29ce5c6c81a0e8c8cd4d943a9e0ba..70404c438dc55b08afc04eb37102e0fb99c9b07e 100644 --- a/scikits/learn/feature_extraction/text/__init__.py +++ b/scikits/learn/feature_extraction/text/__init__.py @@ -1,5 +1,18 @@ +"""Utilities to preprocess text content and vectorize it -from .dense import ENGLISH_STOP_WORDS, strip_accents, strip_tags, \ - DefaultPreprocessor, DEFAULT_PREPROCESSOR, \ - WordNGramAnalyzer, CharNGramAnalyzer, DEFAULT_ANALYZER, \ - CountVectorizer, TfidfTransformer, Vectorizer +The vectorizers are able to output both dense and sparse representations based +on the implementation used. +""" + +from .dense import CharNGramAnalyzer +from .dense import CountVectorizer +from .dense import DEFAULT_ANALYZER +from .dense import DEFAULT_PREPROCESSOR +from .dense import ENGLISH_STOP_WORDS +from .dense import RomanPreprocessor +from .dense import TfidfTransformer +from .dense import Vectorizer +from .dense import WordNGramAnalyzer +from .dense import strip_accents +from .dense import strip_tags +from .dense import to_ascii diff --git a/scikits/learn/feature_extraction/text/dense.py b/scikits/learn/feature_extraction/text/dense.py index 86da448fbea58353e2495b10a0b42f2a25fb8c60..2838187173abe221e2d84dd6e64b45705a582007 100644 --- a/scikits/learn/feature_extraction/text/dense.py +++ b/scikits/learn/feature_extraction/text/dense.py @@ -2,7 +2,7 @@ # Mathieu Blondel # # License: BSD Style. -"""Utilities to build feature vectors from text documents""" +"""Utilities to build dense feature vectors from text documents""" from operator import itemgetter import re @@ -55,24 +55,47 @@ ENGLISH_STOP_WORDS = set([ def strip_accents(s): - """Transform accentuated unicode symbols into their simple counterpart""" - return ''.join((c for c in unicodedata.normalize('NFD', s) - if unicodedata.category(c) != 'Mn')) + """Transform accentuated unicode symbols into their simple counterpart + + Warning: the python-level loop and join operations make this implementation + 20 times slower than the to_ascii basic normalization. + """ + return u''.join([c for c in unicodedata.normalize('NFKD', s) + if not unicodedata.combining(c)]) + + +def to_ascii(s): + """Transform accentuated unicode symbols into ascii or nothing + + Warning: this solution is only suited for roman languages that have a direct + transliteration to ASCII symbols. + + A better solution would be to use transliteration based on a precomputed + unidecode map to be used by translate as explained here: + + http://stackoverflow.com/questions/2854230/ + + """ + nkfd_form = unicodedata.normalize('NFKD', s) + only_ascii = nkfd_form.encode('ASCII', 'ignore') + return only_ascii + def strip_tags(s): return re.compile(r"<([^>]+)>", flags=re.UNICODE).sub("", s) -class DefaultPreprocessor(object): +class RomanPreprocessor(object): + """Fast preprocessor suitable for roman languages""" - def preprocess(self, text): - return strip_accents(strip_tags(text.lower())) + def preprocess(self, unicode_text): + return to_ascii(strip_tags(unicode_text.lower())) def __repr__(self): - return "DefaultPreprocessor()" + return "RomanPreprocessor()" -DEFAULT_PREPROCESSOR = DefaultPreprocessor() +DEFAULT_PREPROCESSOR = RomanPreprocessor() class WordNGramAnalyzer(BaseEstimator): @@ -119,7 +142,7 @@ class WordNGramAnalyzer(BaseEstimator): if n_original_tokens < n: continue for i in xrange(n_original_tokens - n + 1): - tokens.append(" ".join(original_tokens[i: i + n])) + tokens.append(u" ".join(original_tokens[i: i + n])) # handle stop words if self.stop_words is not None: