diff --git a/scikits/learn/features/tests/test_text.py b/scikits/learn/features/tests/test_text.py new file mode 100644 index 0000000000000000000000000000000000000000..421361466d8b7fad268b7e89b9eb713c851e95ef --- /dev/null +++ b/scikits/learn/features/tests/test_text.py @@ -0,0 +1,70 @@ +from scikits.learn.features.text import strip_accents +from scikits.learn.features.text import SimpleAnalyzer +from scikits.learn.features.text import HashingVectorizer +from scikits.learn.logistic import LogisticRegression +from nose.tools import * + + +def test_strip_accents(): + # check some classical latin accentuated symbols + a = u'\xe0\xe1\xe2\xe3\xe4\xe5\xe7\xe8\xe9\xea\xeb' + expected = u'aaaaaaceeee' + assert_equal(strip_accents(a), expected) + + a = u'\xec\xed\xee\xef\xf1\xf2\xf3\xf4\xf5\xf6\xf9\xfa\xfb\xfc\xfd' + expected = u'iiiinooooouuuuy' + assert_equal(strip_accents(a), expected) + + # check some arabic + a = u'\u0625' # halef with a hamza below + expected = u'\u0627' # simple halef + assert_equal(strip_accents(a), expected) + + # mix letters accentuated and not + a = u"this is \xe0 test" + expected = u'this is a test' + assert_equal(strip_accents(a), expected) + + +def test_simple_analyzer(): + sa = SimpleAnalyzer() + + text = u"J'ai mang\xe9 du kangourou ce midi, c'\xe9tait pas tr\xeas bon." + expected = [u'ai', u'mange', u'du', u'kangourou', u'ce', u'midi', + u'etait', u'pas', u'tres', u'bon'] + assert_equal(sa.analyze(text), expected) + + text = "This is a test, really.\n\n I met Harry yesterday." + expected = [u'this', u'is', u'test', u'really', u'met', u'harry', + u'yesterday'] + assert_equal(sa.analyze(text), expected) + + +def test_tf_idf(): + hv = HashingVectorizer(dim=1000, probes=3) + + # junk food documents + hv.sample_document("the pizza pizza beer", label=-1) + hv.sample_document("the pizza pizza beer", label=-1) + hv.sample_document("the the pizza beer beer", label=-1) + hv.sample_document("the pizza beer beer", label=-1) + hv.sample_document("the coke beer coke", label=-1) + hv.sample_document("the coke pizza pizza", label=-1) + + # not-junk food documents + hv.sample_document("the salad celeri", label=1) + hv.sample_document("the salad salad sparkling water", label=1) + hv.sample_document("the the celeri celeri", label=1) + hv.sample_document("the tomato tomato salad water", label=1) + hv.sample_document("the tomato salad water", label=1) + + # extract the TF-IDF data + X, y = hv.get_tfidf(), hv.labels + assert_equal(X.shape, (11, 1000)) + assert_equal(len(y), 11) + + # train and test a classifier + clf = LogisticRegression().fit(X[1:-1], y[1:-1]) + assert_equal(clf.predict([X[0]]), [-1]) + assert_equal(clf.predict([X[-1]]), [1]) + diff --git a/scikits/learn/features/text.py b/scikits/learn/features/text.py index 4f35ed8e7d1ed03ed8c539aa0f14ba3e40670353..adc9f2e0d9f806fc38a4a68b1d12fbb6d86854f9 100644 --- a/scikits/learn/features/text.py +++ b/scikits/learn/features/text.py @@ -101,18 +101,14 @@ class HashingVectorizer(object): """Compute the TF-log(IDF) vectors of the sampled documents""" return self.tf_vectors * np.log(float(self.sampled) / self.df_counts) + def vectorize(self, root_folder): + """Scan a folder structure for text documents and estimate frequencies -if __name__ == "__main__": + If this is a 2 level folder structure the first level is assumed to be + categories to be used as labels for supervised learning. + """ + # TODO: implement me! + pass - # TODO: write unittests instead! - hv = HashingVectorizer(dim=10, probes=2) - print hv.analyzer.analyze(u"This is a s\xe9ntence named Mary; with puncts...") - print hv.sample_document("This is a test document.") - print hv.sample_document("This is not a test.") - print hv.sample_document("document document document this toto") - print hv.tf_vectors - print hv.df_counts - print hv.sampled - print hv.get_tfidf()