diff --git a/scikits/learn/feature_extraction/sparse/text.py b/scikits/learn/feature_extraction/sparse/text.py index 290f4d9f4b17f050c684cb9267d0b77b922c770d..dcc69bcfbb003b0def4bcb9dc0985ecccc5a2ef9 100644 --- a/scikits/learn/feature_extraction/sparse/text.py +++ b/scikits/learn/feature_extraction/sparse/text.py @@ -64,13 +64,6 @@ class TfidfTransformer(BaseTfidfTransformer): d.setdiag(self.idf) X = X * d - if self.normalize: - norms = X.multiply(X).sum(axis=1) - norms = np.sqrt(np.array(norms).ravel()) - - for doc, token in zip(*X.nonzero()): - X[doc, token] /= norms[doc] - return X class Vectorizer(BaseVectorizer): @@ -83,10 +76,9 @@ class Vectorizer(BaseVectorizer): def __init__(self, analyzer=DEFAULT_ANALYZER, use_tf=True, - use_idf=True, - normalize=False): + use_idf=True): self.tc = CountVectorizer(analyzer, dtype=np.float64) - self.tfidf = TfidfTransformer(use_tf, use_idf, normalize) + self.tfidf = TfidfTransformer(use_tf, use_idf) class HashingVectorizer(object): """Compute term freq vectors using hashed term space in a sparse matrix diff --git a/scikits/learn/feature_extraction/tests/test_text.py b/scikits/learn/feature_extraction/tests/test_text.py index 2ce2e325b1ab5ee7855968c961bd29010a1bb19e..6a10c80f471488d0a7058aef40fc666050ba1bae 100644 --- a/scikits/learn/feature_extraction/tests/test_text.py +++ b/scikits/learn/feature_extraction/tests/test_text.py @@ -217,11 +217,6 @@ def _test_vectorizer(cv_class, tf_class, v_class): assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train) - # test normalization - t3 = tf_class(normalize=True) - tfidf_n = toarray(t3.fit(counts_train).transform(counts_train)) - assert_equal(la.norm(tfidf_n[0]), 1.0) - # test the direct tfidf vectorizer # (equivalent to term count vectorizer + tfidf transformer) train_data = iter(JUNK_FOOD_DOCS[:-1]) diff --git a/scikits/learn/feature_extraction/text.py b/scikits/learn/feature_extraction/text.py index 26a7c81ff8c44c2b0cd63a61f62798104f2c68fa..418dcdc8d367fdf5785e9885fda4be705223b9d5 100644 --- a/scikits/learn/feature_extraction/text.py +++ b/scikits/learn/feature_extraction/text.py @@ -313,15 +313,11 @@ class BaseTfidfTransformer(BaseEstimator): use_idf: boolean enable inverse-document-frequency reweighting - - normalize: boolean - normalize vectors to unit-length """ - def __init__(self, use_tf=True, use_idf=True, normalize=False): + def __init__(self, use_tf=True, use_idf=True): self.use_tf = use_tf self.use_idf = use_idf - self.normalize = normalize self.idf = None class TfidfTransformer(BaseTfidfTransformer): @@ -366,9 +362,6 @@ class TfidfTransformer(BaseTfidfTransformer): if self.use_idf: X *= self.idf - if self.normalize: - X /= np.sqrt(np.sum(X ** 2, axis=1))[:,np.newaxis] - return X class BaseVectorizer(BaseEstimator): @@ -428,10 +421,9 @@ class Vectorizer(BaseVectorizer): def __init__(self, analyzer=DEFAULT_ANALYZER, use_tf=True, - use_idf=True, - normalize=False): + use_idf=True): self.tc = CountVectorizer(analyzer, dtype=np.float64) - self.tfidf = TfidfTransformer(use_tf, use_idf, normalize) + self.tfidf = TfidfTransformer(use_tf, use_idf) class HashingVectorizer(object):