diff --git a/scikits/learn/datasets/mlcomp.py b/scikits/learn/datasets/mlcomp.py index fa14da4015ed21af274b528c7b1ada5da384561c..6bdc20fa9962b5b99ee32916b07c7f586d4df0f2 100644 --- a/scikits/learn/datasets/mlcomp.py +++ b/scikits/learn/datasets/mlcomp.py @@ -6,7 +6,8 @@ import os import numpy as np from scikits.learn.datasets.base import load_text_files from scikits.learn.feature_extraction.text import HashingVectorizer -from scikits.learn.feature_extraction.sparse.text import SparseHashingVectorizer +from scikits.learn.feature_extraction.sparse.text import HashingVectorizer as \ + SparseCountVectorizer def _load_document_classification(dataset_path, metadata, set_=None): diff --git a/scikits/learn/feature_extraction/sparse/__init__.py b/scikits/learn/feature_extraction/sparse/__init__.py index 890ce8f7f14e58c0b8abe1f998cb92f7cfe7cf63..fa0f61b5bd0db51b5ab30989174ba4560966983e 100644 --- a/scikits/learn/feature_extraction/sparse/__init__.py +++ b/scikits/learn/feature_extraction/sparse/__init__.py @@ -1,3 +1,3 @@ -from .text import SparseCountVectorizer, SparseTfidfTransformer, \ - SparseVectorizer, SparseHashingVectorizer +from .text import CountVectorizer, TfidfTransformer, Vectorizer, \ + HashingVectorizer diff --git a/scikits/learn/feature_extraction/sparse/text.py b/scikits/learn/feature_extraction/sparse/text.py index 1ba7dd71746f1cb3c64ad94f83d2def54aa3b9e7..290f4d9f4b17f050c684cb9267d0b77b922c770d 100644 --- a/scikits/learn/feature_extraction/sparse/text.py +++ b/scikits/learn/feature_extraction/sparse/text.py @@ -11,12 +11,12 @@ import scipy.sparse as sp from ..text import BaseCountVectorizer, BaseTfidfTransformer, BaseVectorizer, \ DEFAULT_ANALYZER -class SparseCountVectorizer(BaseCountVectorizer): +class CountVectorizer(BaseCountVectorizer): def _init_matrix(self, shape): return sp.dok_matrix(shape, dtype=self.dtype) -class SparseTfidfTransformer(BaseTfidfTransformer): +class TfidfTransformer(BaseTfidfTransformer): def fit(self, X, y=None): """ @@ -73,11 +73,11 @@ class SparseTfidfTransformer(BaseTfidfTransformer): return X -class SparseVectorizer(BaseVectorizer): +class Vectorizer(BaseVectorizer): """ Convert a collection of raw documents to a sparse matrix. - Equivalent to SparseCountVectorizer followed by SparseTfidfTransformer. + Equivalent to CountVectorizer followed by TfidfTransformer. """ def __init__(self, @@ -85,10 +85,10 @@ class SparseVectorizer(BaseVectorizer): use_tf=True, use_idf=True, normalize=False): - self.tc = SparseCountVectorizer(analyzer, dtype=np.float64) - self.tfidf = SparseTfidfTransformer(use_tf, use_idf, normalize) + self.tc = CountVectorizer(analyzer, dtype=np.float64) + self.tfidf = TfidfTransformer(use_tf, use_idf, normalize) -class SparseHashingVectorizer(object): +class HashingVectorizer(object): """Compute term freq vectors using hashed term space in a sparse matrix The logic is the same as HashingVectorizer but it is possible to use much diff --git a/scikits/learn/feature_extraction/tests/test_text.py b/scikits/learn/feature_extraction/tests/test_text.py index f741ce6a486035e0552dbbd6e2a02843aedb0b96..2ce2e325b1ab5ee7855968c961bd29010a1bb19e 100644 --- a/scikits/learn/feature_extraction/tests/test_text.py +++ b/scikits/learn/feature_extraction/tests/test_text.py @@ -1,14 +1,19 @@ from scikits.learn.feature_extraction.text import CharNGramAnalyzer +from scikits.learn.feature_extraction.text import WordNGramAnalyzer +from scikits.learn.feature_extraction.text import strip_accents + from scikits.learn.feature_extraction.text import CountVectorizer -from scikits.learn.feature_extraction.text import HashingVectorizer from scikits.learn.feature_extraction.text import TfidfTransformer from scikits.learn.feature_extraction.text import Vectorizer -from scikits.learn.feature_extraction.sparse.text import SparseCountVectorizer -from scikits.learn.feature_extraction.sparse.text import SparseHashingVectorizer -from scikits.learn.feature_extraction.sparse.text import SparseTfidfTransformer -from scikits.learn.feature_extraction.sparse.text import SparseVectorizer -from scikits.learn.feature_extraction.text import WordNGramAnalyzer -from scikits.learn.feature_extraction.text import strip_accents +from scikits.learn.feature_extraction.text import HashingVectorizer + +import scikits.learn.feature_extraction.sparse.text as st + +SparseCountVectorizer = st.CountVectorizer +SparseTfidfTransformer = st.TfidfTransformer +SparseVectorizer = st.Vectorizer +SparseHashingVectorizer = st.HashingVectorizer + from scikits.learn.grid_search import GridSearchCV from scikits.learn.pipeline import Pipeline from scikits.learn.svm import LinearSVC as DenseLinearSVC