diff --git a/scikits/learn/datasets/mlcomp.py b/scikits/learn/datasets/mlcomp.py index 9e87eebb6434eebedbf2dd324e7c5d7a89cfd85e..bde4a7f1ffb10cec865e268e68723f7f1f1ebe4f 100644 --- a/scikits/learn/datasets/mlcomp.py +++ b/scikits/learn/datasets/mlcomp.py @@ -4,11 +4,29 @@ import os from scikits.learn.datasets.base import Bunch +from scikits.learn.features.text import HashingVectorizer -def load_document_classification(dataset_path, metadata, **kw): - return Bunch(data=None, target=None, target_names=None, - DESCR=metadata.get('description')) +def load_document_classification(dataset_path, metadata, set_, **kw): + """Loader implementation for the DocumentClassification format""" + target = [] + target_names = {} + vectorizer = kw.get('vectorizer', HashingVectorizer()) + + dataset_path = os.path.join(dataset_path, set_) + folders = [f for f in sorted(os.listdir(dataset_path)) + if os.path.isdir(os.path.join(dataset_path, f))] + for label, folder in enumerate(folders): + target_names[label] = folder + folder_path = os.path.join(dataset_path, folder) + documents = [os.path.join(folder_path, d) + for d in sorted(os.listdir(folder_path))] + vectorizer.vectorize(documents) + target.extend(len(documents) * [label]) + + return Bunch(data=vectorizer.get_vectors(), target=target, + target_names=target_names, DESCR=metadata.get('description')) + LOADERS = { 'DocumentClassification': load_document_classification, @@ -16,7 +34,7 @@ LOADERS = { } -def load_mlcomp(name_or_id, mlcomp_root=None, **kwargs): +def load_mlcomp(name_or_id, mlcomp_root=None, set_="raw", **kwargs): """Load a datasets as downloaded from http://mlcomp.org Parameters @@ -29,6 +47,8 @@ def load_mlcomp(name_or_id, mlcomp_root=None, **kwargs): are stored, if mlcomp_root is None, the MLCOMP_DATASETS_HOME environment variable is looked up instead. + set_ : select the portion to load: 'train', 'test' or 'raw' + **kwargs : domain specific kwargs to be passed to the dataset loader. Returns @@ -60,7 +80,6 @@ def load_mlcomp(name_or_id, mlcomp_root=None, **kwargs): if not os.path.exists(mlcomp_root): raise ValueError("Could not find folder: " + mlcomp_root) - # dataset lookup if isinstance(name_or_id, int): # id lookup @@ -95,9 +114,8 @@ def load_mlcomp(name_or_id, mlcomp_root=None, **kwargs): loader = LOADERS.get(format) if loader is None: raise ValueError("No loader implemented for format: " + format) - return loader(dataset_path, metadata, **kwargs) + return loader(dataset_path, metadata, set_=set_, **kwargs) if __name__ == "__main__": - print load_mlcomp('20news-18828') - print load_mlcomp(379) + twentynews = load_mlcomp('20news-18828') diff --git a/scikits/learn/features/text.py b/scikits/learn/features/text.py index 41766e58e4e2d7f1f5fb2422de2766df9d40ba2e..e35e00b1d7a3d78329feaf90faa40e6a886e9156 100644 --- a/scikits/learn/features/text.py +++ b/scikits/learn/features/text.py @@ -26,9 +26,12 @@ class SimpleAnalyzer(object): token_pattern = re.compile(r"\b\w\w+\b", re.U) + def __init__(self, default_charset='utf-8'): + self.charset = default_charset + def analyze(self, text_document): if isinstance(text_document, str): - text_document = text_document.decode("utf-8") + text_document = text_document.decode(self.charset, 'ignore') text_document = strip_accents(text_document.lower()) return re.findall(self.token_pattern, text_document) @@ -54,10 +57,12 @@ class HashingVectorizer(object): # TODO: implement me using the murmurhash that might be faster: but profile # me first :) - def __init__(self, dim=5000, probes=3, analyzer=SimpleAnalyzer()): + def __init__(self, dim=5000, probes=3, analyzer=SimpleAnalyzer(), + use_idf=True): self.dim = dim self.probes = probes self.analyzer = analyzer + self.use_idf = use_idf # start counts at one to avoid zero division while # computing IDF @@ -89,24 +94,39 @@ class HashingVectorizer(object): tf_vector[i] += incr tf_vector /= len(tokens) * self.probes - if update_estimates: + if update_estimates and self.use_idf: # update the running DF estimate self.df_counts += tf_vector != 0.0 self.sampled += 1 return tf_vector + def get_idf(self): + return np.log(float(self.sampled) / self.df_counts) + def get_tfidf(self): """Compute the TF-log(IDF) vectors of the sampled documents""" - return self.tf_vectors * np.log(float(self.sampled) / self.df_counts) + if self.tf_vectors is None: + return None + return self.tf_vectors * self.get_idf() + + def vectorize(self, document_filepaths): + """Vectorize a batch of documents""" + tf_vectors = np.zeros((len(document_filepaths), self.dim)) + for i, filepath in enumerate(document_filepaths): + self.sample_document(file(filepath).read(), tf_vectors[i]) + + if self.tf_vectors is None: + self.tf_vectors = tf_vectors + else: + self.tf_vectors = np.vstack((self.tf_vectors, tf_vectors)) + - def vectorize(self, root_folder): - """Scan a folder structure for text documents and estimate frequencies + def get_vectors(self): + if self.use_idf: + return self.get_tfidf() + else: + return self.tf_vectors - If this is a 2 level folder structure the first level is assumed to be - categories to be used as labels for supervised learning. - """ - # TODO: implement me! - pass