Skip to content
Snippets Groups Projects
Commit cba81c15 authored by Olivier Grisel's avatar Olivier Grisel
Browse files

more work on document classification dataset loader

parent ea20d1c3
Branches
Tags
No related merge requests found
......@@ -4,11 +4,29 @@
import os
from scikits.learn.datasets.base import Bunch
from scikits.learn.features.text import HashingVectorizer
def load_document_classification(dataset_path, metadata, **kw):
return Bunch(data=None, target=None, target_names=None,
DESCR=metadata.get('description'))
def load_document_classification(dataset_path, metadata, set_, **kw):
"""Loader implementation for the DocumentClassification format"""
target = []
target_names = {}
vectorizer = kw.get('vectorizer', HashingVectorizer())
dataset_path = os.path.join(dataset_path, set_)
folders = [f for f in sorted(os.listdir(dataset_path))
if os.path.isdir(os.path.join(dataset_path, f))]
for label, folder in enumerate(folders):
target_names[label] = folder
folder_path = os.path.join(dataset_path, folder)
documents = [os.path.join(folder_path, d)
for d in sorted(os.listdir(folder_path))]
vectorizer.vectorize(documents)
target.extend(len(documents) * [label])
return Bunch(data=vectorizer.get_vectors(), target=target,
target_names=target_names, DESCR=metadata.get('description'))
LOADERS = {
'DocumentClassification': load_document_classification,
......@@ -16,7 +34,7 @@ LOADERS = {
}
def load_mlcomp(name_or_id, mlcomp_root=None, **kwargs):
def load_mlcomp(name_or_id, mlcomp_root=None, set_="raw", **kwargs):
"""Load a datasets as downloaded from http://mlcomp.org
Parameters
......@@ -29,6 +47,8 @@ def load_mlcomp(name_or_id, mlcomp_root=None, **kwargs):
are stored, if mlcomp_root is None, the MLCOMP_DATASETS_HOME
environment variable is looked up instead.
set_ : select the portion to load: 'train', 'test' or 'raw'
**kwargs : domain specific kwargs to be passed to the dataset loader.
Returns
......@@ -60,7 +80,6 @@ def load_mlcomp(name_or_id, mlcomp_root=None, **kwargs):
if not os.path.exists(mlcomp_root):
raise ValueError("Could not find folder: " + mlcomp_root)
# dataset lookup
if isinstance(name_or_id, int):
# id lookup
......@@ -95,9 +114,8 @@ def load_mlcomp(name_or_id, mlcomp_root=None, **kwargs):
loader = LOADERS.get(format)
if loader is None:
raise ValueError("No loader implemented for format: " + format)
return loader(dataset_path, metadata, **kwargs)
return loader(dataset_path, metadata, set_=set_, **kwargs)
if __name__ == "__main__":
print load_mlcomp('20news-18828')
print load_mlcomp(379)
twentynews = load_mlcomp('20news-18828')
......@@ -26,9 +26,12 @@ class SimpleAnalyzer(object):
token_pattern = re.compile(r"\b\w\w+\b", re.U)
def __init__(self, default_charset='utf-8'):
self.charset = default_charset
def analyze(self, text_document):
if isinstance(text_document, str):
text_document = text_document.decode("utf-8")
text_document = text_document.decode(self.charset, 'ignore')
text_document = strip_accents(text_document.lower())
return re.findall(self.token_pattern, text_document)
......@@ -54,10 +57,12 @@ class HashingVectorizer(object):
# TODO: implement me using the murmurhash that might be faster: but profile
# me first :)
def __init__(self, dim=5000, probes=3, analyzer=SimpleAnalyzer()):
def __init__(self, dim=5000, probes=3, analyzer=SimpleAnalyzer(),
use_idf=True):
self.dim = dim
self.probes = probes
self.analyzer = analyzer
self.use_idf = use_idf
# start counts at one to avoid zero division while
# computing IDF
......@@ -89,24 +94,39 @@ class HashingVectorizer(object):
tf_vector[i] += incr
tf_vector /= len(tokens) * self.probes
if update_estimates:
if update_estimates and self.use_idf:
# update the running DF estimate
self.df_counts += tf_vector != 0.0
self.sampled += 1
return tf_vector
def get_idf(self):
return np.log(float(self.sampled) / self.df_counts)
def get_tfidf(self):
"""Compute the TF-log(IDF) vectors of the sampled documents"""
return self.tf_vectors * np.log(float(self.sampled) / self.df_counts)
if self.tf_vectors is None:
return None
return self.tf_vectors * self.get_idf()
def vectorize(self, root_folder):
"""Scan a folder structure for text documents and estimate frequencies
def vectorize(self, document_filepaths):
"""Vectorize a batch of documents"""
tf_vectors = np.zeros((len(document_filepaths), self.dim))
for i, filepath in enumerate(document_filepaths):
self.sample_document(file(filepath).read(), tf_vectors[i])
if self.tf_vectors is None:
self.tf_vectors = tf_vectors
else:
self.tf_vectors = np.vstack((self.tf_vectors, tf_vectors))
def get_vectors(self):
if self.use_idf:
return self.get_tfidf()
else:
return self.tf_vectors
If this is a 2 level folder structure the first level is assumed to be
categories to be used as labels for supervised learning.
"""
# TODO: implement me!
pass
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment