more work on document classification dataset loader

cba81c15 · Olivier Grisel · ea20d1c3 · cba81c15 · cba81c15
Commit cba81c15 authored 14 years ago by Olivier Grisel
--- a/scikits/learn/datasets/mlcomp.py
+++ b/scikits/learn/datasets/mlcomp.py
@@ -4,11 +4,29 @@

 import os
 from scikits.learn.datasets.base import Bunch
+from scikits.learn.features.text import HashingVectorizer


-def load_document_classification(dataset_path, metadata, **kw):
-    return Bunch(data=None, target=None, target_names=None,
-                 DESCR=metadata.get('description'))
+def load_document_classification(dataset_path, metadata, set_, **kw):
+    """Loader implementation for the DocumentClassification format"""
+    target = []
+    target_names = {}
+    vectorizer = kw.get('vectorizer', HashingVectorizer())
+
+    dataset_path = os.path.join(dataset_path, set_)
+    folders = [f for f in sorted(os.listdir(dataset_path))
+               if os.path.isdir(os.path.join(dataset_path, f))]
+    for label, folder in enumerate(folders):
+        target_names[label] = folder
+        folder_path = os.path.join(dataset_path, folder)
+        documents = [os.path.join(folder_path, d)
+                     for d in sorted(os.listdir(folder_path))]
+        vectorizer.vectorize(documents)
+        target.extend(len(documents) * [label])
+
+    return Bunch(data=vectorizer.get_vectors(), target=target,
+                 target_names=target_names, DESCR=metadata.get('description'))
+

 LOADERS = {
    'DocumentClassification': load_document_classification,
@@ -16,7 +34,7 @@ LOADERS = {
 }


-def load_mlcomp(name_or_id, mlcomp_root=None, **kwargs):
+def load_mlcomp(name_or_id, mlcomp_root=None, set_="raw", **kwargs):
    """Load a datasets as downloaded from http://mlcomp.org

    Parameters
@@ -29,6 +47,8 @@ def load_mlcomp(name_or_id, mlcomp_root=None, **kwargs):
                  are stored, if mlcomp_root is None, the MLCOMP_DATASETS_HOME
                  environment variable is looked up instead.

+    set_ : select the portion to load: 'train', 'test' or 'raw'
+
    **kwargs : domain specific kwargs to be passed to the dataset loader.

    Returns
@@ -60,7 +80,6 @@ def load_mlcomp(name_or_id, mlcomp_root=None, **kwargs):
    if not os.path.exists(mlcomp_root):
        raise ValueError("Could not find folder: " + mlcomp_root)

-
    # dataset lookup
    if isinstance(name_or_id, int):
        # id lookup
@@ -95,9 +114,8 @@ def load_mlcomp(name_or_id, mlcomp_root=None, **kwargs):
    loader = LOADERS.get(format)
    if loader is None:
        raise ValueError("No loader implemented for format: " + format)
-    return loader(dataset_path, metadata, **kwargs)
+    return loader(dataset_path, metadata, set_=set_, **kwargs)


 if __name__ == "__main__":
-    print load_mlcomp('20news-18828')
-    print load_mlcomp(379)
+    twentynews = load_mlcomp('20news-18828')
--- a/scikits/learn/features/text.py
+++ b/scikits/learn/features/text.py
@@ -26,9 +26,12 @@ class SimpleAnalyzer(object):

    token_pattern = re.compile(r"\b\w\w+\b", re.U)

+    def __init__(self, default_charset='utf-8'):
+        self.charset = default_charset
+
    def analyze(self, text_document):
        if isinstance(text_document, str):
-            text_document = text_document.decode("utf-8")
+            text_document = text_document.decode(self.charset, 'ignore')
        text_document = strip_accents(text_document.lower())
        return re.findall(self.token_pattern, text_document)

@@ -54,10 +57,12 @@ class HashingVectorizer(object):
    # TODO: implement me using the murmurhash that might be faster: but profile
    # me first :)

-    def __init__(self, dim=5000, probes=3, analyzer=SimpleAnalyzer()):
+    def __init__(self, dim=5000, probes=3, analyzer=SimpleAnalyzer(),
+                 use_idf=True):
        self.dim = dim
        self.probes = probes
        self.analyzer = analyzer
+        self.use_idf = use_idf

        # start counts at one to avoid zero division while
        # computing IDF
@@ -89,24 +94,39 @@ class HashingVectorizer(object):
                tf_vector[i] += incr
        tf_vector /= len(tokens) * self.probes

-        if update_estimates:
+        if update_estimates and self.use_idf:
            # update the running DF estimate
            self.df_counts += tf_vector != 0.0
            self.sampled += 1
        return tf_vector

+    def get_idf(self):
+        return np.log(float(self.sampled) / self.df_counts)
+
    def get_tfidf(self):
        """Compute the TF-log(IDF) vectors of the sampled documents"""
-        return self.tf_vectors * np.log(float(self.sampled) / self.df_counts)
+        if self.tf_vectors is None:
+            return None
+        return self.tf_vectors * self.get_idf()

-    def vectorize(self, root_folder):
-        """Scan a folder structure for text documents and estimate frequencies
+    def vectorize(self, document_filepaths):
+        """Vectorize a batch of documents"""
+        tf_vectors = np.zeros((len(document_filepaths), self.dim))
+        for i, filepath in enumerate(document_filepaths):
+            self.sample_document(file(filepath).read(), tf_vectors[i])
+
+        if self.tf_vectors is None:
+            self.tf_vectors = tf_vectors
+        else:
+            self.tf_vectors = np.vstack((self.tf_vectors, tf_vectors))
+
+
+    def get_vectors(self):
+        if self.use_idf:
+            return self.get_tfidf()
+        else:
+            return self.tf_vectors

-        If this is a 2 level folder structure the first level is assumed to be
-        categories to be used as labels for supervised learning.
-        """
-        # TODO: implement me!
-        pass