From 7759f8b0a58e8bc7ceffa38577d64984a74cc5ef Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Sun, 4 Jul 2010 16:16:56 +0200
Subject: [PATCH] checkpointing work in progress on MLComp dataset integration

---
 scikits/learn/datasets/__init__.py |   2 +
 scikits/learn/datasets/mlcomp.py   | 103 +++++++++++++++++++++++++++++
 2 files changed, 105 insertions(+)
 create mode 100644 scikits/learn/datasets/mlcomp.py

diff --git a/scikits/learn/datasets/__init__.py b/scikits/learn/datasets/__init__.py
index 1e1ca225dd..3719162262 100644
--- a/scikits/learn/datasets/__init__.py
+++ b/scikits/learn/datasets/__init__.py
@@ -1 +1,3 @@
 from base import load_iris, load_digits, load_diabetes
+from mlcomp import load_mlcomp
+
diff --git a/scikits/learn/datasets/mlcomp.py b/scikits/learn/datasets/mlcomp.py
new file mode 100644
index 0000000000..9e87eebb64
--- /dev/null
+++ b/scikits/learn/datasets/mlcomp.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2010 Olivier Grisel <olivier.grisel@ensta.org>
+# License: Simplified BSD
+"""Glue code to load http://mlcomp.org data as a scikit.learn dataset"""
+
+import os
+from scikits.learn.datasets.base import Bunch
+
+
+def load_document_classification(dataset_path, metadata, **kw):
+    return Bunch(data=None, target=None, target_names=None,
+                 DESCR=metadata.get('description'))
+
+LOADERS = {
+    'DocumentClassification': load_document_classification,
+    # TODO: implement the remaining domain formats
+}
+
+
+def load_mlcomp(name_or_id, mlcomp_root=None, **kwargs):
+    """Load a datasets as downloaded from http://mlcomp.org
+
+    Parameters
+    ----------
+
+    name_or_id : the integer id or the string name metadata of the MLComp
+                 dataset to load
+
+    mlcomp_root : the filesystem path to the root folder where MLComp datasets
+                  are stored, if mlcomp_root is None, the MLCOMP_DATASETS_HOME
+                  environment variable is looked up instead.
+
+    **kwargs : domain specific kwargs to be passed to the dataset loader.
+
+    Returns
+    -------
+
+    data : Bunch
+        Dictionnary-like object, the interesting attributes are:
+        'data', the data to learn, 'target', the classification labels,
+        'target_names', the meaning of the labels, and 'DESCR', the
+        full description of the dataset.
+
+    Note on the lookup process: depending on the type of name_or_id,
+    will choose between integer id lookup or metadata name lookup by
+    looking at the unzipped archives and metadata file.
+
+    TODO: implement zip dataset loading too
+    """
+
+    if mlcomp_root is None:
+        try:
+            mlcomp_root = os.environ['MLCOMP_DATASETS_HOME']
+        except KeyError:
+            raise ValueError("MLCOMP_DATASETS_HOME env variable is undefined")
+
+    mlcomp_root = os.path.expanduser(mlcomp_root)
+    mlcomp_root = os.path.abspath(mlcomp_root)
+    mlcomp_root = os.path.normpath(mlcomp_root)
+
+    if not os.path.exists(mlcomp_root):
+        raise ValueError("Could not find folder: " + mlcomp_root)
+
+
+    # dataset lookup
+    if isinstance(name_or_id, int):
+        # id lookup
+        dataset_path = os.path.join(mlcomp_root, str(name_or_id))
+    else:
+        # assume name based lookup
+        dataset_path = None
+        expected_name_line = "name: " + name_or_id
+        for dataset in os.listdir(mlcomp_root):
+            metadata_file = os.path.join(mlcomp_root, dataset, 'metadata')
+            if not os.path.exists(metadata_file):
+                continue
+            for line in file(metadata_file):
+                if line.strip() == expected_name_line:
+                    dataset_path = os.path.join(mlcomp_root, dataset)
+                    break
+        if dataset_path is None:
+            raise ValueError("Could not find dataset with metadata line: " +
+                             expected_name_line)
+
+    # loading the dataset metadata
+    metadata = dict()
+    metadata_file = os.path.join(dataset_path, 'metadata')
+    if not os.path.exists(metadata_file):
+        raise ValueError(dataset_path + ' is not a valid MLComp dataset')
+    for line in file(metadata_file):
+        if ":" in line:
+            key, value = line.split(":", 1)
+            metadata[key.strip()] = value.strip()
+
+    format = metadata.get('format', 'unknow')
+    loader = LOADERS.get(format)
+    if loader is None:
+        raise ValueError("No loader implemented for format: " + format)
+    return loader(dataset_path, metadata, **kwargs)
+
+
+if __name__ == "__main__":
+    print load_mlcomp('20news-18828')
+    print load_mlcomp(379)
-- 
GitLab