From 7759f8b0a58e8bc7ceffa38577d64984a74cc5ef Mon Sep 17 00:00:00 2001 From: Olivier Grisel <olivier.grisel@ensta.org> Date: Sun, 4 Jul 2010 16:16:56 +0200 Subject: [PATCH] checkpointing work in progress on MLComp dataset integration --- scikits/learn/datasets/__init__.py | 2 + scikits/learn/datasets/mlcomp.py | 103 +++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 scikits/learn/datasets/mlcomp.py diff --git a/scikits/learn/datasets/__init__.py b/scikits/learn/datasets/__init__.py index 1e1ca225dd..3719162262 100644 --- a/scikits/learn/datasets/__init__.py +++ b/scikits/learn/datasets/__init__.py @@ -1 +1,3 @@ from base import load_iris, load_digits, load_diabetes +from mlcomp import load_mlcomp + diff --git a/scikits/learn/datasets/mlcomp.py b/scikits/learn/datasets/mlcomp.py new file mode 100644 index 0000000000..9e87eebb64 --- /dev/null +++ b/scikits/learn/datasets/mlcomp.py @@ -0,0 +1,103 @@ +# Copyright (c) 2010 Olivier Grisel <olivier.grisel@ensta.org> +# License: Simplified BSD +"""Glue code to load http://mlcomp.org data as a scikit.learn dataset""" + +import os +from scikits.learn.datasets.base import Bunch + + +def load_document_classification(dataset_path, metadata, **kw): + return Bunch(data=None, target=None, target_names=None, + DESCR=metadata.get('description')) + +LOADERS = { + 'DocumentClassification': load_document_classification, + # TODO: implement the remaining domain formats +} + + +def load_mlcomp(name_or_id, mlcomp_root=None, **kwargs): + """Load a datasets as downloaded from http://mlcomp.org + + Parameters + ---------- + + name_or_id : the integer id or the string name metadata of the MLComp + dataset to load + + mlcomp_root : the filesystem path to the root folder where MLComp datasets + are stored, if mlcomp_root is None, the MLCOMP_DATASETS_HOME + environment variable is looked up instead. + + **kwargs : domain specific kwargs to be passed to the dataset loader. + + Returns + ------- + + data : Bunch + Dictionnary-like object, the interesting attributes are: + 'data', the data to learn, 'target', the classification labels, + 'target_names', the meaning of the labels, and 'DESCR', the + full description of the dataset. + + Note on the lookup process: depending on the type of name_or_id, + will choose between integer id lookup or metadata name lookup by + looking at the unzipped archives and metadata file. + + TODO: implement zip dataset loading too + """ + + if mlcomp_root is None: + try: + mlcomp_root = os.environ['MLCOMP_DATASETS_HOME'] + except KeyError: + raise ValueError("MLCOMP_DATASETS_HOME env variable is undefined") + + mlcomp_root = os.path.expanduser(mlcomp_root) + mlcomp_root = os.path.abspath(mlcomp_root) + mlcomp_root = os.path.normpath(mlcomp_root) + + if not os.path.exists(mlcomp_root): + raise ValueError("Could not find folder: " + mlcomp_root) + + + # dataset lookup + if isinstance(name_or_id, int): + # id lookup + dataset_path = os.path.join(mlcomp_root, str(name_or_id)) + else: + # assume name based lookup + dataset_path = None + expected_name_line = "name: " + name_or_id + for dataset in os.listdir(mlcomp_root): + metadata_file = os.path.join(mlcomp_root, dataset, 'metadata') + if not os.path.exists(metadata_file): + continue + for line in file(metadata_file): + if line.strip() == expected_name_line: + dataset_path = os.path.join(mlcomp_root, dataset) + break + if dataset_path is None: + raise ValueError("Could not find dataset with metadata line: " + + expected_name_line) + + # loading the dataset metadata + metadata = dict() + metadata_file = os.path.join(dataset_path, 'metadata') + if not os.path.exists(metadata_file): + raise ValueError(dataset_path + ' is not a valid MLComp dataset') + for line in file(metadata_file): + if ":" in line: + key, value = line.split(":", 1) + metadata[key.strip()] = value.strip() + + format = metadata.get('format', 'unknow') + loader = LOADERS.get(format) + if loader is None: + raise ValueError("No loader implemented for format: " + format) + return loader(dataset_path, metadata, **kwargs) + + +if __name__ == "__main__": + print load_mlcomp('20news-18828') + print load_mlcomp(379) -- GitLab