From 6f6b2f8afbc053e3bc0ce5f7bf51cb1d9d21c520 Mon Sep 17 00:00:00 2001 From: Olivier Grisel <olivier.grisel@ensta.org> Date: Sun, 4 Jul 2010 19:48:35 +0200 Subject: [PATCH] make it easy to find the raw source document --- scikits/learn/datasets/mlcomp.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scikits/learn/datasets/mlcomp.py b/scikits/learn/datasets/mlcomp.py index bde4a7f1ff..f6a79eb8f9 100644 --- a/scikits/learn/datasets/mlcomp.py +++ b/scikits/learn/datasets/mlcomp.py @@ -11,6 +11,7 @@ def load_document_classification(dataset_path, metadata, set_, **kw): """Loader implementation for the DocumentClassification format""" target = [] target_names = {} + filenames = [] vectorizer = kw.get('vectorizer', HashingVectorizer()) dataset_path = os.path.join(dataset_path, set_) @@ -23,9 +24,11 @@ def load_document_classification(dataset_path, metadata, set_, **kw): for d in sorted(os.listdir(folder_path))] vectorizer.vectorize(documents) target.extend(len(documents) * [label]) + filenames.extend(documents) return Bunch(data=vectorizer.get_vectors(), target=target, - target_names=target_names, DESCR=metadata.get('description')) + target_names=target_names, filenames=filenames, + DESCR=metadata.get('description')) LOADERS = { -- GitLab