diff --git a/scikits/learn/datasets/mlcomp.py b/scikits/learn/datasets/mlcomp.py index bde4a7f1ffb10cec865e268e68723f7f1f1ebe4f..f6a79eb8f94a1a7a690bf457e94efe0d5b89b655 100644 --- a/scikits/learn/datasets/mlcomp.py +++ b/scikits/learn/datasets/mlcomp.py @@ -11,6 +11,7 @@ def load_document_classification(dataset_path, metadata, set_, **kw): """Loader implementation for the DocumentClassification format""" target = [] target_names = {} + filenames = [] vectorizer = kw.get('vectorizer', HashingVectorizer()) dataset_path = os.path.join(dataset_path, set_) @@ -23,9 +24,11 @@ def load_document_classification(dataset_path, metadata, set_, **kw): for d in sorted(os.listdir(folder_path))] vectorizer.vectorize(documents) target.extend(len(documents) * [label]) + filenames.extend(documents) return Bunch(data=vectorizer.get_vectors(), target=target, - target_names=target_names, DESCR=metadata.get('description')) + target_names=target_names, filenames=filenames, + DESCR=metadata.get('description')) LOADERS = {