diff --git a/examples/text/mlcomp_sparse_document_classification.py b/examples/text/mlcomp_sparse_document_classification.py deleted file mode 100644 index de8f94725eafde55f2d3e724415dc8d00038e6bd..0000000000000000000000000000000000000000 --- a/examples/text/mlcomp_sparse_document_classification.py +++ /dev/null @@ -1,145 +0,0 @@ -""" -======================================================== -Classification of text documents: using a MLComp dataset -======================================================== - -This is an example showing how the scikit-learn can be used to classify -documents by topics using a bag-of-words approach. This example uses -a scipy.sparse matrix to store the features instead of standard numpy arrays. - -The dataset used in this example is the 20 newsgroups dataset and should be -downloaded from the http://mlcomp.org (free registration required): - - http://mlcomp.org/datasets/379 - -Once downloaded unzip the archive somewhere on your filesystem. -For instance in:: - - % mkdir -p ~/data/mlcomp - % cd ~/data/mlcomp - % unzip /path/to/dataset-379-20news-18828_XXXXX.zip - -You should get a folder ``~/data/mlcomp/379`` with a file named ``metadata`` -and subfolders ``raw``, ``train`` and ``test`` holding the text documents -organized by newsgroups. - -Then set the ``MLCOMP_DATASETS_HOME`` environment variable pointing to -the root folder holding the uncompressed archive:: - - % export MLCOMP_DATASETS_HOME="~/data/mlcomp" - -Then you are ready to run this example using your favorite python shell:: - - % ipython examples/mlcomp_sparse_document_classification.py - -""" - -# Author: Olivier Grisel <olivier.grisel@ensta.org> -# License: BSD 3 clause - -from __future__ import print_function - -from time import time -import sys -import os -import numpy as np -import scipy.sparse as sp -import matplotlib.pyplot as plt - -from sklearn.datasets import load_mlcomp -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.linear_model import SGDClassifier -from sklearn.metrics import confusion_matrix -from sklearn.metrics import classification_report -from sklearn.naive_bayes import MultinomialNB - - -print(__doc__) - -if 'MLCOMP_DATASETS_HOME' not in os.environ: - print("MLCOMP_DATASETS_HOME not set; please follow the above instructions") - sys.exit(0) - -# Load the training set -print("Loading 20 newsgroups training set... ") -news_train = load_mlcomp('20news-18828', 'train') -print(news_train.DESCR) -print("%d documents" % len(news_train.filenames)) -print("%d categories" % len(news_train.target_names)) - -print("Extracting features from the dataset using a sparse vectorizer") -t0 = time() -vectorizer = TfidfVectorizer(encoding='latin1') -X_train = vectorizer.fit_transform((open(f).read() - for f in news_train.filenames)) -print("done in %fs" % (time() - t0)) -print("n_samples: %d, n_features: %d" % X_train.shape) -assert sp.issparse(X_train) -y_train = news_train.target - -print("Loading 20 newsgroups test set... ") -news_test = load_mlcomp('20news-18828', 'test') -t0 = time() -print("done in %fs" % (time() - t0)) - -print("Predicting the labels of the test set...") -print("%d documents" % len(news_test.filenames)) -print("%d categories" % len(news_test.target_names)) - -print("Extracting features from the dataset using the same vectorizer") -t0 = time() -X_test = vectorizer.transform((open(f).read() for f in news_test.filenames)) -y_test = news_test.target -print("done in %fs" % (time() - t0)) -print("n_samples: %d, n_features: %d" % X_test.shape) - - -############################################################################### -# Benchmark classifiers -def benchmark(clf_class, params, name): - print("parameters:", params) - t0 = time() - clf = clf_class(**params).fit(X_train, y_train) - print("done in %fs" % (time() - t0)) - - if hasattr(clf, 'coef_'): - print("Percentage of non zeros coef: %f" - % (np.mean(clf.coef_ != 0) * 100)) - print("Predicting the outcomes of the testing set") - t0 = time() - pred = clf.predict(X_test) - print("done in %fs" % (time() - t0)) - - print("Classification report on test set for classifier:") - print(clf) - print() - print(classification_report(y_test, pred, - target_names=news_test.target_names)) - - cm = confusion_matrix(y_test, pred) - print("Confusion matrix:") - print(cm) - - # Show confusion matrix - plt.matshow(cm) - plt.title('Confusion matrix of the %s classifier' % name) - plt.colorbar() - - -print("Testbenching a linear classifier...") -parameters = { - 'loss': 'hinge', - 'penalty': 'l2', - 'n_iter': 50, - 'alpha': 0.00001, - 'fit_intercept': True, -} - -benchmark(SGDClassifier, parameters, 'SGD') - -print("Testbenching a MultinomialNB classifier...") -parameters = {'alpha': 0.01} - -benchmark(MultinomialNB, parameters, 'MultinomialNB') - -plt.show() diff --git a/sklearn/datasets/mlcomp.py b/sklearn/datasets/mlcomp.py index 545492834c18c6348690a3b52df3e6d03b566fcd..e97ab047a4fb404043e450c0da1652ae0b9f277f 100644 --- a/sklearn/datasets/mlcomp.py +++ b/sklearn/datasets/mlcomp.py @@ -5,6 +5,7 @@ import os import numbers from sklearn.datasets.base import load_files +from sklearn.utils import deprecated def _load_document_classification(dataset_path, metadata, set_=None, **kwargs): @@ -19,6 +20,9 @@ LOADERS = { } +@deprecated("since the http://mlcomp.org/ website will shut down " + "in March 2017, the load_mlcomp function was deprecated " + "in version 0.19 and will be removed in 0.21.") def load_mlcomp(name_or_id, set_="raw", mlcomp_root=None, **kwargs): """Load a datasets as downloaded from http://mlcomp.org