diff --git a/examples/mlcomp_document_classification.py b/examples/mlcomp_document_classification.py index bdb6d87fb841343bd99e52e96a1797c5bc519893..e1936abd3533f3e45d1c6327ae5c3cacea026473 100644 --- a/examples/mlcomp_document_classification.py +++ b/examples/mlcomp_document_classification.py @@ -43,6 +43,7 @@ import pylab as pl from scikits.learn.datasets import load_mlcomp from scikits.learn.svm import LinearSVC from scikits.learn.metrics import confusion_matrix +from scikits.learn.metrics import classification_report if 'MLCOMP_DATASETS_HOME' not in os.environ: print "Please follow those instructions to get started:" @@ -86,7 +87,11 @@ print "Predicting the labels of the test set..." t0 = time() pred = clf.predict(news_test.data) print "done in %fs" % (time() - t0) -print "Classification accuracy: %f" % (np.mean(pred == news_test.target) * 100) + +print "Classification report on test set:" +print classification_report(news_test.target, pred, + class_names=news_test.target_names) + cm = confusion_matrix(news_test.target, pred) print "Confusion matrix:" diff --git a/examples/mlcomp_sparse_document_classification.py b/examples/mlcomp_sparse_document_classification.py index 639678cf29a17325fa90cf4ff61c750e940aca30..77093fd9ab08c1b037809ddf778a00766c31ac9f 100644 --- a/examples/mlcomp_sparse_document_classification.py +++ b/examples/mlcomp_sparse_document_classification.py @@ -45,6 +45,7 @@ import pylab as pl from scikits.learn.datasets import load_mlcomp from scikits.learn.svm.sparse import LinearSVC from scikits.learn.metrics import confusion_matrix +from scikits.learn.metrics import classification_report if 'MLCOMP_DATASETS_HOME' not in os.environ: print "Please follow those instructions to get started:" @@ -87,7 +88,11 @@ print "Predicting the labels of the test set..." t0 = time() pred = clf.predict(news_test.data) print "done in %fs" % (time() - t0) -print "Classification accuracy: %f" % (np.mean(pred == news_test.target) * 100) + +print "Classification report on test set:" +print classification_report(news_test.target, pred, + class_names=news_test.target_names) + cm = confusion_matrix(news_test.target, pred) print "Confusion matrix:" diff --git a/examples/sgd/mlcomp_sparse_document_classification_sgd.py b/examples/sgd/mlcomp_sparse_document_classification_sgd.py index d56af4fdf37a79250475ee0e04874ece265377d8..d1cdb281da5bb8f61b06a595012bafc92c228d93 100644 --- a/examples/sgd/mlcomp_sparse_document_classification_sgd.py +++ b/examples/sgd/mlcomp_sparse_document_classification_sgd.py @@ -44,6 +44,7 @@ import numpy as np from scikits.learn.datasets import load_mlcomp from scikits.learn.metrics import confusion_matrix +from scikits.learn.metrics import classification_report # from scikits.learn.svm.sparse import LinearSVC from scikits.learn.sgd.sparse import SGD @@ -79,7 +80,6 @@ print "Training a linear SVM (hinge loss and L2 regularizer) using SGD.\n"\ "SGD(n_iter=50, alpha=0.00001, fit_intercept=True)" t0 = time() clf = SGD(n_iter=50, alpha=0.00001, fit_intercept=True) -#clf = LinearSVC(**parameters) clf.fit(data, target) print "done in %fs" % (time() - t0) print "Percentage of non zeros coef: %f" % (np.mean(clf.coef_ != 0) * 100) @@ -95,12 +95,15 @@ neg_idx = np.where(target == neg)[0] idx = np.concatenate((pos_idx, neg_idx)) data = news_test.data[idx] target = news_test.target[idx] +target_names = news_test.target_names[:2] print "Predicting the labels of the test set..." t0 = time() pred = clf.predict(data) print "done in %fs" % (time() - t0) -print "Classification accuracy: %f" % (np.mean(pred == target) * 100) +print "Classification report on test set:" +print classification_report(target, pred, class_names=target_names) + cm = confusion_matrix(target, pred) print "Confusion matrix:" diff --git a/scikits/learn/datasets/mlcomp.py b/scikits/learn/datasets/mlcomp.py index 232543b6943468b34cea83ce21b0e08fd6122452..22b35dd22484e216bffe94404d75a9795ec1b68a 100644 --- a/scikits/learn/datasets/mlcomp.py +++ b/scikits/learn/datasets/mlcomp.py @@ -12,7 +12,7 @@ from scikits.learn.feature_extraction.text import SparseHashingVectorizer def _load_document_classification(dataset_path, metadata, set_, sparse, **kw): """Loader implementation for the DocumentClassification format""" target = [] - target_names = {} + target_names = [] filenames = [] vectorizer = kw.get('vectorizer') if vectorizer is None: @@ -31,7 +31,7 @@ def _load_document_classification(dataset_path, metadata, set_, sparse, **kw): folders = [f for f in sorted(os.listdir(dataset_path)) if os.path.isdir(os.path.join(dataset_path, f))] for label, folder in enumerate(folders): - target_names[label] = folder + target_names.append(folder) folder_path = os.path.join(dataset_path, folder) documents = [os.path.join(folder_path, d) for d in sorted(os.listdir(folder_path))] diff --git a/scikits/learn/metrics.py b/scikits/learn/metrics.py index 8ef121d86b7ca16fb7a505cca888e7b5f96f4ba1..c04582dac9ca97fea58be3c6c1e2896b26f78791 100644 --- a/scikits/learn/metrics.py +++ b/scikits/learn/metrics.py @@ -341,6 +341,7 @@ def classification_report(y_true, y_pred, labels=None, class_names=None): class_names = ['%d' % l for l in labels] else: width = max(len(cn) for cn in class_names) + width = max(width, len('mean')) headers = ["precision", "recall", "f1-score"]