diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 46f8c4017bc47d3c925f28cc597e9739dda008dc..dd73a52078188c91715807acd21f1b6ad2bb26b4 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -783,7 +783,6 @@ details. metrics.classification_report metrics.cohen_kappa_score metrics.confusion_matrix - metrics.dcg_score metrics.f1_score metrics.fbeta_score metrics.hamming_loss @@ -791,7 +790,6 @@ details. metrics.jaccard_similarity_score metrics.log_loss metrics.matthews_corrcoef - metrics.ndcg_score metrics.precision_recall_curve metrics.precision_recall_fscore_support metrics.precision_score diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index c5384d49fa6582eca4e9d5e214798bb4ca155ae3..0f5f0937a751d840f2175ec94e3d6f46a4549b2f 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -308,14 +308,6 @@ Some also work in the multilabel case: recall_score zero_one_loss -Some are typically used for ranking: - -.. autosummary:: - :template: function.rst - - dcg_score - ndcg_score - And some work with binary and multilabel (but not multiclass) problems: .. autosummary:: diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 93d21a146619a00f582fa0490f7bb697236a7412..eb7cf3c01d115a32b29b895c02b5a426d423c485 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -12,8 +12,6 @@ from .ranking import label_ranking_loss from .ranking import precision_recall_curve from .ranking import roc_auc_score from .ranking import roc_curve -from .ranking import dcg_score -from .ranking import ndcg_score from .classification import accuracy_score from .classification import classification_report @@ -118,6 +116,4 @@ __all__ = [ 'v_measure_score', 'zero_one_loss', 'brier_score_loss', - 'dcg_score', - 'ndcg_score' ] diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index 62d8db89e6a5b242b0825efef8477b67be0e2ddc..b7867ce2605c458938d0af259401559f25d5e846 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -26,7 +26,7 @@ from scipy.stats import rankdata from ..utils import assert_all_finite from ..utils import check_consistent_length -from ..utils import column_or_1d, check_array, check_X_y +from ..utils import column_or_1d, check_array from ..utils.multiclass import type_of_target from ..utils.extmath import stable_cumsum from ..utils.sparsefuncs import count_nonzero @@ -788,91 +788,3 @@ def label_ranking_loss(y_true, y_score, sample_weight=None): loss[np.logical_or(n_positives == 0, n_positives == n_labels)] = 0. return np.average(loss, weights=sample_weight) - - -def dcg_score(y_true, y_score, k=5): - """Discounted cumulative gain (DCG) at rank K. - - Parameters - ---------- - y_true : array, shape = [n_samples] - Ground truth (true relevance labels). - y_score : array, shape = [n_samples] - Predicted scores. - k : int - Rank. - - Returns - ------- - score : float - - References - ---------- - .. [1] `Wikipedia entry for the Discounted Cumulative Gain - <https://en.wikipedia.org/wiki/Discounted_cumulative_gain>`_ - """ - order = np.argsort(y_score)[::-1] - y_true = np.take(y_true, order[:k]) - - gain = 2 ** y_true - 1 - - discounts = np.log2(np.arange(len(y_true)) + 2) - return np.sum(gain / discounts) - - -def ndcg_score(y_true, y_score, k=5): - """Normalized discounted cumulative gain (NDCG) at rank K. - - Normalized Discounted Cumulative Gain (NDCG) measures the performance of a - recommendation system based on the graded relevance of the recommended - entities. It varies from 0.0 to 1.0, with 1.0 representing the ideal - ranking of the entities. - - Parameters - ---------- - y_true : array, shape = [n_samples] - Ground truth (true labels represended as integers). - y_score : array, shape = [n_samples, n_classes] - Predicted probabilities. - k : int - Rank. - - Returns - ------- - score : float - - Examples - -------- - >>> y_true = [1, 0, 2] - >>> y_score = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]] - >>> ndcg_score(y_true, y_score, k=2) - 1.0 - >>> y_score = [[0.9, 0.5, 0.8], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]] - >>> ndcg_score(y_true, y_score, k=2) - 0.66666666666666663 - - References - ---------- - .. [1] `Kaggle entry for the Normalized Discounted Cumulative Gain - <https://www.kaggle.com/wiki/NormalizedDiscountedCumulativeGain>`_ - """ - y_score, y_true = check_X_y(y_score, y_true) - - # Make sure we use all the labels (max between the length and the higher - # number in the array) - lb = LabelBinarizer() - lb.fit(np.arange(max(np.max(y_true) + 1, len(y_true)))) - binarized_y_true = lb.transform(y_true) - - if binarized_y_true.shape != y_score.shape: - raise ValueError("y_true and y_score have different value ranges") - - scores = [] - - # Iterate over each y_value_true and compute the DCG score - for y_value_true, y_value_score in zip(binarized_y_true, y_score): - actual = dcg_score(y_value_true, y_value_score, k) - best = dcg_score(y_value_true, y_value_true, k) - scores.append(actual / best) - - return np.mean(scores) diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index db80691663606b3946295e88df2ace846a16e699..951992132f3e3d6029cba8446fbdc0c18525fada 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -29,7 +29,6 @@ from sklearn.metrics import precision_recall_curve from sklearn.metrics import label_ranking_loss from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_curve -from sklearn.metrics import ndcg_score from sklearn.exceptions import UndefinedMetricWarning @@ -738,38 +737,6 @@ def check_zero_or_all_relevant_labels(lrap_score): [[0.5], [0.5], [0.5], [0.5]]), 1.) -def test_ndcg_score(): - # Check perfect ranking - y_true = [1, 0, 2] - y_score = [ - [0.15, 0.55, 0.2], - [0.7, 0.2, 0.1], - [0.06, 0.04, 0.9] - ] - perfect = ndcg_score(y_true, y_score) - assert_equal(perfect, 1.0) - - # Check bad ranking with a small K - y_true = [0, 2, 1] - y_score = [ - [0.15, 0.55, 0.2], - [0.7, 0.2, 0.1], - [0.06, 0.04, 0.9] - ] - short_k = ndcg_score(y_true, y_score, k=1) - assert_equal(short_k, 0.0) - - # Check a random scoring - y_true = [2, 1, 0] - y_score = [ - [0.15, 0.55, 0.2], - [0.7, 0.2, 0.1], - [0.06, 0.04, 0.9] - ] - average_ranking = ndcg_score(y_true, y_score, k=2) - assert_almost_equal(average_ranking, 0.63092975) - - def check_lrap_error_raised(lrap_score): # Raise value error if not appropriate format assert_raises(ValueError, lrap_score,