From 564134ed898e0104c7dccb68d933de658c8ab01c Mon Sep 17 00:00:00 2001
From: Olivier Hervieu <olivier.hervieu@tinyclues.com>
Date: Tue, 30 Aug 2011 16:05:57 +0200
Subject: [PATCH] Refactor roc_curve method. The new implementation, even if it
 looks very naive, reduces the computation time of fpr/tpr vectors. roc_curve
 computation time depends now only on the length of y_score. For comparison,
 here are the results between the old and the new implementation for the
 following vectors:

- 10^6 length vector  (y_score has 1000 unique values):
    - old impl.: 28.29 seconds
    - new impl.: 3.14 seconds

- 10^6 length vector (y_score has 10000 unique values):
    - old impl.: 267.61 seconds
    - new impl.: 3.64 seconds
---
 scikits/learn/metrics/metrics.py | 36 ++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/scikits/learn/metrics/metrics.py b/scikits/learn/metrics/metrics.py
index e5d2b8bf99..dd653c245e 100644
--- a/scikits/learn/metrics/metrics.py
+++ b/scikits/learn/metrics/metrics.py
@@ -118,16 +118,39 @@ def roc_curve(y_true, y_score):
 
     y_score = y_score.ravel()
     thresholds = np.sort(np.unique(y_score))[::-1]
-    n_thresholds = thresholds.size
 
-    tpr = np.empty(n_thresholds)  # True positive rate
-    fpr = np.empty(n_thresholds)  # False positive rate
     n_pos = float(np.sum(y_true == classes[1]))  # nb of true positive
     n_neg = float(np.sum(y_true == classes[0]))  # nb of true negative
 
-    for i, t in enumerate(thresholds):
-        tpr[i] = np.sum(y_true[y_score >= t] == classes[1]) / n_pos
-        fpr[i] = np.sum(y_true[y_score >= t] == classes[0]) / n_neg
+    thresholds = np.unique(y_score)
+    neg_value, pos_value = classes[0], classes[1]
+
+    tpr = np.empty(thresholds.size)  # True positive rate
+    fpr = np.empty(thresholds.size)  # False positive rate
+
+    # Buid tpr/fpr vector
+    dpos = dneg = sum_pos = sum_neg = idx = 0
+
+    sorted_signal = sorted(zip(y_score, y_true), reverse=True)
+    last_input = sorted_signal[0][0]
+    for each, value in sorted_signal:
+        if each == last_input:
+            if value == pos_value:
+                dpos += 1
+            else:
+                dneg += 1
+        else:
+            tpr[idx] = (sum_pos + dpos) / n_pos
+            fpr[idx] = (sum_neg + dneg) / n_neg
+            sum_pos += dpos
+            sum_neg += dneg
+            dpos = 1 if value == pos_value else 0
+            dneg = 1 if value == neg_value else 0
+            idx += 1
+            last_input = each
+    else:
+        tpr[-1] = (sum_pos + dpos) / n_pos
+        fpr[-1] = (sum_neg + dneg) / n_neg
 
     # hard decisions, add (0,0)
     if fpr.shape[0] == 2:
@@ -137,6 +160,7 @@ def roc_curve(y_true, y_score):
     elif fpr.shape[0] == 1:
         fpr = np.array([0.0, fpr[0], 1.0])
         tpr = np.array([0.0, tpr[0], 1.0])
+
     return fpr, tpr, thresholds
 
 
-- 
GitLab