From 8f4b1568bca1602e1489e4e8c7602fb5b5f2b9b2 Mon Sep 17 00:00:00 2001
From: Alexandre Gramfort <alexandre.gramfort@inria.fr>
Date: Mon, 15 Nov 2010 18:44:31 +0100
Subject: [PATCH] ENH : adding support for predict_log_proba in Naive Bayes

---
 scikits/learn/naive_bayes.py            | 38 +++++++++++++++++--------
 scikits/learn/tests/test_naive_bayes.py |  6 +++-
 2 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/scikits/learn/naive_bayes.py b/scikits/learn/naive_bayes.py
index a4635827f8..8f3a4a53e9 100644
--- a/scikits/learn/naive_bayes.py
+++ b/scikits/learn/naive_bayes.py
@@ -7,6 +7,7 @@ import numpy as np
 
 from .base import BaseEstimator, ClassifierMixin
 
+
 class GNB(BaseEstimator, ClassifierMixin):
     """
     Gaussian Naive Bayes (GNB)
@@ -56,6 +57,7 @@ class GNB(BaseEstimator, ClassifierMixin):
     --------
 
     """
+
     def __init__(self):
         pass
 
@@ -65,8 +67,8 @@ class GNB(BaseEstimator, ClassifierMixin):
         proba_y = []
         unique_y = np.unique(y)
         for yi in unique_y:
-            theta.append(np.mean(X[y==yi,:], 0))
-            sigma.append(np.var(X[y==yi,:], 0))
+            theta.append(np.mean(X[y==yi, :], 0))
+            sigma.append(np.var(X[y==yi, :], 0))
             proba_y.append(np.float(np.sum(y==yi)) / np.size(y))
         self.theta = np.array(theta)
         self.sigma = np.array(sigma)
@@ -74,23 +76,35 @@ class GNB(BaseEstimator, ClassifierMixin):
         self.unique_y = unique_y
         return self
 
-
     def predict(self, X):
-        y_pred = self.unique_y[np.argmax(self.predict_proba(X),1)]
+        y_pred = self.unique_y[np.argmax(self.predict_proba(X), 1)]
         return y_pred
 
-
-    def predict_proba(self, X):
+    def _joint_log_likelihood(self, X):
         joint_log_likelihood = []
         for i in range(np.size(self.unique_y)):
             jointi = np.log(self.proba_y[i])
-            n_ij = - 0.5 * np.sum(np.log(np.pi*self.sigma[i,:]))
-            n_ij -= 0.5 * np.sum( ((X - self.theta[i,:])**2) /\
-                                    (self.sigma[i,:]),1)
-            joint_log_likelihood.append(jointi+n_ij)
+            n_ij = - 0.5 * np.sum(np.log(np.pi*self.sigma[i, :]))
+            n_ij -= 0.5 * np.sum(((X - self.theta[i, :])**2) / \
+                                    (self.sigma[i, :]), 1)
+            joint_log_likelihood.append(jointi + n_ij)
         joint_log_likelihood = np.array(joint_log_likelihood).T
+        return joint_log_likelihood
+
+    def predict_proba(self, X):
+        joint_log_likelihood = self._joint_log_likelihood(X)
         proba = np.exp(joint_log_likelihood)
-        proba = proba / np.sum(proba,1)[:,np.newaxis]
+        proba = proba / np.sum(proba, 1)[:, np.newaxis]
         return proba
 
-
+    def predict_log_proba(self, X):
+        log_proba = self._joint_log_likelihood(X)
+        # Compute a sum of logs without underflow. Equivalent to:
+        # log_proba -= np.log(np.sum(np.exp(log_proba), axis=1))[:, np.newaxis]
+        B = np.max(log_proba, axis=1)[:, np.newaxis]
+        logaB = log_proba - B
+        sup = logaB > -np.inf
+        aB = np.zeros_like(logaB)
+        aB[sup] = np.exp(logaB[sup])
+        log_proba -= np.log(np.sum(aB, axis=1))[:, np.newaxis] + B
+        return log_proba
diff --git a/scikits/learn/tests/test_naive_bayes.py b/scikits/learn/tests/test_naive_bayes.py
index 9bbf7a4519..0f4f74437e 100644
--- a/scikits/learn/tests/test_naive_bayes.py
+++ b/scikits/learn/tests/test_naive_bayes.py
@@ -1,5 +1,5 @@
 import numpy as np
-from numpy.testing import assert_array_equal
+from numpy.testing import assert_array_equal, assert_array_almost_equal
 
 from .. import naive_bayes
 
@@ -19,3 +19,7 @@ def test_gnb():
     y_pred = clf.fit(X, y).predict(X)
     assert_array_equal(y_pred, y)
 
+    y_pred_proba = clf.predict_proba(X)
+    y_pred_log_proba = clf.predict_log_proba(X)
+    assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)
+    
-- 
GitLab