diff --git a/scikits/learn/naive_bayes.py b/scikits/learn/naive_bayes.py index a4635827f87addbb43ef360e0174f8281440920c..8f3a4a53e96d53aa26f1b1f34d45d5794cd57697 100644 --- a/scikits/learn/naive_bayes.py +++ b/scikits/learn/naive_bayes.py @@ -7,6 +7,7 @@ import numpy as np from .base import BaseEstimator, ClassifierMixin + class GNB(BaseEstimator, ClassifierMixin): """ Gaussian Naive Bayes (GNB) @@ -56,6 +57,7 @@ class GNB(BaseEstimator, ClassifierMixin): -------- """ + def __init__(self): pass @@ -65,8 +67,8 @@ class GNB(BaseEstimator, ClassifierMixin): proba_y = [] unique_y = np.unique(y) for yi in unique_y: - theta.append(np.mean(X[y==yi,:], 0)) - sigma.append(np.var(X[y==yi,:], 0)) + theta.append(np.mean(X[y==yi, :], 0)) + sigma.append(np.var(X[y==yi, :], 0)) proba_y.append(np.float(np.sum(y==yi)) / np.size(y)) self.theta = np.array(theta) self.sigma = np.array(sigma) @@ -74,23 +76,35 @@ class GNB(BaseEstimator, ClassifierMixin): self.unique_y = unique_y return self - def predict(self, X): - y_pred = self.unique_y[np.argmax(self.predict_proba(X),1)] + y_pred = self.unique_y[np.argmax(self.predict_proba(X), 1)] return y_pred - - def predict_proba(self, X): + def _joint_log_likelihood(self, X): joint_log_likelihood = [] for i in range(np.size(self.unique_y)): jointi = np.log(self.proba_y[i]) - n_ij = - 0.5 * np.sum(np.log(np.pi*self.sigma[i,:])) - n_ij -= 0.5 * np.sum( ((X - self.theta[i,:])**2) /\ - (self.sigma[i,:]),1) - joint_log_likelihood.append(jointi+n_ij) + n_ij = - 0.5 * np.sum(np.log(np.pi*self.sigma[i, :])) + n_ij -= 0.5 * np.sum(((X - self.theta[i, :])**2) / \ + (self.sigma[i, :]), 1) + joint_log_likelihood.append(jointi + n_ij) joint_log_likelihood = np.array(joint_log_likelihood).T + return joint_log_likelihood + + def predict_proba(self, X): + joint_log_likelihood = self._joint_log_likelihood(X) proba = np.exp(joint_log_likelihood) - proba = proba / np.sum(proba,1)[:,np.newaxis] + proba = proba / np.sum(proba, 1)[:, np.newaxis] return proba - + def predict_log_proba(self, X): + log_proba = self._joint_log_likelihood(X) + # Compute a sum of logs without underflow. Equivalent to: + # log_proba -= np.log(np.sum(np.exp(log_proba), axis=1))[:, np.newaxis] + B = np.max(log_proba, axis=1)[:, np.newaxis] + logaB = log_proba - B + sup = logaB > -np.inf + aB = np.zeros_like(logaB) + aB[sup] = np.exp(logaB[sup]) + log_proba -= np.log(np.sum(aB, axis=1))[:, np.newaxis] + B + return log_proba diff --git a/scikits/learn/tests/test_naive_bayes.py b/scikits/learn/tests/test_naive_bayes.py index 9bbf7a451946abc76f4dfe4a80178f49c6d6b2f6..0f4f74437e4307e51e985240a9240b68a02947ef 100644 --- a/scikits/learn/tests/test_naive_bayes.py +++ b/scikits/learn/tests/test_naive_bayes.py @@ -1,5 +1,5 @@ import numpy as np -from numpy.testing import assert_array_equal +from numpy.testing import assert_array_equal, assert_array_almost_equal from .. import naive_bayes @@ -19,3 +19,7 @@ def test_gnb(): y_pred = clf.fit(X, y).predict(X) assert_array_equal(y_pred, y) + y_pred_proba = clf.predict_proba(X) + y_pred_log_proba = clf.predict_log_proba(X) + assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8) +