diff --git a/scikits/learn/__init__.py b/scikits/learn/__init__.py index df11355d259d7564752d73affb7b50d0775d21da..3d2cce01fdd3720a152f00229f4fe60dd007e84a 100644 --- a/scikits/learn/__init__.py +++ b/scikits/learn/__init__.py @@ -18,9 +18,11 @@ from .base import clone try: from numpy.testing import nosetester + class NoseTester(nosetester.NoseTester): """ Subclass numpy's NoseTester to add doctests by default """ + def test(self, label='fast', verbose=1, extra_argv=['--exe'], doctests=True, coverage=False): return super(NoseTester, self).test(label=label, verbose=verbose, @@ -33,9 +35,8 @@ except: pass -__all__ = ['cross_val', 'ball_tree', 'cluster', 'covariance', 'datasets', 'gmm', - 'linear_model', 'logistic', 'lda', 'metrics', 'svm', 'features', 'clone', - 'test', 'gaussian_process'] +__all__ = ['cross_val', 'ball_tree', 'cluster', 'covariance', 'datasets', + 'gmm', 'linear_model', 'logistic', 'lda', 'metrics', 'svm', + 'features', 'clone', 'metrics', 'test', 'gaussian_process'] __version__ = '0.6.git' - diff --git a/scikits/learn/cluster/mean_shift_.py b/scikits/learn/cluster/mean_shift_.py index 1b9934d09880c6fbd13162fcc569dd90d59ef129..8257942a21c9732398d028cbf1ca35dcc1039d11 100644 --- a/scikits/learn/cluster/mean_shift_.py +++ b/scikits/learn/cluster/mean_shift_.py @@ -9,38 +9,7 @@ from math import floor import numpy as np from ..base import BaseEstimator - -def euclidian_distances(X, Y=None): - """ - Considering the rows of X (and Y=X) as vectors, compute the - distance matrix between each pair of vector - - Parameters - ---------- - X, array of shape (n_samples_1, n_features) - - Y, array of shape (n_samples_2, n_features), default None - if Y is None, then Y=X is used instead - - Returns - ------- - distances, array of shape (n_samples_1, n_samples_2) - """ - if Y is None: - Y = X - if X.shape[1] != Y.shape[1]: - raise ValueError, "incompatible dimension for X and Y matrices" - - XX = np.sum(X * X, axis=1)[:,np.newaxis] - if Y is None: - YY = XX.T - else: - YY = np.sum(Y * Y, axis=1)[np.newaxis,:] - distances = XX + YY # Using broadcasting - distances -= 2 * np.dot(X, Y.T) - distances = np.maximum(distances, 0) - distances = np.sqrt(distances) - return distances +from ..metrics.pairwise import euclidian_distances def estimate_bandwidth(X, quantile=0.3): @@ -101,7 +70,7 @@ def mean_shift(X, bandwidth=None): n_clusters = 0 bandwidth_squared = bandwidth**2 points_idx_init = np.arange(n_points) - stop_thresh = 1e-3*bandwidth # when mean has converged + stop_thresh = 1e-3*bandwidth # when mean has converged cluster_centers = [] # center of clusters # track if a points been seen already been_visited_flag = np.zeros(n_points, dtype=np.bool) @@ -114,12 +83,12 @@ def mean_shift(X, bandwidth=None): while n_points_init: # pick a random seed point - tmp_index = random_state.randint(n_points_init) + tmp_index = random_state.randint(n_points_init) # use this point as start of mean - start_idx = points_idx_init[tmp_index] - my_mean = X[start_idx, :] # intilize mean to this points location + start_idx = points_idx_init[tmp_index] + my_mean = X[start_idx, :] # intilize mean to this points location # points that will get added to this cluster - my_members = np.zeros(n_points, dtype=np.bool) + my_members = np.zeros(n_points, dtype=np.bool) # used to resolve conflicts on cluster membership this_cluster_votes = np.zeros(n_points, dtype=np.uint16) @@ -133,10 +102,10 @@ def mean_shift(X, bandwidth=None): # add a vote for all the in points belonging to this cluster this_cluster_votes[in_idx] += 1 - my_old_mean = my_mean # save the old mean - my_mean = np.mean(X[in_idx,:], axis=0) # compute the new mean + my_old_mean = my_mean # save the old mean + my_mean = np.mean(X[in_idx, :], axis=0) # compute the new mean # add any point within bandwidth to the cluster - my_members = np.logical_or(my_members, in_idx) + my_members = np.logical_or(my_members, in_idx) # mark that these points have been visited been_visited_flag[my_members] = True @@ -169,7 +138,7 @@ def mean_shift(X, bandwidth=None): # we can initialize with any of the points not yet visited points_idx_init = np.where(been_visited_flag == False)[0] - n_points_init = points_idx_init.size # number of active points in set + n_points_init = points_idx_init.size # number of active points in set # a point belongs to the cluster with the most votes labels = np.argmax(cluster_votes, axis=0) @@ -177,7 +146,8 @@ def mean_shift(X, bandwidth=None): return cluster_centers, labels -################################################################################ +############################################################################## + class MeanShift(BaseEstimator): """MeanShift clustering @@ -233,5 +203,3 @@ class MeanShift(BaseEstimator): self._set_params(**params) self.cluster_centers_, self.labels_ = mean_shift(X, self.bandwidth) return self - - diff --git a/scikits/learn/metrics/__init__.py b/scikits/learn/metrics/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..78e3131f9e013868b99e7254cbd8be32c7cca7af --- /dev/null +++ b/scikits/learn/metrics/__init__.py @@ -0,0 +1,10 @@ +""" +Metrics module with score functions, performance metrics and +pairwise metrics or distances computation +""" + +from .metrics import confusion_matrix, roc_curve, auc, precision_score, \ + recall_score, fbeta_score, f1_score, \ + precision_recall_fscore_support, classification_report, \ + precision_recall_curve, explained_variance_score, r2_score, \ + zero_one, mean_square_error diff --git a/scikits/learn/metrics.py b/scikits/learn/metrics/metrics.py similarity index 98% rename from scikits/learn/metrics.py rename to scikits/learn/metrics/metrics.py index c2e646146201dcc17f33b845791e79a3e067187f..6d8100107b210830d0eaafa278eb325a572adee9 100644 --- a/scikits/learn/metrics.py +++ b/scikits/learn/metrics/metrics.py @@ -166,7 +166,7 @@ def precision_score(y_true, y_pred, pos_label=1): precision of the positive class in binary classification or weighted avergage of the precision of each class for the multiclass task - + """ p, _, _, s = precision_recall_fscore_support(y_true, y_pred) if p.shape[0] == 2: @@ -417,7 +417,8 @@ def classification_report(y_true, y_pred, labels=None, class_names=None): report = fmt % tuple(headers) report += '\n' - p, r, f1, s = precision_recall_fscore_support(y_true, y_pred, labels=labels) + p, r, f1, s = precision_recall_fscore_support(y_true, y_pred, + labels=labels) for i, label in enumerate(labels): values = [class_names[i]] for v in (p[i], r[i], f1[i]): @@ -512,7 +513,7 @@ def explained_variance_score(y_true, y_pred): y_pred : array-like """ return 1 - np.var(y_true - y_pred) / np.var(y_true) - + def r2_score(y_true, y_pred): """R^2 (coefficient of determination) regression score function @@ -529,7 +530,8 @@ def r2_score(y_true, y_pred): y_pred : array-like """ - return 1 - ((y_true - y_pred)**2).sum() / ((y_true - y_true.mean())**2).sum() + return 1 - (((y_true - y_pred)**2).sum() / + ((y_true - y_true.mean())**2).sum()) @@ -539,7 +541,7 @@ def r2_score(y_true, y_pred): def zero_one(y_true, y_pred): """Zero-One classification loss - Positive integer (number of misclassifications). The best performance + Positive integer (number of misclassifications). The best performance is 0. Return the number of errors @@ -575,6 +577,3 @@ def mean_square_error(y_true, y_pred): loss : float """ return np.linalg.norm(y_pred - y_true) ** 2 - - - diff --git a/scikits/learn/metrics/pairwise.py b/scikits/learn/metrics/pairwise.py new file mode 100644 index 0000000000000000000000000000000000000000..ef32cae79729271a17b0d611977dd4dd17d45aa2 --- /dev/null +++ b/scikits/learn/metrics/pairwise.py @@ -0,0 +1,44 @@ +"""Utilities to evaluate pairwise distances or metrics between 2 +sets of points. + +""" + +# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr> +# License: BSD Style. + +import numpy as np + + +def euclidian_distances(X, Y=None): + """ + Considering the rows of X (and Y=X) as vectors, compute the + distance matrix between each pair of vector + + Parameters + ---------- + X, array of shape (n_samples_1, n_features) + + Y, array of shape (n_samples_2, n_features), default None + if Y is None, then Y=X is used instead + + Returns + ------- + distances, array of shape (n_samples_1, n_samples_2) + """ + X = np.asanyarray(X) + Y = np.asanyarray(Y) + if Y is None: + Y = X + if X.shape[1] != Y.shape[1]: + raise ValueError, "incompatible dimension for X and Y matrices" + + XX = np.sum(X * X, axis=1)[:, np.newaxis] + if Y is None: + YY = XX.T + else: + YY = np.sum(Y * Y, axis=1)[np.newaxis, :] + distances = XX + YY # Using broadcasting + distances -= 2 * np.dot(X, Y.T) + distances = np.maximum(distances, 0) + distances = np.sqrt(distances) + return distances diff --git a/scikits/learn/metrics/tests/__init__.py b/scikits/learn/metrics/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scikits/learn/tests/test_metrics.py b/scikits/learn/metrics/tests/test_metrics.py similarity index 99% rename from scikits/learn/tests/test_metrics.py rename to scikits/learn/metrics/tests/test_metrics.py index f3c8896651d5c508ac54929819c1a821ae622b90..212e4b2a1a8f4eaf21b80230e38f66fc86c31f37 100644 --- a/scikits/learn/tests/test_metrics.py +++ b/scikits/learn/metrics/tests/test_metrics.py @@ -7,8 +7,8 @@ from numpy.testing import assert_array_almost_equal from numpy.testing import assert_array_equal from numpy.testing import assert_equal, assert_almost_equal -from .. import datasets -from .. import svm +from ... import datasets +from ... import svm from ..metrics import auc from ..metrics import classification_report from ..metrics import confusion_matrix diff --git a/scikits/learn/metrics/tests/test_pairwise.py b/scikits/learn/metrics/tests/test_pairwise.py new file mode 100644 index 0000000000000000000000000000000000000000..1d8b1dc54eb8d033517e61536b116691ffe8ae87 --- /dev/null +++ b/scikits/learn/metrics/tests/test_pairwise.py @@ -0,0 +1,11 @@ +from numpy.testing import assert_array_almost_equal + +from ..pairwise import euclidian_distances + + +def test_euclidian_distances(): + """Check that the pairwise euclidian distances computation""" + X = [[0]] + Y = [[1], [2]] + D = euclidian_distances(X, Y) + assert_array_almost_equal(D, [[1., 2.]]) diff --git a/scikits/learn/setup.py b/scikits/learn/setup.py index b50541f2ae919a23b494ac73d1fc6537421c7e9e..b2e940a486a73e66103b892ca5c98c39f2016ef2 100644 --- a/scikits/learn/setup.py +++ b/scikits/learn/setup.py @@ -3,6 +3,7 @@ import warnings import numpy import sys + def configuration(parent_package='', top_path=None): from numpy.distutils.misc_util import Configuration from numpy.distutils.system_info import get_info, BlasNotFoundError @@ -26,14 +27,15 @@ def configuration(parent_package='', top_path=None): config.add_subpackage('externals') config.add_subpackage('gaussian_process') config.add_subpackage('gaussian_process/tests') + config.add_subpackage('metrics') + config.add_subpackage('metrics/tests') # some libs needs cblas, fortran-compiled BLAS will not be sufficient blas_info = get_info('blas_opt', 0) if (not blas_info) or ( - ('NO_ATLAS_INFO', 1) in blas_info.get('define_macros', [])) : + ('NO_ATLAS_INFO', 1) in blas_info.get('define_macros', [])): config.add_library('cblas', - sources=[join('src', 'cblas', '*.c')] - ) + sources=[join('src', 'cblas', '*.c')]) cblas_libs = ['cblas'] blas_info.pop('libraries', None) warnings.warn(BlasNotFoundError.__doc__) @@ -43,8 +45,7 @@ def configuration(parent_package='', top_path=None): config.add_extension('ball_tree', sources=[join('src', 'BallTree.cpp')], - include_dirs=[numpy.get_include()] - ) + include_dirs=[numpy.get_include()]) # the following packages depend on cblas, so they have to be build