From ca1fa138ebf796335f1dee8ee0f9a8cf85d90b77 Mon Sep 17 00:00:00 2001 From: Fabian Pedregosa <fabian.pedregosa@inria.fr> Date: Tue, 5 Jan 2010 15:30:22 +0000 Subject: [PATCH] * bump to 0.5.6 * various cosmetic changes From: cdavid <cdavid@cb17146a-f446-4be1-a4f7-bd7c5bb65646> git-svn-id: https://scikit-learn.svn.sourceforge.net/svnroot/scikit-learn/trunk@92 22fbfee3-77ab-4535-9bad-27d1bd3bc7d8 --- scikits/learn/pyem/Changelog | 8 ++ scikits/learn/pyem/TODO | 10 +-- scikits/learn/pyem/_c_densities.py | 10 +-- scikits/learn/pyem/densities.py | 2 +- scikits/learn/pyem/gauss_mix.py | 53 +++++++++--- scikits/learn/pyem/gmm_em.py | 83 +++++++++++++++++-- scikits/learn/pyem/info.py | 2 +- scikits/learn/pyem/misc.py | 21 +++++ .../pyem/profile_data/profile_densities.py | 4 +- .../learn/pyem/profile_data/profile_gmm.py | 6 +- scikits/learn/pyem/setup.py | 9 +- scikits/learn/pyem/test_reg.py | 44 ++++++++++ scikits/learn/pyem/tests/test_densities.py | 2 +- 13 files changed, 209 insertions(+), 45 deletions(-) create mode 100644 scikits/learn/pyem/misc.py create mode 100644 scikits/learn/pyem/test_reg.py diff --git a/scikits/learn/pyem/Changelog b/scikits/learn/pyem/Changelog index f15c970fdf..bca9b599c9 100644 --- a/scikits/learn/pyem/Changelog +++ b/scikits/learn/pyem/Changelog @@ -1,3 +1,11 @@ +pyem (0.5.6) Thu, 16 Nov 2006 14:18:19 +0900 + + * bump to 0.5.6 + * Add __str__ and __repr__ for GM and GMM classes + * Add regularization method (but not used yet). + * Change 'f<8' to N.float64 for ctype enabled densities + * Move 'Magic numbers' into a separated python file, misc.py + pyem (0.5.5) Tue, 24 Oct 2006 18:30:54 +0900 * Fix a bug inmultiple_gaussian_den which prevents diff --git a/scikits/learn/pyem/TODO b/scikits/learn/pyem/TODO index 030c467f61..49489e0e1c 100644 --- a/scikits/learn/pyem/TODO +++ b/scikits/learn/pyem/TODO @@ -1,12 +1,12 @@ -# Last Change: Fri Oct 20 12:00 PM 2006 J +# Last Change: Thu Nov 09 06:00 PM 2006 J Things which must be implemented for a 1.0 version (in importante order) - - A class for learning + a classifier - - test for various length and model size of gaussian densities/GM and GMM - - a small help/tutorial - - be complient with scipy module dev guidelines (DEVELOPERS.TXT) + - A classifier + - basic regularization Things which would be nice (after 1.0 version): + - Bayes prior (hard, suppose MCMC) + - variational Bayes (hard ? Not sure yet) - Integrate libem (libem should be modified so that it would be easy to package and distribute) - Other initialization schemes diff --git a/scikits/learn/pyem/_c_densities.py b/scikits/learn/pyem/_c_densities.py index c9f35d4b1c..7979532c0a 100644 --- a/scikits/learn/pyem/_c_densities.py +++ b/scikits/learn/pyem/_c_densities.py @@ -1,7 +1,7 @@ #! /usr/bin/python # # Copyrighted David Cournapeau -# Last Change: Thu Oct 19 06:00 PM 2006 J +# Last Change: Thu Nov 09 05:00 PM 2006 J # This module uses a C implementation through ctypes, for diagonal cases # TODO: @@ -26,12 +26,12 @@ if ctypes_major < 1: # Requirements for diag gden _gden = load_library('c_gden.so', __file__) -arg1 = ndpointer(dtype='<f8') +arg1 = ndpointer(dtype=N.float64) arg2 = c_uint arg3 = c_uint -arg4 = ndpointer(dtype='<f8') -arg5 = ndpointer(dtype='<f8') -arg6 = ndpointer(dtype='<f8') +arg4 = ndpointer(dtype=N.float64) +arg5 = ndpointer(dtype=N.float64) +arg6 = ndpointer(dtype=N.float64) _gden.gden_diag.argtypes = [arg1, arg2, arg3, arg4, arg5, arg6] _gden.gden_diag.restype = c_int diff --git a/scikits/learn/pyem/densities.py b/scikits/learn/pyem/densities.py index df9ef2522a..6adcfa74e3 100644 --- a/scikits/learn/pyem/densities.py +++ b/scikits/learn/pyem/densities.py @@ -1,7 +1,7 @@ #! /usr/bin/python # # Copyrighted David Cournapeau -# Last Change: Thu Oct 05 07:00 PM 2006 J +# Last Change: Fri Nov 10 10:00 AM 2006 J import numpy as N import numpy.linalg as lin diff --git a/scikits/learn/pyem/gauss_mix.py b/scikits/learn/pyem/gauss_mix.py index 79d44ce442..fdc66be1c4 100644 --- a/scikits/learn/pyem/gauss_mix.py +++ b/scikits/learn/pyem/gauss_mix.py @@ -1,5 +1,5 @@ # /usr/bin/python -# Last Change: Tue Oct 03 06:00 PM 2006 J +# Last Change: Thu Nov 09 06:00 PM 2006 J # Module to implement GaussianMixture class. @@ -7,9 +7,7 @@ import numpy as N from numpy.random import randn, rand import numpy.linalg as lin import densities - -MAX_DEV = 1e-10 -MAX_COND = 1e10 +from misc import _MAX_DBL_DEV # Right now, two main usages of a Gaussian Model are possible # - init a Gaussian Model with meta-parameters, and trains it @@ -17,9 +15,6 @@ MAX_COND = 1e10 # of confidences. In this case, we would like to init it with # known values of parameters. This can be done with the class method # fromval -# -# For now, we have to init with meta-parameters, and set -# the parameters afterward. There should be a better way ? # TODO: # - change bounds methods of GM class instanciations so that it cannot @@ -48,16 +43,20 @@ class GM: """Gaussian Mixture class. This is a simple container class to hold Gaussian Mixture parameters (weights, mean, etc...). It can also draw itself (confidence ellipses) and samples itself. - - Is initiated by giving dimension, number of components and - covariance mode""" + """ # I am not sure it is useful to have a spherical mode... _cov_mod = ['diag', 'full'] + #=============================== + # Methods to construct a mixture + #=============================== def __init__(self, d, k, mode = 'diag'): - """Init a Gaussian model of k components, each component being a - d multi-variate Gaussian, with covariance matrix of style mode""" + """Init a Gaussian Mixture of k components, each component being a + d multi-variate Gaussian, with covariance matrix of style mode. + + If you want to build a Gaussian Mixture with knowns weights, means + and variances, you can use GM.fromvalues method directly""" if mode not in self._cov_mod: raise GmParamError("mode %s not recognized" + str(mode)) @@ -116,6 +115,9 @@ class GM: res.set_param(weights, mu, sigma) return res + #===================================================== + # Fundamental facilities (sampling, confidence, etc..) + #===================================================== def sample(self, nframes): """ Sample nframes frames from the model """ if not self.is_valid: @@ -242,6 +244,15 @@ class GM: gen_param = classmethod(gen_param) + #======================= + # Regularization methods + #======================= + def _regularize(self): + raise NotImplemented("No regularization") + + #================= + # Plotting methods + #================= def plot(self, *args, **kargs): """Plot the ellipsoides directly for the model @@ -327,6 +338,22 @@ class GM: except ImportError: raise GmParamError("matplotlib not found, cannot plot...") + # Syntactic sugar + def __repr__(self): + repr = "" + repr += "Gaussian Mixture:\n" + repr += " -> %d dimensions\n" % self.d + repr += " -> %d components\n" % self.k + repr += " -> %s covariance \n" % self.mode + if self.is_valid: + repr += "Has initial values""" + else: + repr += "Has no initial values yet""" + return repr + + def __str__(self): + return self.__repr__() + # Function to generate a random index: this is kept outside any class, # as the function can be useful for other def gen_rand_index(p, n): @@ -366,7 +393,7 @@ def check_gmm_param(w, mu, va): """ # Check that w is valid - if N.fabs(N.sum(w, 0) - 1) > MAX_DEV: + if N.fabs(N.sum(w, 0) - 1) > _MAX_DBL_DEV: raise GmParamError('weight does not sum to 1') if not len(w.shape) == 1: diff --git a/scikits/learn/pyem/gmm_em.py b/scikits/learn/pyem/gmm_em.py index 0617fb9713..d5d6aa9f67 100644 --- a/scikits/learn/pyem/gmm_em.py +++ b/scikits/learn/pyem/gmm_em.py @@ -1,5 +1,5 @@ # /usr/bin/python -# Last Change: Tue Oct 24 06:00 PM 2006 J +# Last Change: Thu Nov 16 02:00 PM 2006 J # TODO: # - which methods to avoid va shrinking to 0 ? There are several options, @@ -15,6 +15,8 @@ import densities from kmean import kmean from gauss_mix import GM +from misc import _DEF_ALPHA, _MIN_DBL_DELTA, _MIN_INV_COND + # Error classes class GmmError(Exception): """Base class for exceptions in this module.""" @@ -38,12 +40,9 @@ class GmmParamError: # sense to use inheritance for # interface specification in python, since its # dynamic type systeme. -# Anyway, a mixture class should encapsulates all details concerning a mixture model: -# - internal parameters for the pdfs -# - can compute sufficient statistics for EM -# - can sample a model -# - can generate random valid parameters for a new pdf (using class method) -class MixtureModel: +# Anyway, a mixture model class should encapsulates all details +# concerning getting sufficient statistics (SS), likelihood and bic. +class MixtureModel(object): pass class ExpMixtureModel(MixtureModel): @@ -90,7 +89,7 @@ class GMM(ExpMixtureModel): """ Init the model at random.""" k = self.gm.k d = self.gm.d - if mode == 'diag': + if self.gm.mode == 'diag': w = N.ones(k) / k mu = randn(k, d) va = N.fabs(randn(k, d)) @@ -120,6 +119,7 @@ class GMM(ExpMixtureModel): self.init = init_methods[init] self.isinit = False + self.initst = init def sufficient_statistics(self, data): """ Return normalized and non-normalized sufficient statistics @@ -168,7 +168,10 @@ class GMM(ExpMixtureModel): elif self.gm.mode == 'full': # In full mode, this is the bottleneck: the triple loop # kills performances. This is pretty straightforward - # algebra, so computing it in C should not be too difficult + # algebra, so computing it in C should not be too difficult. The + # real problem is to have valid covariance matrices, and to keep + # them positive definite, maybe with special storage... Not sure + # it really worth the risk mu = N.zeros((k, d)) va = N.zeros((k*d, d)) @@ -239,6 +242,13 @@ class GMM(ExpMixtureModel): n = N.shape(data)[0] return bic(lk, free_deg, n) + # syntactic sugar + def __repr__(self): + repre = "" + repre += "Gaussian Mixture Model\n" + repre += " -> initialized by %s\n" % str(self.initst) + repre += self.gm.__repr__() + return repre class EM: """An EM trainer. An EM trainer @@ -265,6 +275,8 @@ class EM: Returns: likelihood (one value per iteration). """ + if not isinstance(model, MixtureModel): + raise TypeError("expect a MixtureModel as a model") # Initialize the data (may do nothing depending on the model) model.init(data) @@ -282,9 +294,62 @@ class EM: model.update_em(data, g) if has_em_converged(like[i], like[i-1], thresh): return like[0:i] + # # Em computation, with computation of the likelihood + # g, tgd = model.sufficient_statistics(data) + # like[0] = N.sum(N.log(N.sum(tgd, 1)), axis = 0) + # model.update_em(data, g) + # for i in range(1, maxiter): + # print "=== Iteration %d ===" % i + # isreg = False + # for j in range(model.gm.k): + # va = model.gm.va[j] + # if va.any() < _MIN_INV_COND: + # isreg = True + # print "\tregularization detected" + # print "\t" + str(va) + # model.gm.va[j] = regularize_diag(va) + # print "\t" + str(va) + ", " + str(model.gm.va[j]) + # print "\t" + str(gauss_den(data, model.gm.mu[j], model.gm.va[j])) + # print "\tend regularization detected" + # var = va + # + # g, tgd = model.sufficient_statistics(data) + # try: + # assert not( (N.isnan(tgd)).any() ) + # if isreg: + # print var + # except AssertionError: + # print "tgd is nan..." + # print model.gm.va[13,:] + # print 1/model.gm.va[13,:] + # print densities.gauss_den(data, model.gm.mu[13], model.gm.va[13]) + # print N.isnan((multiple_gauss_den(data, model.gm.mu, model.gm.va))).any() + # print "Exciting" + # import sys + # sys.exit(-1) + # like[i] = N.sum(N.log(N.sum(tgd, 1)), axis = 0) + # model.update_em(data, g) + # assert not( model.gm.va.any() < 1e-6) + # if has_em_converged(like[i], like[i-1], thresh): + # return like[0:i] return like +def regularize_diag(variance, alpha = _DEF_ALPHA): + delta = N.sum(variance) / variance.size + if delta > _MIN_DBL_DELTA: + return variance + alpha * delta + else: + return variance + alpha * _MIN_DBL_DELTA + +def regularize_full(variance): + # Trace of a positive definite matrix is always > 0 + delta = N.trace(variance) / variance.shape[0] + if delta > _MIN_DBL_DELTA: + return variance + alpha * delta + else: + return variance + alpha * _MIN_DBL_DELTA + # Misc functions def bic(lk, deg, n): """ Expects lk to be log likelihood """ diff --git a/scikits/learn/pyem/info.py b/scikits/learn/pyem/info.py index 511df8ad0a..080edc6c41 100644 --- a/scikits/learn/pyem/info.py +++ b/scikits/learn/pyem/info.py @@ -60,7 +60,7 @@ Bibliography: Copyright: David Cournapeau 2006 License: BSD-style (see LICENSE.txt in main source directory) """ -version = '0.5.5' +version = '0.5.6' depends = ['linalg', 'stats'] ignore = False diff --git a/scikits/learn/pyem/misc.py b/scikits/learn/pyem/misc.py new file mode 100644 index 0000000000..37549160f0 --- /dev/null +++ b/scikits/learn/pyem/misc.py @@ -0,0 +1,21 @@ +# Last Change: Fri Nov 10 10:00 AM 2006 J + +#===================================================================== +# "magic number", that is number used to control regularization and co +# Change them at your risk ! +#===================================================================== + +# max deviation allowed when comparing double (this is actually stupid, +# I should actually use a number of decimals) +_MAX_DBL_DEV = 1e-10 + +# max conditional number allowed +_MAX_COND = 1e8 +_MIN_INV_COND = 1/_MAX_COND + +# Default alpha for regularization +_DEF_ALPHA = 1e-1 + +# Default min delta for regularization +_MIN_DBL_DELTA = 1e-5 + diff --git a/scikits/learn/pyem/profile_data/profile_densities.py b/scikits/learn/pyem/profile_data/profile_densities.py index 013e4045aa..f2f0c47b25 100644 --- a/scikits/learn/pyem/profile_data/profile_densities.py +++ b/scikits/learn/pyem/profile_data/profile_densities.py @@ -1,7 +1,7 @@ import numpy as N from numpy.random import randn -from pyem import densities as D -from pyem import _c_densities as DC +from scipy.sandbox.pyem import densities as D +from scipy.sandbox.pyem import _c_densities as DC #import tables def bench(func, mode = 'diag'): diff --git a/scikits/learn/pyem/profile_data/profile_gmm.py b/scikits/learn/pyem/profile_data/profile_gmm.py index 93f10704c0..b9f260a5ba 100644 --- a/scikits/learn/pyem/profile_data/profile_gmm.py +++ b/scikits/learn/pyem/profile_data/profile_gmm.py @@ -1,16 +1,14 @@ import numpy as N -from pyem import GM, GMM +from scipy.sandbox.pyem import GM, GMM import copy -from pyem._c_densities import gauss_den - def bench1(mode = 'diag'): #=========================================== # GMM of 20 comp, 20 dimension, 1e4 frames #=========================================== d = 15 k = 30 - nframes = 1e4 + nframes = 1e5 niter = 10 mode = 'diag' diff --git a/scikits/learn/pyem/setup.py b/scikits/learn/pyem/setup.py index 548c06a2a6..6bfeb3c258 100644 --- a/scikits/learn/pyem/setup.py +++ b/scikits/learn/pyem/setup.py @@ -1,5 +1,5 @@ #! /usr/bin/env python -# Last Change: Thu Oct 19 07:00 PM 2006 J +# Last Change: Thu Nov 09 06:00 PM 2006 J # TODO: # - check how to handle cmd line build options with distutils and use # it in the building process @@ -16,14 +16,14 @@ VERSION = pyem_version DESCRIPTION ='A python module for Expectation Maximization learning of mixtures pdf', AUTHOR ='David Cournapeau', AUTHOR_EMAIL='david@ar.media.kyoto-u.ac.jp', -URL ='http://ar.media.kyoto-u.ac.jp/members/david', +URL ='http://ar.media.kyoto-u.ac.jp/members/david/softwares/pyem', def configuration(parent_package='',top_path=None, package_name='pyem'): from numpy.distutils.misc_util import Configuration config = Configuration(package_name,parent_package,top_path, version = VERSION) config.add_data_dir('tests') - config.add_subpackage('profile_data') + config.add_data_dir('profile_data') config.add_extension('c_gden', #define_macros=[('LIBSVM_EXPORTS', None), # ('LIBSVM_DLL', None)], @@ -34,7 +34,8 @@ def configuration(parent_package='',top_path=None, package_name='pyem'): if __name__ == "__main__": from numpy.distutils.core import setup #setup(**configuration(top_path='').todict()) - setup(**configuration(top_path='',)) + #setup(**configuration(top_path='')) + setup(configuration=configuration) # from distutils.core import setup, Extension # from pyem import version as pyem_version # diff --git a/scikits/learn/pyem/test_reg.py b/scikits/learn/pyem/test_reg.py new file mode 100644 index 0000000000..026c1def7e --- /dev/null +++ b/scikits/learn/pyem/test_reg.py @@ -0,0 +1,44 @@ +import numpy as N + +from gauss_mix import GM +from gmm_em import GMM, EM + +from numpy.random import seed + +def test_reg(): + seed(0) + # Generate data with a few components + d = 2 + k = 1 + n = 500 + + w, mu, va = GM.gen_param(d, k) + gm = GM.fromvalues(w, mu, va) + + data = gm.sample(n) + + # Try to learn with an insane number of components + gmm = GMM(GM(d, 30), 'random') + + em = EM() + like= em.train(data, gmm, 20, 1e-20) + + # import pylab as P + # P.subplot(2, 1, 1) + # P.plot(data[:, 0], data[:, 1], '.') + # gmm.gm.plot() + # P.subplot(2, 1, 2) + # P.plot(like) + # print like + # P.show() + +if __name__ == "__main__": + # import hotshot, hotshot.stats + # profile_file = 'manyk.prof' + # prof = hotshot.Profile(profile_file, lineevents=1) + # prof.runcall(test_reg) + # p = hotshot.stats.load(profile_file) + # print p.sort_stats('cumulative').print_stats(20) + # prof.close() + test_reg() + diff --git a/scikits/learn/pyem/tests/test_densities.py b/scikits/learn/pyem/tests/test_densities.py index c383356d2b..02fec1b57b 100644 --- a/scikits/learn/pyem/tests/test_densities.py +++ b/scikits/learn/pyem/tests/test_densities.py @@ -1,5 +1,5 @@ #! /usr/bin/env python -# Last Change: Thu Oct 19 07:00 PM 2006 J +# Last Change: Thu Nov 09 05:00 PM 2006 J # TODO: # - having "fake tests" to check that all mode (scalar, diag and full) are -- GitLab