diff --git a/scikits/learn/pyem/Changelog b/scikits/learn/pyem/Changelog index f15c970fdf742434f7bf8299ff4f6ace555d309d..bca9b599c979fd02c0f62c122f685903db518098 100644 --- a/scikits/learn/pyem/Changelog +++ b/scikits/learn/pyem/Changelog @@ -1,3 +1,11 @@ +pyem (0.5.6) Thu, 16 Nov 2006 14:18:19 +0900 + + * bump to 0.5.6 + * Add __str__ and __repr__ for GM and GMM classes + * Add regularization method (but not used yet). + * Change 'f<8' to N.float64 for ctype enabled densities + * Move 'Magic numbers' into a separated python file, misc.py + pyem (0.5.5) Tue, 24 Oct 2006 18:30:54 +0900 * Fix a bug inmultiple_gaussian_den which prevents diff --git a/scikits/learn/pyem/TODO b/scikits/learn/pyem/TODO index 030c467f613ee7b7c883ea6c4285a0c03f8fb340..49489e0e1c1ebcd8e8a2b3321daa6ced4672be01 100644 --- a/scikits/learn/pyem/TODO +++ b/scikits/learn/pyem/TODO @@ -1,12 +1,12 @@ -# Last Change: Fri Oct 20 12:00 PM 2006 J +# Last Change: Thu Nov 09 06:00 PM 2006 J Things which must be implemented for a 1.0 version (in importante order) - - A class for learning + a classifier - - test for various length and model size of gaussian densities/GM and GMM - - a small help/tutorial - - be complient with scipy module dev guidelines (DEVELOPERS.TXT) + - A classifier + - basic regularization Things which would be nice (after 1.0 version): + - Bayes prior (hard, suppose MCMC) + - variational Bayes (hard ? Not sure yet) - Integrate libem (libem should be modified so that it would be easy to package and distribute) - Other initialization schemes diff --git a/scikits/learn/pyem/_c_densities.py b/scikits/learn/pyem/_c_densities.py index c9f35d4b1c6fdaee22cbbe141c52fb2ce57b269d..7979532c0a9354899545a3f46fa17535b7af580e 100644 --- a/scikits/learn/pyem/_c_densities.py +++ b/scikits/learn/pyem/_c_densities.py @@ -1,7 +1,7 @@ #! /usr/bin/python # # Copyrighted David Cournapeau -# Last Change: Thu Oct 19 06:00 PM 2006 J +# Last Change: Thu Nov 09 05:00 PM 2006 J # This module uses a C implementation through ctypes, for diagonal cases # TODO: @@ -26,12 +26,12 @@ if ctypes_major < 1: # Requirements for diag gden _gden = load_library('c_gden.so', __file__) -arg1 = ndpointer(dtype='<f8') +arg1 = ndpointer(dtype=N.float64) arg2 = c_uint arg3 = c_uint -arg4 = ndpointer(dtype='<f8') -arg5 = ndpointer(dtype='<f8') -arg6 = ndpointer(dtype='<f8') +arg4 = ndpointer(dtype=N.float64) +arg5 = ndpointer(dtype=N.float64) +arg6 = ndpointer(dtype=N.float64) _gden.gden_diag.argtypes = [arg1, arg2, arg3, arg4, arg5, arg6] _gden.gden_diag.restype = c_int diff --git a/scikits/learn/pyem/densities.py b/scikits/learn/pyem/densities.py index df9ef2522afc5e6e9f3cebefb1026321a7c8c5a8..6adcfa74e3b97c7cd4bc1f37b73455078de499cf 100644 --- a/scikits/learn/pyem/densities.py +++ b/scikits/learn/pyem/densities.py @@ -1,7 +1,7 @@ #! /usr/bin/python # # Copyrighted David Cournapeau -# Last Change: Thu Oct 05 07:00 PM 2006 J +# Last Change: Fri Nov 10 10:00 AM 2006 J import numpy as N import numpy.linalg as lin diff --git a/scikits/learn/pyem/gauss_mix.py b/scikits/learn/pyem/gauss_mix.py index 79d44ce442ff5ee3e6f6c0e91addd4a35fa1b766..fdc66be1c47a51c59681c79baf6e186dcbfea6b6 100644 --- a/scikits/learn/pyem/gauss_mix.py +++ b/scikits/learn/pyem/gauss_mix.py @@ -1,5 +1,5 @@ # /usr/bin/python -# Last Change: Tue Oct 03 06:00 PM 2006 J +# Last Change: Thu Nov 09 06:00 PM 2006 J # Module to implement GaussianMixture class. @@ -7,9 +7,7 @@ import numpy as N from numpy.random import randn, rand import numpy.linalg as lin import densities - -MAX_DEV = 1e-10 -MAX_COND = 1e10 +from misc import _MAX_DBL_DEV # Right now, two main usages of a Gaussian Model are possible # - init a Gaussian Model with meta-parameters, and trains it @@ -17,9 +15,6 @@ MAX_COND = 1e10 # of confidences. In this case, we would like to init it with # known values of parameters. This can be done with the class method # fromval -# -# For now, we have to init with meta-parameters, and set -# the parameters afterward. There should be a better way ? # TODO: # - change bounds methods of GM class instanciations so that it cannot @@ -48,16 +43,20 @@ class GM: """Gaussian Mixture class. This is a simple container class to hold Gaussian Mixture parameters (weights, mean, etc...). It can also draw itself (confidence ellipses) and samples itself. - - Is initiated by giving dimension, number of components and - covariance mode""" + """ # I am not sure it is useful to have a spherical mode... _cov_mod = ['diag', 'full'] + #=============================== + # Methods to construct a mixture + #=============================== def __init__(self, d, k, mode = 'diag'): - """Init a Gaussian model of k components, each component being a - d multi-variate Gaussian, with covariance matrix of style mode""" + """Init a Gaussian Mixture of k components, each component being a + d multi-variate Gaussian, with covariance matrix of style mode. + + If you want to build a Gaussian Mixture with knowns weights, means + and variances, you can use GM.fromvalues method directly""" if mode not in self._cov_mod: raise GmParamError("mode %s not recognized" + str(mode)) @@ -116,6 +115,9 @@ class GM: res.set_param(weights, mu, sigma) return res + #===================================================== + # Fundamental facilities (sampling, confidence, etc..) + #===================================================== def sample(self, nframes): """ Sample nframes frames from the model """ if not self.is_valid: @@ -242,6 +244,15 @@ class GM: gen_param = classmethod(gen_param) + #======================= + # Regularization methods + #======================= + def _regularize(self): + raise NotImplemented("No regularization") + + #================= + # Plotting methods + #================= def plot(self, *args, **kargs): """Plot the ellipsoides directly for the model @@ -327,6 +338,22 @@ class GM: except ImportError: raise GmParamError("matplotlib not found, cannot plot...") + # Syntactic sugar + def __repr__(self): + repr = "" + repr += "Gaussian Mixture:\n" + repr += " -> %d dimensions\n" % self.d + repr += " -> %d components\n" % self.k + repr += " -> %s covariance \n" % self.mode + if self.is_valid: + repr += "Has initial values""" + else: + repr += "Has no initial values yet""" + return repr + + def __str__(self): + return self.__repr__() + # Function to generate a random index: this is kept outside any class, # as the function can be useful for other def gen_rand_index(p, n): @@ -366,7 +393,7 @@ def check_gmm_param(w, mu, va): """ # Check that w is valid - if N.fabs(N.sum(w, 0) - 1) > MAX_DEV: + if N.fabs(N.sum(w, 0) - 1) > _MAX_DBL_DEV: raise GmParamError('weight does not sum to 1') if not len(w.shape) == 1: diff --git a/scikits/learn/pyem/gmm_em.py b/scikits/learn/pyem/gmm_em.py index 0617fb97136e5436e0f6e5424064e52d9a04e323..d5d6aa9f676349315256727764c5681d1c2a0faa 100644 --- a/scikits/learn/pyem/gmm_em.py +++ b/scikits/learn/pyem/gmm_em.py @@ -1,5 +1,5 @@ # /usr/bin/python -# Last Change: Tue Oct 24 06:00 PM 2006 J +# Last Change: Thu Nov 16 02:00 PM 2006 J # TODO: # - which methods to avoid va shrinking to 0 ? There are several options, @@ -15,6 +15,8 @@ import densities from kmean import kmean from gauss_mix import GM +from misc import _DEF_ALPHA, _MIN_DBL_DELTA, _MIN_INV_COND + # Error classes class GmmError(Exception): """Base class for exceptions in this module.""" @@ -38,12 +40,9 @@ class GmmParamError: # sense to use inheritance for # interface specification in python, since its # dynamic type systeme. -# Anyway, a mixture class should encapsulates all details concerning a mixture model: -# - internal parameters for the pdfs -# - can compute sufficient statistics for EM -# - can sample a model -# - can generate random valid parameters for a new pdf (using class method) -class MixtureModel: +# Anyway, a mixture model class should encapsulates all details +# concerning getting sufficient statistics (SS), likelihood and bic. +class MixtureModel(object): pass class ExpMixtureModel(MixtureModel): @@ -90,7 +89,7 @@ class GMM(ExpMixtureModel): """ Init the model at random.""" k = self.gm.k d = self.gm.d - if mode == 'diag': + if self.gm.mode == 'diag': w = N.ones(k) / k mu = randn(k, d) va = N.fabs(randn(k, d)) @@ -120,6 +119,7 @@ class GMM(ExpMixtureModel): self.init = init_methods[init] self.isinit = False + self.initst = init def sufficient_statistics(self, data): """ Return normalized and non-normalized sufficient statistics @@ -168,7 +168,10 @@ class GMM(ExpMixtureModel): elif self.gm.mode == 'full': # In full mode, this is the bottleneck: the triple loop # kills performances. This is pretty straightforward - # algebra, so computing it in C should not be too difficult + # algebra, so computing it in C should not be too difficult. The + # real problem is to have valid covariance matrices, and to keep + # them positive definite, maybe with special storage... Not sure + # it really worth the risk mu = N.zeros((k, d)) va = N.zeros((k*d, d)) @@ -239,6 +242,13 @@ class GMM(ExpMixtureModel): n = N.shape(data)[0] return bic(lk, free_deg, n) + # syntactic sugar + def __repr__(self): + repre = "" + repre += "Gaussian Mixture Model\n" + repre += " -> initialized by %s\n" % str(self.initst) + repre += self.gm.__repr__() + return repre class EM: """An EM trainer. An EM trainer @@ -265,6 +275,8 @@ class EM: Returns: likelihood (one value per iteration). """ + if not isinstance(model, MixtureModel): + raise TypeError("expect a MixtureModel as a model") # Initialize the data (may do nothing depending on the model) model.init(data) @@ -282,9 +294,62 @@ class EM: model.update_em(data, g) if has_em_converged(like[i], like[i-1], thresh): return like[0:i] + # # Em computation, with computation of the likelihood + # g, tgd = model.sufficient_statistics(data) + # like[0] = N.sum(N.log(N.sum(tgd, 1)), axis = 0) + # model.update_em(data, g) + # for i in range(1, maxiter): + # print "=== Iteration %d ===" % i + # isreg = False + # for j in range(model.gm.k): + # va = model.gm.va[j] + # if va.any() < _MIN_INV_COND: + # isreg = True + # print "\tregularization detected" + # print "\t" + str(va) + # model.gm.va[j] = regularize_diag(va) + # print "\t" + str(va) + ", " + str(model.gm.va[j]) + # print "\t" + str(gauss_den(data, model.gm.mu[j], model.gm.va[j])) + # print "\tend regularization detected" + # var = va + # + # g, tgd = model.sufficient_statistics(data) + # try: + # assert not( (N.isnan(tgd)).any() ) + # if isreg: + # print var + # except AssertionError: + # print "tgd is nan..." + # print model.gm.va[13,:] + # print 1/model.gm.va[13,:] + # print densities.gauss_den(data, model.gm.mu[13], model.gm.va[13]) + # print N.isnan((multiple_gauss_den(data, model.gm.mu, model.gm.va))).any() + # print "Exciting" + # import sys + # sys.exit(-1) + # like[i] = N.sum(N.log(N.sum(tgd, 1)), axis = 0) + # model.update_em(data, g) + # assert not( model.gm.va.any() < 1e-6) + # if has_em_converged(like[i], like[i-1], thresh): + # return like[0:i] return like +def regularize_diag(variance, alpha = _DEF_ALPHA): + delta = N.sum(variance) / variance.size + if delta > _MIN_DBL_DELTA: + return variance + alpha * delta + else: + return variance + alpha * _MIN_DBL_DELTA + +def regularize_full(variance): + # Trace of a positive definite matrix is always > 0 + delta = N.trace(variance) / variance.shape[0] + if delta > _MIN_DBL_DELTA: + return variance + alpha * delta + else: + return variance + alpha * _MIN_DBL_DELTA + # Misc functions def bic(lk, deg, n): """ Expects lk to be log likelihood """ diff --git a/scikits/learn/pyem/info.py b/scikits/learn/pyem/info.py index 511df8ad0aa94da99347a25e397151f545898e1d..080edc6c4155ac8aca7ced7b99709093a5a8bd30 100644 --- a/scikits/learn/pyem/info.py +++ b/scikits/learn/pyem/info.py @@ -60,7 +60,7 @@ Bibliography: Copyright: David Cournapeau 2006 License: BSD-style (see LICENSE.txt in main source directory) """ -version = '0.5.5' +version = '0.5.6' depends = ['linalg', 'stats'] ignore = False diff --git a/scikits/learn/pyem/misc.py b/scikits/learn/pyem/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..37549160f06b8f0476e02028eba6ef2c8cc8691d --- /dev/null +++ b/scikits/learn/pyem/misc.py @@ -0,0 +1,21 @@ +# Last Change: Fri Nov 10 10:00 AM 2006 J + +#===================================================================== +# "magic number", that is number used to control regularization and co +# Change them at your risk ! +#===================================================================== + +# max deviation allowed when comparing double (this is actually stupid, +# I should actually use a number of decimals) +_MAX_DBL_DEV = 1e-10 + +# max conditional number allowed +_MAX_COND = 1e8 +_MIN_INV_COND = 1/_MAX_COND + +# Default alpha for regularization +_DEF_ALPHA = 1e-1 + +# Default min delta for regularization +_MIN_DBL_DELTA = 1e-5 + diff --git a/scikits/learn/pyem/profile_data/profile_densities.py b/scikits/learn/pyem/profile_data/profile_densities.py index 013e4045aac13aed8791328e24073e5950121366..f2f0c47b25e438ec4ce441abe7104ab046a8b7ec 100644 --- a/scikits/learn/pyem/profile_data/profile_densities.py +++ b/scikits/learn/pyem/profile_data/profile_densities.py @@ -1,7 +1,7 @@ import numpy as N from numpy.random import randn -from pyem import densities as D -from pyem import _c_densities as DC +from scipy.sandbox.pyem import densities as D +from scipy.sandbox.pyem import _c_densities as DC #import tables def bench(func, mode = 'diag'): diff --git a/scikits/learn/pyem/profile_data/profile_gmm.py b/scikits/learn/pyem/profile_data/profile_gmm.py index 93f10704c03c1b64d82f9c17e516c06a3299c70a..b9f260a5ba2795a42632aee66273e2575b2e1647 100644 --- a/scikits/learn/pyem/profile_data/profile_gmm.py +++ b/scikits/learn/pyem/profile_data/profile_gmm.py @@ -1,16 +1,14 @@ import numpy as N -from pyem import GM, GMM +from scipy.sandbox.pyem import GM, GMM import copy -from pyem._c_densities import gauss_den - def bench1(mode = 'diag'): #=========================================== # GMM of 20 comp, 20 dimension, 1e4 frames #=========================================== d = 15 k = 30 - nframes = 1e4 + nframes = 1e5 niter = 10 mode = 'diag' diff --git a/scikits/learn/pyem/setup.py b/scikits/learn/pyem/setup.py index 548c06a2a602c5f79a90dc243ffdfe09f0cf192b..6bfeb3c258aaf73029a41c83b32bb87aa18c920e 100644 --- a/scikits/learn/pyem/setup.py +++ b/scikits/learn/pyem/setup.py @@ -1,5 +1,5 @@ #! /usr/bin/env python -# Last Change: Thu Oct 19 07:00 PM 2006 J +# Last Change: Thu Nov 09 06:00 PM 2006 J # TODO: # - check how to handle cmd line build options with distutils and use # it in the building process @@ -16,14 +16,14 @@ VERSION = pyem_version DESCRIPTION ='A python module for Expectation Maximization learning of mixtures pdf', AUTHOR ='David Cournapeau', AUTHOR_EMAIL='david@ar.media.kyoto-u.ac.jp', -URL ='http://ar.media.kyoto-u.ac.jp/members/david', +URL ='http://ar.media.kyoto-u.ac.jp/members/david/softwares/pyem', def configuration(parent_package='',top_path=None, package_name='pyem'): from numpy.distutils.misc_util import Configuration config = Configuration(package_name,parent_package,top_path, version = VERSION) config.add_data_dir('tests') - config.add_subpackage('profile_data') + config.add_data_dir('profile_data') config.add_extension('c_gden', #define_macros=[('LIBSVM_EXPORTS', None), # ('LIBSVM_DLL', None)], @@ -34,7 +34,8 @@ def configuration(parent_package='',top_path=None, package_name='pyem'): if __name__ == "__main__": from numpy.distutils.core import setup #setup(**configuration(top_path='').todict()) - setup(**configuration(top_path='',)) + #setup(**configuration(top_path='')) + setup(configuration=configuration) # from distutils.core import setup, Extension # from pyem import version as pyem_version # diff --git a/scikits/learn/pyem/test_reg.py b/scikits/learn/pyem/test_reg.py new file mode 100644 index 0000000000000000000000000000000000000000..026c1def7ebb467bd11c1266fc19ac655a6ba1d5 --- /dev/null +++ b/scikits/learn/pyem/test_reg.py @@ -0,0 +1,44 @@ +import numpy as N + +from gauss_mix import GM +from gmm_em import GMM, EM + +from numpy.random import seed + +def test_reg(): + seed(0) + # Generate data with a few components + d = 2 + k = 1 + n = 500 + + w, mu, va = GM.gen_param(d, k) + gm = GM.fromvalues(w, mu, va) + + data = gm.sample(n) + + # Try to learn with an insane number of components + gmm = GMM(GM(d, 30), 'random') + + em = EM() + like= em.train(data, gmm, 20, 1e-20) + + # import pylab as P + # P.subplot(2, 1, 1) + # P.plot(data[:, 0], data[:, 1], '.') + # gmm.gm.plot() + # P.subplot(2, 1, 2) + # P.plot(like) + # print like + # P.show() + +if __name__ == "__main__": + # import hotshot, hotshot.stats + # profile_file = 'manyk.prof' + # prof = hotshot.Profile(profile_file, lineevents=1) + # prof.runcall(test_reg) + # p = hotshot.stats.load(profile_file) + # print p.sort_stats('cumulative').print_stats(20) + # prof.close() + test_reg() + diff --git a/scikits/learn/pyem/tests/test_densities.py b/scikits/learn/pyem/tests/test_densities.py index c383356d2b8c10f12677c15e0ca754f654ec7aa8..02fec1b57b019bd20e44b354d6535ceac2bd398a 100644 --- a/scikits/learn/pyem/tests/test_densities.py +++ b/scikits/learn/pyem/tests/test_densities.py @@ -1,5 +1,5 @@ #! /usr/bin/env python -# Last Change: Thu Oct 19 07:00 PM 2006 J +# Last Change: Thu Nov 09 05:00 PM 2006 J # TODO: # - having "fake tests" to check that all mode (scalar, diag and full) are