diff --git a/scikits/learn/pyem/.bzrignore b/scikits/learn/pyem/.bzrignore index 8bf93bdb40b90500ec7fc5c198b8dff9146be799..3026759a50838b265847032a7d6c56ab6c7c9f16 100644 --- a/scikits/learn/pyem/.bzrignore +++ b/scikits/learn/pyem/.bzrignore @@ -10,3 +10,5 @@ test.py profile_gmm_em.py data.h5 gmmprof +valgrind-python.supp +valgrind-python.supp diff --git a/scikits/learn/pyem/Changelog b/scikits/learn/pyem/Changelog index 28dfe592086aec80d38fdf831d3861a567ebe14c..7bbc351f92ae785a027fab24094bd04928802504 100644 --- a/scikits/learn/pyem/Changelog +++ b/scikits/learn/pyem/Changelog @@ -1,3 +1,12 @@ +pyem (0.5.2) Thu, 24 Aug 2006 19:42:28 +0900 + + * put version to 0.5.2 + * Correct a bug with init method in GMM (change class method + to object method). + * modify the setup for a more flexible system + +-- David Cournapeau <david@ar.media.kyoto-u.ac.jp> + pyem (0.5.1) Thu, 17 Aug 2006 11:54:41 +0900 * put version to 0.5.1 diff --git a/scikits/learn/pyem/TODO b/scikits/learn/pyem/TODO index 546bf2eb488cda4e6275059dbbcfb6337f93f857..89c7cd49b296e0eef1494693cc6745075fd036a1 100644 --- a/scikits/learn/pyem/TODO +++ b/scikits/learn/pyem/TODO @@ -1,8 +1,10 @@ -# Last Change: Fri Aug 04 11:00 PM 2006 J +# Last Change: Tue Aug 22 10:00 PM 2006 J Things which must be implemented for a 1.0 version: - test for various length and model size. - a small help/tutorial + - use scipy.cluster.vq instead of custom kmeans algo (find + a way to get label from vq.kmeans) Things which would be nice: - C implementation of Gaussian densities at least. diff --git a/scikits/learn/pyem/pyem/__init__.py b/scikits/learn/pyem/pyem/__init__.py index ea210869ad3aaa962724d57705749c9b8f8d0cd0..8af616ae53cd2dfcd61c59e0986953c9a9c4cf0a 100644 --- a/scikits/learn/pyem/pyem/__init__.py +++ b/scikits/learn/pyem/pyem/__init__.py @@ -1,7 +1,7 @@ #! /usr/bin/env python -# Last Change: Thu Aug 17 03:00 PM 2006 J +# Last Change: Thu Aug 24 07:00 PM 2006 J -version = '0.5.1' +version = '0.5.2' from gauss_mix import GmParamError, GM from gmm_em import GmmParamError, GMM diff --git a/scikits/learn/pyem/pyem/densities.py b/scikits/learn/pyem/pyem/densities.py index 83d6d9f4adad510e3ee68ceb666d0e208aa760ac..acf63da0b93563ba5838206881eaeb2e74a0533e 100644 --- a/scikits/learn/pyem/pyem/densities.py +++ b/scikits/learn/pyem/pyem/densities.py @@ -1,14 +1,13 @@ #! /usr/bin/python # # Copyrighted David Cournapeau -# Last Change: Thu Aug 17 03:00 PM 2006 J +# Last Change: Tue Aug 22 08:00 PM 2006 J import numpy as N import numpy.linalg as lin from numpy.random import randn from scipy.stats import chi2 - # Error classes class DenError(Exception): """Base class for exceptions in this module. @@ -103,6 +102,12 @@ def _scalar_gauss_den(x, mu, va, log): return y +#from ctypes import cdll, c_uint, c_int, c_double, POINTER +#_gden = cdll.LoadLibrary('src/libgden.so') +#_gden.gden_diag.restype = c_int +#_gden.gden_diag.argtypes = [POINTER(c_double), c_uint, c_uint, +# POINTER(c_double), POINTER(c_double), POINTER(c_double)] + def _diag_gauss_den(x, mu, va, log): """ This function is the actual implementation of gaussian pdf in scalar case. It assumes all args @@ -119,9 +124,20 @@ def _diag_gauss_den(x, mu, va, log): y = (x[:,0] - mu[0,0]) ** 2 * inva * -0.5 for i in range(1, d): inva = 1/va[0,i] - fac *= (2*N.pi) ** (-d/2.0) * N.sqrt(inva) + #fac *= (2*N.pi) ** (-d/2.0) * N.sqrt(inva) + fac *= N.sqrt(inva) y += (x[:,i] - mu[0,i]) ** 2 * inva * -0.5 y = fac * N.exp(y) + #d = mu.size + #n = x.shape[0] + #if not log: + # y = N.zeros(n) + # inva= 1/va + # _gden.gden_diag(x.ctypes.data_as(POINTER(c_double)), + # n, d, + # mu.ctypes.data_as(POINTER(c_double)), + # (inva).ctypes.data_as(POINTER(c_double)), + # y.ctypes.data_as(POINTER(c_double))) else: y = _scalar_gauss_den(x[:,0], mu[0,0], va[0,0], log) for i in range(1, d): diff --git a/scikits/learn/pyem/pyem/gauss_mix.py b/scikits/learn/pyem/pyem/gauss_mix.py index 3b8d9db471eeb1128be0d98f9b67e7bd79b60780..7482b89c0825d05ce4b8c171400318632a622883 100644 --- a/scikits/learn/pyem/pyem/gauss_mix.py +++ b/scikits/learn/pyem/pyem/gauss_mix.py @@ -1,5 +1,5 @@ # /usr/bin/python -# Last Change: Thu Aug 17 03:00 PM 2006 J +# Last Change: Tue Aug 22 09:00 PM 2006 J # Module to implement GaussianMixture class. @@ -46,7 +46,7 @@ class GM: """Init a Gaussian model of k components, each component being a d multi-variate Gaussian, with covariance matrix of style mode""" if mode not in self._cov_mod: - raise GmmParamError("mode %s not recognized" + str(mode)) + raise GmParamError("mode %s not recognized" + str(mode)) self.d = d self.k = k @@ -67,13 +67,13 @@ class GM: """Set parameters of the model""" k, d, mode = check_gmm_param(weights, mu, sigma) if not k == self.k: - raise GmmParamError("Number of given components is %d, expected %d" + raise GmParamError("Number of given components is %d, expected %d" % (shape(k), shape(self.k))) if not d == self.d: - raise GmmParamError("Dimension of the given model is %d, expected %d" + raise GmParamError("Dimension of the given model is %d, expected %d" % (shape(d), shape(self.d))) if not mode == self.mode: - raise GmmParamError("Given covariance mode is %s, expected %d" + raise GmParamError("Given covariance mode is %s, expected %d" % (mode, self.mode)) self.w = weights self.mu = mu @@ -84,7 +84,7 @@ class GM: def sample(self, nframes): """ Sample nframes frames from the model """ if not self.is_valid: - raise GmmParamError("""Parameters of the model has not been + raise GmParamError("""Parameters of the model has not been set yet, please set them using self.set_param()""") # State index (ie hidden var) @@ -106,7 +106,7 @@ class GM: tmpind = N.where(S == s)[0] X[tmpind] = N.dot(X[tmpind], cho[s].transpose()) + self.mu[s] else: - raise GmmParamError('cov matrix mode not recognized, this is a bug !') + raise GmParamError('cov matrix mode not recognized, this is a bug !') return X @@ -172,7 +172,7 @@ class GM: va[k*d:k*d+d] = N.dot( va[k*d:k*d+d], va[k*d:k*d+d].transpose()) else: - raise GmmParamError('cov matrix mode not recognized') + raise GmParamError('cov matrix mode not recognized') return w, mu, va @@ -219,32 +219,32 @@ def check_gmm_param(w, mu, va): # Check that w is valid if N.fabs(N.sum(w, 0) - 1) > MAX_DEV: - raise GmmParamError('weight does not sum to 1') + raise GmParamError('weight does not sum to 1') if not len(w.shape) == 1: - raise GmmParamError('weight is not a vector') + raise GmParamError('weight is not a vector') # Check that mean and va have the same number of components K = len(w) if N.ndim(mu) < 2: msg = "mu should be a K,d matrix, and a row vector if only 1 comp" - raise GmmParamError(msg) + raise GmParamError(msg) if N.ndim(va) < 2: msg = """va should be a K,d / K *d, d matrix, and a row vector if only 1 diag comp""" - raise GmmParamError(msg) + raise GmParamError(msg) (Km, d) = mu.shape (Ka, da) = va.shape if not K == Km: msg = "not same number of component in mean and weights" - raise GmmParamError(msg) + raise GmParamError(msg) if not d == da: msg = "not same number of dimensions in mean and variances" - raise GmmParamError(msg) + raise GmParamError(msg) if Km == Ka: mode = 'diag' @@ -252,7 +252,7 @@ def check_gmm_param(w, mu, va): mode = 'full' if not Ka == Km*d: msg = "not same number of dimensions in mean and variances" - raise GmmParamError(msg) + raise GmParamError(msg) return K, d, mode diff --git a/scikits/learn/pyem/pyem/gmm_em.py b/scikits/learn/pyem/pyem/gmm_em.py index ffde042d66e6d72a3ac1eff150499357be06994c..5a352e38f2f7b48cbdfcef542cb1d62b0a8157dd 100644 --- a/scikits/learn/pyem/pyem/gmm_em.py +++ b/scikits/learn/pyem/pyem/gmm_em.py @@ -1,5 +1,5 @@ # /usr/bin/python -# Last Change: Thu Aug 17 03:00 PM 2006 J +# Last Change: Thu Aug 24 02:00 PM 2006 J import numpy as N import numpy.linalg as lin @@ -95,8 +95,6 @@ class GMM(ExpMixtureModel): self.gm.mu = mu self.gm.va = va - # Possible init methods - _init_methods = {'kmean': init_kmean, 'random' : init_random} # TODO: # - format of parameters ? For variances, list of variances matrix, # keep the current format, have 3d matrices ? @@ -107,10 +105,13 @@ class GMM(ExpMixtureModel): method for training init (kmean by default)""" self.gm = gm - if init not in self._init_methods: + # Possible init methods + init_methods = {'kmean': self.init_kmean, 'random' : self.init_random} + + if init not in init_methods: raise GmmParamError('init method %s not recognized' + str(init)) - GMM.init = self._init_methods[init] + self.init = init_methods[init] def sufficient_statistics(self, data): """ Return normalized and non-normalized sufficient statistics @@ -285,6 +286,9 @@ if __name__ == "__main__": gmm.init(data) # Keep the initialized model for drawing gm0 = copy.copy(lgm) + print gm0.w + print gm0.mu + print gm0.va # The actual EM, with likelihood computation niter = 10 @@ -329,6 +333,11 @@ if __name__ == "__main__": P.plot(Xe[i], Ye[i], 'r', label = '_nolegend_') P.legend(loc = 0) + #from scipy.cluster.vq import kmeans + #code = kmeans(data, k)[0] + #print code + #P.plot(code[:,0], code[:, 1], 'oy') + #P.plot(gm0.mu[:,0], gm0.mu[:, 1], 'ok') P.subplot(2, 1, 2) P.plot(like) P.title('log likelihood') diff --git a/scikits/learn/pyem/pyem/kmean.py b/scikits/learn/pyem/pyem/kmean.py index 01c8f01910cf163bffe8287b317889a378873128..634204231f390a2b98ca83127b26ad59da4a2cf5 100644 --- a/scikits/learn/pyem/pyem/kmean.py +++ b/scikits/learn/pyem/pyem/kmean.py @@ -1,5 +1,5 @@ # /usr/bin/python -# Last Change: Thu Aug 17 03:00 PM 2006 J +# Last Change: Tue Aug 22 10:00 PM 2006 J import numpy as N @@ -46,12 +46,22 @@ def _py_vq(data, code): # Try to import pyrex function for vector quantization. If not available, # falls back on pure python implementation. +#%KMEANIMPORT% try: - from c_gmm import _vq -except: - print """c_gmm._vq not found, using pure python implementation instead. + from scipy.cluster.vq import kmeans as kmean +except ImportError: + try: + from c_gmm import _vq + except: + print """c_gmm._vq not found, using pure python implementation instead. Kmean will be REALLY slow""" - _vq = _py_vq + _vq = _py_vq +#try: +# from c_gmm import _vq +#except: +# print """c_gmm._vq not found, using pure python implementation instead. +# Kmean will be REALLY slow""" +# _vq = _py_vq # Test functions usable for now def test_kmean(): diff --git a/scikits/learn/pyem/pyem/profile_gmm.py b/scikits/learn/pyem/pyem/profile_gmm.py index c2162cc590f48f56ce86a1fa48400d9943902a2f..a1a771a48d73a36cd86d4ca41b1dbe297d326dad 100644 --- a/scikits/learn/pyem/pyem/profile_gmm.py +++ b/scikits/learn/pyem/pyem/profile_gmm.py @@ -9,7 +9,7 @@ def bench1(mode = 'diag'): d = 20 k = 20 nframes = 1e4 - niter = 1 + niter = 10 mode = 'diag' #+++++++++++++++++++++++++++++++++++++++++++ @@ -38,7 +38,6 @@ def bench1(mode = 'diag'): gm0 = copy.copy(lgm) # The actual EM, with likelihood computation - niter = 10 like = N.zeros(niter) print "computing..." diff --git a/scikits/learn/pyem/pyem/src/Makefile b/scikits/learn/pyem/pyem/src/Makefile index e968b88a48ea6def4a0de67c00ccac83d452441e..54c26900c15e10d4a8cffe6dc153b291448f6281 100644 --- a/scikits/learn/pyem/pyem/src/Makefile +++ b/scikits/learn/pyem/pyem/src/Makefile @@ -5,11 +5,17 @@ PYREX = python2.4-pyrexc PYTHONINC = -I/usr/include/python2.4 NUMPYINC = -I/usr/lib/python2.4/site-packages/numpy/core/include -OPTIMS = -O3 -ffast-math -march=pentium4 +OPTIMS = -O2 -ffast-math -march=pentium4 WARN = -W -Wall CFLAGS = $(PYTHONINC) $(NUMPYINC) $(OPTIMS) $(WARN) +libgden.so: c_gden.o + $(LD) -shared -o $@ $< -Wl,-soname,$@ + +c_gden.o: c_gden.c + $(CC) -c $(CFLAGS) -fPIC $< + c_gmm.so: c_gmm.o $(LD) -shared -o $@ $< -Wl,-soname,$@ diff --git a/scikits/learn/pyem/setup.py b/scikits/learn/pyem/setup.py index 9cf524f4244f59cd161a6aa9e4b1f477388e0f4c..f64a1cd76a602456f97a4995bc03c74411b1da66 100644 --- a/scikits/learn/pyem/setup.py +++ b/scikits/learn/pyem/setup.py @@ -1,5 +1,9 @@ #! /usr/bin/env python -# Last Change: Fri Jul 14 04:00 PM 2006 J +# Last Change: Mon Aug 21 12:00 PM 2006 J +# TODO: +# - check how to handle cmd line build options with distutils and use +# it in the building process + """ pyem is a small python package to estimate Gaussian Mixtures Models from data, using Expectation Maximization""" from distutils.core import setup, Extension @@ -9,6 +13,8 @@ from pyem import version as pyem_version import os if os.path.exists('MANIFEST'): os.remove('MANIFEST') +import re + from numpy.distutils.misc_util import get_numpy_include_dirs NUMPYINC = get_numpy_include_dirs()[0] @@ -24,34 +30,82 @@ AUTHOR ='David Cournapeau', AUTHOR_EMAIL='david@ar.media.kyoto-u.ac.jp', URL ='http://ar.media.kyoto-u.ac.jp/members/david', -def setup_pyrex(): - setup(name=DISTNAME, - version=VERSION, - description=DESCRIPTION, - author=AUTHOR, - author_email=AUTHOR_EMAIL, - url=URL, - packages=['pyem'], - ext_modules=[Extension('pyem/c_gmm', ['pyem/src/c_gmm.pyx'], - include_dirs=[NUMPYINC])], - cmdclass = {'build_ext': build_ext}, - ) - -def setup_nopyrex(): - setup(name=DISTNAME, - version=VERSION, - description=DESCRIPTION, - author=AUTHOR, - author_email=AUTHOR_EMAIL, - url=URL, - packages=['pyem'], - #ext_modules=[Extension('_hello', ['hellomodule.c'])], - ) - -try: - from Pyrex.Distutils import build_ext - setup_pyrex() -except ImportError: - print "Pyrex not found, C extension won't be available" - setup_nopyrex() +# Source files for extensions + +# Functions used to substitute values in File. +# Mainly use to replace config.h capabilities +def do_subst_in_file(sourcefile, targetfile, dict): + """Replace all instances of the keys of dict with their values. + For example, if dict is {'%VERSION%': '1.2345', '%BASE%': 'MyProg'}, + then all instances of %VERSION% in the file will be replaced with 1.2345 etc. + """ + try: + f = open(sourcefile, 'rb') + contents = f.read() + f.close() + except: + raise IOError, "Can't read source file %s"%sourcefile + + for (k,v) in dict.items(): + contents = re.sub(k, v, contents) + try: + f = open(targetfile, 'wb') + f.write(contents) + f.close() + except: + raise IOError, "Can't read source file %s"%sourcefile + return 0 # success + +class SetupOption: + def __init__(self): + self.kmean = 'py' + self.ext_modules= [] + self.cmdclass = {} + self.subsdic = {'%KMEANIMPORT%': []} + + def _config_kmean(self): + # Check in this order: + # - kmean in scipy.cluster, + # - custom vq with pyrex + # - custom pure python vq + #try: + # from scipy.cluster.vq import kmeans + # self.kmean = 'scipy' + # #self.subsdic['%KMEANIMPORT%'] = scipy_kmean + #except ImportError: + # try: + # from Pyrex.Distutils import build_ext + # self.kmean = 'pyrex' + # self.ext_modules.append(Extension('pyem/c_gmm', + # ['pyem/src/c_gmm.pyx'], include_dirs=[NUMPYINC])) + # self.cmdclass['build_ext'] = build_ext + # #self.subsdic['%KMEANIMPORT%'] = pyrex_kmean + # except ImportError: + # self.kmean = 'py' + # #self.subsdic['%KMEANIMPORT%'] = pyrex_kmean + try: + from Pyrex.Distutils import build_ext + self.kmean = 'pyrex' + self.ext_modules.append(Extension('pyem/c_gmm', + ['pyem/src/c_gmm.pyx'], include_dirs=[NUMPYINC])) + self.cmdclass['build_ext'] = build_ext + #self.subsdic['%KMEANIMPORT%'] = pyrex_kmean + except ImportError: + self.kmean = 'py' + #self.subsdic['%KMEANIMPORT%'] = pyrex_kmean + def setup(self): + self._config_kmean() + import time + #do_subst_in_file('pyem/kmean.py.in', 'pyem/kmean.py', self.subsdic) + setup(name = DISTNAME, + version = VERSION, + description = DESCRIPTION, + author = AUTHOR, + author_email= AUTHOR_EMAIL, + url = URL, + packages = ['pyem'], + ext_modules = self.ext_modules, + cmdclass = self.cmdclass) +stpobj = SetupOption() +stpobj.setup()