diff --git a/scikits/learn/pyem/__init__.py b/scikits/learn/pyem/__init__.py index 27479a9166855cfb324161922e3ddf816a64db87..73ecd4f5316396602f422b2183536e3a11a1b1cc 100644 --- a/scikits/learn/pyem/__init__.py +++ b/scikits/learn/pyem/__init__.py @@ -1,5 +1,5 @@ #! /usr/bin/env python -# Last Change: Mon May 28 01:00 PM 2007 J +# Last Change: Sat Jun 09 10:00 PM 2007 J from info import __doc__ @@ -8,7 +8,7 @@ from gmm_em import GmmParamError, GMM, EM #from online_em import OnGMM as _OnGMM #import examples as _examples -__all__ = filter(lambda s:not s.startswith('_'),dir()) +__all__ = filter(lambda s:not s.startswith('_'), dir()) from numpy.testing import NumpyTest test = NumpyTest().test diff --git a/scikits/learn/pyem/_c_densities.py b/scikits/learn/pyem/_c_densities.py index 7979532c0a9354899545a3f46fa17535b7af580e..3e9cd2921079c4564cf75da8c808fdcd6b085ffe 100644 --- a/scikits/learn/pyem/_c_densities.py +++ b/scikits/learn/pyem/_c_densities.py @@ -1,28 +1,34 @@ #! /usr/bin/python # # Copyrighted David Cournapeau -# Last Change: Thu Nov 09 05:00 PM 2006 J +# Last Change: Sat Jun 09 10:00 PM 2007 J + +"""This module implements some function of densities module in C for efficiency +reasons. gaussian, such as pdf estimation, confidence interval/ellipsoids, +etc...""" + +__docformat__ = 'restructuredtext' # This module uses a C implementation through ctypes, for diagonal cases # TODO: # - portable way to find/open the shared library # - full cov matrice +# - test before inclusion import numpy as N import numpy.linalg as lin -from numpy.random import randn -from scipy.stats import chi2 -import densities as D +#from numpy.random import randn +#from scipy.stats import chi2 +#import densities as D import ctypes -from ctypes import cdll, c_uint, c_int, c_double, POINTER +from ctypes import c_uint, c_int from numpy.ctypeslib import ndpointer, load_library ctypes_major = int(ctypes.__version__.split('.')[0]) if ctypes_major < 1: - msg = "version of ctypes is %s, expected at least %s" \ - % (ctypes.__version__, '1.0.0') - raise ImportError(msg) + raise ImportError(msg = "version of ctypes is %s, expected at least %s"\ + % (ctypes.__version__, '1.0.1')) # Requirements for diag gden _gden = load_library('c_gden.so', __file__) @@ -75,9 +81,9 @@ def gauss_den(x, mu, va, log = False): if len(N.shape(x)) != 2: raise DenError("x is not rank 2") - (n, d) = x.shape - (dm0, dm1) = mu.shape - (dv0, dv1) = va.shape + (n, d) = N.shape(x) + (dm0, dm1) = N.shape(mu) + (dv0, dv1) = N.shape(va) # Check x and mu same dimension if dm0 != 1: @@ -165,9 +171,9 @@ def _diag_gauss_den(x, mu, va, log): # inva.ctypes.data_as(POINTER(c_double)), # y.ctypes.data_as(POINTER(c_double))) else: - y = _scalar_gauss_den(x[:,0], mu[0,0], va[0,0], log) + y = _scalar_gauss_den(x[:, 0], mu[0, 0], va[0, 0], log) for i in range(1, d): - y += _scalar_gauss_den(x[:,i], mu[0,i], va[0,i], log) + y += _scalar_gauss_den(x[:, i], mu[0, i], va[0, i], log) return y def _full_gauss_den(x, mu, va, log): @@ -199,19 +205,20 @@ def _full_gauss_den(x, mu, va, log): return y if __name__ == "__main__": - #========================================= - # Test accuracy between pure and C python - #========================================= - mu = N.array([2.0, 3]) - va = N.array([5.0, 3]) - - # Generate a multivariate gaussian of mean mu and covariance va - nframes = 1e4 - X = randn(nframes, 2) - Yc = N.dot(N.diag(N.sqrt(va)), X.transpose()) - Yc = Yc.transpose() + mu - - Y = D.gauss_den(Yc, mu, va) - Yt = gauss_den(Yc, mu, va) - - print "Diff is " + str(N.sqrt(N.sum((Y-Yt) ** 2))/nframes/2) + pass + ##========================================= + ## Test accuracy between pure and C python + ##========================================= + #mu = N.array([2.0, 3]) + #va = N.array([5.0, 3]) + + ## Generate a multivariate gaussian of mean mu and covariance va + #nframes = 1e4 + #X = randn(nframes, 2) + #Yc = N.dot(N.diag(N.sqrt(va)), X.transpose()) + #Yc = Yc.transpose() + mu + + #Y = D.gauss_den(Yc, mu, va) + #Yt = gauss_den(Yc, mu, va) + + #print "Diff is " + str(N.sqrt(N.sum((Y-Yt) ** 2))/nframes/2) diff --git a/scikits/learn/pyem/densities.py b/scikits/learn/pyem/densities.py index 28292e9b14d7df6c103c826d2f553110de06cdf8..ca5f7cbd337a18e0283d1317225e4f94927b9a02 100644 --- a/scikits/learn/pyem/densities.py +++ b/scikits/learn/pyem/densities.py @@ -1,8 +1,8 @@ #! /usr/bin/python # # Copyrighted David Cournapeau -# Last Change: Sat Jun 09 08:00 PM 2007 J -"""This module implements various bsic functions related to multivariate +# Last Change: Sat Jun 09 10:00 PM 2007 J +"""This module implements various basic functions related to multivariate gaussian, such as pdf estimation, confidence interval/ellipsoids, etc...""" __docformat__ = 'restructuredtext' @@ -50,10 +50,10 @@ def gauss_den(x, mu, va, log = False): pdf : ndarray Returns a rank 1 array of the pdf at points x. - Notes - ----- - Vector are row vectors, except va which can be a matrix - (row vector variance for diagonal variance).""" + Note + ---- + Vector are row vectors, except va which can be a matrix + (row vector variance for diagonal variance).""" lmu = N.atleast_2d(mu) lva = N.atleast_2d(va) diff --git a/scikits/learn/pyem/doc/Makefile b/scikits/learn/pyem/doc/Makefile index b84ab7b7dc860b07a1538a6113660d5a8d16f6e1..6a0f4d586e608329963d9433d6754276285ccd7e 100644 --- a/scikits/learn/pyem/doc/Makefile +++ b/scikits/learn/pyem/doc/Makefile @@ -1,4 +1,4 @@ -# Last Change: Mon May 28 10:00 AM 2007 J +# Last Change: Sat Jun 09 05:00 PM 2007 J # This makefile is used to build the pdf from the rest file and inlined code # from python examples @@ -7,7 +7,7 @@ py2tex = PYTHONPATH=/home/david/local/lib/python2.4/site-packages pygmentize -l rst2tex = PYTHONPATH=/home/david/local/lib/python2.4/site-packages rst2newlatex.py \ --stylesheet-path base.tex --user-stylesheet user.tex -pytexfiles = pyem.tex basic_example1.tex basic_example2.tex basic_example3.tex +pytexfiles = pyem.tex basic_example1.tex basic_example2.tex basic_example3.tex pdfestimation.tex SOURCEPATH = $(PWD) @@ -24,13 +24,16 @@ pyem.pdf: $(pytexfiles) pyem.tex: index.txt $(rst2tex) $< > $@ -basic_example1.tex: examples/basic_example1.py +basic_example1.tex: ../examples/basic_example1.py $(py2tex) $< > $@ -basic_example2.tex: examples/basic_example2.py +basic_example2.tex: ../examples/basic_example2.py $(py2tex) $< > $@ -basic_example3.tex: examples/basic_example3.py +basic_example3.tex: ../examples/basic_example3.py + $(py2tex) $< > $@ + +pdfestimation.tex: ../examples/pdfestimation.py $(py2tex) $< > $@ clean: diff --git a/scikits/learn/pyem/doc/index.txt b/scikits/learn/pyem/doc/index.txt index 309318d2198bae7296cb5abc19066bb8c1d1d335..37213067469debf031d361804b1d9a7142332e58 100644 --- a/scikits/learn/pyem/doc/index.txt +++ b/scikits/learn/pyem/doc/index.txt @@ -13,7 +13,7 @@ file: Bic_example.png /restindex -.. Last Change: Mon May 28 10:00 AM 2007 J +.. Last Change: Sat Jun 09 07:00 PM 2007 J =================================================== PyEM, a python package for Gaussian mixture models @@ -176,14 +176,36 @@ useful for complex drawing. Examples ========= -TODO. +Using EM for pdf estimation +--------------------------- + +The following example uses the old faithful dataset and is available in the +example directory. It models the joint distribution (d(t), w(t+1)), where d(t) +is the duration time, and w(t+1) the waiting time for the next eruption. It +selects the best model using the BIC. + +.. raw:: latex + + \input{pdfestimation.tex} + +.. figure:: pdfestimation.png + :width: 500 + :height: 400 + + isodensity curves for the old faithful data modeled by a 1, 2, 3 and 4 + componenits model (up to bottom, left to right). + Using EM for clustering ----------------------- +TODO (this is fundamentally the same than pdf estimation, though) + Using PyEM for supervised learning ---------------------------------- +TODO + Note on performances ==================== diff --git a/scikits/learn/pyem/doc/pdfestimation.png b/scikits/learn/pyem/doc/pdfestimation.png new file mode 100644 index 0000000000000000000000000000000000000000..e128ffaa53c725b6d0237904a2e6ca482369c9ff Binary files /dev/null and b/scikits/learn/pyem/doc/pdfestimation.png differ diff --git a/scikits/learn/pyem/doc/tutorial.pdf b/scikits/learn/pyem/doc/tutorial.pdf index 197213b3a65bc2466f1397a48ece86b58f8e2577..333118d6274450824cd0e5a2edd2d1b0480b3d31 100644 Binary files a/scikits/learn/pyem/doc/tutorial.pdf and b/scikits/learn/pyem/doc/tutorial.pdf differ diff --git a/scikits/learn/pyem/examples/pdfestimation.py b/scikits/learn/pyem/examples/pdfestimation.py index 1c64f4f9cb0e26373f03eecc5c71f5e5b585d77c..3e7ce1752870efbc725fae3b8ff17a47e96eced9 100644 --- a/scikits/learn/pyem/examples/pdfestimation.py +++ b/scikits/learn/pyem/examples/pdfestimation.py @@ -1,15 +1,11 @@ #! /usr/bin/env python -# Last Change: Sat Jun 09 03:00 PM 2007 J +# Last Change: Sat Jun 09 07:00 PM 2007 J # Example of doing pdf estimation with EM algorithm. Requires matplotlib. import numpy as N -from numpy.testing import set_package_path, restore_path - import pylab as P -set_package_path() -import pyem -restore_path() +from scipy.sandbox import pyem import utils oldfaithful = utils.get_faithful() @@ -45,6 +41,8 @@ for k in range(1, 5): X, Y, Z, V = gm.density_on_grid() P.contour(X, Y, Z, V) P.plot(dt[:, 0], dt[:, 1], '.') + P.xlabel('duration time (scaled)') + P.ylabel('waiting time (scaled)') print "According to the BIC, model with %d components is better" % (N.argmax(bc) + 1) P.show() diff --git a/scikits/learn/pyem/gauss_mix.py b/scikits/learn/pyem/gauss_mix.py index d7c92f752d1c4ec15a2485f25d36a159ae04015f..010f05c34f9cd2dbea3071fa46d140a573dd05e5 100644 --- a/scikits/learn/pyem/gauss_mix.py +++ b/scikits/learn/pyem/gauss_mix.py @@ -1,7 +1,13 @@ # /usr/bin/python -# Last Change: Sat Jun 09 08:00 PM 2007 J +# Last Change: Sat Jun 09 10:00 PM 2007 J -# Module to implement GaussianMixture class. +"""Module implementing GM, a class which represents Gaussian mixtures. + +GM instances can be used to create, sample mixtures. They also provide +different plotting facilities, such as isodensity contour for multi dimensional +models, ellipses of confidence.""" + +__docformat__ = 'restructuredtext' import numpy as N from numpy.random import randn, rand @@ -21,12 +27,12 @@ import misc # be used as long as w, mu and va are not set # - We have to use scipy now for chisquare pdf, so there may be other # methods to be used, ie for implementing random index. -# - there is no check on internal state of the GM, that is does w, mu and va values -# make sense (eg singular values) -# - plot1d is still very rhough. There should be a sensible way to -# modify the result plot (maybe returns a dic with global pdf, component pdf and -# fill matplotlib handles). Should be coherent with plot -class GmParamError: +# - there is no check on internal state of the GM, that is does w, mu and va +# values make sense (eg singular values) - plot1d is still very rhough. There +# should be a sensible way to modify the result plot (maybe returns a dic +# with global pdf, component pdf and fill matplotlib handles). Should be +# coherent with plot +class GmParamError(Exception): """Exception raised for errors in gmm params Attributes: @@ -34,6 +40,7 @@ class GmParamError: message -- explanation of the error """ def __init__(self, message): + Exception.__init__(self) self.message = message def __str__(self): @@ -52,11 +59,27 @@ class GM: # Methods to construct a mixture #=============================== def __init__(self, d, k, mode = 'diag'): - """Init a Gaussian Mixture of k components, each component being a - d multi-variate Gaussian, with covariance matrix of style mode. - - If you want to build a Gaussian Mixture with knowns weights, means - and variances, you can use GM.fromvalues method directly""" + """Init a Gaussian Mixture. + + :Parameters: + d : int + dimension of the mixture. + k : int + number of component in the mixture. + mode : string + mode of covariance + + :Returns: + an instance of GM. + + Note + ---- + + Only full and diag mode are supported for now. + + :SeeAlso: + If you want to build a Gaussian Mixture with knowns weights, means + and variances, you can use GM.fromvalues method directly""" if mode not in self._cov_mod: raise GmParamError("mode %s not recognized" + str(mode)) @@ -80,16 +103,42 @@ class GM: self.is1d = True def set_param(self, weights, mu, sigma): - """Set parameters of the model. Args should - be conformant with metparameters d and k given during - initialisation""" + """Set parameters of the model. + + Args should be conformant with metparameters d and k given during + initialisation. + + :Parameters: + weights : ndarray + weights of the mixture (k elements) + mu : ndarray + means of the mixture. One component's mean per row, k row for k + components. + sigma : ndarray + variances of the mixture. For diagonal models, one row contains + the diagonal elements of the covariance matrix. For full + covariance, d rows for one variance. + + Examples + -------- + Create a 3 component, 2 dimension mixture with full covariance matrices + + >>> w = numpy.array([0.2, 0.5, 0.3]) + >>> mu = numpy.array([[0., 0.], [1., 1.]]) + >>> va = numpy.array([[1., 0.], [0., 1.], [2., 0.5], [0.5, 1]]) + >>> gm = GM(2, 3, 'full') + >>> gm.set_param(w, mu, va) + + :SeeAlso: + If you know already the parameters when creating the model, you can + simply use the method class GM.fromvalues.""" k, d, mode = check_gmm_param(weights, mu, sigma) if not k == self.k: raise GmParamError("Number of given components is %d, expected %d" % (k, self.k)) if not d == self.d: - raise GmParamError("Dimension of the given model is %d, expected %d" - % (d, self.d)) + raise GmParamError("Dimension of the given model is %d, "\ + "expected %d" % (d, self.d)) if not mode == self.mode and not d == 1: raise GmParamError("Given covariance mode is %s, expected %s" % (mode, self.mode)) @@ -104,16 +153,34 @@ class GM: """This class method can be used to create a GM model directly from its parameters weights, mean and variance - w, mu, va = GM.gen_param(d, k) - gm = GM(d, k) - gm.set_param(w, mu, va) + :Parameters: + weights : ndarray + weights of the mixture (k elements) + mu : ndarray + means of the mixture. One component's mean per row, k row for k + components. + sigma : ndarray + variances of the mixture. For diagonal models, one row contains + the diagonal elements of the covariance matrix. For full + covariance, d rows for one variance. + + :Returns: + gm : GM + an instance of GM. + + Examples + -------- + + >>> w, mu, va = GM.gen_param(d, k) + >>> gm = GM(d, k) + >>> gm.set_param(w, mu, va) and - w, mu, va = GM.gen_param(d, k) - gm = GM.fromvalue(w, mu, va) + >>> w, mu, va = GM.gen_param(d, k) + >>> gm = GM.fromvalue(w, mu, va) - Are equivalent """ + are strictly equivalent.""" k, d, mode = check_gmm_param(weights, mu, sigma) res = cls(d, k, mode) res.set_param(weights, mu, sigma) @@ -123,7 +190,15 @@ class GM: # Fundamental facilities (sampling, confidence, etc..) #===================================================== def sample(self, nframes): - """ Sample nframes frames from the model """ + """ Sample nframes frames from the model. + + :Parameters: + nframes : int + number of samples to draw. + + :Returns: + samples : ndarray + samples in the format one sample per row (nframes, d).""" if not self.is_valid: raise GmParamError("""Parameters of the model has not been set yet, please set them using self.set_param()""") @@ -134,47 +209,60 @@ class GM: X = randn(nframes, self.d) if self.mode == 'diag': - X = self.mu[S, :] + X * N.sqrt(self.va[S,:]) + X = self.mu[S, :] + X * N.sqrt(self.va[S, :]) elif self.mode == 'full': # Faster: cho = N.zeros((self.k, self.va.shape[1], self.va.shape[1])) for i in range(self.k): # Using cholesky looks more stable than sqrtm; sqrtm is not # available in numpy anyway, only in scipy... - cho[i] = lin.cholesky(self.va[i*self.d:i*self.d+self.d,:]) + cho[i] = lin.cholesky(self.va[i*self.d:i*self.d+self.d, :]) for s in range(self.k): tmpind = N.where(S == s)[0] X[tmpind] = N.dot(X[tmpind], cho[s].transpose()) + self.mu[s] else: - raise GmParamError('cov matrix mode not recognized, this is a bug !') + raise GmParamError("cov matrix mode not recognized, "\ + "this is a bug !") return X - def conf_ellipses(self, dim = misc.DEF_VIS_DIM, npoints = misc.DEF_ELL_NP, + def conf_ellipses(self, dim = misc.DEF_VIS_DIM, npoints = misc.DEF_ELL_NP, level = misc.DEF_LEVEL): """Returns a list of confidence ellipsoids describing the Gmm defined by mu and va. Check densities.gauss_ell for details - Returns: - -Xe: a list of x coordinates for the ellipses (Xe[i] is - the array containing x coordinates of the ith Gaussian) - -Ye: a list of y coordinates for the ellipses - - Example: + :Parameters: + dim : sequence + sequences of two integers which represent the dimensions where to + project the ellipsoid. + npoints : int + number of points to generate for the ellipse. + level : float + level of confidence (between 0 and 1). + + :Returns: + Xe : sequence + a list of x coordinates for the ellipses (Xe[i] is the array + containing x coordinates of the ith Gaussian) + Ye : sequence + a list of y coordinates for the ellipses. + + Examples + -------- Suppose we have w, mu and va as parameters for a mixture, then: - gm = GM(d, k) - gm.set_param(w, mu, va) - X = gm.sample(1000) - Xe, Ye = gm.conf_ellipsoids() - pylab.plot(X[:,0], X[:, 1], '.') - for k in len(w): - pylab.plot(Xe[k], Ye[k], 'r') + >>> gm = GM(d, k) + >>> gm.set_param(w, mu, va) + >>> X = gm.sample(1000) + >>> Xe, Ye = gm.conf_ellipsoids() + >>> pylab.plot(X[:,0], X[:, 1], '.') + >>> for k in len(w): + ... pylab.plot(Xe[k], Ye[k], 'r') Will plot samples X draw from the mixture model, and plot the ellipses of equi-probability from the mean with - fixed level of confidence 0.39. """ + default level of confidence.""" if self.is1d: raise ValueError("This function does not make sense for 1d " "mixtures.") @@ -187,14 +275,14 @@ class GM: Ye = [] if self.mode == 'diag': for i in range(self.k): - xe, ye = densities.gauss_ell(self.mu[i,:], self.va[i,:], + xe, ye = densities.gauss_ell(self.mu[i, :], self.va[i, :], dim, npoints, level) Xe.append(xe) Ye.append(ye) elif self.mode == 'full': for i in range(self.k): - xe, ye = densities.gauss_ell(self.mu[i,:], - self.va[i*self.d:i*self.d+self.d,:], + xe, ye = densities.gauss_ell(self.mu[i, :], + self.va[i*self.d:i*self.d+self.d, :], dim, npoints, level) Xe.append(xe) Ye.append(ye) @@ -202,7 +290,10 @@ class GM: return Xe, Ye def check_state(self): - """ + """Returns true if the parameters of the model are valid. + + For Gaussian mixtures, this means weights summing to 1, and variances + to be positive definite. """ if not self.is_valid: raise GmParamError("""Parameters of the model has not been @@ -222,18 +313,33 @@ class GM: cond = N.zeros(self.k) ava = N.absolute(self.va) for c in range(self.k): - cond[c] = N.amax(ava[c,:]) / N.amin(ava[c,:]) + cond[c] = N.amax(ava[c, :]) / N.amin(ava[c, :]) print cond - def gen_param(self, d, nc, varmode = 'diag', spread = 1): - """Generate valid parameters for a gaussian mixture model. - d is the dimension, nc the number of components, and varmode - the mode for cov matrices. - + @classmethod + def gen_param(cls, d, nc, varmode = 'diag', spread = 1): + """Generate random, valid parameters for a gaussian mixture model. + + :Parameters: + d : int + the dimension + nc : int + the number of components + varmode : string + covariance matrix mode ('full' or 'diag'). + + :Returns: + w : ndarray + weights of the mixture + mu : ndarray + means of the mixture + w : ndarray + variances of the mixture + + Notes + ----- This is a class method. - - Returns: w, mu, va """ w = abs(randn(nc)) w = w / sum(w, 0) @@ -251,13 +357,13 @@ class GM: return w, mu, va - gen_param = classmethod(gen_param) + #gen_param = classmethod(gen_param) - #======================= - # Regularization methods - #======================= - def _regularize(self): - raise NotImplemented("No regularization") + # #======================= + # # Regularization methods + # #======================= + # def _regularize(self): + # raise NotImplemented("No regularization") #================= # Plotting methods @@ -266,10 +372,29 @@ class GM: level = misc.DEF_LEVEL): """Plot the ellipsoides directly for the model - Returns a list of lines, so that their style can be modified. By default, - the style is red color, and nolegend for all of them. + Returns a list of lines handle, so that their style can be modified. By + default, the style is red color, and nolegend for all of them. + + :Parameters: + dim : sequence + sequence of two integers, the dimensions of interest. + npoints : int + Number of points to use for the ellipsoids. + level : int + level of confidence (to use with fill argument) + + :Returns: + h : sequence + Returns a list of lines handle so that their properties + can be modified (eg color, label, etc...): + + Note + ---- + Does not work for 1d. Requires matplotlib - Does not work for 1d""" + :SeeAlso: + conf_ellipses is used to compute the ellipses. Use this if you want + to plot with something else than matplotlib.""" if self.is1d: raise ValueError("This function does not make sense for 1d " "mixtures.") @@ -282,22 +407,32 @@ class GM: Xe, Ye = self.conf_ellipses(dim, npoints, level) try: import pylab as P - return [P.plot(Xe[i], Ye[i], 'r', label='_nolegend_')[0] for i in range(k)] + return [P.plot(Xe[i], Ye[i], 'r', label='_nolegend_')[0] for i in + range(k)] #for i in range(k): # P.plot(Xe[i], Ye[i], 'r') except ImportError: raise GmParamError("matplotlib not found, cannot plot...") - def plot1d(self, level = 0.5, fill = 0, gpdf = 0): - """This function plots the pdfs of each component of the model. - If gpdf is 1, also plots the global pdf. If fill is 1, fill confidence - areas using level argument as a level value + def plot1d(self, level = misc.DEF_LEVEL, fill = False, gpdf = False): + """Plots the pdf of each component of the 1d mixture. - Returns a dictionary h of plot handles so that their properties can - be modified (eg color, label, etc...): - - h['pdf'] is a list of lines, one line per component pdf - - h['gpdf'] is the line for the global pdf - - h['conf'] is a list of filling area + :Parameters: + level : int + level of confidence (to use with fill argument) + fill : bool + if True, the area of the pdf corresponding to the given + confidence intervales is filled. + gpdf : bool + if True, the global pdf is plot. + + :Returns: + h : dict + Returns a dictionary h of plot handles so that their properties + can be modified (eg color, label, etc...): + - h['pdf'] is a list of lines, one line per component pdf + - h['gpdf'] is the line for the global pdf + - h['conf'] is a list of filling area """ if not self.is1d: raise ValueError("This function does not make sense for " @@ -310,12 +445,12 @@ class GM: raise GmParamError("the model is not one dimensional model") from scipy.stats import norm nrm = norm(0, 1) - pval = N.sqrt(self.va[:,0]) * nrm.ppf((1+level)/2) + pval = N.sqrt(self.va[:, 0]) * nrm.ppf((1+level)/2) # Compute reasonable min/max for the normal pdf: [-mc * std, mc * std] # gives the range we are taking in account for each gaussian mc = 3 - std = N.sqrt(self.va[:,0]) + std = N.sqrt(self.va[:, 0]) m = N.amin(self.mu[:, 0] - mc * std) M = N.amax(self.mu[:, 0] + mc * std) @@ -326,7 +461,7 @@ class GM: # Prepare the dic of plot handles to return ks = ['pdf', 'conf', 'gpdf'] - hp = dict((i,[]) for i in ks) + hp = dict((i, []) for i in ks) try: import pylab as P for c in range(self.k): @@ -336,7 +471,8 @@ class GM: h = P.plot(x, y, 'r', label ='_nolegend_') hp['pdf'].extend(h) if fill: - #P.axvspan(-pval[c] + self.mu[c][0], pval[c] + self.mu[c][0], + #P.axvspan(-pval[c] + self.mu[c][0], pval[c] + + #self.mu[c][0], # facecolor = 'b', alpha = 0.2) id1 = -pval[c] + self.mu[c] id2 = pval[c] + self.mu[c] @@ -350,7 +486,8 @@ class GM: facecolor = 'b', alpha = 0.1, label='_nolegend_') hp['conf'].extend(h) #P.fill([xc[0], xc[0], xc[-1], xc[-1]], - # [0, Yf[0], Yf[-1], 0], facecolor = 'b', alpha = 0.2) + # [0, Yf[0], Yf[-1], 0], facecolor = 'b', alpha = + # 0.2) if gpdf: h = P.plot(x, Yt, 'r:', label='_nolegend_') hp['gpdf'] = h @@ -363,7 +500,7 @@ class GM: the pdf of the mixture.""" # XXX: have a public function to compute the pdf at given points # instead... - std = N.sqrt(self.va[:,0]) + std = N.sqrt(self.va[:, 0]) retval = N.empty((x.size, self.k)) for c in range(self.k): retval[:, c] = self.w[c]/(N.sqrt(2*N.pi) * std[c]) * \ @@ -373,9 +510,30 @@ class GM: def density_on_grid(self, dim = misc.DEF_VIS_DIM, nx = 50, ny = 50, maxlevel = 0.95): - """Do all the necessary computation for contour plot of mixture's density. + """Do all the necessary computation for contour plot of mixture's + density. + + :Parameters: + dim : sequence + sequence of two integers, the dimensions of interest. + nx : int + Number of points to use for the x axis of the grid + ny : int + Number of points to use for the y axis of the grid - Returns X, Y, Z and V as expected by mpl contour function.""" + :Returns: + X : ndarray + points of the x axis of the grid + Y : ndarray + points of the y axis of the grid + Z : ndarray + values of the density on X and Y + V : ndarray + Contour values to display. + + Note + ---- + X, Y, Z and V are as expected by matplotlib contour function.""" if self.is1d: raise ValueError("This function does not make sense for 1d " "mixtures.") @@ -397,13 +555,14 @@ class GM: X, Y, den = self._densityctr(N.linspace(ax[0]-0.2*w, ax[1]+0.2*w, nx), \ N.linspace(ax[2]-0.2*h, ax[3]+0.2*h, ny), dim = dim) lden = N.log(den) + # XXX: how to find "good" values for level ? V = [-5, -3, -1, -0.5, ] V.extend(N.linspace(0, N.max(lden), 4).tolist()) return X, Y, lden, N.array(V) - def _densityctr(self, xrange, yrange, dim = misc.DEF_VIS_DIM): + def _densityctr(self, rangex, rangey, dim = misc.DEF_VIS_DIM): """Helper function to compute density contours on a grid.""" - gr = N.meshgrid(xrange, yrange) + gr = N.meshgrid(rangex, rangey) X = gr[0].flatten() Y = gr[1].flatten() xdata = N.concatenate((X[:, N.newaxis], Y[:, N.newaxis]), axis = 1) @@ -412,7 +571,7 @@ class GM: dva = self._get_va(dim) den = densities.multiple_gauss_den(xdata, dmu, dva) * self.w den = N.sum(den, 1) - den = den.reshape(len(yrange), len(xrange)) + den = den.reshape(len(rangey), len(rangex)) X = gr[0] Y = gr[1] @@ -435,16 +594,16 @@ class GM: # Syntactic sugar def __repr__(self): - repr = "" - repr += "Gaussian Mixture:\n" - repr += " -> %d dimensions\n" % self.d - repr += " -> %d components\n" % self.k - repr += " -> %s covariance \n" % self.mode + msg = "" + msg += "Gaussian Mixture:\n" + msg += " -> %d dimensions\n" % self.d + msg += " -> %d components\n" % self.k + msg += " -> %s covariance \n" % self.mode if self.is_valid: - repr += "Has initial values""" + msg += "Has initial values""" else: - repr += "Has no initial values yet""" - return repr + msg += "Has no initial values yet""" + return msg def __str__(self): return self.__repr__() @@ -472,19 +631,26 @@ def gen_rand_index(p, n): def check_gmm_param(w, mu, va): """Check that w, mu and va are valid parameters for - a mixture of gaussian: w should sum to 1, there should - be the same number of component in each param, the variances - should be positive definite, etc... + a mixture of gaussian. + + w should sum to 1, there should be the same number of component in each + param, the variances should be positive definite, etc... - Params: - w = vector or list of weigths of the mixture (K elements) - mu = matrix: K * d - va = list of variances (vector K * d or square matrices Kd * d) - - returns: - K = number of components - d = dimension - mode = 'diag' if diagonal covariance, 'full' of full matrices + :Parameters: + w : ndarray + vector or list of weigths of the mixture (K elements) + mu : ndarray + matrix: K * d + va : ndarray + list of variances (vector K * d or square matrices Kd * d) + + :Returns: + k : int + number of components + d : int + dimension + mode : string + 'diag' if diagonal covariance, 'full' of full matrices """ # Check that w is valid @@ -527,34 +693,35 @@ def check_gmm_param(w, mu, va): return K, d, mode if __name__ == '__main__': - # Meta parameters: - # - k = number of components - # - d = dimension - # - mode : mode of covariance matrices - d = 5 - k = 4 - - # Now, drawing a model - mode = 'full' - nframes = 1e3 - - # Build a model with random parameters - w, mu, va = GM.gen_param(d, k, mode, spread = 3) - gm = GM.fromvalues(w, mu, va) - - # Sample nframes frames from the model - X = gm.sample(nframes) - - # Plot the data - import pylab as P - P.plot(X[:, 0], X[:, 1], '.', label = '_nolegend_') - - # Real confidence ellipses with confidence level - level = 0.50 - h = gm.plot(level=level) - - # set the first ellipse label, which will appear in the legend - h[0].set_label('confidence ell at level ' + str(level)) - - P.legend(loc = 0) - P.show() + pass + ## # Meta parameters: + ## # - k = number of components + ## # - d = dimension + ## # - mode : mode of covariance matrices + ## d = 5 + ## k = 4 + + ## # Now, drawing a model + ## mode = 'full' + ## nframes = 1e3 + + ## # Build a model with random parameters + ## w, mu, va = GM.gen_param(d, k, mode, spread = 3) + ## gm = GM.fromvalues(w, mu, va) + + ## # Sample nframes frames from the model + ## X = gm.sample(nframes) + + ## # Plot the data + ## import pylab as P + ## P.plot(X[:, 0], X[:, 1], '.', label = '_nolegend_') + + ## # Real confidence ellipses with confidence level + ## level = 0.50 + ## h = gm.plot(level=level) + + ## # set the first ellipse label, which will appear in the legend + ## h[0].set_label('confidence ell at level ' + str(level)) + + ## P.legend(loc = 0) + ## P.show() diff --git a/scikits/learn/pyem/gmm_em.py b/scikits/learn/pyem/gmm_em.py index 780361d62a2ab0ff290c7388753d132c25af330c..6e03a58c454232e44b53c2897a6e333688ac665f 100644 --- a/scikits/learn/pyem/gmm_em.py +++ b/scikits/learn/pyem/gmm_em.py @@ -1,5 +1,11 @@ # /usr/bin/python -# Last Change: Fri Jun 08 08:00 PM 2007 J +# Last Change: Sat Jun 09 10:00 PM 2007 J + +"""Module implementing GMM, a class to estimate Gaussian mixture models using +EM, and EM, a class which use GMM instances to estimate models parameters using +the ExpectationMaximization algorithm.""" + +__docformat__ = 'restructuredtext' # TODO: # - which methods to avoid va shrinking to 0 ? There are several options, @@ -8,22 +14,23 @@ # - online EM import numpy as N -import numpy.linalg as lin +#import numpy.linalg as lin from numpy.random import randn #import _c_densities as densities import densities #from kmean import kmean from scipy.cluster.vq import kmeans2 as kmean -from gauss_mix import GM +#from gauss_mix import GM -from misc import _DEF_ALPHA, _MIN_DBL_DELTA, _MIN_INV_COND +#from misc import _DEF_ALPHA, _MIN_DBL_DELTA, _MIN_INV_COND # Error classes class GmmError(Exception): """Base class for exceptions in this module.""" - pass + def __init__(self): + Exception.__init__(self) -class GmmParamError: +class GmmParamError(GmmError): """Exception raised for errors in gmm params Attributes: @@ -31,41 +38,33 @@ class GmmParamError: message -- explanation of the error """ def __init__(self, message): + GmmError.__init__(self) self.message = message def __str__(self): return self.message -# Not sure yet about how to design different mixture models. Most of the code -# is different # (pdf, update part of EM, etc...) and I am not sure it makes -# sense to use inheritance for # interface specification in python, since its -# dynamic type systeme. - -# Anyway, a mixture model class should encapsulates all details -# concerning getting sufficient statistics (SS), likelihood and bic. class MixtureModel(object): pass class ExpMixtureModel(MixtureModel): - """Class to model mixture of exponential pdf (eg Gaussian, exponential, Laplace, - etc..). This is a special case because some parts of EM are common for those - models...""" + """Class to model mixture of exponential pdf (eg Gaussian, exponential, + Laplace, etc..). This is a special case because some parts of EM are common + for those models...""" pass class GMM(ExpMixtureModel): - """ A class to model a Gaussian Mixture Model (GMM). An instance of - this class is created by giving weights, mean and variances in the ctor. - An instanciated object can be sampled, trained by EM. - - The class method gen_model can be used without instanciation.""" - + """ A class to model a Gaussian Mixture Model (GMM). An instance of this + class is created by giving weights, mean and variances in the ctor. An + instanciated object can be sampled, trained by EM. """ def init_kmean(self, data, niter = 5): """ Init the model with kmean.""" k = self.gm.k d = self.gm.d init = data[0:k, :] - # XXX: This is bogus: should do better (in kmean or here, do not know yet) + # XXX: This is bogus initialization should do better (in kmean or here, + # do not know yet): should (code, label) = kmean(data, init, niter, minit = 'matrix') w = N.ones(k) / k @@ -74,14 +73,15 @@ class GMM(ExpMixtureModel): va = N.zeros((k, d)) for i in range(k): for j in range(d): - va[i,j] = N.cov(data[N.where(label==i), j], rowvar = 0) + va[i, j] = N.cov(data[N.where(label==i), j], rowvar = 0) elif self.gm.mode == 'full': va = N.zeros((k*d, d)) for i in range(k): - va[i*d:i*d+d,:] = \ + va[i*d:i*d+d, :] = \ N.cov(data[N.where(label==i)], rowvar = 0) else: - raise GmmParamError("mode " + str(mode) + " not recognized") + raise GmmParamError("mode " + str(self.gm.mode) + \ + " not recognized") self.gm.set_param(w, mu, va) @@ -96,8 +96,8 @@ class GMM(ExpMixtureModel): mu = randn(k, d) va = N.fabs(randn(k, d)) else: - raise GmmParamError("""init_random not implemented for - mode %s yet""", mode) + raise GmmParamError("init_random not implemented for " + "mode %s yet", self.gm.mode) self.gm.set_param(w, mu, va) @@ -109,8 +109,18 @@ class GMM(ExpMixtureModel): # - To handle the different modes, we could do something "fancy" such as # replacing methods, to avoid checking cases everywhere and unconsistency. def __init__(self, gm, init = 'kmean'): - """ Initialize a GMM with weight w, mean mu and variances va, and initialization - method for training init (kmean by default)""" + """Initialize a mixture model. + + Initialize the model from a GM instance. This class implements all the + necessary functionalities for EM. + + :Parameters: + gm : GM + the mixture model to train. + init : string + initialization method to use. + + """ self.gm = gm # Possible init methods @@ -124,17 +134,18 @@ class GMM(ExpMixtureModel): self.initst = init def sufficient_statistics(self, data): - """ Return normalized and non-normalized sufficient statistics - from the model. + """Compute responsabilities. + + Return normalized and non-normalized sufficient statistics from the + model. - Computes the latent variable distribution (a - posteriori probability) knowing the explicit data - for the Gaussian model (w, mu, var): gamma(t, i) = - P[state = i | observation = data(t); w, mu, va] + Note + ---- + Computes the latent variable distribution (a posteriori probability) + knowing the explicit data for the Gaussian model (w, mu, var): gamma(t, + i) = P[state = i | observation = data(t); w, mu, va] This is basically the E step of EM for GMM.""" - n = data.shape[0] - # compute the gaussian pdf tgd = densities.multiple_gauss_den(data, self.gm.mu, self.gm.va) # multiply by the weight @@ -149,22 +160,22 @@ class GMM(ExpMixtureModel): from the a posteriori pdf, computed by gmm_posterior (E step). """ - k = self.gm.k - d = self.gm.d - n = data.shape[0] - invn = 1.0/n - mGamma = N.sum(gamma, axis = 0) + k = self.gm.k + d = self.gm.d + n = data.shape[0] + invn = 1.0/n + mGamma = N.sum(gamma, axis = 0) if self.gm.mode == 'diag': - mu = N.zeros((k, d)) - va = N.zeros((k, d)) - gamma = gamma.T + mu = N.zeros((k, d)) + va = N.zeros((k, d)) + gamma = gamma.T for c in range(k): - x = N.dot(gamma[c:c+1,:], data)[0,:] - xx = N.dot(gamma[c:c+1,:], data ** 2)[0,:] + x = N.dot(gamma[c:c+1, :], data)[0, :] + xx = N.dot(gamma[c:c+1, :], data ** 2)[0, :] - mu[c,:] = x / mGamma[c] - va[c,:] = xx / mGamma[c] - mu[c,:] ** 2 + mu[c, :] = x / mGamma[c] + va[c, :] = xx / mGamma[c] - mu[c, :] ** 2 w = invn * mGamma elif self.gm.mode == 'full': @@ -177,21 +188,22 @@ class GMM(ExpMixtureModel): mu = N.zeros((k, d)) va = N.zeros((k*d, d)) - gamma = gamma.transpose() + gamma = gamma.transpose() for c in range(k): #x = N.sum(N.outer(gamma[:, c], # N.ones((1, d))) * data, axis = 0) - x = N.dot(gamma[c:c+1,:], data)[0,:] - xx = N.zeros((d, d)) + x = N.dot(gamma[c:c+1, :], data)[0, :] + xx = N.zeros((d, d)) # This should be much faster than recursing on n... for i in range(d): for j in range(d): - xx[i,j] = N.sum(data[:,i] * data[:,j] * gamma[c,:], axis = 0) + xx[i, j] = N.sum(data[:, i] * data[:, j] * gamma[c, :], + axis = 0) - mu[c,:] = x / mGamma[c] - va[c*d:c*d+d,:] = xx / mGamma[c] - \ - N.outer(mu[c,:], mu[c,:]) + mu[c, :] = x / mGamma[c] + va[c*d:c*d+d, :] = xx / mGamma[c] \ + - N.outer(mu[c, :], mu[c, :]) w = invn * mGamma else: raise GmmParamError("varmode not recognized") @@ -226,19 +238,17 @@ class GMM(ExpMixtureModel): of the definition given here. """ if self.gm.mode == 'diag': - """ for a diagonal model, we have - k - 1 (k weigths, but one constraint of normality) - + k * d (means) + k * d (variances) """ + # for a diagonal model, we have k - 1 (k weigths, but one + # constraint of normality) + k * d (means) + k * d (variances) free_deg = self.gm.k * (self.gm.d * 2 + 1) - 1 elif self.gm.mode == 'full': - """ for a full model, we have - k - 1 (k weigths, but one constraint of normality) - + k * d (means) + k * d * d / 2 (each covariance matrice - has d **2 params, but with positivity constraint) """ + # for a full model, we have k - 1 (k weigths, but one constraint of + # normality) + k * d (means) + k * d * d / 2 (each covariance + # matrice has d **2 params, but with positivity constraint) if self.gm.d == 1: - free_deg = self.gm.k * 3 - 1 + free_deg = self.gm.k * 3 - 1 else: - free_deg = self.gm.k * (self.gm.d + 1 + self.gm.d ** 2 / 2) - 1 + free_deg = self.gm.k * (self.gm.d + 1 + self.gm.d ** 2 / 2) - 1 lk = self.likelihood(data) n = N.shape(data)[0] @@ -261,21 +271,32 @@ class EM: pass def train(self, data, model, maxiter = 10, thresh = 1e-5): - """ - Train a model using data, and stops when the likelihood fails - behind a threshold, or when the number of iterations > niter, - whichever comes first - - Args: - - data: contains the observed features, one row is one frame, ie one - observation of dimension d - - model: object of class Mixture - - maxiter: maximum number of iterations - - The model is trained, and its parameters updated accordingly. - - Returns: - likelihood (one value per iteration). + """Train a model using EM. + + Train a model using data, and stops when the likelihood increase + between two consecutive iteration fails behind a threshold, or when the + number of iterations > niter, whichever comes first + + :Parameters: + data : ndarray + contains the observed features, one row is one frame, ie one + observation of dimension d + model : GMM + GMM instance. + maxiter : int + maximum number of iterations + thresh : threshold + if the slope of the likelihood falls below this value, the + algorithm stops. + + :Returns: + likelihood : ndarray + one value per iteration. + + Note + ---- + The model is trained, and its parameters updated accordingly, eg the + results are put in the GMM instance. """ if not isinstance(model, MixtureModel): raise TypeError("expect a MixtureModel as a model") @@ -296,61 +317,23 @@ class EM: model.update_em(data, g) if has_em_converged(like[i], like[i-1], thresh): return like[0:i] - # # Em computation, with computation of the likelihood - # g, tgd = model.sufficient_statistics(data) - # like[0] = N.sum(N.log(N.sum(tgd, 1)), axis = 0) - # model.update_em(data, g) - # for i in range(1, maxiter): - # print "=== Iteration %d ===" % i - # isreg = False - # for j in range(model.gm.k): - # va = model.gm.va[j] - # if va.any() < _MIN_INV_COND: - # isreg = True - # print "\tregularization detected" - # print "\t" + str(va) - # model.gm.va[j] = regularize_diag(va) - # print "\t" + str(va) + ", " + str(model.gm.va[j]) - # print "\t" + str(gauss_den(data, model.gm.mu[j], model.gm.va[j])) - # print "\tend regularization detected" - # var = va - # - # g, tgd = model.sufficient_statistics(data) - # try: - # assert not( (N.isnan(tgd)).any() ) - # if isreg: - # print var - # except AssertionError: - # print "tgd is nan..." - # print model.gm.va[13,:] - # print 1/model.gm.va[13,:] - # print densities.gauss_den(data, model.gm.mu[13], model.gm.va[13]) - # print N.isnan((multiple_gauss_den(data, model.gm.mu, model.gm.va))).any() - # print "Exciting" - # import sys - # sys.exit(-1) - # like[i] = N.sum(N.log(N.sum(tgd, 1)), axis = 0) - # model.update_em(data, g) - # assert not( model.gm.va.any() < 1e-6) - # if has_em_converged(like[i], like[i-1], thresh): - # return like[0:i] return like -def regularize_diag(variance, alpha = _DEF_ALPHA): - delta = N.sum(variance) / variance.size - if delta > _MIN_DBL_DELTA: - return variance + alpha * delta - else: - return variance + alpha * _MIN_DBL_DELTA - -def regularize_full(variance): - # Trace of a positive definite matrix is always > 0 - delta = N.trace(variance) / variance.shape[0] - if delta > _MIN_DBL_DELTA: - return variance + alpha * delta - else: - return variance + alpha * _MIN_DBL_DELTA +#def regularize_diag(variance, alpha = _DEF_ALPHA): +# delta = N.sum(variance) / variance.size +# if delta > _MIN_DBL_DELTA: +# return variance + alpha * delta +# else: +# return variance + alpha * _MIN_DBL_DELTA +# +#def regularize_full(variance): +# # Trace of a positive definite matrix is always > 0 +# delta = N.trace(variance) / variance.shape[0] +# if delta > _MIN_DBL_DELTA: +# return variance + alpha * delta +# else: +# return variance + alpha * _MIN_DBL_DELTA # Misc functions def bic(lk, deg, n): @@ -369,127 +352,129 @@ def has_em_converged(like, plike, thresh): return False if __name__ == "__main__": - import copy - #============================= - # Simple GMM with 5 components - #============================= - - #+++++++++++++++++++++++++++++ - # Meta parameters of the model - # - k: Number of components - # - d: dimension of each Gaussian - # - mode: Mode of covariance matrix: full or diag - # - nframes: number of frames (frame = one data point = one - # row of d elements - k = 2 - d = 1 - mode = 'full' - nframes = 1e3 - - #+++++++++++++++++++++++++++++++++++++++++++ - # Create an artificial GMM model, samples it - #+++++++++++++++++++++++++++++++++++++++++++ - print "Generating the mixture" - # Generate a model with k components, d dimensions - w, mu, va = GM.gen_param(d, k, mode, spread = 3) - gm = GM(d, k, mode) - gm.set_param(w, mu, va) - - # Sample nframes frames from the model - data = gm.sample(nframes) - - #++++++++++++++++++++++++ - # Learn the model with EM - #++++++++++++++++++++++++ - - # Init the model - print "Init a model for learning, with kmean for initialization" - lgm = GM(d, k, mode) - gmm = GMM(lgm, 'kmean') - gmm.init(data) - - # Keep the initialized model for drawing - gm0 = copy.copy(lgm) - - # The actual EM, with likelihood computation - niter = 10 - like = N.zeros(niter) - - print "computing..." - for i in range(niter): - g, tgd = gmm.sufficient_statistics(data) - like[i] = N.sum(N.log(N.sum(tgd, 1)), axis = 0) - gmm.update_em(data, g) - # # Alternative form, by using EM class: as the EM class - # # is quite rudimentary now, it is not very useful, just save - # # a few lines - # em = EM() - # like = em.train(data, gmm, niter) - - #+++++++++++++++ - # Draw the model - #+++++++++++++++ - print "drawing..." - import pylab as P - P.subplot(2, 1, 1) - - if not d == 1: - # Draw what is happening - P.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_') - - # Real confidence ellipses - Xre, Yre = gm.conf_ellipses() - P.plot(Xre[0], Yre[0], 'g', label = 'true confidence ellipsoides') - for i in range(1,k): - P.plot(Xre[i], Yre[i], 'g', label = '_nolegend_') - - # Initial confidence ellipses as found by kmean - X0e, Y0e = gm0.conf_ellipses() - P.plot(X0e[0], Y0e[0], 'k', label = 'initial confidence ellipsoides') - for i in range(1,k): - P.plot(X0e[i], Y0e[i], 'k', label = '_nolegend_') - - # Values found by EM - Xe, Ye = lgm.conf_ellipses() - P.plot(Xe[0], Ye[0], 'r', label = 'confidence ellipsoides found by EM') - for i in range(1,k): - P.plot(Xe[i], Ye[i], 'r', label = '_nolegend_') - P.legend(loc = 0) - else: - # Real confidence ellipses - h = gm.plot1d() - [i.set_color('g') for i in h['pdf']] - h['pdf'][0].set_label('true pdf') - - # Initial confidence ellipses as found by kmean - h0 = gm0.plot1d() - [i.set_color('k') for i in h0['pdf']] - h0['pdf'][0].set_label('initial pdf') - - # Values found by EM - hl = lgm.plot1d(fill = 1, level = 0.66) - [i.set_color('r') for i in hl['pdf']] - hl['pdf'][0].set_label('pdf found by EM') - - P.legend(loc = 0) - - P.subplot(2, 1, 2) - P.plot(like) - P.title('log likelihood') - - # #++++++++++++++++++ - # # Export the figure - # #++++++++++++++++++ - # F = P.gcf() - # DPI = F.get_dpi() - # DefaultSize = F.get_size_inches() - # # the default is 100dpi for savefig: - # F.savefig("example1.png") - - # # Now make the image twice as big, while keeping the fonts and all the - # # same size - # F.set_figsize_inches( (DefaultSize[0]*2, DefaultSize[1]*2) ) - # Size = F.get_size_inches() - # print "Size in Inches", Size - # F.savefig("example2.png") - P.show() + pass + ## import copy + ## #============================= + ## # Simple GMM with 5 components + ## #============================= + + ## #+++++++++++++++++++++++++++++ + ## # Meta parameters of the model + ## # - k: Number of components + ## # - d: dimension of each Gaussian + ## # - mode: Mode of covariance matrix: full or diag + ## # - nframes: number of frames (frame = one data point = one + ## # row of d elements + ## k = 2 + ## d = 1 + ## mode = 'full' + ## nframes = 1e3 + + ## #+++++++++++++++++++++++++++++++++++++++++++ + ## # Create an artificial GMM model, samples it + ## #+++++++++++++++++++++++++++++++++++++++++++ + ## print "Generating the mixture" + ## # Generate a model with k components, d dimensions + ## w, mu, va = GM.gen_param(d, k, mode, spread = 3) + ## gm = GM(d, k, mode) + ## gm.set_param(w, mu, va) + + ## # Sample nframes frames from the model + ## data = gm.sample(nframes) + + ## #++++++++++++++++++++++++ + ## # Learn the model with EM + ## #++++++++++++++++++++++++ + + ## # Init the model + ## print "Init a model for learning, with kmean for initialization" + ## lgm = GM(d, k, mode) + ## gmm = GMM(lgm, 'kmean') + ## gmm.init(data) + + ## # Keep the initialized model for drawing + ## gm0 = copy.copy(lgm) + + ## # The actual EM, with likelihood computation + ## niter = 10 + ## like = N.zeros(niter) + + ## print "computing..." + ## for i in range(niter): + ## g, tgd = gmm.sufficient_statistics(data) + ## like[i] = N.sum(N.log(N.sum(tgd, 1)), axis = 0) + ## gmm.update_em(data, g) + ## # # Alternative form, by using EM class: as the EM class + ## # # is quite rudimentary now, it is not very useful, just save + ## # # a few lines + ## # em = EM() + ## # like = em.train(data, gmm, niter) + + ## #+++++++++++++++ + ## # Draw the model + ## #+++++++++++++++ + ## print "drawing..." + ## import pylab as P + ## P.subplot(2, 1, 1) + + ## if not d == 1: + ## # Draw what is happening + ## P.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_') + + ## # Real confidence ellipses + ## Xre, Yre = gm.conf_ellipses() + ## P.plot(Xre[0], Yre[0], 'g', label = 'true confidence ellipsoides') + ## for i in range(1,k): + ## P.plot(Xre[i], Yre[i], 'g', label = '_nolegend_') + + ## # Initial confidence ellipses as found by kmean + ## X0e, Y0e = gm0.conf_ellipses() + ## P.plot(X0e[0], Y0e[0], 'k', label = 'initial confidence ellipsoides') + ## for i in range(1,k): + ## P.plot(X0e[i], Y0e[i], 'k', label = '_nolegend_') + + ## # Values found by EM + ## Xe, Ye = lgm.conf_ellipses() + ## P.plot(Xe[0], Ye[0], 'r', label = "confidence ellipsoides found by" + ## "EM") + ## for i in range(1,k): + ## P.plot(Xe[i], Ye[i], 'r', label = '_nolegend_') + ## P.legend(loc = 0) + ## else: + ## # Real confidence ellipses + ## h = gm.plot1d() + ## [i.set_color('g') for i in h['pdf']] + ## h['pdf'][0].set_label('true pdf') + + ## # Initial confidence ellipses as found by kmean + ## h0 = gm0.plot1d() + ## [i.set_color('k') for i in h0['pdf']] + ## h0['pdf'][0].set_label('initial pdf') + + ## # Values found by EM + ## hl = lgm.plot1d(fill = 1, level = 0.66) + ## [i.set_color('r') for i in hl['pdf']] + ## hl['pdf'][0].set_label('pdf found by EM') + + ## P.legend(loc = 0) + + ## P.subplot(2, 1, 2) + ## P.plot(like) + ## P.title('log likelihood') + + ## # #++++++++++++++++++ + ## # # Export the figure + ## # #++++++++++++++++++ + ## # F = P.gcf() + ## # DPI = F.get_dpi() + ## # DefaultSize = F.get_size_inches() + ## # # the default is 100dpi for savefig: + ## # F.savefig("example1.png") + + ## # # Now make the image twice as big, while keeping the fonts and all the + ## # # same size + ## # F.set_figsize_inches( (DefaultSize[0]*2, DefaultSize[1]*2) ) + ## # Size = F.get_size_inches() + ## # print "Size in Inches", Size + ## # F.savefig("example2.png") + ## P.show() diff --git a/scikits/learn/pyem/info.py b/scikits/learn/pyem/info.py index cf107708a96adad4d51370f49c9a136f69bb88e8..eaf502c1905e82fcdb4f67dbce81ae1c2f0d03b0 100644 --- a/scikits/learn/pyem/info.py +++ b/scikits/learn/pyem/info.py @@ -1,61 +1,63 @@ """ -Routines for Gaussian Mixture Models -and learning with Expectation Maximization -========================================== +Routines for Gaussian Mixture Models and learning with Expectation Maximization +=============================================================================== -This module contains classes and function to compute multivariate Gaussian densities -(diagonal and full covariance matrices), Gaussian mixtures, Gaussian mixtures models -and an Em trainer. +This module contains classes and function to compute multivariate Gaussian +densities (diagonal and full covariance matrices), Gaussian mixtures, Gaussian +mixtures models and an Em trainer. More specifically, the module defines the following classes, functions: - densities.gauss_den: function to compute multivariate Gaussian pdf -- gauss_mix.GM: defines the GM (Gaussian Mixture) class. A Gaussian Mixture can be -created from its parameters weights, mean and variances, or from its meta parameters -d (dimension of the Gaussian) and k (number of components in the mixture). A Gaussian -Model can then be sampled or plot (if d>1, plot confidence ellipsoids projected on -2 chosen dimensions, if d == 1, plot the pdf of each component and fill the zone -of confidence for a given level) -- gmm_em.GMM: defines a class GMM (Gaussian Mixture Model). This class is constructed -from a GM model gm, and can be used to train gm. The GMM can be initiated by -kmean or at random, and can compute sufficient statistics, and update its parameters -from the sufficient statistics. -- kmean.kmean: implements a kmean algorithm. We cannot use scipy.cluster.vq kmeans, since -its does not give membership of observations. +- gauss_mix.GM: defines the GM (Gaussian Mixture) class. A Gaussian Mixture can + be created from its parameters weights, mean and variances, or from its meta + parameters d (dimension of the Gaussian) and k (number of components in the + mixture). A Gaussian Model can then be sampled or plot (if d>1, plot + confidence ellipsoids projected on 2 chosen dimensions, if d == 1, plot the + pdf of each component and fill the zone of confidence for a given level) +- gmm_em.GMM: defines a class GMM (Gaussian Mixture Model). This class is + constructed from a GM model gm, and can be used to train gm. The GMM can be + initiated by kmean or at random, and can compute sufficient statistics, and + update its parameters from the sufficient statistics. +- kmean.kmean: implements a kmean algorithm. We cannot use scipy.cluster.vq + kmeans, since its does not give membership of observations. Example of use: - #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - # Create an artificial 2 dimension, 3 clusters GM model, samples it - #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - w, mu, va = GM.gen_param(2, 3, 'diag', spread = 1.5) - gm = GM.fromvalues(w, mu, va) +--------------- - # Sample 1000 frames from the model - data = gm.sample(1000) - - #++++++++++++++++++++++++ - # Learn the model with EM - #++++++++++++++++++++++++ - # Init the model - lgm = GM(d, k, mode) - gmm = GMM(lgm, 'kmean') - - # The actual EM, with likelihood computation. The threshold - # is compared to the (linearly appromixated) derivative of the likelihood - em = EM() - like = em.train(data, gmm, maxiter = 30, thresh = 1e-8) +>>> #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +>>> # Create an artificial 2 dimension, 3 clusters GM model, samples it +>>> #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +>>> w, mu, va = GM.gen_param(2, 3, 'diag', spread = 1.5) +>>> gm = GM.fromvalues(w, mu, va) +>>> +>>> # Sample 1000 frames from the model +>>> data = gm.sample(1000) +>>> +>>> #++++++++++++++++++++++++ +>>> # Learn the model with EM +>>> #++++++++++++++++++++++++ +>>> # Init the model +>>> lgm = GM(d, k, mode) +>>> gmm = GMM(lgm, 'kmean') +>>> +>>> # The actual EM, with likelihood computation. The threshold +>>> # is compared to the (linearly appromixated) derivative of the likelihood +>>> em = EM() +>>> like = em.train(data, gmm, maxiter = 30, thresh = 1e-8) Files example.py and example2.py show more capabilities of the toolbox, including plotting capabilities (using matplotlib) and model selection using Bayesian Information Criterion (BIC). Bibliography: - * Maximum likelihood from incomplete data via the EM algorithm in Journal of - the Royal Statistical Society, Series B, 39(1):1--38, 1977, by A. P. Dempster, - N. M. Laird, and D. B. Rubin - * Bayesian Approaches to Gaussian Mixture Modelling (1998) by - Stephen J. Roberts, Dirk Husmeier, Iead Rezek, William Penny in - IEEE Transactions on Pattern Analysis and Machine Intelligence + +- Maximum likelihood from incomplete data via the EM algorithm in Journal of + the Royal Statistical Society, Series B, 39(1):1--38, 1977, by A. P. + Dempster, N. M. Laird, and D. B. Rubin +- Bayesian Approaches to Gaussian Mixture Modelling (1998) by Stephen J. + Roberts, Dirk Husmeier, Iead Rezek, William Penny in IEEE Transactions on + Pattern Analysis and Machine Intelligence Copyright: David Cournapeau 2006 License: BSD-style (see LICENSE.txt in main source directory) diff --git a/scikits/learn/pyem/online_em.py b/scikits/learn/pyem/online_em.py index 4d599a75c085916bc4facd679ba854cb57716204..e8539038b5980421c8fc10515f9564b258e215d1 100644 --- a/scikits/learn/pyem/online_em.py +++ b/scikits/learn/pyem/online_em.py @@ -1,28 +1,27 @@ # /usr/bin/python -# Last Change: Fri Jun 08 08:00 PM 2007 J +# Last Change: Sat Jun 09 10:00 PM 2007 J -#--------------------------------------------- -# This is not meant to be used yet !!!! I am -# not sure how to integrate this stuff inside -# the package yet. The cases are: -# - we have a set of data, and we want to test online EM -# compared to normal EM -# - we do not have all the data before putting them in online EM: -# eg current frame depends on previous frame in some way. +# This is not meant to be used yet !!!! I am not sure how to integrate this +# stuff inside the package yet. The cases are: +# - we have a set of data, and we want to test online EM compared to normal +# EM +# - we do not have all the data before putting them in online EM: eg current +# frame depends on previous frame in some way. # TODO: # - Add biblio -# - Look back at articles for discussion for init, regularization and +# - Look back at articles for discussion for init, regularization and # convergence rates -# - the function sufficient_statistics does not really return SS. This is not a -# big problem, but it would be better to really return them as the name implied. +# - the function sufficient_statistics does not really return SS. This is not +# a big problem, but it would be better to really return them as the name +# implied. import numpy as N from numpy import mean from numpy.testing import assert_array_almost_equal, assert_array_equal -from gmm_em import ExpMixtureModel, GMM, EM -from gauss_mix import GM +from gmm_em import ExpMixtureModel#, GMM, EM +#from gauss_mix import GM from scipy.cluster.vq import kmeans2 as kmean import densities2 as D @@ -60,22 +59,24 @@ class OnGMM(ExpMixtureModel): k = self.gm.k d = self.gm.d if self.gm.mode == 'diag': - w = N.ones(k) / k + w = N.ones(k) / k # Init the internal state of EM - self.cx = N.outer(w, mean(init_data, 0)) - self.cxx = N.outer(w, mean(init_data ** 2, 0)) + self.cx = N.outer(w, mean(init_data, 0)) + self.cxx = N.outer(w, mean(init_data ** 2, 0)) # w, mu and va init is the same that in the standard case - (code, label) = kmean(init_data, init_data[0:k, :], iter = niter, minit = 'matrix') - mu = code.copy() - va = N.zeros((k, d)) + (code, label) = kmean(init_data, init_data[0:k, :], iter = 10, + minit = 'matrix') + mu = code.copy() + va = N.zeros((k, d)) for i in range(k): for j in range(d): - va [i,j] = N.cov(init_data[N.where(label==i), j], rowvar = 0) + va [i, j] = N.cov(init_data[N.where(label==i), j], + rowvar = 0) else: raise OnGmmParamError("""init_online not implemented for - mode %s yet""", mode) + mode %s yet""", self.gm.mode) self.gm.set_param(w, mu, va) # c* are the parameters which are computed at every step (ie @@ -95,22 +96,24 @@ class OnGMM(ExpMixtureModel): k = self.gm.k d = self.gm.d if self.gm.mode == 'diag': - w = N.ones(k) / k + w = N.ones(k) / k # Init the internal state of EM - self.cx = N.outer(w, mean(init_data, 0)) - self.cxx = N.outer(w, mean(init_data ** 2, 0)) + self.cx = N.outer(w, mean(init_data, 0)) + self.cxx = N.outer(w, mean(init_data ** 2, 0)) # w, mu and va init is the same that in the standard case - (code, label) = kmean(init_data, init_data[0:k, :], iter = niter, minit = 'matrix') - mu = code.copy() - va = N.zeros((k, d)) + (code, label) = kmean(init_data, init_data[0:k, :], + iter = niter, minit = 'matrix') + mu = code.copy() + va = N.zeros((k, d)) for i in range(k): for j in range(d): - va [i,j] = N.cov(init_data[N.where(label==i), j], rowvar = 0) + va[i, j] = N.cov(init_data[N.where(label==i), j], + rowvar = 0) else: raise OnGmmParamError("""init_online not implemented for - mode %s yet""", mode) + mode %s yet""", self.gm.mode) self.gm.set_param(w, mu, va) # c* are the parameters which are computed at every step (ie @@ -278,132 +281,133 @@ def multiple_gauss_den_frame(data, mu, va): if __name__ == '__main__': - d = 1 - k = 2 - mode = 'diag' - nframes = int(5e3) - emiter = 4 - seed(5) - - #+++++++++++++++++++++++++++++++++++++++++++++++++ - # Generate a model with k components, d dimensions - #+++++++++++++++++++++++++++++++++++++++++++++++++ - w, mu, va = GM.gen_param(d, k, mode, spread = 1.5) - gm = GM.fromvalues(w, mu, va) - # Sample nframes frames from the model - data = gm.sample(nframes) - - #++++++++++++++++++++++++++++++++++++++++++ - # Approximate the models with classical EM - #++++++++++++++++++++++++++++++++++++++++++ - # Init the model - lgm = GM(d, k, mode) - gmm = GMM(lgm, 'kmean') - gmm.init(data) - - gm0 = copy.copy(gmm.gm) - # The actual EM, with likelihood computation - like = N.zeros(emiter) - for i in range(emiter): - g, tgd = gmm.sufficient_statistics(data) - like[i] = N.sum(N.log(N.sum(tgd, 1)), axis = 0) - gmm.update_em(data, g) - - #++++++++++++++++++++++++++++++++++++++++ - # Approximate the models with online EM - #++++++++++++++++++++++++++++++++++++++++ - ogm = GM(d, k, mode) - ogmm = OnGMM(ogm, 'kmean') - init_data = data[0:nframes / 20, :] - ogmm.init(init_data) - - # Forgetting param - ku = 0.005 - t0 = 200 - lamb = 1 - 1/(N.arange(-1, nframes-1) * ku + t0) - nu0 = 0.2 - nu = N.zeros((len(lamb), 1)) - nu[0] = nu0 - for i in range(1, len(lamb)): - nu[i] = 1./(1 + lamb[i] / nu[i-1]) - - print "meth1" - # object version of online EM - for t in range(nframes): - ogmm.compute_sufficient_statistics_frame(data[t], nu[t]) - ogmm.update_em_frame() - - ogmm.gm.set_param(ogmm.cw, ogmm.cmu, ogmm.cva) - - # 1d optimized version - ogm2 = GM(d, k, mode) - ogmm2 = OnGMM1d(ogm2, 'kmean') - ogmm2.init(init_data[:, 0]) - - print "meth2" - # object version of online EM - for t in range(nframes): - ogmm2.compute_sufficient_statistics_frame(data[t, 0], nu[t]) - ogmm2.update_em_frame() - - #ogmm2.gm.set_param(ogmm2.cw, ogmm2.cmu, ogmm2.cva) - - print ogmm.cw - print ogmm2.cw - #+++++++++++++++ - # Draw the model - #+++++++++++++++ - print "drawing..." - import pylab as P - P.subplot(2, 1, 1) - - if not d == 1: - # Draw what is happening - P.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_') - - h = gm.plot() - [i.set_color('g') for i in h] - h[0].set_label('true confidence ellipsoides') - - h = gm0.plot() - [i.set_color('k') for i in h] - h[0].set_label('initial confidence ellipsoides') - - h = lgm.plot() - [i.set_color('r') for i in h] - h[0].set_label('confidence ellipsoides found by EM') - - h = ogmm.gm.plot() - [i.set_color('m') for i in h] - h[0].set_label('confidence ellipsoides found by Online EM') - - # P.legend(loc = 0) - else: - # Real confidence ellipses - h = gm.plot1d() - [i.set_color('g') for i in h['pdf']] - h['pdf'][0].set_label('true pdf') - - # Initial confidence ellipses as found by kmean - h0 = gm0.plot1d() - [i.set_color('k') for i in h0['pdf']] - h0['pdf'][0].set_label('initial pdf') - - # Values found by EM - hl = lgm.plot1d(fill = 1, level = 0.66) - [i.set_color('r') for i in hl['pdf']] - hl['pdf'][0].set_label('pdf found by EM') - - P.legend(loc = 0) - - # Values found by Online EM - hl = ogmm.gm.plot1d(fill = 1, level = 0.66) - [i.set_color('m') for i in hl['pdf']] - hl['pdf'][0].set_label('pdf found by Online EM') - - P.legend(loc = 0) - - P.subplot(2, 1, 2) - P.plot(nu) - P.title('Learning rate') - P.show() + pass + #d = 1 + #k = 2 + #mode = 'diag' + #nframes = int(5e3) + #emiter = 4 + #seed(5) + + ##+++++++++++++++++++++++++++++++++++++++++++++++++ + ## Generate a model with k components, d dimensions + ##+++++++++++++++++++++++++++++++++++++++++++++++++ + #w, mu, va = GM.gen_param(d, k, mode, spread = 1.5) + #gm = GM.fromvalues(w, mu, va) + ## Sample nframes frames from the model + #data = gm.sample(nframes) + + ##++++++++++++++++++++++++++++++++++++++++++ + ## Approximate the models with classical EM + ##++++++++++++++++++++++++++++++++++++++++++ + ## Init the model + #lgm = GM(d, k, mode) + #gmm = GMM(lgm, 'kmean') + #gmm.init(data) + + #gm0 = copy.copy(gmm.gm) + ## The actual EM, with likelihood computation + #like = N.zeros(emiter) + #for i in range(emiter): + # g, tgd = gmm.sufficient_statistics(data) + # like[i] = N.sum(N.log(N.sum(tgd, 1)), axis = 0) + # gmm.update_em(data, g) + + ##++++++++++++++++++++++++++++++++++++++++ + ## Approximate the models with online EM + ##++++++++++++++++++++++++++++++++++++++++ + #ogm = GM(d, k, mode) + #ogmm = OnGMM(ogm, 'kmean') + #init_data = data[0:nframes / 20, :] + #ogmm.init(init_data) + + ## Forgetting param + #ku = 0.005 + #t0 = 200 + #lamb = 1 - 1/(N.arange(-1, nframes-1) * ku + t0) + #nu0 = 0.2 + #nu = N.zeros((len(lamb), 1)) + #nu[0] = nu0 + #for i in range(1, len(lamb)): + # nu[i] = 1./(1 + lamb[i] / nu[i-1]) + + #print "meth1" + ## object version of online EM + #for t in range(nframes): + # ogmm.compute_sufficient_statistics_frame(data[t], nu[t]) + # ogmm.update_em_frame() + + #ogmm.gm.set_param(ogmm.cw, ogmm.cmu, ogmm.cva) + + ## 1d optimized version + #ogm2 = GM(d, k, mode) + #ogmm2 = OnGMM1d(ogm2, 'kmean') + #ogmm2.init(init_data[:, 0]) + + #print "meth2" + ## object version of online EM + #for t in range(nframes): + # ogmm2.compute_sufficient_statistics_frame(data[t, 0], nu[t]) + # ogmm2.update_em_frame() + + ##ogmm2.gm.set_param(ogmm2.cw, ogmm2.cmu, ogmm2.cva) + + #print ogmm.cw + #print ogmm2.cw + ##+++++++++++++++ + ## Draw the model + ##+++++++++++++++ + #print "drawing..." + #import pylab as P + #P.subplot(2, 1, 1) + + #if not d == 1: + # # Draw what is happening + # P.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_') + + # h = gm.plot() + # [i.set_color('g') for i in h] + # h[0].set_label('true confidence ellipsoides') + + # h = gm0.plot() + # [i.set_color('k') for i in h] + # h[0].set_label('initial confidence ellipsoides') + + # h = lgm.plot() + # [i.set_color('r') for i in h] + # h[0].set_label('confidence ellipsoides found by EM') + + # h = ogmm.gm.plot() + # [i.set_color('m') for i in h] + # h[0].set_label('confidence ellipsoides found by Online EM') + + # # P.legend(loc = 0) + #else: + # # Real confidence ellipses + # h = gm.plot1d() + # [i.set_color('g') for i in h['pdf']] + # h['pdf'][0].set_label('true pdf') + + # # Initial confidence ellipses as found by kmean + # h0 = gm0.plot1d() + # [i.set_color('k') for i in h0['pdf']] + # h0['pdf'][0].set_label('initial pdf') + + # # Values found by EM + # hl = lgm.plot1d(fill = 1, level = 0.66) + # [i.set_color('r') for i in hl['pdf']] + # hl['pdf'][0].set_label('pdf found by EM') + + # P.legend(loc = 0) + + # # Values found by Online EM + # hl = ogmm.gm.plot1d(fill = 1, level = 0.66) + # [i.set_color('m') for i in hl['pdf']] + # hl['pdf'][0].set_label('pdf found by Online EM') + + # P.legend(loc = 0) + + #P.subplot(2, 1, 2) + #P.plot(nu) + #P.title('Learning rate') + #P.show()