diff --git a/scikits/learn/pyem/.bzrignore b/scikits/learn/pyem/.bzrignore index 3796250d8864adefc4ba425edab4f038dc6e807c..4d275174e7262fd00235790414a1cfd4466522cc 100644 --- a/scikits/learn/pyem/.bzrignore +++ b/scikits/learn/pyem/.bzrignore @@ -19,3 +19,6 @@ pyem/tmp/kmean.py pyem/tmp/blop.py pyem/tmp/ pyem/tmp +matcode +../MSG +MSG diff --git a/scikits/learn/pyem/Changelog b/scikits/learn/pyem/Changelog index 9550e650640546d934ef32ef184a3e6963e0b564..4be45d61b0e50f751b6ea922004fccb2ffcc0968 100644 --- a/scikits/learn/pyem/Changelog +++ b/scikits/learn/pyem/Changelog @@ -1,3 +1,14 @@ +pyem (0.5.2) Tue, 29 Aug 2006 14:53:49 +0900 + + * Add a class method to GM to create a model directly from + w, mean and var values (uses decorator: python 2.4) + * Add a plot1d method to GM to plot a GM in one dimension (where + the confidence ell kind of plot does not make sense). This draws + each Gaussian pdf, fill the area on the confidence interval + (matplotlib is really cool) + +-- David Cournapeau <david@ar.media.kyoto-u.ac.jp> + pyem (0.5.2) Mon, 28 Aug 2006 13:20:13 +0900 * Add a plot method to Gm class, so that it is easier diff --git a/scikits/learn/pyem/pyem/densities.py b/scikits/learn/pyem/pyem/densities.py index 0f1f1941b8db2739aaac029742d20be5e95aabdf..71e190d433fff570a22e2709a739d2e34ce00a9a 100644 --- a/scikits/learn/pyem/pyem/densities.py +++ b/scikits/learn/pyem/pyem/densities.py @@ -1,7 +1,7 @@ #! /usr/bin/python # # Copyrighted David Cournapeau -# Last Change: Mon Aug 28 12:00 PM 2006 J +# Last Change: Tue Aug 29 04:00 PM 2006 J import numpy as N import numpy.linalg as lin @@ -204,8 +204,6 @@ def gauss_ell(mu, va, dim = [0, 1], npoints = 100, level = 0.39): else: raise DenError("mean and variance are not dim conformant") - # TODO: Get a confidence interval using the Chi2 distribution - # of points at a given mahalanobis distance... chi22d = chi2(2) mahal = N.sqrt(chi22d.ppf(level)) diff --git a/scikits/learn/pyem/pyem/gauss_mix.py b/scikits/learn/pyem/pyem/gauss_mix.py index ac6ff3268305fe8267cacabb18c23af2ae22d3ad..947931374b62d819508c2c44fd86659e6f33704d 100644 --- a/scikits/learn/pyem/pyem/gauss_mix.py +++ b/scikits/learn/pyem/pyem/gauss_mix.py @@ -1,5 +1,5 @@ # /usr/bin/python -# Last Change: Mon Aug 28 01:00 PM 2006 J +# Last Change: Tue Aug 29 08:00 PM 2006 J # Module to implement GaussianMixture class. @@ -8,13 +8,15 @@ from numpy.random import randn, rand import numpy.linalg as lin import densities -MAX_DEV = 1e-10 +MAX_DEV = 1e-10 +MAX_COND = 1e10 # Right now, two main usages of a Gaussian Model are possible # - init a Gaussian Model with meta-parameters, and trains it # - set-up a Gaussian Model to sample it, draw ellipsoides # of confidences. In this case, we would like to init it with -# known values of parameters. +# known values of parameters. This can be done with the class method +# fromval # # For now, we have to init with meta-parameters, and set # the parameters afterward. There should be a better way ? @@ -24,6 +26,8 @@ MAX_DEV = 1e-10 # be used as long as w, mu and va are not set # - We have to use scipy now for chisquare pdf, so there may be other # methods to be used, ie for implementing random index. +# - there is no check on internal state of the GM, that is does w, mu and va values +# make sense (eg singular values) class GmParamError: """Exception raised for errors in gmm params @@ -70,7 +74,9 @@ class GM: self.is_valid = False def set_param(self, weights, mu, sigma): - """Set parameters of the model""" + """Set parameters of the model. Args should + be conformant with metparameters d and k given during + initialisation""" k, d, mode = check_gmm_param(weights, mu, sigma) if not k == self.k: raise GmParamError("Number of given components is %d, expected %d" @@ -87,6 +93,26 @@ class GM: self.is_valid = True + @classmethod + def fromvalues(cls, weights, mu, sigma): + """This class method can be used to create a GM model + directly from its parameters weights, mean and variance + + w, mu, va = GM.gen_param(d, k) + gm = GM(d, k) + gm.set_param(w, mu, va) + + and + + w, mu, va = GM.gen_param(d, k) + gm = GM.fromvalue(w, mu, va) + + Are equivalent """ + k, d, mode = check_gmm_param(weights, mu, sigma) + res = cls(d, k, mode) + res.set_param(weights, mu, sigma) + return res + def sample(self, nframes): """ Sample nframes frames from the model """ if not self.is_valid: @@ -139,6 +165,10 @@ class GM: Will plot samples X draw from the mixture model, and plot the ellipses of equi-probability from the mean with fixed level of confidence 0.39. """ + if not self.is_valid: + raise GmParamError("""Parameters of the model has not been + set yet, please set them using self.set_param()""") + Xe = [] Ye = [] if self.mode == 'diag': @@ -157,6 +187,31 @@ class GM: return Xe, Ye + def check_state(self): + """ + """ + if not self.is_valid: + raise GmParamError("""Parameters of the model has not been + set yet, please set them using self.set_param()""") + + if self.mode == 'full': + raise NotImplementedError, "not implemented for full mode yet" + + # # How to check w: if one component is negligeable, what shall + # # we do ? + # M = N.max(self.w) + # m = N.min(self.w) + + # maxc = m / M + + # Check condition number for cov matrix + cond = N.zeros(self.k) + ava = N.absolute(self.va) + for c in range(self.k): + cond[c] = N.amax(ava[c,:]) / N.amin(ava[c,:]) + + print cond + def gen_param(self, d, nc, varmode = 'diag', spread = 1): """Generate valid parameters for a gaussian mixture model. d is the dimension, nc the number of components, and varmode @@ -188,7 +243,13 @@ class GM: """Plot the ellipsoides directly for the model Returns a list of lines, so that their style can be modified. By default, - the style is red color, and nolegend for all of them""" + the style is red color, and nolegend for all of them. + + Does not work for 1d""" + if not self.is_valid: + raise GmParamError("""Parameters of the model has not been + set yet, please set them using self.set_param()""") + k = self.k Xe, Ye = self.conf_ellipses(*args, **kargs) try: @@ -199,6 +260,50 @@ class GM: except ImportError: raise GmParamError("matplotlib not found, cannot plot...") + def plot1d(self, level = 0.5): + """TODO: this is not documented""" + # This is not optimized at all, may be slow. Should not be + # difficult to make much faster, but it is late, and I am lazy + if not self.d == 1: + raise GmParamError("the model is not one dimensional model") + from scipy.stats import norm + nrm = norm(0, 1) + pval = N.sqrt(self.va[:,0]) * nrm.ppf((1+level)/2) + + # Compute reasonable min/max for the normal pdf + mc = 4 + std = N.sqrt(self.va[:,0]) + m = N.amin(self.mu[:, 0] - 5 * std) + M = N.amax(self.mu[:, 0] + 5 * std) + + np = 500 + x = N.linspace(m, M, np) + Yf = N.zeros(np) + Yt = N.zeros(np) + try: + import pylab as P + for c in range(self.k): + y = self.w[c]/(N.sqrt(2*N.pi) * std[c]) * \ + N.exp(-(x-self.mu[c][0])**2/(2*std[c]**2)) + Yt += y + P.plot(x, y, 'r:') + #P.axvspan(-pval[c] + self.mu[c][0], pval[c] + self.mu[c][0], + # facecolor = 'b', alpha = 0.2) + id1 = -pval[c] + self.mu[c] + id2 = pval[c] + self.mu[c] + xc = x[:, N.where(x>id1)[0]] + xc = xc[:, N.where(xc<id2)[0]] + Yf = self.w[c]/(N.sqrt(2*N.pi) * std[c]) * \ + N.exp(-(xc-self.mu[c][0])**2/(2*std[c]**2)) + xc = N.concatenate(([xc[0]], xc, [xc[-1]])) + Yf = N.concatenate(([0], Yf, [0])) + P.fill(xc, Yf, facecolor = 'b', alpha = 0.2) + #P.fill([xc[0], xc[0], xc[-1], xc[-1]], + # [0, Yf[0], Yf[-1], 0], facecolor = 'b', alpha = 0.2) + P.plot(x, Yt, 'r') + except ImportError: + raise GmParamError("matplotlib not found, cannot plot...") + # Function to generate a random index: this is kept outside any class, # as the function can be useful for other def gen_rand_index(p, n): @@ -283,13 +388,14 @@ if __name__ == '__main__': # - mode : mode of covariance matrices d = 5 k = 4 + + # Now, drawing a model mode = 'full' nframes = 1e3 # Build a model with random parameters w, mu, va = GM.gen_param(d, k, mode, spread = 3) - gm = GM(d, k, mode) - gm.set_param(w, mu, va) + gm = GM.fromvalues(w, mu, va) # Sample nframes frames from the model X = gm.sample(nframes) diff --git a/scikits/learn/pyem/pyem/gmm_em.py b/scikits/learn/pyem/pyem/gmm_em.py index 4bcd5bffa92e9b1342e841ea20cebfc870ffd4ee..3d60edb064af147c785e55a6f1e0dadadb4b3650 100644 --- a/scikits/learn/pyem/pyem/gmm_em.py +++ b/scikits/learn/pyem/pyem/gmm_em.py @@ -1,8 +1,10 @@ # /usr/bin/python -# Last Change: Mon Aug 28 05:00 PM 2006 J +# Last Change: Tue Aug 29 03:00 PM 2006 J # TODO: -# - which methods to avoid va shrinking to 0 ? +# - which methods to avoid va shrinking to 0 ? There are several options, not sure which +# ones are appropriates +# - improve EM trainer # - online EM import numpy as N @@ -163,22 +165,27 @@ class GMM(ExpMixtureModel): w = invn * mGamma elif self.gm.mode == 'full': + # In full mode, this is the bottleneck: the triple loop + # kills performances. This is pretty straightforward + # algebra, so computing it in C should not be too difficult mu = N.zeros((k, d)) va = N.zeros((k*d, d)) + gamma = gamma.transpose() for c in range(k): - x = N.sum(N.outerproduct(gamma[:, c], - N.ones((1, d))) * data, axis = 0) + #x = N.sum(N.outer(gamma[:, c], + # N.ones((1, d))) * data, axis = 0) + x = N.dot(gamma[c:c+1,:], data)[0,:] xx = N.zeros((d, d)) # This should be much faster than recursing on n... for i in range(d): for j in range(d): - xx[i,j] = N.sum(data[:,i] * data[:,j] * gamma[:,c], axis = 0) + xx[i,j] = N.sum(data[:,i] * data[:,j] * gamma[c,:], axis = 0) mu[c,:] = x / mGamma[c] va[c*d:c*d+d,:] = xx / mGamma[c] - \ - N.outerproduct(mu[c,:], mu[c,:]) + N.outer(mu[c,:], mu[c,:]) w = invn * mGamma else: raise GmmParamError("varmode not recognized") @@ -223,11 +230,6 @@ class EM: return like -class OnlineEM: - "An online EM trainer. " - def __init__(self): - raise GmmError("not implemented yet") - # Misc functions def multiple_gauss_den(data, mu, va): """Helper function to generate several Gaussian @@ -265,7 +267,7 @@ if __name__ == "__main__": # row of d elements k = 3 d = 2 - mode = 'diag' + mode = 'full' nframes = 1e3 #+++++++++++++++++++++++++++++++++++++++++++ diff --git a/scikits/learn/pyem/pyem/kmean.py b/scikits/learn/pyem/pyem/kmean.py index d29d6cc0153a0a2fef4421fc2d42cf348a9bd46a..0cdcdcc8982c3c9938692766bc828a06a76dbe14 100644 --- a/scikits/learn/pyem/pyem/kmean.py +++ b/scikits/learn/pyem/pyem/kmean.py @@ -1,5 +1,5 @@ # /usr/bin/python -# Last Change: Mon Aug 28 09:00 PM 2006 J +# Last Change: Tue Aug 29 12:00 PM 2006 J import numpy as N @@ -29,7 +29,7 @@ def _py_vq(data, code): # Kmean will be REALLY slow""" # _vq = _py_vq try: - from sccipy.cluster.vq import vq + from scipy.cluster.vq import vq print "using scipy.cluster.vq" def _vq(*args, **kw): return vq(*args, **kw)[0] except ImportError: @@ -42,7 +42,7 @@ except ImportError: _vq = _py_vq def kmean(data, init, iter = 10): - """Simple kmean implementation for EM + """Simple kmean implementation for EM. Runs iter iterations. returns a tuple (code, label), where code are the final centroids, and label are the class label indec for each diff --git a/scikits/learn/pyem/pyem/profile_densities.py b/scikits/learn/pyem/pyem/profile_densities.py index 296a5fa609a58aa2c9a1d68193aa82ebd92bcc08..8f20efd3ce325408edcd46f9a873e0979faa685a 100644 --- a/scikits/learn/pyem/pyem/profile_densities.py +++ b/scikits/learn/pyem/pyem/profile_densities.py @@ -46,6 +46,12 @@ def benchpy(): def benchc(): bench(DC.gauss_den) +def benchpyfull(): + bench(D.gauss_den, 'full') + +def benchcfull(): + bench(DC.gauss_den, 'full') + if __name__ == "__main__": import profile import pstats diff --git a/scikits/learn/pyem/pyem/profile_gmm.py b/scikits/learn/pyem/pyem/profile_gmm.py index 0e9c7c9da1885e152bac1550fd90f802ec5da87e..04ccac529331a3a5a5903eb862ed4ee2933af783 100644 --- a/scikits/learn/pyem/pyem/profile_gmm.py +++ b/scikits/learn/pyem/pyem/profile_gmm.py @@ -10,7 +10,7 @@ def bench1(mode = 'diag'): k = 20 nframes = 1e4 niter = 10 - mode = 'diag' + mode = 'full' #+++++++++++++++++++++++++++++++++++++++++++ # Create an artificial GMM model, samples it @@ -42,7 +42,7 @@ def bench1(mode = 'diag'): print "computing..." for i in range(niter): - print "iter %d" % i + print "iteration %d" % i g, tgd = gmm.sufficient_statistics(data) like[i] = N.sum(N.log(N.sum(tgd, 1))) gmm.update_em(data, g)