diff --git a/scikits/learn/pyem/Changelog b/scikits/learn/pyem/Changelog index 2768e67bcde6450dee3a9d85e90d11ba732852aa..92e5a80e3ade8b136441997c42aef7d49016e48e 100644 --- a/scikits/learn/pyem/Changelog +++ b/scikits/learn/pyem/Changelog @@ -1,3 +1,10 @@ +pyem (0.5.3) Thu, 12 Oct 2006 21:08:21 +0900 + + * Change the layout and setup.py for inclusion to scipy. + * Initial script for online em. + +-- David Cournapeau <david@ar.media.kyoto-u.ac.jp> + pyem (0.5.3) Tue, 03 Oct 2006 18:28:13 +0900 * Update tests so that they work within the numpy test framework diff --git a/scikits/learn/pyem/LICENSE.txt b/scikits/learn/pyem/LICENSE.txt index 27727c5eae9a2dd901ec2f0896fdb4cf3795f5e4..87b237be433f0e478d02daa0a10a89c407d700a6 100644 --- a/scikits/learn/pyem/LICENSE.txt +++ b/scikits/learn/pyem/LICENSE.txt @@ -1,29 +1,3 @@ -Copyright (c) 2001, 2002 Enthought, Inc. - -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - a. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - b. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - c. Neither the name of the Enthought nor the names of its contributors - may be used to endorse or promote products derived from this software - without specific prior written permission. - - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH -DAMAGE. - +Author: David Cournapeau <david@ar.media.kyoto-u.ac.jp> +Copyright: 2006 +License: BSD-style license. See LICENSE.txt in the scipy source directory. diff --git a/scikits/learn/pyem/MANIFEST.in b/scikits/learn/pyem/MANIFEST.in deleted file mode 100644 index cc7fcfbf6d28dcb4d26b65d4accc3f4874997a9e..0000000000000000000000000000000000000000 --- a/scikits/learn/pyem/MANIFEST.in +++ /dev/null @@ -1,6 +0,0 @@ -include pyem/src/c_numpy.pxd -include pyem/src/c_python.pxd -include Changelog -include TODO -include LICENSE.txt -#exclude pyem/_c_densities.py diff --git a/scikits/learn/pyem/example.py b/scikits/learn/pyem/example.py new file mode 100644 index 0000000000000000000000000000000000000000..8b948b98ac79dba5751a00af400ccda463c57a99 --- /dev/null +++ b/scikits/learn/pyem/example.py @@ -0,0 +1,107 @@ +#! /usr/bin/env python + +# Example of use of pyem toolbox. Feel free to change parameters +# such as dimension, number of components, mode of covariance. +# +# You can also try less trivial things such as adding outliers, sampling +# a mixture with full covariance and estimating it with a mixture with diagonal +# gaussians (replace the mode of the learned model lgm) +# +# Later, I hope to add functions for number of component estimation using eg BIC + +import numpy as N +from numpy.random import seed + +from scipy.sandbox.pyem import GM, GMM, EM +import copy + +seed(1) +#+++++++++++++++++++++++++++++ +# Meta parameters of the model +# - k: Number of components +# - d: dimension of each Gaussian +# - mode: Mode of covariance matrix: full or diag (string) +# - nframes: number of frames (frame = one data point = one +# row of d elements) +k = 2 +d = 2 +mode = 'diag' +nframes = 1e3 + +#+++++++++++++++++++++++++++++++++++++++++++ +# Create an artificial GMM model, samples it +#+++++++++++++++++++++++++++++++++++++++++++ +w, mu, va = GM.gen_param(d, k, mode, spread = 1.5) +gm = GM.fromvalues(w, mu, va) + +# Sample nframes frames from the model +data = gm.sample(nframes) + +#++++++++++++++++++++++++ +# Learn the model with EM +#++++++++++++++++++++++++ + +# Init the model +lgm = GM(d, k, mode) +gmm = GMM(lgm, 'kmean') +gmm.init(data) + +# Keep a copy for drawing later +gm0 = copy.copy(lgm) + +# The actual EM, with likelihood computation. The treshold +# is compared to the (linearly appromixated) derivative of the likelihood +em = EM() +like = em.train(data, gmm, maxiter = 30, thresh = 1e-8) + +#+++++++++++++++ +# Draw the model +#+++++++++++++++ +import pylab as P +P.subplot(2, 1, 1) + +level = 0.8 +if not d == 1: + P.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_') + + # h keeps the handles of the plot, so that you can modify + # its parameters like label or color + h = gm.plot(level = level) + [i.set_color('g') for i in h] + h[0].set_label('true confidence ellipsoides') + + # Initial confidence ellipses as found by kmean + h = gm0.plot(level = level) + [i.set_color('k') for i in h] + h[0].set_label('kmean confidence ellipsoides') + + # Values found by EM + h = lgm.plot(level = level) + [i.set_color('r') for i in h] + h[0].set_label('kmean confidence ellipsoides') + + P.legend(loc = 0) +else: + # The 1d plotting function is quite elaborate: the confidence + # interval are represented by filled areas, the pdf of the mixture and + # the pdf of each component is drawn (optional) + h = gm.plot1d(level = level) + [i.set_color('g') for i in h['pdf']] + h['pdf'][0].set_label('true pdf') + + h0 = gm0.plot1d(level = level) + [i.set_color('k') for i in h0['pdf']] + h0['pdf'][0].set_label('initial pdf') + + hl = lgm.plot1d(fill = 1, level = level) + [i.set_color('r') for i in hl['pdf']] + hl['pdf'][0].set_label('pdf found by EM') + + P.legend(loc = 0) + +P.subplot(2, 1, 2) +P.plot(like) +P.title('log likelihood') + +P.show() +P.save('2d diag.png') diff --git a/scikits/learn/pyem/pyem/online_em.py b/scikits/learn/pyem/pyem/online_em.py index 21d8a6f1dc799880d9f0effd95e8ebf450c04b01..83d4cb71ae19afea01e07eb0f813dbefdd72e4d8 100644 --- a/scikits/learn/pyem/pyem/online_em.py +++ b/scikits/learn/pyem/pyem/online_em.py @@ -1,5 +1,5 @@ # /usr/bin/python -# Last Change: Fri Oct 06 08:00 PM 2006 J +# Last Change: Thu Oct 12 09:00 PM 2006 J #--------------------------------------------- # This is not meant to be used yet !!!! I am @@ -211,11 +211,16 @@ if __name__ == "__main__": nu[i] = 1./(1 + lamb[i] / nu[i-1]) gamma = N.zeros((nframes, k)) + agamma = [] + apw = [] + apmu = [] + apva = [] + print "========== Online Manual ===========" for e in range(emiter): - print "online manual:" - print pw - print pmu - print pva + print "online manual: estep %d, printing p* state " % e + apw.append(pw.copy()) + apmu.append(pmu.copy()) + apva.append(pva.copy()) for t in range(nframes): gamma[t] = multiple_gauss_den(data[t:t+1, :], pmu, pva)[0] gamma[t] *= pw @@ -234,32 +239,52 @@ if __name__ == "__main__": #pw = cw.copy() #pmu = cmu.copy() #pva = cva.copy() - print gamma[-1] + print "gamma[end]: " + str(gamma[-1]) pw = cw.copy() pmu = cmu.copy() pva = cva.copy() - - print "online manual:" - print pw - print pmu - print pva + agamma.append(gamma.copy()) + + gamma2 = N.zeros((nframes, k)) + agamma2 = [] + apw2 = [] + apmu2 = [] + apva2 = [] + print "========== Online Automatic ===========" for e in range(emiter): - print "online automatic:" - print ogmm2.pw - print ogmm2.pmu - print ogmm2.pva + print "online automatic: estep %d, printing p* state " % e + apw2.append(ogmm2.pw.copy()) + apmu2.append(ogmm2.pmu.copy()) + apva2.append(ogmm2.pva.copy()) for t in range(nframes): - gamma = ogmm2.sufficient_statistics(data[t:t+1, :], nu[t]) - ogmm2.update_em(data[t, :], gamma, nu[t]) - print gamma + gamma2[t] = ogmm2.sufficient_statistics(data[t:t+1, :], nu[t]) + #gamma2[t] = multiple_gauss_den(data[t:t+1, :], ogmm2.pmu, ogmm2.pva)[0] + #gamma2[t] *= ogmm2.pw + #gamma2[t] /= N.sum(gamma2[t]) + #try: + # assert_array_equal(agamma, gamma2[t]) + #except AssertionError: + # print "error for estep %d, step %d" % (e, t) + # print ogmm2.pw + # print ogmm2.pmu + # print ogmm2.pva + # raise + ogmm2.update_em(data[t, :], gamma2[t], nu[t]) + #ogmm2.cw = (1 - nu[t]) * ogmm2.cw + nu[t] * agamma + ## loop through each component + #for i in range(k): + # ogmm2.cx[i] = (1 - nu[t]) * ogmm2.cx[i] + nu[t] * data[t, :] * agamma[i] + # ogmm2.cxx[i] = (1 - nu[t]) * ogmm2.cxx[i] + nu[t] * data[t, :] ** 2 * agamma[i] + + # ogmm2.cmu[i] = ogmm2.cx[i] / ogmm2.cw[i] + # ogmm2.cva[i] = ogmm2.cxx[i] / ogmm2.cw[i] - ogmm2.cmu[i] ** 2 + + print "gamma[end]: " + str(gamma2[-1]) + agamma2.append(gamma2.copy()) ogmm2.pw = ogmm2.cw.copy() ogmm2.pmu = ogmm2.cmu.copy() ogmm2.pva = ogmm2.cva.copy() - print "online automatic:" - print ogmm2.pw - print ogmm2.pmu - print ogmm2.pva # #ogm.set_param(pw, pmu, pva) # print "weights off vs on: \n" + str(lgm.w) + "\n " + str(cw) # print "mean off vs on: \n" + str(lgm.mu) + "\n " + str(cmu) @@ -267,12 +292,16 @@ if __name__ == "__main__": # print "weights off vs on2: \n" + str(lgm.w) + "\n " + str(ogmm2.cw) # print "mean off vs on2: \n" + str(lgm.mu) + "\n " + str(ogmm2.cmu) # print "variances off vs on2: \n" + str(lgm.va) + "\n " + str(ogmm2.cva) - #assert_array_almost_equal(cw, lgm.w) - #assert_array_almost_equal(cmu, lgm.mu) - #assert_array_almost_equal(cva, lgm.va) - assert_array_almost_equal(ogmm2.cw, lgm.w) - assert_array_almost_equal(ogmm2.cmu, lgm.mu) - assert_array_almost_equal(ogmm2.cva, lgm.va) + # assert_array_almost_equal(cw, lgm.w) + # assert_array_almost_equal(cmu, lgm.mu) + # assert_array_almost_equal(cva, lgm.va) + assert_array_equal(ogmm2.pw, pw) + assert_array_equal(ogmm2.pmu, pmu) + assert_array_equal(ogmm2.pva, pva) + assert_array_equal(agamma[0], agamma2[0]) + #assert_array_almost_equal(ogmm2.cw, lgm.w) + #assert_array_almost_equal(ogmm2.cmu, lgm.mu) + #assert_array_almost_equal(ogmm2.cva, lgm.va) # #+++++++++++++++ # # Draw the model # #+++++++++++++++ diff --git a/scikits/learn/pyem/pyem/setup.py b/scikits/learn/pyem/pyem/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..6d58b904bdaedde838f29554173892a2b71436d6 --- /dev/null +++ b/scikits/learn/pyem/pyem/setup.py @@ -0,0 +1,142 @@ +#! /usr/bin/env python +# Last Change: Fri Oct 06 09:00 PM 2006 J +# TODO: +# - check how to handle cmd line build options with distutils and use +# it in the building process + +""" pyem is a small python package to estimate Gaussian Mixtures Models +from data, using Expectation Maximization""" + +from os.path import join +from info import version as pyem_version + +DISTNAME = 'pyem' +VERSION = pyem_version +DESCRIPTION ='A python module for Expectation Maximization learning of mixtures pdf', +AUTHOR ='David Cournapeau', +AUTHOR_EMAIL='david@ar.media.kyoto-u.ac.jp', +URL ='http://ar.media.kyoto-u.ac.jp/members/david', + +def configuration(parent_package='',top_path=None, package_name='pyem'): + from numpy.distutils.misc_util import Configuration + config = Configuration(package_name,parent_package,top_path, + version = VERSION) + config.add_data_dir('tests') + config.add_subpackage('profile_data') + config.add_extension('c_gden', + #define_macros=[('LIBSVM_EXPORTS', None), + # ('LIBSVM_DLL', None)], + sources=[join('src', 'c_gden.c')]) + + return config + +if __name__ == "__main__": + from numpy.distutils.core import setup + #setup(**configuration(top_path='').todict()) + setup(**configuration(top_path='', + package_name='scipy.sandbox.pyem').todict()) +# from distutils.core import setup, Extension +# from pyem import version as pyem_version +# +# # distutils does not update MANIFEST correctly, removes it +# import os +# if os.path.exists('MANIFEST'): os.remove('MANIFEST') +# from os.path import join +# +# import re +# +# from numpy.distutils.misc_util import get_numpy_include_dirs +# NUMPYINC = get_numpy_include_dirs()[0] +# +# # General variables: +# # - DISTNAME: name of the distributed package +# # - VERSION: the version reference is in pyem/__init__.py file +# # - other upper cased variables are the same than the corresponding +# # keywords in setup call +# DISTNAME = 'pyem' +# VERSION = pyem_version +# DESCRIPTION ='A python module for Expectation Maximization learning of mixtures pdf', +# AUTHOR ='David Cournapeau', +# AUTHOR_EMAIL='david@ar.media.kyoto-u.ac.jp', +# URL ='http://ar.media.kyoto-u.ac.jp/members/david', +# +# # Source files for extensions +# +# # Functions used to substitute values in File. +# # Mainly use to replace config.h capabilities +# def do_subst_in_file(sourcefile, targetfile, dict): +# """Replace all instances of the keys of dict with their values. +# For example, if dict is {'%VERSION%': '1.2345', '%BASE%': 'MyProg'}, +# then all instances of %VERSION% in the file will be replaced with 1.2345 etc. +# """ +# try: +# f = open(sourcefile, 'rb') +# contents = f.read() +# f.close() +# except: +# raise IOError, "Can't read source file %s"%sourcefile +# +# for (k,v) in dict.items(): +# contents = re.sub(k, v, contents) +# try: +# f = open(targetfile, 'wb') +# f.write(contents) +# f.close() +# except: +# raise IOError, "Can't read source file %s"%sourcefile +# return 0 # success +# +# class SetupOption: +# def __init__(self): +# self.kmean = 'py' +# self.ext_modules= [Extension(join('pyem', 'c_gden'), +# sources=[join('pyem', 'src', 'c_gden.c')]) ] +# self.cmdclass = {} +# self.subsdic = {'%KMEANIMPORT%': []} +# +# def _config_kmean(self): +# # Check in this order: +# # - kmean in scipy.cluster, +# # - custom vq with pyrex +# # - custom pure python vq +# #try: +# # from scipy.cluster.vq import kmeans +# # self.kmean = 'scipy' +# # #self.subsdic['%KMEANIMPORT%'] = scipy_kmean +# #except ImportError: +# # try: +# # from Pyrex.Distutils import build_ext +# # self.kmean = 'pyrex' +# # self.ext_modules.append(Extension('pyem/c_gmm', +# # ['pyem/src/c_gmm.pyx'], include_dirs=[NUMPYINC])) +# # self.cmdclass['build_ext'] = build_ext +# # #self.subsdic['%KMEANIMPORT%'] = pyrex_kmean +# # except ImportError: +# # self.kmean = 'py' +# # #self.subsdic['%KMEANIMPORT%'] = pyrex_kmean +# try: +# from Pyrex.Distutils import build_ext +# self.kmean = 'pyrex' +# self.ext_modules.append(Extension('pyem/c_gmm', +# ['pyem/src/c_gmm.pyx'], include_dirs=[NUMPYINC])) +# self.cmdclass['build_ext'] = build_ext +# #self.subsdic['%KMEANIMPORT%'] = pyrex_kmean +# except ImportError: +# self.kmean = 'py' +# #self.subsdic['%KMEANIMPORT%'] = pyrex_kmean +# def setup(self): +# self._config_kmean() +# #import time +# #do_subst_in_file('pyem/kmean.py.in', 'pyem/kmean.py', self.subsdic) +# setup(name = DISTNAME, +# version = VERSION, +# description = DESCRIPTION, +# author = AUTHOR, +# author_email= AUTHOR_EMAIL, +# url = URL, +# packages = ['pyem', 'pyem.tests', 'pyem.profile_data'], +# ext_modules = self.ext_modules, +# cmdclass = self.cmdclass) +# +# stpobj = SetupOption() +# stpobj.setup() diff --git a/scikits/learn/pyem/setup.py b/scikits/learn/pyem/setup.py deleted file mode 100644 index fc8797a3b6931df9b9dc738d92f4ccdab193c75b..0000000000000000000000000000000000000000 --- a/scikits/learn/pyem/setup.py +++ /dev/null @@ -1,113 +0,0 @@ -#! /usr/bin/env python -# Last Change: Fri Sep 29 07:00 PM 2006 J -# TODO: -# - check how to handle cmd line build options with distutils and use -# it in the building process - -""" pyem is a small python package to estimate Gaussian Mixtures Models -from data, using Expectation Maximization""" -from distutils.core import setup, Extension -from pyem import version as pyem_version - -# distutils does not update MANIFEST correctly, removes it -import os -if os.path.exists('MANIFEST'): os.remove('MANIFEST') -from os.path import join - -import re - -from numpy.distutils.misc_util import get_numpy_include_dirs -NUMPYINC = get_numpy_include_dirs()[0] - -# General variables: -# - DISTNAME: name of the distributed package -# - VERSION: the version reference is in pyem/__init__.py file -# - other upper cased variables are the same than the corresponding -# keywords in setup call -DISTNAME = 'pyem' -VERSION = pyem_version -DESCRIPTION ='A python module for Expectation Maximization learning of mixtures pdf', -AUTHOR ='David Cournapeau', -AUTHOR_EMAIL='david@ar.media.kyoto-u.ac.jp', -URL ='http://ar.media.kyoto-u.ac.jp/members/david', - -# Source files for extensions - -# Functions used to substitute values in File. -# Mainly use to replace config.h capabilities -def do_subst_in_file(sourcefile, targetfile, dict): - """Replace all instances of the keys of dict with their values. - For example, if dict is {'%VERSION%': '1.2345', '%BASE%': 'MyProg'}, - then all instances of %VERSION% in the file will be replaced with 1.2345 etc. - """ - try: - f = open(sourcefile, 'rb') - contents = f.read() - f.close() - except: - raise IOError, "Can't read source file %s"%sourcefile - - for (k,v) in dict.items(): - contents = re.sub(k, v, contents) - try: - f = open(targetfile, 'wb') - f.write(contents) - f.close() - except: - raise IOError, "Can't read source file %s"%sourcefile - return 0 # success - -class SetupOption: - def __init__(self): - self.kmean = 'py' - self.ext_modules= [Extension(join('pyem', 'c_gden'), - sources=[join('pyem', 'src', 'c_gden.c')]) ] - self.cmdclass = {} - self.subsdic = {'%KMEANIMPORT%': []} - - def _config_kmean(self): - # Check in this order: - # - kmean in scipy.cluster, - # - custom vq with pyrex - # - custom pure python vq - #try: - # from scipy.cluster.vq import kmeans - # self.kmean = 'scipy' - # #self.subsdic['%KMEANIMPORT%'] = scipy_kmean - #except ImportError: - # try: - # from Pyrex.Distutils import build_ext - # self.kmean = 'pyrex' - # self.ext_modules.append(Extension('pyem/c_gmm', - # ['pyem/src/c_gmm.pyx'], include_dirs=[NUMPYINC])) - # self.cmdclass['build_ext'] = build_ext - # #self.subsdic['%KMEANIMPORT%'] = pyrex_kmean - # except ImportError: - # self.kmean = 'py' - # #self.subsdic['%KMEANIMPORT%'] = pyrex_kmean - try: - from Pyrex.Distutils import build_ext - self.kmean = 'pyrex' - self.ext_modules.append(Extension('pyem/c_gmm', - ['pyem/src/c_gmm.pyx'], include_dirs=[NUMPYINC])) - self.cmdclass['build_ext'] = build_ext - #self.subsdic['%KMEANIMPORT%'] = pyrex_kmean - except ImportError: - self.kmean = 'py' - #self.subsdic['%KMEANIMPORT%'] = pyrex_kmean - def setup(self): - self._config_kmean() - #import time - #do_subst_in_file('pyem/kmean.py.in', 'pyem/kmean.py', self.subsdic) - setup(name = DISTNAME, - version = VERSION, - description = DESCRIPTION, - author = AUTHOR, - author_email= AUTHOR_EMAIL, - url = URL, - packages = ['pyem', 'pyem.tests', 'pyem.profile_data'], - ext_modules = self.ext_modules, - cmdclass = self.cmdclass) - -stpobj = SetupOption() -stpobj.setup()