diff --git a/scikits/learn/em/Changelog b/scikits/learn/em/Changelog deleted file mode 100644 index 0f23280fabdcfab39287b3ce95a79c7b4f9cb4d5..0000000000000000000000000000000000000000 --- a/scikits/learn/em/Changelog +++ /dev/null @@ -1,138 +0,0 @@ -pyem (0.5.7dev) Sun, 22 Jul 2007 11:05:58 +0900 - - * pyem is now part of the scikits.learn package - * Renamed to em - -pyem (0.5.7dev) Mon, 28 May 2007 11:31:08 +0900 - - * Put doc into its own directory - -pyem (0.5.6) Thu, 16 Nov 2006 21:02:02 +0900 - - * correct examples - * correct exceptions msg strings in gauss_mix, which - were buggy - * add examples from website to the package, so that above errors - do not appear again - -pyem (0.5.6) Thu, 16 Nov 2006 14:18:19 +0900 - - * bump to 0.5.6 - * Add __str__ and __repr__ for GM and GMM classes - * Add regularization method (but not used yet). - * Change 'f<8' to N.float64 for ctype enabled densities - * Move 'Magic numbers' into a separated python file, misc.py - -pyem (0.5.5) Tue, 24 Oct 2006 18:30:54 +0900 - - * Fix a bug inmultiple_gaussian_den which prevents - full covariance mode to work in GMM. - * Add consistency tests in tests/test_gmm_em.py to avoid the - above bug from being detected from now on. - -pyem (0.5.5) Mon, 23 Oct 2006 18:48:15 +0900 - - * Bump to 0.5.5 - * Added bic method to GMM. - * A few fixes for docstrings - * Added bibliography to the doc - -pyem (0.5.4) Fri, 20 Oct 2006 12:52:01 +0900 - - * Bump to 0.5.4 - * Online EM is basically implemented, with tests. The API - should be fixed to be choherent (lacks the Trainer Class, too). - The class is not imported by default, still (available as _OnGMM) - --- David Cournapeau <david@ar.media.kyoto-u.ac.jp> - -pyem (0.5.3) Thu, 12 Oct 2006 21:08:21 +0900 - - * Change the layout and setup.py for inclusion to scipy. - * Initial script for online em. - --- David Cournapeau <david@ar.media.kyoto-u.ac.jp> - -pyem (0.5.3) Tue, 03 Oct 2006 18:28:13 +0900 - - * Update tests so that they work within the numpy test framework - * Update version to 0.5.3 - --- David Cournapeau <david@ar.media.kyoto-u.ac.jp> - -pyem (0.5.2) Tue, 29 Aug 2006 14:53:49 +0900 - - * Add a class method to GM to create a model directly from - w, mean and var values (uses decorator: python 2.4) - * Add a plot1d method to GM to plot a GM in one dimension (where - the confidence ell kind of plot does not make sense). This draws - each Gaussian pdf, fill the area on the confidence interval - (matplotlib is really cool) - --- David Cournapeau <david@ar.media.kyoto-u.ac.jp> - -pyem (0.5.2) Mon, 28 Aug 2006 13:20:13 +0900 - - * Add a plot method to Gm class, so that it is easier - to plot a GM model interactively (needs matplotlib) - --- David Cournapeau <david@ar.media.kyoto-u.ac.jp> - -pyem (0.5.2) Thu, 24 Aug 2006 19:42:28 +0900 - - * put version to 0.5.2 - * Correct a bug with init method in GMM (change class method - to object method). - * modify the setup for a more flexible system - --- David Cournapeau <david@ar.media.kyoto-u.ac.jp> - -pyem (0.5.1) Thu, 17 Aug 2006 11:54:41 +0900 - - * put version to 0.5.1 - * Update to last numpy (change axis args between 1.0b1 and 1.0b2, - change type args of ones and zeros) - --- David Cournapeau <david@ar.media.kyoto-u.ac.jp> - -pyem (0.5) Fri, 04 Aug 2006 23:10:37 +0900 - - * put version to 0.5.0 - * implement confidence interval using chi2 - --- David Cournapeau <david@ar.media.kyoto-u.ac.jp> - -pyem (0.4) Fri, 04 Aug 2006 19:37:47 +0900 - - * put version to 0.4.2 - * adapt to new version of numpy (1.0b2SVN) - --- David Cournapeau <david@ar.media.kyoto-u.ac.jp> - -pyem (0.4) Fri, 14 Jul 2006 17:49:57 +0900 - - * put version to 0.4.1 - * change some import due to recent changes in - numpy - --- David Cournapeau <david@ar.media.kyoto-u.ac.jp> - -pyem (0.4) Fri, 14 Jul 2006 16:24:05 +0900 - - * put version to 0.4 - * Heavy refactoring of EM and GMM into classes (see below) - * add a module gauss_mix, which implements Gaussian Mixtures. - * GMM is now a class, which is initialized using a Gaussian Mixture. - a GMM can be trained. - --- David Cournapeau <david@ar.media.kyoto-u.ac.jp> - -pyem (0.3) Thu, 13 Jul 2006 19:48:54 +0900 - - * put version to 0.3 - * Refactoring kmean code into new module. - * Refactoring tests code into test module. - * Replace matrixmultiply and outerproduct calls by dot to use fast BLAS if - available. Not everything is done yet - --- David Cournapeau <david@ar.media.kyoto-u.ac.jp> diff --git a/scikits/learn/em/LICENSE.txt b/scikits/learn/em/LICENSE.txt deleted file mode 100644 index 87b237be433f0e478d02daa0a10a89c407d700a6..0000000000000000000000000000000000000000 --- a/scikits/learn/em/LICENSE.txt +++ /dev/null @@ -1,3 +0,0 @@ -Author: David Cournapeau <david@ar.media.kyoto-u.ac.jp> -Copyright: 2006 -License: BSD-style license. See LICENSE.txt in the scipy source directory. diff --git a/scikits/learn/em/TODO b/scikits/learn/em/TODO deleted file mode 100644 index 7c162d1694af5d47cfd59bb7b467651e61b56357..0000000000000000000000000000000000000000 --- a/scikits/learn/em/TODO +++ /dev/null @@ -1,15 +0,0 @@ -# Last Change: Sun Jul 01 06:00 PM 2007 J - -Things which must be implemented for a 1.0 version (in importante order) - - A classifier - - handle rank 1 for 1d data - - demo for pdf estimation, discriminant analysis and clustering - - scaling of data: maybe something to handle scaling internally ? - -Things which would be nice (after 1.0 version): - - Bayes prior (hard, suppose MCMC) - - variational Bayes (hard ? Not sure yet) - - Integrate libem (libem should be modified so - that it would be easy to package and distribute) - - Other initialization schemes - - Other mixture models diff --git a/scikits/learn/em/__init__.py b/scikits/learn/em/__init__.py deleted file mode 100644 index fd477301516ffdfbc77c8931fd3a2646ffebc0be..0000000000000000000000000000000000000000 --- a/scikits/learn/em/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -#! /usr/bin/env python -# Last Change: Sun Sep 07 04:00 PM 2008 J - -from info import __doc__ - -from gauss_mix import GmParamError, GM -from gmm_em import GmmParamError, GMM, EM -from online_em import OnGMM as _OnGMM - -__all__ = filter(lambda s:not s.startswith('_'), dir()) diff --git a/scikits/learn/em/_c_densities.py b/scikits/learn/em/_c_densities.py deleted file mode 100644 index 3e9cd2921079c4564cf75da8c808fdcd6b085ffe..0000000000000000000000000000000000000000 --- a/scikits/learn/em/_c_densities.py +++ /dev/null @@ -1,224 +0,0 @@ -#! /usr/bin/python -# -# Copyrighted David Cournapeau -# Last Change: Sat Jun 09 10:00 PM 2007 J - -"""This module implements some function of densities module in C for efficiency -reasons. gaussian, such as pdf estimation, confidence interval/ellipsoids, -etc...""" - -__docformat__ = 'restructuredtext' - -# This module uses a C implementation through ctypes, for diagonal cases -# TODO: -# - portable way to find/open the shared library -# - full cov matrice -# - test before inclusion - -import numpy as N -import numpy.linalg as lin -#from numpy.random import randn -#from scipy.stats import chi2 -#import densities as D - -import ctypes -from ctypes import c_uint, c_int -from numpy.ctypeslib import ndpointer, load_library - -ctypes_major = int(ctypes.__version__.split('.')[0]) -if ctypes_major < 1: - raise ImportError(msg = "version of ctypes is %s, expected at least %s"\ - % (ctypes.__version__, '1.0.1')) - -# Requirements for diag gden -_gden = load_library('c_gden.so', __file__) -arg1 = ndpointer(dtype=N.float64) -arg2 = c_uint -arg3 = c_uint -arg4 = ndpointer(dtype=N.float64) -arg5 = ndpointer(dtype=N.float64) -arg6 = ndpointer(dtype=N.float64) -_gden.gden_diag.argtypes = [arg1, arg2, arg3, arg4, arg5, arg6] -_gden.gden_diag.restype = c_int - -# Error classes -class DenError(Exception): - """Base class for exceptions in this module. - - Attributes: - expression -- input expression in which the error occurred - message -- explanation of the error""" - def __init__(self, message): - self.message = message - - def __str__(self): - return self.message - -# The following function do all the fancy stuff to check that parameters -# are Ok, and call the right implementation if args are OK. -def gauss_den(x, mu, va, log = False): - """ Compute multivariate Gaussian density at points x for - mean mu and variance va. - - Vector are row vectors, except va which can be a matrix - (row vector variance for diagonal variance) - - If log is True, than the log density is returned - (useful for underflow ?)""" - mu = N.atleast_2d(mu) - va = N.atleast_2d(va) - x = N.atleast_2d(x) - - #=======================# - # Checking parameters # - #=======================# - if len(N.shape(mu)) != 2: - raise DenError("mu is not rank 2") - - if len(N.shape(va)) != 2: - raise DenError("va is not rank 2") - - if len(N.shape(x)) != 2: - raise DenError("x is not rank 2") - - (n, d) = N.shape(x) - (dm0, dm1) = N.shape(mu) - (dv0, dv1) = N.shape(va) - - # Check x and mu same dimension - if dm0 != 1: - msg = "mean must be a row vector!" - raise DenError(msg) - if dm1 != d: - msg = "x and mu not same dim" - raise DenError(msg) - # Check va and mu same size - if dv1 != d: - msg = "mu and va not same dim" - raise DenError(msg) - if dv0 != 1 and dv0 != d: - msg = "va not square" - raise DenError(msg) - - #===============# - # Computation # - #===============# - if d == 1: - # scalar case - return _scalar_gauss_den(x[:, 0], mu[0, 0], va[0, 0], log) - elif dv0 == 1: - # Diagonal matrix case - return _diag_gauss_den(x, mu, va, log) - elif dv1 == dv0: - # full case - return _full_gauss_den(x, mu, va, log) - else: - raise DenError("variance mode not recognized, this is a bug") - -# Those 3 functions do almost all the actual computation -def _scalar_gauss_den(x, mu, va, log): - """ This function is the actual implementation - of gaussian pdf in scalar case. It assumes all args - are conformant, so it should not be used directly - - ** Expect centered data (ie with mean removed) ** - - Call gauss_den instead""" - d = mu.size - inva = 1/va - fac = (2*N.pi) ** (-d/2.0) * N.sqrt(inva) - y = ((x-mu) ** 2) * -0.5 * inva - if not log: - y = fac * N.exp(y) - else: - y = y + log(fac) - - return y - -def _diag_gauss_den(x, mu, va, log): - """ This function is the actual implementation - of gaussian pdf in scalar case. It assumes all args - are conformant, so it should not be used directly - - ** Expect centered data (ie with mean removed) ** - - Call gauss_den instead""" - # Diagonal matrix case - d = mu.size - n = x.shape[0] - if not log: - y = N.zeros(n) - vat = va.copy() - # _gden.gden_diag(N.require(x, requirements = 'C'), n, d, - # N.require(mu, requirements = 'C'), - # N.require(inva, requirements = 'C'), - # N.require(y, requirements = 'C')) - x = N.require(x, requirements = 'C') - mu = N.require(mu, requirements = 'C') - vat = N.require(vat, requirements = 'C') - y = N.require(y, requirements = 'C') - _gden.gden_diag(x, n, d, mu, vat, y) - return y - # _gden.gden_diag.restype = c_int - # _gden.gden_diag.argtypes = [POINTER(c_double), c_uint, c_uint, - # POINTER(c_double), POINTER(c_double), POINTER(c_double)] - - # y = N.zeros(n) - # inva= 1/va - # _gden.gden_diag(x.ctypes.data_as(POINTER(c_double)), - # n, d, - # mu.ctypes.data_as(POINTER(c_double)), - # inva.ctypes.data_as(POINTER(c_double)), - # y.ctypes.data_as(POINTER(c_double))) - else: - y = _scalar_gauss_den(x[:, 0], mu[0, 0], va[0, 0], log) - for i in range(1, d): - y += _scalar_gauss_den(x[:, i], mu[0, i], va[0, i], log) - return y - -def _full_gauss_den(x, mu, va, log): - """ This function is the actual implementation - of gaussian pdf in full matrix case. - - It assumes all args are conformant, so it should - not be used directly Call gauss_den instead - - ** Expect centered data (ie with mean removed) ** - - Does not check if va is definite positive (on inversible - for that matter), so the inverse computation and/or determinant - would throw an exception.""" - d = mu.size - inva = lin.inv(va) - fac = 1 / N.sqrt( (2*N.pi) ** d * N.fabs(lin.det(va))) - - # we are using a trick with sum to "emulate" - # the matrix multiplication inva * x without any explicit loop - y = N.dot((x-mu), inva) - y = -0.5 * N.sum(y * (x-mu), 1) - - if not log: - y = fac * N.exp(y) - else: - y = y + N.log(fac) - - return y - -if __name__ == "__main__": - pass - ##========================================= - ## Test accuracy between pure and C python - ##========================================= - #mu = N.array([2.0, 3]) - #va = N.array([5.0, 3]) - - ## Generate a multivariate gaussian of mean mu and covariance va - #nframes = 1e4 - #X = randn(nframes, 2) - #Yc = N.dot(N.diag(N.sqrt(va)), X.transpose()) - #Yc = Yc.transpose() + mu - - #Y = D.gauss_den(Yc, mu, va) - #Yt = gauss_den(Yc, mu, va) - - #print "Diff is " + str(N.sqrt(N.sum((Y-Yt) ** 2))/nframes/2) diff --git a/scikits/learn/em/densities.py b/scikits/learn/em/densities.py deleted file mode 100644 index 2b800794678a1c15a493a441835e2959cb6d4d4c..0000000000000000000000000000000000000000 --- a/scikits/learn/em/densities.py +++ /dev/null @@ -1,307 +0,0 @@ -#! /usr/bin/python -# -# Copyrighted David Cournapeau -# Last Change: Thu Jul 12 04:00 PM 2007 J -"""This module implements various basic functions related to multivariate -gaussian, such as pdf estimation, confidence interval/ellipsoids, etc...""" - -__docformat__ = 'restructuredtext' - -import numpy as N -import numpy.linalg as lin -#from numpy.random import randn -from scipy.stats import chi2 -import misc - -# Error classes -class DenError(Exception): - """Base class for exceptions in this module. - - Attributes: - expression -- input expression in which the error occurred - message -- explanation of the error""" - def __init__(self, message): - self.message = message - Exception.__init__(self) - - def __str__(self): - return self.message - -# The following function do all the fancy stuff to check that parameters -# are Ok, and call the right implementation if args are OK. -def gauss_den(x, mu, va, log = False): - """Compute multivariate Gaussian density at points x for - mean mu and variance va. - - :Parameters: - x : ndarray - points where to estimate the pdf. each row of the array is one - point of d dimension - mu : ndarray - mean of the pdf. Should have same dimension d than points in x. - va : ndarray - variance of the pdf. If va has d elements, va is interpreted as the - diagonal elements of the actual covariance matrix. Otherwise, - should be a dxd matrix (and positive definite). - log : boolean - if True, returns the log-pdf instead of the pdf. - - :Returns: - pdf : ndarray - Returns a rank 1 array of the pdf at points x. - - Note - ---- - Vector are row vectors, except va which can be a matrix - (row vector variance for diagonal variance).""" - - lmu = N.atleast_2d(mu) - lva = N.atleast_2d(va) - lx = N.atleast_2d(x) - - #=======================# - # Checking parameters # - #=======================# - if len(N.shape(lmu)) != 2: - raise DenError("mu is not rank 2") - - if len(N.shape(lva)) != 2: - raise DenError("va is not rank 2") - - if len(N.shape(lx)) != 2: - raise DenError("x is not rank 2") - - d = N.shape(lx)[1] - (dm0, dm1) = N.shape(lmu) - (dv0, dv1) = N.shape(lva) - - # Check x and mu same dimension - if dm0 != 1: - msg = "mean must be a row vector!" - raise DenError(msg) - if dm1 != d: - msg = "x and mu not same dim" - raise DenError(msg) - # Check va and mu same size - if dv1 != d: - msg = "mu and va not same dim" - raise DenError(msg) - if dv0 != 1 and dv0 != d: - msg = "va not square" - raise DenError(msg) - - #===============# - # Computation # - #===============# - if d == 1: - # scalar case - return _scalar_gauss_den(lx[:, 0], lmu[0, 0], lva[0, 0], log) - elif dv0 == 1: - # Diagonal matrix case - return _diag_gauss_den(lx, lmu, lva, log) - elif dv1 == dv0: - # full case - return _full_gauss_den(lx, lmu, lva, log) - else: - raise DenError("variance mode not recognized, this is a bug") - -# Those 3 functions do almost all the actual computation -def _scalar_gauss_den(x, mu, va, log): - """ This function is the actual implementation - of gaussian pdf in scalar case. It assumes all args - are conformant, so it should not be used directly - - Call gauss_den instead""" - d = mu.size - inva = 1/va - fac = (2*N.pi) ** (-d/2.0) * N.sqrt(inva) - inva *= -0.5 - y = ((x-mu) ** 2) * inva - if not log: - y = fac * N.exp(y) - else: - y += N.log(fac) - - return y - -def _diag_gauss_den(x, mu, va, log): - """ This function is the actual implementation - of gaussian pdf in scalar case. It assumes all args - are conformant, so it should not be used directly - - Call gauss_den instead""" - # Diagonal matrix case - d = mu.size - #n = x.shape[0] - if not log: - inva = 1/va[0] - fac = (2*N.pi) ** (-d/2.0) * N.prod(N.sqrt(inva)) - inva *= -0.5 - x = x - mu - x **= 2 - y = fac * N.exp(N.dot(x, inva)) - else: - # XXX optimize log case as non log case above - y = _scalar_gauss_den(x[:, 0], mu[0, 0], va[0, 0], log) - for i in range(1, d): - y += _scalar_gauss_den(x[:, i], mu[0, i], va[0, i], log) - return y - -def _full_gauss_den(x, mu, va, log): - """ This function is the actual implementation - of gaussian pdf in full matrix case. - - It assumes all args are conformant, so it should - not be used directly Call gauss_den instead - - Does not check if va is definite positive (on inversible - for that matter), so the inverse computation and/or determinant - would throw an exception.""" - d = mu.size - inva = lin.inv(va) - fac = 1 / N.sqrt( (2*N.pi) ** d * N.fabs(lin.det(va))) - - # we are using a trick with sum to "emulate" - # the matrix multiplication inva * x without any explicit loop - #y = -0.5 * N.sum(N.dot((x-mu), inva) * (x-mu), 1) - y = -0.5 * N.dot(N.dot((x-mu), inva) * (x-mu), - N.ones((mu.size, 1), x.dtype))[:, 0] - - if not log: - y = fac * N.exp(y) - else: - y = y + N.log(fac) - - return y - -# To get coordinatea of a confidence ellipse from multi-variate gaussian pdf -def gauss_ell(mu, va, dim = misc.DEF_VIS_DIM, npoints = misc.DEF_ELL_NP, \ - level = misc.DEF_LEVEL): - """Given a mean and covariance for multi-variate - gaussian, returns the coordinates of the confidense ellipsoid. - - Compute npoints coordinates for the ellipse of confidence of given level - (all points will be inside the ellipsoides with a probability equal to - level). - - :Parameters: - mu : ndarray - mean of the pdf - va : ndarray - variance of the pdf - dim : sequence - sequences of two integers which represent the dimensions where to - project the ellipsoid. - npoints: int - number of points to generate for the ellipse. - level : float - level of confidence (between 0 and 1). - - :Returns: - Returns the coordinate x and y of the ellipse.""" - if level >= 1 or level <= 0: - raise ValueError("level should be a scale strictly between 0 and 1.""") - - mu = N.atleast_1d(mu) - va = N.atleast_1d(va) - d = N.shape(mu)[0] - c = N.array(dim) - - if N.any(c < 0) or N.any(c >= d): - raise ValueError("dim elements should be >= 0 and < %d (dimension"\ - " of the variance)" % d) - if N.size(mu) == N.size(va): - mode = 'diag' - else: - if N.ndim(va) == 2: - if N.shape(va)[0] == N.shape(va)[1]: - mode = 'full' - else: - raise DenError("variance not square") - else: - raise DenError("mean and variance are not dim conformant") - - # When X is a sample from multivariante N(mu, sigma), (X-mu)Sigma^-1(X-mu) - # follows a Chi2(d) law. Here, we only take 2 dimension, so Chi2 with 2 - # degree of freedom (See Wasserman. This is easy to see with characteristic - # functions) - chi22d = chi2(2) - mahal = N.sqrt(chi22d.ppf(level)) - - # Generates a circle of npoints - theta = N.linspace(0, 2 * N.pi, npoints) - circle = mahal * N.array([N.cos(theta), N.sin(theta)]) - - # Get the dimension which we are interested in: - mu = mu[c] - if mode == 'diag': - va = va[c] - elps = N.outer(mu, N.ones(npoints)) - elps += N.dot(N.diag(N.sqrt(va)), circle) - elif mode == 'full': - va = va[c, :][:, c] - # Method: compute the cholesky decomp of each cov matrix, that is - # compute cova such as va = cova * cova' - # WARN: scipy is different than matlab here, as scipy computes a lower - # triangular cholesky decomp: - # - va = cova * cova' (scipy) - # - va = cova' * cova (matlab) - # So take care when comparing results with matlab ! - cova = lin.cholesky(va) - elps = N.outer(mu, N.ones(npoints)) - elps += N.dot(cova, circle) - else: - raise ValueError("var mode not recognized") - - return elps[0, :], elps[1, :] - -def logsumexp(x): - """Compute log(sum(exp(x), 1)) while avoiding underflow. - - :Parameters: - x : ndarray - data in log domain to sum""" - axis = 1 - mc = N.max(x, axis) - return mc + N.log(N.sum(N.exp(x-mc[:, N.newaxis]), axis)) - -def multiple_gauss_den(data, mu, va, log = False): - """Helper function to generate several Gaussian - pdf (different parameters) at the same points - - :Parameters: - data : ndarray - points where to estimate the pdfs (n,d). - mu : ndarray - mean of the pdf, of shape (k,d). One row of dimension d per - different component, the number of rows k being the number of - component - va : ndarray - variance of the pdf. One row per different component for diagonal - covariance (k, d), or d rows per component for full matrix pdf - (k*d,d). - log : boolean - if True, returns the log-pdf instead of the pdf. - - :Returns: - Returns a (n, k) array, each column i being the pdf of the ith mean and - ith variance.""" - mu = N.atleast_2d(mu) - va = N.atleast_2d(va) - - k = N.shape(mu)[0] - n = N.shape(data)[0] - d = N.shape(mu)[1] - - y = N.zeros((k, n)) - if N.size(mu) == N.size(va): - for i in range(k): - y[i] = gauss_den(data, mu[i, :], va[i, :], log) - return y.T - else: - for i in range(k): - y[i] = gauss_den(data, mu[i, :], va[d*i:d*i+d, :], log) - return y.T - -if __name__ == "__main__": - pass diff --git a/scikits/learn/em/densities2.py b/scikits/learn/em/densities2.py deleted file mode 100644 index 80bd72dcdba64eda991267a262af7d751353961d..0000000000000000000000000000000000000000 --- a/scikits/learn/em/densities2.py +++ /dev/null @@ -1,230 +0,0 @@ -#! /usr/bin/python -# -# Copyrighted David Cournapeau -# Last Change: Sat Jun 02 07:00 PM 2007 J - -# New version, with default numpy ordering. - -import numpy as N -import numpy.linalg as lin -from numpy.random import randn -from scipy.stats import chi2 - -# Error classes -class DenError(Exception): - """Base class for exceptions in this module. - - Attributes: - expression -- input expression in which the error occurred - message -- explanation of the error""" - def __init__(self, message): - self.message = message - - def __str__(self): - return self.message - -#============ -# Public API -#============ -# The following function do all the fancy stuff to check that parameters -# are Ok, and call the right implementation if args are OK. -def gauss_den(x, mu, va, log = False, axis = -1): - """ Compute multivariate Gaussian density at points x for - mean mu and variance va along specified axis: - - requirements: - * mean must be rank 0 (1d) or rank 1 (multi variate gaussian) - * va must be rank 0 (1d), rank 1(multi variate, diag covariance) or rank 2 - (multivariate, full covariance). - * in 1 dimension case, any rank for mean and va is ok, as long as their size - is 1 (eg they contain only 1 element) - - Caution: if x is rank 1, it is assumed you have a 1d problem. You cannot compute - the gaussian densities of only one sample of dimension d; for this, you have - to use a rank 2 ! - - If log is True, than the log density is returned - (useful for underflow ?)""" - - # If data is rank 1, then we have 1 dimension problem. - if x.ndim == 1: - d = 1 - n = x.size - if not N.size(mu) == 1: - raise DenError("for 1 dimension problem, mean must have only one element") - - if not N.size(va) == 1: - raise DenError("for 1 dimension problem, mean must have only one element") - - return _scalar_gauss_den(x, mu, va, log) - # If data is rank 2, then we may have 1 dimension or multi-variate problem - elif x.ndim == 2: - oaxis = (axis + 1) % 2 - n = x.shape[axis] - d = x.shape[oaxis] - - # Get away with 1d case now - if d == 1: - return _scalar_gauss_den(x, mu, va, log) - - # Now, d > 1 (numpy attributes should be valid on mean and va now) - if not N.size(mu) == d or not mu.ndim == 1: - raise DenError("data is %d dimension, but mean's shape is %s" \ - % (d, N.shape(mu)) + " (should be (%d,))" % d) - - isfull = (va.ndim == 2) - if not (N.size(va) == d or (isfull and va.shape[0] == va.shape[1] == d)): - raise DenError("va has an invalid shape or number of elements") - - if isfull: - # Compute along rows - if oaxis == 0: - return _full_gauss_den(x, mu[:, N.newaxis], va, log, axis) - else: - return _full_gauss_den(x, mu, va, log, axis) - else: - return _diag_gauss_den(x, mu, va, log, axis) - else: - raise RuntimeError("Sorry, only rank up to 2 supported") - -# To plot a confidence ellipse from multi-variate gaussian pdf -def gauss_ell(mu, va, dim = [0, 1], npoints = 100, level = 0.39): - """ Given a mean and covariance for multi-variate - gaussian, returns npoints points for the ellipse - of confidence given by level (all points will be inside - the ellipsoides with a probability equal to level) - - Returns the coordinate x and y of the ellipse""" - - c = N.array(dim) - - if mu.size < 2: - raise RuntimeError("this function only make sense for dimension 2 and more") - - if mu.size == va.size: - mode = 'diag' - else: - if va.ndim == 2: - if va.shape[0] == va.shape[1]: - mode = 'full' - else: - raise DenError("variance not square") - else: - raise DenError("mean and variance are not dim conformant") - - # If X ~ N(mu, va), then [X` * va^(-1/2) * X] ~ Chi2 - chi22d = chi2(2) - mahal = N.sqrt(chi22d.ppf(level)) - - # Generates a circle of npoints - theta = N.linspace(0, 2 * N.pi, npoints) - circle = mahal * N.array([N.cos(theta), N.sin(theta)]) - - # Get the dimension which we are interested in: - mu = mu[dim] - if mode == 'diag': - va = va[dim] - elps = N.outer(mu, N.ones(npoints)) - elps += N.dot(N.diag(N.sqrt(va)), circle) - elif mode == 'full': - va = va[c,:][:,c] - # Method: compute the cholesky decomp of each cov matrix, that is - # compute cova such as va = cova * cova' - # WARN: scipy is different than matlab here, as scipy computes a lower - # triangular cholesky decomp: - # - va = cova * cova' (scipy) - # - va = cova' * cova (matlab) - # So take care when comparing results with matlab ! - cova = lin.cholesky(va) - elps = N.outer(mu, N.ones(npoints)) - elps += N.dot(cova, circle) - else: - raise DenParam("var mode not recognized") - - return elps[0, :], elps[1, :] - -#============= -# Private Api -#============= -# Those 3 functions do almost all the actual computation -def _scalar_gauss_den(x, mu, va, log): - """ This function is the actual implementation - of gaussian pdf in scalar case. It assumes all args - are conformant, so it should not be used directly - - Call gauss_den instead""" - inva = 1/va - fac = (2*N.pi) ** (-1/2.0) * N.sqrt(inva) - y = ((x-mu) ** 2) * -0.5 * inva - if not log: - y = fac * N.exp(y.ravel()) - else: - y = y + log(fac) - - return y - -def _diag_gauss_den(x, mu, va, log, axis): - """ This function is the actual implementation - of gaussian pdf in scalar case. It assumes all args - are conformant, so it should not be used directly - - Call gauss_den instead""" - # Diagonal matrix case - d = mu.size - if axis % 2 == 0: - x = N.swapaxes(x, 0, 1) - - if not log: - inva = 1/va[0] - fac = (2*N.pi) ** (-d/2.0) * N.sqrt(inva) - y = (x[0] - mu[0]) ** 2 * inva * -0.5 - for i in range(1, d): - inva = 1/va[i] - fac *= N.sqrt(inva) - y += (x[i] - mu[i]) ** 2 * inva * -0.5 - y = fac * N.exp(y) - else: - y = _scalar_gauss_den(x[0], mu[0], va[0], log) - for i in range(1, d): - y += _scalar_gauss_den(x[i], mu[i], va[i], log) - - return y - -def _full_gauss_den(x, mu, va, log, axis): - """ This function is the actual implementation - of gaussian pdf in full matrix case. - - It assumes all args are conformant, so it should - not be used directly Call gauss_den instead - - Does not check if va is definite positive (on inversible - for that matter), so the inverse computation and/or determinant - would throw an exception.""" - d = mu.size - inva = lin.inv(va) - fac = 1 / N.sqrt( (2*N.pi) ** d * N.fabs(lin.det(va))) - - # # Slow version (does not work since version 0.6) - # n = N.size(x, 0) - # y = N.zeros(n) - # for i in range(n): - # y[i] = N.dot(x[i,:], - # N.dot(inva, N.transpose(x[i,:]))) - # y *= -0.5 - - # we are using a trick with sum to "emulate" - # the matrix multiplication inva * x without any explicit loop - if axis % 2 == 1: - y = N.dot(inva, (x-mu)) - y = -0.5 * N.sum(y * (x-mu), 0) - else: - y = N.dot((x-mu), inva) - y = -0.5 * N.sum(y * (x-mu), 1) - - if not log: - y = fac * N.exp(y) - else: - y = y + N.log(fac) - - return y - diff --git a/scikits/learn/em/doc/Makefile b/scikits/learn/em/doc/Makefile deleted file mode 100644 index 00c460adc3d33cc2b5b8783ac0a0623d34f51ff6..0000000000000000000000000000000000000000 --- a/scikits/learn/em/doc/Makefile +++ /dev/null @@ -1,50 +0,0 @@ -# Last Change: Sun Jul 22 11:00 AM 2007 J - -# This makefile is used to build the pdf from the rest file and inlined code -# from python examples - -py2tex = pygmentize -l python -f tex -rst2tex = PYTHONPATH=/home/david/local/lib/python2.4/site-packages rst2newlatex.py \ - --stylesheet-path base.tex --user-stylesheet user.tex - -pytexfiles = pyem.tex basic_example1.tex basic_example2.tex basic_example3.tex pdfestimation.tex discriminant_analysis.tex - -SOURCEPATH = $(PWD) - -EXTTOCLEAN=.chk .dvi .log .aux .bbl .blg .blig .ilg .toc .lof .lot .idx .ind .out .bak .ps .pdf .bm - -tutorial.pdf: pyem.pdf - mv $< $@ - -pyem.pdf: $(pytexfiles) - pdflatex $< - pdflatex $< - pdflatex $< - -pyem.tex: index.txt - $(rst2tex) $< > $@ - -basic_example1.tex: ../examples/basic_example1.py - $(py2tex) $< > $@ - -basic_example2.tex: ../examples/basic_example2.py - $(py2tex) $< > $@ - -basic_example3.tex: ../examples/basic_example3.py - $(py2tex) $< > $@ - -pdfestimation.tex: ../examples/pdfestimation.py - $(py2tex) $< > $@ - -discriminant_analysis.tex: ../examples/discriminant_analysis.py - $(py2tex) $< > $@ - -clean: - for i in $(pytexfiles); do \ - rm -f `echo $$i`; \ - done; - for i in $(SOURCEPATH); do \ - for j in $(EXTTOCLEAN); do \ - rm -f `echo $$i/*$$j`; \ - done; \ - done; diff --git a/scikits/learn/em/doc/base.tex b/scikits/learn/em/doc/base.tex deleted file mode 100644 index b10ea72c7ab929797973f7598c8cba44fab516d6..0000000000000000000000000000000000000000 --- a/scikits/learn/em/doc/base.tex +++ /dev/null @@ -1,1182 +0,0 @@ -% System stylesheet for the new LaTeX writer, newlatex2e. - -% Major parts of the rendering are done in this stylesheet and not in the -% Python module. - -% For development notes, see notes.txt. - -% User documentation (in the stylesheet for now; that may change though): - -% Naming conventions: -% All uppercase letters in macro names have a specific meaning. -% \D...: All macros introduced by the Docutils LaTeX writer start with "D". -% \DS<name>: Setup function (called at the bottom of this stylesheet). -% \DN<nodename>{<contents>}: Handler for Docutils document tree node `node`; called by -% the Python module. -% \DEV<name>: External variable, set by the Python module. -% \DEC<name>: External command. It is called by the Python module and must be -% defined in this stylesheet. -% \DN<nodename>A<attribute>{<number>}{<attribute>}{<value>}{<nodename>}{<contents>}: -% Attribute handler for `attribute` set on nodes of type `nodename`. -% See below for a discussion of attribute handlers. -% \DA<attribute>{<number>}{<attribute>}{<value>}{<nodename>}{<contents>}: -% Attribute handler for all `attribute`. Called only when no specific -% \DN<nodename>A<attribute> handler is defined. -% \DN<nodename>C<class>{<contents>}: -% Handler for `class`, when set on nodes of type `nodename`. -% \DC<class>{<contents>}: -% Handler for `class`. Called only when no specific \DN<nodename>C<class> -% handler is defined. -% \D<name>: Generic variable or function. - -% Attribute handlers: -% TODO - -% --------------------------------------------------------------------------- - -% Having to intersperse code with \makeatletter-\makeatother pairs is very -% annoying, so we call \makeatletter at the top and \makeatother at the -% bottom. Just be aware that you cannot use "@" as a text character inside -% this stylesheet. -\makeatletter - -% Print-mode (as opposed to online mode e.g. with Adobe Reader). -% This causes for example blue hyperlinks. -\providecommand{\Dprinting}{false} - -% \DSearly is called right after \documentclass. -\providecommand{\DSearly}{} -% \DSlate is called at the end of the stylesheet (right before the document -% tree). -\providecommand{\DSlate}{} - -% Use the KOMA script article class. -\providecommand{\Ddocumentclass}{scrartcl} -\providecommand{\Ddocumentoptions}{a4paper} -\providecommand{\DSdocumentclass}{ - \documentclass[\Ddocumentoptions]{\Ddocumentclass} } - -% Todo: This should be movable to the bottom, but it isn't as long as -% we use \usepackage commands at the top level of this stylesheet -% (which we shouldn't). -\DSdocumentclass - -\providecommand{\DSpackages}{ - % Load miscellaneous packages. - % Note 1: Many of the packages loaded here are used throughout this stylesheet. - % If one of these packages does not work on your system or in your scenario, - % please let us know, so we can consider making the package optional. - % Note 2: It would appear cleaner to load packages where they are used. - % However, since using a wrong package loading order can lead to *very* - % subtle bugs, we centralize the loading of most packages here. - \DSfontencoding % load font encoding packages - \DSlanguage % load babel - % Using \ifthenelse conditionals. - \usepackage{ifthen} % before hyperref (really!) - % There is not support for *not* using hyperref because it's used in many - % places. If this is a problem (e.g. because hyperref doesn't work on your - % system), please let us know. - \usepackage[colorlinks=false,pdfborder={0 0 0}]{hyperref} - % Get color, e.g. for links and system messages. - \usepackage{color} - % Get \textnhtt macro (non-hyphenating type writer). - \usepackage{hyphenat} - % For sidebars. - \usepackage{picins} - % We use longtable to create tables. - \usepackage{longtable} - % Images. - \usepackage{graphicx} - % These packages might be useful (some just add magic pixie dust), so - % evaluate them: - %\usepackage{fixmath} - %\usepackage{amsmath} - % Add some missing symbols like \textonehalf. - \usepackage{textcomp} -} - -\providecommand{\DSfontencoding}{ - % Set up font encoding. Called by \DSpackages. - % AE is a T1 emulation. It provides mostly the same characters and - % features as T1-encoded fonts but doesn't use bitmap fonts (which are - % unsuitable for online reading and subtle for printers). - \usepackage{ae} - % Provide the characters not contained in AE from EC bitmap fonts. - \usepackage{aecompl} - % Guillemets ("<<", ">>") in AE. - \usepackage{aeguill} -} - -\providecommand{\DSsymbols}{% - % Fix up symbols. - % The Euro symbol in Computer Modern looks, um, funny. Let's get a - % proper Euro symbol. - \usepackage{eurosym}% - \renewcommand{\texteuro}{\euro}% -} - -% Taken from -% <http://groups.google.de/groups?selm=1i0n5tgtplti420e1omp4pctlv19jpuhbb%404ax.com> -% and modified. Used with permission. -\providecommand{\Dprovidelength}[2]{% - \begingroup% - \escapechar\m@ne% - \xdef\@gtempa{{\string#1}}% - \endgroup% - \expandafter\@ifundefined\@gtempa% - {\newlength{#1}\setlength{#1}{#2}}% - {}% -} - -\providecommand{\Dprovidecounter}[2]{% - % Like \newcounter except that it doesn't crash if the counter - % already exists. - \@ifundefined{c@#1}{\newcounter{#1}\setcounter{#1}{#2}}{} -} - -\Dprovidelength{\Dboxparindent}{\parindent} - -\providecommand{\Dmakebox}[1]{% - % Make a centered, frameless box. Useful e.g. for block quotes. - % Do not use minipages here, but create pseudo-lists to allow - % page-breaking. (Don't use KOMA-script's addmargin environment - % because it messes up bullet lists.) - \Dmakelistenvironment{}{}{% - \setlength{\parskip}{0pt}% - \setlength{\parindent}{\Dboxparindent}% - \item{#1}% - }% -} - -\providecommand{\Dmakefbox}[1]{% - % Make a centered, framed box. Useful e.g. for admonitions. - \vspace{0.4\baselineskip}% - \begin{center}% - \fbox{% - \begin{minipage}[t]{0.9\linewidth}% - \setlength{\parindent}{\Dboxparindent}% - #1% - \end{minipage}% - }% - \end{center}% - \vspace{0.4\baselineskip}% -} - -% We do not currently recognize the difference between an end-sentence and a -% mid-sentence period (". " vs. ". " in plain text). So \frenchspacing is -% appropriate. -\providecommand{\DSfrenchspacing}{\frenchspacing} - - -\Dprovidelength{\Dblocklevelvspace}{% - % Space between block-level elements other than paragraphs. - 0.7\baselineskip plus 0.3\baselineskip minus 0.2\baselineskip% -} -\providecommand{\DECauxiliaryspace}{% - \ifthenelse{\equal{\Dneedvspace}{true}}{\vspace{\Dblocklevelvspace}}{}% - \par\noindent% -} -\providecommand{\DECparagraphspace}{\par} -\providecommand{\Dneedvspace}{true} - -\providecommand{\DSlanguage}{% - % Set up babel. - \usepackage[\DEVlanguagebabel]{babel} -} - -\providecommand{\Difdefined}[3]{\@ifundefined{#1}{#3}{#2}} - -% Handler for 'classes' attribute (called for each class attribute). -\providecommand{\DAclasses}[5]{% - % Dispatch to \DN<nodename>C<class>. - \Difdefined{DN#4C#3}{% - % Pass only contents, nothing else! - \csname DN#4C#3\endcsname{#5}% - }{% - % Otherwise, dispatch to \DC<class>. - \Difdefined{DC#3}{% - \csname DC#3\endcsname{#5}% - }{% - #5% - }% - }% -} - -\providecommand{\DECattr}[5]{% - % Global attribute dispatcher, called inside the document tree. - % Parameters: - % 1. Attribute number. - % 2. Attribute name. - % 3. Attribute value. - % 4. Node name. - % 5. Node contents. - \Difdefined{DN#4A#2}{% - % Dispatch to \DN<nodename>A<attribute>. - \csname DN#4A#2\endcsname{#1}{#2}{#3}{#4}{#5}% - }{\Difdefined{DA#2}{% - % Otherwise dispatch to \DA<attribute>. - \csname DA#2\endcsname{#1}{#2}{#3}{#4}{#5}% - }{% - % Otherwise simply run the contents without calling a handler. - #5% - }}% -} - -% ---------- Link handling ---------- -% Targets and references. - -\providecommand{\Draisedlink}[1]{% - % Anchors are placed on the base line by default. This is a bad thing for - % inline context, so we raise the anchor (normally by \baselineskip). - \Hy@raisedlink{#1}% -} - -% References. -% We're assuming here that the "refid" and "refuri" attributes occur -% only in inline context (in TextElements). -\providecommand{\DArefid}[5]{% - \ifthenelse{\equal{#4}{reference}}{% - \Dexplicitreference{\##3}{#5}% - }{% - % If this is not a target node (targets with refids are - % uninteresting and should be silently dropped). - \ifthenelse{\not\equal{#4}{target}}{% - % If this is a footnote reference, call special macro. - \ifthenelse{\equal{#4}{footnotereference}}{% - \Dimplicitfootnotereference{\##3}{#5}% - }{% - \ifthenelse{\equal{#4}{citationreference}}{% - \Dimplicitcitationreference{\##3}{#5}% - }{% - \Dimplicitreference{\##3}{#5}% - }% - }% - }{}% - }% -} -\providecommand{\DArefuri}[5]{% - \ifthenelse{\equal{#4}{target}}{% - % The node name is 'target', so this is a hyperlink target, like this: - % .. _mytarget: URI - % Hyperlink targets are ignored because they are invisible. - }{% - % If a non-target node has a refuri attribute, it must be an explicit URI - % reference (i.e. node name is 'reference'). - \Durireference{#3}{#5}% - }% -} -% Targets. -\providecommand{\DAids}[5]{% - \label{#3}% - \ifthenelse{\equal{#4}{footnotereference}}{% - {% - \renewcommand{\HyperRaiseLinkDefault}{% - % Dirty hack to make backrefs to footnote references work. - % For some reason, \baselineskip is 0pt in fn references. - 0.5\Doriginalbaselineskip% - }% - \Draisedlink{\hypertarget{#3}{}}#5% - }% - }{% - \Draisedlink{\hypertarget{#3}{}}#5% - }% -} -\providecommand{\Dimplicitreference}[2]{% - % Create implicit reference to ID. Implicit references occur - % e.g. in TOC-backlinks of section titles. Parameters: - % 1. Target. - % 2. Link text. - \href{#1}{#2}% -} -\providecommand{\Dimplicitfootnotereference}[2]{% - % Ditto, but for the special case of footnotes. - % We want them to be rendered like explicit references. - \Dexplicitreference{#1}{#2}% -} -\providecommand{\Dimplicitcitationreference}[2]{% - % Ditto for citation references. - \Dimplicitfootnotereference{#1}{#2}% -} -\providecommand{\Dcolorexplicitreference}{% - \ifthenelse{\equal{\Dprinting}{true}}{\color{black}}{\color{blue}}% -} -\providecommand{\Dexplicitreference}[2]{% - % Create explicit reference to ID, e.g. created with "foo_". - % Parameters: - % 1. Target. - % 2. Link text. - \href{#1}{{\Dcolorexplicitreference#2}}% -} -\providecommand{\Dcolorurireference}{\Dcolorexplicitreference} -\providecommand{\Durireference}[2]{% - % Create reference to URI. Parameters: - % 1. Target. - % 2. Link text. - \href{#1}{{\Dcolorurireference#2}}% -} - -\Dprovidecounter{Dpdfbookmarkid}{0}% -\providecommand{\Dpdfbookmark}[1]{% - % Temporarily decrement Desctionlevel counter. - \addtocounter{Dsectionlevel}{-1}% - %\typeout{\arabic{Dsectionlevel}}% - %\typeout{#1}% - %\typeout{docutils\roman{Dpdfbookmarkid}}% - %\typeout{}% - \pdfbookmark[\arabic{Dsectionlevel}]{#1}{docutils\arabic{Dpdfbookmarkid}}% - \addtocounter{Dsectionlevel}{1}% - \addtocounter{Dpdfbookmarkid}{1}% -} -% ---------- End of Link Handling ---------- - -\providecommand{\DNparagraph}[1]{% - \ifthenelse{\equal{\DEVparagraphindented}{true}}{\indent}{\noindent}% - #1% -} -\providecommand{\Dformatboxtitle}[1]{{\Large\textbf{#1}}} -\providecommand{\Dformatboxsubtitle}[1]{{\large\textbf{#1}}} -\providecommand{\Dtopictitle}[1]{% - \Difinsidetoc{\vspace{1em}\par}{}% - \noindent\Dformatboxtitle{#1}% - \ifthenelse{\equal{\DEVhassubtitle}{false}}{\vspace{1em}}{\vspace{0.5em}}% - \par% -} -\providecommand{\Dadmonitiontitle}[1]{% - \Dtopictitle{#1}% -} -\providecommand{\Dtopicsubtitle}[1]{% - \noindent\Dformatboxsubtitle{#1}% - \vspace{1em}% - \par% -} -\providecommand{\Dsidebartitle}[1]{\Dtopictitle{#1}} -\providecommand{\Dsidebarsubtitle}[1]{\Dtopicsubtitle{#1}} -\providecommand{\Ddocumenttitle}[1]{% - \begin{center}{\Huge#1}\end{center}% - \ifthenelse{\equal{\DEVhassubtitle}{true}}{\vspace{0.1cm}}{\vspace{1cm}}% -} -\providecommand{\Ddocumentsubtitle}[1]{% - \begin{center}{\huge#1}\end{center}% - \vspace{1cm}% -} -% Can be overwritten by user stylesheet. -\providecommand{\Dformatsectiontitle}[1]{#1} -\providecommand{\Dformatsectionsubtitle}[1]{\Dformatsectiontitle{#1}} -\providecommand{\Dbookmarksectiontitle}[1]{% - % Return text suitable for use in \section*, \subsection*, etc., - % containing a PDF bookmark. Parameter: The title (as node tree). - \Draisedlink{\Dpdfbookmark{\DEVtitleastext}}% - #1% -} -\providecommand{\Dsectiontitlehook}[1]{#1} -\providecommand{\Dsectiontitle}[1]{% - \Dsectiontitlehook{% - \Ddispatchsectiontitle{\Dbookmarksectiontitle{\Dformatsectiontitle{#1}}}% - }% -} -\providecommand{\Ddispatchsectiontitle}[1]{% - \@ifundefined{Dsectiontitle\roman{Dsectionlevel}}{% - \Ddeepsectiontitle{#1}% - }{% - \csname Dsectiontitle\roman{Dsectionlevel}\endcsname{#1}% - }% -} -\providecommand{\Ddispatchsectionsubtitle}[1]{% - \Ddispatchsectiontitle{#1}% -} -\providecommand{\Dsectiontitlei}[1]{\section*{#1}} -\providecommand{\Dsectiontitleii}[1]{\subsection*{#1}} -\providecommand{\Ddeepsectiontitle}[1]{% - % Anything below \subsubsection (like \paragraph or \subparagraph) - % is useless because it uses the same font. The only way to - % (visually) distinguish such deeply nested sections is to use - % section numbering. - \subsubsection*{#1}% -} -\providecommand{\Dsectionsubtitlehook}[1]{#1} -\Dprovidelength{\Dsectionsubtitleraisedistance}{0.7em} -\providecommand{\Dsectionsubtitlescaling}{0.85} -\providecommand{\Dsectionsubtitle}[1]{% - \Dsectionsubtitlehook{% - % Move the subtitle nearer to the title. - \vspace{-\Dsectionsubtitleraisedistance}% - % Don't create a PDF bookmark. - \Ddispatchsectionsubtitle{% - \Dformatsectionsubtitle{\scalebox{\Dsectionsubtitlescaling}{#1}}% - }% - }% -} -\providecommand{\DNtitle}[1]{% - % Dispatch to \D<parent>title. - \csname D\DEVparent title\endcsname{#1}% -} -\providecommand{\DNsubtitle}[1]{% - % Dispatch to \D<parent>subtitle. - \csname D\DEVparent subtitle\endcsname{#1}% -} - -\providecommand{\DNliteralblock}[1]{% - \Dmakelistenvironment{}{% - \ifthenelse{\equal{\Dinsidetabular}{true}}{% - \setlength{\leftmargin}{0pt}% - }{}% - \setlength{\rightmargin}{0pt}% - }{% - \raggedright\item\noindent\nohyphens{\textnhtt{#1\Dfinalstrut}}% - }% -} -\providecommand{\DNdoctestblock}[1]{\DNliteralblock{#1}} -\providecommand{\DNliteral}[1]{\textnhtt{#1}} -\providecommand{\DNemphasis}[1]{\emph{#1}} -\providecommand{\DNstrong}[1]{\textbf{#1}} -\providecommand{\DECvisitdocument}{\begin{document}\noindent} -\providecommand{\DECdepartdocument}{\end{document}} -\providecommand{\DNtopic}[1]{% - \ifthenelse{\equal{\DEVcurrentNtopicAcontents}{1}}{% - \addtocounter{Dtoclevel}{1}% - \par\noindent% - #1% - \addtocounter{Dtoclevel}{-1}% - }{% - \par\noindent% - \Dmakebox{#1}% - }% -} -\providecommand{\DNadmonition}[1]{% - \DNtopic{#1}% -} -\providecommand{\Dformatrubric}[1]{\textbf{#1}} -\Dprovidelength{\Dprerubricspace}{0.3em} -\providecommand{\DNrubric}[1]{% - \vspace{\Dprerubricspace}\par\noindent\Dformatrubric{#1}\par% -} - -\providecommand{\Dbullet}{} -\providecommand{\DECsetbullet}[1]{\renewcommand{\Dbullet}{#1}} -\providecommand{\DNbulletlist}[1]{% - \Difinsidetoc{% - \Dtocbulletlist{#1}% - }{% - \Dmakelistenvironment{\Dbullet}{}{#1}% - }% -} -% Todo: So what on earth is @pnumwidth? -\renewcommand{\@pnumwidth}{2.2em} -\providecommand{\DNlistitem}[1]{% - \Difinsidetoc{% - \ifthenelse{\equal{\theDtoclevel}{1}\and\equal{\Dlocaltoc}{false}}{% - {% - \par\addvspace{1em}\noindent% - \sectfont% - #1\hfill\pageref{\DEVcurrentNlistitemAtocrefid}% - }% - }{% - \@dottedtocline{0}{\Dtocindent}{0em}{#1}{% - \pageref{\DEVcurrentNlistitemAtocrefid}% - }% - }% - }{% - \item{#1}% - }% -} -\providecommand{\DNenumeratedlist}[1]{#1} -\Dprovidecounter{Dsectionlevel}{0} -\providecommand{\Dvisitsectionhook}{} -\providecommand{\Ddepartsectionhook}{} -\providecommand{\DECvisitsection}{% - \addtocounter{Dsectionlevel}{1}% - \Dvisitsectionhook% -} -\providecommand{\DECdepartsection}{% - \Ddepartsectionhook% - \addtocounter{Dsectionlevel}{-1}% -} - -% Using \_ will cause hyphenation after _ even in \textnhtt-typewriter -% because the hyphenat package redefines \_. So we use -% \textunderscore here. -\providecommand{\DECtextunderscore}{\textunderscore} - -\providecommand{\Dtextinlineliteralfirstspace}{{ }} -\providecommand{\Dtextinlineliteralsecondspace}{{~}} - -\Dprovidelength{\Dlistspacing}{0.8\baselineskip} - -\providecommand{\Dsetlistrightmargin}{% - \ifthenelse{\lengthtest{\linewidth>12em}}{% - % Equal margins. - \setlength{\rightmargin}{\leftmargin}% - }{% - % If the line is narrower than 10em, we don't remove any further - % space from the right. - \setlength{\rightmargin}{0pt}% - }% -} -\providecommand{\Dresetlistdepth}{false} -\Dprovidelength{\Doriginallabelsep}{\labelsep} -\providecommand{\Dmakelistenvironment}[3]{% - % Make list environment with support for unlimited nesting and with - % reasonable default lengths. Parameters: - % 1. Label (same as in list environment). - % 2. Spacing (same as in list environment). - % 3. List contents (contents of list environment). - \ifthenelse{\equal{\Dinsidetabular}{true}}{% - % Unfortunately, vertical spacing doesn't work correctly when - % using lists inside tabular environments, so we use a minipage. - \begin{minipage}[t]{\linewidth}% - }{}% - {% - \renewcommand{\Dneedvspace}{false}% - % \parsep0.5\baselineskip - \renewcommand{\Dresetlistdepth}{false}% - \ifnum \@listdepth>5% - \protect\renewcommand{\Dresetlistdepth}{true}% - \@listdepth=5% - \fi% - \begin{list}{% - #1% - }{% - \setlength{\itemsep}{0pt}% - \setlength{\partopsep}{0pt}% - \setlength{\topsep}{0pt}% - % List should take 90% of total width. - \setlength{\leftmargin}{0.05\linewidth}% - \ifthenelse{\lengthtest{\leftmargin<1.8em}}{% - \setlength{\leftmargin}{1.8em}% - }{}% - \setlength{\labelsep}{\Doriginallabelsep}% - \Dsetlistrightmargin% - #2% - }{% - #3% - }% - \end{list}% - \ifthenelse{\equal{\Dresetlistdepth}{true}}{\@listdepth=5}{}% - }% - \ifthenelse{\equal{\Dinsidetabular}{true}}{\end{minipage}}{}% -} -\providecommand{\Dfinalstrut}{\@finalstrut\@arstrutbox} -\providecommand{\DAlastitem}[5]{#5\Dfinalstrut} - -\Dprovidelength{\Ditemsep}{0pt} -\providecommand{\DECmakeenumeratedlist}[6]{% - % Make enumerated list. - % Parameters: - % - prefix - % - type (\arabic, \roman, ...) - % - suffix - % - suggested counter name - % - start number - 1 - % - list contents - \newcounter{#4}% - \Dmakelistenvironment{#1#2{#4}#3}{% - % Use as much space as needed for the label. - \setlength{\labelwidth}{10em}% - % Reserve enough space so that the label doesn't go beyond the - % left margin of preceding paragraphs. Like that: - % - % A paragraph. - % - % 1. First item. - \setlength{\leftmargin}{2.5em}% - \Dsetlistrightmargin% - \setlength{\itemsep}{\Ditemsep}% - % Use counter recommended by Python module. - \usecounter{#4}% - % Set start value. - \addtocounter{#4}{#5}% - }{% - % The list contents. - #6% - }% -} - - -% Single quote in literal mode. \textquotesingle from package -% textcomp has wrong width when using package ae, so we use a normal -% single curly quote here. -\providecommand{\DECtextliteralsinglequote}{'} - - -% "Tabular lists" are field lists and options lists (not definition -% lists because there the term always appears on its own line). We'll -% use the terminology of field lists now ("field", "field name", -% "field body"), but the same is also analogously applicable to option -% lists. -% -% We want these lists to be breakable across pages. We cannot -% automatically get the narrowest possible size for the left column -% (i.e. the field names or option groups) because tabularx does not -% support multi-page tables, ltxtable needs to have the table in an -% external file and we don't want to clutter the user's directories -% with auxiliary files created by the filecontents environment, and -% ltablex is not included in teTeX. -% -% Thus we set a fixed length for the left column and use list -% environments. This also has the nice side effect that breaking is -% now possible anywhere, not just between fields. -% -% Note that we are creating a distinct list environment for each -% field. There is no macro for a whole tabular list! -\Dprovidelength{\Dtabularlistfieldnamewidth}{6em} -\Dprovidelength{\Dtabularlistfieldnamesep}{0.5em} -\providecommand{\Dinsidetabular}{false} -\providecommand{\Dsavefieldname}{} -\providecommand{\Dsavefieldbody}{} -\Dprovidelength{\Dusedfieldnamewidth}{0pt} -\Dprovidelength{\Drealfieldnamewidth}{0pt} -\providecommand{\Dtabularlistfieldname}[1]{\renewcommand{\Dsavefieldname}{#1}} -\providecommand{\Dtabularlistfieldbody}[1]{\renewcommand{\Dsavefieldbody}{#1}} -\Dprovidelength{\Dparskiptemp}{0pt} -\providecommand{\Dtabularlistfield}[1]{% - {% - % This only saves field name and field body in \Dsavefieldname and - % \Dsavefieldbody, resp. It does not insert any text into the - % document. - #1% - % Recalculate the real field name width everytime we encounter a - % tabular list field because it may have been changed using a - % "raw" node. - \setlength{\Drealfieldnamewidth}{\Dtabularlistfieldnamewidth}% - \addtolength{\Drealfieldnamewidth}{\Dtabularlistfieldnamesep}% - \Dmakelistenvironment{% - \makebox[\Drealfieldnamewidth][l]{\Dsavefieldname}% - }{% - \setlength{\labelwidth}{\Drealfieldnamewidth}% - \setlength{\leftmargin}{\Drealfieldnamewidth}% - \setlength{\rightmargin}{0pt}% - \setlength{\labelsep}{0pt}% - }{% - \item% - \settowidth{\Dusedfieldnamewidth}{\Dsavefieldname}% - \setlength{\Dparskiptemp}{\parskip}% - \ifthenelse{% - \lengthtest{\Dusedfieldnamewidth>\Dtabularlistfieldnamewidth}% - }{% - \mbox{}\par% - \setlength{\parskip}{0pt}% - }{}% - \Dsavefieldbody% - \setlength{\parskip}{\Dparskiptemp}% - %XXX Why did we need this? - %\@finalstrut\@arstrutbox% - }% - \par% - }% -} - -\providecommand{\Dformatfieldname}[1]{\textbf{#1:}} -\providecommand{\DNfieldlist}[1]{#1} -\providecommand{\DNfield}[1]{\Dtabularlistfield{#1}} -\providecommand{\DNfieldname}[1]{% - \Dtabularlistfieldname{% - \Dformatfieldname{#1}% - }% -} -\providecommand{\DNfieldbody}[1]{\Dtabularlistfieldbody{#1}} - -\providecommand{\Dformatoptiongroup}[1]{% - % Format option group, e.g. "-f file, --input file". - \texttt{#1}% -} -\providecommand{\Dformatoption}[1]{% - % Format option, e.g. "-f file". - % Put into mbox to avoid line-breaking at spaces. - \mbox{#1}% -} -\providecommand{\Dformatoptionstring}[1]{% - % Format option string, e.g. "-f". - #1% -} -\providecommand{\Dformatoptionargument}[1]{% - % Format option argument, e.g. "file". - \textsl{#1}% -} -\providecommand{\Dformatoptiondescription}[1]{% - % Format option description, e.g. - % "\DNparagraph{Read input data from file.}" - #1% -} -\providecommand{\DNoptionlist}[1]{#1} -\providecommand{\Doptiongroupjoiner}{,{ }} -\providecommand{\Disfirstoption}{% - % Auxiliary macro indicating if a given option is the first child - % of its option group (if it's not, it has to preceded by - % \Doptiongroupjoiner). - false% -} -\providecommand{\DNoptionlistitem}[1]{% - \Dtabularlistfield{#1}% -} -\providecommand{\DNoptiongroup}[1]{% - \renewcommand{\Disfirstoption}{true}% - \Dtabularlistfieldname{\Dformatoptiongroup{#1}}% -} -\providecommand{\DNoption}[1]{% - % If this is not the first option in this option group, add a - % joiner. - \ifthenelse{\equal{\Disfirstoption}{true}}{% - \renewcommand{\Disfirstoption}{false}% - }{% - \Doptiongroupjoiner% - }% - \Dformatoption{#1}% -} -\providecommand{\DNoptionstring}[1]{\Dformatoptionstring{#1}} -\providecommand{\DNoptionargument}[1]{{ }\Dformatoptionargument{#1}} -\providecommand{\DNdescription}[1]{% - \Dtabularlistfieldbody{\Dformatoptiondescription{#1}}% -} - -\providecommand{\DNdefinitionlist}[1]{% - \begin{description}% - \parskip0pt% - #1% - \end{description}% -} -\providecommand{\DNdefinitionlistitem}[1]{% - % LaTeX expects the label in square brackets; we provide an empty - % label. - \item[]#1% -} -\providecommand{\Dformatterm}[1]{#1} -\providecommand{\DNterm}[1]{\hspace{-5pt}\Dformatterm{#1}} -% I'm still not sure what's the best rendering for classifiers. The -% colon syntax is used by reStructuredText, so it's at least WYSIWYG. -% Use slanted text because italic would cause too much emphasis. -\providecommand{\Dformatclassifier}[1]{\textsl{#1}} -\providecommand{\DNclassifier}[1]{~:~\Dformatclassifier{#1}} -\providecommand{\Dformatdefinition}[1]{#1} -\providecommand{\DNdefinition}[1]{\par\Dformatdefinition{#1}} - -\providecommand{\Dlineblockindentation}{2.5em} -\providecommand{\DNlineblock}[1]{% - \Dmakelistenvironment{}{% - \ifthenelse{\equal{\DEVparent}{lineblock}}{% - % Parent is a line block, so indent. - \setlength{\leftmargin}{\Dlineblockindentation}% - }{% - % At top level; don't indent. - \setlength{\leftmargin}{0pt}% - }% - \setlength{\rightmargin}{0pt}% - \setlength{\parsep}{0pt}% - }{% - #1% - }% -} -\providecommand{\DNline}[1]{\item#1} - -\providecommand{\DNtransition}{% - \raisebox{0.25em}{\parbox{\linewidth}{\hspace*{\fill}\hrulefill\hrulefill\hspace*{\fill}}}% -} - -\providecommand{\Dformatblockquote}[1]{% - % Format contents of block quote. - % This occurs in block-level context, so we cannot use \textsl. - {\slshape#1}% -} -\providecommand{\Dformatattribution}[1]{---\textup{#1}} -\providecommand{\DNblockquote}[1]{% - \Dmakebox{% - \Dformatblockquote{#1} - }% -} -\providecommand{\DNattribution}[1]{% - \par% - \begin{flushright}\Dformatattribution{#1}\end{flushright}% -} - - -% Sidebars: -% Vertical and horizontal margins. -\Dprovidelength{\Dsidebarvmargin}{0.5em} -\Dprovidelength{\Dsidebarhmargin}{1em} -% Padding (space between contents and frame). -\Dprovidelength{\Dsidebarpadding}{1em} -% Frame width. -\Dprovidelength{\Dsidebarframewidth}{2\fboxrule} -% Position ("l" or "r"). -\providecommand{\Dsidebarposition}{r} -% Width. -\Dprovidelength{\Dsidebarwidth}{0.45\linewidth} -\providecommand{\DNsidebar}[1]{ - \parpic[\Dsidebarposition]{% - \begin{minipage}[t]{\Dsidebarwidth}% - % Doing this with nested minipages is ugly, but I haven't found - % another way to place vertical space before and after the fbox. - \vspace{\Dsidebarvmargin}% - {% - \setlength{\fboxrule}{\Dsidebarframewidth}% - \setlength{\fboxsep}{\Dsidebarpadding}% - \fbox{% - \begin{minipage}[t]{\linewidth}% - \setlength{\parindent}{\Dboxparindent}% - #1% - \end{minipage}% - }% - }% - \vspace{\Dsidebarvmargin}% - \end{minipage}% - }% -} - - -% Citations and footnotes. -\providecommand{\Dformatfootnote}[1]{% - % Format footnote. - {% - \footnotesize#1% - % \par is necessary for LaTeX to adjust baselineskip to the - % changed font size. - \par% - }% -} -\providecommand{\Dformatcitation}[1]{\Dformatfootnote{#1}} -\Dprovidelength{\Doriginalbaselineskip}{0pt} -\providecommand{\DNfootnotereference}[1]{% - {% - % \baselineskip is 0pt in \textsuperscript, so we save it here. - \setlength{\Doriginalbaselineskip}{\baselineskip}% - \textsuperscript{#1}% - }% -} -\providecommand{\DNcitationreference}[1]{{[}#1{]}} -\Dprovidelength{\Dfootnotesep}{3.5pt} -\providecommand{\Dsetfootnotespacing}{% - % Spacing commands executed at the beginning of footnotes. - \setlength{\parindent}{0pt}% - \hspace{1em}% -} -\providecommand{\DNfootnote}[1]{% - % See ltfloat.dtx for details. - {% - \insert\footins{% - % BUG: This is too small if the user adds - % \onehalfspacing or \doublespace. - \vspace{\Dfootnotesep}% - \Dsetfootnotespacing% - \Dformatfootnote{#1}% - }% - }% -} -\providecommand{\DNcitation}[1]{\DNfootnote{#1}} -\providecommand{\Dformatfootnotelabel}[1]{% - % Keep \footnotesize in footnote labels (\textsuperscript would - % reduce the font size even more). - \textsuperscript{\footnotesize#1{ }}% -} -\providecommand{\Dformatcitationlabel}[1]{{[}#1{]}{ }} -\providecommand{\Dformatmultiplebackrefs}[1]{% - % If in printing mode, do not write out multiple backrefs. - \ifthenelse{\equal{\Dprinting}{true}}{}{\textsl{#1}}% -} -\providecommand{\Dthislabel}{} -\providecommand{\DNlabel}[1]{% - % Footnote or citatation label. - \renewcommand{\Dthislabel}{#1}% - \ifthenelse{\not\equal{\DEVsinglebackref}{}}{% - \let\Doriginallabel=\Dthislabel% - \def\Dthislabel{% - \Dsinglefootnotebacklink{\DEVsinglebackref}{\Doriginallabel}% - }% - }{}% - \ifthenelse{\equal{\DEVparent}{footnote}}{% - % Footnote label. - \Dformatfootnotelabel{\Dthislabel}% - }{% - \ifthenelse{\equal{\DEVparent}{citation}}{% - % Citation label. - \Dformatcitationlabel{\Dthislabel}% - }{}% - }% - % If there are multiple backrefs, add them now. - \Dformatmultiplebackrefs{\DEVmultiplebackrefs}% -} -\providecommand{\Dsinglefootnotebacklink}[2]{% - % Create normal backlink of a footnote label. Parameters: - % 1. ID. - % 2. Link text. - % Treat like a footnote reference. - \Dimplicitfootnotereference{\##1}{#2}% -} -\providecommand{\DECmultifootnotebacklink}[2]{% - % Create generated backlink, as in (1, 2). Parameters: - % 1. ID. - % 2. Link text. - % Treat like a footnote reference. - \Dimplicitfootnotereference{\##1}{#2}% -} -\providecommand{\Dsinglecitationbacklink}[2]{\Dsinglefootnotebacklink{#1}{#2}} -\providecommand{\DECmulticitationbacklink}[2]{\DECmultifootnotebacklink{#1}{#2}} - - -\providecommand{\DECmaketable}[2]{% - % Make table. Parameters: - % 1. Table spec (like "|p|p|"). - % 2. Table contents. - {% - \ifthenelse{\equal{\Dinsidetabular}{true}}{% - % Inside longtable; we cannot have nested longtables. - \begin{tabular}{#1}% - \hline% - #2% - \end{tabular}% - }{% - \renewcommand{\Dinsidetabular}{true}% - \begin{longtable}{#1}% - \hline% - #2% - \end{longtable}% - }% - }% -} -\providecommand{\DNthead}[1]{% - #1% - \endhead% -} -\providecommand{\DNrow}[1]{% - #1\tabularnewline% - \hline% -} -\providecommand{\Dinsidemulticolumn}{false} -\providecommand{\Dcompensatingmulticol}[3]{% - \multicolumn{#1}{#2}{% - {% - \renewcommand{\Dinsidemulticolumn}{true}% - % Compensate for weird missing vertical space at top of paragraph. - \raisebox{-2.5pt}{#3}% - }% - }% -} -\providecommand{\DECcolspan}[2]{% - % Take care of the morecols attribute (but incremented by 1). - &% - \Dcompensatingmulticol{#1}{l|}{#2}% -} -\providecommand{\DECcolspanleft}[2]{% - % Like \Dmorecols, but called for the leftmost entries in a table - % row. - \Dcompensatingmulticol{#1}{|l|}{#2}% -} -\providecommand{\DECsubsequententry}[1]{% - % -} -\providecommand{\DNentry}[1]{% - % The following sequence adds minimal vertical space above the top - % lines of the first cell paragraph, so that vertical space is - % balanced at the top and bottom of table cells. - \ifthenelse{\equal{\Dinsidemulticolumn}{false}}{% - \vspace{-1em}\vspace{-\parskip}\par% - }{}% - #1% - % No need to add an ampersand ("&"); that's done by \DECsubsequententry. -} -\providecommand{\DAtableheaderentry}[5]{\Dformattableheaderentry{#5}} -\providecommand{\Dformattableheaderentry}[1]{{\bfseries#1}} - - -\providecommand{\DNsystemmessage}[1]{% - {% - \ifthenelse{\equal{\Dprinting}{false}}{\color{red}}{}% - \bfseries% - #1% - }% -} - - -\providecommand{\Dinsidehalign}{false} -\newsavebox{\Dalignedimagebox} -\Dprovidelength{\Dalignedimagewidth}{0pt} -\providecommand{\Dhalign}[2]{% - % Horizontally align the contents to the left or right so that the - % text flows around it. - % Parameters: - % 1. l or r - % 2. Contents. - \renewcommand{\Dinsidehalign}{true}% - % For some obscure reason \parpic consumes some vertical space. - \vspace{-3pt}% - % Now we do something *really* ugly, but this enables us to wrap the - % image in a minipage while still allowing tight frames when - % class=border (see \DNimageCborder). - \sbox{\Dalignedimagebox}{#2}% - \settowidth{\Dalignedimagewidth}{\usebox{\Dalignedimagebox}}% - \parpic[#1]{% - \begin{minipage}[b]{\Dalignedimagewidth}% - % Compensate for previously added space, but not entirely. - \vspace*{2.0pt}% - \vspace*{\Dfloatimagetopmargin}% - \usebox{\Dalignedimagebox}% - \vspace*{1.5pt}% - \vspace*{\Dfloatimagebottommargin}% - \end{minipage}% - }% - \renewcommand{\Dinsidehalign}{false}% -} - - -% Maximum width of an image. -\providecommand{\Dimagemaxwidth}{\linewidth} -\providecommand{\Dfloatimagemaxwidth}{0.5\linewidth} -% Auxiliary variable. -\Dprovidelength{\Dcurrentimagewidth}{0pt} -\providecommand{\DNimageAalign}[5]{% - \ifthenelse{\equal{#3}{left}}{% - \Dhalign{l}{#5}% - }{% - \ifthenelse{\equal{#3}{right}}{% - \Dhalign{r}{#5}% - }{% - \ifthenelse{\equal{#3}{center}}{% - % Text floating around centered figures is a bad idea. Thus - % we use a center environment. Note that no extra space is - % added by the writer, so the space added by the center - % environment is fine. - \begin{center}#5\end{center}% - }{% - #5% - }% - }% - }% -} -% Base path for images. -\providecommand{\Dimagebase}{} -% Auxiliary command. Current image path. -\providecommand{\Dimagepath}{} -\providecommand{\DNimageAuri}[5]{% - % Insert image. We treat the URI like a path here. - \renewcommand{\Dimagepath}{\Dimagebase#3}% - \Difdefined{DcurrentNimageAwidth}{% - \Dwidthimage{\DEVcurrentNimageAwidth}{\Dimagepath}% - }{% - \Dsimpleimage{\Dimagepath}% - }% -} -\Dprovidelength{\Dfloatimagevmargin}{0pt} -\providecommand{\Dfloatimagetopmargin}{\Dfloatimagevmargin} -\providecommand{\Dfloatimagebottommargin}{\Dfloatimagevmargin} -\providecommand{\Dwidthimage}[2]{% - % Image with specified width. - % Parameters: - % 1. Image width. - % 2. Image path. - % Need to make bottom-alignment dependent on align attribute (add - % functional test first). Need to observe height attribute. - %\begin{minipage}[b]{#1}% - \includegraphics[width=#1,height=\textheight,keepaspectratio]{#2}% - %\end{minipage}% -} -\providecommand{\Dcurrentimagemaxwidth}{} -\providecommand{\Dsimpleimage}[1]{% - % Insert image, without much parametrization. - \settowidth{\Dcurrentimagewidth}{\includegraphics{#1}}% - \ifthenelse{\equal{\Dinsidehalign}{true}}{% - \renewcommand{\Dcurrentimagemaxwidth}{\Dfloatimagemaxwidth}% - }{% - \renewcommand{\Dcurrentimagemaxwidth}{\Dimagemaxwidth}% - }% - \ifthenelse{\lengthtest{\Dcurrentimagewidth>\Dcurrentimagemaxwidth}}{% - \Dwidthimage{\Dcurrentimagemaxwidth}{#1}% - }{% - \Dwidthimage{\Dcurrentimagewidth}{#1}% - }% -} -\providecommand{\Dwidthimage}[2]{% - % Image with specified width. - % Parameters: - % 1. Image width. - % 2. Image path. - \Dwidthimage{#1}{#2}% -} - -% Figures. -\providecommand{\DNfigureAalign}[5]{% - % Hack to make it work Right Now. - %\def\DEVcurrentNimageAwidth{\DEVcurrentNfigureAwidth}% - % - %\def\DEVcurrentNimageAwidth{\linewidth}% - \DNimageAalign{#1}{#2}{#3}{#4}{% - \begin{minipage}[b]{0.4\linewidth}#5\end{minipage}}% - %\let\DEVcurrentNimageAwidth=\relax% - % - %\let\DEVcurrentNimageAwidth=\relax% -} -\providecommand{\DNcaption}[1]{\par\noindent{\slshape#1}} -\providecommand{\DNlegend}[1]{\DECauxiliaryspace#1} - -\providecommand{\DCborder}[1]{\fbox{#1}} -% No padding between image and border. -\providecommand{\DNimageCborder}[1]{\frame{#1}} - - -% Need to replace with language-specific stuff. Maybe look at -% csquotes.sty and ask the author for permission to use parts of it. -\providecommand{\DECtextleftdblquote}{``} -\providecommand{\DECtextrightdblquote}{''} - -% Table of contents: -\Dprovidelength{\Dtocininitialsectnumwidth}{2.4em} -\Dprovidelength{\Dtocadditionalsectnumwidth}{0.7em} -% Level inside a table of contents. While this is at -1, we are not -% inside a TOC. -\Dprovidecounter{Dtoclevel}{-1}% -\providecommand{\Dlocaltoc}{false}% -\providecommand{\DNtopicClocal}[1]{% - \renewcommand{\Dlocaltoc}{true}% - \addtolength{\Dtocsectnumwidth}{2\Dtocadditionalsectnumwidth}% - \addtolength{\Dtocindent}{-2\Dtocadditionalsectnumwidth}% - #1% - \addtolength{\Dtocindent}{2\Dtocadditionalsectnumwidth}% - \addtolength{\Dtocsectnumwidth}{-2\Dtocadditionalsectnumwidth}% - \renewcommand{\Dlocaltoc}{false}% -} -\Dprovidelength{\Dtocindent}{0pt}% -\Dprovidelength{\Dtocsectnumwidth}{\Dtocininitialsectnumwidth} -% Compensate for one additional TOC indentation space so that the -% top-level is unindented. -\addtolength{\Dtocsectnumwidth}{-\Dtocadditionalsectnumwidth} -\addtolength{\Dtocindent}{-\Dtocsectnumwidth} -\providecommand{\Difinsidetoc}[2]{% - \ifthenelse{\not\equal{\theDtoclevel}{-1}}{#1}{#2}% -} -\providecommand{\DNgeneratedCsectnum}[1]{% - \Difinsidetoc{% - % Section number inside TOC. - \makebox[\Dtocsectnumwidth][l]{#1}% - }{% - % Section number inside section title. - #1\quad% - }% -} -\providecommand{\Dtocbulletlist}[1]{% - \addtocounter{Dtoclevel}{1}% - \addtolength{\Dtocindent}{\Dtocsectnumwidth}% - \addtolength{\Dtocsectnumwidth}{\Dtocadditionalsectnumwidth}% - #1% - \addtolength{\Dtocsectnumwidth}{-\Dtocadditionalsectnumwidth}% - \addtolength{\Dtocindent}{-\Dtocsectnumwidth}% - \addtocounter{Dtoclevel}{-1}% -} - - -% For \DECpixelunit, the length value is pre-multiplied with 0.75, so by -% specifying "pt" we get the same notion of "pixel" as graphicx. -\providecommand{\DECpixelunit}{pt} -% Normally lengths are relative to the current linewidth. -\providecommand{\DECrelativeunit}{\linewidth} - - -% ACTION: These commands actually *do* something. -% Ultimately, everything should be done here, and no active content should be -% above (not even \usepackage). - -\DSearly -\DSpackages -\DSfrenchspacing -\DSsymbols -\DSlate - -\makeatother - - \usepackage{fancyvrb} diff --git a/scikits/learn/em/doc/examples/demo1.py b/scikits/learn/em/doc/examples/demo1.py deleted file mode 100644 index 13ed0768ab88837be415d8c3e2a1ee39ada546c0..0000000000000000000000000000000000000000 --- a/scikits/learn/em/doc/examples/demo1.py +++ /dev/null @@ -1,109 +0,0 @@ -#! /usr/bin/env python - -# Example of use of pyem toolbox. Feel free to change parameters -# such as dimension, number of components, mode of covariance. -# -# You can also try less trivial things such as adding outliers, sampling -# a mixture with full covariance and estimating it with a mixture with diagonal -# gaussians (replace the mode of the learned model lgm) -# -# Later, I hope to add functions for number of component estimation using eg BIC - -import numpy as N -from numpy.random import seed - -from scipy.sandbox.pyem import GM, GMM, EM -import copy - -seed(1) -#+++++++++++++++++++++++++++++ -# Meta parameters of the model -# - k: Number of components -# - d: dimension of each Gaussian -# - mode: Mode of covariance matrix: full or diag (string) -# - nframes: number of frames (frame = one data point = one -# row of d elements) -k = 2 -d = 2 -mode = 'diag' -nframes = 1e3 - -#+++++++++++++++++++++++++++++++++++++++++++ -# Create an artificial GM model, samples it -#+++++++++++++++++++++++++++++++++++++++++++ -w, mu, va = GM.gen_param(d, k, mode, spread = 1.5) -gm = GM.fromvalues(w, mu, va) - -# Sample nframes frames from the model -data = gm.sample(nframes) - -#++++++++++++++++++++++++ -# Learn the model with EM -#++++++++++++++++++++++++ - -# Init the model -lgm = GM(d, k, mode) -gmm = GMM(lgm, 'kmean') -gmm.init(data) - -# Keep a copy for drawing later -gm0 = copy.copy(lgm) - -# The actual EM, with likelihood computation. The threshold -# is compared to the (linearly appromixated) derivative of the likelihood -em = EM() -like = em.train(data, gmm, maxiter = 30, thresh = 1e-8) - -#+++++++++++++++ -# Draw the model -#+++++++++++++++ -import pylab as P -P.subplot(2, 1, 1) - -# Level is the confidence level for confidence ellipsoids: 1.0 means that -# all points will be (almost surely) inside the ellipsoid -level = 0.8 -if not d == 1: - P.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_') - - # h keeps the handles of the plot, so that you can modify - # its parameters like label or color - h = gm.plot(level = level) - [i.set_color('g') for i in h] - h[0].set_label('true confidence ellipsoides') - - # Initial confidence ellipses as found by kmean - h = gm0.plot(level = level) - [i.set_color('k') for i in h] - h[0].set_label('kmean confidence ellipsoides') - - # Values found by EM - h = lgm.plot(level = level) - [i.set_color('r') for i in h] - h[0].set_label('EM confidence ellipsoides') - - P.legend(loc = 0) -else: - # The 1d plotting function is quite elaborate: the confidence - # interval are represented by filled areas, the pdf of the mixture and - # the pdf of each component is drawn (optional) - h = gm.plot1d(level = level) - [i.set_color('g') for i in h['pdf']] - h['pdf'][0].set_label('true pdf') - - h0 = gm0.plot1d(level = level) - [i.set_color('k') for i in h0['pdf']] - h0['pdf'][0].set_label('initial pdf') - - hl = lgm.plot1d(fill = 1, level = level) - [i.set_color('r') for i in hl['pdf']] - hl['pdf'][0].set_label('pdf found by EM') - - P.legend(loc = 0) - -P.subplot(2, 1, 2) -P.plot(like) -P.title('log likelihood') - -P.show() -# P.save('2d diag.png') diff --git a/scikits/learn/em/doc/examples/demo2.py b/scikits/learn/em/doc/examples/demo2.py deleted file mode 100644 index d87b5a39069944ac052f9cba1ea4df3ed7122384..0000000000000000000000000000000000000000 --- a/scikits/learn/em/doc/examples/demo2.py +++ /dev/null @@ -1,104 +0,0 @@ -#! /usr/bin/env python - -# Example of use of pyem toolbox. Feel free to change parameters -# such as dimension, number of components, mode of covariance. -# -# You can also try less trivial things such as adding outliers, sampling -# a mixture with full covariance and estimating it with a mixture with diagonal -# gaussians (replace the mode of the learned model lgm) -# -# Later, I hope to add functions for number of component estimation using eg BIC - -import numpy as N -from numpy.random import seed - -from scipy.sandbox.pyem import GM, GMM, EM -import copy - -seed(2) -#+++++++++++++++++++++++++++++ -# Meta parameters of the model -# - k: Number of components -# - d: dimension of each Gaussian -# - mode: Mode of covariance matrix: full or diag (string) -# - nframes: number of frames (frame = one data point = one -# row of d elements) -k = 4 -d = 2 -mode = 'diag' -nframes = 1e3 - -#+++++++++++++++++++++++++++++++++++++++++++ -# Create an artificial GMM model, samples it -#+++++++++++++++++++++++++++++++++++++++++++ -w, mu, va = GM.gen_param(d, k, mode, spread = 1.0) -gm = GM.fromvalues(w, mu, va) - -# Sample nframes frames from the model -data = gm.sample(nframes) - -#++++++++++++++++++++++++ -# Learn the model with EM -#++++++++++++++++++++++++ - -lgm = [] -kmax = 6 -bics = N.zeros(kmax) -for i in range(kmax): - # Init the model with an empty Gaussian Mixture, and create a Gaussian - # Mixture Model from it - lgm.append(GM(d, i+1, mode)) - gmm = GMM(lgm[i], 'kmean') - - # The actual EM, with likelihood computation. The threshold - # is compared to the (linearly appromixated) derivative of the likelihood - em = EM() - em.train(data, gmm, maxiter = 30, thresh = 1e-10) - bics[i] = gmm.bic(data) - -print "Original model has %d clusters, bics says %d" % (k, N.argmax(bics)+1) - -#+++++++++++++++ -# Draw the model -#+++++++++++++++ -import pylab as P -P.subplot(3, 2, 1) - -for k in range(kmax): - P.subplot(3, 2, k+1) - # Level is the confidence level for confidence ellipsoids: 1.0 means that - # all points will be (almost surely) inside the ellipsoid - level = 0.8 - if not d == 1: - P.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_') - - # h keeps the handles of the plot, so that you can modify - # its parameters like label or color - h = lgm[k].plot(level = level) - [i.set_color('r') for i in h] - h[0].set_label('EM confidence ellipsoides') - - h = gm.plot(level = level) - [i.set_color('g') for i in h] - h[0].set_label('Real confidence ellipsoides') - else: - # The 1d plotting function is quite elaborate: the confidence - # interval are represented by filled areas, the pdf of the mixture and - # the pdf of each component is drawn (optional) - h = gm.plot1d(level = level) - [i.set_color('g') for i in h['pdf']] - h['pdf'][0].set_label('true pdf') - - h0 = gm0.plot1d(level = level) - [i.set_color('k') for i in h0['pdf']] - h0['pdf'][0].set_label('initial pdf') - - hl = lgm.plot1d(fill = 1, level = level) - [i.set_color('r') for i in hl['pdf']] - hl['pdf'][0].set_label('pdf found by EM') - - P.legend(loc = 0) - -P.legend(loc = 0) -P.show() -# P.save('2d diag.png') diff --git a/scikits/learn/em/doc/tutorial.pdf b/scikits/learn/em/doc/tutorial.pdf deleted file mode 100644 index 8fb02ec6867870ae77312a4367f1a55acb73f329..0000000000000000000000000000000000000000 Binary files a/scikits/learn/em/doc/tutorial.pdf and /dev/null differ diff --git a/scikits/learn/em/doc/user.tex b/scikits/learn/em/doc/user.tex deleted file mode 100644 index 6ea70994cc65b29a6d4bf8b46ec4c0825b3e8a1d..0000000000000000000000000000000000000000 --- a/scikits/learn/em/doc/user.tex +++ /dev/null @@ -1,64 +0,0 @@ -% Last Change: Wed Jan 31 08:00 PM 2007 J -% vim:syntax=tex - -\newcommand\at{@} -\newcommand\lb{[} -\newcommand\rb{]} -\newcommand\Cba[1]{\textcolor[rgb]{0.67,0.13,1.00}{\textbf{#1}}} -\newcommand\Caz[1]{\textcolor[rgb]{0.00,0.25,0.82}{#1}} -\newcommand\Cay[1]{\textcolor[rgb]{0.67,0.13,1.00}{#1}} -\newcommand\Cax[1]{\textcolor[rgb]{0.00,0.63,0.00}{#1}} -\newcommand\Cbc[1]{\textcolor[rgb]{0.40,0.40,0.40}{#1}} -\newcommand\Cas[1]{\textcolor[rgb]{0.40,0.40,0.40}{#1}} -\newcommand\Car[1]{\textcolor[rgb]{0.72,0.53,0.04}{#1}} -\newcommand\Caq[1]{\textcolor[rgb]{0.73,0.27,0.27}{\textit{#1}}} -\newcommand\Cap[1]{\textcolor[rgb]{0.72,0.53,0.04}{#1}} -\newcommand\Caw[1]{\textcolor[rgb]{0.67,0.13,1.00}{\textbf{#1}}} -\newcommand\Cav[1]{\textcolor[rgb]{0.60,0.60,0.60}{\textbf{#1}}} -\newcommand\Cau[1]{\textcolor[rgb]{0.40,0.40,0.40}{#1}} -\newcommand\Cat[1]{\textcolor[rgb]{0.67,0.13,1.00}{\textbf{#1}}} -\newcommand\Cak[1]{\textbf{#1}} -\newcommand\Caj[1]{\textcolor[rgb]{0.73,0.40,0.53}{#1}} -\newcommand\Cai[1]{\textcolor[rgb]{0.72,0.53,0.04}{#1}} -\newcommand\Cah[1]{\textcolor[rgb]{0.63,0.63,0.00}{#1}} -\newcommand\Cao[1]{\textcolor[rgb]{0.53,0.00,0.00}{#1}} -\newcommand\Can[1]{\textcolor[rgb]{0.00,0.50,0.00}{#1}} -\newcommand\Cam[1]{\textcolor[rgb]{0.73,0.40,0.13}{\textbf{#1}}} -\newcommand\Cal[1]{\textcolor[rgb]{0.67,0.13,1.00}{\textbf{#1}}} -\newcommand\Cac[1]{\textcolor[rgb]{0.73,0.27,0.27}{#1}} -\newcommand\Cab[1]{\textit{#1}} -\newcommand\Caa[1]{\textcolor[rgb]{0.50,0.50,0.50}{#1}} -\newcommand\Cag[1]{\textcolor[rgb]{0.40,0.40,0.40}{#1}} -\newcommand\Caf[1]{\textcolor[rgb]{0.00,0.53,0.00}{\textit{#1}}} -\newcommand\Cae[1]{\textcolor[rgb]{0.40,0.40,0.40}{#1}} -\newcommand\Cad[1]{\textcolor[rgb]{0.73,0.27,0.27}{#1}} -\newcommand\Cbb[1]{\textcolor[rgb]{0.73,0.27,0.27}{#1}} -\newcommand\CaZ[1]{\textcolor[rgb]{0.40,0.40,0.40}{#1}} -\newcommand\CaY[1]{\textcolor[rgb]{0.00,0.00,0.50}{\textbf{#1}}} -\newcommand\CaX[1]{\textcolor[rgb]{0.00,0.50,0.00}{\textbf{#1}}} -\newcommand\Cbd[1]{\textcolor[rgb]{0.73,0.40,0.53}{\textbf{#1}}} -\newcommand\Cbe[1]{\textcolor[rgb]{0.67,0.13,1.00}{\textbf{#1}}} -\newcommand\CaS[1]{\textcolor[rgb]{0.50,0.00,0.50}{\textbf{#1}}} -\newcommand\CaR[1]{\textcolor[rgb]{0.00,0.53,0.00}{\textit{#1}}} -\newcommand\CaQ[1]{\textcolor[rgb]{0.72,0.53,0.04}{#1}} -\newcommand\CaP[1]{\textcolor[rgb]{0.40,0.40,0.40}{#1}} -\newcommand\CaW[1]{\textcolor[rgb]{0.73,0.27,0.27}{#1}} -\newcommand\CaV[1]{\textcolor[rgb]{0.67,0.13,1.00}{#1}} -\newcommand\CaU[1]{\textcolor[rgb]{0.73,0.27,0.27}{#1}} -\newcommand\CaT[1]{\textcolor[rgb]{0.00,0.00,1.00}{\textbf{#1}}} -\newcommand\CaK[1]{\textcolor[rgb]{0.67,0.13,1.00}{#1}} -\newcommand\CaJ[1]{\textcolor[rgb]{0.00,0.63,0.00}{#1}} -\newcommand\CaI[1]{\textcolor[rgb]{0.73,0.27,0.27}{#1}} -\newcommand\CaH[1]{\textcolor[rgb]{0.67,0.13,1.00}{\textbf{#1}}} -\newcommand\CaO[1]{\textcolor[rgb]{0.73,0.27,0.27}{#1}} -\newcommand\CaN[1]{\textcolor[rgb]{0.00,0.00,0.50}{\textbf{#1}}} -\newcommand\CaM[1]{\textcolor[rgb]{0.00,0.00,1.00}{#1}} -\newcommand\CaL[1]{\textcolor[rgb]{0.00,0.53,0.00}{#1}} -\newcommand\CaC[1]{\textcolor[rgb]{0.00,0.53,0.00}{\textit{#1}}} -\newcommand\CaB[1]{\textcolor[rgb]{0.82,0.25,0.23}{\textbf{#1}}} -\newcommand\CaA[1]{\textcolor[rgb]{0.67,0.13,1.00}{#1}} -\newcommand\CaG[1]{\fcolorbox[rgb]{1.00,0.00,0.00}{1,1,1}{#1}} -\newcommand\CaF[1]{\textcolor[rgb]{0.72,0.53,0.04}{#1}} -\newcommand\CaE[1]{\textcolor[rgb]{1.00,0.00,0.00}{#1}} -\newcommand\CaD[1]{\textcolor[rgb]{0.63,0.00,0.00}{#1}} - diff --git a/scikits/learn/em/gauss_mix.py b/scikits/learn/em/gauss_mix.py deleted file mode 100644 index 07d97607bd1c55a71c1c18caa73d53180d30f57c..0000000000000000000000000000000000000000 --- a/scikits/learn/em/gauss_mix.py +++ /dev/null @@ -1,728 +0,0 @@ -# /usr/bin/python -# Last Change: Tue Jul 17 11:00 PM 2007 J - -"""Module implementing GM, a class which represents Gaussian mixtures. - -GM instances can be used to create, sample mixtures. They also provide -different plotting facilities, such as isodensity contour for multi dimensional -models, ellipses of confidence.""" - -__docformat__ = 'restructuredtext' - -import numpy as N -from numpy.random import randn, rand -import numpy.linalg as lin -import densities as D -import misc - -# Right now, two main usages of a Gaussian Model are possible -# - init a Gaussian Model with meta-parameters, and trains it -# - set-up a Gaussian Model to sample it, draw ellipsoides -# of confidences. In this case, we would like to init it with -# known values of parameters. This can be done with the class method -# fromval - -# TODO: -# - change bounds methods of GM class instanciations so that it cannot -# be used as long as w, mu and va are not set -# - We have to use scipy now for chisquare pdf, so there may be other -# methods to be used, ie for implementing random index. -# - there is no check on internal state of the GM, that is does w, mu and va -# values make sense (eg singular values) - plot1d is still very rhough. There -# should be a sensible way to modify the result plot (maybe returns a dic -# with global pdf, component pdf and fill matplotlib handles). Should be -# coherent with plot -class GmParamError(Exception): - """Exception raised for errors in gmm params - - Attributes: - expression -- input expression in which the error occurred - message -- explanation of the error - """ - def __init__(self, message): - Exception.__init__(self) - self.message = message - - def __str__(self): - return self.message - -class GM: - """Gaussian Mixture class. This is a simple container class - to hold Gaussian Mixture parameters (weights, mean, etc...). - It can also draw itself (confidence ellipses) and samples itself. - """ - - # I am not sure it is useful to have a spherical mode... - _cov_mod = ['diag', 'full'] - - #=============================== - # Methods to construct a mixture - #=============================== - def __init__(self, d, k, mode = 'diag'): - """Init a Gaussian Mixture. - - :Parameters: - d : int - dimension of the mixture. - k : int - number of component in the mixture. - mode : string - mode of covariance - - :Returns: - an instance of GM. - - Note - ---- - - Only full and diag mode are supported for now. - - :SeeAlso: - If you want to build a Gaussian Mixture with knowns weights, means - and variances, you can use GM.fromvalues method directly""" - if mode not in self._cov_mod: - raise GmParamError("mode %s not recognized" + str(mode)) - - self.d = d - self.k = k - self.mode = mode - - # Init to 0 all parameters, with the right dimensions. - # Not sure this is useful in python from an efficiency POV ? - self.w = N.zeros(k) - self.mu = N.zeros((k, d)) - if mode == 'diag': - self.va = N.zeros((k, d)) - elif mode == 'full': - self.va = N.zeros((k * d, d)) - - self.__is_valid = False - if d > 1: - self.__is1d = False - else: - self.__is1d = True - - def set_param(self, weights, mu, sigma): - """Set parameters of the model. - - Args should be conformant with metparameters d and k given during - initialisation. - - :Parameters: - weights : ndarray - weights of the mixture (k elements) - mu : ndarray - means of the mixture. One component's mean per row, k row for k - components. - sigma : ndarray - variances of the mixture. For diagonal models, one row contains - the diagonal elements of the covariance matrix. For full - covariance, d rows for one variance. - - Examples - -------- - Create a 3 component, 2 dimension mixture with full covariance matrices - - >>> w = numpy.array([0.2, 0.5, 0.3]) - >>> mu = numpy.array([[0., 0.], [1., 1.]]) - >>> va = numpy.array([[1., 0.], [0., 1.], [2., 0.5], [0.5, 1]]) - >>> gm = GM(2, 3, 'full') - >>> gm.set_param(w, mu, va) - - :SeeAlso: - If you know already the parameters when creating the model, you can - simply use the method class GM.fromvalues.""" - #XXX: when fromvalues is called, parameters are called twice... - k, d, mode = check_gmm_param(weights, mu, sigma) - if not k == self.k: - raise GmParamError("Number of given components is %d, expected %d" - % (k, self.k)) - if not d == self.d: - raise GmParamError("Dimension of the given model is %d, "\ - "expected %d" % (d, self.d)) - if not mode == self.mode and not d == 1: - raise GmParamError("Given covariance mode is %s, expected %s" - % (mode, self.mode)) - self.w = weights - self.mu = mu - self.va = sigma - - self.__is_valid = True - - @classmethod - def fromvalues(cls, weights, mu, sigma): - """This class method can be used to create a GM model - directly from its parameters weights, mean and variance - - :Parameters: - weights : ndarray - weights of the mixture (k elements) - mu : ndarray - means of the mixture. One component's mean per row, k row for k - components. - sigma : ndarray - variances of the mixture. For diagonal models, one row contains - the diagonal elements of the covariance matrix. For full - covariance, d rows for one variance. - - :Returns: - gm : GM - an instance of GM. - - Examples - -------- - - >>> w, mu, va = GM.gen_param(d, k) - >>> gm = GM(d, k) - >>> gm.set_param(w, mu, va) - - and - - >>> w, mu, va = GM.gen_param(d, k) - >>> gm = GM.fromvalue(w, mu, va) - - are strictly equivalent.""" - k, d, mode = check_gmm_param(weights, mu, sigma) - res = cls(d, k, mode) - res.set_param(weights, mu, sigma) - return res - - #===================================================== - # Fundamental facilities (sampling, confidence, etc..) - #===================================================== - def sample(self, nframes): - """ Sample nframes frames from the model. - - :Parameters: - nframes : int - number of samples to draw. - - :Returns: - samples : ndarray - samples in the format one sample per row (nframes, d).""" - if not self.__is_valid: - raise GmParamError("""Parameters of the model has not been - set yet, please set them using self.set_param()""") - - # State index (ie hidden var) - sti = gen_rand_index(self.w, nframes) - # standard gaussian samples - x = randn(nframes, self.d) - - if self.mode == 'diag': - x = self.mu[sti, :] + x * N.sqrt(self.va[sti, :]) - elif self.mode == 'full': - # Faster: - cho = N.zeros((self.k, self.va.shape[1], self.va.shape[1])) - for i in range(self.k): - # Using cholesky looks more stable than sqrtm; sqrtm is not - # available in numpy anyway, only in scipy... - cho[i] = lin.cholesky(self.va[i*self.d:i*self.d+self.d, :]) - - for s in range(self.k): - tmpind = N.where(sti == s)[0] - x[tmpind] = N.dot(x[tmpind], cho[s].T) + self.mu[s] - else: - raise GmParamError("cov matrix mode not recognized, "\ - "this is a bug !") - - return x - - def conf_ellipses(self, dim = misc.DEF_VIS_DIM, npoints = misc.DEF_ELL_NP, - level = misc.DEF_LEVEL): - """Returns a list of confidence ellipsoids describing the Gmm - defined by mu and va. Check densities.gauss_ell for details - - :Parameters: - dim : sequence - sequences of two integers which represent the dimensions where to - project the ellipsoid. - npoints : int - number of points to generate for the ellipse. - level : float - level of confidence (between 0 and 1). - - :Returns: - xe : sequence - a list of x coordinates for the ellipses (Xe[i] is the array - containing x coordinates of the ith Gaussian) - ye : sequence - a list of y coordinates for the ellipses. - - Examples - -------- - Suppose we have w, mu and va as parameters for a mixture, then: - - >>> gm = GM(d, k) - >>> gm.set_param(w, mu, va) - >>> X = gm.sample(1000) - >>> Xe, Ye = gm.conf_ellipsoids() - >>> pylab.plot(X[:,0], X[:, 1], '.') - >>> for k in len(w): - ... pylab.plot(Xe[k], Ye[k], 'r') - - Will plot samples X draw from the mixture model, and - plot the ellipses of equi-probability from the mean with - default level of confidence.""" - if self.__is1d: - raise ValueError("This function does not make sense for 1d " - "mixtures.") - - if not self.__is_valid: - raise GmParamError("""Parameters of the model has not been - set yet, please set them using self.set_param()""") - - xe = [] - ye = [] - if self.mode == 'diag': - for i in range(self.k): - x, y = D.gauss_ell(self.mu[i, :], self.va[i, :], - dim, npoints, level) - xe.append(x) - ye.append(y) - elif self.mode == 'full': - for i in range(self.k): - x, y = D.gauss_ell(self.mu[i, :], - self.va[i*self.d:i*self.d+self.d, :], - dim, npoints, level) - xe.append(x) - ye.append(y) - - return xe, ye - - def check_state(self): - """Returns true if the parameters of the model are valid. - - For Gaussian mixtures, this means weights summing to 1, and variances - to be positive definite. - """ - if not self.__is_valid: - raise GmParamError("Parameters of the model has not been"\ - "set yet, please set them using self.set_param()") - - # Check condition number for cov matrix - if self.mode == 'diag': - tinfo = N.finfo(self.va.dtype) - if N.any(self.va < tinfo.eps): - raise GmParamError("variances are singular") - elif self.mode == 'full': - try: - d = self.d - for i in range(self.k): - N.linalg.cholesky(self.va[i*d:i*d+d, :]) - except N.linalg.LinAlgError: - raise GmParamError("matrix %d is singular " % i) - - else: - raise GmParamError("Unknown mode") - - return True - - @classmethod - def gen_param(cls, d, nc, mode = 'diag', spread = 1): - """Generate random, valid parameters for a gaussian mixture model. - - :Parameters: - d : int - the dimension - nc : int - the number of components - mode : string - covariance matrix mode ('full' or 'diag'). - - :Returns: - w : ndarray - weights of the mixture - mu : ndarray - means of the mixture - w : ndarray - variances of the mixture - - Notes - ----- - This is a class method. - """ - w = N.abs(randn(nc)) - w = w / sum(w, 0) - - mu = spread * N.sqrt(d) * randn(nc, d) - if mode == 'diag': - va = N.abs(randn(nc, d)) - elif mode == 'full': - # If A is invertible, A'A is positive definite - va = randn(nc * d, d) - for k in range(nc): - va[k*d:k*d+d] = N.dot( va[k*d:k*d+d], - va[k*d:k*d+d].transpose()) - else: - raise GmParamError('cov matrix mode not recognized') - - return w, mu, va - - #gen_param = classmethod(gen_param) - - def pdf(self, x, log = False): - """Computes the pdf of the model at given points. - - :Parameters: - x : ndarray - points where to estimate the pdf. One row for one - multi-dimensional sample (eg to estimate the pdf at 100 - different points in 10 dimension, data's shape should be (100, - 20)). - log : bool - If true, returns the log pdf instead of the pdf. - - :Returns: - y : ndarray - the pdf at points x.""" - if log: - return D.logsumexp( - D.multiple_gauss_den(x, self.mu, self.va, log = True) - + N.log(self.w)) - else: - return N.sum(D.multiple_gauss_den(x, self.mu, self.va) * self.w, 1) - - def pdf_comp(self, x, cid, log = False): - """Computes the pdf of the model at given points, at given component. - - :Parameters: - x : ndarray - points where to estimate the pdf. One row for one - multi-dimensional sample (eg to estimate the pdf at 100 - different points in 10 dimension, data's shape should be (100, - 20)). - cid: int - the component index. - log : bool - If true, returns the log pdf instead of the pdf. - - :Returns: - y : ndarray - the pdf at points x.""" - if self.mode == 'diag': - va = self.va[cid] - elif self.mode == 'full': - va = self.va[cid*self.d:(cid+1)*self.d] - else: - raise GmParamError("""var mode %s not supported""" % self.mode) - - if log: - return D.gauss_den(x, self.mu[cid], va, log = True) \ - + N.log(self.w[cid]) - else: - return D.multiple_gauss_den(x, self.mu[cid], va) * self.w[cid] - - #================= - # Plotting methods - #================= - def plot(self, dim = misc.DEF_VIS_DIM, npoints = misc.DEF_ELL_NP, - level = misc.DEF_LEVEL): - """Plot the ellipsoides directly for the model - - Returns a list of lines handle, so that their style can be modified. By - default, the style is red color, and nolegend for all of them. - - :Parameters: - dim : sequence - sequence of two integers, the dimensions of interest. - npoints : int - Number of points to use for the ellipsoids. - level : int - level of confidence (to use with fill argument) - - :Returns: - h : sequence - Returns a list of lines handle so that their properties - can be modified (eg color, label, etc...): - - Note - ---- - Does not work for 1d. Requires matplotlib - - :SeeAlso: - conf_ellipses is used to compute the ellipses. Use this if you want - to plot with something else than matplotlib.""" - if self.__is1d: - raise ValueError("This function does not make sense for 1d " - "mixtures.") - - if not self.__is_valid: - raise GmParamError("""Parameters of the model has not been - set yet, please set them using self.set_param()""") - - k = self.k - xe, ye = self.conf_ellipses(dim, npoints, level) - try: - import pylab as P - return [P.plot(xe[i], ye[i], 'r', label='_nolegend_')[0] for i in - range(k)] - except ImportError: - raise GmParamError("matplotlib not found, cannot plot...") - - def plot1d(self, level = misc.DEF_LEVEL, fill = False, gpdf = False): - """Plots the pdf of each component of the 1d mixture. - - :Parameters: - level : int - level of confidence (to use with fill argument) - fill : bool - if True, the area of the pdf corresponding to the given - confidence intervales is filled. - gpdf : bool - if True, the global pdf is plot. - - :Returns: - h : dict - Returns a dictionary h of plot handles so that their properties - can be modified (eg color, label, etc...): - - h['pdf'] is a list of lines, one line per component pdf - - h['gpdf'] is the line for the global pdf - - h['conf'] is a list of filling area - """ - if not self.__is1d: - raise ValueError("This function does not make sense for "\ - "mixtures which are not unidimensional") - - from scipy.stats import norm - pval = N.sqrt(self.va[:, 0]) * norm(0, 1).ppf((1+level)/2) - - # Compute reasonable min/max for the normal pdf: [-mc * std, mc * std] - # gives the range we are taking in account for each gaussian - mc = 3 - std = N.sqrt(self.va[:, 0]) - mi = N.amin(self.mu[:, 0] - mc * std) - ma = N.amax(self.mu[:, 0] + mc * std) - - np = 500 - x = N.linspace(mi, ma, np) - # Prepare the dic of plot handles to return - ks = ['pdf', 'conf', 'gpdf'] - hp = dict((i, []) for i in ks) - - # Compute the densities - y = D.multiple_gauss_den(x[:, N.newaxis], self.mu, self.va, \ - log = True) \ - + N.log(self.w) - yt = self.pdf(x[:, N.newaxis]) - - try: - import pylab as P - for c in range(self.k): - h = P.plot(x, N.exp(y[:, c]), 'r', label ='_nolegend_') - hp['pdf'].extend(h) - if fill: - # Compute x coordinates of filled area - id1 = -pval[c] + self.mu[c] - id2 = pval[c] + self.mu[c] - xc = x[:, N.where(x>id1)[0]] - xc = xc[:, N.where(xc<id2)[0]] - - # Compute the graph for filling - yf = self.pdf_comp(xc, c) - xc = N.concatenate(([xc[0]], xc, [xc[-1]])) - yf = N.concatenate(([0], yf, [0])) - h = P.fill(xc, yf, facecolor = 'b', alpha = 0.1, - label='_nolegend_') - hp['conf'].extend(h) - if gpdf: - h = P.plot(x, yt, 'r:', label='_nolegend_') - hp['gpdf'] = h - return hp - except ImportError: - raise GmParamError("matplotlib not found, cannot plot...") - - def density_on_grid(self, dim = misc.DEF_VIS_DIM, nx = 50, ny = 50, - nl = 20, maxlevel = 0.95, v = None): - """Do all the necessary computation for contour plot of mixture's - density. - - :Parameters: - dim : sequence - sequence of two integers, the dimensions of interest. - nx : int - Number of points to use for the x axis of the grid - ny : int - Number of points to use for the y axis of the grid - nl : int - Number of contour to plot. - - :Returns: - X : ndarray - points of the x axis of the grid - Y : ndarray - points of the y axis of the grid - Z : ndarray - values of the density on X and Y - V : ndarray - Contour values to display. - - Note - ---- - X, Y, Z and V are as expected by matplotlib contour function.""" - if self.__is1d: - raise ValueError("This function does not make sense for 1d " - "mixtures.") - - # Ok, it is a bit gory. Basically, we want to compute the size of the - # grid. We use conf_ellipse, which will return a couple of points for - # each component, and we can find a grid size which then is just big - # enough to contain all ellipses. This won't work well if two - # ellipsoids are crossing each other a lot (because this assumes that - # at a given point, one component is largely dominant for its - # contribution to the pdf). - - xe, ye = self.conf_ellipses(level = maxlevel, dim = dim) - ax = [N.min(xe), N.max(xe), N.min(ye), N.max(ye)] - - w = ax[1] - ax[0] - h = ax[3] - ax[2] - x, y, lden = self._densityctr(N.linspace(ax[0]-0.2*w, - ax[1]+0.2*w, nx), - N.linspace(ax[2]-0.2*h, - ax[3]+0.2*h, ny), - dim = dim) - # XXX: how to find "good" values for level ? - if v is None: - v = N.linspace(-5, N.max(lden), nl) - return x, y, lden, N.array(v) - - def _densityctr(self, rangex, rangey, dim = misc.DEF_VIS_DIM): - """Helper function to compute density contours on a grid.""" - gr = N.meshgrid(rangex, rangey) - x = gr[0].flatten() - y = gr[1].flatten() - xdata = N.concatenate((x[:, N.newaxis], y[:, N.newaxis]), axis = 1) - dmu = self.mu[:, dim] - dva = self._get_va(dim) - den = GM.fromvalues(self.w, dmu, dva).pdf(xdata, log = True) - den = den.reshape(len(rangey), len(rangex)) - - return gr[0], gr[1], den - - def _get_va(self, dim): - """Returns variance limited do 2 dimension in tuple dim.""" - assert len(dim) == 2 - dim = N.array(dim) - if dim.any() < 0 or dim.any() >= self.d: - raise ValueError("dim elements should be between 0 and dimension" - " of the mixture.") - - if self.mode == 'diag': - return self.va[:, dim] - elif self.mode == 'full': - ld = dim.size - vaselid = N.empty((ld * self.k, ld), N.int) - for i in range(self.k): - vaselid[ld*i] = dim[0] + i * self.d - vaselid[ld*i+1] = dim[1] + i * self.d - vadid = N.empty((ld * self.k, ld), N.int) - for i in range(self.k): - vadid[ld*i] = dim - vadid[ld*i+1] = dim - return self.va[vaselid, vadid] - else: - raise ValueError("Unkown mode") - - # Syntactic sugar - def __repr__(self): - msg = "" - msg += "Gaussian Mixture:\n" - msg += " -> %d dimensions\n" % self.d - msg += " -> %d components\n" % self.k - msg += " -> %s covariance \n" % self.mode - if self.__is_valid: - msg += "Has initial values""" - else: - msg += "Has no initial values yet""" - return msg - - def __str__(self): - return self.__repr__() - -# Function to generate a random index: this is kept outside any class, -# as the function can be useful for other -def gen_rand_index(p, n): - """Generate a N samples vector containing random index between 1 - and length(p), each index i with probability p(i)""" - # TODO Check args here - - # TODO: check each value of inverse distribution is - # different - invcdf = N.cumsum(p) - uni = rand(n) - index = N.zeros(n, dtype=int) - - # This one should be a bit faster - for k in range(len(p)-1, 0, -1): - blop = N.where(N.logical_and(invcdf[k-1] <= uni, - uni < invcdf[k])) - index[blop] = k - - return index - -def check_gmm_param(w, mu, va): - """Check that w, mu and va are valid parameters for - a mixture of gaussian. - - w should sum to 1, there should be the same number of component in each - param, the variances should be positive definite, etc... - - :Parameters: - w : ndarray - vector or list of weigths of the mixture (K elements) - mu : ndarray - matrix: K * d - va : ndarray - list of variances (vector K * d or square matrices Kd * d) - - :Returns: - k : int - number of components - d : int - dimension - mode : string - 'diag' if diagonal covariance, 'full' of full matrices - """ - - # Check that w is valid - if not len(w.shape) == 1: - raise GmParamError('weight should be a rank 1 array') - - if N.fabs(N.sum(w) - 1) > misc.MAX_DBL_DEV: - raise GmParamError('weight does not sum to 1') - - # Check that mean and va have the same number of components - k = len(w) - - if N.ndim(mu) < 2: - msg = "mu should be a K,d matrix, and a row vector if only 1 comp" - raise GmParamError(msg) - if N.ndim(va) < 2: - msg = """va should be a K,d / K *d, d matrix, and a row vector if - only 1 diag comp""" - raise GmParamError(msg) - - (km, d) = mu.shape - (ka, da) = va.shape - - if not k == km: - msg = "not same number of component in mean and weights" - raise GmParamError(msg) - - if not d == da: - msg = "not same number of dimensions in mean and variances" - raise GmParamError(msg) - - if km == ka: - mode = 'diag' - else: - mode = 'full' - if not ka == km*d: - msg = "not same number of dimensions in mean and variances" - raise GmParamError(msg) - - return k, d, mode - -if __name__ == '__main__': - pass diff --git a/scikits/learn/em/gmm_em.py b/scikits/learn/em/gmm_em.py deleted file mode 100644 index ac29fdeff197ed58c24279764d0c8213d99341f2..0000000000000000000000000000000000000000 --- a/scikits/learn/em/gmm_em.py +++ /dev/null @@ -1,520 +0,0 @@ -# /usr/bin/python -# Last Change: Mon Jul 02 07:00 PM 2007 J - -"""Module implementing GMM, a class to estimate Gaussian mixture models using -EM, and EM, a class which use GMM instances to estimate models parameters using -the ExpectationMaximization algorithm.""" - -__docformat__ = 'restructuredtext' - -# TODO: -# - which methods to avoid va shrinking to 0 ? There are several options, -# not sure which ones are appropriates -# - improve EM trainer -import numpy as N -from numpy.random import randn -#import _c_densities as densities -import densities -from scipy.cluster.vq import kmeans2 as kmean -from gauss_mix import GmParamError -from misc import curry - -#from misc import _DEF_ALPHA, _MIN_DBL_DELTA, _MIN_INV_COND - -_PRIOR_COUNT = 0.05 -_COV_PRIOR = 0.1 - -# Error classes -class GmmError(Exception): - """Base class for exceptions in this module.""" - def __init__(self): - Exception.__init__(self) - -class GmmParamError(GmmError): - """Exception raised for errors in gmm params - - Attributes: - expression -- input expression in which the error occurred - message -- explanation of the error - """ - def __init__(self, message): - GmmError.__init__(self) - self.message = message - - def __str__(self): - return self.message - -class MixtureModel(object): - """Class to model mixture """ - # XXX: Is this really needed ? - pass - -class ExpMixtureModel(MixtureModel): - """Class to model mixture of exponential pdf (eg Gaussian, exponential, - Laplace, etc..). This is a special case because some parts of EM are common - for those models...""" - pass - -class GMM(ExpMixtureModel): - """ A class to model a Gaussian Mixture Model (GMM). An instance of this - class is created by giving weights, mean and variances in the ctor. An - instanciated object can be sampled, trained by EM. """ - def init_kmean(self, data, niter = 5): - """ Init the model with kmean.""" - k = self.gm.k - d = self.gm.d - init = data[0:k, :] - - # XXX: This is bogus initialization should do better (in kmean with CV) - (code, label) = kmean(data, init, niter, minit = 'matrix') - - w = N.ones(k) / k - mu = code.copy() - if self.gm.mode == 'diag': - va = N.zeros((k, d)) - for i in range(k): - for j in range(d): - va[i, j] = N.cov(data[N.where(label==i), j], rowvar = 0) - elif self.gm.mode == 'full': - va = N.zeros((k*d, d)) - for i in range(k): - va[i*d:i*d+d, :] = \ - N.cov(data[N.where(label==i)], rowvar = 0) - else: - raise GmmParamError("mode " + str(self.gm.mode) + \ - " not recognized") - - self.gm.set_param(w, mu, va) - - self.isinit = True - - def init_random(self, data): - """ Init the model at random.""" - k = self.gm.k - d = self.gm.d - w = N.ones(k) / k - mu = randn(k, d) - if self.gm.mode == 'diag': - va = N.fabs(randn(k, d)) - else: - # If A is invertible, A'A is positive definite - va = randn(k * d, d) - for i in range(k): - va[i*d:i*d+d] = N.dot( va[i*d:i*d+d], - va[i*d:i*d+d].T) - - self.gm.set_param(w, mu, va) - - self.isinit = True - - def init_test(self, data): - """Use values already in the model as initialization. - - Useful for testing purpose when reproducability is necessary. This does - nothing but checking that the mixture model has valid initial - values.""" - try: - self.gm.check_state() - except GmParamError, e: - print "Model is not properly initalized, cannot init EM." - raise ValueError("Message was %s" % str(e)) - - def __init__(self, gm, init = 'kmean'): - """Initialize a mixture model. - - Initialize the model from a GM instance. This class implements all the - necessary functionalities for EM. - - :Parameters: - gm : GM - the mixture model to train. - init : string - initialization method to use.""" - self.gm = gm - - # Possible init methods - init_methods = {'kmean': self.init_kmean, 'random' : self.init_random, - 'test': self.init_test} - - if init not in init_methods: - raise GmmParamError('init method %s not recognized' + str(init)) - - self.init = init_methods[init] - self.isinit = False - self.initst = init - - def compute_responsabilities(self, data): - """Compute responsabilities. - - Return normalized and non-normalized respondabilities for the model. - - Note - ---- - Computes the latent variable distribution (a posteriori probability) - knowing the explicit data for the Gaussian model (w, mu, var): gamma(t, - i) = P[state = i | observation = data(t); w, mu, va] - - This is basically the E step of EM for finite mixtures.""" - # compute the gaussian pdf - tgd = densities.multiple_gauss_den(data, self.gm.mu, self.gm.va) - # multiply by the weight - tgd *= self.gm.w - # Normalize to get a pdf - gd = tgd / N.sum(tgd, axis=1)[:, N.newaxis] - - return gd, tgd - - def compute_log_responsabilities(self, data): - """Compute log responsabilities. - - Return normalized and non-normalized responsabilities for the model (in - the log domain) - - Note - ---- - Computes the latent variable distribution (a posteriori probability) - knowing the explicit data for the Gaussian model (w, mu, var): gamma(t, - i) = P[state = i | observation = data(t); w, mu, va] - - This is basically the E step of EM for finite mixtures.""" - # compute the gaussian pdf - tgd = densities.multiple_gauss_den(data, self.gm.mu, - self.gm.va, log = True) - # multiply by the weight - tgd += N.log(self.gm.w) - # Normalize to get a (log) pdf - gd = tgd - densities.logsumexp(tgd)[:, N.newaxis] - - return gd, tgd - - def _update_em_diag(self, data, gamma, ngamma): - """Computes update of the Gaussian Mixture Model (M step) from the - responsabilities gamma and normalized responsabilities ngamma, for - diagonal models.""" - #XXX: caching SS may decrease memory consumption, but is this possible ? - k = self.gm.k - d = self.gm.d - n = data.shape[0] - invn = 1.0/n - - mu = N.zeros((k, d)) - va = N.zeros((k, d)) - - for c in range(k): - x = N.dot(gamma.T[c:c+1, :], data)[0, :] - xx = N.dot(gamma.T[c:c+1, :], data ** 2)[0, :] - - mu[c, :] = x / ngamma[c] - va[c, :] = xx / ngamma[c] - mu[c, :] ** 2 - - w = invn * ngamma - - return w, mu, va - - def _update_em_full(self, data, gamma, ngamma): - """Computes update of the Gaussian Mixture Model (M step) from the - responsabilities gamma and normalized responsabilities ngamma, for - full models.""" - k = self.gm.k - d = self.gm.d - n = data.shape[0] - invn = 1.0/n - - # In full mode, this is the bottleneck: the triple loop - # kills performances. This is pretty straightforward - # algebra, so computing it in C should not be too difficult. The - # real problem is to have valid covariance matrices, and to keep - # them positive definite, maybe with special storage... Not sure - # it really worth the risk - mu = N.zeros((k, d)) - va = N.zeros((k*d, d)) - - #XXX: caching SS may decrease memory consumption - for c in range(k): - #x = N.sum(N.outer(gamma[:, c], - # N.ones((1, d))) * data, axis = 0) - x = N.dot(gamma.T[c:c+1, :], data)[0, :] - xx = N.zeros((d, d)) - - # This should be much faster than recursing on n... - for i in range(d): - for j in range(d): - xx[i, j] = N.sum(data[:, i] * data[:, j] * gamma.T[c, :], - axis = 0) - - mu[c, :] = x / ngamma[c] - va[c*d:c*d+d, :] = xx / ngamma[c] \ - - N.outer(mu[c, :], mu[c, :]) - w = invn * ngamma - - return w, mu, va - - def update_em(self, data, gamma): - """Computes update of the Gaussian Mixture Model (M step) - from the a posteriori pdf, computed by gmm_posterior - (E step). - """ - ngamma = N.sum(gamma, axis = 0) - - if self.gm.mode == 'diag': - w, mu, va = self._update_em_diag(data, gamma, ngamma) - elif self.gm.mode == 'full': - w, mu, va = self._update_em_full(data, gamma, ngamma) - else: - raise GmmParamError("varmode not recognized") - - self.gm.set_param(w, mu, va) - - def likelihood(self, data): - """ Returns the current log likelihood of the model given - the data """ - assert(self.isinit) - # compute the gaussian pdf - tgd = densities.multiple_gauss_den(data, self.gm.mu, - self.gm.va, log = True) - # multiply by the weight - tgd += N.log(self.gm.w) - - return N.sum(densities.logsumexp(tgd), axis = 0) - - def bic(self, data): - """ Returns the BIC (Bayesian Information Criterion), - also called Schwarz information criterion. Can be used - to choose between different models which have different - number of clusters. The BIC is defined as: - - BIC = 2 * ln(L) - k * ln(n) - - where: - * ln(L) is the log-likelihood of the estimated model - * k is the number of degrees of freedom - * n is the number of frames - - Not that depending on the literature, BIC may be defined as the opposite - of the definition given here. """ - - if self.gm.mode == 'diag': - # for a diagonal model, we have k - 1 (k weigths, but one - # constraint of normality) + k * d (means) + k * d (variances) - free_deg = self.gm.k * (self.gm.d * 2 + 1) - 1 - elif self.gm.mode == 'full': - # for a full model, we have k - 1 (k weigths, but one constraint of - # normality) + k * d (means) + k * d * d / 2 (each covariance - # matrice has d **2 params, but with positivity constraint) - if self.gm.d == 1: - free_deg = self.gm.k * 3 - 1 - else: - free_deg = self.gm.k * (self.gm.d + 1 + self.gm.d ** 2 / 2) - 1 - - lk = self.likelihood(data) - n = N.shape(data)[0] - return bic(lk, free_deg, n) - - # syntactic sugar - def __repr__(self): - repre = "" - repre += "Gaussian Mixture Model\n" - repre += " -> initialized by %s\n" % str(self.initst) - repre += self.gm.__repr__() - return repre - -class EM: - """An EM trainer. An EM trainer - trains from data, with a model - - Not really useful yet""" - def __init__(self): - pass - - def train(self, data, model, maxiter = 10, thresh = 1e-5, log = False): - """Train a model using EM. - - Train a model using data, and stops when the likelihood increase - between two consecutive iteration fails behind a threshold, or when the - number of iterations > niter, whichever comes first - - :Parameters: - data : ndarray - contains the observed features, one row is one frame, ie one - observation of dimension d - model : GMM - GMM instance. - maxiter : int - maximum number of iterations - thresh : threshold - if the slope of the likelihood falls below this value, the - algorithm stops. - - :Returns: - likelihood : ndarray - one value per iteration. - - Note - ---- - The model is trained, and its parameters updated accordingly, eg the - results are put in the GMM instance. - """ - if not isinstance(model, MixtureModel): - raise TypeError("expect a MixtureModel as a model") - - # Initialize the data (may do nothing depending on the model) - model.init(data) - - # Actual training - if log: - like = self._train_simple_em_log(data, model, maxiter, thresh) - else: - like = self._train_simple_em(data, model, maxiter, thresh) - return like - - def _train_simple_em(self, data, model, maxiter, thresh): - # Likelihood is kept - like = N.zeros(maxiter) - - # Em computation, with computation of the likelihood - g, tgd = model.compute_responsabilities(data) - # TODO: do it in log domain instead - like[0] = N.sum(N.log(N.sum(tgd, 1)), axis = 0) - model.update_em(data, g) - for i in range(1, maxiter): - g, tgd = model.compute_responsabilities(data) - like[i] = N.sum(N.log(N.sum(tgd, 1)), axis = 0) - model.update_em(data, g) - if has_em_converged(like[i], like[i-1], thresh): - return like[0:i] - - def _train_simple_em_log(self, data, model, maxiter, thresh): - # Likelihood is kept - like = N.zeros(maxiter) - - # Em computation, with computation of the likelihood - g, tgd = model.compute_log_responsabilities(data) - like[0] = N.sum(densities.logsumexp(tgd), axis = 0) - model.update_em(data, N.exp(g)) - for i in range(1, maxiter): - g, tgd = model.compute_log_responsabilities(data) - like[i] = N.sum(densities.logsumexp(tgd), axis = 0) - model.update_em(data, N.exp(g)) - if has_em_converged(like[i], like[i-1], thresh): - return like[0:i] - -class RegularizedEM: - # TODO: separate regularizer from EM class ? - def __init__(self, pcnt = _PRIOR_COUNT, pval = _COV_PRIOR): - """Create a regularized EM object. - - Covariances matrices are regularized after the E step. - - :Parameters: - pcnt : float - proportion of soft counts to be count as prior counts (e.g. if - you have 1000 samples and the prior_count is 0.1, than the - prior would "weight" 100 samples). - pval : float - value of the prior. - """ - self.pcnt = pcnt - self.pval = pval - - def train(self, data, model, maxiter = 20, thresh = 1e-5): - """Train a model using EM. - - Train a model using data, and stops when the likelihood increase - between two consecutive iteration fails behind a threshold, or when the - number of iterations > niter, whichever comes first - - :Parameters: - data : ndarray - contains the observed features, one row is one frame, ie one - observation of dimension d - model : GMM - GMM instance. - maxiter : int - maximum number of iterations - thresh : threshold - if the slope of the likelihood falls below this value, the - algorithm stops. - - :Returns: - likelihood : ndarray - one value per iteration. - - Note - ---- - The model is trained, and its parameters updated accordingly, eg the - results are put in the GMM instance. - """ - mode = model.gm.mode - - # Build regularizer - if mode == 'diag': - regularize = curry(regularize_diag, np = self.pcnt, prior = - self.pval * N.ones(model.gm.d)) - elif mode == 'full': - regularize = curry(regularize_full, np = self.pcnt, prior = - self.pval * N.eye(model.gm.d)) - else: - raise ValueError("unknown variance mode") - - model.init(data) - regularize(model.gm.va) - - # Likelihood is kept - like = N.empty(maxiter, N.float) - - # Em computation, with computation of the likelihood - g, tgd = model.compute_log_responsabilities(data) - g = N.exp(g) - model.update_em(data, g) - regularize(model.gm.va) - - like[0] = N.sum(densities.logsumexp(tgd), axis = 0) - for i in range(1, maxiter): - g, tgd = model.compute_log_responsabilities(data) - g = N.exp(g) - model.update_em(data, g) - regularize(model.gm.va) - - like[i] = N.sum(densities.logsumexp(tgd), axis = 0) - if has_em_converged(like[i], like[i-1], thresh): - return like[0:i] - -# Misc functions -def bic(lk, deg, n): - """ Expects lk to be log likelihood """ - return 2 * lk - deg * N.log(n) - -def has_em_converged(like, plike, thresh): - """ given likelihood of current iteration like and previous - iteration plike, returns true is converged: based on comparison - of the slope of the likehood with thresh""" - diff = N.abs(like - plike) - avg = 0.5 * (N.abs(like) + N.abs(plike)) - if diff / avg < thresh: - return True - else: - return False - -def regularize_diag(va, np, prior): - """np * n is the number of prior counts (np is a proportion, and n is the - number of point). - - diagonal variance version""" - k = va.shape[0] - - for i in range(k): - va[i] *= 1. / (1 + np) - va[i] += np / (1. + np) * prior - -def regularize_full(va, np, prior): - """np * n is the number of prior counts (np is a proportion, and n is the - number of point).""" - d = va.shape[1] - k = va.shape[0] / d - - for i in range(k): - va[i*d:i*d+d, :] *= 1. / (1 + np) - va[i*d:i*d+d, :] += np / (1. + np) * prior - -if __name__ == "__main__": - pass diff --git a/scikits/learn/em/info.py b/scikits/learn/em/info.py deleted file mode 100644 index eaf502c1905e82fcdb4f67dbce81ae1c2f0d03b0..0000000000000000000000000000000000000000 --- a/scikits/learn/em/info.py +++ /dev/null @@ -1,68 +0,0 @@ -""" -Routines for Gaussian Mixture Models and learning with Expectation Maximization -=============================================================================== - -This module contains classes and function to compute multivariate Gaussian -densities (diagonal and full covariance matrices), Gaussian mixtures, Gaussian -mixtures models and an Em trainer. - -More specifically, the module defines the following classes, functions: - -- densities.gauss_den: function to compute multivariate Gaussian pdf -- gauss_mix.GM: defines the GM (Gaussian Mixture) class. A Gaussian Mixture can - be created from its parameters weights, mean and variances, or from its meta - parameters d (dimension of the Gaussian) and k (number of components in the - mixture). A Gaussian Model can then be sampled or plot (if d>1, plot - confidence ellipsoids projected on 2 chosen dimensions, if d == 1, plot the - pdf of each component and fill the zone of confidence for a given level) -- gmm_em.GMM: defines a class GMM (Gaussian Mixture Model). This class is - constructed from a GM model gm, and can be used to train gm. The GMM can be - initiated by kmean or at random, and can compute sufficient statistics, and - update its parameters from the sufficient statistics. -- kmean.kmean: implements a kmean algorithm. We cannot use scipy.cluster.vq - kmeans, since its does not give membership of observations. - -Example of use: ---------------- - ->>> #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ->>> # Create an artificial 2 dimension, 3 clusters GM model, samples it ->>> #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ->>> w, mu, va = GM.gen_param(2, 3, 'diag', spread = 1.5) ->>> gm = GM.fromvalues(w, mu, va) ->>> ->>> # Sample 1000 frames from the model ->>> data = gm.sample(1000) ->>> ->>> #++++++++++++++++++++++++ ->>> # Learn the model with EM ->>> #++++++++++++++++++++++++ ->>> # Init the model ->>> lgm = GM(d, k, mode) ->>> gmm = GMM(lgm, 'kmean') ->>> ->>> # The actual EM, with likelihood computation. The threshold ->>> # is compared to the (linearly appromixated) derivative of the likelihood ->>> em = EM() ->>> like = em.train(data, gmm, maxiter = 30, thresh = 1e-8) - -Files example.py and example2.py show more capabilities of the toolbox, including -plotting capabilities (using matplotlib) and model selection using Bayesian -Information Criterion (BIC). - -Bibliography: - -- Maximum likelihood from incomplete data via the EM algorithm in Journal of - the Royal Statistical Society, Series B, 39(1):1--38, 1977, by A. P. - Dempster, N. M. Laird, and D. B. Rubin -- Bayesian Approaches to Gaussian Mixture Modelling (1998) by Stephen J. - Roberts, Dirk Husmeier, Iead Rezek, William Penny in IEEE Transactions on - Pattern Analysis and Machine Intelligence - -Copyright: David Cournapeau 2006 -License: BSD-style (see LICENSE.txt in main source directory) -""" -version = '0.5.7dev' - -depends = ['linalg', 'stats'] -ignore = False diff --git a/scikits/learn/em/misc.py b/scikits/learn/em/misc.py deleted file mode 100644 index 747e1049a6ed1d99d0191041fa7aef326dd62432..0000000000000000000000000000000000000000 --- a/scikits/learn/em/misc.py +++ /dev/null @@ -1,44 +0,0 @@ -# Last Change: Mon Jul 02 06:00 PM 2007 J - -#======================================================== -# Constants used throughout the module (def args, etc...) -#======================================================== -# This is the default dimension for representing confidence ellipses -DEF_VIS_DIM = (0, 1) -DEF_ELL_NP = 100 -DEF_LEVEL = 0.39 - -#===================================================================== -# "magic number", that is number used to control regularization and co -# Change them at your risk ! -#===================================================================== - -# max deviation allowed when comparing double (this is actually stupid, -# I should actually use a number of decimals) -MAX_DBL_DEV = 1e-10 - -## # max conditional number allowed -## _MAX_COND = 1e8 -## _MIN_INV_COND = 1/_MAX_COND -## -## # Default alpha for regularization -## _DEF_ALPHA = 1e-1 -## -## # Default min delta for regularization -## _MIN_DBL_DELTA = 1e-5 -## - -class curry: - def __init__(self, fun, *args, **kwargs): - self.fun = fun - self.pending = args[:] - self.kwargs = kwargs.copy() - - def __call__(self, *args, **kwargs): - if kwargs and self.kwargs: - kw = self.kwargs.copy() - kw.update(kwargs) - else: - kw = kwargs or self.kwargs - - return self.fun(*(self.pending + args), **kw) diff --git a/scikits/learn/em/online_em.py b/scikits/learn/em/online_em.py deleted file mode 100644 index eb02a9d897fe556d60ffb74a2c03da9429e009e0..0000000000000000000000000000000000000000 --- a/scikits/learn/em/online_em.py +++ /dev/null @@ -1,466 +0,0 @@ -# /usr/bin/python -# Last Change: Sat Dec 20 04:00 PM 2008 J - -# This is not meant to be used yet !!!! I am not sure how to integrate this -# stuff inside the package yet. The cases are: -# - we have a set of data, and we want to test online EM compared to normal -# EM -# - we do not have all the data before putting them in online EM: eg current -# frame depends on previous frame in some way. - -# TODO: -# - Add biblio -# - Look back at articles for discussion for init, regularization and -# convergence rates -# - the function sufficient_statistics does not really return SS. This is not -# a big problem, but it would be better to really return them as the name -# implied. - -import numpy as N -from numpy import mean -from numpy.testing import assert_array_almost_equal, assert_array_equal - -from gmm_em import ExpMixtureModel#, GMM, EM -from gauss_mix import GM -#from gauss_mix import GM -from scipy.cluster.vq import kmeans2 as kmean -import densities2 as D - -import copy -from numpy.random import seed - -# Clamp -clamp = 1e-8 - -# Error classes -class OnGmmError(Exception): - """Base class for exceptions in this module.""" - pass - -class OnGmmParamError: - """Exception raised for errors in gmm params - - Attributes: - expression -- input expression in which the error occurred - message -- explanation of the error - """ - def __init__(self, message): - self.message = message - - def __str__(self): - return self.message - -class OnGMM(ExpMixtureModel): - """A Class for 'online' (ie recursive) EM. Instead - of running the E step on the whole data, the sufficient statistics - are updated for each new frame of data, and used in the (unchanged) - M step""" - def init_random(self, init_data): - """ Init the model at random.""" - k = self.gm.k - d = self.gm.d - if self.gm.mode == 'diag': - w = N.ones(k) / k - - # Init the internal state of EM - self.cx = N.outer(w, mean(init_data, 0)) - self.cxx = N.outer(w, mean(init_data ** 2, 0)) - - # w, mu and va init is the same that in the standard case - (code, label) = kmean(init_data, init_data[0:k, :], iter = 10, - minit = 'matrix') - mu = code.copy() - va = N.zeros((k, d)) - for i in range(k): - for j in range(d): - va [i, j] = N.cov(init_data[N.where(label==i), j], - rowvar = 0) - else: - raise OnGmmParamError("""init_online not implemented for - mode %s yet""", self.gm.mode) - - self.gm.set_param(w, mu, va) - # c* are the parameters which are computed at every step (ie - # when a new frame is taken into account - self.cw = self.gm.w - self.cmu = self.gm.mu - self.cva = self.gm.va - - # p* are the parameters used when computing gaussian densities - # they are always the same than c* in the online case - self.pw = self.cw - self.pmu = self.cmu - self.pva = self.cva - - def init_kmean(self, init_data, niter = 5): - """ Init the model using kmean.""" - k = self.gm.k - d = self.gm.d - if self.gm.mode == 'diag': - w = N.ones(k) / k - - # Init the internal state of EM - self.cx = N.outer(w, mean(init_data, 0)) - self.cxx = N.outer(w, mean(init_data ** 2, 0)) - - # w, mu and va init is the same that in the standard case - (code, label) = kmean(init_data, init_data[0:k, :], - iter = niter, minit = 'matrix') - mu = code.copy() - va = N.zeros((k, d)) - for i in range(k): - for j in range(d): - va[i, j] = N.cov(init_data[N.where(label==i), j], - rowvar = 0) - else: - raise OnGmmParamError("""init_online not implemented for - mode %s yet""", self.gm.mode) - - self.gm.set_param(w, mu, va) - # c* are the parameters which are computed at every step (ie - # when a new frame is taken into account - self.cw = self.gm.w - self.cmu = self.gm.mu - self.cva = self.gm.va - - # p* are the parameters used when computing gaussian densities - # they are the same than c* in the online case - # self.pw = self.cw.copy() - # self.pmu = self.cmu.copy() - # self.pva = self.cva.copy() - self.pw = self.cw - self.pmu = self.cmu - self.pva = self.cva - - def __init__(self, gm, init_data, init = 'kmean'): - self.gm = gm - - # Possible init methods - init_methods = {'kmean' : self.init_kmean} - - self.init = init_methods[init] - - def compute_sufficient_statistics_frame(self, frame, nu): - """ sufficient_statistics(frame, nu) for one frame of data - - frame has to be rank 1 !""" - gamma = multiple_gauss_den_frame(frame, self.pmu, self.pva) - gamma *= self.pw - gamma /= N.sum(gamma) - # <1>(t) = cw(t), self.cw = cw(t), each element is one component running weight - #self.cw = (1 - nu) * self.cw + nu * gamma - self.cw *= (1 - nu) - self.cw += nu * gamma - - for k in range(self.gm.k): - self.cx[k] = (1 - nu) * self.cx[k] + nu * frame * gamma[k] - self.cxx[k] = (1 - nu) * self.cxx[k] + nu * frame ** 2 * gamma[k] - - def update_em_frame(self): - for k in range(self.gm.k): - self.cmu[k] = self.cx[k] / self.cw[k] - self.cva[k] = self.cxx[k] / self.cw[k] - self.cmu[k] ** 2 - -import _rawden - -class OnGMM1d(ExpMixtureModel): - """Special purpose case optimized for 1d dimensional cases. - - Require each frame to be a float, which means the API is a bit - different than OnGMM. You are trading elegance for speed here !""" - def init_kmean(self, init_data, niter = 5): - """ Init the model using kmean.""" - assert init_data.ndim == 1 - k = self.gm.k - w = N.ones(k) / k - - # Init the internal state of EM - self.cx = w * mean(init_data) - self.cxx = w * mean(init_data ** 2) - - # w, mu and va init is the same that in the standard case - (code, label) = kmean(init_data[:, N.newaxis], \ - init_data[0:k, N.newaxis], iter = niter) - mu = code.copy() - va = N.zeros((k, 1)) - for i in range(k): - va[i] = N.cov(init_data[N.where(label==i)], rowvar = 0) - - self.gm.set_param(w, mu, va) - # c* are the parameters which are computed at every step (ie - # when a new frame is taken into account - self.cw = self.gm.w - self.cmu = self.gm.mu[:, 0] - self.cva = self.gm.va[:, 0] - - # p* are the parameters used when computing gaussian densities - # they are the same than c* in the online case - # self.pw = self.cw.copy() - # self.pmu = self.cmu.copy() - # self.pva = self.cva.copy() - self.pw = self.cw - self.pmu = self.cmu - self.pva = self.cva - - def __init__(self, gm, init_data, init = 'kmean'): - self.gm = gm - if self.gm.d is not 1: - raise RuntimeError("expects 1d gm only !") - - # Possible init methods - init_methods = {'kmean' : self.init_kmean} - self.init = init_methods[init] - - def compute_sufficient_statistics_frame(self, frame, nu): - """expects frame and nu to be float. Returns - cw, cxx and cxx, eg the sufficient statistics.""" - _rawden.compute_ss_frame_1d(frame, self.cw, self.cmu, self.cva, - self.cx, self.cxx, nu) - return self.cw, self.cx, self.cxx - - def update_em_frame(self, cw, cx, cxx): - """Update EM state using SS as returned by - compute_sufficient_statistics_frame. """ - self.cmu = cx / cw - self.cva = cxx / cw - self.cmu ** 2 - - def compute_em_frame(self, frame, nu): - """Run a whole em step for one frame. frame and nu should be float; - if you don't need to split E and M steps, this is faster than calling - compute_sufficient_statistics_frame and update_em_frame""" - _rawden.compute_em_frame_1d(frame, self.cw, self.cmu, self.cva, \ - self.cx, self.cxx, nu) - -def compute_factor(nframes, ku = 0.005, t0 = 200, nu0 = 0.2): - lamb = 1 - 1/(N.arange(-1, nframes - 1) * ku + t0) - nu = N.zeros((nframes, 1), N.float64) - nu[0] = nu0 - for i in range(1, nframes): - nu[i] = 1./(1 + lamb[i] / nu[i-1]) - return nu - -def online_gmm_em(data, ninit, k, mode = 'diag', step = None): - """ Emulate online_gmm_em of matlab, but uses scipy.sandbox.pyem""" - nframes = data.shape[0] - if data.ndim > 1: - d = data.shape[1] - else: - d = 1 - data = data[:, N.newaxis] - - nu = compute_factor(nframes) - - ogm = GM(d, k, mode) - ogmm = OnGMM1d(ogm, 'kmean') - init_data = data[:ninit, 0] - # We force 10 iteration for equivalence with matlab - ogmm.init(init_data, niter = 10) - # print "after init in python online_gmm_em" - # print ogmm.gm.w - # print ogmm.gm.mu - # print ogmm.gm.va - - wt = [] - mut = [] - vat = [] - - for t in range(nframes): - #ogmm.compute_sufficient_statistics(data[t:t+1, :], nu[t]) - #ogmm.update_em() - # Shit of 1 to agree exactly with matlab (index starting at 1) - # This is totally arbitrary otherwise - ogmm.compute_em_frame(data[t], nu[t]) - if ((t+1) % step) == 0: - wt.append(ogmm.cw.copy()) - mut.append(ogmm.cmu.copy()) - vat.append(ogmm.cva.copy()) - - mut = [m[:, N.newaxis] for m in mut] - vat = [v[:, N.newaxis] for v in vat] - ogmm.gm.set_param(ogmm.cw, ogmm.cmu[:, N.newaxis], ogmm.cva[:, N.newaxis]) - #ogmm.gm.set_param(ogmm.cw, ogmm.cmu, ogmm.cva) - - return ogm, wt, mut, vat - -#class OnlineEM: -# def __init__(self, ogm): -# """Init Online Em algorithm with ogm, an instance of OnGMM.""" -# if not isinstance(ogm, OnGMM): -# raise TypeError("expect a OnGMM instance for the model") -# -# def init_em(self): -# pass -# -# def train(self, data, nu): -# pass -# -# def train_frame(self, frame, nu): -# pass - -def multiple_gauss_den_frame(data, mu, va): - """Helper function to generate several Gaussian - pdf (different parameters) from one frame of data. - - Semantics depending on data's rank - - rank 0: mu and va expected to have rank 0 or 1 - - rank 1: mu and va expected to have rank 2.""" - if N.ndim(data) == 0: - # scalar case - k = mu.size - inva = 1/va - fac = (2*N.pi) ** (-1/2.0) * N.sqrt(inva) - y = ((data-mu) ** 2) * -0.5 * inva - return fac * N.exp(y.ravel()) - elif N.ndim(data) == 1: - # multi variate case (general case) - k = mu.shape[0] - y = N.zeros(k, data.dtype) - if mu.size == va.size: - # diag case - for i in range(k): - #y[i] = D.gauss_den(data, mu[i], va[i]) - # This is a bit hackish: _diag_gauss_den implementation's - # changes can break this, but I don't see how to easily fix this - y[i] = D._diag_gauss_den(data, mu[i], va[i], False, -1) - return y - else: - raise RuntimeError("full not implemented yet") - #for i in range(K): - # y[i] = D.gauss_den(data, mu[i, :], - # va[d*i:d*i+d, :]) - #return y.T - else: - raise RuntimeError("frame should be rank 0 or 1 only") - - -if __name__ == '__main__': - pass - #d = 1 - #k = 2 - #mode = 'diag' - #nframes = int(5e3) - #emiter = 4 - #seed(5) - - ##+++++++++++++++++++++++++++++++++++++++++++++++++ - ## Generate a model with k components, d dimensions - ##+++++++++++++++++++++++++++++++++++++++++++++++++ - #w, mu, va = GM.gen_param(d, k, mode, spread = 1.5) - #gm = GM.fromvalues(w, mu, va) - ## Sample nframes frames from the model - #data = gm.sample(nframes) - - ##++++++++++++++++++++++++++++++++++++++++++ - ## Approximate the models with classical EM - ##++++++++++++++++++++++++++++++++++++++++++ - ## Init the model - #lgm = GM(d, k, mode) - #gmm = GMM(lgm, 'kmean') - #gmm.init(data) - - #gm0 = copy.copy(gmm.gm) - ## The actual EM, with likelihood computation - #like = N.zeros(emiter) - #for i in range(emiter): - # g, tgd = gmm.sufficient_statistics(data) - # like[i] = N.sum(N.log(N.sum(tgd, 1)), axis = 0) - # gmm.update_em(data, g) - - ##++++++++++++++++++++++++++++++++++++++++ - ## Approximate the models with online EM - ##++++++++++++++++++++++++++++++++++++++++ - #ogm = GM(d, k, mode) - #ogmm = OnGMM(ogm, 'kmean') - #init_data = data[0:nframes / 20, :] - #ogmm.init(init_data) - - ## Forgetting param - #ku = 0.005 - #t0 = 200 - #lamb = 1 - 1/(N.arange(-1, nframes-1) * ku + t0) - #nu0 = 0.2 - #nu = N.zeros((len(lamb), 1)) - #nu[0] = nu0 - #for i in range(1, len(lamb)): - # nu[i] = 1./(1 + lamb[i] / nu[i-1]) - - #print "meth1" - ## object version of online EM - #for t in range(nframes): - # ogmm.compute_sufficient_statistics_frame(data[t], nu[t]) - # ogmm.update_em_frame() - - #ogmm.gm.set_param(ogmm.cw, ogmm.cmu, ogmm.cva) - - ## 1d optimized version - #ogm2 = GM(d, k, mode) - #ogmm2 = OnGMM1d(ogm2, 'kmean') - #ogmm2.init(init_data[:, 0]) - - #print "meth2" - ## object version of online EM - #for t in range(nframes): - # ogmm2.compute_sufficient_statistics_frame(data[t, 0], nu[t]) - # ogmm2.update_em_frame() - - ##ogmm2.gm.set_param(ogmm2.cw, ogmm2.cmu, ogmm2.cva) - - #print ogmm.cw - #print ogmm2.cw - ##+++++++++++++++ - ## Draw the model - ##+++++++++++++++ - #print "drawing..." - #import pylab as P - #P.subplot(2, 1, 1) - - #if not d == 1: - # # Draw what is happening - # P.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_') - - # h = gm.plot() - # [i.set_color('g') for i in h] - # h[0].set_label('true confidence ellipsoides') - - # h = gm0.plot() - # [i.set_color('k') for i in h] - # h[0].set_label('initial confidence ellipsoides') - - # h = lgm.plot() - # [i.set_color('r') for i in h] - # h[0].set_label('confidence ellipsoides found by EM') - - # h = ogmm.gm.plot() - # [i.set_color('m') for i in h] - # h[0].set_label('confidence ellipsoides found by Online EM') - - # # P.legend(loc = 0) - #else: - # # Real confidence ellipses - # h = gm.plot1d() - # [i.set_color('g') for i in h['pdf']] - # h['pdf'][0].set_label('true pdf') - - # # Initial confidence ellipses as found by kmean - # h0 = gm0.plot1d() - # [i.set_color('k') for i in h0['pdf']] - # h0['pdf'][0].set_label('initial pdf') - - # # Values found by EM - # hl = lgm.plot1d(fill = 1, level = 0.66) - # [i.set_color('r') for i in hl['pdf']] - # hl['pdf'][0].set_label('pdf found by EM') - - # P.legend(loc = 0) - - # # Values found by Online EM - # hl = ogmm.gm.plot1d(fill = 1, level = 0.66) - # [i.set_color('m') for i in hl['pdf']] - # hl['pdf'][0].set_label('pdf found by Online EM') - - # P.legend(loc = 0) - - #P.subplot(2, 1, 2) - #P.plot(nu) - #P.title('Learning rate') - #P.show() diff --git a/scikits/learn/em/profile_data/blop.c b/scikits/learn/em/profile_data/blop.c deleted file mode 100644 index e172ea4885b1e084769b85dbd9375260916ffcd5..0000000000000000000000000000000000000000 --- a/scikits/learn/em/profile_data/blop.c +++ /dev/null @@ -1,37 +0,0 @@ -#include <math.h> -#include <stddef.h> - -int compute(const double *in, size_t n, size_t d, const double* mu, double* out) -{ - size_t i, j; - double acc; - - for (i = 0; i < n; ++i) { - acc = 0; - for (j = 0; j < d; ++j) { - acc += (in[i*d+j] - mu[j]) * (in[i*d+j] - mu[j]); - } - out[i] = exp(acc); - } - - return 0; -} - -#if 0 -int main(void) -{ - const size_t n = 1e5; - const size_t d = 30; - size_t iter = 10, i; - - double *in, *out; - - in = malloc(sizeof(*in) * n * d); - out = malloc(sizeof(*out) * n * d); - - for (i = 0; i < iter; ++i) { - } - free(in); - out(in); -} -#endif diff --git a/scikits/learn/em/profile_data/gden.m b/scikits/learn/em/profile_data/gden.m deleted file mode 100644 index ae8dd0174fa0ceb1373ef7911537810e1c921b60..0000000000000000000000000000000000000000 --- a/scikits/learn/em/profile_data/gden.m +++ /dev/null @@ -1,10 +0,0 @@ -function out = gden(x, mu) - -% Last Change: Mon Jun 04 10:00 AM 2007 J -[n, d] = size(x); -[nm, dm] = size(mu); -if nm ~= n - out = sum(x-repmat(mu, n, 1), 1); -else - out = sum(x-mu, 1); -end; diff --git a/scikits/learn/em/profile_data/mat_prof.m b/scikits/learn/em/profile_data/mat_prof.m deleted file mode 100644 index eb9944479c6a99a05f1df7626917bdd6be7424c4..0000000000000000000000000000000000000000 --- a/scikits/learn/em/profile_data/mat_prof.m +++ /dev/null @@ -1,11 +0,0 @@ -% Last Change: Mon Jun 04 10:00 AM 2007 J - -n = 1e5; -d = 30; - -x = randn(n, d); -mu = randn(n, d); - -for i=1:10 - y = gden(x, mu); -end; diff --git a/scikits/learn/em/profile_data/profile_densities.py b/scikits/learn/em/profile_data/profile_densities.py deleted file mode 100644 index 85ad5cab8c9801b5ef9a8171bce5a3c5484321a6..0000000000000000000000000000000000000000 --- a/scikits/learn/em/profile_data/profile_densities.py +++ /dev/null @@ -1,71 +0,0 @@ -import numpy as N -from numpy.random import randn - -from numpy.ctypeslib import load_library, ndpointer -from ctypes import cdll, c_uint, c_int, c_double, POINTER - -lib = load_library("blop.so", "file") - -arg1 = ndpointer(dtype=N.float64) -arg2 = c_uint -arg3 = c_uint -arg4 = ndpointer(dtype=N.float64) -arg5 = ndpointer(dtype=N.float64) - -lib.compute.argtypes = [arg1, arg2, arg3, arg4, arg5] -lib.compute.restype = c_int -# Compare computing per component likelihood for frame per row vs frame per column -def component_likelihood(x, mu, va, log = False): - """expect one frame to be one row (rank 2). mu and var are rank 1 array.""" - x -= mu - x **= 2 - return N.exp(N.dot(x, N.ones((mu.size, 1), x.dtype))) - -def component_likelihood3(x, mu, va, log = False): - """expect one frame to be one row (rank 2). mu and var are rank 1 array.""" - y = N.empty(x.shape[0], x.dtype) - lib.compute(x, x.shape[0], x.shape[1], mu, y) - return y - -def bench(func, mode = 'diag'): - d = 30 - n = 1e5 - niter = 10 - - print "Compute %d times densities, %d dimension, %d frames" % (niter, d, n) - mu = 0.1 * randn(d) - va = 0.1 * abs(randn(d)) - - X = 0.1 * randn(n, d) - for i in range(niter): - Y = func(X, mu, va) - -def benchpy(): - bench(component_likelihood) - -def benchpy3(): - bench(component_likelihood3) - -def benchpy2(): - bench2(component_likelihood2) - -if __name__ == "__main__": - #import hotshot, hotshot.stats - #profile_file = 'gdenpy.prof' - #prof = hotshot.Profile(profile_file, lineevents=1) - #prof.runcall(benchpy) - #p = hotshot.stats.load(profile_file) - #print p.sort_stats('cumulative').print_stats(20) - #prof.close() - - #profile_file = 'gdenc.prof' - #prof = hotshot.Profile(profile_file, lineevents=1) - #prof.runcall(benchpy3) - #p = hotshot.stats.load(profile_file) - #print p.sort_stats('cumulative').print_stats(20) - #prof.close() - - #import cProfile as profile - #profile.run('benchpy()', 'fooprof') - benchpy() - benchpy3() diff --git a/scikits/learn/em/profile_data/profile_gmm.py b/scikits/learn/em/profile_data/profile_gmm.py deleted file mode 100644 index b9f260a5ba2795a42632aee66273e2575b2e1647..0000000000000000000000000000000000000000 --- a/scikits/learn/em/profile_data/profile_gmm.py +++ /dev/null @@ -1,68 +0,0 @@ -import numpy as N -from scipy.sandbox.pyem import GM, GMM -import copy - -def bench1(mode = 'diag'): - #=========================================== - # GMM of 20 comp, 20 dimension, 1e4 frames - #=========================================== - d = 15 - k = 30 - nframes = 1e5 - niter = 10 - mode = 'diag' - - print "=============================================================" - print "(%d dim, %d components) GMM with %d iterations, for %d frames" \ - % (d, k, niter, nframes) - - #+++++++++++++++++++++++++++++++++++++++++++ - # Create an artificial GMM model, samples it - #+++++++++++++++++++++++++++++++++++++++++++ - print "Generating the mixture" - # Generate a model with k components, d dimensions - w, mu, va = GM.gen_param(d, k, mode, spread = 3) - # gm = GM(d, k, mode) - # gm.set_param(w, mu, va) - gm = GM.fromvalues(w, mu, va) - - # Sample nframes frames from the model - data = gm.sample(nframes) - - #++++++++++++++++++++++++ - # Learn the model with EM - #++++++++++++++++++++++++ - - # Init the model - print "Init a model for learning, with kmean for initialization" - lgm = GM(d, k, mode) - gmm = GMM(lgm, 'kmean') - - gmm.init(data) - # Keep the initialized model for drawing - gm0 = copy.copy(lgm) - - # The actual EM, with likelihood computation - like = N.zeros(niter) - - print "computing..." - for i in range(niter): - print "iteration %d" % i - g, tgd = gmm.sufficient_statistics(data) - like[i] = N.sum(N.log(N.sum(tgd, 1))) - gmm.update_em(data, g) - -if __name__ == "__main__": - import hotshot, hotshot.stats - profile_file = 'gmm.prof' - prof = hotshot.Profile(profile_file, lineevents=1) - prof.runcall(bench1) - p = hotshot.stats.load(profile_file) - print p.sort_stats('cumulative').print_stats(20) - prof.close() - # import profile - # profile.run('bench1()', 'gmmprof') - # import pstats - # p = pstats.Stats('gmmprof') - # print p.sort_stats('cumulative').print_stats(20) - diff --git a/scikits/learn/em/profile_data/profile_online_em.py b/scikits/learn/em/profile_data/profile_online_em.py deleted file mode 100644 index 2d403fe256c5d402c1f8ae28c7d8d2e40a5a8a11..0000000000000000000000000000000000000000 --- a/scikits/learn/em/profile_data/profile_online_em.py +++ /dev/null @@ -1,241 +0,0 @@ -# /usr/bin/python -# Last Change: Wed Dec 06 08:00 PM 2006 J -import copy - -import numpy as N - -from gauss_mix import GM -from gmm_em import GMM - -def _generate_data(nframes, d, k, mode = 'diag'): - N.random.seed(0) - w, mu, va = GM.gen_param(d, k, mode, spread = 1.5) - gm = GM.fromvalues(w, mu, va) - # Sample nframes frames from the model - data = gm.sample(nframes) - - #++++++++++++++++++++++++++++++++++++++++++ - # Approximate the models with classical EM - #++++++++++++++++++++++++++++++++++++++++++ - emiter = 5 - # Init the model - lgm = GM(d, k, mode) - gmm = GMM(lgm, 'kmean') - gmm.init(data) - - gm0 = copy.copy(gmm.gm) - # The actual EM, with likelihood computation - like = N.zeros(emiter) - for i in range(emiter): - g, tgd = gmm.sufficient_statistics(data) - like[i] = N.sum(N.log(N.sum(tgd, 1)), axis = 0) - gmm.update_em(data, g) - - return data, gm - -nframes = int(5e3) -d = 1 -k = 2 -niter = 1 - -def test_v1(): - # Generate test data - data, gm = _generate_data(nframes, d, k) - for i in range(niter): - iter_1(data, gm) - -def test_v2(): - # Generate test data - data, gm = _generate_data(nframes, d, k) - for i in range(niter): - iter_2(data, gm) - -def test_v3(): - # Generate test data - data, gm = _generate_data(nframes, d, k) - for i in range(niter): - iter_3(data, gm) - -def test_v4(): - # Generate test data - data, gm = _generate_data(nframes, d, k) - for i in range(niter): - iter_4(data, gm) - -def iter_1(data, gm): - """Online EM with original densities + original API""" - from online_em import OnGMM - - nframes = data.shape[0] - ogm = copy.copy(gm) - ogmm = OnGMM(ogm, 'kmean') - init_data = data[0:nframes / 20, :] - ogmm.init(init_data) - - # Forgetting param - ku = 0.005 - t0 = 200 - lamb = 1 - 1/(N.arange(-1, nframes-1) * ku + t0) - nu0 = 0.2 - nu = N.zeros((len(lamb), 1)) - nu[0] = nu0 - for i in range(1, len(lamb)): - nu[i] = 1./(1 + lamb[i] / nu[i-1]) - - # object version of online EM - for t in range(nframes): - ogmm.compute_sufficient_statistics_frame(data[t], nu[t]) - ogmm.update_em_frame() - - ogmm.gm.set_param(ogmm.cw, ogmm.cmu, ogmm.cva) - print ogmm.cw - print ogmm.cmu - print ogmm.cva - -def iter_2(data, gm): - """Online EM with densities2 + original API""" - from online_em2 import OnGMM - - nframes = data.shape[0] - ogm = copy.copy(gm) - ogmm = OnGMM(ogm, 'kmean') - init_data = data[0:nframes / 20, :] - ogmm.init(init_data) - - # Forgetting param - ku = 0.005 - t0 = 200 - lamb = 1 - 1/(N.arange(-1, nframes-1) * ku + t0) - nu0 = 0.2 - nu = N.zeros((len(lamb), 1)) - nu[0] = nu0 - for i in range(1, len(lamb)): - nu[i] = 1./(1 + lamb[i] / nu[i-1]) - - # object version of online EM - for t in range(nframes): - ogmm.compute_sufficient_statistics_frame(data[t], nu[t]) - ogmm.update_em_frame() - - ogmm.gm.set_param(ogmm.cw, ogmm.cmu, ogmm.cva) - print ogmm.cw - print ogmm.cmu - print ogmm.cva - -def iter_3(data, gm): - """Online EM with densities + 1d API""" - from online_em import OnGMM1d - - #def blop(self, frame, nu): - # self.compute_sufficient_statistics_frame(frame, nu) - #OnGMM.blop = blop - - nframes = data.shape[0] - ogm = copy.copy(gm) - ogmm = OnGMM1d(ogm, 'kmean') - init_data = data[0:nframes / 20, :] - ogmm.init(init_data[:, 0]) - - # Forgetting param - ku = 0.005 - t0 = 200 - lamb = 1 - 1/(N.arange(-1, nframes-1) * ku + t0) - nu0 = 0.2 - nu = N.zeros((len(lamb), 1)) - nu[0] = nu0 - for i in range(1, len(lamb)): - nu[i] = 1./(1 + lamb[i] / nu[i-1]) - - # object version of online EM - for t in range(nframes): - #assert ogmm.cw is ogmm.pw - #assert ogmm.cva is ogmm.pva - #assert ogmm.cmu is ogmm.pmu - a, b, c = ogmm.compute_sufficient_statistics_frame(data[t, 0], nu[t]) - ##ogmm.blop(data[t,0], nu[t]) - ogmm.update_em_frame(a, b, c) - - #ogmm.gm.set_param(ogmm.cw, ogmm.cmu, ogmm.cva) - print ogmm.cw - print ogmm.cmu - print ogmm.cva - -def iter_4(data, gm): - """Online EM with densities2 + 1d API""" - from online_em2 import OnGMM1d - - #def blop(self, frame, nu): - # self.compute_sufficient_statistics_frame(frame, nu) - #OnGMM.blop = blop - - nframes = data.shape[0] - ogm = copy.copy(gm) - ogmm = OnGMM1d(ogm, 'kmean') - init_data = data[0:nframes / 20, :] - ogmm.init(init_data[:, 0]) - - # Forgetting param - ku = 0.005 - t0 = 200 - lamb = 1 - 1/(N.arange(-1, nframes-1) * ku + t0) - nu0 = 0.2 - nu = N.zeros((len(lamb), 1)) - nu[0] = nu0 - for i in range(1, len(lamb)): - nu[i] = 1./(1 + lamb[i] / nu[i-1]) - - # object version of online EM - def blop(): - #for t in range(nframes): - # #assert ogmm.cw is ogmm.pw - # #assert ogmm.cva is ogmm.pva - # #assert ogmm.cmu is ogmm.pmu - # #a, b, c = ogmm.compute_sufficient_statistics_frame(data[t, 0], nu[t]) - # ###ogmm.blop(data[t,0], nu[t]) - # #ogmm.update_em_frame(a, b, c) - # ogmm.compute_em_frame(data[t, 0], nu[t]) - [ogmm.compute_em_frame(data[t, 0], nu[t]) for t in range(nframes)] - blop() - - #ogmm.gm.set_param(ogmm.cw, ogmm.cmu, ogmm.cva) - print ogmm.cw - print ogmm.cmu - print ogmm.cva - - - -if __name__ == '__main__': - #import hotshot, hotshot.stats - #profile_file = 'onem1.prof' - #prof = hotshot.Profile(profile_file, lineevents=1) - #prof.runcall(test_v1) - #p = hotshot.stats.load(profile_file) - #print p.sort_stats('cumulative').print_stats(20) - #prof.close() - - #import hotshot, hotshot.stats - #profile_file = 'onem2.prof' - #prof = hotshot.Profile(profile_file, lineevents=1) - #prof.runcall(test_v2) - #p = hotshot.stats.load(profile_file) - #print p.sort_stats('cumulative').print_stats(20) - #prof.close() - - import hotshot, hotshot.stats - profile_file = 'onem3.prof' - prof = hotshot.Profile(profile_file, lineevents=1) - prof.runcall(test_v3) - p = hotshot.stats.load(profile_file) - print p.sort_stats('cumulative').print_stats(20) - prof.close() - - import hotshot, hotshot.stats - profile_file = 'onem4.prof' - prof = hotshot.Profile(profile_file, lineevents=1) - prof.runcall(test_v4) - p = hotshot.stats.load(profile_file) - print p.sort_stats('cumulative').print_stats(20) - prof.close() - #test_v1() - #test_v2() - #test_v3() diff --git a/scikits/learn/em/setup.py b/scikits/learn/em/setup.py deleted file mode 100644 index 47ebe2b53c4f88c557bf7609be7c3a3af3b92240..0000000000000000000000000000000000000000 --- a/scikits/learn/em/setup.py +++ /dev/null @@ -1,33 +0,0 @@ -#! /usr/bin/env python -# Last Change: Mon Jul 02 02:00 PM 2007 J -# TODO: -# - check how to handle cmd line build options with distutils and use -# it in the building process - -"""This is a small python package to estimate Gaussian Mixtures Models -from data, using Expectation Maximization. - -Maximum likelihood EM for mixture of Gaussian is implemented, with BIC computation -for number of cluster assessment. - -There is also an experimental online EM version (the EM is updated for each new -sample), and I plan to add Variational Bayes and/or MCMC support for Bayesian approach -for estimating meta parameters of mixtures. """ -from os.path import join - -def configuration(parent_package='',top_path=None, package_name='em'): - from numpy.distutils.misc_util import Configuration - config = Configuration('em',parent_package, top_path) - config.add_data_dir('examples') - config.add_data_dir('tests') - config.add_data_dir('profile_data') - config.add_extension('c_gden', - sources=[join('src', 'c_gden.c')]) - config.add_extension('_rawden', - sources=[join('src', 'pure_den.c')]) - - return config - -if __name__ == "__main__": - from numpy.distutils.core import setup - setup(**configuration(top_path='').todict()) diff --git a/scikits/learn/em/src/Makefile b/scikits/learn/em/src/Makefile deleted file mode 100644 index 854b4106ee55d0e4e4158f7789fe705c940185ca..0000000000000000000000000000000000000000 --- a/scikits/learn/em/src/Makefile +++ /dev/null @@ -1,42 +0,0 @@ -CC = colorgcc -LD = gcc - -PYREX = python2.4-pyrexc - -PYTHONINC = -I/usr/include/python2.4 -NUMPYINC = -I/home/david/local/lib/python2.4/site-packages/numpy/core/include -OPTIMS = -O3 -funroll-all-loops -march=pentium4 -msse2 -WARN = -Wall -W -Winline -Wstrict-prototypes -Wmissing-prototypes \ - -Waggregate-return -Wcast-align -Wcast-qual -Wnested-externs \ - -Wshadow -Wbad-function-cast -Wwrite-strings - -CFLAGS = $(PYTHONINC) $(NUMPYINC) $(OPTIMS) $(WARN) - -targets: c_gden.so _rawden.so - -c_gden.so: c_gden.o - $(LD) -shared -o $@ $< -Wl,-soname,$@ - -_rawden.so: pure_den.o - $(LD) -shared -o $@ $< -Wl,-soname,$@ - -c_gden.o: c_gden.c - $(CC) -c $(CFLAGS) -fPIC $< - -pure_den.o: pure_den.c - $(CC) -c $(CFLAGS) -fPIC $< - -#c_gmm.so: c_gmm.o -# $(LD) -shared -o $@ $< -Wl,-soname,$@ -# -#c_gmm.o: c_gmm.c -# $(CC) -c $(CFLAGS) -fPIC $< -# -#c_gmm.c: c_gmm.pyx c_numpy.pxd c_python.pxd -# $(PYREX) $< - -clean: - rm -f c_gmm.c - rm -f *.o - rm -f *.so - rm -f *.pyc diff --git a/scikits/learn/em/src/c_gden.c b/scikits/learn/em/src/c_gden.c deleted file mode 100644 index 82d6685b229741ee18ef20d84abc9602c6fe7f9a..0000000000000000000000000000000000000000 --- a/scikits/learn/em/src/c_gden.c +++ /dev/null @@ -1,38 +0,0 @@ -#include <stddef.h> -#include <stdio.h> -#define _USE_MATH_DEFINES /* for Visual Studio */ -#include <math.h> - -/* - * use va for caching, so its content is modified - */ -int gden_diag(const double *in, const size_t n, const size_t d, - const double* mu, double* va, double* out) -{ - size_t j, i; - double fac = 1.0; - double acc, tmp; - - /* - * Cache some precomputing - */ - for(j = 0; j < d; ++j) { - va[j] = 0.5/va[j]; - fac *= sqrt(va[j] / (M_PI) ); - va[j] *= -1.0; - } - - /* - * actual computing - */ - for(i = 0; i < n; ++i) { - acc = 0; - for(j = 0; j < d; ++j) { - tmp = (in[i *d + j] - mu[j]); - acc += tmp * tmp * va[j]; - } - out[i] = exp(acc) * fac; - } - - return 0; -} diff --git a/scikits/learn/em/src/c_gmm.pyx b/scikits/learn/em/src/c_gmm.pyx deleted file mode 100644 index 2a670cbb81d42619a2e8469cec053af5ee4a8614..0000000000000000000000000000000000000000 --- a/scikits/learn/em/src/c_gmm.pyx +++ /dev/null @@ -1,66 +0,0 @@ -# Last Change: Thu Jul 13 04:00 PM 2006 J - -cimport c_numpy -cimport c_python -import numpy - -c_numpy.import_array() - -# pyrex version of _vq. Much faster in high dimension/high number of K -# (ie K > 3-4) -def _vq(data, init): - (n, d) = data.shape - label = numpy.zeros(n, int) - _imp_vq(data, init, label) - - return label - -def _imp_vq(c_numpy.ndarray data, c_numpy.ndarray init, c_numpy.ndarray label): - cdef int n - cdef int d - cdef int nc - cdef int i - cdef int j - cdef int k - cdef int *labeld - cdef double *da, *code - cdef double dist - cdef double acc - - n = data.dimensions[0] - d = data.dimensions[1] - nc = init.dimensions[0] - - if not data.dtype == numpy.dtype(numpy.float64): - print '_vq not (yet) implemented for dtype %s'%dtype.name - return - da = (<double*>data.data) - - if not init.dtype == numpy.dtype(numpy.float64): - print '_vq not (yet) implemented for dtype %s'%dtype.name - return - code = (<double*>init.data) - - if not label.dtype == numpy.dtype(numpy.int32): - print '_vq not (yet) implemented for dtype %s'%dtype.name - return - labeld = (<int*>label.data) - - for i from 0<=i<n: - acc = 0 - lab = 0 - for j from 0<=j<d: - acc = acc + (da[i * d + j] - code[j]) * \ - (da[i * d + j] - code[j]) - dist = acc - for k from 1<=k<nc: - acc = 0 - for j from 0<=j<d: - acc = acc + (da[i * d + j] - code[k * d + j]) * \ - (da[i * d + j] - code[k * d + j]) - if acc < dist: - dist = acc - lab = k - labeld[i] = lab - - return lab diff --git a/scikits/learn/em/src/pure_den.c b/scikits/learn/em/src/pure_den.c deleted file mode 100644 index 27ca134427e3cc7777ab5eff686507cb7c50d307..0000000000000000000000000000000000000000 --- a/scikits/learn/em/src/pure_den.c +++ /dev/null @@ -1,462 +0,0 @@ -/* - * Last Change: Mon May 28 01:00 PM 2007 J - * - * Pure C module because ctypes cannot be used here for performance reasons - * (function calls are the primary bottleneck) - */ -#include <Python.h> -#include <numpy/arrayobject.h> - -#define _USE_MATH_DEFINES -#include <math.h> - -PyObject* compute_ss_frame_1d_py(PyObject* dum, PyObject *arg); -PyObject* compute_em_frame_1d_py(PyObject* dum, PyObject *arg); - -/* - * Pure C methods - */ -static int compute_ss_frame_1d(double sample, int nc, double* w, double* mu, double *va, - double *cx, double *cxx, double nu); -static int update_em_frame_1d(const double* w, const double* cx, const double *cxx, - int nc, double *cmu, double *cva); - -static PyMethodDef mymethods[] = { - {"compute_ss_frame_1d", compute_ss_frame_1d_py, METH_VARARGS, ""}, - {"compute_em_frame_1d", compute_em_frame_1d_py, METH_VARARGS, ""}, - {NULL, NULL, 0, NULL} /* Sentinel */ -}; - -/* - * function table - */ -PyMODINIT_FUNC init_rawden(void) -{ - (void)Py_InitModule("_rawden", mymethods); - import_array(); -} - -PyObject* compute_ss_frame_1d_py(PyObject* self, PyObject *args) -{ - PyObject *w, *mu, *va, *cx, *cxx; - PyObject *w_a, *mu_a, *va_a, *cx_a, *cxx_a; - double nu, sample; - npy_intp ndim, *dims, len; - double *mu_ca, *va_ca, *w_ca, *cx_ca, *cxx_ca; - - const npy_intp mrank = 1; - - int st; - - /* - * Init python object holder to NULL so that we can use Py_XDECREF on all - * the objets when something is woring - */ - w = NULL; - mu = NULL; - va = NULL; - cx = NULL; - cxx = NULL; - - w_a = NULL; - mu_a = NULL; - va_a = NULL; - cx_a = NULL; - cxx_a = NULL; - /* - * Parsing of args: w, cx and cxx are inout - */ - if (!PyArg_ParseTuple(args, "dOOOOOd", &sample, &w, &mu, &va, &cx, &cxx, &nu)){ - return NULL; - } - if ( (nu > 1) | (nu <= 0) ) { - PyErr_SetString(PyExc_TypeError, "nu should be between 0 and 1"); - return NULL; - } - - /* inout entries */ - w_a = PyArray_FROM_OTF(w, NPY_DOUBLE, NPY_INOUT_ARRAY); - if (w_a == NULL) { - PyErr_SetString(PyExc_TypeError, "w array not convertible"); - return NULL; - } - - cx_a = PyArray_FROM_OTF(cx, NPY_DOUBLE, NPY_INOUT_ARRAY); - if (cx_a == NULL) { - PyErr_SetString(PyExc_TypeError, "cx array not convertible"); - goto fail; - } - - cxx_a = PyArray_FROM_OTF(cxx, NPY_DOUBLE, NPY_INOUT_ARRAY); - if (cxx_a == NULL) { - PyErr_SetString(PyExc_TypeError, "cxx array not convertible"); - goto fail; - } - - /* in only entries */ - mu_a = PyArray_FROM_OTF(mu, NPY_DOUBLE, NPY_IN_ARRAY); - if (mu_a == NULL) { - PyErr_SetString(PyExc_TypeError, "mu array not convertible"); - goto fail; - } - - va_a = PyArray_FROM_OTF(va, NPY_DOUBLE, NPY_IN_ARRAY); - if (va_a == NULL) { - PyErr_SetString(PyExc_TypeError, "va array not convertible"); - goto fail; - } - - /* - * Check that in and out have same size and same rank - */ - ndim = PyArray_NDIM(w_a); - if(ndim != mrank) { - PyErr_SetString(PyExc_TypeError, "w rank should be 1"); - goto fail; - } - ndim = PyArray_NDIM(cx_a); - if(ndim != mrank) { - PyErr_SetString(PyExc_TypeError, "cx rank should be 1"); - goto fail; - } - ndim = PyArray_NDIM(cxx_a); - if(ndim != mrank) { - PyErr_SetString(PyExc_TypeError, "cxx rank should be 1"); - goto fail; - } - ndim = PyArray_NDIM(mu_a); - if(ndim != mrank) { - PyErr_SetString(PyExc_TypeError, "mu rank should be 1"); - goto fail; - } - ndim = PyArray_NDIM(va_a); - if(ndim != mrank) { - PyErr_SetString(PyExc_TypeError, "va rank should be 1"); - goto fail; - } - - dims = PyArray_DIMS(w_a); - len = dims[0]; - //fprintf(stderr, "%s:%s, len is %d\n", __FILE__, __func__, len); - dims = PyArray_DIMS(cx_a); - if(dims[0] != len) { - PyErr_SetString(PyExc_TypeError, "cx shape should match !"); - goto fail; - } - dims = PyArray_DIMS(cxx_a); - if(dims[0] != len) { - PyErr_SetString(PyExc_TypeError, "cxx shape should match !"); - goto fail; - } - dims = PyArray_DIMS(mu_a); - if(dims[0] != len) { - PyErr_SetString(PyExc_TypeError, "mu_a shape should match !"); - goto fail; - } - dims = PyArray_DIMS(va_a); - if(dims[0] != len) { - PyErr_SetString(PyExc_TypeError, "va_a shape should match !"); - goto fail; - } - - /* - * Get pointer to the data - */ - w_ca = PyArray_DATA(w_a); - if (w_ca == NULL) { - PyErr_SetString(PyExc_TypeError, "Unknown Error for w_ca"); - goto fail; - } - cx_ca = PyArray_DATA(cx_a); - if (cx_ca == NULL) { - PyErr_SetString(PyExc_TypeError, "Unknown Error for cx_ca"); - goto fail; - } - cxx_ca = PyArray_DATA(cxx_a); - if (w_ca == NULL) { - PyErr_SetString(PyExc_TypeError, "Unknown Error for cxx_ca"); - goto fail; - } - mu_ca = PyArray_DATA(mu_a); - if (mu_ca == NULL) { - PyErr_SetString(PyExc_TypeError, "Unknown Error for mu_ca"); - goto fail; - } - va_ca = PyArray_DATA(va_a); - if (va_ca == NULL) { - PyErr_SetString(PyExc_TypeError, "Unknown Error for va_ca"); - goto fail; - } - /* - * Call actual implementation - */ - st = compute_ss_frame_1d(sample, len, w_ca, mu_ca, va_ca, cx_ca, cxx_ca, nu); - if (st) { - PyErr_SetString(PyExc_TypeError, "Error while calling multi_gauss...."); - goto fail; - } - - Py_DECREF(w_a); - Py_DECREF(cx_a); - Py_DECREF(cxx_a); - Py_DECREF(mu_a); - Py_DECREF(va_a); - - Py_INCREF(Py_None); - return Py_None; - -fail: - Py_XDECREF(w_a); - Py_XDECREF(cx_a); - Py_XDECREF(cxx_a); - Py_XDECREF(mu_a); - Py_XDECREF(va_a); - return NULL; -} - -PyObject* compute_em_frame_1d_py(PyObject* self, PyObject *args) -{ - PyObject *w, *mu, *va, *cx, *cxx; - PyObject *w_a, *mu_a, *va_a, *cx_a, *cxx_a; - double nu, sample; - npy_intp ndim, *dims, len; - double *mu_ca, *va_ca, *w_ca, *cx_ca, *cxx_ca; - - const npy_intp mrank = 1; - - int st; - - /* - * Init python object holder to NULL so that we can use Py_XDECREF on all - * the objets when something is woring - */ - w = NULL; - mu = NULL; - va = NULL; - cx = NULL; - cxx = NULL; - - w_a = NULL; - mu_a = NULL; - va_a = NULL; - cx_a = NULL; - cxx_a = NULL; - /* - * Parsing of args: w, cx and cxx are inout - */ - if (!PyArg_ParseTuple(args, "dOOOOOd", &sample, &w, &mu, &va, &cx, &cxx, &nu)){ - return NULL; - } - if ( (nu > 1) | (nu <= 0) ) { - PyErr_SetString(PyExc_TypeError, "nu should be between 0 and 1"); - return NULL; - } - - /* inout entries */ - w_a = PyArray_FROM_OTF(w, NPY_DOUBLE, NPY_INOUT_ARRAY); - if (w_a == NULL) { - PyErr_SetString(PyExc_TypeError, "w array not convertible"); - return NULL; - } - - cx_a = PyArray_FROM_OTF(cx, NPY_DOUBLE, NPY_INOUT_ARRAY); - if (cx_a == NULL) { - PyErr_SetString(PyExc_TypeError, "cx array not convertible"); - goto fail; - } - - cxx_a = PyArray_FROM_OTF(cxx, NPY_DOUBLE, NPY_INOUT_ARRAY); - if (cxx_a == NULL) { - PyErr_SetString(PyExc_TypeError, "cxx array not convertible"); - goto fail; - } - - /* in only entries */ - mu_a = PyArray_FROM_OTF(mu, NPY_DOUBLE, NPY_IN_ARRAY); - if (mu_a == NULL) { - PyErr_SetString(PyExc_TypeError, "mu array not convertible"); - goto fail; - } - - va_a = PyArray_FROM_OTF(va, NPY_DOUBLE, NPY_IN_ARRAY); - if (va_a == NULL) { - PyErr_SetString(PyExc_TypeError, "va array not convertible"); - goto fail; - } - - /* - * Check that in and out have same size and same rank - */ - ndim = PyArray_NDIM(w_a); - if(ndim != mrank) { - PyErr_SetString(PyExc_TypeError, "w rank should be 1"); - goto fail; - } - ndim = PyArray_NDIM(cx_a); - if(ndim != mrank) { - PyErr_SetString(PyExc_TypeError, "cx rank should be 1"); - goto fail; - } - ndim = PyArray_NDIM(cxx_a); - if(ndim != mrank) { - PyErr_SetString(PyExc_TypeError, "cxx rank should be 1"); - goto fail; - } - ndim = PyArray_NDIM(mu_a); - if(ndim != mrank) { - PyErr_SetString(PyExc_TypeError, "mu rank should be 1"); - goto fail; - } - ndim = PyArray_NDIM(va_a); - if(ndim != mrank) { - PyErr_SetString(PyExc_TypeError, "va rank should be 1"); - goto fail; - } - - dims = PyArray_DIMS(w_a); - len = dims[0]; - //fprintf(stderr, "%s:%s, len is %d\n", __FILE__, __func__, len); - dims = PyArray_DIMS(cx_a); - if(dims[0] != len) { - PyErr_SetString(PyExc_TypeError, "cx shape should match !"); - goto fail; - } - dims = PyArray_DIMS(cxx_a); - if(dims[0] != len) { - PyErr_SetString(PyExc_TypeError, "cxx shape should match !"); - goto fail; - } - dims = PyArray_DIMS(mu_a); - if(dims[0] != len) { - PyErr_SetString(PyExc_TypeError, "mu_a shape should match !"); - goto fail; - } - dims = PyArray_DIMS(va_a); - if(dims[0] != len) { - PyErr_SetString(PyExc_TypeError, "va_a shape should match !"); - goto fail; - } - - /* - * Get pointer to the data - */ - w_ca = PyArray_DATA(w_a); - if (w_ca == NULL) { - PyErr_SetString(PyExc_TypeError, "Unknown Error for w_ca"); - goto fail; - } - cx_ca = PyArray_DATA(cx_a); - if (cx_ca == NULL) { - PyErr_SetString(PyExc_TypeError, "Unknown Error for cx_ca"); - goto fail; - } - cxx_ca = PyArray_DATA(cxx_a); - if (w_ca == NULL) { - PyErr_SetString(PyExc_TypeError, "Unknown Error for cxx_ca"); - goto fail; - } - mu_ca = PyArray_DATA(mu_a); - if (mu_ca == NULL) { - PyErr_SetString(PyExc_TypeError, "Unknown Error for mu_ca"); - goto fail; - } - va_ca = PyArray_DATA(va_a); - if (va_ca == NULL) { - PyErr_SetString(PyExc_TypeError, "Unknown Error for va_ca"); - goto fail; - } - /* - * Call actual implementation - */ - st = compute_ss_frame_1d(sample, len, w_ca, mu_ca, va_ca, cx_ca, cxx_ca, nu); - if (st) { - PyErr_SetString(PyExc_TypeError, "Error while calling multi_gauss...."); - goto fail; - } - st = update_em_frame_1d(w_ca, cx_ca, cxx_ca, len, mu_ca, va_ca); - if (st) { - PyErr_SetString(PyExc_TypeError, "Error while calling update_em_frame_1d...."); - goto fail; - } - - Py_DECREF(w_a); - Py_DECREF(cx_a); - Py_DECREF(cxx_a); - Py_DECREF(mu_a); - Py_DECREF(va_a); - - Py_INCREF(Py_None); - return Py_None; - -fail: - Py_XDECREF(w_a); - Py_XDECREF(cx_a); - Py_XDECREF(cxx_a); - Py_XDECREF(mu_a); - Py_XDECREF(va_a); - return NULL; -} - -int compute_ss_frame_1d(double sample, int nc, double* w, double* mu, double *va, - double *cx, double *cxx, double nu) -{ - /* - * TODO: check va division - */ - int i; - double inva, fac, *gam, acc; - - gam = malloc(sizeof(*gam) * nc); - if (gam == NULL) { - return -1; - } - - /* Compute gamma */ - acc = 0; - for (i = 0; i < nc; ++i) { - inva = 1/va[i]; - fac = 1 / sqrt(2 * M_PI * va[i]); - gam[i] = fac * exp( -0.5 * inva * (sample - mu[i]) * (sample - mu[i])); - gam[i] *= w[i]; - acc += gam[i]; - } - /* Normalize gamma */ - for (i = 0; i < nc; ++i) { - gam[i] /= acc; - } - - /* Compute new SS from EM (cx and cxx) */ - for (i = 0; i < nc; ++i) { - w[i] *= (1 - nu); - w[i] += nu * gam[i]; - cx[i] = (1 - nu) * cx[i] + nu * sample * gam[i]; - cxx[i] = (1 - nu) * cxx[i] + nu * sample * sample * gam[i]; - } - - free(gam); - - return 0; -} - -/* - * update mu and va from SS w, cx and cxx. Only mu and va are modified - * all arrays have same length nc - */ -int update_em_frame_1d(const double* cw, const double* cx, const double *cxx, - int nc, double *cmu, double *cva) -{ - /* - * TODO: check va division - */ - int i; - double invw; - - /* Compute new SS from EM (cx and cxx) */ - for (i = 0; i < nc; ++i) { - invw = 1/cw[i]; - cmu[i] = cx[i] * invw; - cva[i] = cxx[i] * invw - cmu[i] * cmu[i]; - } - - return 0; -} diff --git a/scikits/learn/em/tests/__init__.py b/scikits/learn/em/tests/__init__.py deleted file mode 100644 index e9a78bf4fc70908129b74c73c3966ee98727b0dd..0000000000000000000000000000000000000000 --- a/scikits/learn/em/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""This is here just to make local imports from this module""" diff --git a/scikits/learn/em/tests/diag_1d_3k.mat b/scikits/learn/em/tests/diag_1d_3k.mat deleted file mode 100644 index 8e5e93a770c6a4e8588399fa1fc996bb9a782a9b..0000000000000000000000000000000000000000 Binary files a/scikits/learn/em/tests/diag_1d_3k.mat and /dev/null differ diff --git a/scikits/learn/em/tests/diag_1d_4k.mat b/scikits/learn/em/tests/diag_1d_4k.mat deleted file mode 100644 index 103a598e4df25670dec482fabb0b40eae8ce1b07..0000000000000000000000000000000000000000 Binary files a/scikits/learn/em/tests/diag_1d_4k.mat and /dev/null differ diff --git a/scikits/learn/em/tests/diag_2d_3k.mat b/scikits/learn/em/tests/diag_2d_3k.mat deleted file mode 100644 index 36ded5a87dfeaf190b956145fb1d5f5f009f90e1..0000000000000000000000000000000000000000 Binary files a/scikits/learn/em/tests/diag_2d_3k.mat and /dev/null differ diff --git a/scikits/learn/em/tests/full_2d_3k.mat b/scikits/learn/em/tests/full_2d_3k.mat deleted file mode 100644 index 41d9df341815536336eb6d0f1896208030c901da..0000000000000000000000000000000000000000 Binary files a/scikits/learn/em/tests/full_2d_3k.mat and /dev/null differ diff --git a/scikits/learn/em/tests/generate_tests_data.py b/scikits/learn/em/tests/generate_tests_data.py deleted file mode 100644 index 302f4448ae41c6bf1f3350c3a498f48056a215a1..0000000000000000000000000000000000000000 --- a/scikits/learn/em/tests/generate_tests_data.py +++ /dev/null @@ -1,98 +0,0 @@ -#! /usr/bin/env python -# Last Change: Sun Sep 07 04:00 PM 2008 J - -# This script generates some random data used for testing EM implementations. -import copy -import numpy as N -from scipy.io import savemat, loadmat - -from scikits.learn.em import GM, GMM, EM - -def generate_dataset(d, k, mode, nframes): - """Generate a dataset useful for EM anf GMM testing. - - returns: - data : ndarray - data from the true model. - tgm : GM - the true model (randomly generated) - gm0 : GM - the initial model - gm : GM - the trained model - """ - # Generate a model - w, mu, va = GM.gen_param(d, k, mode, spread = 2.0) - tgm = GM.fromvalues(w, mu, va) - - # Generate data from the model - data = tgm.sample(nframes) - - # Run EM on the model, by running the initialization separetely. - gmm = GMM(GM(d, k, mode), 'test') - gmm.init_random(data) - gm0 = copy.copy(gmm.gm) - - gmm = GMM(copy.copy(gmm.gm), 'test') - em = EM() - em.train(data, gmm) - - return data, tgm, gm0, gmm.gm - -def save_dataset(filename, data, tgm, gm0, gm): - dic = {'tw': tgm.w, 'tmu': tgm.mu, 'tva': tgm.va, - 'w0': gm0.w, 'mu0' : gm0.mu, 'va0': gm0.va, - 'w': gm.w, 'mu': gm.mu, 'va': gm.va, - 'data': data} - savemat(filename, dic) - -def doall(d, k, mode): - import pylab as P - - data, tgm, gm0, gm = generate_dataset(d, k, mode, 500) - filename = mode + '_%dd' % d + '_%dk.mat' % k - save_dataset(filename, data, tgm, gm0, gm) - - if d == 1: - P.subplot(2, 1, 1) - gm0.plot1d() - h = tgm.plot1d(gpdf = True) - P.hist(data[:, 0], 20, normed = 1, fill = False) - - P.subplot(2, 1, 2) - gm.plot1d() - tgm.plot1d(gpdf = True) - P.hist(data[:, 0], 20, normed = 1, fill = False) - else: - P.subplot(2, 1, 1) - gm0.plot() - h = tgm.plot() - [i.set_color('g') for i in h] - P.plot(data[:, 0], data[:, 1], '.') - - P.subplot(2, 1, 2) - gm.plot() - h = tgm.plot() - [i.set_color('g') for i in h] - P.plot(data[:, 0], data[:, 1], '.') - - P.show() - -if __name__ == '__main__': - N.random.seed(0) - d = 2 - k = 3 - mode = 'full' - doall(d, k, mode) - - N.random.seed(0) - d = 2 - k = 3 - mode = 'diag' - doall(d, k, mode) - - N.random.seed(0) - d = 1 - k = 4 - mode = 'diag' - doall(d, k, mode) diff --git a/scikits/learn/em/tests/test_densities.py b/scikits/learn/em/tests/test_densities.py deleted file mode 100644 index c8be9c3342632a72a13b81382b23f3d948dbcc01..0000000000000000000000000000000000000000 --- a/scikits/learn/em/tests/test_densities.py +++ /dev/null @@ -1,215 +0,0 @@ -#! /usr/bin/env python -# Last Change: Sun Sep 07 04:00 PM 2008 J - -# TODO: -# - having "fake tests" to check that all mode (scalar, diag and full) are -# executables -# - having a dataset to check against - -import sys -from unittest import TestCase - -import numpy as N -from numpy.testing import assert_array_almost_equal, assert_array_equal - -from scikits.learn.em.densities import gauss_den, \ - multiple_gauss_den, logsumexp, gauss_ell - -from scikits.learn.em.tests.testcommon import DEF_DEC - -class TestDensities(TestCase): - def _generate_test_data_1d(self): - self.va = 2.0 - self.mu = 1.0 - self.X = N.linspace(-2, 2, 10)[:, N.newaxis] - - self.Yt = N.array([0.02973257230591, 0.05512079811082, - 0.09257745306945, 0.14086453882683, 0.19418015562214, - 0.24250166773127, 0.27436665745048, 0.28122547107069, - 0.26114678964743, 0.21969564473386]) - - def _generate_test_data_2d_diag(self): - #============================ - # Small test in 2d (diagonal) - #============================ - self.mu = N.atleast_2d([-1.0, 2.0]) - self.va = N.atleast_2d([2.0, 3.0]) - - self.X = N.zeros((10, 2)) - self.X[:,0] = N.linspace(-2, 2, 10) - self.X[:,1] = N.linspace(-1, 3, 10) - - self.Yt = N.array([0.01129091565384, 0.02025416837152, - 0.03081845516786, 0.03977576221540, 0.04354490552910, - 0.04043592581117, 0.03184994053539, 0.02127948225225, - 0.01205937178755, 0.00579694938623 ]) - - - def _generate_test_data_2d_full(self): - #============================ - # Small test in 2d (full mat) - #============================ - self.mu = N.array([[0.2, -1.0]]) - self.va = N.array([[1.2, 0.1], [0.1, 0.5]]) - X1 = N.linspace(-2, 2, 10)[:, N.newaxis] - X2 = N.linspace(-3, 3, 10)[:, N.newaxis] - self.X = N.concatenate(([X1, X2]), 1) - - self.Yt = N.array([0.00096157109751, 0.01368908714856, - 0.07380823191162, 0.15072050533842, - 0.11656739937861, 0.03414436965525, - 0.00378789836599, 0.00015915297541, - 0.00000253261067, 0.00000001526368]) - -#===================== -# Basic accuracy tests -#===================== -class test_py_implementation(TestDensities): - def _test(self, level, decimal = DEF_DEC): - Y = gauss_den(self.X, self.mu, self.va) - assert_array_almost_equal(Y, self.Yt, decimal) - - def _test_log(self, level, decimal = DEF_DEC): - Y = gauss_den(self.X, self.mu, self.va, log = True) - assert_array_almost_equal(N.exp(Y), self.Yt, decimal) - - def test_2d_diag(self, level = 0): - self._generate_test_data_2d_diag() - self._test(level) - - def test_2d_full(self, level = 0): - self._generate_test_data_2d_full() - self._test(level) - - def test_1d(self, level = 0): - self._generate_test_data_1d() - self._test(level) - - def test_2d_diag_log(self, level = 0): - self._generate_test_data_2d_diag() - self._test_log(level) - - def test_2d_full_log(self, level = 0): - self._generate_test_data_2d_full() - self._test_log(level) - - def test_1d_log(self, level = 0): - self._generate_test_data_1d() - self._test_log(level) - -# #===================== -# # Basic speed tests -# #===================== -# class test_speed(TestCase): -# def __init__(self, *args, **kw): -# TestCase.__init__(self, *args, **kw) -# import sys -# import re -# try: -# a = open('/proc/cpuinfo').readlines() -# b = re.compile('cpu MHz') -# c = [i for i in a if b.match(i)] -# fcpu = float(c[0].split(':')[1]) -# self.fcpu = fcpu * 1e6 -# self.hascpu = True -# except: -# print "Could not read cpu frequency" -# self.hascpu = False -# self.fcpu = 0. -# -# def _prepare(self, n, d, mode): -# niter = 10 -# x = 0.1 * N.random.randn(n, d) -# mu = 0.1 * N.random.randn(d) -# if mode == 'diag': -# va = 0.1 * N.random.randn(d) ** 2 -# elif mode == 'full': -# a = N.random.randn(d, d) -# va = 0.1 * N.dot(a.T, a) -# st = self.measure("gauss_den(x, mu, va)", niter) -# return st / niter -# -# def _bench(self, n, d, mode): -# st = self._prepare(n, d, mode) -# print "%d dimension, %d samples, %s mode: %8.2f " % (d, n, mode, st) -# if self.hascpu: -# print "Cost per frame is %f; cost per sample is %f" % \ -# (st * self.fcpu / n, st * self.fcpu / n / d) -# -# def test1(self, level = 5): -# cls = self.__class__ -# for n, d in [(1e5, 1), (1e5, 5), (1e5, 10), (1e5, 30), (1e4, 100)]: -# self._bench(n, d, 'diag') -# for n, d in [(1e4, 2), (1e4, 5), (1e4, 10), (5000, 40)]: -# self._bench(n, d, 'full') - -#================ -# Logsumexp tests -#================ -class test_py_logsumexp(TestDensities): - """Class to compare logsumexp vs naive implementation.""" - - def naive_logsumexp(self, data): - return N.log(N.sum(N.exp(data), 1)) - - def test_1d(self): - data = N.random.randn(1e1)[:, N.newaxis] - mu = N.array([[-5], [-6]]) - va = N.array([[0.1], [0.1]]) - y = multiple_gauss_den(data, mu, va, log = True) - a1 = logsumexp(y) - a2 = self.naive_logsumexp(y) - assert_array_equal(a1, a2) - - def test_2d_full(self): - data = N.random.randn(1e1, 2) - mu = N.array([[-3, -1], [3, 3]]) - va = N.array([[1.1, 0.4], [0.6, 0.8], [0.4, 0.2], [0.3, 0.9]]) - y = multiple_gauss_den(data, mu, va, log = True) - a1 = logsumexp(y) - a2 = self.naive_logsumexp(y) - assert_array_almost_equal(a1, a2, DEF_DEC) - - def test_2d_diag(self): - data = N.random.randn(1e1, 2) - mu = N.array([[-3, -1], [3, 3]]) - va = N.array([[1.1, 0.4], [0.6, 0.8]]) - y = multiple_gauss_den(data, mu, va, log = True) - a1 = logsumexp(y) - a2 = self.naive_logsumexp(y) - assert_array_almost_equal(a1, a2, DEF_DEC) - -#======================= -# Test C implementation -#======================= -class test_c_implementation(TestDensities): - def _test(self, level, decimal = DEF_DEC): - try: - from scikits.learn.em._c_densities import gauss_den as c_gauss_den - Y = c_gauss_den(self.X, self.mu, self.va) - assert_array_almost_equal(Y, self.Yt, decimal) - except Exception, inst: - print "Error while importing C implementation, not tested" - print " -> (Import error was %s)" % inst - - def test_1d(self, level = 0): - self._generate_test_data_1d() - self._test(level) - - def test_2d_diag(self, level = 0): - self._generate_test_data_2d_diag() - self._test(level) - - def test_2d_full(self, level = 0): - self._generate_test_data_2d_full() - self._test(level) - -class test_gauss_ell(TestCase): - def test_dim(self): - gauss_ell([0, 1], [1, 2.], [0, 1]) - try: - gauss_ell([0, 1], [1, 2.], [0, 2]) - raise AssertionError("this call should not succeed, bogus dim.") - except ValueError, e: - print "Call with bogus dim did not succeed, OK" - diff --git a/scikits/learn/em/tests/test_gauss_mix.py b/scikits/learn/em/tests/test_gauss_mix.py deleted file mode 100644 index 376e673b823f47153aef80c7d5b2956e9ca9c677..0000000000000000000000000000000000000000 --- a/scikits/learn/em/tests/test_gauss_mix.py +++ /dev/null @@ -1,86 +0,0 @@ -#! /usr/bin/env python -# Last Change: Sun Sep 07 04:00 PM 2008 J - -# For now, just test that all mode/dim execute correctly - -import sys -from unittest import TestCase - -import numpy as N -from numpy.testing import assert_array_almost_equal - -from scikits.learn.em import GM -from scikits.learn.em.densities import multiple_gauss_den - -class test_BasicFunc(TestCase): - """Check that basic functionalities work.""" - def test_conf_ellip(self): - """Only test whether the call succeed. To check wether the result is - OK, you have to plot the results.""" - d = 3 - k = 3 - w, mu, va = GM.gen_param(d, k) - gm = GM.fromvalues(w, mu, va) - gm.conf_ellipses() - - def test_1d_bogus(self): - """Check that functions which do not make sense for 1d fail nicely.""" - d = 1 - k = 2 - w, mu, va = GM.gen_param(d, k) - gm = GM.fromvalues(w, mu, va) - try: - gm.conf_ellipses() - raise AssertionError("This should not work !") - except ValueError, e: - self.assertEqual(str(e), - "This function does not make sense for 1d mixtures.") - - try: - gm.density_on_grid() - raise AssertionError("This should not work !") - except ValueError, e: - self.assertEqual(str(e), - "This function does not make sense for 1d mixtures.") - - def test_get_va(self): - """Test _get_va for diag and full mode.""" - d = 3 - k = 2 - ld = 2 - dim = [0, 2] - w, mu, va = GM.gen_param(d, k, 'full') - va = N.arange(d*d*k).reshape(d*k, d) - gm = GM.fromvalues(w, mu, va) - - tva = N.empty(ld * ld * k) - for i in range(k * ld * ld): - tva[i] = dim[i%ld] + (i % 4)/ ld * dim[1] * d + d*d * (i / (ld*ld)) - tva = tva.reshape(ld * k, ld) - sva = gm._get_va(dim) - assert N.all(sva == tva) - - def test_2d_diag_pdf(self): - d = 2 - w = N.array([0.4, 0.6]) - mu = N.array([[0., 2], [-1, -2]]) - va = N.array([[1, 0.5], [0.5, 1]]) - x = N.random.randn(100, 2) - gm = GM.fromvalues(w, mu, va) - y1 = N.sum(multiple_gauss_den(x, mu, va) * w, 1) - y2 = gm.pdf(x) - assert_array_almost_equal(y1, y2) - - def test_2d_diag_logpdf(self): - d = 2 - w = N.array([0.4, 0.6]) - mu = N.array([[0., 2], [-1, -2]]) - va = N.array([[1, 0.5], [0.5, 1]]) - x = N.random.randn(100, 2) - gm = GM.fromvalues(w, mu, va) - y1 = N.sum(multiple_gauss_den(x, mu, va) * w, 1) - y2 = gm.pdf(x, log = True) - assert_array_almost_equal(N.log(y1), y2) - -if __name__ == "__main__": - NumpyTest().run() diff --git a/scikits/learn/em/tests/test_gmm_em.py b/scikits/learn/em/tests/test_gmm_em.py deleted file mode 100644 index a2a935b614a445954a805ee2f6b7a0f8eea25706..0000000000000000000000000000000000000000 --- a/scikits/learn/em/tests/test_gmm_em.py +++ /dev/null @@ -1,198 +0,0 @@ -#! /usr/bin/env python -# Last Change: Sun Sep 07 04:00 PM 2008 J - -# For now, just test that all mode/dim execute correctly - -import sys -import os -from unittest import TestCase - -import numpy as N -from numpy.testing import assert_array_almost_equal - -from scikits.learn.em import GMM, GM, EM - -from scikits.learn.em.tests.testcommon import DEF_DEC - -curpath = os.path.dirname(__file__) - -def load_dataset(filename): - from scipy.io import loadmat - dic = loadmat(os.path.join(curpath, filename), squeeze_me = False, byte_order='little') - dic['w0'] = dic['w0'].squeeze() - dic['w'] = dic['w'].squeeze() - dic['tw'] = dic['tw'].squeeze() - return dic - -class EmTest(TestCase): - def _create_model_and_run_em(self, d, k, mode, nframes): - #+++++++++++++++++++++++++++++++++++++++++++++++++ - # Generate a model with k components, d dimensions - #+++++++++++++++++++++++++++++++++++++++++++++++++ - w, mu, va = GM.gen_param(d, k, mode, spread = 1.5) - gm = GM.fromvalues(w, mu, va) - # Sample nframes frames from the model - data = gm.sample(nframes) - - #++++++++++++++++++++++++++++++++++++++++++ - # Approximate the models with classical EM - #++++++++++++++++++++++++++++++++++++++++++ - # Init the model - lgm = GM(d, k, mode) - gmm = GMM(lgm, 'kmean') - - em = EM() - lk = em.train(data, gmm) - -class test_full_run(EmTest): - """This class only tests whether the algorithms runs. Do not check the - results.""" - def test_1d(self, level = 1): - d = 1 - k = 2 - mode = 'full' - nframes = int(1e2) - - #seed(1) - self._create_model_and_run_em(d, k, mode, nframes) - - def test_2d(self, level = 1): - d = 2 - k = 2 - mode = 'full' - nframes = int(1e2) - - #seed(1) - self._create_model_and_run_em(d, k, mode, nframes) - - def test_5d(self, level = 1): - d = 5 - k = 3 - mode = 'full' - nframes = int(1e2) - - #seed(1) - self._create_model_and_run_em(d, k, mode, nframes) - -class test_diag_run(EmTest): - """This class only tests whether the algorithms runs. Do not test the - results.""" - def test_1d(self, level = 1): - d = 1 - k = 2 - mode = 'diag' - nframes = int(1e2) - - #seed(1) - self._create_model_and_run_em(d, k, mode, nframes) - - def test_2d(self, level = 1): - d = 2 - k = 2 - mode = 'diag' - nframes = int(1e2) - - #seed(1) - self._create_model_and_run_em(d, k, mode, nframes) - - def test_5d(self, level = 1): - d = 5 - k = 3 - mode = 'diag' - nframes = int(1e2) - - #seed(1) - self._create_model_and_run_em(d, k, mode, nframes) - -class test_datasets(EmTest): - """This class tests whether the EM algorithms works using pre-computed - datasets.""" - def _test(self, dataset, log): - dic = load_dataset(dataset) - - gm = GM.fromvalues(dic['w0'], dic['mu0'], dic['va0']) - gmm = GMM(gm, 'test') - EM().train(dic['data'], gmm, log = log) - - assert_array_almost_equal(gmm.gm.w, dic['w'], DEF_DEC) - assert_array_almost_equal(gmm.gm.mu, dic['mu'], DEF_DEC) - assert_array_almost_equal(gmm.gm.va, dic['va'], DEF_DEC) - - def test_1d_full(self, level = 1): - d = 1 - k = 4 - mode = 'full' - # Data are exactly the same than in diagonal mode, just test that - # calling full mode works even in 1d, even if it is kind of stupid to - # do so - filename = 'diag_1d_4k.mat' - self._test(filename, log = False) - - def test_2d_full(self, level = 1): - d = 2 - k = 3 - mode = 'full' - filename = 'full_2d_3k.mat' - self._test(filename, log = False) - - def test_2d_full_log(self, level = 1): - d = 2 - k = 3 - mode = 'full' - filename = 'full_2d_3k.mat' - self._test(filename, log = True) - - def test_2d_diag(self, level = 1): - d = 2 - k = 3 - mode = 'diag' - filename = 'diag_2d_3k.mat' - self._test(filename, log = False) - - def test_2d_diag_log(self, level = 1): - d = 2 - k = 3 - mode = 'diag' - filename = 'diag_2d_3k.mat' - self._test(filename, log = True) - -class test_log_domain(EmTest): - """This class tests whether the GMM works in log domain.""" - def _test_common(self, d, k, mode): - dic = load_dataset('%s_%dd_%dk.mat' % (mode, d, k)) - - gm = GM.fromvalues(dic['w0'], dic['mu0'], dic['va0']) - gmm = GMM(gm, 'test') - - a, na = gmm.compute_responsabilities(dic['data']) - la, nla = gmm.compute_log_responsabilities(dic['data']) - - ta = N.log(a) - tna = N.log(na) - if not N.all(N.isfinite(ta)): - print "precision problem for %s, %dd, %dk, test need fixing" % (mode, d, k) - else: - assert_array_almost_equal(ta, la, DEF_DEC) - - if not N.all(N.isfinite(tna)): - print "precision problem for %s, %dd, %dk, test need fixing" % (mode, d, k) - else: - assert_array_almost_equal(tna, nla, DEF_DEC) - - def test_2d_diag(self, level = 1): - d = 2 - k = 3 - mode = 'diag' - self._test_common(d, k, mode) - - def test_1d_full(self, level = 1): - d = 1 - k = 4 - mode = 'diag' - self._test_common(d, k, mode) - - def test_2d_full(self, level = 1): - d = 2 - k = 3 - mode = 'full' - self._test_common(d, k, mode) diff --git a/scikits/learn/em/tests/test_online_em.py b/scikits/learn/em/tests/test_online_em.py deleted file mode 100644 index 770d37b57b832e4e212976a3f26776af08eb6854..0000000000000000000000000000000000000000 --- a/scikits/learn/em/tests/test_online_em.py +++ /dev/null @@ -1,230 +0,0 @@ -#! /usr/bin/env python -# Last Change: Sun Sep 07 04:00 PM 2008 J - -import copy - -import sys -from unittest import TestCase - -import numpy as N -from numpy.random import seed - -from scikits.learn.em import GM, GMM -from scikits.learn.em.online_em import OnGMM, OnGMM1d - -# #Optional: -# set_local_path() -# # import modules that are located in the same directory as this file. -# restore_path() - -# Error precision allowed (nb of decimals) -AR_AS_PREC = 12 -KM_ITER = 5 - -class OnlineEmTest(TestCase): - def _create_model(self, d, k, mode, nframes, emiter): - #+++++++++++++++++++++++++++++++++++++++++++++++++ - # Generate a model with k components, d dimensions - #+++++++++++++++++++++++++++++++++++++++++++++++++ - w, mu, va = GM.gen_param(d, k, mode, spread = 1.5) - gm = GM.fromvalues(w, mu, va) - # Sample nframes frames from the model - data = gm.sample(nframes) - - #++++++++++++++++++++++++++++++++++++++++++ - # Approximate the models with classical EM - #++++++++++++++++++++++++++++++++++++++++++ - # Init the model - lgm = GM(d, k, mode) - gmm = GMM(lgm, 'kmean') - gmm.init(data, niter = KM_ITER) - - self.gm0 = copy.copy(gmm.gm) - # The actual EM, with likelihood computation - for i in range(emiter): - g, tgd = gmm.compute_responsabilities(data) - gmm.update_em(data, g) - - self.data = data - self.gm = lgm - -class test_on_off_eq(OnlineEmTest): - def check_1d(self, level = 1): - d = 1 - k = 2 - mode = 'diag' - nframes = int(1e2) - emiter = 3 - - seed(1) - self._create_model(d, k, mode, nframes, emiter) - self._check(d, k, mode, nframes, emiter) - - def check_2d(self, level = 1): - d = 2 - k = 2 - mode = 'diag' - nframes = int(1e2) - emiter = 3 - - seed(1) - self._create_model(d, k, mode, nframes, emiter) - self._check(d, k, mode, nframes, emiter) - - def check_5d(self, level = 5): - d = 5 - k = 2 - mode = 'diag' - nframes = int(1e2) - emiter = 3 - - seed(1) - self._create_model(d, k, mode, nframes, emiter) - self._check(d, k, mode, nframes, emiter) - - def _check(self, d, k, mode, nframes, emiter): - #++++++++++++++++++++++++++++++++++++++++ - # Approximate the models with online EM - #++++++++++++++++++++++++++++++++++++++++ - # Learn the model with Online EM - ogm = GM(d, k, mode) - ogmm = OnGMM(ogm, 'kmean') - init_data = self.data - ogmm.init(init_data, niter = KM_ITER) - - # Check that online kmean init is the same than kmean offline init - ogm0 = copy.copy(ogm) - assert_array_equal(ogm0.w, self.gm0.w) - assert_array_equal(ogm0.mu, self.gm0.mu) - assert_array_equal(ogm0.va, self.gm0.va) - - # Forgetting param - lamb = N.ones((nframes, 1)) - lamb[0] = 0 - nu0 = 1.0 - nu = N.zeros((len(lamb), 1)) - nu[0] = nu0 - for i in range(1, len(lamb)): - nu[i] = 1./(1 + lamb[i] / nu[i-1]) - - # object version of online EM: the p* arguments are updated only at each - # epoch, which is equivalent to on full EM iteration on the - # classic EM algorithm - ogmm.pw = ogmm.cw.copy() - ogmm.pmu = ogmm.cmu.copy() - ogmm.pva = ogmm.cva.copy() - for e in range(emiter): - for t in range(nframes): - ogmm.compute_sufficient_statistics_frame(self.data[t], nu[t]) - ogmm.update_em_frame() - - # Change pw args only a each epoch - ogmm.pw = ogmm.cw.copy() - ogmm.pmu = ogmm.cmu.copy() - ogmm.pva = ogmm.cva.copy() - - # For equivalence between off and on, we allow a margin of error, - # because of round-off errors. - print " Checking precision of equivalence with offline EM trainer " - maxtestprec = 18 - try : - for i in range(maxtestprec): - assert_array_almost_equal(self.gm.w, ogmm.pw, decimal = i) - assert_array_almost_equal(self.gm.mu, ogmm.pmu, decimal = i) - assert_array_almost_equal(self.gm.va, ogmm.pva, decimal = i) - print "\t !! Precision up to %d decimals !! " % i - except AssertionError: - if i < AR_AS_PREC: - print """\t !!NOT OK: Precision up to %d decimals only, - outside the allowed range (%d) !! """ % (i, AR_AS_PREC) - raise AssertionError - else: - print "\t !!OK: Precision up to %d decimals !! " % i - -class test_on(OnlineEmTest): - def check_consistency(self): - d = 1 - k = 2 - mode = 'diag' - nframes = int(5e2) - emiter = 4 - - self._create_model(d, k, mode, nframes, emiter) - self._run_pure_online(d, k, mode, nframes) - - def check_1d_imp(self): - d = 1 - k = 2 - mode = 'diag' - nframes = int(5e2) - emiter = 4 - - self._create_model(d, k, mode, nframes, emiter) - gmref = self._run_pure_online(d, k, mode, nframes) - gmtest = self._run_pure_online_1d(d, k, mode, nframes) - - assert_array_almost_equal(gmref.w, gmtest.w, AR_AS_PREC) - assert_array_almost_equal(gmref.mu, gmtest.mu, AR_AS_PREC) - assert_array_almost_equal(gmref.va, gmtest.va, AR_AS_PREC) - - def _run_pure_online_1d(self, d, k, mode, nframes): - #++++++++++++++++++++++++++++++++++++++++ - # Approximate the models with online EM - #++++++++++++++++++++++++++++++++++++++++ - ogm = GM(d, k, mode) - ogmm = OnGMM1d(ogm, 'kmean') - init_data = self.data[0:nframes / 20, :] - ogmm.init(init_data[:, 0]) - - # Forgetting param - ku = 0.005 - t0 = 200 - lamb = 1 - 1/(N.arange(-1, nframes-1) * ku + t0) - nu0 = 0.2 - nu = N.zeros((len(lamb), 1)) - nu[0] = nu0 - for i in range(1, len(lamb)): - nu[i] = 1./(1 + lamb[i] / nu[i-1]) - - # object version of online EM - for t in range(nframes): - # the assert are here to check we do not create copies - # unvoluntary for parameters - a, b, c = ogmm.compute_sufficient_statistics_frame(self.data[t, 0], nu[t]) - ogmm.update_em_frame(a, b, c) - - ogmm.gm.set_param(ogmm.cw, ogmm.cmu[:, N.newaxis], ogmm.cva[:, N.newaxis]) - - return ogmm.gm - def _run_pure_online(self, d, k, mode, nframes): - #++++++++++++++++++++++++++++++++++++++++ - # Approximate the models with online EM - #++++++++++++++++++++++++++++++++++++++++ - ogm = GM(d, k, mode) - ogmm = OnGMM(ogm, 'kmean') - init_data = self.data[0:nframes / 20, :] - ogmm.init(init_data) - - # Forgetting param - ku = 0.005 - t0 = 200 - lamb = 1 - 1/(N.arange(-1, nframes-1) * ku + t0) - nu0 = 0.2 - nu = N.zeros((len(lamb), 1)) - nu[0] = nu0 - for i in range(1, len(lamb)): - nu[i] = 1./(1 + lamb[i] / nu[i-1]) - - # object version of online EM - for t in range(nframes): - # the assert are here to check we do not create copies - # unvoluntary for parameters - assert ogmm.pw is ogmm.cw - assert ogmm.pmu is ogmm.cmu - assert ogmm.pva is ogmm.cva - ogmm.compute_sufficient_statistics_frame(self.data[t], nu[t]) - ogmm.update_em_frame() - - ogmm.gm.set_param(ogmm.cw, ogmm.cmu, ogmm.cva) - - return ogmm.gm diff --git a/scikits/learn/em/tests/testcommon.py b/scikits/learn/em/tests/testcommon.py deleted file mode 100644 index 940b04c4e680a12d35299771b411a4990d09f509..0000000000000000000000000000000000000000 --- a/scikits/learn/em/tests/testcommon.py +++ /dev/null @@ -1 +0,0 @@ -DEF_DEC = 12 diff --git a/scikits/learn/setup.py b/scikits/learn/setup.py index 50b668f8b2aafbc68fd44ec5b803bcc20fc48649..5566137b6032fe50cf139ce3a247b974709bc704 100644 --- a/scikits/learn/setup.py +++ b/scikits/learn/setup.py @@ -11,7 +11,6 @@ def configuration(parent_package='',top_path=None): site_cfg = ConfigParser() site_cfg.read(get_standard_file('site.cfg')) - config.add_subpackage('em') config.add_subpackage('datasets') config.add_subpackage('feature_selection') config.add_subpackage('utils')