diff --git a/examples/em/README.txt b/examples/em/README.txt new file mode 100644 index 0000000000000000000000000000000000000000..c33ac9108e7da0b317db040d148072cc2a148004 --- /dev/null +++ b/examples/em/README.txt @@ -0,0 +1,11 @@ + +Gaussian-mixture modeling +-------------------------- + +Examples concerning the `scikits.learn.em` package. + +.. warning:: + + The code in the `scikits.learn.em` submodule is not as mature as the + rest of the scikit and is prone to changing. + diff --git a/examples/em/basic_example1.py b/examples/em/basic_example1.py new file mode 100644 index 0000000000000000000000000000000000000000..96084fce3b54defd8b4281d0e4f71390332e6107 --- /dev/null +++ b/examples/em/basic_example1.py @@ -0,0 +1,53 @@ +""" +Simple mixture model example +============================= +""" + +import numpy as np +import pylab as pl +from scikits.learn.em import GM + +################################################################################ +# Hyper parameters: +# - K: number of clusters +# - d: dimension +k = 3 +d = 2 + +################################################################################ +# Values for weights, mean and (diagonal) variances +# - the weights are an array of rank 1 +# - mean is expected to be rank 2 with one row for one component +# - variances are also expteced to be rank 2. For diagonal, one row +# is one diagonal, for full, the first d rows are the first variance, +# etc... In this case, the variance matrix should be k*d rows and d +# colums +w = np.array([0.2, 0.45, 0.35]) +mu = np.array([[4.1, 3], [1, 5], [-2, -3]]) +va = np.array([[1, 1.5], [3, 4], [2, 3.5]]) + +################################################################################ +# First method: directly from parameters: +# Both methods are equivalents. +gm = GM.fromvalues(w, mu, va) + +################################################################################ +# Second method to build a GM instance: +gm = GM(d, k, mode = 'diag') +# The set_params checks that w, mu, and va corresponds to k, d and m +gm.set_param(w, mu, va) + +# Once set_params is called, both methods are equivalent. The 2d +# method is useful when using a GM object for learning (where +# the learner class will set the params), whereas the first one +# is useful when there is a need to quickly sample a model +# from existing values, without a need to give the hyper parameters + +# Create a Gaussian Mixture from the parameters, and sample +# 1000 items from it (one row = one 2 dimension sample) +data = gm.sample(1000) + +# Plot the samples +pl.plot(data[:, 0], data[:, 1], '.') +# Plot the ellipsoids of confidence with a level a 75 % +gm.plot(level = 0.75) diff --git a/examples/em/basic_example2.py b/examples/em/basic_example2.py new file mode 100644 index 0000000000000000000000000000000000000000..19615e7e614c117d7134a5199f60c54048236bee --- /dev/null +++ b/examples/em/basic_example2.py @@ -0,0 +1,50 @@ +""" +Another simple mixture model example +===================================== +""" + +from numpy.random import seed + +from scikits.learn.em import GM, GMM, EM + +# To reproduce results, fix the random seed +seed(1) + +################################################################################ +# Meta parameters of the model +# - k: Number of components +# - d: dimension of each Gaussian +# - mode: Mode of covariance matrix: full or diag (string) +# - nframes: number of frames (frame = one data point = one +# row of d elements) +k = 2 +d = 2 +mode = 'diag' +nframes = 1e3 + +################################################################################ +# Create an artificial GM model, samples it +################################################################################ + +w, mu, va = GM.gen_param(d, k, mode, spread = 1.5) +gm = GM.fromvalues(w, mu, va) + +# Sample nframes frames from the model +data = gm.sample(nframes) + +################################################################################ +# Learn the model with EM +################################################################################ + +# Create a Model from a Gaussian mixture with kmean initialization +lgm = GM(d, k, mode) +gmm = GMM(lgm, 'kmean') + +# The actual EM, with likelihood computation. The threshold +# is compared to the (linearly appromixated) derivative of the likelihood +em = EM() +like = em.train(data, gmm, maxiter = 30, thresh = 1e-8) + +# The computed parameters are in gmm.gm, which is the same than lgm +# (remember, python does not copy most objects by default). You can for example +# plot lgm against gm to compare diff --git a/examples/em/basic_example3.py b/examples/em/basic_example3.py new file mode 100644 index 0000000000000000000000000000000000000000..c953319de58888c3799f920fe5429feada7481f2 --- /dev/null +++ b/examples/em/basic_example3.py @@ -0,0 +1,67 @@ +""" +Simple Gaussian-mixture model example +====================================== +""" + +import numpy as np +from numpy.random import seed + +from scikits.learn.em import GM, GMM, EM + +seed(2) + +k = 4 +d = 2 +mode = 'diag' +nframes = 1e3 + +################################################################################ +# Create an artificial GMM model, samples it +################################################################################ +w, mu, va = GM.gen_param(d, k, mode, spread = 1.0) +gm = GM.fromvalues(w, mu, va) + +# Sample nframes frames from the model +data = gm.sample(nframes) + +################################################################################ +# Learn the model with EM +################################################################################ + +# List of learned mixtures lgm[i] is a mixture with i+1 components +lgm = [] +kmax = 6 +bics = np.zeros(kmax) +em = EM() +for i in range(kmax): + lgm.append(GM(d, i+1, mode)) + + gmm = GMM(lgm[i], 'kmean') + em.train(data, gmm, maxiter = 30, thresh = 1e-10) + bics[i] = gmm.bic(data) + +print "Original model has %d clusters, bics says %d" % (k, np.argmax(bics)+1) + +################################################################################ +# Draw the model +################################################################################ +import pylab as pl +pl.subplot(3, 2, 1) + +for k in range(kmax): + pl.subplot(3, 2, k+1) + level = 0.9 + pl.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_') + + # h keeps the handles of the plot, so that you can modify + # its parameters like label or color + h = lgm[k].plot(level = level) + [i.set_color('r') for i in h] + h[0].set_label('EM confidence ellipsoides') + + h = gm.plot(level = level) + [i.set_color('g') for i in h] + h[0].set_label('Real confidence ellipsoides') + +pl.legend(loc = 0) +pl.show() diff --git a/examples/em/discriminant_analysis.py b/examples/em/discriminant_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..86810386320f2633951c75a9b457bd86059e6032 --- /dev/null +++ b/examples/em/discriminant_analysis.py @@ -0,0 +1,128 @@ +""" +Classification using mixture of Gaussian +========================================= + +Example of doing classification with mixture of Gaussian. Note that this +is really a toy example: we do not use testing testset nor cross +validation. + +We use the famous iris database used by Sir R.A. Fisher. You can try to change +the attributes used for classification, number of components used for the +mixtures, etc...""" + +import numpy as N +import pylab as P +import matplotlib as MPL + +from scikits.learn.em import EM, GMM, GM +import utils + +data = utils.iris.load() +# cnames are the class names +cnames = data.keys() + +#-------------------- +# Data pre processing +#-------------------- +# we use 25 samples of each class (eg half of iris), for +# learning, and the other half for testing. We use sepal width and petal width +# only +ln = 25 +tn = 25 +xdata = {} +ydata = {} +# learning data +ldata = {} + +# you can change here the used attributes (sepal vs petal, width vs height) +for i in cnames: + xdata[i] = data[i]['sepal width'] + ydata[i] = data[i]['petal width'] + ldata[i] = N.concatenate((xdata[i][:ln, N.newaxis], + ydata[i][:ln, N.newaxis]), + axis = 1) + +tx = N.concatenate([xdata[i][ln:] for i in cnames]) +ty = N.concatenate([ydata[i][ln:] for i in cnames]) +tdata = N.concatenate((tx[:, N.newaxis], ty[:, N.newaxis]), axis = 1) + +# tclass makes the correspondance class <-> index in the testing data tdata +tclass = {} +for i in range(3): + tclass[cnames[i]] = N.arange(tn * i, tn * (i+1)) + +#---------------------------- +# Learning and classification +#---------------------------- +# This function train a mixture model with k components +def cluster(data, k, mode = 'full'): + d = data.shape[1] + gm = GM(d, k, mode) + gmm = GMM(gm) + em = EM() + em.train(data, gmm, maxiter = 20) + return gm + +# Estimate each class with a mixture of nc components +nc = 2 +mode = 'diag' +lmod = {} +for i in cnames: + lmod[i] = cluster(ldata[i], nc, mode) + +# Classifiy the testing data. Of course, the data are not really IID, because +# we did not shuffle the testing data, but in this case, this does not change +# anything. +p = N.empty((len(tdata), 3)) +for i in range(3): + # For each class, computes the likelihood for the testing data + p[:, i] = lmod[cnames[i]].pdf(tdata) + +# We then take the Maximum A Posteriori class (same than most likely model in +# this case, since each class is equiprobable) +cid = N.argmax(p, 1) +classification = {} +for i in range(3): + classification[cnames[i]] = N.where(cid == i)[0] + +correct = {} +incorrect = {} +for i in cnames: + correct[i] = N.intersect1d(classification[i], tclass[i]) + incorrect[i] = N.setdiff1d(classification[i], tclass[i]) + +#----------------- +# Plot the results +#----------------- +csym = {'setosa' : 's', 'versicolor' : 'x', 'virginica' : 'o'} +r = 50. +P.figure(figsize = [600/r, 400/r]) + +prop = MPL.font_manager.FontProperties(size='smaller') + +# Plot the learning data with the mixtures +P.subplot(2, 1, 1) +for i in lmod.values(): + #i.plot() + X, Y, Z, V = i.density_on_grid() + P.contourf(X, Y, Z, V) + +for i in cnames: + P.plot(ldata[i][:, 0], ldata[i][:, 1], csym[i], label = i + ' (learning)') +P.xlabel('sepal width') +P.ylabel('petal width') +P.legend(loc = 'best') + +# Plot the results on test dataset (green for correctly classified, red for +# incorrectly classified) +P.subplot(2, 1, 2) +for i in cnames: + P.plot(tx[correct[i]], ty[correct[i]], 'g' + csym[i], + label = '%s (correctly classified)' % i) + if len(incorrect[i]) > 0: + P.plot(tx[incorrect[i]], ty[incorrect[i]], 'r' + csym[i], + label = '%s (incorrectly classified)' % i) +P.legend(loc = 'best', prop = prop) +P.xlabel('sepal width') +P.ylabel('petal width') +P.savefig('dclass.png', dpi = 60) diff --git a/examples/em/expectation.py b/examples/em/expectation.py new file mode 100644 index 0000000000000000000000000000000000000000..b33998e8b20e911bd68af5230edf0ec186c1c0b1 --- /dev/null +++ b/examples/em/expectation.py @@ -0,0 +1,44 @@ +""" +Mixture of Gaussian expectation maximation example +==================================================== + +""" + +import pylab +import numpy as np +from scikits.learn.em.densities2 import gauss_ell + +#========================================= +# Test plotting a simple diag 2d variance: +#========================================= +va = np.array([5, 3]) +mu = np.array([2, 3]) + +# Generate a multivariate gaussian of mean mu and covariance va +X = np.random.randn(2, 1e3) +Yc = np.dot(np.diag(np.sqrt(va)), X) +Yc = Yc.transpose() + mu + +# Plotting +Xe, Ye = gauss_ell(mu, va, npoints = 100) +pylab.figure() +pylab.plot(Yc[:, 0], Yc[:, 1], '.') +pylab.plot(Xe, Ye, 'r') + +#========================================= +# Test plotting a simple full 2d variance: +#========================================= +va = np.array([[0.2, 0.1],[0.1, 0.5]]) +mu = np.array([0, 3]) + +# Generate a multivariate gaussian of mean mu and covariance va +X = np.random.randn(1e3, 2) +Yc = np.dot(np.linalg.cholesky(va), X.transpose()) +Yc = Yc.transpose() + mu + +# Plotting +Xe, Ye = gauss_ell(mu, va, npoints = 100, level=0.95) +pylab.figure() +pylab.plot(Yc[:, 0], Yc[:, 1], '.') +pylab.plot(Xe, Ye, 'r') +pylab.show() diff --git a/examples/em/pdfestimation.py b/examples/em/pdfestimation.py new file mode 100644 index 0000000000000000000000000000000000000000..195573d20bc0e3fa8e6fe6cc11918111e629b9e3 --- /dev/null +++ b/examples/em/pdfestimation.py @@ -0,0 +1,53 @@ +""" +Density estimation with mixture models +====================================== +""" + +#! /usr/bin/env python +# Last Change: Sun Jul 22 12:00 PM 2007 J + +# Example of doing pdf estimation with EM algorithm. Requires matplotlib. +import numpy as N +import pylab as P + +from scikits.learn.em import EM, GM, GMM +import utils + +oldfaithful = utils.get_faithful() + +# We want the relationship between d(t) and w(t+1), but get_faithful gives +# d(t), w(t), so we have to shift to get the "usual" faithful data +waiting = oldfaithful[1:, 1:] +duration = oldfaithful[:len(waiting), :1] +dt = N.concatenate((duration, waiting), 1) + +# Scale the data so that each component is in [0..1] +dt = utils.scale(dt) + +# This function train a mixture model with k components, returns the trained +# model and the BIC +def cluster(data, k, mode = 'full'): + d = data.shape[1] + gm = GM(d, k, mode) + gmm = GMM(gm) + em = EM() + em.train(data, gmm, maxiter = 20) + return gm, gmm.bic(data) + +# bc will contain a list of BIC values for each model trained +bc = [] +mode = 'full' +P.figure() +for k in range(1, 5): + # Train a model of k component, and plots isodensity curve + P.subplot(2, 2, k) + gm, b = cluster(dt, k = k, mode = mode) + bc.append(b) + + X, Y, Z, V = gm.density_on_grid() + P.contour(X, Y, Z, V) + P.plot(dt[:, 0], dt[:, 1], '.') + P.xlabel('duration time (scaled)') + P.ylabel('waiting time (scaled)') + +print "According to the BIC, model with %d components is better" % (N.argmax(bc) + 1) diff --git a/examples/em/pdfestimation1d.py b/examples/em/pdfestimation1d.py new file mode 100644 index 0000000000000000000000000000000000000000..08227c2a71aa5105e8cb3936759ea34e59731017 --- /dev/null +++ b/examples/em/pdfestimation1d.py @@ -0,0 +1,85 @@ +""" + +Density estimation +==================== + +This example shows how to do pdf estimation for one dimensional data. It +estimates a Gaussian mixture model for several number of components, and +it determines the "best" one according to the Bayesian Information +Criterion. + +It uses old faitfhul waiting time as the one dimension data, and plots the best +model as well as the BIC as a function of the number of component. +""" + +# Example of doing pdf estimation with EM algorithm. Requires matplotlib. +import numpy as N + +import pylab as P +import matplotlib as MPL + +from scikits.learn.em import EM, GM, GMM +import utils + +oldfaithful = utils.get_faithful() + +duration = oldfaithful[:, :1] +waiting = oldfaithful[:, 1:] + +#dt = utils.scale(duration) +#dt = duration / 60. +dt = waiting / 60. + +# This function train a mixture model with k components, returns the trained +# model and the BIC +def cluster(data, k): + d = data.shape[1] + gm = GM(d, k) + gmm = GMM(gm) + em = EM() + em.train(data, gmm, maxiter = 20) + return gm, gmm.bic(data) + +# bc will contain a list of BIC values for each model trained, gml the +# corresponding mixture model +bc = [] +gml = [] + +for k in range(1, 8): + gm, b = cluster(dt, k = k) + bc.append(b) + gml.append(gm) + +mbic = N.argmax(bc) + +# Below is code to display a figure with histogram and best model (in the BIC +# sense) pdf, with the BIC as a function of the number of components on the +# right. +P.figure(figsize = [12, 7]) +#--------------------------- +# histogram + pdf estimation +#--------------------------- +P.subplot(1, 2, 1) +h = gml[mbic].plot1d(gpdf=True) +# You can manipulate the differents parts of the plot through the returned +# handles +h['gpdf'][0].set_linestyle('-') +h['gpdf'][0].set_label('pdf of the mixture') +h['pdf'][0].set_label('pdf of individual component') +[l.set_linestyle('-') for l in h['pdf']] +[l.set_color('g') for l in h['pdf']] + +prop = MPL.font_manager.FontProperties(size='smaller') +P.legend(loc = 'best', prop = prop) + +P.hist(dt, 25, normed = 1, fill = False) +P.xlabel('waiting time between consecutive eruption (in min)') + +#------------------------------------------ +# BIC as a function of number of components +#------------------------------------------ +P.subplot(1, 2, 2) +P.plot(N.arange(1, 8), bc, 'o:') +P.xlabel("number of components") +P.ylabel("BIC") +print "According to the BIC, model with %d components is better" % (N.argmax(bc) + 1) diff --git a/examples/em/plotexamples.py b/examples/em/plotexamples.py new file mode 100644 index 0000000000000000000000000000000000000000..a38c09b7aa3f918b342c01c014e334ed07a0a179 --- /dev/null +++ b/examples/em/plotexamples.py @@ -0,0 +1,42 @@ +""" +Ellipse fitting with Gaussian mixtures +======================================= + +""" + +# This is a simple test to check whether plotting ellipsoides of confidence and +# isodensity contours match +import numpy as N + +import pylab as P + +from scikits.learn.em import EM, GM, GMM + +# Generate a simple mixture model, plot its confidence ellipses + isodensity +# curves for both diagonal and full covariance matrices +d = 3 +k = 3 +dim = [0, 2] +# diag model +w, mu, va = GM.gen_param(d, k) +dgm = GM.fromvalues(w, mu, va) +# full model +w, mu, va = GM.gen_param(d, k, 'full', spread = 1) +fgm = GM.fromvalues(w, mu, va) + +def plot_model(gm, dim): + X, Y, Z, V = gm.density_on_grid(dim = dim) + h = gm.plot(dim = dim) + [i.set_linestyle('-.') for i in h] + P.contour(X, Y, Z, V) + data = gm.sample(200) + P.plot(data[:, dim[0]], data[:,dim[1]], '.') + +# Plot the contours and the ellipsoids of confidence +P.subplot(2, 1, 1) +plot_model(dgm, dim) + +P.subplot(2, 1, 2) +plot_model(fgm, dim) + +P.show() diff --git a/examples/em/regularized_example.py b/examples/em/regularized_example.py new file mode 100644 index 0000000000000000000000000000000000000000..a16bd39235802a1ceae98c5398af0e2c142ca654 --- /dev/null +++ b/examples/em/regularized_example.py @@ -0,0 +1,71 @@ +""" + +Regularized Gaussian mixture on hand-written digits +==================================================== + +Example of using RegularizedEM with pendigits data. + +If you want to do discriminant analysis with pendigits, you quickly have +problems with EM if you use directly the coordinates, because some points are +likely to be on the border, hence the corresponding component can have a +covariance matrix which easily becomes singular. Regularized EM avoids this +problem by using simple regularization on the mixture. You can play with pcount +and pval to see the effect on pdf estimation. + +For now, regularized EM is pretty crude, but is enough for simple cases where +you need to avoid singular covariance matrices.""" + +import numpy as N +import pylab as P + +from scikits.learn.em import EM, GM, GMM +# Experimental RegularizedEM +from scikits.learn.em.gmm_em import RegularizedEM +import utils + +x, y = utils.get_pendigits() + +# Take only the first point of pendigits for pdf estimation +dt1 = N.concatenate([x[:, N.newaxis], y[:, N.newaxis]], 1) +dt1 = utils.scale(dt1.astype(N.float)) + +# pcnt is the poportion of samples to use as prior count. Eg if you have 1000 +# samples, and pcn is 0.1, then the prior count would be 100, and 1100 samples +# will be considered as overall when regularizing the parameters. +pcnt = 0.05 +# You should try different values of pval. If pval is 0.1, then the +# regularization will be strong. If you use something like 0.01, really sharp +# components will appear. If the values are too small, the regularizer may not +# work (singular covariance matrices). +pval = 0.05 + +# This function train a mixture model with k components, returns the trained +# model and the BIC +def cluster(data, k, mode = 'full'): + d = data.shape[1] + gm = GM(d, k, mode) + gmm = GMM(gm, 'random') + em = RegularizedEM(pcnt = pcnt, pval = pval) + em.train(data, gmm, maxiter = 20) + return gm, gmm.bic(data) + +# bc will contain a list of BIC values for each model trained +N.seterr(all = 'warn') +bc = [] +mode = 'full' + +P.figure() +for k in range(1, 5): + # Train a model of k component, and plots isodensity curve + P.subplot(2, 2, k) + gm, b = cluster(dt1, k = k, mode = mode) + bc.append(b) + + X, Y, Z, V = gm.density_on_grid(nl = 20) + P.contour(X, Y, Z, V) + P.plot(dt1[:, 0], dt1[:, 1], '.') + P.xlabel('x coordinate (scaled)') + P.ylabel('y coordinate (scaled)') + +print "According to the BIC, model with %d components is better" % (N.argmax(bc) + 1) +P.show() diff --git a/examples/em/utils.py b/examples/em/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..89f11cf19030a8e7b39cc59d2b8869b7e3e87e88 --- /dev/null +++ b/examples/em/utils.py @@ -0,0 +1,47 @@ +#! /usr/bin/env python +# Last Change: Sun Jul 22 03:00 PM 2007 J + +# Various utilities for examples + +import numpy as N + +from scikits.learn.datasets import oldfaithful, pendigits + +def get_faithful(): + """Return faithful data as a nx2 array, first column being duration, second + being waiting time.""" + # Load faithful data, convert waiting into integer, remove L, M and S data + data = oldfaithful.load() + tmp1 = [] + tmp2 = [] + for i in data['data']: + if not (i[0] == 'L' or i[0] == 'M' or i[0] == 'S'): + tmp1.append(i[0]) + tmp2.append(i[1]) + + waiting = N.array([int(i) for i in tmp1], dtype = N.float) + duration = N.array([i for i in tmp2], dtype = N.float) + + waiting = waiting[:, N.newaxis] + duration = duration[:, N.newaxis] + + return N.concatenate((waiting, duration), 1) + +def get_pendigits(): + """Return faithful data as a nx2 array, first column being duration, second + being waiting time.""" + # Load faithful data, convert waiting into integer, remove L, M and S data + data = pendigits.training.load() + return data['data']['x0'], data['data']['y0'] + +def scale(data): + """ Scale data such as each col is in the range [0..1]. + + Note: inplace.""" + n = N.min(data, 0) + m = N.max(data, 0) + + data -= n + data /= (m-n) + return data +