Skip to content
Snippets Groups Projects
Commit 1a17e795 authored by Fabian Pedregosa's avatar Fabian Pedregosa
Browse files

Remove examples from deprecated modules.

parent 32f4c67c
No related branches found
No related tags found
No related merge requests found
Gaussian-mixture modeling
--------------------------
Examples concerning the `scikits.learn.em` package.
.. warning::
The code in the `scikits.learn.em` submodule is not as mature as the
rest of the scikit and is prone to changing.
"""
Simple mixture model example
=============================
"""
import numpy as np
import pylab as pl
from scikits.learn.em import GM
################################################################################
# Hyper parameters:
# - K: number of clusters
# - d: dimension
k = 3
d = 2
################################################################################
# Values for weights, mean and (diagonal) variances
# - the weights are an array of rank 1
# - mean is expected to be rank 2 with one row for one component
# - variances are also expteced to be rank 2. For diagonal, one row
# is one diagonal, for full, the first d rows are the first variance,
# etc... In this case, the variance matrix should be k*d rows and d
# colums
w = np.array([0.2, 0.45, 0.35])
mu = np.array([[4.1, 3], [1, 5], [-2, -3]])
va = np.array([[1, 1.5], [3, 4], [2, 3.5]])
################################################################################
# First method: directly from parameters:
# Both methods are equivalents.
gm = GM.fromvalues(w, mu, va)
################################################################################
# Second method to build a GM instance:
gm = GM(d, k, mode = 'diag')
# The set_params checks that w, mu, and va corresponds to k, d and m
gm.set_param(w, mu, va)
# Once set_params is called, both methods are equivalent. The 2d
# method is useful when using a GM object for learning (where
# the learner class will set the params), whereas the first one
# is useful when there is a need to quickly sample a model
# from existing values, without a need to give the hyper parameters
# Create a Gaussian Mixture from the parameters, and sample
# 1000 items from it (one row = one 2 dimension sample)
data = gm.sample(1000)
# Plot the samples
pl.plot(data[:, 0], data[:, 1], '.')
# Plot the ellipsoids of confidence with a level a 75 %
gm.plot(level = 0.75)
"""
Another simple mixture model example
=====================================
"""
from numpy.random import seed
from scikits.learn.em import GM, GMM, EM
# To reproduce results, fix the random seed
seed(1)
################################################################################
# Meta parameters of the model
# - k: Number of components
# - d: dimension of each Gaussian
# - mode: Mode of covariance matrix: full or diag (string)
# - nframes: number of frames (frame = one data point = one
# row of d elements)
k = 2
d = 2
mode = 'diag'
nframes = 1e3
################################################################################
# Create an artificial GM model, samples it
################################################################################
w, mu, va = GM.gen_param(d, k, mode, spread = 1.5)
gm = GM.fromvalues(w, mu, va)
# Sample nframes frames from the model
data = gm.sample(nframes)
################################################################################
# Learn the model with EM
################################################################################
# Create a Model from a Gaussian mixture with kmean initialization
lgm = GM(d, k, mode)
gmm = GMM(lgm, 'kmean')
# The actual EM, with likelihood computation. The threshold
# is compared to the (linearly appromixated) derivative of the likelihood
em = EM()
like = em.train(data, gmm, maxiter = 30, thresh = 1e-8)
# The computed parameters are in gmm.gm, which is the same than lgm
# (remember, python does not copy most objects by default). You can for example
# plot lgm against gm to compare
"""
Simple Gaussian-mixture model example
======================================
"""
import numpy as np
from numpy.random import seed
from scikits.learn.em import GM, GMM, EM
seed(2)
k = 4
d = 2
mode = 'diag'
nframes = 1e3
################################################################################
# Create an artificial GMM model, samples it
################################################################################
w, mu, va = GM.gen_param(d, k, mode, spread = 1.0)
gm = GM.fromvalues(w, mu, va)
# Sample nframes frames from the model
data = gm.sample(nframes)
################################################################################
# Learn the model with EM
################################################################################
# List of learned mixtures lgm[i] is a mixture with i+1 components
lgm = []
kmax = 6
bics = np.zeros(kmax)
em = EM()
for i in range(kmax):
lgm.append(GM(d, i+1, mode))
gmm = GMM(lgm[i], 'kmean')
em.train(data, gmm, maxiter = 30, thresh = 1e-10)
bics[i] = gmm.bic(data)
print "Original model has %d clusters, bics says %d" % (k, np.argmax(bics)+1)
################################################################################
# Draw the model
################################################################################
import pylab as pl
pl.subplot(3, 2, 1)
for k in range(kmax):
pl.subplot(3, 2, k+1)
level = 0.9
pl.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_')
# h keeps the handles of the plot, so that you can modify
# its parameters like label or color
h = lgm[k].plot(level = level)
[i.set_color('r') for i in h]
h[0].set_label('EM confidence ellipsoides')
h = gm.plot(level = level)
[i.set_color('g') for i in h]
h[0].set_label('Real confidence ellipsoides')
pl.legend(loc = 0)
pl.show()
"""
Classification using mixture of Gaussian
=========================================
Example of doing classification with mixture of Gaussian. Note that this
is really a toy example: we do not use testing testset nor cross
validation.
We use the famous iris database used by Sir R.A. Fisher. You can try to change
the attributes used for classification, number of components used for the
mixtures, etc..."""
import numpy as N
import pylab as P
import matplotlib as MPL
from scikits.learn.em import EM, GMM, GM
import utils
data = utils.iris.load()
# cnames are the class names
cnames = data.keys()
#--------------------
# Data pre processing
#--------------------
# we use 25 samples of each class (eg half of iris), for
# learning, and the other half for testing. We use sepal width and petal width
# only
ln = 25
tn = 25
xdata = {}
ydata = {}
# learning data
ldata = {}
# you can change here the used attributes (sepal vs petal, width vs height)
for i in cnames:
xdata[i] = data[i]['sepal width']
ydata[i] = data[i]['petal width']
ldata[i] = N.concatenate((xdata[i][:ln, N.newaxis],
ydata[i][:ln, N.newaxis]),
axis = 1)
tx = N.concatenate([xdata[i][ln:] for i in cnames])
ty = N.concatenate([ydata[i][ln:] for i in cnames])
tdata = N.concatenate((tx[:, N.newaxis], ty[:, N.newaxis]), axis = 1)
# tclass makes the correspondance class <-> index in the testing data tdata
tclass = {}
for i in range(3):
tclass[cnames[i]] = N.arange(tn * i, tn * (i+1))
#----------------------------
# Learning and classification
#----------------------------
# This function train a mixture model with k components
def cluster(data, k, mode = 'full'):
d = data.shape[1]
gm = GM(d, k, mode)
gmm = GMM(gm)
em = EM()
em.train(data, gmm, maxiter = 20)
return gm
# Estimate each class with a mixture of nc components
nc = 2
mode = 'diag'
lmod = {}
for i in cnames:
lmod[i] = cluster(ldata[i], nc, mode)
# Classifiy the testing data. Of course, the data are not really IID, because
# we did not shuffle the testing data, but in this case, this does not change
# anything.
p = N.empty((len(tdata), 3))
for i in range(3):
# For each class, computes the likelihood for the testing data
p[:, i] = lmod[cnames[i]].pdf(tdata)
# We then take the Maximum A Posteriori class (same than most likely model in
# this case, since each class is equiprobable)
cid = N.argmax(p, 1)
classification = {}
for i in range(3):
classification[cnames[i]] = N.where(cid == i)[0]
correct = {}
incorrect = {}
for i in cnames:
correct[i] = N.intersect1d(classification[i], tclass[i])
incorrect[i] = N.setdiff1d(classification[i], tclass[i])
#-----------------
# Plot the results
#-----------------
csym = {'setosa' : 's', 'versicolor' : 'x', 'virginica' : 'o'}
r = 50.
P.figure(figsize = [600/r, 400/r])
prop = MPL.font_manager.FontProperties(size='smaller')
# Plot the learning data with the mixtures
P.subplot(2, 1, 1)
for i in lmod.values():
#i.plot()
X, Y, Z, V = i.density_on_grid()
P.contourf(X, Y, Z, V)
for i in cnames:
P.plot(ldata[i][:, 0], ldata[i][:, 1], csym[i], label = i + ' (learning)')
P.xlabel('sepal width')
P.ylabel('petal width')
P.legend(loc = 'best')
# Plot the results on test dataset (green for correctly classified, red for
# incorrectly classified)
P.subplot(2, 1, 2)
for i in cnames:
P.plot(tx[correct[i]], ty[correct[i]], 'g' + csym[i],
label = '%s (correctly classified)' % i)
if len(incorrect[i]) > 0:
P.plot(tx[incorrect[i]], ty[incorrect[i]], 'r' + csym[i],
label = '%s (incorrectly classified)' % i)
P.legend(loc = 'best', prop = prop)
P.xlabel('sepal width')
P.ylabel('petal width')
P.savefig('dclass.png', dpi = 60)
"""
Density estimation with mixture models
======================================
"""
#! /usr/bin/env python
# Last Change: Sun Jul 22 12:00 PM 2007 J
# Example of doing pdf estimation with EM algorithm. Requires matplotlib.
import numpy as N
import pylab as P
from scikits.learn.em import EM, GM, GMM
import utils
oldfaithful = utils.get_faithful()
# We want the relationship between d(t) and w(t+1), but get_faithful gives
# d(t), w(t), so we have to shift to get the "usual" faithful data
waiting = oldfaithful[1:, 1:]
duration = oldfaithful[:len(waiting), :1]
dt = N.concatenate((duration, waiting), 1)
# Scale the data so that each component is in [0..1]
dt = utils.scale(dt)
# This function train a mixture model with k components, returns the trained
# model and the BIC
def cluster(data, k, mode = 'full'):
d = data.shape[1]
gm = GM(d, k, mode)
gmm = GMM(gm)
em = EM()
em.train(data, gmm, maxiter = 20)
return gm, gmm.bic(data)
# bc will contain a list of BIC values for each model trained
bc = []
mode = 'full'
P.figure()
for k in range(1, 5):
# Train a model of k component, and plots isodensity curve
P.subplot(2, 2, k)
gm, b = cluster(dt, k = k, mode = mode)
bc.append(b)
X, Y, Z, V = gm.density_on_grid()
P.contour(X, Y, Z, V)
P.plot(dt[:, 0], dt[:, 1], '.')
P.xlabel('duration time (scaled)')
P.ylabel('waiting time (scaled)')
print "According to the BIC, model with %d components is better" % (N.argmax(bc) + 1)
"""
Density estimation
====================
This example shows how to do pdf estimation for one dimensional data. It
estimates a Gaussian mixture model for several number of components, and
it determines the "best" one according to the Bayesian Information
Criterion.
It uses old faitfhul waiting time as the one dimension data, and plots the best
model as well as the BIC as a function of the number of component.
"""
# Example of doing pdf estimation with EM algorithm. Requires matplotlib.
import numpy as N
import pylab as P
import matplotlib as MPL
from scikits.learn.em import EM, GM, GMM
import utils
oldfaithful = utils.get_faithful()
duration = oldfaithful[:, :1]
waiting = oldfaithful[:, 1:]
#dt = utils.scale(duration)
#dt = duration / 60.
dt = waiting / 60.
# This function train a mixture model with k components, returns the trained
# model and the BIC
def cluster(data, k):
d = data.shape[1]
gm = GM(d, k)
gmm = GMM(gm)
em = EM()
em.train(data, gmm, maxiter = 20)
return gm, gmm.bic(data)
# bc will contain a list of BIC values for each model trained, gml the
# corresponding mixture model
bc = []
gml = []
for k in range(1, 8):
gm, b = cluster(dt, k = k)
bc.append(b)
gml.append(gm)
mbic = N.argmax(bc)
# Below is code to display a figure with histogram and best model (in the BIC
# sense) pdf, with the BIC as a function of the number of components on the
# right.
P.figure(figsize = [12, 7])
#---------------------------
# histogram + pdf estimation
#---------------------------
P.subplot(1, 2, 1)
h = gml[mbic].plot1d(gpdf=True)
# You can manipulate the differents parts of the plot through the returned
# handles
h['gpdf'][0].set_linestyle('-')
h['gpdf'][0].set_label('pdf of the mixture')
h['pdf'][0].set_label('pdf of individual component')
[l.set_linestyle('-') for l in h['pdf']]
[l.set_color('g') for l in h['pdf']]
prop = MPL.font_manager.FontProperties(size='smaller')
P.legend(loc = 'best', prop = prop)
P.hist(dt, 25, normed = 1, fill = False)
P.xlabel('waiting time between consecutive eruption (in min)')
#------------------------------------------
# BIC as a function of number of components
#------------------------------------------
P.subplot(1, 2, 2)
P.plot(N.arange(1, 8), bc, 'o:')
P.xlabel("number of components")
P.ylabel("BIC")
print "According to the BIC, model with %d components is better" % (N.argmax(bc) + 1)
"""
Mixture of Gaussian expectation maximation example
====================================================
"""
import pylab
import numpy as np
from scikits.learn.em.densities2 import gauss_ell
#=========================================
# Test plotting a simple diag 2d variance:
#=========================================
va = np.array([5, 3])
mu = np.array([2, 3])
# Generate a multivariate gaussian of mean mu and covariance va
X = np.random.randn(2, 1e3)
Yc = np.dot(np.diag(np.sqrt(va)), X)
Yc = Yc.transpose() + mu
# Plotting
Xe, Ye = gauss_ell(mu, va, npoints = 100)
pylab.figure()
pylab.plot(Yc[:, 0], Yc[:, 1], '.')
pylab.plot(Xe, Ye, 'r')
#=========================================
# Test plotting a simple full 2d variance:
#=========================================
va = np.array([[0.2, 0.1],[0.1, 0.5]])
mu = np.array([0, 3])
# Generate a multivariate gaussian of mean mu and covariance va
X = np.random.randn(1e3, 2)
Yc = np.dot(np.linalg.cholesky(va), X.transpose())
Yc = Yc.transpose() + mu
# Plotting
Xe, Ye = gauss_ell(mu, va, npoints = 100, level=0.95)
pylab.figure()
pylab.plot(Yc[:, 0], Yc[:, 1], '.')
pylab.plot(Xe, Ye, 'r')
pylab.show()
"""
Ellipse fitting with Gaussian mixtures
=======================================
"""
# This is a simple test to check whether plotting ellipsoides of confidence and
# isodensity contours match
import numpy as N
import pylab as P
from scikits.learn.em import EM, GM, GMM
# Generate a simple mixture model, plot its confidence ellipses + isodensity
# curves for both diagonal and full covariance matrices
d = 3
k = 3
dim = [0, 2]
# diag model
w, mu, va = GM.gen_param(d, k)
dgm = GM.fromvalues(w, mu, va)
# full model
w, mu, va = GM.gen_param(d, k, 'full', spread = 1)
fgm = GM.fromvalues(w, mu, va)
def plot_model(gm, dim):
X, Y, Z, V = gm.density_on_grid(dim = dim)
h = gm.plot(dim = dim)
[i.set_linestyle('-.') for i in h]
P.contour(X, Y, Z, V)
data = gm.sample(200)
P.plot(data[:, dim[0]], data[:,dim[1]], '.')
# Plot the contours and the ellipsoids of confidence
P.subplot(2, 1, 1)
plot_model(dgm, dim)
P.subplot(2, 1, 2)
plot_model(fgm, dim)
P.show()
"""
Regularized Gaussian mixture on hand-written digits
====================================================
Example of using RegularizedEM with pendigits data.
If you want to do discriminant analysis with pendigits, you quickly have
problems with EM if you use directly the coordinates, because some points are
likely to be on the border, hence the corresponding component can have a
covariance matrix which easily becomes singular. Regularized EM avoids this
problem by using simple regularization on the mixture. You can play with pcount
and pval to see the effect on pdf estimation.
For now, regularized EM is pretty crude, but is enough for simple cases where
you need to avoid singular covariance matrices."""
import numpy as N
import pylab as P
from scikits.learn.em import EM, GM, GMM
# Experimental RegularizedEM
from scikits.learn.em.gmm_em import RegularizedEM
import utils
x, y = utils.get_pendigits()
# Take only the first point of pendigits for pdf estimation
dt1 = N.concatenate([x[:, N.newaxis], y[:, N.newaxis]], 1)
dt1 = utils.scale(dt1.astype(N.float))
# pcnt is the poportion of samples to use as prior count. Eg if you have 1000
# samples, and pcn is 0.1, then the prior count would be 100, and 1100 samples
# will be considered as overall when regularizing the parameters.
pcnt = 0.05
# You should try different values of pval. If pval is 0.1, then the
# regularization will be strong. If you use something like 0.01, really sharp
# components will appear. If the values are too small, the regularizer may not
# work (singular covariance matrices).
pval = 0.05
# This function train a mixture model with k components, returns the trained
# model and the BIC
def cluster(data, k, mode = 'full'):
d = data.shape[1]
gm = GM(d, k, mode)
gmm = GMM(gm, 'random')
em = RegularizedEM(pcnt = pcnt, pval = pval)
em.train(data, gmm, maxiter = 20)
return gm, gmm.bic(data)
# bc will contain a list of BIC values for each model trained
N.seterr(all = 'warn')
bc = []
mode = 'full'
P.figure()
for k in range(1, 5):
# Train a model of k component, and plots isodensity curve
P.subplot(2, 2, k)
gm, b = cluster(dt1, k = k, mode = mode)
bc.append(b)
X, Y, Z, V = gm.density_on_grid(nl = 20)
P.contour(X, Y, Z, V)
P.plot(dt1[:, 0], dt1[:, 1], '.')
P.xlabel('x coordinate (scaled)')
P.ylabel('y coordinate (scaled)')
print "According to the BIC, model with %d components is better" % (N.argmax(bc) + 1)
P.show()
#! /usr/bin/env python
# Last Change: Sun Jul 22 03:00 PM 2007 J
# Various utilities for examples
import numpy as N
from scikits.learn.datasets import oldfaithful, pendigits
def get_faithful():
"""Return faithful data as a nx2 array, first column being duration, second
being waiting time."""
# Load faithful data, convert waiting into integer, remove L, M and S data
data = oldfaithful.load()
tmp1 = []
tmp2 = []
for i in data['data']:
if not (i[0] == 'L' or i[0] == 'M' or i[0] == 'S'):
tmp1.append(i[0])
tmp2.append(i[1])
waiting = N.array([int(i) for i in tmp1], dtype = N.float)
duration = N.array([i for i in tmp2], dtype = N.float)
waiting = waiting[:, N.newaxis]
duration = duration[:, N.newaxis]
return N.concatenate((waiting, duration), 1)
def get_pendigits():
"""Return faithful data as a nx2 array, first column being duration, second
being waiting time."""
# Load faithful data, convert waiting into integer, remove L, M and S data
data = pendigits.training.load()
return data['data']['x0'], data['data']['y0']
def scale(data):
""" Scale data such as each col is in the range [0..1].
Note: inplace."""
n = N.min(data, 0)
m = N.max(data, 0)
data -= n
data /= (m-n)
return data
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment