Remove examples from deprecated modules.

1a17e795 · Fabian Pedregosa · 32f4c67c · 32f4c67c · 32f4c67c · 32f4c67c
Commit 1a17e795 authored 14 years ago by Fabian Pedregosa
--- a/examples/em/README.txt
+++ b/examples/em/README.txt
-Gaussian-mixture modeling
--------------------------
-Examples concerning the `scikits.learn.em` package.
-.. warning::
-  The code in the `scikits.learn.em` submodule is not as mature as the
-  rest of the scikit and is prone to changing.
--- a/examples/em/basic_example1.py
+++ b/examples/em/basic_example1.py
-"""
-Simple mixture model example
-=============================
-"""
-import numpy as np
-import pylab as pl
-from scikits.learn.em import GM
-################################################################################
-# Hyper parameters:
-#   - K:    number of clusters
-#   - d:    dimension
-k   = 3
-d   = 2
-################################################################################
-# Values for weights, mean and (diagonal) variances
-#   - the weights are an array of rank 1
-#   - mean is expected to be rank 2 with one row for one component
-#   - variances are also expteced to be rank 2. For diagonal, one row
-#   is one diagonal, for full, the first d rows are the first variance,
-#   etc... In this case, the variance matrix should be k*d rows and d 
-#   colums
-w   = np.array([0.2, 0.45, 0.35])
-mu  = np.array([[4.1, 3], [1, 5], [-2, -3]])
-va  = np.array([[1, 1.5], [3, 4], [2, 3.5]])
-################################################################################
-# First method: directly from parameters:
-# Both methods are equivalents.
-gm      = GM.fromvalues(w, mu, va)
-################################################################################
-# Second method to build a GM instance:
-gm      = GM(d, k, mode = 'diag')
-# The set_params checks that w, mu, and va corresponds to k, d and m
-gm.set_param(w, mu, va)
-# Once set_params is called, both methods are equivalent. The 2d
-# method is useful when using a GM object for learning (where
-# the learner class will set the params), whereas the first one
-# is useful when there is a need to quickly sample a model
-# from existing values, without a need to give the hyper parameters
-# Create a Gaussian Mixture from the parameters, and sample
-# 1000 items from it (one row = one 2 dimension sample)
-data    = gm.sample(1000)
-# Plot the samples
-pl.plot(data[:, 0], data[:, 1], '.')
-# Plot the ellipsoids of confidence with a level a 75 %
-gm.plot(level = 0.75)
--- a/examples/em/basic_example2.py
+++ b/examples/em/basic_example2.py
-"""
-Another simple mixture model example
-=====================================
-"""
-from numpy.random import seed
-from scikits.learn.em import GM, GMM, EM
-# To reproduce results, fix the random seed
-seed(1)
-################################################################################
-# Meta parameters of the model
-#   - k: Number of components
-#   - d: dimension of each Gaussian
-#   - mode: Mode of covariance matrix: full or diag (string)
-#   - nframes: number of frames (frame = one data point = one
-#   row of d elements)
-k       = 2
-d       = 2
-mode    = 'diag'
-nframes = 1e3
-################################################################################
-# Create an artificial GM model, samples it
-################################################################################
-w, mu, va   = GM.gen_param(d, k, mode, spread = 1.5)
-gm          = GM.fromvalues(w, mu, va)
-# Sample nframes frames  from the model
-data    = gm.sample(nframes)
-################################################################################
-# Learn the model with EM
-################################################################################
-# Create a Model from a Gaussian mixture with kmean initialization
-lgm = GM(d, k, mode)
-gmm = GMM(lgm, 'kmean')
-# The actual EM, with likelihood computation. The threshold
-# is compared to the (linearly appromixated) derivative of the likelihood
-em      = EM()
-like    = em.train(data, gmm, maxiter = 30, thresh = 1e-8)
-# The computed parameters are in gmm.gm, which is the same than lgm
-# (remember, python does not copy most objects by default). You can for example
-# plot lgm against gm to compare
--- a/examples/em/basic_example3.py
+++ b/examples/em/basic_example3.py
-"""
-Simple Gaussian-mixture model example
-======================================
-"""
-import numpy as np
-from numpy.random import seed
-from scikits.learn.em import GM, GMM, EM
-seed(2)
-k       = 4
-d       = 2
-mode    = 'diag'
-nframes = 1e3
-################################################################################
-# Create an artificial GMM model, samples it
-################################################################################
-w, mu, va   = GM.gen_param(d, k, mode, spread = 1.0)
-gm          = GM.fromvalues(w, mu, va)
-# Sample nframes frames  from the model
-data    = gm.sample(nframes)
-################################################################################
-# Learn the model with EM
-################################################################################
-# List of learned mixtures lgm[i] is a mixture with i+1 components
-lgm     = []
-kmax    = 6
-bics    = np.zeros(kmax)
-em      = EM()
-for i in range(kmax):
-    lgm.append(GM(d, i+1, mode))
-    gmm = GMM(lgm[i], 'kmean')
-    em.train(data, gmm, maxiter = 30, thresh = 1e-10)
-    bics[i] = gmm.bic(data)
-print "Original model has %d clusters, bics says %d" % (k, np.argmax(bics)+1)
-################################################################################
-# Draw the model
-################################################################################
-import pylab as pl
-pl.subplot(3, 2, 1)
-for k in range(kmax):
-    pl.subplot(3, 2, k+1)
-    level   = 0.9
-    pl.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_')
-    # h keeps the handles of the plot, so that you can modify 
-    # its parameters like label or color
-    h   = lgm[k].plot(level = level)
-    [i.set_color('r') for i in h]
-    h[0].set_label('EM confidence ellipsoides')
-    h   = gm.plot(level = level)
-    [i.set_color('g') for i in h]
-    h[0].set_label('Real confidence ellipsoides')
-pl.legend(loc = 0)
-pl.show()
--- a/examples/em/discriminant_analysis.py
+++ b/examples/em/discriminant_analysis.py
-"""
-Classification using mixture of Gaussian
-=========================================
-Example of doing classification with mixture of Gaussian. Note that this
-is really a toy example: we do not use testing testset nor cross
-validation.
-We use the famous iris database used by Sir R.A. Fisher. You can try to change
-the attributes used for classification, number of components used for the
-mixtures, etc..."""
-import numpy as N
-import pylab as P
-import matplotlib as MPL
-from scikits.learn.em import EM, GMM, GM
-import utils
-data = utils.iris.load()
-# cnames are the class names
-cnames = data.keys()
-#--------------------
-# Data pre processing
-#--------------------
-# we use 25 samples of each class (eg half of iris), for
-# learning, and the other half for testing. We use sepal width and petal width
-# only
-ln = 25
-tn = 25
-xdata = {}
-ydata = {}
-# learning data
-ldata = {}
-# you can change here the used attributes (sepal vs petal, width vs height)
-for i in cnames:
-    xdata[i] = data[i]['sepal width']
-    ydata[i] = data[i]['petal width']
-    ldata[i] = N.concatenate((xdata[i][:ln, N.newaxis], 
-                              ydata[i][:ln, N.newaxis]), 
-                             axis = 1)
-tx = N.concatenate([xdata[i][ln:] for i in cnames])
-ty = N.concatenate([ydata[i][ln:] for i in cnames])
-tdata = N.concatenate((tx[:, N.newaxis], ty[:, N.newaxis]), axis = 1)
-# tclass makes the correspondance class <-> index in the testing data tdata
-tclass = {}
-for i in range(3):
-    tclass[cnames[i]] = N.arange(tn * i, tn * (i+1))
-#----------------------------
-# Learning and classification
-#----------------------------
-# This function train a mixture model with k components
-def cluster(data, k, mode = 'full'):
-    d = data.shape[1]
-    gm = GM(d, k, mode)
-    gmm = GMM(gm)
-    em = EM()
-    em.train(data, gmm, maxiter = 20)
-    return gm
-# Estimate each class with a mixture of nc components
-nc = 2
-mode = 'diag'
-lmod = {}
-for i in cnames:
-    lmod[i] = cluster(ldata[i], nc, mode)
-# Classifiy the testing data. Of course, the data are not really IID, because
-# we did not shuffle the testing data, but in this case, this does not change
-# anything.
-p = N.empty((len(tdata), 3))
-for i in range(3):
-    # For each class, computes the likelihood for the testing data
-    p[:, i] = lmod[cnames[i]].pdf(tdata)
-# We then take the Maximum A Posteriori class (same than most likely model in
-# this case, since each class is equiprobable)
-cid = N.argmax(p, 1)
-classification = {}
-for i in range(3):
-    classification[cnames[i]] = N.where(cid == i)[0]
-correct = {}
-incorrect = {}
-for i in cnames:
-    correct[i] = N.intersect1d(classification[i], tclass[i])
-    incorrect[i] = N.setdiff1d(classification[i], tclass[i])
-#-----------------
-# Plot the results
-#-----------------
-csym = {'setosa' : 's', 'versicolor' : 'x', 'virginica' : 'o'}
-r = 50.
-P.figure(figsize = [600/r, 400/r])
-prop = MPL.font_manager.FontProperties(size='smaller')
-# Plot the learning data with the mixtures
-P.subplot(2, 1, 1)
-for i in lmod.values():
-    #i.plot()
-    X, Y, Z, V = i.density_on_grid()
-    P.contourf(X, Y, Z, V)
-for i in cnames:
-    P.plot(ldata[i][:, 0], ldata[i][:, 1], csym[i], label = i + ' (learning)')
-P.xlabel('sepal width')
-P.ylabel('petal width')
-P.legend(loc = 'best')
-# Plot the results on test dataset (green for correctly classified, red for
-# incorrectly classified)
-P.subplot(2, 1, 2)
-for i in cnames:
-    P.plot(tx[correct[i]], ty[correct[i]], 'g' + csym[i], 
-           label = '%s (correctly classified)' % i)
-    if len(incorrect[i]) > 0:
-        P.plot(tx[incorrect[i]], ty[incorrect[i]], 'r' + csym[i], 
-               label = '%s (incorrectly classified)' % i)
-P.legend(loc = 'best', prop = prop)
-P.xlabel('sepal width')
-P.ylabel('petal width')
-P.savefig('dclass.png', dpi = 60)
--- a/examples/em/pdfestimation.py
+++ b/examples/em/pdfestimation.py
-"""
-Density estimation with mixture models
-======================================
-"""
-#! /usr/bin/env python
-# Last Change: Sun Jul 22 12:00 PM 2007 J
-# Example of doing pdf estimation with EM algorithm. Requires matplotlib.
-import numpy as N
-import pylab as P
-from scikits.learn.em import EM, GM, GMM
-import utils
-oldfaithful = utils.get_faithful()
-# We want the relationship between d(t) and w(t+1), but get_faithful gives
-# d(t), w(t), so we have to shift to get the "usual" faithful data
-waiting = oldfaithful[1:, 1:]
-duration = oldfaithful[:len(waiting), :1]
-dt = N.concatenate((duration, waiting), 1)
-# Scale the data so that each component is in [0..1]
-dt = utils.scale(dt)
-# This function train a mixture model with k components, returns the trained
-# model and the BIC
-def cluster(data, k, mode = 'full'):
-    d = data.shape[1]
-    gm = GM(d, k, mode)
-    gmm = GMM(gm)
-    em = EM()
-    em.train(data, gmm, maxiter = 20)
-    return gm, gmm.bic(data)
-# bc will contain a list of BIC values for each model trained
-bc = []
-mode = 'full'
-P.figure()
-for k in range(1, 5):
-    # Train a model of k component, and plots isodensity curve
-    P.subplot(2, 2, k)
-    gm, b = cluster(dt, k = k, mode = mode)
-    bc.append(b)
-    X, Y, Z, V = gm.density_on_grid()
-    P.contour(X, Y, Z, V)
-    P.plot(dt[:, 0], dt[:, 1], '.')
-    P.xlabel('duration time (scaled)')
-    P.ylabel('waiting time (scaled)')
-print "According to the BIC, model with %d components is better" % (N.argmax(bc) + 1)
--- a/examples/em/pdfestimation1d.py
+++ b/examples/em/pdfestimation1d.py
-"""
-Density estimation
-====================
-This example shows how to do pdf estimation for one dimensional data. It
-estimates a Gaussian mixture model for several number of components, and
-it determines the "best" one according to the Bayesian Information
-Criterion.
-It uses old faitfhul waiting time as the one dimension data, and plots the best
-model as well as the BIC as a function of the number of component.
-"""
-# Example of doing pdf estimation with EM algorithm. Requires matplotlib.
-import numpy as N
-import pylab as P
-import matplotlib as MPL
-from scikits.learn.em import EM, GM, GMM
-import utils
-oldfaithful = utils.get_faithful()
-duration = oldfaithful[:, :1]
-waiting = oldfaithful[:, 1:]
-#dt = utils.scale(duration)
-#dt = duration / 60.
-dt = waiting / 60.
-# This function train a mixture model with k components, returns the trained
-# model and the BIC
-def cluster(data, k):
-    d = data.shape[1]
-    gm = GM(d, k)
-    gmm = GMM(gm)
-    em = EM()
-    em.train(data, gmm, maxiter = 20)
-    return gm, gmm.bic(data)
-# bc will contain a list of BIC values for each model trained, gml the
-# corresponding mixture model
-bc = []
-gml = []
-for k in range(1, 8):
-    gm, b = cluster(dt, k = k)
-    bc.append(b)
-    gml.append(gm)
-mbic = N.argmax(bc)
-# Below is code to display a figure with histogram and best model (in the BIC
-# sense) pdf, with the BIC as a function of the number of components on the
-# right.
-P.figure(figsize = [12, 7])
-#---------------------------
-# histogram + pdf estimation
-#---------------------------
-P.subplot(1, 2, 1)
-h = gml[mbic].plot1d(gpdf=True)
-# You can manipulate the differents parts of the plot through the returned
-# handles
-h['gpdf'][0].set_linestyle('-')
-h['gpdf'][0].set_label('pdf of the mixture')
-h['pdf'][0].set_label('pdf of individual component')
-[l.set_linestyle('-') for l in h['pdf']]
-[l.set_color('g') for l in h['pdf']]
-prop = MPL.font_manager.FontProperties(size='smaller')
-P.legend(loc = 'best', prop = prop)
-P.hist(dt, 25, normed = 1, fill = False)
-P.xlabel('waiting time between consecutive eruption (in min)')
-#------------------------------------------
-# BIC as a function of number of components
-#------------------------------------------
-P.subplot(1, 2, 2)
-P.plot(N.arange(1, 8), bc, 'o:')
-P.xlabel("number of components")
-P.ylabel("BIC")
-print "According to the BIC, model with %d components is better" % (N.argmax(bc) + 1)
--- a/examples/em/plot_expectation.py
+++ b/examples/em/plot_expectation.py
-"""
-Mixture of Gaussian expectation maximation example
-====================================================
-"""
-import pylab
-import numpy as np
-from scikits.learn.em.densities2 import gauss_ell
-#=========================================
-# Test plotting a simple diag 2d variance:
-#=========================================
-va  = np.array([5, 3])
-mu  = np.array([2, 3])
-# Generate a multivariate gaussian of mean mu and covariance va
-X       = np.random.randn(2, 1e3)
-Yc      = np.dot(np.diag(np.sqrt(va)), X)
-Yc      = Yc.transpose() + mu
-# Plotting
-Xe, Ye  = gauss_ell(mu, va, npoints = 100)
-pylab.figure()
-pylab.plot(Yc[:, 0], Yc[:, 1], '.')
-pylab.plot(Xe, Ye, 'r')
-#=========================================
-# Test plotting a simple full 2d variance:
-#=========================================
-va  = np.array([[0.2, 0.1],[0.1, 0.5]])
-mu  = np.array([0, 3])
-# Generate a multivariate gaussian of mean mu and covariance va
-X       = np.random.randn(1e3, 2)
-Yc      = np.dot(np.linalg.cholesky(va), X.transpose())
-Yc      = Yc.transpose() + mu
-# Plotting
-Xe, Ye  = gauss_ell(mu, va, npoints = 100, level=0.95)
-pylab.figure()
-pylab.plot(Yc[:, 0], Yc[:, 1], '.')
-pylab.plot(Xe, Ye, 'r')
-pylab.show()
--- a/examples/em/plotexamples.py
+++ b/examples/em/plotexamples.py
-"""
-Ellipse fitting with Gaussian mixtures
-=======================================
-"""
-# This is a simple test to check whether plotting ellipsoides of confidence and
-# isodensity contours match
-import numpy as N
-import pylab as P
-from scikits.learn.em import EM, GM, GMM
-# Generate a simple mixture model, plot its confidence ellipses + isodensity
-# curves for both diagonal and full covariance matrices
-d = 3
-k = 3
-dim = [0, 2]
-# diag model
-w, mu, va = GM.gen_param(d, k)
-dgm = GM.fromvalues(w, mu, va)
-# full model
-w, mu, va = GM.gen_param(d, k, 'full', spread = 1)
-fgm = GM.fromvalues(w, mu, va)
-def plot_model(gm, dim):
-    X, Y, Z, V = gm.density_on_grid(dim = dim)
-    h = gm.plot(dim = dim)
-    [i.set_linestyle('-.') for i in h]
-    P.contour(X, Y, Z, V)
-    data = gm.sample(200)
-    P.plot(data[:, dim[0]], data[:,dim[1]], '.')
-# Plot the contours and the ellipsoids of confidence
-P.subplot(2, 1, 1)
-plot_model(dgm, dim)
-P.subplot(2, 1, 2)
-plot_model(fgm, dim)
-P.show()
--- a/examples/em/regularized_example.py
+++ b/examples/em/regularized_example.py
-"""
-Regularized Gaussian mixture on hand-written digits
-====================================================
-Example of using RegularizedEM with pendigits data.
-If you want to do discriminant analysis with pendigits, you quickly have
-problems with EM if you use directly the coordinates, because some points are
-likely to be on the border, hence the corresponding component can have a
-covariance matrix which easily becomes singular. Regularized EM avoids this
-problem by using simple regularization on the mixture. You can play with pcount
-and pval to see the effect on pdf estimation.
-For now, regularized EM is pretty crude, but is enough for simple cases where
-you need to avoid singular covariance matrices."""
-import numpy as N
-import pylab as P
-from scikits.learn.em import EM, GM, GMM
-# Experimental RegularizedEM
-from scikits.learn.em.gmm_em import RegularizedEM
-import utils
-x, y = utils.get_pendigits()
-# Take only the first point of pendigits for pdf estimation
-dt1 = N.concatenate([x[:, N.newaxis], y[:, N.newaxis]], 1)
-dt1 = utils.scale(dt1.astype(N.float))
-# pcnt is the poportion of samples to use as prior count. Eg if you have 1000
-# samples, and pcn is 0.1, then the prior count would be 100, and 1100 samples
-# will be considered as overall when regularizing the parameters.
-pcnt = 0.05
-# You should try different values of pval. If pval is 0.1, then the
-# regularization will be strong. If you use something like 0.01, really sharp
-# components will appear. If the values are too small, the regularizer may not
-# work (singular covariance matrices).
-pval = 0.05
-# This function train a mixture model with k components, returns the trained
-# model and the BIC
-def cluster(data, k, mode = 'full'):
-    d = data.shape[1]
-    gm = GM(d, k, mode)
-    gmm = GMM(gm, 'random')
-    em = RegularizedEM(pcnt = pcnt, pval = pval)
-    em.train(data, gmm, maxiter = 20)
-    return gm, gmm.bic(data)
-# bc will contain a list of BIC values for each model trained
-N.seterr(all = 'warn')
-bc = []
-mode = 'full'
-P.figure()
-for k in range(1, 5):
-    # Train a model of k component, and plots isodensity curve
-    P.subplot(2, 2, k)
-    gm, b = cluster(dt1, k = k, mode = mode)
-    bc.append(b)
-    X, Y, Z, V = gm.density_on_grid(nl = 20)
-    P.contour(X, Y, Z, V)
-    P.plot(dt1[:, 0], dt1[:, 1], '.')
-    P.xlabel('x coordinate (scaled)')
-    P.ylabel('y coordinate (scaled)')
-print "According to the BIC, model with %d components is better" % (N.argmax(bc) + 1)
-P.show()
--- a/examples/em/utils.py
+++ b/examples/em/utils.py
-#! /usr/bin/env python
-# Last Change: Sun Jul 22 03:00 PM 2007 J
-# Various utilities for examples 
-import numpy as N
-from scikits.learn.datasets import oldfaithful, pendigits
-def get_faithful():
-    """Return faithful data as a nx2 array, first column being duration, second
-    being waiting time."""
-    # Load faithful data, convert waiting into integer, remove L, M and S data
-    data = oldfaithful.load()
-    tmp1 = []
-    tmp2 = []
-    for i in data['data']:
-        if not (i[0] == 'L' or i[0] == 'M' or i[0] == 'S'):
-            tmp1.append(i[0])
-            tmp2.append(i[1])
-    waiting = N.array([int(i) for i in tmp1], dtype = N.float)
-    duration = N.array([i for i in tmp2], dtype = N.float)
-    waiting = waiting[:, N.newaxis]
-    duration = duration[:, N.newaxis]
-    return N.concatenate((waiting, duration), 1)
-def get_pendigits():
-    """Return faithful data as a nx2 array, first column being duration, second
-    being waiting time."""
-    # Load faithful data, convert waiting into integer, remove L, M and S data
-    data = pendigits.training.load()
-    return data['data']['x0'], data['data']['y0']
-def scale(data):
-    """ Scale data such as each col is in the range [0..1].
-    Note: inplace."""
-    n = N.min(data, 0)
-    m = N.max(data, 0)
-    data -= n
-    data /= (m-n)
-    return data