diff --git a/examples/em/README.txt b/examples/em/README.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c33ac9108e7da0b317db040d148072cc2a148004
--- /dev/null
+++ b/examples/em/README.txt
@@ -0,0 +1,11 @@
+
+Gaussian-mixture modeling
+--------------------------
+
+Examples concerning the `scikits.learn.em` package.
+
+.. warning::
+
+  The code in the `scikits.learn.em` submodule is not as mature as the
+  rest of the scikit and is prone to changing.
+
diff --git a/examples/em/basic_example1.py b/examples/em/basic_example1.py
new file mode 100644
index 0000000000000000000000000000000000000000..96084fce3b54defd8b4281d0e4f71390332e6107
--- /dev/null
+++ b/examples/em/basic_example1.py
@@ -0,0 +1,53 @@
+"""
+Simple mixture model example
+=============================
+"""
+
+import numpy as np
+import pylab as pl
+from scikits.learn.em import GM
+
+################################################################################
+# Hyper parameters:
+#   - K:    number of clusters
+#   - d:    dimension
+k   = 3
+d   = 2
+
+################################################################################
+# Values for weights, mean and (diagonal) variances
+#   - the weights are an array of rank 1
+#   - mean is expected to be rank 2 with one row for one component
+#   - variances are also expteced to be rank 2. For diagonal, one row
+#   is one diagonal, for full, the first d rows are the first variance,
+#   etc... In this case, the variance matrix should be k*d rows and d 
+#   colums
+w   = np.array([0.2, 0.45, 0.35])
+mu  = np.array([[4.1, 3], [1, 5], [-2, -3]])
+va  = np.array([[1, 1.5], [3, 4], [2, 3.5]])
+
+################################################################################
+# First method: directly from parameters:
+# Both methods are equivalents.
+gm      = GM.fromvalues(w, mu, va)
+
+################################################################################
+# Second method to build a GM instance:
+gm      = GM(d, k, mode = 'diag')
+# The set_params checks that w, mu, and va corresponds to k, d and m
+gm.set_param(w, mu, va)
+
+# Once set_params is called, both methods are equivalent. The 2d
+# method is useful when using a GM object for learning (where
+# the learner class will set the params), whereas the first one
+# is useful when there is a need to quickly sample a model
+# from existing values, without a need to give the hyper parameters
+
+# Create a Gaussian Mixture from the parameters, and sample
+# 1000 items from it (one row = one 2 dimension sample)
+data    = gm.sample(1000)
+
+# Plot the samples
+pl.plot(data[:, 0], data[:, 1], '.')
+# Plot the ellipsoids of confidence with a level a 75 %
+gm.plot(level = 0.75)
diff --git a/examples/em/basic_example2.py b/examples/em/basic_example2.py
new file mode 100644
index 0000000000000000000000000000000000000000..19615e7e614c117d7134a5199f60c54048236bee
--- /dev/null
+++ b/examples/em/basic_example2.py
@@ -0,0 +1,50 @@
+"""
+Another simple mixture model example
+=====================================
+"""
+
+from numpy.random import seed
+
+from scikits.learn.em import GM, GMM, EM
+
+# To reproduce results, fix the random seed
+seed(1)
+
+################################################################################
+# Meta parameters of the model
+#   - k: Number of components
+#   - d: dimension of each Gaussian
+#   - mode: Mode of covariance matrix: full or diag (string)
+#   - nframes: number of frames (frame = one data point = one
+#   row of d elements)
+k       = 2
+d       = 2
+mode    = 'diag'
+nframes = 1e3
+
+################################################################################
+# Create an artificial GM model, samples it
+################################################################################
+
+w, mu, va   = GM.gen_param(d, k, mode, spread = 1.5)
+gm          = GM.fromvalues(w, mu, va)
+
+# Sample nframes frames  from the model
+data    = gm.sample(nframes)
+
+################################################################################
+# Learn the model with EM
+################################################################################
+
+# Create a Model from a Gaussian mixture with kmean initialization
+lgm = GM(d, k, mode)
+gmm = GMM(lgm, 'kmean')
+
+# The actual EM, with likelihood computation. The threshold
+# is compared to the (linearly appromixated) derivative of the likelihood
+em      = EM()
+like    = em.train(data, gmm, maxiter = 30, thresh = 1e-8)
+
+# The computed parameters are in gmm.gm, which is the same than lgm
+# (remember, python does not copy most objects by default). You can for example
+# plot lgm against gm to compare
diff --git a/examples/em/basic_example3.py b/examples/em/basic_example3.py
new file mode 100644
index 0000000000000000000000000000000000000000..c953319de58888c3799f920fe5429feada7481f2
--- /dev/null
+++ b/examples/em/basic_example3.py
@@ -0,0 +1,67 @@
+"""
+Simple Gaussian-mixture model example
+======================================
+"""
+
+import numpy as np
+from numpy.random import seed
+
+from scikits.learn.em import GM, GMM, EM
+
+seed(2)
+
+k       = 4
+d       = 2
+mode    = 'diag'
+nframes = 1e3
+
+################################################################################
+# Create an artificial GMM model, samples it
+################################################################################
+w, mu, va   = GM.gen_param(d, k, mode, spread = 1.0)
+gm          = GM.fromvalues(w, mu, va)
+
+# Sample nframes frames  from the model
+data    = gm.sample(nframes)
+
+################################################################################
+# Learn the model with EM
+################################################################################
+
+# List of learned mixtures lgm[i] is a mixture with i+1 components
+lgm     = []
+kmax    = 6
+bics    = np.zeros(kmax)
+em      = EM()
+for i in range(kmax):
+    lgm.append(GM(d, i+1, mode))
+
+    gmm = GMM(lgm[i], 'kmean')
+    em.train(data, gmm, maxiter = 30, thresh = 1e-10)
+    bics[i] = gmm.bic(data)
+
+print "Original model has %d clusters, bics says %d" % (k, np.argmax(bics)+1)
+
+################################################################################
+# Draw the model
+################################################################################
+import pylab as pl
+pl.subplot(3, 2, 1)
+
+for k in range(kmax):
+    pl.subplot(3, 2, k+1)
+    level   = 0.9
+    pl.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_')
+
+    # h keeps the handles of the plot, so that you can modify 
+    # its parameters like label or color
+    h   = lgm[k].plot(level = level)
+    [i.set_color('r') for i in h]
+    h[0].set_label('EM confidence ellipsoides')
+
+    h   = gm.plot(level = level)
+    [i.set_color('g') for i in h]
+    h[0].set_label('Real confidence ellipsoides')
+
+pl.legend(loc = 0)
+pl.show()
diff --git a/examples/em/discriminant_analysis.py b/examples/em/discriminant_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..86810386320f2633951c75a9b457bd86059e6032
--- /dev/null
+++ b/examples/em/discriminant_analysis.py
@@ -0,0 +1,128 @@
+"""
+Classification using mixture of Gaussian
+=========================================
+
+Example of doing classification with mixture of Gaussian. Note that this
+is really a toy example: we do not use testing testset nor cross
+validation.
+
+We use the famous iris database used by Sir R.A. Fisher. You can try to change
+the attributes used for classification, number of components used for the
+mixtures, etc..."""
+
+import numpy as N
+import pylab as P
+import matplotlib as MPL
+
+from scikits.learn.em import EM, GMM, GM
+import utils
+
+data = utils.iris.load()
+# cnames are the class names
+cnames = data.keys()
+
+#--------------------
+# Data pre processing
+#--------------------
+# we use 25 samples of each class (eg half of iris), for
+# learning, and the other half for testing. We use sepal width and petal width
+# only
+ln = 25
+tn = 25
+xdata = {}
+ydata = {}
+# learning data
+ldata = {}
+
+# you can change here the used attributes (sepal vs petal, width vs height)
+for i in cnames:
+    xdata[i] = data[i]['sepal width']
+    ydata[i] = data[i]['petal width']
+    ldata[i] = N.concatenate((xdata[i][:ln, N.newaxis], 
+                              ydata[i][:ln, N.newaxis]), 
+                             axis = 1)
+
+tx = N.concatenate([xdata[i][ln:] for i in cnames])
+ty = N.concatenate([ydata[i][ln:] for i in cnames])
+tdata = N.concatenate((tx[:, N.newaxis], ty[:, N.newaxis]), axis = 1)
+
+# tclass makes the correspondance class <-> index in the testing data tdata
+tclass = {}
+for i in range(3):
+    tclass[cnames[i]] = N.arange(tn * i, tn * (i+1))
+
+#----------------------------
+# Learning and classification
+#----------------------------
+# This function train a mixture model with k components
+def cluster(data, k, mode = 'full'):
+    d = data.shape[1]
+    gm = GM(d, k, mode)
+    gmm = GMM(gm)
+    em = EM()
+    em.train(data, gmm, maxiter = 20)
+    return gm
+
+# Estimate each class with a mixture of nc components
+nc = 2
+mode = 'diag'
+lmod = {}
+for i in cnames:
+    lmod[i] = cluster(ldata[i], nc, mode)
+
+# Classifiy the testing data. Of course, the data are not really IID, because
+# we did not shuffle the testing data, but in this case, this does not change
+# anything.
+p = N.empty((len(tdata), 3))
+for i in range(3):
+    # For each class, computes the likelihood for the testing data
+    p[:, i] = lmod[cnames[i]].pdf(tdata)
+
+# We then take the Maximum A Posteriori class (same than most likely model in
+# this case, since each class is equiprobable)
+cid = N.argmax(p, 1)
+classification = {}
+for i in range(3):
+    classification[cnames[i]] = N.where(cid == i)[0]
+
+correct = {}
+incorrect = {}
+for i in cnames:
+    correct[i] = N.intersect1d(classification[i], tclass[i])
+    incorrect[i] = N.setdiff1d(classification[i], tclass[i])
+
+#-----------------
+# Plot the results
+#-----------------
+csym = {'setosa' : 's', 'versicolor' : 'x', 'virginica' : 'o'}
+r = 50.
+P.figure(figsize = [600/r, 400/r])
+
+prop = MPL.font_manager.FontProperties(size='smaller')
+
+# Plot the learning data with the mixtures
+P.subplot(2, 1, 1)
+for i in lmod.values():
+    #i.plot()
+    X, Y, Z, V = i.density_on_grid()
+    P.contourf(X, Y, Z, V)
+
+for i in cnames:
+    P.plot(ldata[i][:, 0], ldata[i][:, 1], csym[i], label = i + ' (learning)')
+P.xlabel('sepal width')
+P.ylabel('petal width')
+P.legend(loc = 'best')
+
+# Plot the results on test dataset (green for correctly classified, red for
+# incorrectly classified)
+P.subplot(2, 1, 2)
+for i in cnames:
+    P.plot(tx[correct[i]], ty[correct[i]], 'g' + csym[i], 
+           label = '%s (correctly classified)' % i)
+    if len(incorrect[i]) > 0:
+        P.plot(tx[incorrect[i]], ty[incorrect[i]], 'r' + csym[i], 
+               label = '%s (incorrectly classified)' % i)
+P.legend(loc = 'best', prop = prop)
+P.xlabel('sepal width')
+P.ylabel('petal width')
+P.savefig('dclass.png', dpi = 60)
diff --git a/examples/em/expectation.py b/examples/em/expectation.py
new file mode 100644
index 0000000000000000000000000000000000000000..b33998e8b20e911bd68af5230edf0ec186c1c0b1
--- /dev/null
+++ b/examples/em/expectation.py
@@ -0,0 +1,44 @@
+"""
+Mixture of Gaussian expectation maximation example
+====================================================
+
+"""
+
+import pylab
+import numpy as np
+from scikits.learn.em.densities2 import gauss_ell
+
+#=========================================
+# Test plotting a simple diag 2d variance:
+#=========================================
+va  = np.array([5, 3])
+mu  = np.array([2, 3])
+
+# Generate a multivariate gaussian of mean mu and covariance va
+X       = np.random.randn(2, 1e3)
+Yc      = np.dot(np.diag(np.sqrt(va)), X)
+Yc      = Yc.transpose() + mu
+
+# Plotting
+Xe, Ye  = gauss_ell(mu, va, npoints = 100)
+pylab.figure()
+pylab.plot(Yc[:, 0], Yc[:, 1], '.')
+pylab.plot(Xe, Ye, 'r')
+
+#=========================================
+# Test plotting a simple full 2d variance:
+#=========================================
+va  = np.array([[0.2, 0.1],[0.1, 0.5]])
+mu  = np.array([0, 3])
+
+# Generate a multivariate gaussian of mean mu and covariance va
+X       = np.random.randn(1e3, 2)
+Yc      = np.dot(np.linalg.cholesky(va), X.transpose())
+Yc      = Yc.transpose() + mu
+
+# Plotting
+Xe, Ye  = gauss_ell(mu, va, npoints = 100, level=0.95)
+pylab.figure()
+pylab.plot(Yc[:, 0], Yc[:, 1], '.')
+pylab.plot(Xe, Ye, 'r')
+pylab.show()
diff --git a/examples/em/pdfestimation.py b/examples/em/pdfestimation.py
new file mode 100644
index 0000000000000000000000000000000000000000..195573d20bc0e3fa8e6fe6cc11918111e629b9e3
--- /dev/null
+++ b/examples/em/pdfestimation.py
@@ -0,0 +1,53 @@
+"""
+Density estimation with mixture models
+======================================
+"""
+
+#! /usr/bin/env python
+# Last Change: Sun Jul 22 12:00 PM 2007 J
+
+# Example of doing pdf estimation with EM algorithm. Requires matplotlib.
+import numpy as N
+import pylab as P
+
+from scikits.learn.em import EM, GM, GMM
+import utils
+
+oldfaithful = utils.get_faithful()
+
+# We want the relationship between d(t) and w(t+1), but get_faithful gives
+# d(t), w(t), so we have to shift to get the "usual" faithful data
+waiting = oldfaithful[1:, 1:]
+duration = oldfaithful[:len(waiting), :1]
+dt = N.concatenate((duration, waiting), 1)
+
+# Scale the data so that each component is in [0..1]
+dt = utils.scale(dt)
+
+# This function train a mixture model with k components, returns the trained
+# model and the BIC
+def cluster(data, k, mode = 'full'):
+    d = data.shape[1]
+    gm = GM(d, k, mode)
+    gmm = GMM(gm)
+    em = EM()
+    em.train(data, gmm, maxiter = 20)
+    return gm, gmm.bic(data)
+
+# bc will contain a list of BIC values for each model trained
+bc = []
+mode = 'full'
+P.figure()
+for k in range(1, 5):
+    # Train a model of k component, and plots isodensity curve
+    P.subplot(2, 2, k)
+    gm, b = cluster(dt, k = k, mode = mode)
+    bc.append(b)
+
+    X, Y, Z, V = gm.density_on_grid()
+    P.contour(X, Y, Z, V)
+    P.plot(dt[:, 0], dt[:, 1], '.')
+    P.xlabel('duration time (scaled)')
+    P.ylabel('waiting time (scaled)')
+
+print "According to the BIC, model with %d components is better" % (N.argmax(bc) + 1)
diff --git a/examples/em/pdfestimation1d.py b/examples/em/pdfestimation1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..08227c2a71aa5105e8cb3936759ea34e59731017
--- /dev/null
+++ b/examples/em/pdfestimation1d.py
@@ -0,0 +1,85 @@
+"""
+
+Density estimation
+====================
+
+This example shows how to do pdf estimation for one dimensional data. It
+estimates a Gaussian mixture model for several number of components, and
+it determines the "best" one according to the Bayesian Information
+Criterion.
+
+It uses old faitfhul waiting time as the one dimension data, and plots the best
+model as well as the BIC as a function of the number of component.
+"""
+
+# Example of doing pdf estimation with EM algorithm. Requires matplotlib.
+import numpy as N
+
+import pylab as P
+import matplotlib as MPL
+
+from scikits.learn.em import EM, GM, GMM
+import utils
+
+oldfaithful = utils.get_faithful()
+
+duration = oldfaithful[:, :1]
+waiting = oldfaithful[:, 1:]
+
+#dt = utils.scale(duration)
+#dt = duration / 60.
+dt = waiting / 60.
+
+# This function train a mixture model with k components, returns the trained
+# model and the BIC
+def cluster(data, k):
+    d = data.shape[1]
+    gm = GM(d, k)
+    gmm = GMM(gm)
+    em = EM()
+    em.train(data, gmm, maxiter = 20)
+    return gm, gmm.bic(data)
+
+# bc will contain a list of BIC values for each model trained, gml the
+# corresponding mixture model
+bc = []
+gml = []
+
+for k in range(1, 8):
+    gm, b = cluster(dt, k = k)
+    bc.append(b)
+    gml.append(gm)
+
+mbic = N.argmax(bc)
+
+# Below is code to display a figure with histogram and best model (in the BIC
+# sense) pdf, with the BIC as a function of the number of components on the
+# right.
+P.figure(figsize = [12, 7])
+#---------------------------
+# histogram + pdf estimation
+#---------------------------
+P.subplot(1, 2, 1)
+h = gml[mbic].plot1d(gpdf=True)
+# You can manipulate the differents parts of the plot through the returned
+# handles
+h['gpdf'][0].set_linestyle('-')
+h['gpdf'][0].set_label('pdf of the mixture')
+h['pdf'][0].set_label('pdf of individual component')
+[l.set_linestyle('-') for l in h['pdf']]
+[l.set_color('g') for l in h['pdf']]
+
+prop = MPL.font_manager.FontProperties(size='smaller')
+P.legend(loc = 'best', prop = prop)
+
+P.hist(dt, 25, normed = 1, fill = False)
+P.xlabel('waiting time between consecutive eruption (in min)')
+
+#------------------------------------------
+# BIC as a function of number of components
+#------------------------------------------
+P.subplot(1, 2, 2)
+P.plot(N.arange(1, 8), bc, 'o:')
+P.xlabel("number of components")
+P.ylabel("BIC")
+print "According to the BIC, model with %d components is better" % (N.argmax(bc) + 1)
diff --git a/examples/em/plotexamples.py b/examples/em/plotexamples.py
new file mode 100644
index 0000000000000000000000000000000000000000..a38c09b7aa3f918b342c01c014e334ed07a0a179
--- /dev/null
+++ b/examples/em/plotexamples.py
@@ -0,0 +1,42 @@
+"""
+Ellipse fitting with Gaussian mixtures
+=======================================
+
+"""
+
+# This is a simple test to check whether plotting ellipsoides of confidence and
+# isodensity contours match
+import numpy as N
+
+import pylab as P
+
+from scikits.learn.em import EM, GM, GMM
+
+# Generate a simple mixture model, plot its confidence ellipses + isodensity
+# curves for both diagonal and full covariance matrices
+d = 3
+k = 3
+dim = [0, 2]
+# diag model
+w, mu, va = GM.gen_param(d, k)
+dgm = GM.fromvalues(w, mu, va)
+# full model
+w, mu, va = GM.gen_param(d, k, 'full', spread = 1)
+fgm = GM.fromvalues(w, mu, va)
+
+def plot_model(gm, dim):
+    X, Y, Z, V = gm.density_on_grid(dim = dim)
+    h = gm.plot(dim = dim)
+    [i.set_linestyle('-.') for i in h]
+    P.contour(X, Y, Z, V)
+    data = gm.sample(200)
+    P.plot(data[:, dim[0]], data[:,dim[1]], '.')
+
+# Plot the contours and the ellipsoids of confidence
+P.subplot(2, 1, 1)
+plot_model(dgm, dim)
+
+P.subplot(2, 1, 2)
+plot_model(fgm, dim)
+
+P.show()
diff --git a/examples/em/regularized_example.py b/examples/em/regularized_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..a16bd39235802a1ceae98c5398af0e2c142ca654
--- /dev/null
+++ b/examples/em/regularized_example.py
@@ -0,0 +1,71 @@
+"""
+
+Regularized Gaussian mixture on hand-written digits
+====================================================
+
+Example of using RegularizedEM with pendigits data.
+
+If you want to do discriminant analysis with pendigits, you quickly have
+problems with EM if you use directly the coordinates, because some points are
+likely to be on the border, hence the corresponding component can have a
+covariance matrix which easily becomes singular. Regularized EM avoids this
+problem by using simple regularization on the mixture. You can play with pcount
+and pval to see the effect on pdf estimation.
+
+For now, regularized EM is pretty crude, but is enough for simple cases where
+you need to avoid singular covariance matrices."""
+
+import numpy as N
+import pylab as P
+
+from scikits.learn.em import EM, GM, GMM
+# Experimental RegularizedEM
+from scikits.learn.em.gmm_em import RegularizedEM
+import utils
+
+x, y = utils.get_pendigits()
+
+# Take only the first point of pendigits for pdf estimation
+dt1 = N.concatenate([x[:, N.newaxis], y[:, N.newaxis]], 1)
+dt1 = utils.scale(dt1.astype(N.float))
+
+# pcnt is the poportion of samples to use as prior count. Eg if you have 1000
+# samples, and pcn is 0.1, then the prior count would be 100, and 1100 samples
+# will be considered as overall when regularizing the parameters.
+pcnt = 0.05
+# You should try different values of pval. If pval is 0.1, then the
+# regularization will be strong. If you use something like 0.01, really sharp
+# components will appear. If the values are too small, the regularizer may not
+# work (singular covariance matrices).
+pval = 0.05
+
+# This function train a mixture model with k components, returns the trained
+# model and the BIC
+def cluster(data, k, mode = 'full'):
+    d = data.shape[1]
+    gm = GM(d, k, mode)
+    gmm = GMM(gm, 'random')
+    em = RegularizedEM(pcnt = pcnt, pval = pval)
+    em.train(data, gmm, maxiter = 20)
+    return gm, gmm.bic(data)
+
+# bc will contain a list of BIC values for each model trained
+N.seterr(all = 'warn')
+bc = []
+mode = 'full'
+
+P.figure()
+for k in range(1, 5):
+    # Train a model of k component, and plots isodensity curve
+    P.subplot(2, 2, k)
+    gm, b = cluster(dt1, k = k, mode = mode)
+    bc.append(b)
+
+    X, Y, Z, V = gm.density_on_grid(nl = 20)
+    P.contour(X, Y, Z, V)
+    P.plot(dt1[:, 0], dt1[:, 1], '.')
+    P.xlabel('x coordinate (scaled)')
+    P.ylabel('y coordinate (scaled)')
+
+print "According to the BIC, model with %d components is better" % (N.argmax(bc) + 1)
+P.show()
diff --git a/examples/em/utils.py b/examples/em/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..89f11cf19030a8e7b39cc59d2b8869b7e3e87e88
--- /dev/null
+++ b/examples/em/utils.py
@@ -0,0 +1,47 @@
+#! /usr/bin/env python
+# Last Change: Sun Jul 22 03:00 PM 2007 J
+
+# Various utilities for examples 
+
+import numpy as N
+
+from scikits.learn.datasets import oldfaithful, pendigits
+
+def get_faithful():
+    """Return faithful data as a nx2 array, first column being duration, second
+    being waiting time."""
+    # Load faithful data, convert waiting into integer, remove L, M and S data
+    data = oldfaithful.load()
+    tmp1 = []
+    tmp2 = []
+    for i in data['data']:
+        if not (i[0] == 'L' or i[0] == 'M' or i[0] == 'S'):
+            tmp1.append(i[0])
+            tmp2.append(i[1])
+            
+    waiting = N.array([int(i) for i in tmp1], dtype = N.float)
+    duration = N.array([i for i in tmp2], dtype = N.float)
+
+    waiting = waiting[:, N.newaxis]
+    duration = duration[:, N.newaxis]
+
+    return N.concatenate((waiting, duration), 1)
+
+def get_pendigits():
+    """Return faithful data as a nx2 array, first column being duration, second
+    being waiting time."""
+    # Load faithful data, convert waiting into integer, remove L, M and S data
+    data = pendigits.training.load()
+    return data['data']['x0'], data['data']['y0']
+
+def scale(data):
+    """ Scale data such as each col is in the range [0..1].
+
+    Note: inplace."""
+    n = N.min(data, 0)
+    m = N.max(data, 0)
+
+    data -= n
+    data /= (m-n)
+    return data
+