From f2fc81e4fba200caf9ef7ba3b6fa4acff3d1bf9f Mon Sep 17 00:00:00 2001
From: Alexandre Passos <alexandre.tp@gmail.com>
Date: Fri, 15 Apr 2011 09:14:24 -0300
Subject: [PATCH] Editing a single example for the GMM and DPGMM explaining the
 difference

---
 examples/mixture/plot_dpgmm.py | 58 ----------------------------------
 examples/mixture/plot_gmm.py   | 57 ++++++++++++++++++++++++---------
 2 files changed, 42 insertions(+), 73 deletions(-)
 delete mode 100644 examples/mixture/plot_dpgmm.py

diff --git a/examples/mixture/plot_dpgmm.py b/examples/mixture/plot_dpgmm.py
deleted file mode 100644
index 1e0c4fab6c..0000000000
--- a/examples/mixture/plot_dpgmm.py
+++ /dev/null
@@ -1,58 +0,0 @@
-"""
-=================================
-DP Mixture Model Ellipsoids
-=================================
-
-Plot the covariance ellipsoids of a dirichlet process mixture of two
-gaussians for varying values of the alpha parameter.
-
-Note that we generate the data from two components, which are
-correctly recovered by the dirichlet process, even though the
-approximating distribution is truncated at five components.
-"""
-
-import numpy as np
-from scikits.learn import mixture
-import itertools
-
-import pylab as pl
-import matplotlib as mpl
-
-n, m = 200, 2
-
-# generate random sample, two components
-np.random.seed(0)
-C = np.array([[0., -0.7], [3.5, .7]])
-X = np.r_[np.dot(np.random.randn(n, 2), C),
-          np.random.randn(n, 2) + np.array([3, 3])]
-
-for p, alpha in enumerate([0.01, 1.]):
-    # fit a five-component dirichlet process mixture model on the
-    # data.
-    clf = mixture.DPGMM(n_states=5, cvtype='diag', alpha=alpha)
-    clf.fit(X)
-
-    splot = pl.subplot(311 + p, aspect='equal')
-    color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm', 'y'])
-
-    Y_ = clf.predict(X)
-
-    for i, (mean, covar, color) in enumerate(zip(clf.means,
-                                                 clf.covars,
-                                                 color_iter)):
-        v, w = np.linalg.eigh(covar)
-        u = w[0] / np.linalg.norm(w[0])
-        # as the DP will not use every component it has access to
-        # unless it needs it, we shouldn't plot the redundant
-        # components.
-        if not sum(Y_ == i) > 1:
-            continue
-        pl.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
-        angle = np.arctan(u[1] / u[0])
-        angle = 180 * angle / np.pi  # convert to degrees
-        ell = mpl.patches.Ellipse(mean, v[0], v[1], 180 + angle, color=color)
-        ell.set_clip_box(splot.bbox)
-        ell.set_alpha(0.5)
-        splot.add_artist(ell)
-
-pl.show()
diff --git a/examples/mixture/plot_gmm.py b/examples/mixture/plot_gmm.py
index 80e1c6fbe8..d96d0cdf3f 100644
--- a/examples/mixture/plot_gmm.py
+++ b/examples/mixture/plot_gmm.py
@@ -3,7 +3,19 @@
 Gaussian Mixture Model Ellipsoids
 =================================
 
-Plot the confidence ellipsoids of a mixture of two gaussians.
+Plot the confidence ellipsoids of a mixture of two gaussians with EM
+and variational dirichlet process.
+
+Both models have access to five components with which to fit the
+data. Note that the EM model will necessarily use all five components
+while the DP model will effectively only use as many as are needed for
+a good fit. This is a property of the Dirichlet Process prior.
+
+This example doesn't show it, as we're in a low-dimensional space, but
+another advantage of the dirichlet process model is that it can fit
+full covariance matrices effectively even when there are less examples
+per cluster than there are dimensions in the data, due to
+regularization properties of the inference algorithm.
 """
 
 import numpy as np
@@ -21,24 +33,39 @@ C = np.array([[0., -0.7], [3.5, .7]])
 X = np.r_[np.dot(np.random.randn(n, 2), C),
           np.random.randn(n, 2) + np.array([3, 3])]
 
-clf = mixture.GMM(n_states=2, cvtype='full')
+
+# fit a mixture of gaussians with EM using five components
+clf = mixture.GMM(n_states=5, cvtype='diag')
 clf.fit(X)
 
-splot = pl.subplot(111, aspect='equal')
-color_iter = itertools.cycle (['r', 'g', 'b', 'c'])
+# fit a dirichlet process mixture of gaussians using five components
+dpclf = mixture.DPGMM(n_states=5, cvtype='diag')
+dpclf.fit(X)
+
+color_iter = itertools.cycle (['r', 'g', 'b', 'c', 'm'])
 
-Y_ = clf.predict(X)
 
-for i, (mean, covar, color) in enumerate(zip(clf.means, clf.covars, color_iter)):
-    v, w = np.linalg.eigh(covar)
-    u = w[0] / np.linalg.norm(w[0])
-    pl.scatter(X[Y_==i, 0], X[Y_==i, 1], .8, color=color)
-    angle = np.arctan(u[1]/u[0])
-    angle = 180 * angle / np.pi # convert to degrees
-    ell = mpl.patches.Ellipse (mean, v[0], v[1], 180 + angle, color=color)
-    ell.set_clip_box(splot.bbox)
-    ell.set_alpha(0.5)
-    splot.add_artist(ell)
+for i,c in enumerate([clf, dpclf]):
+    splot = pl.subplot(211+i, aspect='equal')
+    Y_ = c.predict(X)
+    for i, (mean, covar, color) in enumerate(zip(c.means, c.covars, color_iter)):
+        v, w = np.linalg.eigh(covar)
+        u = w[0] / np.linalg.norm(w[0])
+        # as the DP will not use every component it has access to
+        # unless it needs it, we shouldn't plot the redundant
+        # components.
+        if not sum(Y_ == i) > 1:
+            continue
+        pl.scatter(X[Y_==i, 0], X[Y_==i, 1], .8, color=color)
+        angle = np.arctan(u[1]/u[0])
+        angle = 180 * angle / np.pi # convert to degrees
+        ell = mpl.patches.Ellipse (mean, v[0], v[1], 180 + angle, color=color)
+        ell.set_clip_box(splot.bbox)
+        ell.set_alpha(0.5)
+        splot.add_artist(ell)
 
+# Note that the GMM will use all components it has access to, while
+# the dirichlet process model will only use as many are needed to
+# explain the data
 pl.show()
 
-- 
GitLab