From f2fc81e4fba200caf9ef7ba3b6fa4acff3d1bf9f Mon Sep 17 00:00:00 2001 From: Alexandre Passos <alexandre.tp@gmail.com> Date: Fri, 15 Apr 2011 09:14:24 -0300 Subject: [PATCH] Editing a single example for the GMM and DPGMM explaining the difference --- examples/mixture/plot_dpgmm.py | 58 ---------------------------------- examples/mixture/plot_gmm.py | 57 ++++++++++++++++++++++++--------- 2 files changed, 42 insertions(+), 73 deletions(-) delete mode 100644 examples/mixture/plot_dpgmm.py diff --git a/examples/mixture/plot_dpgmm.py b/examples/mixture/plot_dpgmm.py deleted file mode 100644 index 1e0c4fab6c..0000000000 --- a/examples/mixture/plot_dpgmm.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -================================= -DP Mixture Model Ellipsoids -================================= - -Plot the covariance ellipsoids of a dirichlet process mixture of two -gaussians for varying values of the alpha parameter. - -Note that we generate the data from two components, which are -correctly recovered by the dirichlet process, even though the -approximating distribution is truncated at five components. -""" - -import numpy as np -from scikits.learn import mixture -import itertools - -import pylab as pl -import matplotlib as mpl - -n, m = 200, 2 - -# generate random sample, two components -np.random.seed(0) -C = np.array([[0., -0.7], [3.5, .7]]) -X = np.r_[np.dot(np.random.randn(n, 2), C), - np.random.randn(n, 2) + np.array([3, 3])] - -for p, alpha in enumerate([0.01, 1.]): - # fit a five-component dirichlet process mixture model on the - # data. - clf = mixture.DPGMM(n_states=5, cvtype='diag', alpha=alpha) - clf.fit(X) - - splot = pl.subplot(311 + p, aspect='equal') - color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm', 'y']) - - Y_ = clf.predict(X) - - for i, (mean, covar, color) in enumerate(zip(clf.means, - clf.covars, - color_iter)): - v, w = np.linalg.eigh(covar) - u = w[0] / np.linalg.norm(w[0]) - # as the DP will not use every component it has access to - # unless it needs it, we shouldn't plot the redundant - # components. - if not sum(Y_ == i) > 1: - continue - pl.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color) - angle = np.arctan(u[1] / u[0]) - angle = 180 * angle / np.pi # convert to degrees - ell = mpl.patches.Ellipse(mean, v[0], v[1], 180 + angle, color=color) - ell.set_clip_box(splot.bbox) - ell.set_alpha(0.5) - splot.add_artist(ell) - -pl.show() diff --git a/examples/mixture/plot_gmm.py b/examples/mixture/plot_gmm.py index 80e1c6fbe8..d96d0cdf3f 100644 --- a/examples/mixture/plot_gmm.py +++ b/examples/mixture/plot_gmm.py @@ -3,7 +3,19 @@ Gaussian Mixture Model Ellipsoids ================================= -Plot the confidence ellipsoids of a mixture of two gaussians. +Plot the confidence ellipsoids of a mixture of two gaussians with EM +and variational dirichlet process. + +Both models have access to five components with which to fit the +data. Note that the EM model will necessarily use all five components +while the DP model will effectively only use as many as are needed for +a good fit. This is a property of the Dirichlet Process prior. + +This example doesn't show it, as we're in a low-dimensional space, but +another advantage of the dirichlet process model is that it can fit +full covariance matrices effectively even when there are less examples +per cluster than there are dimensions in the data, due to +regularization properties of the inference algorithm. """ import numpy as np @@ -21,24 +33,39 @@ C = np.array([[0., -0.7], [3.5, .7]]) X = np.r_[np.dot(np.random.randn(n, 2), C), np.random.randn(n, 2) + np.array([3, 3])] -clf = mixture.GMM(n_states=2, cvtype='full') + +# fit a mixture of gaussians with EM using five components +clf = mixture.GMM(n_states=5, cvtype='diag') clf.fit(X) -splot = pl.subplot(111, aspect='equal') -color_iter = itertools.cycle (['r', 'g', 'b', 'c']) +# fit a dirichlet process mixture of gaussians using five components +dpclf = mixture.DPGMM(n_states=5, cvtype='diag') +dpclf.fit(X) + +color_iter = itertools.cycle (['r', 'g', 'b', 'c', 'm']) -Y_ = clf.predict(X) -for i, (mean, covar, color) in enumerate(zip(clf.means, clf.covars, color_iter)): - v, w = np.linalg.eigh(covar) - u = w[0] / np.linalg.norm(w[0]) - pl.scatter(X[Y_==i, 0], X[Y_==i, 1], .8, color=color) - angle = np.arctan(u[1]/u[0]) - angle = 180 * angle / np.pi # convert to degrees - ell = mpl.patches.Ellipse (mean, v[0], v[1], 180 + angle, color=color) - ell.set_clip_box(splot.bbox) - ell.set_alpha(0.5) - splot.add_artist(ell) +for i,c in enumerate([clf, dpclf]): + splot = pl.subplot(211+i, aspect='equal') + Y_ = c.predict(X) + for i, (mean, covar, color) in enumerate(zip(c.means, c.covars, color_iter)): + v, w = np.linalg.eigh(covar) + u = w[0] / np.linalg.norm(w[0]) + # as the DP will not use every component it has access to + # unless it needs it, we shouldn't plot the redundant + # components. + if not sum(Y_ == i) > 1: + continue + pl.scatter(X[Y_==i, 0], X[Y_==i, 1], .8, color=color) + angle = np.arctan(u[1]/u[0]) + angle = 180 * angle / np.pi # convert to degrees + ell = mpl.patches.Ellipse (mean, v[0], v[1], 180 + angle, color=color) + ell.set_clip_box(splot.bbox) + ell.set_alpha(0.5) + splot.add_artist(ell) +# Note that the GMM will use all components it has access to, while +# the dirichlet process model will only use as many are needed to +# explain the data pl.show() -- GitLab