Use CCA as well in multilabel example.

1382b264 · Mathieu Blondel · 512c9116 · 1382b264
Commit 1382b264 authored 13 years ago by Mathieu Blondel
--- a/examples/plot_multilabel.py
+++ b/examples/plot_multilabel.py
+# Authors: Vlad Niculae, Mathieu Blondel
+# License: BSD
 """
 =========================
 Multilabel classification
@@ -11,16 +13,16 @@ dataset is generated randomly based on the following process:
    - pick the document length: k ~ Poisson(length)
    - k times, choose a word: w ~ Multinomial(theta_c)

-In the above process, rejection sampling is used to make sure that
-n is never zero or more than 2, and that the document length
-is never zero. Likewise, we reject classes which have already been chosen.
-The documents that are assigned to both classes are plotted surrounded by
-two colored circles.
+In the above process, rejection sampling is used to make sure that n is more
+than 2, and that the document length is never zero. Likewise, we reject classes
+which have already been chosen.  The documents that are assigned to both classes
+are plotted surrounded by two colored circles.

 The classification is performed by projecting to the first two principal
-components for visualisation purposes, followed by using the
-:class:`sklearn.multiclass.OneVsRestClassifier` metaclassifier using two SVCs
-with linear kernels to learn a discriminative model for each class.
+components found by PCA and CCA for visualisation purposes, followed by using
+the :class:`sklearn.multiclass.OneVsRestClassifier` metaclassifier using two
+SVCs with linear kernels to learn a discriminative model for each class.
+Note that PCA is an unsupervised algorithm, while CCA is supervised.
 """
 print __doc__

@@ -30,7 +32,9 @@ import matplotlib.pylab as pl
 from sklearn.datasets import make_multilabel_classification
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.svm import SVC
+from sklearn.preprocessing import LabelBinarizer
 from sklearn.decomposition import PCA
+from sklearn.pls import CCA


 def plot_hyperplane(clf, min_x, max_x, linestyle, label):
@@ -42,26 +46,24 @@ def plot_hyperplane(clf, min_x, max_x, linestyle, label):
    pl.plot(xx, yy, linestyle, label=label)


-pl.figure(figsize=(13, 6))
-
-for subplot, allow_unlabeled, title in zip((1, 2),
-                                           (False, True),
-                                           ('with unlabeled samples',
-                                            'without unlabeled samples')):
-    X, Y = make_multilabel_classification(n_classes=2, n_labels=1,
-                                          allow_unlabeled=allow_unlabeled,
-                                          random_state=42)
+def plot_subfigure(X, Y, subplot, title, transform):
+    if transform == "pca":
        X = PCA(n_components=2).fit_transform(X)
+    elif transform == "cca":
+        # Convert list of tuples to a class indicator matrix first
+        Y_indicator = LabelBinarizer().fit(Y).transform(Y)
+        X = CCA(n_components=2).fit(X, Y_indicator).transform(X)
+    else:
+        raise ValueError
+
    min_x = np.min(X[:, 0])
    max_x = np.max(X[:, 0])

    classif = OneVsRestClassifier(SVC(kernel='linear'))
    classif.fit(X, Y)

-    pl.subplot(1, 2, subplot)
-    pl.title('Multilabel classification\n(%s)' % title)
-    pl.xlabel('First principal component')
-    pl.ylabel('Second principal component')
+    pl.subplot(2, 2, subplot)
+    pl.title(title)

    zero_class = np.where([0 in y for y in Y])
    one_class = np.where([1 in y for y in Y])
@@ -78,7 +80,28 @@ for subplot, allow_unlabeled, title in zip((1, 2),
                    'Boundary\nfor class 2')
    pl.xticks(())
    pl.yticks(())
-    pl.legend()
+
+    if subplot == 1:
+        pl.xlabel('First principal component')
+        pl.ylabel('Second principal component')
+        pl.legend(loc="upper right")
+
+
+pl.figure(figsize=(13, 6))
+
+X, Y = make_multilabel_classification(n_classes=2, n_labels=1,
+                                      allow_unlabeled=True,
+                                      random_state=42)
+
+plot_subfigure(X, Y, 1, "With unlabeled samples + CCA", "cca")
+plot_subfigure(X, Y, 2, "With unlabeled samples + PCA", "pca")
+
+X, Y = make_multilabel_classification(n_classes=2, n_labels=1,
+                                      allow_unlabeled=False,
+                                      random_state=42)
+
+plot_subfigure(X, Y, 3, "Without unlabeled samples + CCA", "cca")
+plot_subfigure(X, Y, 4, "Without unlabeled samples + PCA", "pca")

 pl.subplots_adjust(.04, .07, .97, .90, .09, .2)
 pl.show()