diff --git a/examples/svm/plot_svm_scale_c.py b/examples/svm/plot_svm_scale_c.py
index 8abb588c91a7adafa412b1c384ac2231a0548dda..a2b98a45ff19d3e02928169fcd7d1a816be9f579 100644
--- a/examples/svm/plot_svm_scale_c.py
+++ b/examples/svm/plot_svm_scale_c.py
@@ -4,8 +4,8 @@ Support Vector Classification (SVC): scaling the regularization parameter
 =========================================================================
 
 The following example illustrates the effect of scaling the
-regularization parameter when using :ref:`svm` for 
-:ref:`classification <svm_classification>`. 
+regularization parameter when using :ref:`svm` for
+:ref:`classification <svm_classification>`.
 For SVC classification, we are interested in a risk minimization for the
 equation:
 
@@ -21,35 +21,35 @@ where
       and our model parameters.
     - :math:`\Omega` is a `penalty` function of our model parameters
 
-If we consider the :math:`\mathcal{L}` function to be the individual error per 
-sample, then the data-fit term, or the sum of the error for each sample, will 
-increase as we add more samples. The penalization term, however, will not 
+If we consider the loss function to be the individual error per
+sample, then the data-fit term, or the sum of the error for each sample, will
+increase as we add more samples. The penalization term, however, will not
 increase.
 
 When using, for example, :ref:`cross validation <cross_validation>`, to
-set amount of regularization with :math:`C`, there will be a different 
-amount of samples between every problem that we are using for model 
-selection, as well as for the final problem that we want to use for 
+set amount of regularization with `C`, there will be a different
+amount of samples between every problem that we are using for model
+selection, as well as for the final problem that we want to use for
 training.
 
 Since our loss function is dependant on the amount of samples, the latter
-will influence the selected value of :math:`C`.
+will influence the selected value of `C`.
 The question that arises is `How do we optimally adjust C to
 account for the different training samples?`
 
 The figures below are used to illustrate the effect of scaling our
-:math:`C` to compensate for the change in the amount of samples, in the
-case of using an :math:`L1` penalty, as well as the :math:`L2` penalty.
+`C` to compensate for the change in the amount of samples, in the
+case of using an `L1` penalty, as well as the `L2` penalty.
 
 L1-penalty case
 -----------------
-In the :math:`L1` case, theory says that prediction consistency
+In the `L1` case, theory says that prediction consistency
 (i.e. that under given hypothesis, the estimator
-learned predicts as well as an model knowing the true distribution) 
-is not possible because of the biasof the :math:`L1`. It does say, however, 
+learned predicts as well as an model knowing the true distribution)
+is not possible because of the biasof the `L1`. It does say, however,
 that model consistancy, in terms of finding the right set of non-zero
-parameters as well as their signs, can be achieved by scaling 
-:math:`C1`.
+parameters as well as their signs, can be achieved by scaling
+`C1`.
 
 L2-penalty case
 -----------------
@@ -59,17 +59,21 @@ as the number of samples grow, in order to keep prediction consistency.
 Simulations
 ------------
 
-The two figures below plot the values of :math:`C` on the `x-axis` and the
+The two figures below plot the values of `C` on the `x-axis` and the
 corresponding cross-validation scores on the `y-axis`, for several different
 fractions of a generated data-set.
 
-In the :math:`L1` penalty case, the results are best when scaling our :math:`C` with
+In the `L1` penalty case, the results are best when scaling our `C` with
 the amount of samples, `n`, which can be seen in the third plot of the first figure.
 
-For the :math:`L2` penalty case, the best result comes from the case where :math:`C`
+For the `L2` penalty case, the best result comes from the case where `C`
 is not scaled.
 
+.. topic:: Note:
 
+    Two seperate datasets are used for the two different plots. The reason
+    behind this is the `L1` case works better on sparse data, while `L2`
+    is better suited to the non-sparse case.
 """
 print __doc__
 
@@ -94,32 +98,29 @@ rnd = check_random_state(1)
 # set up dataset
 n_samples = 100
 n_features = 300
-    
+
 #L1 data (only 5 informative features)
 X_1, y_1 = datasets.make_classification(n_samples=n_samples, n_features=n_features,
         n_informative=5, random_state=1)
-    
+
 #L2 data: non sparse, but less features
 y_2 = np.sign(.5 - rnd.rand(n_samples))
 X_2 = rnd.randn(n_samples, n_features/5) + y_2[:, np.newaxis]
 X_2 += 5 * rnd.randn(n_samples, n_features/5)
-            
-clf_sets = [(LinearSVC(penalty='L1', loss='L2', dual=False, 
+
+clf_sets = [(LinearSVC(penalty='L1', loss='L2', dual=False,
                        tol=1e-3),
              np.logspace(-2.2, -1.2, 10), X_1, y_1),
-            (LinearSVC(penalty='L2', loss='L2', dual=True, 
+            (LinearSVC(penalty='L2', loss='L2', dual=True,
                        tol=1e-4),
              np.logspace(-4.5, -2, 10), X_2, y_2)]
-    
+
 colors = ['b', 'g', 'r', 'c']
 
 for fignum, (clf, cs, X, y) in enumerate(clf_sets):
     # set up the plot for each regressor
     pl.figure(fignum, figsize=(9, 10))
-    pl.clf
-    pl.xlabel('C')
-    pl.ylabel('CV Score')
-    
+
     for k, train_size in enumerate(np.linspace(0.3, 0.7, 3)[::-1]):
         param_grid = dict(C=cs)
         # To get nice curve, we need a large number of iterations to
@@ -129,23 +130,20 @@ for fignum, (clf, cs, X, y) in enumerate(clf_sets):
                                         n_iterations=250, random_state=1))
         grid.fit(X, y)
         scores = [x[1] for x in grid.grid_scores_]
-        
-        scales = [(1, 'No scaling'), 
-                  ((n_samples * train_size), '1/n_samples'), 
+
+        scales = [(1, 'No scaling'),
+                  ((n_samples * train_size), '1/n_samples'),
                   ]
 
         for subplotnum, (scaler, name) in enumerate(scales):
             pl.subplot(2, 1, subplotnum + 1)
-            grid_cs =  cs * float(scaler) # scale the C's 
+            pl.xlabel('C')
+            pl.ylabel('CV Score')
+            grid_cs =  cs * float(scaler) # scale the C's
             pl.semilogx(grid_cs, scores, label="fraction %.2f" %
                         train_size)
             pl.title('scaling=%s, penalty=%s, loss=%s' % (name, clf.penalty, clf.loss))
 
-            #ymin, ymax = pl.ylim()
-            #pl.axvline(grid_cs[np.argmax(scores)], 0, 1,
-            #           color=colors[k])
-            #pl.ylim(ymin=ymin-0.0025, ymax=ymax+0.008) # adjust the y-axis
-
     pl.legend(loc="best")
 pl.show()
-    
+