diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index ab472c206221fcb8da83af6cc1a34634ee392586..61b17a5a12ab290ba16c8a80ccb383d9317b5e9b 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -381,7 +381,7 @@ Signal Decomposition
    decomposition.SparsePCA
    decomposition.MiniBatchSparsePCA
    decomposition.DictionaryLearning
-   decomposition.DictionaryLearningOnline
+   decomposition.MiniBatchDictionaryLearning
 
 .. autosummary::
    :toctree: generated/
diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst
index 059f09cd45d79583280cb43a43d4fad602b55698..4a00a88e272eade291bc8982c8549469ec0d5a14 100644
--- a/doc/modules/decomposition.rst
+++ b/doc/modules/decomposition.rst
@@ -235,6 +235,117 @@ factorization, while larger values shrink many coefficients to zero.
      R. Jenatton, G. Obozinski, F. Bach, 2009
 
 
+.. _DictionaryLearning:
+
+Dictionary Learning
+===================
+
+Generic dictionary learning
+---------------------------
+
+Dictionary learning (:class:`DictionaryLearning`) is a matrix factorization
+problem that amounts to finding a (usually overcomplete) dictionary that will
+perform good at sparsely encoding the fitted data.
+
+Representing data as sparse combinations of atoms from an overcomplete
+dictionary is suggested to be the way the mammal primary visual cortex works.
+Consequently, dictionary learning applied on image patches has been shown to 
+give good results in image processing tasks such as image completion,
+inpainting and denoising, as well as for supervised recognition tasks.
+
+Dictionary learning is an optimization problem solved by alternatively updating
+the sparse code, as a solution to multiple Lasso problems, considering the
+dictionary fixed, and then updating the dictionary to best fit the sparse code.
+
+.. math::
+   (U^*, V^*) = \underset{U, V}{\operatorname{arg\,min\,}} & \frac{1}{2}
+                ||X-UV||_2^2+\alpha||U||_1 \\
+                \text{subject to\,} & ||V_k||_2 = 1 \text{ for all }
+                0 \leq k < n_{atoms}
+
+After using such a procedure to fit the dictionary, the fitted object can be 
+used to transform new data. The transformation amounts to a sparse coding
+problem: finding a representation of the data as a linear combination of as few
+dictionary atoms as possible. All variations of dictionary learning implement
+the following transform methods, controllable via the `transform_method` 
+initialization parameter:
+
+
+* Orthogonal matching pursuit (:ref:`omp`)
+
+* Least-angle regression (:ref:`least_angle_regression`)
+
+* Lasso computed by least-angle regression
+
+* Lasso using coordinate descent (:ref:`lasso`)
+
+* Thresholding
+
+Thresholding is very fast but it does not yield accurate reconstructions.
+They have been shown useful in literature for classification tasks. For image
+reconstruction tasks, orthogonal matching pursuit yields the most accurate,
+unbiased reconstruction.
+
+The dictionary learning objects offer, via the `split_code` parameter, the
+possibility to separate the positive and negative values in the results of 
+sparse coding. This is useful when dictionary learning is used for extracting
+features that will be used for supervised learning, because it allows the
+learning algorithm to assign different weights to negative loadings of a
+particular atom, from to the corresponding positive loading.
+
+The split code for a single sample has length `2 * n_atoms`
+and is constructed using the following rule: First, the regular code of length
+`n_atoms` is computed. Then, the first `n_atoms` entries of the split_code are
+filled with the positive part of the regular code vector. The second half of
+the split code is filled with the negative part of the code vector, only with
+a positive sign. Therefore, the split_code is non-negative. 
+
+The following image shows how a dictionary learned from 4x4 pixel image patches
+extracted from part of the image of Lena looks like.
+
+
+.. figure:: ../auto_examples/decomposition/images/plot_img_denoising_1.png
+    :target: ../auto_examples/decomposition/plot_img_denoising.html
+    :align: center
+    :scale: 50%
+
+
+.. figure:: ../auto_examples/decomposition/images/plot_faces_decomposition_5.png
+   :target: ../auto_examples/decomposition/plot_faces_decomposition.html
+   :scale: 50%
+
+
+.. topic:: Examples:
+
+  * :ref:`example_decomposition_plot_img_denoising.py`
+
+
+.. topic:: References:
+
+  * `"Online dictionary learning for sparse coding" 
+    <http://www.di.ens.fr/sierra/pdfs/icml09.pdf>`_
+    J. Mairal, F. Bach, J. Ponce, G. Sapiro, 2009
+
+.. _MiniBatchDictionaryLearning
+
+Mini-batch dictionary learning
+--------------------------
+
+:class:`MiniBatchDictionaryLearning` implements a faster, but less accurate
+version of the dictionary learning algorithm that is better suited for large
+datasets. 
+
+By default, :class:`MiniBatchDictionaryLearning` divides the data into
+mini-batches and optimizes in an online manner by cycling over the mini-batches
+for the specified number of iterations. However, at the moment it does not
+implement a stopping condition.
+
+The estimator also implements `partial_fit`, which updates the dictionary by
+iterating only once over a mini-batch. This can be used for online learning
+when the data is not readily available from the start, or for when the data
+does not fit into the memory.
+
+
 .. _ICA:
 
 Independent component analysis (ICA)
@@ -348,104 +459,3 @@ of the data.
       <http://www.cs.rpi.edu/~boutsc/files/nndsvd.pdf>`_
       C. Boutsidis, E. Gallopoulos, 2008
 
-
-
-.. _DictionaryLearning:
-
-Dictionary Learning
-===================
-
-Generic dictionary learning
----------------------------
-
-Dictionary learning (:class:`DictionaryLearning`) is a matrix factorization
-problem that amounts to finding a (usually overcomplete) dictionary that will
-perform good at sparsely encoding the fitted data.
-
-Representing data as sparse combinations of atoms from an overcomplete
-dictionary is suggested to be the way the mammal primary visual cortex works.
-Consequently, dictionary learning applied on image patches has been shown to 
-give good results in image processing tasks such as image completion,
-inpainting and denoising, as well as for supervised recognition tasks.
-
-Dictionary learning is an optimization problem solved by alternatively updating
-the sparse code, as a solution to multiple Lasso problems, considering the
-dictionary fixed, and then updating the dictionary to best fit the sparse code.
-
-After using such a procedure to fit the dictionary, the fitted object can be 
-used to transform new data. The transformation amounts to a sparse coding
-problem: finding a representation of the data as a linear combination of as few
-dictionary atoms as possible. All variations of dictionary learning implement
-the following transform methods, controllable via the `transform_method` 
-initialization parameter:
-
-
-* Orthogonal matching pursuit (:ref:`omp`)
-
-* Least-angle regression (:ref:`least_angle_regression`)
-
-* Lasso computed by least-angle regression
-
-* Lasso using coordinate descent (:ref:`lasso`)
-
-* Thresholding
-
-Thresholding is very fast but it does not yield accurate reconstructions.
-They have been shown useful in literature for classification tasks. For image
-reconstruction tasks, orthogonal matching pursuit yields the most accurate,
-unbiased reconstruction.
-
-The dictionary learning objects offer, via the `split_code` parameter, the
-possibility to separate the positive and negative values in the results of 
-sparse coding. This is useful when dictionary learning is used for extracting
-features that will be used for supervised learning, because it allows the
-learning algorithm to assign different weights to negative loadings of a
-particular atom, from to the corresponding positive loading.
-
-The split code for a single sample has length `2 * n_atoms`
-and is constructed using the following rule: First, the regular code of length
-`n_atoms` is computed. Then, the first `n_atoms` entries of the split_code are
-filled with the positive part of the regular code vector. The second half of
-the split code is filled with the negative part of the code vector, only with
-a positive sign. Therefore, the split_code is non-negative. 
-
-The following image shows how a dictionary learned from 4x4 pixel image patches
-extracted from part of the image of Lena looks like.
-
-
-.. figure:: ../auto_examples/decomposition/images/plot_img_denoising_1.png
-    :target: ../auto_examples/decomposition/plot_img_denoising.html
-    :align: center
-    :scale: 50%
-
-
-.. topic:: Examples:
-
-  * :ref:`example_decomposition_plot_img_denoising.py`
-
-
-.. topic:: References:
-
-  * `"Online dictionary learning for sparse coding" 
-    <http://www.di.ens.fr/sierra/pdfs/icml09.pdf>`_
-    J. Mairal, F. Bach, J. Ponce, G. Sapiro, 2009
-
-.. _DictionaryLearningOnline
-
-Online dictionary learning
---------------------------
-
-:class:`DictionaryLearningOnline` implements a faster, but less accurate
-version of the dictionary learning algorithm that is better suited for large
-datasets. 
-
-By default, :class:`DictionaryLearningOnline` divides the data into
-mini-batches and optimizes in an online manner by cycling over the mini-batches
-for the specified number of iterations. However, at the moment it does not
-implement a stopping condition.
-
-The estimator also implements `partial_fit`, which updates the dictionary by
-iterating only once over a mini-batch. This can be used for online learning
-when the data is not readily available from the start, or for when the data
-does not fit into the memory.
-
diff --git a/examples/decomposition/plot_faces_decomposition.py b/examples/decomposition/plot_faces_decomposition.py
index 1cd41b9e9fc9a6180b9c2177913a5fdd154d0495..fcab10be6dcdf40385b4096cb95f1cc859602142 100644
--- a/examples/decomposition/plot_faces_decomposition.py
+++ b/examples/decomposition/plot_faces_decomposition.py
@@ -70,7 +70,7 @@ estimators = [
 
     ('Non-negative components - NMF',
      decomposition.NMF(n_components=n_components, init='nndsvda', beta=5.0,
-                        tol=5e-3, sparseness='components'),
+                       tol=5e-3, sparseness='components'),
      False, False),
 
     ('Independent components - FastICA',
@@ -82,9 +82,9 @@ estimators = [
                                       n_iter=100, chunk_size=3),
      True, False),
 
-    ('Dictionary atoms - DictionaryLearningOnline',
-    decomposition.DictionaryLearningOnline(n_atoms=n_components, alpha=1e-3,
-                                           n_iter=100, chunk_size=3),
+    ('MiniBatchDictionaryLearning',
+    decomposition.MiniBatchDictionaryLearning(n_atoms=n_components, alpha=5e-3,
+                                              n_iter=100, chunk_size=3),
      True, False),
 
     ('Cluster centers - MiniBatchKMeans',
diff --git a/examples/decomposition/plot_img_denoising.py b/examples/decomposition/plot_img_denoising.py
index a89ae896ad7540979f65eb1c29c0f9e731b5718a..bc1dede7b37dd68e8bd8eda91907ac6ed94f818f 100644
--- a/examples/decomposition/plot_img_denoising.py
+++ b/examples/decomposition/plot_img_denoising.py
@@ -14,9 +14,9 @@ at the difference between the reconstruction and the original image. If the
 reconstruction is perfect this will look like gaussian noise.
 
 It can be seen from the plots that the results of :ref:`omp` with two
-non-zero coefficients is a bit less biased than when keeping only one (the
-edges look less prominent). However, it is farther from the ground truth in
-Frobenius norm.
+non-zero coefficients is a bit less biased than when keeping only one
+(the edges look less prominent). It is in addition closer from the ground
+truth in Frobenius norm.
 
 The result of :ref:`least_angle_regression` is much more strongly biased: the
 difference is reminiscent of the local intensity value of the original image.
@@ -35,7 +35,7 @@ import pylab as pl
 import scipy as sp
 import numpy as np
 
-from sklearn.decomposition import DictionaryLearningOnline
+from sklearn.decomposition import MiniBatchDictionaryLearning
 from sklearn.feature_extraction.image import extract_patches_2d
 from sklearn.feature_extraction.image import reconstruct_from_patches_2d
 
@@ -46,8 +46,7 @@ lena = sp.lena() / 256.0
 
 # downsample for higher speed
 lena = lena[::2, ::2] + lena[1::2, ::2] + lena[::2, 1::2] + lena[1::2, 1::2]
-lena = lena[::2, ::2] + lena[1::2, ::2] + lena[::2, 1::2] + lena[1::2, 1::2]
-lena /= 16.0
+lena /= 4.0
 height, width = lena.shape
 
 # Distort the right half of the image
@@ -70,7 +69,7 @@ print 'done in %.2fs.' % (time() - t0)
 
 print 'Learning the dictionary... '
 t0 = time()
-dico = DictionaryLearningOnline(n_atoms=100, alpha=1e-2, n_iter=500)
+dico = MiniBatchDictionaryLearning(n_atoms=100, alpha=1e-2, n_iter=500)
 V = dico.fit(data).components_
 dt = time() - t0
 print 'done in %.2fs.' % dt
@@ -123,15 +122,13 @@ data -= intercept
 print 'done in %.2fs.' % (time() - t0)
 
 transform_algorithms = [
-    ('Orthogonal Matching Pursuit\n1 atom',
-     'omp', {'transform_n_nonzero_coefs': 1}),
-    ('Orthogonal Matching Pursuit\n2 atoms',
-     'omp', {'transform_n_nonzero_coefs': 2}),
-    ('Least-angle regression\n5 atoms',
-     'lars', {'transform_n_nonzero_coefs': 5}),
-    ('Thresholding\n alpha=0.1', 'threshold',
-     {'transform_alpha': .1}),
-]
+    ('Orthogonal Matching Pursuit\n1 atom', 'omp',
+     {'transform_n_nonzero_coefs': 1}),
+    ('Orthogonal Matching Pursuit\n2 atoms', 'omp',
+     {'transform_n_nonzero_coefs': 2}),
+    ('Least-angle regression\n5 atoms', 'lars',
+                            {'transform_n_nonzero_coefs': 5}),
+    ('Thresholding\n alpha=0.1', 'threshold', {'transform_alpha': .1})]
 
 reconstructions = {}
 for title, transform_algorithm, kwargs in transform_algorithms:
diff --git a/sklearn/decomposition/__init__.py b/sklearn/decomposition/__init__.py
index fbc9e2f33ad1d13bacfe9561452b41417b2703e8..748832041db71068999a6b3374e6bdb18593c401 100644
--- a/sklearn/decomposition/__init__.py
+++ b/sklearn/decomposition/__init__.py
@@ -8,4 +8,4 @@ from .kernel_pca import KernelPCA
 from .sparse_pca import SparsePCA, MiniBatchSparsePCA
 from .fastica_ import FastICA, fastica
 from .dict_learning import dict_learning, dict_learning_online, \
-                           DictionaryLearning, DictionaryLearningOnline
+                           DictionaryLearning, MiniBatchDictionaryLearning
diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py
index 03365cde25078b4e587c0accabd0f456d0921a0e..b211c122e79c012ea6c0e13bfbc69e85509b1cd8 100644
--- a/sklearn/decomposition/dict_learning.py
+++ b/sklearn/decomposition/dict_learning.py
@@ -828,8 +828,8 @@ class DictionaryLearning(BaseDictionaryLearning):
         return self
 
 
-class DictionaryLearningOnline(BaseDictionaryLearning):
-    """ Online dictionary learning
+class MiniBatchDictionaryLearning(BaseDictionaryLearning):
+    """Mini-batch dictionary learning
 
     Finds a dictionary (a set of atoms) that can best be used to represent data
     using a sparse code.
diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py
index fcde71495e44ed6a8f2c52b31f873e7591255b59..0f6a9ae116c1cc0fe44c5ec87bd7c7f20e6b0146 100644
--- a/sklearn/decomposition/tests/test_dict_learning.py
+++ b/sklearn/decomposition/tests/test_dict_learning.py
@@ -2,7 +2,7 @@ import numpy as np
 from numpy.testing import assert_array_almost_equal, assert_array_equal, \
                           assert_equal
 
-from .. import DictionaryLearning, DictionaryLearningOnline, \
+from .. import DictionaryLearning, MiniBatchDictionaryLearning, \
                dict_learning_online
 from ..dict_learning import sparse_encode, sparse_encode_parallel
 
@@ -74,20 +74,20 @@ def test_dict_learning_online_shapes():
 
 def test_dict_learning_online_estimator_shapes():
     n_atoms = 5
-    dico = DictionaryLearningOnline(n_atoms, n_iter=20).fit(X)
+    dico = MiniBatchDictionaryLearning(n_atoms, n_iter=20).fit(X)
     assert dico.components_.shape == (n_atoms, n_features)
 
 
 def test_dict_learning_online_overcomplete():
     n_atoms = 12
-    dico = DictionaryLearningOnline(n_atoms, n_iter=20).fit(X)
+    dico = MiniBatchDictionaryLearning(n_atoms, n_iter=20).fit(X)
     assert dico.components_.shape == (n_atoms, n_features)
 
 
 def test_dict_learning_online_initialization():
     n_atoms = 12
     V = rng.randn(n_atoms, n_features)
-    dico = DictionaryLearningOnline(n_atoms, n_iter=0, dict_init=V).fit(X)
+    dico = MiniBatchDictionaryLearning(n_atoms, n_iter=0, dict_init=V).fit(X)
     assert_array_equal(dico.components_, V)
 
 
@@ -96,13 +96,13 @@ def test_dict_learning_online_partial_fit():
     V = rng.randn(n_atoms, n_features)  # random init
     rng1 = np.random.RandomState(0)
     rng2 = np.random.RandomState(0)
-    dico1 = DictionaryLearningOnline(n_atoms, n_iter=10, chunk_size=1,
-                                     shuffle=False, dict_init=V,
-                                     transform_algorithm='threshold',
-                                     random_state=rng1).fit(X)
-    dico2 = DictionaryLearningOnline(n_atoms, n_iter=1, dict_init=V,
-                                     transform_algorithm='threshold',
-                                     random_state=rng2)
+    dico1 = MiniBatchDictionaryLearning(n_atoms, n_iter=10, chunk_size=1,
+                                        shuffle=False, dict_init=V,
+                                        transform_algorithm='threshold',
+                                        random_state=rng1).fit(X)
+    dico2 = MiniBatchDictionaryLearning(n_atoms, n_iter=1, dict_init=V,
+                                        transform_algorithm='threshold',
+                                        random_state=rng2)
     for ii, sample in enumerate(X):
         dico2.partial_fit(sample, iter_offset=ii * dico2.n_iter)