diff --git a/doc/modules/mixture.rst b/doc/modules/mixture.rst
index cf9c3ea7e7e5ad69f1d5e5c49838fbf29d7f691e..2449901e90cd24e4f40dfbd95605eae8a99bd824 100644
--- a/doc/modules/mixture.rst
+++ b/doc/modules/mixture.rst
@@ -8,7 +8,7 @@ Gaussian mixture models
 
 .. currentmodule:: sklearn.mixture
 
-`sklearn.mixture` is a package which enables one to learn
+``sklearn.mixture`` is a package which enables one to learn
 Gaussian Mixture Models (diagonal, spherical, tied and full covariance
 matrices supported), sample them, and estimate them from
 data. Facilities to help determine the appropriate number of
@@ -93,14 +93,16 @@ Cons
    or information theoretical criteria to decide how many components to use
    in the absence of external cues.
 
-Selecting the number of components in a classical GMM
-------------------------------------------------------
+Selecting the number of components in a classical Gaussian Mixture Model
+------------------------------------------------------------------------
 
-The BIC criterion can be used to select the number of components in a GMM
-in an efficient way. In theory, it recovers the true number of components
-only in the asymptotic regime (i.e. if much data is available).
-Note that using a :ref:`DPGMM <dpgmm>` avoids the specification of the
-number of components for a Gaussian mixture model.
+The BIC criterion can be used to select the number of components in a Gaussian
+Mixture in an efficient way. In theory, it recovers the true number of
+components only in the asymptotic regime (i.e. if much data is available and
+assuming that the data was actually generated i.i.d. from a mixture of Gaussian
+distribution). Note that using a :ref:`Variational Bayesian Gaussian mixture <bgmm>`
+avoids the specification of the number of components for a Gaussian mixture
+model.
 
 .. figure:: ../auto_examples/mixture/images/sphx_glr_plot_gmm_selection_001.png
    :target: ../auto_examples/mixture/plot_gmm_selection.html
@@ -135,11 +137,12 @@ to a local optimum.
 
 .. _bgmm:
 
-Bayesian Gaussian Mixture
-=========================
+Variational Bayesian Gaussian Mixture
+=====================================
 
-The :class:`BayesianGaussianMixture` object implements a variant of the Gaussian
-mixture model with variational inference algorithms.
+The :class:`BayesianGaussianMixture` object implements a variant of the
+Gaussian mixture model with variational inference algorithms. The API is
+similar as the one defined by :class:`GaussianMixture`.
 
 .. _variational_inference:
 
@@ -152,171 +155,167 @@ priors) instead of data likelihood. The principle behind
 variational methods is the same as expectation-maximization (that is
 both are iterative algorithms that alternate between finding the
 probabilities for each point to be generated by each mixture and
-fitting the mixtures to these assigned points), but variational
+fitting the mixture to these assigned points), but variational
 methods add regularization by integrating information from prior
 distributions. This avoids the singularities often found in
 expectation-maximization solutions but introduces some subtle biases
 to the model. Inference is often notably slower, but not usually as
 much so as to render usage unpractical.
 
-Due to its Bayesian nature, the variational algorithm needs more
-hyper-parameters than expectation-maximization, the most
-important of these being the concentration parameter ``dirichlet_concentration_prior``. Specifying
-a high value of prior of the dirichlet concentration leads more often to uniformly-sized mixture
-components, while specifying small (between 0 and 1) values will lead
-to some mixture components getting almost all the points while most
-mixture components will be centered on just a few of the remaining
-points.
-
-.. figure:: ../auto_examples/mixture/images/sphx_glr_plot_bayesian_gaussian_mixture_001.png
-   :target: ../auto_examples/mixture/plot_bayesian_gaussian_mixture.html
-   :align: center
-   :scale: 50%
-
-.. topic:: Examples:
-
-    * See :ref:`sphx_glr_auto_examples_plot_bayesian_gaussian_mixture.py` for a comparaison of
-      the results of the ``BayesianGaussianMixture`` for different values
-      of the parameter ``dirichlet_concentration_prior``.
-
-Pros and cons of variational inference with :class:BayesianGaussianMixture
---------------------------------------------------------------------------
-
-Pros
-.....
-
-:Regularization: due to the incorporation of prior information,
-   variational solutions have less pathological special cases than
-   expectation-maximization solutions.
-
-:Automatic selection: when `dirichlet_concentration_prior` is small enough and
-   `n_components` is larger than what is found necessary by the model, the
-   Variational Bayesian mixture model has a natural tendency to set some mixture
-   weights values close to zero. This makes it possible to let the model choose a
-   suitable number of effective components automatically.
-
-Cons
-.....
-
-:Bias: to regularize a model one has to add biases. The
-   variational algorithm will bias all the means towards the origin
-   (part of the prior information adds a "ghost point" in the origin
-   to every mixture component) and it will bias the covariances to
-   be more spherical. It will also, depending on the concentration
-   parameter, bias the cluster structure either towards uniformity
-   or towards a rich-get-richer scenario.
-
-:Hyperparameters: this algorithm needs an extra hyperparameter
-   that might need experimental tuning via cross-validation.
+Due to its Bayesian nature, the variational algorithm needs more hyper-
+parameters than expectation-maximization, the most important of these being the
+concentration parameter ``weight_concentration_prior``. Specifying a low value
+for the concentration prior will make the model put most of the weight on few
+components set the remaining components weights very close to zero. High values
+of the concentration prior will allow a larger number of components to be active
+in the mixture.
+
+The parameters implementation of the :class:`BayesianGaussianMixture` class
+proposes two types of prior for the weights distribution: a finite mixture model
+with Dirichlet distribution and an infinite mixture model with the Dirichlet
+Process. In practice Dirichlet Process inference algorithm is approximated and
+uses a truncated distribution with a fixed maximum number of components (called
+the Stick-breaking representation). The number of components actually used
+almost always depends on the data.
+
+The next figure compares the results obtained for the different type of the
+weight concentration prior (parameter ``weight_concentration_prior_type``)
+for different values of ``weight_concentration_prior``.
+Here, we can see the the value of the ``weight_concentration_prior`` parameter
+has a strong impact on the effective number of active components obtained. We
+can also notice that large values for the concentration weight prior lead to
+more uniform weights when the type of prior is 'dirichlet_distribution' while
+this is not necessarily the case for the 'dirichlet_process' type (used by
+default).
+
+.. |plot_bgmm| image:: ../auto_examples/mixture/images/sphx_glr_plot_concentration_prior_002.png
+   :target: ../auto_examples/mixture/plot_gmm.html
+   :scale: 48%
 
-.. _dpgmm:
+.. |plot_dpgmm| image:: ../auto_examples/mixture/images/sphx_glr_plot_concentration_prior_002.png
+   :target: ../auto_examples/mixture/plot_concentration_prior.html
+   :scale: 48%
 
-DPGMM: Infinite Gaussian mixtures
-=================================
+.. centered:: |plot_bgmm| |plot_dpgmm|
+
+The examples below compare Gaussian mixture models with a fixed number of
+components, to the variational Gaussian mixture models with a Dirichlet process
+prior. Here, a classical Gaussian mixture is fitted with 5 components on a
+dataset composed of 2 clusters. We can see that the variational Gaussian mixture
+with a Dirichlet process prior is able to limit itself to only 2 components
+whereas the Gaussian mixture fits the data with a fixed number of components
+that has to be set a priori by the user. In this case the user has selected
+``n_components=5`` which does not match the true generative distribution of this
+toy dataset. Note that with very little observations, the variational Gaussian
+mixture models with a Dirichlet process prior can take a conservative stand, and
+fit only one component.
+
+.. figure:: ../auto_examples/mixture/images/sphx_glr_plot_gmm_001.png
+   :target: ../auto_examples/mixture/plot_gmm.html
+   :align: center
+   :scale: 70%
 
-The :class:`DPGMM` object implements a variant of the Gaussian mixture
-model with a variable (but bounded) number of components using the
-Dirichlet Process.
-This class doesn't require the user to choose the number of
-components, and at the expense of extra computational time the user
-only needs to specify a loose upper bound on this number and a
-concentration parameter.
 
-.. |plot_gmm| image:: ../auto_examples/mixture/images/sphx_glr_plot_gmm_001.png
-   :target: ../auto_examples/mixture/plot_gmm.html
-   :scale: 48%
+On the following figure we are fitting a dataset not well-depicted by a
+Gaussian mixture. Adjusting the ``weight_concentration_prior``, parameter of the
+class:`BayesianGaussianMixture` controls the number of components used to fit
+this data. We also present on the last two plots a random sampling generated
+from the two resulting mixtures.
 
-.. |plot_gmm_sin| image:: ../auto_examples/mixture/images/sphx_glr_plot_gmm_sin_001.png
+.. figure:: ../auto_examples/mixture/images/sphx_glr_plot_gmm_sin_001.png
    :target: ../auto_examples/mixture/plot_gmm_sin.html
-   :scale: 48%
-
-.. centered:: |plot_gmm| |plot_gmm_sin|
+   :align: center
+   :scale: 65%
 
 
-The examples above compare Gaussian mixture models with fixed number of
-components, to DPGMM models. **On the left** the GMM is fitted with 5
-components on a dataset composed of 2 clusters. We can see that the DPGMM is
-able to limit itself to only 2 components whereas the GMM fits the data fit too
-many components. Note that with very little observations, the DPGMM can take a
-conservative stand, and fit only one component. **On the right** we are fitting
-a dataset not well-depicted by a Gaussian mixture. Adjusting the `alpha`
-parameter of the DPGMM controls the number of components used to fit this
-data.
 
 .. topic:: Examples:
 
-    * See :ref:`sphx_glr_auto_examples_mixture_plot_gmm.py` for an example on plotting the
-      confidence ellipsoids for both :class:`GaussianMixture`
-      and :class:`DPGMM`.
+    * See :ref:`sphx_glr_auto_examples_mixture_plot_gmm.py` for an example on
+      plotting the confidence ellipsoids for both :class:`GaussianMixture`
+      and :class:`BayesianGaussianMixture`.
 
     * :ref:`sphx_glr_auto_examples_mixture_plot_gmm_sin.py` shows using
-      :class:`GaussianMixture` and :class:`DPGMM` to fit a sine wave
+      :class:`GaussianMixture` and :class:`BayesianGaussianMixture` to fit a
+      sine wave.
+
+    * See :ref:`sphx_glr_auto_examples_mixture_plot_concentration_prior.py`
+      for an example plotting the confidence ellipsoids for the
+      :class:`BayesianGaussianMixture` with different
+      ``weight_concentration_prior_type`` for different values of the parameter
+      ``weight_concentration_prior``.
 
-Pros and cons of class :class:`DPGMM`: Dirichlet process mixture model
-----------------------------------------------------------------------
+
+Pros and cons of variational inference with :class:`BayesianGaussianMixture`
+----------------------------------------------------------------------------
 
 Pros
 .....
 
-:Less sensitivity to the number of parameters: unlike finite
-   models, which will almost always use all components as much as
-   they can, and hence will produce wildly different solutions for
-   different numbers of components, the Dirichlet process solution
-   won't change much with changes to the parameters, leading to more
-   stability and less tuning.
+:Automatic selection: when ``weight_concentration_prior`` is small enough and
+   ``n_components`` is larger than what is found necessary by the model, the
+   Variational Bayesian mixture model has a natural tendency to set some mixture
+   weights values close to zero. This makes it possible to let the model choose
+   a suitable number of effective components automatically. Only an upper bound
+   of this number needs to be provided. Note however that the "ideal" number of
+   active components is very application specific and is typically ill-defined
+   in a data exploration setting.
+
+:Less sensitivity to the number of parameters: unlike finite models, which will
+   almost always use all components as much as they can, and hence will produce
+   wildly different solutions for different numbers of components, the
+   variantional inference with a Dirichlet process prior
+   (``weight_concentration_prior_type='dirichlet_process'``) won't change much
+   with changes to the parameters, leading to more stability and less tuning.
+
+:Regularization: due to the incorporation of prior information,
+   variational solutions have less pathological special cases than
+   expectation-maximization solutions.
 
-:No need to specify the number of components: only an upper bound of
-   this number needs to be provided. Note however that the DPMM is not
-   a formal model selection procedure, and thus provides no guarantee
-   on the result.
 
 Cons
 .....
 
-:Speed: the extra parametrization necessary for variational
-   inference and for the structure of the Dirichlet process can and
-   will make inference slower, although not by much.
+:Speed: the extra parametrization necessary for variational inference make
+   inference slower, although not by much.
+
+:Hyperparameters: this algorithm needs an extra hyperparameter
+   that might need experimental tuning via cross-validation.
 
-:Bias: as in variational techniques, but only more so, there are
-   many implicit biases in the Dirichlet process and the inference
-   algorithms, and whenever there is a mismatch between these biases
-   and the data it might be possible to fit better models using a
+:Bias: there are many implicit biases in the inference algorithms (and also in
+   the Dirichlet process if used), and whenever there is a mismatch between
+   these biases and the data it might be possible to fit better models using a
    finite mixture.
 
+
 .. _dirichlet_process:
 
 The Dirichlet Process
 ---------------------
 
 Here we describe variational inference algorithms on Dirichlet process
-mixtures. The Dirichlet process is a prior probability distribution on
+mixture. The Dirichlet process is a prior probability distribution on
 *clusterings with an infinite, unbounded, number of partitions*.
 Variational techniques let us incorporate this prior structure on
 Gaussian mixture models at almost no penalty in inference time, comparing
 with a finite Gaussian mixture model.
 
-An important question is how can the Dirichlet process use an
-infinite, unbounded number of clusters and still be consistent. While
-a full explanation doesn't fit this manual, one can think of its
-`chinese restaurant process
-<https://en.wikipedia.org/wiki/Chinese_restaurant_process>`_
-analogy to help understanding it. The
-chinese restaurant process is a generative story for the Dirichlet
-process. Imagine a chinese restaurant with an infinite number of
-tables, at first all empty. When the first customer of the day
-arrives, he sits at the first table. Every following customer will
-then either sit on an occupied table with probability proportional to
-the number of customers in that table or sit in an entirely new table
-with probability proportional to the concentration parameter
-`alpha`. After a finite number of customers has sat, it is easy to see
-that only finitely many of the infinite tables will ever be used, and
-the higher the value of alpha the more total tables will be used. So
-the Dirichlet process does clustering with an unbounded number of
-mixture components by assuming a very asymmetrical prior structure
-over the assignments of points to components that is very concentrated
-(this property is known as rich-get-richer, as the full tables in the
-Chinese restaurant process only tend to get fuller as the simulation
-progresses).
+An important question is how can the Dirichlet process use an infinite,
+unbounded number of clusters and still be consistent. While a full explanation
+doesn't fit this manual, one can think of its `stick breaking process
+<https://en.wikipedia.org/wiki/Dirichlet_process#The_stick-breaking_process>`
+analogy to help understanding it. The stick breaking process is a generative
+story for the Dirichlet process. We start with a unit-length stick and in each
+step we break off a portion of the remaining stick. Each time, we associate the
+length of the piece of the stick to the proportion of points that falls into a
+group of the mixture. At the end, to represent the infinite mixture, we
+associate the last remaining piece of the stick to the proportion of points
+that don't fall into all the other groups. The length of each piece is random
+variable with probability proportional to the concentration parameter. Smaller
+value of the concentration will divide the unit-length into larger pieces of
+the stick (defining more concentrated distribution). Larger concentration
+values will create smaller pieces of the stick (increasing the number of
+components with non zero weights).
 
 Variational inference techniques for the Dirichlet process still work
 with a finite approximation to this infinite mixture model, but
@@ -325,13 +324,3 @@ use, one just specifies the concentration parameter and an upper bound
 on the number of mixture components (this upper bound, assuming it is
 higher than the "true" number of components, affects only algorithmic
 complexity, not the actual number of components used).
-
-.. topic:: Derivation:
-
-   * See `here <dp-derivation.html>`_ the full derivation of this
-     algorithm.
-
-.. toctree::
-    :hidden:
-
-    dp-derivation.rst
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index a450c175ae8fca3f49a25a4639775e8e6d0030a4..5a3e969710a8bb8fb118c743b81c64bcf6723e56 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -419,10 +419,20 @@ API changes summary
      :class:`isotonic.IsotonicRegression`. By `Jonathan Arfa`_.
 
    - The old :class:`VBGMM` is deprecated in favor of the new
-     :class:`BayesianGaussianMixture`. The new class solves the computational
+     :class:`BayesianGaussianMixture` (with the parameter
+     ``weight_concentration_prior_type='dirichlet_distribution'``).
+     The new class solves the computational
+     problems of the old class and computes the Gaussian mixture with a
+     Dirichlet process prior faster than before.
+     (`#7295 <https://github.com/scikit-learn/scikit-learn/pull/7295>`_) by
+     `Wei Xue`_ and `Thierry Guillemot`_.
+
+   - The old :class:`VBGMM` is deprecated in favor of the new
+     :class:`BayesianGaussianMixture` (with the parameter
+     ``weight_concentration_prior_type='dirichlet_distribution'``).
+     The new class solves the computational
      problems of the old class and computes the Variational Bayesian Gaussian
      mixture faster than before.
-     Ref :ref:`b` for more information.
      (`#6651 <https://github.com/scikit-learn/scikit-learn/pull/6651>`_) by
      `Wei Xue`_ and `Thierry Guillemot`_.
 
diff --git a/examples/mixture/plot_bayesian_gaussian_mixture.py b/examples/mixture/plot_bayesian_gaussian_mixture.py
deleted file mode 100644
index 9efea3f04c7feac84fd9855b8b92726e331c1e02..0000000000000000000000000000000000000000
--- a/examples/mixture/plot_bayesian_gaussian_mixture.py
+++ /dev/null
@@ -1,114 +0,0 @@
-"""
-======================================================
-Bayesian Gaussian Mixture Concentration Prior Analysis
-======================================================
-
-Plot the resulting ellipsoids of a mixture of three Gaussians with
-variational Bayesian Gaussian Mixture for three different values on the
-prior the dirichlet concentration.
-
-For all models, the Variationnal Bayesian Gaussian Mixture adapts its number of
-mixture automatically. The parameter `dirichlet_concentration_prior` has a
-direct link with the resulting number of components. Specifying a high value of
-`dirichlet_concentration_prior` leads more often to uniformly-sized mixture
-components, while specifying small (under 0.1) values will lead to some mixture
-components getting almost all the points while most mixture components will be
-centered on just a few of the remaining points.
-"""
-# Author: Thierry Guillemot <thierry.guillemot.work@gmail.com>
-# License: BSD 3 clause
-
-import numpy as np
-import matplotlib as mpl
-import matplotlib.pyplot as plt
-import matplotlib.gridspec as gridspec
-
-from sklearn.mixture import BayesianGaussianMixture
-
-print(__doc__)
-
-
-def plot_ellipses(ax, weights, means, covars):
-    for n in range(means.shape[0]):
-        v, w = np.linalg.eigh(covars[n][:2, :2])
-        u = w[0] / np.linalg.norm(w[0])
-        angle = np.arctan2(u[1], u[0])
-        angle = 180 * angle / np.pi  # convert to degrees
-        v = 2 * np.sqrt(2) * np.sqrt(v)
-        ell = mpl.patches.Ellipse(means[n, :2], v[0], v[1], 180 + angle)
-        ell.set_clip_box(ax.bbox)
-        ell.set_alpha(weights[n])
-        ax.add_artist(ell)
-
-
-def plot_results(ax1, ax2, estimator, dirichlet_concentration_prior, X, y, plot_title=False):
-    estimator.dirichlet_concentration_prior = dirichlet_concentration_prior
-    estimator.fit(X)
-    ax1.set_title("Bayesian Gaussian Mixture for "
-                  r"$dc_0=%.1e$" % dirichlet_concentration_prior)
-    # ax1.axis('equal')
-    ax1.scatter(X[:, 0], X[:, 1], s=5, marker='o', color=colors[y], alpha=0.8)
-    ax1.set_xlim(-2., 2.)
-    ax1.set_ylim(-3., 3.)
-    ax1.set_xticks(())
-    ax1.set_yticks(())
-    plot_ellipses(ax1, estimator.weights_, estimator.means_,
-                  estimator.covariances_)
-
-    ax2.get_xaxis().set_tick_params(direction='out')
-    ax2.yaxis.grid(True, alpha=0.7)
-    for k, w in enumerate(estimator.weights_):
-        ax2.bar(k - .45, w, width=0.9, color='royalblue', zorder=3)
-        ax2.text(k, w + 0.007, "%.1f%%" % (w * 100.),
-                 horizontalalignment='center')
-    ax2.set_xlim(-.6, 2 * n_components - .4)
-    ax2.set_ylim(0., 1.1)
-    ax2.tick_params(axis='y', which='both', left='off',
-                    right='off', labelleft='off')
-    ax2.tick_params(axis='x', which='both', top='off')
-
-    if plot_title:
-        ax1.set_ylabel('Estimated Mixtures')
-        ax2.set_ylabel('Weight of each component')
-
-# Parameters
-random_state = 2
-n_components, n_features = 3, 2
-colors = np.array(['mediumseagreen', 'royalblue', 'r', 'gold',
-                   'orchid', 'indigo', 'darkcyan', 'tomato'])
-dirichlet_concentration_prior = np.logspace(-3, 3, 3)
-covars = np.array([[[.7, .0], [.0, .1]],
-                   [[.5, .0], [.0, .1]],
-                   [[.5, .0], [.0, .1]]])
-samples = np.array([200, 500, 200])
-means = np.array([[.0, -.70],
-                  [.0, .0],
-                  [.0, .70]])
-
-
-# Here we put beta_prior to 0.8 to minimize the influence of the prior for this
-# dataset
-estimator = BayesianGaussianMixture(n_components=2 * n_components,
-                                    init_params='random', max_iter=1500,
-                                    mean_precision_prior=.8, tol=1e-9,
-                                    random_state=random_state)
-
-# Generate data
-rng = np.random.RandomState(random_state)
-X = np.vstack([
-    rng.multivariate_normal(means[j], covars[j], samples[j])
-    for j in range(n_components)])
-y = np.concatenate([j * np.ones(samples[j], dtype=int)
-                    for j in range(n_components)])
-
-# Plot Results
-plt.figure(figsize=(4.7 * 3, 8))
-plt.subplots_adjust(bottom=.04, top=0.95, hspace=.05, wspace=.05,
-                    left=.03, right=.97)
-
-gs = gridspec.GridSpec(3, len(dirichlet_concentration_prior))
-for k, dc in enumerate(dirichlet_concentration_prior):
-    plot_results(plt.subplot(gs[0:2, k]), plt.subplot(gs[2, k]),
-                 estimator, dc, X, y, plot_title=k == 0)
-
-plt.show()
diff --git a/examples/mixture/plot_concentration_prior.py b/examples/mixture/plot_concentration_prior.py
new file mode 100644
index 0000000000000000000000000000000000000000..49d9e6211b351500fe67ffafe5e30bf944784c21
--- /dev/null
+++ b/examples/mixture/plot_concentration_prior.py
@@ -0,0 +1,135 @@
+"""
+========================================================================
+Concentration Prior Type Analysis of Variation Bayesian Gaussian Mixture
+========================================================================
+
+This example plots the ellipsoids obtained from a toy dataset (mixture of three
+Gaussians) fitted by the ``BayesianGaussianMixture`` class models with a
+Dirichlet distribution prior
+(``weight_concentration_prior_type='dirichlet_distribution'``) and a Dirichlet
+process prior (``weight_concentration_prior_type='dirichlet_process'``). On
+each figure, we plot the results for three different values of the weight
+concentration prior.
+
+The ``BayesianGaussianMixture`` class can adapt its number of mixture
+componentsautomatically. The parameter ``weight_concentration_prior`` has a
+direct link with the resulting number of components with non-zero weights.
+Specifying a low value for the concentration prior will make the model put most
+of the weight on few components set the remaining components weights very close
+to zero. High values of the concentration prior will allow a larger number of
+components to be active in the mixture.
+
+The Dirichlet process prior allows to define an infinite number of components
+and automatically selects the correct number of components: it activates a
+component only if it is necessary.
+
+On the contrary the classical finite mixture model with a Dirichlet
+distribution prior will favor more uniformly weighted components and therefore
+tends to divide natural clusters into unnecessary sub-components.
+"""
+# Author: Thierry Guillemot <thierry.guillemot.work@gmail.com>
+# License: BSD 3 clause
+
+import numpy as np
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import matplotlib.gridspec as gridspec
+
+from sklearn.mixture import BayesianGaussianMixture
+
+print(__doc__)
+
+
+def plot_ellipses(ax, weights, means, covars):
+    for n in range(means.shape[0]):
+        eig_vals, eig_vecs = np.linalg.eigh(covars[n])
+        unit_eig_vec = eig_vecs[0] / np.linalg.norm(eig_vecs[0])
+        angle = np.arctan2(unit_eig_vec[1], unit_eig_vec[0])
+        # Ellipse needs degrees
+        angle = 180 * angle / np.pi
+        # eigenvector normalization
+        eig_vals = 2 * np.sqrt(2) * np.sqrt(eig_vals)
+        ell = mpl.patches.Ellipse(means[n], eig_vals[0], eig_vals[1],
+                                  180 + angle)
+        ell.set_clip_box(ax.bbox)
+        ell.set_alpha(weights[n])
+        ell.set_facecolor('#56B4E9')
+        ax.add_artist(ell)
+
+
+def plot_results(ax1, ax2, estimator, X, y, title, plot_title=False):
+    ax1.set_title(title)
+    ax1.scatter(X[:, 0], X[:, 1], s=5, marker='o', color=colors[y], alpha=0.8)
+    ax1.set_xlim(-2., 2.)
+    ax1.set_ylim(-3., 3.)
+    ax1.set_xticks(())
+    ax1.set_yticks(())
+    plot_ellipses(ax1, estimator.weights_, estimator.means_,
+                  estimator.covariances_)
+
+    ax2.get_xaxis().set_tick_params(direction='out')
+    ax2.yaxis.grid(True, alpha=0.7)
+    for k, w in enumerate(estimator.weights_):
+        ax2.bar(k - .45, w, width=0.9, color='#56B4E9', zorder=3)
+        ax2.text(k, w + 0.007, "%.1f%%" % (w * 100.),
+                 horizontalalignment='center')
+    ax2.set_xlim(-.6, 2 * n_components - .4)
+    ax2.set_ylim(0., 1.1)
+    ax2.tick_params(axis='y', which='both', left='off',
+                    right='off', labelleft='off')
+    ax2.tick_params(axis='x', which='both', top='off')
+
+    if plot_title:
+        ax1.set_ylabel('Estimated Mixtures')
+        ax2.set_ylabel('Weight of each component')
+
+# Parameters of the dataset
+random_state, n_components, n_features = 2, 3, 2
+colors = np.array(['#0072B2', '#F0E442', '#D55E00'])
+
+covars = np.array([[[.7, .0], [.0, .1]],
+                   [[.5, .0], [.0, .1]],
+                   [[.5, .0], [.0, .1]]])
+samples = np.array([200, 500, 200])
+means = np.array([[.0, -.70],
+                  [.0, .0],
+                  [.0, .70]])
+
+# mean_precision_prior= 0.8 to minimize the influence of the prior
+estimators = [
+    ("Finite mixture with a Dirichlet distribution\nprior and "
+     r"$\gamma_0=$", BayesianGaussianMixture(
+        weight_concentration_prior_type="dirichlet_distribution",
+        n_components=2 * n_components, reg_covar=0, init_params='random',
+        max_iter=1500, mean_precision_prior=.8,
+        random_state=random_state), [0.001, 1, 1000]),
+    ("Infinite mixture with a Dirichlet process\n prior and" r"$\gamma_0=$",
+     BayesianGaussianMixture(
+        weight_concentration_prior_type="dirichlet_process",
+        n_components=2 * n_components, reg_covar=0, init_params='random',
+        max_iter=1500, mean_precision_prior=.8,
+        random_state=random_state), [1, 1000, 100000])]
+
+# Generate data
+rng = np.random.RandomState(random_state)
+X = np.vstack([
+    rng.multivariate_normal(means[j], covars[j], samples[j])
+    for j in range(n_components)])
+y = np.concatenate([j * np.ones(samples[j], dtype=int)
+                    for j in range(n_components)])
+
+# Plot results in two different figures
+for (title, estimator, concentrations_prior) in estimators:
+    plt.figure(figsize=(4.7 * 3, 8))
+    plt.subplots_adjust(bottom=.04, top=0.90, hspace=.05, wspace=.05,
+                        left=.03, right=.99)
+
+    gs = gridspec.GridSpec(3, len(concentrations_prior))
+    for k, concentration in enumerate(concentrations_prior):
+        estimator.weight_concentration_prior = concentration
+        estimator.fit(X)
+        plot_results(plt.subplot(gs[0:2, k]), plt.subplot(gs[2, k]), estimator,
+                     X, y, r"%s$%.1e$" % (title, concentration),
+                     plot_title=k == 0)
+
+plt.show()
diff --git a/examples/mixture/plot_gmm.py b/examples/mixture/plot_gmm.py
index d6b0839c193a7647475ff1489f20121eaf830eba..5f2f8596d4bbea1765c1be2c7ec4ab84a5e8c3dc 100644
--- a/examples/mixture/plot_gmm.py
+++ b/examples/mixture/plot_gmm.py
@@ -3,16 +3,18 @@
 Gaussian Mixture Model Ellipsoids
 =================================
 
-Plot the confidence ellipsoids of a mixture of two Gaussians with EM
-and variational Dirichlet process.
-
-Both models have access to five components with which to fit the
-data. Note that the EM model will necessarily use all five components
-while the DP model will effectively only use as many as are needed for
-a good fit. This is a property of the Dirichlet Process prior. Here we
-can see that the EM model splits some components arbitrarily, because it
-is trying to fit too many components, while the Dirichlet Process model
-adapts it number of state automatically.
+Plot the confidence ellipsoids of a mixture of two Gaussians
+obtained with Expectation Maximisation (``GaussianMixture`` class) and
+Variational Inference (``BayesianGaussianMixture`` class models with
+a Dirichlet process prior).
+
+Both models have access to five components with which to fit the data. Note
+that the Expectation Maximisation model will necessarily use all five
+components while the Variational Inference model will effectively only use as
+many as are needed for a good fit. Here we can see that the Expectation
+Maximisation model splits some components arbitrarily, because it is trying to
+fit too many components, while the Dirichlet Process model adapts it number of
+state automatically.
 
 This example doesn't show it, as we're in a low-dimensional space, but
 another advantage of the Dirichlet process model is that it can fit
@@ -56,7 +58,7 @@ def plot_results(X, Y_, means, covariances, index, title):
         ell.set_alpha(0.5)
         splot.add_artist(ell)
 
-    plt.xlim(-10., 10.)
+    plt.xlim(-9., 5.)
     plt.ylim(-3., 6.)
     plt.xticks(())
     plt.yticks(())
@@ -78,8 +80,9 @@ plot_results(X, gmm.predict(X), gmm.means_, gmm.covariances_, 0,
              'Gaussian Mixture')
 
 # Fit a Dirichlet process Gaussian mixture using five components
-dpgmm = mixture.DPGMM(n_components=5, covariance_type='full').fit(X)
-plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm._get_covars(), 1,
-             'Dirichlet Process GMM')
+dpgmm = mixture.BayesianGaussianMixture(n_components=5,
+                                        covariance_type='full').fit(X)
+plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm.covariances_, 1,
+             'Bayesian Gaussian Mixture with a Dirichlet process prior')
 
 plt.show()
diff --git a/examples/mixture/plot_gmm_sin.py b/examples/mixture/plot_gmm_sin.py
index cc9217740a195e9afa4d2c492688a8912d481d82..f5fb2ded45120d51b606488b24c6ead64708c61d 100644
--- a/examples/mixture/plot_gmm_sin.py
+++ b/examples/mixture/plot_gmm_sin.py
@@ -3,15 +3,40 @@
 Gaussian Mixture Model Sine Curve
 =================================
 
-This example highlights the advantages of the Dirichlet Process:
-complexity control and dealing with sparse data. The dataset is formed
-by 100 points loosely spaced following a noisy sine curve. The fit by
-the GMM class, using the expectation-maximization algorithm to fit a
-mixture of 10 Gaussian components, finds too-small components and very
-little structure. The fits by the Dirichlet process, however, show
-that the model can either learn a global structure for the data (small
-alpha) or easily interpolate to finding relevant local structure
-(large alpha), never falling into the problems shown by the GMM class.
+This example demonstrates the behavior of Gaussian mixture models fit on data
+that was not sampled from a mixture of Gaussian random variables. The dataset
+is formed by 100 points loosely spaced following a noisy sine curve. There is
+therefore no ground truth value for the number of Gaussian components.
+
+The first model is a classical Gaussian Mixture Model with 10 components fit
+with the Expectation-Maximization algorithm.
+
+The second model is a Bayesian Gaussian Mixture Model with a Dirichlet process
+prior fit with variational inference. The low value of the concentration prior
+makes the model favor a lower number of active components. This models
+"decides" to focus its modeling power on the big picture of the structure of
+the dataset: groups of points with alternating directions modeled by
+non-diagonal covariance matrices. Those alternating directions roughly capture
+the alternating nature of the original sine signal.
+
+The third model is also a Bayesian Gaussian mixture model with a Dirichlet
+process prior but this time the value of the concentration prior is higher
+giving the model more liberty to model the fine-grained structure of the data.
+The result is a mixture with a larger number of active components that is
+similar to the first model where we arbitrarily decided to fix the number of
+components to 10.
+
+Which model is the best is a matter of subjective judgement: do we want to
+favor models that only capture the big picture to summarize and explain most of
+the structure of the data while ignoring the details or do we prefer models
+that closely follow the high density regions of the signal?
+
+The last two panels show how we can sample from the last two models. The
+resulting samples distributions do not look exactly like the original data
+distribution. The difference primarily stems from the approximation error we
+made by using a model that assumes that the data was generated by a finite
+number of Gaussian components instead of a continuous noisy sine curve.
+
 """
 
 import itertools
@@ -23,12 +48,14 @@ import matplotlib as mpl
 
 from sklearn import mixture
 
+print(__doc__)
+
 color_iter = itertools.cycle(['navy', 'c', 'cornflowerblue', 'gold',
                               'darkorange'])
 
 
-def plot_results(X, Y_, means, covariances, index, title):
-    splot = plt.subplot(3, 1, 1 + index)
+def plot_results(X, Y, means, covariances, index, title):
+    splot = plt.subplot(5, 1, 1 + index)
     for i, (mean, covar, color) in enumerate(zip(
             means, covariances, color_iter)):
         v, w = linalg.eigh(covar)
@@ -37,9 +64,9 @@ def plot_results(X, Y_, means, covariances, index, title):
         # as the DP will not use every component it has access to
         # unless it needs it, we shouldn't plot the redundant
         # components.
-        if not np.any(Y_ == i):
+        if not np.any(Y == i):
             continue
-        plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
+        plt.scatter(X[Y == i, 0], X[Y == i, 1], .8, color=color)
 
         # Plot an ellipse to show the Gaussian component
         angle = np.arctan(u[1] / u[0])
@@ -56,7 +83,24 @@ def plot_results(X, Y_, means, covariances, index, title):
     plt.yticks(())
 
 
-# Number of samples per component
+def plot_samples(X, Y, n_components, index, title):
+    plt.subplot(5, 1, 4 + index)
+    for i, color in zip(range(n_components), color_iter):
+        # as the DP will not use every component it has access to
+        # unless it needs it, we shouldn't plot the redundant
+        # components.
+        if not np.any(Y == i):
+            continue
+        plt.scatter(X[Y == i, 0], X[Y == i, 1], .8, color=color)
+
+    plt.xlim(-6., 4. * np.pi - 6.)
+    plt.ylim(-5., 5.)
+    plt.title(title)
+    plt.xticks(())
+    plt.yticks(())
+
+
+# Parameters
 n_samples = 100
 
 # Generate random sample following a sine curve
@@ -69,22 +113,42 @@ for i in range(X.shape[0]):
     X[i, 0] = x + np.random.normal(0, 0.1)
     X[i, 1] = 3. * (np.sin(x) + np.random.normal(0, .2))
 
+plt.figure(figsize=(10, 10))
+plt.subplots_adjust(bottom=.04, top=0.95, hspace=.2, wspace=.05,
+                    left=.03, right=.97)
+
 # Fit a Gaussian mixture with EM using ten components
 gmm = mixture.GaussianMixture(n_components=10, covariance_type='full',
                               max_iter=100).fit(X)
 plot_results(X, gmm.predict(X), gmm.means_, gmm.covariances_, 0,
              'Expectation-maximization')
 
-# Fit a Dirichlet process Gaussian mixture using ten components
-dpgmm = mixture.DPGMM(n_components=10, covariance_type='full', alpha=0.01,
-                      n_iter=100).fit(X)
-plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm._get_covars(), 1,
-             'Dirichlet Process,alpha=0.01')
-
+dpgmm = mixture.BayesianGaussianMixture(
+    n_components=10, covariance_type='full', weight_concentration_prior=1e-2,
+    weight_concentration_prior_type='dirichlet_process',
+    mean_precision_prior=1e-2, covariance_prior=1e0 * np.eye(2),
+    init_params="random", max_iter=100, random_state=2).fit(X)
+plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm.covariances_, 1,
+             "Bayesian Gaussian mixture models with a Dirichlet process prior "
+             r"for $\gamma_0=0.01$.")
+
+X_s, y_s = dpgmm.sample(n_samples=2000)
+plot_samples(X_s, y_s, dpgmm.n_components, 0,
+             "Gaussian mixture with a Dirichlet process prior "
+             r"for $\gamma_0=0.01$ sampled with $2000$ samples.")
+
+dpgmm = mixture.BayesianGaussianMixture(
+    n_components=10, covariance_type='full', weight_concentration_prior=1e+2,
+    weight_concentration_prior_type='dirichlet_process',
+    mean_precision_prior=1e-2, covariance_prior=1e0 * np.eye(2),
+    init_params="kmeans", max_iter=100, random_state=2).fit(X)
+plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm.covariances_, 2,
+             "Bayesian Gaussian mixture models with a Dirichlet process prior "
+             r"for $\gamma_0=100$")
+
+X_s, y_s = dpgmm.sample(n_samples=2000)
+plot_samples(X_s, y_s, dpgmm.n_components, 1,
+             "Gaussian mixture with a Dirichlet process prior "
+             r"for $\gamma_0=100$ sampled with $2000$ samples.")
 
-# Fit a Dirichlet process Gaussian mixture using ten components
-dpgmm = mixture.DPGMM(n_components=10, covariance_type='diag', alpha=100.,
-                      n_iter=100).fit(X)
-plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm._get_covars(), 2,
-             'Dirichlet Process,alpha=100.')
 plt.show()
diff --git a/sklearn/mixture/base.py b/sklearn/mixture/base.py
index 062d4032288596f05cfc1c764cfbd9f01198f70b..38ee12e9256949635096905fd07718835c783dd2 100644
--- a/sklearn/mixture/base.py
+++ b/sklearn/mixture/base.py
@@ -130,15 +130,17 @@ class BaseMixture(six.with_metaclass(ABCMeta, DensityMixin, BaseEstimator)):
         """
         pass
 
-    def _initialize_parameters(self, X):
+    def _initialize_parameters(self, X, random_state):
         """Initialize the model parameters.
 
         Parameters
         ----------
         X : array-like, shape  (n_samples, n_features)
+
+        random_state: RandomState
+            A random number generator instance.
         """
         n_samples, _ = X.shape
-        random_state = check_random_state(self.random_state)
 
         if self.init_params == 'kmeans':
             resp = np.zeros((n_samples, self.n_components))
@@ -195,12 +197,14 @@ class BaseMixture(six.with_metaclass(ABCMeta, DensityMixin, BaseEstimator)):
         max_lower_bound = -np.infty
         self.converged_ = False
 
+        random_state = check_random_state(self.random_state)
+
         n_samples, _ = X.shape
         for init in range(n_init):
             self._print_verbose_msg_init_beg(init)
 
             if do_init:
-                self._initialize_parameters(X)
+                self._initialize_parameters(X, random_state)
                 self.lower_bound_ = -np.infty
 
             for n_iter in range(self.max_iter):
@@ -228,7 +232,7 @@ class BaseMixture(six.with_metaclass(ABCMeta, DensityMixin, BaseEstimator)):
         if not self.converged_:
             warnings.warn('Initialization %d did not converged. '
                           'Try different init parameters, '
-                          'or increase n_init, tol '
+                          'or increase max_iter, tol '
                           'or check for degenerate data.'
                           % (init + 1), ConvergenceWarning)
 
@@ -355,6 +359,51 @@ class BaseMixture(six.with_metaclass(ABCMeta, DensityMixin, BaseEstimator)):
         _, log_resp = self._estimate_log_prob_resp(X)
         return np.exp(log_resp)
 
+    def sample(self, n_samples=1):
+        """Generate random samples from the fitted Gaussian distribution.
+
+        Parameters
+        ----------
+        n_samples : int, optional
+            Number of samples to generate. Defaults to 1.
+
+        Returns
+        -------
+        X : array, shape (n_samples, n_features)
+            Randomly generated sample
+        """
+        self._check_is_fitted()
+
+        if n_samples < 1:
+            raise ValueError(
+                "Invalid value for 'n_samples': %d . The sampling requires at "
+                "least one sample." % (self.n_components))
+
+        _, n_features = self.means_.shape
+        rng = check_random_state(self.random_state)
+        n_samples_comp = np.round(self.weights_ * n_samples).astype(int)
+
+        if self.covariance_type == 'full':
+            X = np.vstack([
+                rng.multivariate_normal(mean, covariance, int(sample))
+                for (mean, covariance, sample) in zip(
+                    self.means_, self.covariances_, n_samples_comp)])
+        elif self.covariance_type == "tied":
+            X = np.vstack([
+                rng.multivariate_normal(mean, self.covariances_, int(sample))
+                for (mean, sample) in zip(
+                    self.means_, n_samples_comp)])
+        else:
+            X = np.vstack([
+                mean + rng.randn(sample, n_features) * np.sqrt(covariance)
+                for (mean, covariance, sample) in zip(
+                    self.means_, self.covariances_, n_samples_comp)])
+
+        y = np.concatenate([j * np.ones(sample, dtype=int)
+                           for j, sample in enumerate(n_samples_comp)])
+
+        return (X, y)
+
     def _estimate_weighted_log_prob(self, X):
         """Estimate the weighted log-probabilities, log P(X | Z) + log weights.
 
diff --git a/sklearn/mixture/bayesian_mixture.py b/sklearn/mixture/bayesian_mixture.py
index 2b5d4458c879ee4d7ac15b624e44b95fd8610aef..97aaacb4564c1d00c8f17524d639c8cf8b2e5bb0 100644
--- a/sklearn/mixture/bayesian_mixture.py
+++ b/sklearn/mixture/bayesian_mixture.py
@@ -5,7 +5,7 @@
 
 import math
 import numpy as np
-from scipy.special import digamma, gammaln
+from scipy.special import betaln, digamma, gammaln
 
 from .base import BaseMixture, _check_shape
 from .gaussian_mixture import _check_precision_matrix
@@ -63,19 +63,29 @@ def _log_wishart_norm(degrees_of_freedom, log_det_precisions_chol, n_features):
 
 
 class BayesianGaussianMixture(BaseMixture):
-    """Variational estimation of a Gaussian mixture.
+    """Variational Bayesian estimation of a Gaussian mixture.
 
     This class allows to infer an approximate posterior distribution over the
     parameters of a Gaussian mixture distribution. The effective number of
     components can be inferred from the data.
 
+    This class implements two types of prior for the weights distribution: a
+    finite mixture model with Dirichlet distribution and an infinite mixture
+    model with the Dirichlet Process. In practice Dirichlet Process inference
+    algorithm is approximated and uses a truncated distribution with a fixed
+    maximum number of components (called the Stick-breaking representation).
+    The number of components actually used almost always depends on the data.
+
+    .. versionadded:: 0.18
+    *BayesianGaussianMixture*.
+
     Read more in the :ref:`User Guide <bgmm>`.
 
     Parameters
     ----------
     n_components : int, defaults to 1.
         The number of mixture components. Depending on the data and the value
-        of the `dirichlet_concentration_prior` the model can decide to not use
+        of the `weight_concentration_prior` the model can decide to not use
         all the components by setting some component `weights_` to values very
         close to zero. The number of effective components is therefore smaller
         than n_components.
@@ -83,10 +93,10 @@ class BayesianGaussianMixture(BaseMixture):
     covariance_type : {'full', 'tied', 'diag', 'spherical'}, defaults to 'full'.
         String describing the type of covariance parameters to use.
         Must be one of::
-        'full' (each component has its own general covariance matrix).
+        'full' (each component has its own general covariance matrix),
         'tied' (all components share the same general covariance matrix),
         'diag' (each component has its own diagonal covariance matrix),
-        'spherical' (each component has its own single variance),
+        'spherical' (each component has its own single variance).
 
     tol : float, defaults to 1e-3.
         The convergence threshold. EM iterations will stop when the
@@ -111,7 +121,13 @@ class BayesianGaussianMixture(BaseMixture):
         'kmeans' : responsibilities are initialized using kmeans.
         'random' : responsibilities are initialized randomly.
 
-    dirichlet_concentration_prior : float | None, optional.
+    weight_concentration_prior_type : str, defaults to 'dirichlet_process'.
+        String describing the type of the weight concentration prior.
+        Must be one of::
+        'dirichlet_process' (using the Stick-breaking representation),
+        'dirichlet_distribution' (can favor more uniform weights).
+
+    weight_concentration_prior : float | None, optional.
         The dirichlet concentration of each component on the weight
         distribution (Dirichlet). The higher concentration puts more mass in
         the center and will lead to more components being active, while a lower
@@ -122,7 +138,7 @@ class BayesianGaussianMixture(BaseMixture):
     mean_precision_prior : float | None, optional.
         The precision prior on the mean distribution (Gaussian).
         Controls the extend to where means can be placed. Smaller
-        values concentrates the means of each clusters around `mean_prior`.
+        values concentrate the means of each clusters around `mean_prior`.
         The value of the parameter must be greater than 0.
         If it is None, it's set to 1.
 
@@ -157,6 +173,9 @@ class BayesianGaussianMixture(BaseMixture):
         it prints also the log probability and the time needed
         for each step.
 
+    verbose_interval : int, default to 10.
+        Number of iteration done before the next print.
+
     Attributes
     ----------
     weights_ : array-like, shape (`n_components`,)
@@ -210,21 +229,25 @@ class BayesianGaussianMixture(BaseMixture):
         Lower bound value on the likelihood (of the training data with
         respect to the model) of the best fit of inference.
 
-    dirichlet_concentration_prior_ : float
+    weight_concentration_prior_ : tuple or float
         The dirichlet concentration of each component on the weight
-        distribution (Dirichlet). The higher concentration puts more mass in
+        distribution (Dirichlet). The type depends on
+        `weight_concentration_prior_type`::
+            (float, float) if 'dirichlet_process' (Beta parameters),
+            float          if 'dirichlet_distribution' (Dirichlet parameters).
+        The higher concentration puts more mass in
         the center and will lead to more components being active, while a lower
         concentration parameter will lead to more mass at the edge of the
         simplex.
 
-    dirichlet_concentration_ : array-like, shape (`n_components`, )
+    weight_concentration_ : array-like, shape (`n_components`, )
         The dirichlet concentration of each component on the weight
         distribution (Dirichlet).
 
     mean_precision_prior : float
         The precision prior on the mean distribution (Gaussian).
         Controls the extend to where means can be placed.
-        Smaller values concentrates the means of each clusters around
+        Smaller values concentrate the means of each clusters around
         `mean_prior`.
 
     mean_precision_ : array-like, shape (`n_components`, )
@@ -251,15 +274,32 @@ class BayesianGaussianMixture(BaseMixture):
     See Also
     --------
     GaussianMixture : Finite Gaussian mixture fit with EM.
+
+    References
+    ----------
+
+    .. [1] `Bishop, Christopher M. (2006). "Pattern recognition and machine
+       learning". Vol. 4 No. 4. New York: Springer.
+       <http://www.springer.com/kr/book/9780387310732>`_
+
+    .. [2] `Hagai Attias. (2000). "A Variational Bayesian Framework for
+       Graphical Models". In Advances in Neural Information Processing
+       Systems 12.
+       <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.36.2841&rep=rep1&type=pdf>`_
+
+    .. [3] `Blei, David M. and Michael I. Jordan. (2006). "Variational
+       inference for Dirichlet process mixtures". Bayesian analysis 1.1
+       <http://www.cs.princeton.edu/courses/archive/fall11/cos597C/reading/BleiJordan2005.pdf>
     """
 
     def __init__(self, n_components=1, covariance_type='full', tol=1e-3,
                  reg_covar=1e-6, max_iter=100, n_init=1, init_params='kmeans',
-                 dirichlet_concentration_prior=None,
+                 weight_concentration_prior_type='dirichlet_process',
+                 weight_concentration_prior=None,
                  mean_precision_prior=None, mean_prior=None,
                  degrees_of_freedom_prior=None, covariance_prior=None,
                  random_state=None, warm_start=False, verbose=0,
-                 verbose_interval=20):
+                 verbose_interval=10):
         super(BayesianGaussianMixture, self).__init__(
             n_components=n_components, tol=tol, reg_covar=reg_covar,
             max_iter=max_iter, n_init=n_init, init_params=init_params,
@@ -267,7 +307,8 @@ class BayesianGaussianMixture(BaseMixture):
             verbose=verbose, verbose_interval=verbose_interval)
 
         self.covariance_type = covariance_type
-        self.dirichlet_concentration_prior = dirichlet_concentration_prior
+        self.weight_concentration_prior_type = weight_concentration_prior_type
+        self.weight_concentration_prior = weight_concentration_prior
         self.mean_precision_prior = mean_precision_prior
         self.mean_prior = mean_prior
         self.degrees_of_freedom_prior = degrees_of_freedom_prior
@@ -285,6 +326,15 @@ class BayesianGaussianMixture(BaseMixture):
                              "'covariance_type' should be in "
                              "['spherical', 'tied', 'diag', 'full']"
                              % self.covariance_type)
+
+        if (self.weight_concentration_prior_type not in
+                ['dirichlet_process', 'dirichlet_distribution']):
+            raise ValueError(
+                "Invalid value for 'weight_concentration_prior_type': %s "
+                "'weight_concentration_prior_type' should be in "
+                "['dirichlet_process', 'dirichlet_distribution']"
+                % self.weight_concentration_prior_type)
+
         self._check_weights_parameters()
         self._check_means_parameters(X)
         self._check_precision_parameters(X)
@@ -292,15 +342,15 @@ class BayesianGaussianMixture(BaseMixture):
 
     def _check_weights_parameters(self):
         """Check the parameter of the Dirichlet distribution."""
-        if self.dirichlet_concentration_prior is None:
-            self.dirichlet_concentration_prior_ = 1. / self.n_components
-        elif self.dirichlet_concentration_prior > 0.:
-            self.dirichlet_concentration_prior_ = (
-                self.dirichlet_concentration_prior)
+        if self.weight_concentration_prior is None:
+            self.weight_concentration_prior_ = 1. / self.n_components
+        elif self.weight_concentration_prior > 0.:
+            self.weight_concentration_prior_ = (
+                self.weight_concentration_prior)
         else:
-            raise ValueError("The parameter 'dirichlet_concentration_prior' "
+            raise ValueError("The parameter 'weight_concentration_prior' "
                              "should be greater than 0., but got %.3f."
-                             % self.dirichlet_concentration_prior)
+                             % self.weight_concentration_prior)
 
     def _check_means_parameters(self, X):
         """Check the parameters of the Gaussian distribution.
@@ -410,8 +460,16 @@ class BayesianGaussianMixture(BaseMixture):
         ----------
         nk : array-like, shape (n_components,)
         """
-        self.dirichlet_concentration_ = (
-            self.dirichlet_concentration_prior_ + nk)
+        if self.weight_concentration_prior_type == 'dirichlet_process':
+            # For dirichlet process weight_concentration will be a tuple
+            # containing the two parameters of the beta distribution
+            self.weight_concentration_ = (
+                1. + nk,
+                (self.weight_concentration_prior_ +
+                 np.hstack((np.cumsum(nk[::-1])[-2::-1], 0))))
+        else:
+            # case Variationnal Gaussian mixture with dirichlet distribution
+            self.weight_concentration_ = self.weight_concentration_prior_ + nk
 
     def _estimate_means(self, nk, xk):
         """Estimate the parameters of the Gaussian distribution.
@@ -575,7 +633,7 @@ class BayesianGaussianMixture(BaseMixture):
         self.covariances_ /= self.degrees_of_freedom_
 
     def _check_is_fitted(self):
-        check_is_fitted(self, ['dirichlet_concentration_', 'mean_precision_',
+        check_is_fitted(self, ['weight_concentration_', 'mean_precision_',
                                'means_', 'degrees_of_freedom_',
                                'covariances_', 'precisions_',
                                'precisions_cholesky_'])
@@ -600,8 +658,17 @@ class BayesianGaussianMixture(BaseMixture):
         self._estimate_precisions(nk, xk, sk)
 
     def _estimate_log_weights(self):
-        return (digamma(self.dirichlet_concentration_) -
-                digamma(np.sum(self.dirichlet_concentration_)))
+        if self.weight_concentration_prior_type == 'dirichlet_process':
+            digamma_sum = digamma(self.weight_concentration_[0] +
+                                  self.weight_concentration_[1])
+            digamma_a = digamma(self.weight_concentration_[0])
+            digamma_b = digamma(self.weight_concentration_[1])
+            return (digamma_a - digamma_sum +
+                    np.hstack((0, np.cumsum(digamma_b - digamma_sum)[:-1])))
+        else:
+            # case Variationnal Gaussian mixture with dirichlet distribution
+            return (digamma(self.weight_concentration_) -
+                    digamma(np.sum(self.weight_concentration_)))
 
     def _estimate_log_prob(self, X):
         _, n_features = X.shape
@@ -657,25 +724,41 @@ class BayesianGaussianMixture(BaseMixture):
             log_wishart = np.sum(_log_wishart_norm(
                 self.degrees_of_freedom_, log_det_precisions_chol, n_features))
 
-        return (-np.sum(np.exp(log_resp) * log_resp) - log_wishart -
-                _log_dirichlet_norm(self.dirichlet_concentration_) -
+        if self.weight_concentration_prior_type == 'dirichlet_process':
+            log_norm_weight = -np.sum(betaln(self.weight_concentration_[0],
+                                             self.weight_concentration_[1]))
+        else:
+            log_norm_weight = _log_dirichlet_norm(self.weight_concentration_)
+
+        return (-np.sum(np.exp(log_resp) * log_resp) -
+                log_wishart - log_norm_weight -
                 0.5 * n_features * np.sum(np.log(self.mean_precision_)))
 
     def _get_parameters(self):
-        return (self.dirichlet_concentration_,
+        return (self.weight_concentration_,
                 self.mean_precision_, self.means_,
                 self.degrees_of_freedom_, self.covariances_,
                 self.precisions_cholesky_)
 
     def _set_parameters(self, params):
-        (self.dirichlet_concentration_, self.mean_precision_, self.means_,
+        (self.weight_concentration_, self.mean_precision_, self.means_,
          self.degrees_of_freedom_, self.covariances_,
          self.precisions_cholesky_) = params
 
-        # Attributes computation
-        self. weights_ = (self.dirichlet_concentration_ /
-                          np.sum(self.dirichlet_concentration_))
+        # Weights computation
+        if self.weight_concentration_prior_type == "dirichlet_process":
+            weight_dirichlet_sum = (self.weight_concentration_[0] +
+                                    self.weight_concentration_[1])
+            tmp = self.weight_concentration_[1] / weight_dirichlet_sum
+            self.weights_ = (
+                self.weight_concentration_[0] / weight_dirichlet_sum *
+                np.hstack((1, np.cumprod(tmp[:-1]))))
+            self.weights_ /= np.sum(self.weights_)
+        else:
+            self. weights_ = (self.weight_concentration_ /
+                              np.sum(self.weight_concentration_))
 
+        # Precisions matrices computation
         if self.covariance_type == 'full':
             self.precisions_ = np.array([
                 np.dot(prec_chol, prec_chol.T)
diff --git a/sklearn/mixture/dpgmm.py b/sklearn/mixture/dpgmm.py
index ee2bd64911165bed8fd502a38f72e40b7d2502fc..a8a1e2d928b11de127510ed18df66925b884e3d9 100644
--- a/sklearn/mixture/dpgmm.py
+++ b/sklearn/mixture/dpgmm.py
@@ -10,6 +10,13 @@ from __future__ import print_function
 #         Fabian Pedregosa <fabian.pedregosa@inria.fr>
 #
 
+# Important note for the deprecation cleaning of 0.20 :
+# All the function and classes of this file have been deprecated in 0.18.
+# When you remove this file please also remove the related files
+# - 'sklearn/mixture/gmm.py'
+# - 'sklearn/mixture/test_dpgmm.py'
+# - 'sklearn/mixture/test_gmm.py'
+
 import numpy as np
 from scipy.special import digamma as _digamma, gammaln as _gammaln
 from scipy import linalg
@@ -616,9 +623,11 @@ class _DPGMMBase(_GMMBase):
         return z
 
 
-@deprecated("The DPGMM class is not working correctly and it's better "
-            "to not use it. DPGMM is deprecated in 0.18 and "
-            "will be removed in 0.20.")
+@deprecated("The `DPGMM` class is not working correctly and it's better "
+            "to use `sklearn.mixture.BayesianGaussianMixture` class with "
+            "parameter `weight_concentration_prior_type='dirichlet_process'` "
+            "instead. DPGMM is deprecated in 0.18 and will be "
+            "removed in 0.20.")
 class DPGMM(_DPGMMBase):
     def __init__(self, n_components=1, covariance_type='diag', alpha=1.0,
                  random_state=None, tol=1e-3, verbose=0, min_covar=None,
@@ -630,8 +639,10 @@ class DPGMM(_DPGMMBase):
             init_params=init_params)
 
 
-@deprecated("The VBGMM class is not working correctly and it's better "
-            "to use sklearn.mixture.BayesianGaussianMixture class instead. "
+@deprecated("The `VBGMM` class is not working correctly and it's better "
+            "to use `sklearn.mixture.BayesianGaussianMixture` class with "
+            "parameter `weight_concentration_prior_type="
+            "'dirichlet_distribution'` instead. "
             "VBGMM is deprecated in 0.18 and will be removed in 0.20.")
 class VBGMM(_DPGMMBase):
     """Variational Inference for the Gaussian Mixture Model
diff --git a/sklearn/mixture/gaussian_mixture.py b/sklearn/mixture/gaussian_mixture.py
index 749af6c82fd634fa77f2a24f7ad22c32b58b705f..94c3a07d9307586505788d0a5b99d10b8d0d73b0 100644
--- a/sklearn/mixture/gaussian_mixture.py
+++ b/sklearn/mixture/gaussian_mixture.py
@@ -439,6 +439,11 @@ class GaussianMixture(BaseMixture):
     This class allows to estimate the parameters of a Gaussian mixture
     distribution.
 
+    .. versionadded:: 0.18
+    *GaussianMixture*.
+
+    Read more in the :ref:`User Guide <gmm>`.
+
     Parameters
     ----------
     n_components : int, defaults to 1.
@@ -448,10 +453,10 @@ class GaussianMixture(BaseMixture):
         defaults to 'full'.
         String describing the type of covariance parameters to use.
         Must be one of::
-        'full' (each component has its own general covariance matrix).
+        'full' (each component has its own general covariance matrix),
         'tied' (all components share the same general covariance matrix),
         'diag' (each component has its own diagonal covariance matrix),
-        'spherical' (each component has its own single variance),
+        'spherical' (each component has its own single variance).
 
     tol : float, defaults to 1e-3.
         The convergence threshold. EM iterations will stop when the
@@ -506,6 +511,9 @@ class GaussianMixture(BaseMixture):
         it prints also the log probability and the time needed
         for each step.
 
+    verbose_interval : int, default to 10.
+        Number of iteration done before the next print.
+
     Attributes
     ----------
     weights_ : array-like, shape (n_components,)
@@ -559,8 +567,8 @@ class GaussianMixture(BaseMixture):
 
     See Also
     --------
-    BayesianGaussianMixture : Finite gaussian mixture model fit with a
-        variational algorithm.
+    BayesianGaussianMixture : Gaussian mixture model fit with a variational
+        inference.
     """
 
     def __init__(self, n_components=1, covariance_type='full', tol=1e-3,
diff --git a/sklearn/mixture/gmm.py b/sklearn/mixture/gmm.py
index e5c8734c770bd0f588f71a8cfd6e735ecd25a673..d1a4eb818a6c0d29f315d859f951e99338ffbbc5 100644
--- a/sklearn/mixture/gmm.py
+++ b/sklearn/mixture/gmm.py
@@ -9,6 +9,13 @@ of Gaussian Mixture Models.
 #         Fabian Pedregosa <fabian.pedregosa@inria.fr>
 #         Bertrand Thirion <bertrand.thirion@inria.fr>
 
+# Important note for the deprecation cleaning of 0.20 :
+# All the functions and classes of this file have been deprecated in 0.18.
+# When you remove this file please also remove the related files
+# - 'sklearn/mixture/dpgmm.py'
+# - 'sklearn/mixture/test_dpgmm.py'
+# - 'sklearn/mixture/test_gmm.py'
+
 import numpy as np
 from scipy import linalg
 from time import time
@@ -65,6 +72,9 @@ def log_multivariate_normal_density(X, means, covars, covariance_type='diag'):
         X, means, covars)
 
 
+@deprecated("The function sample_gaussian is deprecated in 0.18"
+            " and will be removed in 0.20."
+            " Use numpy.random.multivariate_normal instead.")
 def sample_gaussian(mean, covar, covariance_type='diag', n_samples=1,
                     random_state=None):
     """Generate random samples from a Gaussian distribution.
diff --git a/sklearn/mixture/tests/test_bayesian_mixture.py b/sklearn/mixture/tests/test_bayesian_mixture.py
index 3003a9444669dbaa2cad65267b640731769187a7..e678c07d9236ff21c7bd8ad9f0a62f651301dd95 100644
--- a/sklearn/mixture/tests/test_bayesian_mixture.py
+++ b/sklearn/mixture/tests/test_bayesian_mixture.py
@@ -19,15 +19,16 @@ from sklearn.utils.testing import assert_greater_equal, ignore_warnings
 
 
 COVARIANCE_TYPE = ['full', 'tied', 'diag', 'spherical']
+PRIOR_TYPE = ['dirichlet_process', 'dirichlet_distribution']
 
 
 def test_log_dirichlet_norm():
     rng = np.random.RandomState(0)
 
-    dirichlet_concentration = rng.rand(2)
-    expected_norm = (gammaln(np.sum(dirichlet_concentration)) -
-                     np.sum(gammaln(dirichlet_concentration)))
-    predected_norm = _log_dirichlet_norm(dirichlet_concentration)
+    weight_concentration = rng.rand(2)
+    expected_norm = (gammaln(np.sum(weight_concentration)) -
+                     np.sum(gammaln(weight_concentration)))
+    predected_norm = _log_dirichlet_norm(weight_concentration)
 
     assert_almost_equal(expected_norm, predected_norm)
 
@@ -64,8 +65,22 @@ def test_bayesian_mixture_covariance_type():
                          "Invalid value for 'covariance_type': %s "
                          "'covariance_type' should be in "
                          "['spherical', 'tied', 'diag', 'full']"
-                         % covariance_type,
-                         bgmm.fit, X)
+                         % covariance_type, bgmm.fit, X)
+
+
+def test_bayesian_mixture_weight_concentration_prior_type():
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 10, 2
+    X = rng.rand(n_samples, n_features)
+
+    bad_prior_type = 'bad_prior_type'
+    bgmm = BayesianGaussianMixture(
+        weight_concentration_prior_type=bad_prior_type, random_state=rng)
+    assert_raise_message(ValueError,
+                         "Invalid value for 'weight_concentration_prior_type':"
+                         " %s 'weight_concentration_prior_type' should be in "
+                         "['dirichlet_process', 'dirichlet_distribution']"
+                         % bad_prior_type, bgmm.fit, X)
 
 
 def test_bayesian_mixture_weights_prior_initialisation():
@@ -73,29 +88,29 @@ def test_bayesian_mixture_weights_prior_initialisation():
     n_samples, n_components, n_features = 10, 5, 2
     X = rng.rand(n_samples, n_features)
 
-    # Check raise message for a bad value of dirichlet_concentration_prior
-    bad_dirichlet_concentration_prior_ = 0.
+    # Check raise message for a bad value of weight_concentration_prior
+    bad_weight_concentration_prior_ = 0.
     bgmm = BayesianGaussianMixture(
-        dirichlet_concentration_prior=bad_dirichlet_concentration_prior_,
+        weight_concentration_prior=bad_weight_concentration_prior_,
         random_state=0)
     assert_raise_message(ValueError,
-                         "The parameter 'dirichlet_concentration_prior' "
+                         "The parameter 'weight_concentration_prior' "
                          "should be greater than 0., but got %.3f."
-                         % bad_dirichlet_concentration_prior_,
+                         % bad_weight_concentration_prior_,
                          bgmm.fit, X)
 
-    # Check correct init for a given value of dirichlet_concentration_prior
-    dirichlet_concentration_prior = rng.rand()
+    # Check correct init for a given value of weight_concentration_prior
+    weight_concentration_prior = rng.rand()
     bgmm = BayesianGaussianMixture(
-        dirichlet_concentration_prior=dirichlet_concentration_prior,
+        weight_concentration_prior=weight_concentration_prior,
         random_state=rng).fit(X)
-    assert_almost_equal(dirichlet_concentration_prior,
-                        bgmm.dirichlet_concentration_prior_)
+    assert_almost_equal(weight_concentration_prior,
+                        bgmm.weight_concentration_prior_)
 
-    # Check correct init for the default value of dirichlet_concentration_prior
+    # Check correct init for the default value of weight_concentration_prior
     bgmm = BayesianGaussianMixture(n_components=n_components,
                                    random_state=rng).fit(X)
-    assert_almost_equal(1. / n_components, bgmm.dirichlet_concentration_prior_)
+    assert_almost_equal(1. / n_components, bgmm.weight_concentration_prior_)
 
 
 def test_bayesian_mixture_means_prior_initialisation():
@@ -237,44 +252,57 @@ def test_bayesian_mixture_weights():
     n_samples, n_features = 10, 2
 
     X = rng.rand(n_samples, n_features)
-    bgmm = BayesianGaussianMixture(random_state=rng).fit(X)
-
-    # Check the weights values
-    expected_weights = (bgmm.dirichlet_concentration_ /
-                        np.sum(bgmm.dirichlet_concentration_))
-    predected_weights = bgmm.weights_
 
-    assert_almost_equal(expected_weights, predected_weights)
+    # Case Dirichlet distribution for the weight concentration prior type
+    bgmm = BayesianGaussianMixture(
+        weight_concentration_prior_type="dirichlet_distribution",
+        n_components=3, random_state=rng).fit(X)
 
-    # Check the weights sum = 1
+    expected_weights = (bgmm.weight_concentration_ /
+                        np.sum(bgmm.weight_concentration_))
+    assert_almost_equal(expected_weights, bgmm.weights_)
     assert_almost_equal(np.sum(bgmm.weights_), 1.0)
 
+    # Case Dirichlet process for the weight concentration prior type
+    dpgmm = BayesianGaussianMixture(
+        weight_concentration_prior_type="dirichlet_process",
+        n_components=3, random_state=rng).fit(X)
+    weight_dirichlet_sum = (dpgmm.weight_concentration_[0] +
+                            dpgmm.weight_concentration_[1])
+    tmp = dpgmm.weight_concentration_[1] / weight_dirichlet_sum
+    expected_weights = (dpgmm.weight_concentration_[0] / weight_dirichlet_sum *
+                        np.hstack((1, np.cumprod(tmp[:-1]))))
+    expected_weights /= np.sum(expected_weights)
+    assert_almost_equal(expected_weights, dpgmm.weights_)
+    assert_almost_equal(np.sum(dpgmm.weights_), 1.0)
+
 
 @ignore_warnings(category=ConvergenceWarning)
 def test_monotonic_likelihood():
     # We check that each step of the each step of variational inference without
     # regularization improve monotonically the training set of the bound
     rng = np.random.RandomState(0)
-    rand_data = RandomData(rng, scale=7)
+    rand_data = RandomData(rng, scale=20)
     n_components = rand_data.n_components
 
-    for covar_type in COVARIANCE_TYPE:
-        X = rand_data.X[covar_type]
-        bgmm = BayesianGaussianMixture(n_components=2 * n_components,
-                                       covariance_type=covar_type,
-                                       warm_start=True, max_iter=1,
-                                       random_state=rng, tol=1e-4)
-        current_lower_bound = -np.infty
-        # Do one training iteration at a time so we can make sure that the
-        # training log likelihood increases after each iteration.
-        for _ in range(500):
-            prev_lower_bound = current_lower_bound
-            current_lower_bound = bgmm.fit(X).lower_bound_
-            assert_greater_equal(current_lower_bound, prev_lower_bound)
-
-            if bgmm.converged_:
-                break
-        assert(bgmm.converged_)
+    for prior_type in PRIOR_TYPE:
+        for covar_type in COVARIANCE_TYPE:
+            X = rand_data.X[covar_type]
+            bgmm = BayesianGaussianMixture(
+                weight_concentration_prior_type=prior_type,
+                n_components=2 * n_components, covariance_type=covar_type,
+                warm_start=True, max_iter=1, random_state=rng, tol=1e-4)
+            current_lower_bound = -np.infty
+            # Do one training iteration at a time so we can make sure that the
+            # training log likelihood increases after each iteration.
+            for _ in range(600):
+                prev_lower_bound = current_lower_bound
+                current_lower_bound = bgmm.fit(X).lower_bound_
+                assert_greater_equal(current_lower_bound, prev_lower_bound)
+
+                if bgmm.converged_:
+                    break
+            assert(bgmm.converged_)
 
 
 def test_compare_covar_type():
@@ -284,46 +312,55 @@ def test_compare_covar_type():
     rand_data = RandomData(rng, scale=7)
     X = rand_data.X['full']
     n_components = rand_data.n_components
-    # Computation of the full_covariance
-    bgmm = BayesianGaussianMixture(n_components=2 * n_components,
-                                   covariance_type='full',
-                                   max_iter=1, random_state=0, tol=1e-7)
-    bgmm._check_initial_parameters(X)
-    bgmm._initialize_parameters(X)
-    full_covariances = (bgmm.covariances_ *
-                        bgmm.degrees_of_freedom_[:, np.newaxis, np.newaxis])
-
-    # Check tied_covariance = mean(full_covariances, 0)
-    bgmm = BayesianGaussianMixture(n_components=2 * n_components,
-                                   covariance_type='tied',
-                                   max_iter=1, random_state=0, tol=1e-7)
-    bgmm._check_initial_parameters(X)
-    bgmm._initialize_parameters(X)
-
-    tied_covariance = bgmm.covariances_ * bgmm.degrees_of_freedom_
-    assert_almost_equal(tied_covariance, np.mean(full_covariances, 0))
-
-    # Check diag_covariance = diag(full_covariances)
-    bgmm = BayesianGaussianMixture(n_components=2 * n_components,
-                                   covariance_type='diag',
-                                   max_iter=1, random_state=0, tol=1e-7)
-    bgmm._check_initial_parameters(X)
-    bgmm._initialize_parameters(X)
-
-    diag_covariances = (bgmm.covariances_ *
-                        bgmm.degrees_of_freedom_[:, np.newaxis])
-    assert_almost_equal(diag_covariances,
-                        np.array([np.diag(cov) for cov in full_covariances]))
-
-    # Check spherical_covariance = np.mean(diag_covariances, 0)
-    bgmm = BayesianGaussianMixture(n_components=2 * n_components,
-                                   covariance_type='spherical',
-                                   max_iter=1, random_state=0, tol=1e-7)
-    bgmm._check_initial_parameters(X)
-    bgmm._initialize_parameters(X)
-
-    spherical_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_
-    assert_almost_equal(spherical_covariances, np.mean(diag_covariances, 1))
+
+    for prior_type in PRIOR_TYPE:
+        # Computation of the full_covariance
+        bgmm = BayesianGaussianMixture(
+            weight_concentration_prior_type=prior_type,
+            n_components=2 * n_components, covariance_type='full',
+            max_iter=1, random_state=0, tol=1e-7)
+        bgmm._check_initial_parameters(X)
+        bgmm._initialize_parameters(X, np.random.RandomState(0))
+        full_covariances = (
+            bgmm.covariances_ *
+            bgmm.degrees_of_freedom_[:, np.newaxis, np.newaxis])
+
+        # Check tied_covariance = mean(full_covariances, 0)
+        bgmm = BayesianGaussianMixture(
+            weight_concentration_prior_type=prior_type,
+            n_components=2 * n_components, covariance_type='tied',
+            max_iter=1, random_state=0, tol=1e-7)
+        bgmm._check_initial_parameters(X)
+        bgmm._initialize_parameters(X, np.random.RandomState(0))
+
+        tied_covariance = bgmm.covariances_ * bgmm.degrees_of_freedom_
+        assert_almost_equal(tied_covariance, np.mean(full_covariances, 0))
+
+        # Check diag_covariance = diag(full_covariances)
+        bgmm = BayesianGaussianMixture(
+            weight_concentration_prior_type=prior_type,
+            n_components=2 * n_components, covariance_type='diag',
+            max_iter=1, random_state=0, tol=1e-7)
+        bgmm._check_initial_parameters(X)
+        bgmm._initialize_parameters(X, np.random.RandomState(0))
+
+        diag_covariances = (bgmm.covariances_ *
+                            bgmm.degrees_of_freedom_[:, np.newaxis])
+        assert_almost_equal(diag_covariances,
+                            np.array([np.diag(cov)
+                                     for cov in full_covariances]))
+
+        # Check spherical_covariance = np.mean(diag_covariances, 0)
+        bgmm = BayesianGaussianMixture(
+            weight_concentration_prior_type=prior_type,
+            n_components=2 * n_components, covariance_type='spherical',
+            max_iter=1, random_state=0, tol=1e-7)
+        bgmm._check_initial_parameters(X)
+        bgmm._initialize_parameters(X, np.random.RandomState(0))
+
+        spherical_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_
+        assert_almost_equal(
+            spherical_covariances, np.mean(diag_covariances, 1))
 
 
 @ignore_warnings(category=ConvergenceWarning)
@@ -357,3 +394,28 @@ def test_check_covariance_precision():
         else:
             assert_almost_equal(bgmm.covariances_ * bgmm.precisions_,
                                 np.ones(n_components))
+
+
+@ignore_warnings(category=ConvergenceWarning)
+def test_invariant_translation():
+    # We check here that adding a constant in the data change correctly the
+    # parameters of the mixture
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=100)
+    n_components = 2 * rand_data.n_components
+
+    for prior_type in PRIOR_TYPE:
+        for covar_type in COVARIANCE_TYPE:
+            X = rand_data.X[covar_type]
+            bgmm1 = BayesianGaussianMixture(
+                weight_concentration_prior_type=prior_type,
+                n_components=n_components, max_iter=100, random_state=0,
+                tol=1e-3, reg_covar=0).fit(X)
+            bgmm2 = BayesianGaussianMixture(
+                weight_concentration_prior_type=prior_type,
+                n_components=n_components, max_iter=100, random_state=0,
+                tol=1e-3, reg_covar=0).fit(X + 100)
+
+            assert_almost_equal(bgmm1.means_, bgmm2.means_ - 100)
+            assert_almost_equal(bgmm1.weights_, bgmm2.weights_)
+            assert_almost_equal(bgmm1.covariances_, bgmm2.covariances_)
diff --git a/sklearn/mixture/tests/test_dpgmm.py b/sklearn/mixture/tests/test_dpgmm.py
index 363d31bc29f4fc0babbfad26ae2fa7d159ae1980..8ca38626b4cefccbb2fcd9c31a3ac0f07c474dab 100644
--- a/sklearn/mixture/tests/test_dpgmm.py
+++ b/sklearn/mixture/tests/test_dpgmm.py
@@ -1,3 +1,9 @@
+# Important note for the deprecation cleaning of 0.20 :
+# All the function and classes of this file have been deprecated in 0.18.
+# When you remove this file please also remove the related files
+# - 'sklearn/mixture/dpgmm.py'
+# - 'sklearn/mixture/gmm.py'
+# - 'sklearn/mixture/test_gmm.py'
 import unittest
 import sys
 
@@ -143,10 +149,12 @@ def test_wishart_logz():
 
 @ignore_warnings(category=DeprecationWarning)
 def test_DPGMM_deprecation():
-    assert_warns_message(DeprecationWarning, "The DPGMM class is"
-                         " not working correctly and it's better "
-                         "to not use it. DPGMM is deprecated in 0.18 "
-                         "and will be removed in 0.20.", DPGMM)
+    assert_warns_message(
+      DeprecationWarning, "The `DPGMM` class is not working correctly and "
+      "it's better to use `sklearn.mixture.BayesianGaussianMixture` class "
+      "with parameter `weight_concentration_prior_type='dirichlet_process'` "
+      "instead. DPGMM is deprecated in 0.18 and will be removed in 0.20.",
+      DPGMM)
 
 
 def do_model(self, **kwds):
@@ -183,11 +191,12 @@ class TestDPGMMWithFullCovars(unittest.TestCase, DPGMMTester):
 
 
 def test_VBGMM_deprecation():
-    assert_warns_message(DeprecationWarning, "The VBGMM class is not working "
-                         "correctly and it's better to use "
-                         "sklearn.mixture.BayesianGaussianMixture class "
-                         "instead. VBGMM is deprecated in 0.18 and will be "
-                         "removed in 0.20.", VBGMM)
+    assert_warns_message(
+        DeprecationWarning, "The `VBGMM` class is not working correctly and "
+        "it's better to use `sklearn.mixture.BayesianGaussianMixture` class "
+        "with parameter `weight_concentration_prior_type="
+        "'dirichlet_distribution'` instead. VBGMM is deprecated "
+        "in 0.18 and will be removed in 0.20.", VBGMM)
 
 
 class VBGMMTester(GMMTester):
diff --git a/sklearn/mixture/tests/test_gaussian_mixture.py b/sklearn/mixture/tests/test_gaussian_mixture.py
index 194248ac7a0dffd3daaad1d4574d810ba9742236..43d61e010a3a07066c990f9ebf06b2f47e02dad4 100644
--- a/sklearn/mixture/tests/test_gaussian_mixture.py
+++ b/sklearn/mixture/tests/test_gaussian_mixture.py
@@ -33,6 +33,7 @@ from sklearn.utils.testing import assert_greater_equal
 from sklearn.utils.testing import assert_raise_message
 from sklearn.utils.testing import assert_true
 from sklearn.utils.testing import assert_warns_message
+from sklearn.utils.testing import ignore_warnings
 
 
 COVARIANCE_TYPE = ['full', 'tied', 'diag', 'spherical']
@@ -650,7 +651,7 @@ def test_gaussian_mixture_fit_convergence_warning():
         assert_warns_message(ConvergenceWarning,
                              'Initialization %d did not converged. '
                              'Try different init parameters, '
-                             'or increase n_init, tol '
+                             'or increase max_iter, tol '
                              'or check for degenerate data.'
                              % max_iter, g.fit, X)
 
@@ -913,3 +914,60 @@ def test_property():
                                       gmm.covariances_)
         else:
             assert_array_almost_equal(gmm.precisions_, 1. / gmm.covariances_)
+
+
+def test_sample():
+    rng = np.random.RandomState(0)
+    rand_data = RandomData(rng, scale=7)
+    n_features, n_components = rand_data.n_features, rand_data.n_components
+
+    for covar_type in COVARIANCE_TYPE:
+        X = rand_data.X[covar_type]
+
+        gmm = GaussianMixture(n_components=n_components,
+                              covariance_type=covar_type, random_state=rng)
+        # To sample we need that GaussianMixture is fitted
+        assert_raise_message(NotFittedError, "This GaussianMixture instance "
+                             "is not fitted", gmm.sample, 0)
+        gmm.fit(X)
+
+        assert_raise_message(ValueError, "Invalid value for 'n_samples",
+                             gmm.sample, 0)
+
+        # Just to make sure the class samples correctly
+        X_s, y_s = gmm.sample(20000)
+        for k in range(n_features):
+            if covar_type == 'full':
+                assert_array_almost_equal(gmm.covariances_[k],
+                                          np.cov(X_s[y_s == k].T), decimal=1)
+            elif covar_type == 'tied':
+                assert_array_almost_equal(gmm.covariances_,
+                                          np.cov(X_s[y_s == k].T), decimal=1)
+            elif covar_type == 'diag':
+                assert_array_almost_equal(gmm.covariances_[k],
+                                          np.diag(np.cov(X_s[y_s == k].T)),
+                                          decimal=1)
+            else:
+                assert_array_almost_equal(
+                    gmm.covariances_[k], np.var(X_s[y_s == k] - gmm.means_[k]),
+                    decimal=1)
+
+        means_s = np.array([np.mean(X_s[y_s == k], 0)
+                           for k in range(n_features)])
+        assert_array_almost_equal(gmm.means_, means_s, decimal=1)
+
+
+@ignore_warnings(category=ConvergenceWarning)
+def test_init():
+    # We check that by increasing the n_init number we have a better solution
+    random_state = 0
+    rand_data = RandomData(np.random.RandomState(random_state), scale=1)
+    n_components = rand_data.n_components
+    X = rand_data.X['full']
+
+    gmm1 = GaussianMixture(n_components=n_components, n_init=1,
+                           max_iter=1, random_state=random_state).fit(X)
+    gmm2 = GaussianMixture(n_components=n_components, n_init=100,
+                           max_iter=1, random_state=random_state).fit(X)
+
+    assert_greater(gmm2.lower_bound_, gmm1.lower_bound_)
diff --git a/sklearn/mixture/tests/test_gmm.py b/sklearn/mixture/tests/test_gmm.py
index 66e581a582e0d73997a4b133ff9a135dedfb87e9..55f0dfb83f225eeb53ec4d9386f06f88ed6acb40 100644
--- a/sklearn/mixture/tests/test_gmm.py
+++ b/sklearn/mixture/tests/test_gmm.py
@@ -1,5 +1,9 @@
-# These tests are those of the deprecated GMM class
-
+# Important note for the deprecation cleaning of 0.20 :
+# All the functions and classes of this file have been deprecated in 0.18.
+# When you remove this file please remove the related files
+# - 'sklearn/mixture/dpgmm.py'
+# - 'sklearn/mixture/gmm.py'
+# - 'sklearn/mixture/test_dpgmm.py'
 import unittest
 import copy
 import sys