diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py
index 63b1b2a865e60197c81cd9bef753bb996eace769..8fbf40a1145e426b5ec269547cdf6e9e74b0afc7 100644
--- a/sklearn/cluster/dbscan_.py
+++ b/sklearn/cluster/dbscan_.py
@@ -186,6 +186,10 @@ class DBSCAN(BaseEstimator, ClusterMixin):
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square. X may be a sparse matrix, in which case only "nonzero"
         elements may be considered neighbors for DBSCAN.
+
+        .. versionadded:: 0.17
+           metric *precomputed* to accept precomputed sparse matrix.
+
     algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
         The algorithm to be used by the NearestNeighbors module
         to compute pointwise distances and find nearest neighbors.
diff --git a/sklearn/cluster/mean_shift_.py b/sklearn/cluster/mean_shift_.py
index 703a21fc5745dea9f87dd320eae5fc7ed0820e1b..322a37dbe95643fc69a058a6b7fb1c0f69ea9c14 100644
--- a/sklearn/cluster/mean_shift_.py
+++ b/sklearn/cluster/mean_shift_.py
@@ -147,6 +147,9 @@ def mean_shift(X, bandwidth=None, seeds=None, bin_seeding=False,
         (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
         are used.
 
+        .. versionadded:: 0.17
+           Parallel Execution using *n_jobs*.
+
     Returns
     -------
 
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 1b172a2ed001475a2f4f0de0f55ad1d279feca89..1aa1dd64206cb3d9c66258ff8856230cb4d28bb4 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -351,6 +351,8 @@ class LabelKFold(_BaseKFold):
     The folds are approximately balanced in the sense that the number of
     distinct labels is approximately the same in each fold.
 
+    .. versionadded:: 0.17
+
     Parameters
     ----------
     labels : array-like with shape (n_samples, )
@@ -1098,6 +1100,8 @@ class LabelShuffleSplit(ShuffleSplit):
     Note: The parameters ``test_size`` and ``train_size`` refer to labels, and
     not to samples, as in ShuffleSplit.
 
+    .. versionadded:: 0.17
+
     Parameters
     ----------
     labels :  array, [n_samples]
@@ -1820,6 +1824,9 @@ def train_test_split(*arrays, **options):
         If not None, data is split in a stratified fashion, using this as
         the labels array.
 
+        .. versionadded:: 0.17
+           *stratify* splitting
+
     Returns
     -------
     splitting : list, length = 2 * len(arrays),
diff --git a/sklearn/datasets/rcv1.py b/sklearn/datasets/rcv1.py
index 5f57fb09bd8928dd4a5e3101c6c8f0a919ddd65a..4f25528ed0ff2148b826b5b2a00938ce33c8c47d 100644
--- a/sklearn/datasets/rcv1.py
+++ b/sklearn/datasets/rcv1.py
@@ -51,6 +51,8 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
 
     Read more in the :ref:`User Guide <datasets>`.
 
+    .. versionadded:: 0.17
+
     Parameters
     ----------
     data_home : string, optional
diff --git a/sklearn/datasets/samples_generator.py b/sklearn/datasets/samples_generator.py
index 6ad641afa3201f8a5eeb707a53a10f033b5a54c0..6a1533b9b6abafb32f668180d957bdf75208fa47 100644
--- a/sklearn/datasets/samples_generator.py
+++ b/sklearn/datasets/samples_generator.py
@@ -294,6 +294,9 @@ def make_multilabel_classification(n_samples=100, n_features=20, n_classes=5,
     sparse : bool, optional (default=False)
         If ``True``, return a sparse feature matrix
 
+        .. versionadded:: 0.17
+           parameter to allow *sparse* output.
+
     return_indicator : 'dense' (default) | 'sparse' | False
         If ``dense`` return ``Y`` in the dense binary indicator format. If
         ``'sparse'`` return ``Y`` in the sparse binary indicator format.
diff --git a/sklearn/datasets/svmlight_format.py b/sklearn/datasets/svmlight_format.py
index 892f139ff10e940dbbff40648917106822671a08..7793410d38b444507b9bfa9eceaa7933032d853b 100644
--- a/sklearn/datasets/svmlight_format.py
+++ b/sklearn/datasets/svmlight_format.py
@@ -342,7 +342,7 @@ def dump_svmlight_file(X, y, f,  zero_based=True, comment=None, query_id=None,
         n_features is the number of features.
 
     y : array-like, shape = [n_samples] or [n_samples, n_labels]
-        Target values. Class labels must be an integer or float, or array-like 
+        Target values. Class labels must be an integer or float, or array-like
         objects of integer or float for multilabel classifications.
 
     f : string or file-like in binary mode
@@ -369,6 +369,9 @@ def dump_svmlight_file(X, y, f,  zero_based=True, comment=None, query_id=None,
     multilabel: boolean, optional
         Samples may have several labels each (see
         http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)
+
+        .. versionadded:: 0.17
+           parameter *multilabel* to support multilabel datasets.
     """
     if comment is not None:
         # Convert comment string to list of lines in UTF-8.
diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py
index 03a85baa67853533897e940dc3e8b7a2362f369e..856bb111163840f196461a680f5b58c31f39e59d 100644
--- a/sklearn/decomposition/dict_learning.py
+++ b/sklearn/decomposition/dict_learning.py
@@ -948,6 +948,9 @@ class DictionaryLearning(BaseEstimator, SparseCodingMixin):
         Lasso solution (linear_model.Lasso). Lars will be faster if
         the estimated components are sparse.
 
+        .. versionadded:: 0.17
+           *cd* coordinate descent method to improve speed.
+
     transform_algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', \
     'threshold'}
         Algorithm used to transform the data
@@ -960,6 +963,9 @@ class DictionaryLearning(BaseEstimator, SparseCodingMixin):
         threshold: squashes to zero all coefficients less than alpha from
         the projection ``dictionary * X'``
 
+        .. versionadded:: 0.17
+           *lasso_cd* coordinate descent method to improve speed.
+
     transform_n_nonzero_coefs : int, ``0.1 * n_features`` by default
         Number of nonzero coefficients to target in each column of the
         solution. This is only used by `algorithm='lars'` and `algorithm='omp'`
diff --git a/sklearn/decomposition/nmf.py b/sklearn/decomposition/nmf.py
index 0c27b1f7c4e4e06c27522bfb2647a305dc8a1e4c..0e212d8f32bd4844b0fae62399c30b314899551c 100644
--- a/sklearn/decomposition/nmf.py
+++ b/sklearn/decomposition/nmf.py
@@ -841,8 +841,14 @@ class NMF(BaseEstimator, TransformerMixin):
 
     solver : 'pg' | 'cd'
         Numerical solver to use:
-        'pg' is a (deprecated) Projected Gradient solver.
-        'cd' is a Coordinate Descent solver.
+        'pg' is a Projected Gradient solver (deprecated).
+        'cd' is a Coordinate Descent solver (recommended).
+
+        .. versionadded:: 0.17
+           Coordinate Descent solver.
+
+        .. versionchanged:: 0.17
+           Deprecated Projected Gradient solver.
 
     tol : double, default: 1e-4
         Tolerance value used in stopping conditions.
@@ -857,6 +863,9 @@ class NMF(BaseEstimator, TransformerMixin):
         Constant that multiplies the regularization terms. Set it to zero to
         have no regularization.
 
+        .. versionadded:: 0.17
+           *alpha* used in the Coordinate Descent solver.
+
     l1_ratio : double, default: 0.
         The regularization mixing parameter, with 0 <= l1_ratio <= 1.
         For l1_ratio = 0 the penalty is an elementwise L2 penalty
@@ -864,25 +873,47 @@ class NMF(BaseEstimator, TransformerMixin):
         For l1_ratio = 1 it is an elementwise L1 penalty.
         For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
 
+        .. versionadded:: 0.17
+           Regularization parameter *l1_ratio* used in the Coordinate Descent solver.
+
     shuffle : boolean, default: False
         If true, randomize the order of coordinates in the CD solver.
 
+        .. versionadded:: 0.17
+           *shuffle* parameter used in the Coordinate Descent solver.
+
     nls_max_iter : integer, default: 2000
         Number of iterations in NLS subproblem.
         Used only in the deprecated 'pg' solver.
 
+        .. versionchanged:: 0.17
+           Deprecated Projected Gradient solver. Use Coordinate Descent solver
+           instead.
+
     sparseness : 'data' | 'components' | None, default: None
         Where to enforce sparsity in the model.
         Used only in the deprecated 'pg' solver.
 
+        .. versionchanged:: 0.17
+           Deprecated Projected Gradient solver. Use Coordinate Descent solver
+           instead.
+
     beta : double, default: 1
         Degree of sparseness, if sparseness is not None. Larger values mean
         more sparseness. Used only in the deprecated 'pg' solver.
 
+        .. versionchanged:: 0.17
+           Deprecated Projected Gradient solver. Use Coordinate Descent solver
+           instead.
+
     eta : double, default: 0.1
         Degree of correctness to maintain, if sparsity is not None. Smaller
         values mean larger error. Used only in the deprecated 'pg' solver.
 
+        .. versionchanged:: 0.17
+           Deprecated Projected Gradient solver. Use Coordinate Descent solver
+           instead.
+
     Attributes
     ----------
     components_ : array, [n_components, n_features]
diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py
index d3b8bbf6d884809f5c5bd26f0dc4009e17096130..a26c35e2837b16fe883d6f59f1b07a63d942add6 100644
--- a/sklearn/decomposition/online_lda.py
+++ b/sklearn/decomposition/online_lda.py
@@ -135,6 +135,8 @@ def _update_doc_distribution(X, exp_topic_word_distr, doc_topic_prior,
 class LatentDirichletAllocation(BaseEstimator, TransformerMixin):
     """Latent Dirichlet Allocation with online variational Bayes algorithm
 
+    .. versionadded:: 0.17
+
     Parameters
     ----------
     n_topics : int, optional (default=10)
diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py
index d8dcb47a503efd65182668ee68be60d506503df3..6ee9bb9741ebd56e9ea0f5c36dd4c59a2991a218 100644
--- a/sklearn/discriminant_analysis.py
+++ b/sklearn/discriminant_analysis.py
@@ -140,6 +140,12 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin,
     The fitted model can also be used to reduce the dimensionality of the input
     by projecting it to the most discriminative directions.
 
+    .. versionadded:: 0.17
+       *LinearDiscriminantAnalysis*.
+
+    .. versionchanged:: 0.17
+       Deprecated :class:`lda.LDA` have been moved to *LinearDiscriminantAnalysis*.
+
     Parameters
     ----------
     solver : string, optional
@@ -167,9 +173,13 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin,
     store_covariance : bool, optional
         Additionally compute class covariance matrix (default False).
 
+        .. versionadded:: 0.17
+
     tol : float, optional
         Threshold used for rank estimation in SVD solver.
 
+        .. versionadded:: 0.17
+
     Attributes
     ----------
     coef_ : array, shape (n_features,) or (n_classes, n_features)
@@ -399,6 +409,12 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin,
         """Fit LinearDiscriminantAnalysis model according to the given
            training data and parameters.
 
+           .. versionchanged:: 0.17
+              Deprecated *store_covariance* have been moved to main constructor.
+
+           .. versionchanged:: 0.17
+              Deprecated *tol* have been moved to main constructor.
+
         Parameters
         ----------
         X : array-like, shape (n_samples, n_features)
@@ -532,6 +548,12 @@ class QuadraticDiscriminantAnalysis(BaseEstimator, ClassifierMixin):
 
     The model fits a Gaussian density to each class.
 
+    .. versionadded:: 0.17
+       *QuadraticDiscriminantAnalysis*
+
+    .. versionchanged:: 0.17
+       Deprecated :class:`qda.QDA` have been moved to *QuadraticDiscriminantAnalysis*.
+
     Parameters
     ----------
     priors : array, optional, shape = [n_classes]
@@ -567,9 +589,13 @@ class QuadraticDiscriminantAnalysis(BaseEstimator, ClassifierMixin):
         If True the covariance matrices are computed and stored in the
         `self.covariances_` attribute.
 
+        .. versionadded:: 0.17
+
     tol : float, optional, default 1.0e-4
         Threshold used for rank estimation.
 
+        .. versionadded:: 0.17
+
     Examples
     --------
     >>> from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
@@ -600,6 +626,12 @@ class QuadraticDiscriminantAnalysis(BaseEstimator, ClassifierMixin):
     def fit(self, X, y, store_covariances=None, tol=None):
         """Fit the model according to the given training data and parameters.
 
+            .. versionchanged:: 0.17
+               Deprecated *store_covariance* have been moved to main constructor.
+
+            .. versionchanged:: 0.17
+               Deprecated *tol* have been moved to main constructor.
+
         Parameters
         ----------
         X : array-like, shape = [n_samples, n_features]
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
index f227fd5f99655966124592af15eb074629731c69..ebb83baed4e662379bc5c8c944d63daa84b08a30 100644
--- a/sklearn/dummy.py
+++ b/sklearn/dummy.py
@@ -42,6 +42,10 @@ class DummyClassifier(BaseEstimator, ClassifierMixin):
           the user. This is useful for metrics that evaluate a non-majority
           class
 
+          .. versionadded:: 0.17
+             Dummy Classifier now supports prior fitting strategy using
+             parameter *prior*.
+
     random_state : int seed, RandomState instance, or None (default)
         The seed of the pseudo random number generator to use.
 
diff --git a/sklearn/ensemble/bagging.py b/sklearn/ensemble/bagging.py
index ad12550350a06782b487a4c5967ab73822e3c640..59347935ff9768a35927351c38bae5a91d444cb6 100644
--- a/sklearn/ensemble/bagging.py
+++ b/sklearn/ensemble/bagging.py
@@ -414,6 +414,9 @@ class BaggingClassifier(BaseBagging, ClassifierMixin):
         and add more estimators to the ensemble, otherwise, just fit
         a whole new ensemble.
 
+        .. versionadded:: 0.17
+           *warm_start* constructor parameter.
+
     n_jobs : int, optional (default=1)
         The number of jobs to run in parallel for both `fit` and `predict`.
         If -1, then the number of jobs is set to the number of cores.
diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py
index 8f9a3f1c98b61d70a4f88f3b81d2eea85d0ffcd8..46ec28e379b68f84e08a67e585243a66133a3ec0 100644
--- a/sklearn/ensemble/gradient_boosting.py
+++ b/sklearn/ensemble/gradient_boosting.py
@@ -1226,6 +1226,8 @@ class BaseGradientBoosting(six.with_metaclass(ABCMeta, BaseEnsemble,
     def apply(self, X):
         """Apply trees in the ensemble to X, return leaf indices.
 
+        .. versionadded:: 0.17
+
         Parameters
         ----------
         X : array-like or sparse matrix, shape = [n_samples, n_features]
@@ -1361,6 +1363,9 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin):
         default to normal sorting on sparse data. Setting presort to true on
         sparse data will raise an error.
 
+        .. versionadded:: 0.17
+           *presort* parameter.
+
     Attributes
     ----------
     feature_importances_ : array, shape = [n_features]
@@ -1696,6 +1701,9 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin):
         default to normal sorting on sparse data. Setting presort to true on
         sparse data will raise an error.
 
+        .. versionadded:: 0.17
+           optional parameter *presort*.
+
     Attributes
     ----------
     feature_importances_ : array, shape = [n_features]
@@ -1795,6 +1803,8 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin):
     def apply(self, X):
         """Apply trees in the ensemble to X, return leaf indices.
 
+        .. versionadded:: 0.17
+
         Parameters
         ----------
         X : array-like or sparse matrix, shape = [n_samples, n_features]
diff --git a/sklearn/ensemble/voting_classifier.py b/sklearn/ensemble/voting_classifier.py
index 496a9740f3393c4087dbfad6ccea884d2b860fd9..17540f2b5e7e8725bb29e9ca2da211f8ffef57ba 100644
--- a/sklearn/ensemble/voting_classifier.py
+++ b/sklearn/ensemble/voting_classifier.py
@@ -24,6 +24,7 @@ from ..externals import six
 class VotingClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
     """Soft Voting/Majority Rule classifier for unfitted estimators.
 
+    .. versionadded:: 0.17
     Read more in the :ref:`User Guide <voting_classifier>`.
 
     Parameters
diff --git a/sklearn/feature_selection/from_model.py b/sklearn/feature_selection/from_model.py
index 81e35a8000adf626459d2cf8bc69baf23cbf0d42..b3a38dc94788bac821e53ee97d73d8b910b6a4d1 100644
--- a/sklearn/feature_selection/from_model.py
+++ b/sklearn/feature_selection/from_model.py
@@ -146,6 +146,8 @@ class _LearntSelectorMixin(TransformerMixin):
 class SelectFromModel(BaseEstimator, SelectorMixin):
     """Meta-transformer for selecting features based on importance weights.
 
+    .. versionadded:: 0.17
+
     Parameters
     ----------
     estimator : object
diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py
index 1f63e5f65a677c8c29ab0d3749bbda071f0b692b..f7db1d66113cc6aa34ea4a04c1991dc0b678ee9f 100644
--- a/sklearn/grid_search.py
+++ b/sklearn/grid_search.py
@@ -645,6 +645,9 @@ class GridSearchCV(BaseSearchCV):
     n_jobs : int, default=1
         Number of jobs to run in parallel.
 
+        .. versionchanged:: 0.17
+           Upgraded to joblib 0.9.3.
+
     pre_dispatch : int, or string, optional
         Controls the number of jobs that get dispatched during parallel
         execution. Reducing this number can be useful to avoid an
diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
index 53efca112232bc9fa9a38cd071e2f96627f6065d..22c66f2ba1a5b4c320d6522b097ef8c5b8a14d1a 100644
--- a/sklearn/linear_model/base.py
+++ b/sklearn/linear_model/base.py
@@ -414,6 +414,9 @@ class LinearRegression(LinearModel, RegressorMixin):
         sample_weight : numpy array of shape [n_samples]
             Individual weights for each sample
 
+            .. versionadded:: 0.17
+               parameter *sample_weight* support to LinearRegression.
+
         Returns
         -------
         self : returns an instance of self.
diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py
index 8ed571619e7eb903bd99ae141eaf18303cdeee1e..70086b35543d3712beb647fe6e25425b206d0da1 100644
--- a/sklearn/linear_model/logistic.py
+++ b/sklearn/linear_model/logistic.py
@@ -987,6 +987,9 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
         Note that these weights will be multiplied with sample_weight (passed
         through the fit method) if sample_weight is specified.
 
+        .. versionadded:: 0.17
+           *class_weight='balanced'* instead of deprecated *class_weight='auto'*.
+
     max_iter : int
         Useful only for the newton-cg, sag and lbfgs solvers.
         Maximum number of iterations taken for the solvers to converge.
@@ -1009,6 +1012,9 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
         approximately the same scale. You can preprocess the data with a
         scaler from sklearn.preprocessing.
 
+        .. versionadded:: 0.17
+           Stochastic Average Gradient descent solver.
+
     tol : float, optional
         Tolerance for stopping criteria.
 
@@ -1028,6 +1034,9 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
         initialization, otherwise, just erase the previous solution.
         Useless for liblinear solver.
 
+        .. versionadded:: 0.17
+           *warm_start* to support *lbfgs*, *newton-cg*, *sag* solvers.
+
     n_jobs : int, optional
         Number of CPU cores used during the cross-validation loop. If given
         a value of -1, all cores are used.
@@ -1111,6 +1120,9 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
             Array of weights that are assigned to individual samples.
             If not provided, then each sample is given unit weight.
 
+            .. versionadded:: 0.17
+               *sample_weight* support to LogisticRegression.
+
         Returns
         -------
         self : object
@@ -1307,6 +1319,9 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
         Note that these weights will be multiplied with sample_weight (passed
         through the fit method) if sample_weight is specified.
 
+        .. versionadded:: 0.17
+           class_weight == 'balanced'
+
     cv : integer or cross-validation generator
         The default cross-validation generator used is Stratified K-Folds.
         If an integer is provided, then it is the number of folds used.
diff --git a/sklearn/linear_model/passive_aggressive.py b/sklearn/linear_model/passive_aggressive.py
index 5359d19f6c750555b34dc706c9522c362e0789c9..8419383efa17e693dba5b108c6f6b58f254df8a0 100644
--- a/sklearn/linear_model/passive_aggressive.py
+++ b/sklearn/linear_model/passive_aggressive.py
@@ -59,6 +59,9 @@ class PassiveAggressiveClassifier(BaseSGDClassifier):
         weights inversely proportional to class frequencies in the input data
         as ``n_samples / (n_classes * np.bincount(y))``
 
+        .. versionadded:: 0.17
+           parameter *class_weight* to automatically weight samples.
+
     Attributes
     ----------
     coef_ : array, shape = [1, n_features] if n_classes == 2 else [n_classes,\
diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py
index 77e315fc40edc24ee6f8b8e1c161bf1b6fac13bf..1683011e5194221e9420cb279681bb57da84bb0b 100644
--- a/sklearn/linear_model/ridge.py
+++ b/sklearn/linear_model/ridge.py
@@ -222,6 +222,8 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
         Individual weights for each sample. If sample_weight is not None and
         solver='auto', the solver will be set to 'cholesky'.
 
+        .. versionadded:: 0.17
+
     solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg'}
         Solver to use in the computational routines:
 
@@ -544,6 +546,9 @@ class Ridge(_BaseRidge, RegressorMixin):
         All last four solvers support both dense and sparse data. However,
         only 'sag' supports sparse input when `fit_intercept` is True.
 
+        .. versionadded:: 0.17
+           Stochastic Average Gradient descent solver.
+
     tol : float
         Precision of the solution.
 
@@ -551,6 +556,9 @@ class Ridge(_BaseRidge, RegressorMixin):
         The seed of the pseudo random number generator to use when
         shuffling the data. Used in 'sag' solver.
 
+        .. versionadded:: 0.17
+           *random_state* to support Stochastic Average Gradient.
+
     Attributes
     ----------
     coef_ : array, shape (n_features,) or (n_targets, n_features)
@@ -672,6 +680,9 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
           iterative procedure, and is faster than other solvers when both
           n_samples and n_features are large.
 
+          .. versionadded:: 0.17
+             Stochastic Average Gradient descent solver.
+
     tol : float
         Precision of the solution.
 
@@ -725,6 +736,9 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
         sample_weight : float or numpy array of shape (n_samples,)
             Sample weight.
 
+            .. versionadded:: 0.17
+               *sample_weight* support to Classifier.
+
         Returns
         -------
         self : returns an instance of self.
diff --git a/sklearn/linear_model/sag.py b/sklearn/linear_model/sag.py
index 48104e9a610b765f0d49922aa203e933b9e34999..b4f7474f2e113520000c7e046a74bfd114958186 100644
--- a/sklearn/linear_model/sag.py
+++ b/sklearn/linear_model/sag.py
@@ -76,6 +76,8 @@ def sag_solver(X, y, sample_weight=None, loss='log', alpha=1.,
     The regularizer is a penalty added to the loss function that shrinks model
     parameters towards the zero vector using the squared euclidean norm L2.
 
+    .. versionadded:: 0.17
+
     Parameters
     ----------
     X : {array-like, sparse matrix}, shape (n_samples, n_features)
diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py
index 2191cc5be2d45b7659b2945b6b386ab3a748d237..dcb9622f04d07afcd2706c20cd9c4d586e0ceebd 100644
--- a/sklearn/manifold/t_sne.py
+++ b/sklearn/manifold/t_sne.py
@@ -548,6 +548,9 @@ class TSNE(BaseEstimator):
         Maximum number of iterations without progress before we abort the
         optimization.
 
+        .. versionadded:: 0.17
+           parameter *n_iter_without_progress* to control stopping criteria.
+
     min_grad_norm : float, optional (default: 1E-7)
         If the gradient norm is below this threshold, the optimization will
         be aborted.
@@ -585,6 +588,9 @@ class TSNE(BaseEstimator):
         to be better than 3%. However, the exact method cannot scale to
         millions of examples.
 
+        .. versionadded:: 0.17
+           Approximate optimization *method* via the Barnes-Hut.
+
     angle : float (default: 0.5)
         Only used if method='barnes_hut'
         This is the trade-off between speed and accuracy for Barnes-Hut T-SNE.
diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 4fbbeb478ca4e88ca404f066246b795e4cfc03e3..f3378961bff91c64b44598d73c6111d3c123880f 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -568,6 +568,9 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
         labels are column indices. By default, all labels in ``y_true`` and
         ``y_pred`` are used in sorted order.
 
+        .. versionchanged:: 0.17
+           parameter *labels* improved for multiclass problem.
+
     pos_label : str or int, 1 by default
         The class to report if ``average='binary'``. Until version 0.18 it is
         necessary to set ``pos_label=None`` if seeking to use another averaging
@@ -671,6 +674,9 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1,
         labels are column indices. By default, all labels in ``y_true`` and
         ``y_pred`` are used in sorted order.
 
+        .. versionchanged:: 0.17
+           parameter *labels* improved for multiclass problem.
+
     pos_label : str or int, 1 by default
         The class to report if ``average='binary'``. Until version 0.18 it is
         necessary to set ``pos_label=None`` if seeking to use another averaging
@@ -1126,6 +1132,9 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1,
         labels are column indices. By default, all labels in ``y_true`` and
         ``y_pred`` are used in sorted order.
 
+        .. versionchanged:: 0.17
+           parameter *labels* improved for multiclass problem.
+
     pos_label : str or int, 1 by default
         The class to report if ``average='binary'``. Until version 0.18 it is
         necessary to set ``pos_label=None`` if seeking to use another averaging
@@ -1225,6 +1234,9 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
         labels are column indices. By default, all labels in ``y_true`` and
         ``y_pred`` are used in sorted order.
 
+        .. versionchanged:: 0.17
+           parameter *labels* improved for multiclass problem.
+
     pos_label : str or int, 1 by default
         The class to report if ``average='binary'``. Until version 0.18 it is
         necessary to set ``pos_label=None`` if seeking to use another averaging
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index f76d85238ae9c9df8d3fef049b05eec53712ccb3..78753a8739b3c8ffa63f8222ffda077ca58260c2 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -822,6 +822,8 @@ def laplacian_kernel(X, Y=None, gamma=None):
     for each pair of rows x in X and y in Y.
     Read more in the :ref:`User Guide <laplacian_kernel>`.
 
+    .. versionadded:: 0.17
+
     Parameters
     ----------
     X : array of shape (n_samples_X, n_features)
@@ -866,6 +868,9 @@ def cosine_similarity(X, Y=None, dense_output=True):
         Whether to return dense output even when the input is sparse. If
         ``False``, the output is sparse if both input arrays are sparse.
 
+        .. versionadded:: 0.17
+           parameter *dense_output* for sparse output.
+
     Returns
     -------
     kernel matrix : array
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 51883ca681f981cfce812e5be71f0ccb68d8e681..252288808111a1bab3177ac1b9610c05c6cb8d45 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -444,6 +444,9 @@ def roc_curve(y_true, y_score, pos_label=None, sample_weight=None,
         on a plotted ROC curve. This is useful in order to create lighter
         ROC curves.
 
+        .. versionadded:: 0.17
+           parameter *drop_intermediate*.
+
     Returns
     -------
     fpr : array, shape = [>2]
@@ -678,6 +681,9 @@ def label_ranking_loss(y_true, y_score, sample_weight=None):
 
     Read more in the :ref:`User Guide <label_ranking_loss>`.
 
+    .. versionadded:: 0.17
+       A function *label_ranking_loss*
+
     Parameters
     ----------
     y_true : array or sparse matrix, shape = [n_samples, n_labels]
diff --git a/sklearn/mixture/gmm.py b/sklearn/mixture/gmm.py
index de11daf147106873fcd377d55bca5e9cc4658b80..a4761d767bcef9b5d26d3ba5b2082d71535ae265 100644
--- a/sklearn/mixture/gmm.py
+++ b/sklearn/mixture/gmm.py
@@ -429,6 +429,9 @@ class GMM(BaseEstimator):
         Warning: due to the final maximization step in the EM algorithm,
         with low iterations the prediction may not be 100% accurate
 
+        .. versionadded:: 0.17
+           *fit_predict* method in Gaussian Mixture Model.
+
         Parameters
         ----------
         X : array-like, shape = [n_samples, n_features]
diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
index 51bcfef2fb0e540631450e21e9a06e6461502b98..410eb7cf6db98102f39ef3410ca7dd78fd3413ed 100644
--- a/sklearn/naive_bayes.py
+++ b/sklearn/naive_bayes.py
@@ -162,6 +162,9 @@ class GaussianNB(BaseNB):
         sample_weight : array-like, shape (n_samples,), optional
             Weights applied to individual samples (1. for unweighted).
 
+            .. versionadded:: 0.17
+               Gaussian Naive Bayes supports fitting with *sample_weight*.
+
         Returns
         -------
         self : object
@@ -279,6 +282,8 @@ class GaussianNB(BaseNB):
         sample_weight : array-like, shape (n_samples,), optional
             Weights applied to individual samples (1. for unweighted).
 
+            .. versionadded:: 0.17
+
         Returns
         -------
         self : object
diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py
index c814b14bf377e92d0a9d6c418d9ad5109b8e38a5..25ed4936a5ca38b5bdcca180b1bf478aed66dee4 100644
--- a/sklearn/preprocessing/_function_transformer.py
+++ b/sklearn/preprocessing/_function_transformer.py
@@ -21,6 +21,8 @@ class FunctionTransformer(BaseEstimator, TransformerMixin):
     Note: If a lambda is used as the function, then the resulting
     transformer will not be pickleable.
 
+    .. versionadded:: 0.17
+
     Parameters
     ----------
     func : callable, optional default=None
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 54ea1ed4fc8e85a3414886bb65bffc5af8e125f0..76e7ba58eb8f89244a2d2987455d8d475ad7ee67 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -226,14 +226,26 @@ class MinMaxScaler(BaseEstimator, TransformerMixin):
     scale_ : ndarray, shape (n_features,)
         Per feature relative scaling of the data.
 
+        .. versionadded:: 0.17
+           *scale_* attribute.
+
     data_min_ : ndarray, shape (n_features,)
         Per feature minimum seen in the data
 
+        .. versionadded:: 0.17
+           *data_min_* instead of deprecated *data_min*.
+
     data_max_ : ndarray, shape (n_features,)
         Per feature maximum seen in the data
 
+        .. versionadded:: 0.17
+           *data_max_* instead of deprecated *data_max*.
+
     data_range_ : ndarray, shape (n_features,)
         Per feature range ``(data_max_ - data_min_)`` seen in the data
+
+        .. versionadded:: 0.17
+           *data_range_* instead of deprecated *data_range*.
     """
 
     def __init__(self, feature_range=(0, 1), copy=True):
@@ -388,6 +400,9 @@ def minmax_scale(X, feature_range=(0, 1), axis=0, copy=True):
 
     Read more in the :ref:`User Guide <preprocessing_scaler>`.
 
+    .. versionadded:: 0.17
+       *minmax_scale* function interface to :class:`sklearn.preprocessing.MinMaxScaler`.
+
     Parameters
     ----------
     feature_range: tuple (min, max), default=(0, 1)
@@ -478,6 +493,9 @@ class StandardScaler(BaseEstimator, TransformerMixin):
     scale_ : ndarray, shape (n_features,)
         Per feature relative scaling of the data.
 
+        .. versionadded:: 0.17
+           *scale_* is recommended instead of deprecated *std_*.
+
     mean_ : array of floats with shape [n_features]
         The mean value for each feature in the training set.
 
@@ -685,6 +703,8 @@ class MaxAbsScaler(BaseEstimator, TransformerMixin):
 
     This scaler can also be applied to sparse CSR or CSC matrices.
 
+    .. versionadded:: 0.17
+
     Parameters
     ----------
     copy : boolean, optional, default is True
@@ -696,6 +716,9 @@ class MaxAbsScaler(BaseEstimator, TransformerMixin):
     scale_ : ndarray, shape (n_features,)
         Per feature relative scaling of the data.
 
+        .. versionadded:: 0.17
+           *scale_* attribute.
+
     max_abs_ : ndarray, shape (n_features,)
         Per feature maximum absolute value.
 
@@ -879,6 +902,7 @@ class RobustScaler(BaseEstimator, TransformerMixin):
     sample mean / variance in a negative way. In such cases, the median and
     the interquartile range often give better results.
 
+    .. versionadded:: 0.17
     Read more in the :ref:`User Guide <preprocessing_scaler>`.
 
     Parameters
@@ -907,6 +931,9 @@ class RobustScaler(BaseEstimator, TransformerMixin):
     scale_ : array of floats
         The (scaled) interquartile range for each feature in the training set.
 
+        .. versionadded:: 0.17
+           *scale_* attribute.
+
     See also
     --------
     :class:`sklearn.preprocessing.StandardScaler` to perform centering
diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py
index 2fccc0e55dff7af05eb940b5f4705d8a612841af..60ea6ad92c63ea5b1737e7cc972585bd544b33fc 100644
--- a/sklearn/svm/classes.py
+++ b/sklearn/svm/classes.py
@@ -461,6 +461,12 @@ class SVC(BaseSVC):
         compatibility and raise a deprecation warning, but will change 'ovr'
         in 0.18.
 
+        .. versionadded:: 0.17
+           *decision_function_shape='ovr'* is recommended.
+
+        .. versionchanged:: 0.17
+           Deprecated *decision_function_shape='ovo' and None*.
+
     random_state : int seed, RandomState instance, or None (default)
         The seed of the pseudo random number generator to use when
         shuffling the data for probability estimation.
@@ -608,6 +614,12 @@ class NuSVC(BaseSVC):
         compatibility and raise a deprecation warning, but will change 'ovr'
         in 0.18.
 
+        .. versionadded:: 0.17
+           *decision_function_shape='ovr'* is recommended.
+
+        .. versionchanged:: 0.17
+           Deprecated *decision_function_shape='ovo' and None*.
+
     random_state : int seed, RandomState instance, or None (default)
         The seed of the pseudo random number generator to use when
         shuffling the data for probability estimation.
diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index c20af9da9672ce72e30c1d0bd30cf292c8a6746e..2767831fc75a13191ab26022824632f5989c08f4 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -433,6 +433,8 @@ class BaseDecisionTree(six.with_metaclass(ABCMeta, BaseEstimator,
         """
         Returns the index of the leaf that each sample is predicted as.
 
+        .. versionadded:: 0.17
+
         Parameters
         ----------
         X : array_like or sparse matrix, shape = [n_samples, n_features]