diff --git a/.travis.yml b/.travis.yml
index 6892cdbd53e51ec3aa391625ad51916fff33248b..a1f58514b0d89d85dbe7be6d29e8663cf16785f6 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -49,6 +49,7 @@ matrix:
     - env: RUN_FLAKE8="true" SKIP_TESTS="true"
            DISTRIB="conda" PYTHON_VERSION="3.5" INSTALL_MKL="true"
            NUMPY_VERSION="1.12.1" SCIPY_VERSION="0.19.0" CYTHON_VERSION="0.23.5"
+           TEST_DOCSTRINGS="true"
     # This environment tests scikit-learn against numpy and scipy master
     # installed from their CI wheels in a virtualenv with the Python
     # interpreter provided by travis.
diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh
index 257cfb17f393801a41b960f83b07a4008b2e2e33..8cd774d649338cfd5119953d9adf3d2c7370ac61 100755
--- a/build_tools/travis/install.sh
+++ b/build_tools/travis/install.sh
@@ -85,6 +85,10 @@ if [[ "$COVERAGE" == "true" ]]; then
     pip install coverage codecov
 fi
 
+if [[ "$TEST_DOCSTRINGS" == "true" ]]; then
+    pip install sphinx numpydoc  # numpydoc requires sphinx
+fi
+
 if [[ "$SKIP_TESTS" == "true" ]]; then
     echo "No need to build scikit-learn when not running the tests"
 else
diff --git a/sklearn/base.py b/sklearn/base.py
index 119696f5b37229b473aae168f605926c01228345..aa4f9f9ce17c1e78542f0126743c374cf03b892c 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -428,6 +428,11 @@ class BiclusterMixin(object):
 
         Only works if ``rows_`` and ``columns_`` attributes exist.
 
+        Parameters
+        ----------
+        i : int
+            The index of the cluster.
+
         Returns
         -------
         row_ind : np.array, dtype=np.intp
@@ -443,6 +448,11 @@ class BiclusterMixin(object):
     def get_shape(self, i):
         """Shape of the i'th bicluster.
 
+        Parameters
+        ----------
+        i : int
+            The index of the cluster.
+
         Returns
         -------
         shape : (int, int)
@@ -454,9 +464,22 @@ class BiclusterMixin(object):
     def get_submatrix(self, i, data):
         """Returns the submatrix corresponding to bicluster `i`.
 
+        Parameters
+        ----------
+        i : int
+            The index of the cluster.
+        data : array
+            The data.
+
+        Returns
+        -------
+        submatrix : array
+            The submatrix corresponding to bicluster i.
+
+        Notes
+        -----
         Works with sparse matrices. Only works if ``rows_`` and
         ``columns_`` attributes exist.
-
         """
         from .utils.validation import check_array
         data = check_array(data, accept_sparse='csr')
@@ -525,10 +548,33 @@ class MetaEstimatorMixin(object):
 ###############################################################################
 
 def is_classifier(estimator):
-    """Returns True if the given estimator is (probably) a classifier."""
+    """Returns True if the given estimator is (probably) a classifier.
+
+    Parameters
+    ----------
+    estimator : object
+        Estimator object to test.
+
+    Returns
+    -------
+    out : bool
+        True if estimator is a classifier and False otherwise.
+    """
     return getattr(estimator, "_estimator_type", None) == "classifier"
 
 
 def is_regressor(estimator):
-    """Returns True if the given estimator is (probably) a regressor."""
+    """Returns True if the given estimator is (probably) a regressor.
+
+
+    Parameters
+    ----------
+    estimator : object
+        Estimator object to test.
+
+    Returns
+    -------
+    out : bool
+        True if estimator is a regressor and False otherwise.
+    """
     return getattr(estimator, "_estimator_type", None) == "regressor"
diff --git a/sklearn/cluster/affinity_propagation_.py b/sklearn/cluster/affinity_propagation_.py
index 53072e24c4ae2b38c1a1548a494bf6df155e4364..398529793880fae5c1105e1b1a4d62b860b1bdec 100644
--- a/sklearn/cluster/affinity_propagation_.py
+++ b/sklearn/cluster/affinity_propagation_.py
@@ -199,13 +199,13 @@ class AffinityPropagation(BaseEstimator, ClusterMixin):
     damping : float, optional, default: 0.5
         Damping factor between 0.5 and 1.
 
+    max_iter : int, optional, default: 200
+        Maximum number of iterations.
+
     convergence_iter : int, optional, default: 15
         Number of iterations with no change in the number
         of estimated clusters that stops the convergence.
 
-    max_iter : int, optional, default: 200
-        Maximum number of iterations.
-
     copy : boolean, optional, default: True
         Make a copy of input data.
 
diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py
index 2195fe8ee3d85264d6a2f6ced4674fac4929508c..29d725bd8ce5463c8b33fa41d4af420fca6ee6d5 100644
--- a/sklearn/cluster/hierarchical.py
+++ b/sklearn/cluster/hierarchical.py
@@ -312,6 +312,9 @@ def linkage_tree(X, connectivity=None, n_components=None,
         be symmetric and only the upper triangular half is used.
         Default is None, i.e, the Ward algorithm is unstructured.
 
+    n_components : int (optional)
+        The number of connected components in the graph.
+
     n_clusters : int (optional)
         Stop early the construction of the tree at n_clusters. This is
         useful to decrease computation time if the number of clusters is
@@ -596,14 +599,6 @@ class AgglomerativeClustering(BaseEstimator, ClusterMixin):
     n_clusters : int, default=2
         The number of clusters to find.
 
-    connectivity : array-like or callable, optional
-        Connectivity matrix. Defines for each sample the neighboring
-        samples following a given structure of the data.
-        This can be a connectivity matrix itself or a callable that transforms
-        the data into a connectivity matrix, such as derived from
-        kneighbors_graph. Default is None, i.e, the
-        hierarchical clustering algorithm is unstructured.
-
     affinity : string or callable, default: "euclidean"
         Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
         "manhattan", "cosine", or 'precomputed'.
@@ -615,6 +610,14 @@ class AgglomerativeClustering(BaseEstimator, ClusterMixin):
         By default, no caching is done. If a string is given, it is the
         path to the caching directory.
 
+    connectivity : array-like or callable, optional
+        Connectivity matrix. Defines for each sample the neighboring
+        samples following a given structure of the data.
+        This can be a connectivity matrix itself or a callable that transforms
+        the data into a connectivity matrix, such as derived from
+        kneighbors_graph. Default is None, i.e, the
+        hierarchical clustering algorithm is unstructured.
+
     compute_full_tree : bool or 'auto' (optional)
         Stop early the construction of the tree at n_clusters. This is
         useful to decrease computation time if the number of clusters is
@@ -766,14 +769,6 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
     n_clusters : int, default 2
         The number of clusters to find.
 
-    connectivity : array-like or callable, optional
-        Connectivity matrix. Defines for each feature the neighboring
-        features following a given structure of the data.
-        This can be a connectivity matrix itself or a callable that transforms
-        the data into a connectivity matrix, such as derived from
-        kneighbors_graph. Default is None, i.e, the
-        hierarchical clustering algorithm is unstructured.
-
     affinity : string or callable, default "euclidean"
         Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
         "manhattan", "cosine", or 'precomputed'.
@@ -785,6 +780,14 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
         By default, no caching is done. If a string is given, it is the
         path to the caching directory.
 
+    connectivity : array-like or callable, optional
+        Connectivity matrix. Defines for each feature the neighboring
+        features following a given structure of the data.
+        This can be a connectivity matrix itself or a callable that transforms
+        the data into a connectivity matrix, such as derived from
+        kneighbors_graph. Default is None, i.e, the
+        hierarchical clustering algorithm is unstructured.
+
     compute_full_tree : bool or 'auto', optional, default "auto"
         Stop early the construction of the tree at n_clusters. This is
         useful to decrease computation time if the number of clusters is
diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py
index 5014279946637c402189ad7a46ffc4ce04ee9e26..af2fc67e083db658b2fd1a2ab1f24fe9cb08e724 100644
--- a/sklearn/cluster/k_means_.py
+++ b/sklearn/cluster/k_means_.py
@@ -181,14 +181,6 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances='auto',
         The number of clusters to form as well as the number of
         centroids to generate.
 
-    max_iter : int, optional, default 300
-        Maximum number of iterations of the k-means algorithm to run.
-
-    n_init : int, optional, default: 10
-        Number of time the k-means algorithm will be run with different
-        centroid seeds. The final results will be the best output of
-        n_init consecutive runs in terms of inertia.
-
     init : {'k-means++', 'random', or ndarray, or a callable}, optional
         Method for initialization, default to 'k-means++':
 
@@ -205,12 +197,6 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances='auto',
         If a callable is passed, it should take arguments X, k and
         and a random state and return an initialization.
 
-    algorithm : "auto", "full" or "elkan", default="auto"
-        K-means algorithm to use. The classical EM-style algorithm is "full".
-        The "elkan" variation is more efficient by using the triangle
-        inequality, but currently doesn't support sparse data. "auto" chooses
-        "elkan" for dense data and "full" for sparse data.
-
     precompute_distances : {'auto', True, False}
         Precompute distances (faster but takes more memory).
 
@@ -222,12 +208,20 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances='auto',
 
         False : never precompute distances
 
-    tol : float, optional
-        The relative increment in the results before declaring convergence.
+    n_init : int, optional, default: 10
+        Number of time the k-means algorithm will be run with different
+        centroid seeds. The final results will be the best output of
+        n_init consecutive runs in terms of inertia.
+
+    max_iter : int, optional, default 300
+        Maximum number of iterations of the k-means algorithm to run.
 
     verbose : boolean, optional
         Verbosity mode.
 
+    tol : float, optional
+        The relative increment in the results before declaring convergence.
+
     random_state : int, RandomState instance or None, optional, default: None
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
@@ -250,6 +244,12 @@ def k_means(X, n_clusters, init='k-means++', precompute_distances='auto',
         (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
         are used.
 
+    algorithm : "auto", "full" or "elkan", default="auto"
+        K-means algorithm to use. The classical EM-style algorithm is "full".
+        The "elkan" variation is more efficient by using the triangle
+        inequality, but currently doesn't support sparse data. "auto" chooses
+        "elkan" for dense data and "full" for sparse data.
+
     return_n_iter : bool, optional
         Whether or not to return the number of iterations.
 
@@ -716,15 +716,6 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
         The number of clusters to form as well as the number of
         centroids to generate.
 
-    max_iter : int, default: 300
-        Maximum number of iterations of the k-means algorithm for a
-        single run.
-
-    n_init : int, default: 10
-        Number of time the k-means algorithm will be run with different
-        centroid seeds. The final results will be the best output of
-        n_init consecutive runs in terms of inertia.
-
     init : {'k-means++', 'random' or an ndarray}
         Method for initialization, defaults to 'k-means++':
 
@@ -738,11 +729,17 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
         If an ndarray is passed, it should be of shape (n_clusters, n_features)
         and gives the initial centers.
 
-    algorithm : "auto", "full" or "elkan", default="auto"
-        K-means algorithm to use. The classical EM-style algorithm is "full".
-        The "elkan" variation is more efficient by using the triangle
-        inequality, but currently doesn't support sparse data. "auto" chooses
-        "elkan" for dense data and "full" for sparse data.
+    n_init : int, default: 10
+        Number of time the k-means algorithm will be run with different
+        centroid seeds. The final results will be the best output of
+        n_init consecutive runs in terms of inertia.
+
+    max_iter : int, default: 300
+        Maximum number of iterations of the k-means algorithm for a
+        single run.
+
+    tol : float, default: 1e-4
+        Relative tolerance with regards to inertia to declare convergence
 
     precompute_distances : {'auto', True, False}
         Precompute distances (faster but takes more memory).
@@ -755,17 +752,8 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
 
         False : never precompute distances
 
-    tol : float, default: 1e-4
-        Relative tolerance with regards to inertia to declare convergence
-
-    n_jobs : int
-        The number of jobs to use for the computation. This works by computing
-        each of the n_init runs in parallel.
-
-        If -1 all CPUs are used. If 1 is given, no parallel computing code is
-        used at all, which is useful for debugging. For n_jobs below -1,
-        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
-        are used.
+    verbose : int, default 0
+        Verbosity mode.
 
     random_state : int, RandomState instance or None, optional, default: None
         If int, random_state is the seed used by the random number generator;
@@ -773,9 +761,6 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
-    verbose : int, default 0
-        Verbosity mode.
-
     copy_x : boolean, default True
         When pre-computing distances it is more numerically accurate to center
         the data first.  If copy_x is True, then the original data is not
@@ -783,6 +768,21 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
         the function returns, but small numerical differences may be introduced
         by subtracting and then adding the data mean.
 
+    n_jobs : int
+        The number of jobs to use for the computation. This works by computing
+        each of the n_init runs in parallel.
+
+        If -1 all CPUs are used. If 1 is given, no parallel computing code is
+        used at all, which is useful for debugging. For n_jobs below -1,
+        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
+        are used.
+
+    algorithm : "auto", "full" or "elkan", default="auto"
+        K-means algorithm to use. The classical EM-style algorithm is "full".
+        The "elkan" variation is more efficient by using the triangle
+        inequality, but currently doesn't support sparse data. "auto" chooses
+        "elkan" for dense data and "full" for sparse data.
+
     Attributes
     ----------
     cluster_centers_ : array, [n_clusters, n_features]
@@ -898,6 +898,16 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
 
         Convenience method; equivalent to calling fit(X) followed by
         predict(X).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+            New data to transform.
+
+        Returns
+        -------
+        labels : array, shape [n_samples,]
+            Index of the cluster each sample belongs to.
         """
         return self.fit(X).labels_
 
@@ -905,6 +915,16 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
         """Compute clustering and transform X to cluster-distance space.
 
         Equivalent to fit(X).transform(X), but more efficiently implemented.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+            New data to transform.
+
+        Returns
+        -------
+        X_new : array, shape [n_samples, k]
+            X transformed in the new space.
         """
         # Currently, this just skips a copy of the data if it is not in
         # np.array or CSR format already.
@@ -1196,16 +1216,38 @@ class MiniBatchKMeans(KMeans):
         The number of clusters to form as well as the number of
         centroids to generate.
 
+    init : {'k-means++', 'random' or an ndarray}, default: 'k-means++'
+        Method for initialization, defaults to 'k-means++':
+
+        'k-means++' : selects initial cluster centers for k-mean
+        clustering in a smart way to speed up convergence. See section
+        Notes in k_init for more details.
+
+        'random': choose k observations (rows) at random from data for
+        the initial centroids.
+
+        If an ndarray is passed, it should be of shape (n_clusters, n_features)
+        and gives the initial centers.
+
     max_iter : int, optional
         Maximum number of iterations over the complete dataset before
         stopping independently of any early stopping criterion heuristics.
 
-    max_no_improvement : int, default: 10
-        Control early stopping based on the consecutive number of mini
-        batches that does not yield an improvement on the smoothed inertia.
+    batch_size : int, optional, default: 100
+        Size of the mini batches.
 
-        To disable convergence detection based on inertia, set
-        max_no_improvement to None.
+    verbose : boolean, optional
+        Verbosity mode.
+
+    compute_labels : boolean, default=True
+        Compute label assignment and inertia for the complete dataset
+        once the minibatch optimization has converged in fit.
+
+    random_state : int, RandomState instance or None, optional, default: None
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
 
     tol : float, default: 0.0
         Control early stopping based on the relative center changes as
@@ -1218,8 +1260,12 @@ class MiniBatchKMeans(KMeans):
         To disable convergence detection based on normalized center
         change, set tol to 0.0 (default).
 
-    batch_size : int, optional, default: 100
-        Size of the mini batches.
+    max_no_improvement : int, default: 10
+        Control early stopping based on the consecutive number of mini
+        batches that does not yield an improvement on the smoothed inertia.
+
+        To disable convergence detection based on inertia, set
+        max_no_improvement to None.
 
     init_size : int, optional, default: 3 * batch_size
         Number of samples to randomly sample for speeding up the
@@ -1227,34 +1273,11 @@ class MiniBatchKMeans(KMeans):
         only algorithm is initialized by running a batch KMeans on a
         random subset of the data. This needs to be larger than n_clusters.
 
-    init : {'k-means++', 'random' or an ndarray}, default: 'k-means++'
-        Method for initialization, defaults to 'k-means++':
-
-        'k-means++' : selects initial cluster centers for k-mean
-        clustering in a smart way to speed up convergence. See section
-        Notes in k_init for more details.
-
-        'random': choose k observations (rows) at random from data for
-        the initial centroids.
-
-        If an ndarray is passed, it should be of shape (n_clusters, n_features)
-        and gives the initial centers.
-
     n_init : int, default=3
         Number of random initializations that are tried.
         In contrast to KMeans, the algorithm is only run once, using the
         best of the ``n_init`` initializations as measured by inertia.
 
-    compute_labels : boolean, default=True
-        Compute label assignment and inertia for the complete dataset
-        once the minibatch optimization has converged in fit.
-
-    random_state : int, RandomState instance or None, optional, default: None
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
-
     reassignment_ratio : float, default: 0.01
         Control the fraction of the maximum number of counts for a
         center to be reassigned. A higher value means that low count
@@ -1262,9 +1285,6 @@ class MiniBatchKMeans(KMeans):
         model will take longer to converge, but should converge in a
         better clustering.
 
-    verbose : boolean, optional
-        Verbosity mode.
-
     Attributes
     ----------
 
diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py
index d6caa0ae056676e52c2f09aa1301728ddca07993..5f5f0a4e9d45209f68126565b6b7cb807226036e 100644
--- a/sklearn/cluster/spectral.py
+++ b/sklearn/cluster/spectral.py
@@ -300,30 +300,6 @@ class SpectralClustering(BaseEstimator, ClusterMixin):
     n_clusters : integer, optional
         The dimension of the projection subspace.
 
-    affinity : string, array-like or callable, default 'rbf'
-        If a string, this may be one of 'nearest_neighbors', 'precomputed',
-        'rbf' or one of the kernels supported by
-        `sklearn.metrics.pairwise_kernels`.
-
-        Only kernels that produce similarity scores (non-negative values that
-        increase with similarity) should be used. This property is not checked
-        by the clustering algorithm.
-
-    gamma : float, default=1.0
-        Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels.
-        Ignored for ``affinity='nearest_neighbors'``.
-
-    degree : float, default=3
-        Degree of the polynomial kernel. Ignored by other kernels.
-
-    coef0 : float, default=1
-        Zero coefficient for polynomial and sigmoid kernels.
-        Ignored by other kernels.
-
-    n_neighbors : integer
-        Number of neighbors to use when constructing the affinity matrix using
-        the nearest neighbors method. Ignored for ``affinity='rbf'``.
-
     eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}
         The eigenvalue decomposition strategy to use. AMG requires pyamg
         to be installed. It can be faster on very large, sparse problems,
@@ -342,6 +318,23 @@ class SpectralClustering(BaseEstimator, ClusterMixin):
         centroid seeds. The final results will be the best output of
         n_init consecutive runs in terms of inertia.
 
+    gamma : float, default=1.0
+        Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels.
+        Ignored for ``affinity='nearest_neighbors'``.
+
+    affinity : string, array-like or callable, default 'rbf'
+        If a string, this may be one of 'nearest_neighbors', 'precomputed',
+        'rbf' or one of the kernels supported by
+        `sklearn.metrics.pairwise_kernels`.
+
+        Only kernels that produce similarity scores (non-negative values that
+        increase with similarity) should be used. This property is not checked
+        by the clustering algorithm.
+
+    n_neighbors : integer
+        Number of neighbors to use when constructing the affinity matrix using
+        the nearest neighbors method. Ignored for ``affinity='rbf'``.
+
     eigen_tol : float, optional, default: 0.0
         Stopping criterion for eigendecomposition of the Laplacian matrix
         when using arpack eigen_solver.
@@ -353,6 +346,13 @@ class SpectralClustering(BaseEstimator, ClusterMixin):
         also be sensitive to initialization. Discretization is another approach
         which is less sensitive to random initialization.
 
+    degree : float, default=3
+        Degree of the polynomial kernel. Ignored by other kernels.
+
+    coef0 : float, default=1
+        Zero coefficient for polynomial and sigmoid kernels.
+        Ignored by other kernels.
+
     kernel_params : dictionary of string to any, optional
         Parameters (keyword arguments) and values for kernel passed as
         callable object. Ignored by other kernels.
diff --git a/sklearn/covariance/graph_lasso_.py b/sklearn/covariance/graph_lasso_.py
index 9292e9341208fe32eb9183a824f467531784ef1a..3345f5193e59882c9fbee4dd50697973863510d7 100644
--- a/sklearn/covariance/graph_lasso_.py
+++ b/sklearn/covariance/graph_lasso_.py
@@ -333,7 +333,14 @@ class GraphLasso(EmpiricalCovariance):
         self.verbose = verbose
 
     def fit(self, X, y=None):
+        """Fits the GraphLasso model to X.
 
+        Parameters
+        ----------
+        X : ndarray, shape (n_samples, n_features)
+            Data from which to compute the covariance estimate
+        y : (ignored)
+        """
         # Covariance does not make sense for a single feature
         X = check_array(X, ensure_min_features=2, ensure_min_samples=2,
                         estimator=self)
@@ -570,6 +577,7 @@ class GraphLassoCV(GraphLasso):
         ----------
         X : ndarray, shape (n_samples, n_features)
             Data from which to compute the covariance estimate
+        y : (ignored)
         """
         # Covariance does not make sense for a single feature
         X = check_array(X, ensure_min_features=2, estimator=self)
diff --git a/sklearn/covariance/outlier_detection.py b/sklearn/covariance/outlier_detection.py
index 8529cb571574f9bfb8c8ffda20dbc106fecede34..8cc81cca07b5be3c6842e7aa6d79ee745b124438 100644
--- a/sklearn/covariance/outlier_detection.py
+++ b/sklearn/covariance/outlier_detection.py
@@ -47,6 +47,13 @@ class EllipticEnvelope(MinCovDet):
         The amount of contamination of the data set, i.e. the proportion
         of outliers in the data set.
 
+    random_state : int, RandomState instance or None, optional (default=None)
+        The seed of the pseudo random number generator to use when shuffling
+        the data.  If int, random_state is the seed used by the random number
+        generator; If RandomState instance, random_state is the random number
+        generator; If None, the random number generator is the RandomState
+        instance used by `np.random`.
+
     Attributes
     ----------
     location_ : array-like, shape (n_features,)
@@ -90,6 +97,14 @@ class EllipticEnvelope(MinCovDet):
         self.contamination = contamination
 
     def fit(self, X, y=None):
+        """Fit the EllipticEnvelope model with X.
+
+        Parameters
+        ----------
+        X : numpy array or sparse matrix of shape [n_samples, n_features]
+            Training data
+        y : (ignored)
+        """
         super(EllipticEnvelope, self).fit(X)
         self.threshold_ = sp.stats.scoreatpercentile(
             self.dist_, 100. * (1. - self.contamination))
diff --git a/sklearn/covariance/robust_covariance.py b/sklearn/covariance/robust_covariance.py
index 2b7ebabc83db4d507761a3c4ffabf654f5653915..985dda92f990cbd1baf0d7012a8a6b06aa03c080 100644
--- a/sklearn/covariance/robust_covariance.py
+++ b/sklearn/covariance/robust_covariance.py
@@ -317,16 +317,16 @@ def fast_mcd(X, support_fraction=None,
           value of support_fraction will be used within the algorithm:
           `[n_sample + n_features + 1] / 2`.
 
+    cov_computation_method : callable, default empirical_covariance
+        The function which will be used to compute the covariance.
+        Must return shape (n_features, n_features)
+
     random_state : int, RandomState instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
-    cov_computation_method : callable, default empirical_covariance
-        The function which will be used to compute the covariance.
-        Must return shape (n_features, n_features)
-
     Notes
     -----
     The FastMCD algorithm has been introduced by Rousseuw and Van Driessen
diff --git a/sklearn/covariance/shrunk_covariance_.py b/sklearn/covariance/shrunk_covariance_.py
index 700052df49c6d20df4a6ded5b229ae779a2bca7c..a99b0f41113234de5ae8cfce19a3fbbfe1fcfb9a 100644
--- a/sklearn/covariance/shrunk_covariance_.py
+++ b/sklearn/covariance/shrunk_covariance_.py
@@ -73,16 +73,16 @@ class ShrunkCovariance(EmpiricalCovariance):
     store_precision : boolean, default True
         Specify if the estimated precision is stored
 
-    shrinkage : float, 0 <= shrinkage <= 1, default 0.1
-        Coefficient in the convex combination used for the computation
-        of the shrunk estimate.
-
     assume_centered : boolean, default False
         If True, data are not centered before computation.
         Useful when working with data whose mean is almost, but not exactly
         zero.
         If False, data are centered before computation.
 
+    shrinkage : float, 0 <= shrinkage <= 1, default 0.1
+        Coefficient in the convex combination used for the computation
+        of the shrunk estimate.
+
     Attributes
     ----------
     covariance_ : array-like, shape (n_features, n_features)
diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
index 1441daf838032c0a39ead9b4ee069d35769e9751..698060ae54568cdd47f698157183227fea26609e 100644
--- a/sklearn/datasets/base.py
+++ b/sklearn/datasets/base.py
@@ -41,6 +41,11 @@ def get_data_home(data_home=None):
     '~' symbol is expanded to the user home folder.
 
     If the folder does not already exist, it is automatically created.
+
+    Parameters
+    ----------
+    data_home : str | None
+        The path to scikit-learn data dir.
     """
     if data_home is None:
         data_home = environ.get('SCIKIT_LEARN_DATA',
@@ -52,7 +57,13 @@ def get_data_home(data_home=None):
 
 
 def clear_data_home(data_home=None):
-    """Delete all the content of the data home cache."""
+    """Delete all the content of the data home cache.
+
+    Parameters
+    ----------
+    data_home : str | None
+        The path to scikit-learn data dir.
+    """
     data_home = get_data_home(data_home)
     shutil.rmtree(data_home)
 
@@ -118,6 +129,11 @@ def load_files(container_path, description=None, categories=None,
         in the data structure returned. If not, a filenames attribute
         gives the path to the files.
 
+    shuffle : bool, optional (default=True)
+        Whether or not to shuffle the data: might be important for models that
+        make the assumption that the samples are independent and identically
+        distributed (i.i.d.), such as stochastic gradient descent.
+
     encoding : string or None (default is None)
         If None, do not try to decode the content of the files (e.g. for
         images or other non-text content).
@@ -129,11 +145,6 @@ def load_files(container_path, description=None, categories=None,
         contains characters not of the given `encoding`. Passed as keyword
         argument 'errors' to bytes.decode.
 
-    shuffle : bool, optional (default=True)
-        Whether or not to shuffle the data: might be important for models that
-        make the assumption that the samples are independent and identically
-        distributed (i.i.d.), such as stochastic gradient descent.
-
     random_state : int, RandomState instance or None, optional (default=0)
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
diff --git a/sklearn/datasets/kddcup99.py b/sklearn/datasets/kddcup99.py
index 6d52c5b6214b241efb86b62425f5e4faf14299de..56cf3c4181c7caf75408b597409c87ad09b44963 100644
--- a/sklearn/datasets/kddcup99.py
+++ b/sklearn/datasets/kddcup99.py
@@ -130,6 +130,9 @@ def fetch_kddcup99(subset=None, data_home=None, shuffle=False,
         all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
         .. versionadded:: 0.19
 
+    shuffle : bool, default=False
+        Whether to shuffle dataset.
+
     random_state : int, RandomState instance or None, optional (default=None)
         Random state for shuffling the dataset.
         If int, random_state is the seed used by the random number generator;
@@ -137,9 +140,6 @@ def fetch_kddcup99(subset=None, data_home=None, shuffle=False,
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
-    shuffle : bool, default=False
-        Whether to shuffle dataset.
-
     percent10 : bool, default=True
         Whether to load only 10 percent of the data.
 
diff --git a/sklearn/datasets/mldata.py b/sklearn/datasets/mldata.py
index 9b4a8e3a1daa376bbcfbec34c377b38ec716d8ae..141620858463487b3033ec4278d603901ff24001 100644
--- a/sklearn/datasets/mldata.py
+++ b/sklearn/datasets/mldata.py
@@ -30,7 +30,18 @@ MLDATA_BASE_URL = "http://mldata.org/repository/data/download/matlab/%s"
 
 
 def mldata_filename(dataname):
-    """Convert a raw name for a data set in a mldata.org filename."""
+    """Convert a raw name for a data set in a mldata.org filename.
+
+    Parameters
+    ----------
+    dataname : str
+        Name of dataset
+
+    Returns
+    -------
+    fname : str
+        The converted dataname.
+    """
     dataname = dataname.lower().replace(' ', '-')
     return re.sub(r'[().]', '', dataname)
 
@@ -62,7 +73,7 @@ def fetch_mldata(dataname, target_name='label', data_name='data',
     Parameters
     ----------
 
-    dataname :
+    dataname : str
         Name of the data set on mldata.org,
         e.g.: "leukemia", "Whistler Daily Snowfall", etc.
         The raw name is automatically converted to a mldata.org URL .
diff --git a/sklearn/datasets/olivetti_faces.py b/sklearn/datasets/olivetti_faces.py
index ac80d49e937d299e43a70de682355e838d88ebd7..7ff3af6921230e5cfdb534f24b8bdc1686047259 100644
--- a/sklearn/datasets/olivetti_faces.py
+++ b/sklearn/datasets/olivetti_faces.py
@@ -67,16 +67,16 @@ def fetch_olivetti_faces(data_home=None, shuffle=False, random_state=0,
         If True the order of the dataset is shuffled to avoid having
         images of the same person grouped.
 
-    download_if_missing : optional, True by default
-        If False, raise a IOError if the data is not locally available
-        instead of trying to download the data from the source site.
-
     random_state : int, RandomState instance or None, optional (default=0)
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
+    download_if_missing : optional, True by default
+        If False, raise a IOError if the data is not locally available
+        instead of trying to download the data from the source site.
+
     Returns
     -------
     An object with the following attributes:
diff --git a/sklearn/datasets/samples_generator.py b/sklearn/datasets/samples_generator.py
index 82ae355a7f4f265e1ea3b8e6ee0a21c7dab38d69..c92dfcc9254efaf73423895aaa75295165795ad1 100644
--- a/sklearn/datasets/samples_generator.py
+++ b/sklearn/datasets/samples_generator.py
@@ -589,6 +589,12 @@ def make_circles(n_samples=100, shuffle=True, noise=None, random_state=None,
     noise : double or None (default=None)
         Standard deviation of Gaussian noise added to the data.
 
+    random_state : int, RandomState instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
     factor : double < 1 (default=.8)
         Scale factor between inner and outer circle.
 
@@ -643,6 +649,12 @@ def make_moons(n_samples=100, shuffle=True, noise=None, random_state=None):
     noise : double or None (default=None)
         Standard deviation of Gaussian noise added to the data.
 
+    random_state : int, RandomState instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
     Returns
     -------
     X : array of shape [n_samples, 2]
@@ -1199,22 +1211,22 @@ def make_sparse_spd_matrix(dim=1, alpha=0.95, norm_diag=False,
         The probability that a coefficient is zero (see notes). Larger values
         enforce more sparsity.
 
+    norm_diag : boolean, optional (default=False)
+        Whether to normalize the output matrix to make the leading diagonal
+        elements all 1
+
+    smallest_coef : float between 0 and 1, optional (default=0.1)
+        The value of the smallest coefficient.
+
+    largest_coef : float between 0 and 1, optional (default=0.9)
+        The value of the largest coefficient.
+
     random_state : int, RandomState instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
-    largest_coef : float between 0 and 1, optional (default=0.9)
-        The value of the largest coefficient.
-
-    smallest_coef : float between 0 and 1, optional (default=0.1)
-        The value of the smallest coefficient.
-
-    norm_diag : boolean, optional (default=False)
-        Whether to normalize the output matrix to make the leading diagonal
-        elements all 1
-
     Returns
     -------
     prec : sparse matrix of shape (dim, dim)
diff --git a/sklearn/datasets/species_distributions.py b/sklearn/datasets/species_distributions.py
index 556ad9ea45e0533df461804fe8f6fde376b9f06b..8e50ba547e8a3855396b5042b1bdbe0571f82511 100644
--- a/sklearn/datasets/species_distributions.py
+++ b/sklearn/datasets/species_distributions.py
@@ -176,8 +176,16 @@ def fetch_species_distributions(data_home=None,
     grid_size : float
         The spacing between points of the grid, in degrees
 
+    References
+    ----------
+
+    * `"Maximum entropy modeling of species geographic distributions"
+      <http://rob.schapire.net/papers/ecolmod.pdf>`_
+      S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling,
+      190:231-259, 2006.
+
     Notes
-    ------
+    -----
 
     This dataset represents the geographic distribution of species.
     The dataset is provided by Phillips et. al. (2006).
@@ -193,16 +201,6 @@ def fetch_species_distributions(data_home=None,
       also known as the Forest Small Rice Rat, a rodent that lives in Peru,
       Colombia, Ecuador, Peru, and Venezuela.
 
-    References
-    ----------
-
-    * `"Maximum entropy modeling of species geographic distributions"
-      <http://rob.schapire.net/papers/ecolmod.pdf>`_
-      S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling,
-      190:231-259, 2006.
-
-    Notes
-    -----
 
     * For an example of using this dataset with scikit-learn, see
       :ref:`examples/applications/plot_species_distribution_modeling.py
diff --git a/sklearn/datasets/svmlight_format.py b/sklearn/datasets/svmlight_format.py
index c919dc8c0a259c708d3bf91263c7b22913afef37..bf14edabea4987829b1b9cd0c9e3caa686b5199c 100644
--- a/sklearn/datasets/svmlight_format.py
+++ b/sklearn/datasets/svmlight_format.py
@@ -80,6 +80,10 @@ def load_svmlight_file(f, n_features=None, dtype=np.float64,
         n_features is only required if ``offset`` or ``length`` are passed a
         non-default value.
 
+    dtype : numpy data type, default np.float64
+        Data type of dataset to be loaded. This will be the data type of the
+        output numpy arrays ``X`` and ``y``.
+
     multilabel : boolean, optional, default False
         Samples may have several labels each (see
         http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)
@@ -99,10 +103,6 @@ def load_svmlight_file(f, n_features=None, dtype=np.float64,
     query_id : boolean, default False
         If True, will return the query_id array for each file.
 
-    dtype : numpy data type, default np.float64
-        Data type of dataset to be loaded. This will be the data type of the
-        output numpy arrays ``X`` and ``y``.
-
     offset : integer, optional, default 0
         Ignore the offset first bytes by seeking forward, then
         discarding the following bytes up until the next new line
@@ -224,6 +224,10 @@ def load_svmlight_files(files, n_features=None, dtype=np.float64,
         in any of the input files, but setting it to a lower value will cause
         an exception to be raised.
 
+    dtype : numpy data type, default np.float64
+        Data type of dataset to be loaded. This will be the data type of the
+        output numpy arrays ``X`` and ``y``.
+
     multilabel : boolean, optional
         Samples may have several labels each (see
         http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)
@@ -243,10 +247,6 @@ def load_svmlight_files(files, n_features=None, dtype=np.float64,
     query_id : boolean, defaults to False
         If True, will return the query_id array for each file.
 
-    dtype : numpy data type, default np.float64
-        Data type of dataset to be loaded. This will be the data type of the
-        output numpy arrays ``X`` and ``y``.
-
     offset : integer, optional, default 0
         Ignore the offset first bytes by seeking forward, then
         discarding the following bytes up until the next new line
diff --git a/sklearn/datasets/twenty_newsgroups.py b/sklearn/datasets/twenty_newsgroups.py
index 47b543d8d2e16192a6f9d408a46aad6c230015bf..fe838b1be5fd0e17b397b4faa242c129f87717d8 100644
--- a/sklearn/datasets/twenty_newsgroups.py
+++ b/sklearn/datasets/twenty_newsgroups.py
@@ -160,14 +160,14 @@ def fetch_20newsgroups(data_home=None, subset='train', categories=None,
 
     Parameters
     ----------
-    subset : 'train' or 'test', 'all', optional
-        Select the dataset to load: 'train' for the training set, 'test'
-        for the test set, 'all' for both, with shuffled ordering.
-
     data_home : optional, default: None
         Specify a download and cache folder for the datasets. If None,
         all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
 
+    subset : 'train' or 'test', 'all', optional
+        Select the dataset to load: 'train' for the training set, 'test'
+        for the test set, 'all' for both, with shuffled ordering.
+
     categories : None or collection of string or unicode
         If None (default), load all the categories.
         If not None, list of category names to load (other categories
@@ -181,10 +181,6 @@ def fetch_20newsgroups(data_home=None, subset='train', categories=None,
     random_state : numpy random number generator or seed integer
         Used to shuffle the dataset.
 
-    download_if_missing : optional, True by default
-        If False, raise an IOError if the data is not locally available
-        instead of trying to download the data from the source site.
-
     remove : tuple
         May contain any subset of ('headers', 'footers', 'quotes'). Each of
         these are kinds of text that will be detected and removed from the
@@ -197,6 +193,10 @@ def fetch_20newsgroups(data_home=None, subset='train', categories=None,
 
         'headers' follows an exact standard; the other filters are not always
         correct.
+
+    download_if_missing : optional, True by default
+        If False, raise an IOError if the data is not locally available
+        instead of trying to download the data from the source site.
     """
 
     data_home = get_data_home(data_home=data_home)
@@ -295,15 +295,10 @@ def fetch_20newsgroups_vectorized(subset="train", remove=(), data_home=None):
 
     Parameters
     ----------
-
     subset : 'train' or 'test', 'all', optional
         Select the dataset to load: 'train' for the training set, 'test'
         for the test set, 'all' for both, with shuffled ordering.
 
-    data_home : optional, default: None
-        Specify an download and cache folder for the datasets. If None,
-        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
-
     remove : tuple
         May contain any subset of ('headers', 'footers', 'quotes'). Each of
         these are kinds of text that will be detected and removed from the
@@ -314,9 +309,12 @@ def fetch_20newsgroups_vectorized(subset="train", remove=(), data_home=None):
         ends of posts that look like signatures, and 'quotes' removes lines
         that appear to be quoting another post.
 
+    data_home : optional, default: None
+        Specify an download and cache folder for the datasets. If None,
+        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
+
     Returns
     -------
-
     bunch : Bunch object
         bunch.data: sparse matrix, shape [n_samples, n_features]
         bunch.target: array, shape [n_samples]
diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py
index 14ed2cf46730923dd967720da68b4bd7d0711e23..62cd2cd2aa1013cecb436a518119d865bc4c36ba 100644
--- a/sklearn/decomposition/dict_learning.py
+++ b/sklearn/decomposition/dict_learning.py
@@ -220,6 +220,10 @@ def sparse_encode(X, dictionary, gram=None, cov=None, algorithm='lasso_lars',
         the reconstruction error targeted. In this case, it overrides
         `n_nonzero_coefs`.
 
+    copy_cov : boolean, optional
+        Whether to copy the precomputed covariance matrix; if False, it may be
+        overwritten.
+
     init : array of shape (n_samples, n_components)
         Initialization value of the sparse codes. Only used if
         `algorithm='lasso_cd'`.
@@ -227,10 +231,6 @@ def sparse_encode(X, dictionary, gram=None, cov=None, algorithm='lasso_lars',
     max_iter : int, 1000 by default
         Maximum number of iterations to perform if `algorithm='lasso_cd'`.
 
-    copy_cov : boolean, optional
-        Whether to copy the precomputed covariance matrix; if False, it may be
-        overwritten.
-
     n_jobs : int, optional
         Number of parallel jobs to run.
 
@@ -434,11 +434,11 @@ def dict_learning(X, n_components, alpha, max_iter=100, tol=1e-8,
     code_init : array of shape (n_samples, n_components),
         Initial value for the sparse code for warm restart scenarios.
 
-    callback :
-        Callable that gets invoked every five iterations.
+    callback : callable or None, optional (default: None)
+        Callable that gets invoked every five iterations
 
-    verbose :
-        Degree of output the procedure will print.
+    verbose : bool, optional (default: False)
+        To control the verbosity of the procedure.
 
     random_state : int, RandomState instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
@@ -599,14 +599,14 @@ def dict_learning_online(X, n_components=2, alpha=1, n_iter=100,
     dict_init : array of shape (n_components, n_features),
         Initial value for the dictionary for warm restart scenarios.
 
-    callback :
-        Callable that gets invoked every five iterations.
+    callback : callable or None, optional (default: None)
+        callable that gets invoked every five iterations
 
     batch_size : int,
         The number of samples to take in each batch.
 
-    verbose :
-        Degree of output the procedure will print.
+    verbose : bool, optional (default: False)
+        To control the verbosity of the procedure.
 
     shuffle : boolean,
         Whether to shuffle the data before splitting it in batches.
@@ -924,6 +924,17 @@ class SparseCoder(BaseEstimator, SparseCodingMixin):
 
         This method is just there to implement the usual API and hence
         work in pipelines.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Training vector, where n_samples in the number of samples
+            and n_features is the number of features.
+
+        Returns
+        -------
+        self : object
+            Returns the object itself
         """
         return self
 
@@ -995,11 +1006,6 @@ class DictionaryLearning(BaseEstimator, SparseCodingMixin):
         the reconstruction error targeted. In this case, it overrides
         `n_nonzero_coefs`.
 
-    split_sign : bool, False by default
-        Whether to split the sparse feature vector into the concatenation of
-        its negative part and its positive part. This can improve the
-        performance of downstream classifiers.
-
     n_jobs : int,
         number of parallel jobs to run
 
@@ -1009,8 +1015,13 @@ class DictionaryLearning(BaseEstimator, SparseCodingMixin):
     dict_init : array of shape (n_components, n_features),
         initial values for the dictionary, for warm restart
 
-    verbose :
-        degree of verbosity of the printed output
+    verbose : bool, optional (default: False)
+        To control the verbosity of the procedure.
+
+    split_sign : bool, False by default
+        Whether to split the sparse feature vector into the concatenation of
+        its negative part and its positive part. This can improve the
+        performance of downstream classifiers.
 
     random_state : int, RandomState instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
@@ -1129,6 +1140,18 @@ class MiniBatchDictionaryLearning(BaseEstimator, SparseCodingMixin):
         Lasso solution (linear_model.Lasso). Lars will be faster if
         the estimated components are sparse.
 
+    n_jobs : int,
+        number of parallel jobs to run
+
+    batch_size : int,
+        number of samples in each mini-batch
+
+    shuffle : bool,
+        whether to shuffle the samples before forming batches
+
+    dict_init : array of shape (n_components, n_features),
+        initial value of the dictionary for warm restart scenarios
+
     transform_algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', \
     'threshold'}
         Algorithm used to transform the data.
@@ -1155,26 +1178,14 @@ class MiniBatchDictionaryLearning(BaseEstimator, SparseCodingMixin):
         the reconstruction error targeted. In this case, it overrides
         `n_nonzero_coefs`.
 
+    verbose : bool, optional (default: False)
+        To control the verbosity of the procedure.
+
     split_sign : bool, False by default
         Whether to split the sparse feature vector into the concatenation of
         its negative part and its positive part. This can improve the
         performance of downstream classifiers.
 
-    n_jobs : int,
-        number of parallel jobs to run
-
-    dict_init : array of shape (n_components, n_features),
-        initial value of the dictionary for warm restart scenarios
-
-    verbose :
-        degree of verbosity of the printed output
-
-    batch_size : int,
-        number of samples in each mini-batch
-
-    shuffle : bool,
-        whether to shuffle the samples before forming batches
-
     random_state : int, RandomState instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py
index af9cd582b42f9540d84ae55bfc14065f04b01516..fcc11ff643a5efceaacacc052c8bec7f0b8c316e 100644
--- a/sklearn/decomposition/fastica_.py
+++ b/sklearn/decomposition/fastica_.py
@@ -539,11 +539,11 @@ class FastICA(BaseEstimator, TransformerMixin):
         X : array-like, shape (n_samples, n_features)
             Data to transform, where n_samples is the number of samples
             and n_features is the number of features.
-        copy : bool (optional)
-            If False, data passed to fit are overwritten. Defaults to True.
         y : (ignored)
             .. deprecated:: 0.19
                This parameter will be removed in 0.21.
+        copy : bool (optional)
+            If False, data passed to fit are overwritten. Defaults to True.
 
         Returns
         -------
diff --git a/sklearn/decomposition/incremental_pca.py b/sklearn/decomposition/incremental_pca.py
index 9b23d1f16e1fddf17ab73d1a3e4ab9b8f778ed22..c7b09c93dace95308102f30c97fc71e74cc06521 100644
--- a/sklearn/decomposition/incremental_pca.py
+++ b/sklearn/decomposition/incremental_pca.py
@@ -40,16 +40,6 @@ class IncrementalPCA(_BasePCA):
         Number of components to keep. If ``n_components `` is ``None``,
         then ``n_components`` is set to ``min(n_samples, n_features)``.
 
-    batch_size : int or None, (default=None)
-        The number of samples to use for each batch. Only used when calling
-        ``fit``. If ``batch_size`` is ``None``, then ``batch_size``
-        is inferred from the data and set to ``5 * n_features``, to provide a
-        balance between approximation accuracy and memory consumption.
-
-    copy : bool, (default=True)
-        If False, X will be overwritten. ``copy=False`` can be used to
-        save memory but is unsafe for general use.
-
     whiten : bool, optional
         When True (False by default) the ``components_`` vectors are divided
         by ``n_samples`` times ``components_`` to ensure uncorrelated outputs
@@ -60,6 +50,16 @@ class IncrementalPCA(_BasePCA):
         improve the predictive accuracy of the downstream estimators by
         making data respect some hard-wired assumptions.
 
+    copy : bool, (default=True)
+        If False, X will be overwritten. ``copy=False`` can be used to
+        save memory but is unsafe for general use.
+
+    batch_size : int or None, (default=None)
+        The number of samples to use for each batch. Only used when calling
+        ``fit``. If ``batch_size`` is ``None``, then ``batch_size``
+        is inferred from the data and set to ``5 * n_features``, to provide a
+        balance between approximation accuracy and memory consumption.
+
     Attributes
     ----------
     components_ : array, shape (n_components, n_features)
@@ -195,6 +195,8 @@ class IncrementalPCA(_BasePCA):
         X : array-like, shape (n_samples, n_features)
             Training data, where n_samples is the number of samples and
             n_features is the number of features.
+        check_input : bool
+            Run check_array on X.
 
         Returns
         -------
diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py
index 385c0dd18996bf79224294aa28472348acff0714..a9a728c9dcb9721a64bfec37a1136d9ad35e709f 100644
--- a/sklearn/decomposition/kernel_pca.py
+++ b/sklearn/decomposition/kernel_pca.py
@@ -31,13 +31,13 @@ class KernelPCA(BaseEstimator, TransformerMixin):
     kernel : "linear" | "poly" | "rbf" | "sigmoid" | "cosine" | "precomputed"
         Kernel. Default="linear".
 
-    degree : int, default=3
-        Degree for poly kernels. Ignored by other kernels.
-
     gamma : float, default=1/n_features
         Kernel coefficient for rbf, poly and sigmoid kernels. Ignored by other
         kernels.
 
+    degree : int, default=3
+        Degree for poly kernels. Ignored by other kernels.
+
     coef0 : float, default=1
         Independent term in poly and sigmoid kernels.
         Ignored by other kernels.
@@ -82,12 +82,6 @@ class KernelPCA(BaseEstimator, TransformerMixin):
 
         .. versionadded:: 0.18
 
-    n_jobs : int, default=1
-        The number of parallel jobs to run.
-        If `-1`, then the number of jobs is set to the number of CPU cores.
-
-        .. versionadded:: 0.18
-
     copy_X : boolean, default=True
         If True, input X is copied and stored by the model in the `X_fit_`
         attribute. If no further changes will be done to X, setting
@@ -95,6 +89,12 @@ class KernelPCA(BaseEstimator, TransformerMixin):
 
         .. versionadded:: 0.18
 
+    n_jobs : int, default=1
+        The number of parallel jobs to run.
+        If `-1`, then the number of jobs is set to the number of CPU cores.
+
+        .. versionadded:: 0.18
+
     Attributes
     ----------
     lambdas_ : array, (n_components,)
diff --git a/sklearn/decomposition/nmf.py b/sklearn/decomposition/nmf.py
index 47eb42496f501751d486b00003315827f5c317f1..153731cb8365198322acb2b7d29e89a0bd407376 100644
--- a/sklearn/decomposition/nmf.py
+++ b/sklearn/decomposition/nmf.py
@@ -1143,6 +1143,9 @@ class NMF(BaseEstimator, TransformerMixin):
            Regularization parameter *l1_ratio* used in the Coordinate Descent
            solver.
 
+    verbose : bool, default=False
+        Whether to be verbose.
+
     shuffle : boolean, default: False
         If true, randomize the order of coordinates in the CD solver.
 
diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py
index 7cfcb0ae4396c99b449c1f7c75858d86394c59a4..e9743c69422fb2a5557ad79ae41d21abf787d746 100644
--- a/sklearn/decomposition/online_lda.py
+++ b/sklearn/decomposition/online_lda.py
@@ -187,9 +187,6 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin):
     max_iter : integer, optional (default=10)
         The maximum number of iterations.
 
-    total_samples : int, optional (default=1e6)
-        Total number of documents. Only used in the `partial_fit` method.
-
     batch_size : int, optional (default=128)
         Number of documents to use in each EM iteration. Only used in online
         learning.
@@ -202,6 +199,9 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin):
         Evaluating perplexity in every iteration might increase training time
         up to two-fold.
 
+    total_samples : int, optional (default=1e6)
+        Total number of documents. Only used in the `partial_fit` method.
+
     perp_tol : float, optional (default=1e-1)
         Perplexity tolerance in batch learning. Only used when
         ``evaluate_every`` is greater than 0.
@@ -795,6 +795,9 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin):
 
             .. deprecated:: 0.19
 
+        sub_sampling : bool
+            Do sub-sampling or not.
+
         Returns
         -------
         score : float
diff --git a/sklearn/decomposition/sparse_pca.py b/sklearn/decomposition/sparse_pca.py
index f5250cac8ace5b5625c15147b034e9634481c58e..47c03a80278b94591ed0da5e716efcd9cc15b217 100644
--- a/sklearn/decomposition/sparse_pca.py
+++ b/sklearn/decomposition/sparse_pca.py
@@ -57,8 +57,8 @@ class SparsePCA(BaseEstimator, TransformerMixin):
     V_init : array of shape (n_components, n_features),
         Initial values for the components for warm restart scenarios.
 
-    verbose :
-        Degree of verbosity of the printed output.
+    verbose : int
+        Controls the verbosity; the higher, the more messages. Defaults to 0.
 
     random_state : int, RandomState instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
@@ -151,7 +151,7 @@ class SparsePCA(BaseEstimator, TransformerMixin):
             Test data to be transformed, must have the same number of
             features as the data used to train the model.
 
-        ridge_alpha: float, default: 0.01
+        ridge_alpha : float, default: 0.01
             Amount of ridge shrinkage to apply in order to improve
             conditioning.
 
@@ -209,14 +209,14 @@ class MiniBatchSparsePCA(SparsePCA):
     n_iter : int,
         number of iterations to perform for each mini batch
 
-    callback : callable,
+    callback : callable or None, optional (default: None)
         callable that gets invoked every five iterations
 
     batch_size : int,
         the number of features to take in each mini batch
 
-    verbose :
-        degree of output the procedure will print
+    verbose : int
+        Controls the verbosity; the higher, the more messages. Defaults to 0.
 
     shuffle : boolean,
         whether to shuffle the data before splitting it in batches
diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
index 60732bf83a446a2b03b4aaa80ba092605ae965dd..51792383eb0cb40a8f103f3de18c33a84b3fcc87 100644
--- a/sklearn/ensemble/forest.py
+++ b/sklearn/ensemble/forest.py
@@ -1840,6 +1840,13 @@ class RandomTreesEmbedding(BaseForest):
             efficiency. Sparse matrices are also supported, use sparse
             ``csc_matrix`` for maximum efficiency.
 
+        sample_weight : array-like, shape = [n_samples] or None
+            Sample weights. If None, then samples are equally weighted. Splits
+            that would create child nodes with net zero or negative weight are
+            ignored while searching for a split in each node. In the case of
+            classification, splits are also ignored if they would result in any
+            single class carrying a negative weight in either child node.
+
         Returns
         -------
         self : object
@@ -1858,6 +1865,13 @@ class RandomTreesEmbedding(BaseForest):
             Input data used to build forests. Use ``dtype=np.float32`` for
             maximum efficiency.
 
+        sample_weight : array-like, shape = [n_samples] or None
+            Sample weights. If None, then samples are equally weighted. Splits
+            that would create child nodes with net zero or negative weight are
+            ignored while searching for a split in each node. In the case of
+            classification, splits are also ignored if they would result in any
+            single class carrying a negative weight in either child node.
+
         Returns
         -------
         X_transformed : sparse matrix, shape=(n_samples, n_out)
diff --git a/sklearn/ensemble/iforest.py b/sklearn/ensemble/iforest.py
index a178967d5c5f9474e3cddb9f7c23c18560a4a6f2..216d2c4f78631b4863ad7d008a26c8eb26e92f7c 100644
--- a/sklearn/ensemble/iforest.py
+++ b/sklearn/ensemble/iforest.py
@@ -151,6 +151,9 @@ class IsolationForest(BaseBagging):
             efficiency. Sparse matrices are also supported, use sparse
             ``csc_matrix`` for maximum efficiency.
 
+        sample_weight : array-like, shape = [n_samples] or None
+            Sample weights. If None, then samples are equally weighted.
+
         Returns
         -------
         self : object
diff --git a/sklearn/feature_extraction/hashing.py b/sklearn/feature_extraction/hashing.py
index c45d5d917cb992d650ca896510cc520857831fae..d586e6302e540c8f8c1c476c8749ef0ca418d8ac 100644
--- a/sklearn/feature_extraction/hashing.py
+++ b/sklearn/feature_extraction/hashing.py
@@ -41,12 +41,6 @@ class FeatureHasher(BaseEstimator, TransformerMixin):
         The number of features (columns) in the output matrices. Small numbers
         of features are likely to cause hash collisions, but large numbers
         will cause larger coefficient dimensions in linear learners.
-
-    dtype : numpy type, optional, default np.float64
-        The type of feature values. Passed to scipy.sparse matrix constructors
-        as the dtype argument. Do not set this to bool, np.boolean or any
-        unsigned integer type.
-
     input_type : string, optional, default "dict"
         Either "dict" (the default) to accept dictionaries over
         (feature_name, value); "pair" to accept pairs of (feature_name, value);
@@ -56,7 +50,10 @@ class FeatureHasher(BaseEstimator, TransformerMixin):
         The feature_name is hashed to find the appropriate column for the
         feature. The value's sign might be flipped in the output (but see
         non_negative, below).
-
+    dtype : numpy type, optional, default np.float64
+        The type of feature values. Passed to scipy.sparse matrix constructors
+        as the dtype argument. Do not set this to bool, np.boolean or any
+        unsigned integer type.
     alternate_sign : boolean, optional, default True
         When True, an alternating sign is added to the features as to
         approximately conserve the inner product in the hashed space even for
@@ -122,6 +119,10 @@ class FeatureHasher(BaseEstimator, TransformerMixin):
         This method doesn't do anything. It exists purely for compatibility
         with the scikit-learn transformer API.
 
+        Parameters
+        ----------
+        X : array-like
+
         Returns
         -------
         self : FeatureHasher
diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index e1740a6702166bec0504f354e504e1d567ec895c..a1a034cb9eb729b27ccbdf89897590b5f6c64a4c 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -185,15 +185,15 @@ def lasso_path(X, y, eps=1e-3, n_alphas=100, alphas=None,
     verbose : bool or integer
         Amount of verbosity.
 
-    params : kwargs
-        keyword arguments passed to the coordinate descent solver.
+    return_n_iter : bool
+        whether to return the number of iterations or not.
 
     positive : bool, default False
         If set to True, forces coefficients to be positive.
         (Only allowed when ``y.ndim == 1``).
 
-    return_n_iter : bool
-        whether to return the number of iterations or not.
+    **params : kwargs
+        keyword arguments passed to the coordinate descent solver.
 
     Returns
     -------
@@ -336,9 +336,6 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
     verbose : bool or integer
         Amount of verbosity.
 
-    params : kwargs
-        keyword arguments passed to the coordinate descent solver.
-
     return_n_iter : bool
         whether to return the number of iterations or not.
 
@@ -350,6 +347,9 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
         Skip input validation checks, including the Gram matrix when provided
         assuming there are handled by the caller when check_input=False.
 
+    **params : kwargs
+        keyword arguments passed to the coordinate descent solver.
+
     Returns
     -------
     alphas : array, shape (n_alphas,)
@@ -584,12 +584,6 @@ class ElasticNet(LinearModel, RegressorMixin):
     positive : bool, optional
         When set to ``True``, forces the coefficients to be positive.
 
-    selection : str, default 'cyclic'
-        If set to 'random', a random coefficient is updated every iteration
-        rather than looping over features sequentially by default. This
-        (setting to 'random') often leads to significantly faster convergence
-        especially when tol is higher than 1e-4.
-
     random_state : int, RandomState instance or None, optional, default None
         The seed of the pseudo random number generator that selects a random
         feature to update.  If int, random_state is the seed used by the random
@@ -598,6 +592,12 @@ class ElasticNet(LinearModel, RegressorMixin):
         RandomState instance used by `np.random`. Used when ``selection`` ==
         'random'.
 
+    selection : str, default 'cyclic'
+        If set to 'random', a random coefficient is updated every iteration
+        rather than looping over features sequentially by default. This
+        (setting to 'random') often leads to significantly faster convergence
+        especially when tol is higher than 1e-4.
+
     Attributes
     ----------
     coef_ : array, shape (n_features,) | (n_targets, n_features)
@@ -809,15 +809,15 @@ class Lasso(ElasticNet):
         :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
-    copy_X : boolean, optional, default True
-        If ``True``, X will be copied; else, it may be overwritten.
-
     precompute : True | False | array-like, default=False
         Whether to use a precomputed Gram matrix to speed up
         calculations. If set to ``'auto'`` let us decide. The Gram
         matrix can also be passed as argument. For sparse input
         this option is always ``True`` to preserve sparsity.
 
+    copy_X : boolean, optional, default True
+        If ``True``, X will be copied; else, it may be overwritten.
+
     max_iter : int, optional
         The maximum number of iterations
 
@@ -834,12 +834,6 @@ class Lasso(ElasticNet):
     positive : bool, optional
         When set to ``True``, forces the coefficients to be positive.
 
-    selection : str, default 'cyclic'
-        If set to 'random', a random coefficient is updated every iteration
-        rather than looping over features sequentially by default. This
-        (setting to 'random') often leads to significantly faster convergence
-        especially when tol is higher than 1e-4.
-
     random_state : int, RandomState instance or None, optional, default None
         The seed of the pseudo random number generator that selects a random
         feature to update.  If int, random_state is the seed used by the random
@@ -848,6 +842,12 @@ class Lasso(ElasticNet):
         RandomState instance used by `np.random`. Used when ``selection`` ==
         'random'.
 
+    selection : str, default 'cyclic'
+        If set to 'random', a random coefficient is updated every iteration
+        rather than looping over features sequentially by default. This
+        (setting to 'random') often leads to significantly faster convergence
+        especially when tol is higher than 1e-4.
+
     Attributes
     ----------
     coef_ : array, shape (n_features,) | (n_targets, n_features)
@@ -1236,6 +1236,19 @@ class LassoCV(LinearModelCV, RegressorMixin):
         List of alphas where to compute the models.
         If ``None`` alphas are set automatically
 
+    fit_intercept : boolean, default True
+        whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (e.g. data is expected to be already centered).
+
+    normalize : boolean, optional, default False
+        This parameter is ignored when ``fit_intercept`` is set to False.
+        If True, the regressors X will be normalized before regression by
+        subtracting the mean and dividing by the l2-norm.
+        If you wish to standardize, please use
+        :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
+        on an estimator with ``normalize=False``.
+
     precompute : True | False | 'auto' | array-like
         Whether to use a precomputed Gram matrix to speed up
         calculations. If set to ``'auto'`` let us decide. The Gram
@@ -1250,6 +1263,9 @@ class LassoCV(LinearModelCV, RegressorMixin):
         dual gap for optimality and continues until it is smaller
         than ``tol``.
 
+    copy_X : boolean, optional, default True
+        If ``True``, X will be copied; else, it may be overwritten.
+
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
@@ -1274,12 +1290,6 @@ class LassoCV(LinearModelCV, RegressorMixin):
     positive : bool, optional
         If positive, restrict regression coefficients to be positive
 
-    selection : str, default 'cyclic'
-        If set to 'random', a random coefficient is updated every iteration
-        rather than looping over features sequentially by default. This
-        (setting to 'random') often leads to significantly faster convergence
-        especially when tol is higher than 1e-4.
-
     random_state : int, RandomState instance or None, optional, default None
         The seed of the pseudo random number generator that selects a random
         feature to update.  If int, random_state is the seed used by the random
@@ -1288,21 +1298,11 @@ class LassoCV(LinearModelCV, RegressorMixin):
         RandomState instance used by `np.random`. Used when ``selection`` ==
         'random'.
 
-    fit_intercept : boolean, default True
-        whether to calculate the intercept for this model. If set
-        to false, no intercept will be used in calculations
-        (e.g. data is expected to be already centered).
-
-    normalize : boolean, optional, default False
-        This parameter is ignored when ``fit_intercept`` is set to False.
-        If True, the regressors X will be normalized before regression by
-        subtracting the mean and dividing by the l2-norm.
-        If you wish to standardize, please use
-        :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
-        on an estimator with ``normalize=False``.
-
-    copy_X : boolean, optional, default True
-        If ``True``, X will be copied; else, it may be overwritten.
+    selection : str, default 'cyclic'
+        If set to 'random', a random coefficient is updated every iteration
+        rather than looping over features sequentially by default. This
+        (setting to 'random') often leads to significantly faster convergence
+        especially when tol is higher than 1e-4.
 
     Attributes
     ----------
@@ -1392,6 +1392,19 @@ class ElasticNetCV(LinearModelCV, RegressorMixin):
         List of alphas where to compute the models.
         If None alphas are set automatically
 
+    fit_intercept : boolean
+        whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (e.g. data is expected to be already centered).
+
+    normalize : boolean, optional, default False
+        This parameter is ignored when ``fit_intercept`` is set to False.
+        If True, the regressors X will be normalized before regression by
+        subtracting the mean and dividing by the l2-norm.
+        If you wish to standardize, please use
+        :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
+        on an estimator with ``normalize=False``.
+
     precompute : True | False | 'auto' | array-like
         Whether to use a precomputed Gram matrix to speed up
         calculations. If set to ``'auto'`` let us decide. The Gram
@@ -1420,6 +1433,9 @@ class ElasticNetCV(LinearModelCV, RegressorMixin):
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
 
+    copy_X : boolean, optional, default True
+        If ``True``, X will be copied; else, it may be overwritten.
+
     verbose : bool or integer
         Amount of verbosity.
 
@@ -1430,12 +1446,6 @@ class ElasticNetCV(LinearModelCV, RegressorMixin):
     positive : bool, optional
         When set to ``True``, forces the coefficients to be positive.
 
-    selection : str, default 'cyclic'
-        If set to 'random', a random coefficient is updated every iteration
-        rather than looping over features sequentially by default. This
-        (setting to 'random') often leads to significantly faster convergence
-        especially when tol is higher than 1e-4.
-
     random_state : int, RandomState instance or None, optional, default None
         The seed of the pseudo random number generator that selects a random
         feature to update.  If int, random_state is the seed used by the random
@@ -1444,21 +1454,11 @@ class ElasticNetCV(LinearModelCV, RegressorMixin):
         RandomState instance used by `np.random`. Used when ``selection`` ==
         'random'.
 
-    fit_intercept : boolean
-        whether to calculate the intercept for this model. If set
-        to false, no intercept will be used in calculations
-        (e.g. data is expected to be already centered).
-
-    normalize : boolean, optional, default False
-        This parameter is ignored when ``fit_intercept`` is set to False.
-        If True, the regressors X will be normalized before regression by
-        subtracting the mean and dividing by the l2-norm.
-        If you wish to standardize, please use
-        :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
-        on an estimator with ``normalize=False``.
-
-    copy_X : boolean, optional, default True
-        If ``True``, X will be copied; else, it may be overwritten.
+    selection : str, default 'cyclic'
+        If set to 'random', a random coefficient is updated every iteration
+        rather than looping over features sequentially by default. This
+        (setting to 'random') often leads to significantly faster convergence
+        especially when tol is higher than 1e-4.
 
     Attributes
     ----------
@@ -1604,12 +1604,6 @@ class MultiTaskElasticNet(Lasso):
         When set to ``True``, reuse the solution of the previous call to fit as
         initialization, otherwise, just erase the previous solution.
 
-    selection : str, default 'cyclic'
-        If set to 'random', a random coefficient is updated every iteration
-        rather than looping over features sequentially by default. This
-        (setting to 'random') often leads to significantly faster convergence
-        especially when tol is higher than 1e-4.
-
     random_state : int, RandomState instance or None, optional, default None
         The seed of the pseudo random number generator that selects a random
         feature to update.  If int, random_state is the seed used by the random
@@ -1618,6 +1612,12 @@ class MultiTaskElasticNet(Lasso):
         RandomState instance used by `np.random`. Used when ``selection`` ==
         'random'.
 
+    selection : str, default 'cyclic'
+        If set to 'random', a random coefficient is updated every iteration
+        rather than looping over features sequentially by default. This
+        (setting to 'random') often leads to significantly faster convergence
+        especially when tol is higher than 1e-4.
+
     Attributes
     ----------
     intercept_ : array, shape (n_tasks,)
@@ -1791,12 +1791,6 @@ class MultiTaskLasso(MultiTaskElasticNet):
         When set to ``True``, reuse the solution of the previous call to fit as
         initialization, otherwise, just erase the previous solution.
 
-    selection : str, default 'cyclic'
-        If set to 'random', a random coefficient is updated every iteration
-        rather than looping over features sequentially by default. This
-        (setting to 'random') often leads to significantly faster convergence
-        especially when tol is higher than 1e-4
-
     random_state : int, RandomState instance or None, optional, default None
         The seed of the pseudo random number generator that selects a random
         feature to update.  If int, random_state is the seed used by the random
@@ -1805,6 +1799,12 @@ class MultiTaskLasso(MultiTaskElasticNet):
         RandomState instance used by `np.random`. Used when ``selection`` ==
         'random'.
 
+    selection : str, default 'cyclic'
+        If set to 'random', a random coefficient is updated every iteration
+        rather than looping over features sequentially by default. This
+        (setting to 'random') often leads to significantly faster convergence
+        especially when tol is higher than 1e-4
+
     Attributes
     ----------
     coef_ : array, shape (n_tasks, n_features)
@@ -1877,17 +1877,6 @@ class MultiTaskElasticNetCV(LinearModelCV, RegressorMixin):
 
     Parameters
     ----------
-    eps : float, optional
-        Length of the path. ``eps=1e-3`` means that
-        ``alpha_min / alpha_max = 1e-3``.
-
-    alphas : array-like, optional
-        List of alphas where to compute the models.
-        If not provided, set automatically.
-
-    n_alphas : int, optional
-        Number of alphas along the regularization path
-
     l1_ratio : float or array of floats
         The ElasticNet mixing parameter, with 0 < l1_ratio <= 1.
         For l1_ratio = 1 the penalty is an L1/L2 penalty. For l1_ratio = 0 it
@@ -1900,6 +1889,17 @@ class MultiTaskElasticNetCV(LinearModelCV, RegressorMixin):
         (i.e. Lasso) and less close to 0 (i.e. Ridge), as in ``[.1, .5, .7,
         .9, .95, .99, 1]``
 
+    eps : float, optional
+        Length of the path. ``eps=1e-3`` means that
+        ``alpha_min / alpha_max = 1e-3``.
+
+    n_alphas : int, optional
+        Number of alphas along the regularization path
+
+    alphas : array-like, optional
+        List of alphas where to compute the models.
+        If not provided, set automatically.
+
     fit_intercept : boolean
         whether to calculate the intercept for this model. If set
         to false, no intercept will be used in calculations
@@ -1913,9 +1913,6 @@ class MultiTaskElasticNetCV(LinearModelCV, RegressorMixin):
         :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
-    copy_X : boolean, optional, default True
-        If ``True``, X will be copied; else, it may be overwritten.
-
     max_iter : int, optional
         The maximum number of iterations
 
@@ -1939,6 +1936,9 @@ class MultiTaskElasticNetCV(LinearModelCV, RegressorMixin):
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
 
+    copy_X : boolean, optional, default True
+        If ``True``, X will be copied; else, it may be overwritten.
+
     verbose : bool or integer
         Amount of verbosity.
 
@@ -1947,12 +1947,6 @@ class MultiTaskElasticNetCV(LinearModelCV, RegressorMixin):
         all the CPUs. Note that this is used only if multiple values for
         l1_ratio are given.
 
-    selection : str, default 'cyclic'
-        If set to 'random', a random coefficient is updated every iteration
-        rather than looping over features sequentially by default. This
-        (setting to 'random') often leads to significantly faster convergence
-        especially when tol is higher than 1e-4.
-
     random_state : int, RandomState instance or None, optional, default None
         The seed of the pseudo random number generator that selects a random
         feature to update.  If int, random_state is the seed used by the random
@@ -1961,6 +1955,12 @@ class MultiTaskElasticNetCV(LinearModelCV, RegressorMixin):
         RandomState instance used by `np.random`. Used when ``selection`` ==
         'random'.
 
+    selection : str, default 'cyclic'
+        If set to 'random', a random coefficient is updated every iteration
+        rather than looping over features sequentially by default. This
+        (setting to 'random') often leads to significantly faster convergence
+        especially when tol is higher than 1e-4.
+
     Attributes
     ----------
     intercept_ : array, shape (n_tasks,)
@@ -2060,13 +2060,13 @@ class MultiTaskLassoCV(LinearModelCV, RegressorMixin):
         Length of the path. ``eps=1e-3`` means that
         ``alpha_min / alpha_max = 1e-3``.
 
+    n_alphas : int, optional
+        Number of alphas along the regularization path
+
     alphas : array-like, optional
         List of alphas where to compute the models.
         If not provided, set automatically.
 
-    n_alphas : int, optional
-        Number of alphas along the regularization path
-
     fit_intercept : boolean
         whether to calculate the intercept for this model. If set
         to false, no intercept will be used in calculations
@@ -2080,9 +2080,6 @@ class MultiTaskLassoCV(LinearModelCV, RegressorMixin):
         :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
-    copy_X : boolean, optional, default True
-        If ``True``, X will be copied; else, it may be overwritten.
-
     max_iter : int, optional
         The maximum number of iterations.
 
@@ -2092,6 +2089,9 @@ class MultiTaskLassoCV(LinearModelCV, RegressorMixin):
         dual gap for optimality and continues until it is smaller
         than ``tol``.
 
+    copy_X : boolean, optional, default True
+        If ``True``, X will be copied; else, it may be overwritten.
+
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
@@ -2114,19 +2114,19 @@ class MultiTaskLassoCV(LinearModelCV, RegressorMixin):
         all the CPUs. Note that this is used only if multiple values for
         l1_ratio are given.
 
-    selection : str, default 'cyclic'
-        If set to 'random', a random coefficient is updated every iteration
-        rather than looping over features sequentially by default. This
-        (setting to 'random') often leads to significantly faster convergence
-        especially when tol is higher than 1e-4.
-
     random_state : int, RandomState instance or None, optional, default None
         The seed of the pseudo random number generator that selects a random
         feature to update.  If int, random_state is the seed used by the random
         number generator; If RandomState instance, random_state is the random
         number generator; If None, the random number generator is the
         RandomState instance used by `np.random`. Used when ``selection`` ==
-        'random'/
+        'random'
+
+    selection : str, default 'cyclic'
+        If set to 'random', a random coefficient is updated every iteration
+        rather than looping over features sequentially by default. This
+        (setting to 'random') often leads to significantly faster convergence
+        especially when tol is higher than 1e-4.
 
     Attributes
     ----------
diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py
index 2722dc5c6d26e9a4aab31139bd3bc256d397cc69..17b988b08e6c70fbf29d57d12baa76dd8fed9f91 100644
--- a/sklearn/linear_model/least_angle.py
+++ b/sklearn/linear_model/least_angle.py
@@ -54,24 +54,19 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500,
     y : array, shape: (n_samples)
         Input targets.
 
-    positive : boolean (default=False)
-        Restrict coefficients to be >= 0.
-        When using this option together with method 'lasso' the model
-        coefficients will not converge to the ordinary-least-squares solution
-        for small values of alpha (neither will they when using method 'lar'
-        ..). Only coefficients up to the smallest alpha value (``alphas_[alphas_ >
-        0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso
-        algorithm are typically in congruence with the solution of the
-        coordinate descent lasso_path function.
-
-    max_iter : integer, optional (default=500)
-        Maximum number of iterations to perform, set to infinity for no limit.
+    Xy : array-like, shape (n_samples,) or (n_samples, n_targets), \
+            optional
+        Xy = np.dot(X.T, y) that can be precomputed. It is useful
+        only when the Gram matrix is precomputed.
 
     Gram : None, 'auto', array, shape: (n_features, n_features), optional
         Precomputed Gram matrix (X' * X), if ``'auto'``, the Gram
         matrix is precomputed from the given X, if there are more samples
         than features.
 
+    max_iter : integer, optional (default=500)
+        Maximum number of iterations to perform, set to infinity for no limit.
+
     alpha_min : float, optional (default=0)
         Minimum correlation along the path. It corresponds to the
         regularization parameter alpha parameter in the Lasso.
@@ -80,14 +75,14 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500,
         Specifies the returned model. Select ``'lar'`` for Least Angle
         Regression, ``'lasso'`` for the Lasso.
 
+    copy_X : bool, optional (default=True)
+        If ``False``, ``X`` is overwritten.
+
     eps : float, optional (default=``np.finfo(np.float).eps``)
         The machine-precision regularization in the computation of the
         Cholesky diagonal factors. Increase this for very ill-conditioned
         systems.
 
-    copy_X : bool, optional (default=True)
-        If ``False``, ``X`` is overwritten.
-
     copy_Gram : bool, optional (default=True)
         If ``False``, ``Gram`` is overwritten.
 
@@ -101,6 +96,16 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500,
     return_n_iter : bool, optional (default=False)
         Whether to return the number of iterations.
 
+    positive : boolean (default=False)
+        Restrict coefficients to be >= 0.
+        When using this option together with method 'lasso' the model
+        coefficients will not converge to the ordinary-least-squares solution
+        for small values of alpha (neither will they when using method 'lar'
+        ..). Only coefficients up to the smallest alpha value
+        (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by the
+        stepwise Lars-Lasso algorithm are typically in congruence with the
+        solution of the coordinate descent lasso_path function.
+
     Returns
     --------
     alphas : array, shape: [n_alphas + 1]
@@ -498,18 +503,11 @@ class Lars(LinearModel, RegressorMixin):
 
     Parameters
     ----------
-    n_nonzero_coefs : int, optional
-        Target number of non-zero coefficients. Use ``np.inf`` for no limit.
-
     fit_intercept : boolean
         Whether to calculate the intercept for this model. If set
         to false, no intercept will be used in calculations
         (e.g. data is expected to be already centered).
 
-    positive : boolean (default=False)
-        Restrict coefficients to be >= 0. Be aware that you might want to
-        remove fit_intercept which is set True by default.
-
     verbose : boolean or integer, optional
         Sets the verbosity amount
 
@@ -526,8 +524,8 @@ class Lars(LinearModel, RegressorMixin):
         calculations. If set to ``'auto'`` let us decide. The Gram
         matrix can also be passed as argument.
 
-    copy_X : boolean, optional, default True
-        If ``True``, X will be copied; else, it may be overwritten.
+    n_nonzero_coefs : int, optional
+        Target number of non-zero coefficients. Use ``np.inf`` for no limit.
 
     eps : float, optional
         The machine-precision regularization in the computation of the
@@ -536,12 +534,19 @@ class Lars(LinearModel, RegressorMixin):
         optimization-based algorithms, this parameter does not control
         the tolerance of the optimization.
 
+    copy_X : boolean, optional, default True
+        If ``True``, X will be copied; else, it may be overwritten.
+
     fit_path : boolean
         If True the full path is stored in the ``coef_path_`` attribute.
         If you compute the solution for a large problem or many targets,
         setting ``fit_path`` to ``False`` will lead to a speedup, especially
         with a small alpha.
 
+    positive : boolean (default=False)
+        Restrict coefficients to be >= 0. Be aware that you might want to
+        remove fit_intercept which is set True by default.
+
     Attributes
     ----------
     alphas_ : array, shape (n_alphas + 1,) | list of n_targets such arrays
@@ -731,16 +736,6 @@ class LassoLars(Lars):
         to false, no intercept will be used in calculations
         (e.g. data is expected to be already centered).
 
-    positive : boolean (default=False)
-        Restrict coefficients to be >= 0. Be aware that you might want to
-        remove fit_intercept which is set True by default.
-        Under the positive restriction the model coefficients will not converge
-        to the ordinary-least-squares solution for small values of alpha.
-        Only coefficients up to the smallest alpha value (``alphas_[alphas_ >
-        0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso
-        algorithm are typically in congruence with the solution of the
-        coordinate descent Lasso estimator.
-
     verbose : boolean or integer, optional
         Sets the verbosity amount
 
@@ -752,9 +747,6 @@ class LassoLars(Lars):
         :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
-    copy_X : boolean, optional, default True
-        If True, X will be copied; else, it may be overwritten.
-
     precompute : True | False | 'auto' | array-like
         Whether to use a precomputed Gram matrix to speed up
         calculations. If set to ``'auto'`` let us decide. The Gram
@@ -770,12 +762,25 @@ class LassoLars(Lars):
         optimization-based algorithms, this parameter does not control
         the tolerance of the optimization.
 
+    copy_X : boolean, optional, default True
+        If True, X will be copied; else, it may be overwritten.
+
     fit_path : boolean
         If ``True`` the full path is stored in the ``coef_path_`` attribute.
         If you compute the solution for a large problem or many targets,
         setting ``fit_path`` to ``False`` will lead to a speedup, especially
         with a small alpha.
 
+    positive : boolean (default=False)
+        Restrict coefficients to be >= 0. Be aware that you might want to
+        remove fit_intercept which is set True by default.
+        Under the positive restriction the model coefficients will not converge
+        to the ordinary-least-squares solution for small values of alpha.
+        Only coefficients up to the smallest alpha value (``alphas_[alphas_ >
+        0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso
+        algorithm are typically in congruence with the solution of the
+        coordinate descent Lasso estimator.
+
     Attributes
     ----------
     alphas_ : array, shape (n_alphas + 1,) | list of n_targets such arrays
@@ -976,13 +981,12 @@ class LarsCV(Lars):
         to false, no intercept will be used in calculations
         (e.g. data is expected to be already centered).
 
-    positive : boolean (default=False)
-        Restrict coefficients to be >= 0. Be aware that you might want to
-        remove fit_intercept which is set True by default.
-
     verbose : boolean or integer, optional
         Sets the verbosity amount
 
+    max_iter : integer, optional
+        Maximum number of iterations to perform.
+
     normalize : boolean, optional, default True
         This parameter is ignored when ``fit_intercept`` is set to False.
         If True, the regressors X will be normalized before regression by
@@ -991,17 +995,11 @@ class LarsCV(Lars):
         :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
-    copy_X : boolean, optional, default True
-        If ``True``, X will be copied; else, it may be overwritten.
-
-    precompute : True | False | 'auto'
+    precompute : True | False | 'auto' | array-like
         Whether to use a precomputed Gram matrix to speed up
         calculations. If set to ``'auto'`` let us decide. The Gram matrix
         cannot be passed as argument since we will use only subsets of X.
 
-    max_iter : integer, optional
-        Maximum number of iterations to perform.
-
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
@@ -1029,6 +1027,13 @@ class LarsCV(Lars):
         Cholesky diagonal factors. Increase this for very ill-conditioned
         systems.
 
+    copy_X : boolean, optional, default True
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    positive : boolean (default=False)
+        Restrict coefficients to be >= 0. Be aware that you might want to
+        remove fit_intercept which is set True by default.
+
 
     Attributes
     ----------
@@ -1189,21 +1194,12 @@ class LassoLarsCV(LarsCV):
         to false, no intercept will be used in calculations
         (e.g. data is expected to be already centered).
 
-    positive : boolean (default=False)
-        Restrict coefficients to be >= 0. Be aware that you might want to
-        remove fit_intercept which is set True by default.
-        Under the positive restriction the model coefficients do not converge
-        to the ordinary-least-squares solution for small values of alpha.
-        Only coefficients up to the smallest alpha value (``alphas_[alphas_ >
-        0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso
-        algorithm are typically in congruence with the solution of the
-        coordinate descent Lasso estimator.
-        As a consequence using LassoLarsCV only makes sense for problems where
-        a sparse solution is expected and/or reached.
-
     verbose : boolean or integer, optional
         Sets the verbosity amount
 
+    max_iter : integer, optional
+        Maximum number of iterations to perform.
+
     normalize : boolean, optional, default True
         This parameter is ignored when ``fit_intercept`` is set to False.
         If True, the regressors X will be normalized before regression by
@@ -1217,9 +1213,6 @@ class LassoLarsCV(LarsCV):
         calculations. If set to ``'auto'`` let us decide. The Gram matrix
         cannot be passed as argument since we will use only subsets of X.
 
-    max_iter : integer, optional
-        Maximum number of iterations to perform.
-
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
@@ -1250,6 +1243,18 @@ class LassoLarsCV(LarsCV):
     copy_X : boolean, optional, default True
         If True, X will be copied; else, it may be overwritten.
 
+    positive : boolean (default=False)
+        Restrict coefficients to be >= 0. Be aware that you might want to
+        remove fit_intercept which is set True by default.
+        Under the positive restriction the model coefficients do not converge
+        to the ordinary-least-squares solution for small values of alpha.
+        Only coefficients up to the smallest alpha value (``alphas_[alphas_ >
+        0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso
+        algorithm are typically in congruence with the solution of the
+        coordinate descent Lasso estimator.
+        As a consequence using LassoLarsCV only makes sense for problems where
+        a sparse solution is expected and/or reached.
+
     Attributes
     ----------
     coef_ : array, shape (n_features,)
@@ -1340,18 +1345,6 @@ class LassoLarsIC(LassoLars):
         to false, no intercept will be used in calculations
         (e.g. data is expected to be already centered).
 
-    positive : boolean (default=False)
-        Restrict coefficients to be >= 0. Be aware that you might want to
-        remove fit_intercept which is set True by default.
-        Under the positive restriction the model coefficients do not converge
-        to the ordinary-least-squares solution for small values of alpha.
-        Only coefficients up to the smallest alpha value (``alphas_[alphas_ >
-        0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso
-        algorithm are typically in congruence with the solution of the
-        coordinate descent Lasso estimator.
-        As a consequence using LassoLarsIC only makes sense for problems where
-        a sparse solution is expected and/or reached.
-
     verbose : boolean or integer, optional
         Sets the verbosity amount
 
@@ -1363,9 +1356,6 @@ class LassoLarsIC(LassoLars):
         :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
-    copy_X : boolean, optional, default True
-        If True, X will be copied; else, it may be overwritten.
-
     precompute : True | False | 'auto' | array-like
         Whether to use a precomputed Gram matrix to speed up
         calculations. If set to ``'auto'`` let us decide. The Gram
@@ -1382,6 +1372,21 @@ class LassoLarsIC(LassoLars):
         optimization-based algorithms, this parameter does not control
         the tolerance of the optimization.
 
+    copy_X : boolean, optional, default True
+        If True, X will be copied; else, it may be overwritten.
+
+    positive : boolean (default=False)
+        Restrict coefficients to be >= 0. Be aware that you might want to
+        remove fit_intercept which is set True by default.
+        Under the positive restriction the model coefficients do not converge
+        to the ordinary-least-squares solution for small values of alpha.
+        Only coefficients up to the smallest alpha value (``alphas_[alphas_ >
+        0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso
+        algorithm are typically in congruence with the solution of the
+        coordinate descent Lasso estimator.
+        As a consequence using LassoLarsIC only makes sense for problems where
+        a sparse solution is expected and/or reached.
+
 
     Attributes
     ----------
diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py
index 480bd80d6313014c32a72ae72dc034f4cda95f7b..8dbb1bec93d3daec93b8c1882d918b9fedc6ea65 100644
--- a/sklearn/linear_model/logistic.py
+++ b/sklearn/linear_model/logistic.py
@@ -425,8 +425,9 @@ def _multinomial_grad_hess(w, X, Y, alpha, sample_weight):
 
 def _check_solver_option(solver, multi_class, penalty, dual):
     if solver not in ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']:
-        raise ValueError("Logistic Regression supports only liblinear,"
-                         " newton-cg, lbfgs and sag solvers, got %s" % solver)
+        raise ValueError("Logistic Regression supports only liblinear, "
+                         "newton-cg, lbfgs, sag and saga solvers, got %s"
+                         % solver)
 
     if multi_class not in ['multinomial', 'ovr']:
         raise ValueError("multi_class should be either multinomial or "
@@ -472,16 +473,16 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
     y : array-like, shape (n_samples,)
         Input data, target values.
 
+    pos_class : int, None
+        The class with respect to which we perform a one-vs-all fit.
+        If None, then it is assumed that the given problem is binary.
+
     Cs : int | array-like, shape (n_cs,)
         List of values for the regularization parameter or integer specifying
         the number of regularization parameters that should be used. In this
         case, the parameters will be chosen in a logarithmic scale between
         1e-4 and 1e4.
 
-    pos_class : int, None
-        The class with respect to which we perform a one-vs-all fit.
-        If None, then it is assumed that the given problem is binary.
-
     fit_intercept : bool
         Whether to fit an intercept for the model. In this case the shape of
         the returned array is (n_cs, n_features + 1).
@@ -994,6 +995,9 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
         l2 penalty with liblinear solver. Prefer dual=False when
         n_samples > n_features.
 
+    tol : float, default: 1e-4
+        Tolerance for stopping criteria.
+
     C : float, default: 1.0
         Inverse of regularization strength; must be a positive float.
         Like in support vector machines, smaller values specify stronger
@@ -1030,10 +1034,6 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
         .. versionadded:: 0.17
            *class_weight='balanced'*
 
-    max_iter : int, default: 100
-        Useful only for the newton-cg, sag and lbfgs solvers.
-        Maximum number of iterations taken for the solvers to converge.
-
     random_state : int, RandomState instance or None, optional, default: None
         The seed of the pseudo random number generator to use when shuffling
         the data.  If int, random_state is the seed used by the random number
@@ -1063,8 +1063,9 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
         .. versionadded:: 0.19
            SAGA solver.
 
-    tol : float, default: 1e-4
-        Tolerance for stopping criteria.
+    max_iter : int, default: 100
+        Useful only for the newton-cg, sag and lbfgs solvers.
+        Maximum number of iterations taken for the solvers to converge.
 
     multi_class : str, {'ovr', 'multinomial'}, default: 'ovr'
         Multiclass option can be either 'ovr' or 'multinomial'. If the option
@@ -1391,35 +1392,21 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the decision function.
 
-    class_weight : dict or 'balanced', optional
-        Weights associated with classes in the form ``{class_label: weight}``.
-        If not given, all classes are supposed to have weight one.
-
-        The "balanced" mode uses the values of y to automatically adjust
-        weights inversely proportional to class frequencies in the input data
-        as ``n_samples / (n_classes * np.bincount(y))``.
-
-        Note that these weights will be multiplied with sample_weight (passed
-        through the fit method) if sample_weight is specified.
-
-        .. versionadded:: 0.17
-           class_weight == 'balanced'
-
     cv : integer or cross-validation generator
         The default cross-validation generator used is Stratified K-Folds.
         If an integer is provided, then it is the number of folds used.
         See the module :mod:`sklearn.model_selection` module for the
         list of possible cross-validation objects.
 
-    penalty : str, 'l1' or 'l2'
-        Used to specify the norm used in the penalization. The 'newton-cg',
-        'sag' and 'lbfgs' solvers support only l2 penalties.
-
     dual : bool
         Dual or primal formulation. Dual formulation is only implemented for
         l2 penalty with liblinear solver. Prefer dual=False when
         n_samples > n_features.
 
+    penalty : str, 'l1' or 'l2'
+        Used to specify the norm used in the penalization. The 'newton-cg',
+        'sag' and 'lbfgs' solvers support only l2 penalties.
+
     scoring : string, callable, or None
         A string (see model evaluation documentation) or
         a scorer callable object / function with signature
@@ -1456,6 +1443,20 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
     max_iter : int, optional
         Maximum number of iterations of the optimization algorithm.
 
+    class_weight : dict or 'balanced', optional
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``.
+
+        Note that these weights will be multiplied with sample_weight (passed
+        through the fit method) if sample_weight is specified.
+
+        .. versionadded:: 0.17
+           class_weight == 'balanced'
+
     n_jobs : int, optional
         Number of CPU cores used during the cross-validation loop. If given
         a value of -1, all cores are used.
@@ -1471,16 +1472,6 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
         Otherwise the coefs, intercepts and C that correspond to the
         best scores across folds are averaged.
 
-    multi_class : str, {'ovr', 'multinomial'}
-        Multiclass option can be either 'ovr' or 'multinomial'. If the option
-        chosen is 'ovr', then a binary problem is fit for each label. Else
-        the loss minimised is the multinomial loss fit across
-        the entire probability distribution. Works only for the 'newton-cg',
-        'sag' and 'lbfgs' solver.
-
-        .. versionadded:: 0.18
-           Stochastic Average Gradient descent solver for 'multinomial' case.
-
     intercept_scaling : float, default 1.
         Useful only when the solver 'liblinear' is used
         and self.fit_intercept is set to True. In this case, x becomes
@@ -1494,6 +1485,16 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
         To lessen the effect of regularization on synthetic feature weight
         (and therefore on the intercept) intercept_scaling has to be increased.
 
+    multi_class : str, {'ovr', 'multinomial'}
+        Multiclass option can be either 'ovr' or 'multinomial'. If the option
+        chosen is 'ovr', then a binary problem is fit for each label. Else
+        the loss minimised is the multinomial loss fit across
+        the entire probability distribution. Works only for the 'newton-cg',
+        'sag', 'saga' and 'lbfgs' solver.
+
+        .. versionadded:: 0.18
+           Stochastic Average Gradient descent solver for 'multinomial' case.
+
     random_state : int, RandomState instance or None, optional, default None
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
diff --git a/sklearn/linear_model/omp.py b/sklearn/linear_model/omp.py
index 2a03d31fee03556bb2133d2b38dbf3a2b627d337..8fcbd4e211af91249cc443e4dba280cc7e68de75 100644
--- a/sklearn/linear_model/omp.py
+++ b/sklearn/linear_model/omp.py
@@ -537,6 +537,8 @@ def orthogonal_mp_gram(Gram, Xy, n_nonzero_coefs=None, tol=None,
 class OrthogonalMatchingPursuit(LinearModel, RegressorMixin):
     """Orthogonal Matching Pursuit model (OMP)
 
+    Read more in the :ref:`User Guide <omp>`.
+
     Parameters
     ----------
     n_nonzero_coefs : int, optional
@@ -565,8 +567,6 @@ class OrthogonalMatchingPursuit(LinearModel, RegressorMixin):
         very large. Note that if you already have such matrices, you can pass
         them directly to the fit method.
 
-    Read more in the :ref:`User Guide <omp>`.
-
     Attributes
     ----------
     coef_ : array, shape (n_features,) or (n_targets, n_features)
@@ -740,6 +740,8 @@ def _omp_path_residues(X_train, y_train, X_test, y_test, copy=True,
 class OrthogonalMatchingPursuitCV(LinearModel, RegressorMixin):
     """Cross-validated Orthogonal Matching Pursuit model (OMP)
 
+    Read more in the :ref:`User Guide <omp>`.
+
     Parameters
     ----------
     copy : bool, optional
@@ -785,8 +787,6 @@ class OrthogonalMatchingPursuitCV(LinearModel, RegressorMixin):
     verbose : boolean or integer, optional
         Sets the verbosity amount
 
-    Read more in the :ref:`User Guide <omp>`.
-
     Attributes
     ----------
     intercept_ : float or array, shape (n_targets,)
diff --git a/sklearn/linear_model/passive_aggressive.py b/sklearn/linear_model/passive_aggressive.py
index ea5c37ad3d65f3bd4fb6992b02cb9b0c3d07a0a7..183049e4fdb55b882d43caad40f3bc354d3c9ce7 100644
--- a/sklearn/linear_model/passive_aggressive.py
+++ b/sklearn/linear_model/passive_aggressive.py
@@ -21,13 +21,6 @@ class PassiveAggressiveClassifier(BaseSGDClassifier):
         Whether the intercept should be estimated or not. If False, the
         data is assumed to be already centered.
 
-    n_iter : int, optional
-        The number of passes over the training data (aka epochs).
-        Defaults to None. Deprecated, will be removed in 0.21.
-
-        .. versionchanged:: 0.19
-            Deprecated
-
     max_iter : int, optional
         The maximum number of passes over the training data (aka epochs).
         It only impacts the behavior in the ``fit`` method, and not the
@@ -46,25 +39,25 @@ class PassiveAggressiveClassifier(BaseSGDClassifier):
     shuffle : bool, default=True
         Whether or not the training data should be shuffled after each epoch.
 
-    random_state : int, RandomState instance or None, optional, default=None
-        The seed of the pseudo random number generator to use when shuffling
-        the data.  If int, random_state is the seed used by the random number
-        generator; If RandomState instance, random_state is the random number
-        generator; If None, the random number generator is the RandomState
-        instance used by `np.random`.
-
     verbose : integer, optional
         The verbosity level
 
+    loss : string, optional
+        The loss function to be used:
+        hinge: equivalent to PA-I in the reference paper.
+        squared_hinge: equivalent to PA-II in the reference paper.
+
     n_jobs : integer, optional
         The number of CPUs to use to do the OVA (One Versus All, for
         multi-class problems) computation. -1 means 'all CPUs'. Defaults
         to 1.
 
-    loss : string, optional
-        The loss function to be used:
-        hinge: equivalent to PA-I in the reference paper.
-        squared_hinge: equivalent to PA-II in the reference paper.
+    random_state : int, RandomState instance or None, optional, default=None
+        The seed of the pseudo random number generator to use when shuffling
+        the data.  If int, random_state is the seed used by the random number
+        generator; If RandomState instance, random_state is the random number
+        generator; If None, the random number generator is the RandomState
+        instance used by `np.random`.
 
     warm_start : bool, optional
         When set to True, reuse the solution of the previous call to fit as
@@ -92,6 +85,13 @@ class PassiveAggressiveClassifier(BaseSGDClassifier):
         .. versionadded:: 0.19
            parameter *average* to use weights averaging in SGD
 
+    n_iter : int, optional
+        The number of passes over the training data (aka epochs).
+        Defaults to None. Deprecated, will be removed in 0.21.
+
+        .. versionchanged:: 0.19
+            Deprecated
+
     Attributes
     ----------
     coef_ : array, shape = [1, n_features] if n_classes == 2 else [n_classes,\
@@ -217,21 +217,10 @@ class PassiveAggressiveRegressor(BaseSGDRegressor):
     C : float
         Maximum step size (regularization). Defaults to 1.0.
 
-    epsilon : float
-        If the difference between the current prediction and the correct label
-        is below this threshold, the model is not updated.
-
     fit_intercept : bool
         Whether the intercept should be estimated or not. If False, the
         data is assumed to be already centered. Defaults to True.
 
-    n_iter : int, optional
-        The number of passes over the training data (aka epochs).
-        Defaults to None. Deprecated, will be removed in 0.21.
-
-        .. versionchanged:: 0.19
-            Deprecated
-
     max_iter : int, optional
         The maximum number of passes over the training data (aka epochs).
         It only impacts the behavior in the ``fit`` method, and not the
@@ -250,13 +239,6 @@ class PassiveAggressiveRegressor(BaseSGDRegressor):
     shuffle : bool, default=True
         Whether or not the training data should be shuffled after each epoch.
 
-    random_state : int, RandomState instance or None, optional, default=None
-        The seed of the pseudo random number generator to use when shuffling
-        the data.  If int, random_state is the seed used by the random number
-        generator; If RandomState instance, random_state is the random number
-        generator; If None, the random number generator is the RandomState
-        instance used by `np.random`.
-
     verbose : integer, optional
         The verbosity level
 
@@ -266,6 +248,17 @@ class PassiveAggressiveRegressor(BaseSGDRegressor):
         squared_epsilon_insensitive: equivalent to PA-II in the reference
         paper.
 
+    epsilon : float
+        If the difference between the current prediction and the correct label
+        is below this threshold, the model is not updated.
+
+    random_state : int, RandomState instance or None, optional, default=None
+        The seed of the pseudo random number generator to use when shuffling
+        the data.  If int, random_state is the seed used by the random number
+        generator; If RandomState instance, random_state is the random number
+        generator; If None, the random number generator is the RandomState
+        instance used by `np.random`.
+
     warm_start : bool, optional
         When set to True, reuse the solution of the previous call to fit as
         initialization, otherwise, just erase the previous solution.
@@ -279,6 +272,13 @@ class PassiveAggressiveRegressor(BaseSGDRegressor):
         .. versionadded:: 0.19
            parameter *average* to use weights averaging in SGD
 
+    n_iter : int, optional
+        The number of passes over the training data (aka epochs).
+        Defaults to None. Deprecated, will be removed in 0.21.
+
+        .. versionchanged:: 0.19
+            Deprecated
+
     Attributes
     ----------
     coef_ : array, shape = [1, n_features] if n_classes == 2 else [n_classes,\
diff --git a/sklearn/linear_model/perceptron.py b/sklearn/linear_model/perceptron.py
index 0edfa28712644462015e51a6938b345ba352a293..28cb4561521f593141a3f12e182a3929b5839b4f 100644
--- a/sklearn/linear_model/perceptron.py
+++ b/sklearn/linear_model/perceptron.py
@@ -23,13 +23,6 @@ class Perceptron(BaseSGDClassifier):
         Whether the intercept should be estimated or not. If False, the
         data is assumed to be already centered. Defaults to True.
 
-    n_iter : int, optional
-        The number of passes over the training data (aka epochs).
-        Defaults to None. Deprecated, will be removed in 0.21.
-
-        .. versionchanged:: 0.19
-            Deprecated
-
     max_iter : int, optional
         The maximum number of passes over the training data (aka epochs).
         It only impacts the behavior in the ``fit`` method, and not the
@@ -48,23 +41,23 @@ class Perceptron(BaseSGDClassifier):
     shuffle : bool, optional, default True
         Whether or not the training data should be shuffled after each epoch.
 
-    random_state : int, RandomState instance or None, optional, default None
-        The seed of the pseudo random number generator to use when shuffling
-        the data.  If int, random_state is the seed used by the random number
-        generator; If RandomState instance, random_state is the random number
-        generator; If None, the random number generator is the RandomState
-        instance used by `np.random`.
-
     verbose : integer, optional
         The verbosity level
 
+    eta0 : double
+        Constant by which the updates are multiplied. Defaults to 1.
+
     n_jobs : integer, optional
         The number of CPUs to use to do the OVA (One Versus All, for
         multi-class problems) computation. -1 means 'all CPUs'. Defaults
         to 1.
 
-    eta0 : double
-        Constant by which the updates are multiplied. Defaults to 1.
+    random_state : int, RandomState instance or None, optional, default None
+        The seed of the pseudo random number generator to use when shuffling
+        the data.  If int, random_state is the seed used by the random number
+        generator; If RandomState instance, random_state is the random number
+        generator; If None, the random number generator is the RandomState
+        instance used by `np.random`.
 
     class_weight : dict, {class_label: weight} or "balanced" or None, optional
         Preset for the class_weight fit parameter.
@@ -80,6 +73,13 @@ class Perceptron(BaseSGDClassifier):
         When set to True, reuse the solution of the previous call to fit as
         initialization, otherwise, just erase the previous solution.
 
+    n_iter : int, optional
+        The number of passes over the training data (aka epochs).
+        Defaults to None. Deprecated, will be removed in 0.21.
+
+        .. versionchanged:: 0.19
+            Deprecated
+
     Attributes
     ----------
     coef_ : array, shape = [1, n_features] if n_classes == 2 else [n_classes,\
diff --git a/sklearn/linear_model/randomized_l1.py b/sklearn/linear_model/randomized_l1.py
index 28a861f024bcd8cea7b89302069b7197977f000e..a84558823146e67527f11a96f49b2fdd4cac6751 100644
--- a/sklearn/linear_model/randomized_l1.py
+++ b/sklearn/linear_model/randomized_l1.py
@@ -253,16 +253,16 @@ class RandomizedLasso(BaseRandomizedLinearModel):
         optimization-based algorithms, this parameter does not control
         the tolerance of the optimization.
 
-    n_jobs : integer, optional
-        Number of CPUs to use during the resampling. If '-1', use
-        all the CPUs
-
     random_state : int, RandomState instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
+    n_jobs : integer, optional
+        Number of CPUs to use during the resampling. If '-1', use
+        all the CPUs
+
     pre_dispatch : int, or string, optional
         Controls the number of jobs that get dispatched during parallel
         execution. Reducing this number can be useful to avoid an
@@ -433,6 +433,9 @@ class RandomizedLogisticRegression(BaseRandomizedLinearModel):
     selection_threshold : float, optional, default=0.25
         The score above which features should be selected.
 
+    tol : float, optional, default=1e-3
+         tolerance for stopping criteria of LogisticRegression
+
     fit_intercept : boolean, optional, default=True
         whether to calculate the intercept for this model. If set
         to false, no intercept will be used in calculations
@@ -451,19 +454,16 @@ class RandomizedLogisticRegression(BaseRandomizedLinearModel):
         `preprocessing.StandardScaler` before calling `fit` on an estimator
         with `normalize=False`.
 
-    tol : float, optional, default=1e-3
-         tolerance for stopping criteria of LogisticRegression
-
-    n_jobs : integer, optional
-        Number of CPUs to use during the resampling. If '-1', use
-        all the CPUs
-
     random_state : int, RandomState instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
+    n_jobs : integer, optional
+        Number of CPUs to use during the resampling. If '-1', use
+        all the CPUs
+
     pre_dispatch : int, or string, optional
         Controls the number of jobs that get dispatched during parallel
         execution. Reducing this number can be useful to avoid an
diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py
index caf2f9eed64c27cf289fc9e8cf2254fe549c8b7e..3e584a78ad93afa1fb15902ed703a7207a0c09ce 100644
--- a/sklearn/linear_model/ridge.py
+++ b/sklearn/linear_model/ridge.py
@@ -218,11 +218,6 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
         assumed to be specific to the targets. Hence they must correspond in
         number.
 
-    max_iter : int, optional
-        Maximum number of iterations for conjugate gradient solver.
-        For 'sparse_cg' and 'lsqr' solvers, the default value is determined
-        by scipy.sparse.linalg. For 'sag' solver, the default value is 1000.
-
     sample_weight : float or numpy array of shape [n_samples]
         Individual weights for each sample. If sample_weight is not None and
         solver='auto', the solver will be set to 'cholesky'.
@@ -268,6 +263,12 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
         .. versionadded:: 0.19
            SAGA solver.
 
+    max_iter : int, optional
+        Maximum number of iterations for conjugate gradient solver.
+        For the 'sparse_cg' and 'lsqr' solvers, the default value is determined
+        by scipy.sparse.linalg. For 'sag' and saga solver, the default value is
+        1000.
+
     tol : float
         Precision of the solution.
 
@@ -530,19 +531,11 @@ class Ridge(_BaseRidge, RegressorMixin):
         assumed to be specific to the targets. Hence they must correspond in
         number.
 
-    copy_X : boolean, optional, default True
-        If True, X will be copied; else, it may be overwritten.
-
     fit_intercept : boolean
         Whether to calculate the intercept for this model. If set
         to false, no intercept will be used in calculations
         (e.g. data is expected to be already centered).
 
-    max_iter : int, optional
-        Maximum number of iterations for conjugate gradient solver.
-        For 'sparse_cg' and 'lsqr' solvers, the default value is determined
-        by scipy.sparse.linalg. For 'sag' solver, the default value is 1000.
-
     normalize : boolean, optional, default False
         This parameter is ignored when ``fit_intercept`` is set to False.
         If True, the regressors X will be normalized before regression by
@@ -551,6 +544,17 @@ class Ridge(_BaseRidge, RegressorMixin):
         :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
+    copy_X : boolean, optional, default True
+        If True, X will be copied; else, it may be overwritten.
+
+    max_iter : int, optional
+        Maximum number of iterations for conjugate gradient solver.
+        For 'sparse_cg' and 'lsqr' solvers, the default value is determined
+        by scipy.sparse.linalg. For 'sag' solver, the default value is 1000.
+
+    tol : float
+        Precision of the solution.
+
     solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}
         Solver to use in the computational routines:
 
@@ -589,9 +593,6 @@ class Ridge(_BaseRidge, RegressorMixin):
         .. versionadded:: 0.19
            SAGA solver.
 
-    tol : float
-        Precision of the solution.
-
     random_state : int, RandomState instance or None, optional, default None
         The seed of the pseudo random number generator to use when shuffling
         the data.  If int, random_state is the seed used by the random number
@@ -678,26 +679,11 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
         Alpha corresponds to ``C^-1`` in other linear models such as
         LogisticRegression or LinearSVC.
 
-    class_weight : dict or 'balanced', optional
-        Weights associated with classes in the form ``{class_label: weight}``.
-        If not given, all classes are supposed to have weight one.
-
-        The "balanced" mode uses the values of y to automatically adjust
-        weights inversely proportional to class frequencies in the input data
-        as ``n_samples / (n_classes * np.bincount(y))``
-
-    copy_X : boolean, optional, default True
-        If True, X will be copied; else, it may be overwritten.
-
     fit_intercept : boolean
         Whether to calculate the intercept for this model. If set to false, no
         intercept will be used in calculations (e.g. data is expected to be
         already centered).
 
-    max_iter : int, optional
-        Maximum number of iterations for conjugate gradient solver.
-        The default value is determined by scipy.sparse.linalg.
-
     normalize : boolean, optional, default False
         This parameter is ignored when ``fit_intercept`` is set to False.
         If True, the regressors X will be normalized before regression by
@@ -706,6 +692,24 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
         :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
+    copy_X : boolean, optional, default True
+        If True, X will be copied; else, it may be overwritten.
+
+    max_iter : int, optional
+        Maximum number of iterations for conjugate gradient solver.
+        The default value is determined by scipy.sparse.linalg.
+
+    tol : float
+        Precision of the solution.
+
+    class_weight : dict or 'balanced', optional
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
+
     solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}
         Solver to use in the computational routines:
 
@@ -740,9 +744,6 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
           .. versionadded:: 0.19
            SAGA solver.
 
-    tol : float
-        Precision of the solution.
-
     random_state : int, RandomState instance or None, optional, default None
         The seed of the pseudo random number generator to use when shuffling
         the data.  If int, random_state is the seed used by the random number
diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py
index ab8d9c69fc5c9cbea0fb1a1a8dc9af6ef3bffb4f..aba8c6c1363c0bd3e2ed508303b67a560eae4ae9 100644
--- a/sklearn/linear_model/stochastic_gradient.py
+++ b/sklearn/linear_model/stochastic_gradient.py
@@ -605,11 +605,14 @@ class SGDClassifier(BaseSGDClassifier):
 
     Parameters
     ----------
-    loss : str, 'hinge', 'log', 'modified_huber', 'squared_hinge',\
-                'perceptron', or a regression loss: 'squared_loss', 'huber',\
-                'epsilon_insensitive', or 'squared_epsilon_insensitive'
+    loss : str, default: 'hinge'
         The loss function to be used. Defaults to 'hinge', which gives a
         linear SVM.
+
+        The possible options are 'hinge', 'log', 'modified_huber',
+        'squared_hinge', 'perceptron', or a regression loss: 'squared_loss',
+        'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.
+
         The 'log' loss gives logistic regression, a probabilistic classifier.
         'modified_huber' is another smooth loss that brings tolerance to
         outliers as well as probability estimates.
@@ -637,13 +640,6 @@ class SGDClassifier(BaseSGDClassifier):
         Whether the intercept should be estimated or not. If False, the
         data is assumed to be already centered. Defaults to True.
 
-    n_iter : int, optional
-        The number of passes over the training data (aka epochs).
-        Defaults to None. Deprecated, will be removed in 0.21.
-
-        .. versionchanged:: 0.19
-            Deprecated
-
     max_iter : int, optional
         The maximum number of passes over the training data (aka epochs).
         It only impacts the behavior in the ``fit`` method, and not the
@@ -663,13 +659,6 @@ class SGDClassifier(BaseSGDClassifier):
         Whether or not the training data should be shuffled after each epoch.
         Defaults to True.
 
-    random_state : int, RandomState instance or None, optional (default=None)
-        The seed of the pseudo random number generator to use when shuffling
-        the data.  If int, random_state is the seed used by the random number
-        generator; If RandomState instance, random_state is the random number
-        generator; If None, the random number generator is the RandomState
-        instance used by `np.random`.
-
     verbose : integer, optional
         The verbosity level
 
@@ -686,6 +675,13 @@ class SGDClassifier(BaseSGDClassifier):
         multi-class problems) computation. -1 means 'all CPUs'. Defaults
         to 1.
 
+    random_state : int, RandomState instance or None, optional (default=None)
+        The seed of the pseudo random number generator to use when shuffling
+        the data.  If int, random_state is the seed used by the random number
+        generator; If RandomState instance, random_state is the random number
+        generator; If None, the random number generator is the RandomState
+        instance used by `np.random`.
+
     learning_rate : string, optional
         The learning rate schedule:
 
@@ -724,6 +720,13 @@ class SGDClassifier(BaseSGDClassifier):
         average. So ``average=10`` will begin averaging after seeing 10
         samples.
 
+    n_iter : int, optional
+        The number of passes over the training data (aka epochs).
+        Defaults to None. Deprecated, will be removed in 0.21.
+
+        .. versionchanged:: 0.19
+            Deprecated
+
     Attributes
     ----------
     coef_ : array, shape (1, n_features) if n_classes == 2 else (n_classes,\
@@ -1172,15 +1175,17 @@ class SGDRegressor(BaseSGDRegressor):
 
     Parameters
     ----------
-    loss : str, 'squared_loss', 'huber', 'epsilon_insensitive', \
-                or 'squared_epsilon_insensitive'
-        The loss function to be used. Defaults to 'squared_loss' which refers
-        to the ordinary least squares fit. 'huber' modifies 'squared_loss' to
-        focus less on getting outliers correct by switching from squared to
-        linear loss past a distance of epsilon. 'epsilon_insensitive' ignores
-        errors less than epsilon and is linear past that; this is the loss
-        function used in SVR. 'squared_epsilon_insensitive' is the same but
-        becomes squared loss past a tolerance of epsilon.
+    loss : str, default: 'squared_loss'
+        The loss function to be used. The possible values are 'squared_loss',
+        'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'
+
+        The 'squared_loss' refers to the ordinary least squares fit.
+        'huber' modifies 'squared_loss' to focus less on getting outliers
+        correct by switching from squared to linear loss past a distance of
+        epsilon. 'epsilon_insensitive' ignores errors less than epsilon and is
+        linear past that; this is the loss function used in SVR.
+        'squared_epsilon_insensitive' is the same but becomes squared loss past
+        a tolerance of epsilon.
 
     penalty : str, 'none', 'l2', 'l1', or 'elasticnet'
         The penalty (aka regularization term) to be used. Defaults to 'l2'
@@ -1201,13 +1206,6 @@ class SGDRegressor(BaseSGDRegressor):
         Whether the intercept should be estimated or not. If False, the
         data is assumed to be already centered. Defaults to True.
 
-    n_iter : int, optional
-        The number of passes over the training data (aka epochs).
-        Defaults to None. Deprecated, will be removed in 0.21.
-
-        .. versionchanged:: 0.19
-            Deprecated
-
     max_iter : int, optional
         The maximum number of passes over the training data (aka epochs).
         It only impacts the behavior in the ``fit`` method, and not the
@@ -1227,13 +1225,6 @@ class SGDRegressor(BaseSGDRegressor):
         Whether or not the training data should be shuffled after each epoch.
         Defaults to True.
 
-    random_state : int, RandomState instance or None, optional (default=None)
-        The seed of the pseudo random number generator to use when shuffling
-        the data.  If int, random_state is the seed used by the random number
-        generator; If RandomState instance, random_state is the random number
-        generator; If None, the random number generator is the RandomState
-        instance used by `np.random`.
-
     verbose : integer, optional
         The verbosity level.
 
@@ -1245,6 +1236,13 @@ class SGDRegressor(BaseSGDRegressor):
         For epsilon-insensitive, any differences between the current prediction
         and the correct label are ignored if they are less than this threshold.
 
+    random_state : int, RandomState instance or None, optional (default=None)
+        The seed of the pseudo random number generator to use when shuffling
+        the data.  If int, random_state is the seed used by the random number
+        generator; If RandomState instance, random_state is the random number
+        generator; If None, the random number generator is the RandomState
+        instance used by `np.random`.
+
     learning_rate : string, optional
         The learning rate schedule:
 
@@ -1271,6 +1269,13 @@ class SGDRegressor(BaseSGDRegressor):
         average. So ``average=10`` will begin averaging after seeing 10
         samples.
 
+    n_iter : int, optional
+        The number of passes over the training data (aka epochs).
+        Defaults to None. Deprecated, will be removed in 0.21.
+
+        .. versionchanged:: 0.19
+            Deprecated
+
     Attributes
     ----------
     coef_ : array, shape (n_features,)
diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index 89f09255cad39d8d7bf533d9ce9ddb3217d7c20e..6a7f71794648142cb38fd7cb02430162af5dad57 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -145,8 +145,8 @@ def test_check_solver_option():
     X, y = iris.data, iris.target
     for LR in [LogisticRegression, LogisticRegressionCV]:
 
-        msg = ("Logistic Regression supports only liblinear, newton-cg, lbfgs"
-               " and sag solvers, got wrong_name")
+        msg = ('Logistic Regression supports only liblinear, newton-cg, '
+               'lbfgs, sag and saga solvers, got wrong_name')
         lr = LR(solver="wrong_name")
         assert_raise_message(ValueError, msg, lr.fit, X, y)
 
diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py
index 8d60686808e1a875bc9f97e65cae16bdf1d125a4..a330b7da7f8561bb0b0e02e03d438cbd6d958b31 100644
--- a/sklearn/manifold/spectral_embedding_.py
+++ b/sklearn/manifold/spectral_embedding_.py
@@ -178,15 +178,15 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None,
         Stopping criterion for eigendecomposition of the Laplacian matrix
         when using arpack eigen_solver.
 
+    norm_laplacian : bool, optional, default=True
+        If True, then compute normalized Laplacian.
+
     drop_first : bool, optional, default=True
         Whether to drop the first eigenvector. For spectral embedding, this
         should be True as the first eigenvector should be constant vector for
         connected graph, but for spectral clustering, this should be kept as
         False to retain the first eigenvector.
 
-    norm_laplacian : bool, optional, default=True
-        If True, then compute normalized Laplacian.
-
     Returns
     -------
     embedding : array, shape=(n_samples, n_components)
@@ -343,19 +343,6 @@ class SpectralEmbedding(BaseEstimator):
     n_components : integer, default: 2
         The dimension of the projected subspace.
 
-    eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}
-        The eigenvalue decomposition strategy to use. AMG requires pyamg
-        to be installed. It can be faster on very large, sparse problems,
-        but may also lead to instabilities.
-
-    random_state : int, RandomState instance or None, optional, default: None
-        A pseudo random number generator used for the initialization of the
-        lobpcg eigenvectors.  If int, random_state is the seed used by the
-        random number generator; If RandomState instance, random_state is the
-        random number generator; If None, the random number generator is the
-        RandomState instance used by `np.random`. Used when ``solver`` ==
-        'amg'.
-
     affinity : string or callable, default : "nearest_neighbors"
         How to construct the affinity matrix.
          - 'nearest_neighbors' : construct affinity matrix by knn graph
@@ -368,6 +355,19 @@ class SpectralEmbedding(BaseEstimator):
     gamma : float, optional, default : 1/n_features
         Kernel coefficient for rbf kernel.
 
+    random_state : int, RandomState instance or None, optional, default: None
+        A pseudo random number generator used for the initialization of the
+        lobpcg eigenvectors.  If int, random_state is the seed used by the
+        random number generator; If RandomState instance, random_state is the
+        random number generator; If None, the random number generator is the
+        RandomState instance used by `np.random`. Used when ``solver`` ==
+        'amg'.
+
+    eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}
+        The eigenvalue decomposition strategy to use. AMG requires pyamg
+        to be installed. It can be faster on very large, sparse problems,
+        but may also lead to instabilities.
+
     n_neighbors : int, default : max(n_samples/10 , 1)
         Number of nearest neighbors for nearest_neighbors graph building.
 
diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index a9f2932fa94ff05f285a037df99eeb6333c9a2c3..c79770de4ab8bbd8e35e04c416f921d34af66b0a 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -813,6 +813,9 @@ def fowlkes_mallows_score(labels_true, labels_pred, sparse=False):
     labels_pred : array, shape = (``n_samples``, )
         A clustering of the data into disjoint subsets.
 
+    sparse : bool
+        Compute contingency matrix internally with sparse matrix.
+
     Returns
     -------
     score : float
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 0b63653672f51f7880a7ab8c907668efdaa8cf60..4e82328f6fc53f9831526d1fbbe1fe8fa5a8d0c8 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -273,16 +273,14 @@ def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean",
 
     Parameters
     ----------
-    X, Y : {array-like, sparse matrix}
-        Arrays containing points. Respective shapes (n_samples1, n_features)
-        and (n_samples2, n_features)
+    X : {array-like, sparse matrix}, shape (n_samples1, n_features)
+        Array containing points.
 
-    batch_size : integer
-        To reduce memory consumption over the naive solution, data are
-        processed in batches, comprising batch_size rows of X and
-        batch_size rows of Y. The default value is quite conservative, but
-        can be changed for fine-tuning. The larger the number, the larger the
-        memory usage.
+    Y : {array-like, sparse matrix}, shape (n_samples2, n_features)
+        Arrays containing points.
+
+    axis : int, optional, default 1
+        Axis along which the argmin and distances are to be computed.
 
     metric : string or callable, default 'euclidean'
         metric to use for distance computation. Any metric from scikit-learn
@@ -310,12 +308,16 @@ def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean",
         See the documentation for scipy.spatial.distance for details on these
         metrics.
 
+    batch_size : integer
+        To reduce memory consumption over the naive solution, data are
+        processed in batches, comprising batch_size rows of X and
+        batch_size rows of Y. The default value is quite conservative, but
+        can be changed for fine-tuning. The larger the number, the larger the
+        memory usage.
+
     metric_kwargs : dict, optional
         Keyword arguments to pass to specified metric function.
 
-    axis : int, optional, default 1
-        Axis along which the argmin and distances are to be computed.
-
     Returns
     -------
     argmin : numpy.ndarray
@@ -408,12 +410,8 @@ def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean",
         Arrays containing points. Respective shapes (n_samples1, n_features)
         and (n_samples2, n_features)
 
-    batch_size : integer
-        To reduce memory consumption over the naive solution, data are
-        processed in batches, comprising batch_size rows of X and
-        batch_size rows of Y. The default value is quite conservative, but
-        can be changed for fine-tuning. The larger the number, the larger the
-        memory usage.
+    axis : int, optional, default 1
+        Axis along which the argmin and distances are to be computed.
 
     metric : string or callable
         metric to use for distance computation. Any metric from scikit-learn
@@ -441,12 +439,16 @@ def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean",
         See the documentation for scipy.spatial.distance for details on these
         metrics.
 
+    batch_size : integer
+        To reduce memory consumption over the naive solution, data are
+        processed in batches, comprising batch_size rows of X and
+        batch_size rows of Y. The default value is quite conservative, but
+        can be changed for fine-tuning. The larger the number, the larger the
+        memory usage.
+
     metric_kwargs : dict
         keyword arguments to pass to specified metric function.
 
-    axis : int, optional, default 1
-        Axis along which the argmin and distances are to be computed.
-
     Returns
     -------
     argmin : numpy.ndarray
@@ -1348,6 +1350,9 @@ def pairwise_kernels(X, Y=None, metric="linear", filter_params=False,
         should take two arrays from X as input and return a value indicating
         the distance between them.
 
+    filter_params : boolean
+        Whether to filter invalid parameters or not.
+
     n_jobs : int
         The number of jobs to use for the computation. This works by breaking
         down the pairwise matrix into n_jobs even slices and computing them in
@@ -1358,9 +1363,6 @@ def pairwise_kernels(X, Y=None, metric="linear", filter_params=False,
         (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
         are used.
 
-    filter_params : boolean
-        Whether to filter invalid parameters or not.
-
     **kwds : optional keyword parameters
         Any further parameters are passed directly to the kernel function.
 
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 938367706da10c45f84726f49a29fd7f4919b0b3..2b54896781929f4c9a8a74bf77d2d04b232baf42 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -823,8 +823,8 @@ def ndcg_score(y_true, y_score, k=5):
     -------
     score : float
 
-    Example
-    -------
+    Examples
+    --------
     >>> y_true = [1, 0, 2]
     >>> y_score = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
     >>> ndcg_score(y_true, y_score, k=2)
diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py
index 1d16a9dcb01ac07cb50dffd6d6045bb0b58a3cdd..7d213ae39aaed5d8d7e6e17baca892025ef02928 100644
--- a/sklearn/metrics/scorer.py
+++ b/sklearn/metrics/scorer.py
@@ -209,6 +209,18 @@ class _ThresholdScorer(_BaseScorer):
 
 
 def get_scorer(scoring):
+    """Get a scorer from string
+
+    Parameters
+    ----------
+    scoring : str | callable
+        scoring method as string. If callable it is returned as is.
+
+    Returns
+    -------
+    scorer : callable
+        The scorer.
+    """
     valid = True
     if isinstance(scoring, six.string_types):
         try:
diff --git a/sklearn/neighbors/approximate.py b/sklearn/neighbors/approximate.py
index b4e0a60085ce5e48baa5e799ba2cb0b2817fe9c7..ac59305e123783c8c5e6988855e3ab8d464a0797 100644
--- a/sklearn/neighbors/approximate.py
+++ b/sklearn/neighbors/approximate.py
@@ -130,9 +130,9 @@ class LSHForest(BaseEstimator, KNeighborsMixin, RadiusNeighborsMixin):
     n_estimators : int (default = 10)
         Number of trees in the LSH Forest.
 
-    min_hash_match : int (default = 4)
-        lowest hash length to be searched when candidate selection is
-        performed for nearest neighbors.
+    radius : float, optinal (default = 1.0)
+        Radius from the data point to its neighbors. This is the parameter
+        space to use by default for the :meth`radius_neighbors` queries.
 
     n_candidates : int (default = 10)
         Minimum number of candidates evaluated per estimator, assuming enough
@@ -142,9 +142,9 @@ class LSHForest(BaseEstimator, KNeighborsMixin, RadiusNeighborsMixin):
         Number of neighbors to be returned from query function when
         it is not provided to the :meth:`kneighbors` method.
 
-    radius : float, optinal (default = 1.0)
-        Radius from the data point to its neighbors. This is the parameter
-        space to use by default for the :meth`radius_neighbors` queries.
+    min_hash_match : int (default = 4)
+        lowest hash length to be searched when candidate selection is
+        performed for nearest neighbors.
 
     radius_cutoff_ratio : float, optional (default = 0.9)
         A value ranges from 0 to 1. Radius neighbors will be searched until
diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py
index 1eb5ec72c096c7ef3ef36444a543f1623b25c654..fb0dc8ad15e3fb6a51fcf38768a27bdd7fcf40c2 100644
--- a/sklearn/neighbors/classification.py
+++ b/sklearn/neighbors/classification.py
@@ -61,17 +61,17 @@ class KNeighborsClassifier(NeighborsBase, KNeighborsMixin,
         required to store the tree.  The optimal value depends on the
         nature of the problem.
 
+    p : integer, optional (default = 2)
+        Power parameter for the Minkowski metric. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+
     metric : string or callable, default 'minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
         metric. See the documentation of the DistanceMetric class for a
         list of available metrics.
 
-    p : integer, optional (default = 2)
-        Power parameter for the Minkowski metric. When p = 1, this is
-        equivalent to using manhattan_distance (l1), and euclidean_distance
-        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
-
     metric_params : dict, optional (default = None)
         Additional keyword arguments for the metric function.
 
@@ -268,17 +268,17 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin,
         required to store the tree.  The optimal value depends on the
         nature of the problem.
 
+    p : integer, optional (default = 2)
+        Power parameter for the Minkowski metric. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+
     metric : string or callable, default 'minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
         metric. See the documentation of the DistanceMetric class for a
         list of available metrics.
 
-    p : integer, optional (default = 2)
-        Power parameter for the Minkowski metric. When p = 1, this is
-        equivalent to using manhattan_distance (l1), and euclidean_distance
-        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
-
     outlier_label : int, optional (default = None)
         Label, which is given for outlier samples (samples with no
         neighbors on given radius).
diff --git a/sklearn/neighbors/graph.py b/sklearn/neighbors/graph.py
index f04596584f2bf5b1377b91b344270bec7592ab73..61a4561430cad1c2acca53b1256a44395df38569 100644
--- a/sklearn/neighbors/graph.py
+++ b/sklearn/neighbors/graph.py
@@ -57,11 +57,6 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski',
         The default distance is 'euclidean' ('minkowski' metric with the p
         param equal to 2.)
 
-    include_self : bool, default=False.
-        Whether or not to mark each sample as the first nearest neighbor to
-        itself. If `None`, then True is used for mode='connectivity' and False
-        for mode='distance' as this will preserve backwards compatibilty.
-
     p : int, default 2
         Power parameter for the Minkowski metric. When p = 1, this is
         equivalent to using manhattan_distance (l1), and euclidean_distance
@@ -70,6 +65,11 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski',
     metric_params : dict, optional
         additional keyword arguments for the metric function.
 
+    include_self : bool, default=False.
+        Whether or not to mark each sample as the first nearest neighbor to
+        itself. If `None`, then True is used for mode='connectivity' and False
+        for mode='distance' as this will preserve backwards compatibilty.
+
     n_jobs : int, optional (default = 1)
         The number of parallel jobs to run for neighbors search.
         If ``-1``, then the number of jobs is set to the number of CPU cores.
@@ -132,11 +132,6 @@ def radius_neighbors_graph(X, radius, mode='connectivity', metric='minkowski',
         gives a list of available metrics. The default distance is
         'euclidean' ('minkowski' metric with the param equal to 2.)
 
-    include_self : bool, default=False
-        Whether or not to mark each sample as the first nearest neighbor to
-        itself. If `None`, then True is used for mode='connectivity' and False
-        for mode='distance' as this will preserve backwards compatibilty.
-
     p : int, default 2
         Power parameter for the Minkowski metric. When p = 1, this is
         equivalent to using manhattan_distance (l1), and euclidean_distance
@@ -145,6 +140,11 @@ def radius_neighbors_graph(X, radius, mode='connectivity', metric='minkowski',
     metric_params : dict, optional
         additional keyword arguments for the metric function.
 
+    include_self : bool, default=False
+        Whether or not to mark each sample as the first nearest neighbor to
+        itself. If `None`, then True is used for mode='connectivity' and False
+        for mode='distance' as this will preserve backwards compatibilty.
+
     n_jobs : int, optional (default = 1)
         The number of parallel jobs to run for neighbors search.
         If ``-1``, then the number of jobs is set to the number of CPU cores.
diff --git a/sklearn/neighbors/lof.py b/sklearn/neighbors/lof.py
index 605032106abf2310962addea6211d3fbcbb1a05e..3559d76cf898a7a1d1fa810baeb8c40ba047fee2 100644
--- a/sklearn/neighbors/lof.py
+++ b/sklearn/neighbors/lof.py
@@ -55,12 +55,6 @@ class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin):
         required to store the tree. The optimal value depends on the
         nature of the problem.
 
-    p : integer, optional (default=2)
-        Parameter for the Minkowski metric from
-        :ref:`sklearn.metrics.pairwise.pairwise_distances`. When p = 1, this is
-        equivalent to using manhattan_distance (l1), and euclidean_distance
-        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
-
     metric : string or callable, default 'minkowski'
         metric used for the distance computation. Any metric from scikit-learn
         or scipy.spatial.distance can be used.
@@ -89,6 +83,12 @@ class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin):
         metrics:
         http://docs.scipy.org/doc/scipy/reference/spatial.distance.html
 
+    p : integer, optional (default=2)
+        Parameter for the Minkowski metric from
+        :ref:`sklearn.metrics.pairwise.pairwise_distances`. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+
     metric_params : dict, optional (default=None)
         Additional keyword arguments for the metric function.
 
diff --git a/sklearn/neighbors/regression.py b/sklearn/neighbors/regression.py
index 78ee35cae279b5f22e974f6d6da29abba463f681..1180850b8d21a1e19dc5405a7e44d26c129a25fc 100644
--- a/sklearn/neighbors/regression.py
+++ b/sklearn/neighbors/regression.py
@@ -63,17 +63,17 @@ class KNeighborsRegressor(NeighborsBase, KNeighborsMixin,
         required to store the tree.  The optimal value depends on the
         nature of the problem.
 
+    p : integer, optional (default = 2)
+        Power parameter for the Minkowski metric. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+
     metric : string or callable, default 'minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
         metric. See the documentation of the DistanceMetric class for a
         list of available metrics.
 
-    p : integer, optional (default = 2)
-        Power parameter for the Minkowski metric. When p = 1, this is
-        equivalent to using manhattan_distance (l1), and euclidean_distance
-        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
-
     metric_params : dict, optional (default = None)
         Additional keyword arguments for the metric function.
 
@@ -213,17 +213,17 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin,
         required to store the tree.  The optimal value depends on the
         nature of the problem.
 
+    p : integer, optional (default = 2)
+        Power parameter for the Minkowski metric. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+
     metric : string or callable, default 'minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
         metric. See the documentation of the DistanceMetric class for a
         list of available metrics.
 
-    p : integer, optional (default = 2)
-        Power parameter for the Minkowski metric. When p = 1, this is
-        equivalent to using manhattan_distance (l1), and euclidean_distance
-        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
-
     metric_params : dict, optional (default = None)
         Additional keyword arguments for the metric function.
 
diff --git a/sklearn/neighbors/unsupervised.py b/sklearn/neighbors/unsupervised.py
index cf7bf82d17fbdd1b2a7aac4a39cfd4357db0cbac..f0a904caaca32da3e8351058cb83ee7b46eb1bc9 100644
--- a/sklearn/neighbors/unsupervised.py
+++ b/sklearn/neighbors/unsupervised.py
@@ -39,12 +39,6 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin,
         required to store the tree.  The optimal value depends on the
         nature of the problem.
 
-    p : integer, optional (default = 2)
-        Parameter for the Minkowski metric from
-        sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
-        equivalent to using manhattan_distance (l1), and euclidean_distance
-        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
-
     metric : string or callable, default 'minkowski'
         metric to use for distance computation. Any metric from scikit-learn
         or scipy.spatial.distance can be used.
@@ -71,6 +65,12 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin,
         See the documentation for scipy.spatial.distance for details on these
         metrics.
 
+    p : integer, optional (default = 2)
+        Parameter for the Minkowski metric from
+        sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
+        equivalent to using manhattan_distance (l1), and euclidean_distance
+        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+
     metric_params : dict, optional (default = None)
         Additional keyword arguments for the metric function.
 
diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py
index d4adfd9107f6e6592ac18e8f01373287fa128dd9..af1eca3b201d57f1d0345ad1c038af4d30af3f79 100644
--- a/sklearn/neural_network/multilayer_perceptron.py
+++ b/sklearn/neural_network/multilayer_perceptron.py
@@ -751,6 +751,15 @@ class MLPClassifier(BaseMultilayerPerceptron, ClassifierMixin):
 
         Only used when ``solver='sgd'``.
 
+    learning_rate_init : double, optional, default 0.001
+        The initial learning rate used. It controls the step-size
+        in updating the weights. Only used when solver='sgd' or 'adam'.
+
+    power_t : double, optional, default 0.5
+        The exponent for inverse scaling learning rate.
+        It is used in updating effective learning rate when the learning_rate
+        is set to 'invscaling'. Only used when solver='sgd'.
+
     max_iter : int, optional, default 200
         Maximum number of iterations. The solver iterates until convergence
         (determined by 'tol') or this number of iterations. For stochastic
@@ -758,31 +767,22 @@ class MLPClassifier(BaseMultilayerPerceptron, ClassifierMixin):
         (how many times each data point will be used), not the number of
         gradient steps.
 
+    shuffle : bool, optional, default True
+        Whether to shuffle samples in each iteration. Only used when
+        solver='sgd' or 'adam'.
+
     random_state : int, RandomState instance or None, optional, default None
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
-    shuffle : bool, optional, default True
-        Whether to shuffle samples in each iteration. Only used when
-        solver='sgd' or 'adam'.
-
     tol : float, optional, default 1e-4
         Tolerance for the optimization. When the loss or score is not improving
         by at least tol for two consecutive iterations, unless `learning_rate`
         is set to 'adaptive', convergence is considered to be reached and
         training stops.
 
-    learning_rate_init : double, optional, default 0.001
-        The initial learning rate used. It controls the step-size
-        in updating the weights. Only used when solver='sgd' or 'adam'.
-
-    power_t : double, optional, default 0.5
-        The exponent for inverse scaling learning rate.
-        It is used in updating effective learning rate when the learning_rate
-        is set to 'invscaling'. Only used when solver='sgd'.
-
     verbose : bool, optional, default False
         Whether to print progress messages to stdout.
 
@@ -1128,6 +1128,15 @@ class MLPRegressor(BaseMultilayerPerceptron, RegressorMixin):
 
         Only used when solver='sgd'.
 
+    learning_rate_init : double, optional, default 0.001
+        The initial learning rate used. It controls the step-size
+        in updating the weights. Only used when solver='sgd' or 'adam'.
+
+    power_t : double, optional, default 0.5
+        The exponent for inverse scaling learning rate.
+        It is used in updating effective learning rate when the learning_rate
+        is set to 'invscaling'. Only used when solver='sgd'.
+
     max_iter : int, optional, default 200
         Maximum number of iterations. The solver iterates until convergence
         (determined by 'tol') or this number of iterations. For stochastic
@@ -1135,31 +1144,22 @@ class MLPRegressor(BaseMultilayerPerceptron, RegressorMixin):
         (how many times each data point will be used), not the number of
         gradient steps.
 
+    shuffle : bool, optional, default True
+        Whether to shuffle samples in each iteration. Only used when
+        solver='sgd' or 'adam'.
+
     random_state : int, RandomState instance or None, optional, default None
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
-    shuffle : bool, optional, default True
-        Whether to shuffle samples in each iteration. Only used when
-        solver='sgd' or 'adam'.
-
     tol : float, optional, default 1e-4
         Tolerance for the optimization. When the loss or score is not improving
         by at least tol for two consecutive iterations, unless `learning_rate`
         is set to 'adaptive', convergence is considered to be reached and
         training stops.
 
-    learning_rate_init : double, optional, default 0.001
-        The initial learning rate used. It controls the step-size
-        in updating the weights. Only used when solver='sgd' or 'adam'.
-
-    power_t : double, optional, default 0.5
-        The exponent for inverse scaling learning rate.
-        It is used in updating effective learning rate when the learning_rate
-        is set to 'invscaling'. Only used when solver='sgd'.
-
     verbose : bool, optional, default False
         Whether to print progress messages to stdout.
 
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 252fae9bfcb0f477696af2a78be4ff2f302554c2..eb19494c83b756a4786c8a46c93d3ead3fa31d69 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -394,6 +394,9 @@ def minmax_scale(X, feature_range=(0, 1), axis=0, copy=True):
 
     Parameters
     ----------
+    X : array-like, shape (n_samples, n_features)
+        The data.
+
     feature_range : tuple (min, max), default=(0, 1)
         Desired range of transformed data.
 
@@ -465,6 +468,12 @@ class StandardScaler(BaseEstimator, TransformerMixin):
 
     Parameters
     ----------
+    copy : boolean, optional, default True
+        If False, try to avoid a copy and do inplace scaling instead.
+        This is not guaranteed to always work inplace; e.g. if the data is
+        not a NumPy array or scipy.sparse CSR matrix, a copy may still be
+        returned.
+
     with_mean : boolean, True by default
         If True, center the data before scaling.
         This does not work (and will raise an exception) when attempted on
@@ -476,12 +485,6 @@ class StandardScaler(BaseEstimator, TransformerMixin):
         If True, scale the data to unit variance (or equivalently,
         unit standard deviation).
 
-    copy : boolean, optional, default True
-        If False, try to avoid a copy and do inplace scaling instead.
-        This is not guaranteed to always work inplace; e.g. if the data is
-        not a NumPy array or scipy.sparse CSR matrix, a copy may still be
-        returned.
-
     Attributes
     ----------
     scale_ : ndarray, shape (n_features,)
@@ -627,6 +630,8 @@ class StandardScaler(BaseEstimator, TransformerMixin):
         y : (ignored)
             .. deprecated:: 0.19
                This parameter will be removed in 0.21.
+        copy : bool, optional (default: None)
+            Copy the input X or not.
         """
         if not isinstance(y, string_types) or y != 'deprecated':
             warnings.warn("The parameter y on transform() is "
@@ -660,6 +665,13 @@ class StandardScaler(BaseEstimator, TransformerMixin):
         ----------
         X : array-like, shape [n_samples, n_features]
             The data used to scale along the features axis.
+        copy : bool, optional (default: None)
+            Copy the input X or not.
+
+        Returns
+        -------
+        X_tr : array-like, shape [n_samples, n_features]
+            Transformed array.
         """
         check_is_fitted(self, 'scale_')
 
@@ -844,6 +856,9 @@ def maxabs_scale(X, axis=0, copy=True):
 
     Parameters
     ----------
+    X : array-like, shape (n_samples, n_features)
+        The data.
+
     axis : int (0 by default)
         axis used to scale along. If 0, independently scale each feature,
         otherwise (if 1) scale each sample.
@@ -1241,6 +1256,16 @@ class PolynomialFeatures(BaseEstimator, TransformerMixin):
     def fit(self, X, y=None):
         """
         Compute number of output features.
+
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            The data.
+
+        Returns
+        -------
+        self : instance
         """
         n_samples, n_features = check_array(X).shape
         combinations = self._combinations(n_features, self.degree,
@@ -1435,6 +1460,10 @@ class Normalizer(BaseEstimator, TransformerMixin):
 
         This method is just there to implement the usual API and hence
         work in pipelines.
+
+        Parameters
+        ----------
+        X : array-like
         """
         X = check_array(X, accept_sparse='csr')
         return self
@@ -1450,6 +1479,8 @@ class Normalizer(BaseEstimator, TransformerMixin):
         y : (ignored)
             .. deprecated:: 0.19
                This parameter will be removed in 0.21.
+        copy : bool, optional (default: None)
+            Copy the input X or not.
         """
         if not isinstance(y, string_types) or y != 'deprecated':
             warnings.warn("The parameter y on transform() is "
@@ -1554,6 +1585,10 @@ class Binarizer(BaseEstimator, TransformerMixin):
 
         This method is just there to implement the usual API and hence
         work in pipelines.
+
+        Parameters
+        ----------
+        X : array-like
         """
         check_array(X, accept_sparse='csr')
         return self
@@ -1570,6 +1605,8 @@ class Binarizer(BaseEstimator, TransformerMixin):
         y : (ignored)
             .. deprecated:: 0.19
                This parameter will be removed in 0.21.
+        copy : bool
+            Copy the input X or not.
         """
         if not isinstance(y, string_types) or y != 'deprecated':
             warnings.warn("The parameter y on transform() is "
@@ -1933,6 +1970,11 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
 
         Equivalent to self.fit(X).transform(X), but more convenient and more
         efficient. See fit for the parameters, transform for the return value.
+
+        Parameters
+        ----------
+        X : array-like, shape [n_samples, n_feature]
+            Input array of type int.
         """
         return _transform_selected(X, self._fit_transform,
                                    self.categorical_features, copy=True)
@@ -2347,6 +2389,8 @@ class QuantileTransformer(BaseEstimator, TransformerMixin):
     def inverse_transform(self, X):
         """Back-projection to the original space.
 
+        Parameters
+        ----------
         X : ndarray or sparse matrix, shape (n_samples, n_features)
             The data used to scale along the features axis. If a sparse
             matrix is provided, it will be converted into a sparse
diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py
index ab0dd64bf81ea7fde43a07ce1bd8f1a74157a727..5e35efe82f91410aec8fa6092dbb890d441e8958 100644
--- a/sklearn/semi_supervised/label_propagation.py
+++ b/sklearn/semi_supervised/label_propagation.py
@@ -91,6 +91,9 @@ class BaseLabelPropagation(six.with_metaclass(ABCMeta, BaseEstimator,
     gamma : float
         Parameter for rbf kernel
 
+    n_neighbors : integer > 0
+        Parameter for knn kernel
+
     alpha : float
         Clamping factor
 
@@ -101,9 +104,6 @@ class BaseLabelPropagation(six.with_metaclass(ABCMeta, BaseEstimator,
         Convergence tolerance: threshold to consider the system at steady
         state
 
-    n_neighbors : integer > 0
-        Parameter for knn kernel
-
     n_jobs : int, optional (default = 1)
         The number of parallel jobs to run.
         If ``-1``, then the number of jobs is set to the number of CPU cores.
@@ -331,6 +331,10 @@ class LabelPropagation(BaseLabelPropagation):
         Convergence tolerance: threshold to consider the system at steady
         state
 
+    n_jobs : int, optional (default = 1)
+        The number of parallel jobs to run.
+        If ``-1``, then the number of jobs is set to the number of CPU cores.
+
     Attributes
     ----------
     X_ : array, shape = [n_samples, n_features]
diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py
index 252b1d07bb8d2ba771c93454c054e7bcd26ac709..ad71aa678a8cfaf139eabb7209c6ad7f794d43ca 100644
--- a/sklearn/svm/base.py
+++ b/sklearn/svm/base.py
@@ -811,7 +811,6 @@ def _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight,
         generator; If None, the random number generator is the RandomState
         instance used by `np.random`.
 
-
     multi_class : str, {'ovr', 'crammer_singer'}
         `ovr` trains n_classes one-vs-rest classifiers, while `crammer_singer`
         optimizes a joint objective over all classes.
diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py
index e73da83cbfeb312d35653bdbdd84f8b71e51d9fa..4833042827361594e2612996b00758d8ca6f83ee 100644
--- a/sklearn/svm/classes.py
+++ b/sklearn/svm/classes.py
@@ -26,19 +26,16 @@ class LinearSVC(BaseEstimator, LinearClassifierMixin,
 
     Parameters
     ----------
-    C : float, optional (default=1.0)
-        Penalty parameter C of the error term.
+    penalty : string, 'l1' or 'l2' (default='l2')
+        Specifies the norm used in the penalization. The 'l2'
+        penalty is the standard used in SVC. The 'l1' leads to ``coef_``
+        vectors that are sparse.
 
     loss : string, 'hinge' or 'squared_hinge' (default='squared_hinge')
         Specifies the loss function. 'hinge' is the standard SVM loss
         (used e.g. by the SVC class) while 'squared_hinge' is the
         square of the hinge loss.
 
-    penalty : string, 'l1' or 'l2' (default='l2')
-        Specifies the norm used in the penalization. The 'l2'
-        penalty is the standard used in SVC. The 'l1' leads to ``coef_``
-        vectors that are sparse.
-
     dual : bool, (default=True)
         Select the algorithm to either solve the dual or primal
         optimization problem. Prefer dual=False when n_samples > n_features.
@@ -46,6 +43,9 @@ class LinearSVC(BaseEstimator, LinearClassifierMixin,
     tol : float, optional (default=1e-4)
         Tolerance for stopping criteria.
 
+    C : float, optional (default=1.0)
+        Penalty parameter C of the error term.
+
     multi_class : string, 'ovr' or 'crammer_singer' (default='ovr')
         Determines the multi-class strategy if `y` contains more than
         two classes.
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
new file mode 100644
index 0000000000000000000000000000000000000000..584c4f2e7ceed82bab9a3e17ff2bab1ffc648560
--- /dev/null
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -0,0 +1,149 @@
+# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
+#          Raghav RV <rvraghav93@gmail.com>
+# License: BSD 3 clause
+
+from __future__ import print_function
+
+import inspect
+import sys
+import warnings
+import importlib
+
+from pkgutil import walk_packages
+from inspect import getsource
+
+import sklearn
+from sklearn.base import signature
+from sklearn.utils.testing import SkipTest
+from sklearn.utils.testing import check_docstring_parameters
+from sklearn.utils.testing import _get_func_name
+from sklearn.utils.testing import ignore_warnings
+from sklearn.utils.deprecation import _is_deprecated
+
+PUBLIC_MODULES = set(['sklearn.' + pckg[1]
+                      for pckg in walk_packages('sklearn.*')
+                      if not pckg[1].startswith('_')])
+
+# TODO Uncomment all modules and fix doc inconsistencies everywhere
+# The list of modules that are not tested for now
+PUBLIC_MODULES -= set([
+    'sklearn.cross_decomposition',
+    'sklearn.discriminant_analysis',
+    'sklearn.ensemble',
+    'sklearn.feature_selection',
+    'sklearn.kernel_approximation',
+    'sklearn.model_selection',
+    'sklearn.multioutput',
+    'sklearn.random_projection',
+    'sklearn.setup',
+    'sklearn.svm',
+    'sklearn.utils',
+    # Deprecated modules
+    'sklearn.cross_validation',
+    'sklearn.grid_search',
+    'sklearn.learning_curve',
+])
+
+# functions to ignore args / docstring of
+_DOCSTRING_IGNORES = [
+    'sklearn.utils.deprecation.load_mlcomp',
+    'sklearn.pipeline.make_pipeline',
+    'sklearn.pipeline.make_union',
+    'sklearn.utils.extmath.safe_sparse_dot',
+]
+
+# Methods where y param should be ignored if y=None by default
+_METHODS_IGNORE_NONE_Y = [
+        'fit',
+        'score',
+        'fit_predict',
+        'fit_transform',
+        'partial_fit',
+        'predict'
+]
+
+
+def test_docstring_parameters():
+    # Test module docstring formatting
+
+    # Skip test if numpydoc is not found or if python version is < 3.5
+    try:
+        import numpydoc  # noqa
+        assert sys.version_info >= (3, 5)
+    except (ImportError, AssertionError):
+        raise SkipTest(
+            "numpydoc is required to test the docstrings")
+
+    from numpydoc import docscrape
+
+    incorrect = []
+    for name in PUBLIC_MODULES:
+        with warnings.catch_warnings(record=True):
+            module = importlib.import_module(name)
+        classes = inspect.getmembers(module, inspect.isclass)
+        for cname, cls in classes:
+            this_incorrect = []
+            if cname in _DOCSTRING_IGNORES:
+                continue
+            if cname.startswith('_'):
+                continue
+            with warnings.catch_warnings(record=True) as w:
+                cdoc = docscrape.ClassDoc(cls)
+            if len(w):
+                raise RuntimeError('Error for __init__ of %s in %s:\n%s'
+                                   % (cls, name, w[0]))
+
+            cls_init = getattr(cls, '__init__', None)
+
+            if _is_deprecated(cls_init):
+                continue
+
+            elif cls_init is not None:
+                this_incorrect += check_docstring_parameters(
+                    cls.__init__, cdoc, class_name=cname)
+            for method_name in cdoc.methods:
+                method = getattr(cls, method_name)
+                if _is_deprecated(method):
+                    continue
+                param_ignore = None
+                # Now skip docstring test for y when y is None
+                # by default for API reason
+                if method_name in _METHODS_IGNORE_NONE_Y:
+                    sig = signature(method)
+                    if ('y' in sig.parameters and
+                            sig.parameters['y'].default is None):
+                        param_ignore = ['y']  # ignore y for fit and score
+                result = check_docstring_parameters(
+                    method, ignore=param_ignore, class_name=cname)
+                this_incorrect += result
+
+            incorrect += this_incorrect
+
+        functions = inspect.getmembers(module, inspect.isfunction)
+        for fname, func in functions:
+            # Don't test private methods / functions
+            if fname.startswith('_'):
+                continue
+            name_ = _get_func_name(func)
+            if (not any(d in name_ for d in _DOCSTRING_IGNORES) and
+                    not _is_deprecated(func)):
+                incorrect += check_docstring_parameters(func)
+    msg = '\n' + '\n'.join(sorted(list(set(incorrect))))
+    if len(incorrect) > 0:
+        raise AssertionError(msg)
+
+
+@ignore_warnings(category=DeprecationWarning)
+def test_tabs():
+    # Test that there are no tabs in our source files
+    for importer, modname, ispkg in walk_packages(sklearn.__path__,
+                                                  prefix='sklearn.'):
+        # because we don't import
+        mod = importlib.import_module(modname)
+        try:
+            source = getsource(mod)
+        except IOError:  # user probably should have run "make clean"
+            continue
+        assert '\t' not in source, ('"%s" has tabs, please remove them ',
+                                    'or add it to theignore list'
+                                    % modname)
diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index 8d3048d32edd31f9929defaae91d404367ce800e..93db4eb98f34e04566e43e5c6f348826576ab96a 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -527,22 +527,6 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin):
         strategies are "best" to choose the best split and "random" to choose
         the best random split.
 
-    max_features : int, float, string or None, optional (default=None)
-        The number of features to consider when looking for the best split:
-
-            - If int, then consider `max_features` features at each split.
-            - If float, then `max_features` is a percentage and
-              `int(max_features * n_features)` features are considered at each
-              split.
-            - If "auto", then `max_features=sqrt(n_features)`.
-            - If "sqrt", then `max_features=sqrt(n_features)`.
-            - If "log2", then `max_features=log2(n_features)`.
-            - If None, then `max_features=n_features`.
-
-        Note: the search for a split does not stop until at least one
-        valid partition of the node samples is found, even if it requires to
-        effectively inspect more than ``max_features`` features.
-
     max_depth : int or None, optional (default=None)
         The maximum depth of the tree. If None, then nodes are expanded until
         all leaves are pure or until all leaves contain less than
@@ -575,31 +559,21 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin):
         the input samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided.
 
-    max_leaf_nodes : int or None, optional (default=None)
-        Grow a tree with ``max_leaf_nodes`` in best-first fashion.
-        Best nodes are defined as relative reduction in impurity.
-        If None then unlimited number of leaf nodes.
-
-    class_weight : dict, list of dicts, "balanced" or None, optional (default=None)
-        Weights associated with classes in the form ``{class_label: weight}``.
-        If not given, all classes are supposed to have weight one. For
-        multi-output problems, a list of dicts can be provided in the same
-        order as the columns of y.
-
-        Note that for multioutput (including multilabel) weights should be
-        defined for each class of every column in its own dict. For example,
-        for four-class multilabel classification weights should be
-        [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
-        [{1:1}, {2:5}, {3:1}, {4:1}].
-
-        The "balanced" mode uses the values of y to automatically adjust
-        weights inversely proportional to class frequencies in the input data
-        as ``n_samples / (n_classes * np.bincount(y))``
+    max_features : int, float, string or None, optional (default=None)
+        The number of features to consider when looking for the best split:
 
-        For multi-output, the weights of each column of y will be multiplied.
+            - If int, then consider `max_features` features at each split.
+            - If float, then `max_features` is a percentage and
+              `int(max_features * n_features)` features are considered at each
+              split.
+            - If "auto", then `max_features=sqrt(n_features)`.
+            - If "sqrt", then `max_features=sqrt(n_features)`.
+            - If "log2", then `max_features=log2(n_features)`.
+            - If None, then `max_features=n_features`.
 
-        Note that these weights will be multiplied with sample_weight (passed
-        through the fit method) if sample_weight is specified.
+        Note: the search for a split does not stop until at least one
+        valid partition of the node samples is found, even if it requires to
+        effectively inspect more than ``max_features`` features.
 
     random_state : int, RandomState instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
@@ -607,14 +581,10 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin):
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
-    min_impurity_split : float,
-        Threshold for early stopping in tree growth. A node will split
-        if its impurity is above the threshold, otherwise it is a leaf.
-
-        .. deprecated:: 0.19
-           ``min_impurity_split`` has been deprecated in favor of
-           ``min_impurity_decrease`` in 0.19 and will be removed in 0.21.
-           Use ``min_impurity_decrease`` instead.
+    max_leaf_nodes : int or None, optional (default=None)
+        Grow a tree with ``max_leaf_nodes`` in best-first fashion.
+        Best nodes are defined as relative reduction in impurity.
+        If None then unlimited number of leaf nodes.
 
     min_impurity_decrease : float, optional (default=0.)
         A node will be split if this split induces a decrease of the impurity
@@ -634,6 +604,36 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin):
 
         .. versionadded:: 0.19
 
+    min_impurity_split : float,
+        Threshold for early stopping in tree growth. A node will split
+        if its impurity is above the threshold, otherwise it is a leaf.
+
+        .. deprecated:: 0.19
+           ``min_impurity_split`` has been deprecated in favor of
+           ``min_impurity_decrease`` in 0.19 and will be removed in 0.21.
+           Use ``min_impurity_decrease`` instead.
+
+    class_weight : dict, list of dicts, "balanced" or None, default=None
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one. For
+        multi-output problems, a list of dicts can be provided in the same
+        order as the columns of y.
+
+        Note that for multioutput (including multilabel) weights should be
+        defined for each class of every column in its own dict. For example,
+        for four-class multilabel classification weights should be
+        [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
+        [{1:1}, {2:5}, {3:1}, {4:1}].
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
+
+        For multi-output, the weights of each column of y will be multiplied.
+
+        Note that these weights will be multiplied with sample_weight (passed
+        through the fit method) if sample_weight is specified.
+
     presort : bool, optional (default=False)
         Whether to presort the data to speed up the finding of best splits in
         fitting. For the default settings of a decision tree on large
@@ -806,6 +806,9 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin):
             ``dtype=np.float32`` and if a sparse matrix is provided
             to a sparse ``csr_matrix``.
 
+        check_input : bool
+            Run check_array on X.
+
         Returns
         -------
         p : array of shape = [n_samples, n_classes], or a list of n_outputs
@@ -887,22 +890,6 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin):
         strategies are "best" to choose the best split and "random" to choose
         the best random split.
 
-    max_features : int, float, string or None, optional (default=None)
-        The number of features to consider when looking for the best split:
-
-        - If int, then consider `max_features` features at each split.
-        - If float, then `max_features` is a percentage and
-          `int(max_features * n_features)` features are considered at each
-          split.
-        - If "auto", then `max_features=n_features`.
-        - If "sqrt", then `max_features=sqrt(n_features)`.
-        - If "log2", then `max_features=log2(n_features)`.
-        - If None, then `max_features=n_features`.
-
-        Note: the search for a split does not stop until at least one
-        valid partition of the node samples is found, even if it requires to
-        effectively inspect more than ``max_features`` features.
-
     max_depth : int or None, optional (default=None)
         The maximum depth of the tree. If None, then nodes are expanded until
         all leaves are pure or until all leaves contain less than
@@ -935,10 +922,21 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin):
         the input samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided.
 
-    max_leaf_nodes : int or None, optional (default=None)
-        Grow a tree with ``max_leaf_nodes`` in best-first fashion.
-        Best nodes are defined as relative reduction in impurity.
-        If None then unlimited number of leaf nodes.
+    max_features : int, float, string or None, optional (default=None)
+        The number of features to consider when looking for the best split:
+
+        - If int, then consider `max_features` features at each split.
+        - If float, then `max_features` is a percentage and
+          `int(max_features * n_features)` features are considered at each
+          split.
+        - If "auto", then `max_features=n_features`.
+        - If "sqrt", then `max_features=sqrt(n_features)`.
+        - If "log2", then `max_features=log2(n_features)`.
+        - If None, then `max_features=n_features`.
+
+        Note: the search for a split does not stop until at least one
+        valid partition of the node samples is found, even if it requires to
+        effectively inspect more than ``max_features`` features.
 
     random_state : int, RandomState instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
@@ -946,14 +944,10 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin):
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
-    min_impurity_split : float,
-        Threshold for early stopping in tree growth. A node will split
-        if its impurity is above the threshold, otherwise it is a leaf.
-
-        .. deprecated:: 0.19
-           ``min_impurity_split`` has been deprecated in favor of
-           ``min_impurity_decrease`` in 0.19 and will be removed in 0.21.
-           Use ``min_impurity_decrease`` instead.
+    max_leaf_nodes : int or None, optional (default=None)
+        Grow a tree with ``max_leaf_nodes`` in best-first fashion.
+        Best nodes are defined as relative reduction in impurity.
+        If None then unlimited number of leaf nodes.
 
     min_impurity_decrease : float, optional (default=0.)
         A node will be split if this split induces a decrease of the impurity
@@ -973,6 +967,15 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin):
 
         .. versionadded:: 0.19
 
+    min_impurity_split : float,
+        Threshold for early stopping in tree growth. A node will split
+        if its impurity is above the threshold, otherwise it is a leaf.
+
+        .. deprecated:: 0.19
+           ``min_impurity_split`` has been deprecated in favor of
+           ``min_impurity_decrease`` in 0.19 and will be removed in 0.21.
+           Use ``min_impurity_decrease`` instead.
+
     presort : bool, optional (default=False)
         Whether to presort the data to speed up the finding of best splits in
         fitting. For the default settings of a decision tree on large
@@ -1132,6 +1135,124 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
 
     Read more in the :ref:`User Guide <tree>`.
 
+    Parameters
+    ----------
+    criterion : string, optional (default="gini")
+        The function to measure the quality of a split. Supported criteria are
+        "gini" for the Gini impurity and "entropy" for the information gain.
+
+    splitter : string, optional (default="best")
+        The strategy used to choose the split at each node. Supported
+        strategies are "best" to choose the best split and "random" to choose
+        the best random split.
+
+    max_depth : int or None, optional (default=None)
+        The maximum depth of the tree. If None, then nodes are expanded until
+        all leaves are pure or until all leaves contain less than
+        min_samples_split samples.
+
+    min_samples_split : int, float, optional (default=2)
+        The minimum number of samples required to split an internal node:
+
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a percentage and
+          `ceil(min_samples_split * n_samples)` are the minimum
+          number of samples for each split.
+
+        .. versionchanged:: 0.18
+           Added float values for percentages.
+
+    min_samples_leaf : int, float, optional (default=1)
+        The minimum number of samples required to be at a leaf node:
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a percentage and
+          `ceil(min_samples_leaf * n_samples)` are the minimum
+          number of samples for each node.
+
+        .. versionchanged:: 0.18
+           Added float values for percentages.
+
+    min_weight_fraction_leaf : float, optional (default=0.)
+        The minimum weighted fraction of the sum total of weights (of all
+        the input samples) required to be at a leaf node. Samples have
+        equal weight when sample_weight is not provided.
+
+    max_features : int, float, string or None, optional (default=None)
+        The number of features to consider when looking for the best split:
+
+            - If int, then consider `max_features` features at each split.
+            - If float, then `max_features` is a percentage and
+              `int(max_features * n_features)` features are considered at each
+              split.
+            - If "auto", then `max_features=sqrt(n_features)`.
+            - If "sqrt", then `max_features=sqrt(n_features)`.
+            - If "log2", then `max_features=log2(n_features)`.
+            - If None, then `max_features=n_features`.
+
+        Note: the search for a split does not stop until at least one
+        valid partition of the node samples is found, even if it requires to
+        effectively inspect more than ``max_features`` features.
+
+    random_state : int, RandomState instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
+    max_leaf_nodes : int or None, optional (default=None)
+        Grow a tree with ``max_leaf_nodes`` in best-first fashion.
+        Best nodes are defined as relative reduction in impurity.
+        If None then unlimited number of leaf nodes.
+
+    min_impurity_decrease : float, optional (default=0.)
+        A node will be split if this split induces a decrease of the impurity
+        greater than or equal to this value.
+
+        The weighted impurity decrease equation is the following::
+
+            N_t / N * (impurity - N_t_R / N_t * right_impurity
+                                - N_t_L / N_t * left_impurity)
+
+        where ``N`` is the total number of samples, ``N_t`` is the number of
+        samples at the current node, ``N_t_L`` is the number of samples in the
+        left child, and ``N_t_R`` is the number of samples in the right child.
+
+        ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
+        if ``sample_weight`` is passed.
+
+        .. versionadded:: 0.19
+
+    min_impurity_split : float,
+        Threshold for early stopping in tree growth. A node will split
+        if its impurity is above the threshold, otherwise it is a leaf.
+
+        .. deprecated:: 0.19
+           ``min_impurity_split`` has been deprecated in favor of
+           ``min_impurity_decrease`` in 0.19 and will be removed in 0.21.
+           Use ``min_impurity_decrease`` instead.
+
+    class_weight : dict, list of dicts, "balanced" or None, default=None
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one. For
+        multi-output problems, a list of dicts can be provided in the same
+        order as the columns of y.
+
+        Note that for multioutput (including multilabel) weights should be
+        defined for each class of every column in its own dict. For example,
+        for four-class multilabel classification weights should be
+        [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
+        [{1:1}, {2:5}, {3:1}, {4:1}].
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
+
+        For multi-output, the weights of each column of y will be multiplied.
+
+        Note that these weights will be multiplied with sample_weight (passed
+        through the fit method) if sample_weight is specified.
+
     See also
     --------
     ExtraTreeRegressor, ExtraTreesClassifier, ExtraTreesRegressor
@@ -1192,6 +1313,109 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
 
     Read more in the :ref:`User Guide <tree>`.
 
+    Parameters
+    ----------
+    criterion : string, optional (default="mse")
+        The function to measure the quality of a split. Supported criteria
+        are "mse" for the mean squared error, which is equal to variance
+        reduction as feature selection criterion, and "mae" for the mean
+        absolute error.
+
+        .. versionadded:: 0.18
+           Mean Absolute Error (MAE) criterion.
+
+    splitter : string, optional (default="best")
+        The strategy used to choose the split at each node. Supported
+        strategies are "best" to choose the best split and "random" to choose
+        the best random split.
+
+    max_depth : int or None, optional (default=None)
+        The maximum depth of the tree. If None, then nodes are expanded until
+        all leaves are pure or until all leaves contain less than
+        min_samples_split samples.
+
+    min_samples_split : int, float, optional (default=2)
+        The minimum number of samples required to split an internal node:
+
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a percentage and
+          `ceil(min_samples_split * n_samples)` are the minimum
+          number of samples for each split.
+
+        .. versionchanged:: 0.18
+           Added float values for percentages.
+
+    min_samples_leaf : int, float, optional (default=1)
+        The minimum number of samples required to be at a leaf node:
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a percentage and
+          `ceil(min_samples_leaf * n_samples)` are the minimum
+          number of samples for each node.
+
+        .. versionchanged:: 0.18
+           Added float values for percentages.
+
+    min_weight_fraction_leaf : float, optional (default=0.)
+        The minimum weighted fraction of the sum total of weights (of all
+        the input samples) required to be at a leaf node. Samples have
+        equal weight when sample_weight is not provided.
+
+    max_features : int, float, string or None, optional (default=None)
+        The number of features to consider when looking for the best split:
+
+        - If int, then consider `max_features` features at each split.
+        - If float, then `max_features` is a percentage and
+          `int(max_features * n_features)` features are considered at each
+          split.
+        - If "auto", then `max_features=n_features`.
+        - If "sqrt", then `max_features=sqrt(n_features)`.
+        - If "log2", then `max_features=log2(n_features)`.
+        - If None, then `max_features=n_features`.
+
+        Note: the search for a split does not stop until at least one
+        valid partition of the node samples is found, even if it requires to
+        effectively inspect more than ``max_features`` features.
+
+    random_state : int, RandomState instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
+    min_impurity_decrease : float, optional (default=0.)
+        A node will be split if this split induces a decrease of the impurity
+        greater than or equal to this value.
+
+        The weighted impurity decrease equation is the following::
+
+            N_t / N * (impurity - N_t_R / N_t * right_impurity
+                                - N_t_L / N_t * left_impurity)
+
+        where ``N`` is the total number of samples, ``N_t`` is the number of
+        samples at the current node, ``N_t_L`` is the number of samples in the
+        left child, and ``N_t_R`` is the number of samples in the right child.
+
+        ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
+        if ``sample_weight`` is passed.
+
+        .. versionadded:: 0.19
+
+    min_impurity_split : float,
+        Threshold for early stopping in tree growth. A node will split
+        if its impurity is above the threshold, otherwise it is a leaf.
+
+        .. deprecated:: 0.19
+           ``min_impurity_split`` has been deprecated in favor of
+           ``min_impurity_decrease`` in 0.19 and will be removed in 0.21.
+           Use ``min_impurity_decrease`` instead.
+
+    max_leaf_nodes : int or None, optional (default=None)
+        Grow a tree with ``max_leaf_nodes`` in best-first fashion.
+        Best nodes are defined as relative reduction in impurity.
+        If None then unlimited number of leaf nodes.
+
+
     See also
     --------
     ExtraTreeClassifier, ExtraTreesClassifier, ExtraTreesRegressor
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index 69a1be10f089b177782bdbca62c68d2f8308f5f8..fc71c387903a3240e47c7d954104b0a9da015041 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -463,7 +463,12 @@ def _get_n_jobs(n_jobs):
 
 
 def tosequence(x):
-    """Cast iterable x to a Sequence, avoiding a copy if possible."""
+    """Cast iterable x to a Sequence, avoiding a copy if possible.
+
+    Parameters
+    ----------
+    x : iterable
+    """
     if isinstance(x, np.ndarray):
         return np.asarray(x)
     elif isinstance(x, Sequence):
diff --git a/sklearn/utils/deprecation.py b/sklearn/utils/deprecation.py
index aa0caea2ce2b8f7a3062dd40008108cb9644482f..ca305e5cb3f62d6e8ba94528632f0f2a48ef59f5 100644
--- a/sklearn/utils/deprecation.py
+++ b/sklearn/utils/deprecation.py
@@ -1,3 +1,4 @@
+import sys
 import warnings
 
 __all__ = ["deprecated", ]
@@ -19,22 +20,26 @@ class deprecated(object):
 
     >>> @deprecated()
     ... def some_function(): pass
+
+    Parameters
+    ----------
+    extra : string
+          to be added to the deprecation messages
     """
 
     # Adapted from http://wiki.python.org/moin/PythonDecoratorLibrary,
     # but with many changes.
 
     def __init__(self, extra=''):
-        """
-        Parameters
-        ----------
-        extra : string
-          to be added to the deprecation messages
-
-        """
         self.extra = extra
 
     def __call__(self, obj):
+        """Call method
+
+        Parameters
+        ----------
+        obj : object
+        """
         if isinstance(obj, type):
             return self._decorate_class(obj)
         else:
@@ -83,3 +88,17 @@ class deprecated(object):
         if olddoc:
             newdoc = "%s\n\n%s" % (newdoc, olddoc)
         return newdoc
+
+
+def _is_deprecated(func):
+    """Helper to check if func is wraped by our deprecated decorator"""
+    if sys.version_info < (3, 5):
+        raise NotImplementedError("This is only available for python3.5 "
+                                  "or above")
+    closures = getattr(func, '__closure__', [])
+    if closures is None:
+        closures = []
+    is_deprecated = ('deprecated' in ''.join([c.cell_contents
+                                              for c in closures
+                     if isinstance(c.cell_contents, str)]))
+    return is_deprecated
diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py
index 035b901abe952446f58eafacabe017f4a1fccfaf..cfaefc88d23081860f87f448fc5edf8b9a024199 100644
--- a/sklearn/utils/testing.py
+++ b/sklearn/utils/testing.py
@@ -773,3 +773,132 @@ class _named_check(object):
 
     def __call__(self, *args, **kwargs):
         return self.check(*args, **kwargs)
+
+# Utils to test docstrings
+
+
+def _get_args(function, varargs=False):
+    """Helper to get function arguments"""
+    # NOTE this works only in python3.5
+    if sys.version_info < (3, 5):
+        NotImplementedError("_get_args is not available for python < 3.5")
+
+    params = inspect.signature(function).parameters
+    args = [key for key, param in params.items()
+            if param.kind not in (param.VAR_POSITIONAL, param.VAR_KEYWORD)]
+    if varargs:
+        varargs = [param.name for param in params.values()
+                   if param.kind == param.VAR_POSITIONAL]
+        if len(varargs) == 0:
+            varargs = None
+        return args, varargs
+    else:
+        return args
+
+
+def _get_func_name(func, class_name=None):
+    """Get function full name
+
+    Parameters
+    ----------
+    func : callable
+        The function object.
+    class_name : string, optional (default: None)
+       If ``func`` is a class method and the class name is known specify
+       class_name for the error message.
+
+    Returns
+    -------
+    name : str
+        The function name.
+    """
+    parts = []
+    module = inspect.getmodule(func)
+    if module:
+        parts.append(module.__name__)
+    if class_name is not None:
+        parts.append(class_name)
+    elif hasattr(func, 'im_class'):
+        parts.append(func.im_class.__name__)
+
+    parts.append(func.__name__)
+    return '.'.join(parts)
+
+
+def check_docstring_parameters(func, doc=None, ignore=None, class_name=None):
+    """Helper to check docstring
+
+    Parameters
+    ----------
+    func : callable
+        The function object to test.
+    doc : str, optional (default: None)
+        Docstring if it is passed manually to the test.
+    ignore : None | list
+        Parameters to ignore.
+    class_name : string, optional (default: None)
+       If ``func`` is a class method and the class name is known specify
+       class_name for the error message.
+
+    Returns
+    -------
+    incorrect : list
+        A list of string describing the incorrect results.
+    """
+    from numpydoc import docscrape
+    incorrect = []
+    ignore = [] if ignore is None else ignore
+
+    func_name = _get_func_name(func, class_name=class_name)
+    if (not func_name.startswith('sklearn.') or
+            func_name.startswith('sklearn.externals')):
+        return incorrect
+    # Don't check docstring for property-functions
+    if inspect.isdatadescriptor(func):
+        return incorrect
+    args = list(filter(lambda x: x not in ignore, _get_args(func)))
+    # drop self
+    if len(args) > 0 and args[0] == 'self':
+        args.remove('self')
+
+    if doc is None:
+        with warnings.catch_warnings(record=True) as w:
+            try:
+                doc = docscrape.FunctionDoc(func)
+            except Exception as exp:
+                incorrect += [func_name + ' parsing error: ' + str(exp)]
+                return incorrect
+        if len(w):
+            raise RuntimeError('Error for %s:\n%s' % (func_name, w[0]))
+
+    param_names = []
+    for name, type_definition, param_doc in doc['Parameters']:
+        if (type_definition.strip() == "" or
+                type_definition.strip().startswith(':')):
+
+            param_name = name.lstrip()
+
+            # If there was no space between name and the colon
+            # "verbose:" -> len(["verbose", ""][0]) -> 7
+            # If "verbose:"[7] == ":", then there was no space
+            if param_name[len(param_name.split(':')[0].strip())] == ':':
+                incorrect += [func_name +
+                              ' There was no space between the param name and '
+                              'colon ("%s")' % name]
+            else:
+                incorrect += [func_name + ' Incorrect type definition for '
+                              'param: "%s" (type definition was "%s")'
+                              % (name.split(':')[0], type_definition)]
+        if '*' not in name:
+            param_names.append(name.split(':')[0].strip('` '))
+
+    param_names = list(filter(lambda x: x not in ignore, param_names))
+
+    if len(param_names) != len(args):
+        bad = str(sorted(list(set(param_names) ^ set(args))))
+        incorrect += [func_name + ' arg mismatch: ' + bad]
+    else:
+        for n1, n2 in zip(param_names, args):
+            if n1 != n2:
+                incorrect += [func_name + ' ' + n1 + ' != ' + n2]
+    return incorrect
diff --git a/sklearn/utils/tests/test_deprecation.py b/sklearn/utils/tests/test_deprecation.py
new file mode 100644
index 0000000000000000000000000000000000000000..31a92bc442cc99cd35e9591a2845aebc03e950d4
--- /dev/null
+++ b/sklearn/utils/tests/test_deprecation.py
@@ -0,0 +1,57 @@
+# Authors: Raghav RV <rvraghav93@gmail.com>
+# License: BSD 3 clause
+
+
+import sys
+
+from sklearn.utils.deprecation import _is_deprecated
+from sklearn.utils.deprecation import deprecated
+from sklearn.utils.testing import assert_warns_message
+from sklearn.utils.testing import SkipTest
+
+
+@deprecated('qwerty')
+class MockClass1:
+    pass
+
+
+class MockClass2:
+    @deprecated('mockclass2_method')
+    def method(self):
+        pass
+
+
+class MockClass3:
+    @deprecated()
+    def __init__(self):
+        pass
+
+
+class MockClass4:
+    pass
+
+
+@deprecated()
+def mock_function():
+    return 10
+
+
+def test_deprecated():
+    assert_warns_message(DeprecationWarning, 'qwerty', MockClass1)
+    assert_warns_message(DeprecationWarning, 'mockclass2_method',
+                         MockClass2().method)
+    assert_warns_message(DeprecationWarning, 'deprecated', MockClass3)
+    val = assert_warns_message(DeprecationWarning, 'deprecated', mock_function)
+    assert val == 10
+
+
+def test_is_deprecated():
+    if sys.version_info < (3, 5):
+        raise SkipTest("This test will run only on python3.5 and above")
+    # Test if _is_deprecated helper identifies wrapping via deprecated
+    # NOTE it works only for class methods and functions
+    assert _is_deprecated(MockClass1.__init__)
+    assert _is_deprecated(MockClass2().method)
+    assert _is_deprecated(MockClass3.__init__)
+    assert not _is_deprecated(MockClass4.__init__)
+    assert _is_deprecated(mock_function)
diff --git a/sklearn/utils/tests/test_testing.py b/sklearn/utils/tests/test_testing.py
index 78eb10a635ece1e036990da1d90fd7289973b52e..cf18de0b35b11ddf1484e9deb7d062fc772da2ce 100644
--- a/sklearn/utils/tests/test_testing.py
+++ b/sklearn/utils/tests/test_testing.py
@@ -4,7 +4,10 @@ import sys
 import numpy as np
 from scipy import sparse
 
+from sklearn.utils.deprecation import deprecated
+from sklearn.utils.metaestimators import if_delegate_has_method
 from sklearn.utils.testing import (
+    assert_true,
     assert_raises,
     assert_less,
     assert_greater,
@@ -15,9 +18,11 @@ from sklearn.utils.testing import (
     assert_equal,
     set_random_state,
     assert_raise_message,
-    assert_allclose_dense_sparse,
-    ignore_warnings)
+    ignore_warnings,
+    check_docstring_parameters,
+    assert_allclose_dense_sparse)
 
+from sklearn.utils.testing import SkipTest
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 
@@ -235,3 +240,247 @@ class TestWarns(unittest.TestCase):
 
         if failed:
             raise AssertionError("wrong warning caught by assert_warn")
+
+
+# Tests for docstrings:
+
+def f_ok(a, b):
+    """Function f
+
+    Parameters
+    ----------
+    a : int
+        Parameter a
+    b : float
+        Parameter b
+
+    Returns
+    -------
+    c : list
+        Parameter c
+    """
+    c = a + b
+    return c
+
+
+def f_bad_sections(a, b):
+    """Function f
+
+    Parameters
+    ----------
+    a : int
+        Parameter a
+    b : float
+        Parameter b
+
+    Results
+    -------
+    c : list
+        Parameter c
+    """
+    c = a + b
+    return c
+
+
+def f_bad_order(b, a):
+    """Function f
+
+    Parameters
+    ----------
+    a : int
+        Parameter a
+    b : float
+        Parameter b
+
+    Returns
+    -------
+    c : list
+        Parameter c
+    """
+    c = a + b
+    return c
+
+
+def f_missing(a, b):
+    """Function f
+
+    Parameters
+    ----------
+    a : int
+        Parameter a
+
+    Returns
+    -------
+    c : list
+        Parameter c
+    """
+    c = a + b
+    return c
+
+
+def f_check_param_definition(a, b, c, d):
+    """Function f
+
+    Parameters
+    ----------
+    a: int
+        Parameter a
+    b:
+        Parameter b
+    c :
+        Parameter c
+    d:int
+        Parameter d
+    """
+    return a + b + c + d
+
+
+class Klass(object):
+    def f_missing(self, X, y):
+        pass
+
+    def f_bad_sections(self, X, y):
+        """Function f
+
+        Parameter
+        ----------
+        a : int
+            Parameter a
+        b : float
+            Parameter b
+
+        Results
+        -------
+        c : list
+            Parameter c
+        """
+        pass
+
+
+class MockEst(object):
+    def __init__(self):
+        """MockEstimator"""
+    def fit(self, X, y):
+        return X
+
+    def predict(self, X):
+        return X
+
+    def predict_proba(self, X):
+        return X
+
+    def score(self, X):
+        return 1.
+
+
+class MockMetaEstimator(object):
+    def __init__(self, delegate):
+        """MetaEstimator to check if doctest on delegated methods work.
+
+        Parameters
+        ---------
+        delegate : estimator
+            Delegated estimator.
+        """
+        self.delegate = delegate
+
+    @if_delegate_has_method(delegate=('delegate'))
+    def predict(self, X):
+        """This is available only if delegate has predict.
+
+        Parameters
+        ----------
+        y : ndarray
+            Parameter y
+        """
+        return self.delegate.predict(X)
+
+    @deprecated("Testing a deprecated delegated method")
+    @if_delegate_has_method(delegate=('delegate'))
+    def score(self, X):
+        """This is available only if delegate has score.
+
+        Parameters
+        ---------
+        y : ndarray
+            Parameter y
+        """
+
+    @if_delegate_has_method(delegate=('delegate'))
+    def predict_proba(self, X):
+        """This is available only if delegate has predict_proba.
+
+        Parameters
+        ---------
+        X : ndarray
+            Parameter X
+        """
+        return X
+
+    @deprecated('Testing deprecated function with incorrect params')
+    @if_delegate_has_method(delegate=('delegate'))
+    def predict_log_proba(self, X):
+        """This is available only if delegate has predict_proba.
+
+        Parameters
+        ---------
+        y : ndarray
+            Parameter X
+        """
+        return X
+
+    @deprecated('Testing deprecated function with wrong params')
+    @if_delegate_has_method(delegate=('delegate'))
+    def fit(self, X, y):
+        """Incorrect docstring but should not be tested"""
+
+
+def test_check_docstring_parameters():
+    try:
+        import numpydoc  # noqa
+        assert sys.version_info >= (3, 5)
+    except (ImportError, AssertionError):
+        raise SkipTest(
+            "numpydoc is required to test the docstrings")
+
+    incorrect = check_docstring_parameters(f_ok)
+    assert_equal(incorrect, [])
+    incorrect = check_docstring_parameters(f_ok, ignore=['b'])
+    assert_equal(incorrect, [])
+    incorrect = check_docstring_parameters(f_missing, ignore=['b'])
+    assert_equal(incorrect, [])
+    assert_raise_message(RuntimeError, 'Unknown section Results',
+                         check_docstring_parameters, f_bad_sections)
+    assert_raise_message(RuntimeError, 'Unknown section Parameter',
+                         check_docstring_parameters, Klass.f_bad_sections)
+
+    messages = ["a != b", "arg mismatch: ['b']", "arg mismatch: ['X', 'y']",
+                "predict y != X",
+                "predict_proba arg mismatch: ['X']",
+                "predict_log_proba arg mismatch: ['X']",
+                "score arg mismatch: ['X']",
+                ".fit arg mismatch: ['X', 'y']"]
+
+    mock_meta = MockMetaEstimator(delegate=MockEst())
+
+    for mess, f in zip(messages,
+                       [f_bad_order, f_missing, Klass.f_missing,
+                        mock_meta.predict, mock_meta.predict_proba,
+                        mock_meta.predict_log_proba,
+                        mock_meta.score, mock_meta.fit]):
+        incorrect = check_docstring_parameters(f)
+        assert_true(len(incorrect) >= 1)
+        assert_true(mess in incorrect[0],
+                    '"%s" not in "%s"' % (mess, incorrect[0]))
+
+    incorrect = check_docstring_parameters(f_check_param_definition)
+    assert_equal(
+        incorrect,
+        ['sklearn.utils.tests.test_testing.f_check_param_definition There was '
+         'no space between the param name and colon ("a: int")',
+         'sklearn.utils.tests.test_testing.f_check_param_definition There was '
+         'no space between the param name and colon ("b:")',
+         'sklearn.utils.tests.test_testing.f_check_param_definition Incorrect '
+         'type definition for param: "c " (type definition was "")',
+         'sklearn.utils.tests.test_testing.f_check_param_definition There was '
+         'no space between the param name and colon ("d:int")'])
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 490b2455a68953320cfb3663a8463ab46e663836..e6e98f45ae5d5a3cccb9c782ef9b3b56fdc38c4b 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -586,10 +586,13 @@ def column_or_1d(y, warn=False):
 def check_random_state(seed):
     """Turn seed into a np.random.RandomState instance
 
-    If seed is None, return the RandomState singleton used by np.random.
-    If seed is an int, return a new RandomState instance seeded with seed.
-    If seed is already a RandomState instance, return it.
-    Otherwise raise ValueError.
+    Parameters
+    ----------
+    seed : None | int | instance of RandomState
+        If seed is None, return the RandomState singleton used by np.random.
+        If seed is an int, return a new RandomState instance seeded with seed.
+        If seed is already a RandomState instance, return it.
+        Otherwise raise ValueError.
     """
     if seed is None or seed is np.random:
         return np.random.mtrand._rand