diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst
index e7925f3e949656f1cbc205371626619a8352dc2b..fb97a16fc9b8af5018782b214e4dd574f6c12ef8 100644
--- a/doc/datasets/index.rst
+++ b/doc/datasets/index.rst
@@ -255,6 +255,20 @@ features::
  _`Faster API-compatible implementation`: https://github.com/mblondel/svmlight-loader
 
 
+.. make sure everything is in a toc tree
+
+.. toctree::
+    :maxdepth: 2
+    :hidden:
+
+    olivetti_faces
+    twenty_newsgroups
+    mldata
+    labeled_faces
+    covtype
+    rcv1
+
+
 .. include:: olivetti_faces.rst
 
 .. include:: twenty_newsgroups.rst
diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index a675633e192572a00ebe00f6f1b06bf1207e84ec..e2a7f97e2804df06132a06487943c7186accd646 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -4,7 +4,7 @@
 Clustering
 ==========
 
-`Clustering <https://en.wikipedia.org/wiki/Cluster_analysis>`_ of
+`Clustering <https://en.wikipedia.org/wiki/Cluster_analysis>`__ of
 unlabeled data can be performed with the module :mod:`sklearn.cluster`.
 
 Each clustering algorithm comes in two variants: a class, that implements
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index e0b14bf20b271e2fce312bf48e975ce151151c78..83cb48baba530ecb95a4f5f3d1774834b067ad99 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -235,7 +235,7 @@ Enhancements
 
    - The "Wisconsin Breast Cancer" classical two-class classification dataset
      is now included in scikit-learn, available with
-     :fun:`sklearn.dataset.load_breast_cancer`.
+     :func:`sklearn.dataset.load_breast_cancer`.
 
    - Upgraded to joblib 0.9.3 to benefit from the new automatic batching of
      short tasks. This makes it possible for scikit-learn to benefit from
diff --git a/examples/applications/face_recognition.py b/examples/applications/face_recognition.py
index b23dda4749fe450ac87ed37a0ea01d568de98056..0400b9cffb5c6bce3f579ae807ae674985cb2b9c 100644
--- a/examples/applications/face_recognition.py
+++ b/examples/applications/face_recognition.py
@@ -10,7 +10,7 @@ The dataset used in this example is a preprocessed excerpt of the
 
 .. _LFW: http://vis-www.cs.umass.edu/lfw/
 
-Expected results for the top 5 most represented people in the dataset::
+Expected results for the top 5 most represented people in the dataset:
 
 ================== ============ ======= ========== =======
                    precision    recall  f1-score   support
diff --git a/examples/ensemble/plot_partial_dependence.py b/examples/ensemble/plot_partial_dependence.py
index d4a26166944c3d7d6ae4b43d650ae11fdcafa3de..1dd08b15502c2ed202a3d52f21d77c7253876534 100644
--- a/examples/ensemble/plot_partial_dependence.py
+++ b/examples/ensemble/plot_partial_dependence.py
@@ -3,7 +3,7 @@
 Partial Dependence Plots
 ========================
 
-Partial dependence plots show the dependence between the target function [1]_
+Partial dependence plots show the dependence between the target function [2]_
 and a set of 'target' features, marginalizing over the
 values of all other features (the complement features). Due to the limits
 of human perception the size of the target feature set must be small (usually,
@@ -13,7 +13,7 @@ important features
 
 This example shows how to obtain partial dependence plots from a
 :class:`~sklearn.ensemble.GradientBoostingRegressor` trained on the California
-housing dataset. The example is taken from [HTF2009]_.
+housing dataset. The example is taken from [1]_.
 
 The plot shows four one-way and one two-way partial dependence plots.
 The target variables for the one-way PDP are:
@@ -38,10 +38,10 @@ For an avg. occupancy greater than two, the house price is nearly independent
 of the house age, whereas for values less than two there is a strong dependence
 on age.
 
-.. [HTF2009] T. Hastie, R. Tibshirani and J. Friedman,
+.. [1] T. Hastie, R. Tibshirani and J. Friedman,
     "Elements of Statistical Learning Ed. 2", Springer, 2009.
 
-.. [1] For classification you can think of it as the regression score before
+.. [2] For classification you can think of it as the regression score before
        the link function.
 """
 print(__doc__)
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index f974977162f1c84cc755d7a468236984346a653c..388e08c775b0328cd324f3d0f5b03128ac4dd11f 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -46,20 +46,21 @@ class CalibratedClassifierCV(BaseEstimator, ClassifierMixin):
         to offer more accurate predict_proba outputs. If cv=prefit, the
         classifier must have been fit already on data.
 
-    method : 'sigmoid' | 'isotonic'
+    method : 'sigmoid' or 'isotonic'
         The method to use for calibration. Can be 'sigmoid' which
         corresponds to Platt's method or 'isotonic' which is a
         non-parameteric approach. It is not advised to use isotonic calibration
-        with too few calibration samples (<<1000) since it tends to overfit.
+        with too few calibration samples ``(<<1000)`` since it tends to overfit.
         Use sigmoids (Platt's calibration) in this case.
 
-    cv : integer/cross-validation generator/iterable or "prefit", optional
+    cv : integer, cross-validation generator, iterable or "prefit", optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
-          - None, to use the default 3-fold cross-validation,
-          - integer, to specify the number of folds.
-          - An object to be used as a cross-validation generator.
-          - An iterable yielding train/test splits.
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - An object to be used as a cross-validation generator.
+        - An iterable yielding train/test splits.
 
         For integer/None inputs, if ``y`` is binary or multiclass,
         :class:`StratifiedKFold` used. If ``y`` is neither binary nor
diff --git a/sklearn/covariance/graph_lasso_.py b/sklearn/covariance/graph_lasso_.py
index 7d9568823322dec8c99a4743aeb98c1817df1000..41e50ac29f8b216254a7acf279a9aa0c0b904673 100644
--- a/sklearn/covariance/graph_lasso_.py
+++ b/sklearn/covariance/graph_lasso_.py
@@ -468,10 +468,11 @@ class GraphLassoCV(GraphLasso):
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
-          - None, to use the default 3-fold cross-validation,
-          - integer, to specify the number of folds.
-          - An object to be used as a cross-validation generator.
-          - An iterable yielding train/test splits.
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - An object to be used as a cross-validation generator.
+        - An iterable yielding train/test splits.
 
         For integer/None inputs :class:`KFold` is used.
 
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index ea73401e4cfae14a750347ee5fe9ce0be0b283cd..d1d784333fbefe17f94b4ee63427c7c4c70ae36f 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -1089,7 +1089,7 @@ class PredefinedSplit(_PartitionIterator):
 
 
 class LabelShuffleSplit(ShuffleSplit):
-    '''Shuffle-Labels-Out cross-validation iterator
+    """Shuffle-Labels-Out cross-validation iterator
 
     Provides randomized train/test indices to split data according to a
     third-party provided label. This label information can be used to encode
@@ -1118,7 +1118,7 @@ class LabelShuffleSplit(ShuffleSplit):
         Labels of samples
 
     n_iter : int (default 5)
-        Number of re-shuffling & splitting iterations.
+        Number of re-shuffling and splitting iterations.
 
     test_size : float (default 0.2), int, or None
         If float, should be between 0.0 and 1.0 and represent the
@@ -1134,7 +1134,8 @@ class LabelShuffleSplit(ShuffleSplit):
 
     random_state : int or RandomState
         Pseudo-random number generator state used for random sampling.
-    '''
+
+    """
     def __init__(self, labels, n_iter=5, test_size=0.2, train_size=None,
                  random_state=None):
 
@@ -1208,10 +1209,11 @@ def cross_val_predict(estimator, X, y=None, cv=None, n_jobs=1,
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
-          - None, to use the default 3-fold cross-validation,
-          - integer, to specify the number of folds.
-          - An object to be used as a cross-validation generator.
-          - An iterable yielding train/test splits.
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - An object to be used as a cross-validation generator.
+        - An iterable yielding train/test splits.
 
         For integer/None inputs, if ``y`` is binary or multiclass,
         :class:`StratifiedKFold` used. If the estimator is a classifier
@@ -1382,10 +1384,11 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
-          - None, to use the default 3-fold cross-validation,
-          - integer, to specify the number of folds.
-          - An object to be used as a cross-validation generator.
-          - An iterable yielding train/test splits.
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - An object to be used as a cross-validation generator.
+        - An iterable yielding train/test splits.
 
         For integer/None inputs, if ``y`` is binary or multiclass,
         :class:`StratifiedKFold` used. If the estimator is a classifier
@@ -1643,10 +1646,11 @@ def check_cv(cv, X=None, y=None, classifier=False):
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
-          - None, to use the default 3-fold cross-validation,
-          - integer, to specify the number of folds.
-          - An object to be used as a cross-validation generator.
-          - An iterable yielding train/test splits.
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - An object to be used as a cross-validation generator.
+        - An iterable yielding train/test splits.
 
         For integer/None inputs, if ``y`` is binary or multiclass,
         :class:`StratifiedKFold` used. If the estimator is a classifier
@@ -1716,10 +1720,11 @@ def permutation_test_score(estimator, X, y, cv=None,
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
-          - None, to use the default 3-fold cross-validation,
-          - integer, to specify the number of folds.
-          - An object to be used as a cross-validation generator.
-          - An iterable yielding train/test splits.
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - An object to be used as a cross-validation generator.
+        - An iterable yielding train/test splits.
 
         For integer/None inputs, if ``y`` is binary or multiclass,
         :class:`StratifiedKFold` used. If the estimator is a classifier
diff --git a/sklearn/decomposition/nmf.py b/sklearn/decomposition/nmf.py
index 326d88d11791b2f7fec8290dc3024d0f1f8c5384..f8118ed73e72f53592e7a38196cb56f01b89f6ab 100644
--- a/sklearn/decomposition/nmf.py
+++ b/sklearn/decomposition/nmf.py
@@ -111,17 +111,23 @@ def _initialize_nmf(X, n_components, init=None, eps=1e-6,
         Method used to initialize the procedure.
         Default: 'nndsvdar' if n_components < n_features, otherwise 'random'.
         Valid options:
-            'random': non-negative random matrices, scaled with:
-                sqrt(X.mean() / n_components)
-            'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
-                initialization (better for sparseness)
-            'nndsvda': NNDSVD with zeros filled with the average of X
-                (better when sparsity is not desired)
-            'nndsvdar': NNDSVD with zeros filled with small random values
-                (generally faster, less accurate alternative to NNDSVDa
-                for when sparsity is not desired)
-
-    eps: float
+
+        - 'random': non-negative random matrices, scaled with:
+            sqrt(X.mean() / n_components)
+
+        - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
+            initialization (better for sparseness)
+
+        - 'nndsvda': NNDSVD with zeros filled with the average of X
+            (better when sparsity is not desired)
+
+        - 'nndsvdar': NNDSVD with zeros filled with small random values
+            (generally faster, less accurate alternative to NNDSVDa
+            for when sparsity is not desired)
+
+        - 'custom': use custom matrices W and H
+
+    eps : float
         Truncate all values less then this in output to zero.
 
     random_state : int seed, RandomState instance, or None (default)
@@ -641,17 +647,22 @@ def non_negative_factorization(X, W=None, H=None, n_components=None,
     init :  None | 'random' | 'nndsvd' | 'nndsvda' | 'nndsvdar' | 'custom'
         Method used to initialize the procedure.
         Default: 'nndsvd' if n_components < n_features, otherwise random.
-        Valid options::
-            'random': non-negative random matrices, scaled with:
-                sqrt(X.mean() / n_components)
-            'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
-                initialization (better for sparseness)
-            'nndsvda': NNDSVD with zeros filled with the average of X
-                (better when sparsity is not desired)
-            'nndsvdar': NNDSVD with zeros filled with small random values
-                (generally faster, less accurate alternative to NNDSVDa
-                for when sparsity is not desired)
-            'custom': use custom matrices W and H
+        Valid options:
+
+        - 'random': non-negative random matrices, scaled with:
+            sqrt(X.mean() / n_components)
+
+        - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
+            initialization (better for sparseness)
+
+        - 'nndsvda': NNDSVD with zeros filled with the average of X
+            (better when sparsity is not desired)
+
+        - 'nndsvdar': NNDSVD with zeros filled with small random values
+            (generally faster, less accurate alternative to NNDSVDa
+            for when sparsity is not desired)
+
+        - 'custom': use custom matrices W and H
 
     update_H : boolean, default: True
         Set to True, both W and H will be estimated from initial guesses.
@@ -828,16 +839,22 @@ class NMF(BaseEstimator, TransformerMixin):
     init :  'random' | 'nndsvd' |  'nndsvda' | 'nndsvdar' | 'custom'
         Method used to initialize the procedure.
         Default: 'nndsvdar' if n_components < n_features, otherwise random.
-        Valid options::
-            'random': non-negative random matrices
-            'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
-                initialization (better for sparseness)
-            'nndsvda': NNDSVD with zeros filled with the average of X
-                (better when sparsity is not desired)
-            'nndsvdar': NNDSVD with zeros filled with small random values
-                (generally faster, less accurate alternative to NNDSVDa
-                for when sparsity is not desired)
-            'custom': use custom matrices W and H, given in 'fit' method.
+        Valid options:
+
+        - 'random': non-negative random matrices, scaled with:
+            sqrt(X.mean() / n_components)
+
+        - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
+            initialization (better for sparseness)
+
+        - 'nndsvda': NNDSVD with zeros filled with the average of X
+            (better when sparsity is not desired)
+
+        - 'nndsvdar': NNDSVD with zeros filled with small random values
+            (generally faster, less accurate alternative to NNDSVDa
+            for when sparsity is not desired)
+
+        - 'custom': use custom matrices W and H
 
     solver : 'pg' | 'cd'
         Numerical solver to use:
@@ -1095,6 +1112,172 @@ class NMF(BaseEstimator, TransformerMixin):
 @deprecated("It will be removed in release 0.19. Use NMF instead."
             "'pg' solver is still available until release 0.19.")
 class ProjectedGradientNMF(NMF):
+    """Non-Negative Matrix Factorization (NMF)
+
+    Find two non-negative matrices (W, H) whose product approximates the non-
+    negative matrix X. This factorization can be used for example for
+    dimensionality reduction, source separation or topic extraction.
+
+    The objective function is::
+
+        0.5 * ||X - WH||_Fro^2
+        + alpha * l1_ratio * ||vec(W)||_1
+        + alpha * l1_ratio * ||vec(H)||_1
+        + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2
+        + 0.5 * alpha * (1 - l1_ratio) * ||H||_Fro^2
+
+    Where::
+
+        ||A||_Fro^2 = \sum_{i,j} A_{ij}^2 (Frobenius norm)
+        ||vec(A)||_1 = \sum_{i,j} abs(A_{ij}) (Elementwise L1 norm)
+
+    The objective function is minimized with an alternating minimization of W
+    and H.
+
+    Read more in the :ref:`User Guide <NMF>`.
+
+    Parameters
+    ----------
+    n_components : int or None
+        Number of components, if n_components is not set all features
+        are kept.
+
+    init :  'random' | 'nndsvd' |  'nndsvda' | 'nndsvdar' | 'custom'
+        Method used to initialize the procedure.
+        Default: 'nndsvdar' if n_components < n_features, otherwise random.
+        Valid options:
+
+        - 'random': non-negative random matrices, scaled with:
+            sqrt(X.mean() / n_components)
+
+        - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
+            initialization (better for sparseness)
+
+        - 'nndsvda': NNDSVD with zeros filled with the average of X
+            (better when sparsity is not desired)
+
+        - 'nndsvdar': NNDSVD with zeros filled with small random values
+            (generally faster, less accurate alternative to NNDSVDa
+            for when sparsity is not desired)
+
+        - 'custom': use custom matrices W and H
+
+    solver : 'pg' | 'cd'
+        Numerical solver to use:
+        'pg' is a Projected Gradient solver (deprecated).
+        'cd' is a Coordinate Descent solver (recommended).
+
+        .. versionadded:: 0.17
+           Coordinate Descent solver.
+
+        .. versionchanged:: 0.17
+           Deprecated Projected Gradient solver.
+
+    tol : double, default: 1e-4
+        Tolerance value used in stopping conditions.
+
+    max_iter : integer, default: 200
+        Number of iterations to compute.
+
+    random_state : integer seed, RandomState instance, or None (default)
+        Random number generator seed control.
+
+    alpha : double, default: 0.
+        Constant that multiplies the regularization terms. Set it to zero to
+        have no regularization.
+
+        .. versionadded:: 0.17
+           *alpha* used in the Coordinate Descent solver.
+
+    l1_ratio : double, default: 0.
+        The regularization mixing parameter, with 0 <= l1_ratio <= 1.
+        For l1_ratio = 0 the penalty is an elementwise L2 penalty
+        (aka Frobenius Norm).
+        For l1_ratio = 1 it is an elementwise L1 penalty.
+        For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
+
+        .. versionadded:: 0.17
+           Regularization parameter *l1_ratio* used in the Coordinate Descent solver.
+
+    shuffle : boolean, default: False
+        If true, randomize the order of coordinates in the CD solver.
+
+        .. versionadded:: 0.17
+           *shuffle* parameter used in the Coordinate Descent solver.
+
+    nls_max_iter : integer, default: 2000
+        Number of iterations in NLS subproblem.
+        Used only in the deprecated 'pg' solver.
+
+        .. versionchanged:: 0.17
+           Deprecated Projected Gradient solver. Use Coordinate Descent solver
+           instead.
+
+    sparseness : 'data' | 'components' | None, default: None
+        Where to enforce sparsity in the model.
+        Used only in the deprecated 'pg' solver.
+
+        .. versionchanged:: 0.17
+           Deprecated Projected Gradient solver. Use Coordinate Descent solver
+           instead.
+
+    beta : double, default: 1
+        Degree of sparseness, if sparseness is not None. Larger values mean
+        more sparseness. Used only in the deprecated 'pg' solver.
+
+        .. versionchanged:: 0.17
+           Deprecated Projected Gradient solver. Use Coordinate Descent solver
+           instead.
+
+    eta : double, default: 0.1
+        Degree of correctness to maintain, if sparsity is not None. Smaller
+        values mean larger error. Used only in the deprecated 'pg' solver.
+
+        .. versionchanged:: 0.17
+           Deprecated Projected Gradient solver. Use Coordinate Descent solver
+           instead.
+
+    Attributes
+    ----------
+    components_ : array, [n_components, n_features]
+        Non-negative components of the data.
+
+    reconstruction_err_ : number
+        Frobenius norm of the matrix difference between
+        the training data and the reconstructed data from
+        the fit produced by the model. ``|| X - WH ||_2``
+
+    n_iter_ : int
+        Actual number of iterations.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> X = np.array([[1,1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
+    >>> from sklearn.decomposition import NMF
+    >>> model = NMF(n_components=2, init='random', random_state=0)
+    >>> model.fit(X) #doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    NMF(alpha=0.0, beta=1, eta=0.1, init='random', l1_ratio=0.0, max_iter=200,
+      n_components=2, nls_max_iter=2000, random_state=0, shuffle=False,
+      solver='cd', sparseness=None, tol=0.0001, verbose=0)
+
+    >>> model.components_
+    array([[ 2.09783018,  0.30560234],
+           [ 2.13443044,  2.13171694]])
+    >>> model.reconstruction_err_ #doctest: +ELLIPSIS
+    0.00115993...
+
+    References
+    ----------
+    C.-J. Lin. Projected gradient methods for non-negative matrix
+    factorization. Neural Computation, 19(2007), 2756-2779.
+    http://www.csie.ntu.edu.tw/~cjlin/nmf/
+
+    Cichocki, Andrzej, and P. H. A. N. Anh-Huy. "Fast local algorithms for
+    large scale nonnegative matrix and tensor factorizations."
+    IEICE transactions on fundamentals of electronics, communications and
+    computer sciences 92.3: 708-721, 2009.
+    """
 
     def __init__(self, n_components=None, solver='pg', init=None,
                  tol=1e-4, max_iter=200, random_state=None,
diff --git a/sklearn/ensemble/voting_classifier.py b/sklearn/ensemble/voting_classifier.py
index 4cdfa892bfe4629c01a24f0b80754b3c5db3db2a..60382fa15b65c9cfc2ce300da61e23b9e519e83d 100644
--- a/sklearn/ensemble/voting_classifier.py
+++ b/sklearn/ensemble/voting_classifier.py
@@ -25,12 +25,13 @@ class VotingClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
     """Soft Voting/Majority Rule classifier for unfitted estimators.
 
     .. versionadded:: 0.17
+
     Read more in the :ref:`User Guide <voting_classifier>`.
 
     Parameters
     ----------
     estimators : list of (string, estimator) tuples
-        Invoking the `fit` method on the `VotingClassifier` will fit clones
+        Invoking the ``fit`` method on the ``VotingClassifier`` will fit clones
         of those original estimators that will be stored in the class attribute
         `self.estimators_`.
 
diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py
index d6c88403b9558f52e5525a4cd760a86983ec1f0b..89cb2f115278dfaf37910c8928ed97c3b4f80b38 100644
--- a/sklearn/feature_selection/rfe.py
+++ b/sklearn/feature_selection/rfe.py
@@ -275,10 +275,11 @@ class RFECV(RFE, MetaEstimatorMixin):
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
-          - None, to use the default 3-fold cross-validation,
-          - integer, to specify the number of folds.
-          - An object to be used as a cross-validation generator.
-          - An iterable yielding train/test splits.
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - An object to be used as a cross-validation generator.
+        - An iterable yielding train/test splits.
 
         For integer/None inputs, if ``y`` is binary or multiclass,
         :class:`StratifiedKFold` used. If the estimator is a classifier
diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py
index 6bda0b6f6ba20af3f16d1562fdc37f06dedf60ec..86f2b693032d787b4b0ba5bc6fac65f589967f6f 100644
--- a/sklearn/grid_search.py
+++ b/sklearn/grid_search.py
@@ -680,10 +680,11 @@ class GridSearchCV(BaseSearchCV):
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
-          - None, to use the default 3-fold cross-validation,
-          - integer, to specify the number of folds.
-          - An object to be used as a cross-validation generator.
-          - An iterable yielding train/test splits.
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - An object to be used as a cross-validation generator.
+        - An iterable yielding train/test splits.
 
         For integer/None inputs, if ``y`` is binary or multiclass,
         :class:`StratifiedKFold` used. If the estimator is a classifier
@@ -890,10 +891,11 @@ class RandomizedSearchCV(BaseSearchCV):
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
-          - None, to use the default 3-fold cross-validation,
-          - integer, to specify the number of folds.
-          - An object to be used as a cross-validation generator.
-          - An iterable yielding train/test splits.
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - An object to be used as a cross-validation generator.
+        - An iterable yielding train/test splits.
 
         For integer/None inputs, if ``y`` is binary or multiclass,
         :class:`StratifiedKFold` used. If the estimator is a classifier
diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py
index 7cdeb40cb8572bc81970140863227da3c0f33171..74f31093b796b5760b62d410e79f3f49c3ce3895 100644
--- a/sklearn/learning_curve.py
+++ b/sklearn/learning_curve.py
@@ -68,10 +68,11 @@ def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 5),
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
-          - None, to use the default 3-fold cross-validation,
-          - integer, to specify the number of folds.
-          - An object to be used as a cross-validation generator.
-          - An iterable yielding train/test splits.
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - An object to be used as a cross-validation generator.
+        - An iterable yielding train/test splits.
 
         For integer/None inputs, if ``y`` is binary or multiclass,
         :class:`StratifiedKFold` used. If the estimator is a classifier
@@ -280,10 +281,11 @@ def validation_curve(estimator, X, y, param_name, param_range, cv=None,
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
-          - None, to use the default 3-fold cross-validation,
-          - integer, to specify the number of folds.
-          - An object to be used as a cross-validation generator.
-          - An iterable yielding train/test splits.
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - An object to be used as a cross-validation generator.
+        - An iterable yielding train/test splits.
 
         For integer/None inputs, if ``y`` is binary or multiclass,
         :class:`StratifiedKFold` used. If the estimator is a classifier
diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index 5f0e55b4fcfed271806b289bb95560ceb293dfc4..3519307b68a229f16041dce04e8cf6f5d1773946 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -1218,10 +1218,11 @@ class LassoCV(LinearModelCV, RegressorMixin):
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
-          - None, to use the default 3-fold cross-validation,
-          - integer, to specify the number of folds.
-          - An object to be used as a cross-validation generator.
-          - An iterable yielding train/test splits.
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - An object to be used as a cross-validation generator.
+        - An iterable yielding train/test splits.
 
         For integer/None inputs, :class:`KFold` is used.
 
@@ -1364,10 +1365,11 @@ class ElasticNetCV(LinearModelCV, RegressorMixin):
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
-          - None, to use the default 3-fold cross-validation,
-          - integer, to specify the number of folds.
-          - An object to be used as a cross-validation generator.
-          - An iterable yielding train/test splits.
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - An object to be used as a cross-validation generator.
+        - An iterable yielding train/test splits.
 
         For integer/None inputs, :class:`KFold` is used.
 
@@ -1852,10 +1854,11 @@ class MultiTaskElasticNetCV(LinearModelCV, RegressorMixin):
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
-          - None, to use the default 3-fold cross-validation,
-          - integer, to specify the number of folds.
-          - An object to be used as a cross-validation generator.
-          - An iterable yielding train/test splits.
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - An object to be used as a cross-validation generator.
+        - An iterable yielding train/test splits.
 
         For integer/None inputs, :class:`KFold` is used.
 
@@ -2009,10 +2012,11 @@ class MultiTaskLassoCV(LinearModelCV, RegressorMixin):
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
-          - None, to use the default 3-fold cross-validation,
-          - integer, to specify the number of folds.
-          - An object to be used as a cross-validation generator.
-          - An iterable yielding train/test splits.
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - An object to be used as a cross-validation generator.
+        - An iterable yielding train/test splits.
 
         For integer/None inputs, :class:`KFold` is used.
 
diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py
index 699ce2b31518875ceef2b07b76df3ecf00c0d748..8f6531b7d7f5962f6e3eecd169e4b4f137251eb6 100644
--- a/sklearn/linear_model/least_angle.py
+++ b/sklearn/linear_model/least_angle.py
@@ -983,10 +983,11 @@ class LarsCV(Lars):
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
-          - None, to use the default 3-fold cross-validation,
-          - integer, to specify the number of folds.
-          - An object to be used as a cross-validation generator.
-          - An iterable yielding train/test splits.
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - An object to be used as a cross-validation generator.
+        - An iterable yielding train/test splits.
 
         For integer/None inputs, :class:`KFold` is used.
 
@@ -1182,10 +1183,11 @@ class LassoLarsCV(LarsCV):
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
-          - None, to use the default 3-fold cross-validation,
-          - integer, to specify the number of folds.
-          - An object to be used as a cross-validation generator.
-          - An iterable yielding train/test splits.
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - An object to be used as a cross-validation generator.
+        - An iterable yielding train/test splits.
 
         For integer/None inputs, :class:`KFold` is used.
 
diff --git a/sklearn/linear_model/omp.py b/sklearn/linear_model/omp.py
index 92f6cb69b238d494edbdf8be516029b1fba3231f..967122765c438ff51514eef35679776908c77530 100644
--- a/sklearn/linear_model/omp.py
+++ b/sklearn/linear_model/omp.py
@@ -758,10 +758,11 @@ class OrthogonalMatchingPursuitCV(LinearModel, RegressorMixin):
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
-          - None, to use the default 3-fold cross-validation,
-          - integer, to specify the number of folds.
-          - An object to be used as a cross-validation generator.
-          - An iterable yielding train/test splits.
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - An object to be used as a cross-validation generator.
+        - An iterable yielding train/test splits.
 
         For integer/None inputs, :class:`KFold` is used.
 
diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py
index f82ca9de4965566945b033faf163b5d67b1ee4e4..e50f1aa3de2de7253157e5a58cf6ec2fdd7865bd 100644
--- a/sklearn/linear_model/ridge.py
+++ b/sklearn/linear_model/ridge.py
@@ -1085,10 +1085,11 @@ class RidgeCV(_BaseRidgeCV, RegressorMixin):
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
-          - None, to use the efficient Leave-One-Out cross-validation
-          - integer, to specify the number of folds.
-          - An object to be used as a cross-validation generator.
-          - An iterable yielding train/test splits.
+
+        - None, to use the efficient Leave-One-Out cross-validation
+        - integer, to specify the number of folds.
+        - An object to be used as a cross-validation generator.
+        - An iterable yielding train/test splits.
 
         For integer/None inputs, if ``y`` is binary or multiclass,
         :class:`StratifiedKFold` used, else, :class:`KFold` is used.
@@ -1178,10 +1179,11 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV):
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
-          - None, to use the efficient Leave-One-Out cross-validation
-          - integer, to specify the number of folds.
-          - An object to be used as a cross-validation generator.
-          - An iterable yielding train/test splits.
+
+        - None, to use the efficient Leave-One-Out cross-validation
+        - integer, to specify the number of folds.
+        - An object to be used as a cross-validation generator.
+        - An iterable yielding train/test splits.
 
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 8605395c870fa6597419a478e358829b28c930e2..51405e1a212969037b33810f2bb764a4434975b0 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -616,8 +616,7 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
 
     References
     ----------
-    .. [1] `Wikipedia entry for the F1-score
-           <http://en.wikipedia.org/wiki/F1_score>`_
+    .. [1] `Wikipedia entry for the F1-score <http://en.wikipedia.org/wiki/F1_score>`_
 
     Examples
     --------
@@ -992,7 +991,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
         labels = np.hstack([labels, np.setdiff1d(present_labels, labels,
                                                  assume_unique=True)])
 
-    ### Calculate tp_sum, pred_sum, true_sum ###
+    # Calculate tp_sum, pred_sum, true_sum ###
 
     if y_type.startswith('multilabel'):
         sum_axis = 1 if average == 'samples' else 0
@@ -1063,7 +1062,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
         pred_sum = np.array([pred_sum.sum()])
         true_sum = np.array([true_sum.sum()])
 
-    ### Finally, we have all our sufficient statistics. Divide! ###
+    # Finally, we have all our sufficient statistics. Divide! #
 
     beta2 = beta ** 2
     with np.errstate(divide='ignore', invalid='ignore'):
@@ -1081,7 +1080,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
                    (beta2 * precision + recall))
         f_score[tp_sum == 0] = 0.0
 
-    ## Average the results ##
+    # Average the results
 
     if average == 'weighted':
         weights = true_sum
diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py
index f76f3abce851e18495c27056900c225dc490edae..d6eb96057e0f3811d93c99b7b8ebfc84b73a2f2f 100644
--- a/sklearn/metrics/regression.py
+++ b/sklearn/metrics/regression.py
@@ -390,8 +390,9 @@ def r2_score(y_true, y_pred,
     sample_weight : array-like of shape = (n_samples), optional
         Sample weights.
 
-    multioutput : string in ['raw_values', 'uniform_average',
-                'variance_weighted'] or None or array-like of shape (n_outputs)
+    multioutput : string in ['raw_values', 'uniform_average', \
+'variance_weighted'] or None or array-like of shape (n_outputs)
+
         Defines aggregating of multiple output scores.
         Array-like value defines weights used to average scores.
         Default value correponds to 'variance_weighted', this behaviour is
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 56d4b4e54ae6b20848c0272ddd37085324c605d2..0eee0cabcb608faabc8dff5ead724aba37db65bd 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -896,6 +896,7 @@ class RobustScaler(BaseEstimator, TransformerMixin):
     the interquartile range often give better results.
 
     .. versionadded:: 0.17
+
     Read more in the :ref:`User Guide <preprocessing_scaler>`.
 
     Parameters
diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py
index 15deea3dc08f1b2ee7a1cf0d426b7a035adcbd17..13f96b18041cb16ac8cd8ce6e1535131d70d9a72 100644
--- a/sklearn/svm/classes.py
+++ b/sklearn/svm/classes.py
@@ -37,7 +37,7 @@ class LinearSVC(BaseEstimator, LinearClassifierMixin,
 
     penalty : string, 'l1' or 'l2' (default='l2')
         Specifies the norm used in the penalization. The 'l2'
-        penalty is the standard used in SVC. The 'l1' leads to `coef_`
+        penalty is the standard used in SVC. The 'l1' leads to ``coef_``
         vectors that are sparse.
 
     dual : bool, (default=True)
@@ -50,12 +50,12 @@ class LinearSVC(BaseEstimator, LinearClassifierMixin,
     multi_class: string, 'ovr' or 'crammer_singer' (default='ovr')
         Determines the multi-class strategy if `y` contains more than
         two classes.
-        `ovr` trains n_classes one-vs-rest classifiers, while `crammer_singer`
+        ``"ovr"`` trains n_classes one-vs-rest classifiers, while ``"crammer_singer"``
         optimizes a joint objective over all classes.
         While `crammer_singer` is interesting from a theoretical perspective
         as it is consistent, it is seldom used in practice as it rarely leads
         to better accuracy and is more expensive to compute.
-        If `crammer_singer` is chosen, the options loss, penalty and dual will
+        If ``"crammer_singer"`` is chosen, the options loss, penalty and dual will
         be ignored.
 
     fit_intercept : boolean, optional (default=True)
@@ -65,7 +65,7 @@ class LinearSVC(BaseEstimator, LinearClassifierMixin,
 
     intercept_scaling : float, optional (default=1)
         When self.fit_intercept is True, instance vector x becomes
-        [x, self.intercept_scaling],
+        ``[x, self.intercept_scaling]``,
         i.e. a "synthetic" feature with constant value equals to
         intercept_scaling is appended to the instance vector.
         The intercept becomes intercept_scaling * synthetic feature weight
@@ -75,7 +75,7 @@ class LinearSVC(BaseEstimator, LinearClassifierMixin,
         (and therefore on the intercept) intercept_scaling has to be increased.
 
     class_weight : {dict, 'balanced'}, optional
-        Set the parameter C of class i to class_weight[i]*C for
+        Set the parameter C of class i to ``class_weight[i]*C`` for
         SVC. If not given, all classes are supposed to have
         weight one.
         The "balanced" mode uses the values of y to automatically adjust
@@ -96,12 +96,11 @@ class LinearSVC(BaseEstimator, LinearClassifierMixin,
 
     Attributes
     ----------
-    coef_ : array, shape = [n_features] if n_classes == 2
-            else [n_classes, n_features]
+    coef_ : array, shape = [n_features] if n_classes == 2 else [n_classes, n_features]
         Weights assigned to the features (coefficients in the primal
         problem). This is only available in the case of a linear kernel.
 
-        `coef_` is a readonly property derived from `raw_coef_` that
+        ``coef_`` is a readonly property derived from ``raw_coef_`` that
         follows the internal memory layout of liblinear.
 
     intercept_ : array, shape = [1] if n_classes == 2 else [n_classes]
@@ -114,14 +113,15 @@ class LinearSVC(BaseEstimator, LinearClassifierMixin,
     to have slightly different results for the same input data. If
     that happens, try with a smaller ``tol`` parameter.
 
-    The underlying implementation (liblinear) uses a sparse internal
+    The underlying implementation, liblinear, uses a sparse internal
     representation for the data that will incur a memory copy.
 
     Predict output may not match that of standalone liblinear in certain
     cases. See :ref:`differences from liblinear <liblinear_differences>`
     in the narrative documentation.
 
-    **References:**
+    References
+    ----------
     `LIBLINEAR: A Library for Large Linear Classification
     <http://www.csie.ntu.edu.tw/~cjlin/liblinear/>`__
 
@@ -239,15 +239,14 @@ class LinearSVR(LinearModel, RegressorMixin):
         Penalty parameter C of the error term. The penalty is a squared
         l2 penalty. The bigger this parameter, the less regularization is used.
 
-    loss : string, 'epsilon_insensitive' or 'squared_epsilon_insensitive'
-           (default='epsilon_insensitive')
+    loss : string, 'epsilon_insensitive' or 'squared_epsilon_insensitive' (default='epsilon_insensitive')
         Specifies the loss function. 'l1' is the epsilon-insensitive loss
         (standard SVR) while 'l2' is the squared epsilon-insensitive loss.
 
     epsilon : float, optional (default=0.1)
         Epsilon parameter in the epsilon-insensitive loss function. Note
         that the value of this parameter depends on the scale of the target
-        variable y. If unsure, set epsilon=0.
+        variable y. If unsure, set ``epsilon=0``.
 
     dual : bool, (default=True)
         Select the algorithm to either solve the dual or primal
@@ -286,8 +285,7 @@ class LinearSVR(LinearModel, RegressorMixin):
 
     Attributes
     ----------
-    coef_ : array, shape = [n_features] if n_classes == 2
-            else [n_classes, n_features]
+    coef_ : array, shape = [n_features] if n_classes == 2 else [n_classes, n_features]
         Weights assigned to the features (coefficients in the primal
         problem). This is only available in the case of a linear kernel.
 
diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index 98692c411a95d3e490f40b53e0ffdf96afdfcb37..d33f2fbadcb80ace1a1c846795a2caa128fcd2a8 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -311,7 +311,7 @@ class BaseDecisionTree(six.with_metaclass(ABCMeta, BaseEstimator,
         elif self.presort == 'auto':
             presort = True
 
-        if presort == True and issparse(X):
+        if presort is True and issparse(X):
             raise ValueError("Presorting is not supported for sparse "
                              "matrices.")
 
@@ -585,8 +585,7 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin):
         If None then unlimited number of leaf nodes.
         If not None then ``max_depth`` will be ignored.
 
-    class_weight : dict, list of dicts, "balanced" or None, optional
-                   (default=None)
+    class_weight : dict, list of dicts, "balanced" or None, optional (default=None)
         Weights associated with classes in the form ``{class_label: weight}``.
         If not given, all classes are supposed to have weight one. For
         multi-output problems, a list of dicts can be provided in the same