diff --git a/sklearn/base.py b/sklearn/base.py
index d1628f39b3727fc193e1cad07e71d3c62ac10217..67a7c61c60e5817b83fbe907d97228b1d5b4af17 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -222,7 +222,7 @@ class BaseEstimator(object):
 
         Parameters
         ----------
-        deep: boolean, optional
+        deep : boolean, optional
             If True, will return the parameters for this estimator and
             contained subobjects that are estimators.
 
diff --git a/sklearn/covariance/graph_lasso_.py b/sklearn/covariance/graph_lasso_.py
index 41e50ac29f8b216254a7acf279a9aa0c0b904673..e166cfe2072b79eeceaabdd8abb77ea7dabc0b63 100644
--- a/sklearn/covariance/graph_lasso_.py
+++ b/sklearn/covariance/graph_lasso_.py
@@ -479,7 +479,7 @@ class GraphLassoCV(GraphLasso):
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
 
-    tol: positive float, optional
+    tol : positive float, optional
         The tolerance to declare convergence: if the dual gap goes below
         this value, iterations are stopped.
 
@@ -489,7 +489,7 @@ class GraphLassoCV(GraphLasso):
         for a given column update, not of the overall parameter estimate. Only
         used for mode='cd'.
 
-    max_iter: integer, optional
+    max_iter : integer, optional
         Maximum number of iterations.
 
     mode: {'cd', 'lars'}
@@ -498,10 +498,10 @@ class GraphLassoCV(GraphLasso):
         than number of samples. Elsewhere prefer cd which is more numerically
         stable.
 
-    n_jobs: int, optional
+    n_jobs : int, optional
         number of jobs to run in parallel (default 1).
 
-    verbose: boolean, optional
+    verbose : boolean, optional
         If verbose is True, the objective function and duality gap are
         printed at each iteration.
 
diff --git a/sklearn/datasets/mldata.py b/sklearn/datasets/mldata.py
index 1ab3edea91bde1a30983d56e06db05f2073d3744..c15e12cc7db6353e8b5966c75535109280661c12 100644
--- a/sklearn/datasets/mldata.py
+++ b/sklearn/datasets/mldata.py
@@ -61,21 +61,21 @@ def fetch_mldata(dataname, target_name='label', data_name='data',
     Parameters
     ----------
 
-    dataname:
+    dataname :
         Name of the data set on mldata.org,
         e.g.: "leukemia", "Whistler Daily Snowfall", etc.
         The raw name is automatically converted to a mldata.org URL .
 
-    target_name: optional, default: 'label'
+    target_name : optional, default: 'label'
         Name or index of the column containing the target values.
 
-    data_name: optional, default: 'data'
+    data_name : optional, default: 'data'
         Name or index of the column containing the data.
 
-    transpose_data: optional, default: True
+    transpose_data : optional, default: True
         If True, transpose the downloaded data array.
 
-    data_home: optional, default: None
+    data_home : optional, default: None
         Specify another download and cache folder for the data sets. By default
         all scikit learn data is stored in '~/scikit_learn_data' subfolders.
 
diff --git a/sklearn/datasets/olivetti_faces.py b/sklearn/datasets/olivetti_faces.py
index ba21bba64b0edc44cb0b005edaf81a72bdf850bb..e74d65d60e18de6e971a50fa36823b8565434c4a 100644
--- a/sklearn/datasets/olivetti_faces.py
+++ b/sklearn/datasets/olivetti_faces.py
@@ -67,7 +67,7 @@ def fetch_olivetti_faces(data_home=None, shuffle=False, random_state=0,
         If True the order of the dataset is shuffled to avoid having
         images of the same person grouped.
 
-    download_if_missing: optional, True by default
+    download_if_missing : optional, True by default
         If False, raise a IOError if the data is not locally available
         instead of trying to download the data from the source site.
 
diff --git a/sklearn/datasets/samples_generator.py b/sklearn/datasets/samples_generator.py
index 5f6d1a71b503d7f21de69773cfa16a5c99b5d174..53ee8987ba968eac760810923fb6dc593b660b29 100644
--- a/sklearn/datasets/samples_generator.py
+++ b/sklearn/datasets/samples_generator.py
@@ -631,7 +631,7 @@ def make_moons(n_samples=100, shuffle=True, noise=None, random_state=None):
     """Make two interleaving half circles
 
     A simple toy dataset to visualize clustering and classification
-    algorithms.
+    algorithms. Read more in the :ref:`User Guide <sample_generators>`.
 
     Parameters
     ----------
@@ -644,8 +644,6 @@ def make_moons(n_samples=100, shuffle=True, noise=None, random_state=None):
     noise : double or None (default=None)
         Standard deviation of Gaussian noise added to the data.
 
-    Read more in the :ref:`User Guide <sample_generators>`.
-
     Returns
     -------
     X : array of shape [n_samples, 2]
@@ -697,10 +695,10 @@ def make_blobs(n_samples=100, n_features=2, centers=3, cluster_std=1.0,
         (default=3)
         The number of centers to generate, or the fixed center locations.
 
-    cluster_std: float or sequence of floats, optional (default=1.0)
+    cluster_std : float or sequence of floats, optional (default=1.0)
         The standard deviation of the clusters.
 
-    center_box: pair of floats (min, max), optional (default=(-10.0, 10.0))
+    center_box : pair of floats (min, max), optional (default=(-10.0, 10.0))
         The bounding box for each cluster center when centers are
         generated at random.
 
@@ -1061,18 +1059,18 @@ def make_sparse_coded_signal(n_samples, n_components, n_features,
     n_nonzero_coefs : int
         number of active (non-zero) coefficients in each sample
 
-    random_state: int or RandomState instance, optional (default=None)
+    random_state : int or RandomState instance, optional (default=None)
         seed used by the pseudo random number generator
 
     Returns
     -------
-    data: array of shape [n_features, n_samples]
+    data : array of shape [n_features, n_samples]
         The encoded signal (Y).
 
-    dictionary: array of shape [n_features, n_components]
+    dictionary : array of shape [n_features, n_components]
         The dictionary with normalized components (D).
 
-    code: array of shape [n_components, n_samples]
+    code : array of shape [n_components, n_samples]
         The sparse code such that each column of this matrix has exactly
         n_nonzero_coefs non-zero items (X).
 
@@ -1192,10 +1190,10 @@ def make_sparse_spd_matrix(dim=1, alpha=0.95, norm_diag=False,
 
     Parameters
     ----------
-    dim: integer, optional (default=1)
+    dim : integer, optional (default=1)
         The size of the random matrix to generate.
 
-    alpha: float between 0 and 1, optional (default=0.95)
+    alpha : float between 0 and 1, optional (default=0.95)
         The probability that a coefficient is zero (see notes). Larger values 
         enforce more sparsity.
 
diff --git a/sklearn/datasets/species_distributions.py b/sklearn/datasets/species_distributions.py
index 2720aab6e1dca01aa7cb963e0e8b71c2395a2669..6af36e6745d33ea6dc4a287df40bbfe69f7807cb 100644
--- a/sklearn/datasets/species_distributions.py
+++ b/sklearn/datasets/species_distributions.py
@@ -141,7 +141,7 @@ def fetch_species_distributions(data_home=None,
         Specify another download and cache folder for the datasets. By default
         all scikit learn data is stored in '~/scikit_learn_data' subfolders.
 
-    download_if_missing: optional, True by default
+    download_if_missing : optional, True by default
         If False, raise a IOError if the data is not locally available
         instead of trying to download the data from the source site.
 
diff --git a/sklearn/datasets/svmlight_format.py b/sklearn/datasets/svmlight_format.py
index 951eb00c8241cda4d10212eb40a0c6b9db7a49ee..52e81da086f627d985a15e5a34dbb1eeae6ad666 100644
--- a/sklearn/datasets/svmlight_format.py
+++ b/sklearn/datasets/svmlight_format.py
@@ -100,12 +100,12 @@ def load_svmlight_file(f, n_features=None, dtype=np.float64,
 
     Returns
     -------
-    X: scipy.sparse matrix of shape (n_samples, n_features)
+    X : scipy.sparse matrix of shape (n_samples, n_features)
 
-    y: ndarray of shape (n_samples,), or, in the multilabel a list of
+    y : ndarray of shape (n_samples,), or, in the multilabel a list of
         tuples of length n_samples.
 
-    query_id: array of shape (n_samples,)
+    query_id : array of shape (n_samples,)
        query_id for each sample. Only returned when query_id is set to
        True.
 
@@ -198,7 +198,7 @@ def load_svmlight_files(files, n_features=None, dtype=np.float64,
         closed by this function. File-like objects must be opened in binary
         mode.
 
-    n_features: int or None
+    n_features : int or None
         The number of features to use. If None, it will be inferred from the
         maximum column index occurring in any of the files.
 
@@ -206,11 +206,11 @@ def load_svmlight_files(files, n_features=None, dtype=np.float64,
         in any of the input files, but setting it to a lower value will cause
         an exception to be raised.
 
-    multilabel: boolean, optional
+    multilabel : boolean, optional
         Samples may have several labels each (see
         http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)
 
-    zero_based: boolean or "auto", optional
+    zero_based : boolean or "auto", optional
         Whether column indices in f are zero-based (True) or one-based
         (False). If column indices are one-based, they are transformed to
         zero-based to match Python/NumPy conventions.
@@ -219,7 +219,7 @@ def load_svmlight_files(files, n_features=None, dtype=np.float64,
         are unfortunately not self-identifying. Using "auto" or True should
         always be safe.
 
-    query_id: boolean, defaults to False
+    query_id : boolean, defaults to False
         If True, will return the query_id array for each file.
 
     dtype : numpy data type, default np.float64
@@ -374,7 +374,7 @@ def dump_svmlight_file(X, y, f,  zero_based=True, comment=None, query_id=None,
         Array containing pairwise preference constraints (qid in svmlight
         format).
 
-    multilabel: boolean, optional
+    multilabel : boolean, optional
         Samples may have several labels each (see
         http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)
 
diff --git a/sklearn/datasets/twenty_newsgroups.py b/sklearn/datasets/twenty_newsgroups.py
index 524d38d6c1ec1a9c321ec7a594b112994abfba0b..128610fd2830f03407cbdb36de0184306612abdb 100644
--- a/sklearn/datasets/twenty_newsgroups.py
+++ b/sklearn/datasets/twenty_newsgroups.py
@@ -161,32 +161,32 @@ def fetch_20newsgroups(data_home=None, subset='train', categories=None,
 
     Parameters
     ----------
-    subset: 'train' or 'test', 'all', optional
+    subset : 'train' or 'test', 'all', optional
         Select the dataset to load: 'train' for the training set, 'test'
         for the test set, 'all' for both, with shuffled ordering.
 
-    data_home: optional, default: None
+    data_home : optional, default: None
         Specify a download and cache folder for the datasets. If None,
         all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
 
-    categories: None or collection of string or unicode
+    categories : None or collection of string or unicode
         If None (default), load all the categories.
         If not None, list of category names to load (other categories
         ignored).
 
-    shuffle: bool, optional
+    shuffle : bool, optional
         Whether or not to shuffle the data: might be important for models that
         make the assumption that the samples are independent and identically
         distributed (i.i.d.), such as stochastic gradient descent.
 
-    random_state: numpy random number generator or seed integer
+    random_state : numpy random number generator or seed integer
         Used to shuffle the dataset.
 
-    download_if_missing: optional, True by default
+    download_if_missing : optional, True by default
         If False, raise an IOError if the data is not locally available
         instead of trying to download the data from the source site.
 
-    remove: tuple
+    remove : tuple
         May contain any subset of ('headers', 'footers', 'quotes'). Each of
         these are kinds of text that will be detected and removed from the
         newsgroup posts, preventing classifiers from overfitting on
@@ -297,15 +297,15 @@ def fetch_20newsgroups_vectorized(subset="train", remove=(), data_home=None):
     Parameters
     ----------
 
-    subset: 'train' or 'test', 'all', optional
+    subset : 'train' or 'test', 'all', optional
         Select the dataset to load: 'train' for the training set, 'test'
         for the test set, 'all' for both, with shuffled ordering.
 
-    data_home: optional, default: None
+    data_home : optional, default: None
         Specify an download and cache folder for the datasets. If None,
         all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
 
-    remove: tuple
+    remove : tuple
         May contain any subset of ('headers', 'footers', 'quotes'). Each of
         these are kinds of text that will be detected and removed from the
         newsgroup posts, preventing classifiers from overfitting on
diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py
index 990baef4c8c7870b383ca94eea88771f77fc6f9d..7e6a136f3d651e995f4473a899fbd55410b392d6 100644
--- a/sklearn/decomposition/dict_learning.py
+++ b/sklearn/decomposition/dict_learning.py
@@ -394,44 +394,44 @@ def dict_learning(X, n_components, alpha, max_iter=100, tol=1e-8,
 
     Parameters
     ----------
-    X: array of shape (n_samples, n_features)
+    X : array of shape (n_samples, n_features)
         Data matrix.
 
-    n_components: int,
+    n_components : int,
         Number of dictionary atoms to extract.
 
-    alpha: int,
+    alpha : int,
         Sparsity controlling parameter.
 
-    max_iter: int,
+    max_iter : int,
         Maximum number of iterations to perform.
 
-    tol: float,
+    tol : float,
         Tolerance for the stopping condition.
 
-    method: {'lars', 'cd'}
+    method : {'lars', 'cd'}
         lars: uses the least angle regression method to solve the lasso problem
         (linear_model.lars_path)
         cd: uses the coordinate descent method to compute the
         Lasso solution (linear_model.Lasso). Lars will be faster if
         the estimated components are sparse.
 
-    n_jobs: int,
+    n_jobs : int,
         Number of parallel jobs to run, or -1 to autodetect.
 
-    dict_init: array of shape (n_components, n_features),
+    dict_init : array of shape (n_components, n_features),
         Initial value for the dictionary for warm restart scenarios.
 
-    code_init: array of shape (n_samples, n_components),
+    code_init : array of shape (n_samples, n_components),
         Initial value for the sparse code for warm restart scenarios.
 
-    callback:
+    callback :
         Callable that gets invoked every five iterations.
 
-    verbose:
+    verbose :
         Degree of output the procedure will print.
 
-    random_state: int or RandomState
+    random_state : int or RandomState
         Pseudo number generator state used for random sampling.
 
     return_n_iter : bool
@@ -439,13 +439,13 @@ def dict_learning(X, n_components, alpha, max_iter=100, tol=1e-8,
 
     Returns
     -------
-    code: array of shape (n_samples, n_components)
+    code : array of shape (n_samples, n_components)
         The sparse code factor in the matrix factorization.
 
-    dictionary: array of shape (n_components, n_features),
+    dictionary : array of shape (n_components, n_features),
         The dictionary factor in the matrix factorization.
 
-    errors: array
+    errors : array
         Vector of errors at each iteration.
 
     n_iter : int
diff --git a/sklearn/feature_extraction/dict_vectorizer.py b/sklearn/feature_extraction/dict_vectorizer.py
index 408024bce20733585e37997ed6c5806e0e56dee2..66390d7a2c96349d1437bdfaca9cc67e9454330c 100644
--- a/sklearn/feature_extraction/dict_vectorizer.py
+++ b/sklearn/feature_extraction/dict_vectorizer.py
@@ -52,13 +52,13 @@ class DictVectorizer(BaseEstimator, TransformerMixin):
     dtype : callable, optional
         The type of feature values. Passed to Numpy array/scipy.sparse matrix
         constructors as the dtype argument.
-    separator: string, optional
+    separator : string, optional
         Separator string used when constructing new features for one-hot
         coding.
-    sparse: boolean, optional.
+    sparse : boolean, optional.
         Whether transform should produce scipy.sparse matrices.
         True by default.
-    sort: boolean, optional.
+    sort : boolean, optional.
         Whether ``feature_names_`` and ``vocabulary_`` should be sorted when fitting.
         True by default.
 
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index b9d09443832b51b4b6af6c31723e01164f064eeb..856a2db060eff58387f00d7ff93d7c12ce58a568 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -397,12 +397,12 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin):
     norm : 'l1', 'l2' or None, optional
         Norm used to normalize term vectors. None for no normalization.
 
-    binary: boolean, default=False.
+    binary : boolean, default=False.
         If True, all non zero counts are set to 1. This is useful for discrete
         probabilistic models that model binary events rather than integer
         counts.
 
-    dtype: type, optional
+    dtype : type, optional
         Type of the matrix returned by fit_transform() or transform().
 
     non_negative : boolean, default=False
diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index f5cec7bd89d2545a79210ea1182caed6e0462868..f0e6c6c4397790e7609fa1f8aefe1b0d26f98bbc 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -483,7 +483,7 @@ class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
 
             'fmin_l_bfgs_b'
 
-    n_restarts_optimizer: int, optional (default: 0)
+    n_restarts_optimizer : int, optional (default: 0)
         The number of restarts of the optimizer for finding the kernel's
         parameters which maximize the log-marginal likelihood. The first run
         of the optimizer is performed from the kernel's initial parameters,
@@ -492,7 +492,7 @@ class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
         must be finite. Note that n_restarts_optimizer=0 implies that one
         run is performed.
 
-    max_iter_predict: int, optional (default: 100)
+    max_iter_predict : int, optional (default: 100)
         The maximum number of iterations in Newton's method for approximating
         the posterior during predict. Smaller values will reduce computation
         time at the cost of worse results.
@@ -515,7 +515,7 @@ class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
         given, it fixes the seed. Defaults to the global numpy random
         number generator.
 
-    multi_class: string, default: "one_vs_rest"
+    multi_class: string, default : "one_vs_rest"
         Specifies how multi-class classification problems are handled.
         Supported are "one_vs_rest" and "one_vs_one". In "one_vs_rest",
         one binary Gaussian process classifier is fitted for each class, which
@@ -541,7 +541,7 @@ class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
         classification, a CompoundKernel is returned which consists of the
         different kernels used in the one-versus-rest classifiers.
 
-    log_marginal_likelihood_value_: float
+    log_marginal_likelihood_value_ : float
         The log-marginal-likelihood of ``self.kernel_.theta``
 
     classes_ : array-like, shape = (n_classes,)
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 4cf4cb07a724383db41618c10e3955444c19a04c..4f4941fe1d706f53670f14a84135fcf196651f05 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -79,7 +79,7 @@ class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
 
             'fmin_l_bfgs_b'
 
-    n_restarts_optimizer: int, optional (default: 0)
+    n_restarts_optimizer : int, optional (default: 0)
         The number of restarts of the optimizer for finding the kernel's
         parameters which maximize the log-marginal likelihood. The first run
         of the optimizer is performed from the kernel's initial parameters,
@@ -88,7 +88,7 @@ class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
         must be finite. Note that n_restarts_optimizer == 0 implies that one
         run is performed.
 
-    normalize_y: boolean, optional (default: False)
+    normalize_y : boolean, optional (default: False)
         Whether the target values y are normalized, i.e., the mean of the
         observed target values become zero. This parameter should be set to
         True if the target values' mean is expected to differ considerable from
@@ -112,20 +112,20 @@ class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
     X_train_ : array-like, shape = (n_samples, n_features)
         Feature values in training data (also required for prediction)
 
-    y_train_: array-like, shape = (n_samples, [n_output_dims])
+    y_train_ : array-like, shape = (n_samples, [n_output_dims])
         Target values in training data (also required for prediction)
 
-    kernel_: kernel object
+    kernel_ : kernel object
         The kernel used for prediction. The structure of the kernel is the
         same as the one passed as parameter but with optimized hyperparameters
 
-    L_: array-like, shape = (n_samples, n_samples)
+    L_ : array-like, shape = (n_samples, n_samples)
         Lower-triangular Cholesky decomposition of the kernel in ``X_train_``
 
-    alpha_: array-like, shape = (n_samples,)
+    alpha_ : array-like, shape = (n_samples,)
         Dual coefficients of training data points in kernel space
 
-    log_marginal_likelihood_value_: float
+    log_marginal_likelihood_value_ : float
         The log-marginal-likelihood of ``self.kernel_.theta``
 
     """
diff --git a/sklearn/linear_model/ransac.py b/sklearn/linear_model/ransac.py
index 1e65358746570ef6cf3f3e63b3dd5659730277e2..1c29f7fa6b33f5b3f1c0bdf9d4aa509f3eb3a9a2 100644
--- a/sklearn/linear_model/ransac.py
+++ b/sklearn/linear_model/ransac.py
@@ -138,7 +138,7 @@ class RANSACRegressor(BaseEstimator, MetaEstimatorMixin, RegressorMixin):
         NOTE: residual_metric is deprecated from 0.18 and will be removed in 0.20
         Use ``loss`` instead.
 
-    loss: string, callable, optional, default "absolute_loss"
+    loss : string, callable, optional, default "absolute_loss"
         String inputs, "absolute_loss" and "squared_loss" are supported which
         find the absolute loss and squared loss per sample
         respectively.
@@ -206,7 +206,7 @@ class RANSACRegressor(BaseEstimator, MetaEstimatorMixin, RegressorMixin):
         y : array-like, shape = [n_samples] or [n_samples, n_targets]
             Target values.
 
-        sample_weight: array-like, shape = [n_samples]
+        sample_weight : array-like, shape = [n_samples]
             Individual weights for each sample
             raises error if sample_weight is passed and base_estimator
             fit method does not support it.
diff --git a/sklearn/manifold/locally_linear.py b/sklearn/manifold/locally_linear.py
index 7de60140e8677629149160247f01bdec3e5c82eb..fd9e496d093d82214c55068330403f4052c16f79 100644
--- a/sklearn/manifold/locally_linear.py
+++ b/sklearn/manifold/locally_linear.py
@@ -139,7 +139,7 @@ def null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1E-6, max_iter=100,
     max_iter : maximum number of iterations for 'arpack' method
         not used if eigen_solver=='dense'
 
-    random_state: numpy.RandomState or int, optional
+    random_state : numpy.RandomState or int, optional
         The generator or seed used to determine the starting vector for arpack
         iterations.  Defaults to numpy.random.
 
diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 60cdacea976f51afd18232e17f72056c2f59ed11..a0b5593b08217fc4e00d61fffdf3354790ccaa11 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -941,16 +941,16 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
 
     Returns
     -------
-    precision: float (if average is not None) or array of float, shape =\
+    precision : float (if average is not None) or array of float, shape =\
         [n_unique_labels]
 
-    recall: float (if average is not None) or array of float, , shape =\
+    recall : float (if average is not None) or array of float, , shape =\
         [n_unique_labels]
 
-    fbeta_score: float (if average is not None) or array of float, shape =\
+    fbeta_score : float (if average is not None) or array of float, shape =\
         [n_unique_labels]
 
-    support: int (if average is not None) or array of int, shape =\
+    support : int (if average is not None) or array of int, shape =\
         [n_unique_labels]
         The number of occurrences of each label in ``y_true``.
 
diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 010f8bb928d1a71a73a0e6f129aa82a8a7deceb9..59f048b6d854d130d0c4098a77f04967d85a3561 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -253,13 +253,13 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred):
 
     Returns
     -------
-    homogeneity: float
+    homogeneity : float
        score between 0.0 and 1.0. 1.0 stands for perfectly homogeneous labeling
 
-    completeness: float
+    completeness : float
        score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling
 
-    v_measure: float
+    v_measure : float
         harmonic mean of the first two
 
     See also
@@ -317,7 +317,7 @@ def homogeneity_score(labels_true, labels_pred):
 
     Returns
     -------
-    homogeneity: float
+    homogeneity : float
        score between 0.0 and 1.0. 1.0 stands for perfectly homogeneous labeling
 
     References
@@ -465,7 +465,7 @@ def v_measure_score(labels_true, labels_pred):
 
     Returns
     -------
-    v_measure: float
+    v_measure : float
        score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling
 
     References
@@ -573,7 +573,7 @@ def mutual_info_score(labels_true, labels_pred, contingency=None):
 
     Returns
     -------
-    mi: float
+    mi : float
        Mutual information, a non-negative value
 
     See also
@@ -741,7 +741,7 @@ def normalized_mutual_info_score(labels_true, labels_pred):
 
     Returns
     -------
-    nmi: float
+    nmi : float
        score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling
 
     See also
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index c058e20967291171d6b6940d287e62b26e52b83d..e8f5090cba64b28ed360ec1cfeda82d5304821a8 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -787,7 +787,7 @@ def sigmoid_kernel(X, Y=None, gamma=None, coef0=1):
 
     Returns
     -------
-    Gram matrix: array of shape (n_samples_1, n_samples_2)
+    Gram matrix : array of shape (n_samples_1, n_samples_2)
     """
     X, Y = check_pairwise_arrays(X, Y)
     if gamma is None:
@@ -1354,7 +1354,7 @@ def pairwise_kernels(X, Y=None, metric="linear", filter_params=False,
         (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
         are used.
 
-    filter_params: boolean
+    filter_params : boolean
         Whether to filter invalid parameters or not.
 
     `**kwds` : optional keyword parameters
diff --git a/sklearn/mixture/dpgmm.py b/sklearn/mixture/dpgmm.py
index a8a1e2d928b11de127510ed18df66925b884e3d9..0b7f11affe3ca5217c8ad434a87a2937dad9c3a7 100644
--- a/sklearn/mixture/dpgmm.py
+++ b/sklearn/mixture/dpgmm.py
@@ -262,7 +262,7 @@ class _DPGMMBase(_GMMBase):
         -------
         logprob : array_like, shape (n_samples,)
             Log probabilities of each data point in X
-        responsibilities: array_like, shape (n_samples, n_components)
+        responsibilities : array_like, shape (n_samples, n_components)
             Posterior probabilities of each mixture component for each
             observation
         """
@@ -787,7 +787,7 @@ class VBGMM(_DPGMMBase):
         -------
         logprob : array_like, shape (n_samples,)
             Log probabilities of each data point in X
-        responsibilities: array_like, shape (n_samples, n_components)
+        responsibilities : array_like, shape (n_samples, n_components)
             Posterior probabilities of each mixture component for each
             observation
         """
diff --git a/sklearn/mixture/gaussian_mixture.py b/sklearn/mixture/gaussian_mixture.py
index f4a182a7c95672c42899ae26e9b9ce9431d6a013..e7c489cbb5140ba8609e0970d3f06f7ee08fe36f 100644
--- a/sklearn/mixture/gaussian_mixture.py
+++ b/sklearn/mixture/gaussian_mixture.py
@@ -500,7 +500,7 @@ class GaussianMixture(BaseMixture):
             (n_components, n_features)             if 'diag',
             (n_components, n_features, n_features) if 'full'
 
-    random_state: RandomState or an int seed, defaults to None.
+    random_state : RandomState or an int seed, defaults to None.
         A random number generator instance.
 
     warm_start : bool, default to False.
diff --git a/sklearn/neighbors/nearest_centroid.py b/sklearn/neighbors/nearest_centroid.py
index 10eb107d3c2ffd873d3d42a709a9a1de7c63c878..778605ad84e8a00099da203db9cba63ca9ed03cb 100644
--- a/sklearn/neighbors/nearest_centroid.py
+++ b/sklearn/neighbors/nearest_centroid.py
@@ -29,7 +29,7 @@ class NearestCentroid(BaseEstimator, ClassifierMixin):
 
     Parameters
     ----------
-    metric: string, or callable
+    metric : string, or callable
         The metric to use when calculating distance between instances in a
         feature array. If metric is a string or callable, it must be one of
         the options allowed by metrics.pairwise.pairwise_distances for its
diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py
index bf6199b00457773df17ada659c3fadc7d3b6dcff..19c0ac0d5bc9ddc44fc76e31049e4f55e18fe726 100644
--- a/sklearn/preprocessing/_function_transformer.py
+++ b/sklearn/preprocessing/_function_transformer.py
@@ -50,7 +50,7 @@ class FunctionTransformer(BaseEstimator, TransformerMixin):
         False, this has no effect. Otherwise, if accept_sparse is false,
         sparse matrix inputs will cause an exception to be raised.
 
-    pass_y: bool, optional default=False
+    pass_y : bool, optional default=False
         Indicate that transform should forward the y argument to the
         inner callable.
 
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 08be1c75d0f499ed3806d7e5e9962e72b48f24bb..4740d18f5b84e012f6f9f401e37797a8bcb00253 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -204,7 +204,7 @@ class MinMaxScaler(BaseEstimator, TransformerMixin):
 
     Parameters
     ----------
-    feature_range: tuple (min, max), default=(0, 1)
+    feature_range : tuple (min, max), default=(0, 1)
         Desired range of transformed data.
 
     copy : boolean, optional, default True
@@ -403,7 +403,7 @@ def minmax_scale(X, feature_range=(0, 1), axis=0, copy=True):
 
     Parameters
     ----------
-    feature_range: tuple (min, max), default=(0, 1)
+    feature_range : tuple (min, max), default=(0, 1)
         Desired range of transformed data.
 
     axis : int (0 by default)
@@ -1754,7 +1754,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
                   ``X[:, i]``. Each feature value should be
                   in ``range(n_values[i])``
 
-    categorical_features: "all" or array of indices or mask
+    categorical_features : "all" or array of indices or mask
         Specify what features are treated as categorical.
 
         - 'all' (default): All features are treated as categorical.