From 67ece78da4f8cd60582a83f075b04dc32e33d6ad Mon Sep 17 00:00:00 2001
From: Stefano Lattarini <stefano.lattarini@gmail.com>
Date: Thu, 11 Apr 2013 20:51:28 +0200
Subject: [PATCH] COSMIT various typofixes

As suggested by codespell <https://github.com/lucasdemarchi/codespell>

Signed-off-by: Stefano Lattarini <stefano.lattarini@gmail.com>
---
 doc/datasets/index.rst                         |  2 +-
 doc/developers/index.rst                       |  2 +-
 doc/developers/performance.rst                 |  2 +-
 doc/modules/decomposition.rst                  |  2 +-
 doc/modules/dp-derivation.rst                  |  2 +-
 doc/modules/ensemble.rst                       |  2 +-
 doc/modules/feature_extraction.rst             |  4 ++--
 doc/modules/gaussian_process.rst               |  6 +++---
 doc/modules/kernel_approximation.rst           |  2 +-
 doc/modules/lda_qda.rst                        |  2 +-
 doc/modules/linear_model.rst                   |  6 +++---
 doc/modules/model_evaluation.rst               |  2 +-
 doc/modules/outlier_detection.rst              |  6 +++---
 doc/modules/sgd.rst                            |  2 +-
 .../plot_outlier_detection_housing.py          |  2 +-
 examples/cluster/plot_digits_agglomeration.py  |  2 +-
 examples/cluster/plot_kmeans_digits.py         |  2 +-
 examples/cluster/plot_segmentation_toy.py      |  2 +-
 .../covariance/plot_mahalanobis_distances.py   |  4 ++--
 examples/decomposition/plot_sparse_coding.py   |  2 +-
 examples/document_clustering.py                |  2 +-
 examples/ensemble/plot_forest_importances.py   |  2 +-
 .../ensemble/plot_random_forest_embedding.py   |  2 +-
 examples/linear_model/plot_ard.py              |  2 +-
 examples/linear_model/plot_bayesian_ridge.py   |  2 +-
 examples/linear_model/plot_iris_logistic.py    |  2 +-
 .../plot_multi_task_lasso_support.py           |  2 +-
 examples/linear_model/plot_sgd_iris.py         |  2 +-
 examples/manifold/plot_compare_methods.py      |  2 +-
 examples/manifold/plot_lle_digits.py           |  2 +-
 examples/manifold/plot_manifold_sphere.py      |  2 +-
 examples/neighbors/plot_classification.py      |  2 +-
 examples/neighbors/plot_nearest_centroid.py    |  2 +-
 examples/plot_classifier_comparison.py         |  2 +-
 examples/plot_kernel_approximation.py          |  2 +-
 ...label_propagation_digits_active_learning.py |  2 +-
 .../plot_label_propagation_versus_svm_iris.py  |  2 +-
 examples/svm/plot_custom_kernel.py             |  2 +-
 examples/svm/plot_iris.py                      |  2 +-
 examples/svm/plot_svm_iris.py                  |  2 +-
 examples/svm/plot_svm_margin.py                |  6 +++---
 examples/svm/plot_svm_scale_c.py               |  6 +++---
 examples/tree/plot_tree_regression.py          |  2 +-
 sklearn/_hmmc.c                                |  2 +-
 sklearn/_isotonic.c                            |  2 +-
 sklearn/cluster/_hierarchical.c                |  2 +-
 sklearn/cluster/_k_means.c                     |  2 +-
 sklearn/cluster/hierarchical.py                |  2 +-
 sklearn/cluster/k_means_.py                    |  6 +++---
 sklearn/cluster/mean_shift_.py                 |  4 ++--
 sklearn/covariance/graph_lasso_.py             |  2 +-
 sklearn/covariance/shrunk_covariance_.py       |  2 +-
 sklearn/cross_validation.py                    |  2 +-
 sklearn/datasets/DATASET_PROPOSAL.txt          |  2 +-
 sklearn/datasets/_svmlight_format.c            |  2 +-
 sklearn/datasets/lfw.py                        |  2 +-
 sklearn/datasets/species_distributions.py      |  4 ++--
 sklearn/datasets/tests/test_lfw.py             |  2 +-
 sklearn/ensemble/_gradient_boosting.c          |  2 +-
 sklearn/ensemble/forest.py                     |  2 +-
 sklearn/ensemble/gradient_boosting.py          |  2 +-
 sklearn/ensemble/partial_dependence.py         |  2 +-
 sklearn/externals/joblib/func_inspect.py       |  2 +-
 sklearn/externals/joblib/memory.py             | 12 ++++++------
 sklearn/externals/joblib/numpy_pickle.py       |  2 +-
 sklearn/externals/joblib/parallel.py           |  4 ++--
 sklearn/externals/joblib/test/test_hashing.py  |  8 ++++----
 sklearn/externals/joblib/test/test_memory.py   |  2 +-
 sklearn/feature_extraction/_hashing.c          |  6 +++---
 sklearn/feature_extraction/text.py             |  2 +-
 sklearn/gaussian_process/gaussian_process.py   |  2 +-
 sklearn/grid_search.py                         |  8 ++++----
 sklearn/hmm.py                                 |  4 ++--
 sklearn/linear_model/base.py                   |  2 +-
 sklearn/linear_model/cd_fast.c                 |  2 +-
 sklearn/linear_model/coordinate_descent.py     |  6 +++---
 sklearn/linear_model/least_angle.py            |  4 ++--
 sklearn/linear_model/randomized_l1.py          |  4 ++--
 sklearn/linear_model/sgd_fast.c                | 18 +++++++++---------
 sklearn/linear_model/sgd_fast.pyx              |  2 +-
 .../tests/test_coordinate_descent.py           |  2 +-
 sklearn/linear_model/tests/test_least_angle.py |  2 +-
 sklearn/manifold/spectral_embedding.py         |  2 +-
 .../cluster/expected_mutual_info_fast.c        |  2 +-
 sklearn/metrics/cluster/supervised.py          |  8 ++++----
 sklearn/metrics/metrics.py                     |  8 ++++----
 sklearn/metrics/pairwise.py                    |  8 ++++----
 sklearn/metrics/pairwise_fast.c                |  2 +-
 sklearn/neighbors/ball_tree.c                  |  2 +-
 sklearn/pls.py                                 | 10 +++++-----
 sklearn/preprocessing.py                       |  4 ++--
 sklearn/svm/classes.py                         |  2 +-
 sklearn/svm/liblinear.c                        |  2 +-
 sklearn/svm/libsvm.c                           |  2 +-
 sklearn/svm/libsvm_sparse.c                    |  2 +-
 sklearn/svm/setup.py                           |  2 +-
 sklearn/svm/src/libsvm/svm.cpp                 |  4 ++--
 sklearn/tests/test_pipeline.py                 |  2 +-
 sklearn/tree/_tree.c                           |  2 +-
 sklearn/utils/arpack.py                        |  8 ++++----
 sklearn/utils/arraybuilder.c                   |  2 +-
 sklearn/utils/arrayfuncs.c                     |  2 +-
 sklearn/utils/class_weight.py                  |  2 +-
 sklearn/utils/graph_shortest_path.c            |  2 +-
 sklearn/utils/murmurhash.c                     |  2 +-
 sklearn/utils/random.c                         |  2 +-
 sklearn/utils/seq_dataset.c                    |  2 +-
 sklearn/utils/seq_dataset.pyx                  |  4 ++--
 sklearn/utils/sparsefuncs.c                    |  2 +-
 sklearn/utils/sparsetools/csgraph_wrap.cxx     |  2 +-
 sklearn/utils/src/gamma.c                      |  2 +-
 sklearn/utils/weight_vector.c                  |  2 +-
 112 files changed, 174 insertions(+), 174 deletions(-)

diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst
index b95b65708e..b18bf8bd4e 100644
--- a/doc/datasets/index.rst
+++ b/doc/datasets/index.rst
@@ -106,7 +106,7 @@ Sample generators
 =================
 
 In addition, scikit-learn includes various random sample generators that
-can be used to build artifical datasets of controled size and complexity.
+can be used to build artificial datasets of controlled size and complexity.
 
 .. image:: ../auto_examples/datasets/images/plot_random_dataset_1.png
    :target: ../auto_examples/datasets/plot_random_dataset.html
diff --git a/doc/developers/index.rst b/doc/developers/index.rst
index 35f2593116..018070f165 100644
--- a/doc/developers/index.rst
+++ b/doc/developers/index.rst
@@ -467,7 +467,7 @@ hence the validation in ``fit``, not ``__init__``.
 Deprecation
 -----------
 
-If any publically accessible method, function, attribute or parameter
+If any publicly accessible method, function, attribute or parameter
 is renamed, we still support the old one for two releases and issue
 a deprecation warning when it is called/passed/accessed.
 E.g., if the function ``zero_one`` is renamed to ``zero_one_loss``,
diff --git a/doc/developers/performance.rst b/doc/developers/performance.rst
index 68919daeb2..ac0b102e78 100644
--- a/doc/developers/performance.rst
+++ b/doc/developers/performance.rst
@@ -132,7 +132,7 @@ Note the use of the ``-l nmf.py`` that restricts the output to lines that
 contains the "nmf.py" string. This is useful to have a quick look at the hotspot
 of the nmf Python module it-self ignoring anything else.
 
-Here is the begining of the output of the same command without the ``-l nmf.py``
+Here is the beginning of the output of the same command without the ``-l nmf.py``
 filter::
 
   In [5] %prun NMF(n_components=16, tol=1e-2).fit(X)
diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst
index c796dfba0c..5fc64a8417 100644
--- a/doc/modules/decomposition.rst
+++ b/doc/modules/decomposition.rst
@@ -510,7 +510,7 @@ instead of :class:`PCA` or its variants, in the cases where the data matrix
 does not contain negative values.
 
 Unlike :class:`PCA`, the representation of a vector is obtained in an additive
-fashion, by superimposing the components, without substracting. Such additive
+fashion, by superimposing the components, without subtracting. Such additive
 models are efficient for representing images and text.
 
 It has been observed in [Hoyer, 04] that, when carefully constrained,
diff --git a/doc/modules/dp-derivation.rst b/doc/modules/dp-derivation.rst
index 3e4d1b2d7c..f2dab2854d 100644
--- a/doc/modules/dp-derivation.rst
+++ b/doc/modules/dp-derivation.rst
@@ -287,7 +287,7 @@ distributions of :math:`\sigma` (as there are a lot more :math:`\sigma` s
 now) and :math:`X`.
 
 The bound for :math:`\sigma_{k,d}` is the same bound for :math:`\sigma_k` and can
-be safelly ommited.
+be safelly omitted.
 
 **The bound for** :math:`X` :
 
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index 5747b1d7e2..b93c54a410 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -601,7 +601,7 @@ used for model selection (e.g. to determine the optimal number of iterations).
 
 Another strategy to reduce the variance is by subsampling the features
 analogous to the random splits in Random Forests. The size of the subsample
-can be controled via the ``max_features`` parameter.
+can be controlled via the ``max_features`` parameter.
 
 .. topic:: Examples:
 
diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst
index 088606c4f1..53441093c0 100644
--- a/doc/modules/feature_extraction.rst
+++ b/doc/modules/feature_extraction.rst
@@ -394,7 +394,7 @@ suitable for usage by a classifier it is very common to use the tf–idf
 transform.
 
 Tf means **term-frequency** while tf–idf means term-frequency times
-**inverse document-frequency**. This is a orginally a term weighting
+**inverse document-frequency**. This is a originally a term weighting
 scheme developed for information retrieval (as a ranking function
 for search engines results), that has also found good use in document
 classification and clustering.
@@ -504,7 +504,7 @@ misspellings or word derivations.
 
 N-grams to the rescue! Instead of building a simple collection of
 unigrams (n=1), one might prefer a collection of bigrams (n=2), where
-occurences of pairs of consecutive words are counted.
+occurrences of pairs of consecutive words are counted.
 
 One might alternatively consider a collection of character n-grams, a
 representation resiliant against misspellings and derivations.
diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index f8e129cd36..27a104e2bd 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -55,7 +55,7 @@ Say we want to surrogate the function :math:`g(x) = x \sin(x)`. To do so,
 the function is evaluated onto a design of experiments. Then, we define a
 GaussianProcess model whose regression and correlation models might be
 specified using additional kwargs, and ask for the model to be fitted to the
-data. Depending on the number of parameters provided at instanciation, the
+data. Depending on the number of parameters provided at instantiation, the
 fitting procedure may recourse to maximum likelihood estimation for the
 parameters or alternatively it uses the given parameters.
 
@@ -151,7 +151,7 @@ basic least squares linear regression problem:
 
         g(X) \approx f(X)^T \beta
 
-Except we additionaly assume some spatial coherence (correlation) between the
+Except we additionally assume some spatial coherence (correlation) between the
 samples dictated by the correlation function. Indeed, ordinary least squares
 assumes the correlation model :math:`R(|X - X'|)` is one when :math:`X = X'`
 and zero otherwise : a *dirac* correlation model -- sometimes referred to as a
@@ -316,7 +316,7 @@ Correlation Models
 
 Common correlation models matches some famous SVM's kernels because they are
 mostly built on equivalent assumptions. They must fulfill Mercer's conditions
-and should additionaly remain stationary. Note however, that the choice of the
+and should additionally remain stationary. Note however, that the choice of the
 correlation model should be made in agreement with the known properties of the
 original experiment from which the observations come. For instance:
 
diff --git a/doc/modules/kernel_approximation.rst b/doc/modules/kernel_approximation.rst
index 43515b4b2e..65c4c767d1 100644
--- a/doc/modules/kernel_approximation.rst
+++ b/doc/modules/kernel_approximation.rst
@@ -118,7 +118,7 @@ The class :class:`AdditiveChi2Sampler` implements this component wise
 deterministic sampling. Each component is sampled `n` times, yielding
 `2n+1` dimensions per input dimension (the multiple of two stems
 from the real and complex part of the Fourier transform).
-In the literature, `n` is usually choosen to be `1` or `2`, transforming
+In the literature, `n` is usually chosen to be `1` or `2`, transforming
 the dataset to size `n_samples x 5 * n_features` (in the case of `n=2`).
 
 The approximate feature map provided by :class:`AdditiveChi2Sampler` can be combined
diff --git a/doc/modules/lda_qda.rst b/doc/modules/lda_qda.rst
index 854f75f7d6..180c8a586f 100644
--- a/doc/modules/lda_qda.rst
+++ b/doc/modules/lda_qda.rst
@@ -54,7 +54,7 @@ for each class `k`. Predictions can be obtained by using Bayes' rule:
 .. math::
     P(y | X) = P(X | y) \cdot P(y) / P(X) = P(X | y) \cdot P(Y) / ( \sum_{y'} P(X | y') \cdot p(y'))
 
-In linear and quadratic discriminant analysis, `P(X|y)` is modeled as a Gaussian distribution.
+In linear and quadratic discriminant analysis, `P(X|y)` is modelled as a Gaussian distribution.
 In the case of LDA, the Gaussians for each class are assumed to share the same covariance matrix.
 This leads to a linear decision surface, as can be seen by comparing the the log-probability rations
 :math:`log[P(y=k | X) / P(y=l | X)]`.
diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 14d3874030..1ea5cb7aad 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -540,7 +540,7 @@ The prior for the parameter :math:`w` is given by a spherical Gaussian:
 .. math:: p(w|\lambda) =
     \mathcal{N}(w|0,\lambda^{-1}\bold{I_{p}})
 
-The priors over :math:`\alpha` and :math:`\lambda` are choosen to be `gamma
+The priors over :math:`\alpha` and :math:`\lambda` are chosen to be `gamma
 distributions <http://en.wikipedia.org/wiki/Gamma_distribution>`__, the
 conjugate prior for the precision of the Gaussian.
 
@@ -548,7 +548,7 @@ The resulting model is called *Bayesian Ridge Regression*, and is similar to the
 classical :class:`Ridge`.  The parameters :math:`w`, :math:`\alpha` and
 :math:`\lambda` are estimated jointly during the fit of the model.  The
 remaining hyperparameters are the parameters of the gamma priors over
-:math:`\alpha` and :math:`\lambda`.  These are usually choosen to be
+:math:`\alpha` and :math:`\lambda`.  These are usually chosen to be
 *non-informative*.  The parameters are estimated by maximizing the *marginal
 log likelihood*.
 
@@ -673,7 +673,7 @@ Stochastic Gradient Descent - SGD
 =================================
 
 Stochastic gradient descent is a simple yet very efficient approach
-to fit linear models. It is particulary useful when the number of samples
+to fit linear models. It is particularly useful when the number of samples
 (and the number of features) is very large.
 
 
diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index e89df88d26..c0e613b783 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -202,7 +202,7 @@ Classification report
 ---------------------
 The :func:`classification_report` function builds a text report showing the
 main classification metrics. Here a small example with custom ``target_names``
-and infered labels:
+and inferred labels:
 
    >>> from sklearn.metrics import classification_report
    >>> y_true = [0, 1, 2, 2, 0]
diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst
index fb57da80e0..ed61f64edf 100644
--- a/doc/modules/outlier_detection.rst
+++ b/doc/modules/outlier_detection.rst
@@ -42,7 +42,7 @@ add one more observation to that data set. Is the new observation so
 different from the others that we can doubt it is regular? (i.e. does
 it come from the same distribution?) Or on the contrary, is it so
 similar to the other that we cannot distinguish it from the original
-observations? This is the question adressed by the novelty detection
+observations? This is the question addressed by the novelty detection
 tools and methods.
 
 In general, it is about to learn a rough, close frontier delimiting
@@ -58,7 +58,7 @@ implemented in the :ref:`svm` module in the
 :class:`svm.OneClassSVM` object. It requires the choice of a
 kernel and a scalar parameter to define a frontier.  The RBF kernel is
 usually chosen although there exist no exact formula or algorithm to
-set its bandwith parameter. This is the default in the scikit-learn
+set its bandwidth parameter. This is the default in the scikit-learn
 implementation. The :math:`\nu` parameter, also known as the margin of
 the One-Class SVM, corresponds to the probability of finding a new,
 but regular, observation outside the frontier.
@@ -90,7 +90,7 @@ Fitting an elliptic envelop
 
 One common way of performing outlier detection is to assume that the
 regular data come from a known distribution (e.g. data are Gaussian
-distributed). From this assumption, we generaly try to define the
+distributed). From this assumption, we generally try to define the
 "shape" of the data, and can define outlying observations as
 observations which stand far enough from the fit shape. 
 
diff --git a/doc/modules/sgd.rst b/doc/modules/sgd.rst
index 3a2c1170d7..656ec715c9 100644
--- a/doc/modules/sgd.rst
+++ b/doc/modules/sgd.rst
@@ -354,7 +354,7 @@ For regression the default learning rate schedule is inverse scaling
 
     \eta^{(t)} = \frac{eta_0}{t^{power\_t}}
 
-where :math:`eta_0` and :math:`power\_t` are hyperparameters choosen by the
+where :math:`eta_0` and :math:`power\_t` are hyperparameters chosen by the
 user via ``eta0`` and ``power_t``, resp.
 
 For a constant learning rate use ``learning_rate='constant'`` and use ``eta0``
diff --git a/examples/applications/plot_outlier_detection_housing.py b/examples/applications/plot_outlier_detection_housing.py
index 74c058bebc..319ba4af37 100644
--- a/examples/applications/plot_outlier_detection_housing.py
+++ b/examples/applications/plot_outlier_detection_housing.py
@@ -40,7 +40,7 @@ distribution: the location seems to be well estimated, although the covariance
 is hard to estimate due to the banana-shaped distribution. Anyway, we can
 get rid of some outlying observations.
 The One-Class SVM is able to capture the real data structure, but the
-difficulty is to adjust its kernel bandwith parameter so as to obtain
+difficulty is to adjust its kernel bandwidth parameter so as to obtain
 a good compromise between the shape of the data scatter matrix and the
 risk of over-fitting the data.
 
diff --git a/examples/cluster/plot_digits_agglomeration.py b/examples/cluster/plot_digits_agglomeration.py
index 6d6f3cb11b..fd92f27d86 100644
--- a/examples/cluster/plot_digits_agglomeration.py
+++ b/examples/cluster/plot_digits_agglomeration.py
@@ -6,7 +6,7 @@
 Feature agglomeration
 =========================================================
 
-These images how similiar features are merged together using
+These images how similar features are merged together using
 feature agglomeration.
 """
 print(__doc__)
diff --git a/examples/cluster/plot_kmeans_digits.py b/examples/cluster/plot_kmeans_digits.py
index 88a657cef6..509a0ed857 100644
--- a/examples/cluster/plot_kmeans_digits.py
+++ b/examples/cluster/plot_kmeans_digits.py
@@ -95,7 +95,7 @@ kmeans.fit(reduced_data)
 # Step size of the mesh. Decrease to increase the quality of the VQ.
 h = .02     # point in the mesh [x_min, m_max]x[y_min, y_max].
 
-# Plot the decision boundary. For that, we will asign a color to each
+# Plot the decision boundary. For that, we will assign a color to each
 x_min, x_max = reduced_data[:, 0].min() + 1, reduced_data[:, 0].max() - 1
 y_min, y_max = reduced_data[:, 1].min() + 1, reduced_data[:, 1].max() - 1
 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
diff --git a/examples/cluster/plot_segmentation_toy.py b/examples/cluster/plot_segmentation_toy.py
index 133359f095..c5eb01bbc9 100644
--- a/examples/cluster/plot_segmentation_toy.py
+++ b/examples/cluster/plot_segmentation_toy.py
@@ -65,7 +65,7 @@ img += 1 + 0.2 * np.random.randn(*img.shape)
 graph = image.img_to_graph(img, mask=mask)
 
 # Take a decreasing function of the gradient: we take it weakly
-# dependant from the gradient the segmentation is close to a voronoi
+# dependent from the gradient the segmentation is close to a voronoi
 graph.data = np.exp(-graph.data / graph.data.std())
 
 # Force the solver to be arpack, since amg is numerically
diff --git a/examples/covariance/plot_mahalanobis_distances.py b/examples/covariance/plot_mahalanobis_distances.py
index 83f9434a3b..25b43ede41 100644
--- a/examples/covariance/plot_mahalanobis_distances.py
+++ b/examples/covariance/plot_mahalanobis_distances.py
@@ -3,7 +3,7 @@
 Robust covariance estimation and Mahalanobis distances relevance
 ================================================================
 
-For Gaussian ditributed data, the distance of an observation
+For Gaussian distributed data, the distance of an observation
 :math:`x_i` to the mode of the distribution can be computed using its
 Mahalanobis distance: :math:`d_{(\mu,\Sigma)}(x_i)^2 = (x_i -
 \mu)'\Sigma^{-1}(x_i - \mu)` where :math:`\mu` and :math:`\Sigma` are
@@ -33,7 +33,7 @@ by P.J.Rousseuw in [1].
 
 This example illustrates how the Mahalanobis distances are affected by
 outlying data: observations drawn from a contaminating distribution
-are not distinguishable from the observations comming from the real,
+are not distinguishable from the observations coming from the real,
 Gaussian distribution that one may want to work with. Using MCD-based
 Mahalanobis distances, the two populations become
 distinguishable. Associated applications are outliers detection,
diff --git a/examples/decomposition/plot_sparse_coding.py b/examples/decomposition/plot_sparse_coding.py
index 42dc1a73d4..8a55246751 100644
--- a/examples/decomposition/plot_sparse_coding.py
+++ b/examples/decomposition/plot_sparse_coding.py
@@ -6,7 +6,7 @@ Sparse coding with a precomputed dictionary
 Transform a signal as a sparse combination of Ricker wavelets. This example
 visually compares different sparse coding methods using the
 :class:`sklearn.decomposition.SparseCoder` estimator. The Ricker (also known
-as mexican hat or the second derivative of a gaussian) is not a particularily
+as mexican hat or the second derivative of a gaussian) is not a particularly
 good kernel to represent piecewise constant signals like this one. It can
 therefore be seen how much adding different widths of atoms matters and it
 therefore motivates learning the dictionary to best fit your type of signals.
diff --git a/examples/document_clustering.py b/examples/document_clustering.py
index b411ad5cb4..47b63a41a3 100644
--- a/examples/document_clustering.py
+++ b/examples/document_clustering.py
@@ -33,7 +33,7 @@ quality of the clustering by quite a lot as measured against the "ground truth"
 provided by the class label assignments of the 20 newsgroups dataset.
 
 This improvement is not visible in the Silhouette Coefficient which is small
-for both as this measure seem to suffer from the phenomenom called
+for both as this measure seem to suffer from the phenomenon called
 "Concentration of Measure" or "Curse of Dimensionality" for high dimensional
 datasets such as text data. Other measures such as V-measure and Adjusted Rand
 Index are information theoretic based evaluation scores: as they are only based
diff --git a/examples/ensemble/plot_forest_importances.py b/examples/ensemble/plot_forest_importances.py
index bd6c3cc7da..ea27d316a0 100644
--- a/examples/ensemble/plot_forest_importances.py
+++ b/examples/ensemble/plot_forest_importances.py
@@ -4,7 +4,7 @@ Feature importances with forests of trees
 =========================================
 
 This examples shows the use of forests of trees to evaluate the importance of
-features on an artifical classification task. The red bars are the feature
+features on an artificial classification task. The red bars are the feature
 importances of the forest, along with their inter-trees variability.
 
 As expected, the plot suggests that 3 features are informative, while the
diff --git a/examples/ensemble/plot_random_forest_embedding.py b/examples/ensemble/plot_random_forest_embedding.py
index b7f37028bb..5db2d6a745 100644
--- a/examples/ensemble/plot_random_forest_embedding.py
+++ b/examples/ensemble/plot_random_forest_embedding.py
@@ -69,7 +69,7 @@ ax.set_title("PCA reduction (2d) of transformed data (%dd)" %
 ax.set_xticks(())
 ax.set_yticks(())
 
-# Plot the decision in original space. For that, we will asign a color to each
+# Plot the decision in original space. For that, we will assign a color to each
 # point in the mesh [x_min, m_max] x [y_min, y_max].
 h = .01
 x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
diff --git a/examples/linear_model/plot_ard.py b/examples/linear_model/plot_ard.py
index fb7354e5e5..dd62b70e7f 100644
--- a/examples/linear_model/plot_ard.py
+++ b/examples/linear_model/plot_ard.py
@@ -6,7 +6,7 @@ Automatic Relevance Determination Regression (ARD)
 Fit regression model with :ref:`bayesian_ridge_regression`.
 
 Compared to the OLS (ordinary least squares) estimator, the coefficient
-weights are slightly shifted toward zeros, wich stabilises them.
+weights are slightly shifted toward zeros, which stabilises them.
 
 The histogram of the estimated weights is very peaked, as a sparsity-inducing
 prior is implied on the weights.
diff --git a/examples/linear_model/plot_bayesian_ridge.py b/examples/linear_model/plot_bayesian_ridge.py
index 33971df249..59a0269654 100644
--- a/examples/linear_model/plot_bayesian_ridge.py
+++ b/examples/linear_model/plot_bayesian_ridge.py
@@ -6,7 +6,7 @@ Bayesian Ridge Regression
 Computes a :ref:`bayesian_ridge_regression` on a synthetic dataset.
 
 Compared to the OLS (ordinary least squares) estimator, the coefficient
-weights are slightly shifted toward zeros, wich stabilises them.
+weights are slightly shifted toward zeros, which stabilises them.
 
 As the prior on the weights is a Gaussian prior, the histogram of the
 estimated weights is Gaussian.
diff --git a/examples/linear_model/plot_iris_logistic.py b/examples/linear_model/plot_iris_logistic.py
index e7c7e0411c..9f6a87319d 100644
--- a/examples/linear_model/plot_iris_logistic.py
+++ b/examples/linear_model/plot_iris_logistic.py
@@ -33,7 +33,7 @@ logreg = linear_model.LogisticRegression(C=1e5)
 # we create an instance of Neighbours Classifier and fit the data.
 logreg.fit(X, Y)
 
-# Plot the decision boundary. For that, we will asign a color to each
+# Plot the decision boundary. For that, we will assign a color to each
 # point in the mesh [x_min, m_max]x[y_min, y_max].
 x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
diff --git a/examples/linear_model/plot_multi_task_lasso_support.py b/examples/linear_model/plot_multi_task_lasso_support.py
index 881ecdeec8..a5ad0b0aae 100644
--- a/examples/linear_model/plot_multi_task_lasso_support.py
+++ b/examples/linear_model/plot_multi_task_lasso_support.py
@@ -5,7 +5,7 @@ Joint feature selection with multi-task Lasso
 =============================================
 
 The multi-task lasso allows to fit multiple regression problems
-jointly enforcing the selected features to be the same accross
+jointly enforcing the selected features to be the same across
 tasks. This example simulates sequential measurements, each task
 is a time instant, and the relevant features vary in amplitude
 over time while being the same. The multi-task lasso imposes that
diff --git a/examples/linear_model/plot_sgd_iris.py b/examples/linear_model/plot_sgd_iris.py
index 4bb9930df4..ee63c99a12 100644
--- a/examples/linear_model/plot_sgd_iris.py
+++ b/examples/linear_model/plot_sgd_iris.py
@@ -44,7 +44,7 @@ y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
 xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                      np.arange(y_min, y_max, h))
 
-# Plot the decision boundary. For that, we will asign a color to each
+# Plot the decision boundary. For that, we will assign a color to each
 # point in the mesh [x_min, m_max]x[y_min, y_max].
 Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
 # Put the result into a color plot
diff --git a/examples/manifold/plot_compare_methods.py b/examples/manifold/plot_compare_methods.py
index 6f9b4d8edd..af2abe6bfb 100644
--- a/examples/manifold/plot_compare_methods.py
+++ b/examples/manifold/plot_compare_methods.py
@@ -9,7 +9,7 @@ with various manifold learning methods.
 For a discussion and comparison of these algorithms, see the
 :ref:`manifold module page <manifold>`
 
-For a similiar example, where the methods are applied to a
+For a similar example, where the methods are applied to a
 sphere dataset, see :ref:`example_manifold_plot_manifold_sphere.py`
 
 Note that the purpose of the MDS is to find a low-dimensional
diff --git a/examples/manifold/plot_lle_digits.py b/examples/manifold/plot_lle_digits.py
index caa65d8e98..5c78579724 100644
--- a/examples/manifold/plot_lle_digits.py
+++ b/examples/manifold/plot_lle_digits.py
@@ -7,7 +7,7 @@ An illustration of various embeddings on the digits dataset.
 
 The RandomTreesEmbedding, from the :mod:`sklearn.ensemble` module, is not
 technically a manifold embedding method, as it learn a high-dimensional
-representation on wich we apply a dimensionality reduction method.
+representation on which we apply a dimensionality reduction method.
 However, it is often useful to cast a dataset into a representation in
 which the classes are linearly-seperable.
 """
diff --git a/examples/manifold/plot_manifold_sphere.py b/examples/manifold/plot_manifold_sphere.py
index f0a1dda483..d853ff468e 100644
--- a/examples/manifold/plot_manifold_sphere.py
+++ b/examples/manifold/plot_manifold_sphere.py
@@ -14,7 +14,7 @@ the poles are cut from the sphere, as well as a thin slice down its
 side. This enables the manifold learning techniques to
 'spread it open' whilst projecting it onto two dimensions.
 
-For a similiar example, where the methods are applied to the
+For a similar example, where the methods are applied to the
 S-curve dataset, see :ref:`example_manifold_plot_compare_methods.py`
 
 Note that the purpose of the :ref:`MDS <multidimensional_scaling>` is
diff --git a/examples/neighbors/plot_classification.py b/examples/neighbors/plot_classification.py
index 280b5ba1a3..e562753c62 100644
--- a/examples/neighbors/plot_classification.py
+++ b/examples/neighbors/plot_classification.py
@@ -32,7 +32,7 @@ for weights in ['uniform', 'distance']:
     clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
     clf.fit(X, y)
 
-    # Plot the decision boundary. For that, we will asign a color to each
+    # Plot the decision boundary. For that, we will assign a color to each
     # point in the mesh [x_min, m_max]x[y_min, y_max].
     x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
     y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
diff --git a/examples/neighbors/plot_nearest_centroid.py b/examples/neighbors/plot_nearest_centroid.py
index 0c6343b335..3064aa8b06 100644
--- a/examples/neighbors/plot_nearest_centroid.py
+++ b/examples/neighbors/plot_nearest_centroid.py
@@ -34,7 +34,7 @@ for shrinkage in [None, 0.1]:
     clf.fit(X, y)
     y_pred = clf.predict(X)
     print(shrinkage, np.mean(y == y_pred))
-    # Plot the decision boundary. For that, we will asign a color to each
+    # Plot the decision boundary. For that, we will assign a color to each
     # point in the mesh [x_min, m_max]x[y_min, y_max].
     x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
     y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
diff --git a/examples/plot_classifier_comparison.py b/examples/plot_classifier_comparison.py
index a322fb2bff..e2a530b828 100644
--- a/examples/plot_classifier_comparison.py
+++ b/examples/plot_classifier_comparison.py
@@ -101,7 +101,7 @@ for ds in datasets:
         clf.fit(X_train, y_train)
         score = clf.score(X_test, y_test)
 
-        # Plot the decision boundary. For that, we will asign a color to each
+        # Plot the decision boundary. For that, we will assign a color to each
         # point in the mesh [x_min, m_max]x[y_min, y_max].
         if hasattr(clf, "decision_function"):
             Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
diff --git a/examples/plot_kernel_approximation.py b/examples/plot_kernel_approximation.py
index 6ce66fece2..e427feb9b1 100644
--- a/examples/plot_kernel_approximation.py
+++ b/examples/plot_kernel_approximation.py
@@ -189,7 +189,7 @@ pl.figure(figsize=(12, 5))
 # predict and plot
 for i, clf in enumerate((kernel_svm, nystroem_approx_svm,
                          fourier_approx_svm)):
-    # Plot the decision boundary. For that, we will asign a color to each
+    # Plot the decision boundary. For that, we will assign a color to each
     # point in the mesh [x_min, m_max]x[y_min, y_max].
     pl.subplot(1, 3, i + 1)
     Z = clf.predict(flat_grid)
diff --git a/examples/semi_supervised/plot_label_propagation_digits_active_learning.py b/examples/semi_supervised/plot_label_propagation_digits_active_learning.py
index 081eaf45dc..01a2b127e4 100644
--- a/examples/semi_supervised/plot_label_propagation_digits_active_learning.py
+++ b/examples/semi_supervised/plot_label_propagation_digits_active_learning.py
@@ -72,7 +72,7 @@ for i in range(5):
     # select five digit examples that the classifier is most uncertain about
     uncertainty_index = uncertainty_index = np.argsort(pred_entropies)[-5:]
 
-    # keep track of indicies that we get labels for
+    # keep track of indices that we get labels for
     delete_indices = np.array([])
 
     f.text(.05, (1 - (i + 1) * .183),
diff --git a/examples/semi_supervised/plot_label_propagation_versus_svm_iris.py b/examples/semi_supervised/plot_label_propagation_versus_svm_iris.py
index 4bcbe828f2..45d6162fae 100644
--- a/examples/semi_supervised/plot_label_propagation_versus_svm_iris.py
+++ b/examples/semi_supervised/plot_label_propagation_versus_svm_iris.py
@@ -59,7 +59,7 @@ titles = ['Label Spreading 30% data',
 color_map = {-1: (1, 1, 1), 0: (0, 0, .9), 1: (1, 0, 0), 2: (.8, .6, 0)}
 
 for i, (clf, y_train) in enumerate((ls30, ls50, ls100, rbf_svc)):
-    # Plot the decision boundary. For that, we will asign a color to each
+    # Plot the decision boundary. For that, we will assign a color to each
     # point in the mesh [x_min, m_max]x[y_min, y_max].
     pl.subplot(2, 2, i + 1)
     Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
diff --git a/examples/svm/plot_custom_kernel.py b/examples/svm/plot_custom_kernel.py
index 401700fc85..bdc19d2338 100644
--- a/examples/svm/plot_custom_kernel.py
+++ b/examples/svm/plot_custom_kernel.py
@@ -38,7 +38,7 @@ h = .02  # step size in the mesh
 clf = svm.SVC(kernel=my_kernel)
 clf.fit(X, Y)
 
-# Plot the decision boundary. For that, we will asign a color to each
+# Plot the decision boundary. For that, we will assign a color to each
 # point in the mesh [x_min, m_max]x[y_min, y_max].
 x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
diff --git a/examples/svm/plot_iris.py b/examples/svm/plot_iris.py
index 0340ac5ff2..1c33ae2810 100644
--- a/examples/svm/plot_iris.py
+++ b/examples/svm/plot_iris.py
@@ -43,7 +43,7 @@ titles = ['SVC with linear kernel',
 
 
 for i, clf in enumerate((svc, rbf_svc, poly_svc, lin_svc)):
-    # Plot the decision boundary. For that, we will asign a color to each
+    # Plot the decision boundary. For that, we will assign a color to each
     # point in the mesh [x_min, m_max]x[y_min, y_max].
     pl.subplot(2, 2, i + 1)
     Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
diff --git a/examples/svm/plot_svm_iris.py b/examples/svm/plot_svm_iris.py
index 39d4242acb..3122169183 100644
--- a/examples/svm/plot_svm_iris.py
+++ b/examples/svm/plot_svm_iris.py
@@ -35,7 +35,7 @@ clf = svm.SVC(C=1.0, kernel='linear')
 # we create an instance of SVM Classifier and fit the data.
 clf.fit(X, Y)
 
-# Plot the decision boundary. For that, we will asign a color to each
+# Plot the decision boundary. For that, we will assign a color to each
 # point in the mesh [x_min, m_max]x[y_min, y_max].
 x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
diff --git a/examples/svm/plot_svm_margin.py b/examples/svm/plot_svm_margin.py
index b02bb13207..88748a3446 100644
--- a/examples/svm/plot_svm_margin.py
+++ b/examples/svm/plot_svm_margin.py
@@ -6,10 +6,10 @@
 SVM Margins Example
 =========================================================
 The plots below illustrate the effect the parameter `C` has
-on the seperation line. A large value of `C` basically tells
+on the separation line. A large value of `C` basically tells
 our model that we do not have that much faith in our data's
-distrubution, and will only consider points close to line
-of seperation.
+distribution, and will only consider points close to line
+of separation.
 
 A small value of `C` includes more/all the observations, allowing
 the margins to be calculated using all the data in the area.
diff --git a/examples/svm/plot_svm_scale_c.py b/examples/svm/plot_svm_scale_c.py
index 13fe094e10..41934d7672 100644
--- a/examples/svm/plot_svm_scale_c.py
+++ b/examples/svm/plot_svm_scale_c.py
@@ -29,9 +29,9 @@ increase.
 When using, for example, :ref:`cross validation <cross_validation>`, to
 set the amount of regularization with `C`, there will be a
 different amount of samples between the main problem and the smaller problems
-withing the folds of the cross validation.
+within the folds of the cross validation.
 
-Since our loss function is dependant on the amount of samples, the latter
+Since our loss function is dependent on the amount of samples, the latter
 will influence the selected value of `C`.
 The question that arises is `How do we optimally adjust C to
 account for the different amount of training samples?`
@@ -72,7 +72,7 @@ is not scaled.
 
 .. topic:: Note:
 
-    Two seperate datasets are used for the two different plots. The reason
+    Two separate datasets are used for the two different plots. The reason
     behind this is the `L1` case works better on sparse data, while `L2`
     is better suited to the non-sparse case.
 """
diff --git a/examples/tree/plot_tree_regression.py b/examples/tree/plot_tree_regression.py
index 62e22796af..349db25535 100644
--- a/examples/tree/plot_tree_regression.py
+++ b/examples/tree/plot_tree_regression.py
@@ -7,7 +7,7 @@ Decision Tree Regression
 used to fit a sine curve with addition noisy observation. As a result, it
 learns local linear regressions approximating the sine curve.
 
-We can see that if the maximum depth of the tree (controled by the
+We can see that if the maximum depth of the tree (controlled by the
 `max_depth` parameter) is set too high, the decision trees learn too fine
 details of the training data and learn from the noise, i.e. they overfit.
 """
diff --git a/sklearn/_hmmc.c b/sklearn/_hmmc.c
index b908b39155..1b10a0f18a 100644
--- a/sklearn/_hmmc.c
+++ b/sklearn/_hmmc.c
@@ -3158,7 +3158,7 @@ static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
diff --git a/sklearn/_isotonic.c b/sklearn/_isotonic.c
index 31cd4ece18..ec524c8cf6 100644
--- a/sklearn/_isotonic.c
+++ b/sklearn/_isotonic.c
@@ -1955,7 +1955,7 @@ static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
diff --git a/sklearn/cluster/_hierarchical.c b/sklearn/cluster/_hierarchical.c
index 3181c24ca1..f62b8ad1c1 100644
--- a/sklearn/cluster/_hierarchical.c
+++ b/sklearn/cluster/_hierarchical.c
@@ -2462,7 +2462,7 @@ static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
diff --git a/sklearn/cluster/_k_means.c b/sklearn/cluster/_k_means.c
index 99fb52ac7d..4a57af9196 100644
--- a/sklearn/cluster/_k_means.c
+++ b/sklearn/cluster/_k_means.c
@@ -5154,7 +5154,7 @@ static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py
index ca2bdd0503..191de5c35a 100644
--- a/sklearn/cluster/hierarchical.py
+++ b/sklearn/cluster/hierarchical.py
@@ -1,6 +1,6 @@
 """Hierarchical Agglomerative Clustering
 
-These routines perform some hierachical agglomerative clustering of some
+These routines perform some hierarchical agglomerative clustering of some
 input data. Currently, only Ward's algorithm is implemented.
 
 Authors : Vincent Michel, Bertrand Thirion, Alexandre Gramfort,
diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py
index 3fdc9c5e53..25002b186d 100644
--- a/sklearn/cluster/k_means_.py
+++ b/sklearn/cluster/k_means_.py
@@ -546,7 +546,7 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None,
         centers = centers.toarray()
 
     if len(centers) != k:
-        raise ValueError('The shape of the inital centers (%s) '
+        raise ValueError('The shape of the initial centers (%s) '
                          'does not match the number of clusters %i'
                          % (centers.shape, k))
 
@@ -948,7 +948,7 @@ def _mini_batch_convergence(model, iteration_idx, n_iter, tol,
         print(progress_msg)
 
     # Early stopping based on absolute tolerance on squared change of
-    # centers postion (using EWA smoothing)
+    # centers position (using EWA smoothing)
     if tol > 0.0 and ewa_diff < tol:
         if verbose:
             print('Converged (small centers change) at iteration %d/%d'
@@ -972,7 +972,7 @@ def _mini_batch_convergence(model, iteration_idx, n_iter, tol,
                   % (iteration_idx + 1, n_iter))
         return True
 
-    # update the convergence context to maintain state across sucessive calls:
+    # update the convergence context to maintain state across successive calls:
     context['ewa_diff'] = ewa_diff
     context['ewa_inertia'] = ewa_inertia
     context['ewa_inertia_min'] = ewa_inertia_min
diff --git a/sklearn/cluster/mean_shift_.py b/sklearn/cluster/mean_shift_.py
index aa82346465..077ecc34e0 100644
--- a/sklearn/cluster/mean_shift_.py
+++ b/sklearn/cluster/mean_shift_.py
@@ -15,7 +15,7 @@ from ..neighbors import NearestNeighbors
 
 
 def estimate_bandwidth(X, quantile=0.3, n_samples=None, random_state=0):
-    """Estimate the bandwith to use with MeanShift algorithm
+    """Estimate the bandwidth to use with MeanShift algorithm
 
     Parameters
     ----------
@@ -206,7 +206,7 @@ class MeanShift(BaseEstimator, ClusterMixin):
     Parameters
     ----------
     bandwidth : float, optional
-        Bandwith used in the RBF kernel
+        Bandwidth used in the RBF kernel
         If not set, the bandwidth is estimated.
         See clustering.estimate_bandwidth.
 
diff --git a/sklearn/covariance/graph_lasso_.py b/sklearn/covariance/graph_lasso_.py
index 9901b3cc5d..d498447911 100644
--- a/sklearn/covariance/graph_lasso_.py
+++ b/sklearn/covariance/graph_lasso_.py
@@ -486,7 +486,7 @@ class GraphLassoCV(GraphLasso):
 
             # Refine our grid
             if best_index == 0:
-                # We do not need to go back: we have choosen
+                # We do not need to go back: we have chosen
                 # the highest value of alpha for which there are
                 # non-zero coefficients
                 alpha_1 = path[0][0]
diff --git a/sklearn/covariance/shrunk_covariance_.py b/sklearn/covariance/shrunk_covariance_.py
index 23ce1e1b66..59f208eb91 100644
--- a/sklearn/covariance/shrunk_covariance_.py
+++ b/sklearn/covariance/shrunk_covariance_.py
@@ -244,7 +244,7 @@ def ledoit_wolf(X, assume_centered=False, block_size=1000):
 
     assume_centered: Boolean
       If True, data are not centered before computation.
-      Usefull to work with data whose mean is significantly equal to
+      Useful to work with data whose mean is significantly equal to
       zero but is not exactly zero.
       If False, data are centered before computation.
 
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index 9a44c9f36b..7338699fd1 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -1150,7 +1150,7 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
 
 
 def _permutation_test_score(estimator, X, y, cv, scorer):
-    """Auxilary function for permutation_test_score"""
+    """Auxiliary function for permutation_test_score"""
     avg_score = []
     for train, test in cv:
         estimator.fit(X[train], y[train])
diff --git a/sklearn/datasets/DATASET_PROPOSAL.txt b/sklearn/datasets/DATASET_PROPOSAL.txt
index 7389c4d446..0fefb901fc 100644
--- a/sklearn/datasets/DATASET_PROPOSAL.txt
+++ b/sklearn/datasets/DATASET_PROPOSAL.txt
@@ -108,7 +108,7 @@ I see mainly two big problems:
           we want to avoid loading all the data in memory ? Can we use memory
           mapped arrays ?
         - Missing data: I thought about subclassing both record arrays and
-          masked arrays classes, but I don't know if this is feasable, or even
+          masked arrays classes, but I don't know if this is feasible, or even
           makes sense. I have the feeling that some Data mining software use
           Nan (for example, weka seems to use float internally), but this
           prevents them from representing integer data.
diff --git a/sklearn/datasets/_svmlight_format.c b/sklearn/datasets/_svmlight_format.c
index 94f28c12ef..b05d9bcb47 100644
--- a/sklearn/datasets/_svmlight_format.c
+++ b/sklearn/datasets/_svmlight_format.c
@@ -2416,7 +2416,7 @@ static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
diff --git a/sklearn/datasets/lfw.py b/sklearn/datasets/lfw.py
index 88ec446b4a..0877018303 100644
--- a/sklearn/datasets/lfw.py
+++ b/sklearn/datasets/lfw.py
@@ -11,7 +11,7 @@ must predict whether the two images are from the same person.
 
 An alternative task, Face Recognition or Face Identification is:
 given the picture of the face of an unknown person, identify the name
-of the person by refering to a gallery of previously seen pictures of
+of the person by referring to a gallery of previously seen pictures of
 identified persons.
 
 Both Face Verification and Face Recognition are tasks that are typically
diff --git a/sklearn/datasets/species_distributions.py b/sklearn/datasets/species_distributions.py
index ffa513e122..541c81ee63 100644
--- a/sklearn/datasets/species_distributions.py
+++ b/sklearn/datasets/species_distributions.py
@@ -85,8 +85,8 @@ def _load_coverage(F, header_length=6,
 def _load_csv(F):
     """Load csv file.
 
-    Paramters
-    ---------
+    Parameters
+    ----------
     F : string or file object
         file object or name of file
 
diff --git a/sklearn/datasets/tests/test_lfw.py b/sklearn/datasets/tests/test_lfw.py
index 5e0b83db8e..54c4e17ded 100644
--- a/sklearn/datasets/tests/test_lfw.py
+++ b/sklearn/datasets/tests/test_lfw.py
@@ -1,6 +1,6 @@
 """This test for the LFW require medium-size data dowloading and processing
 
-If the data has not been already downloaded by runnning the examples,
+If the data has not been already downloaded by running the examples,
 the tests won't run (skipped).
 
 If the test are run, the first execution will be long (typically a bit
diff --git a/sklearn/ensemble/_gradient_boosting.c b/sklearn/ensemble/_gradient_boosting.c
index c66fb61675..2499f53a98 100644
--- a/sklearn/ensemble/_gradient_boosting.c
+++ b/sklearn/ensemble/_gradient_boosting.c
@@ -3553,7 +3553,7 @@ static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
index 7b6ff71545..9f5b125b2c 100644
--- a/sklearn/ensemble/forest.py
+++ b/sklearn/ensemble/forest.py
@@ -24,7 +24,7 @@ The module structure is the following:
 
 - The ``ExtraTreesClassifier`` and ``ExtraTreesRegressor`` derived
   classes provide the user with concrete implementations of the
-  forest ensemble method using the extremly randomized trees
+  forest ensemble method using the extremely randomized trees
   ``ExtraTreeClassifier`` and ``ExtraTreeRegressor`` as
   sub-estimator implementations.
 
diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py
index 62be7e2089..aa83c76c7d 100644
--- a/sklearn/ensemble/gradient_boosting.py
+++ b/sklearn/ensemble/gradient_boosting.py
@@ -943,7 +943,7 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin):
     loss : {'ls', 'lad', 'huber', 'quantile'}, optional (default='ls')
         loss function to be optimized. 'ls' refers to least squares
         regression. 'lad' (least absolute deviation) is a highly robust
-        loss function soley based on order information of the input
+        loss function solely based on order information of the input
         variables. 'huber' is a combination of the two. 'quantile'
         allows quantile regression (use `alpha` to specify the quantile).
 
diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py
index 3d4e5abac8..02a3cb913b 100644
--- a/sklearn/ensemble/partial_dependence.py
+++ b/sklearn/ensemble/partial_dependence.py
@@ -173,7 +173,7 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
                             contour_kw=None, **fig_kw):
     """Partial dependence plots for ``features``.
 
-    The ``len(features)`` plots are aranged in a grid with ``n_cols``
+    The ``len(features)`` plots are arranged in a grid with ``n_cols``
     columns. Two-way partial dependence plots are plotted as contour
     plots.
 
diff --git a/sklearn/externals/joblib/func_inspect.py b/sklearn/externals/joblib/func_inspect.py
index bce7ad8d2e..b85bf56af1 100644
--- a/sklearn/externals/joblib/func_inspect.py
+++ b/sklearn/externals/joblib/func_inspect.py
@@ -58,7 +58,7 @@ def get_func_code(func):
         else:
             # Weird objects like numpy ufunc don't have __code__
             # This is fragile, as quite often the id of the object is
-            # in the repr, so it might not persist accross sessions,
+            # in the repr, so it might not persist across sessions,
             # however it will work for ufuncs.
             return repr(func), source_file, -1
 
diff --git a/sklearn/externals/joblib/memory.py b/sklearn/externals/joblib/memory.py
index 815575b8aa..85512e25c1 100644
--- a/sklearn/externals/joblib/memory.py
+++ b/sklearn/externals/joblib/memory.py
@@ -92,7 +92,7 @@ class MemorizedFunc(Logger):
             arrays cannot be read by memmapping.
         verbose: int, optional
             The verbosity flag, controls messages that are issued as
-            the function is revaluated.
+            the function is reevaluated.
     """
     #-------------------------------------------------------------------------
     # Public interface
@@ -115,7 +115,7 @@ class MemorizedFunc(Logger):
                 arguments.
             verbose: int, optional
                 Verbosity flag, controls the debug messages that are issued
-                as functions are revaluated. The higher, the more verbose
+                as functions are reevaluated. The higher, the more verbose
             timestamp: float, optional
                 The reference time from which times in tracing messages
                 are reported.
@@ -246,8 +246,8 @@ class MemorizedFunc(Logger):
         if old_func_code == func_code:
             return True
 
-        # We have differing code, is this because we are refering to
-        # differing functions, or because the function we are refering as
+        # We have differing code, is this because we are referring to
+        # differing functions, or because the function we are referring as
         # changed?
 
         if old_first_line == first_line == -1:
@@ -464,9 +464,9 @@ class Memory(Logger):
                 compressed arrays cannot be read by memmapping.
             verbose: int, optional
                 Verbosity flag, controls the debug messages that are issued
-                as functions are revaluated.
+                as functions are reevaluated.
         """
-        # XXX: Bad explaination of the None value of cachedir
+        # XXX: Bad explanation of the None value of cachedir
         Logger.__init__(self)
         self._verbose = verbose
         self.mmap_mode = mmap_mode
diff --git a/sklearn/externals/joblib/numpy_pickle.py b/sklearn/externals/joblib/numpy_pickle.py
index 5502665bfd..2fc4b49919 100644
--- a/sklearn/externals/joblib/numpy_pickle.py
+++ b/sklearn/externals/joblib/numpy_pickle.py
@@ -132,7 +132,7 @@ class NDArrayWrapper(object):
 class ZNDArrayWrapper(NDArrayWrapper):
     """An object to be persisted instead of numpy arrays.
 
-    This object store the Zfile filename in wich
+    This object store the Zfile filename in which
     the data array has been persisted, and the meta information to
     retrieve it.
 
diff --git a/sklearn/externals/joblib/parallel.py b/sklearn/externals/joblib/parallel.py
index 405e956623..c81b0e6d87 100644
--- a/sklearn/externals/joblib/parallel.py
+++ b/sklearn/externals/joblib/parallel.py
@@ -58,7 +58,7 @@ def cpu_count():
 # For verbosity
 
 def _verbosity_filter(index, verbose):
-    """ Returns False for indices increasingly appart, the distance
+    """ Returns False for indices increasingly apart, the distance
         depending on the value of verbose.
 
         We use a lag increasing as the square of index
@@ -367,7 +367,7 @@ class Parallel(Logger):
 
     def print_progress(self, index):
         """Display the process of the parallel execution only a fraction
-           of time, controled by self.verbose.
+           of time, controlled by self.verbose.
         """
         if not self.verbose:
             return
diff --git a/sklearn/externals/joblib/test/test_hashing.py b/sklearn/externals/joblib/test/test_hashing.py
index 79c7dc21bc..bdb0a44628 100644
--- a/sklearn/externals/joblib/test/test_hashing.py
+++ b/sklearn/externals/joblib/test/test_hashing.py
@@ -238,8 +238,8 @@ def test_numpy_scalar():
 
 
 def test_dict_hash():
-    # Check that dictionaries hash consistently, eventhough the ordering
-    # of the keys is not garanteed
+    # Check that dictionaries hash consistently, even though the ordering
+    # of the keys is not guaranteed
     k = KlassWithCachedMethod()
 
     d = {'#s12069__c_maps.nii.gz': [33],
@@ -264,8 +264,8 @@ def test_dict_hash():
 
 
 def test_set_hash():
-    # Check that sets hash consistently, eventhough their ordering
-    # is not garanteed
+    # Check that sets hash consistently, even though their ordering
+    # is not guaranteed
     k = KlassWithCachedMethod()
 
     s = set(['#s12069__c_maps.nii.gz',
diff --git a/sklearn/externals/joblib/test/test_memory.py b/sklearn/externals/joblib/test/test_memory.py
index a98dc8c469..d70cb9f4be 100644
--- a/sklearn/externals/joblib/test/test_memory.py
+++ b/sklearn/externals/joblib/test/test_memory.py
@@ -307,7 +307,7 @@ def test_memory_eval():
 def count_and_append(x=[]):
     """ A function with a side effect in its arguments.
 
-        Return the lenght of its argument and append one element.
+        Return the length of its argument and append one element.
     """
     len_x = len(x)
     x.append(None)
diff --git a/sklearn/feature_extraction/_hashing.c b/sklearn/feature_extraction/_hashing.c
index 2d779f1a28..8e86ec3da1 100644
--- a/sklearn/feature_extraction/_hashing.c
+++ b/sklearn/feature_extraction/_hashing.c
@@ -1995,7 +1995,7 @@ static CYTHON_UNUSED int __pyx_pw_7cpython_5array_5array_1__getbuffer__(PyObject
  * 
  *         def __getbuffer__(array self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_7cpython_5array_5array___getbuffer__(arrayobject *__pyx_v_self, Py_buffer *__pyx_v_info, CYTHON_UNUSED int __pyx_v_flags) {
@@ -2331,7 +2331,7 @@ static CYTHON_INLINE arrayobject *__pyx_f_7cpython_5array_copy(arrayobject *__py
  *     return op
  * 
  * cdef inline int extend_buffer(array self, char* stuff, Py_ssize_t n):             # <<<<<<<<<<<<<<
- *     """ efficent appending of new stuff of same type
+ *     """ efficient appending of new stuff of same type
  *     (e.g. of same array type)
  */
 
@@ -2509,7 +2509,7 @@ static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index d4c6a1ab32..564e787955 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -339,7 +339,7 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin):
         or more letters characters (punctuation is completely ignored
         and always treated as a token separator).
 
-    n_features : interger, optional, (2 ** 20) by default
+    n_features : integer, optional, (2 ** 20) by default
         The number of features (columns) in the output matrices. Small numbers
         of features are likely to cause hash collisions, but large numbers
         will cause larger coefficient dimensions in linear learners.
diff --git a/sklearn/gaussian_process/gaussian_process.py b/sklearn/gaussian_process/gaussian_process.py
index 5a9476cd60..f5c17569fd 100644
--- a/sklearn/gaussian_process/gaussian_process.py
+++ b/sklearn/gaussian_process/gaussian_process.py
@@ -458,7 +458,7 @@ class GaussianProcess(BaseEstimator, RegressorMixin):
                     # Light storage mode (need to recompute C, F, Ft and G)
                     if self.verbose:
                         print("This GaussianProcess used 'light' storage mode "
-                              "at instanciation. Need to recompute "
+                              "at instantiation. Need to recompute "
                               "autocorrelation matrix...")
                     reduced_likelihood_function_value, par = \
                         self.reduced_likelihood_function()
diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py
index fa3450f03b..3f8bf404a3 100644
--- a/sklearn/grid_search.py
+++ b/sklearn/grid_search.py
@@ -556,7 +556,7 @@ class GridSearchCV(BaseSearchCV):
         explosion of memory consumption when more jobs get dispatched
         than CPUs can process. This parameter can be:
 
-            - None, in which case all the jobs are immediatly
+            - None, in which case all the jobs are immediately
               created and spawned. Use this for lightweight and
               fast-running jobs, to avoid delays due to on-demand
               spawning of the jobs
@@ -615,7 +615,7 @@ class GridSearchCV(BaseSearchCV):
             * ``cv_validation_scores``, the list of scores for each fold
 
     `best_estimator_` : estimator
-        Estimator that was choosen by the search, i.e. estimator
+        Estimator that was chosen by the search, i.e. estimator
         which gave highest score (or smallest loss if specified)
         on the left out data.
 
@@ -728,7 +728,7 @@ class RandomizedSearchCV(BaseSearchCV):
         explosion of memory consumption when more jobs get dispatched
         than CPUs can process. This parameter can be:
 
-            - None, in which case all the jobs are immediatly
+            - None, in which case all the jobs are immediately
               created and spawned. Use this for lightweight and
               fast-running jobs, to avoid delays due to on-demand
               spawning of the jobs
@@ -771,7 +771,7 @@ class RandomizedSearchCV(BaseSearchCV):
             * ``cv_validation_scores``, the list of scores for each fold
 
     `best_estimator_` : estimator
-        Estimator that was choosen by the search, i.e. estimator
+        Estimator that was chosen by the search, i.e. estimator
         which gave highest score (or smallest loss if specified)
         on the left out data.
 
diff --git a/sklearn/hmm.py b/sklearn/hmm.py
index c4539faed9..755705caa5 100644
--- a/sklearn/hmm.py
+++ b/sklearn/hmm.py
@@ -130,7 +130,7 @@ class _BaseHMM(BaseEstimator):
     # which depend on the specific emission distribution.
     #
     # Subclasses will probably also want to implement properties for
-    # the emission distribution parameters to expose them publically.
+    # the emission distribution parameters to expose them publicly.
 
     def __init__(self, n_components=1, startprob=None, transmat=None,
                  startprob_prior=None, transmat_prior=None,
@@ -999,7 +999,7 @@ class MultinomialHMM(_BaseHMM):
             return False
 
         if np.any(symbols < 0):
-            # input containes negative intiger
+            # input contains negative intiger
             return False
 
         symbols.sort()
diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
index d184616dda..49b678a456 100644
--- a/sklearn/linear_model/base.py
+++ b/sklearn/linear_model/base.py
@@ -316,7 +316,7 @@ class LinearRegression(LinearModel, RegressorMixin):
         Estimated coefficients for the linear regression problem.
         If multiple targets are passed during the fit (y 2D), this
         is a 2D array of shape (n_targets, n_features), while if only
-        one target is passed, this is a 1D array of lenght n_features.
+        one target is passed, this is a 1D array of length n_features.
 
     `intercept_` : array
         Independent term in the linear model.
diff --git a/sklearn/linear_model/cd_fast.c b/sklearn/linear_model/cd_fast.c
index ddbbeac449..991254ce44 100644
--- a/sklearn/linear_model/cd_fast.c
+++ b/sklearn/linear_model/cd_fast.c
@@ -7463,7 +7463,7 @@ static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index 2645b1a72c..d6b15a879f 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -893,7 +893,7 @@ class LassoCV(LinearModelCV, RegressorMixin):
     Attributes
     ----------
     ``alpha_`` : float
-        The amount of penalization choosen by cross validation
+        The amount of penalization chosen by cross validation
 
     ``coef_`` : array, shape = (n_features,) | (n_targets, n_features)
         parameter vector (w in the cost function formula)
@@ -996,10 +996,10 @@ class ElasticNetCV(LinearModelCV, RegressorMixin):
     Attributes
     ----------
     ``alpha_`` : float
-        The amount of penalization choosen by cross validation
+        The amount of penalization chosen by cross validation
 
     ``l1_ratio_`` : float
-        The compromise between l1 and l2 penalization choosen by
+        The compromise between l1 and l2 penalization chosen by
         cross validation
 
     ``coef_`` : array, shape = (n_features,) | (n_targets, n_features)
diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py
index e103bf7d17..f94a11b7d0 100644
--- a/sklearn/linear_model/least_angle.py
+++ b/sklearn/linear_model/least_angle.py
@@ -264,7 +264,7 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500,
             # regressors. Time to bail out
             warnings.warn('Early stopping the lars path, as the residues '
                           'are small and the current value of alpha is no '
-                          'longer well controled. %i iterations, alpha=%.3e, '
+                          'longer well controlled. %i iterations, alpha=%.3e, '
                           'previous alpha=%.3e, with an active set of %i '
                           'regressors.'
                           % (n_iter, alpha, prev_alpha, n_active))
@@ -1052,7 +1052,7 @@ class LassoLarsCV(LarsCV):
     -----
 
     The object solves the same problem as the LassoCV object. However,
-    unlike the LassoCV, it find the relevent alphas values by itself.
+    unlike the LassoCV, it find the relevant alphas values by itself.
     In general, because of this property, it will be more stable.
     However, it is more fragile to heavily multicollinear datasets.
 
diff --git a/sklearn/linear_model/randomized_l1.py b/sklearn/linear_model/randomized_l1.py
index 119b5c8bc8..fbc7e3905c 100644
--- a/sklearn/linear_model/randomized_l1.py
+++ b/sklearn/linear_model/randomized_l1.py
@@ -242,7 +242,7 @@ class RandomizedLasso(BaseRandomizedLinearModel):
         explosion of memory consumption when more jobs get dispatched
         than CPUs can process. This parameter can be:
 
-            - None, in which case all the jobs are immediatly
+            - None, in which case all the jobs are immediately
               created and spawned. Use this for lightweight and
               fast-running jobs, to avoid delays due to on-demand
               spawning of the jobs
@@ -404,7 +404,7 @@ class RandomizedLogisticRegression(BaseRandomizedLinearModel):
         explosion of memory consumption when more jobs get dispatched
         than CPUs can process. This parameter can be:
 
-            - None, in which case all the jobs are immediatly
+            - None, in which case all the jobs are immediately
               created and spawned. Use this for lightweight and
               fast-running jobs, to avoid delays due to on-demand
               spawning of the jobs
diff --git a/sklearn/linear_model/sgd_fast.c b/sklearn/linear_model/sgd_fast.c
index f85b95921d..e902e15fb9 100644
--- a/sklearn/linear_model/sgd_fast.c
+++ b/sklearn/linear_model/sgd_fast.c
@@ -1639,7 +1639,7 @@ static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_s
 static char __pyx_k_1[] = "-- Epoch %d";
 static char __pyx_k_2[] = "Norm: %.2f, NNZs: %d, Bias: %.6f, T: %d, Avg. loss: %.6f";
 static char __pyx_k_3[] = "Total training time: %.2f seconds.";
-static char __pyx_k_4[] = "floating-point under-/overflow occured.";
+static char __pyx_k_4[] = "floating-point under-/overflow occurred.";
 static char __pyx_k_6[] = "ndarray is not C contiguous";
 static char __pyx_k_8[] = "ndarray is not Fortran contiguous";
 static char __pyx_k_10[] = "Non-native byte order not supported";
@@ -8293,7 +8293,7 @@ static PyObject *__pyx_pf_7sklearn_12linear_model_8sgd_fast_plain_sgd(CYTHON_UNU
  *         # floating-point under-/overflow check.
  *         if np.any(np.isinf(weights)) or np.any(np.isnan(weights)) \             # <<<<<<<<<<<<<<
  *            or np.isnan(intercept) or np.isinf(intercept):
- *             raise ValueError("floating-point under-/overflow occured.")
+ *             raise ValueError("floating-point under-/overflow occurred.")
  */
     __pyx_t_15 = __Pyx_GetName(__pyx_m, __pyx_n_s__np); if (unlikely(!__pyx_t_15)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 506; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_15);
@@ -8331,7 +8331,7 @@ static PyObject *__pyx_pf_7sklearn_12linear_model_8sgd_fast_plain_sgd(CYTHON_UNU
  *         # floating-point under-/overflow check.
  *         if np.any(np.isinf(weights)) or np.any(np.isnan(weights)) \
  *            or np.isnan(intercept) or np.isinf(intercept):             # <<<<<<<<<<<<<<
- *             raise ValueError("floating-point under-/overflow occured.")
+ *             raise ValueError("floating-point under-/overflow occurred.")
  * 
  */
       __pyx_t_3 = __Pyx_GetName(__pyx_m, __pyx_n_s__np); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 506; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -8345,7 +8345,7 @@ static PyObject *__pyx_pf_7sklearn_12linear_model_8sgd_fast_plain_sgd(CYTHON_UNU
  *         # floating-point under-/overflow check.
  *         if np.any(np.isinf(weights)) or np.any(np.isnan(weights)) \             # <<<<<<<<<<<<<<
  *            or np.isnan(intercept) or np.isinf(intercept):
- *             raise ValueError("floating-point under-/overflow occured.")
+ *             raise ValueError("floating-point under-/overflow occurred.")
  */
       __pyx_t_3 = __Pyx_GetName(__pyx_m, __pyx_n_s__np); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 506; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
       __Pyx_GOTREF(__pyx_t_3);
@@ -8378,7 +8378,7 @@ static PyObject *__pyx_pf_7sklearn_12linear_model_8sgd_fast_plain_sgd(CYTHON_UNU
  *         # floating-point under-/overflow check.
  *         if np.any(np.isinf(weights)) or np.any(np.isnan(weights)) \
  *            or np.isnan(intercept) or np.isinf(intercept):             # <<<<<<<<<<<<<<
- *             raise ValueError("floating-point under-/overflow occured.")
+ *             raise ValueError("floating-point under-/overflow occurred.")
  * 
  */
         __pyx_t_2 = __Pyx_GetName(__pyx_m, __pyx_n_s__np); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 507; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
@@ -8435,7 +8435,7 @@ static PyObject *__pyx_pf_7sklearn_12linear_model_8sgd_fast_plain_sgd(CYTHON_UNU
       /* "sklearn/linear_model/sgd_fast.pyx":508
  *         if np.any(np.isinf(weights)) or np.any(np.isnan(weights)) \
  *            or np.isnan(intercept) or np.isinf(intercept):
- *             raise ValueError("floating-point under-/overflow occured.")             # <<<<<<<<<<<<<<
+ *             raise ValueError("floating-point under-/overflow occurred.")             # <<<<<<<<<<<<<<
  * 
  *     w.reset_wscale()
  */
@@ -8450,7 +8450,7 @@ static PyObject *__pyx_pf_7sklearn_12linear_model_8sgd_fast_plain_sgd(CYTHON_UNU
   }
 
   /* "sklearn/linear_model/sgd_fast.pyx":510
- *             raise ValueError("floating-point under-/overflow occured.")
+ *             raise ValueError("floating-point under-/overflow occurred.")
  * 
  *     w.reset_wscale()             # <<<<<<<<<<<<<<
  * 
@@ -8822,7 +8822,7 @@ static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
@@ -12776,7 +12776,7 @@ static int __Pyx_InitCachedConstants(void) {
   /* "sklearn/linear_model/sgd_fast.pyx":508
  *         if np.any(np.isinf(weights)) or np.any(np.isnan(weights)) \
  *            or np.isnan(intercept) or np.isinf(intercept):
- *             raise ValueError("floating-point under-/overflow occured.")             # <<<<<<<<<<<<<<
+ *             raise ValueError("floating-point under-/overflow occurred.")             # <<<<<<<<<<<<<<
  * 
  *     w.reset_wscale()
  */
diff --git a/sklearn/linear_model/sgd_fast.pyx b/sklearn/linear_model/sgd_fast.pyx
index 9aa5c0a7cd..888d9c8086 100644
--- a/sklearn/linear_model/sgd_fast.pyx
+++ b/sklearn/linear_model/sgd_fast.pyx
@@ -505,7 +505,7 @@ def plain_sgd(np.ndarray[DOUBLE, ndim=1, mode='c'] weights,
         # floating-point under-/overflow check.
         if np.any(np.isinf(weights)) or np.any(np.isnan(weights)) \
            or np.isnan(intercept) or np.isinf(intercept):
-            raise ValueError("floating-point under-/overflow occured.")
+            raise ValueError("floating-point under-/overflow occurred.")
 
     w.reset_wscale()
 
diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index 6cd9d0ef2f..2102291c47 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -82,7 +82,7 @@ def test_enet_toy():
     """
     Test ElasticNet for various parameters of alpha and l1_ratio.
 
-    Actualy, the parameters alpha = 0 should not be alowed. However,
+    Actually, the parameters alpha = 0 should not be allowed. However,
     we test it as a border case.
 
     ElasticNet is tested with and without precomputed Gram matrix
diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
index 959106d89d..8ec60760df 100644
--- a/sklearn/linear_model/tests/test_least_angle.py
+++ b/sklearn/linear_model/tests/test_least_angle.py
@@ -403,7 +403,7 @@ def test_lars_cv():
     """ Test the LassoLarsCV object by checking that the optimal alpha
         increases as the number of samples increases.
 
-        This property is not actualy garantied in general and is just a
+        This property is not actually garantied in general and is just a
         property of the given dataset, with the given steps chosen.
     """
     old_alpha = 0
diff --git a/sklearn/manifold/spectral_embedding.py b/sklearn/manifold/spectral_embedding.py
index 93b630d0a3..fb5dbc0fdc 100644
--- a/sklearn/manifold/spectral_embedding.py
+++ b/sklearn/manifold/spectral_embedding.py
@@ -104,7 +104,7 @@ def _set_diag(laplacian, value):
         diag_idx = (laplacian.row == laplacian.col)
         laplacian.data[diag_idx] = value
         # If the matrix has a small number of diagonals (as in the
-        # case of structured matrices comming from images), the
+        # case of structured matrices coming from images), the
         # dia format might be best suited for matvec products:
         n_diags = np.unique(laplacian.row - laplacian.col).size
         if n_diags <= 7:
diff --git a/sklearn/metrics/cluster/expected_mutual_info_fast.c b/sklearn/metrics/cluster/expected_mutual_info_fast.c
index c0f43503b5..6ea6e4c32b 100644
--- a/sklearn/metrics/cluster/expected_mutual_info_fast.c
+++ b/sklearn/metrics/cluster/expected_mutual_info_fast.c
@@ -2527,7 +2527,7 @@ static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 9e687c3f59..6370aeb74d 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -57,7 +57,7 @@ def contingency_matrix(labels_true, labels_pred, eps=None):
 
     eps: None or float
         If a float, that value is added to all values in the contingency
-        matrix. This helps to stop NaN propogation.
+        matrix. This helps to stop NaN propagation.
         If ``None``, nothing is adjusted.
 
     Returns
@@ -309,7 +309,7 @@ def homogeneity_score(labels_true, labels_pred):
       >>> homogeneity_score([0, 0, 1, 1], [1, 1, 0, 0])
       1.0
 
-    Non-pefect labelings that futher split classes into more clusters can be
+    Non-pefect labelings that further split classes into more clusters can be
     perfectly homogeneous::
 
       >>> print("%.6f" % homogeneity_score([0, 0, 1, 1], [0, 0, 1, 2]))
@@ -499,8 +499,8 @@ def mutual_info_score(labels_true, labels_pred, contingency=None):
 
     The Mutual Information is a measure of the similarity between two labels of
     the same data. Where :math:`P(i)` is the probability of a random sample
-    occuring in cluster :math:`U_i` and :math:`P'(j)` is the probability of a
-    random sample occuring in cluster :math:`V_j`, the Mutual Information
+    occurring in cluster :math:`U_i` and :math:`P'(j)` is the probability of a
+    random sample occurring in cluster :math:`V_j`, the Mutual Information
     between clusterings :math:`U` and :math:`V` is given as:
 
     .. math::
diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index b06b1c27d7..21c86b8bb9 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -547,7 +547,7 @@ def roc_curve(y_true, y_score, pos_label=None):
 
     y_true : array, shape = [n_samples]
         True binary labels in range {0, 1} or {-1, 1}.  If labels are not
-        binary, pos_label should be explictly given.
+        binary, pos_label should be explicitly given.
 
     y_score : array, shape = [n_samples]
         Target scores, can either be probability estimates of the positive
@@ -709,7 +709,7 @@ def confusion_matrix(y_true, y_pred, labels=None):
         Estimated targets as returned by a classifier.
 
     labels : array, shape = [n_classes]
-        List of all labels occuring in the dataset.
+        List of all labels occurring in the dataset.
         If none is given, those that appear at least once
         in ``y_true`` or ``y_pred`` are used.
 
@@ -1838,7 +1838,7 @@ def explained_variance_score(y_true, y_pred):
         if numerator == 0.0:
             return 1.0
         else:
-            # arbitary set to zero to avoid -inf scores, having a constant
+            # arbitrary set to zero to avoid -inf scores, having a constant
             # y_true is not interesting for scoring a regression anyway
             return 0.0
     return 1 - numerator / denominator
@@ -1903,7 +1903,7 @@ def r2_score(y_true, y_pred):
         if numerator == 0.0:
             return 1.0
         else:
-            # arbitary set to zero to avoid -inf scores, having a constant
+            # arbitrary set to zero to avoid -inf scores, having a constant
             # y_true is not interesting for scoring a regression anyway
             return 0.0
 
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index c1a3e26e14..81a5f7d63d 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -76,10 +76,10 @@ def check_pairwise_arrays(X, Y):
     Returns
     -------
     safe_X : {array-like, sparse matrix}, shape = [n_samples_a, n_features]
-        An array equal to X, guarenteed to be a numpy array.
+        An array equal to X, guaranteed to be a numpy array.
 
     safe_Y : {array-like, sparse matrix}, shape = [n_samples_b, n_features]
-        An array equal to Y if Y was not None, guarenteed to be a numpy array.
+        An array equal to Y if Y was not None, guaranteed to be a numpy array.
         If Y was None, safe_Y will be a pointer to X.
 
     """
@@ -573,7 +573,7 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds):
     computed. If the input is a distances matrix, it is returned instead.
 
     This method provides a safe way to take a distance matrix as input, while
-    preserving compatability with many other algorithms that take a vector
+    preserving compatibility with many other algorithms that take a vector
     array.
 
     If Y is given (default is None), then the returned matrix is the pairwise
@@ -743,7 +743,7 @@ def pairwise_kernels(X, Y=None, metric="linear", filter_params=False,
     computed. If the input is a kernel matrix, it is returned instead.
 
     This method provides a safe way to take a kernel matrix as input, while
-    preserving compatability with many other algorithms that take a vector
+    preserving compatibility with many other algorithms that take a vector
     array.
 
     If Y is given (default is None), then the returned matrix is the pairwise
diff --git a/sklearn/metrics/pairwise_fast.c b/sklearn/metrics/pairwise_fast.c
index 5fd5eadf0e..df3bdd6563 100644
--- a/sklearn/metrics/pairwise_fast.c
+++ b/sklearn/metrics/pairwise_fast.c
@@ -4918,7 +4918,7 @@ static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
diff --git a/sklearn/neighbors/ball_tree.c b/sklearn/neighbors/ball_tree.c
index 491d85c5f1..092fdc425c 100644
--- a/sklearn/neighbors/ball_tree.c
+++ b/sklearn/neighbors/ball_tree.c
@@ -9291,7 +9291,7 @@ static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
diff --git a/sklearn/pls.py b/sklearn/pls.py
index 01623a0cc5..c4264d8a68 100644
--- a/sklearn/pls.py
+++ b/sklearn/pls.py
@@ -286,20 +286,20 @@ class _PLS(BaseEstimator, TransformerMixin, RegressorMixin):
             # ----------------------
             # Possible memory footprint reduction may done here: in order to
             # avoid the allocation of a data chunk for the rank-one
-            # approximations matrix which is then substracted to Xk, we suggest
+            # approximations matrix which is then subtracted to Xk, we suggest
             # to perform a column-wise deflation.
             #
             # - regress Xk's on x_score
             x_loadings = np.dot(Xk.T, x_scores) / np.dot(x_scores.T, x_scores)
-            # - substract rank-one approximations to obtain remainder matrix
+            # - subtract rank-one approximations to obtain remainder matrix
             Xk -= np.dot(x_scores, x_loadings.T)
             if self.deflation_mode == "canonical":
-                # - regress Yk's on y_score, then substract rank-one approx.
+                # - regress Yk's on y_score, then subtract rank-one approx.
                 y_loadings = (np.dot(Yk.T, y_scores)
                               / np.dot(y_scores.T, y_scores))
                 Yk -= np.dot(y_scores, y_loadings.T)
             if self.deflation_mode == "regression":
-                # - regress Yk's on x_score, then substract rank-one approx.
+                # - regress Yk's on x_score, then subtract rank-one approx.
                 y_loadings = (np.dot(Yk.T, x_scores)
                               / np.dot(x_scores.T, x_scores))
                 Yk -= np.dot(x_scores, y_loadings.T)
@@ -548,7 +548,7 @@ class PLSRegression(_PLS):
 
 class PLSCanonical(_PLS):
     """ PLSCanonical implements the 2 blocks canonical PLS of the original Wold
-    algorithm [Tenenhaus 1998] p.204, refered as PLS-C2A in [Wegelin 2000].
+    algorithm [Tenenhaus 1998] p.204, referred as PLS-C2A in [Wegelin 2000].
 
     This class inherits from PLS with mode="A" and deflation_mode="canonical",
     norm_y_weights=True and algorithm="nipals", but svd should provide similar
diff --git a/sklearn/preprocessing.py b/sklearn/preprocessing.py
index 1aba057ebe..214d54540e 100644
--- a/sklearn/preprocessing.py
+++ b/sklearn/preprocessing.py
@@ -575,10 +575,10 @@ class Binarizer(BaseEstimator, TransformerMixin):
 
     Binarization is a common operation on text count data where the
     analyst can decide to only consider the presence or absence of a
-    feature rather than a quantified number of occurences for instance.
+    feature rather than a quantified number of occurrences for instance.
 
     It can also be used as a pre-processing step for estimators that
-    consider boolean random variables (e.g. modeled using the Bernoulli
+    consider boolean random variables (e.g. modelled using the Bernoulli
     distribution in a Bayesian setting).
 
     Parameters
diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py
index cc0c8ee607..98691538df 100644
--- a/sklearn/svm/classes.py
+++ b/sklearn/svm/classes.py
@@ -45,7 +45,7 @@ class LinearSVC(BaseLibLinear, LinearClassifierMixin, SelectorMixin,
         While `crammer_singer` is interesting from an theoretical perspective
         as it is consistent it is seldom used in practice and rarely leads to
         better accuracy and is more expensive to compute.
-        If `crammer_singer` is choosen, the options loss, penalty and dual will
+        If `crammer_singer` is chosen, the options loss, penalty and dual will
         be ignored.
 
     fit_intercept : boolean, optional (default=True)
diff --git a/sklearn/svm/liblinear.c b/sklearn/svm/liblinear.c
index 58584b0271..3c5167fc57 100644
--- a/sklearn/svm/liblinear.c
+++ b/sklearn/svm/liblinear.c
@@ -2573,7 +2573,7 @@ static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
diff --git a/sklearn/svm/libsvm.c b/sklearn/svm/libsvm.c
index ec8fbe1ebe..4084080f28 100644
--- a/sklearn/svm/libsvm.c
+++ b/sklearn/svm/libsvm.c
@@ -6204,7 +6204,7 @@ static int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx_v_self, Py_bu
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
diff --git a/sklearn/svm/libsvm_sparse.c b/sklearn/svm/libsvm_sparse.c
index 5ab414eeae..4853ef5e4e 100644
--- a/sklearn/svm/libsvm_sparse.c
+++ b/sklearn/svm/libsvm_sparse.c
@@ -4139,7 +4139,7 @@ static int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx_v_self, Py_bu
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
diff --git a/sklearn/svm/setup.py b/sklearn/svm/setup.py
index 9e7b1e705a..05ea4beccc 100644
--- a/sklearn/svm/setup.py
+++ b/sklearn/svm/setup.py
@@ -47,7 +47,7 @@ def configuration(parent_package='', top_path=None):
     liblinear_depends = [join('src', 'liblinear', '*.h'),
                          join('src', 'liblinear', 'liblinear_helper.c')]
 
-    # we try to link agains system-wide blas
+    # we try to link against system-wide blas
     blas_info = get_info('blas_opt', 0)
 
     if not blas_info:
diff --git a/sklearn/svm/src/libsvm/svm.cpp b/sklearn/svm/src/libsvm/svm.cpp
index 7a4e265ae9..adcee012ff 100644
--- a/sklearn/svm/src/libsvm/svm.cpp
+++ b/sklearn/svm/src/libsvm/svm.cpp
@@ -36,7 +36,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
    - Support for dense data by Ming-Fang Weng
 
-   - Return indicies for support vectors, Fabian Pedregosa
+   - Return indices for support vectors, Fabian Pedregosa
      <fabian.pedregosa@inria.fr>
 
    - Fixes to avoid name collision, Fabian Pedregosa
@@ -1895,7 +1895,7 @@ static decision_function svm_train_one(
 	return f;
 }
 
-// Platt's binary SVM Probablistic Output: an improvement from Lin et al.
+// Platt's binary SVM Probabilistic Output: an improvement from Lin et al.
 static void sigmoid_train(
 	int l, const double *dec_values, const double *labels, 
 	double& A, double& B)
diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py
index ee6a990823..ed2e285307 100644
--- a/sklearn/tests/test_pipeline.py
+++ b/sklearn/tests/test_pipeline.py
@@ -95,7 +95,7 @@ def test_pipeline_init():
     pipe2 = clone(pipe)
     assert_false(pipe.named_steps['svc'] is pipe2.named_steps['svc'])
 
-    # Check that appart from estimators, the parameters are the same
+    # Check that apart from estimators, the parameters are the same
     params = pipe.get_params()
     params2 = pipe2.get_params()
     # Remove estimators that where copied
diff --git a/sklearn/tree/_tree.c b/sklearn/tree/_tree.c
index e434dd7538..966d8f1ca3 100644
--- a/sklearn/tree/_tree.c
+++ b/sklearn/tree/_tree.c
@@ -14915,7 +14915,7 @@ static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
diff --git a/sklearn/utils/arpack.py b/sklearn/utils/arpack.py
index 8aa753d081..95f6761461 100644
--- a/sklearn/utils/arpack.py
+++ b/sklearn/utils/arpack.py
@@ -81,9 +81,9 @@ DNAUPD_ERRORS = {
     -8: "Error return from LAPACK eigenvalue calculation;",
     -9: "Starting vector is zero.",
     -10: "IPARAM(7) must be 1,2,3,4.",
-    -11: "IPARAM(7) = 1 and BMAT = 'G' are incompatable.",
+    -11: "IPARAM(7) = 1 and BMAT = 'G' are incompatible.",
     -12: "IPARAM(1) must be equal to 0 or 1.",
-    -13: "NEV and WHICH = 'BE' are incompatable.",
+    -13: "NEV and WHICH = 'BE' are incompatible.",
     -9999: "Could not build an Arnoldi factorization. "
            "IPARAM(5) returns the size of the current Arnoldi "
            "factorization. The user is advised to check that "
@@ -118,9 +118,9 @@ DSAUPD_ERRORS = {
         "Informational error from LAPACK routine dsteqr .",
     -9: "Starting vector is zero.",
     -10: "IPARAM(7) must be 1,2,3,4,5.",
-    -11: "IPARAM(7) = 1 and BMAT = 'G' are incompatable.",
+    -11: "IPARAM(7) = 1 and BMAT = 'G' are incompatible.",
     -12: "IPARAM(1) must be equal to 0 or 1.",
-    -13: "NEV and WHICH = 'BE' are incompatable. ",
+    -13: "NEV and WHICH = 'BE' are incompatible. ",
     -9999: "Could not build an Arnoldi factorization. "
            "IPARAM(5) returns the size of the current Arnoldi "
            "factorization. The user is advised to check that "
diff --git a/sklearn/utils/arraybuilder.c b/sklearn/utils/arraybuilder.c
index daef5b33f8..c87a281de1 100644
--- a/sklearn/utils/arraybuilder.c
+++ b/sklearn/utils/arraybuilder.c
@@ -1499,7 +1499,7 @@ static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
diff --git a/sklearn/utils/arrayfuncs.c b/sklearn/utils/arrayfuncs.c
index 03ac7ca8f4..e548e31fd7 100644
--- a/sklearn/utils/arrayfuncs.c
+++ b/sklearn/utils/arrayfuncs.c
@@ -1789,7 +1789,7 @@ static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
diff --git a/sklearn/utils/class_weight.py b/sklearn/utils/class_weight.py
index bbb95d6706..02d5e3300a 100644
--- a/sklearn/utils/class_weight.py
+++ b/sklearn/utils/class_weight.py
@@ -19,7 +19,7 @@ def compute_class_weight(class_weight, classes, y_ind):
         If None is given, the class weights will be uniform.
 
     classes : ndarray
-        Array of the classes occuring in the data, as given by
+        Array of the classes occurring in the data, as given by
         ``np.unique(y_org)`` with ``y_org`` the original class labels.
 
     y_ind : array-like, shape=(n_samples,), dtype=int
diff --git a/sklearn/utils/graph_shortest_path.c b/sklearn/utils/graph_shortest_path.c
index 4973a3c721..03ef368659 100644
--- a/sklearn/utils/graph_shortest_path.c
+++ b/sklearn/utils/graph_shortest_path.c
@@ -4557,7 +4557,7 @@ static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
diff --git a/sklearn/utils/murmurhash.c b/sklearn/utils/murmurhash.c
index 08e3ca6b8d..d7aae700fa 100644
--- a/sklearn/utils/murmurhash.c
+++ b/sklearn/utils/murmurhash.c
@@ -2764,7 +2764,7 @@ static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
diff --git a/sklearn/utils/random.c b/sklearn/utils/random.c
index 57e7e6c159..1e0df705a3 100644
--- a/sklearn/utils/random.c
+++ b/sklearn/utils/random.c
@@ -3070,7 +3070,7 @@ static int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx_v_self, Py_bu
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
diff --git a/sklearn/utils/seq_dataset.c b/sklearn/utils/seq_dataset.c
index 0215824e5d..7232eff395 100644
--- a/sklearn/utils/seq_dataset.c
+++ b/sklearn/utils/seq_dataset.c
@@ -2438,7 +2438,7 @@ static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
diff --git a/sklearn/utils/seq_dataset.pyx b/sklearn/utils/seq_dataset.pyx
index 0438784440..66187bf68d 100644
--- a/sklearn/utils/seq_dataset.pyx
+++ b/sklearn/utils/seq_dataset.pyx
@@ -57,8 +57,8 @@ cdef class ArrayDataset(SequentialDataset):
                   np.ndarray[DOUBLE, ndim=1, mode='c'] sample_weights):
         """A ``SequentialDataset`` backed by a two-dimensional numpy array.
 
-        Paramters
-        ---------
+        Parameters
+        ----------
         X : ndarray, dtype=np.float64, ndim=2, mode='c'
             The samples; a two-dimensional c-continuous numpy array of
             dtype np.float64.
diff --git a/sklearn/utils/sparsefuncs.c b/sklearn/utils/sparsefuncs.c
index 7f1a22cc63..fe91e04e35 100644
--- a/sklearn/utils/sparsefuncs.c
+++ b/sklearn/utils/sparsefuncs.c
@@ -4042,7 +4042,7 @@ static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
diff --git a/sklearn/utils/sparsetools/csgraph_wrap.cxx b/sklearn/utils/sparsetools/csgraph_wrap.cxx
index 6329d07f64..ca79cc6c7f 100644
--- a/sklearn/utils/sparsetools/csgraph_wrap.cxx
+++ b/sklearn/utils/sparsetools/csgraph_wrap.cxx
@@ -2899,7 +2899,7 @@ const char* pytype_string(PyObject* py_obj) {
   if (PyModule_Check(  py_obj)) return "module"      ;
   if (PyInstance_Check(py_obj)) return "instance"    ;
 
-  return "unkown type";
+  return "unknown type";
 }
 
 /* Given a NumPy typecode, return a string describing the type.
diff --git a/sklearn/utils/src/gamma.c b/sklearn/utils/src/gamma.c
index 3a4cd503fd..41f61de426 100644
--- a/sklearn/utils/src/gamma.c
+++ b/sklearn/utils/src/gamma.c
@@ -98,7 +98,7 @@ static double sklearn_gamma(double x)
         if (arg_was_less_than_one)
             /* Use identity gamma(z) = gamma(z+1)/z
              * The variable "result" now holds gamma of the original y + 1
-             * Thus we use y-1 to get back the orginal y. */
+             * Thus we use y-1 to get back the original y. */
             result /= (y-1.0);
         else
             /* Use the identity gamma(z+n) = z*(z+1)* ... *(z+n-1)*gamma(z) */
diff --git a/sklearn/utils/weight_vector.c b/sklearn/utils/weight_vector.c
index dee06f66af..28e820d50a 100644
--- a/sklearn/utils/weight_vector.c
+++ b/sklearn/utils/weight_vector.c
@@ -1634,7 +1634,7 @@ static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx
  *         # -- the details of this may change.
  *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
  *             # This implementation of getbuffer is geared towards Cython
- *             # requirements, and does not yet fullfill the PEP.
+ *             # requirements, and does not yet fulfill the PEP.
  */
 
 static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
-- 
GitLab