diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index e2e6122aad162981569f8f936e637251e941ff90..d8619a99e706e61858f18807ada3fbbb2dc2b8bc 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -197,7 +197,7 @@ class VectorizerMixin(object):
             return strip_accents
 
     def build_tokenizer(self):
-        """Return a function that split a string in sequence of tokens"""
+        """Return a function that splits a string into a sequence of tokens"""
         if self.tokenizer is not None:
             return self.tokenizer
         token_pattern = re.compile(self.token_pattern)
@@ -274,12 +274,12 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin):
     ----------
 
     input: string {'filename', 'file', 'content'}
-        If filename, the sequence passed as an argument to fit is
+        If 'filename', the sequence passed as an argument to fit is
         expected to be a list of filenames that need reading to fetch
         the raw content to analyze.
 
-        If 'file', the sequence items must have 'read' method (file-like
-        object) it is called to fetch the bytes in memory.
+        If 'file', the sequence items must have a 'read' method (file-like
+        object) that is called to fetch the bytes in memory.
 
         Otherwise the input is expected to be the sequence strings or
         bytes items are expected to be analyzed directly.
@@ -335,8 +335,8 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin):
 
     token_pattern: string
         Regular expression denoting what constitutes a "token", only used
-        if `analyzer == 'word'`. The default regexp select tokens of 2
-        or more letters characters (punctuation is completely ignored
+        if `analyzer == 'word'`. The default regexp selects tokens of 2
+        or more alphanumeric characters (punctuation is completely ignored
         and always treated as a token separator).
 
     n_features : integer, optional, (2 ** 20) by default
@@ -475,12 +475,12 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
     Parameters
     ----------
     input : string {'filename', 'file', 'content'}
-        If filename, the sequence passed as an argument to fit is
+        If 'filename', the sequence passed as an argument to fit is
         expected to be a list of filenames that need reading to fetch
         the raw content to analyze.
 
-        If 'file', the sequence items must have 'read' method (file-like
-        object) it is called to fetch the bytes in memory.
+        If 'file', the sequence items must have a 'read' method (file-like
+        object) that is called to fetch the bytes in memory.
 
         Otherwise the input is expected to be the sequence strings or
         bytes items are expected to be analyzed directly.
@@ -535,13 +535,13 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
         in the range [0.7, 1.0) to automatically detect and filter stop
         words based on intra corpus document frequency of terms.
 
-    lowercase : boolean, default True
-        Convert all characters to lowercase befor tokenizing.
+    lowercase : boolean, True by default
+        Convert all characters to lowercase before tokenizing.
 
     token_pattern : string
         Regular expression denoting what constitutes a "token", only used
         if `tokenize == 'word'`. The default regexp select tokens of 2
-        or more letters characters (punctuation is completely ignored
+        or more alphanumeric characters (punctuation is completely ignored
         and always treated as a token separator).
 
     max_df : float in range [0.0, 1.0] or int, optional, 1.0 by default
@@ -1035,12 +1035,12 @@ class TfidfVectorizer(CountVectorizer):
     Parameters
     ----------
     input : string {'filename', 'file', 'content'}
-        If filename, the sequence passed as an argument to fit is
+        If 'filename', the sequence passed as an argument to fit is
         expected to be a list of filenames that need reading to fetch
         the raw content to analyze.
 
-        If 'file', the sequence items must have 'read' method (file-like
-        object) it is called to fetch the bytes in memory.
+        If 'file', the sequence items must have a 'read' method (file-like
+        object) that is called to fetch the bytes in memory.
 
         Otherwise the input is expected to be the sequence strings or
         bytes items are expected to be analyzed directly.
@@ -1094,12 +1094,12 @@ class TfidfVectorizer(CountVectorizer):
         words based on intra corpus document frequency of terms.
 
     lowercase : boolean, default True
-        Convert all characters to lowercase befor tokenizing.
+        Convert all characters to lowercase before tokenizing.
 
     token_pattern : string
         Regular expression denoting what constitutes a "token", only used
-        if `analyzer == 'word'`. The default regexp select tokens of 2
-        or more letters characters (punctuation is completely ignored
+        if `analyzer == 'word'`. The default regexp selects tokens of 2
+        or more alphanumeric characters (punctuation is completely ignored
         and always treated as a token separator).
 
     max_df : float in range [0.0, 1.0] or int, optional, 1.0 by default