diff --git a/doc/datasets/twenty_newsgroups.rst b/doc/datasets/twenty_newsgroups.rst
index 01c2a53ff77e5401068b3991fe052ff5559ddfcb..e0e845a04f53992cb59e1e49b4bc046c8f3dd6e0 100644
--- a/doc/datasets/twenty_newsgroups.rst
+++ b/doc/datasets/twenty_newsgroups.rst
@@ -132,8 +132,8 @@ which is fast to train and achieves a decent F-score::
   >>> clf = MultinomialNB(alpha=.01)
   >>> clf.fit(vectors, newsgroups_train.target)
   >>> pred = clf.predict(vectors_test)
-  >>> metrics.f1_score(newsgroups_test.target, pred, average='weighted')
-  0.88251152461278892
+  >>> metrics.f1_score(newsgroups_test.target, pred, average='macro')
+  0.88213592402729568
 
 (The example :ref:`example_text_document_classification_20newsgroups.py` shuffles
 the training and test data, instead of segmenting by time, and in that case
@@ -182,8 +182,8 @@ blocks, and quotation blocks respectively.
   ...                                      categories=categories)
   >>> vectors_test = vectorizer.transform(newsgroups_test.data)
   >>> pred = clf.predict(vectors_test)
-  >>> metrics.f1_score(pred, newsgroups_test.target, average='weighted')
-  0.78409163025839435
+  >>> metrics.f1_score(pred, newsgroups_test.target, average='macro')
+  0.77310350681274775
 
 This classifier lost over a lot of its F-score, just because we removed
 metadata that has little to do with topic classification.
diff --git a/examples/model_selection/grid_search_digits.py b/examples/model_selection/grid_search_digits.py
index 40ed573247efd0f61ec9de000ea5c2c8d3adc160..13755b0bc8c105ac1dd5f636dee405bb8ba523b1 100644
--- a/examples/model_selection/grid_search_digits.py
+++ b/examples/model_selection/grid_search_digits.py
@@ -51,7 +51,7 @@ for score in scores:
     print()
 
     clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5,
-                       scoring='%s_weighted' % score)
+                       scoring='%s_macro' % score)
     clf.fit(X_train, y_train)
 
     print("Best parameters set found on development set:")
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index f28e6cc77093b12a9db2dff23bdbc05565dd15fb..2794948ce93d5c3c4758656906af1c364f0802fc 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -469,7 +469,7 @@ def test_precision_recall_f1_score_multiclass_pos_label_none():
     # compute scores with default labels introspection
     p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                  pos_label=None,
-                                                 average='weighted')
+                                                 average='binary')
 
 
 def test_zero_precision_recall():
@@ -482,10 +482,10 @@ def test_zero_precision_recall():
         y_pred = np.array([2, 0, 1, 1, 2, 0])
 
         assert_almost_equal(precision_score(y_true, y_pred,
-                                            average='weighted'), 0.0, 2)
-        assert_almost_equal(recall_score(y_true, y_pred, average='weighted'),
+                                            average='macro'), 0.0, 2)
+        assert_almost_equal(recall_score(y_true, y_pred, average='macro'),
                             0.0, 2)
-        assert_almost_equal(f1_score(y_true, y_pred, average='weighted'),
+        assert_almost_equal(f1_score(y_true, y_pred, average='macro'),
                             0.0, 2)
 
     finally:
diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py
index 670180695e4523d619354d1c4b35ead034be5cd5..df9f6f988c0c50080ecf1f2c9c07133704f8cba1 100644
--- a/sklearn/svm/tests/test_svm.py
+++ b/sklearn/svm/tests/test_svm.py
@@ -439,9 +439,9 @@ def test_auto_weight():
         y_pred = clf.fit(X[unbalanced], y[unbalanced]).predict(X)
         clf.set_params(class_weight='balanced')
         y_pred_balanced = clf.fit(X[unbalanced], y[unbalanced],).predict(X)
-        assert_true(metrics.f1_score(y, y_pred, average='weighted')
+        assert_true(metrics.f1_score(y, y_pred, average='macro')
                     <= metrics.f1_score(y, y_pred_balanced,
-                                        average='weighted'))
+                                        average='macro'))
 
 
 def test_bad_input():