diff --git a/doc/datasets/twenty_newsgroups.rst b/doc/datasets/twenty_newsgroups.rst
index e0e845a04f53992cb59e1e49b4bc046c8f3dd6e0..55fe227682190547eb5944f2b84931ab14dba0f4 100644
--- a/doc/datasets/twenty_newsgroups.rst
+++ b/doc/datasets/twenty_newsgroups.rst
@@ -197,8 +197,8 @@ It loses even more if we also strip this metadata from the training data:
   >>> clf.fit(vectors, newsgroups_train.target)
   >>> vectors_test = vectorizer.transform(newsgroups_test.data)
   >>> pred = clf.predict(vectors_test)
-  >>> metrics.f1_score(newsgroups_test.target, pred, average='weighted')
-  0.73160869205141166
+  >>> metrics.f1_score(newsgroups_test.target, pred, average='macro')
+  0.65437545099490202
 
 Some other classifiers cope better with this harder version of the task. Try
 running :ref:`example_model_selection_grid_search_text_feature_extraction.py` with and without
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 2794948ce93d5c3c4758656906af1c364f0802fc..5f93333e585cff750d9fe6f265a0b92dd8f0cade 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -469,7 +469,7 @@ def test_precision_recall_f1_score_multiclass_pos_label_none():
     # compute scores with default labels introspection
     p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                  pos_label=None,
-                                                 average='binary')
+                                                 average='macro')
 
 
 def test_zero_precision_recall():