From 5f4ca067349c04e68c4e1f04070dbc2d1fa36a70 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Sat, 11 Feb 2012 13:58:33 +0100
Subject: [PATCH] FIX: make LFW data shapes consistent with Olivetti faces

---
 .gitignore                                | 1 +
 doc/datasets/labeled_faces.rst            | 8 +++++++-
 doc/whats_new.rst                         | 4 ++++
 examples/applications/face_recognition.py | 9 +++++----
 sklearn/datasets/lfw.py                   | 6 ++++--
 sklearn/datasets/tests/test_lfw.py        | 9 +++++----
 6 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index 69915c34be..2136b6ea2a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
 *~
 .#*
 *.swp
+*.swo
 .DS_Store
 build
 sklearn/datasets/__config__.py
diff --git a/doc/datasets/labeled_faces.rst b/doc/datasets/labeled_faces.rst
index 7f86da507c..829eaeed3d 100644
--- a/doc/datasets/labeled_faces.rst
+++ b/doc/datasets/labeled_faces.rst
@@ -60,6 +60,9 @@ most of the background::
   dtype('float32')
 
   >>> lfw_people.data.shape
+  (1288, 1850)
+
+  >>> lfw_people.images.shape
   (1288, 50, 37)
 
 Each of the ``1140`` faces is assigned to a single person id in the ``target``
@@ -80,9 +83,12 @@ is a pair of two picture belonging or not to the same person::
   >>> list(lfw_pairs_train.target_names)
   ['Different persons', 'Same person']
 
-  >>> lfw_pairs_train.data.shape
+  >>> lfw_pairs_train.pairs.shape
   (2200, 2, 62, 47)
 
+  >>> lfw_pairs_train.data.shape
+  (2200, 5828)
+
   >>> lfw_pairs_train.target.shape
   (2200,)
 
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index dd5243896a..37bf80aaa0 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -65,6 +65,10 @@ API changes summary
      objects are now deprecated.
      `scores_` or `pvalues_` should be used instead.
 
+   - LFW ``data`` is now always shape ``(n_samples, n_features)`` to be
+     consistent with the Olivetti faces dataset. Use ``images`` and
+     ``pairs`` attribute to access the natural images shapes instead.
+
 .. _changes_0_10:
 
 0.10
diff --git a/examples/applications/face_recognition.py b/examples/applications/face_recognition.py
index 5e29a8dc47..8844cd3207 100644
--- a/examples/applications/face_recognition.py
+++ b/examples/applications/face_recognition.py
@@ -53,11 +53,12 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
 
 lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
 
-# reshape the data using the traditional (n_samples, n_features) shape
-faces = lfw_people.data
-n_samples, h, w = faces.shape
+# introspect the images arrays to find the shapes (for plotting)
+n_samples, h, w = lfw_people.images.shape
 
-X = faces.reshape((n_samples, h * w))
+# fot machine learning we use the 2 data directly (as relative pixel
+# positions info is ignored by this model)
+X = lfw_people.data
 n_features = X.shape[1]
 
 # the label to predict is the id of the person
diff --git a/sklearn/datasets/lfw.py b/sklearn/datasets/lfw.py
index c0469d0221..c836231a95 100644
--- a/sklearn/datasets/lfw.py
+++ b/sklearn/datasets/lfw.py
@@ -272,7 +272,8 @@ def fetch_lfw_people(data_home=None, funneled=True, resize=0.5,
         min_faces_per_person=min_faces_per_person, color=color, slice_=slice_)
 
     # pack the results as a Bunch instance
-    return Bunch(data=faces, target=target, target_names=target_names,
+    return Bunch(data=faces.reshape(len(faces), -1), images=faces,
+                 target=target, target_names=target_names,
                  DESCR="LFW faces dataset")
 
 
@@ -421,7 +422,8 @@ def fetch_lfw_pairs(subset='train', data_home=None, funneled=True, resize=0.5,
         slice_=slice_)
 
     # pack the results as a Bunch instance
-    return Bunch(data=pairs, target=target, target_names=target_names,
+    return Bunch(data=pairs.reshape(len(pairs), -1), pairs=pairs,
+                 target=target, target_names=target_names,
                  DESCR="'%s' segment of the LFW pairs dataset" % subset)
 
 
diff --git a/sklearn/datasets/tests/test_lfw.py b/sklearn/datasets/tests/test_lfw.py
index 8bbb00d7c4..4c069d8513 100644
--- a/sklearn/datasets/tests/test_lfw.py
+++ b/sklearn/datasets/tests/test_lfw.py
@@ -121,7 +121,8 @@ def test_load_fake_lfw_people():
 
     # The data is croped around the center as a rectangular bounding box
     # arounthe the face. Colors are converted to gray levels:
-    assert_equal(lfw_people.data.shape, (10, 62, 47))
+    assert_equal(lfw_people.images.shape, (10, 62, 47))
+    assert_equal(lfw_people.data.shape, (10, 2914))
 
     # the target is array of person integer ids
     assert_array_equal(lfw_people.target, [2, 0, 1, 0, 2, 0, 2, 1, 1, 2])
@@ -134,7 +135,7 @@ def test_load_fake_lfw_people():
     # conversion and not limit on the number of picture per person
     lfw_people = load_lfw_people(data_home=SCIKIT_LEARN_DATA,
                                  resize=None, slice_=None, color=True)
-    assert_equal(lfw_people.data.shape, (17, 250, 250, 3))
+    assert_equal(lfw_people.images.shape, (17, 250, 250, 3))
 
     # the ids and class names are the same as previously
     assert_array_equal(lfw_people.target,
@@ -159,7 +160,7 @@ def test_load_fake_lfw_pairs():
 
     # The data is croped around the center as a rectangular bounding box
     # arounthe the face. Colors are converted to gray levels:
-    assert_equal(lfw_pairs_train.data.shape, (10, 2, 62, 47))
+    assert_equal(lfw_pairs_train.pairs.shape, (10, 2, 62, 47))
 
     # the target is whether the person is the same or not
     assert_array_equal(lfw_pairs_train.target, [1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
@@ -172,7 +173,7 @@ def test_load_fake_lfw_pairs():
     # conversion
     lfw_pairs_train = load_lfw_pairs(data_home=SCIKIT_LEARN_DATA,
                                      resize=None, slice_=None, color=True)
-    assert_equal(lfw_pairs_train.data.shape, (10, 2, 250, 250, 3))
+    assert_equal(lfw_pairs_train.pairs.shape, (10, 2, 250, 250, 3))
 
     # the ids and class names are the same as previously
     assert_array_equal(lfw_pairs_train.target, [1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
-- 
GitLab