From 5f4ca067349c04e68c4e1f04070dbc2d1fa36a70 Mon Sep 17 00:00:00 2001 From: Olivier Grisel <olivier.grisel@ensta.org> Date: Sat, 11 Feb 2012 13:58:33 +0100 Subject: [PATCH] FIX: make LFW data shapes consistent with Olivetti faces --- .gitignore | 1 + doc/datasets/labeled_faces.rst | 8 +++++++- doc/whats_new.rst | 4 ++++ examples/applications/face_recognition.py | 9 +++++---- sklearn/datasets/lfw.py | 6 ++++-- sklearn/datasets/tests/test_lfw.py | 9 +++++---- 6 files changed, 26 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 69915c34be..2136b6ea2a 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ *~ .#* *.swp +*.swo .DS_Store build sklearn/datasets/__config__.py diff --git a/doc/datasets/labeled_faces.rst b/doc/datasets/labeled_faces.rst index 7f86da507c..829eaeed3d 100644 --- a/doc/datasets/labeled_faces.rst +++ b/doc/datasets/labeled_faces.rst @@ -60,6 +60,9 @@ most of the background:: dtype('float32') >>> lfw_people.data.shape + (1288, 1850) + + >>> lfw_people.images.shape (1288, 50, 37) Each of the ``1140`` faces is assigned to a single person id in the ``target`` @@ -80,9 +83,12 @@ is a pair of two picture belonging or not to the same person:: >>> list(lfw_pairs_train.target_names) ['Different persons', 'Same person'] - >>> lfw_pairs_train.data.shape + >>> lfw_pairs_train.pairs.shape (2200, 2, 62, 47) + >>> lfw_pairs_train.data.shape + (2200, 5828) + >>> lfw_pairs_train.target.shape (2200,) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index dd5243896a..37bf80aaa0 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -65,6 +65,10 @@ API changes summary objects are now deprecated. `scores_` or `pvalues_` should be used instead. + - LFW ``data`` is now always shape ``(n_samples, n_features)`` to be + consistent with the Olivetti faces dataset. Use ``images`` and + ``pairs`` attribute to access the natural images shapes instead. + .. _changes_0_10: 0.10 diff --git a/examples/applications/face_recognition.py b/examples/applications/face_recognition.py index 5e29a8dc47..8844cd3207 100644 --- a/examples/applications/face_recognition.py +++ b/examples/applications/face_recognition.py @@ -53,11 +53,12 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4) -# reshape the data using the traditional (n_samples, n_features) shape -faces = lfw_people.data -n_samples, h, w = faces.shape +# introspect the images arrays to find the shapes (for plotting) +n_samples, h, w = lfw_people.images.shape -X = faces.reshape((n_samples, h * w)) +# fot machine learning we use the 2 data directly (as relative pixel +# positions info is ignored by this model) +X = lfw_people.data n_features = X.shape[1] # the label to predict is the id of the person diff --git a/sklearn/datasets/lfw.py b/sklearn/datasets/lfw.py index c0469d0221..c836231a95 100644 --- a/sklearn/datasets/lfw.py +++ b/sklearn/datasets/lfw.py @@ -272,7 +272,8 @@ def fetch_lfw_people(data_home=None, funneled=True, resize=0.5, min_faces_per_person=min_faces_per_person, color=color, slice_=slice_) # pack the results as a Bunch instance - return Bunch(data=faces, target=target, target_names=target_names, + return Bunch(data=faces.reshape(len(faces), -1), images=faces, + target=target, target_names=target_names, DESCR="LFW faces dataset") @@ -421,7 +422,8 @@ def fetch_lfw_pairs(subset='train', data_home=None, funneled=True, resize=0.5, slice_=slice_) # pack the results as a Bunch instance - return Bunch(data=pairs, target=target, target_names=target_names, + return Bunch(data=pairs.reshape(len(pairs), -1), pairs=pairs, + target=target, target_names=target_names, DESCR="'%s' segment of the LFW pairs dataset" % subset) diff --git a/sklearn/datasets/tests/test_lfw.py b/sklearn/datasets/tests/test_lfw.py index 8bbb00d7c4..4c069d8513 100644 --- a/sklearn/datasets/tests/test_lfw.py +++ b/sklearn/datasets/tests/test_lfw.py @@ -121,7 +121,8 @@ def test_load_fake_lfw_people(): # The data is croped around the center as a rectangular bounding box # arounthe the face. Colors are converted to gray levels: - assert_equal(lfw_people.data.shape, (10, 62, 47)) + assert_equal(lfw_people.images.shape, (10, 62, 47)) + assert_equal(lfw_people.data.shape, (10, 2914)) # the target is array of person integer ids assert_array_equal(lfw_people.target, [2, 0, 1, 0, 2, 0, 2, 1, 1, 2]) @@ -134,7 +135,7 @@ def test_load_fake_lfw_people(): # conversion and not limit on the number of picture per person lfw_people = load_lfw_people(data_home=SCIKIT_LEARN_DATA, resize=None, slice_=None, color=True) - assert_equal(lfw_people.data.shape, (17, 250, 250, 3)) + assert_equal(lfw_people.images.shape, (17, 250, 250, 3)) # the ids and class names are the same as previously assert_array_equal(lfw_people.target, @@ -159,7 +160,7 @@ def test_load_fake_lfw_pairs(): # The data is croped around the center as a rectangular bounding box # arounthe the face. Colors are converted to gray levels: - assert_equal(lfw_pairs_train.data.shape, (10, 2, 62, 47)) + assert_equal(lfw_pairs_train.pairs.shape, (10, 2, 62, 47)) # the target is whether the person is the same or not assert_array_equal(lfw_pairs_train.target, [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) @@ -172,7 +173,7 @@ def test_load_fake_lfw_pairs(): # conversion lfw_pairs_train = load_lfw_pairs(data_home=SCIKIT_LEARN_DATA, resize=None, slice_=None, color=True) - assert_equal(lfw_pairs_train.data.shape, (10, 2, 250, 250, 3)) + assert_equal(lfw_pairs_train.pairs.shape, (10, 2, 250, 250, 3)) # the ids and class names are the same as previously assert_array_equal(lfw_pairs_train.target, [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) -- GitLab