diff --git a/.gitignore b/.gitignore index 69915c34befaf3cebe9f4b00d8e3080894d312b4..2136b6ea2a1183bf5fd3ce18c327409533a38a71 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ *~ .#* *.swp +*.swo .DS_Store build sklearn/datasets/__config__.py diff --git a/doc/datasets/labeled_faces.rst b/doc/datasets/labeled_faces.rst index 7f86da507ca9297d54ef131c0e5aafbc028e3171..829eaeed3d5ff937f334ec137d6f67750c716a7a 100644 --- a/doc/datasets/labeled_faces.rst +++ b/doc/datasets/labeled_faces.rst @@ -60,6 +60,9 @@ most of the background:: dtype('float32') >>> lfw_people.data.shape + (1288, 1850) + + >>> lfw_people.images.shape (1288, 50, 37) Each of the ``1140`` faces is assigned to a single person id in the ``target`` @@ -80,9 +83,12 @@ is a pair of two picture belonging or not to the same person:: >>> list(lfw_pairs_train.target_names) ['Different persons', 'Same person'] - >>> lfw_pairs_train.data.shape + >>> lfw_pairs_train.pairs.shape (2200, 2, 62, 47) + >>> lfw_pairs_train.data.shape + (2200, 5828) + >>> lfw_pairs_train.target.shape (2200,) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index dd5243896ae2ddcb01d1739d5ec907f18e66d554..37bf80aaa0b29fe1c7390d0c3164efc4ff669d06 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -65,6 +65,10 @@ API changes summary objects are now deprecated. `scores_` or `pvalues_` should be used instead. + - LFW ``data`` is now always shape ``(n_samples, n_features)`` to be + consistent with the Olivetti faces dataset. Use ``images`` and + ``pairs`` attribute to access the natural images shapes instead. + .. _changes_0_10: 0.10 diff --git a/examples/applications/face_recognition.py b/examples/applications/face_recognition.py index 5e29a8dc479557c2e4d8779f35b20850ec32e1a9..8844cd320764fd479de58f0b7fa0c5565f183b62 100644 --- a/examples/applications/face_recognition.py +++ b/examples/applications/face_recognition.py @@ -53,11 +53,12 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4) -# reshape the data using the traditional (n_samples, n_features) shape -faces = lfw_people.data -n_samples, h, w = faces.shape +# introspect the images arrays to find the shapes (for plotting) +n_samples, h, w = lfw_people.images.shape -X = faces.reshape((n_samples, h * w)) +# fot machine learning we use the 2 data directly (as relative pixel +# positions info is ignored by this model) +X = lfw_people.data n_features = X.shape[1] # the label to predict is the id of the person diff --git a/sklearn/datasets/lfw.py b/sklearn/datasets/lfw.py index c0469d0221be7815c527a218e8456fbf7eb1b232..c836231a9524b5223b1d80b57b26b883ec935ac6 100644 --- a/sklearn/datasets/lfw.py +++ b/sklearn/datasets/lfw.py @@ -272,7 +272,8 @@ def fetch_lfw_people(data_home=None, funneled=True, resize=0.5, min_faces_per_person=min_faces_per_person, color=color, slice_=slice_) # pack the results as a Bunch instance - return Bunch(data=faces, target=target, target_names=target_names, + return Bunch(data=faces.reshape(len(faces), -1), images=faces, + target=target, target_names=target_names, DESCR="LFW faces dataset") @@ -421,7 +422,8 @@ def fetch_lfw_pairs(subset='train', data_home=None, funneled=True, resize=0.5, slice_=slice_) # pack the results as a Bunch instance - return Bunch(data=pairs, target=target, target_names=target_names, + return Bunch(data=pairs.reshape(len(pairs), -1), pairs=pairs, + target=target, target_names=target_names, DESCR="'%s' segment of the LFW pairs dataset" % subset) diff --git a/sklearn/datasets/tests/test_lfw.py b/sklearn/datasets/tests/test_lfw.py index 8bbb00d7c419efbf2b9b2d6fabbbb23410cf16ee..4c069d8513f3233a87fcba0d3ded3e803dc699ad 100644 --- a/sklearn/datasets/tests/test_lfw.py +++ b/sklearn/datasets/tests/test_lfw.py @@ -121,7 +121,8 @@ def test_load_fake_lfw_people(): # The data is croped around the center as a rectangular bounding box # arounthe the face. Colors are converted to gray levels: - assert_equal(lfw_people.data.shape, (10, 62, 47)) + assert_equal(lfw_people.images.shape, (10, 62, 47)) + assert_equal(lfw_people.data.shape, (10, 2914)) # the target is array of person integer ids assert_array_equal(lfw_people.target, [2, 0, 1, 0, 2, 0, 2, 1, 1, 2]) @@ -134,7 +135,7 @@ def test_load_fake_lfw_people(): # conversion and not limit on the number of picture per person lfw_people = load_lfw_people(data_home=SCIKIT_LEARN_DATA, resize=None, slice_=None, color=True) - assert_equal(lfw_people.data.shape, (17, 250, 250, 3)) + assert_equal(lfw_people.images.shape, (17, 250, 250, 3)) # the ids and class names are the same as previously assert_array_equal(lfw_people.target, @@ -159,7 +160,7 @@ def test_load_fake_lfw_pairs(): # The data is croped around the center as a rectangular bounding box # arounthe the face. Colors are converted to gray levels: - assert_equal(lfw_pairs_train.data.shape, (10, 2, 62, 47)) + assert_equal(lfw_pairs_train.pairs.shape, (10, 2, 62, 47)) # the target is whether the person is the same or not assert_array_equal(lfw_pairs_train.target, [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) @@ -172,7 +173,7 @@ def test_load_fake_lfw_pairs(): # conversion lfw_pairs_train = load_lfw_pairs(data_home=SCIKIT_LEARN_DATA, resize=None, slice_=None, color=True) - assert_equal(lfw_pairs_train.data.shape, (10, 2, 250, 250, 3)) + assert_equal(lfw_pairs_train.pairs.shape, (10, 2, 250, 250, 3)) # the ids and class names are the same as previously assert_array_equal(lfw_pairs_train.target, [1, 1, 1, 1, 1, 0, 0, 0, 0, 0])