From 63583fe658886cc5eb48a2ba9a541d5f6be7194b Mon Sep 17 00:00:00 2001 From: Ralf Gommers <ralf.gommers@gmail.com> Date: Wed, 30 Nov 2016 06:43:32 +1300 Subject: [PATCH] BUG: for several datasets, ``download_if_missing`` keyword was ignored. (#7944) --- sklearn/datasets/california_housing.py | 4 ++++ sklearn/datasets/covtype.py | 3 +++ sklearn/datasets/kddcup99.py | 3 +++ sklearn/datasets/olivetti_faces.py | 4 ++++ sklearn/datasets/species_distributions.py | 3 +++ sklearn/datasets/tests/test_covtype.py | 6 ++---- sklearn/datasets/tests/test_kddcup99.py | 6 ++---- 7 files changed, 21 insertions(+), 8 deletions(-) diff --git a/sklearn/datasets/california_housing.py b/sklearn/datasets/california_housing.py index c109fee618..8a74ad9e60 100644 --- a/sklearn/datasets/california_housing.py +++ b/sklearn/datasets/california_housing.py @@ -87,8 +87,12 @@ def fetch_california_housing(data_home=None, download_if_missing=True): data_home = get_data_home(data_home=data_home) if not exists(data_home): makedirs(data_home) + filepath = _pkl_filepath(data_home, TARGET_FILENAME) if not exists(filepath): + if not download_if_missing: + raise IOError("Data not found and `download_if_missing` is False") + print('downloading Cal. housing from %s to %s' % (DATA_URL, data_home)) archive_fileobj = BytesIO(urlopen(DATA_URL).read()) fileobj = tarfile.open( diff --git a/sklearn/datasets/covtype.py b/sklearn/datasets/covtype.py index f7cb1ed03f..6e0b4d2d0d 100644 --- a/sklearn/datasets/covtype.py +++ b/sklearn/datasets/covtype.py @@ -99,6 +99,9 @@ def fetch_covtype(data_home=None, download_if_missing=True, joblib.dump(X, samples_path, compress=9) joblib.dump(y, targets_path, compress=9) + elif not available: + if not download_if_missing: + raise IOError("Data not found and `download_if_missing` is False") try: X, y diff --git a/sklearn/datasets/kddcup99.py b/sklearn/datasets/kddcup99.py index 824809a80e..03bf3f8d8f 100644 --- a/sklearn/datasets/kddcup99.py +++ b/sklearn/datasets/kddcup99.py @@ -345,6 +345,9 @@ def _fetch_brute_kddcup99(subset=None, data_home=None, joblib.dump(X, samples_path, compress=0) joblib.dump(y, targets_path, compress=0) + elif not available: + if not download_if_missing: + raise IOError("Data not found and `download_if_missing` is False") try: X, y diff --git a/sklearn/datasets/olivetti_faces.py b/sklearn/datasets/olivetti_faces.py index e74d65d60e..5f3af040dc 100644 --- a/sklearn/datasets/olivetti_faces.py +++ b/sklearn/datasets/olivetti_faces.py @@ -111,6 +111,9 @@ def fetch_olivetti_faces(data_home=None, shuffle=False, random_state=0, makedirs(data_home) filepath = _pkl_filepath(data_home, TARGET_FILENAME) if not exists(filepath): + if not download_if_missing: + raise IOError("Data not found and `download_if_missing` is False") + print('downloading Olivetti faces from %s to %s' % (DATA_URL, data_home)) fhandle = urlopen(DATA_URL) @@ -121,6 +124,7 @@ def fetch_olivetti_faces(data_home=None, shuffle=False, random_state=0, del mfile else: faces = joblib.load(filepath) + # We want floating point data, but float32 is enough (there is only # one byte of precision in the original uint8s anyway) faces = np.float32(faces) diff --git a/sklearn/datasets/species_distributions.py b/sklearn/datasets/species_distributions.py index 6af36e6745..330c535620 100644 --- a/sklearn/datasets/species_distributions.py +++ b/sklearn/datasets/species_distributions.py @@ -222,6 +222,9 @@ def fetch_species_distributions(data_home=None, archive_path = _pkl_filepath(data_home, DATA_ARCHIVE_NAME) if not exists(archive_path): + if not download_if_missing: + raise IOError("Data not found and `download_if_missing` is False") + print('Downloading species data from %s to %s' % (SAMPLES_URL, data_home)) X = np.load(BytesIO(urlopen(SAMPLES_URL).read())) diff --git a/sklearn/datasets/tests/test_covtype.py b/sklearn/datasets/tests/test_covtype.py index f32511d7c9..c980bb86fc 100644 --- a/sklearn/datasets/tests/test_covtype.py +++ b/sklearn/datasets/tests/test_covtype.py @@ -3,7 +3,6 @@ Skipped if covtype is not already downloaded to data_home. """ -import errno from sklearn.datasets import fetch_covtype from sklearn.utils.testing import assert_equal, SkipTest @@ -15,9 +14,8 @@ def fetch(*args, **kwargs): def test_fetch(): try: data1 = fetch(shuffle=True, random_state=42) - except IOError as e: - if e.errno == errno.ENOENT: - raise SkipTest("Covertype dataset can not be loaded.") + except IOError: + raise SkipTest("Covertype dataset can not be loaded.") data2 = fetch(shuffle=True, random_state=37) diff --git a/sklearn/datasets/tests/test_kddcup99.py b/sklearn/datasets/tests/test_kddcup99.py index 414c89763c..498b98f4e6 100644 --- a/sklearn/datasets/tests/test_kddcup99.py +++ b/sklearn/datasets/tests/test_kddcup99.py @@ -5,7 +5,6 @@ The test is skipped if the data wasn't previously fetched and saved to scikit-learn data folder. """ -import errno from sklearn.datasets import fetch_kddcup99 from sklearn.utils.testing import assert_equal, SkipTest @@ -13,9 +12,8 @@ from sklearn.utils.testing import assert_equal, SkipTest def test_percent10(): try: data = fetch_kddcup99(download_if_missing=False) - except IOError as e: - if e.errno == errno.ENOENT: - raise SkipTest("kddcup99 dataset can not be loaded.") + except IOError: + raise SkipTest("kddcup99 dataset can not be loaded.") assert_equal(data.data.shape, (494021, 41)) assert_equal(data.target.shape, (494021,)) -- GitLab