From 2e2eebf4ab0e9a2a5e518cc54f5c93a4e40d1550 Mon Sep 17 00:00:00 2001 From: Andreas Mueller <amueller@ais.uni-bonn.de> Date: Thu, 13 Nov 2014 11:07:15 -0500 Subject: [PATCH] Make ParameterSampler sample without replacement if all parameters are given as lists. --- doc/whats_new.rst | 3 ++ sklearn/grid_search.py | 57 ++++++++++++++++++++++------ sklearn/tests/test_grid_search.py | 54 ++++++++++++++++---------- sklearn/tests/test_metaestimators.py | 2 +- 4 files changed, 83 insertions(+), 33 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 251870907f..09a6e38d44 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -158,6 +158,9 @@ Enhancements :class:`tree.DecisionTreeClassifier`, :class:`ensemble.ExtraTreesClassifier` and :class:`tree.ExtraTreeClassifier`. By `Trevor Stephens`_. + - :class:`grid_search.RandomizedSearchCV` now does sampling without + replacement if all parameters are given as lists. by `Andreas Mueller`_. + Documentation improvements .......................... diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 31eabe7f8b..c0e428f4bc 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -26,6 +26,7 @@ from .cross_validation import _fit_and_score from .externals.joblib import Parallel, delayed from .externals import six from .utils import check_random_state +from .utils.random import sample_without_replacement from .utils.validation import _num_samples, indexable from .utils.metaestimators import if_delegate_has_method from .metrics.scorer import check_scoring @@ -113,7 +114,11 @@ class ParameterSampler(object): """Generator on parameters sampled from given distributions. Non-deterministic iterable over random candidate combinations for hyper- - parameter search. + parameter search. If all parameters are presented as a list, + sampling without replacement is performed. If at least one parameter + is given as a distribution, sampling with replacement is used. + It is highly recommended to use continuous distributions for continuous + parameters. Note that as of SciPy 0.12, the ``scipy.stats.distributions`` do not accept a custom RNG instance and always use the singleton RNG from @@ -165,17 +170,39 @@ class ParameterSampler(object): self.random_state = random_state def __iter__(self): + samples = [] + # check if all distributions are given as lists + # in this case we want to sample without replacement + all_lists = np.all([not hasattr(v, "rvs") + for v in self.param_distributions.values()]) rnd = check_random_state(self.random_state) - # Always sort the keys of a dictionary, for reproducibility - items = sorted(self.param_distributions.items()) - for _ in range(self.n_iter): - params = dict() - for k, v in items: - if hasattr(v, "rvs"): - params[k] = v.rvs() - else: - params[k] = v[rnd.randint(len(v))] - yield params + + if all_lists: + # get complete grid and yield from it + param_grid = list(ParameterGrid(self.param_distributions)) + grid_size = len(param_grid) + + if grid_size < self.n_iter: + raise ValueError( + "The total space of parameters %d is smaller " + "than n_iter=%d." % (grid_size, self.n_iter) + + " For exhaustive searches, use GridSearchCV.") + for i in sample_without_replacement(grid_size, self.n_iter, + random_state=rnd): + yield param_grid[i] + + else: + # Always sort the keys of a dictionary, for reproducibility + items = sorted(self.param_distributions.items()) + while len(samples) < self.n_iter: + params = dict() + for k, v in items: + if hasattr(v, "rvs"): + params[k] = v.rvs() + else: + params[k] = v[rnd.randint(len(v))] + samples.append(params) + yield params def __len__(self): """Number of points that will be sampled.""" @@ -249,7 +276,7 @@ def _check_param_grid(param_grid): raise ValueError("Parameter array should be one-dimensional.") check = [isinstance(v, k) for k in (list, tuple, np.ndarray)] - if not True in check: + if True not in check: raise ValueError("Parameter values should be a list.") if len(v) == 0: @@ -717,6 +744,12 @@ class RandomizedSearchCV(BaseSearchCV): distributions. The number of parameter settings that are tried is given by n_iter. + If all parameters are presented as a list, + sampling without replacement is performed. If at least one parameter + is given as a distribution, sampling with replacement is used. + It is highly recommended to use continuous distributions for continuous + parameters. + Parameters ---------- estimator : object type that implements the "fit" and "predict" methods diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py index 0d03de69a5..8fe2bdc627 100644 --- a/sklearn/tests/test_grid_search.py +++ b/sklearn/tests/test_grid_search.py @@ -26,7 +26,7 @@ from sklearn.utils.testing import assert_no_warnings from sklearn.utils.testing import ignore_warnings from sklearn.utils.mocking import CheckingClassifier, MockDataFrame -from scipy.stats import distributions +from scipy.stats import bernoulli, expon, uniform from sklearn.externals.six.moves import zip from sklearn.base import BaseEstimator @@ -214,7 +214,7 @@ def test_trivial_grid_scores(): grid_search.fit(X, y) assert_true(hasattr(grid_search, "grid_scores_")) - random_search = RandomizedSearchCV(clf, {'foo_param': [0]}) + random_search = RandomizedSearchCV(clf, {'foo_param': [0]}, n_iter=1) random_search.fit(X, y) assert_true(hasattr(random_search, "grid_scores_")) @@ -530,7 +530,7 @@ def test_gridsearch_no_predict(): def test_param_sampler(): # test basic properties of param sampler param_distributions = {"kernel": ["rbf", "linear"], - "C": distributions.uniform(0, 1)} + "C": uniform(0, 1)} sampler = ParameterSampler(param_distributions=param_distributions, n_iter=10, random_state=0) samples = [x for x in sampler] @@ -549,8 +549,8 @@ def test_randomized_search_grid_scores(): # XXX: as of today (scipy 0.12) it's not possible to set the random seed # of scipy.stats distributions: the assertions in this test should thus # not depend on the randomization - params = dict(C=distributions.expon(scale=10), - gamma=distributions.expon(scale=0.1)) + params = dict(C=expon(scale=10), + gamma=expon(scale=0.1)) n_cv_iter = 3 n_search_iter = 30 search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_cv_iter, @@ -615,7 +615,7 @@ def test_pickle(): pickle.dumps(grid_search) # smoke test random_search = RandomizedSearchCV(clf, {'foo_param': [1, 2, 3]}, - refit=True) + refit=True, n_iter=3) random_search.fit(X, y) pickle.dumps(random_search) # smoke test @@ -647,20 +647,7 @@ def test_grid_search_with_multioutput_data(): # Test with a randomized search for est in estimators: - random_search = RandomizedSearchCV(est, est_parameters, cv=cv) - random_search.fit(X, y) - for parameters, _, cv_validation_scores in random_search.grid_scores_: - est.set_params(**parameters) - - for i, (train, test) in enumerate(cv): - est.fit(X[train], y[train]) - correct_score = est.score(X[test], y[test]) - assert_almost_equal(correct_score, - cv_validation_scores[i]) - - # Test with a randomized search - for est in estimators: - random_search = RandomizedSearchCV(est, est_parameters, cv=cv) + random_search = RandomizedSearchCV(est, est_parameters, cv=cv, n_iter=3) random_search.fit(X, y) for parameters, _, cv_validation_scores in random_search.grid_scores_: est.set_params(**parameters) @@ -758,3 +745,30 @@ def test_grid_search_failing_classifier_raise(): # FailingClassifier issues a ValueError so this is what we look for. assert_raises(ValueError, gs.fit, X, y) + + +def test_parameters_sampler_replacement(): + # raise error if n_iter too large + params = {'first': [0, 1], 'second': ['a', 'b', 'c']} + sampler = ParameterSampler(params, n_iter=7) + assert_raises(ValueError, list, sampler) + # degenerates to GridSearchCV if n_iter the same as grid_size + sampler = ParameterSampler(params, n_iter=6) + samples = list(sampler) + assert_equal(len(samples), 6) + for values in ParameterGrid(params): + assert_true(values in samples) + + # test sampling without replacement in a large grid + params = {'a': range(10), 'b': range(10), 'c': range(10)} + sampler = ParameterSampler(params, n_iter=99, random_state=42) + samples = list(sampler) + assert_equal(len(samples), 99) + hashable_samples = ["a%db%dc%d" % (p['a'], p['b'], p['c']) for p in samples] + assert_equal(len(set(hashable_samples)), 99) + + # doesn't go into infinite loops + params_distribution = {'first': bernoulli(.5), 'second': ['a', 'b', 'c']} + sampler = ParameterSampler(params_distribution, n_iter=7) + samples = list(sampler) + assert_equal(len(samples), 7) diff --git a/sklearn/tests/test_metaestimators.py b/sklearn/tests/test_metaestimators.py index 90241694bd..9e959052cf 100644 --- a/sklearn/tests/test_metaestimators.py +++ b/sklearn/tests/test_metaestimators.py @@ -30,7 +30,7 @@ DELEGATING_METAESTIMATORS = [ skip_methods=['score']), DelegatorData('RandomizedSearchCV', lambda est: RandomizedSearchCV( - est, param_distributions={'param': [5]}, cv=2), + est, param_distributions={'param': [5]}, cv=2, n_iter=1), skip_methods=['score']), DelegatorData('RFE', RFE, skip_methods=['transform', 'inverse_transform', 'score']), -- GitLab