From 9d6a0b0046e8c2cbee369a841894f04675b42b44 Mon Sep 17 00:00:00 2001 From: Gael Varoquaux <gael.varoquaux@normalesup.org> Date: Mon, 7 May 2012 14:55:52 +0200 Subject: [PATCH] ENH: update joblib 2012-05-07 Vlad Niculae ENH: controlled randomness in tests and doctest fix 2012-02-21 GaelVaroquaux ENH: add verbosity in memory 2012-02-21 GaelVaroquaux BUG: non-reproducible hashing: order of kwargs The ordering of a dictionnary is random. As a result the function hashing was not reproducible. --- sklearn/externals/joblib/__init__.py | 3 +- sklearn/externals/joblib/func_inspect.py | 2 +- sklearn/externals/joblib/memory.py | 25 ++++++++++++--- sklearn/externals/joblib/test/test_hashing.py | 6 ++-- sklearn/externals/joblib/test/test_memory.py | 4 ++- .../joblib/test/test_numpy_pickle.py | 32 +++++++++++++++++-- 6 files changed, 60 insertions(+), 12 deletions(-) diff --git a/sklearn/externals/joblib/__init__.py b/sklearn/externals/joblib/__init__.py index a14aa72321..5c2d87f90f 100755 --- a/sklearn/externals/joblib/__init__.py +++ b/sklearn/externals/joblib/__init__.py @@ -60,6 +60,7 @@ Main features inputs and outputs: Python functions. Joblib can save their computation to disk and rerun it only if necessary:: + >>> import numpy as np >>> from sklearn.externals.joblib import Memory >>> mem = Memory(cachedir='/tmp/joblib') >>> import numpy as np @@ -101,7 +102,7 @@ Main features """ -__version__ = '0.6.3' +__version__ = '0.6.4' from .memory import Memory diff --git a/sklearn/externals/joblib/func_inspect.py b/sklearn/externals/joblib/func_inspect.py index 9a84cc38a2..10eebc7b2e 100755 --- a/sklearn/externals/joblib/func_inspect.py +++ b/sklearn/externals/joblib/func_inspect.py @@ -207,7 +207,7 @@ def filter_args(func, ignore_lst, args=(), kwargs=dict()): ) varkwargs = dict() - for arg_name, arg_value in kwargs.iteritems(): + for arg_name, arg_value in sorted(kwargs.items()): if arg_name in arg_dict: arg_dict[arg_name] = arg_value elif arg_keywords is not None: diff --git a/sklearn/externals/joblib/memory.py b/sklearn/externals/joblib/memory.py index b8fc0c196d..f0a0252f42 100755 --- a/sklearn/externals/joblib/memory.py +++ b/sklearn/externals/joblib/memory.py @@ -159,10 +159,15 @@ class MemorizedFunc(Logger): def __call__(self, *args, **kwargs): # Compare the function code with the previous to see if the # function code has changed - output_dir, _ = self.get_output_dir(*args, **kwargs) + output_dir, argument_hash = self.get_output_dir(*args, **kwargs) # FIXME: The statements below should be try/excepted if not (self._check_previous_func_code(stacklevel=3) and os.path.exists(output_dir)): + if self._verbose > 10: + _, name = get_func_name(self.func) + self.warn('Computing func %s, argument hash %s in ' + 'directory %s' + % (name, argument_hash, output_dir)) return self.call(*args, **kwargs) else: try: @@ -287,6 +292,10 @@ class MemorizedFunc(Logger): # The function has changed, wipe the cache directory. # XXX: Should be using warnings, and giving stacklevel + if self._verbose > 10: + _, func_name = get_func_name(self.func, resolv_alias=False) + self.warn("Function %s (stored in %s) has changed." % + (func_name, func_dir)) self.clear(warn=True) return False @@ -308,12 +317,11 @@ class MemorizedFunc(Logger): persist the output values. """ start_time = time.time() + output_dir, argument_hash = self.get_output_dir(*args, **kwargs) if self._verbose: print self.format_call(*args, **kwargs) - output_dir, argument_hash = self.get_output_dir(*args, **kwargs) output = self.func(*args, **kwargs) self._persist_output(output, output_dir) - input_repr = self._persist_input(output_dir, *args, **kwargs) duration = time.time() - start_time if self._verbose: _, name = get_func_name(self.func) @@ -368,6 +376,8 @@ class MemorizedFunc(Logger): mkdirp(dir) filename = os.path.join(dir, 'output.pkl') numpy_pickle.dump(output, filename, compress=self.compress) + if self._verbose > 10: + print 'Persisting in %s' % dir except OSError: " Race condition in the creation of the directory " @@ -398,10 +408,17 @@ class MemorizedFunc(Logger): """ if self._verbose > 1: t = time.time() - self.timestamp - print '[Memory]% 16s: Loading %s...' % ( + if self._verbose < 10: + print '[Memory]% 16s: Loading %s...' % ( format_time(t), self.format_signature(self.func)[0] ) + else: + print '[Memory]% 16s: Loading %s from %s' % ( + format_time(t), + self.format_signature(self.func)[0], + output_dir + ) filename = os.path.join(output_dir, 'output.pkl') return numpy_pickle.load(filename, mmap_mode=self.mmap_mode) diff --git a/sklearn/externals/joblib/test/test_hashing.py b/sklearn/externals/joblib/test/test_hashing.py index 02ae4d9c39..7fdf822852 100755 --- a/sklearn/externals/joblib/test/test_hashing.py +++ b/sklearn/externals/joblib/test/test_hashing.py @@ -93,7 +93,8 @@ def test_hash_methods(): def test_hash_numpy(): """ Test hashing with numpy arrays. """ - arr1 = np.random.random((10, 10)) + rnd = np.random.RandomState(0) + arr1 = rnd.random_sample((10, 10)) arr2 = arr1.copy() arr3 = arr2.copy() arr3[0] += 1 @@ -160,7 +161,8 @@ def test_hash_numpy_performance(): In [26]: %timeit hash(a) 100 loops, best of 3: 20.8 ms per loop """ - a = np.random.random(1000000) + rnd = np.random.RandomState(0) + a = rnd.random_sample(1000000) md5_hash = lambda x: hashlib.md5(np.getbuffer(x)).hexdigest() relative_diff = relative_time(md5_hash, hash, a) diff --git a/sklearn/externals/joblib/test/test_memory.py b/sklearn/externals/joblib/test/test_memory.py index 028670ee00..356f366cbc 100755 --- a/sklearn/externals/joblib/test/test_memory.py +++ b/sklearn/externals/joblib/test/test_memory.py @@ -325,8 +325,10 @@ def test_memory_numpy(): verbose=0) memory.clear(warn=False) cached_n = memory.cache(n) + + rnd = np.random.RandomState(0) for i in range(3): - a = np.random.random((10, 10)) + a = rnd.random_sample((10, 10)) for _ in range(3): yield nose.tools.assert_true, np.all(cached_n(a) == a) yield nose.tools.assert_equal, len(accumulator), i + 1 diff --git a/sklearn/externals/joblib/test/test_numpy_pickle.py b/sklearn/externals/joblib/test/test_numpy_pickle.py index b8069615c9..f5d34cc136 100755 --- a/sklearn/externals/joblib/test/test_numpy_pickle.py +++ b/sklearn/externals/joblib/test/test_numpy_pickle.py @@ -134,7 +134,8 @@ def test_value_error(): @with_numpy def test_numpy_persistence(): filename = env['filename'] - a = np.random.random((10, 2)) + rnd = np.random.RandomState(0) + a = rnd.random_sample((10, 2)) for compress, cache_size in ((0, 0), (1, 0), (1, 10)): # We use 'a.T' to have a non C-contiguous array. for index, obj in enumerate(((a,), (a.T,), (a, a), [a, a, a])): @@ -183,7 +184,8 @@ def test_numpy_persistence(): @with_numpy def test_memmap_persistence(): - a = np.random.random(10) + rnd = np.random.RandomState(0) + a = rnd.random_sample(10) filename = env['filename'] + str(random.randint(0, 1000)) numpy_pickle.dump(a, filename) b = numpy_pickle.load(filename, mmap_mode='r') @@ -195,7 +197,8 @@ def test_memmap_persistence(): def test_masked_array_persistence(): # The special-case picker fails, because saving masked_array # not implemented, but it just delegates to the standard pickler. - a = np.random.random(10) + rnd = np.random.RandomState(0) + a = rnd.random_sample(10) a = np.ma.masked_greater(a, 0.5) filename = env['filename'] + str(random.randint(0, 1000)) numpy_pickle.dump(a, filename) @@ -210,3 +213,26 @@ def test_z_file(): numpy_pickle.write_zfile(file(filename, 'wb'), data) data_read = numpy_pickle.read_zfile(file(filename, 'rb')) nose.tools.assert_equal(data, data_read) + +################################################################################ +# Test dumping array subclasses +if np is not None: + + class SubArray(np.ndarray): + + def __reduce__(self): + return (_load_sub_array, (np.asarray(self), )) + + + def _load_sub_array(arr): + d = SubArray(arr.shape) + d[:] = arr + return d + +@with_numpy +def test_numpy_subclass(): + filename = env['filename'] + a = SubArray((10,)) + numpy_pickle.dump(a, filename) + c = numpy_pickle.load(filename) + nose.tools.assert_true(isinstance(c, SubArray)) -- GitLab