diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 09b4a7090deffa41905b53f3368fda08bb3405bc..8df8604b77b26140669e361e534f02b82dfdd0fa 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -3,172 +3,9 @@ The :mod:`sklearn.utils` module includes various utilites. """ import numpy as np -import scipy.sparse as sp import warnings - -def assert_all_finite(X): - """Throw a ValueError if X contains NaN or infinity. - - Input MUST be an np.ndarray instance or a scipy.sparse matrix.""" - - # First try an O(n) time, O(1) space solution for the common case that - # there everything is finite; fall back to O(n) space np.isfinite to - # prevent false positives from overflow in sum method. - if X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum()) \ - and not np.isfinite(X).all(): - raise ValueError("array contains NaN or infinity") - - -def safe_asarray(X, dtype=None, order=None): - """Convert X to an array or sparse matrix. - - Prevents copying X when possible; sparse matrices are passed through.""" - if not sp.issparse(X): - X = np.asarray(X, dtype, order) - assert_all_finite(X) - return X - - -def as_float_array(X, copy=True): - """Converts an array-like to an array of floats - - The new dtype will be np.float32 or np.float64, depending on the original - type. The function can create a copy or modify the argument depending - on the argument copy. - - Parameters - ---------- - X : array - - copy : bool, optional - If True, a copy of X will be created. If False, a copy may still be - returned if X's dtype is not a floating point type. - - Returns - ------- - X : array - An array of type np.float - """ - if isinstance(X, np.matrix): - X = X.A - elif not isinstance(X, np.ndarray) and not sp.issparse(X): - return safe_asarray(X, dtype=np.float64) - if X.dtype in [np.float32, np.float64]: - return X.copy() if copy else X - if X.dtype == np.int32: - X = X.astype(np.float32) - else: - X = X.astype(np.float64) - return X - - -def array2d(X, dtype=None, order=None): - """Returns at least 2-d array with data from X""" - return np.asarray(np.atleast_2d(X), dtype=dtype, order=order) - - -def atleast2d_or_csr(X): - """Like numpy.atleast_2d, but converts sparse matrices to CSR format - - Also, converts np.matrix to np.ndarray. - """ - X = X.tocsr() if sp.issparse(X) else array2d(X) - assert_all_finite(X) - return X - - -def check_random_state(seed): - """Turn seed into a np.random.RandomState instance - - If seed is None, return the RandomState singleton used by np.random. - If seed is an int, return a new RandomState instance seeded with seed. - If seed is already a RandomState instance, return it. - Otherwise raise ValueError. - """ - if seed is None or seed is np.random: - return np.random.mtrand._rand - if isinstance(seed, int): - return np.random.RandomState(seed) - if isinstance(seed, np.random.RandomState): - return seed - raise ValueError('%r cannot be used to seed a numpy.random.RandomState' - ' instance' % seed) - - -def _num_samples(x): - """Return number of samples in array-like x.""" - if not hasattr(x, '__len__') and not hasattr(x, 'shape'): - raise TypeError("Expected sequence or array-like, got %r" % x) - return x.shape[0] if hasattr(x, 'shape') else len(x) - - -def check_arrays(*arrays, **options): - """Checked that all arrays have consistent first dimensions - - Parameters - ---------- - *arrays : sequence of arrays or scipy.sparse matrices with same shape[0] - Python lists or tuples occurring in arrays are converted to 1D numpy - arrays. - - sparse_format : 'csr' or 'csc', None by default - If not None, any scipy.sparse matrix is converted to - Compressed Sparse Rows or Compressed Sparse Columns representations. - - copy : boolean, False by default - If copy is True, ensure that returned arrays are copies of the original - (if not already converted to another format earlier in the process). - """ - sparse_format = options.pop('sparse_format', None) - if sparse_format not in (None, 'csr', 'csc'): - raise ValueError('Unexpected sparse format: %r' % sparse_format) - copy = options.pop('copy', False) - if options: - raise ValueError("Unexpected kw arguments: %r" % options.keys()) - - if len(arrays) == 0: - return None - - n_samples = _num_samples(arrays[0]) - - checked_arrays = [] - for array in arrays: - array_orig = array - if array is None: - # special case: ignore optional y=None kwarg pattern - checked_arrays.append(array) - continue - - size = _num_samples(array) - - if size != n_samples: - raise ValueError("Found array with dim %d. Expected %d" % ( - size, n_samples)) - - if sp.issparse(array): - if sparse_format == 'csr': - array = array.tocsr() - elif sparse_format == 'csc': - array = array.tocsc() - else: - array = np.asarray(array) - - if copy and array is array_orig: - array = array.copy() - checked_arrays.append(array) - - return checked_arrays - - -def warn_if_not_float(X, estimator='This algorithm'): - """Warning utility function to check that data type is floating point""" - if not isinstance(estimator, basestring): - estimator = estimator.__class__.__name__ - if X.dtype.kind != 'f': - warnings.warn("%s assumes floating point values as input, " - "got %s" % (estimator, X.dtype)) - +from validation import * class deprecated(object): """Decorator to mark a function or class as deprecated. diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py new file mode 100644 index 0000000000000000000000000000000000000000..d251c1d3d214470629b9dd0511e358fd66cc0dfd --- /dev/null +++ b/sklearn/utils/validation.py @@ -0,0 +1,169 @@ +""" +Utilities for input validation +""" + +import numpy as np +import scipy.sparse as sp +import warnings + + +def assert_all_finite(X): + """Throw a ValueError if X contains NaN or infinity. + + Input MUST be an np.ndarray instance or a scipy.sparse matrix.""" + + # First try an O(n) time, O(1) space solution for the common case that + # there everything is finite; fall back to O(n) space np.isfinite to + # prevent false positives from overflow in sum method. + if X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum()) \ + and not np.isfinite(X).all(): + raise ValueError("array contains NaN or infinity") + + +def safe_asarray(X, dtype=None, order=None): + """Convert X to an array or sparse matrix. + + Prevents copying X when possible; sparse matrices are passed through.""" + if not sp.issparse(X): + X = np.asarray(X, dtype, order) + assert_all_finite(X) + return X + + +def as_float_array(X, copy=True): + """Converts an array-like to an array of floats + + The new dtype will be np.float32 or np.float64, depending on the original + type. The function can create a copy or modify the argument depending + on the argument copy. + + Parameters + ---------- + X : array + + copy : bool, optional + If True, a copy of X will be created. If False, a copy may still be + returned if X's dtype is not a floating point type. + + Returns + ------- + X : array + An array of type np.float + """ + if isinstance(X, np.matrix): + X = X.A + elif not isinstance(X, np.ndarray) and not sp.issparse(X): + return safe_asarray(X, dtype=np.float64) + if X.dtype in [np.float32, np.float64]: + return X.copy() if copy else X + if X.dtype == np.int32: + X = X.astype(np.float32) + else: + X = X.astype(np.float64) + return X + + +def array2d(X, dtype=None, order=None): + """Returns at least 2-d array with data from X""" + return np.asarray(np.atleast_2d(X), dtype=dtype, order=order) + + +def atleast2d_or_csr(X): + """Like numpy.atleast_2d, but converts sparse matrices to CSR format + + Also, converts np.matrix to np.ndarray. + """ + X = X.tocsr() if sp.issparse(X) else array2d(X) + assert_all_finite(X) + return X + +def _num_samples(x): + """Return number of samples in array-like x.""" + if not hasattr(x, '__len__') and not hasattr(x, 'shape'): + raise TypeError("Expected sequence or array-like, got %r" % x) + return x.shape[0] if hasattr(x, 'shape') else len(x) + + +def check_arrays(*arrays, **options): + """Checked that all arrays have consistent first dimensions + + Parameters + ---------- + *arrays : sequence of arrays or scipy.sparse matrices with same shape[0] + Python lists or tuples occurring in arrays are converted to 1D numpy + arrays. + + sparse_format : 'csr' or 'csc', None by default + If not None, any scipy.sparse matrix is converted to + Compressed Sparse Rows or Compressed Sparse Columns representations. + + copy : boolean, False by default + If copy is True, ensure that returned arrays are copies of the original + (if not already converted to another format earlier in the process). + """ + sparse_format = options.pop('sparse_format', None) + if sparse_format not in (None, 'csr', 'csc'): + raise ValueError('Unexpected sparse format: %r' % sparse_format) + copy = options.pop('copy', False) + if options: + raise ValueError("Unexpected kw arguments: %r" % options.keys()) + + if len(arrays) == 0: + return None + + n_samples = _num_samples(arrays[0]) + + checked_arrays = [] + for array in arrays: + array_orig = array + if array is None: + # special case: ignore optional y=None kwarg pattern + checked_arrays.append(array) + continue + + size = _num_samples(array) + + if size != n_samples: + raise ValueError("Found array with dim %d. Expected %d" % ( + size, n_samples)) + + if sp.issparse(array): + if sparse_format == 'csr': + array = array.tocsr() + elif sparse_format == 'csc': + array = array.tocsc() + else: + array = np.asarray(array) + + if copy and array is array_orig: + array = array.copy() + checked_arrays.append(array) + + return checked_arrays + + +def warn_if_not_float(X, estimator='This algorithm'): + """Warning utility function to check that data type is floating point""" + if not isinstance(estimator, basestring): + estimator = estimator.__class__.__name__ + if X.dtype.kind != 'f': + warnings.warn("%s assumes floating point values as input, " + "got %s" % (estimator, X.dtype)) + + +def check_random_state(seed): + """Turn seed into a np.random.RandomState instance + + If seed is None, return the RandomState singleton used by np.random. + If seed is an int, return a new RandomState instance seeded with seed. + If seed is already a RandomState instance, return it. + Otherwise raise ValueError. + """ + if seed is None or seed is np.random: + return np.random.mtrand._rand + if isinstance(seed, int): + return np.random.RandomState(seed) + if isinstance(seed, np.random.RandomState): + return seed + raise ValueError('%r cannot be used to seed a numpy.random.RandomState' + ' instance' % seed)