From 5a5d2fc6b16b0be176ab9132a6badf884de2b4ae Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Sun, 28 Nov 2010 23:40:58 +0100
Subject: [PATCH] PEP8 + various cosmits in sample generators

---
 scikits/learn/datasets/samples_generator.py | 159 +++++++++++---------
 1 file changed, 88 insertions(+), 71 deletions(-)

diff --git a/scikits/learn/datasets/samples_generator.py b/scikits/learn/datasets/samples_generator.py
index dcac19f7af..be11dff5d8 100644
--- a/scikits/learn/datasets/samples_generator.py
+++ b/scikits/learn/datasets/samples_generator.py
@@ -2,38 +2,36 @@
 Generate samples of synthetic data sets.
 """
 
-# Author: B. Thirion, G. Varoquaux, A. Gramfort, V. Michel
+# Author: B. Thirion, G. Varoquaux, A. Gramfort, V. Michel, O. Grisel
 # License: BSD 3 clause
 
 import numpy as np
 import numpy.random as nr
 
 
-
-def samples_classif():
-    pass
-
-######################################################################
-# Generate Dataset for test
-######################################################################
-
 def test_dataset_classif(n_samples=100, n_features=100, param=[1,1],
-                             k=0, seed=None):
-    """
-    Generate an snp matrix
+                         n_informative=0, k=0, seed=None):
+    """Generate an snp matrix
 
     Parameters
     ----------
     n_samples : 100, int,
-        the number of subjects
+        the number of observations
+
     n_features : 100, int,
-        the number of features
-    param : [1,1], list,
+        the number of features for each observation
+
+    param : [1, 1], list,
         parameter of a dirichlet density
         that is used to generate multinomial densities
-        from which the n_featuress will be samples
-    k : 0, int,
+        from which the n_features will be samples
+
+    n_informative: 0, int
         number of informative features
+
+    k : 0, int
+        deprecated: use n_informative instead
+
     seed : None, int or np.random.RandomState
         if seed is an instance of np.random.RandomState,
         it is used to initialize the random generator
@@ -42,12 +40,18 @@ def test_dataset_classif(n_samples=100, n_features=100, param=[1,1],
     -------
     x : array of shape(n_samples, n_features),
         the design matrix
+
     y : array of shape (n_samples),
         the subject labels
 
     """
-    assert k<=n_features, ValueError('cannot have %d informative features and'
-                                   ' %d features' % (k, n_features))
+    if k > 0 and n_informative == 0:
+        n_informative = k
+
+    if n_informative > n_features:
+        raise ValueError('cannot have %d informative features and'
+                         ' %d features' % (n_informative, n_features))
+
     if isinstance(seed, np.random.RandomState):
         random = seed
     elif seed is not None:
@@ -59,22 +63,29 @@ def test_dataset_classif(n_samples=100, n_features=100, param=[1,1],
     y = np.zeros(n_samples)
     param = np.ravel(np.array(param)).astype(np.float)
     for n in range(n_samples):
-        y[n] = np.nonzero(random.multinomial(1, param/param.sum()))[0]
-    x[:,:k] += 3*y[:,np.newaxis]
+        y[n] = np.nonzero(random.multinomial(1, param / param.sum()))[0]
+    x[:, :k] += 3 * y[:, np.newaxis]
     return x, y
 
-def test_dataset_reg(n_samples=100, n_features=100, k=0, seed=None):
-    """
-    Generate an snp matrix
+
+def test_dataset_reg(n_samples=100, n_features=100, n_informative=0, k=0,
+                     seed=None):
+    """Generate an snp matrix
 
     Parameters
     ----------
-    n_samples : 100, int,
+    n_samples : 100, int
         the number of subjects
-    n_features : 100, int,
+
+    n_features : 100, int
         the number of features
-    k : 0, int,
+
+    n_informative: 0, int
         number of informative features
+
+    k : 0, int
+        deprecated: use n_informative instead
+
     seed : None, int or np.random.RandomState
         if seed is an instance of np.random.RandomState,
         it is used to initialize the random generator
@@ -83,12 +94,17 @@ def test_dataset_reg(n_samples=100, n_features=100, k=0, seed=None):
     -------
     x : array of shape(n_samples, n_features),
         the design matrix
+
     y : array of shape (n_samples),
         the subject data
-
     """
-    assert k<n_features, ValueError('cannot have %d informative fetaures and'
-                                   ' %d features' % (k, n_features))
+    if k > 0 and n_informative == 0:
+        n_informative = k
+
+    if n_informative > n_features:
+        raise ValueError('cannot have %d informative features and'
+                         ' %d features' % (n_informative, n_features))
+
     if isinstance(seed, np.random.RandomState):
         random = seed
     elif seed is not None:
@@ -98,67 +114,68 @@ def test_dataset_reg(n_samples=100, n_features=100, k=0, seed=None):
 
     x = random.randn(n_samples, n_features)
     y = random.randn(n_samples)
-    x[:,:k] += y[:, np.newaxis]
+    x[:, :k] += y[:, np.newaxis]
     return x, y
 
 
+def sparse_uncorrelated(n_samples=100, n_features=10):
+    """Function creating simulated data with sparse uncorrelated design
 
+    cf.Celeux et al. 2009,  Bayesian regularization in regression)
 
-
-######################################################################
-# Generate Dataset for regression
-######################################################################
-
-
-def sparse_uncorrelated(nb_samples=100, nb_features=10):
-    """
-    Function creating simulated data with sparse uncorrelated design.
-    (cf.Celeux et al. 2009,  Bayesian regularization in regression)
-    X = NR.normal(0,1)
-    Y = NR.normal(X[:,0]+2*X[:,1]-2*X[:,2]-1.5*X[:,3])
+    X = NR.normal(0, 1)
+    Y = NR.normal(X[:, 0] + 2 * X[:, 1] - 2 * X[:, 2] - 1.5 * X[:, 3])
     The number of features is at least 10.
 
     Parameters
     ----------
-    nb_samples : int
-                 number of samples (default is 100).
-    nb_features : int
-                  number of features (default is 5).
+    n_samples : int
+        number of samples (default is 100).
+    n_features : int
+        number of features (default is 10).
 
     Returns
     -------
-    X : numpy array of shape (nb_samples, nb_features) for input samples
-    Y : numpy array of shape (nb_samples) for labels
+    X : numpy array of shape (n_samples, n_features) for input samples
+    y : numpy array of shape (n_samples) for labels
     """
-    X = nr.normal(loc=0, scale=1, size=(nb_samples, nb_features))
-    Y = nr.normal(loc=X[:, 0] + 2 * X[:, 1] - 2 * X[:,2] - 1.5 * X[:, 3],
-                  scale = np.ones(nb_samples))
-    return X, Y
+    X = nr.normal(loc=0, scale=1, size=(n_samples, n_features))
+    y = nr.normal(loc=X[:, 0] + 2 * X[:, 1] - 2 * X[:,2] - 1.5 * X[:, 3],
+                  scale = np.ones(n_samples))
+    return X, y
 
 
-def friedman(nb_samples=100, nb_features=10,noise_std=1):
-    """
-    Function creating simulated data with non linearities
-    (cf.Friedman 1993)
-    X = NR.normal(0,1)
-    Y = 10*sin(X[:,0]*X[:,1]) + 20*(X[:,2]-0.5)**2 + 10*X[:,3] + 5*X[:,4]
+def friedman(n_samples=100, n_features=10, noise_std=1):
+    """Function creating simulated data with non linearities
+
+    cf. Friedman 1993
+
+    X = np.random.normal(0, 1)
+
+    y = 10 * sin(X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 \
+            + 10 * X[:, 3] + 5 * X[:, 4]
+
     The number of features is at least 5.
 
     Parameters
     ----------
-    nb_samples : int
-                 number of samples (default is 100).
-    nb_features : int
-                  number of features (default is 10).
+    n_samples : int
+        number of samples (default is 100).
+
+    n_features : int
+        number of features (default is 10).
+
     noise_std : float
-		std of the noise, which is added as noise_std*NR.normal(0,1)
+        std of the noise, which is added as noise_std*NR.normal(0,1)
+
     Returns
     -------
-    X : numpy array of shape (nb_samples, nb_features) for input samples
-    Y : numpy array of shape (nb_samples) for labels
-
+    X : numpy array of shape (n_samples, n_features) for input samples
+    y : numpy array of shape (n_samples,) for labels
     """
-    X = nr.normal(loc=0, scale=1, size=(nb_samples, nb_features))
-    Y = 10*np.sin(X[:,0]*X[:,1]) + 20*(X[:,2]-0.5)**2 + 10*X[:,3] + 5*X[:,4]
-    Y += noise_std*nr.normal(loc=0,scale=1,size=(nb_samples))
-    return X,Y
+    X = nr.normal(loc=0, scale=1, size=(n_samples, n_features))
+    y = 10 * np.sin(X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 \
+            + 10 * X[:, 3] + 5 * X[:, 4]
+    y += noise_std * nr.normal(loc=0, scale=1, size=n_samples)
+    return X, y
+
-- 
GitLab