Select Git revision
least_angle.py
-
Alexandre Gramfort authoredAlexandre Gramfort authored
least_angle.py 14.28 KiB
"""
Least Angle Regression algorithm. See the documentation on the
Generalized Linear Model for a complete discussion.
"""
# Author: Fabian Pedregosa <fabian.pedregosa@inria.fr>
# Alexandre Gramfort <alexandre.gramfort@inria.fr>
#
# License: BSD Style.
import numpy as np
from scipy import linalg
from scipy.linalg.lapack import get_lapack_funcs
from .base import LinearModel
from ..utils import arrayfuncs
def lars_path(X, y, Xy=None, Gram=None, max_features=None,
alpha_min=0, method='lar', overwrite_X=False,
overwrite_Gram=False, verbose=False):
""" Compute Least Angle Regression and LASSO path
Parameters
-----------
X: array, shape: (n_samples, n_features)
Input data
y: array, shape: (n_samples)
Input targets
max_features: integer, optional
Maximum number of selected features.
Gram: array, shape: (n_features, n_features), optional
Precomputed Gram matrix (X' * X)
alpha_min: float, optional
Minimum correlation along the path. It corresponds to the
regularization parameter alpha parameter in the Lasso.
method: 'lar' | 'lasso'
Specifies the returned model. Select 'lar' for Least Angle
Regression, 'lasso' for the Lasso.
Returns
--------
alphas: array, shape: (max_features + 1,)
Maximum of covariances (in absolute value) at each
iteration.
active: array, shape (max_features,)
Indices of active variables at the end of the path.
coefs: array, shape (n_features, max_features+1)
Coefficients along the path
See also
--------
:ref:`LassoLARS`, :ref:`LARS`
Notes
------
* http://en.wikipedia.org/wiki/Least-angle_regression
* http://en.wikipedia.org/wiki/Lasso_(statistics)#LASSO_method
"""
n_features = X.shape[1]
n_samples = y.size
if max_features is None:
max_features = min(n_samples, n_features)
coefs = np.zeros((max_features + 1, n_features))
alphas = np.zeros(max_features + 1)
n_iter, n_active = 0, 0
active, indices = list(), np.arange(n_features)
# holds the sign of covariance
sign_active = np.empty(max_features, dtype=np.int8)
drop = False
eps = np.finfo(X.dtype).eps
# will hold the cholesky factorization. Only lower part is
# referenced.
L = np.empty((max_features, max_features), dtype=X.dtype)
swap, nrm2 = linalg.get_blas_funcs(('swap', 'nrm2'), (X,))
potrs, = get_lapack_funcs(('potrs',), (X,))
if Gram is None:
if not overwrite_X:
# force copy. setting the array to be fortran-ordered
# speeds up the calculation of the (partial) Gram matrix
# and allows to easily swap columns
X = X.copy('F')
else:
if not overwrite_Gram:
Gram = Gram.copy()
if Xy is None:
Cov = np.dot(X.T, y)
else:
Cov = Xy.copy()
if verbose:
print "Step\t\tAdded\t\tDropped\t\tActive set size\t\tC"
while 1:
if Cov.size:
C_idx = np.argmax(np.abs(Cov))
C_ = Cov[C_idx]
C = np.fabs(C_)
# to match a for computing gamma_
else:
if Gram is None:
C -= gamma_ * np.abs(np.dot(X.T[0], eq_dir))
else:
C -= gamma_ * np.abs(np.dot(Gram[0], least_squares))
alphas[n_iter] = C / n_samples
# Check for early stopping
if alphas[n_iter] < alpha_min: # interpolate
# interpolation factor 0 <= ss < 1
ss = (alphas[n_iter-1] - alpha_min) / (alphas[n_iter-1] -
alphas[n_iter])
coefs[n_iter] = coefs[n_iter-1] + ss*(coefs[n_iter] -
coefs[n_iter-1])
alphas[n_iter] = alpha_min
break
if n_active == max_features:
break
if not drop:
# Update the Cholesky factorization of (Xa * Xa') #
# #
# ( L 0 ) #
# L -> ( ) , where L * w = b #
# ( w z ) z = 1 - ||w|| #
# #
# where u is the last added to the active set #
sign_active[n_active] = np.sign(C_)
m, n = n_active, C_idx+n_active
Cov[C_idx], Cov[0] = swap(Cov[C_idx], Cov[0])
indices[n], indices[m] = indices[m], indices[n]
Cov = Cov[1:] # remove Cov[0]
if Gram is None:
X.T[n], X.T[m] = swap(X.T[n], X.T[m])
c = nrm2(X.T[n_active])**2
L[n_active, :n_active] = \
np.dot(X.T[n_active], X.T[:n_active].T)
else:
# swap does only work inplace if matrix is fortran
# contiguous ...
Gram[m], Gram[n] = swap(Gram[m], Gram[n])
Gram[:, m], Gram[:, n] = swap(Gram[:, m], Gram[:, n])
c = Gram[n_active, n_active]
L[n_active, :n_active] = Gram[n_active, :n_active]
# Update the cholesky decomposition for the Gram matrix
arrayfuncs.solve_triangular(L[:n_active, :n_active],
L[n_active, :n_active])
v = np.dot(L[n_active, :n_active], L[n_active, :n_active])
diag = max(np.sqrt(np.abs(c - v)), eps)
L[n_active, n_active] = diag
active.append(indices[n_active])
n_active += 1
if verbose:
print "%s\t\t%s\t\t%s\t\t%s\t\t%s" % (n_iter, active[-1], '',
n_active, C)
# least squares solution
least_squares, info = potrs(L[:n_active, :n_active],
sign_active[:n_active], lower=True)
# is this really needed ?
AA = 1. / np.sqrt(np.sum(least_squares * sign_active[:n_active]))
least_squares *= AA
if Gram is None:
# equiangular direction of variables in the active set
eq_dir = np.dot(X.T[:n_active].T, least_squares)
# correlation between each unactive variables and
# eqiangular vector
corr_eq_dir = np.dot(X.T[n_active:], eq_dir)
else:
# if huge number of features, this takes 50% of time, I
# think could be avoided if we just update it using an
# orthogonal (QR) decomposition of X
corr_eq_dir = np.dot(Gram[:n_active, n_active:].T,
least_squares)
g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir))
g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir))
gamma_ = min(g1, g2, C/AA)
# TODO: better names for these variables: z
drop = False
z = - coefs[n_iter, active] / least_squares
z_pos = arrayfuncs.min_pos(z)
if z_pos < gamma_:
# some coefficients have changed sign
idx = np.where(z == z_pos)[0]
# update the sign, important for LAR
sign_active[idx] = -sign_active[idx]
if method == 'lasso': gamma_ = z_pos
drop = True
n_iter += 1
if n_iter >= coefs.shape[0]:
# resize the coefs and alphas array
add_features = 2 * (max_features - n_active)
coefs.resize((n_iter + add_features, n_features))
alphas.resize(n_iter + add_features)
coefs[n_iter, active] = coefs[n_iter-1, active] + \
gamma_ * least_squares
# update correlations
Cov -= gamma_ * corr_eq_dir
if n_active > n_features:
break
# See if any coefficient has changed sign
if drop and method == 'lasso':
arrayfuncs.cholesky_delete(L[:n_active, :n_active], idx)
n_active -= 1
m, n = idx, n_active
drop_idx = active.pop(idx)
if Gram is None:
# propagate dropped variable
for i in range(idx, n_active):
X.T[i], X.T[i+1] = swap(X.T[i], X.T[i+1])
indices[i], indices[i+1] = \
indices[i+1], indices[i] # yeah this is stupid
# TODO: this could be updated
residual = y - np.dot(X[:, :n_active],
coefs[n_iter, active])
temp = np.dot(X.T[n_active], residual)
Cov = np.r_[temp, Cov]
else:
for i in range(idx, n_active):
indices[i], indices[i+1] = \
indices[i+1], indices[i]
Gram[i], Gram[i+1] = swap(Gram[i], Gram[i+1])
Gram[:, i], Gram[:, i+1] = swap(Gram[:, i], Gram[:, i+1])
# Cov_n = Cov_j + x_j * X + increment(betas) TODO:
# will this still work with multiple drops ?
# recompute covariance. Probably could be done better
# wrong as Xy is not swapped with the rest of variables
# TODO: this could be updated
residual = y - np.dot(X, coefs[n_iter])
temp = np.dot(X.T[drop_idx], residual)
Cov = np.r_[temp, Cov]
sign_active = np.delete(sign_active, idx)
sign_active = np.append(sign_active, 0.) # just to maintain size
if verbose:
print "%s\t\t%s\t\t%s\t\t%s\t\t%s" % (n_iter, '', drop_idx,
n_active, abs(temp))
# resize coefs in case of early stop
alphas = alphas[:n_iter+1]
coefs = coefs[:n_iter+1]
return alphas, active, coefs.T
class LARS(LinearModel):
"""Least Angle Regression model a.k.a. LAR
Parameters
----------
n_features : int, optional
Number of selected active features
fit_intercept : boolean
whether to calculate the intercept for this model. If set
to false, no intercept will be used in calculations
(e.g. data is expected to be already centered).
Attributes
----------
`coef_` : array, shape = [n_features]
parameter vector (w in the fomulation formula)
`intercept_` : float
independent term in decision function.
Examples
--------
>>> from scikits.learn import linear_model
>>> clf = linear_model.LARS()
>>> clf.fit([[-1,1], [0, 0], [1, 1]], [-1, 0, -1], max_features=1)
LARS(verbose=False, fit_intercept=True)
>>> print clf.coef_
[ 0. -0.81649658]
References
----------
http://en.wikipedia.org/wiki/Least_angle_regression
See also
--------
lars_path, LassoLARS
"""
def __init__(self, fit_intercept=True, verbose=False):
self.fit_intercept = fit_intercept
self.verbose = verbose
self.method = 'lar'
def fit (self, X, y, normalize=True, max_features=None,
precompute='auto', overwrite_X=False, **params):
"""
Fit the model using X, y as training data.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Training data.
y : array-like, shape = [n_samples]
Target values.
precompute : True | False | 'auto' | array-like
Whether to use a precomputed Gram matrix to speed up
calculations. If set to 'auto' let us decide. The Gram
matrix can also be passed as argument.
Returns
-------
self : object
returns an instance of self.
"""
self._set_params(**params)
X = np.atleast_2d(X)
y = np.atleast_1d(y)
X, y, Xmean, ymean = LinearModel._center_data(X, y, self.fit_intercept)
n_samples = X.shape[0]
if self.method == 'lasso':
alpha = self.alpha * n_samples # scale alpha with number of samples
else:
alpha = 0.
if normalize:
norms = np.sqrt(np.sum(X**2, axis=0))
nonzeros = np.flatnonzero(norms)
X[:, nonzeros] /= norms[nonzeros]
# precompute if n_samples > n_features
if hasattr(precompute, '__array__'):
# copy as it's going to be modified
Gram = precompute.copy()
elif precompute == True or \
(precompute == 'auto' and X.shape[0] > X.shape[1]):
Gram = np.dot(X.T, X)
else:
Gram = None
self.alphas_, self.active_, self.coef_path_ = lars_path(X, y,
Gram=Gram, overwrite_X=overwrite_X,
overwrite_Gram=True, alpha_min=alpha,
method=self.method, verbose=self.verbose,
max_features=max_features)
self.coef_ = self.coef_path_[:,-1]
self._set_intercept(Xmean, ymean)
return self
class LassoLARS (LARS):
""" Lasso model fit with Least Angle Regression a.k.a. LARS
It is a Linear Model trained with an L1 prior as regularizer.
lasso).
Parameters
----------
alpha : float, optional
Constant that multiplies the L1 term. Defaults to 1.0
fit_intercept : boolean
whether to calculate the intercept for this model. If set
to false, no intercept will be used in calculations
(e.g. data is expected to be already centered).
Attributes
----------
`coef_` : array, shape = [n_features]
parameter vector (w in the fomulation formula)
`intercept_` : float
independent term in decision function.
Examples
--------
>>> from scikits.learn import linear_model
>>> clf = linear_model.LassoLARS(alpha=0.01)
>>> clf.fit([[-1,1], [0, 0], [1, 1]], [-1, 0, -1])
LassoLARS(alpha=0.01, verbose=False, fit_intercept=True)
>>> print clf.coef_
[ 0. -0.72649658]
References
----------
http://en.wikipedia.org/wiki/Least_angle_regression
See also
--------
lars_path, Lasso
"""
def __init__(self, alpha=1.0, fit_intercept=True, verbose=False):
self.alpha = alpha
self.fit_intercept = fit_intercept
self.verbose = verbose
self.method = 'lasso'