Skip to content
Snippets Groups Projects
Commit 7f19dbeb authored by Albert Thomas's avatar Albert Thomas Committed by Olivier Grisel
Browse files

[MRG+1] Fix LOF and Isolation benchmarks (#9798)

parent f84581bf
No related branches found
No related tags found
No related merge requests found
......@@ -3,6 +3,17 @@
IsolationForest benchmark
==========================================
A test of IsolationForest on classical anomaly detection datasets.
The benchmark is run as follows:
1. The dataset is randomly split into a training set and a test set, both
assumed to contain outliers.
2. Isolation Forest is trained on the training set.
3. The ROC curve is computed on the test set using the knowledge of the labels.
Note that the smtp dataset contains a very small proportion of outliers.
Therefore, depending on the seed of the random number generator, randomly
splitting the data set might lead to a test set containing no outliers. In this
case a warning is raised when computing the ROC curve.
"""
from time import time
......@@ -12,7 +23,7 @@ import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_curve, auc
from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import shuffle as sh
print(__doc__)
......@@ -30,15 +41,14 @@ def print_outlier_ratio(y):
print("----- Outlier ratio: %.5f" % (np.min(cnt) / len(y)))
np.random.seed(1)
random_state = 1
fig_roc, ax_roc = plt.subplots(1, 1, figsize=(8, 5))
# Set this to true for plotting score histograms for each dataset:
with_decision_function_histograms = False
# Removed the shuttle dataset because as of 2017-03-23 mldata.org is down:
# datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
datasets = ['http', 'smtp', 'SA', 'SF', 'forestcover']
# datasets available = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
# Loop over all datasets for fitting and scoring the estimator:
for dat in datasets:
......@@ -47,7 +57,8 @@ for dat in datasets:
print('====== %s ======' % dat)
print('--- Fetching data...')
if dat in ['http', 'smtp', 'SF', 'SA']:
dataset = fetch_kddcup99(subset=dat, shuffle=True, percent10=True)
dataset = fetch_kddcup99(subset=dat, shuffle=True,
percent10=True, random_state=random_state)
X = dataset.data
y = dataset.target
......@@ -55,7 +66,7 @@ for dat in datasets:
dataset = fetch_mldata('shuttle')
X = dataset.data
y = dataset.target
X, y = sh(X, y)
X, y = sh(X, y, random_state=random_state)
# we remove data with label 4
# normal data are then those of class 1
s = (y != 4)
......@@ -65,7 +76,7 @@ for dat in datasets:
print('----- ')
if dat == 'forestcover':
dataset = fetch_covtype(shuffle=True)
dataset = fetch_covtype(shuffle=True, random_state=random_state)
X = dataset.data
y = dataset.target
# normal data are those with attribute 2
......@@ -79,17 +90,17 @@ for dat in datasets:
print('--- Vectorizing data...')
if dat == 'SF':
lb = MultiLabelBinarizer()
x1 = lb.fit_transform(X[:, 1])
lb = LabelBinarizer()
x1 = lb.fit_transform(X[:, 1].astype(str))
X = np.c_[X[:, :1], x1, X[:, 2:]]
y = (y != b'normal.').astype(int)
print_outlier_ratio(y)
if dat == 'SA':
lb = MultiLabelBinarizer()
x1 = lb.fit_transform(X[:, 1])
x2 = lb.fit_transform(X[:, 2])
x3 = lb.fit_transform(X[:, 3])
lb = LabelBinarizer()
x1 = lb.fit_transform(X[:, 1].astype(str))
x2 = lb.fit_transform(X[:, 2].astype(str))
x3 = lb.fit_transform(X[:, 3].astype(str))
X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
y = (y != b'normal.').astype(int)
print_outlier_ratio(y)
......@@ -108,7 +119,7 @@ for dat in datasets:
y_test = y[n_samples_train:]
print('--- Fitting the IsolationForest estimator...')
model = IsolationForest(n_jobs=-1)
model = IsolationForest(n_jobs=-1, random_state=random_state)
tstart = time()
model.fit(X_train)
fit_time = time() - tstart
......
......@@ -5,6 +5,16 @@ LocalOutlierFactor benchmark
A test of LocalOutlierFactor on classical anomaly detection datasets.
Note that LocalOutlierFactor is not meant to predict on a test set and its
performance is assessed in an outlier detection context:
1. The model is trained on the whole dataset which is assumed to contain
outliers.
2. The ROC curve is computed on the same dataset using the knowledge of the
labels.
In this context there is no need to shuffle the dataset because the model
is trained and tested on the whole dataset. The randomness of this benchmark
is only caused by the random selection of anomalies in the SA dataset.
"""
from time import time
......@@ -14,23 +24,21 @@ from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import roc_curve, auc
from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import shuffle as sh
print(__doc__)
np.random.seed(2)
random_state = 2 # to control the random selection of anomalies in SA
# datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
datasets = ['shuttle']
novelty_detection = True # if False, training set polluted by outliers
datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
plt.figure()
for dataset_name in datasets:
# loading and vectorization
print('loading data')
if dataset_name in ['http', 'smtp', 'SA', 'SF']:
dataset = fetch_kddcup99(subset=dataset_name, shuffle=True,
percent10=False)
dataset = fetch_kddcup99(subset=dataset_name, percent10=True,
random_state=random_state)
X = dataset.data
y = dataset.target
......@@ -38,7 +46,6 @@ for dataset_name in datasets:
dataset = fetch_mldata('shuttle')
X = dataset.data
y = dataset.target
X, y = sh(X, y)
# we remove data with label 4
# normal data are then those of class 1
s = (y != 4)
......@@ -47,7 +54,7 @@ for dataset_name in datasets:
y = (y != 1).astype(int)
if dataset_name == 'forestcover':
dataset = fetch_covtype(shuffle=True)
dataset = fetch_covtype()
X = dataset.data
y = dataset.target
# normal data are those with attribute 2
......@@ -61,54 +68,34 @@ for dataset_name in datasets:
if dataset_name == 'SF':
lb = LabelBinarizer()
lb.fit(X[:, 1])
x1 = lb.transform(X[:, 1])
x1 = lb.fit_transform(X[:, 1].astype(str))
X = np.c_[X[:, :1], x1, X[:, 2:]]
y = (y != 'normal.').astype(int)
y = (y != b'normal.').astype(int)
if dataset_name == 'SA':
lb = LabelBinarizer()
lb.fit(X[:, 1])
x1 = lb.transform(X[:, 1])
lb.fit(X[:, 2])
x2 = lb.transform(X[:, 2])
lb.fit(X[:, 3])
x3 = lb.transform(X[:, 3])
x1 = lb.fit_transform(X[:, 1].astype(str))
x2 = lb.fit_transform(X[:, 2].astype(str))
x3 = lb.fit_transform(X[:, 3].astype(str))
X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
y = (y != 'normal.').astype(int)
y = (y != b'normal.').astype(int)
if dataset_name == 'http' or dataset_name == 'smtp':
y = (y != 'normal.').astype(int)
n_samples, n_features = np.shape(X)
n_samples_train = n_samples // 2
n_samples_test = n_samples - n_samples_train
y = (y != b'normal.').astype(int)
X = X.astype(float)
X_train = X[:n_samples_train, :]
X_test = X[n_samples_train:, :]
y_train = y[:n_samples_train]
y_test = y[n_samples_train:]
if novelty_detection:
X_train = X_train[y_train == 0]
y_train = y_train[y_train == 0]
print('LocalOutlierFactor processing...')
model = LocalOutlierFactor(n_neighbors=20)
tstart = time()
model.fit(X_train)
model.fit(X)
fit_time = time() - tstart
tstart = time()
scoring = -model.decision_function(X_test) # the lower, the more normal
predict_time = time() - tstart
fpr, tpr, thresholds = roc_curve(y_test, scoring)
scoring = -model.negative_outlier_factor_ # the lower, the more normal
fpr, tpr, thresholds = roc_curve(y, scoring)
AUC = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1,
label=('ROC for %s (area = %0.3f, train-time: %0.2fs,'
'test-time: %0.2fs)' % (dataset_name, AUC, fit_time,
predict_time)))
label=('ROC for %s (area = %0.3f, train-time: %0.2fs)'
% (dataset_name, AUC, fit_time)))
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment