Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
scikit-learn
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Ian Johnson
scikit-learn
Commits
7f19dbeb
Commit
7f19dbeb
authored
7 years ago
by
Albert Thomas
Committed by
Olivier Grisel
7 years ago
Browse files
Options
Downloads
Patches
Plain Diff
[MRG+1] Fix LOF and Isolation benchmarks (#9798)
parent
f84581bf
No related branches found
No related tags found
No related merge requests found
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
benchmarks/bench_isolation_forest.py
+26
-15
26 additions, 15 deletions
benchmarks/bench_isolation_forest.py
benchmarks/bench_lof.py
+28
-41
28 additions, 41 deletions
benchmarks/bench_lof.py
with
54 additions
and
56 deletions
benchmarks/bench_isolation_forest.py
+
26
−
15
View file @
7f19dbeb
...
...
@@ -3,6 +3,17 @@
IsolationForest benchmark
==========================================
A test of IsolationForest on classical anomaly detection datasets.
The benchmark is run as follows:
1. The dataset is randomly split into a training set and a test set, both
assumed to contain outliers.
2. Isolation Forest is trained on the training set.
3. The ROC curve is computed on the test set using the knowledge of the labels.
Note that the smtp dataset contains a very small proportion of outliers.
Therefore, depending on the seed of the random number generator, randomly
splitting the data set might lead to a test set containing no outliers. In this
case a warning is raised when computing the ROC curve.
"""
from
time
import
time
...
...
@@ -12,7 +23,7 @@ import matplotlib.pyplot as plt
from
sklearn.ensemble
import
IsolationForest
from
sklearn.metrics
import
roc_curve
,
auc
from
sklearn.datasets
import
fetch_kddcup99
,
fetch_covtype
,
fetch_mldata
from
sklearn.preprocessing
import
Multi
LabelBinarizer
from
sklearn.preprocessing
import
LabelBinarizer
from
sklearn.utils
import
shuffle
as
sh
print
(
__doc__
)
...
...
@@ -30,15 +41,14 @@ def print_outlier_ratio(y):
print
(
"
----- Outlier ratio: %.5f
"
%
(
np
.
min
(
cnt
)
/
len
(
y
)))
np
.
random
.
seed
(
1
)
random
_state
=
1
fig_roc
,
ax_roc
=
plt
.
subplots
(
1
,
1
,
figsize
=
(
8
,
5
))
# Set this to true for plotting score histograms for each dataset:
with_decision_function_histograms
=
False
# Removed the shuttle dataset because as of 2017-03-23 mldata.org is down:
# datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
datasets
=
[
'
http
'
,
'
smtp
'
,
'
SA
'
,
'
SF
'
,
'
forestcover
'
]
# datasets available = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
datasets
=
[
'
http
'
,
'
smtp
'
,
'
SA
'
,
'
SF
'
,
'
shuttle
'
,
'
forestcover
'
]
# Loop over all datasets for fitting and scoring the estimator:
for
dat
in
datasets
:
...
...
@@ -47,7 +57,8 @@ for dat in datasets:
print
(
'
====== %s ======
'
%
dat
)
print
(
'
--- Fetching data...
'
)
if
dat
in
[
'
http
'
,
'
smtp
'
,
'
SF
'
,
'
SA
'
]:
dataset
=
fetch_kddcup99
(
subset
=
dat
,
shuffle
=
True
,
percent10
=
True
)
dataset
=
fetch_kddcup99
(
subset
=
dat
,
shuffle
=
True
,
percent10
=
True
,
random_state
=
random_state
)
X
=
dataset
.
data
y
=
dataset
.
target
...
...
@@ -55,7 +66,7 @@ for dat in datasets:
dataset
=
fetch_mldata
(
'
shuttle
'
)
X
=
dataset
.
data
y
=
dataset
.
target
X
,
y
=
sh
(
X
,
y
)
X
,
y
=
sh
(
X
,
y
,
random_state
=
random_state
)
# we remove data with label 4
# normal data are then those of class 1
s
=
(
y
!=
4
)
...
...
@@ -65,7 +76,7 @@ for dat in datasets:
print
(
'
-----
'
)
if
dat
==
'
forestcover
'
:
dataset
=
fetch_covtype
(
shuffle
=
True
)
dataset
=
fetch_covtype
(
shuffle
=
True
,
random_state
=
random_state
)
X
=
dataset
.
data
y
=
dataset
.
target
# normal data are those with attribute 2
...
...
@@ -79,17 +90,17 @@ for dat in datasets:
print
(
'
--- Vectorizing data...
'
)
if
dat
==
'
SF
'
:
lb
=
Multi
LabelBinarizer
()
x1
=
lb
.
fit_transform
(
X
[:,
1
])
lb
=
LabelBinarizer
()
x1
=
lb
.
fit_transform
(
X
[:,
1
]
.
astype
(
str
)
)
X
=
np
.
c_
[
X
[:,
:
1
],
x1
,
X
[:,
2
:]]
y
=
(
y
!=
b
'
normal.
'
).
astype
(
int
)
print_outlier_ratio
(
y
)
if
dat
==
'
SA
'
:
lb
=
Multi
LabelBinarizer
()
x1
=
lb
.
fit_transform
(
X
[:,
1
])
x2
=
lb
.
fit_transform
(
X
[:,
2
])
x3
=
lb
.
fit_transform
(
X
[:,
3
])
lb
=
LabelBinarizer
()
x1
=
lb
.
fit_transform
(
X
[:,
1
]
.
astype
(
str
)
)
x2
=
lb
.
fit_transform
(
X
[:,
2
]
.
astype
(
str
)
)
x3
=
lb
.
fit_transform
(
X
[:,
3
]
.
astype
(
str
)
)
X
=
np
.
c_
[
X
[:,
:
1
],
x1
,
x2
,
x3
,
X
[:,
4
:]]
y
=
(
y
!=
b
'
normal.
'
).
astype
(
int
)
print_outlier_ratio
(
y
)
...
...
@@ -108,7 +119,7 @@ for dat in datasets:
y_test
=
y
[
n_samples_train
:]
print
(
'
--- Fitting the IsolationForest estimator...
'
)
model
=
IsolationForest
(
n_jobs
=-
1
)
model
=
IsolationForest
(
n_jobs
=-
1
,
random_state
=
random_state
)
tstart
=
time
()
model
.
fit
(
X_train
)
fit_time
=
time
()
-
tstart
...
...
This diff is collapsed.
Click to expand it.
benchmarks/bench_lof.py
+
28
−
41
View file @
7f19dbeb
...
...
@@ -5,6 +5,16 @@ LocalOutlierFactor benchmark
A test of LocalOutlierFactor on classical anomaly detection datasets.
Note that LocalOutlierFactor is not meant to predict on a test set and its
performance is assessed in an outlier detection context:
1. The model is trained on the whole dataset which is assumed to contain
outliers.
2. The ROC curve is computed on the same dataset using the knowledge of the
labels.
In this context there is no need to shuffle the dataset because the model
is trained and tested on the whole dataset. The randomness of this benchmark
is only caused by the random selection of anomalies in the SA dataset.
"""
from
time
import
time
...
...
@@ -14,23 +24,21 @@ from sklearn.neighbors import LocalOutlierFactor
from
sklearn.metrics
import
roc_curve
,
auc
from
sklearn.datasets
import
fetch_kddcup99
,
fetch_covtype
,
fetch_mldata
from
sklearn.preprocessing
import
LabelBinarizer
from
sklearn.utils
import
shuffle
as
sh
print
(
__doc__
)
np
.
random
.
se
ed
(
2
)
random_state
=
2
# to control the
random
se
lection of anomalies in SA
# datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
datasets
=
[
'
shuttle
'
]
novelty_detection
=
True
# if False, training set polluted by outliers
datasets
=
[
'
http
'
,
'
smtp
'
,
'
SA
'
,
'
SF
'
,
'
shuttle
'
,
'
forestcover
'
]
plt
.
figure
()
for
dataset_name
in
datasets
:
# loading and vectorization
print
(
'
loading data
'
)
if
dataset_name
in
[
'
http
'
,
'
smtp
'
,
'
SA
'
,
'
SF
'
]:
dataset
=
fetch_kddcup99
(
subset
=
dataset_name
,
shuffle
=
True
,
percent10
=
Fals
e
)
dataset
=
fetch_kddcup99
(
subset
=
dataset_name
,
percent10
=
True
,
random_state
=
random_stat
e
)
X
=
dataset
.
data
y
=
dataset
.
target
...
...
@@ -38,7 +46,6 @@ for dataset_name in datasets:
dataset
=
fetch_mldata
(
'
shuttle
'
)
X
=
dataset
.
data
y
=
dataset
.
target
X
,
y
=
sh
(
X
,
y
)
# we remove data with label 4
# normal data are then those of class 1
s
=
(
y
!=
4
)
...
...
@@ -47,7 +54,7 @@ for dataset_name in datasets:
y
=
(
y
!=
1
).
astype
(
int
)
if
dataset_name
==
'
forestcover
'
:
dataset
=
fetch_covtype
(
shuffle
=
True
)
dataset
=
fetch_covtype
()
X
=
dataset
.
data
y
=
dataset
.
target
# normal data are those with attribute 2
...
...
@@ -61,54 +68,34 @@ for dataset_name in datasets:
if
dataset_name
==
'
SF
'
:
lb
=
LabelBinarizer
()
lb
.
fit
(
X
[:,
1
])
x1
=
lb
.
transform
(
X
[:,
1
])
x1
=
lb
.
fit_transform
(
X
[:,
1
].
astype
(
str
))
X
=
np
.
c_
[
X
[:,
:
1
],
x1
,
X
[:,
2
:]]
y
=
(
y
!=
'
normal.
'
).
astype
(
int
)
y
=
(
y
!=
b
'
normal.
'
).
astype
(
int
)
if
dataset_name
==
'
SA
'
:
lb
=
LabelBinarizer
()
lb
.
fit
(
X
[:,
1
])
x1
=
lb
.
transform
(
X
[:,
1
])
lb
.
fit
(
X
[:,
2
])
x2
=
lb
.
transform
(
X
[:,
2
])
lb
.
fit
(
X
[:,
3
])
x3
=
lb
.
transform
(
X
[:,
3
])
x1
=
lb
.
fit_transform
(
X
[:,
1
].
astype
(
str
))
x2
=
lb
.
fit_transform
(
X
[:,
2
].
astype
(
str
))
x3
=
lb
.
fit_transform
(
X
[:,
3
].
astype
(
str
))
X
=
np
.
c_
[
X
[:,
:
1
],
x1
,
x2
,
x3
,
X
[:,
4
:]]
y
=
(
y
!=
'
normal.
'
).
astype
(
int
)
y
=
(
y
!=
b
'
normal.
'
).
astype
(
int
)
if
dataset_name
==
'
http
'
or
dataset_name
==
'
smtp
'
:
y
=
(
y
!=
'
normal.
'
).
astype
(
int
)
n_samples
,
n_features
=
np
.
shape
(
X
)
n_samples_train
=
n_samples
//
2
n_samples_test
=
n_samples
-
n_samples_train
y
=
(
y
!=
b
'
normal.
'
).
astype
(
int
)
X
=
X
.
astype
(
float
)
X_train
=
X
[:
n_samples_train
,
:]
X_test
=
X
[
n_samples_train
:,
:]
y_train
=
y
[:
n_samples_train
]
y_test
=
y
[
n_samples_train
:]
if
novelty_detection
:
X_train
=
X_train
[
y_train
==
0
]
y_train
=
y_train
[
y_train
==
0
]
print
(
'
LocalOutlierFactor processing...
'
)
model
=
LocalOutlierFactor
(
n_neighbors
=
20
)
tstart
=
time
()
model
.
fit
(
X
_train
)
model
.
fit
(
X
)
fit_time
=
time
()
-
tstart
tstart
=
time
()
scoring
=
-
model
.
decision_function
(
X_test
)
# the lower, the more normal
predict_time
=
time
()
-
tstart
fpr
,
tpr
,
thresholds
=
roc_curve
(
y_test
,
scoring
)
scoring
=
-
model
.
negative_outlier_factor_
# the lower, the more normal
fpr
,
tpr
,
thresholds
=
roc_curve
(
y
,
scoring
)
AUC
=
auc
(
fpr
,
tpr
)
plt
.
plot
(
fpr
,
tpr
,
lw
=
1
,
label
=
(
'
ROC for %s (area = %0.3f, train-time: %0.2fs,
'
'
test-time: %0.2fs)
'
%
(
dataset_name
,
AUC
,
fit_time
,
predict_time
)))
label
=
(
'
ROC for %s (area = %0.3f, train-time: %0.2fs)
'
%
(
dataset_name
,
AUC
,
fit_time
)))
plt
.
xlim
([
-
0.05
,
1.05
])
plt
.
ylim
([
-
0.05
,
1.05
])
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment