[MRG+1] Fix LOF and Isolation benchmarks (#9798)

7f19dbeb · Albert Thomas · Olivier Grisel · f84581bf · 7f19dbeb · 7f19dbeb
Commit 7f19dbeb authored 7 years ago by Albert Thomas Committed by Olivier Grisel 7 years ago
--- a/benchmarks/bench_isolation_forest.py
+++ b/benchmarks/bench_isolation_forest.py
@@ -3,6 +3,17 @@
 IsolationForest benchmark
 ==========================================
 A test of IsolationForest on classical anomaly detection datasets.
+
+The benchmark is run as follows:
+1. The dataset is randomly split into a training set and a test set, both
+assumed to contain outliers.
+2. Isolation Forest is trained on the training set.
+3. The ROC curve is computed on the test set using the knowledge of the labels.
+
+Note that the smtp dataset contains a very small proportion of outliers.
+Therefore, depending on the seed of the random number generator, randomly
+splitting the data set might lead to a test set containing no outliers. In this
+case a warning is raised when computing the ROC curve.
 """

 from time import time
@@ -12,7 +23,7 @@ import matplotlib.pyplot as plt
 from sklearn.ensemble import IsolationForest
 from sklearn.metrics import roc_curve, auc
 from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata
-from sklearn.preprocessing import MultiLabelBinarizer
+from sklearn.preprocessing import LabelBinarizer
 from sklearn.utils import shuffle as sh

 print(__doc__)
@@ -30,15 +41,14 @@ def print_outlier_ratio(y):
    print("----- Outlier ratio: %.5f" % (np.min(cnt) / len(y)))


-np.random.seed(1)
+random_state = 1
 fig_roc, ax_roc = plt.subplots(1, 1, figsize=(8, 5))

 # Set this to true for plotting score histograms for each dataset:
 with_decision_function_histograms = False

-# Removed the shuttle dataset because as of 2017-03-23 mldata.org is down:
-# datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
-datasets = ['http', 'smtp', 'SA', 'SF', 'forestcover']
+# datasets available = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
+datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']

 # Loop over all datasets for fitting and scoring the estimator:
 for dat in datasets:
@@ -47,7 +57,8 @@ for dat in datasets:
    print('====== %s ======' % dat)
    print('--- Fetching data...')
    if dat in ['http', 'smtp', 'SF', 'SA']:
-        dataset = fetch_kddcup99(subset=dat, shuffle=True, percent10=True)
+        dataset = fetch_kddcup99(subset=dat, shuffle=True,
+                                 percent10=True, random_state=random_state)
        X = dataset.data
        y = dataset.target

@@ -55,7 +66,7 @@ for dat in datasets:
        dataset = fetch_mldata('shuttle')
        X = dataset.data
        y = dataset.target
-        X, y = sh(X, y)
+        X, y = sh(X, y, random_state=random_state)
        # we remove data with label 4
        # normal data are then those of class 1
        s = (y != 4)
@@ -65,7 +76,7 @@ for dat in datasets:
        print('----- ')

    if dat == 'forestcover':
-        dataset = fetch_covtype(shuffle=True)
+        dataset = fetch_covtype(shuffle=True, random_state=random_state)
        X = dataset.data
        y = dataset.target
        # normal data are those with attribute 2
@@ -79,17 +90,17 @@ for dat in datasets:
    print('--- Vectorizing data...')

    if dat == 'SF':
-        lb = MultiLabelBinarizer()
-        x1 = lb.fit_transform(X[:, 1])
+        lb = LabelBinarizer()
+        x1 = lb.fit_transform(X[:, 1].astype(str))
        X = np.c_[X[:, :1], x1, X[:, 2:]]
        y = (y != b'normal.').astype(int)
        print_outlier_ratio(y)

    if dat == 'SA':
-        lb = MultiLabelBinarizer()
-        x1 = lb.fit_transform(X[:, 1])
-        x2 = lb.fit_transform(X[:, 2])
-        x3 = lb.fit_transform(X[:, 3])
+        lb = LabelBinarizer()
+        x1 = lb.fit_transform(X[:, 1].astype(str))
+        x2 = lb.fit_transform(X[:, 2].astype(str))
+        x3 = lb.fit_transform(X[:, 3].astype(str))
        X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
        y = (y != b'normal.').astype(int)
        print_outlier_ratio(y)
@@ -108,7 +119,7 @@ for dat in datasets:
    y_test = y[n_samples_train:]

    print('--- Fitting the IsolationForest estimator...')
-    model = IsolationForest(n_jobs=-1)
+    model = IsolationForest(n_jobs=-1, random_state=random_state)
    tstart = time()
    model.fit(X_train)
    fit_time = time() - tstart

--- a/benchmarks/bench_lof.py
+++ b/benchmarks/bench_lof.py
@@ -5,6 +5,16 @@ LocalOutlierFactor benchmark

 A test of LocalOutlierFactor on classical anomaly detection datasets.

+Note that LocalOutlierFactor is not meant to predict on a test set and its
+performance is assessed in an outlier detection context:
+1. The model is trained on the whole dataset which is assumed to contain
+outliers.
+2. The ROC curve is computed on the same dataset using the knowledge of the
+labels.
+In this context there is no need to shuffle the dataset because the model
+is trained and tested on the whole dataset. The randomness of this benchmark
+is only caused by the random selection of anomalies in the SA dataset.
+
 """

 from time import time
@@ -14,23 +24,21 @@ from sklearn.neighbors import LocalOutlierFactor
 from sklearn.metrics import roc_curve, auc
 from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata
 from sklearn.preprocessing import LabelBinarizer
-from sklearn.utils import shuffle as sh

 print(__doc__)

-np.random.seed(2)
+random_state = 2  # to control the random selection of anomalies in SA

 # datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
-datasets = ['shuttle']
-
-novelty_detection = True  # if False, training set polluted by outliers
+datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']

+plt.figure()
 for dataset_name in datasets:
    # loading and vectorization
    print('loading data')
    if dataset_name in ['http', 'smtp', 'SA', 'SF']:
-        dataset = fetch_kddcup99(subset=dataset_name, shuffle=True,
-                                 percent10=False)
+        dataset = fetch_kddcup99(subset=dataset_name, percent10=True,
+                                 random_state=random_state)
        X = dataset.data
        y = dataset.target

@@ -38,7 +46,6 @@ for dataset_name in datasets:
        dataset = fetch_mldata('shuttle')
        X = dataset.data
        y = dataset.target
-        X, y = sh(X, y)
        # we remove data with label 4
        # normal data are then those of class 1
        s = (y != 4)
@@ -47,7 +54,7 @@ for dataset_name in datasets:
        y = (y != 1).astype(int)

    if dataset_name == 'forestcover':
-        dataset = fetch_covtype(shuffle=True)
+        dataset = fetch_covtype()
        X = dataset.data
        y = dataset.target
        # normal data are those with attribute 2
@@ -61,54 +68,34 @@ for dataset_name in datasets:

    if dataset_name == 'SF':
        lb = LabelBinarizer()
-        lb.fit(X[:, 1])
-        x1 = lb.transform(X[:, 1])
+        x1 = lb.fit_transform(X[:, 1].astype(str))
        X = np.c_[X[:, :1], x1, X[:, 2:]]
-        y = (y != 'normal.').astype(int)
+        y = (y != b'normal.').astype(int)

    if dataset_name == 'SA':
        lb = LabelBinarizer()
-        lb.fit(X[:, 1])
-        x1 = lb.transform(X[:, 1])
-        lb.fit(X[:, 2])
-        x2 = lb.transform(X[:, 2])
-        lb.fit(X[:, 3])
-        x3 = lb.transform(X[:, 3])
+        x1 = lb.fit_transform(X[:, 1].astype(str))
+        x2 = lb.fit_transform(X[:, 2].astype(str))
+        x3 = lb.fit_transform(X[:, 3].astype(str))
        X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
-        y = (y != 'normal.').astype(int)
+        y = (y != b'normal.').astype(int)

    if dataset_name == 'http' or dataset_name == 'smtp':
-        y = (y != 'normal.').astype(int)
-
-    n_samples, n_features = np.shape(X)
-    n_samples_train = n_samples // 2
-    n_samples_test = n_samples - n_samples_train
+        y = (y != b'normal.').astype(int)

    X = X.astype(float)
-    X_train = X[:n_samples_train, :]
-    X_test = X[n_samples_train:, :]
-    y_train = y[:n_samples_train]
-    y_test = y[n_samples_train:]
-
-    if novelty_detection:
-        X_train = X_train[y_train == 0]
-        y_train = y_train[y_train == 0]

    print('LocalOutlierFactor processing...')
    model = LocalOutlierFactor(n_neighbors=20)
    tstart = time()
-    model.fit(X_train)
+    model.fit(X)
    fit_time = time() - tstart
-    tstart = time()
-
-    scoring = -model.decision_function(X_test)  # the lower, the more normal
-    predict_time = time() - tstart
-    fpr, tpr, thresholds = roc_curve(y_test, scoring)
+    scoring = -model.negative_outlier_factor_  # the lower, the more normal
+    fpr, tpr, thresholds = roc_curve(y, scoring)
    AUC = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=1,
-             label=('ROC for %s (area = %0.3f, train-time: %0.2fs,'
-                    'test-time: %0.2fs)' % (dataset_name, AUC, fit_time,
-                                            predict_time)))
+             label=('ROC for %s (area = %0.3f, train-time: %0.2fs)'
+                    % (dataset_name, AUC, fit_time)))

 plt.xlim([-0.05, 1.05])
 plt.ylim([-0.05, 1.05])