forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bench_lof.py
111 lines (94 loc) · 3.41 KB
/
bench_lof.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""
============================
LocalOutlierFactor benchmark
============================
A test of LocalOutlierFactor on classical anomaly detection datasets.
Note that LocalOutlierFactor is not meant to predict on a test set and its
performance is assessed in an outlier detection context:
1. The model is trained on the whole dataset which is assumed to contain
outliers.
2. The ROC curve is computed on the same dataset using the knowledge of the
labels.
In this context there is no need to shuffle the dataset because the model
is trained and tested on the whole dataset. The randomness of this benchmark
is only caused by the random selection of anomalies in the SA dataset.
"""
from time import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import roc_curve, auc
from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_openml
from sklearn.preprocessing import LabelBinarizer
print(__doc__)
random_state = 2 # to control the random selection of anomalies in SA
# datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
datasets = ["http", "smtp", "SA", "SF", "shuttle", "forestcover"]
plt.figure()
for dataset_name in datasets:
# loading and vectorization
print("loading data")
if dataset_name in ["http", "smtp", "SA", "SF"]:
dataset = fetch_kddcup99(
subset=dataset_name, percent10=True, random_state=random_state
)
X = dataset.data
y = dataset.target
if dataset_name == "shuttle":
dataset = fetch_openml("shuttle")
X = dataset.data
y = dataset.target
# we remove data with label 4
# normal data are then those of class 1
s = y != 4
X = X[s, :]
y = y[s]
y = (y != 1).astype(int)
if dataset_name == "forestcover":
dataset = fetch_covtype()
X = dataset.data
y = dataset.target
# normal data are those with attribute 2
# abnormal those with attribute 4
s = (y == 2) + (y == 4)
X = X[s, :]
y = y[s]
y = (y != 2).astype(int)
print("vectorizing data")
if dataset_name == "SF":
lb = LabelBinarizer()
x1 = lb.fit_transform(X[:, 1].astype(str))
X = np.c_[X[:, :1], x1, X[:, 2:]]
y = (y != b"normal.").astype(int)
if dataset_name == "SA":
lb = LabelBinarizer()
x1 = lb.fit_transform(X[:, 1].astype(str))
x2 = lb.fit_transform(X[:, 2].astype(str))
x3 = lb.fit_transform(X[:, 3].astype(str))
X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
y = (y != b"normal.").astype(int)
if dataset_name == "http" or dataset_name == "smtp":
y = (y != b"normal.").astype(int)
X = X.astype(float)
print("LocalOutlierFactor processing...")
model = LocalOutlierFactor(n_neighbors=20)
tstart = time()
model.fit(X)
fit_time = time() - tstart
scoring = -model.negative_outlier_factor_ # the lower, the more normal
fpr, tpr, thresholds = roc_curve(y, scoring)
AUC = auc(fpr, tpr)
plt.plot(
fpr,
tpr,
lw=1,
label="ROC for %s (area = %0.3f, train-time: %0.2fs)"
% (dataset_name, AUC, fit_time),
)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic")
plt.legend(loc="lower right")
plt.show()