-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathplot_anomalies.py
91 lines (77 loc) · 3.59 KB
/
plot_anomalies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import numpy as np
import pandas as pd
from sklearn.metrics import (accuracy_score, r2_score, mean_absolute_error,
confusion_matrix, f1_score, average_precision_score)
import matplotlib.pyplot as plt
import sys
import time
from iforest import IsolationTreeEnsemble, find_TPR_threshold
def plot_anomalies(X, y, sample_size=256, n_trees=100, desired_TPR=None,
percentile=None, normal_ymax=None, bins=20):
N = len(X)
it = IsolationTreeEnsemble(sample_size=sample_size, n_trees=n_trees)
fit_start = time.time()
it.fit(X)
fit_stop = time.time()
fit_time = fit_stop - fit_start
print(f"fit time {fit_time:3.2f}s")
score_start = time.time()
scores = it.anomaly_score(X)
score_stop = time.time()
score_time = score_stop - score_start
print(f"score time {score_time:3.2f}s")
if desired_TPR is not None:
threshold, FPR = find_TPR_threshold(y, scores, desired_TPR)
print(f"Computed {desired_TPR:.4f} TPR threshold {threshold:.4f} with FPR {FPR:.4f}")
else:
threshold = np.percentile(scores, percentile)
y_pred = it.predict_from_anomaly_scores(scores, threshold=threshold)
confusion = confusion_matrix(y, y_pred)
print(confusion)
TN, FP, FN, TP = confusion.flat
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
normal = scores[y == 0]
anomalies = scores[y == 1]
F1 = f1_score(y, y_pred)
PR = average_precision_score(y, scores)
print(f"Proportion anomalies/normal = {len(anomalies)}/{len(normal)} = {(len(anomalies)/len(normal))*100:.1f}%")
print(f"F1 score {F1:.4f}, avg PR {PR:.4f}")
fig, axes = plt.subplots(2, 1, sharex=True)
counts0, binlocs0, _ = axes[0].hist(normal, color='#c7e9b4', bins=bins)
counts1, binlocs1, _ = axes[1].hist(anomalies, color='#fee090', bins=bins)
axes[1].set_xlabel("Anomaly score")
axes[0].set_ylabel("Normal sample count")
axes[1].set_ylabel("Anomalous sample count")
axes[0].plot([threshold, threshold], [0, max(counts0)], '--', color='grey')
axes[1].plot([threshold, threshold], [0, max(counts1)], '--', color='grey')
text_xr = 0.97 * axes[0].get_xlim()[1]
axes[0].text(text_xr, .85 * max(counts0), f"N {N}, {n_trees} trees", horizontalalignment='right')
axes[0].text(text_xr, .75 * max(counts0), f"F1 score {F1:.4f}, avg PR {PR:.4f}", horizontalalignment='right')
axes[0].text(text_xr, .65 * max(counts0), f"TPR {TPR:.4f}, FPR {FPR:.4f}", horizontalalignment='right')
axes[0].text(threshold + .005, .20 * max(counts0), f"score threshold {threshold:.3f}")
axes[0].text(threshold + .005, .10 * max(counts0), f"True anomaly rate {len(anomalies) / len(normal):.4f}")
if normal_ymax is not None:
axes[0].set_ylim(0, normal_ymax)
plt.tight_layout()
plt.savefig(f"{datafile.split('.')[0]}-{n_trees}-{int(desired_TPR*100)}.svg",
bbox_inches='tight',
pad_inches=0)
plt.show()
if __name__ == '__main__':
# launch with "python plot_anomalies.py http.csv attack 20000 256 100 99"
# or, "python plot_anomalies.py cancer.csv diagnosis all 5 1000 80
datafile = sys.argv[1]
targetcol = sys.argv[2]
sample_size = int(sys.argv[4])
n_trees = int(sys.argv[5])
desired_TPR = int(sys.argv[6])
desired_TPR /= 100.0
df = pd.read_csv(datafile)
if sys.argv[3] == 'all':
N = len(df)
else:
N = int(sys.argv[3])
df = df.sample(N) # grab random subset (too slow otherwise)
X, y = df.drop(targetcol, axis=1), df[targetcol]
plot_anomalies(X, y, sample_size=sample_size, n_trees=n_trees, desired_TPR=desired_TPR, bins=15)