forked from logpai/loglizer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
HDFS_bechmark.py
75 lines (64 loc) · 3.29 KB
/
HDFS_bechmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
sys.path.append('../')
import pandas as pd
from loglizer.models import *
from loglizer import dataloader, preprocessing
run_models = ['PCA', 'InvariantsMiner', 'LogClustering', 'IsolationForest', 'LR',
'SVM', 'DecisionTree']
struct_log = '../data/HDFS/HDFS.npz' # The benchmark dataset
if __name__ == '__main__':
(x_tr, y_train), (x_te, y_test) = dataloader.load_HDFS(struct_log,
window='session',
train_ratio=0.5,
split_type='uniform')
benchmark_results = []
for _model in run_models:
print('Evaluating {} on HDFS:'.format(_model))
if _model == 'PCA':
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_tr, term_weighting='tf-idf',
normalization='zero-mean')
model = PCA()
model.fit(x_train)
elif _model == 'InvariantsMiner':
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_tr)
model = InvariantsMiner(epsilon=0.5)
model.fit(x_train)
elif _model == 'LogClustering':
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_tr, term_weighting='tf-idf')
model = LogClustering(max_dist=0.3, anomaly_threshold=0.3)
model.fit(x_train[y_train == 0, :]) # Use only normal samples for training
elif _model == 'IsolationForest':
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_tr)
model = IsolationForest(random_state=2019, max_samples=0.9999, contamination=0.03,
n_jobs=4)
model.fit(x_train)
elif _model == 'LR':
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_tr, term_weighting='tf-idf')
model = LR()
model.fit(x_train, y_train)
elif _model == 'SVM':
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_tr, term_weighting='tf-idf')
model = SVM()
model.fit(x_train, y_train)
elif _model == 'DecisionTree':
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_tr, term_weighting='tf-idf')
model = DecisionTree()
model.fit(x_train, y_train)
x_test = feature_extractor.transform(x_te)
print('Train accuracy:')
precision, recall, f1 = model.evaluate(x_train, y_train)
benchmark_results.append([_model + '-train', precision, recall, f1])
print('Test accuracy:')
precision, recall, f1 = model.evaluate(x_test, y_test)
benchmark_results.append([_model + '-test', precision, recall, f1])
pd.DataFrame(benchmark_results, columns=['Model', 'Precision', 'Recall', 'F1']) \
.to_csv('benchmark_result.csv', index=False)