forked from NetManAIOps/OmniAnomaly
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
220 lines (187 loc) · 8.04 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# -*- coding: utf-8 -*-
import logging
import os
import pickle
import sys
import time
import warnings
from argparse import ArgumentParser
from pprint import pformat, pprint
import numpy as np
import tensorflow as tf
from tfsnippet.examples.utils import MLResults, print_with_title
from tfsnippet.scaffold import VariableSaver
from tfsnippet.utils import get_variables_as_dict, register_config_arguments, Config
from omni_anomaly.eval_methods import pot_eval, bf_search
from omni_anomaly.model import OmniAnomaly
from omni_anomaly.prediction import Predictor
from omni_anomaly.training import Trainer
from omni_anomaly.utils import get_data_dim, get_data, save_z
class ExpConfig(Config):
# dataset configuration
dataset = "machine-1-1"
x_dim = get_data_dim(dataset)
# model architecture configuration
use_connected_z_q = True
use_connected_z_p = True
# model parameters
z_dim = 3
rnn_cell = 'GRU' # 'GRU', 'LSTM' or 'Basic'
rnn_num_hidden = 500
window_length = 100
dense_dim = 500
posterior_flow_type = 'nf' # 'nf' or None
nf_layers = 20 # for nf
max_epoch = 10
train_start = 0
max_train_size = None # `None` means full train set
batch_size = 50
l2_reg = 0.0001
initial_lr = 0.001
lr_anneal_factor = 0.5
lr_anneal_epoch_freq = 40
lr_anneal_step_freq = None
std_epsilon = 1e-4
# evaluation parameters
test_n_z = 1
test_batch_size = 50
test_start = 0
max_test_size = None # `None` means full test set
# the range and step-size for score for searching best-f1
# may vary for different dataset
bf_search_min = -400.
bf_search_max = 400.
bf_search_step_size = 1.
valid_step_freq = 100
gradient_clip_norm = 10.
early_stop = True # whether to apply early stop method
# pot parameters
# recommend values for `level`:
# SMAP: 0.07
# MSL: 0.01
# SMD group 1: 0.0050
# SMD group 2: 0.0075
# SMD group 3: 0.0001
level = 0.01
# outputs config
save_z = False # whether to save sampled z in hidden space
get_score_on_dim = False # whether to get score on dim. If `True`, the score will be a 2-dim ndarray
save_dir = 'model'
restore_dir = None # If not None, restore variables from this dir
result_dir = 'result' # Where to save the result file
train_score_filename = 'train_score.pkl'
test_score_filename = 'test_score.pkl'
def main():
logging.basicConfig(
level='INFO',
format='%(asctime)s [%(levelname)s] %(name)s: %(message)s'
)
# prepare the data
(x_train, _), (x_test, y_test) = \
get_data(config.dataset, config.max_train_size, config.max_test_size, train_start=config.train_start,
test_start=config.test_start)
# construct the model under `variable_scope` named 'model'
with tf.variable_scope('model') as model_vs:
model = OmniAnomaly(config=config, name="model")
# construct the trainer
trainer = Trainer(model=model,
model_vs=model_vs,
max_epoch=config.max_epoch,
batch_size=config.batch_size,
valid_batch_size=config.test_batch_size,
initial_lr=config.initial_lr,
lr_anneal_epochs=config.lr_anneal_epoch_freq,
lr_anneal_factor=config.lr_anneal_factor,
grad_clip_norm=config.gradient_clip_norm,
valid_step_freq=config.valid_step_freq)
# construct the predictor
predictor = Predictor(model, batch_size=config.batch_size, n_z=config.test_n_z,
last_point_only=True)
with tf.Session().as_default():
if config.restore_dir is not None:
# Restore variables from `save_dir`.
saver = VariableSaver(get_variables_as_dict(model_vs), config.restore_dir)
saver.restore()
if config.max_epoch > 0:
# train the model
train_start = time.time()
best_valid_metrics = trainer.fit(x_train)
train_time = (time.time() - train_start) / config.max_epoch
best_valid_metrics.update({
'train_time': train_time
})
else:
best_valid_metrics = {}
# get score of train set for POT algorithm
train_score, train_z, train_pred_speed = predictor.get_score(x_train)
if config.train_score_filename is not None:
with open(os.path.join(config.result_dir, config.train_score_filename), 'wb') as file:
pickle.dump(train_score, file)
if config.save_z:
save_z(train_z, 'train_z')
if x_test is not None:
# get score of test set
test_start = time.time()
test_score, test_z, pred_speed = predictor.get_score(x_test)
test_time = time.time() - test_start
if config.save_z:
save_z(test_z, 'test_z')
best_valid_metrics.update({
'pred_time': pred_speed,
'pred_total_time': test_time
})
if config.test_score_filename is not None:
with open(os.path.join(config.result_dir, config.test_score_filename), 'wb') as file:
pickle.dump(test_score, file)
if y_test is not None and len(y_test) >= len(test_score):
if config.get_score_on_dim:
# get the joint score
test_score = np.sum(test_score, axis=-1)
train_score = np.sum(train_score, axis=-1)
# get best f1
t, th = bf_search(test_score, y_test[-len(test_score):],
start=config.bf_search_min,
end=config.bf_search_max,
step_num=int(abs(config.bf_search_max - config.bf_search_min) /
config.bf_search_step_size),
display_freq=50)
# get pot results
pot_result = pot_eval(train_score, test_score, y_test[-len(test_score):], level=config.level)
# output the results
best_valid_metrics.update({
'best-f1': t[0],
'precision': t[1],
'recall': t[2],
'TP': t[3],
'TN': t[4],
'FP': t[5],
'FN': t[6],
'latency': t[-1],
'threshold': th
})
best_valid_metrics.update(pot_result)
results.update_metrics(best_valid_metrics)
if config.save_dir is not None:
# save the variables
var_dict = get_variables_as_dict(model_vs)
saver = VariableSaver(var_dict, config.save_dir)
saver.save()
print('=' * 30 + 'result' + '=' * 30)
pprint(best_valid_metrics)
if __name__ == '__main__':
# get config obj
config = ExpConfig()
# parse the arguments
arg_parser = ArgumentParser()
register_config_arguments(config, arg_parser)
arg_parser.parse_args(sys.argv[1:])
config.x_dim = get_data_dim(config.dataset)
print_with_title('Configurations', pformat(config.to_dict()), after='\n')
# open the result object and prepare for result directories if specified
results = MLResults(config.result_dir)
results.save_config(config) # save experiment settings for review
results.make_dirs(config.save_dir, exist_ok=True)
with warnings.catch_warnings():
# suppress DeprecationWarning from NumPy caused by codes in TensorFlow-Probability
warnings.filterwarnings("ignore", category=DeprecationWarning, module='numpy')
main()