-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathevalIDS.py
253 lines (218 loc) · 11.1 KB
/
evalIDS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
from pyids.algorithms.ids_classifier import mine_CARs
#pip3 install git+https://github.com/KIZI/pyIDS.git@76ad9d9b2bc12630ae1a36cb8f96103a5e5bbfce
from pyids.algorithms.ids import IDS
from pyids.model_selection import mode
import time
import os
from pyarc.qcba.transformation import QCBATransformation
from pyarc.algorithms.rule_generation import generateCARs
from pyarc.data_structures import TransactionDB
import pandas as pd
from pyarc.qcba.data_structures import (
IntervalReader,
Interval,
QuantitativeDataFrame,
QuantitativeCAR
)
from pyarc.qcba.classifier import QuantitativeClassifier
from sklearn.metrics import accuracy_score
interval_reader = IntervalReader()
interval_reader.closed_bracket = "", "NULL"
interval_reader.open_bracket = "NULL", ""
interval_reader.infinity_symbol = "inf", "inf"
interval_reader.members_separator = "_to_"
interval_reader.compile_reader()
QuantitativeCAR.interval_reader = interval_reader
min_support=0.01
min_confidence=50
max_rule_len=5
rule_cutoff=50
basepath="./"
unique_transactions= True
lambda_array = [1, 1, 1, 1, 1, 1, 1]
import logging
import time
logging.basicConfig(level=logging.DEBUG)
def run1fold(basepath,datasetname, unique_transactions=True,runQCBA=False,saveIDSRules=True,useConfidenceForCandidateGeneration=True):
df_stat = pd.DataFrame(columns=['ids','idsqcba'], index=["accuracy","rulecount","rulelength","buildtime"])
if (runQCBA):
#python QCBA implementation uses custom discretization format
data_train_disc = pd.read_csv(basepath+"data/folds_discr/train/{}.csv".format(datasetname))
data_test_disc = pd.read_csv(basepath+"data/folds_discr/test/{}.csv".format(datasetname))
data_test_undisc = pd.read_csv(basepath+"data/folds_nodiscr/test/{}.csv".format(datasetname))
data_train_undisc = pd.read_csv(basepath+"data/folds_nodiscr/train/{}.csv".format(datasetname))
quant_dataframe_test_undisc = QuantitativeDataFrame(data_test_undisc)
quant_dataframe_train_undisc = QuantitativeDataFrame(data_train_undisc)
else:
#R QCBA implementation uses different discretization format, folds are generated with preprocess_for_ids.R
data_train_disc = pd.read_csv(basepath+"data/folds_discr2/train/{}.csv".format(datasetname))
data_test_disc = pd.read_csv(basepath+"data/folds_discr2/test/{}.csv".format(datasetname))
quant_dataframe_train_disc = QuantitativeDataFrame(data_train_disc)
quant_dataframe_test_disc = QuantitativeDataFrame(data_test_disc)
actual = quant_dataframe_test_disc.dataframe.iloc[:,-1].values
print("Starting rule learning for candidate generation")
max_rule_len_adjusted = max_rule_len
if useConfidenceForCandidateGeneration:
# mine_CARs learns initial candidate rules with CBA-like approach
# it uses unsupervised paramter tuning to determine conf, supp and len thresholds,
# as described in Kliegr & Kuchar, 2019
# Because the subsequent optimization is slow, not all initial candidate rules can be passed to IDS.
# the sample parameter controls, how the subset of N rules will be selected from the initial candidates:
# sample=False: take top N rules according to CBA criteria. According to our experiments, this has better results
# sample=True: take random N rules
if "spambase" in datasetname:
print("reducing maxlen")
max_rule_len_adjusted = 4
txns = TransactionDB.from_DataFrame(data_train_disc)
cars = generateCARs(txns, support=min_support, confidence=min_confidence, maxlen=max_rule_len_adjusted)
cars = cars[:rule_cutoff]
else:
# learn candidate rules using approach without min confidence described in Lakkaraju et al, 2-16
print("WARNING save any unsaved work")
print("WARNING candidate generation without minimum confidence and sampling may be too slow or memory intensive")
cars =mine_CARs(data_train_disc, rule_cutoff=rule_cutoff)
print("rule mining finished")
print("rules found:" + str(len(cars)))
#train IDS model
ids = IDS(algorithm="SLS")
start = time.time()
# all lambdas are set to the same value
print("Fittings IDS")
ids.fit(class_association_rules=cars, lambda_array=lambda_array, quant_dataframe=quant_dataframe_train_disc,random_seed=1)
print("Fittings IDS - Finished")
end = time.time()
df_stat.loc["buildtime","ids"] = end-start
#apply IDS model
print("Applying IDS on test data")
df_stat.loc["accuracy","ids"] = ids.score(quant_dataframe_test_disc)
print("Applying IDS on test data - Finished")
print("Acc IDS:",df_stat.loc["accuracy","ids"] )
df_stat.loc["rulecount","ids"] = len(ids.clf.rules)
antLengths=list(map(lambda r: len(r.car.antecedent.itemset.items()),ids.clf.rules ))
df_stat.loc["rulelength","ids"]=sum(antLengths)/len(antLengths)
avg_rule_legnth_ids = None
print("Rule Count IDS:", df_stat.loc["rulecount","ids"])
if (saveIDSRules):
idsRulesPath = basepath+modelFolder+"/{}.csv".format(datasetname)
file = open(idsRulesPath,"w")
txtexport="rules,suppport,confidence,lift\n"
#Before export, IDS sorts the rule by harmonic mean of support and confidence (st.hmean([self.car.support, self.car.confidence]))
#In this order, rules are also applied for prediction
for r in ids.clf.rules:
args = [r.car.antecedent.string(), "{" + r.car.consequent.string() + "}", r.car.support, r.car.confidence,0]
txtexport = txtexport+ "\"{} => {}\",{:.2f},{:.2f},{:.2f} \n".format(*args)
#add default rule
classname = data_train_disc.columns.values[-1]
txtexport = txtexport+ "\"{ } => " + "{"+classname+"="+ mode(data_train_disc[data_train_disc.columns[-1]])+ "}\", 0,0,0"
print(txtexport)
file.write(txtexport)
file.close()
if (runQCBA):
#postprocess IDS model with QCBA - Python version in pyarc (this can be slow)
rules_to_optimize = ids.clf.rules
start = time.time()
quant_rules = [ QuantitativeCAR(r.car) for r in rules_to_optimize ]
qcba_transformation = QCBATransformation(quant_dataframe_train_undisc)
transformed_rules = qcba_transformation.transform(quant_rules)
end = time.time()
df_stat.loc["buildtime","idsqcba"] = end-start
rules, default_class = transformed_rules
antLengths=list(map(lambda r: len(r.car.antecedent.itemset.items()),ids.clf.rules ))
#+1 because the default rule is not counted
df_stat.loc["rulelength","idsqcba"]=sum(antLengths)/(len(antLengths)+1)
#apply QCBA model
qclf = QuantitativeClassifier(rules, default_class)
pred = qclf.predict(quant_dataframe_test_undisc)
#evaluate model - QCBA
df_stat.loc["accuracy","idsqcba"] = accuracy_score(actual, pred)
df_stat.loc["rulecount","idsqcba"] = len(rules)
print("Acc IDS-QCBA:", df_stat.loc["accuracy","idsqcba"] )
print("Rule Count IDS-QCBA:", df_stat.loc["rulecount","idsqcba"])
return df_stat
def mean_allfolds(dataset_name, start=0, end=10, unique_transactions=True,runQCBA=False,useConfidenceForCandidateGeneration=True,saveIDSRules=True):
files = [ dataset_name + repr(i) for i in range(start, end) ]
df_agg = None
emptyDF = True
for file in files:
df_stats = run1fold(basepath,file, unique_transactions=unique_transactions,runQCBA=runQCBA,useConfidenceForCandidateGeneration=useConfidenceForCandidateGeneration,saveIDSRules=True)
print("done", file)
if emptyDF:
df_agg = df_stats
emptyDF = False
else:
df_agg = df_agg + df_stats
print(df_agg)
foldcount = end-start
print(foldcount)
print(df_agg)
df_agg= df_agg / foldcount
print(df_agg)
return df_agg
datasets = ["iris","australian","anneal","autos","breast-w","colic","credit-a","credit-g","diabetes","glass","heart-statlog","ionosphere","labor","letter","lymph","segment","sonar","vehicle","vowel","hepatitis","spambase","hypothyroid","kdd1000_","kdd10000_","kdd20000_","kdd30000_","kdd40000_"]
runQCBA = False
saveIDSRules=True
useConfidenceForCandidateGeneration = True
separateFilePerDataset=False
modelFolder="IDS_Models"
if runQCBA:
resultFolder="IDSQCBA_results"
else:
resultFolder="IDS_results"
if not os.path.exists(modelFolder):
os.makedirs(modelFolder)
if saveIDSRules and not os.path.exists(resultFolder):
os.makedirs(resultFolder)
for dataset in datasets:
print("Processing dataset " + dataset)
# Check if already computed
if separateFilePerDataset:
resultFile=resultFolder+"/"+dataset+".csv"
if os.path.exists(resultFile):
print("skipping already computed")
continue
else:
resultFileIDS=resultFolder+"/IDS.csv"
# TODO: refactor this into functions
if runQCBA:
resultFileIDSQCBA=resultFolder+"/IDSQCBAPy.csv"
if not os.path.exists(resultFileIDSQCBA):
print("initializing result file")
file = open(resultFileIDSQCBA,"w+")
file.write("dataset,accuracy,rules,antlength,buildtime\n")
file.close()
else:
file = open(resultFileIDSQCBA,"r+")
strings = file.read()
file.close()
if "\n" + dataset +"," in strings:
print("skipping QCBA already computed")
continue
if not os.path.exists(resultFileIDS):
print("initializing result file")
file = open(resultFileIDS,"w+")
file.write("dataset,accuracy,rules,antlength,buildtime\n")
file.close()
else:
file = open(resultFileIDS,"r+")
strings = file.read()
file.close()
if "\n" + dataset +"," in strings:
print("skipping IDS model already computed")
continue
# Compute
df_stats_per_dataset = mean_allfolds(dataset, start=0,end=10, runQCBA=runQCBA,useConfidenceForCandidateGeneration=useConfidenceForCandidateGeneration,saveIDSRules=saveIDSRules)
print("*****")
print(df_stats_per_dataset)
print("******")
# Save
if separateFilePerDataset:
df_stats_per_dataset.to_csv(resultFile)
else:
df_stats_per_dataset
file = open(resultFileIDS,"a")
file.write(dataset + "," + str(df_stats_per_dataset.loc["accuracy","ids"]) + "," + str(df_stats_per_dataset.loc["rulecount","ids"]) + "," + str(df_stats_per_dataset.loc["rulelength","ids"]) +"," + str(df_stats_per_dataset.loc["buildtime","ids"]) + "\n")
file.close()
if runQCBA:
file = open(resultFileIDSQCBA,"a")
file.write(dataset + "," + str(df_stats_per_dataset.loc["accuracy","idsqcba"]) + "," + str(df_stats_per_dataset.loc["rulecount","idsqcba"]) + "," + str(df_stats_per_dataset.loc["rulelength","idsqcba"])+"," + str(df_stats_per_dataset.loc["buildtime","idsqcba"]) + "\n")
file.close()