-
Notifications
You must be signed in to change notification settings - Fork 4
/
generate_baseline_log.py
91 lines (76 loc) · 3.14 KB
/
generate_baseline_log.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import sys
import csv
import datetime
from scipy.stats import wasserstein_distance
class excel_semicolon(csv.excel):
delimiter = ';'
def violatesTCloseness(distributionActivity, distributionEquivalenceClass, t):
maxDifference = max(distributionActivity) - min(distributionActivity)
if maxDifference == 0.0: #All annotations have the same value(most likely= 0.0)
return False
if (wasserstein_distance(distributionActivity,distributionEquivalenceClass)/maxDifference) >= t:
return True
else:
return False
filePath = sys.argv[1]
kString = sys.argv[2]
tString = sys.argv[3]
k = int(kString)
t = float(tString)
caseIdColName = "Case ID"
variantColName = "Variant"
activityColName = "Activity"
writeFilePath = filePath.replace(".csv","_pretsa_baseline_k%s_t%s.csv" % (kString,tString))
timeStampColName = "Complete Timestamp"
with open(filePath) as csvfile:
reader = csv.DictReader(csvfile, delimiter=";")
variantsDict = {}
variantsAnnotationDict = {}
currentCase = ""
eventsBefore = 0
activityDistributions = {}
for row in reader:
eventsBefore += 1
if currentCase != row[caseIdColName]:
currentCase = row[caseIdColName]
if row[variantColName] in variantsDict:
variantCounter = variantsDict.get(row[variantColName])
else:
variantCounter = 0
variantsDict[row[variantColName]] = variantCounter + 1
currentActivity = row[activityColName]
currentVariant = row[variantColName]
duration = float(row["Duration"])
if currentActivity in activityDistributions:
activityDistributions[currentActivity].append(duration)
else:
activityDistributions[currentActivity] = [duration]
if row[variantColName] in variantsAnnotationDict:
variantDistributions = variantsAnnotationDict.get(currentVariant)
else:
variantDistributions = {}
if currentActivity in variantDistributions:
variantDistributions[currentActivity].append(duration)
else:
variantDistributions[currentActivity] = [duration]
variantsAnnotationDict[currentVariant] = variantDistributions
variantsViolatingTcloseness = set()
for variant in variantsAnnotationDict.keys():
for activity in variantsAnnotationDict[variant].keys():
if violatesTCloseness(activityDistributions[activity],variantsAnnotationDict[variant][activity],t):
variantsViolatingTcloseness.add(variant)
with open(filePath) as csvfile:
with open(writeFilePath,'w') as writeFile:
reader = csv.DictReader(csvfile,delimiter=";")
fieldNamesWrite = reader.fieldnames
writer = csv.DictWriter(writeFile, fieldnames=fieldNamesWrite,dialect=excel_semicolon)
writer.writeheader()
eventsAfter = 0
next(reader)
for row in reader:
if variantsDict[row[variantColName]] >= k:
eventsAfter += 1
writer.writerow(row)
print("Events before " + str(eventsBefore))
print("Events after " + str(eventsAfter))
print("Remaining " + str(eventsAfter/eventsBefore))