-
Notifications
You must be signed in to change notification settings - Fork 1
/
transformClassificationSheet.py
117 lines (106 loc) · 4.71 KB
/
transformClassificationSheet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import sys
import re
def transform(source):
annotations = []
n = 0
with open(source, "r") as f:
for line in f.readlines():
if n == 0:
sys.stderr.write("Ignoring line '%s', assuming it contains the definitions of the fields\n" %line)
n = 1
continue
fields = re.split("\t+", line.strip())
study, dataset, relation, doi, annotation = ("", "", "", "", "")
if len(fields) == 5:
study, dataset, relation, doi, annotation = fields
elif len(fields) == 4:
study, dataset, relation, doi = fields
annotation = "correct"
else:
sys.stderr.write("warning: cannot process line %s; ignoring." %line)
continue
if study == "Eurobarometer" and dataset.startswith("Flash"):
annotations.append([study, dataset, relation, doi, "incorrect"])
else:
annotations.append([study, dataset, relation, doi, annotation])
return annotations
def toDictionary(annotations):
d = {}
for annotation in annotations:
vals = d.get(annotation[1] + "@" + annotation[3], [])
vals.append(annotation)
d[annotation[1] + "@" + annotation[3]] = vals
return d
def addRelations(dictionary):
for key in dictionary.keys():
name = key.split("@")[0]
if name.startswith("Candidate Countries Eurobarometer") or name.startswith("Applicant Countries Eurobarometer"):
newAnno = dictionary.get(key)[0][:]
newAnno[2] = "parts_of_spatial"
dictionary[key].append(newAnno)
elif re.match("Eurobarometer.*?((OVR)|(LAN))", name):
newAnno = dictionary.get(key)[0][:]
newAnno[2] = "part_of_supplement"
dictionary[key].append(newAnno)
elif re.match("(Candidate Countries )?Eurobarometer.*?:.*", name):
newAnno = dictionary.get(key)[0][:]
newAnno[2] = "part_of"
dictionary[key].append(newAnno)
elif re.match("Central and Eastern Eurobarometer.*?\(.*\)", name):
newAnno = dictionary.get(key)[0][:]
newAnno[2] = "part_of"
dictionary[key].append(newAnno)
elif name.startswith("German Social Survey (ALLBUS)") or name.startswith("German Election Study"):
newAnno = dictionary.get(key)[0][:]
newAnno[2] = "part_of_translation"
dictionary[key].append(newAnno)
elif name.startswith("Politbarometer (Kumulierter Datensatz, inkl. Kurzbarometer)"):
newAnno = dictionary.get(key)[0][:]
newAnno[2] = "part_of_supplement"
dictionary[key].append(newAnno)
elif name.startswith("Wahlstudie (Politbarometer)"):
newAnno = dictionary.get(key)[0][:]
newAnno[2] = "version_of"
dictionary[key].append(newAnno)
elif re.match("International Social Survey Programme:.*", name):
newAnno = dictionary.get(key)[0][:]
newAnno[2] = "part_of"
dictionary[key].append(newAnno)
return dictionary
def removeIncorrectPartOfs(dictionary):
for key in dictionary.keys():
parts_of, part_of, parts_of_spatial, part_of_spatial, parts_of_temporal, part_of_temporal = ([], [], [], [], [], [])
for entry in dictionary.get(key):
if entry[2] == "parts_of":
parts_of = entry
if entry[2] == "part_of":
part_of = entry
if entry[2] == "parts_of_spatial":
parts_of_spatial = entry
if entry[2] == "part_of_spatial":
part_of_spatial = entry
if entry[2] == "parts_of_temporal":
parts_of_temporal = entry
if entry[2] == "part_of_temporal":
part_of_temporal = entry
if parts_of and part_of:
part_of[4] = "incorrect"
if parts_of_spatial and part_of_spatial:
part_of_spatial[4] = "incorrect"
if parts_of_temporal and part_of_temporal:
part_of_temporal[4] = "incorrect"
return dictionary
def flattenDictionary(dictionary):
l = []
for entry in dictionary.values():
try:
l.append("\t".join(entry))
except TypeError:
for item in entry:
l.append("\t".join(item))
return l
if __name__=="__main__":
with open(sys.argv[2], "w") as f:
f.write("\n")
for entry in flattenDictionary(removeIncorrectPartOfs(addRelations(toDictionary(transform(sys.argv[1]))))):
f.write((entry + "\n").decode('utf-8').encode('utf-8'))