-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathconvert_art.py
60 lines (46 loc) · 1.37 KB
/
convert_art.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import sys, glob
import xml.etree.ElementTree as et
"""
This script creates a clean version of the art corpus.
Clean version looks like this (label \t sentence):
Background Some sentece.
Background This is another sentence.
Result Last sentence of this document
Method First sentence of the next document.
Method Another sentence.
Result Last sentence again
...
"""
#Get input and output path
in_path = sys.argv[1]
out_path = sys.argv[2]
if (in_path[-1] != '/'):
in_path += '/'
if(out_path[-1] != '/'):
out_path += '/'
files = glob.glob(in_path + '*/*.xml')
f_output = open(out_path + 'full_clean.txt', 'w', encoding="utf-8")
def map_labels(label):
x = {"Obj": "Object",
"Met": "Method",
"Bac": "Background",
"Con": "Conclusion",
"Res": "Result",
"Goa": "Goal",
"Mot": "Motivation",
"Hyp": "Hypothesis",
"Mod": "Model",
"Exp": "Experiment",
"Obs": "Observation"}
return x[label]
for filename in files:
tree = et.parse(filename)
root = tree.getroot()
sentences = root.iter('annotationART')
for sentence in sentences:
label = sentence.attrib['type']
text = ''.join(sentence.itertext())
if label != None and text != "" and label:
f_output.write(map_labels(label) + "\t" + text + "\n")
f_output.write("\n")
f_output.close()