-
Notifications
You must be signed in to change notification settings - Fork 0
/
corpus_explore_5.py
36 lines (26 loc) · 1.01 KB
/
corpus_explore_5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
""" Checking EDU->ADU match """
import xml.etree.ElementTree as ET
import os
CORPUS_PATH = "corpus/en/"
# central_edus = list()
for file in os.listdir(CORPUS_PATH):
if file[-4:] == ".xml":
tree = ET.parse(f"{CORPUS_PATH}{file}")
root = tree.getroot()
cur_seg_dict = dict()
cur_rel_dict = dict()
# print(root.attrib)
for child in root:
if child.tag == 'edge':
if child.attrib['type'] == 'seg':
cur_seg_dict[child.attrib['src']] = child.attrib['trg']
src_id = child.attrib['src']
trg_id = child.attrib['trg']
if trg_id != f"a{src_id[1:]}":
print("mismatch", src_id, trg_id)
else:
cur_rel_dict[child.attrib['src']] = {'trg': child.attrib['trg'], 'type': child.attrib['type']}
elif child.tag == 'joint':
print(child)
# print(cur_seg_dict)
# print(cur_rel_dict)