-
Notifications
You must be signed in to change notification settings - Fork 0
/
conllutools.py
245 lines (191 loc) · 7.21 KB
/
conllutools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
import os
import preprocessing
import shutil
"""===========================================================
CoNLL-U i/o for BabyLemmatizer 2.0
asahala 2023
https://github.com/asahala
University of Helsinki
Origins of Emesal Project
Centre of Excellence for Ancient Near-Eastern Empires
==========================================================="""
ID, FORM, LEMMA, UPOS, XPOS = 0, 1, 2, 3, 4
FEATS, HEAD, DEPREL, DEPS, MISC = 5, 6, 7, 8, 9
ENG, NORM, CONTEXT, SCORE = 10, 11, 12, 13
""" End-of-unit symbol """
EOU = ('<EOU>', '<EOU>', '<EOU>')
def read_conllu(filename, only_data=False):
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
line = line[:-1]
if not only_data:
yield line
else:
if line:
if not line.startswith('#'):
yield line
def write_conllu(filename, content):
with open(filename, 'w', encoding='utf-8') as f:
for c in content:
f.write(c + '\n')
def get_contexts(data, context=1):
""" Return POS contexts for each word in
the CoNLL-U file
:param context context window size
:type context int """
tag_sequence = list('<' * context)
first = True
window = context + 1 + context
for fields in read_conllu(data, only_data=True):
fields = fields.split('\t')
if fields[ID] == '1':
if not first:
tag_sequence.extend(list('>'*context))
tag_sequence.extend(list('<'*context))
tag_sequence.append(fields[UPOS])
else:
tag_sequence.append(fields[UPOS])
first = False
else:
tag_sequence.append(fields[UPOS])
tag_sequence.extend(list('>'*context))
for e, tag in enumerate(tag_sequence):
if tag not in '><':
yield 'P={}|T={}|N={}'.format(*tag_sequence[e-context:e-context+window])
def add_fields(source_file, values, index):
for line in read_conllu(source_file):
if line.startswith('#'):
yield line
elif not line:
yield line
else:
data = line.split('\t')
if index >= len(data):
data.extend(list('_'*(1+index-len(data))))
data[index] = next(values)
yield '\t'.join(data)
def get_lexicon(filename):
""" Parse override data from CoNLL-U file """
yield EOU
for line in read_conllu(filename):
if line.startswith('#'):
continue
if line:
data = line.split('\t')
yield data[FORM], data[LEMMA], data[UPOS]
else:
yield EOU
def get_training_data2(filename, preprocess=None):
""" Parse training data from CoNLL-U file
and preprocess it
:param filename CoNLL-U file to parse
:param preprocess preprocessing pipeline
:type filename str
:type preprocess method that takes transliteration
as an argument (one word)
"""
stack = []
stack.append(EOU)
for line in read_conllu(filename):
if line.startswith('#'):
continue
if line:
data = line.split('\t')
if preprocess is not None:
data[FORM] = preprocess(data[FORM])
data[FORM] = preprocessing.get_chars(data[FORM])
stack.append((data[FORM], data[LEMMA], data[UPOS]))
else:
stack.append(EOU)
if len(stack) == 3:
if stack[1] != EOU:
yield tuple(stack)
else:
yield EOU
stack.pop(0)
def make_conllu(final_results, source_conllu, output_conllu):
""" Merge annotations with existing CoNLL-U file
:param final_results lemmatizer's final output file
:param source_conllu original input CoNLL-U file
:param output_conllu output CoNNL-U for annotations
"""
only_data = False
if isinstance(final_results, str):
with open(final_results, 'r', encoding='utf-8') as f:
results = f.read().splitlines()
else:
only_data = True
results = [f'{line[2]}\t{line[3]}' for line in final_results]
with open(source_conllu, 'r', encoding='utf-8') as f,\
open(output_conllu, 'w', encoding='utf-8') as output:
for line in f.read().splitlines():
if not line:
output.write(line + '\n')
if not only_data:
results.pop(0)
elif line.startswith('#'):
output.write(line + '\n')
else:
line = line.split('\t')
lemma, pos = results.pop(0).split('\t')
line[2] = lemma
line[3] = pos
line[4] = pos
output.write('\t'.join(line) + '\n')
def upl_to_conllu(upl_file, output):
""" Convert unit-per-line format into CoNLL-U
:param upl_file upl file name
:param output CoNNL-u file name
Example of the input format (line-by-line):
šum-ma a-wi-lum
in DUMU a-wi-lim uh₂-ta-ap-pi-id
in-šu u-hap-pa-du
"""
head = {1: '0'}
deprel = {1: 'root'}
with open(upl_file, 'r', encoding='utf-8') as f,\
open(output, 'w', encoding='utf-8') as o:
for line in f.read().splitlines():
i = 1
for word in line.strip().split(' '):
hh = head.get(i, '1')
rr = deprel.get(i, 'child')
o.write(f'{i}\t{word}\t_\t_\t_\t_\t{hh}\t{rr}\t_\t_\n')
i += 1
o.write('\n')
print(f'> File converted to CoNLL-U and saved as {output}')
def normalize_conllu(filename, output_filename):
""" Normalizes CoNLL-U file transliteration and lemmatization """
content = list(read_conllu(filename))
with open(output_filename, 'w', encoding='utf-8') as f:
for line in content:
if line:
line = line.split('\t')
orig = line[1]
orig2 = line[2]
line[1] = line[1].replace('sz', 'š')
line[1] = line[1].replace('SZ', 'Š')
line[1] = line[1].replace('s,', 'ṣ')
line[1] = line[1].replace('t,', 'ṭ')
line[1] = preprocessing.lowercase_determinatives(line[1])
line[1] = preprocessing.unify_h(line[1])
line[1] = preprocessing.subscribe_indices(line[1])
line[2] = preprocessing.unify_h(line[2])
f.write('\t'.join(line) + '\n')
else:
f.write('\n')
def normalize_all(path):
""" Lowercases determinatives and unifies special h with h """
try:
os.mkdir(os.path.join(path, 'normalized'))
except FileExistsError:
'huumori__huiskaus'
pass
files = (x for x in os.listdir(path) if x.endswith('.conllu'))
for file in files:
print(f'> normalizing {file}')
fn = os.path.join(path, file)
ofn = os.path.join(path, 'normalized', file)
normalize_conllu(fn, ofn)
if __name__ == "__main__":
pass