-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathconll2tei.py
92 lines (80 loc) · 2.36 KB
/
conll2tei.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os,sys,re,traceback
def help():
sys.stderr.write("""synopsis: conll2tei.py [-h|-?|-help] [-] [FILE1..n]
-h print this message
- read CDLI-CoNLL from stdin
FILEi read CDLI-CoNLL from file
Read CDLI-CoNLL files from stdin or args, write one TEI/XML file to stdout.
If no arguments or - are provided, read from stdin.\n""")
sys.stderr.flush()
files=sys.argv[1:]
if len(files)==0:
files=["-h","-"]
header="""<?xml version="1.0"?>
<TEI>
<teiHeader>
<revisionDesc>
<change who="conll2tei.py" when="">Converted from CDLI-CoNLL</change>
</revisionDesc>
</teiHeader>
<teiCorpus>"""
footer=""" </teiCorpus>
</TEI>
"""
output=False
for file in files:
input=None
if os.path.exists(file):
sys.stderr.write("reading from \""+file+"\"\n")
input=open(file,"rt", errors="ignore")
elif re.match(r"^[\-]+$",file):
sys.stderr.write("reading from stdin\n")
input=sys.stdin
elif re.match(r"^[\-]+(\?|h|help)$",file):
help()
else:
sys.stderr.write("could not open file \""+file+"\"\n")
sys.exit(1)
sys.stderr.flush()
if not output:
print(header)
output=True
print(f""" <TEI>
<teiHeader>
<notesStmt>
<note n=orgfile>{file}</note>
</notesStmt>
<teiHeader>
<text>""")
s=0
tok=0
for line in input:
line=line.strip()
if not line.startswith("#"):
if line=="" and tok!=0:
print(" </s>")
tok=0
s+=1
if "\t" in line:
fields=line.split("\t")
if len(fields)>=7:
if tok==0:
s+=1
tok=fields[0]
print(f""" <s id="s-{s}">""")
row={
"id": "w-"+fields[0],
"lemma": fields[2],
"xpos": fields[4],
"head": "w-"+fields[5],
"deprel": fields[6]
}
print(" <tok "+" ".join([key+"=\""+val+"\"" for key,val in row.items()])+">"+fields[1]+"</tok>")
if tok!=0:
print(" </s>")
print(""" </text>
</TEI>""")
if os.path.exists(file):
input.close()
if output:
print(footer)