-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTranslationToMMS.py
129 lines (92 loc) · 3.83 KB
/
TranslationToMMS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import re
import pandas
import xml.etree.ElementTree as ET
# Default: this is the average transition between glosses in seconds
DEFAULT_TRANSITION = 0.35
def _extract_text_from_xml(xml_tree) -> str:
"""Extracts the text between <value><string> tags from an XML file."""
root = xml_tree.getroot()
value_element = root.find('.//value')
if value_element is not None:
string_element = value_element.find('string')
if string_element is not None:
return string_element.text
raise Exception("Elements <value><string> not found")
def _recapitalize_gloss(gloss: str) -> str:
"""The translator outputs everything in lo-case.
This function brings the case up again where needed.
Strategy:
1) if there is a column (category specifier), capitalize only the part after the column;
2) if in the second part there are parentheses, leave the content in parentheses in lo-case."""
column_pos = gloss.find(':')
if column_pos == -1:
category = ""
content = gloss
else:
category = gloss[:column_pos] + ":"
content = gloss[column_pos + 1: ]
# Look for parentheses
par_match_res = re.match("(.+)(\(.+\))", content)
if par_match_res:
# We found parentheses
out_par = par_match_res.group(1)
in_par = par_match_res.group(2)
# In general, up-case only the part outside the parentheses
# Notable exceptions to the general rule!
if in_par == "(-ort)":
in_par = "(-Ort)"
elif in_par == "(-objekt)":
in_par = "(-Objekt)"
up_cased = out_par.upper() + in_par
else:
up_cased = content.upper()
out = category + up_cased
return out
def format_mms(in_text: str, transition_duration: float = DEFAULT_TRANSITION) -> pandas.DataFrame:
"""Given the sequence of glosses, as single string, composes a dataframe containing an MMS instance
with default duration and fixed transition time between glosses"""
glosses = in_text.split()
n_glosses = len(glosses)
out_glosses = []
for gloss in glosses:
recap_gloss = _recapitalize_gloss(gloss)
print(f"{gloss} --> {recap_gloss}")
out_glosses.append(recap_gloss)
assert len(out_glosses) == n_glosses
mms_data = {
'maingloss': out_glosses,
'framestart': [0] * n_glosses,
'frameend': [0] * n_glosses,
'duration': ['100%'] * n_glosses,
'transition': [transition_duration] * n_glosses,
'domgloss': [''] * n_glosses,
'ndomgloss': [''] * n_glosses,
}
out = pandas.DataFrame(data=mms_data)
return out
#
#
if __name__ == "__main__":
import argparse
from pathlib import Path
parser = argparse.ArgumentParser(description='Converts the lo-case output of the translator into an MMS instance.')
parser.add_argument('--in-xml', '-i', type=str,
help='The filename with the text to format into an MMS, as outputted by the translator.',
required=True)
parser.add_argument('--out-mms', '-o', type=str,
help='The name of the file that will contain the MMS instance.',
required=True)
args = parser.parse_args()
in_xml_path = Path(args.in_xml)
out_mms_path = Path(args.out_mms)
if not in_xml_path.exists():
raise Exception(f"File {str(in_xml_path)} doesn't exist.")
print(f"Parsing XML file '{in_xml_path}'...")
tree = ET.parse(in_xml_path)
in_txt = _extract_text_from_xml(xml_tree=tree)
print(f"Input text: '{in_txt}'")
mms_dataframe: pandas.DataFrame = format_mms(in_text=in_txt)
print(f"Output MMS size: {len(mms_dataframe)}")
print(f"Saving output MMS to {out_mms_path}...")
mms_dataframe.to_csv(path_or_buf=out_mms_path, sep=",", header=True, index=False)
print("All Done.")