-
Notifications
You must be signed in to change notification settings - Fork 0
/
S_08_mark_etym.py
197 lines (140 loc) · 9.86 KB
/
S_08_mark_etym.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#!/usr/bin/env python3
'''
SCRIPT 08:
Script for tagging information in the etymological section (grammatical information and word forms).
Used package:
re (see: https://docs.python.org/3/library/re.html)
'''
# === Imports ===
import re
import S_01_helpers as helpers
# === Functions ===
def seperate_hi_italics(tei):
'''Splits the contant of <hi rend="italics"> when separated by commas.'''
n = 0
for n in range (0,5):
tei = re.sub(r'(<hi rend="italics">[^<^,]+)(,)([^<]+</hi>)', r'\1</hi>\2<hi rend="italics">\3', tei)
n =+ 1
return tei
def mark_pos_etym(tei, pos_csv):
''' Annotates grammatical information in the etymological section.'''
data_wortarten = helpers.read_csv(pos_csv, 'abbr')
pattern = '(<hi rend="italics">)([^<]+)(</hi>)'
tei = helpers.mark_pattern_df(pattern, data_wortarten, " ", tei)
# moving <hi> in front of <gramGrp>
tei = re.sub(r'(?P<begin><hi rend="italics">)(?P<keep>[^<]*)(?P<gram><gramGrp>(<gram[^<]+</gram>\/?)+</gramGrp>[^<]*)(?P<end></hi>)', r'\g<begin>\g<keep>\g<end>\g<gram>', tei)
# deleting empty <hi>-elements
tei = re.sub(r'<hi rend="italics">\s*</hi>', r'', tei)
# annotating grammatical person and number and moving into <gramGrp>
tei = re.sub(r'(\d\.\s)(<hi rend="italics">)(<gramGrp>)(<gram [^>]+>(Sg.|Pl.)</gram>)', r'\2\3<gram type="person">\1</gram>\4', tei)
# merging two following <gramGrp>s
tei = re.sub(r'</gramGrp>\s{1,2}<gramGrp>', r'', tei)
return tei
def move_hi_from_gram(tei):
'''Moves <gramGrp> out of <hi rend=italics>.'''
### different cases according to content of <hi>:
# case 1: word form + gramGrp
tei = re.sub(r'(<hi rend="italics">[^<]+)(<gramGrp>(<gram[^<]+</gram>\/?){1,3}</gramGrp>\s{0,2})(</hi>)', r'\1\4\2', tei)
# case 2: word form + gramGrp + 'u.ä.' or 'Pl.'
tei = re.sub(r'(<hi rend="italics">[^<]+)(<gramGrp>(<gram[^<]+</gram>\/?){1,3}</gramGrp>\s{1,2}(u\.ä|Pl\.))(</hi>)', r'\1\5\2', tei)
# case 3: gramGrp + word form
tei = re.sub(r'(<hi rend="italics">)\s?(<gramGrp>(<gram[^<]+</gram>){1,3}</gramGrp>)([^<]+</hi>)', r'\2\1\4', tei)
# case 4: only gramGroup in <hi>
tei = re.sub(r'(<hi rend="italics">)(<gramGrp>(<gram[^<]+</gram>){1,3}</gramGrp>)(</hi>)', r'\2', tei)
return tei
def mark_orth(tei):
'''Marks word forms (remaining content of <hi rend="italics">) with <orth>.'''
tei = re.sub(r'((?<!<orth>)<hi rend="italics">[^<]+</hi>)', r'<orth>\1</orth>', tei)
# exception: no <orth> in case of references
tei = re.sub(r'(<ref type="entry">)<orth>(<hi rend="italics">[^>]+</hi>)</orth>(</ref>)', r'\1\2\3', tei)
return tei
def mark_form(tei):
''' Inserting '<form xml:lang="xxx">' around <orth> or <orth> + following <gramGrp>.'''
# <orth> + <gramGrp>
tei = re.sub(r'(<orth><hi rend="italics">[^<]+</hi></orth><gramGrp>(<gram[^<]+</gram>\/?)*</gramGrp>)', r'<form xml:lang="xxx">\1</form>', tei)
# <lang> in front of <orth>
tei = re.sub(r'(<lang[^<]+</lang>[^<]*)(<orth><hi rend="italics">[^<]+</hi></orth>)', r'\1<form xml:lang="xxx">\2</form>', tei)
# text in front of <orth>
tei = re.sub(r'([^<^>]+)(<orth><hi rend="italics">[^<]+</hi></orth>)', r'\1<form xml:lang="xxx">\2</form>', tei)
# </hi> oder </sense> in front of <orth>
tei = re.sub(r'(</hi>|</sense>)(<orth><hi rend="italics">[^<]+</hi></orth>)', r'\1<form xml:lang="xxx">\2</form>', tei)
# <lang> or ',' + <gramGrp> + <orth>
tei = re.sub(r'(</lang>|,)(<gramGrp>(<gram [^<]+</gram>)*</gramGrp><orth><hi rend="italics">[^<]+</hi></orth>)', r'\1<form xml:lang="xxx">\2</form>', tei)
# deleting empty <orth>-elements
tei = re.sub(r'<orth><hi rend="italics">\s?</hi></orth>', r'', tei)
return tei
def mark_cit(tei):
'''Inserts '<cit type="etymologicalForm">' around <form> + possibly preceding <lang>.'''
### deleting whitespace between elements
tei = re.sub(r'</lang>\s{1,2}<form', r'</lang><form', tei)
tei = re.sub(r'</form>\s{1,2}<def>', r'</form><def>', tei)
### inserting <cit> + <mark/> (in order to prevent the future finding of already tagged cases)
# case 1a: lang + form (1. orth, 2. gramGrp) + def
tei = re.sub(r'(<lang[^<]+</lang>([^<]+)?)(<form[^>]+><orth><hi[^<]+</hi></orth>(<gramGrp>(<gram [^<]+</gram>)*</gramGrp>)?</form>\s?<def>[^<]+</def>(,\s<gramGrp>(<gram [^<]+</gram>)*</gramGrp>\s<def>[^<]+</def>)?)', r'<cit type="etymologicalForm">\1<mark/>\3</cit>', tei)
# case 1b: lang + form (1. gramGrp, 2. orth) + def
tei = re.sub(r'(<lang[^<]+</lang>([^<]+)?)(<form[^>]+><gramGrp>(<gram [^<]+</gram>)*</gramGrp><orth><hi[^<]+</hi></orth></form>\s?<def>[^<]+</def>)', r'<cit type="etymologicalForm">\1<mark/>\3</cit>', tei)
# case 2a: lang + form (1. orth, 2. gramGrp)
tei = re.sub(r'((?<!>)(<lang[^<]+</lang>)+)(<form[^>]+><orth><hi[^<]+</hi></orth>(<gramGrp>(<gram [^<]+</gram>)*</gramGrp>)?</form>)', r'<cit type="etymologicalForm">\1<mark/>\3</cit>', tei)
# case 2b: </hi> between lang and <form>
tei = re.sub(r'(?<!>)(<lang[^<]+</lang>([^<]+)?)(</hi>)(<form[^>]+><orth><hi[^<]+</hi></orth>(<gramGrp>(<gram [^<]+</gram>)*</gramGrp>)?</form>)', r'\3<cit type="etymologicalForm">\1<mark/>\4</cit>', tei)
# case 2c: lang + form (1. gramGrp, 2. orth)
tei = re.sub(r'((?<!>)<lang[^<]+</lang>([^<]+)?)(<form[^>]+><gramGrp>(<gram [^<]+</gram>)*</gramGrp><orth><hi[^<]+</hi></orth></form>)', r'<cit type="etymologicalForm">\1<mark/>\3</cit>', tei)
# case 3: lang + form (gram seperated by "/")
tei = re.sub(r'(<lang[^<]+</lang>[^<]*<form[^>]+><orth([^<]|<hi[^<]+</hi>)+</orth><gramGrp><gram [^<]+</gram>(/<gram[^<]+</gram>)+</gramGrp></form>)', r'<cit type="etymologicalForm">\1<mark/></cit>', tei)
### move variants (several belonging forms) in one single <cit>
# cit is followed by <form> (and possibly) <def>
tei = re.sub(r'(</cit>)((\s?,<form[^>]+><orth><hi[^<]+</hi></orth>(<gramGrp>(<gram [^<]+</gram>\/?)*</gramGrp>)?</form>)+(\s?<def>[^<]+</def>)?)', r'\2\1',tei)
# case: order: 1. gramGrp, 2. orth
tei = re.sub(r'(</cit>)((\s?,<form[^>]+><gramGrp>(<gram [^<]+</gram>)*</gramGrp><orth><hi[^<]+</hi></orth></form>)+(\s?<def>[^<]+</def>)?)', r'\2\1',tei)
# special case: variant in '()' + gramGrp (optional) + def (optional)
tei = re.sub(r'(</cit>\s?)(\(oder\s)(<form[^>]+><orth><hi[^<]+</hi></orth></form>\)(\s?<gramGrp>(<gram [^<]+</gram>)+</gramGrp>)?(\s?<def>[^<]+</def>)?)', r'\2<mark/>\3\1', tei)
### inserting <cit> - remaining cases
# case 4a: text + form + def (optional)
tei = re.sub(r'((?<![,>])<form xml:lang[^>]+><orth><hi[^<]+</hi></orth>(\s?<gramGrp>(<gram [^<]+</gram>)+</gramGrp>)?</form>(<def>[^<]+</def>)?)', r'<cit type="etymologicalForm">\1</cit>', tei)
# case 4b: </hi> in front of form
tei = re.sub(r'(</hi>)(<form xml:lang[^>]+><orth><hi[^<]+</hi></orth>(\s?<gramGrp>(<gram [^<]+</gram>)+</gramGrp>)?</form>(<def>[^<]+</def>)?)', r'\1<cit type="etymologicalForm">\2</cit>', tei)
# case 5: enumeration of <form>s (which aren't variants)
n = 0
for n in range(0,5):
tei = re.sub(r'(</cit>\s?,)((<form[^>]+><orth><hi[^<]+</hi></orth>(<gramGrp>(<gram [^<]+</gram>)*</gramGrp>)?</form>)+(\s?<def>[^<]+</def>)?)', r'\1<cit type="etymologicalForm">\2</cit>', tei)
n =+1
# delete <mark/>
tei = tei.replace('<mark/>', '')
return tei
def mark_variants(tei):
'''Adds attribute type="variant" to <form>-elements that are variants (several <form>-elements in one <cit>).'''
matches = re.finditer('(?P<begin><cit type="etymologicalForm"><lang[^<]+</lang>)(?P<form>(<form[^<]+\s?(<orth><hi[^<]+</hi></orth>)?(<gramGrp>(<gram [^<]+</gram>)+</gramGrp>)?(<orth><hi[^<]+</hi></orth>)?</form>\,?)+)(?P<end>(<def>[^<]+</def>)?</cit>)', tei)
for match in matches:
if match[0].count('<form') > 1:
repl = re.sub(r'(<form)', r'\1 type="variant"', match.group('form'))
replaceStr = match.group('begin') + repl + match.group('end')
tei = tei.replace(match[0], replaceStr)
return tei
def mark_cit_relatedForm(tei):
'''Inserts '<cit type="related"> around sublemmata in the etymological section.'''
### step 1: mask sublemmta in entry head by replacing the attribute value
matches = re.finditer('<form type="lemmaGroup"><form type="lemma"><orth>[^<]+(<hi[^<]+</hi>)?</orth></form>(\([^\)]+\)\s?)?\s?<gramGrp>(<gram [^<]+</gram>)*</gramGrp></form>(\s\([^\)]+\))?(,\s<form type="sublemma">)?', tei)
for match in matches:
repl = re.sub('sublemma', 'placeholder', match[0])
tei = tei.replace(match[0], repl)
### step 2: insert <cit>
tei = re.sub(r'(<form type="sublemma"><orth>[^<]+(<hi[^<]+</hi>)?</orth></form>(\s?<gramGrp>(<gram[^<]+</gram>)+</gramGrp>)?(\s?<def[^<]+</def>)?)', r'<cit type="relatedForm">\1</cit>', tei)
# sublemma is reference:
tei = re.sub(r'(<form type="sublemma"><orth><xr><lbl>↗</lbl><ref type="entry">[^<]+(<hi[^<]+</hi>)?</ref></xr></orth></form>)', r'<cit type="relatedForm">\1</cit>', tei)
tei = re.sub(r'(<form type="sublemma"><orth><xr><lbl>↗</lbl><ref type="entry"><hi[^<]+(<hi[^<]+</hi>)?</hi></ref></xr></orth></form>)', r'<cit type="relatedForm">\1</cit>', tei)
### step 3: undo masking
tei = tei.replace('placeholder', 'sublemma')
return tei
# === Coordinating function ===
def main(tei, pos_csv):
print("--- 08_mark_etym.py running")
tei = seperate_hi_italics(tei)
tei = mark_pos_etym(tei, pos_csv)
tei = move_hi_from_gram(tei)
tei = mark_orth(tei)
tei = mark_form(tei)
tei = mark_cit(tei)
tei = mark_variants(tei)
tei = mark_cit_relatedForm(tei)
print("... done!")
return tei