-
Notifications
You must be signed in to change notification settings - Fork 17
/
tupdate.py
executable file
·133 lines (109 loc) · 4.76 KB
/
tupdate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python3
"""
Apply updates made manually to a TSV file generated by tquery.py,
producing an updated JSON for the corpus.
Usage:
./tupdate.py streusle.json updates.tsv > streusle.new.json
Note that conllulex2csv.py and csv2conllulex.py should be used
if working with a spreadsheet editor rather than editing updates.tsv directly.
@author: Nathan Schneider (@nschneid)
@since: 2019-09-08
"""
import sys, json, fileinput, re
import shlex, subprocess
from itertools import chain
from conllulex2json import print_json
from mwerender import makelabel
from tquery import ALL_FIELDS, LEX_LEVEL_FIELDS
def tupdate(jsonPath, fields, updates_table):
"""
Modify data associated with particular tokens in the corpus loaded from jsonPath,
and return the updated dict to be serialized to JSON.
`updates_table` must contain "_sentid" and "_tokoffset" for all records.
At present, only the fields "ss", "ss2", and "lexcat" are eligible for modification;
other fields present in `updates` will be checked to ensure there is no change
(unknown fields will trigger a warning).
"""
assert '_sentid' in fields
assert '_tokoffset' in fields
ignores = []
for fld in fields:
if fld not in ('_sentid', '_tokoffset', '_context'):
if fld not in ALL_FIELDS:
print(f'Warning: unknown field {fld!r} will be ignored', file=sys.stderr)
ignores.append(fld)
elif fld not in LEX_LEVEL_FIELDS:
raise ValueError(f'''In the present implementation, only fields of the lexical expression,
such as lexlemma, ss, ss2, and lexcat, may be provided to tupdate(): {fld!r}''')
if '_context' in fields:
ignores.append('_context')
# updates: sentId -> tokOffset -> field -> value
updates = {}
for record in updates_table:
sentId = record.pop('_sentid')
tokOffset = record.pop('_tokoffset')
assert sentId
assert tokOffset
if sentId not in updates:
updates[sentId] = {}
assert tokOffset not in updates[sentId],f'Cannot have multiple updates to the same token: {sentId}:{tokOffset}'
updates[sentId][tokOffset] = {**record}
with open(jsonPath, encoding='utf-8') as inF:
data = json.load(inF)
for sent in data:
if sent["sent_id"] not in updates:
continue
for lexe in chain(sent["swes"].values(), sent["smwes"].values()):
toknums = lexe["toknums"]
if 1 < len(toknums) == max(toknums)-min(toknums)+1:
_tokoffset = f'{min(toknums)}-{max(toknums)}'
else:
_tokoffset = ','.join(map(str,lexe["toknums"]))
changed = False
for k,v in updates[sent["sent_id"]].get(_tokoffset, {}).items():
if v == '':
v = None
if k in ignores:
continue
elif k in ('ss', 'ss2', 'lexcat'):
# update value!
if lexe[LEX_LEVEL_FIELDS[k]] != v:
changed = True
lexe[LEX_LEVEL_FIELDS[k]] = v
else: # locked field: confirm value hasn't changed
assert lexe[LEX_LEVEL_FIELDS[k]]==v,f'Changes to this field not currently supported: {k!r}={v!r} in {sentId}:{tokOffset}'
if changed:
# now we need to update the lextag(s) for consistency
# we assume the MWE part hasn't changed
lexcat_ss = makelabel(lexe).replace(':', '|')
# get the first token of the lexical expression, whose lextag
# encodes supersense and lexcat info
tok1 = sent["toks"][toknums[0]-1]
fulllextag = tok1['lextag']
mwepart = fulllextag[:fulllextag.index('-')]
fulllextag = f'{mwepart}-{lexcat_ss}'
tok1['lextag'] = fulllextag
return data
if __name__=='__main__':
jsonFP, updatesFP = sys.argv[1:]
rows = []
with open(updatesFP, encoding='utf-8') as inF:
# ignore all but last header row, which contains column headers (field names)
lastHeader = next(inF)
assert lastHeader.startswith('#'),lastHeader
for ln in inF:
ln = ln[:-1]
if ln.startswith('#'):
lastHeader = ln
else:
rows.append(ln.split('\t'))
fields = lastHeader[1:].strip().split('\t')
table = []
for row in rows:
record = {f: row[i] for i,f in enumerate(fields)}
if '' in record:
del record[''] # ignore header-less columns
table.append(record)
updated = tupdate(jsonFP, fields, table)
print_json(updated)
print(f'{len(rows)} update rows processed', file=sys.stderr)