tupdate.py

#!/usr/bin/env python3
"""
Apply updates made manually to a TSV file generated by tquery.py,
producing an updated JSON for the corpus.

Usage:

  ./tupdate.py streusle.json updates.tsv > streusle.new.json

Note that conllulex2csv.py and csv2conllulex.py should be used
if working with a spreadsheet editor rather than editing updates.tsv directly.

@author: Nathan Schneider (@nschneid)
@since: 2019-09-08
"""

import sys, json, fileinput, re
import shlex, subprocess
from itertools import chain

from conllulex2json import print_json
from mwerender import makelabel
from tquery import ALL_FIELDS, LEX_LEVEL_FIELDS

def tupdate(jsonPath, fields, updates_table):
    """
    Modify data associated with particular tokens in the corpus loaded from jsonPath,
    and return the updated dict to be serialized to JSON.
    `updates_table` must contain "_sentid" and "_tokoffset" for all records.
    At present, only the fields "ss", "ss2", and "lexcat" are eligible for modification;
    other fields present in `updates` will be checked to ensure there is no change
    (unknown fields will trigger a warning).
    """

    assert '_sentid' in fields
    assert '_tokoffset' in fields
    ignores = []
    for fld in fields:
        if fld not in ('_sentid', '_tokoffset', '_context'):
            if fld not in ALL_FIELDS:
                print(f'Warning: unknown field {fld!r} will be ignored', file=sys.stderr)
                ignores.append(fld)
            elif fld not in LEX_LEVEL_FIELDS:
                raise ValueError(f'''In the present implementation, only fields of the lexical expression,
                    such as lexlemma, ss, ss2, and lexcat, may be provided to tupdate(): {fld!r}''')

    if '_context' in fields:
        ignores.append('_context')

    # updates: sentId -> tokOffset -> field -> value
    updates = {}
    for record in updates_table:
        sentId = record.pop('_sentid')
        tokOffset = record.pop('_tokoffset')
        assert sentId
        assert tokOffset
        if sentId not in updates:
            updates[sentId] = {}
        assert tokOffset not in updates[sentId],f'Cannot have multiple updates to the same token: {sentId}:{tokOffset}'
        updates[sentId][tokOffset] = {**record}

    with open(jsonPath, encoding='utf-8') as inF:
        data = json.load(inF)

    for sent in data:
        if sent["sent_id"] not in updates:
            continue
        for lexe in chain(sent["swes"].values(), sent["smwes"].values()):
            toknums = lexe["toknums"]
            if 1 < len(toknums) == max(toknums)-min(toknums)+1:
                _tokoffset = f'{min(toknums)}-{max(toknums)}'
            else:
                _tokoffset = ','.join(map(str,lexe["toknums"]))

            changed = False
            for k,v in updates[sent["sent_id"]].get(_tokoffset, {}).items():
                if v == '':
                    v = None
                if k in ignores:
                    continue
                elif k in ('ss', 'ss2', 'lexcat'):
                    # update value!
                    if lexe[LEX_LEVEL_FIELDS[k]] != v:
                        changed = True
                    lexe[LEX_LEVEL_FIELDS[k]] = v
                else:   # locked field: confirm value hasn't changed
                    assert lexe[LEX_LEVEL_FIELDS[k]]==v,f'Changes to this field not currently supported: {k!r}={v!r} in {sentId}:{tokOffset}'

            if changed:
                # now we need to update the lextag(s) for consistency
                # we assume the MWE part hasn't changed

                lexcat_ss = makelabel(lexe).replace(':', '|')

                # get the first token of the lexical expression, whose lextag
                # encodes supersense and lexcat info
                tok1 = sent["toks"][toknums[0]-1]
                fulllextag = tok1['lextag']
                mwepart = fulllextag[:fulllextag.index('-')]
                fulllextag = f'{mwepart}-{lexcat_ss}'
                tok1['lextag'] = fulllextag

    return data


if __name__=='__main__':

    jsonFP, updatesFP = sys.argv[1:]

    rows = []
    with open(updatesFP, encoding='utf-8') as inF:
        # ignore all but last header row, which contains column headers (field names)
        lastHeader = next(inF)
        assert lastHeader.startswith('#'),lastHeader
        for ln in inF:
            ln = ln[:-1]
            if ln.startswith('#'):
                lastHeader = ln
            else:
                rows.append(ln.split('\t'))
    fields = lastHeader[1:].strip().split('\t')
    table = []
    for row in rows:
        record = {f: row[i] for i,f in enumerate(fields)}
        if '' in record:
            del record['']  # ignore header-less columns
        table.append(record)

    updated = tupdate(jsonFP, fields, table)

    print_json(updated)

    print(f'{len(rows)} update rows processed', file=sys.stderr)