correct_xml.py

import xml.etree.ElementTree as ET
import argparse
import logging
import subprocess
_log = logging.getLogger(__name__)


def add_element(tree, name, text):
    """
    Add elements to an xml object
    :param tree: xml object to append to
    :param name: name attribute of the item
    :param text: text of the item to add
    :return: tree with appended item
    """
    new = ET.Element("att")
    new.attrib["name"] = name
    new.text = text
    tree.append(new)


def correct_charset(root):
    corrected_charset = False
    for child in root:
        if corrected_charset:
            continue
        if child.tag == 'charset':
            child.text = 'UTF-8'
            corrected_charset = True
        if child.tag == 'addAttributes':
            new = ET.Element("charset")
            new.text = 'UTF-8'
            root.insert(4, new)
            corrected_charset = True


def sort_children_by(parent):
    parent[:] = sorted(parent, key=lambda child: child.get("datasetID"), reverse=True)


def sort_by_datasetid(root):
    data_children = []
    for child in root.findall('dataset'):
        data_children.append(child)
        root.remove(child)
    sort_children_by(data_children)
    for child in data_children:
        root.append(child)
    return root


def update_adcp(glider, mission):
    document_loc = f"/home/usrerddap/erddap/xml_edit/xml/adcp_SEA{glider}_M{mission}.xml"
    tree = ET.parse(document_loc)
    root = tree.getroot()
    correct_charset(root)
    # Update dataset name
    ds_name = f"adcp_SEA{str(glider).zfill(3)}_M{mission}"
    root.attrib["datasetID"] = ds_name
    # append dataset to datasets.xml
    # fix indentation and write xml
    ET.indent(tree, '  ')
    out = f"/home/usrerddap/erddap/content/parts/{ds_name}.xml"
    tree.write(out, encoding="utf-8", xml_declaration=True)
    _log.info(f"Recombining datasets.xml")
    subprocess.check_call(['/usr/bin/bash', "/home/usrerddap/erddap/xml_edit/make_datasets.sh"])


def update_doc(glider, mission, kind):
    """
    Edit the xml generated by GenerateDatasetsXml.sh
    :param glider: glider number
    :param mission: mission number
    :param kind: nrt or complete
    :return: 
    """
    if kind == "adcp":
        update_adcp(glider, mission)
        return
    document_loc = f"/home/usrerddap/erddap/xml_edit/xml/{kind}_SEA{glider}_M{mission}.xml"
    tree = ET.parse(document_loc)
    root = tree.getroot()
    correct_charset(root)
    # Update dataset name
    ds_name = f"{kind}_SEA{str(glider).zfill(3)}_M{mission}"
    root.attrib["datasetID"] = ds_name
    first_vars = []
    data_vars = []
    special_vars = ["longitude", "latitude", "time", "depth"]
    for child in root:
        # Check that data dir matches dataset name
        if child.tag == "fileDir":
            data_dir = child.text
            dir_parts = data_dir.split("/")
            glider_num = int(dir_parts[-4][3:])
            mission_num = int(dir_parts[-3][1:])
            assert glider_num == glider
            assert mission_num == mission
        # fix for addAttributes
        if child.tag == "addAttributes":
            add_attrs = child
            edit_add_attrs(add_attrs)
        if child.tag == "dataVariable":
            profile_index = False
            # Fix for the profile index
            for grand_child in child:
                if grand_child.tag == "sourceName" and grand_child.text == "profile_index":
                    profile_index = True
                if profile_index:
                    if grand_child.tag == "addAttributes":
                        child.remove(grand_child)
                        new_add = ET.Element("addAttributes")
                        add_element(new_add, "ioos_category", "Identifier")
                        add_element(new_add, "long_name", "Profile Index")
                        add_element(new_add, "cf_role", "timeseries_id")
                        child.append(new_add)
                # Correct addAttributes
                if grand_child.tag == "addAttributes":
                    _log.debug(f"Remove units from {child[0].text}")
                    grand_child = edit_datavar_add_attrs(grand_child)

                # Take the common selection variables and put them at the top
                if grand_child.tag == "sourceName":
                    if grand_child.text in special_vars:
                        first_vars.append(child)
                    else:
                        data_vars.append(child)
    # remove data variables
    for child in root.findall('dataVariable'):
        root.remove(child)
    # re-append data variables in desired order
    for var in first_vars:
        root.append(var)
    vars_dict = {}
    for var in data_vars:
        for child in var:
            if child.tag == "sourceName":
                vars_dict[child.text] = var
    vars_dict_sorted = dict(sorted(vars_dict.items()))
    for var in vars_dict_sorted.values():
        root.append(var)

    # fix indentation and write xml
    ET.indent(tree, '  ')
    out = f"/home/usrerddap/erddap/content/parts/{ds_name}.xml"
    tree.write(out, encoding="utf-8", xml_declaration=True)
    _log.info(f"Recombining datasets.xml")
    subprocess.check_call(['/usr/bin/bash', "/home/usrerddap/erddap/xml_edit/make_datasets.sh"])


def edit_add_attrs(adds):
    for child in adds:
        if child.attrib["name"] == "cdm_trajectory_variables":
            adds.remove(child)
        if child.attrib["name"] == "subsetVariables":
            adds.remove(child)

    add_element(adds, "cdm_data_type", "TimeSeries")
    add_element(adds, "featureType", "TimeSeries")
    add_element(adds, "cdm_timeseries_variables", "profile_index")
    add_element(adds, "subsetVariables", "profile_index")


def edit_datavar_add_attrs(adds):
    # Remove the units tags that ERDDAP adds.
    for child in adds:
        if child.attrib["name"] == "units":
            _log.debug("remove", child.attrib["name"], child.text)
            adds.remove(child)
    return adds


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Add dataset to ERDDAP')
    parser.add_argument('glider', type=int, help='glider number, e.g. 70')
    parser.add_argument('mission', type=int, help='Mission number, e.g. 23')
    parser.add_argument('kind', type=str, help='Kind of dataset, must be nrt or delayed')
    args = parser.parse_args()
    if args.kind not in ['nrt', 'delayed', 'adcp']:
        raise ValueError('kind must be nrt or delayed')
    logf = f'/data/log/{args.kind}.log'
    logging.basicConfig(filename=logf,
                        filemode='a',
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        level=logging.INFO,
                        datefmt='%Y-%m-%d %H:%M:%S')
    _log.info(f"Start add dataset SEA{args.glider} M{args.mission} to xml")
    update_doc(args.glider, args.mission, args.kind)
    _log.info(f"Complete add dataset SEA{args.glider} M{args.mission} to xml")