-
Notifications
You must be signed in to change notification settings - Fork 0
/
correct_xml.py
189 lines (168 loc) · 6.84 KB
/
correct_xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import xml.etree.ElementTree as ET
import argparse
import logging
import subprocess
_log = logging.getLogger(__name__)
def add_element(tree, name, text):
"""
Add elements to an xml object
:param tree: xml object to append to
:param name: name attribute of the item
:param text: text of the item to add
:return: tree with appended item
"""
new = ET.Element("att")
new.attrib["name"] = name
new.text = text
tree.append(new)
def correct_charset(root):
corrected_charset = False
for child in root:
if corrected_charset:
continue
if child.tag == 'charset':
child.text = 'UTF-8'
corrected_charset = True
if child.tag == 'addAttributes':
new = ET.Element("charset")
new.text = 'UTF-8'
root.insert(4, new)
corrected_charset = True
def sort_children_by(parent):
parent[:] = sorted(parent, key=lambda child: child.get("datasetID"), reverse=True)
def sort_by_datasetid(root):
data_children = []
for child in root.findall('dataset'):
data_children.append(child)
root.remove(child)
sort_children_by(data_children)
for child in data_children:
root.append(child)
return root
def update_adcp(glider, mission):
document_loc = f"/home/usrerddap/erddap/xml_edit/xml/adcp_SEA{glider}_M{mission}.xml"
tree = ET.parse(document_loc)
root = tree.getroot()
correct_charset(root)
# Update dataset name
ds_name = f"adcp_SEA{str(glider).zfill(3)}_M{mission}"
root.attrib["datasetID"] = ds_name
# append dataset to datasets.xml
# fix indentation and write xml
ET.indent(tree, ' ')
out = f"/home/usrerddap/erddap/content/parts/{ds_name}.xml"
tree.write(out, encoding="utf-8", xml_declaration=True)
_log.info(f"Recombining datasets.xml")
subprocess.check_call(['/usr/bin/bash', "/home/usrerddap/erddap/xml_edit/make_datasets.sh"])
def update_doc(glider, mission, kind):
"""
Edit the xml generated by GenerateDatasetsXml.sh
:param glider: glider number
:param mission: mission number
:param kind: nrt or complete
:return:
"""
if kind == "adcp":
update_adcp(glider, mission)
return
document_loc = f"/home/usrerddap/erddap/xml_edit/xml/{kind}_SEA{glider}_M{mission}.xml"
tree = ET.parse(document_loc)
root = tree.getroot()
correct_charset(root)
# Update dataset name
ds_name = f"{kind}_SEA{str(glider).zfill(3)}_M{mission}"
root.attrib["datasetID"] = ds_name
first_vars = []
data_vars = []
special_vars = ["longitude", "latitude", "time", "depth"]
for child in root:
# Check that data dir matches dataset name
if child.tag == "fileDir":
data_dir = child.text
dir_parts = data_dir.split("/")
glider_num = int(dir_parts[-4][3:])
mission_num = int(dir_parts[-3][1:])
assert glider_num == glider
assert mission_num == mission
# fix for addAttributes
if child.tag == "addAttributes":
add_attrs = child
edit_add_attrs(add_attrs)
if child.tag == "dataVariable":
profile_index = False
# Fix for the profile index
for grand_child in child:
if grand_child.tag == "sourceName" and grand_child.text == "profile_index":
profile_index = True
if profile_index:
if grand_child.tag == "addAttributes":
child.remove(grand_child)
new_add = ET.Element("addAttributes")
add_element(new_add, "ioos_category", "Identifier")
add_element(new_add, "long_name", "Profile Index")
add_element(new_add, "cf_role", "timeseries_id")
child.append(new_add)
# Correct addAttributes
if grand_child.tag == "addAttributes":
_log.debug(f"Remove units from {child[0].text}")
grand_child = edit_datavar_add_attrs(grand_child)
# Take the common selection variables and put them at the top
if grand_child.tag == "sourceName":
if grand_child.text in special_vars:
first_vars.append(child)
else:
data_vars.append(child)
# remove data variables
for child in root.findall('dataVariable'):
root.remove(child)
# re-append data variables in desired order
for var in first_vars:
root.append(var)
vars_dict = {}
for var in data_vars:
for child in var:
if child.tag == "sourceName":
vars_dict[child.text] = var
vars_dict_sorted = dict(sorted(vars_dict.items()))
for var in vars_dict_sorted.values():
root.append(var)
# fix indentation and write xml
ET.indent(tree, ' ')
out = f"/home/usrerddap/erddap/content/parts/{ds_name}.xml"
tree.write(out, encoding="utf-8", xml_declaration=True)
_log.info(f"Recombining datasets.xml")
subprocess.check_call(['/usr/bin/bash', "/home/usrerddap/erddap/xml_edit/make_datasets.sh"])
def edit_add_attrs(adds):
for child in adds:
if child.attrib["name"] == "cdm_trajectory_variables":
adds.remove(child)
if child.attrib["name"] == "subsetVariables":
adds.remove(child)
add_element(adds, "cdm_data_type", "TimeSeries")
add_element(adds, "featureType", "TimeSeries")
add_element(adds, "cdm_timeseries_variables", "profile_index")
add_element(adds, "subsetVariables", "profile_index")
def edit_datavar_add_attrs(adds):
# Remove the units tags that ERDDAP adds.
for child in adds:
if child.attrib["name"] == "units":
_log.debug("remove", child.attrib["name"], child.text)
adds.remove(child)
return adds
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Add dataset to ERDDAP')
parser.add_argument('glider', type=int, help='glider number, e.g. 70')
parser.add_argument('mission', type=int, help='Mission number, e.g. 23')
parser.add_argument('kind', type=str, help='Kind of dataset, must be nrt or delayed')
args = parser.parse_args()
if args.kind not in ['nrt', 'delayed', 'adcp']:
raise ValueError('kind must be nrt or delayed')
logf = f'/data/log/{args.kind}.log'
logging.basicConfig(filename=logf,
filemode='a',
format='%(asctime)s %(levelname)-8s %(message)s',
level=logging.INFO,
datefmt='%Y-%m-%d %H:%M:%S')
_log.info(f"Start add dataset SEA{args.glider} M{args.mission} to xml")
update_doc(args.glider, args.mission, args.kind)
_log.info(f"Complete add dataset SEA{args.glider} M{args.mission} to xml")