diff --git a/ci/environment-3.11.yml b/ci/environment-3.11.yml index a1aecacf..cf5717bf 100644 --- a/ci/environment-3.11.yml +++ b/ci/environment-3.11.yml @@ -1,4 +1,4 @@ -name: access-nri-intake-test +name: access-nri channels: - conda-forge - accessnri diff --git a/config/access-mopper.yaml b/config/access-mopper.yaml new file mode 100644 index 00000000..88d36ddb --- /dev/null +++ b/config/access-mopper.yaml @@ -0,0 +1,18 @@ +builder: MopperBuilder + +translator: DefaultTranslator + +sources: + + - path: + - /g/data/ua8/AUS2200/mjo-elnino + metadata_yaml: /g/data/ua8/Working/packages/access-nri-intake-catalog/config/metadata.yaml + ensemble: true +# These are added attributes + fpattern: "{version}/{frequency}/{variable}/{variable}_{model}_{member}_{frequency}" + toselect: + - variable + - frequency + - version + - member + diff --git a/config/metadata.yaml b/config/metadata.yaml new file mode 100644 index 00000000..4d51e3cc --- /dev/null +++ b/config/metadata.yaml @@ -0,0 +1,29 @@ +name: AUS2200 +experiment_uuid: thisdoesntexistsIdoIknowwhattoputifIdontknowhowtogenerateonewhichisdifferentfromwhateverisalreadyhere +description: | + AUS2200 simulations collections. Each simulation is a limited area model study of the entire Australian continent at 2.2 km resolution, using the UM atmospheric model. + This dataset is part of the AUS2200 simulations collection at NCI. +long_description: | + "There's a total of 9 simulations: + ashwed80 - ERA5+ERA5Land reanalysis data was used to provide initial and boundary conditions. The study covers the time period of 1980-02-18 to 1980-02-21, including the 1980 Ash Wednesday bushfires. + ashwed83 - ERA5+ERA5Land reanalysis data was used to provide initial and boundary conditions. The study covers the time period of 1983-02-15 to 1983-02-17, including the 1983 Ash Wednesday bushfires." +model: "AUS2200 (UM v12.2)" +frequency: + - subhr + - 1hr +nominal_resolution: + - atmos = 2.2 km +version: 1 +contact: Dale Roberts +email: cws_help@nci.org.au +created: null +reference: null +license: https://creativecommons.org/licenses/by/4.0/ +url: null +parent_experiment: null +related_experiments: + - null +notes: | + null +keywords: + - null diff --git a/src/access_nri_intake/catalog/manager.py b/src/access_nri_intake/catalog/manager.py index 37b6cf01..249b326d 100644 --- a/src/access_nri_intake/catalog/manager.py +++ b/src/access_nri_intake/catalog/manager.py @@ -107,7 +107,6 @@ def build_esm( f"An Intake-ESM datastore already exists for {name}. To overwrite, " "pass `overwrite=True` to CatalogBuilder.build_esm" ) - builder = builder(path, **kwargs).build() builder.save(name=name, description=description, directory=directory) diff --git a/src/access_nri_intake/cli.py b/src/access_nri_intake/cli.py index c2073105..08ccf766 100644 --- a/src/access_nri_intake/cli.py +++ b/src/access_nri_intake/cli.py @@ -68,7 +68,7 @@ def _parse_build_inputs(config_yamls, build_path): if translator: source_args["translator"] = getattr(translators, translator) - + args.append((method, source_args | kwargs)) return args @@ -208,6 +208,7 @@ def _get_project(path): storage_flags = "+".join(sorted([f"gdata/{proj}" for proj in project])) # Build the catalog + #cm = CatalogManager(path=metacatalog_path, extra=args['extra']) cm = CatalogManager(path=metacatalog_path) for method, args in parsed_sources: logger.info(f"Adding '{args['name']}' to metacatalog '{metacatalog_path}'") diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py index 70c7cf9b..8ec56d6b 100644 --- a/src/access_nri_intake/source/builders.py +++ b/src/access_nri_intake/source/builders.py @@ -11,9 +11,13 @@ import xarray as xr from ecgtools.builder import INVALID_ASSET, TRACEBACK, Builder +from pathlib import Path +from textwrap import wrap +from datetime import datetime from ..utils import validate_against_schema from . import ESM_JSONSCHEMA, PATH_COLUMN, VARIABLE_COLUMN +#PP check if I still need this? from .utils import ( EmptyFileError, _AccessNCFileInfo, @@ -167,7 +171,7 @@ def validate_parser(self): raise ValueError( "asset list provided is None. Please run `.get_assets()` first" ) - + for asset in self.assets: info = self.parser(asset) if INVALID_ASSET not in info: @@ -556,3 +560,170 @@ class AccessCm2Builder(AccessEsm15Builder): rf"^iceh.*\.({PATTERNS_HELPERS['ym']})-{PATTERNS_HELPERS['not_multi_digit']}.*", # ACCESS-CM2 ice r"^.*\.p.(\d{6})_.*", # ACCESS-CM2 atmosphere ] + +class MopperBuilder(BaseBuilder): + """Intake-ESM datastore builder for ACCESS-MOPPeR processed data""" + + def __init__(self, path, ensemble, fpattern, toselect): #, extra): + """ + Initialise a MopperBuilder + + Parameters + ---------- + path: str or list of str + Path or list of paths to crawl for assets/files. + ensemble: boolean + Whether to treat each path as a separate member of an ensemble to join + along a new member dimension + fpattern: str + The pattern used by mopper to encode info in the filename and path + toselect: list + List of attributes to add pattern used by mopper to encode info in the filename and path + """ + + kwargs = dict( + path=path, + depth=5, + exclude_patterns=[], + include_patterns=["*.nc*"], + data_format="netcdf", + groupby_attrs=["file_id", "frequency"], + aggregations=[ + { + "type": "join_existing", + "attribute_name": "start_date", + "options": { + "dim": "time", + "combine": "by_coords", + }, + }, + ], + ) + + if ensemble: + kwargs["aggregations"] += [ + { + "type": "join_new", + "attribute_name": "member", + }, + ] + + super().__init__(**kwargs) + self.fpattern = fpattern + self.toselect = toselect + + #@classmethod + def parser(self, fpath): + if True: + basedir = self.paths[0] + fpattern = self.fpattern + toselect = self.toselect + filepat = fpattern.split("/")[-1] + dirpat = "/".join(fpattern.split("/")[:-1]) + dirpat = dirpat.replace("{","(?P<").replace("}",">[^/]+)") + filepat = filepat.replace("{","(?P<").replace("}",">[^_]+)") + tocompiledir = "^/" + dirpat + "/" + tocompilefile = "^" + filepat + "_?(?P.*)?\.nc" + + dir_re = re.compile(tocompiledir, re.VERBOSE) + file_re = re.compile(tocompilefile, re.VERBOSE) + + fname = str(Path(fpath).name) + fbase = str(Path(fpath).parent) + fbase = fbase.replace(basedir, "") + "/" + dir_match = dir_re.match(fbase).groupdict() + file_match = file_re.match(fname).groupdict() + + exargs = {} + exargs['date_range'] = file_match.get('date_range', '') + for x in toselect: + exargs[x] = file_match.get(x, 'unknown') + if exargs[x] == 'unknown': + exargs[x] = dir_match.get(x, 'unknown') + if 'frequency' in exargs.keys(): + exargs['frequency'] = exargs['frequency'].replace("Pt","") + if 'realm' not in exargs.keys(): + exargs['realm'] = 'unknown' + + # NB all files coming out of mopper have a single variable + # however catalogue seems to expect a multivariable file + # in my opinion this is limiting usefulness of catalogues for + # the many collection that have 1 variable per file! + # Also file_id is required however we're not using it as this aren't multivariable + # files, so we can/should aggregate on variable not file_id + # so I'm using variable as file_id instead. + nc_info, exargs = self.parse_ncfile(fpath, exargs) + ncinfo_dict = nc_info.to_dict() + for k,v in exargs.items(): + ncinfo_dict[k] = v + return ncinfo_dict + + @classmethod + def parse_ncfile(self, fpath, exargs): + """ + Get Intake-ESM datastore entry info from an ACCESS netcdf file + CMOR has its own base date format, length depends on frequency + + Parameters + ---------- + fpath: str + The path to the netcdf file + exargs: dict + Stores extra arguments as frequency, date_range, variable etc, derived from fpattern + + Returns + ------- + output_nc_info: _AccessNCFileInfo + A dataclass containing the information parsed from the file + exargs: dict + Stores extra arguments as frequency, date_range, variable etc, derived from fpattern + + """ + time_format = "%Y-%m-%d, %H:%M:%S" + # get format for dates based on dates lenght + # dformat is the longest possible datetime format for cmor + dformat = '%Y%m%d%H%M%S' + date_range = exargs.pop('date_range') + if date_range == '': + start_date = 'none' + end_date = 'none' + else: + ts, te = date_range.split("-") + cmor_format = dformat[:(len(ts)-2)] + ts = datetime.strptime(ts, cmor_format) + start_date = ts.strftime(time_format) + te = datetime.strptime(te, cmor_format) + end_date = te.strftime(time_format) + + variable = exargs.pop('variable') + with xr.open_dataset( + fpath, + chunks={}, + decode_cf=False, + decode_times=False, + decode_coords=False, + ) as ds: + attrs = ds[variable].attrs + variable_long_name = attrs.get('long_name', 'unknown') + variable_standard_name = attrs.get('standard_name', 'unknown') + variable_cell_methods = attrs.get('cell_methods', 'unknown') + variable_units = attrs.get('units', 'unknown') + tracking_id = ds.attrs.get('tracking_id', 'unknown') + + output_nc_info = _AccessNCFileInfo( + filename=Path(fpath).name, + path=fpath, + file_id=tracking_id, + filename_timestamp=date_range, + frequency=exargs.pop('frequency'), + start_date=start_date, + end_date=end_date, + variable=[variable], + variable_long_name=[variable_long_name], + variable_standard_name=[variable_standard_name], + variable_units=[variable_units], + variable_cell_methods=[variable_cell_methods], + ) + + return output_nc_info, exargs + diff --git a/tests/data/access-mopper/README b/tests/data/access-mopper/README new file mode 100644 index 00000000..6857f116 --- /dev/null +++ b/tests/data/access-mopper/README @@ -0,0 +1 @@ +These files were copied from /g/data/ua8/AUS2200/ashwed83/ and then cdo was used to select only a small region using sellatlonbox diff --git a/tests/data/access-mopper/ashwed83/v1-0/10min/cllow/cllow_AUS2200_ashwed1983_subhrPt_19830216001000-19830217000000.nc b/tests/data/access-mopper/ashwed83/v1-0/10min/cllow/cllow_AUS2200_ashwed1983_subhrPt_19830216001000-19830217000000.nc new file mode 100644 index 00000000..c07dd1a1 Binary files /dev/null and b/tests/data/access-mopper/ashwed83/v1-0/10min/cllow/cllow_AUS2200_ashwed1983_subhrPt_19830216001000-19830217000000.nc differ diff --git a/tests/data/access-mopper/ashwed83/v1-0/1hr/huss/huss_AUS2200_ashwed1983_1hr_198302150030-198302172330.nc b/tests/data/access-mopper/ashwed83/v1-0/1hr/huss/huss_AUS2200_ashwed1983_1hr_198302150030-198302172330.nc new file mode 100644 index 00000000..a59cb813 Binary files /dev/null and b/tests/data/access-mopper/ashwed83/v1-0/1hr/huss/huss_AUS2200_ashwed1983_1hr_198302150030-198302172330.nc differ diff --git a/tests/data/access-mopper/ashwed83/v1-0/1hr/tas/tas_AUS2200_ashwed1983_1hr_198302150030-198302172330.nc b/tests/data/access-mopper/ashwed83/v1-0/1hr/tas/tas_AUS2200_ashwed1983_1hr_198302150030-198302172330.nc new file mode 100644 index 00000000..f9b5a3f6 Binary files /dev/null and b/tests/data/access-mopper/ashwed83/v1-0/1hr/tas/tas_AUS2200_ashwed1983_1hr_198302150030-198302172330.nc differ diff --git a/tests/data/access-mopper/metadata.yaml b/tests/data/access-mopper/metadata.yaml new file mode 100644 index 00000000..e9fa487a --- /dev/null +++ b/tests/data/access-mopper/metadata.yaml @@ -0,0 +1,30 @@ +name: AUS2200 +experiment_uuid: thisdoesntexistsIdoIknowwhattoputifIdontknowhowtogenerateonewhichisdifferentfromwhateverisalreadyhere +description: | + AUS2200 simulations collections. Each simulation is a limited area model study of the entire Australian continent at 2.2 km resolution, using the UM atmospheric model. + This dataset is part of the AUS2200 simulations collection at NCI. +long_description: | + "There's a total of 9 simulations: + ashwed80 - ERA5+ERA5Land reanalysis data was used to provide initial and boundary conditions. The study covers the time period of 1980-02-18 to 1980-02-21, including the 1980 Ash Wednesday bushfires. + ashwed83 - ERA5+ERA5Land reanalysis data was used to provide initial and boundary conditions. The study covers the time period of 1983-02-15 to 1983-02-17, including the 1983 Ash Wednesday bushfires." +model: + - AUS2200 (UM v12.2) +frequency: + - subhr + - 1hr +nominal_resolution: + - atmos = 2.2 km +version: 1 +contact: Dale Roberts +email: cws_help@nci.org.au +created: null +reference: null +license: https://creativecommons.org/licenses/by/4.0/ +url: null +parent_experiment: null +related_experiments: + - null +notes: | + null +keywords: + - null diff --git a/tests/test_builders.py b/tests/test_builders.py index 29bc4e2c..fc63d495 100644 --- a/tests/test_builders.py +++ b/tests/test_builders.py @@ -28,6 +28,7 @@ ), (["access-esm1-5"], "AccessEsm15Builder", {"ensemble": False}, 11, 11, 11), (["access-om3"], "AccessOm3Builder", {}, 12, 12, 6), + (["access-mopper"], "MopperBuilder", {"ensemble": True}, 2, 2, 1), ], ) def test_builder_build(