Skip to content

Commit

Permalink
Bring @paolap AUS2200 builder work into main repository (#253)
Browse files Browse the repository at this point in the history
* added draft of mopper builder

* first attempt to introduce a mopper builder

* updated mopper builder and added test and test data

* maybe solved date format issue

* fixed typo and datetime import

* Mopper bUilder class is now working

* added config files

---------

Co-authored-by: Paola Petrelli <[email protected]>
Co-authored-by: Paola Petrelli <[email protected]>
  • Loading branch information
3 people authored Nov 14, 2024
1 parent a0dcb66 commit 4390267
Show file tree
Hide file tree
Showing 12 changed files with 254 additions and 4 deletions.
2 changes: 1 addition & 1 deletion ci/environment-3.11.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: access-nri-intake-test
name: access-nri
channels:
- conda-forge
- accessnri
Expand Down
18 changes: 18 additions & 0 deletions config/access-mopper.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
builder: MopperBuilder

translator: DefaultTranslator

sources:

- path:
- /g/data/ua8/AUS2200/mjo-elnino
metadata_yaml: /g/data/ua8/Working/packages/access-nri-intake-catalog/config/metadata.yaml
ensemble: true
# These are added attributes
fpattern: "{version}/{frequency}/{variable}/{variable}_{model}_{member}_{frequency}"
toselect:
- variable
- frequency
- version
- member

29 changes: 29 additions & 0 deletions config/metadata.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: AUS2200
experiment_uuid: thisdoesntexistsIdoIknowwhattoputifIdontknowhowtogenerateonewhichisdifferentfromwhateverisalreadyhere
description: |
AUS2200 simulations collections. Each simulation is a limited area model study of the entire Australian continent at 2.2 km resolution, using the UM atmospheric model.
This dataset is part of the AUS2200 simulations collection at NCI.
long_description: |
"There's a total of 9 simulations:
ashwed80 - ERA5+ERA5Land reanalysis data was used to provide initial and boundary conditions. The study covers the time period of 1980-02-18 to 1980-02-21, including the 1980 Ash Wednesday bushfires.
ashwed83 - ERA5+ERA5Land reanalysis data was used to provide initial and boundary conditions. The study covers the time period of 1983-02-15 to 1983-02-17, including the 1983 Ash Wednesday bushfires."
model: "AUS2200 (UM v12.2)"
frequency:
- subhr
- 1hr
nominal_resolution:
- atmos = 2.2 km
version: 1
contact: Dale Roberts
email: [email protected]
created: null
reference: null
license: https://creativecommons.org/licenses/by/4.0/
url: null
parent_experiment: null
related_experiments:
- null
notes: |
null
keywords:
- null
1 change: 0 additions & 1 deletion src/access_nri_intake/catalog/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,6 @@ def build_esm(
f"An Intake-ESM datastore already exists for {name}. To overwrite, "
"pass `overwrite=True` to CatalogBuilder.build_esm"
)

builder = builder(path, **kwargs).build()
builder.save(name=name, description=description, directory=directory)

Expand Down
3 changes: 2 additions & 1 deletion src/access_nri_intake/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def _parse_build_inputs(config_yamls, build_path):

if translator:
source_args["translator"] = getattr(translators, translator)

args.append((method, source_args | kwargs))

return args
Expand Down Expand Up @@ -208,6 +208,7 @@ def _get_project(path):
storage_flags = "+".join(sorted([f"gdata/{proj}" for proj in project]))

# Build the catalog
#cm = CatalogManager(path=metacatalog_path, extra=args['extra'])
cm = CatalogManager(path=metacatalog_path)
for method, args in parsed_sources:
logger.info(f"Adding '{args['name']}' to metacatalog '{metacatalog_path}'")
Expand Down
173 changes: 172 additions & 1 deletion src/access_nri_intake/source/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,13 @@

import xarray as xr
from ecgtools.builder import INVALID_ASSET, TRACEBACK, Builder
from pathlib import Path
from textwrap import wrap
from datetime import datetime

from ..utils import validate_against_schema
from . import ESM_JSONSCHEMA, PATH_COLUMN, VARIABLE_COLUMN
#PP check if I still need this?
from .utils import (
EmptyFileError,
_AccessNCFileInfo,
Expand Down Expand Up @@ -167,7 +171,7 @@ def validate_parser(self):
raise ValueError(
"asset list provided is None. Please run `.get_assets()` first"
)

for asset in self.assets:
info = self.parser(asset)
if INVALID_ASSET not in info:
Expand Down Expand Up @@ -556,3 +560,170 @@ class AccessCm2Builder(AccessEsm15Builder):
rf"^iceh.*\.({PATTERNS_HELPERS['ym']})-{PATTERNS_HELPERS['not_multi_digit']}.*", # ACCESS-CM2 ice
r"^.*\.p.(\d{6})_.*", # ACCESS-CM2 atmosphere
]

class MopperBuilder(BaseBuilder):
"""Intake-ESM datastore builder for ACCESS-MOPPeR processed data"""

def __init__(self, path, ensemble, fpattern, toselect): #, extra):
"""
Initialise a MopperBuilder
Parameters
----------
path: str or list of str
Path or list of paths to crawl for assets/files.
ensemble: boolean
Whether to treat each path as a separate member of an ensemble to join
along a new member dimension
fpattern: str
The pattern used by mopper to encode info in the filename and path
toselect: list
List of attributes to add pattern used by mopper to encode info in the filename and path
"""

kwargs = dict(
path=path,
depth=5,
exclude_patterns=[],
include_patterns=["*.nc*"],
data_format="netcdf",
groupby_attrs=["file_id", "frequency"],
aggregations=[
{
"type": "join_existing",
"attribute_name": "start_date",
"options": {
"dim": "time",
"combine": "by_coords",
},
},
],
)

if ensemble:
kwargs["aggregations"] += [
{
"type": "join_new",
"attribute_name": "member",
},
]

super().__init__(**kwargs)
self.fpattern = fpattern
self.toselect = toselect

#@classmethod
def parser(self, fpath):
if True:
basedir = self.paths[0]
fpattern = self.fpattern
toselect = self.toselect
filepat = fpattern.split("/")[-1]
dirpat = "/".join(fpattern.split("/")[:-1])
dirpat = dirpat.replace("{","(?P<").replace("}",">[^/]+)")
filepat = filepat.replace("{","(?P<").replace("}",">[^_]+)")
tocompiledir = "^/" + dirpat + "/"
tocompilefile = "^" + filepat + "_?(?P<date_range>.*)?\.nc"

dir_re = re.compile(tocompiledir, re.VERBOSE)
file_re = re.compile(tocompilefile, re.VERBOSE)

fname = str(Path(fpath).name)
fbase = str(Path(fpath).parent)
fbase = fbase.replace(basedir, "") + "/"
dir_match = dir_re.match(fbase).groupdict()
file_match = file_re.match(fname).groupdict()

exargs = {}
exargs['date_range'] = file_match.get('date_range', '')
for x in toselect:
exargs[x] = file_match.get(x, 'unknown')
if exargs[x] == 'unknown':
exargs[x] = dir_match.get(x, 'unknown')
if 'frequency' in exargs.keys():
exargs['frequency'] = exargs['frequency'].replace("Pt","")
if 'realm' not in exargs.keys():
exargs['realm'] = 'unknown'

# NB all files coming out of mopper have a single variable
# however catalogue seems to expect a multivariable file
# in my opinion this is limiting usefulness of catalogues for
# the many collection that have 1 variable per file!
# Also file_id is required however we're not using it as this aren't multivariable
# files, so we can/should aggregate on variable not file_id
# so I'm using variable as file_id instead.
nc_info, exargs = self.parse_ncfile(fpath, exargs)
ncinfo_dict = nc_info.to_dict()
for k,v in exargs.items():
ncinfo_dict[k] = v
return ncinfo_dict

@classmethod
def parse_ncfile(self, fpath, exargs):
"""
Get Intake-ESM datastore entry info from an ACCESS netcdf file
CMOR has its own base date format, length depends on frequency
Parameters
----------
fpath: str
The path to the netcdf file
exargs: dict
Stores extra arguments as frequency, date_range, variable etc, derived from fpattern
Returns
-------
output_nc_info: _AccessNCFileInfo
A dataclass containing the information parsed from the file
exargs: dict
Stores extra arguments as frequency, date_range, variable etc, derived from fpattern
"""
time_format = "%Y-%m-%d, %H:%M:%S"
# get format for dates based on dates lenght
# dformat is the longest possible datetime format for cmor
dformat = '%Y%m%d%H%M%S'
date_range = exargs.pop('date_range')
if date_range == '':
start_date = 'none'
end_date = 'none'
else:
ts, te = date_range.split("-")
cmor_format = dformat[:(len(ts)-2)]
ts = datetime.strptime(ts, cmor_format)
start_date = ts.strftime(time_format)
te = datetime.strptime(te, cmor_format)
end_date = te.strftime(time_format)

variable = exargs.pop('variable')
with xr.open_dataset(
fpath,
chunks={},
decode_cf=False,
decode_times=False,
decode_coords=False,
) as ds:
attrs = ds[variable].attrs
variable_long_name = attrs.get('long_name', 'unknown')
variable_standard_name = attrs.get('standard_name', 'unknown')
variable_cell_methods = attrs.get('cell_methods', 'unknown')
variable_units = attrs.get('units', 'unknown')
tracking_id = ds.attrs.get('tracking_id', 'unknown')

output_nc_info = _AccessNCFileInfo(
filename=Path(fpath).name,
path=fpath,
file_id=tracking_id,
filename_timestamp=date_range,
frequency=exargs.pop('frequency'),
start_date=start_date,
end_date=end_date,
variable=[variable],
variable_long_name=[variable_long_name],
variable_standard_name=[variable_standard_name],
variable_units=[variable_units],
variable_cell_methods=[variable_cell_methods],
)

return output_nc_info, exargs

1 change: 1 addition & 0 deletions tests/data/access-mopper/README
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
These files were copied from /g/data/ua8/AUS2200/ashwed83/ and then cdo was used to select only a small region using sellatlonbox
Binary file not shown.
Binary file not shown.
Binary file not shown.
30 changes: 30 additions & 0 deletions tests/data/access-mopper/metadata.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: AUS2200
experiment_uuid: thisdoesntexistsIdoIknowwhattoputifIdontknowhowtogenerateonewhichisdifferentfromwhateverisalreadyhere
description: |
AUS2200 simulations collections. Each simulation is a limited area model study of the entire Australian continent at 2.2 km resolution, using the UM atmospheric model.
This dataset is part of the AUS2200 simulations collection at NCI.
long_description: |
"There's a total of 9 simulations:
ashwed80 - ERA5+ERA5Land reanalysis data was used to provide initial and boundary conditions. The study covers the time period of 1980-02-18 to 1980-02-21, including the 1980 Ash Wednesday bushfires.
ashwed83 - ERA5+ERA5Land reanalysis data was used to provide initial and boundary conditions. The study covers the time period of 1983-02-15 to 1983-02-17, including the 1983 Ash Wednesday bushfires."
model:
- AUS2200 (UM v12.2)
frequency:
- subhr
- 1hr
nominal_resolution:
- atmos = 2.2 km
version: 1
contact: Dale Roberts
email: [email protected]
created: null
reference: null
license: https://creativecommons.org/licenses/by/4.0/
url: null
parent_experiment: null
related_experiments:
- null
notes: |
null
keywords:
- null
1 change: 1 addition & 0 deletions tests/test_builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
),
(["access-esm1-5"], "AccessEsm15Builder", {"ensemble": False}, 11, 11, 11),
(["access-om3"], "AccessOm3Builder", {}, 12, 12, 6),
(["access-mopper"], "MopperBuilder", {"ensemble": True}, 2, 2, 1),
],
)
def test_builder_build(
Expand Down

0 comments on commit 4390267

Please sign in to comment.