Bring @paolap AUS2200 builder work into main repository (#253)

* added draft of mopper builder * first attempt to introduce a mopper builder * updated mopper builder and added test and test data * maybe solved date format issue * fixed typo and datetime import * Mopper bUilder class is now working * added config files --------- Co-authored-by: Paola Petrelli <[email protected]> Co-authored-by: Paola Petrelli <[email protected]>
ACCESS-NRI · Nov 14, 2024 · 4390267 · 4390267
1 parent a0dcb66
commit 4390267
Show file tree

Hide file tree

Showing 12 changed files with 254 additions and 4 deletions.
diff --git a/ci/environment-3.11.yml b/ci/environment-3.11.yml
@@ -1,4 +1,4 @@
-name: access-nri-intake-test
+name: access-nri
 channels:
   - conda-forge
   - accessnri

diff --git a/config/access-mopper.yaml b/config/access-mopper.yaml
@@ -0,0 +1,18 @@
+builder: MopperBuilder
+
+translator: DefaultTranslator
+
+sources:
+
+  - path:
+      - /g/data/ua8/AUS2200/mjo-elnino
+    metadata_yaml: /g/data/ua8/Working/packages/access-nri-intake-catalog/config/metadata.yaml
+    ensemble: true
+# These are added attributes
+    fpattern: "{version}/{frequency}/{variable}/{variable}_{model}_{member}_{frequency}"
+    toselect: 
+      - variable
+      - frequency
+      - version
+      - member
+
diff --git a/config/metadata.yaml b/config/metadata.yaml
@@ -0,0 +1,29 @@
+name: AUS2200
+experiment_uuid: thisdoesntexistsIdoIknowwhattoputifIdontknowhowtogenerateonewhichisdifferentfromwhateverisalreadyhere
+description: |
+   AUS2200 simulations collections. Each simulation is a limited area model study of the entire Australian continent at 2.2 km resolution, using the UM atmospheric model. 
+            This dataset is part of the AUS2200 simulations collection at NCI.
+long_description: |
+   "There's a total of 9 simulations:
+    ashwed80 - ERA5+ERA5Land reanalysis data was used to provide initial and boundary conditions. The study covers the time period of 1980-02-18 to 1980-02-21, including the 1980 Ash Wednesday bushfires.
+    ashwed83 - ERA5+ERA5Land reanalysis data was used to provide initial and boundary conditions. The study covers the time period of 1983-02-15 to 1983-02-17, including the 1983 Ash Wednesday bushfires."
+model: "AUS2200 (UM v12.2)"
+frequency:
+  - subhr
+  - 1hr
+nominal_resolution:
+  - atmos = 2.2 km
+version: 1
+contact: Dale Roberts
+email:  [email protected]
+created: null
+reference: null
+license:  https://creativecommons.org/licenses/by/4.0/ 
+url: null
+parent_experiment: null
+related_experiments:
+  - null
+notes: | 
+  null
+keywords:
+  - null
diff --git a/src/access_nri_intake/catalog/manager.py b/src/access_nri_intake/catalog/manager.py
@@ -107,7 +107,6 @@ def build_esm(
                     f"An Intake-ESM datastore already exists for {name}. To overwrite, "
                     "pass `overwrite=True` to CatalogBuilder.build_esm"
                 )
-
         builder = builder(path, **kwargs).build()
         builder.save(name=name, description=description, directory=directory)
 

diff --git a/src/access_nri_intake/cli.py b/src/access_nri_intake/cli.py
@@ -68,7 +68,7 @@ def _parse_build_inputs(config_yamls, build_path):
 
             if translator:
                 source_args["translator"] = getattr(translators, translator)
-
+           
             args.append((method, source_args | kwargs))
 
     return args
@@ -208,6 +208,7 @@ def _get_project(path):
     storage_flags = "+".join(sorted([f"gdata/{proj}" for proj in project]))
 
     # Build the catalog
+    #cm = CatalogManager(path=metacatalog_path, extra=args['extra'])
     cm = CatalogManager(path=metacatalog_path)
     for method, args in parsed_sources:
         logger.info(f"Adding '{args['name']}' to metacatalog '{metacatalog_path}'")

diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py
@@ -11,9 +11,13 @@
 
 import xarray as xr
 from ecgtools.builder import INVALID_ASSET, TRACEBACK, Builder
+from pathlib import Path
+from textwrap import wrap
+from datetime import datetime
 
 from ..utils import validate_against_schema
 from . import ESM_JSONSCHEMA, PATH_COLUMN, VARIABLE_COLUMN
+#PP check if I still need this?
 from .utils import (
     EmptyFileError,
     _AccessNCFileInfo,
@@ -167,7 +171,7 @@ def validate_parser(self):
             raise ValueError(
                 "asset list provided is None. Please run `.get_assets()` first"
             )
-
+     
         for asset in self.assets:
             info = self.parser(asset)
             if INVALID_ASSET not in info:
@@ -556,3 +560,170 @@ class AccessCm2Builder(AccessEsm15Builder):
         rf"^iceh.*\.({PATTERNS_HELPERS['ym']})-{PATTERNS_HELPERS['not_multi_digit']}.*",  # ACCESS-CM2 ice
         r"^.*\.p.(\d{6})_.*",  # ACCESS-CM2 atmosphere
     ]
+
+class MopperBuilder(BaseBuilder):
+    """Intake-ESM datastore builder for ACCESS-MOPPeR processed data"""
+
+    def __init__(self, path, ensemble, fpattern, toselect): #, extra):
+        """
+        Initialise a MopperBuilder
+
+        Parameters
+        ----------
+        path: str or list of str
+            Path or list of paths to crawl for assets/files.
+        ensemble: boolean
+            Whether to treat each path as a separate member of an ensemble to join
+            along a new member dimension
+        fpattern: str
+            The pattern used by mopper to encode info in the filename and path
+        toselect: list
+            List of attributes to add pattern used by mopper to encode info in the filename and path
+        """
+
+        kwargs = dict(
+            path=path,
+            depth=5,
+            exclude_patterns=[],
+            include_patterns=["*.nc*"],
+            data_format="netcdf",
+            groupby_attrs=["file_id", "frequency"],
+            aggregations=[
+                {
+                    "type": "join_existing",
+                    "attribute_name": "start_date",
+                    "options": {
+                        "dim": "time",
+                        "combine": "by_coords",
+                    },
+                },
+            ],
+        )
+
+        if ensemble:
+            kwargs["aggregations"] += [
+                {
+                    "type": "join_new",
+                    "attribute_name": "member",
+                },
+            ]
+
+        super().__init__(**kwargs)
+        self.fpattern = fpattern
+        self.toselect = toselect
+
+    #@classmethod
+    def parser(self, fpath): 
+        if True:
+            basedir = self.paths[0]
+            fpattern = self.fpattern
+            toselect = self.toselect
+            filepat = fpattern.split("/")[-1]
+            dirpat = "/".join(fpattern.split("/")[:-1])
+            dirpat = dirpat.replace("{","(?P<").replace("}",">[^/]+)")
+            filepat = filepat.replace("{","(?P<").replace("}",">[^_]+)")
+            tocompiledir = "^/" + dirpat + "/"
+            tocompilefile = "^" + filepat + "_?(?P<date_range>.*)?\.nc"
+
+            dir_re = re.compile(tocompiledir, re.VERBOSE)
+            file_re = re.compile(tocompilefile, re.VERBOSE)
+
+            fname = str(Path(fpath).name)
+            fbase = str(Path(fpath).parent)
+            fbase = fbase.replace(basedir, "") + "/"
+            dir_match = dir_re.match(fbase).groupdict()
+            file_match = file_re.match(fname).groupdict()
+
+            exargs = {}
+            exargs['date_range'] = file_match.get('date_range', '')
+            for x in toselect: 
+                exargs[x] = file_match.get(x, 'unknown')
+                if exargs[x] == 'unknown':
+                    exargs[x] = dir_match.get(x, 'unknown')
+            if 'frequency' in exargs.keys():
+                exargs['frequency'] = exargs['frequency'].replace("Pt","")
+            if 'realm' not in exargs.keys():
+                exargs['realm'] = 'unknown'
+
+        # NB all files coming out of mopper have a single variable
+        # however catalogue seems to expect a multivariable file
+        # in my opinion this is limiting usefulness of catalogues for
+        # the many collection that have 1 variable per file!
+        # Also file_id is required however we're not using it as this aren't multivariable 
+        # files, so we can/should aggregate on variable not file_id
+        # so I'm using variable as file_id instead.
+            nc_info, exargs = self.parse_ncfile(fpath, exargs)
+            ncinfo_dict = nc_info.to_dict()
+            for k,v in exargs.items():
+                ncinfo_dict[k] = v
+            return ncinfo_dict
+
+    @classmethod
+    def parse_ncfile(self, fpath, exargs):
+        """
+        Get Intake-ESM datastore entry info from an ACCESS netcdf file
+        CMOR has its own base date format, length depends on frequency
+
+        Parameters
+        ----------
+        fpath: str
+            The path to the netcdf file
+        exargs: dict
+            Stores extra arguments as frequency, date_range, variable etc, derived from fpattern
+
+        Returns
+        -------
+        output_nc_info: _AccessNCFileInfo
+            A dataclass containing the information parsed from the file
+        exargs: dict
+            Stores extra arguments as frequency, date_range, variable etc, derived from fpattern
+            
+        """
+        time_format = "%Y-%m-%d, %H:%M:%S"
+        # get format for dates based on dates lenght
+        # dformat is the longest possible datetime format for cmor
+        dformat = '%Y%m%d%H%M%S'
+        date_range = exargs.pop('date_range')
+        if date_range == '':
+            start_date = 'none'
+            end_date = 'none'
+        else:
+            ts, te = date_range.split("-")
+            cmor_format = dformat[:(len(ts)-2)]
+            ts = datetime.strptime(ts, cmor_format)
+            start_date = ts.strftime(time_format)
+            te = datetime.strptime(te, cmor_format)
+            end_date = te.strftime(time_format)
+
+        variable = exargs.pop('variable')
+        with xr.open_dataset(
+            fpath,
+            chunks={},
+            decode_cf=False,
+            decode_times=False,
+            decode_coords=False,
+        ) as ds:
+            attrs = ds[variable].attrs
+            variable_long_name = attrs.get('long_name', 'unknown')
+            variable_standard_name = attrs.get('standard_name', 'unknown')
+            variable_cell_methods = attrs.get('cell_methods', 'unknown')
+            variable_units = attrs.get('units', 'unknown')
+            tracking_id = ds.attrs.get('tracking_id', 'unknown')
+
+        output_nc_info = _AccessNCFileInfo(
+            filename=Path(fpath).name,
+            path=fpath,
+            file_id=tracking_id,
+            filename_timestamp=date_range,
+            frequency=exargs.pop('frequency'),
+            start_date=start_date,
+            end_date=end_date,
+            variable=[variable],
+            variable_long_name=[variable_long_name],
+            variable_standard_name=[variable_standard_name],
+            variable_units=[variable_units],
+            variable_cell_methods=[variable_cell_methods],
+        )
+
+        return output_nc_info, exargs
+
diff --git a/tests/data/access-mopper/README b/tests/data/access-mopper/README
@@ -0,0 +1 @@
+These files were copied from /g/data/ua8/AUS2200/ashwed83/ and then cdo was used to select only a small region using sellatlonbox
diff --git a/...hwed83/v1-0/10min/cllow/cllow_AUS2200_ashwed1983_subhrPt_19830216001000-19830217000000.nc b/...hwed83/v1-0/10min/cllow/cllow_AUS2200_ashwed1983_subhrPt_19830216001000-19830217000000.nc
diff --git a/...ss-mopper/ashwed83/v1-0/1hr/huss/huss_AUS2200_ashwed1983_1hr_198302150030-198302172330.nc b/...ss-mopper/ashwed83/v1-0/1hr/huss/huss_AUS2200_ashwed1983_1hr_198302150030-198302172330.nc
diff --git a/...cess-mopper/ashwed83/v1-0/1hr/tas/tas_AUS2200_ashwed1983_1hr_198302150030-198302172330.nc b/...cess-mopper/ashwed83/v1-0/1hr/tas/tas_AUS2200_ashwed1983_1hr_198302150030-198302172330.nc
diff --git a/tests/data/access-mopper/metadata.yaml b/tests/data/access-mopper/metadata.yaml
@@ -0,0 +1,30 @@
+name: AUS2200
+experiment_uuid: thisdoesntexistsIdoIknowwhattoputifIdontknowhowtogenerateonewhichisdifferentfromwhateverisalreadyhere
+description: |
+   AUS2200 simulations collections. Each simulation is a limited area model study of the entire Australian continent at 2.2 km resolution, using the UM atmospheric model. 
+            This dataset is part of the AUS2200 simulations collection at NCI.
+long_description: |
+   "There's a total of 9 simulations:
+    ashwed80 - ERA5+ERA5Land reanalysis data was used to provide initial and boundary conditions. The study covers the time period of 1980-02-18 to 1980-02-21, including the 1980 Ash Wednesday bushfires.
+    ashwed83 - ERA5+ERA5Land reanalysis data was used to provide initial and boundary conditions. The study covers the time period of 1983-02-15 to 1983-02-17, including the 1983 Ash Wednesday bushfires."
+model:
+  - AUS2200 (UM v12.2)
+frequency:
+  - subhr
+  - 1hr
+nominal_resolution:
+  - atmos = 2.2 km
+version: 1
+contact: Dale Roberts
+email:  [email protected]
+created: null
+reference: null
+license:  https://creativecommons.org/licenses/by/4.0/ 
+url: null
+parent_experiment: null
+related_experiments:
+  - null
+notes: | 
+  null
+keywords:
+  - null
diff --git a/tests/test_builders.py b/tests/test_builders.py
@@ -28,6 +28,7 @@
         ),
         (["access-esm1-5"], "AccessEsm15Builder", {"ensemble": False}, 11, 11, 11),
         (["access-om3"], "AccessOm3Builder", {}, 12, 12, 6),
+        (["access-mopper"], "MopperBuilder", {"ensemble": True}, 2, 2, 1),
     ],
 )
 def test_builder_build(