diff --git a/.gitignore b/.gitignore index e2f19f1..57f1c4e 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,9 @@ build *.jsonl *.json +## Exclude schemas +!schemas/**/*.json + # Old Submodule Path # Could be used locally pyessv-archive/ diff --git a/CHANGES.md b/CHANGES.md index 7b2d707..2b6e110 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -16,6 +16,8 @@ * Add tests for CLI and implementations when invoked through the CLI * Refactored code dealing with requests and authentication to the `STACpopulator/requests.py` file * Add `--log-file` command line option to specify a non-default location to write log files to +* Add `cordex6` extension and `CORDEX-CMIP6_Ouranos` implementation. This includes a refactoring of base extension classes. +* Add an `xscen` extension demonstrating how to add properties to a STAC Item. ## [0.6.0](https://github.com/crim-ca/stac-populator/tree/0.6.0) (2024-02-22) diff --git a/Makefile b/Makefile index 1e31d6d..76072bc 100644 --- a/Makefile +++ b/Makefile @@ -6,6 +6,7 @@ APP_NAME := STACpopulator APP_VERSION ?= 0.6.0 DOCKER_COMPOSE_FILES := -f "$(APP_ROOT)/docker/docker-compose.yml" +COMPOSE_PROJECT_NAME := stac-populator DOCKER_TAG := ghcr.io/crim-ca/stac-populator:$(APP_VERSION) IMP_DIR := $(APP_NAME)/implementations @@ -35,6 +36,7 @@ del-cmip6: curl --location --request DELETE '$(STAC_HOST)/collections/CMIP6_UofT' @echo "" + docker-start: docker compose $(DOCKER_COMPOSE_FILES) up starthost: docker-start @@ -43,12 +45,12 @@ docker-stop: docker compose $(DOCKER_COMPOSE_FILES) down stophost: docker-stop +del_docker_volume: stophost + docker volume rm stac-db + docker-build: docker build "$(APP_ROOT)" -f "$(APP_ROOT)/docker/Dockerfile" -t "$(DOCKER_TAG)" -del_docker_volume: stophost - docker volume rm stac-populator_stac-db - resethost: del_docker_volume starthost install: diff --git a/README.md b/README.md index 8cad8a9..5c36dc8 100644 --- a/README.md +++ b/README.md @@ -19,13 +19,15 @@ contain all the logic for constructing the STAC representation for an item in th Provided implementations of `STACpopulatorBase`: -| Implementation | Description | -|--------------------------------|-------------------------------------------------------------------------------------------------------------------------| -| [CMIP6_UofT][CMIP6_UofT] | Crawls a THREDDS Catalog for CMIP6 NCML-annotated NetCDF references to publish corresponding STAC Collection and Items. | -| [DirectoryLoader][DirLoader] | Crawls a subdirectory hierarchy of pre-generated STAC Collections and Items to publish to a STAC API endpoint. | +| Implementation | Description | +|----------------------------------------------|-------------------------------------------------------------------------------------------------------------------------| +| [CMIP6_UofT][CMIP6_UofT] | Crawls a THREDDS Catalog for CMIP6 NCML-annotated NetCDF references to publish corresponding STAC Collection and Items. | +| [DirectoryLoader][DirLoader] | Crawls a subdirectory hierarchy of pre-generated STAC Collections and Items to publish to a STAC API endpoint. | +| [CORDEX-CMIP6_Ouranos][CORDEX-CMIP6_Ouranos] | Crawls a THREDDS Catalog for CORDEX-CMIP6 NetCDF references to publish corresponding STAC Collection and Items. | [CMIP6_UofT]: STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py [DirLoader]: STACpopulator/implementations/DirectoryLoader/crawl_directory.py +[CORDEX-CMIP6_Ouranos]: STACpopulator/implementations/CORDEX-CMIP6_Ouranos/add_CORDEX-CMIP6.py ## Installation and Execution diff --git a/STACpopulator/cli.py b/STACpopulator/cli.py index 11ed00f..5cf9f48 100644 --- a/STACpopulator/cli.py +++ b/STACpopulator/cli.py @@ -14,6 +14,7 @@ def add_parser_args(parser: argparse.ArgumentParser) -> dict[str, Callable]: + """Common CLI arguments for all implementations.""" parser.add_argument( "--version", "-V", diff --git a/STACpopulator/extensions/__init__.py b/STACpopulator/extensions/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/STACpopulator/extensions/base.py b/STACpopulator/extensions/base.py new file mode 100644 index 0000000..3bd0db1 --- /dev/null +++ b/STACpopulator/extensions/base.py @@ -0,0 +1,331 @@ +""" +# Base classes for STAC extensions + +What we have: + - `Loader`, which returns attributes. + - An external json schema describing a subset of the attributes returned by the Loader. This schema might preclude + additional properties, so it cannot be applied wholesale to the Loader's output. (maybe overkill since not a lot of schemas can be found in the wild...) + - `data model` describing the content we want included in the catalog. It includes a subset of the schema properties, + as well as additional attributes desired by the catalog admins. + +Desiderata: + - Not having to replicate existing validation logic in the schema + - Not having to create a modified schema + - Being able to supplement the schema validation by pydantic validation logic + - Streamline the creation of new data models (reduce boilerplate, allow subclassing) + - Developer-friendly validation error messages + + +How-to: + - Instructions to create basic datamodel from schema (codegen) + + + +""" +from __future__ import annotations + +from datetime import datetime +import json +import jsonschema +import logging +from typing import Any, Dict, Generic, TypeVar, Union, cast, Optional +from pydantic import (BaseModel, create_model, Field, FilePath, field_validator, model_validator, HttpUrl, ConfigDict, + PrivateAttr) +import pystac +from pystac.extensions import item_assets +from pystac.extensions.base import ( + ExtensionManagementMixin, + PropertiesExtension, + SummariesExtension, +) +from pystac import STACValidationError +from pystac.extensions.base import S # generic pystac.STACObject +from STACpopulator.models import AnyGeometry, GeoJSONPolygon +from STACpopulator.stac_utils import ServiceType +import types + +T = TypeVar("T", pystac.Collection, pystac.Item, pystac.Asset, item_assets.AssetDefinition) + +LOGGER = logging.getLogger(__name__) + +class Helper: + """Class to be subclassed by extension helpers.""" + + +class ExtensionHelper(BaseModel, Helper): + """Base class for dataset properties going into the catalog. + + Subclass this with attributes. + + Attributes + ---------- + _prefix : str + If not None, this prefix is added to ingested data before the jsonschema validation. + _schema_uri : str + URI of the json schema to validate against. Note this is not a STAC schema, but a schema for the dataset properties only. + _schema_exclude : list[str] + Properties not meant to be validated by json schema, but still included in the data model. + """ + _prefix: str = PrivateAttr() + _schema_uri: FilePath = PrivateAttr(None) + _schema_exclude: list[str] = PrivateAttr([]) + + model_config = ConfigDict(populate_by_name=True, extra="ignore") + + @classmethod + def __init_subclass__(cls, **kwargs): + """Automatically set an alias generator from the `_prefix`.""" + prefix = cls._prefix.default + + if prefix is not None: + cls.model_config["alias_generator"] = lambda key: f"{prefix}:{key}" + + @model_validator(mode="before") + @classmethod + def validate_jsonschema(cls, data): + """Validate the data model against the json schema, if given.""" + # Load schema + uri = cls._schema_uri.default + if uri is not None: + schema = json.load(open(uri)) + validator_cls = jsonschema.validators.validator_for(schema) + validator_cls.check_schema(schema) + validator = validator_cls(schema) + + attrs = {f"{cls._prefix.default}:{k}": v for (k,v) in data.items() if k not in cls._schema_exclude.default} + errors = list(validator.iter_errors(attrs)) + if errors: + raise ValueError(errors) + + return data + + def apply(self, item, add_if_missing=True): + """Add extension for the properties of the dataset to the STAC item. + The extension class is created dynamically from the properties. + """ + schema_uri = self.write_stac_schema() if self._schema_uri else None + ExtSubCls = metacls_extension(self._prefix, schema_uri=schema_uri) + item_ext = ExtSubCls.ext(item, add_if_missing=add_if_missing) + item_ext.apply(self.model_dump(mode="json", by_alias=True)) + return item + + def to_stac_schema(self) -> dict: + """Return the STAC schema for the extension.""" + return {'type': 'object', + 'required': ['type', 'properties'], + 'properties': {'type': {'const': 'Feature'}, + 'properties': {'$ref': str(self._schema_uri)} + } + } + def write_stac_schema(self) -> str: + path = f"/tmp/{self._prefix}-schema.json" + with open(path, "w") as fh: + json.dump(self.to_stac_schema(), fh) + return path + + +class BaseSTAC(BaseModel): + """Base class for STAC item data models. + + Attributes + ---------- + geometry : AnyGeometry + The geometry of the dataset. + bbox : list[float] + The bounding box of the dataset. + start_datetime : datetime + The start datetime of the dataset. + end_datetime : datetime + The end datetime of the dataset. + extensions : list[str] + Name of the class attributes that point to STAC extension helper classes. Those extension classes should have an `apply` method. + """ + # STAC item properties + geometry: AnyGeometry | None + bbox: list[float] + start_datetime: datetime + end_datetime: datetime + + model_config = ConfigDict(populate_by_name=True, extra="ignore", arbitrary_types_allowed=True) + + # Helpers are automatically detected by being Helper subclasses + _helpers: list[str] = PrivateAttr([]) + + @property + def uid(self) -> str: + """Return a unique ID. When subclassing, use a combination of properties uniquely identifying a dataset.""" + # TODO: Should this be an abstract method? + import uuid + return str(uuid.uuid4()) + + @model_validator(mode="after") + def find_helpers(self): + """Populate the list of extensions.""" + for key, field in self.model_fields.items(): + if isinstance(field.annotation, type) and issubclass(field.annotation, Helper): + self._helpers.append(key) + + def stac_item(self) -> "pystac.Item": + """Create a STAC item and add extensions.""" + item = pystac.Item( + id=self.uid, + geometry=self.geometry.model_dump(), + bbox=self.bbox, + start_datetime=self.start_datetime, + end_datetime=self.end_datetime, + datetime=None, + properties={}, + ) + + # Add extensions + for ext in self._helpers: + getattr(self, ext).apply(item) + + try: + item.validate() + except STACValidationError as e: + raise Exception("Failed to validate STAC item") from e + + return json.loads(json.dumps(item.to_dict())) + + +def metacls_extension(name, schema_uri): + """Create an extension class dynamically from the properties.""" + cls_name = f"{name.upper()}Extension" + + bases = (MetaExtension, + Generic[T], + PropertiesExtension, + ExtensionManagementMixin[Union[pystac.Asset, pystac.Item, pystac.Collection]] + ) + + attrs = {"name": name, "schema_uri": schema_uri} + return types.new_class(name=cls_name, bases=bases, kwds=None, exec_body=lambda ns: ns.update(attrs)) + + +class MetaExtension: + name: str + schema_uri: str + + def apply(self, properties: dict[str, Any]) -> None: + """Applies CMIP6 Extension properties to the extended + :class:`~pystac.Item` or :class:`~pystac.Asset`. + """ + for prop, val in properties.items(): + self._set_property(prop, val) + + @classmethod + def get_schema_uri(cls) -> str: + """We have already validated the JSON schema.""" + return cls.schema_uri + + @classmethod + def has_extension(cls, obj: S): + # FIXME: this override should be removed once an official and versioned schema is released + # ignore the original implementation logic for a version regex + # since in our case, the VERSION_REGEX is not fulfilled (ie: using 'main' branch, no tag available...) + ext_uri = cls.get_schema_uri() + return obj.stac_extensions is not None and any(uri == ext_uri for uri in obj.stac_extensions) + + @classmethod + def ext(cls, obj: T, add_if_missing: bool = False) -> "Extension[T]": + """Extends the given STAC Object with properties from the + :stac-ext:`Extension`. + + This extension can be applied to instances of :class:`~pystac.Item` or + :class:`~pystac.Asset`. + + Raises: + + pystac.ExtensionTypeError : If an invalid object type is passed. + """ + cls_map = {pystac.Item: MetaItemExtension} + + for key, meta in cls_map.items(): + if isinstance(obj, key): + # cls.ensure_has_extension(obj, add_if_missing) + kls = extend_type(key, meta, cls[key]) + return cast(cls[T], kls(obj)) + else: + raise pystac.ExtensionTypeError(cls._ext_error_message(obj)) + + +def extend_type(stac, cls, ext): + """Create an extension subclass for different STAC objects. + + Note: This is super confusing... we should come up with some better nomenclature. + + Parameters + ---------- + stac: pystac.Item, pystac.Asset, pystac.Collection + The STAC object. + cls: MetaItemExtension + The generic extension class for the STAC object. + ext: MetaExtension[T] + The meta extension class. + """ + cls_name = f"{stac.__name__ }{ext.__name__}" + return types.new_class(cls_name, (cls, ext), {}, lambda ns: ns) + + +class MetaItemExtension: + """A concrete implementation of :class:`Extension` on an :class:`~pystac.Item` + that extends the properties of the Item to include properties defined in the + :stac-ext:`Extension`. + + This class should generally not be instantiated directly. Instead, call + :meth:`Extension.ext` on an :class:`~pystac.Item` to extend it. + """ + def __init__(self, item: pystac.Item): + self.item = item + self.properties = item.properties + + def get_assets( + self, + service_type: Optional[ServiceType] = None, + ) -> dict[str, pystac.Asset]: + """Get the item's assets where eo:bands are defined. + + Args: + service_type: If set, filter the assets such that only those with a + matching :class:`~STACpopulator.stac_utils.ServiceType` are returned. + + Returns: + Dict[str, Asset]: A dictionary of assets that match ``service_type`` + if set or else all of this item's assets were service types are defined. + """ + return { + key: asset + for key, asset in self.item.get_assets().items() + if (service_type is ServiceType and service_type.value in asset.extra_fields) + or any(ServiceType.from_value(field, default=None) is ServiceType for field in asset.extra_fields) + } + + def __repr__(self) -> str: + return f"<{self.__class__.__name__} Item id={self.item.id}>" + + +# TODO: Add the other STAC item meta extensions + +def schema_properties(schema: dict) -> list[str]: + """Return the list of properties described by schema.""" + out = [] + for key, val in schema["properties"].items(): + prefix, name = key.split(":") if ":" in key else (None, key) + out.append(name) + return out + + +def model_from_schema(model_name, schema: dict): + """Create pydantic BaseModel from JSON schema.""" + type_map = {"string": str, "number": float, "integer": int, "boolean": bool, "array": list, "object": dict, + None: Any} + + fields = {} + for key, val in schema["properties"].items(): + prefix, name = key.split(":") if ":" in key else (None, key) + typ = type_map[val.get("type")] + default = ... if key in schema["required"] else None + fields[name] = (typ, Field(default, alias=key)) + return create_model(model_name, **fields) + diff --git a/STACpopulator/extensions/cmip6.py b/STACpopulator/extensions/cmip6.py index 4f848c4..6953bcb 100644 --- a/STACpopulator/extensions/cmip6.py +++ b/STACpopulator/extensions/cmip6.py @@ -51,7 +51,7 @@ SchemaName = Literal["cmip6"] # FIXME: below reference (used as ID in the schema itself) should be updated once the extension is officially released -# SCHEMA_URI: str = "https://stac-extensions.github.io/cmip6/v1.0.0/schema.json" +# SCHEMA_URI: str = "https://raw.githubusercontent.com/stac-extensions/cmip6/refs/heads/main/json-schema/schema.json" # below is the temporary resolvable URI SCHEMA_URI: str = "https://raw.githubusercontent.com/dchandan/stac-extension-cmip6/main/json-schema/schema.json" PREFIX = f"{get_args(SchemaName)[0]}:" diff --git a/STACpopulator/extensions/cordex6.py b/STACpopulator/extensions/cordex6.py new file mode 100644 index 0000000..05c3fc0 --- /dev/null +++ b/STACpopulator/extensions/cordex6.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +from pathlib import Path +from pydantic import BaseModel, Field, FilePath, model_validator +from datetime import datetime + +from STACpopulator.extensions.base import ExtensionHelper +from STACpopulator.extensions.thredds import THREDDSCatalogDataModel +from STACpopulator.extensions.xscen import Xscen + + +# This is generated using datamodel-codegen + manual edits +class CordexCmip6(ExtensionHelper): + # Fields from schema + activity_id: str + contact: str + # Conventions: str = Field(..., alias='cordex6:Conventions') + creation_date: datetime + domain_id: str + domain: str + driving_experiment_id: str + driving_experiment: str + driving_institution_id: str + driving_source_id: str + driving_variant_label: str + frequency: str + grid: str + institution: str + institution_id: str + license: str + mip_era: str + product: str + project_id: str + source: str + source_id: str + source_type: str + tracking_id: str + variable_id: str + version_realization: str + + # Extra fields + external_variables: str | list[str] + + _prefix = "cordex6" + # Note that this is not a STAC item schema, but a schema for the global attributes of the CMIP6 data. + _schema_uri: FilePath = Path(__file__).parent / "schemas" / "cordex6" / "cmip6-cordex-global-attrs-schema.json" + + + +# Customize the THREDDSCatalogDataModel +class Cordex6DataModel(THREDDSCatalogDataModel): + """Data model for CORDEX-CMIP6 NetCDF datasets.""" + cordex6: CordexCmip6 + + @property + def uid(self) -> str: + """Return a unique ID for CMIP6 data item.""" + keys = [ + "activity_id", + "driving_institution_id", + "driving_source_id", + "institution_id", + "source_id", + "driving_experiment_id", + "driving_variant_label", + "variable_id", + "domain_id", + ] + values = [getattr(self.cordex6, k) for k in keys] + values.append(self.start_datetime.strftime("%Y%m%d")) + values.append(self.end_datetime.strftime("%Y%m%d")) + return "_".join(values) + + @model_validator(mode="before") + @classmethod + def properties_helper(cls, data): + """Instantiate the properties helper.""" + data["cordex6"] = data['data']['attributes'] + return data + + +# Customize the THREDDSCatalogDataModel +class Cordex6DataModelNcML(Cordex6DataModel): + """Data model for CORDEX-CMIP6 NcML aggregations.""" + xscen: Xscen + + @model_validator(mode="before") + @classmethod + def xscen_helper(cls, data): + data['xscen'] = data['data']['attributes'] + return data diff --git a/STACpopulator/extensions/datacube.py b/STACpopulator/extensions/datacube.py index b394416..5d550b3 100644 --- a/STACpopulator/extensions/datacube.py +++ b/STACpopulator/extensions/datacube.py @@ -1,12 +1,12 @@ import functools from typing import Any, MutableMapping, MutableSequence -from pystac.extensions.datacube import Dimension, DimensionType, Variable, VariableType +from pystac.extensions.datacube import Dimension, DimensionType, Variable, VariableType, DatacubeExtension from STACpopulator.stac_utils import ncattrs_to_bbox +from STACpopulator.extensions.base import Helper - -class DataCubeHelper: +class DataCubeHelper(Helper): """Return STAC Item from CF JSON metadata, as provided by `xncml.Dataset.to_cf_dict`.""" axis = {"X": "x", "Y": "y", "Z": "z", "T": None, "longitude": "x", "latitude": "y", "vertical": "z", "time": "t"} @@ -248,3 +248,10 @@ def temporal_extent(self) -> MutableSequence[str]: start_datetime = cfmeta["time_coverage_start"] end_datetime = cfmeta["time_coverage_end"] return [start_datetime, end_datetime] + + def apply(self, item, add_if_missing:bool = True): + """Apply the Datacube extension to an item.""" + ext = DatacubeExtension.ext(item, add_if_missing=add_if_missing) + ext.apply(dimensions=self.dimensions, variables=self.variables) + return item + diff --git a/STACpopulator/extensions/schemas/cordex6/cmip6-cordex-global-attrs-schema.json b/STACpopulator/extensions/schemas/cordex6/cmip6-cordex-global-attrs-schema.json new file mode 100644 index 0000000..47ee1f5 --- /dev/null +++ b/STACpopulator/extensions/schemas/cordex6/cmip6-cordex-global-attrs-schema.json @@ -0,0 +1,415 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "cmip6-cordex-global-attrs-schema.json#", + "title": "CORDEX-CMIP6 metadata schema for global attributes", + "description": "JSON schema for global attributes of CORDEX-CMIP6 datasets. This schema is automatically generated from the CVs. Manual edits will be overwritten.", + "type": "object", + "properties": { + "cordex6:activity_id": { + "enum": [ + "DD", + "ESD" + ] + }, + "cordex6:contact": { + "type": "string" + }, + "cordex6:Conventions": { + "type": "string" + }, + "cordex6:creation_date": { + "type": "string", + "format": "date-time" + }, + "cordex6:domain_id": { + "enum": [ + "SAM-50", + "CAM-50", + "NAM-50", + "EUR-50", + "AFR-50", + "WAS-50", + "EAS-50", + "CAS-50", + "AUS-50", + "ANT-50", + "ARC-50", + "MED-50", + "MNA-50", + "MNA-25", + "SAM-12", + "CAM-12", + "NAM-12", + "EUR-12", + "AFR-12", + "WAS-12", + "EAS-12", + "CAS-12", + "AUS-12", + "ANT-12", + "ARC-12", + "MED-12", + "MNA-12", + "SAM-25", + "CAM-25", + "NAM-25", + "EUR-25", + "AFR-25", + "WAS-25", + "EAS-25", + "CAS-25", + "AUS-25", + "SEA-25", + "SAM-50i", + "CAM-50i", + "NAM-50i", + "EUR-50i", + "AFR-50i", + "WAS-50i", + "EAS-50i", + "CAS-50i", + "AUS-50i", + "ANT-50i", + "ARC-50i", + "MED-50i", + "MNA-50i", + "MNA-25i", + "EUR-12i", + "SEA-25i" + ] + }, + "cordex6:domain": { + "type": "string" + }, + "cordex6:driving_experiment_id": { + "enum": [ + "evaluation", + "historical", + "ssp119", + "ssp126", + "ssp245", + "ssp370", + "ssp585" + ] + }, + "cordex6:driving_experiment": { + "type": "string" + }, + "cordex6:driving_institution_id": { + "enum": [ + "AER", + "AS-RCEC", + "AWI", + "BCC", + "BNU", + "CAMS", + "CAS", + "CCCR-IITM", + "CCCma", + "CMCC", + "CNRM-CERFACS", + "CSIR-Wits-CSIRO", + "CSIRO", + "CSIRO-ARCCSS", + "CSIRO-COSIMA", + "DKRZ", + "DWD", + "E3SM-Project", + "EC-Earth-Consortium", + "ECMWF", + "FIO-QLNM", + "HAMMOZ-Consortium", + "INM", + "INPE", + "IPSL", + "KIOST", + "LLNL", + "MESSy-Consortium", + "MIROC", + "MOHC", + "MPI-M", + "MRI", + "NASA-GISS", + "NASA-GSFC", + "NCAR", + "NCC", + "NERC", + "NIMS-KMA", + "NIWA", + "NOAA-GFDL", + "NTU", + "NUIST", + "PCMDI", + "PNNL-WACCEM", + "RTE-RRTMGP-Consortium", + "RUBISCO", + "SNU", + "THU", + "UA", + "UCI", + "UHH", + "UTAS", + "UofT" + ] + }, + "cordex6:driving_source_id": { + "enum": [ + "4AOP-v1-5", + "ACCESS-CM2", + "ACCESS-ESM1-5", + "ACCESS-OM2", + "ACCESS-OM2-025", + "ARTS-2-3", + "AWI-CM-1-1-HR", + "AWI-CM-1-1-LR", + "AWI-CM-1-1-MR", + "AWI-ESM-1-1-LR", + "AWI-ESM-2-1-LR", + "BCC-CSM2-HR", + "BCC-CSM2-MR", + "BCC-ESM1", + "CAM-MPAS-HR", + "CAM-MPAS-LR", + "CAMS-CSM1-0", + "CAS-ESM2-0", + "CESM1-1-CAM5-CMIP5", + "CESM1-CAM5-SE-HR", + "CESM1-CAM5-SE-LR", + "CESM1-WACCM-SC", + "CESM2", + "CESM2-FV2", + "CESM2-WACCM", + "CESM2-WACCM-FV2", + "CIESM", + "CMCC-CM2-HR4", + "CMCC-CM2-SR5", + "CMCC-CM2-VHR4", + "CMCC-ESM2", + "CNRM-CM6-1", + "CNRM-CM6-1-HR", + "CNRM-ESM2-1", + "CanESM5", + "CanESM5-1", + "CanESM5-CanOE", + "E3SM-1-0", + "E3SM-1-1", + "E3SM-1-1-ECA", + "E3SM-2-0", + "E3SM-2-0-NARRM", + "EC-Earth3", + "EC-Earth3-AerChem", + "EC-Earth3-CC", + "EC-Earth3-GrIS", + "EC-Earth3-HR", + "EC-Earth3-LR", + "EC-Earth3-Veg", + "EC-Earth3-Veg-LR", + "EC-Earth3P", + "EC-Earth3P-HR", + "EC-Earth3P-VHR", + "ECMWF-IFS-HR", + "ECMWF-IFS-LR", + "ECMWF-IFS-MR", + "FGOALS-f3-H", + "FGOALS-f3-L", + "FGOALS-g3", + "FIO-ESM-2-0", + "GFDL-AM4", + "GFDL-CM4", + "GFDL-CM4C192", + "GFDL-ESM2M", + "GFDL-ESM4", + "GFDL-GRTCODE", + "GFDL-OM4p5B", + "GFDL-RFM-DISORT", + "GISS-E2-1-G", + "GISS-E2-1-G-CC", + "GISS-E2-1-H", + "GISS-E2-2-G", + "GISS-E2-2-H", + "GISS-E3-G", + "HadGEM3-GC31-HH", + "HadGEM3-GC31-HM", + "HadGEM3-GC31-LL", + "HadGEM3-GC31-LM", + "HadGEM3-GC31-MH", + "HadGEM3-GC31-MM", + "HiRAM-SIT-HR", + "HiRAM-SIT-LR", + "ICON-ESM-LR", + "IITM-ESM", + "INM-CM4-8", + "INM-CM5-0", + "INM-CM5-H", + "IPSL-CM5A2-INCA", + "IPSL-CM6A-ATM-HR", + "IPSL-CM6A-ATM-ICO-HR", + "IPSL-CM6A-ATM-ICO-LR", + "IPSL-CM6A-ATM-ICO-MR", + "IPSL-CM6A-ATM-ICO-VHR", + "IPSL-CM6A-ATM-LR-REPROBUS", + "IPSL-CM6A-LR", + "IPSL-CM6A-LR-INCA", + "IPSL-CM6A-MR1", + "KACE-1-0-G", + "KIOST-ESM", + "LBLRTM-12-8", + "MCM-UA-1-0", + "MIROC-ES2H", + "MIROC-ES2H-NB", + "MIROC-ES2L", + "MIROC6", + "MPI-ESM-1-2-HAM", + "MPI-ESM1-2-HR", + "MPI-ESM1-2-LR", + "MPI-ESM1-2-XR", + "MRI-AGCM3-2-H", + "MRI-AGCM3-2-S", + "MRI-ESM2-0", + "NESM3", + "NICAM16-7S", + "NICAM16-8S", + "NICAM16-9S", + "NorCPM1", + "NorESM1-F", + "NorESM2-LM", + "NorESM2-MM", + "PCMDI-test-1-0", + "RRTMG-LW-4-91", + "RRTMG-SW-4-02", + "RTE-RRTMGP-181204", + "SAM0-UNICON", + "TaiESM1", + "TaiESM1-TIMCOM", + "TaiESM1-TIMCOM2", + "UKESM1-0-LL", + "UKESM1-1-LL", + "UKESM1-ice-LL", + "ERA5" + ] + }, + "cordex6:driving_variant_label": { + "type": "string" + }, + "cordex6:frequency": { + "enum": [ + "1hr", + "3hr", + "6hr", + "day", + "fx", + "mon", + "yr" + ] + }, + "cordex6:grid": { + "type": "string" + }, + "cordex6:institution": { + "type": "string" + }, + "cordex6:institution_id": { + "enum": [ + "BCCR-UCAN", + "BOM", + "CCCma", + "CLMcom-CMCC", + "CLMcom-DWD", + "CLMcom-GERICS", + "CLMcom-KIT", + "CNRM-MF", + "GERICS", + "HCLIMcom-DMI", + "HCLIMcom-METNo", + "HCLIMcom-SMHI", + "ICTP", + "IRD-MF", + "KNMI", + "MOHC", + "OURANOS", + "UBA-CIMA-IFAECI", + "UQ-DEC" + ] + }, + "cordex6:license": { + "enum": [ + "https://cordex.org/data-access/cordex-cmip6-data/cordex-cmip6-terms-of-use" + ] + }, + "cordex6:mip_era": { + "type": "string" + }, + "cordex6:product": { + "type": "string" + }, + "cordex6:project_id": { + "enum": [ + "CORDEX" + ] + }, + "cordex6:source": { + "type": "string" + }, + "cordex6:source_id": { + "enum": [ + "CCAM-v2105", + "CCAM-v2112", + "CCAMoc-v2112", + "CNRM-ALADIN64E1", + "CRCM5-SN", + "CanRCM5-SN", + "HCLIM43-ALADIN", + "HadREM3-GA7-05", + "ICON-CLM-202407-1-1", + "RACMO23E", + "REMO2020", + "RegCM5-0", + "WRF451Q" + ] + }, + "cordex6:source_type": { + "enum": [ + "ARCM", + "AORCM", + "AGCM", + "AOGCM" + ] + }, + "cordex6:tracking_id": { + "type": "string" + }, + "cordex6:variable_id": { + "type": "string" + }, + "cordex6:version_realization": { + "type": "string" + } + }, + "required": [ + "cordex6:activity_id", + "cordex6:contact", + "cordex6:Conventions", + "cordex6:creation_date", + "cordex6:domain_id", + "cordex6:domain", + "cordex6:driving_experiment_id", + "cordex6:driving_experiment", + "cordex6:driving_institution_id", + "cordex6:driving_source_id", + "cordex6:driving_variant_label", + "cordex6:frequency", + "cordex6:grid", + "cordex6:institution", + "cordex6:institution_id", + "cordex6:license", + "cordex6:mip_era", + "cordex6:product", + "cordex6:project_id", + "cordex6:source", + "cordex6:source_id", + "cordex6:source_type", + "cordex6:tracking_id", + "cordex6:variable_id", + "cordex6:version_realization" + ] +} \ No newline at end of file diff --git a/STACpopulator/extensions/thredds.py b/STACpopulator/extensions/thredds.py index 19e2f44..ce6fc7f 100644 --- a/STACpopulator/extensions/thredds.py +++ b/STACpopulator/extensions/thredds.py @@ -5,8 +5,10 @@ ExtensionManagementMixin, PropertiesExtension, ) - -from STACpopulator.stac_utils import ServiceType, magpie_resource_link +from pydantic import ConfigDict, field_validator, model_validator +from STACpopulator.stac_utils import ServiceType, magpie_resource_link, ncattrs_to_bbox, ncattrs_to_geometry +from STACpopulator.extensions.base import Helper, BaseSTAC +from STACpopulator.extensions.datacube import DataCubeHelper T = TypeVar("T", pystac.Collection, pystac.Item) @@ -110,7 +112,7 @@ def __repr__(self) -> str: return f"" -class THREDDSHelper: +class THREDDSHelper(Helper): def __init__(self, access_urls: dict[str, str]): self.access_urls = { ServiceType.from_value(svc): url @@ -132,3 +134,64 @@ def links(self) -> list[pystac.Link]: url = self.access_urls[ServiceType.httpserver] link = magpie_resource_link(url) return [link] + + def apply(self, item, add_if_missing:bool = False): + """Apply the THREDDS extension to an item.""" + ext = THREDDSExtension.ext(item, add_if_missing=add_if_missing) + ext.apply(services=self.services, links=self.links) + return item + + +class THREDDSCatalogDataModel(BaseSTAC): + """Base class ingesting attributes loaded by `THREDDSLoader` and creating a STAC item. + + This is meant to be subclassed for each extension. + + It includes two validation mechanisms: + - pydantic validation using type hints, and + - json schema validation. + """ + # Data from loader + data: dict + + # Extensions classes + datacube: DataCubeHelper + thredds: THREDDSHelper + + model_config = ConfigDict(populate_by_name=True, extra="ignore", arbitrary_types_allowed=True) + + @classmethod + def from_data(cls, data): + """Instantiate class from data provided by THREDDS Loader. + """ + # This is where we match the Loader's output to the STAC item and extensions inputs. If we had multiple + # loaders, that's probably the only thing that would be different between them. + return cls(data=data, + start_datetime=data["groups"]["CFMetadata"]["attributes"]["time_coverage_start"], + end_datetime=data["groups"]["CFMetadata"]["attributes"]["time_coverage_end"], + geometry=ncattrs_to_geometry(data), + bbox=ncattrs_to_bbox(data), + ) + + @model_validator(mode="before") + @classmethod + def datacube_helper(cls, data): + """Instantiate the DataCubeHelper.""" + data["datacube"] = DataCubeHelper(data['data']) + return data + + @model_validator(mode="before") + @classmethod + def thredds_helper(cls, data): + """Instantiate the THREDDSHelper.""" + data["thredds"] = THREDDSHelper(data['data']["access_urls"]) + return data + + +# TODO: Validate services links exist ? +# @field_validator("access_urls") +# @classmethod +# def validate_access_urls(cls, value): +# assert len(set(["HTTPServer", "OPENDAP"]).intersection(value.keys())) >= 1, ( +# "Access URLs must include HTTPServer or OPENDAP keys.") +# return value diff --git a/STACpopulator/extensions/xscen.py b/STACpopulator/extensions/xscen.py new file mode 100644 index 0000000..edb8b93 --- /dev/null +++ b/STACpopulator/extensions/xscen.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from typing import Literal +from STACpopulator.extensions.base import ExtensionHelper + + +class Xscen(ExtensionHelper): + type: Literal["forecast", "station-obs", "gridded-obs", "reconstruction", "simulation"] + processing_level: Literal["raw", "extracted", "regridded", "biasadjusted"] + license_type: Literal["permissive", "permissive non-commercial"] + _prefix: str = "xscen" diff --git a/STACpopulator/implementations/CORDEXCMIP6_Ouranos/__init__.py b/STACpopulator/implementations/CORDEXCMIP6_Ouranos/__init__.py new file mode 100644 index 0000000..a324acc --- /dev/null +++ b/STACpopulator/implementations/CORDEXCMIP6_Ouranos/__init__.py @@ -0,0 +1,4 @@ +from STACpopulator.implementations.CORDEXCMIP6_Ouranos .add_CORDEX6 import add_parser_args, runner + +__all__ = ["add_parser_args", "runner"] + diff --git a/STACpopulator/implementations/CORDEXCMIP6_Ouranos/add_CORDEX6.py b/STACpopulator/implementations/CORDEXCMIP6_Ouranos/add_CORDEX6.py new file mode 100644 index 0000000..c72ea91 --- /dev/null +++ b/STACpopulator/implementations/CORDEXCMIP6_Ouranos/add_CORDEX6.py @@ -0,0 +1,63 @@ +import logging +from requests.sessions import Session + +from STACpopulator.requests import add_request_options, apply_request_options +from STACpopulator.input import ErrorLoader, THREDDSLoader + +import argparse +from typing import Any +from STACpopulator.populator_base import STACpopulatorBase +from STACpopulator.extensions.cordex6 import Cordex6DataModel + +LOGGER = logging.getLogger(__name__) + + +class CORDEX_STAC_Populator(STACpopulatorBase): + data_model = Cordex6DataModel + item_geometry_model = None # Unnecessary, but kept for consistency + + def create_stac_item(self, item_name: str, item_data: dict[str, Any]) -> dict[str, Any]: + dm = self.data_model.from_data(item_data) + return dm.stac_item() + + + +# TODO: This probably doesn't need to be copied for every implementation, right ? +def add_parser_args(parser: argparse.ArgumentParser) -> None: + parser.description="CMIP6-CORDEX STAC populator from a THREDDS catalog or NCML XML." + parser.add_argument("stac_host", help="STAC API URL") + parser.add_argument("href", help="URL to a THREDDS catalog or a NCML XML with CMIP6 metadata.") + parser.add_argument("--update", action="store_true", help="Update collection and its items") + parser.add_argument( + "--mode", + choices=["full", "single"], + default="full", + help="Operation mode, processing the full dataset or only the single reference.", + ) + parser.add_argument( + "--config", + type=str, + help=( + "Override configuration file for the populator. " + "By default, uses the adjacent configuration to the implementation class." + ), + ) + add_request_options(parser) + + +def runner(ns: argparse.Namespace) -> int: + LOGGER.info(f"Arguments to call: {vars(ns)}") + + with Session() as session: + apply_request_options(session, ns) + if ns.mode == "full": + data_loader = THREDDSLoader(ns.href, session=session) + else: + # To be implemented + data_loader = ErrorLoader() + + c = CORDEX_STAC_Populator( + ns.stac_host, data_loader, update=ns.update, session=session, config_file=ns.config, log_debug=ns.debug + ) + c.ingest() + return 0 diff --git a/STACpopulator/implementations/CORDEXCMIP6_Ouranos/collection_config.yml b/STACpopulator/implementations/CORDEXCMIP6_Ouranos/collection_config.yml new file mode 100644 index 0000000..bd9c42b --- /dev/null +++ b/STACpopulator/implementations/CORDEXCMIP6_Ouranos/collection_config.yml @@ -0,0 +1,18 @@ +title: CMIP6-CORDEX +id: Ouranos_CMIP6-CORDEX +description: Coordinated Regional Downscaling Experiment phase 6 +keywords: ['CMIP', 'CMIP6', 'WCRP', 'Climate Change', 'CORDEX'] +license: "CC-BY-4.0" +spatialextent: [-180, -90, 180, 90] +temporalextent: ['1850-01-01', '2500-01-01'] + +links: + - rel: about + title : Project homepage + target : https://cordex.org/experiment-guidelines/cordex-cmip6/ + media_type: text/html + - rel: license + title : License + target : https://cordex.org/data-access/cordex-cmip6-data/cordex-cmip6-terms-of-use + media_type: text/plain + diff --git a/STACpopulator/implementations/__init__.py b/STACpopulator/implementations/__init__.py index 80c732b..fe16c15 100644 --- a/STACpopulator/implementations/__init__.py +++ b/STACpopulator/implementations/__init__.py @@ -1,8 +1,8 @@ # By adding modules to __all__, they are discoverable by the cli.implementation_modules method and -# become available to be invoked through the CLI. -# All modules in this list must contain two functions: +# become available to be invoked through the CLI. +# All modules in this list must contain two functions: # - add_parser_args(parser: argparse.ArgumentParser) -> None # - adds additional arguments to the given parser needed to run this implementation # - def runner(ns: argparse.Namespace) -> int: # - runs the implementation given a namespace constructed from the parser arguments supplied -__all__ = ["CMIP6_UofT", "DirectoryLoader"] +__all__ = ["CMIP6_UofT", "DirectoryLoader", "CORDEXCMIP6_Ouranos"] diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index 44341f3..0b7fe2f 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -1,3 +1,4 @@ +import argparse import functools import inspect import json @@ -7,6 +8,7 @@ from datetime import datetime from typing import Any, Dict, List, MutableMapping, Optional, Type, Union + import pystac from requests.sessions import Session @@ -18,6 +20,9 @@ from STACpopulator.input import GenericLoader from STACpopulator.models import AnyGeometry from STACpopulator.stac_utils import load_config, url_validate +from STACpopulator.requests import add_request_options, apply_request_options +from STACpopulator.input import ErrorLoader, GenericLoader, THREDDSLoader + LOGGER = logging.getLogger(__name__) @@ -221,3 +226,5 @@ def ingest(self) -> None: counter += 1 LOGGER.info(f"Processed {counter} data items. {failures} failures") + + diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 515507a..d608e0e 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -105,7 +105,10 @@ def ncattrs_to_bbox(attrs: MutableMapping[str, Any]) -> list[float]: def numpy_to_python_datatypes(data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: - # Converting numpy datatypes to python standard datatypes + """Convert numpy datatypes to python standard datatypes. + + This is useful when validating against a JSON schema that does not recognize an int32 as an integer. + """ for key, value in data.items(): if isinstance(value, list): newlist = [] @@ -123,6 +126,37 @@ def numpy_to_python_datatypes(data: MutableMapping[str, Any]) -> MutableMapping[ return data +def np2py(data): + """Convert numpy datatypes to python standard datatypes. + + This is useful when validating against a JSON schema that does not recognize an int32 as an integer. + + Parameters + ---------- + data : dict, list, tuple, int, float, np.integer, np.floating, str + Object to convert. + """ + import numpy as np + + if isinstance(data, dict): + return {key: np2py(value) for key, value in data.items()} + + elif isinstance(data, (list, tuple)): + out = [np2py(item) for item in data] + if isinstance(data, tuple): + return tuple(out) + return out + + elif issubclass(type(data), np.integer): + return int(data) + + elif issubclass(type(data), np.floating): + return float(data) + + else: + return data + + def magpie_resource_link(url: str) -> pystac.Link: """Creates a link that will be used by Cowbird to create a resource in Magpie associated with the STAC item. diff --git a/pyproject.toml b/pyproject.toml index e3f5b34..ce0447f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ exclude = [ ] [tool.setuptools.package-data] -STACpopulator = ["**/collection_config.yml"] +STACpopulator = ["**/collection_config.yml", "extensions/schemas/**/*.json"] [tool.pytest.ini_options] @@ -150,7 +150,8 @@ keywords = [ "SpatioTemporal Asset Catalog", "Data Ingestion", "THREDDS", - "CMIP6" + "CMIP6", + "CORDEX" ] [project.scripts] diff --git a/tests/data/cordex6_ncml.json b/tests/data/cordex6_ncml.json new file mode 100644 index 0000000..9290ebf --- /dev/null +++ b/tests/data/cordex6_ncml.json @@ -0,0 +1,418 @@ +{ + "@location": "Not provided because of security concerns.", + "@xmlns": { + "ncml": "http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2" + }, + "attributes": { + "Conventions": "CF-1.11", + "activity_id": "DD", + "comment": "CRCM5 v3331 0.11 deg AMNO11d1 L56 S17-15m MPI-ESM1-2-LR membre 1 PILSPEC PS3", + "contact": "simulations_ouranos@ouranos.ca", + "creation_date": "2023-12-08T19:52:05Z", + "domain": "North America", + "domain_id": "NAM-12", + "driving_experiment": "gap-filling scenario reaching 7.0 based on SSP3", + "driving_experiment_id": "ssp370", + "driving_institution_id": "MPI-M", + "driving_source_id": "MPI-ESM1-2-LR", + "driving_variant_label": "r1i1p1f1", + "institution_id": "OURANOS", + "mip_era": "CMIP6", + "ouranos_experiment_name": "cau", + "product": "model-output", + "project_id": "CORDEX", + "source_type": "ARCM", + "title": "CRCM5-CMIP6 : Canadian Regional Climate Model v.5 - CMIP6 : daily", + "external_variables": "areacella", + "frequency": "day", + "variable_id": "tas", + "history": "Thu Aug 29 22:44:08 2024: ncks --cmp=dfl,6 CORDEX/CMIP6/DD/NAM-12/OURANOS/MPI-ESM1-2-LR/ssp370/r1i1p1f1/CRCM5/v1-r1/day/tas/v20231208/tas_NAM-12_MPI-ESM1-2-LR_ssp370_r1i1p1f1_OURANOS_CRCM5_v1-r1_day_20910101-20951231.nc CORDEX/CMIP6/DD/NAM-12/OURANOS/MPI-ESM1-2-LR/ssp370/r1i1p1f1/CRCM5/v1-r1/day/tas/v20231208/tas_NAM-12_MPI-ESM1-2-LR_ssp370_r1i1p1f1_OURANOS_CRCM5_v1-r1_day_20910101-20951231.nc.comp\nWed Dec 13 04:21:34 2023: Metadata converted to CORDEX specifications.Monthly files merged by chunks of 5 years. Data resampled from H to D.\nSat Sep 10 12:19:11 2022: ncks -O --chunk_policy g3d --cnk_dmn plev,1 --cnk_dmn rlon,50 --cnk_dmn rlat,50 --cnk_dmn time,250 /home/dpaquin1/scratch/arch/cau/209101/nc4c_tas_cau_209101_se.nc /home/dpaquin1/scratch/arch/cau/209101/tas_cau_209101_se.nc\nSat Sep 10 12:17:28 2022: ncks -O --fl_fmt=netcdf4_classic -L 6 /home/dpaquin1/scratch/arch/cau/209101/trim_tas_cau_209101_se.nc /home/dpaquin1/scratch/arch/cau/209101/nc4c_tas_cau_209101_se.nc\nSat Sep 10 12:17:23 2022: ncks -O -d time,2091-01-01 00:00:00,2091-01-31 23:59:59 /home/dpaquin1/postprod/cau/transit2/209101/tas_cau_209101_se.nc /home/dpaquin1/scratch/arch/cau/209101/trim_tas_cau_209101_se.nc", + "tracking_id": "hdl:21.14100/85e73a84-8318-4652-915c-52a9d6bc0de8", + "coordinates": "vertices_latitude vertices_longitude crs", + "license": "https://cordex.org/data-access/cordex-cmip6-data/cordex-cmip6-terms-of-use", + "source": "Canadian Regional Climate Model version 5", + "source_id": "CRCM5-SN", + "source_name": "CRCM5", + "doi": "https://zenodo.org/doi/10.5281/zenodo.11061924", + "further_info_url": "https://zenodo.org/doi/10.5281/zenodo.11061924", + "grid": "Rotated-pole latitude-longitude with 0.11 degree grid spacing (AMNO11d1)", + "version_realization": "v1-r1", + "institution": "Ouranos Consortium on Regional Climatology and Adaptation to Climate Change", + "NCO": "netCDF Operators version 5.1.8 (Homepage = http://nco.sf.net, Code = http://github.com/nco/nco, Citation = 10.1016/j.envsoft.2008.03.004)", + "abstract": "Ouranos produces operational regional climate simulations over the Cordex North American domain, at 0.11\u00b0 resolution. The current ensemble uses the fifth version of the CRCM, developed at UQAM's ESCER center in collaboration with ECCC. Pilot data for the simulations come from the CMIP6 ensemble, except for those in hindcast mode, which use ERA5.", + "dataset_id": "CRCM5-CMIP6", + "license_type": "permissive", + "type": "simulation", + "processing_level": "raw", + "modeling_realm": "atmos", + "_CoordSysBuilder": "ucar.nc2.dataset.conv.CF1Convention" + }, + "dimensions": { + "rlat": 628, + "rlon": 655, + "bounds": 4, + "time": 31411, + "bnds": 2 + }, + "groups": { + "CFMetadata": { + "attributes": { + "geospatial_lon_min": [ + -179.9917755126953 + ], + "geospatial_lat_min": [ + 6.3356499671936035 + ], + "geospatial_lon_max": [ + 179.9958038330078 + ], + "geospatial_lat_max": [ + 82.84487915039062 + ], + "geospatial_lon_units": "degrees_east", + "geospatial_lat_units": "degrees_north", + "geospatial_lon_resolution": "8.751603406088485E-4", + "geospatial_lat_resolution": "1.8600042124828522E-4", + "geospatial_vertical_min": "2.0", + "geospatial_vertical_max": "2.0", + "geospatial_vertical_units": "m", + "geospatial_vertical_resolution": "0.0", + "geospatial_vertical_positive": "up", + "time_coverage_start": "2015-01-01T12:00:00Z", + "time_coverage_end": "2100-12-31T12:00:00Z", + "time_coverage_units": "seconds", + "time_coverage_resolution": "86400.0", + "time_coverage_duration": "P0Y0M31410DT0H0M0.000S" + } + }, + "NCISOMetadata": { + "attributes": { + "metadata_creation": "2024-10-18", + "nciso_version": "2.2.3" + } + } + }, + "variables": { + "rlat": { + "shape": [ + "rlat" + ], + "type": "double", + "attributes": { + "_FillValue": [ + NaN + ], + "actual_range": [ + -33.625, + 35.345 + ], + "units": "degrees", + "axis": "Y", + "long_name": "latitude in rotated pole grid", + "standard_name": "grid_latitude", + "bounds": "rlat_bounds", + "_ChunkSizes": [ + 628 + ], + "_CoordinateAxisType": "GeoY" + } + }, + "rlon": { + "shape": [ + "rlon" + ], + "type": "double", + "attributes": { + "long_name": "longitude in rotated pole grid", + "actual_range": [ + -34.045, + 37.895 + ], + "_FillValue": [ + NaN + ], + "axis": "X", + "standard_name": "grid_longitude", + "bounds": "rlon_bounds", + "units": "degrees", + "_ChunkSizes": [ + 655 + ], + "_CoordinateAxisType": "GeoX" + } + }, + "time": { + "shape": [ + "time" + ], + "type": "double", + "attributes": { + "_FillValue": [ + NaN + ], + "units": "days since 1950-01-01", + "calendar": "standard", + "_ChunkSizes": [ + 512 + ], + "_CoordinateAxisType": "Time" + } + }, + "crs": { + "shape": [ + "" + ], + "type": "char", + "attributes": { + "grid_mapping_name": "rotated_latitude_longitude", + "grid_north_pole_latitude": [ + 42.5 + ], + "grid_north_pole_longitude": [ + 83.0 + ], + "north_pole_grid_longitude": [ + 0.0 + ], + "_CoordinateTransformType": "Projection", + "_CoordinateAxisTypes": "GeoX GeoY" + } + }, + "vertices_latitude": { + "shape": [ + "rlat", + "rlon", + "bounds" + ], + "type": "double", + "attributes": { + "_FillValue": [ + NaN + ], + "coordinates": "lat lon", + "_ChunkSizes": [ + 628, + 655, + 2 + ] + } + }, + "vertices_longitude": { + "shape": [ + "rlat", + "rlon", + "bounds" + ], + "type": "double", + "attributes": { + "_FillValue": [ + NaN + ], + "coordinates": "lat lon", + "_ChunkSizes": [ + 628, + 655, + 2 + ] + } + }, + "tas": { + "shape": [ + "time", + "rlat", + "rlon" + ], + "type": "float", + "attributes": { + "long_name": "Near-Surface Air Temperature", + "_FillValue": [ + 1.0000000200408773e+20 + ], + "standard_name": "air_temperature", + "cell_measures": "area: areacella", + "cell_methods": "area: mean time: point", + "missing_value": [ + 1.0000000200408773e+20 + ], + "units": "K", + "grid_mapping": "crs", + "coordinates": "height lat lon", + "_ChunkSizes": [ + 250, + 50, + 50 + ] + } + }, + "pr": { + "shape": [ + "time", + "rlat", + "rlon" + ], + "type": "float", + "attributes": { + "long_name": "Precipitation", + "_FillValue": [ + 1.0000000200408773e+20 + ], + "standard_name": "precipitation_flux", + "cell_measures": "area: areacella", + "cell_methods": "area: mean time: mean", + "missing_value": [ + 1.0000000200408773e+20 + ], + "units": "kg m-2 s-1", + "grid_mapping": "crs", + "coordinates": "lat lon", + "_ChunkSizes": [ + 250, + 50, + 50 + ] + } + }, + "time_bnds": { + "shape": [ + "time", + "bnds" + ], + "type": "double", + "attributes": { + "_FillValue": [ + NaN + ], + "_ChunkSizes": [ + 1, + 2 + ] + } + }, + "tasmax": { + "shape": [ + "time", + "rlat", + "rlon" + ], + "type": "float", + "attributes": { + "_FillValue": [ + 1.0000000200408773e+20 + ], + "cell_measures": "area: areacella", + "cell_methods": "area: mean time: maximum", + "grid_mapping": "crs", + "long_name": "Daily Maximum Near-Surface Air Temperature", + "standard_name": "air_temperature", + "units": "K", + "coordinates": "height lat lon", + "missing_value": [ + 1.0000000200408773e+20 + ], + "_ChunkSizes": [ + 250, + 50, + 50 + ] + } + }, + "tasmin": { + "shape": [ + "time", + "rlat", + "rlon" + ], + "type": "float", + "attributes": { + "_FillValue": [ + 1.0000000200408773e+20 + ], + "cell_measures": "area: areacella", + "cell_methods": "area: mean time: minimum", + "grid_mapping": "crs", + "long_name": "Daily Minimum Near-Surface Air Temperature", + "standard_name": "air_temperature", + "units": "K", + "coordinates": "height lat lon", + "missing_value": [ + 1.0000000200408773e+20 + ], + "_ChunkSizes": [ + 250, + 50, + 50 + ] + } + }, + "height": { + "shape": [ + "" + ], + "type": "double", + "attributes": { + "_FillValue": [ + NaN + ], + "units": "m", + "standard_name": "height", + "axis": "Z", + "long_name": "height", + "positive": "up", + "_CoordinateAxisType": "Height", + "_CoordinateZisPositive": "up" + } + }, + "lat": { + "shape": [ + "rlat", + "rlon" + ], + "type": "double", + "attributes": { + "_FillValue": [ + NaN + ], + "standard_name": "latitude", + "long_name": "latitude", + "units": "degrees_north", + "bounds": "vertices_latitude", + "_ChunkSizes": [ + 628, + 655 + ], + "_CoordinateAxisType": "Lat" + } + }, + "lon": { + "shape": [ + "rlat", + "rlon" + ], + "type": "double", + "attributes": { + "_FillValue": [ + NaN + ], + "standard_name": "longitude", + "long_name": "longitude", + "units": "degrees_east", + "bounds": "vertices_longitude", + "_ChunkSizes": [ + 628, + 655 + ], + "_CoordinateAxisType": "Lon" + } + } + }, + "access_urls": { + "HTTPServer": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/fileServer/datasets/simulations/RCM-CMIP6/CORDEX/NAM-12/day/NAM-12_MPI-ESM1-2-LR_ssp370_r1i1p1f1_OURANOS_CRCM5_v1-r1_day_20150101-21001231.ncml", + "OPENDAP": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/dodsC/datasets/simulations/RCM-CMIP6/CORDEX/NAM-12/day/NAM-12_MPI-ESM1-2-LR_ssp370_r1i1p1f1_OURANOS_CRCM5_v1-r1_day_20150101-21001231.ncml", + "NCML": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/ncml/datasets/simulations/RCM-CMIP6/CORDEX/NAM-12/day/NAM-12_MPI-ESM1-2-LR_ssp370_r1i1p1f1_OURANOS_CRCM5_v1-r1_day_20150101-21001231.ncml", + "UDDC": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/uddc/datasets/simulations/RCM-CMIP6/CORDEX/NAM-12/day/NAM-12_MPI-ESM1-2-LR_ssp370_r1i1p1f1_OURANOS_CRCM5_v1-r1_day_20150101-21001231.ncml", + "ISO": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/iso/datasets/simulations/RCM-CMIP6/CORDEX/NAM-12/day/NAM-12_MPI-ESM1-2-LR_ssp370_r1i1p1f1_OURANOS_CRCM5_v1-r1_day_20150101-21001231.ncml", + "WCS": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wcs/datasets/simulations/RCM-CMIP6/CORDEX/NAM-12/day/NAM-12_MPI-ESM1-2-LR_ssp370_r1i1p1f1_OURANOS_CRCM5_v1-r1_day_20150101-21001231.ncml", + "WMS": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wms/datasets/simulations/RCM-CMIP6/CORDEX/NAM-12/day/NAM-12_MPI-ESM1-2-LR_ssp370_r1i1p1f1_OURANOS_CRCM5_v1-r1_day_20150101-21001231.ncml", + "NetcdfSubset": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/ncss/datasets/simulations/RCM-CMIP6/CORDEX/NAM-12/day/NAM-12_MPI-ESM1-2-LR_ssp370_r1i1p1f1_OURANOS_CRCM5_v1-r1_day_20150101-21001231.ncml" + } +} diff --git a/tests/data/cordex6_raw.json b/tests/data/cordex6_raw.json new file mode 100644 index 0000000..e75170a --- /dev/null +++ b/tests/data/cordex6_raw.json @@ -0,0 +1,311 @@ +{ + "@location": "Not provided because of security concerns.", + "@xmlns": { + "ncml": "http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2" + }, + "attributes": { + "Conventions": "CF-1.11", + "activity_id": "DD", + "comment": "CRCM5 v3331 0.11 deg AMNO11d1 L56 S17-15m MPI-ESM1-2-LR membre 1 PILSPEC PS3", + "contact": "simulations_ouranos@ouranos.ca", + "creation_date": "2023-12-08T19:52:05Z", + "domain": "North America", + "domain_id": "NAM-12", + "driving_experiment": "gap-filling scenario reaching 7.0 based on SSP3", + "driving_experiment_id": "ssp370", + "driving_institution_id": "MPI-M", + "driving_source_id": "MPI-ESM1-2-LR", + "driving_variant_label": "r1i1p1f1", + "institution_id": "OURANOS", + "mip_era": "CMIP6", + "ouranos_experiment_name": "cau", + "product": "model-output", + "project_id": "CORDEX", + "source_type": "ARCM", + "title": "CRMC5 output, model ran by Ouranos, prepared for CORDEX-CMIP6", + "external_variables": "areacella", + "frequency": "day", + "variable_id": "tas", + "history": "Thu Aug 29 22:41:40 2024: ncks --cmp=dfl,6 CORDEX/CMIP6/DD/NAM-12/OURANOS/MPI-ESM1-2-LR/ssp370/r1i1p1f1/CRCM5/v1-r1/day/tas/v20231208/tas_NAM-12_MPI-ESM1-2-LR_ssp370_r1i1p1f1_OURANOS_CRCM5_v1-r1_day_20960101-21001231.nc CORDEX/CMIP6/DD/NAM-12/OURANOS/MPI-ESM1-2-LR/ssp370/r1i1p1f1/CRCM5/v1-r1/day/tas/v20231208/tas_NAM-12_MPI-ESM1-2-LR_ssp370_r1i1p1f1_OURANOS_CRCM5_v1-r1_day_20960101-21001231.nc.comp\nWed Dec 13 09:09:00 2023: Metadata converted to CORDEX specifications.Monthly files merged by chunks of 5 years. Data resampled from H to D.\nFri Sep 16 01:38:18 2022: ncks -O --chunk_policy g3d --cnk_dmn plev,1 --cnk_dmn rlon,50 --cnk_dmn rlat,50 --cnk_dmn time,250 /home/dpaquin1/scratch/arch/cau/209601/nc4c_tas_cau_209601_se.nc /home/dpaquin1/scratch/arch/cau/209601/tas_cau_209601_se.nc\nFri Sep 16 01:36:33 2022: ncks -O --fl_fmt=netcdf4_classic -L 6 /home/dpaquin1/scratch/arch/cau/209601/trim_tas_cau_209601_se.nc /home/dpaquin1/scratch/arch/cau/209601/nc4c_tas_cau_209601_se.nc\nFri Sep 16 01:36:28 2022: ncks -O -d time,2096-01-01 00:00:00,2096-01-31 23:59:59 /home/dpaquin1/postprod/cau/transit2/209601/tas_cau_209601_se.nc /home/dpaquin1/scratch/arch/cau/209601/trim_tas_cau_209601_se.nc", + "tracking_id": "hdl:21.14100/74a3dd5a-c6a8-4e3b-b6e2-5d89289770c9", + "coordinates": "vertices_latitude vertices_longitude crs", + "license": "https://cordex.org/data-access/cordex-cmip6-data/cordex-cmip6-terms-of-use", + "source": "Canadian Regional Climate Model version 5", + "source_id": "CRCM5-SN", + "source_name": "CRCM5", + "doi": "https://zenodo.org/doi/10.5281/zenodo.11061924", + "further_info_url": "https://zenodo.org/doi/10.5281/zenodo.11061924", + "grid": "Rotated-pole latitude-longitude with 0.11 degree grid spacing (AMNO11d1)", + "version_realization": "v1-r1", + "institution": "Ouranos Consortium, Montreal (Quebec), Canada", + "NCO": "netCDF Operators version 5.1.8 (Homepage = http://nco.sf.net, Code = http://github.com/nco/nco, Citation = 10.1016/j.envsoft.2008.03.004)", + "_CoordSysBuilder": "ucar.nc2.dataset.conv.CF1Convention" + }, + "dimensions": { + "rlat": 628, + "rlon": 655, + "time": 1826, + "bounds": 4 + }, + "groups": { + "CFMetadata": { + "attributes": { + "geospatial_lon_min": [ + -179.9917755126953 + ], + "geospatial_lat_min": [ + 6.3356499671936035 + ], + "geospatial_lon_max": [ + 179.9958038330078 + ], + "geospatial_lat_max": [ + 82.84487915039062 + ], + "geospatial_lon_units": "degrees_east", + "geospatial_lat_units": "degrees_north", + "geospatial_lon_resolution": "8.751603406088485E-4", + "geospatial_lat_resolution": "1.8600042124828522E-4", + "geospatial_vertical_min": "2.0", + "geospatial_vertical_max": "2.0", + "geospatial_vertical_units": "m", + "geospatial_vertical_resolution": "0.0", + "geospatial_vertical_positive": "up", + "time_coverage_start": "2096-01-01T12:00:00Z", + "time_coverage_end": "2100-12-31T12:00:00Z", + "time_coverage_units": "seconds", + "time_coverage_resolution": "86400.0", + "time_coverage_duration": "P0Y0M1825DT0H0M0.000S" + } + }, + "NCISOMetadata": { + "attributes": { + "metadata_creation": "2024-10-18", + "nciso_version": "2.2.3" + } + } + }, + "variables": { + "rlat": { + "shape": [ + "rlat" + ], + "type": "double", + "attributes": { + "_FillValue": [ + NaN + ], + "actual_range": [ + -33.625, + 35.345 + ], + "units": "degrees", + "axis": "Y", + "long_name": "latitude in rotated pole grid", + "standard_name": "grid_latitude", + "bounds": "rlat_bounds", + "_ChunkSizes": [ + 628 + ], + "_CoordinateAxisType": "GeoY" + } + }, + "rlon": { + "shape": [ + "rlon" + ], + "type": "double", + "attributes": { + "long_name": "longitude in rotated pole grid", + "actual_range": [ + -34.045, + 37.895 + ], + "_FillValue": [ + NaN + ], + "axis": "X", + "standard_name": "grid_longitude", + "bounds": "rlon_bounds", + "units": "degrees", + "_ChunkSizes": [ + 655 + ], + "_CoordinateAxisType": "GeoX" + } + }, + "time": { + "shape": [ + "time" + ], + "type": "double", + "attributes": { + "_FillValue": [ + NaN + ], + "units": "days since 1950-01-01", + "calendar": "standard", + "_ChunkSizes": [ + 512 + ], + "_CoordinateAxisType": "Time" + } + }, + "crs": { + "shape": [ + "" + ], + "type": "char", + "attributes": { + "grid_mapping_name": "rotated_latitude_longitude", + "grid_north_pole_latitude": [ + 42.5 + ], + "grid_north_pole_longitude": [ + 83.0 + ], + "north_pole_grid_longitude": [ + 0.0 + ], + "_CoordinateTransformType": "Projection", + "_CoordinateAxisTypes": "GeoX GeoY" + } + }, + "tas": { + "shape": [ + "time", + "rlat", + "rlon" + ], + "type": "float", + "attributes": { + "long_name": "Near-Surface Air Temperature", + "_FillValue": [ + 1.0000000200408773e+20 + ], + "standard_name": "air_temperature", + "cell_measures": "area: areacella", + "cell_methods": "area: mean time: point", + "missing_value": [ + 1.0000000200408773e+20 + ], + "units": "K", + "grid_mapping": "crs", + "coordinates": "height lat lon", + "_ChunkSizes": [ + 250, + 50, + 50 + ] + } + }, + "vertices_latitude": { + "shape": [ + "rlat", + "rlon", + "bounds" + ], + "type": "double", + "attributes": { + "_FillValue": [ + NaN + ], + "coordinates": "lat lon", + "_ChunkSizes": [ + 628, + 655, + 2 + ] + } + }, + "vertices_longitude": { + "shape": [ + "rlat", + "rlon", + "bounds" + ], + "type": "double", + "attributes": { + "_FillValue": [ + NaN + ], + "coordinates": "lat lon", + "_ChunkSizes": [ + 628, + 655, + 2 + ] + } + }, + "height": { + "shape": [ + "" + ], + "type": "double", + "attributes": { + "_FillValue": [ + NaN + ], + "units": "m", + "standard_name": "height", + "axis": "Z", + "long_name": "height", + "positive": "up", + "_CoordinateAxisType": "Height", + "_CoordinateZisPositive": "up" + } + }, + "lat": { + "shape": [ + "rlat", + "rlon" + ], + "type": "double", + "attributes": { + "_FillValue": [ + NaN + ], + "standard_name": "latitude", + "long_name": "latitude", + "units": "degrees_north", + "bounds": "vertices_latitude", + "_ChunkSizes": [ + 628, + 655 + ], + "_CoordinateAxisType": "Lat" + } + }, + "lon": { + "shape": [ + "rlat", + "rlon" + ], + "type": "double", + "attributes": { + "_FillValue": [ + NaN + ], + "standard_name": "longitude", + "long_name": "longitude", + "units": "degrees_east", + "bounds": "vertices_longitude", + "_ChunkSizes": [ + 628, + 655 + ], + "_CoordinateAxisType": "Lon" + } + } + }, + "access_urls": { + "HTTPServer": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/fileServer/birdhouse/disk2/ouranos/CORDEX/CMIP6/DD/NAM-12/OURANOS/MPI-ESM1-2-LR/ssp370/r1i1p1f1/CRCM5/v1-r1/day/tas/v20231208/tas_NAM-12_MPI-ESM1-2-LR_ssp370_r1i1p1f1_OURANOS_CRCM5_v1-r1_day_20960101-21001231.nc", + "OPENDAP": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/dodsC/birdhouse/disk2/ouranos/CORDEX/CMIP6/DD/NAM-12/OURANOS/MPI-ESM1-2-LR/ssp370/r1i1p1f1/CRCM5/v1-r1/day/tas/v20231208/tas_NAM-12_MPI-ESM1-2-LR_ssp370_r1i1p1f1_OURANOS_CRCM5_v1-r1_day_20960101-21001231.nc", + "NCML": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/ncml/birdhouse/disk2/ouranos/CORDEX/CMIP6/DD/NAM-12/OURANOS/MPI-ESM1-2-LR/ssp370/r1i1p1f1/CRCM5/v1-r1/day/tas/v20231208/tas_NAM-12_MPI-ESM1-2-LR_ssp370_r1i1p1f1_OURANOS_CRCM5_v1-r1_day_20960101-21001231.nc", + "UDDC": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/uddc/birdhouse/disk2/ouranos/CORDEX/CMIP6/DD/NAM-12/OURANOS/MPI-ESM1-2-LR/ssp370/r1i1p1f1/CRCM5/v1-r1/day/tas/v20231208/tas_NAM-12_MPI-ESM1-2-LR_ssp370_r1i1p1f1_OURANOS_CRCM5_v1-r1_day_20960101-21001231.nc", + "ISO": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/iso/birdhouse/disk2/ouranos/CORDEX/CMIP6/DD/NAM-12/OURANOS/MPI-ESM1-2-LR/ssp370/r1i1p1f1/CRCM5/v1-r1/day/tas/v20231208/tas_NAM-12_MPI-ESM1-2-LR_ssp370_r1i1p1f1_OURANOS_CRCM5_v1-r1_day_20960101-21001231.nc", + "WCS": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wcs/birdhouse/disk2/ouranos/CORDEX/CMIP6/DD/NAM-12/OURANOS/MPI-ESM1-2-LR/ssp370/r1i1p1f1/CRCM5/v1-r1/day/tas/v20231208/tas_NAM-12_MPI-ESM1-2-LR_ssp370_r1i1p1f1_OURANOS_CRCM5_v1-r1_day_20960101-21001231.nc", + "WMS": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wms/birdhouse/disk2/ouranos/CORDEX/CMIP6/DD/NAM-12/OURANOS/MPI-ESM1-2-LR/ssp370/r1i1p1f1/CRCM5/v1-r1/day/tas/v20231208/tas_NAM-12_MPI-ESM1-2-LR_ssp370_r1i1p1f1_OURANOS_CRCM5_v1-r1_day_20960101-21001231.nc", + "NetcdfSubset": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/ncss/birdhouse/disk2/ouranos/CORDEX/CMIP6/DD/NAM-12/OURANOS/MPI-ESM1-2-LR/ssp370/r1i1p1f1/CRCM5/v1-r1/day/tas/v20231208/tas_NAM-12_MPI-ESM1-2-LR_ssp370_r1i1p1f1_OURANOS_CRCM5_v1-r1_day_20960101-21001231.nc" + } +} \ No newline at end of file diff --git a/tests/test_cordex.py b/tests/test_cordex.py new file mode 100644 index 0000000..59de5e9 --- /dev/null +++ b/tests/test_cordex.py @@ -0,0 +1,55 @@ +import json +from STACpopulator.extensions.cordex6 import Cordex6DataModel, Cordex6DataModelNcML + + +def get_first_item_attrs(url): + import requests + from siphon.catalog import TDSCatalog + import xncml + from STACpopulator.stac_utils import np2py + + cat = TDSCatalog(url) + + if cat.datasets.items(): + for item_name, ds in cat.datasets.items(): + r = requests.get(ds.access_urls["NCML"]) + attrs = xncml.Dataset.from_text(r.text).to_cf_dict() + attrs["access_urls"] = ds.access_urls + return np2py(attrs) + + +def make_test_data(): + """Fetches attribute data from the PAVICS THREDDS catalog and stores it in the data/ directory as a json.""" + # Raw CORDEX data + url = "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/disk2/ouranos/CORDEX/CMIP6/DD/NAM-12/OURANOS/MPI-ESM1-2-LR/ssp370/r1i1p1f1/CRCM5/v1-r1/day/tas/v20231208/catalog.html" + attrs = get_first_item_attrs(url) + with open("data/cordex6_raw.json", "w") as f: + json.dump(attrs, f, indent=2) + + # NcML CORDEX data + url = "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/datasets/simulations/RCM-CMIP6/CORDEX/NAM-12/day/catalog.html" + attrs = get_first_item_attrs(url) + with open("data/cordex6_ncml.json", "w") as f: + json.dump(attrs, f, indent=2) + + + +def test_item_raw(): + attrs = json.load(open("tests/data/cordex6_raw.json")) + model = Cordex6DataModel.from_data(attrs) + item = model.stac_item() + assert set(model._helpers) == {"cordex6", "thredds", "datacube"} + + assert item["properties"]["cordex6:activity_id"] == "DD" + assert item["properties"]["cordex6:project_id"] == "CORDEX" + + +def test_item_ncml(): + attrs = json.load(open("tests/data/cordex6_ncml.json")) + model = Cordex6DataModelNcML.from_data(attrs) + item = model.stac_item() + assert set(model._helpers) == {"cordex6", "thredds", "datacube", "xscen"} + + assert item["properties"]["cordex6:activity_id"] == "DD" + assert item["properties"]["cordex6:project_id"] == "CORDEX" + assert item["properties"]["xscen:type"] == "simulation"