From 71ce3e076185bb413c580060e36ee27990648d59 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 4 Oct 2023 22:03:00 -0400 Subject: [PATCH 01/36] adding numpy types to python types conversion for metadata --- STACpopulator/input.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/STACpopulator/input.py b/STACpopulator/input.py index 088051f..3b2c11c 100644 --- a/STACpopulator/input.py +++ b/STACpopulator/input.py @@ -2,6 +2,7 @@ from abc import ABC, abstractmethod from typing import Any, Iterator, MutableMapping, Optional, Tuple +import numpy as np import requests import siphon import xncml @@ -37,10 +38,6 @@ def reset(self): pass - - - - class THREDDSLoader(GenericLoader): def __init__(self, thredds_catalog_url: str, depth: Optional[int] = None) -> None: """Constructor @@ -89,6 +86,21 @@ def extract_metadata(self, ds: siphon.catalog.Dataset) -> MutableMapping[str, An # Convert NcML to CF-compliant dictionary attrs = xncml.Dataset.from_text(r.content).to_cf_dict() + # Converting numpy datatypes to python standard datatypes + for key, value in attrs["attributes"].items(): + if isinstance(value, list): + newlist = [] + for item in value: + if issubclass(type(item), np.integer): + newlist.append(int(item)) + elif issubclass(type(item), np.floating): + newlist.append(float(item)) + else: + newlist.append(item) + attrs["attributes"][key] = newlist + elif isinstance(type(value), np.integer): + attrs["attributes"][key] = int(value) + attrs["access_urls"] = ds.access_urls return attrs From 350b4f4cb26f064fe83e25eb436417ccc89c5312 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 4 Oct 2023 22:04:37 -0400 Subject: [PATCH 02/36] removing collection2enum --- STACpopulator/stac_utils.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 24efb07..cb424c2 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -47,23 +47,6 @@ def url_validate(target: str) -> bool: return True if re.match(url_regex, target) else False -def collection2enum(collection: pyessv.model.collection.Collection) -> enumtype: - """Create Enum based on terms from pyessv collection. - - Parameters - ---------- - collection : pyessv.model.collection.Collection - pyessv collection of terms. - - Returns - ------- - Enum - Enum storing terms and their labels from collection. - """ - mp = {term.name: term.label for term in collection} - return Enum(collection.raw_name.capitalize(), mp, module="base") - - def collection2literal(collection): import typing terms = tuple(term.label for term in collection) From 2a445f04b7f4c12dd50401baf8ce4502aa696aff Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 4 Oct 2023 22:08:24 -0400 Subject: [PATCH 03/36] black --- STACpopulator/stac_utils.py | 318 +++++++++++++++++++----------------- 1 file changed, 170 insertions(+), 148 deletions(-) diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index cb424c2..6c4f67a 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -135,6 +135,7 @@ class Item(BaseModel): class CFJsonItem: """Return STAC Item from CF JSON metadata, as provided by `xncml.Dataset.to_cf_dict`.""" + axis = {"X": "x", "Y": "y", "Z": "z", "T": "t", "longitude": "x", "latitude": "y", "vertical": "z", "time": "t"} def __init__(self, iid: str, attrs: dict, datamodel=None): @@ -154,10 +155,11 @@ def __init__(self, iid: str, attrs: dict, datamodel=None): cfmeta = attrs["groups"]["CFMetadata"]["attributes"] # Global attributes - gattrs = {"start_datetime": cfmeta["time_coverage_start"], - "end_datetime": cfmeta["time_coverage_end"], - **attrs["attributes"], - } + gattrs = { + "start_datetime": cfmeta["time_coverage_start"], + "end_datetime": cfmeta["time_coverage_end"], + **attrs["attributes"], + } # Validate using pydantic data model if given datamodel = datamodel or dict @@ -179,8 +181,8 @@ class MySTACItem(Item): # Add assets if "access_urls" in attrs: root = attrs["access_urls"] - elif 'THREDDSMetadata' in attrs["groups"]: - root = attrs["groups"]['THREDDSMetadata']['groups']['services']['attributes'] + elif "THREDDSMetadata" in attrs["groups"]: + root = attrs["groups"]["THREDDSMetadata"]["groups"]["services"]["attributes"] else: root = {} @@ -244,31 +246,32 @@ def dimensions(self) -> dict: bbox = self.obj.ncattrs_to_bbox() for key, criteria in coordinate_criteria.items(): for criterion, expected in criteria.items(): - if v['attributes'].get(criterion, None) in expected: + if v["attributes"].get(criterion, None) in expected: axis = self.axis[key] - type_ = DimensionType.SPATIAL if axis in ['x', 'y', 'z'] else DimensionType.TEMPORAL + type_ = DimensionType.SPATIAL if axis in ["x", "y", "z"] else DimensionType.TEMPORAL - if v['type'] == 'int': + if v["type"] == "int": extent = [0, int(length)] else: # Not clear the logic is sound - if key == 'X': + if key == "X": extent = bbox[0], bbox[2] elif key == "Y": extent = bbox[1], bbox[3] else: extent = None - dims[name] = Dimension(properties=dict( - axis = axis, - type = type_, - extent = extent, - description=v.get("description", v.get("long_name", criteria["standard_name"])) + dims[name] = Dimension( + properties=dict( + axis=axis, + type=type_, + extent=extent, + description=v.get("description", v.get("long_name", criteria["standard_name"])), ) ) return dims - def variables(self)->dict: + def variables(self) -> dict: """Return Variable objects required for Datacube extension.""" variables = {} @@ -276,17 +279,18 @@ def variables(self)->dict: if name in self.attrs["dimensions"]: continue - attrs = meta['attributes'] - variables[name] = Variable(properties=dict( + attrs = meta["attributes"] + variables[name] = Variable( + properties=dict( dimensions=meta["shape"], - type = VariableType.AUXILIARY.value if self.is_coordinate(attrs) else - VariableType.DATA.value, + type=VariableType.AUXILIARY.value if self.is_coordinate(attrs) else VariableType.DATA.value, description=attrs.get("description", attrs.get("long_name")), - unit=attrs.get("units", None) - )) + unit=attrs.get("units", None), + ) + ) return variables - def is_coordinate(self, attrs: dict)-> bool: + def is_coordinate(self, attrs: dict) -> bool: """Return whether variable is a coordinate.""" for key, criteria in coordinate_criteria.items(): for criterion, expected in criteria.items(): @@ -297,128 +301,146 @@ def is_coordinate(self, attrs: dict)-> bool: # From CF-Xarray coordinate_criteria = { - 'latitude': {'standard_name': ('latitude',), - 'units': ('degree_north', - 'degree_N', - 'degreeN', - 'degrees_north', - 'degrees_N', - 'degreesN'), - '_CoordinateAxisType': ('Lat',), - 'long_name': ('latitude',)}, - 'longitude': {'standard_name': ('longitude',), - 'units': ('degree_east', - 'degree_E', - 'degreeE', - 'degrees_east', - 'degrees_E', - 'degreesE'), - '_CoordinateAxisType': ('Lon',), - 'long_name': ('longitude',)}, - 'Z': {'standard_name': ('model_level_number', - 'atmosphere_ln_pressure_coordinate', - 'atmosphere_sigma_coordinate', - 'atmosphere_hybrid_sigma_pressure_coordinate', - 'atmosphere_hybrid_height_coordinate', - 'atmosphere_sleve_coordinate', - 'ocean_sigma_coordinate', - 'ocean_s_coordinate', - 'ocean_s_coordinate_g1', - 'ocean_s_coordinate_g2', - 'ocean_sigma_z_coordinate', - 'ocean_double_sigma_coordinate'), - '_CoordinateAxisType': ('GeoZ', 'Height', 'Pressure'), - 'axis': ('Z',), - 'cartesian_axis': ('Z',), - 'grads_dim': ('z',), - 'long_name': ('model_level_number', - 'atmosphere_ln_pressure_coordinate', - 'atmosphere_sigma_coordinate', - 'atmosphere_hybrid_sigma_pressure_coordinate', - 'atmosphere_hybrid_height_coordinate', - 'atmosphere_sleve_coordinate', - 'ocean_sigma_coordinate', - 'ocean_s_coordinate', - 'ocean_s_coordinate_g1', - 'ocean_s_coordinate_g2', - 'ocean_sigma_z_coordinate', - 'ocean_double_sigma_coordinate')}, - 'vertical': {'standard_name': ('air_pressure', - 'height', - 'depth', - 'geopotential_height', - 'altitude', - 'height_above_geopotential_datum', - 'height_above_reference_ellipsoid', - 'height_above_mean_sea_level'), - 'positive': ('up', 'down'), - 'long_name': ('air_pressure', - 'height', - 'depth', - 'geopotential_height', - 'altitude', - 'height_above_geopotential_datum', - 'height_above_reference_ellipsoid', - 'height_above_mean_sea_level')}, - 'X': {'standard_name': ('projection_x_coordinate', - 'grid_longitude', - 'projection_x_angular_coordinate'), - '_CoordinateAxisType': ('GeoX',), - 'axis': ('X',), - 'cartesian_axis': ('X',), - 'grads_dim': ('x',), - 'long_name': ('projection_x_coordinate', - 'grid_longitude', - 'projection_x_angular_coordinate', - 'cell index along first dimension')}, - 'Y': {'standard_name': ('projection_y_coordinate', - 'grid_latitude', - 'projection_y_angular_coordinate'), - '_CoordinateAxisType': ('GeoY',), - 'axis': ('Y',), - 'cartesian_axis': ('Y',), - 'grads_dim': ('y',), - 'long_name': ('projection_y_coordinate', - 'grid_latitude', - 'projection_y_angular_coordinate', - 'cell index along second dimension')}, - 'T': {'standard_name': ('time',), - '_CoordinateAxisType': ('Time',), - 'axis': ('T',), - 'cartesian_axis': ('T',), - 'grads_dim': ('t',), - 'long_name': ('time',)}, - 'time': {'standard_name': ('time',), - '_CoordinateAxisType': ('Time',), - 'axis': ('T',), - 'cartesian_axis': ('T',), - 'grads_dim': ('t',), - 'long_name': ('time',)}} - - -media_types = {"httpserver_service": "application/x-netcdf", - "opendap_service": pystac.MediaType.HTML, - "wcs_service": pystac.MediaType.XML, - "wms_service": pystac.MediaType.XML, - "nccs_service": "application/x-netcdf", - "HTTPServer": "application/x-netcdf", - "OPENDAP": pystac.MediaType.HTML, - "NCML": pystac.MediaType.XML, - "WCS": pystac.MediaType.XML, - "ISO": pystac.MediaType.XML, - "WMS": pystac.MediaType.XML, - "NetcdfSubset": "application/x-netcdf", - } - -asset_roles = {"httpserver_service": ["data"], - "opendap_service": ["data"], - "wcs_service": ["data"], - "wms_service": ["visual"], - "nccs_service": ["data"], - "HTTPServer": ["data"], - "OPENDAP": ["data"], - "NCML": ["metadata"], - "WCS": ["data"], - "ISO": ["metadata"], - "WMS": ["visual"], - "NetcdfSubset": ["data"],} + "latitude": { + "standard_name": ("latitude",), + "units": ("degree_north", "degree_N", "degreeN", "degrees_north", "degrees_N", "degreesN"), + "_CoordinateAxisType": ("Lat",), + "long_name": ("latitude",), + }, + "longitude": { + "standard_name": ("longitude",), + "units": ("degree_east", "degree_E", "degreeE", "degrees_east", "degrees_E", "degreesE"), + "_CoordinateAxisType": ("Lon",), + "long_name": ("longitude",), + }, + "Z": { + "standard_name": ( + "model_level_number", + "atmosphere_ln_pressure_coordinate", + "atmosphere_sigma_coordinate", + "atmosphere_hybrid_sigma_pressure_coordinate", + "atmosphere_hybrid_height_coordinate", + "atmosphere_sleve_coordinate", + "ocean_sigma_coordinate", + "ocean_s_coordinate", + "ocean_s_coordinate_g1", + "ocean_s_coordinate_g2", + "ocean_sigma_z_coordinate", + "ocean_double_sigma_coordinate", + ), + "_CoordinateAxisType": ("GeoZ", "Height", "Pressure"), + "axis": ("Z",), + "cartesian_axis": ("Z",), + "grads_dim": ("z",), + "long_name": ( + "model_level_number", + "atmosphere_ln_pressure_coordinate", + "atmosphere_sigma_coordinate", + "atmosphere_hybrid_sigma_pressure_coordinate", + "atmosphere_hybrid_height_coordinate", + "atmosphere_sleve_coordinate", + "ocean_sigma_coordinate", + "ocean_s_coordinate", + "ocean_s_coordinate_g1", + "ocean_s_coordinate_g2", + "ocean_sigma_z_coordinate", + "ocean_double_sigma_coordinate", + ), + }, + "vertical": { + "standard_name": ( + "air_pressure", + "height", + "depth", + "geopotential_height", + "altitude", + "height_above_geopotential_datum", + "height_above_reference_ellipsoid", + "height_above_mean_sea_level", + ), + "positive": ("up", "down"), + "long_name": ( + "air_pressure", + "height", + "depth", + "geopotential_height", + "altitude", + "height_above_geopotential_datum", + "height_above_reference_ellipsoid", + "height_above_mean_sea_level", + ), + }, + "X": { + "standard_name": ("projection_x_coordinate", "grid_longitude", "projection_x_angular_coordinate"), + "_CoordinateAxisType": ("GeoX",), + "axis": ("X",), + "cartesian_axis": ("X",), + "grads_dim": ("x",), + "long_name": ( + "projection_x_coordinate", + "grid_longitude", + "projection_x_angular_coordinate", + "cell index along first dimension", + ), + }, + "Y": { + "standard_name": ("projection_y_coordinate", "grid_latitude", "projection_y_angular_coordinate"), + "_CoordinateAxisType": ("GeoY",), + "axis": ("Y",), + "cartesian_axis": ("Y",), + "grads_dim": ("y",), + "long_name": ( + "projection_y_coordinate", + "grid_latitude", + "projection_y_angular_coordinate", + "cell index along second dimension", + ), + }, + "T": { + "standard_name": ("time",), + "_CoordinateAxisType": ("Time",), + "axis": ("T",), + "cartesian_axis": ("T",), + "grads_dim": ("t",), + "long_name": ("time",), + }, + "time": { + "standard_name": ("time",), + "_CoordinateAxisType": ("Time",), + "axis": ("T",), + "cartesian_axis": ("T",), + "grads_dim": ("t",), + "long_name": ("time",), + }, +} + + +media_types = { + "httpserver_service": "application/x-netcdf", + "opendap_service": pystac.MediaType.HTML, + "wcs_service": pystac.MediaType.XML, + "wms_service": pystac.MediaType.XML, + "nccs_service": "application/x-netcdf", + "HTTPServer": "application/x-netcdf", + "OPENDAP": pystac.MediaType.HTML, + "NCML": pystac.MediaType.XML, + "WCS": pystac.MediaType.XML, + "ISO": pystac.MediaType.XML, + "WMS": pystac.MediaType.XML, + "NetcdfSubset": "application/x-netcdf", +} + +asset_roles = { + "httpserver_service": ["data"], + "opendap_service": ["data"], + "wcs_service": ["data"], + "wms_service": ["visual"], + "nccs_service": ["data"], + "HTTPServer": ["data"], + "OPENDAP": ["data"], + "NCML": ["metadata"], + "WCS": ["data"], + "ISO": ["metadata"], + "WMS": ["visual"], + "NetcdfSubset": ["data"], +} From 2728ce664e83ab212eb0209841fbec2ff29462fc Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 4 Oct 2023 22:13:51 -0400 Subject: [PATCH 04/36] extracting pydantic base models to models.py --- STACpopulator/models.py | 74 ++++++++++++++++ STACpopulator/stac_utils.py | 112 ++---------------------- implementations/CMIP6-UofT/add_CMIP6.py | 43 ++++----- 3 files changed, 99 insertions(+), 130 deletions(-) create mode 100644 STACpopulator/models.py diff --git a/STACpopulator/models.py b/STACpopulator/models.py new file mode 100644 index 0000000..2b617b1 --- /dev/null +++ b/STACpopulator/models.py @@ -0,0 +1,74 @@ +import datetime as dt +from typing import Any, Dict, List, Optional, Union + +from pydantic import AnyHttpUrl, AnyUrl, BaseModel, Field, field_validator +from typing_extensions import TypedDict + + +class Geometry(TypedDict): + type: str + coordinates: List[List[List[float]]] + + +class Asset(BaseModel): + href: AnyHttpUrl + media_type: Optional[str] = None + title: Optional[str] = None + description: Optional[str] = None + roles: Optional[List[str]] = None + + +class STACItemProperties(BaseModel): + start_datetime: Optional[dt.datetime] = None + end_datetime: Optional[dt.datetime] = None + datetime: Optional[dt.datetime] = None + + @field_validator("datetime", mode="before") + @classmethod + def validate_datetime(cls, v: Union[dt.datetime, str], values: Dict[str, Any]) -> dt: + if v == "null": + if not values["start_datetime"] and not values["end_datetime"]: + raise ValueError("start_datetime and end_datetime must be specified when datetime is null") + + +# class Link(BaseModel): +# """ +# https://github.com/radiantearth/stac-spec/blob/v1.0.0/collection-spec/collection-spec.md#link-object +# """ + +# href: str = Field(..., alias="href", min_length=1) +# rel: str = Field(..., alias="rel", min_length=1) +# type: Optional[str] = None +# title: Optional[str] = None +# # Label extension +# label: Optional[str] = Field(None, alias="label:assets") +# model_config = ConfigDict(use_enum_values=True) + +# def resolve(self, base_url: str) -> None: +# """resolve a link to the given base URL""" +# self.href = urljoin(base_url, self.href) + + +# class PaginationLink(Link): +# """ +# https://github.com/radiantearth/stac-api-spec/blob/master/api-spec.md#paging-extension +# """ + +# rel: Literal["next", "previous"] +# method: Literal["GET", "POST"] +# body: Optional[Dict[Any, Any]] = None +# merge: bool = False + + +# Links = RootModel[List[Union[PaginationLink, Link]]] + + +class STACItem(BaseModel): + id: str = Field(..., alias="id", min_length=1) + geometry: Optional[Geometry] = None + bbox: Optional[List[float]] = None + properties: Optional[STACItemProperties] = None + assets: Dict[str, Asset] = None + stac_extensions: Optional[List[AnyUrl]] = [] + collection: Optional[str] = None + datetime: Optional[dt.datetime] = None # Not in the spec, but needed by pystac.Item. diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 6c4f67a..df84f10 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -1,27 +1,12 @@ -import re import json -import datetime as dt -from enum import Enum, auto -from typing import Any, Iterator, MutableMapping, Optional, Tuple, Union -from typing import Any, Dict, List, Literal, MutableMapping -from typing_extensions import TypedDict -import pystac -from pystac.extensions.datacube import Dimension, DimensionType, VariableType, Variable, DatacubeExtension -from pydantic import AnyHttpUrl, BaseModel, field_validator, Field, ConfigDict, RootModel, AnyUrl -from urllib.parse import urljoin - - -import pyessv - +import re +from typing import Any, Literal, MutableMapping -try: - from enum import EnumType as enumtype -except ImportError: - # < Python 3.11 - from enum import EnumMeta as enumtype +import pystac +from pystac.extensions.datacube import Dimension, DimensionType, Variable, VariableType +from STACpopulator.models import STACItem, STACItemProperties -STAC_VERSION = "1.0.0" def url_validate(target: str) -> bool: """Validate whether a supplied URL is reliably written. @@ -48,89 +33,8 @@ def url_validate(target: str) -> bool: def collection2literal(collection): - import typing terms = tuple(term.label for term in collection) - return typing.Literal[terms] - - -class AutoValueEnum(Enum): - def _generate_next_value_( # type: ignore - name: str, start: int, count: int, last_values: List[Any] - ) -> Any: - return name - - -# DH: There is a question here whether we want to use pystac.Item or not. -# pystac.Item takes datetime, start_datetime and end_datetime as optional parameters, and then copies them into -# properties. -# If we use pystac.Item, we don't have to put start_datetime and end_datetime into Properties, we can let pystac do -# that. -class ItemProperties(BaseModel): - start_datetime: Optional[dt.datetime] = None - end_datetime: Optional[dt.datetime] = None - datetime: Optional[dt.datetime] = None - - @field_validator("datetime", mode="before") - def validate_datetime(cls, v: Union[dt.datetime, str], values: Dict[str, Any]) -> dt: - if v == "null": - if not values["start_datetime"] and not values["end_datetime"]: - raise ValueError( - "start_datetime and end_datetime must be specified when datetime is null" - ) - - -class Geometry(TypedDict): - type: str - coordinates: List[List[List[float]]] - -class Asset(BaseModel): - href: AnyHttpUrl - media_type: Optional[str] = None - title: Optional[str] = None - description: Optional[str] = None - roles: Optional[List[str]] = None - -class Link(BaseModel): - """ - https://github.com/radiantearth/stac-spec/blob/v1.0.0/collection-spec/collection-spec.md#link-object - """ - - href: str = Field(..., alias="href", min_length=1) - rel: str = Field(..., alias="rel", min_length=1) - type: Optional[str] = None - title: Optional[str] = None - # Label extension - label: Optional[str] = Field(None, alias="label:assets") - model_config = ConfigDict(use_enum_values=True) - - def resolve(self, base_url: str) -> None: - """resolve a link to the given base URL""" - self.href = urljoin(base_url, self.href) - - -class PaginationLink(Link): - """ - https://github.com/radiantearth/stac-api-spec/blob/master/api-spec.md#paging-extension - """ - - rel: Literal["next", "previous"] - method: Literal["GET", "POST"] - body: Optional[Dict[Any, Any]] = None - merge: bool = False - -Links = RootModel[List[Union[PaginationLink, Link]]] - - -class Item(BaseModel): - id: str = Field(..., alias="id", min_length=1) - geometry: Optional[Geometry] = None - bbox: Optional[List[float]] = None - properties: Optional[ItemProperties] = None - assets: Dict[str, Asset] = None - stac_extensions: Optional[List[AnyUrl]] = [] - collection: Optional[str] = None - datetime: Optional[dt.datetime] = None # Not in the spec, but needed by pystac.Item. - + return Literal[terms] class CFJsonItem: @@ -162,9 +66,9 @@ def __init__(self, iid: str, attrs: dict, datamodel=None): } # Validate using pydantic data model if given - datamodel = datamodel or dict + datamodel = datamodel or STACItemProperties - class MySTACItem(Item): + class MySTACItem(STACItem): properties: datamodel # Create STAC item diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index 4fdea40..55162f6 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -1,19 +1,18 @@ -import logging +import argparse import hashlib +import logging from datetime import datetime from typing import Any, Dict, List, Literal, MutableMapping -from colorlog import ColoredFormatter -import argparse + import pyessv -from pydantic import AnyHttpUrl, BaseModel, Field, FieldValidationInfo, field_validator -from pystac.extensions.datacube import DatacubeExtension +from colorlog import ColoredFormatter +from pydantic import AnyHttpUrl, Field, FieldValidationInfo, field_validator from STACpopulator import STACpopulatorBase from STACpopulator.extensions import cmip6 from STACpopulator.input import THREDDSLoader -from STACpopulator.stac_utils import ItemProperties -from STACpopulator.stac_utils import collection2literal, CFJsonItem - +from STACpopulator.models import STACItemProperties +from STACpopulator.stac_utils import CFJsonItem, collection2literal LOGGER = logging.getLogger(__name__) LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" @@ -33,17 +32,15 @@ Frequency = collection2literal(CV.frequency) GridLabel = collection2literal(CV.grid_label) InstitutionID = collection2literal(CV.institution_id) -# Member = collection2literal(CV.member_id) # This is empty NominalResolution = collection2literal(CV.nominal_resolution) Realm = collection2literal(CV.realm) SourceID = collection2literal(CV.source_id) SourceType = collection2literal(CV.source_type) SubExperimentID = collection2literal(CV.sub_experiment_id) TableID = collection2literal(CV.table_id) -# Variable = collection2literal(CV.variable_id) # This is empty -class Properties(ItemProperties, validate_assignment=True): +class CMIP6ItemProperties(STACItemProperties, validate_assignment=True): """Data model for CMIP6 Controlled Vocabulary.""" Conventions: str = Field(..., serialization_alias="cmip6:Conventions") @@ -129,7 +126,7 @@ def __init__(self, stac_host: str, thredds_catalog_url: str, config_filename: st """ data_loader = THREDDSLoader(thredds_catalog_url) - self.props_model = Properties + self.item_properties_model = CMIP6ItemProperties super().__init__(stac_host, data_loader, config_filename) def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]): @@ -147,22 +144,16 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) """ iid = make_cmip6_item_id(item_data["attributes"]) - obj = CFJsonItem(iid, item_data, self.props_model) - - # Add CMIP6 extension - try: - cmip6_ext = cmip6.CMIP6Extension.ext(obj.item, add_if_missing=True) - cmip6_ext.apply(item_data["attributes"]) - except: - LOGGER.warning(f"Failed to add CMIP6 extension to item {item_name}") + obj = CFJsonItem(iid, item_data, self.item_properties_model) - # Add datacube extension - try: - dc_ext = DatacubeExtension.ext(obj.item, add_if_missing=True) - dc_ext.apply(dimensions=obj.dimensions(), variables=obj.variables()) - except: - LOGGER.warning(f"Failed to add Datacube extension to item {item_name}") + # # Add datacube extension + # try: + # dc_ext = DatacubeExtension.ext(obj.item, add_if_missing=True) + # dc_ext.apply(dimensions=obj.dimensions(), variables=obj.variables()) + # except: + # LOGGER.warning(f"Failed to add Datacube extension to item {item_name}") + print(obj.item.to_dict()) return obj.item.to_dict() def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: From b47d613e659767149807fed8984bfbf709f94388 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 4 Oct 2023 22:17:11 -0400 Subject: [PATCH 05/36] removing cmip6 extension code --- STACpopulator/extensions/__init__.py | 0 STACpopulator/extensions/cmip6.py | 179 ------------------------ implementations/CMIP6-UofT/add_CMIP6.py | 1 - 3 files changed, 180 deletions(-) delete mode 100644 STACpopulator/extensions/__init__.py delete mode 100644 STACpopulator/extensions/cmip6.py diff --git a/STACpopulator/extensions/__init__.py b/STACpopulator/extensions/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/STACpopulator/extensions/cmip6.py b/STACpopulator/extensions/cmip6.py deleted file mode 100644 index 2e75020..0000000 --- a/STACpopulator/extensions/cmip6.py +++ /dev/null @@ -1,179 +0,0 @@ -"""CMIP6 extension based on https://stac-extensions.github.io/cmip6/v1.0.0/schema.json""" - -import json -from typing import Generic, TypeVar, Union, cast - -import pystac -from pystac.extensions.base import ExtensionManagementMixin, PropertiesExtension -from pystac.extensions.hooks import ExtensionHooks - -from datetime import date, datetime -from typing import Any, Dict, List, Literal -import pyessv -from pydantic import ( - AnyHttpUrl, - FieldValidationInfo, - field_validator, - model_serializer, -) -from pydantic.networks import Url - - -from STACpopulator.stac_utils import ItemProperties -from STACpopulator.stac_utils import collection2literal - -T = TypeVar("T", pystac.Collection, pystac.Item, pystac.Asset) - -SCHEMA_URI = "https://stac-extensions.github.io/cmip6/v1.0.0/schema.json" - - -# CMIP6 controlled vocabulary (CV) -CV = pyessv.WCRP.CMIP6 - -# Enum classes built from the pyessv' CV -ActivityID = collection2literal(CV.activity_id) -ExperimentID = collection2literal(CV.experiment_id) -Frequency = collection2literal(CV.frequency) -GridLabel = collection2literal(CV.grid_label) -InstitutionID = collection2literal(CV.institution_id) -NominalResolution = collection2literal(CV.nominal_resolution) -Realm = collection2literal(CV.realm) -SourceID = collection2literal(CV.source_id) -SourceType = collection2literal(CV.source_type) -SubExperimentID = collection2literal(CV.sub_experiment_id) -TableID = collection2literal(CV.table_id) - - -class Properties(ItemProperties, validate_assignment=True): - """Data model for CMIP6 Controlled Vocabulary.""" - - Conventions: str - activity_id: ActivityID - creation_date: datetime - data_specs_version: str - experiment: str - experiment_id: ExperimentID - frequency: Frequency - further_info_url: AnyHttpUrl - grid_label: GridLabel - institution: str - institution_id: InstitutionID - nominal_resolution: NominalResolution - realm: List[Realm] - source: str - source_id: SourceID - source_type: List[SourceType] - sub_experiment: Union[str, Literal["none"]] - sub_experiment_id: Union[SubExperimentID, Literal["none"]] - table_id: TableID - variable_id: str - variant_label: str - initialization_index: int - physics_index: int - realization_index: int - forcing_index: int - tracking_id: str - version: str - product: str - license: str - grid: str - mip_era: str - - - @field_validator("initialization_index", "physics_index", "realization_index", "forcing_index", mode="before") - @classmethod - def first_item(cls, v: list, info: FieldValidationInfo): - """Pick single item from list.""" - assert len(v) == 1, f"{info.field_name} must have one item only." - return v[0] - - @field_validator("realm", "source_type", mode="before") - @classmethod - def split(cls, v: str, info: FieldValidationInfo): - """Split string into list.""" - return v.split(" ") - - @field_validator("version") - @classmethod - def validate_version(cls, v: str, info: FieldValidationInfo): - assert v[0] == "v", "Version string should begin with a lower case 'v'" - assert v[1:].isdigit(), "All characters in version string, except first, should be digits" - return v - - -class CMIP6Extension(Generic[T], ExtensionManagementMixin[pystac.Item], PropertiesExtension): - """An abstract class that can be used to extend the properties of a - :class:`~pystac.Item` with properties from the :stac-ext:`CMIP6 Extension `. - - To create an instance of :class:`CMIP6Extension`, use the :meth:`CMIP6Extension.ext` method. - """ - prefix: str = "cmip6:" - - def apply(self, attrs: Dict[str, Any]) -> None: - """Applies Datacube Extension properties to the extended - :class:`~pystac.Collection`, :class:`~pystac.Item` or :class:`~pystac.Asset`. - - Args: - dimensions : Dictionary mapping dimension name to :class:`Dimension` - objects. - variables : Dictionary mapping variable name to a :class:`Variable` - object. - """ - import json - - p = Properties(**attrs) - - # Add prefix - objs = {self.prefix + k: v for (k, v) in json.loads(p.model_dump_json()).items()} - - # Update item properties - self.properties.update(**objs) - - @classmethod - def get_schema_uri(cls) -> str: - return SCHEMA_URI - - @classmethod - def ext(cls, obj: T, add_if_missing: bool = False): - """Extends the given STAC Object with properties from the :stac-ext:`CMIP6 - Extension `. - - This extension can be applied to instances of :class:`~pystac.Item`. - - Raises: - pystac.ExtensionTypeError : If an invalid object type is passed. - """ - if isinstance(obj, pystac.Item): - cls.validate_has_extension(obj, add_if_missing) - return cast(CMIP6Extension[T], ItemCMIP6Extension(obj)) - else: - raise pystac.ExtensionTypeError(cls._ext_error_message(obj)) - - -class ItemCMIP6Extension(CMIP6Extension[pystac.Item]): - """A concrete implementation of :class:`DatacubeExtension` on an - :class:`~pystac.Item` that extends the properties of the Item to include properties - defined in the :stac-ext:`Datacube Extension `. - - This class should generally not be instantiated directly. Instead, call - :meth:`DatacubeExtension.ext` on an :class:`~pystac.Item` to extend it. - """ - - item: pystac.Item - properties: Dict[str, Any] - - def __init__(self, item: pystac.Item): - self.item = item - self.properties = item.properties - - def __repr__(self) -> str: - return "".format(self.item.id) - - -class CMIP6ExtensionHooks(ExtensionHooks): - schema_uri: str = SCHEMA_URI - prev_extension_ids = {"cmip6"} - stac_object_types = {pystac.STACObjectType.ITEM} - - -CMIP6_EXTENSION_HOOKS: ExtensionHooks = CMIP6ExtensionHooks() diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index 55162f6..8cf0298 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -9,7 +9,6 @@ from pydantic import AnyHttpUrl, Field, FieldValidationInfo, field_validator from STACpopulator import STACpopulatorBase -from STACpopulator.extensions import cmip6 from STACpopulator.input import THREDDSLoader from STACpopulator.models import STACItemProperties from STACpopulator.stac_utils import CFJsonItem, collection2literal From 2f5dc39598f13b8f028a31723030306f287da5ce Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Fri, 6 Oct 2023 14:34:15 -0400 Subject: [PATCH 06/36] Breaking CFJsonItem part 1: extracting STAC item creation --- STACpopulator/models.py | 17 ++++- STACpopulator/populator_base.py | 21 ++++-- STACpopulator/stac_utils.py | 90 +++++++++++++++++++++++++ implementations/CMIP6-UofT/add_CMIP6.py | 19 ++++-- 4 files changed, 131 insertions(+), 16 deletions(-) diff --git a/STACpopulator/models.py b/STACpopulator/models.py index 2b617b1..625efc2 100644 --- a/STACpopulator/models.py +++ b/STACpopulator/models.py @@ -1,7 +1,14 @@ import datetime as dt from typing import Any, Dict, List, Optional, Union -from pydantic import AnyHttpUrl, AnyUrl, BaseModel, Field, field_validator +from pydantic import ( + AnyHttpUrl, + AnyUrl, + BaseModel, + Field, + SerializeAsAny, + field_validator, +) from typing_extensions import TypedDict @@ -19,6 +26,10 @@ class Asset(BaseModel): class STACItemProperties(BaseModel): + """Base STAC Item properties data model. In concrete implementations, users would want to define a new + data model that inherits from this base model and extends it with properties tailored to the data they are + ingesting.""" + start_datetime: Optional[dt.datetime] = None end_datetime: Optional[dt.datetime] = None datetime: Optional[dt.datetime] = None @@ -64,10 +75,12 @@ def validate_datetime(cls, v: Union[dt.datetime, str], values: Dict[str, Any]) - class STACItem(BaseModel): + """STAC Item data model.""" + id: str = Field(..., alias="id", min_length=1) geometry: Optional[Geometry] = None bbox: Optional[List[float]] = None - properties: Optional[STACItemProperties] = None + properties: Optional[SerializeAsAny[STACItemProperties]] = None assets: Dict[str, Asset] = None stac_extensions: Optional[List[AnyUrl]] = [] collection: Optional[str] = None diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index 2541fe7..3d8f50c 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -58,7 +58,7 @@ def __init__( self._ingest_pipeline = data_loader self._stac_host = self.validate_host(stac_host) - #self._collection_id = hashlib.md5(self.collection_name.encode("utf-8")).hexdigest() + # self._collection_id = hashlib.md5(self.collection_name.encode("utf-8")).hexdigest() self._collection_id = self.collection_name LOGGER.info("Initialization complete") LOGGER.info(f"Collection {self.collection_name} is assigned id {self._collection_id}") @@ -76,6 +76,13 @@ def stac_host(self) -> str: def collection_id(self) -> str: return self._collection_id + @property + @abstractmethod + def item_properties_model(self): + """In derived classes, this property should be defined as a pydantic data model that derives from + models.STACItemProperties.""" + pass + def validate_host(self, stac_host: str) -> str: if not url_validate(stac_host): raise ValueError("stac_host URL is not appropriately formatted") @@ -115,12 +122,12 @@ def ingest(self) -> None: for item_name, item_data in self._ingest_pipeline: LOGGER.info(f"Creating STAC representation for {item_name}") stac_item = self.create_stac_item(item_name, item_data) - post_stac_item(self.stac_host, self.collection_id, item_name, stac_item) - try: - pass - except Exception: - LOGGER.error(f"Failed adding STAC item {item_name}") - self.handle_ingestion_error("Posting Error", item_name, item_data) + # post_stac_item(self.stac_host, self.collection_id, item_name, stac_item) + # try: + # pass + # except Exception: + # LOGGER.error(f"Failed adding STAC item {item_name}") + # self.handle_ingestion_error("Posting Error", item_name, item_data) @abstractmethod def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]): diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index df84f10..361eead 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -37,6 +37,96 @@ def collection2literal(collection): return Literal[terms] +def ncattrs_to_geometry(attrs: MutableMapping[str, Any]) -> MutableMapping[str, Any]: + """Create Polygon geometry from CFMetadata.""" + attrs = attrs["groups"]["CFMetadata"]["attributes"] + return { + "type": "Polygon", + "coordinates": [ + [ + [ + float(attrs["geospatial_lon_min"][0]), + float(attrs["geospatial_lat_min"][0]), + ], + [ + float(attrs["geospatial_lon_min"][0]), + float(attrs["geospatial_lat_max"][0]), + ], + [ + float(attrs["geospatial_lon_max"][0]), + float(attrs["geospatial_lat_max"][0]), + ], + [ + float(attrs["geospatial_lon_max"][0]), + float(attrs["geospatial_lat_min"][0]), + ], + [ + float(attrs["geospatial_lon_min"][0]), + float(attrs["geospatial_lat_min"][0]), + ], + ] + ], + } + + +def ncattrs_to_bbox(attrs: MutableMapping[str, Any]) -> list: + """Create BBOX from CFMetadata.""" + attrs = attrs["groups"]["CFMetadata"]["attributes"] + return [ + float(attrs["geospatial_lon_min"][0]), + float(attrs["geospatial_lat_min"][0]), + float(attrs["geospatial_lon_max"][0]), + float(attrs["geospatial_lat_max"][0]), + ] + + +def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_props_datamodel): + """ + Create STAC Item from CF JSON metadata. + + Parameters + ---------- + iid : str + Unique item ID. + attrs: dict + CF JSON metadata returned by `xncml.Dataset.to_cf_dict`. + datamodel : pydantic.BaseModel, optional + Data model for validating global attributes. + """ + + cfmeta = attrs["groups"]["CFMetadata"]["attributes"] + + # Create pydantic STAC item + item = STACItem( + id=iid, + geometry=ncattrs_to_geometry(attrs), + bbox=ncattrs_to_bbox(attrs), + properties=item_props_datamodel( + start_datetime=cfmeta["time_coverage_start"], + end_datetime=cfmeta["time_coverage_end"], + **attrs["attributes"], + ), + datetime=None, + ) + + # Convert pydantic STAC item to a PySTAC Item + item = pystac.Item(**json.loads(item.model_dump_json(by_alias=True))) + + # Add assets + if "access_urls" in attrs: + root = attrs["access_urls"] + elif "THREDDSMetadata" in attrs["groups"]: + root = attrs["groups"]["THREDDSMetadata"]["groups"]["services"]["attributes"] + else: + root = {} + + for name, url in root.items(): + asset = pystac.Asset(href=url, media_type=media_types.get(name), roles=asset_roles.get(name)) + item.add_asset(name, asset) + + return item + + class CFJsonItem: """Return STAC Item from CF JSON metadata, as provided by `xncml.Dataset.to_cf_dict`.""" diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index 8cf0298..bfb5d53 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -11,7 +11,11 @@ from STACpopulator import STACpopulatorBase from STACpopulator.input import THREDDSLoader from STACpopulator.models import STACItemProperties -from STACpopulator.stac_utils import CFJsonItem, collection2literal +from STACpopulator.stac_utils import ( + CFJsonItem, + STAC_item_from_metadata, + collection2literal, +) LOGGER = logging.getLogger(__name__) LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" @@ -109,10 +113,11 @@ def make_cmip6_item_id(attrs: MutableMapping[str, Any]) -> str: ] name = "_".join(attrs[k] for k in keys) return name - return hashlib.md5(name.encode("utf-8")).hexdigest() class CMIP6populator(STACpopulatorBase): + item_properties_model = CMIP6ItemProperties + def __init__(self, stac_host: str, thredds_catalog_url: str, config_filename: str) -> None: """Constructor @@ -125,7 +130,6 @@ def __init__(self, stac_host: str, thredds_catalog_url: str, config_filename: st """ data_loader = THREDDSLoader(thredds_catalog_url) - self.item_properties_model = CMIP6ItemProperties super().__init__(stac_host, data_loader, config_filename) def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]): @@ -143,17 +147,18 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) """ iid = make_cmip6_item_id(item_data["attributes"]) - obj = CFJsonItem(iid, item_data, self.item_properties_model) + item = STAC_item_from_metadata(iid, item_data, self.item_properties_model) - # # Add datacube extension + # Add datacube extension # try: # dc_ext = DatacubeExtension.ext(obj.item, add_if_missing=True) # dc_ext.apply(dimensions=obj.dimensions(), variables=obj.variables()) # except: # LOGGER.warning(f"Failed to add Datacube extension to item {item_name}") - print(obj.item.to_dict()) - return obj.item.to_dict() + # print(obj.item.to_dict()) + # return obj.item.to_dict() + print(item.to_dict()) def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: # Validation is done at the item creating stage, using the Properties class. From 3f821ce73439062ae568b857680b4dbedb6c05fd Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Fri, 6 Oct 2023 14:50:24 -0400 Subject: [PATCH 07/36] Breaking CFJsonItem part 2: extracting datacube extension code --- STACpopulator/stac_utils.py | 285 +---------------------- implementations/CMIP6-UofT/add_CMIP6.py | 19 +- implementations/CMIP6-UofT/extensions.py | 201 ++++++++++++++++ 3 files changed, 210 insertions(+), 295 deletions(-) create mode 100644 implementations/CMIP6-UofT/extensions.py diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 361eead..7cf3ed9 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -3,9 +3,8 @@ from typing import Any, Literal, MutableMapping import pystac -from pystac.extensions.datacube import Dimension, DimensionType, Variable, VariableType -from STACpopulator.models import STACItem, STACItemProperties +from STACpopulator.models import STACItem def url_validate(target: str) -> bool: @@ -127,288 +126,6 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop return item -class CFJsonItem: - """Return STAC Item from CF JSON metadata, as provided by `xncml.Dataset.to_cf_dict`.""" - - axis = {"X": "x", "Y": "y", "Z": "z", "T": "t", "longitude": "x", "latitude": "y", "vertical": "z", "time": "t"} - - def __init__(self, iid: str, attrs: dict, datamodel=None): - """ - Create STAC Item from CF JSON metadata. - - Parameters - ---------- - iid : str - Unique item ID. - attrs: dict - CF JSON metadata returned by `xncml.Dataset.to_cf_dict`. - datamodel : pydantic.BaseModel, optional - Data model for validating global attributes. - """ - self.attrs = attrs - cfmeta = attrs["groups"]["CFMetadata"]["attributes"] - - # Global attributes - gattrs = { - "start_datetime": cfmeta["time_coverage_start"], - "end_datetime": cfmeta["time_coverage_end"], - **attrs["attributes"], - } - - # Validate using pydantic data model if given - datamodel = datamodel or STACItemProperties - - class MySTACItem(STACItem): - properties: datamodel - - # Create STAC item - item = MySTACItem( - id=iid, - geometry=self.ncattrs_to_geometry(), - bbox=self.ncattrs_to_bbox(), - properties=gattrs, - datetime=None, - ) - - item = pystac.Item(**json.loads(item.model_dump_json(by_alias=True))) - - # Add assets - if "access_urls" in attrs: - root = attrs["access_urls"] - elif "THREDDSMetadata" in attrs["groups"]: - root = attrs["groups"]["THREDDSMetadata"]["groups"]["services"]["attributes"] - else: - root = {} - - for name, url in root.items(): - asset = pystac.Asset(href=url, media_type=media_types.get(name), roles=asset_roles.get(name)) - item.add_asset(name, asset) - - self.item = item - - def to_json(self) -> str: - self.item.model_dump_json() - - def ncattrs_to_geometry(self) -> MutableMapping[str, Any]: - """Create Polygon geometry from CFMetadata.""" - attrs = self.attrs["groups"]["CFMetadata"]["attributes"] - return { - "type": "Polygon", - "coordinates": [ - [ - [ - float(attrs["geospatial_lon_min"][0]), - float(attrs["geospatial_lat_min"][0]), - ], - [ - float(attrs["geospatial_lon_min"][0]), - float(attrs["geospatial_lat_max"][0]), - ], - [ - float(attrs["geospatial_lon_max"][0]), - float(attrs["geospatial_lat_max"][0]), - ], - [ - float(attrs["geospatial_lon_max"][0]), - float(attrs["geospatial_lat_min"][0]), - ], - [ - float(attrs["geospatial_lon_min"][0]), - float(attrs["geospatial_lat_min"][0]), - ], - ] - ], - } - - def ncattrs_to_bbox(self) -> list: - """Create BBOX from CFMetadata.""" - attrs = self.attrs["groups"]["CFMetadata"]["attributes"] - return [ - float(attrs["geospatial_lon_min"][0]), - float(attrs["geospatial_lat_min"][0]), - float(attrs["geospatial_lon_max"][0]), - float(attrs["geospatial_lat_max"][0]), - ] - - def dimensions(self) -> dict: - """Return Dimension objects required for Datacube extension.""" - - dims = {} - for name, length in self.attrs["dimensions"].items(): - v = self.attrs["variables"].get(name) - if v: - bbox = self.obj.ncattrs_to_bbox() - for key, criteria in coordinate_criteria.items(): - for criterion, expected in criteria.items(): - if v["attributes"].get(criterion, None) in expected: - axis = self.axis[key] - type_ = DimensionType.SPATIAL if axis in ["x", "y", "z"] else DimensionType.TEMPORAL - - if v["type"] == "int": - extent = [0, int(length)] - else: # Not clear the logic is sound - if key == "X": - extent = bbox[0], bbox[2] - elif key == "Y": - extent = bbox[1], bbox[3] - else: - extent = None - - dims[name] = Dimension( - properties=dict( - axis=axis, - type=type_, - extent=extent, - description=v.get("description", v.get("long_name", criteria["standard_name"])), - ) - ) - - return dims - - def variables(self) -> dict: - """Return Variable objects required for Datacube extension.""" - variables = {} - - for name, meta in self.attrs["variables"].items(): - if name in self.attrs["dimensions"]: - continue - - attrs = meta["attributes"] - variables[name] = Variable( - properties=dict( - dimensions=meta["shape"], - type=VariableType.AUXILIARY.value if self.is_coordinate(attrs) else VariableType.DATA.value, - description=attrs.get("description", attrs.get("long_name")), - unit=attrs.get("units", None), - ) - ) - return variables - - def is_coordinate(self, attrs: dict) -> bool: - """Return whether variable is a coordinate.""" - for key, criteria in coordinate_criteria.items(): - for criterion, expected in criteria.items(): - if attrs.get(criterion, None) in expected: - return True - return False - - -# From CF-Xarray -coordinate_criteria = { - "latitude": { - "standard_name": ("latitude",), - "units": ("degree_north", "degree_N", "degreeN", "degrees_north", "degrees_N", "degreesN"), - "_CoordinateAxisType": ("Lat",), - "long_name": ("latitude",), - }, - "longitude": { - "standard_name": ("longitude",), - "units": ("degree_east", "degree_E", "degreeE", "degrees_east", "degrees_E", "degreesE"), - "_CoordinateAxisType": ("Lon",), - "long_name": ("longitude",), - }, - "Z": { - "standard_name": ( - "model_level_number", - "atmosphere_ln_pressure_coordinate", - "atmosphere_sigma_coordinate", - "atmosphere_hybrid_sigma_pressure_coordinate", - "atmosphere_hybrid_height_coordinate", - "atmosphere_sleve_coordinate", - "ocean_sigma_coordinate", - "ocean_s_coordinate", - "ocean_s_coordinate_g1", - "ocean_s_coordinate_g2", - "ocean_sigma_z_coordinate", - "ocean_double_sigma_coordinate", - ), - "_CoordinateAxisType": ("GeoZ", "Height", "Pressure"), - "axis": ("Z",), - "cartesian_axis": ("Z",), - "grads_dim": ("z",), - "long_name": ( - "model_level_number", - "atmosphere_ln_pressure_coordinate", - "atmosphere_sigma_coordinate", - "atmosphere_hybrid_sigma_pressure_coordinate", - "atmosphere_hybrid_height_coordinate", - "atmosphere_sleve_coordinate", - "ocean_sigma_coordinate", - "ocean_s_coordinate", - "ocean_s_coordinate_g1", - "ocean_s_coordinate_g2", - "ocean_sigma_z_coordinate", - "ocean_double_sigma_coordinate", - ), - }, - "vertical": { - "standard_name": ( - "air_pressure", - "height", - "depth", - "geopotential_height", - "altitude", - "height_above_geopotential_datum", - "height_above_reference_ellipsoid", - "height_above_mean_sea_level", - ), - "positive": ("up", "down"), - "long_name": ( - "air_pressure", - "height", - "depth", - "geopotential_height", - "altitude", - "height_above_geopotential_datum", - "height_above_reference_ellipsoid", - "height_above_mean_sea_level", - ), - }, - "X": { - "standard_name": ("projection_x_coordinate", "grid_longitude", "projection_x_angular_coordinate"), - "_CoordinateAxisType": ("GeoX",), - "axis": ("X",), - "cartesian_axis": ("X",), - "grads_dim": ("x",), - "long_name": ( - "projection_x_coordinate", - "grid_longitude", - "projection_x_angular_coordinate", - "cell index along first dimension", - ), - }, - "Y": { - "standard_name": ("projection_y_coordinate", "grid_latitude", "projection_y_angular_coordinate"), - "_CoordinateAxisType": ("GeoY",), - "axis": ("Y",), - "cartesian_axis": ("Y",), - "grads_dim": ("y",), - "long_name": ( - "projection_y_coordinate", - "grid_latitude", - "projection_y_angular_coordinate", - "cell index along second dimension", - ), - }, - "T": { - "standard_name": ("time",), - "_CoordinateAxisType": ("Time",), - "axis": ("T",), - "cartesian_axis": ("T",), - "grads_dim": ("t",), - "long_name": ("time",), - }, - "time": { - "standard_name": ("time",), - "_CoordinateAxisType": ("Time",), - "axis": ("T",), - "cartesian_axis": ("T",), - "grads_dim": ("t",), - "long_name": ("time",), - }, -} - - media_types = { "httpserver_service": "application/x-netcdf", "opendap_service": pystac.MediaType.HTML, diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index bfb5d53..93d6072 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -1,21 +1,17 @@ import argparse -import hashlib import logging from datetime import datetime from typing import Any, Dict, List, Literal, MutableMapping import pyessv from colorlog import ColoredFormatter +from extensions import DataCubeHelper from pydantic import AnyHttpUrl, Field, FieldValidationInfo, field_validator from STACpopulator import STACpopulatorBase from STACpopulator.input import THREDDSLoader from STACpopulator.models import STACItemProperties -from STACpopulator.stac_utils import ( - CFJsonItem, - STAC_item_from_metadata, - collection2literal, -) +from STACpopulator.stac_utils import STAC_item_from_metadata, collection2literal LOGGER = logging.getLogger(__name__) LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" @@ -150,11 +146,12 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) item = STAC_item_from_metadata(iid, item_data, self.item_properties_model) # Add datacube extension - # try: - # dc_ext = DatacubeExtension.ext(obj.item, add_if_missing=True) - # dc_ext.apply(dimensions=obj.dimensions(), variables=obj.variables()) - # except: - # LOGGER.warning(f"Failed to add Datacube extension to item {item_name}") + try: + dchelper = DataCubeHelper(item_data) + dc_ext = DatacubeExtension.ext(item, add_if_missing=True) + dc_ext.apply(dimensions=dchelper.dimensions(), variables=dchelper.variables()) + except: + LOGGER.warning(f"Failed to add Datacube extension to item {item_name}") # print(obj.item.to_dict()) # return obj.item.to_dict() diff --git a/implementations/CMIP6-UofT/extensions.py b/implementations/CMIP6-UofT/extensions.py new file mode 100644 index 0000000..e09f9b2 --- /dev/null +++ b/implementations/CMIP6-UofT/extensions.py @@ -0,0 +1,201 @@ +import pystac +from pystac.extensions.datacube import Dimension, DimensionType, Variable, VariableType + + +class DataCubeHelper: + """Return STAC Item from CF JSON metadata, as provided by `xncml.Dataset.to_cf_dict`.""" + + axis = {"X": "x", "Y": "y", "Z": "z", "T": "t", "longitude": "x", "latitude": "y", "vertical": "z", "time": "t"} + + def __init__(self, attrs: dict): + """ + Create STAC Item from CF JSON metadata. + + Parameters + ---------- + iid : str + Unique item ID. + attrs: dict + CF JSON metadata returned by `xncml.Dataset.to_cf_dict`. + datamodel : pydantic.BaseModel, optional + Data model for validating global attributes. + """ + self.attrs = attrs + + def dimensions(self) -> dict: + """Return Dimension objects required for Datacube extension.""" + + dims = {} + for name, length in self.attrs["dimensions"].items(): + v = self.attrs["variables"].get(name) + if v: + bbox = self.obj.ncattrs_to_bbox() + for key, criteria in coordinate_criteria.items(): + for criterion, expected in criteria.items(): + if v["attributes"].get(criterion, None) in expected: + axis = self.axis[key] + type_ = DimensionType.SPATIAL if axis in ["x", "y", "z"] else DimensionType.TEMPORAL + + if v["type"] == "int": + extent = [0, int(length)] + else: # Not clear the logic is sound + if key == "X": + extent = bbox[0], bbox[2] + elif key == "Y": + extent = bbox[1], bbox[3] + else: + extent = None + + dims[name] = Dimension( + properties=dict( + axis=axis, + type=type_, + extent=extent, + description=v.get("description", v.get("long_name", criteria["standard_name"])), + ) + ) + + return dims + + def variables(self) -> dict: + """Return Variable objects required for Datacube extension.""" + variables = {} + + for name, meta in self.attrs["variables"].items(): + if name in self.attrs["dimensions"]: + continue + + attrs = meta["attributes"] + variables[name] = Variable( + properties=dict( + dimensions=meta["shape"], + type=VariableType.AUXILIARY.value if self.is_coordinate(attrs) else VariableType.DATA.value, + description=attrs.get("description", attrs.get("long_name")), + unit=attrs.get("units", None), + ) + ) + return variables + + def is_coordinate(self, attrs: dict) -> bool: + """Return whether variable is a coordinate.""" + for key, criteria in coordinate_criteria.items(): + for criterion, expected in criteria.items(): + if attrs.get(criterion, None) in expected: + return True + return False + + +# From CF-Xarray +coordinate_criteria = { + "latitude": { + "standard_name": ("latitude",), + "units": ("degree_north", "degree_N", "degreeN", "degrees_north", "degrees_N", "degreesN"), + "_CoordinateAxisType": ("Lat",), + "long_name": ("latitude",), + }, + "longitude": { + "standard_name": ("longitude",), + "units": ("degree_east", "degree_E", "degreeE", "degrees_east", "degrees_E", "degreesE"), + "_CoordinateAxisType": ("Lon",), + "long_name": ("longitude",), + }, + "Z": { + "standard_name": ( + "model_level_number", + "atmosphere_ln_pressure_coordinate", + "atmosphere_sigma_coordinate", + "atmosphere_hybrid_sigma_pressure_coordinate", + "atmosphere_hybrid_height_coordinate", + "atmosphere_sleve_coordinate", + "ocean_sigma_coordinate", + "ocean_s_coordinate", + "ocean_s_coordinate_g1", + "ocean_s_coordinate_g2", + "ocean_sigma_z_coordinate", + "ocean_double_sigma_coordinate", + ), + "_CoordinateAxisType": ("GeoZ", "Height", "Pressure"), + "axis": ("Z",), + "cartesian_axis": ("Z",), + "grads_dim": ("z",), + "long_name": ( + "model_level_number", + "atmosphere_ln_pressure_coordinate", + "atmosphere_sigma_coordinate", + "atmosphere_hybrid_sigma_pressure_coordinate", + "atmosphere_hybrid_height_coordinate", + "atmosphere_sleve_coordinate", + "ocean_sigma_coordinate", + "ocean_s_coordinate", + "ocean_s_coordinate_g1", + "ocean_s_coordinate_g2", + "ocean_sigma_z_coordinate", + "ocean_double_sigma_coordinate", + ), + }, + "vertical": { + "standard_name": ( + "air_pressure", + "height", + "depth", + "geopotential_height", + "altitude", + "height_above_geopotential_datum", + "height_above_reference_ellipsoid", + "height_above_mean_sea_level", + ), + "positive": ("up", "down"), + "long_name": ( + "air_pressure", + "height", + "depth", + "geopotential_height", + "altitude", + "height_above_geopotential_datum", + "height_above_reference_ellipsoid", + "height_above_mean_sea_level", + ), + }, + "X": { + "standard_name": ("projection_x_coordinate", "grid_longitude", "projection_x_angular_coordinate"), + "_CoordinateAxisType": ("GeoX",), + "axis": ("X",), + "cartesian_axis": ("X",), + "grads_dim": ("x",), + "long_name": ( + "projection_x_coordinate", + "grid_longitude", + "projection_x_angular_coordinate", + "cell index along first dimension", + ), + }, + "Y": { + "standard_name": ("projection_y_coordinate", "grid_latitude", "projection_y_angular_coordinate"), + "_CoordinateAxisType": ("GeoY",), + "axis": ("Y",), + "cartesian_axis": ("Y",), + "grads_dim": ("y",), + "long_name": ( + "projection_y_coordinate", + "grid_latitude", + "projection_y_angular_coordinate", + "cell index along second dimension", + ), + }, + "T": { + "standard_name": ("time",), + "_CoordinateAxisType": ("Time",), + "axis": ("T",), + "cartesian_axis": ("T",), + "grads_dim": ("t",), + "long_name": ("time",), + }, + "time": { + "standard_name": ("time",), + "_CoordinateAxisType": ("Time",), + "axis": ("T",), + "cartesian_axis": ("T",), + "grads_dim": ("t",), + "long_name": ("time",), + }, +} From 3c584cc424c5276c8cad6f28487525785e8de19e Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 12 Oct 2023 11:03:49 -0400 Subject: [PATCH 08/36] updating geometry structure --- STACpopulator/models.py | 29 +++++++++++++++++++++---- STACpopulator/stac_utils.py | 10 +++++---- implementations/CMIP6-UofT/add_CMIP6.py | 5 +++-- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/STACpopulator/models.py b/STACpopulator/models.py index 625efc2..3e93802 100644 --- a/STACpopulator/models.py +++ b/STACpopulator/models.py @@ -1,6 +1,7 @@ import datetime as dt -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Literal, Optional, Union +from annotated_types import Ge from pydantic import ( AnyHttpUrl, AnyUrl, @@ -9,14 +10,34 @@ SerializeAsAny, field_validator, ) -from typing_extensions import TypedDict +from xarray import Coordinates -class Geometry(TypedDict): +class Geometry(BaseModel): type: str + coordinates: List + + +class GeoJSONPoint(Geometry): + type: Literal["Point"] + coordinates: List[float] + + +class GeoJSONMultiPoint(Geometry): + type: Literal["MultiPoint"] + coordinates: List[List[float]] + + +class GeoJSONPolygon(Geometry): + type: Literal["Polygon"] coordinates: List[List[List[float]]] +class GeoJSONMultiPolygon(Geometry): + type: Literal["MultiPolygon"] + coordinates: List[List[List[List[float]]]] + + class Asset(BaseModel): href: AnyHttpUrl media_type: Optional[str] = None @@ -78,7 +99,7 @@ class STACItem(BaseModel): """STAC Item data model.""" id: str = Field(..., alias="id", min_length=1) - geometry: Optional[Geometry] = None + geometry: Optional[SerializeAsAny[Geometry]] = None bbox: Optional[List[float]] = None properties: Optional[SerializeAsAny[STACItemProperties]] = None assets: Dict[str, Asset] = None diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 7cf3ed9..9f0198b 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -79,7 +79,7 @@ def ncattrs_to_bbox(attrs: MutableMapping[str, Any]) -> list: ] -def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_props_datamodel): +def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_props_datamodel, item_geometry_model): """ Create STAC Item from CF JSON metadata. @@ -89,8 +89,10 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop Unique item ID. attrs: dict CF JSON metadata returned by `xncml.Dataset.to_cf_dict`. - datamodel : pydantic.BaseModel, optional - Data model for validating global attributes. + item_props_datamodel : pydantic.BaseModel + Data model describing the properties of the STAC item. + item_geometry_model : pydantic.BaseModel + Data model describing the geometry of the STAC item. """ cfmeta = attrs["groups"]["CFMetadata"]["attributes"] @@ -98,7 +100,7 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop # Create pydantic STAC item item = STACItem( id=iid, - geometry=ncattrs_to_geometry(attrs), + geometry=item_geometry_model(**ncattrs_to_geometry(attrs)), bbox=ncattrs_to_bbox(attrs), properties=item_props_datamodel( start_datetime=cfmeta["time_coverage_start"], diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index 93d6072..fee24ce 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -10,7 +10,7 @@ from STACpopulator import STACpopulatorBase from STACpopulator.input import THREDDSLoader -from STACpopulator.models import STACItemProperties +from STACpopulator.models import GeoJSONPolygon, STACItemProperties from STACpopulator.stac_utils import STAC_item_from_metadata, collection2literal LOGGER = logging.getLogger(__name__) @@ -113,6 +113,7 @@ def make_cmip6_item_id(attrs: MutableMapping[str, Any]) -> str: class CMIP6populator(STACpopulatorBase): item_properties_model = CMIP6ItemProperties + item_geometry_model = GeoJSONPolygon def __init__(self, stac_host: str, thredds_catalog_url: str, config_filename: str) -> None: """Constructor @@ -143,7 +144,7 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) """ iid = make_cmip6_item_id(item_data["attributes"]) - item = STAC_item_from_metadata(iid, item_data, self.item_properties_model) + item = STAC_item_from_metadata(iid, item_data, self.item_properties_model, self.item_geometry_model) # Add datacube extension try: From b7a7ed94ea846505d541bd01ec5c729f67c5f3ce Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 12 Oct 2023 11:25:06 -0400 Subject: [PATCH 09/36] moving np datatype conversion to a separate function --- STACpopulator/input.py | 19 +++---------------- STACpopulator/stac_utils.py | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/STACpopulator/input.py b/STACpopulator/input.py index 3b2c11c..2e61257 100644 --- a/STACpopulator/input.py +++ b/STACpopulator/input.py @@ -2,14 +2,14 @@ from abc import ABC, abstractmethod from typing import Any, Iterator, MutableMapping, Optional, Tuple -import numpy as np import requests import siphon import xncml from colorlog import ColoredFormatter -from numpy import extract from siphon.catalog import TDSCatalog +from STACpopulator.stac_utils import numpy_to_python_datatypes + LOGGER = logging.getLogger(__name__) LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" formatter = ColoredFormatter(LOGFORMAT) @@ -86,20 +86,7 @@ def extract_metadata(self, ds: siphon.catalog.Dataset) -> MutableMapping[str, An # Convert NcML to CF-compliant dictionary attrs = xncml.Dataset.from_text(r.content).to_cf_dict() - # Converting numpy datatypes to python standard datatypes - for key, value in attrs["attributes"].items(): - if isinstance(value, list): - newlist = [] - for item in value: - if issubclass(type(item), np.integer): - newlist.append(int(item)) - elif issubclass(type(item), np.floating): - newlist.append(float(item)) - else: - newlist.append(item) - attrs["attributes"][key] = newlist - elif isinstance(type(value), np.integer): - attrs["attributes"][key] = int(value) + attrs["attributes"] = numpy_to_python_datatypes(attrs["attributes"]) attrs["access_urls"] = ds.access_urls diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 9f0198b..50871c6 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -2,6 +2,7 @@ import re from typing import Any, Literal, MutableMapping +import numpy as np import pystac from STACpopulator.models import STACItem @@ -79,6 +80,25 @@ def ncattrs_to_bbox(attrs: MutableMapping[str, Any]) -> list: ] +def numpy_to_python_datatypes(data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: + # Converting numpy datatypes to python standard datatypes + for key, value in data.items(): + if isinstance(value, list): + newlist = [] + for item in value: + if issubclass(type(item), np.integer): + newlist.append(int(item)) + elif issubclass(type(item), np.floating): + newlist.append(float(item)) + else: + newlist.append(item) + data[key] = newlist + elif isinstance(type(value), np.integer): + data[key] = int(value) + + return data + + def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_props_datamodel, item_geometry_model): """ Create STAC Item from CF JSON metadata. From 48598ae49a31ede1e5803ecbbb434962b474e9a2 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 12 Oct 2023 11:26:10 -0400 Subject: [PATCH 10/36] modifications to datacube extension helper functions as per Francis's comments --- implementations/CMIP6-UofT/extensions.py | 244 ++++++++++++----------- 1 file changed, 125 insertions(+), 119 deletions(-) diff --git a/implementations/CMIP6-UofT/extensions.py b/implementations/CMIP6-UofT/extensions.py index e09f9b2..9f77b0f 100644 --- a/implementations/CMIP6-UofT/extensions.py +++ b/implementations/CMIP6-UofT/extensions.py @@ -1,4 +1,5 @@ -import pystac +import functools + from pystac.extensions.datacube import Dimension, DimensionType, Variable, VariableType @@ -22,6 +23,123 @@ def __init__(self, attrs: dict): """ self.attrs = attrs + # From CF-Xarray + self.coordinate_criteria = { + "latitude": { + "standard_name": ("latitude",), + "units": ("degree_north", "degree_N", "degreeN", "degrees_north", "degrees_N", "degreesN"), + "_CoordinateAxisType": ("Lat",), + "long_name": ("latitude",), + }, + "longitude": { + "standard_name": ("longitude",), + "units": ("degree_east", "degree_E", "degreeE", "degrees_east", "degrees_E", "degreesE"), + "_CoordinateAxisType": ("Lon",), + "long_name": ("longitude",), + }, + "Z": { + "standard_name": ( + "model_level_number", + "atmosphere_ln_pressure_coordinate", + "atmosphere_sigma_coordinate", + "atmosphere_hybrid_sigma_pressure_coordinate", + "atmosphere_hybrid_height_coordinate", + "atmosphere_sleve_coordinate", + "ocean_sigma_coordinate", + "ocean_s_coordinate", + "ocean_s_coordinate_g1", + "ocean_s_coordinate_g2", + "ocean_sigma_z_coordinate", + "ocean_double_sigma_coordinate", + ), + "_CoordinateAxisType": ("GeoZ", "Height", "Pressure"), + "axis": ("Z",), + "cartesian_axis": ("Z",), + "grads_dim": ("z",), + "long_name": ( + "model_level_number", + "atmosphere_ln_pressure_coordinate", + "atmosphere_sigma_coordinate", + "atmosphere_hybrid_sigma_pressure_coordinate", + "atmosphere_hybrid_height_coordinate", + "atmosphere_sleve_coordinate", + "ocean_sigma_coordinate", + "ocean_s_coordinate", + "ocean_s_coordinate_g1", + "ocean_s_coordinate_g2", + "ocean_sigma_z_coordinate", + "ocean_double_sigma_coordinate", + ), + }, + "vertical": { + "standard_name": ( + "air_pressure", + "height", + "depth", + "geopotential_height", + "altitude", + "height_above_geopotential_datum", + "height_above_reference_ellipsoid", + "height_above_mean_sea_level", + ), + "positive": ("up", "down"), + "long_name": ( + "air_pressure", + "height", + "depth", + "geopotential_height", + "altitude", + "height_above_geopotential_datum", + "height_above_reference_ellipsoid", + "height_above_mean_sea_level", + ), + }, + "X": { + "standard_name": ("projection_x_coordinate", "grid_longitude", "projection_x_angular_coordinate"), + "_CoordinateAxisType": ("GeoX",), + "axis": ("X",), + "cartesian_axis": ("X",), + "grads_dim": ("x",), + "long_name": ( + "projection_x_coordinate", + "grid_longitude", + "projection_x_angular_coordinate", + "cell index along first dimension", + ), + }, + "Y": { + "standard_name": ("projection_y_coordinate", "grid_latitude", "projection_y_angular_coordinate"), + "_CoordinateAxisType": ("GeoY",), + "axis": ("Y",), + "cartesian_axis": ("Y",), + "grads_dim": ("y",), + "long_name": ( + "projection_y_coordinate", + "grid_latitude", + "projection_y_angular_coordinate", + "cell index along second dimension", + ), + }, + "T": { + "standard_name": ("time",), + "_CoordinateAxisType": ("Time",), + "axis": ("T",), + "cartesian_axis": ("T",), + "grads_dim": ("t",), + "long_name": ("time",), + }, + "time": { + "standard_name": ("time",), + "_CoordinateAxisType": ("Time",), + "axis": ("T",), + "cartesian_axis": ("T",), + "grads_dim": ("t",), + "long_name": ("time",), + }, + } + + @property + @functools.cache def dimensions(self) -> dict: """Return Dimension objects required for Datacube extension.""" @@ -30,7 +148,7 @@ def dimensions(self) -> dict: v = self.attrs["variables"].get(name) if v: bbox = self.obj.ncattrs_to_bbox() - for key, criteria in coordinate_criteria.items(): + for key, criteria in self.coordinate_criteria.items(): for criterion, expected in criteria.items(): if v["attributes"].get(criterion, None) in expected: axis = self.axis[key] @@ -57,6 +175,8 @@ def dimensions(self) -> dict: return dims + @property + @functools.cache def variables(self) -> dict: """Return Variable objects required for Datacube extension.""" variables = {} @@ -76,126 +196,12 @@ def variables(self) -> dict: ) return variables + @property + @functools.cache def is_coordinate(self, attrs: dict) -> bool: """Return whether variable is a coordinate.""" - for key, criteria in coordinate_criteria.items(): + for key, criteria in self.coordinate_criteria.items(): for criterion, expected in criteria.items(): if attrs.get(criterion, None) in expected: return True return False - - -# From CF-Xarray -coordinate_criteria = { - "latitude": { - "standard_name": ("latitude",), - "units": ("degree_north", "degree_N", "degreeN", "degrees_north", "degrees_N", "degreesN"), - "_CoordinateAxisType": ("Lat",), - "long_name": ("latitude",), - }, - "longitude": { - "standard_name": ("longitude",), - "units": ("degree_east", "degree_E", "degreeE", "degrees_east", "degrees_E", "degreesE"), - "_CoordinateAxisType": ("Lon",), - "long_name": ("longitude",), - }, - "Z": { - "standard_name": ( - "model_level_number", - "atmosphere_ln_pressure_coordinate", - "atmosphere_sigma_coordinate", - "atmosphere_hybrid_sigma_pressure_coordinate", - "atmosphere_hybrid_height_coordinate", - "atmosphere_sleve_coordinate", - "ocean_sigma_coordinate", - "ocean_s_coordinate", - "ocean_s_coordinate_g1", - "ocean_s_coordinate_g2", - "ocean_sigma_z_coordinate", - "ocean_double_sigma_coordinate", - ), - "_CoordinateAxisType": ("GeoZ", "Height", "Pressure"), - "axis": ("Z",), - "cartesian_axis": ("Z",), - "grads_dim": ("z",), - "long_name": ( - "model_level_number", - "atmosphere_ln_pressure_coordinate", - "atmosphere_sigma_coordinate", - "atmosphere_hybrid_sigma_pressure_coordinate", - "atmosphere_hybrid_height_coordinate", - "atmosphere_sleve_coordinate", - "ocean_sigma_coordinate", - "ocean_s_coordinate", - "ocean_s_coordinate_g1", - "ocean_s_coordinate_g2", - "ocean_sigma_z_coordinate", - "ocean_double_sigma_coordinate", - ), - }, - "vertical": { - "standard_name": ( - "air_pressure", - "height", - "depth", - "geopotential_height", - "altitude", - "height_above_geopotential_datum", - "height_above_reference_ellipsoid", - "height_above_mean_sea_level", - ), - "positive": ("up", "down"), - "long_name": ( - "air_pressure", - "height", - "depth", - "geopotential_height", - "altitude", - "height_above_geopotential_datum", - "height_above_reference_ellipsoid", - "height_above_mean_sea_level", - ), - }, - "X": { - "standard_name": ("projection_x_coordinate", "grid_longitude", "projection_x_angular_coordinate"), - "_CoordinateAxisType": ("GeoX",), - "axis": ("X",), - "cartesian_axis": ("X",), - "grads_dim": ("x",), - "long_name": ( - "projection_x_coordinate", - "grid_longitude", - "projection_x_angular_coordinate", - "cell index along first dimension", - ), - }, - "Y": { - "standard_name": ("projection_y_coordinate", "grid_latitude", "projection_y_angular_coordinate"), - "_CoordinateAxisType": ("GeoY",), - "axis": ("Y",), - "cartesian_axis": ("Y",), - "grads_dim": ("y",), - "long_name": ( - "projection_y_coordinate", - "grid_latitude", - "projection_y_angular_coordinate", - "cell index along second dimension", - ), - }, - "T": { - "standard_name": ("time",), - "_CoordinateAxisType": ("Time",), - "axis": ("T",), - "cartesian_axis": ("T",), - "grads_dim": ("t",), - "long_name": ("time",), - }, - "time": { - "standard_name": ("time",), - "_CoordinateAxisType": ("Time",), - "axis": ("T",), - "cartesian_axis": ("T",), - "grads_dim": ("t",), - "long_name": ("time",), - }, -} From 94eb521e783d9c67a0e293c391b1ab98fbffc710 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 12 Oct 2023 11:30:43 -0400 Subject: [PATCH 11/36] code cleanup --- STACpopulator/metadata_parsers.py | 61 ------------------------------- STACpopulator/models.py | 2 - STACpopulator/populator_base.py | 1 - 3 files changed, 64 deletions(-) delete mode 100644 STACpopulator/metadata_parsers.py diff --git a/STACpopulator/metadata_parsers.py b/STACpopulator/metadata_parsers.py deleted file mode 100644 index 84636f8..0000000 --- a/STACpopulator/metadata_parsers.py +++ /dev/null @@ -1,61 +0,0 @@ -import lxml.etree -import requests - - -def nc_attrs_from_ncml(url): - """Extract attributes from NcML file. - - Parameters - ---------- - url : str - Link to NcML service of THREDDS server for a dataset. - - Returns - ------- - dict - Global attribute values keyed by facet names, with variable attributes in `__variable__` nested dict, and - additional specialized attributes in `__group__` nested dict. - """ - parser = lxml.etree.XMLParser(encoding="UTF-8") - - ns = {"ncml": "http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2"} - - # Parse XML content - UTF-8 encoded documents need to be read as bytes - xml = requests.get(url).content - doc = lxml.etree.fromstring(xml, parser=parser) - nc = doc.xpath("/ncml:netcdf", namespaces=ns)[0] - - # Extract global attributes - out = _attrib_to_dict(nc.xpath("ncml:attribute", namespaces=ns)) - - # Extract group attributes - gr = {} - for group in nc.xpath("ncml:group", namespaces=ns): - gr[group.attrib["name"]] = _attrib_to_dict(group.xpath("ncml:attribute", namespaces=ns)) - - # Extract variable attributes - va = {} - for variable in nc.xpath("ncml:variable", namespaces=ns): - if "_CoordinateAxisType" in variable.xpath("ncml:attribute/@name", namespaces=ns): - continue - va[variable.attrib["name"]] = _attrib_to_dict(variable.xpath("ncml:attribute", namespaces=ns)) - - out["__group__"] = gr - out["__variable__"] = va - - return out - - -def _attrib_to_dict(elems): - """Convert element attributes to dictionary. - - Ignore attributes with names starting with _ - """ - hidden_prefix = "_" - out = {} - for e in elems: - a = e.attrib - if a["name"].startswith(hidden_prefix): - continue - out[a["name"]] = a["value"] - return out diff --git a/STACpopulator/models.py b/STACpopulator/models.py index 3e93802..f91dab5 100644 --- a/STACpopulator/models.py +++ b/STACpopulator/models.py @@ -1,7 +1,6 @@ import datetime as dt from typing import Any, Dict, List, Literal, Optional, Union -from annotated_types import Ge from pydantic import ( AnyHttpUrl, AnyUrl, @@ -10,7 +9,6 @@ SerializeAsAny, field_validator, ) -from xarray import Coordinates class Geometry(BaseModel): diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index 3d8f50c..404d610 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -1,4 +1,3 @@ -import hashlib import logging from abc import ABC, abstractmethod from datetime import datetime From a64a2265a42726aaa824a2364b09397ea3a83647 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 12 Oct 2023 16:23:04 -0400 Subject: [PATCH 12/36] change how prefix is applied --- implementations/CMIP6-UofT/add_CMIP6.py | 70 ++++++++++++++----------- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index fee24ce..1a5dbd8 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -6,7 +6,7 @@ import pyessv from colorlog import ColoredFormatter from extensions import DataCubeHelper -from pydantic import AnyHttpUrl, Field, FieldValidationInfo, field_validator +from pydantic import AnyHttpUrl, ConfigDict, Field, FieldValidationInfo, field_validator from STACpopulator import STACpopulatorBase from STACpopulator.input import THREDDSLoader @@ -39,40 +39,46 @@ TableID = collection2literal(CV.table_id) +def add_cmip6_prefix(name: str) -> str: + return "cmip6:" + name if "datetime" not in name else name + + class CMIP6ItemProperties(STACItemProperties, validate_assignment=True): """Data model for CMIP6 Controlled Vocabulary.""" - Conventions: str = Field(..., serialization_alias="cmip6:Conventions") - activity_id: ActivityID = Field(..., serialization_alias="cmip6:activity_id") - creation_date: datetime = Field(..., serialization_alias="cmip6:creation_date") - data_specs_version: str = Field(..., serialization_alias="cmip6:data_specs_version") - experiment: str = Field(..., serialization_alias="cmip6:experiment") - experiment_id: ExperimentID = Field(..., serialization_alias="cmip6:experiment_id") - frequency: Frequency = Field(..., serialization_alias="cmip6:frequency") - further_info_url: AnyHttpUrl = Field(..., serialization_alias="cmip6:further_info_url") - grid_label: GridLabel = Field(..., serialization_alias="cmip6:grid_label") - institution: str = Field(..., serialization_alias="cmip6:institution") - institution_id: InstitutionID = Field(..., serialization_alias="cmip6:institution_id") - nominal_resolution: NominalResolution = Field(..., serialization_alias="cmip6:nominal_resolution") - realm: List[Realm] = Field(..., serialization_alias="cmip6:realm") - source: str = Field(..., serialization_alias="cmip6:source") - source_id: SourceID = Field(..., serialization_alias="cmip6:source_id") - source_type: List[SourceType] = Field(..., serialization_alias="cmip6:source_type") - sub_experiment: str | Literal["none"] = Field(..., serialization_alias="cmip6:sub_experiment") - sub_experiment_id: SubExperimentID | Literal["none"] = Field(..., serialization_alias="cmip6:sub_experiment_id") - table_id: TableID = Field(..., serialization_alias="cmip6:table_id") - variable_id: str = Field(..., serialization_alias="cmip6:variable_id") - variant_label: str = Field(..., serialization_alias="cmip6:variant_label") - initialization_index: int = Field(..., serialization_alias="cmip6:initialization_index") - physics_index: int = Field(..., serialization_alias="cmip6:physics_index") - realization_index: int = Field(..., serialization_alias="cmip6:realization_index") - forcing_index: int = Field(..., serialization_alias="cmip6:forcing_index") - tracking_id: str = Field(..., serialization_alias="cmip6:tracking_id") - version: str = Field("", serialization_alias="cmip6:version") - product: str = Field(..., serialization_alias="cmip6:product") - license: str = Field(..., serialization_alias="cmip6:license") - grid: str = Field(..., serialization_alias="cmip6:grid") - mip_era: str = Field(..., serialization_alias="cmip6:mip_era") + Conventions: str + activity_id: ActivityID + creation_date: datetime + data_specs_version: str + experiment: str + experiment_id: ExperimentID + frequency: Frequency + further_info_url: AnyHttpUrl + grid_label: GridLabel + institution: str + institution_id: InstitutionID + nominal_resolution: NominalResolution + realm: List[Realm] + source: str + source_id: SourceID + source_type: List[SourceType] + sub_experiment: str | Literal["none"] + sub_experiment_id: SubExperimentID | Literal["none"] + table_id: TableID + variable_id: str + variant_label: str + initialization_index: int + physics_index: int + realization_index: int + forcing_index: int + tracking_id: str + version: str = Field("") + product: str + license: str + grid: str + mip_era: str + + model_config = ConfigDict(alias_generator=add_cmip6_prefix, populate_by_name=True) @field_validator("initialization_index", "physics_index", "realization_index", "forcing_index", mode="before") @classmethod From f22c1a20b10eb6dd371e33e24271f0e6c0f2d122 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Fri, 13 Oct 2023 10:17:56 -0400 Subject: [PATCH 13/36] PR changes --- Makefile | 2 +- .../implementations}/CMIP6-UofT/CMIP6.yml | 0 .../implementations}/CMIP6-UofT/add_CMIP6.py | 2 +- .../implementations}/CMIP6-UofT/extensions.py | 0 .../implementations}/NEX-GDDP-UofT/add_NEX-GDDP.py | 0 STACpopulator/stac_utils.py | 2 +- 6 files changed, 3 insertions(+), 3 deletions(-) rename {implementations => STACpopulator/implementations}/CMIP6-UofT/CMIP6.yml (100%) rename {implementations => STACpopulator/implementations}/CMIP6-UofT/add_CMIP6.py (99%) rename {implementations => STACpopulator/implementations}/CMIP6-UofT/extensions.py (100%) rename {implementations => STACpopulator/implementations}/NEX-GDDP-UofT/add_NEX-GDDP.py (100%) diff --git a/Makefile b/Makefile index e9e1f6f..ca5cf52 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -IMP_DIR = /Users/dchandan/DACCS/Codes/stac-populator/implementations +IMP_DIR = STACpopulator/implementations STAC_HOST = http://localhost:8880/stac testcmip6: diff --git a/implementations/CMIP6-UofT/CMIP6.yml b/STACpopulator/implementations/CMIP6-UofT/CMIP6.yml similarity index 100% rename from implementations/CMIP6-UofT/CMIP6.yml rename to STACpopulator/implementations/CMIP6-UofT/CMIP6.yml diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py similarity index 99% rename from implementations/CMIP6-UofT/add_CMIP6.py rename to STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py index 1a5dbd8..8a8f171 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py @@ -82,7 +82,7 @@ class CMIP6ItemProperties(STACItemProperties, validate_assignment=True): @field_validator("initialization_index", "physics_index", "realization_index", "forcing_index", mode="before") @classmethod - def first_item(cls, v: list, info: FieldValidationInfo): + def only_item(cls, v: list[int], info: FieldValidationInfo): """Pick single item from list.""" assert len(v) == 1, f"{info.field_name} must have one item only." return v[0] diff --git a/implementations/CMIP6-UofT/extensions.py b/STACpopulator/implementations/CMIP6-UofT/extensions.py similarity index 100% rename from implementations/CMIP6-UofT/extensions.py rename to STACpopulator/implementations/CMIP6-UofT/extensions.py diff --git a/implementations/NEX-GDDP-UofT/add_NEX-GDDP.py b/STACpopulator/implementations/NEX-GDDP-UofT/add_NEX-GDDP.py similarity index 100% rename from implementations/NEX-GDDP-UofT/add_NEX-GDDP.py rename to STACpopulator/implementations/NEX-GDDP-UofT/add_NEX-GDDP.py diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 50871c6..cf3a8c2 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -69,7 +69,7 @@ def ncattrs_to_geometry(attrs: MutableMapping[str, Any]) -> MutableMapping[str, } -def ncattrs_to_bbox(attrs: MutableMapping[str, Any]) -> list: +def ncattrs_to_bbox(attrs: MutableMapping[str, Any]) -> list[float]: """Create BBOX from CFMetadata.""" attrs = attrs["groups"]["CFMetadata"]["attributes"] return [ From efd9230823314450e3ed47777cbd5f363b0c7723 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Tue, 17 Oct 2023 12:07:28 -0400 Subject: [PATCH 14/36] fixing output media type and roles output for assets --- STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py | 6 +++--- STACpopulator/stac_utils.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py index 8a8f171..a137a0b 100644 --- a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py @@ -1,4 +1,5 @@ import argparse +import json import logging from datetime import datetime from typing import Any, Dict, List, Literal, MutableMapping @@ -160,9 +161,8 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) except: LOGGER.warning(f"Failed to add Datacube extension to item {item_name}") - # print(obj.item.to_dict()) - # return obj.item.to_dict() - print(item.to_dict()) + # return json.dumps(item.to_dict()) + print(json.dumps(item.to_dict())) def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: # Validation is done at the item creating stage, using the Properties class. diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index cf3a8c2..76b6c9f 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -142,6 +142,7 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop root = {} for name, url in root.items(): + name = str(name) # converting name from siphon.catalog.CaseInsensitiveStr to str asset = pystac.Asset(href=url, media_type=media_types.get(name), roles=asset_roles.get(name)) item.add_asset(name, asset) From 3e88591f26cd9ce278d28f8da0e55a13890b0476 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Tue, 17 Oct 2023 17:42:22 -0400 Subject: [PATCH 15/36] adding magpie resource link --- STACpopulator/stac_utils.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 76b6c9f..fe8c650 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -99,6 +99,22 @@ def numpy_to_python_datatypes(data: MutableMapping[str, Any]) -> MutableMapping[ return data +def magpie_resource_link(url: str) -> pystac.Link: + """Creates a link that will be used by Cowbird to create a resource in Magpie + associated with the STAC item. + + :param url: HTTPServer access URL for a STAC item + :type url: str + :return: A PySTAC Link + :rtype: pystac.Link + """ + url_ = url.replace("fileServer", "*") + i = url_.find("*") + title = url_[i + 2 :] + link = pystac.Link(rel="source", title=title, target=url, media_type="application/x-netcdf") + return link + + def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_props_datamodel, item_geometry_model): """ Create STAC Item from CF JSON metadata. @@ -146,6 +162,9 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop asset = pystac.Asset(href=url, media_type=media_types.get(name), roles=asset_roles.get(name)) item.add_asset(name, asset) + if root: + item.add_link(magpie_resource_link(root["HTTPServer"])) + return item From 8d66fba8877ecf8c166379b91a16b4bc49039b86 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 18 Oct 2023 13:18:18 -0400 Subject: [PATCH 16/36] adding collection resource link for Magpie --- Makefile | 3 +++ STACpopulator/input.py | 16 ++++++++++++++-- STACpopulator/populator_base.py | 3 +++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index ca5cf52..588e160 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,9 @@ STAC_HOST = http://localhost:8880/stac testcmip6: python $(IMP_DIR)/CMIP6-UofT/add_CMIP6.py $(STAC_HOST) https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/testdata/xclim/cmip6/catalog.html $(IMP_DIR)/CMIP6-UofT/CMIP6.yml +delcmip6: + curl --location --request DELETE '$(STAC_HOST)/collections/CMIP6' + @echo "" starthost: docker compose up diff --git a/STACpopulator/input.py b/STACpopulator/input.py index 2e61257..54f0d68 100644 --- a/STACpopulator/input.py +++ b/STACpopulator/input.py @@ -2,6 +2,7 @@ from abc import ABC, abstractmethod from typing import Any, Iterator, MutableMapping, Optional, Tuple +import pystac import requests import siphon import xncml @@ -22,7 +23,7 @@ class GenericLoader(ABC): def __init__(self) -> None: - pass + self.links = [] @abstractmethod def __iter__(self): @@ -58,6 +59,16 @@ def __init__(self, thredds_catalog_url: str, depth: Optional[int] = None) -> Non self.thredds_catalog_URL = thredds_catalog_url self.catalog = TDSCatalog(self.thredds_catalog_URL) self.catalog_head = self.catalog + self.links.append(self.magpie_collection_link()) + + def magpie_collection_link(self): + """Return Link to THREDDS catalog.""" + url = self.thredds_catalog_URL + parts = url.split("/") + i = parts.index("catalog") + service = parts[i - 1] + path = "/".join(parts[i + 1 : -1]) + return pystac.Link(rel="source", target=url, media_type="text/xml", title=f"{service}:{path}") def reset(self): """Reset the generator.""" @@ -81,7 +92,8 @@ def extract_metadata(self, ds: siphon.catalog.Dataset) -> MutableMapping[str, An url = ds.access_urls["NCML"] LOGGER.info("Requesting NcML dataset description") - r = requests.get(url) + # r = requests.get(url) + r = requests.get(url, params={"catalog": self.catalog_head, "dataset": ds}) # Convert NcML to CF-compliant dictionary attrs = xncml.Dataset.from_text(r.content).to_cf_dict() diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index 404d610..07841bc 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -114,6 +114,9 @@ def create_stac_collection(self): self._collection_info["summaries"] = pystac.Summaries({"needs_summaries_update": ["true"]}) collection = pystac.Collection(id=self.collection_id, **self._collection_info) + + collection.add_links(self._ingest_pipeline.links) + post_stac_collection(self.stac_host, collection.to_dict()) def ingest(self) -> None: From 00a968a33e14400f07c7d53f0cf02ff993346b57 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 18 Oct 2023 23:43:10 -0400 Subject: [PATCH 17/36] posting items fixes --- STACpopulator/api_requests.py | 6 +++--- STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py | 4 ++-- STACpopulator/populator_base.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/STACpopulator/api_requests.py b/STACpopulator/api_requests.py index fdd9a65..23a7371 100644 --- a/STACpopulator/api_requests.py +++ b/STACpopulator/api_requests.py @@ -2,6 +2,7 @@ import os from typing import Any, Optional from urllib.parse import urljoin + import requests from colorlog import ColoredFormatter @@ -79,17 +80,16 @@ def post_stac_item( """ item_id = json_data["id"] - r = requests.post(urljoin(stac_host, f"collections/{collection_id}/items"), json=json_data) + r = requests.post(os.path.join(stac_host, f"collections/{collection_id}/items"), json=json_data) if r.status_code == 200: LOGGER.info(f"Item {item_name} successfully added") elif r.status_code == 409: if update: LOGGER.info(f"Item {item_id} already exists. Updating.") - r = requests.put(urljoin(stac_host, f"collections/{collection_id}/items/{item_id}"), json=json_data) + r = requests.put(os.path.join(stac_host, f"collections/{collection_id}/items/{item_id}"), json=json_data) r.raise_for_status() else: LOGGER.info(f"Item {item_id} already exists.") else: r.raise_for_status() - diff --git a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py index a137a0b..3f2f115 100644 --- a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py @@ -161,8 +161,8 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) except: LOGGER.warning(f"Failed to add Datacube extension to item {item_name}") - # return json.dumps(item.to_dict()) - print(json.dumps(item.to_dict())) + # print(json.dumps(item.to_dict())) + return json.loads(json.dumps(item.to_dict())) def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: # Validation is done at the item creating stage, using the Properties class. diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index 07841bc..beb1541 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -124,7 +124,7 @@ def ingest(self) -> None: for item_name, item_data in self._ingest_pipeline: LOGGER.info(f"Creating STAC representation for {item_name}") stac_item = self.create_stac_item(item_name, item_data) - # post_stac_item(self.stac_host, self.collection_id, item_name, stac_item) + post_stac_item(self.stac_host, self.collection_id, item_name, stac_item) # try: # pass # except Exception: From 2c3b49de9808ba1e842fd5a93231e458e7374767 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 19 Oct 2023 11:05:06 -0400 Subject: [PATCH 18/36] removing function no longer in use --- STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py | 4 ---- STACpopulator/populator_base.py | 4 ---- 2 files changed, 8 deletions(-) diff --git a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py index 3f2f115..31495cc 100644 --- a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py @@ -164,10 +164,6 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) # print(json.dumps(item.to_dict())) return json.loads(json.dumps(item.to_dict())) - def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: - # Validation is done at the item creating stage, using the Properties class. - return True - if __name__ == "__main__": parser = argparse.ArgumentParser(prog="CMIP6 STAC populator") diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index beb1541..cc84403 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -138,7 +138,3 @@ def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableM @abstractmethod def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: pass - - @abstractmethod - def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: - pass From 6908d5548da395359a9a8f394be6a2ca4566257a Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 19 Oct 2023 11:23:12 -0400 Subject: [PATCH 19/36] implemented updating stac collection and items --- .../implementations/CMIP6-UofT/add_CMIP6.py | 12 ++++-- STACpopulator/populator_base.py | 37 +++++++++---------- 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py index 31495cc..25e3ac5 100644 --- a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py @@ -2,7 +2,7 @@ import json import logging from datetime import datetime -from typing import Any, Dict, List, Literal, MutableMapping +from typing import Any, Dict, List, Literal, MutableMapping, Optional import pyessv from colorlog import ColoredFormatter @@ -122,7 +122,9 @@ class CMIP6populator(STACpopulatorBase): item_properties_model = CMIP6ItemProperties item_geometry_model = GeoJSONPolygon - def __init__(self, stac_host: str, thredds_catalog_url: str, config_filename: str) -> None: + def __init__( + self, stac_host: str, thredds_catalog_url: str, config_filename: str, update: Optional[bool] = False + ) -> None: """Constructor :param stac_host: URL to the STAC API @@ -134,7 +136,8 @@ def __init__(self, stac_host: str, thredds_catalog_url: str, config_filename: st """ data_loader = THREDDSLoader(thredds_catalog_url) - super().__init__(stac_host, data_loader, config_filename) + + super().__init__(stac_host, data_loader, config_filename, update) def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]): pass @@ -170,8 +173,9 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) parser.add_argument("stac_host", type=str, help="STAC API address") parser.add_argument("thredds_catalog_URL", type=str, help="URL to the CMIP6 THREDDS catalog") parser.add_argument("config_file", type=str, help="Name of the configuration file") + parser.add_argument("--update", action="store_true", help="Update collection and its items") args = parser.parse_args() LOGGER.info(f"Arguments to call: {args}") - c = CMIP6populator(args.stac_host, args.thredds_catalog_URL, args.config_file) + c = CMIP6populator(args.stac_host, args.thredds_catalog_URL, args.config_file, args.update) c.ingest() diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index cc84403..a1fdc85 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -1,7 +1,7 @@ import logging from abc import ABC, abstractmethod from datetime import datetime -from typing import Any, MutableMapping +from typing import Any, MutableMapping, Optional import pystac import yaml @@ -32,6 +32,7 @@ def __init__( stac_host: str, data_loader: GenericLoader, collection_info_filename: str, + update: Optional[bool] = False, ) -> None: """Constructor @@ -56,6 +57,7 @@ def __init__( self._ingest_pipeline = data_loader self._stac_host = self.validate_host(stac_host) + self.update = update # self._collection_id = hashlib.md5(self.collection_name.encode("utf-8")).hexdigest() self._collection_id = self.collection_name @@ -96,35 +98,32 @@ def create_stac_collection(self): Returns the collection. """ - if stac_collection_exists(self.stac_host, self.collection_id): - LOGGER.info(f"Collection '{self.collection_name}' already exists") - else: - LOGGER.info(f"Creating collection '{self.collection_name}'") - sp_extent = pystac.SpatialExtent([self._collection_info.pop("spatialextent")]) - tmp = self._collection_info.pop("temporalextent") - tmp_extent = pystac.TemporalExtent( + LOGGER.info(f"Creating collection '{self.collection_name}'") + sp_extent = pystac.SpatialExtent([self._collection_info.pop("spatialextent")]) + tmp = self._collection_info.pop("temporalextent") + tmp_extent = pystac.TemporalExtent( + [ [ - [ - datetime.strptime(tmp[0], "%Y-%m-%d") if tmp[0] is not None else None, - datetime.strptime(tmp[1], "%Y-%m-%d") if tmp[1] is not None else None, - ] + datetime.strptime(tmp[0], "%Y-%m-%d") if tmp[0] is not None else None, + datetime.strptime(tmp[1], "%Y-%m-%d") if tmp[1] is not None else None, ] - ) - self._collection_info["extent"] = pystac.Extent(sp_extent, tmp_extent) - self._collection_info["summaries"] = pystac.Summaries({"needs_summaries_update": ["true"]}) + ] + ) + self._collection_info["extent"] = pystac.Extent(sp_extent, tmp_extent) + self._collection_info["summaries"] = pystac.Summaries({"needs_summaries_update": ["true"]}) - collection = pystac.Collection(id=self.collection_id, **self._collection_info) + collection = pystac.Collection(id=self.collection_id, **self._collection_info) - collection.add_links(self._ingest_pipeline.links) + collection.add_links(self._ingest_pipeline.links) - post_stac_collection(self.stac_host, collection.to_dict()) + post_stac_collection(self.stac_host, collection.to_dict(), self.update) def ingest(self) -> None: LOGGER.info("Data ingestion") for item_name, item_data in self._ingest_pipeline: LOGGER.info(f"Creating STAC representation for {item_name}") stac_item = self.create_stac_item(item_name, item_data) - post_stac_item(self.stac_host, self.collection_id, item_name, stac_item) + post_stac_item(self.stac_host, self.collection_id, item_name, stac_item, self.update) # try: # pass # except Exception: From 0c959ea6c20718217ff3e0a1f16f29548c38272a Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 19 Oct 2023 13:27:55 -0400 Subject: [PATCH 20/36] removing need to pass yml file to app on command line --- Makefile | 2 +- .../implementations/CMIP6-UofT/add_CMIP6.py | 13 ++++--------- .../CMIP6-UofT/{CMIP6.yml => collection_config.yml} | 0 STACpopulator/populator_base.py | 13 +++++++++---- 4 files changed, 14 insertions(+), 14 deletions(-) rename STACpopulator/implementations/CMIP6-UofT/{CMIP6.yml => collection_config.yml} (100%) diff --git a/Makefile b/Makefile index 588e160..914e513 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ IMP_DIR = STACpopulator/implementations STAC_HOST = http://localhost:8880/stac testcmip6: - python $(IMP_DIR)/CMIP6-UofT/add_CMIP6.py $(STAC_HOST) https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/testdata/xclim/cmip6/catalog.html $(IMP_DIR)/CMIP6-UofT/CMIP6.yml + python $(IMP_DIR)/CMIP6-UofT/add_CMIP6.py $(STAC_HOST) https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/testdata/xclim/cmip6/catalog.html delcmip6: curl --location --request DELETE '$(STAC_HOST)/collections/CMIP6' diff --git a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py index 25e3ac5..532cfb4 100644 --- a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py @@ -122,22 +122,17 @@ class CMIP6populator(STACpopulatorBase): item_properties_model = CMIP6ItemProperties item_geometry_model = GeoJSONPolygon - def __init__( - self, stac_host: str, thredds_catalog_url: str, config_filename: str, update: Optional[bool] = False - ) -> None: + def __init__(self, stac_host: str, thredds_catalog_url: str, update: Optional[bool] = False) -> None: """Constructor :param stac_host: URL to the STAC API :type stac_host: str :param thredds_catalog_url: the URL to the THREDDS catalog to ingest :type thredds_catalog_url: str - :param config_filename: Yaml file containing the information about the collection to populate - :type config_filename: str """ - data_loader = THREDDSLoader(thredds_catalog_url) - super().__init__(stac_host, data_loader, config_filename, update) + super().__init__(stac_host, data_loader, update) def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]): pass @@ -172,10 +167,10 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) parser = argparse.ArgumentParser(prog="CMIP6 STAC populator") parser.add_argument("stac_host", type=str, help="STAC API address") parser.add_argument("thredds_catalog_URL", type=str, help="URL to the CMIP6 THREDDS catalog") - parser.add_argument("config_file", type=str, help="Name of the configuration file") parser.add_argument("--update", action="store_true", help="Update collection and its items") args = parser.parse_args() + LOGGER.info(f"Arguments to call: {args}") - c = CMIP6populator(args.stac_host, args.thredds_catalog_URL, args.config_file, args.update) + c = CMIP6populator(args.stac_host, args.thredds_catalog_URL, args.update) c.ingest() diff --git a/STACpopulator/implementations/CMIP6-UofT/CMIP6.yml b/STACpopulator/implementations/CMIP6-UofT/collection_config.yml similarity index 100% rename from STACpopulator/implementations/CMIP6-UofT/CMIP6.yml rename to STACpopulator/implementations/CMIP6-UofT/collection_config.yml diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index a1fdc85..38ef8be 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -1,4 +1,6 @@ import logging +import os +import sys from abc import ABC, abstractmethod from datetime import datetime from typing import Any, MutableMapping, Optional @@ -31,7 +33,6 @@ def __init__( self, stac_host: str, data_loader: GenericLoader, - collection_info_filename: str, update: Optional[bool] = False, ) -> None: """Constructor @@ -40,13 +41,17 @@ def __init__( :type stac_host: str :param data_loader: A concrete implementation of the GenericLoader abstract base class :type data_loader: GenericLoader - :param collection_info_filename: Yaml file containing the information about the collection to populate - :type collection_info_filename: str :raises RuntimeError: Raised if one of the required definitions is not found in the collection info filename """ super().__init__() - with open(collection_info_filename) as f: + self._collection_info_filename = "collection_config.yml" + self._app_directory = os.path.dirname(sys.argv[0]) + + if not os.path.exists(os.path.join(self._app_directory, self._collection_info_filename)): + raise RuntimeError(f"Missing {self._collection_info_filename} file for this implementation") + + with open(os.path.join(self._app_directory, self._collection_info_filename)) as f: self._collection_info = yaml.load(f, yaml.Loader) req_definitions = ["title", "description", "keywords", "license"] From 73b277337cc820e0333297e4d9be2442e503b8c3 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 19 Oct 2023 13:29:34 -0400 Subject: [PATCH 21/36] code cleanup --- STACpopulator/api_requests.py | 1 - STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py | 2 +- STACpopulator/populator_base.py | 2 -- 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/STACpopulator/api_requests.py b/STACpopulator/api_requests.py index 23a7371..35b0dc2 100644 --- a/STACpopulator/api_requests.py +++ b/STACpopulator/api_requests.py @@ -1,7 +1,6 @@ import logging import os from typing import Any, Optional -from urllib.parse import urljoin import requests from colorlog import ColoredFormatter diff --git a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py index 532cfb4..207add8 100644 --- a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py @@ -2,7 +2,7 @@ import json import logging from datetime import datetime -from typing import Any, Dict, List, Literal, MutableMapping, Optional +from typing import Any, List, Literal, MutableMapping, Optional import pyessv from colorlog import ColoredFormatter diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index 38ef8be..4e75cb1 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -12,7 +12,6 @@ from STACpopulator.api_requests import ( post_stac_collection, post_stac_item, - stac_collection_exists, stac_host_reachable, ) from STACpopulator.input import GenericLoader @@ -64,7 +63,6 @@ def __init__( self._stac_host = self.validate_host(stac_host) self.update = update - # self._collection_id = hashlib.md5(self.collection_name.encode("utf-8")).hexdigest() self._collection_id = self.collection_name LOGGER.info("Initialization complete") LOGGER.info(f"Collection {self.collection_name} is assigned id {self._collection_id}") From 9e919c25df954306cedcfa813bc65e1d4ee2c06b Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 19 Oct 2023 14:01:56 -0400 Subject: [PATCH 22/36] adding __init__ files --- STACpopulator/implementations/CMIP6-UofT/__init__.py | 0 STACpopulator/implementations/NEX-GDDP-UofT/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 STACpopulator/implementations/CMIP6-UofT/__init__.py create mode 100644 STACpopulator/implementations/NEX-GDDP-UofT/__init__.py diff --git a/STACpopulator/implementations/CMIP6-UofT/__init__.py b/STACpopulator/implementations/CMIP6-UofT/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/STACpopulator/implementations/NEX-GDDP-UofT/__init__.py b/STACpopulator/implementations/NEX-GDDP-UofT/__init__.py new file mode 100644 index 0000000..e69de29 From c62fb801439b2d4900550e157df431984122654e Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 19 Oct 2023 14:07:47 -0400 Subject: [PATCH 23/36] fix --- STACpopulator/implementations/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 STACpopulator/implementations/__init__.py diff --git a/STACpopulator/implementations/__init__.py b/STACpopulator/implementations/__init__.py new file mode 100644 index 0000000..e69de29 From 10db1281b46298828727293159e7a5a52e71cb89 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 19 Oct 2023 16:20:58 -0400 Subject: [PATCH 24/36] more fixes --- Makefile | 2 +- .../implementations/{CMIP6-UofT => CMIP6_UofT}/__init__.py | 0 .../implementations/{CMIP6-UofT => CMIP6_UofT}/add_CMIP6.py | 2 +- .../{CMIP6-UofT => CMIP6_UofT}/collection_config.yml | 0 .../implementations/{CMIP6-UofT => CMIP6_UofT}/extensions.py | 0 .../{NEX-GDDP-UofT => NEX_GDDP_UofT}/__init__.py | 0 .../{NEX-GDDP-UofT => NEX_GDDP_UofT}/add_NEX-GDDP.py | 0 7 files changed, 2 insertions(+), 2 deletions(-) rename STACpopulator/implementations/{CMIP6-UofT => CMIP6_UofT}/__init__.py (100%) rename STACpopulator/implementations/{CMIP6-UofT => CMIP6_UofT}/add_CMIP6.py (98%) rename STACpopulator/implementations/{CMIP6-UofT => CMIP6_UofT}/collection_config.yml (100%) rename STACpopulator/implementations/{CMIP6-UofT => CMIP6_UofT}/extensions.py (100%) rename STACpopulator/implementations/{NEX-GDDP-UofT => NEX_GDDP_UofT}/__init__.py (100%) rename STACpopulator/implementations/{NEX-GDDP-UofT => NEX_GDDP_UofT}/add_NEX-GDDP.py (100%) diff --git a/Makefile b/Makefile index 914e513..b08e1b8 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ IMP_DIR = STACpopulator/implementations STAC_HOST = http://localhost:8880/stac testcmip6: - python $(IMP_DIR)/CMIP6-UofT/add_CMIP6.py $(STAC_HOST) https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/testdata/xclim/cmip6/catalog.html + python $(IMP_DIR)/CMIP6_UofT/add_CMIP6.py $(STAC_HOST) https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/testdata/xclim/cmip6/catalog.html delcmip6: curl --location --request DELETE '$(STAC_HOST)/collections/CMIP6' diff --git a/STACpopulator/implementations/CMIP6-UofT/__init__.py b/STACpopulator/implementations/CMIP6_UofT/__init__.py similarity index 100% rename from STACpopulator/implementations/CMIP6-UofT/__init__.py rename to STACpopulator/implementations/CMIP6_UofT/__init__.py diff --git a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py similarity index 98% rename from STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py rename to STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py index 207add8..56bf4e6 100644 --- a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py @@ -6,10 +6,10 @@ import pyessv from colorlog import ColoredFormatter -from extensions import DataCubeHelper from pydantic import AnyHttpUrl, ConfigDict, Field, FieldValidationInfo, field_validator from STACpopulator import STACpopulatorBase +from STACpopulator.implementations.CMIP6_UofT.extensions import DataCubeHelper from STACpopulator.input import THREDDSLoader from STACpopulator.models import GeoJSONPolygon, STACItemProperties from STACpopulator.stac_utils import STAC_item_from_metadata, collection2literal diff --git a/STACpopulator/implementations/CMIP6-UofT/collection_config.yml b/STACpopulator/implementations/CMIP6_UofT/collection_config.yml similarity index 100% rename from STACpopulator/implementations/CMIP6-UofT/collection_config.yml rename to STACpopulator/implementations/CMIP6_UofT/collection_config.yml diff --git a/STACpopulator/implementations/CMIP6-UofT/extensions.py b/STACpopulator/implementations/CMIP6_UofT/extensions.py similarity index 100% rename from STACpopulator/implementations/CMIP6-UofT/extensions.py rename to STACpopulator/implementations/CMIP6_UofT/extensions.py diff --git a/STACpopulator/implementations/NEX-GDDP-UofT/__init__.py b/STACpopulator/implementations/NEX_GDDP_UofT/__init__.py similarity index 100% rename from STACpopulator/implementations/NEX-GDDP-UofT/__init__.py rename to STACpopulator/implementations/NEX_GDDP_UofT/__init__.py diff --git a/STACpopulator/implementations/NEX-GDDP-UofT/add_NEX-GDDP.py b/STACpopulator/implementations/NEX_GDDP_UofT/add_NEX-GDDP.py similarity index 100% rename from STACpopulator/implementations/NEX-GDDP-UofT/add_NEX-GDDP.py rename to STACpopulator/implementations/NEX_GDDP_UofT/add_NEX-GDDP.py From 25985dbbf27aaeeae1c53d8f82ac648e6d31a379 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Mon, 23 Oct 2023 12:00:06 -0400 Subject: [PATCH 25/36] diagnostics --- STACpopulator/input.py | 8 ++++++++ STACpopulator/stac_utils.py | 6 ++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/STACpopulator/input.py b/STACpopulator/input.py index 54f0d68..be67ede 100644 --- a/STACpopulator/input.py +++ b/STACpopulator/input.py @@ -76,6 +76,8 @@ def reset(self): def __iter__(self) -> Iterator[Tuple[str, MutableMapping[str, Any]]]: """Return a generator walking a THREDDS data catalog for datasets.""" + # print(f"At START catalog head is: {self.catalog_head}") + print(self.catalog_head.__dict__) if self.catalog_head.datasets.items(): for item_name, ds in self.catalog_head.datasets.items(): attrs = self.extract_metadata(ds) @@ -84,6 +86,7 @@ def __iter__(self) -> Iterator[Tuple[str, MutableMapping[str, Any]]]: if self._depth > 0: for name, ref in self.catalog_head.catalog_refs.items(): self.catalog_head = ref.follow() + print(f"catalog head is: {self.catalog_head}") self._depth -= 1 yield from self @@ -91,6 +94,11 @@ def extract_metadata(self, ds: siphon.catalog.Dataset) -> MutableMapping[str, An # Get URL for NCML service url = ds.access_urls["NCML"] + print(url) + # print(self.catalog_head) + print(f"ds = {ds}") + print(ds.__dict__) + print(self.catalog_head.catalog_url) LOGGER.info("Requesting NcML dataset description") # r = requests.get(url) r = requests.get(url, params={"catalog": self.catalog_head, "dataset": ds}) diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index fe8c650..62b795f 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -151,8 +151,10 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop # Add assets if "access_urls" in attrs: + print("access_urls") root = attrs["access_urls"] elif "THREDDSMetadata" in attrs["groups"]: + print("THREDDSMetadata") root = attrs["groups"]["THREDDSMetadata"]["groups"]["services"]["attributes"] else: root = {} @@ -162,8 +164,8 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop asset = pystac.Asset(href=url, media_type=media_types.get(name), roles=asset_roles.get(name)) item.add_asset(name, asset) - if root: - item.add_link(magpie_resource_link(root["HTTPServer"])) + # if root: + # item.add_link(magpie_resource_link(root["HTTPServer"])) return item From 6d675bcfe156bd633e665bb514052a245596156f Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Mon, 23 Oct 2023 15:59:35 -0400 Subject: [PATCH 26/36] removing unused code --- tests/test_client.py | 5 ----- tests/test_cmip6_extension.py | 20 -------------------- 2 files changed, 25 deletions(-) delete mode 100644 tests/test_client.py delete mode 100644 tests/test_cmip6_extension.py diff --git a/tests/test_client.py b/tests/test_client.py deleted file mode 100644 index b35f9ac..0000000 --- a/tests/test_client.py +++ /dev/null @@ -1,5 +0,0 @@ -from pystac_client import Client - -def test_cmip6(): - """Assume some CMIP6 has been ingested.""" - c = Client.open("http://localhost:8880/stac") diff --git a/tests/test_cmip6_extension.py b/tests/test_cmip6_extension.py deleted file mode 100644 index f899a33..0000000 --- a/tests/test_cmip6_extension.py +++ /dev/null @@ -1,20 +0,0 @@ -from STACpopulator.extensions import cmip6 -from STACpopulator.stac_utils import CFJsonItem -import xncml -from pathlib import Path -from pystac import Item, validation - -TEST_DATA = Path(__file__).parent / "data" - -def test_extension(): - ds = xncml.Dataset(TEST_DATA / "o3_Amon_GFDL-ESM4_historical_r1i1p1f1_gr1_185001-194912.xml") - attrs = ds.to_cf_dict() - - item = CFJsonItem("test", attrs).item - validation.validate(item) - - ext = cmip6.CMIP6Extension.ext(item, add_if_missing=True) - ext.apply(attrs["attributes"]) - assert "cmip6:realm" in item.properties - - From 65bd5bbaac751e535c08fb1441cdef1087e85db2 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Mon, 23 Oct 2023 17:43:44 -0400 Subject: [PATCH 27/36] refactoring to allow more flexibility --- Makefile | 2 +- .../implementations/CMIP6_UofT/add_CMIP6.py | 20 +-- .../CMIP6_UofT/collection_config.yml | 1 + STACpopulator/input.py | 60 ++++----- STACpopulator/populator_base.py | 53 +++----- STACpopulator/stac_utils.py | 79 +++++++---- tests/ref.txt | 124 ++++++++++++++++++ tests/test_standalone_stac_item.py | 30 +++++ 8 files changed, 269 insertions(+), 100 deletions(-) create mode 100644 tests/ref.txt create mode 100644 tests/test_standalone_stac_item.py diff --git a/Makefile b/Makefile index b08e1b8..439f93e 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ testcmip6: python $(IMP_DIR)/CMIP6_UofT/add_CMIP6.py $(STAC_HOST) https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/testdata/xclim/cmip6/catalog.html delcmip6: - curl --location --request DELETE '$(STAC_HOST)/collections/CMIP6' + curl --location --request DELETE '$(STAC_HOST)/collections/CMIP6_UofT' @echo "" starthost: diff --git a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py index 56bf4e6..fc39baf 100644 --- a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py @@ -10,7 +10,7 @@ from STACpopulator import STACpopulatorBase from STACpopulator.implementations.CMIP6_UofT.extensions import DataCubeHelper -from STACpopulator.input import THREDDSLoader +from STACpopulator.input import GenericLoader, THREDDSLoader from STACpopulator.models import GeoJSONPolygon, STACItemProperties from STACpopulator.stac_utils import STAC_item_from_metadata, collection2literal @@ -122,7 +122,7 @@ class CMIP6populator(STACpopulatorBase): item_properties_model = CMIP6ItemProperties item_geometry_model = GeoJSONPolygon - def __init__(self, stac_host: str, thredds_catalog_url: str, update: Optional[bool] = False) -> None: + def __init__(self, stac_host: str, data_loader: GenericLoader, update: Optional[bool] = False) -> None: """Constructor :param stac_host: URL to the STAC API @@ -130,13 +130,8 @@ def __init__(self, stac_host: str, thredds_catalog_url: str, update: Optional[bo :param thredds_catalog_url: the URL to the THREDDS catalog to ingest :type thredds_catalog_url: str """ - data_loader = THREDDSLoader(thredds_catalog_url) - super().__init__(stac_host, data_loader, update) - def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]): - pass - def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: """Creates the STAC item. @@ -172,5 +167,14 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) args = parser.parse_args() LOGGER.info(f"Arguments to call: {args}") - c = CMIP6populator(args.stac_host, args.thredds_catalog_URL, args.update) + + mode = "full" + + if mode == "full": + data_loader = THREDDSLoader(args.thredds_catalog_URL) + else: + # To be implemented + data_loader = ErrorLoader(args.error_file) + + c = CMIP6populator(args.stac_host, data_loader, args.update) c.ingest() diff --git a/STACpopulator/implementations/CMIP6_UofT/collection_config.yml b/STACpopulator/implementations/CMIP6_UofT/collection_config.yml index a57875b..0f43c78 100644 --- a/STACpopulator/implementations/CMIP6_UofT/collection_config.yml +++ b/STACpopulator/implementations/CMIP6_UofT/collection_config.yml @@ -1,4 +1,5 @@ title: CMIP6 +id: CMIP6_UofT description: Coupled Model Intercomparison Project phase 6 keywords: ['CMIP', 'CMIP6', 'WCRP', 'Climate Change'] license: "CC-BY-4.0" diff --git a/STACpopulator/input.py b/STACpopulator/input.py index be67ede..272f9ad 100644 --- a/STACpopulator/input.py +++ b/STACpopulator/input.py @@ -4,12 +4,11 @@ import pystac import requests -import siphon import xncml from colorlog import ColoredFormatter from siphon.catalog import TDSCatalog -from STACpopulator.stac_utils import numpy_to_python_datatypes +from STACpopulator.stac_utils import numpy_to_python_datatypes, url_validate LOGGER = logging.getLogger(__name__) LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" @@ -52,23 +51,41 @@ def __init__(self, thredds_catalog_url: str, depth: Optional[int] = None) -> Non super().__init__() self._depth = depth if depth is not None else 1000 - if thredds_catalog_url.endswith(".html"): - thredds_catalog_url = thredds_catalog_url.replace(".html", ".xml") - LOGGER.info("Converting catalog URL from html to xml") + self.thredds_catalog_URL = self.validate_catalog_url(thredds_catalog_url) - self.thredds_catalog_URL = thredds_catalog_url self.catalog = TDSCatalog(self.thredds_catalog_URL) self.catalog_head = self.catalog self.links.append(self.magpie_collection_link()) - def magpie_collection_link(self): - """Return Link to THREDDS catalog.""" + def validate_catalog_url(self, url: str) -> str: + """Validate the user-provided catalog URL. + + :param url: URL to the THREDDS catalog + :type url: str + :raises RuntimeError: if URL is invalid or contains query parameters. + :return: a valid URL + :rtype: str + """ + if url_validate(url): + if "?" in url: + raise RuntimeError("THREDDS catalog URL should not contain query parameter") + else: + raise RuntimeError("Invalid URL") + + return url.replace(".html", ".xml") if url.endswith(".html") else url + + def magpie_collection_link(self) -> pystac.Link: + """Creates a PySTAC Link for the collection that is used by Cowbird and Magpie. + + :return: A PySTAC Link + :rtype: pystac.Link + """ url = self.thredds_catalog_URL parts = url.split("/") i = parts.index("catalog") - service = parts[i - 1] + # service = parts[i - 1] path = "/".join(parts[i + 1 : -1]) - return pystac.Link(rel="source", target=url, media_type="text/xml", title=f"{service}:{path}") + return pystac.Link(rel="source", target=url, media_type="text/xml", title=path) def reset(self): """Reset the generator.""" @@ -76,40 +93,23 @@ def reset(self): def __iter__(self) -> Iterator[Tuple[str, MutableMapping[str, Any]]]: """Return a generator walking a THREDDS data catalog for datasets.""" - # print(f"At START catalog head is: {self.catalog_head}") - print(self.catalog_head.__dict__) if self.catalog_head.datasets.items(): for item_name, ds in self.catalog_head.datasets.items(): - attrs = self.extract_metadata(ds) + attrs = self.extract_metadata(ds.access_urls["NCML"], self.catalog_head.catalog_url, ds.url_path) yield item_name, attrs if self._depth > 0: for name, ref in self.catalog_head.catalog_refs.items(): self.catalog_head = ref.follow() - print(f"catalog head is: {self.catalog_head}") self._depth -= 1 yield from self - def extract_metadata(self, ds: siphon.catalog.Dataset) -> MutableMapping[str, Any]: - # Get URL for NCML service - url = ds.access_urls["NCML"] - - print(url) - # print(self.catalog_head) - print(f"ds = {ds}") - print(ds.__dict__) - print(self.catalog_head.catalog_url) + def extract_metadata(self, ncml_url: str, catalog_url: str, dataset_path: str) -> MutableMapping[str, Any]: LOGGER.info("Requesting NcML dataset description") - # r = requests.get(url) - r = requests.get(url, params={"catalog": self.catalog_head, "dataset": ds}) - + r = requests.get(ncml_url, params={"catalog": catalog_url, "dataset": dataset_path}) # Convert NcML to CF-compliant dictionary attrs = xncml.Dataset.from_text(r.content).to_cf_dict() - attrs["attributes"] = numpy_to_python_datatypes(attrs["attributes"]) - - attrs["access_urls"] = ds.access_urls - return attrs diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index 4e75cb1..e6b795d 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -1,12 +1,9 @@ import logging -import os -import sys from abc import ABC, abstractmethod from datetime import datetime from typing import Any, MutableMapping, Optional import pystac -import yaml from colorlog import ColoredFormatter from STACpopulator.api_requests import ( @@ -15,7 +12,7 @@ stac_host_reachable, ) from STACpopulator.input import GenericLoader -from STACpopulator.stac_utils import url_validate +from STACpopulator.stac_utils import load_collection_configuration, url_validate LOGGER = logging.getLogger(__name__) LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" @@ -44,20 +41,7 @@ def __init__( """ super().__init__() - self._collection_info_filename = "collection_config.yml" - self._app_directory = os.path.dirname(sys.argv[0]) - - if not os.path.exists(os.path.join(self._app_directory, self._collection_info_filename)): - raise RuntimeError(f"Missing {self._collection_info_filename} file for this implementation") - - with open(os.path.join(self._app_directory, self._collection_info_filename)) as f: - self._collection_info = yaml.load(f, yaml.Loader) - - req_definitions = ["title", "description", "keywords", "license"] - for req in req_definitions: - if req not in self._collection_info.keys(): - LOGGER.error(f"'{req}' is required in the configuration file") - raise RuntimeError(f"'{req}' is required in the configuration file") + self._collection_info = load_collection_configuration() self._ingest_pipeline = data_loader self._stac_host = self.validate_host(stac_host) @@ -78,7 +62,7 @@ def stac_host(self) -> str: @property def collection_id(self) -> str: - return self._collection_id + return self._collection_info["id"] @property @abstractmethod @@ -87,15 +71,26 @@ def item_properties_model(self): models.STACItemProperties.""" pass + @property + @abstractmethod + def item_geometry_model(self): + """In derived classes, this property should be defined as a pydantic data model that derives from + models.STACItemProperties.""" + pass + + @abstractmethod + def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: + pass + def validate_host(self, stac_host: str) -> str: if not url_validate(stac_host): raise ValueError("stac_host URL is not appropriately formatted") if not stac_host_reachable(stac_host): - raise ValueError("stac_host is not reachable") + raise RuntimeError("stac_host is not reachable") return stac_host - def create_stac_collection(self): + def create_stac_collection(self) -> None: """ Create a basic STAC collection. @@ -114,8 +109,7 @@ def create_stac_collection(self): ) self._collection_info["extent"] = pystac.Extent(sp_extent, tmp_extent) self._collection_info["summaries"] = pystac.Summaries({"needs_summaries_update": ["true"]}) - - collection = pystac.Collection(id=self.collection_id, **self._collection_info) + collection = pystac.Collection(**self._collection_info) collection.add_links(self._ingest_pipeline.links) @@ -127,16 +121,3 @@ def ingest(self) -> None: LOGGER.info(f"Creating STAC representation for {item_name}") stac_item = self.create_stac_item(item_name, item_data) post_stac_item(self.stac_host, self.collection_id, item_name, stac_item, self.update) - # try: - # pass - # except Exception: - # LOGGER.error(f"Failed adding STAC item {item_name}") - # self.handle_ingestion_error("Posting Error", item_name, item_data) - - @abstractmethod - def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]): - pass - - @abstractmethod - def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: - pass diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 62b795f..d3786e1 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -1,12 +1,27 @@ +import datetime import json +import logging +import os import re +import sys from typing import Any, Literal, MutableMapping import numpy as np import pystac +import yaml +from colorlog import ColoredFormatter from STACpopulator.models import STACItem +LOGGER = logging.getLogger(__name__) +LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" +formatter = ColoredFormatter(LOGFORMAT) +stream = logging.StreamHandler() +stream.setFormatter(formatter) +LOGGER.addHandler(stream) +LOGGER.setLevel(logging.INFO) +LOGGER.propagate = False + def url_validate(target: str) -> bool: """Validate whether a supplied URL is reliably written. @@ -32,6 +47,33 @@ def url_validate(target: str) -> bool: return True if re.match(url_regex, target) else False +def load_collection_configuration() -> MutableMapping[str, Any]: + """Reads details of the STAC Collection to be created from a configuration file. the + code expects a "collection_config.yml" file to be present in the app directory. + + :raises RuntimeError: If the configuration file is not present + :raises RuntimeError: If required values are not present in the configuration file + :return: A python dictionary describing the details of the Collection + :rtype: MutableMapping[str, Any] + """ + collection_info_filename = "collection_config.yml" + app_directory = os.path.dirname(sys.argv[0]) + + if not os.path.exists(os.path.join(app_directory, collection_info_filename)): + raise RuntimeError(f"Missing {collection_info_filename} file for this implementation") + + with open(os.path.join(app_directory, collection_info_filename)) as f: + collection_info = yaml.load(f, yaml.Loader) + + req_definitions = ["title", "id", "description", "keywords", "license"] + for req in req_definitions: + if req not in collection_info.keys(): + LOGGER.error(f"'{req}' is required in the configuration file") + raise RuntimeError(f"'{req}' is required in the configuration file") + + return collection_info + + def collection2literal(collection): terms = tuple(term.label for term in collection) return Literal[terms] @@ -149,40 +191,34 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop # Convert pydantic STAC item to a PySTAC Item item = pystac.Item(**json.loads(item.model_dump_json(by_alias=True))) - # Add assets - if "access_urls" in attrs: - print("access_urls") - root = attrs["access_urls"] - elif "THREDDSMetadata" in attrs["groups"]: - print("THREDDSMetadata") - root = attrs["groups"]["THREDDSMetadata"]["groups"]["services"]["attributes"] - else: - root = {} + root = attrs["groups"]["THREDDSMetadata"]["groups"]["services"]["attributes"] for name, url in root.items(): name = str(name) # converting name from siphon.catalog.CaseInsensitiveStr to str asset = pystac.Asset(href=url, media_type=media_types.get(name), roles=asset_roles.get(name)) + + name = asset_name_remaps[name] if name in asset_name_remaps.keys() else name item.add_asset(name, asset) - # if root: - # item.add_link(magpie_resource_link(root["HTTPServer"])) + item.add_link(magpie_resource_link(root["httpserver_service"])) return item +asset_name_remaps = { + "httpserver_service": "HTTPServer", + "opendap_service": "OPENDAP", + "wcs_service": "WCS", + "wms_service": "WMS", + "nccs_service": "NetcdfSubset", +} + media_types = { "httpserver_service": "application/x-netcdf", "opendap_service": pystac.MediaType.HTML, "wcs_service": pystac.MediaType.XML, "wms_service": pystac.MediaType.XML, "nccs_service": "application/x-netcdf", - "HTTPServer": "application/x-netcdf", - "OPENDAP": pystac.MediaType.HTML, - "NCML": pystac.MediaType.XML, - "WCS": pystac.MediaType.XML, - "ISO": pystac.MediaType.XML, - "WMS": pystac.MediaType.XML, - "NetcdfSubset": "application/x-netcdf", } asset_roles = { @@ -191,11 +227,4 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop "wcs_service": ["data"], "wms_service": ["visual"], "nccs_service": ["data"], - "HTTPServer": ["data"], - "OPENDAP": ["data"], - "NCML": ["metadata"], - "WCS": ["data"], - "ISO": ["metadata"], - "WMS": ["visual"], - "NetcdfSubset": ["data"], } diff --git a/tests/ref.txt b/tests/ref.txt new file mode 100644 index 0000000..f3b8c23 --- /dev/null +++ b/tests/ref.txt @@ -0,0 +1,124 @@ +{ + "type": "Feature", + "stac_version": "1.0.0", + "id": "ScenarioMIP_CCCma_CanESM5_ssp245_r13i1p2f1_SImon_siconc_gn", + "properties": { + "start_datetime": "2019-12-06T12:00:00Z", + "end_datetime": "2020-11-04T12:00:00Z", + "datetime": null, + "cmip6:Conventions": "CF-1.7 CMIP-6.2", + "cmip6:activity_id": "ScenarioMIP", + "cmip6:creation_date": "2019-09-25T23:01:33Z", + "cmip6:data_specs_version": "01.00.30", + "cmip6:experiment": "update of RCP4.5 based on SSP2", + "cmip6:experiment_id": "ssp245", + "cmip6:frequency": "mon", + "cmip6:further_info_url": "https://furtherinfo.es-doc.org/CMIP6.CCCma.CanESM5.ssp245.none.r13i1p2f1", + "cmip6:grid_label": "gn", + "cmip6:institution": "Canadian Centre for Climate Modelling and Analysis, Environment and Climate Change Canada, Victoria, BC V8P 5C2, Canada", + "cmip6:institution_id": "CCCma", + "cmip6:nominal_resolution": "100 km", + "cmip6:realm": [ + "seaIce" + ], + "cmip6:source": "CanESM5 (2019): \naerosol: interactive\natmos: CanAM5 (T63L49 native atmosphere, T63 Linear Gaussian Grid; 128 x 64 longitude/latitude; 49 levels; top level 1 hPa)\natmosChem: specified oxidants for aerosols\nland: CLASS3.6/CTEM1.2\nlandIce: specified ice sheets\nocean: NEMO3.4.1 (ORCA1 tripolar grid, 1 deg with refinement to 1/3 deg within 20 degrees of the equator; 361 x 290 longitude/latitude; 45 vertical levels; top grid cell 0-6.19 m)\nocnBgchem: Canadian Model of Ocean Carbon (CMOC); NPZD ecosystem with OMIP prescribed carbonate chemistry\nseaIce: LIM2", + "cmip6:source_id": "CanESM5", + "cmip6:source_type": [ + "AOGCM" + ], + "cmip6:sub_experiment": "none", + "cmip6:sub_experiment_id": "none", + "cmip6:table_id": "SImon", + "cmip6:variable_id": "siconc", + "cmip6:variant_label": "r13i1p2f1", + "cmip6:initialization_index": 1, + "cmip6:physics_index": 2, + "cmip6:realization_index": 13, + "cmip6:forcing_index": 1, + "cmip6:tracking_id": "hdl:21.14100/9e4f804b-c161-44fa-acd1-c2e94e220c95", + "cmip6:version": "v20190429", + "cmip6:product": "model-output", + "cmip6:license": "CMIP6 model data produced by The Government of Canada (Canadian Centre for Climate Modelling and Analysis, Environment and Climate Change Canada) is licensed under a Creative Commons Attribution ShareAlike 4.0 International License (https://creativecommons.org/licenses). Consult https://pcmdi.llnl.gov/CMIP6/TermsOfUse for terms of use governing CMIP6 output, including citation requirements and proper acknowledgment. Further information about this data, including some limitations, can be found via the further_info_url (recorded as a global attribute in this file) and at https:///pcmdi.llnl.gov/. The data producers and data providers make no warranty, either express or implied, including, but not limited to, warranties of merchantability and fitness for a particular purpose. All liabilities arising from the supply of the information (including any liability arising in negligence) are excluded to the fullest extent permitted by law.", + "cmip6:grid": "ORCA1 tripolar grid, 1 deg with refinement to 1/3 deg within 20 degrees of the equator; 361 x 290 longitude/latitude; 45 vertical levels; top grid cell 0-6.19 m", + "cmip6:mip_era": "CMIP6" + }, + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [ + 0.049800001084804535, + -78.39350128173828 + ], + [ + 0.049800001084804535, + 89.74176788330078 + ], + [ + 359.99493408203125, + 89.74176788330078 + ], + [ + 359.99493408203125, + -78.39350128173828 + ], + [ + 0.049800001084804535, + -78.39350128173828 + ] + ] + ] + }, + "links": [ + { + "rel": "source", + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/fileServer/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", + "type": "application/x-netcdf", + "title": "birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc" + } + ], + "assets": { + "HTTPServer": { + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/fileServer/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", + "type": "application/x-netcdf", + "roles": [ + "data" + ] + }, + "OPENDAP": { + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/dodsC/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", + "type": "text/html", + "roles": [ + "data" + ] + }, + "WCS": { + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wcs/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc?service=WCS&version=1.0.0&request=GetCapabilities", + "type": "application/xml", + "roles": [ + "data" + ] + }, + "WMS": { + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wms/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc?service=WMS&version=1.3.0&request=GetCapabilities", + "type": "application/xml", + "roles": [ + "visual" + ] + }, + "NetcdfSubset": { + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/ncss/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc/dataset.html", + "type": "application/x-netcdf", + "roles": [ + "data" + ] + } + }, + "bbox": [ + 0.049800001084804535, + -78.39350128173828, + 359.99493408203125, + 89.74176788330078 + ], + "stac_extensions": [] +} \ No newline at end of file diff --git a/tests/test_standalone_stac_item.py b/tests/test_standalone_stac_item.py new file mode 100644 index 0000000..f0dc3c8 --- /dev/null +++ b/tests/test_standalone_stac_item.py @@ -0,0 +1,30 @@ +import json + +import requests +import xncml + +from STACpopulator.implementations.CMIP6_UofT.add_CMIP6 import ( + CMIP6ItemProperties, + make_cmip6_item_id, +) +from STACpopulator.models import GeoJSONPolygon +from STACpopulator.stac_utils import STAC_item_from_metadata + + +def test_standalone_stac_item(): + url = ( + "https://pavics.ouranos.ca/twitcher/ows/proxy/" + "thredds/ncml/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc" + "?catalog=https%3A%2F%2Fpavics.ouranos.ca%2Ftwitcher%2Fows%2Fproxy%2F" + "thredds%2Fcatalog%2Fbirdhouse%2Ftestdata%2Fxclim%2Fcmip6%2Fcatalog.html" + "&dataset=birdhouse%2Ftestdata%2Fxclim%2Fcmip6%2Fsic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc" + ) + + attrs = xncml.Dataset.from_text(requests.get(url).content).to_cf_dict() + stac_item_id = make_cmip6_item_id(attrs["attributes"]) + stac_item = STAC_item_from_metadata(stac_item_id, attrs, CMIP6ItemProperties, GeoJSONPolygon) + + with open("tests/ref.txt", "r") as ff: + reference = json.load(ff) + + assert stac_item.to_dict() == reference From f540dbe1ef7f4e17ad736e743bfb77c184616fd3 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 26 Oct 2023 13:53:22 -0400 Subject: [PATCH 28/36] fix datacube extension --- STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py | 3 ++- STACpopulator/implementations/CMIP6_UofT/extensions.py | 8 +++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py index fc39baf..32f8577 100644 --- a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py @@ -7,6 +7,7 @@ import pyessv from colorlog import ColoredFormatter from pydantic import AnyHttpUrl, ConfigDict, Field, FieldValidationInfo, field_validator +from pystac.extensions.datacube import DatacubeExtension from STACpopulator import STACpopulatorBase from STACpopulator.implementations.CMIP6_UofT.extensions import DataCubeHelper @@ -150,7 +151,7 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) try: dchelper = DataCubeHelper(item_data) dc_ext = DatacubeExtension.ext(item, add_if_missing=True) - dc_ext.apply(dimensions=dchelper.dimensions(), variables=dchelper.variables()) + dc_ext.apply(dimensions=dchelper.dimensions, variables=dchelper.variables) except: LOGGER.warning(f"Failed to add Datacube extension to item {item_name}") diff --git a/STACpopulator/implementations/CMIP6_UofT/extensions.py b/STACpopulator/implementations/CMIP6_UofT/extensions.py index 9f77b0f..31450a6 100644 --- a/STACpopulator/implementations/CMIP6_UofT/extensions.py +++ b/STACpopulator/implementations/CMIP6_UofT/extensions.py @@ -2,6 +2,8 @@ from pystac.extensions.datacube import Dimension, DimensionType, Variable, VariableType +from STACpopulator.stac_utils import ncattrs_to_bbox + class DataCubeHelper: """Return STAC Item from CF JSON metadata, as provided by `xncml.Dataset.to_cf_dict`.""" @@ -147,7 +149,7 @@ def dimensions(self) -> dict: for name, length in self.attrs["dimensions"].items(): v = self.attrs["variables"].get(name) if v: - bbox = self.obj.ncattrs_to_bbox() + bbox = ncattrs_to_bbox(self.attrs) for key, criteria in self.coordinate_criteria.items(): for criterion, expected in criteria.items(): if v["attributes"].get(criterion, None) in expected: @@ -196,8 +198,8 @@ def variables(self) -> dict: ) return variables - @property - @functools.cache + # @property + # @functools.cache def is_coordinate(self, attrs: dict) -> bool: """Return whether variable is a coordinate.""" for key, criteria in self.coordinate_criteria.items(): From 323c9453afb78a67461a6d8a2a992a7b02108b98 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 26 Oct 2023 20:12:01 -0400 Subject: [PATCH 29/36] pr changes --- STACpopulator/populator_base.py | 6 +++--- tests/{ref.txt => ref.json} | 0 tests/test_standalone_stac_item.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) rename tests/{ref.txt => ref.json} (100%) diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index e6b795d..f8ccb1c 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -69,18 +69,18 @@ def collection_id(self) -> str: def item_properties_model(self): """In derived classes, this property should be defined as a pydantic data model that derives from models.STACItemProperties.""" - pass + raise NotImplementedError @property @abstractmethod def item_geometry_model(self): """In derived classes, this property should be defined as a pydantic data model that derives from models.STACItemProperties.""" - pass + raise NotImplementedError @abstractmethod def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: - pass + raise NotImplementedError def validate_host(self, stac_host: str) -> str: if not url_validate(stac_host): diff --git a/tests/ref.txt b/tests/ref.json similarity index 100% rename from tests/ref.txt rename to tests/ref.json diff --git a/tests/test_standalone_stac_item.py b/tests/test_standalone_stac_item.py index f0dc3c8..d7239a8 100644 --- a/tests/test_standalone_stac_item.py +++ b/tests/test_standalone_stac_item.py @@ -24,7 +24,7 @@ def test_standalone_stac_item(): stac_item_id = make_cmip6_item_id(attrs["attributes"]) stac_item = STAC_item_from_metadata(stac_item_id, attrs, CMIP6ItemProperties, GeoJSONPolygon) - with open("tests/ref.txt", "r") as ff: + with open("tests/ref.json", "r") as ff: reference = json.load(ff) assert stac_item.to_dict() == reference From 0581c615c9959bc0d1f898b275d4f8f93c66c15d Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 26 Oct 2023 20:13:50 -0400 Subject: [PATCH 30/36] reverting to old way to read thredds access links --- STACpopulator/input.py | 9 ++++++--- STACpopulator/stac_utils.py | 25 ++++++++++++------------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/STACpopulator/input.py b/STACpopulator/input.py index 272f9ad..2522f15 100644 --- a/STACpopulator/input.py +++ b/STACpopulator/input.py @@ -4,6 +4,7 @@ import pystac import requests +import siphon import xncml from colorlog import ColoredFormatter from siphon.catalog import TDSCatalog @@ -95,7 +96,7 @@ def __iter__(self) -> Iterator[Tuple[str, MutableMapping[str, Any]]]: """Return a generator walking a THREDDS data catalog for datasets.""" if self.catalog_head.datasets.items(): for item_name, ds in self.catalog_head.datasets.items(): - attrs = self.extract_metadata(ds.access_urls["NCML"], self.catalog_head.catalog_url, ds.url_path) + attrs = self.extract_metadata(ds) yield item_name, attrs if self._depth > 0: @@ -104,12 +105,14 @@ def __iter__(self) -> Iterator[Tuple[str, MutableMapping[str, Any]]]: self._depth -= 1 yield from self - def extract_metadata(self, ncml_url: str, catalog_url: str, dataset_path: str) -> MutableMapping[str, Any]: + def extract_metadata(self, ds: siphon.catalog.Dataset) -> MutableMapping[str, Any]: LOGGER.info("Requesting NcML dataset description") - r = requests.get(ncml_url, params={"catalog": catalog_url, "dataset": dataset_path}) + url = ds.access_urls["NCML"] + r = requests.get(url) # Convert NcML to CF-compliant dictionary attrs = xncml.Dataset.from_text(r.content).to_cf_dict() attrs["attributes"] = numpy_to_python_datatypes(attrs["attributes"]) + attrs["access_urls"] = ds.access_urls return attrs diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index d3786e1..c245ed1 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -191,16 +191,15 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop # Convert pydantic STAC item to a PySTAC Item item = pystac.Item(**json.loads(item.model_dump_json(by_alias=True))) - root = attrs["groups"]["THREDDSMetadata"]["groups"]["services"]["attributes"] + root = attrs["access_urls"] for name, url in root.items(): name = str(name) # converting name from siphon.catalog.CaseInsensitiveStr to str asset = pystac.Asset(href=url, media_type=media_types.get(name), roles=asset_roles.get(name)) - name = asset_name_remaps[name] if name in asset_name_remaps.keys() else name item.add_asset(name, asset) - item.add_link(magpie_resource_link(root["httpserver_service"])) + item.add_link(magpie_resource_link(root["HTTPServer"])) return item @@ -214,17 +213,17 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop } media_types = { - "httpserver_service": "application/x-netcdf", - "opendap_service": pystac.MediaType.HTML, - "wcs_service": pystac.MediaType.XML, - "wms_service": pystac.MediaType.XML, - "nccs_service": "application/x-netcdf", + "HTTPServer": "application/x-netcdf", + "OPENDAP": pystac.MediaType.HTML, + "WCS": pystac.MediaType.XML, + "WMS": pystac.MediaType.XML, + "NetcdfSubset": "application/x-netcdf", } asset_roles = { - "httpserver_service": ["data"], - "opendap_service": ["data"], - "wcs_service": ["data"], - "wms_service": ["visual"], - "nccs_service": ["data"], + "HTTPServer": ["data"], + "OPENDAP": ["data"], + "WCS": ["data"], + "WMS": ["visual"], + "NetcdfSubset": ["data"], } From 37a26e19d50c8c129f78dce995344b400ca1ee8a Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 8 Nov 2023 16:11:13 -0500 Subject: [PATCH 31/36] adding ability to get single file from THREDDS loader --- STACpopulator/input.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/STACpopulator/input.py b/STACpopulator/input.py index 2522f15..25750c0 100644 --- a/STACpopulator/input.py +++ b/STACpopulator/input.py @@ -105,6 +105,9 @@ def __iter__(self) -> Iterator[Tuple[str, MutableMapping[str, Any]]]: self._depth -= 1 yield from self + def __getitem__(self, dataset): + return self.catalog.datasets[dataset] + def extract_metadata(self, ds: siphon.catalog.Dataset) -> MutableMapping[str, Any]: LOGGER.info("Requesting NcML dataset description") url = ds.access_urls["NCML"] From e55591dd0b7f7db6cd4ee7256512d5693d282145 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 8 Nov 2023 16:12:15 -0500 Subject: [PATCH 32/36] making make_cmip6_item_id a staticmethod --- .../implementations/CMIP6_UofT/add_CMIP6.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py index 32f8577..a4285bf 100644 --- a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py @@ -133,6 +133,22 @@ def __init__(self, stac_host: str, data_loader: GenericLoader, update: Optional[ """ super().__init__(stac_host, data_loader, update) + @staticmethod + def make_cmip6_item_id(attrs: MutableMapping[str, Any]) -> str: + """Return a unique ID for CMIP6 data item.""" + keys = [ + "activity_id", + "institution_id", + "source_id", + "experiment_id", + "variant_label", + "table_id", + "variable_id", + "grid_label", + ] + name = "_".join(attrs[k] for k in keys) + return name + def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: """Creates the STAC item. @@ -143,7 +159,7 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) :return: _description_ :rtype: MutableMapping[str, Any] """ - iid = make_cmip6_item_id(item_data["attributes"]) + iid = self.make_cmip6_item_id(item_data["attributes"]) item = STAC_item_from_metadata(iid, item_data, self.item_properties_model, self.item_geometry_model) From f1e28db4c22a6ba6082ba11a9acf896cb550accf Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 8 Nov 2023 16:13:34 -0500 Subject: [PATCH 33/36] wrapping call to make STAC item with a try-exepcet block --- STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py index a4285bf..65c1457 100644 --- a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py @@ -161,7 +161,12 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) """ iid = self.make_cmip6_item_id(item_data["attributes"]) - item = STAC_item_from_metadata(iid, item_data, self.item_properties_model, self.item_geometry_model) + try: + item = STAC_item_from_metadata(iid, item_data, self.item_properties_model, self.item_geometry_model) + except pydantic_core._pydantic_core.ValidationError: + print(f"ERROR: ValidationError for {iid}") + return -1 + # Add datacube extension try: From 8bb21e11b6d7642cce11c6f3a1f0aeda684716ce Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 8 Nov 2023 16:15:21 -0500 Subject: [PATCH 34/36] fixing commit e55591dd0b7f7db6cd4ee7256512d5693d282145 --- .../implementations/CMIP6_UofT/add_CMIP6.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py index 65c1457..3de2435 100644 --- a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py @@ -103,22 +103,6 @@ def validate_version(cls, v: str, info: FieldValidationInfo): return v -def make_cmip6_item_id(attrs: MutableMapping[str, Any]) -> str: - """Return a unique ID for CMIP6 data item.""" - keys = [ - "activity_id", - "institution_id", - "source_id", - "experiment_id", - "variant_label", - "table_id", - "variable_id", - "grid_label", - ] - name = "_".join(attrs[k] for k in keys) - return name - - class CMIP6populator(STACpopulatorBase): item_properties_model = CMIP6ItemProperties item_geometry_model = GeoJSONPolygon From 3055afc382bcb27a34abd0a63d0428371a84a034 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 8 Nov 2023 16:16:51 -0500 Subject: [PATCH 35/36] more fixes to previous commits --- STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py index 3de2435..eedecc9 100644 --- a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py @@ -4,6 +4,7 @@ from datetime import datetime from typing import Any, List, Literal, MutableMapping, Optional +import pydantic_core import pyessv from colorlog import ColoredFormatter from pydantic import AnyHttpUrl, ConfigDict, Field, FieldValidationInfo, field_validator @@ -151,6 +152,10 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) print(f"ERROR: ValidationError for {iid}") return -1 + # Add the CMIP6 STAC extension + item.stac_extensions.append( + "https://raw.githubusercontent.com/TomAugspurger/cmip6/main/json-schema/schema.json" + ) # Add datacube extension try: From 3f1d2843fc0a887f5ab74ae06af034d70fda01a6 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 8 Nov 2023 16:18:02 -0500 Subject: [PATCH 36/36] making tracking_id optional in CMIP6ItemProperties --- STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py index eedecc9..6d6fedb 100644 --- a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py @@ -74,7 +74,7 @@ class CMIP6ItemProperties(STACItemProperties, validate_assignment=True): physics_index: int realization_index: int forcing_index: int - tracking_id: str + tracking_id: str = "" version: str = Field("") product: str license: str