Skip to content

Commit

Permalink
suggestions from review
Browse files Browse the repository at this point in the history
  • Loading branch information
huard committed Oct 15, 2024
1 parent 8cceeab commit 8b583bb
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 112 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ build

## Logs
*.jsonl
*.json

## Exclude schemas
!schemas/**/*.json

# Old Submodule Path
# Could be used locally
Expand Down
72 changes: 34 additions & 38 deletions STACpopulator/extensions/base.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,28 @@
"""
# Base classes for STAC extensions
What we have:
- `Loader`, which returns attributes.
- An external json schema describing a subset of the attributes returned by the Loader. This schema might preclude
additional properties, so it cannot be applied wholesale to the Loader's output. (maybe overkill since not a lot of schemas can be found in the wild...)
- `data model` describing the content we want included in the catalog. It includes a subset of the schema properties,
as well as additional attributes desired by the catalog admins.
Desiderata:
- Not having to replicate existing validation logic in the schema
- Not having to create a modified schema
- Being able to supplement the schema validation by pydantic validation logic
- Streamline the creation of new data models (reduce boilerplate, allow subclassing)
- Developer-friendly validation error messages
How-to:
- Instructions to create basic datamodel from schema (codegen)
"""

from datetime import datetime
import json
import jsonschema
Expand Down Expand Up @@ -25,47 +50,27 @@
from STACpopulator.extensions.datacube import DataCubeHelper
from STACpopulator.extensions.thredds import THREDDSExtension, THREDDSHelper



T = TypeVar("T", pystac.Collection, pystac.Item, pystac.Asset, item_assets.AssetDefinition)

LOGGER = logging.getLogger(__name__)
"""
# Context

What we have:
- `Loader`, which returns attributes.
- An external json schema describing a subset of the attributes returned by the Loader. This schema might preclude
additional properties, so it cannot be applied wholesale to the Loader's output. (maybe overkill since not a lot of schemas can be found in the wild...)
- `data model` describing the content we want included in the catalog. It includes a subset of the schema properties,
as well as additional attributes desired by the catalog admins.

Desiderata:
- Not having to replicate existing validation logic in the schema
- Not having to create a modified schema
- Being able to supplement the schema validation by pydantic validation logic
- Streamline the creation of new data models (reduce boilerplate, allow subclassing)
- Developer-friendly validation error messages
How-to:
- Instructions to create basic datamodel from schema (codegen)
"""
class DataModel(BaseModel):
"""Base class for dataset properties going into the catalog.
Subclass this with attributes.
Attributes
----------
_prefix : str
If not None, a prefix for the properties in the catalog will be added.
_schema_uri : str
URI of the json schema to validate against.
_schema_exclude : list[str]
Properties not meant to be validated by json schema, but still included in the data model.
"""
# Ideally, the catalog properties would be described by a jsonschema.
_prefix: str = PrivateAttr()

# URI of the json schema to validate against.
_schema_uri: FilePath = PrivateAttr(None)

# List of properties not meant to be validated by json schema.
_schema_exclude: list[str] = PrivateAttr([])

model_config = ConfigDict(populate_by_name=True, extra="ignore")
Expand Down Expand Up @@ -135,14 +140,6 @@ def uid(self) -> str:
import uuid
return str(uuid.uuid4())

# TODO: Move this into the THREDDS extension?
# @field_validator("access_urls")
# @classmethod
# def validate_access_urls(cls, value):
# assert len(set(["HTTPServer", "OPENDAP"]).intersection(value.keys())) >= 1, (
# "Access URLs must include HTTPServer or OPENDAP keys.")
# return value

def stac_item(self) -> "pystac.Item":
"""Create a STAC item and add extensions."""
item = pystac.Item(
Expand All @@ -167,7 +164,6 @@ def stac_item(self) -> "pystac.Item":

return json.loads(json.dumps(item.to_dict()))


def metadata_extension(self, item):
"""Add extension for the properties of the dataset to the STAC item.
The extension class is created dynamically from the properties.
Expand Down
23 changes: 0 additions & 23 deletions STACpopulator/extensions/cordex6.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,26 +70,3 @@ def uid(self) -> str:



# TODO: Remove before merging
def get_test_data():
import requests
from siphon.catalog import TDSCatalog
import xncml
from STACpopulator.stac_utils import numpy_to_python_datatypes

cat = TDSCatalog("https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/disk2/ouranos/CORDEX/CMIP6/DD/NAM-12/OURANOS/MPI-ESM1-2-LR/ssp370/r1i1p1f1/CRCM5/v1-r1/day/tas/v20231208/catalog.html")

if cat.datasets.items():
for item_name, ds in cat.datasets.items():
url = ds.access_urls["NCML"]
r = requests.get(url)
attrs = xncml.Dataset.from_text(r.text).to_cf_dict()
attrs["attributes"] = numpy_to_python_datatypes(attrs["attributes"])
attrs["access_urls"] = ds.access_urls
return attrs

def test_item():
attrs = get_test_data()
model = Cordex6DataModel.from_data(attrs)
model.stac_item()

9 changes: 9 additions & 0 deletions STACpopulator/extensions/thredds.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,12 @@ def links(self) -> list[pystac.Link]:
url = self.access_urls[ServiceType.httpserver]
link = magpie_resource_link(url)
return [link]


# TODO: Validate services links exist ?
# @field_validator("access_urls")
# @classmethod
# def validate_access_urls(cls, value):
# assert len(set(["HTTPServer", "OPENDAP"]).intersection(value.keys())) >= 1, (
# "Access URLs must include HTTPServer or OPENDAP keys.")
# return value
52 changes: 1 addition & 51 deletions STACpopulator/populator_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,55 +226,5 @@ def ingest(self) -> None:

counter += 1
LOGGER.info(f"Processed {counter} data items. {failures} failures")



class THREDDSRunner:
def __init__(self, populator):
self.populator = populator
self.parser = argparse.ArgumentParser()
self.add_parser_args(self.parser)

@staticmethod
def add_parser_args(parser: argparse.ArgumentParser) -> None:
parser.description="STAC populator from a THREDDS catalog or NCML XML."
parser.add_argument("stac_host", help="STAC API URL")
parser.add_argument("href", help="URL to a THREDDS catalog or a NCML XML with CMIP6 metadata.")
parser.add_argument("--update", action="store_true", help="Update collection and its items")
parser.add_argument(
"--mode",
choices=["full", "single"],
default="full",
help="Operation mode, processing the full dataset or only the single reference.",
)
parser.add_argument(
"--config",
type=str,
help=(
"Override configuration file for the populator. "
"By default, uses the adjacent configuration to the implementation class."
),
)
add_request_options(parser)

def runner(self, ns: argparse.Namespace) -> int:
LOGGER.info(f"Arguments to call: {vars(ns)}")

with Session() as session:
apply_request_options(session, ns)
if ns.mode == "full":
data_loader = THREDDSLoader(ns.href, session=session)
else:
# To be implemented
data_loader = ErrorLoader()

c = self.populator(
ns.stac_host, data_loader, update=ns.update, session=session, config_file=ns.config, log_debug=ns.debug
)
c.ingest()
return 0

def main(self, *args: str) -> int:
ns = self.parser.parse_args(args or None)
return self.runner(ns)


24 changes: 24 additions & 0 deletions tests/test_cordex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from STACpopulator.extensions.cordex6 import Cordex6DataModel


def get_test_data():
import requests
from siphon.catalog import TDSCatalog
import xncml
from STACpopulator.stac_utils import numpy_to_python_datatypes

cat = TDSCatalog("https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/disk2/ouranos/CORDEX/CMIP6/DD/NAM-12/OURANOS/MPI-ESM1-2-LR/ssp370/r1i1p1f1/CRCM5/v1-r1/day/tas/v20231208/catalog.html")

if cat.datasets.items():
for item_name, ds in cat.datasets.items():
url = ds.access_urls["NCML"]
r = requests.get(url)
attrs = xncml.Dataset.from_text(r.text).to_cf_dict()
attrs["attributes"] = numpy_to_python_datatypes(attrs["attributes"])
attrs["access_urls"] = ds.access_urls
return attrs

def test_item():
attrs = get_test_data()
model = Cordex6DataModel.from_data(attrs)
model.stac_item()

0 comments on commit 8b583bb

Please sign in to comment.