diff --git a/stacbuilder/builder.py b/stacbuilder/builder.py index 544098b..44b2524 100644 --- a/stacbuilder/builder.py +++ b/stacbuilder/builder.py @@ -146,7 +146,6 @@ def from_config(cls, config: AlternateHrefConfig) -> "AlternateHrefGenerator": class MapMetadataToSTACItem: """Converts AssetMetadata objects to STAC Items. - TODO: class name could be better TODO: find better name for item_assets_configs, maybe asset_definition_configs. """ diff --git a/stacbuilder/collector.py b/stacbuilder/collector.py index 13e0bc3..22eb9bd 100644 --- a/stacbuilder/collector.py +++ b/stacbuilder/collector.py @@ -195,7 +195,6 @@ def from_rasterio_dataset(dataset) -> Tuple[BoundingBox, BoundingBox, List[float class MapGeoTiffToAssetMetadata: """Extracts AssetMetadata from each GeoTIFF file. - TODO: name could be better TODO: VVVV move functionality to GeoTiffMetadataCollector """ diff --git a/stacbuilder/config.py b/stacbuilder/config.py index d83f25e..69b70d5 100644 --- a/stacbuilder/config.py +++ b/stacbuilder/config.py @@ -74,6 +74,7 @@ class EOBandConfig(BaseModel): # TODO: the EO extension calls this common_name, maybe we should use that instead of 'name' # https://github.com/stac-extensions/eo#band-object + # See GH issue: https://github.com/VitoTAP/stac-catalog-builder/issues/29 name: str = Field(description="common_name of the band.") description: str = Field(description="Description of the band.") @@ -249,8 +250,6 @@ class AlternateHrefConfig(BaseModel): class CollectionConfig(BaseModel): """Model, store configuration of a STAC collection""" - # TODO: add nested configuration object for how to group collections - # Currently the default class is GroupMetadataByYear and there are no options to choose a grouping. model_config = ConfigDict(from_attributes=True) collection_id: str diff --git a/stacbuilder/exceptions.py b/stacbuilder/exceptions.py index 1e87a33..6a79fc4 100644 --- a/stacbuilder/exceptions.py +++ b/stacbuilder/exceptions.py @@ -36,3 +36,7 @@ class InvalidConfiguration(STACBuilderException): """ pass + + +class DataValidationError(Exception): + """Raised when one of the validations on our data processing fails.""" diff --git a/stacbuilder/metadata.py b/stacbuilder/metadata.py index 808a314..fb72263 100644 --- a/stacbuilder/metadata.py +++ b/stacbuilder/metadata.py @@ -106,7 +106,6 @@ class AssetMetadata: # for the EO and Raster STAC extensions (eo:bands and raster:bands) # TODO: will probably also need the spatial resolution. - # TODO: more explicit handling of data extracted from path or href, should be a class perhaps. PROPS_FROM_HREFS = [ "item_id", "asset_id", @@ -157,8 +156,7 @@ def __init__( asset_id. """ - # TODO: item_type is currently a misnomer, more like an asset type. - # We use this to find the corresponding asset definition config in the CollectionConfig + # We use asset_type to find the corresponding asset definition config in the CollectionConfig self._asset_type: Optional[str] = None # Temporal extent @@ -364,7 +362,6 @@ def proj_geometry_as_wkt(self) -> Optional[str]: @property def proj_bbox_as_polygon(self) -> Optional[Polygon]: # TODO: [decide] convert this RO property to a method or not? - # TODO: method name could be better if not self._bbox_projected: return None return self._bbox_projected.as_polygon() @@ -629,7 +626,6 @@ def get_differences(self, other) -> Dict[str, Any]: class GeodataframeExporter: """Utility class to export metadata and STAC items as geopandas GeoDataframes. - TODO: find a better name for GeodataframeExporter TODO: This is currently a class with only static methods, perhaps a module would be beter. """ diff --git a/stacbuilder/terracatalog.py b/stacbuilder/terracatalog.py index 1ad2a49..ebabbcc 100644 --- a/stacbuilder/terracatalog.py +++ b/stacbuilder/terracatalog.py @@ -15,7 +15,6 @@ from pprint import pprint from typing import Any, Dict, List, Optional, Tuple - import geopandas as gpd import numpy as np import pandas as pd @@ -27,12 +26,14 @@ from terracatalogueclient.config import CatalogueEnvironment from terracatalogueclient import ProductFile -from stacbuilder.metadata import AssetMetadata from stacbuilder.boundingbox import BoundingBox from stacbuilder.collector import IMetadataCollector from stacbuilder.config import AssetConfig, CollectionConfig, RasterBandConfig, ProviderModel +from stacbuilder.exceptions import DataValidationError +from stacbuilder.metadata import AssetMetadata from stacbuilder.projections import reproject_bounding_box + _logger = logging.getLogger(__name__) @@ -223,7 +224,7 @@ def guess_datatype(self, bit_per_value: int) -> str: To be improved """ # TODO: need to add setting so we know if we should assume it is an int type or a a float type - # While float types with less than 32 bits doe exist, they are not common. + # While float types with less than 32 bits do exist, they are not common. # Not sure if EO ever uses those, but they do exist in other industries # (for example it is commonly used in the EXR image format, but I haven't heard of any other examples) # So for 16 bits and less we could assume it must be an int. @@ -516,19 +517,36 @@ def get_products_as_dataframe(self) -> gpd.GeoDataFrame: self._df_asset_metadata = gdf_asset_md self._save_dataframes() - # TODO: these asserts are some temporary checks. Or do we need to keep them after all? # Verify we have no duplicate products, # i.e. the number of unique product IDs must be == to the number of products. self._log_progress_message("START sanity checks: no duplicate products present and received all products ...") product_ids = set(p.id for p in all_products) - assert len(product_ids) == len(all_products), "The result should not contain duplicate products." - assert len(product_ids) == len(assets_md), "Each products should correspond to exactly 1 AssetMetadata instance" + + if len(product_ids) != len(all_products): + raise DataValidationError( + "Sanity check failed in get_products_as_dataframe:" + + " The result should not contain duplicate products." + + " len(product_ids) != len(all_products)" + + f"{len(product_ids)=} {len(all_products)=}" + ) + + if len(product_ids) != len(assets_md): + raise DataValidationError( + "Sanity check failed in get_products_as_dataframe:" + + " Each products should correspond to exactly 1 AssetMetadata instance." + + " len(product_ids) != len(assets_md)" + + f"{len(product_ids)=} {len(assets_md)=}" + ) # Check that we have processed all products, based on the product count reported by the terracatalogueclient. if not self.max_products: - assert ( - len(product_ids) == num_prods - ), "Number of products in result must be the product count reported by terracataloguiclient" + if len(product_ids) != num_prods: + raise DataValidationError( + "Sanity check failed in get_products_as_dataframe:" + + "Number of products in result must be the product count reported by terracataloguiclient" + + "len(product_ids) != num_prods" + + f"{len(product_ids)=} product count: {num_prods=}" + ) self._log_progress_message("DONE sanity checks") self._log_progress_message("DONE: get_products_as_dataframe") @@ -641,12 +659,6 @@ def create_asset_metadata(self, product: tcc.Product) -> AssetMetadata: raise asset_metadata.proj_epsg = epsg_code or EPSG_4326_LATLON - # if data_links: - # asset_metadata.asset_type = first_link.get("title") - # file_type = first_link.get("type") - # asset_metadata.media_type = AssetMetadata.mime_to_media_type(file_type) - # asset_metadata.file_size = first_link.get("length") - asset_metadata.bbox_lat_lon = BoundingBox.from_list(product.bbox, epsg=EPSG_4326_LATLON) asset_metadata.geometry_lat_lon = product.geometry if epsg_code: