Skip to content

Commit

Permalink
Issue #16 Remove old TODOs and replace asserts with more informative …
Browse files Browse the repository at this point in the history
…excpections
  • Loading branch information
JohanKJSchreurs committed Mar 6, 2024
1 parent b2e863a commit 3ed18af
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 24 deletions.
1 change: 0 additions & 1 deletion stacbuilder/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,6 @@ def from_config(cls, config: AlternateHrefConfig) -> "AlternateHrefGenerator":
class MapMetadataToSTACItem:
"""Converts AssetMetadata objects to STAC Items.
TODO: class name could be better
TODO: find better name for item_assets_configs, maybe asset_definition_configs.
"""

Expand Down
1 change: 0 additions & 1 deletion stacbuilder/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,6 @@ def from_rasterio_dataset(dataset) -> Tuple[BoundingBox, BoundingBox, List[float
class MapGeoTiffToAssetMetadata:
"""Extracts AssetMetadata from each GeoTIFF file.
TODO: name could be better
TODO: VVVV move functionality to GeoTiffMetadataCollector
"""

Expand Down
3 changes: 1 addition & 2 deletions stacbuilder/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ class EOBandConfig(BaseModel):

# TODO: the EO extension calls this common_name, maybe we should use that instead of 'name'
# https://github.com/stac-extensions/eo#band-object
# See GH issue: https://github.com/VitoTAP/stac-catalog-builder/issues/29
name: str = Field(description="common_name of the band.")

description: str = Field(description="Description of the band.")
Expand Down Expand Up @@ -249,8 +250,6 @@ class AlternateHrefConfig(BaseModel):
class CollectionConfig(BaseModel):
"""Model, store configuration of a STAC collection"""

# TODO: add nested configuration object for how to group collections
# Currently the default class is GroupMetadataByYear and there are no options to choose a grouping.
model_config = ConfigDict(from_attributes=True)

collection_id: str
Expand Down
4 changes: 4 additions & 0 deletions stacbuilder/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,7 @@ class InvalidConfiguration(STACBuilderException):
"""

pass


class DataValidationError(Exception):
"""Raised when one of the validations on our data processing fails."""
6 changes: 1 addition & 5 deletions stacbuilder/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ class AssetMetadata:
# for the EO and Raster STAC extensions (eo:bands and raster:bands)
# TODO: will probably also need the spatial resolution.

# TODO: more explicit handling of data extracted from path or href, should be a class perhaps.
PROPS_FROM_HREFS = [
"item_id",
"asset_id",
Expand Down Expand Up @@ -157,8 +156,7 @@ def __init__(
asset_id.
"""

# TODO: item_type is currently a misnomer, more like an asset type.
# We use this to find the corresponding asset definition config in the CollectionConfig
# We use asset_type to find the corresponding asset definition config in the CollectionConfig
self._asset_type: Optional[str] = None

# Temporal extent
Expand Down Expand Up @@ -364,7 +362,6 @@ def proj_geometry_as_wkt(self) -> Optional[str]:
@property
def proj_bbox_as_polygon(self) -> Optional[Polygon]:
# TODO: [decide] convert this RO property to a method or not?
# TODO: method name could be better
if not self._bbox_projected:
return None
return self._bbox_projected.as_polygon()
Expand Down Expand Up @@ -629,7 +626,6 @@ def get_differences(self, other) -> Dict[str, Any]:
class GeodataframeExporter:
"""Utility class to export metadata and STAC items as geopandas GeoDataframes.
TODO: find a better name for GeodataframeExporter
TODO: This is currently a class with only static methods, perhaps a module would be beter.
"""

Expand Down
42 changes: 27 additions & 15 deletions stacbuilder/terracatalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from pprint import pprint
from typing import Any, Dict, List, Optional, Tuple


import geopandas as gpd
import numpy as np
import pandas as pd
Expand All @@ -27,12 +26,14 @@
from terracatalogueclient.config import CatalogueEnvironment
from terracatalogueclient import ProductFile

from stacbuilder.metadata import AssetMetadata
from stacbuilder.boundingbox import BoundingBox
from stacbuilder.collector import IMetadataCollector
from stacbuilder.config import AssetConfig, CollectionConfig, RasterBandConfig, ProviderModel
from stacbuilder.exceptions import DataValidationError
from stacbuilder.metadata import AssetMetadata
from stacbuilder.projections import reproject_bounding_box


_logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -223,7 +224,7 @@ def guess_datatype(self, bit_per_value: int) -> str:
To be improved
"""
# TODO: need to add setting so we know if we should assume it is an int type or a a float type
# While float types with less than 32 bits doe exist, they are not common.
# While float types with less than 32 bits do exist, they are not common.
# Not sure if EO ever uses those, but they do exist in other industries
# (for example it is commonly used in the EXR image format, but I haven't heard of any other examples)
# So for 16 bits and less we could assume it must be an int.
Expand Down Expand Up @@ -516,19 +517,36 @@ def get_products_as_dataframe(self) -> gpd.GeoDataFrame:
self._df_asset_metadata = gdf_asset_md
self._save_dataframes()

# TODO: these asserts are some temporary checks. Or do we need to keep them after all?
# Verify we have no duplicate products,
# i.e. the number of unique product IDs must be == to the number of products.
self._log_progress_message("START sanity checks: no duplicate products present and received all products ...")
product_ids = set(p.id for p in all_products)
assert len(product_ids) == len(all_products), "The result should not contain duplicate products."
assert len(product_ids) == len(assets_md), "Each products should correspond to exactly 1 AssetMetadata instance"

if len(product_ids) != len(all_products):
raise DataValidationError(
"Sanity check failed in get_products_as_dataframe:"
+ " The result should not contain duplicate products."
+ " len(product_ids) != len(all_products)"
+ f"{len(product_ids)=} {len(all_products)=}"
)

if len(product_ids) != len(assets_md):
raise DataValidationError(
"Sanity check failed in get_products_as_dataframe:"
+ " Each products should correspond to exactly 1 AssetMetadata instance."
+ " len(product_ids) != len(assets_md)"
+ f"{len(product_ids)=} {len(assets_md)=}"
)

# Check that we have processed all products, based on the product count reported by the terracatalogueclient.
if not self.max_products:
assert (
len(product_ids) == num_prods
), "Number of products in result must be the product count reported by terracataloguiclient"
if len(product_ids) != num_prods:
raise DataValidationError(
"Sanity check failed in get_products_as_dataframe:"
+ "Number of products in result must be the product count reported by terracataloguiclient"
+ "len(product_ids) != num_prods"
+ f"{len(product_ids)=} product count: {num_prods=}"
)
self._log_progress_message("DONE sanity checks")

self._log_progress_message("DONE: get_products_as_dataframe")
Expand Down Expand Up @@ -641,12 +659,6 @@ def create_asset_metadata(self, product: tcc.Product) -> AssetMetadata:
raise
asset_metadata.proj_epsg = epsg_code or EPSG_4326_LATLON

# if data_links:
# asset_metadata.asset_type = first_link.get("title")
# file_type = first_link.get("type")
# asset_metadata.media_type = AssetMetadata.mime_to_media_type(file_type)
# asset_metadata.file_size = first_link.get("length")

asset_metadata.bbox_lat_lon = BoundingBox.from_list(product.bbox, epsg=EPSG_4326_LATLON)
asset_metadata.geometry_lat_lon = product.geometry
if epsg_code:
Expand Down

0 comments on commit 3ed18af

Please sign in to comment.