Skip to content

Commit

Permalink
fix bugs with oper products and zero-area products
Browse files Browse the repository at this point in the history
  • Loading branch information
favyen2 committed Feb 13, 2025
1 parent 51a2ee1 commit 19c7ac3
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 12 deletions.
35 changes: 23 additions & 12 deletions rslearn/data_sources/gcp_public_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,20 @@ def _read_index(
result = client.query(query_str)

for row in tqdm.tqdm(result, desc=desc):
# Validate product ID has correct number of sections and that it is MSIL1C.
# We must do this before checking source_url because we want to skip the
# products that say OPER instead of MSIL1C.
product_id = row["product_id"]
product_id_parts = product_id.split("_")
if len(product_id_parts) < 7:
continue
product_type = product_id_parts[1]
if product_type != "MSIL1C":
continue
time_str = product_id_parts[2]
tile_id = product_id_parts[5]
assert tile_id[0] == "T"

# Figure out what the product folder is for this entry.
# Some entries have source_url correct and others have base_url correct.
# If base_url is correct, then it seems the source_url always ends in
Expand All @@ -281,23 +295,12 @@ def _read_index(
raise ValueError(
f"Unexpected value '{row['source_url']}' in column 'source_url'"
+ f" and '{row['base_url']} in column 'base_url'"
+ f"for product {row['product_id']}"
)

# Build the blob prefix based on the product ID and granule ID.
# The blob prefix is the prefix to the JP2 image files on GCS.
product_id = row["product_id"]
product_id_parts = product_id.split("_")
if len(product_id_parts) < 7:
continue
product_type = product_id_parts[1]
if product_type != "MSIL1C":
continue
time_str = product_id_parts[2]
tile_id = product_id_parts[5]
assert tile_id[0] == "T"

granule_id = row["granule_id"]

blob_prefix = (
f"{product_folder}/GRANULE/{granule_id}/IMG_DATA/{tile_id}_{time_str}_"
)
Expand Down Expand Up @@ -471,6 +474,14 @@ def _get_item_by_name(self, name: str) -> Sentinel2Item:
if not geometry.shp.is_valid:
geometry.shp = shapely.make_valid(geometry.shp)

# Some rasters have zero-area geometry due to incorrect geometry. For example,
# S2B_MSIL1C_20190111T193659_N0207_R056_T08MLS_20190111T205033.SAFE.
# So here we add a check for that and mark it corrupt if so.
if geometry.shp.area == 0:
raise CorruptItemException(
f"XML for item {name} shows geometry with zero area"
)

return Sentinel2Item(
name=name,
geometry=geometry,
Expand Down
17 changes: 17 additions & 0 deletions tests/integration/data_sources/test_gcp_public_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,14 @@
MissingXMLException,
Sentinel2,
)
from rslearn.log_utils import get_logger
from rslearn.tile_stores import DefaultTileStore, TileStoreWithLayer
from rslearn.utils import STGeometry

TEST_BAND = "B04"

logger = get_logger(__name__)


class TestSentinel2:
"""Tests the Sentinel2 data source."""
Expand Down Expand Up @@ -216,3 +219,17 @@ def test_product_with_missing_bands(sentinel2_without_rtree: Sentinel2) -> None:

with pytest.raises(CorruptItemException):
sentinel2_without_rtree.get_item_by_name(item_name)


def test_product_with_incorrect_geometry(sentinel2_without_rtree: Sentinel2) -> None:
"""Verify that the data source raises a CorruptItemException for products with
missing or invalid geometry."""

# This is an example product where the geometry is a MultiLineString (area is 0).
item_name = "S2B_MSIL1C_20190111T193659_N0207_R056_T08MLS_20190111T205033"

with pytest.raises(CorruptItemException):
item = sentinel2_without_rtree.get_item_by_name(item_name)
logger.error(
"item should not have been returned, the geometry is %s", {item.geometry}
)

0 comments on commit 19c7ac3

Please sign in to comment.