diff --git a/apps/dc_tools/odc/apps/dc_tools/_version.py b/apps/dc_tools/odc/apps/dc_tools/_version.py index 11ef09286..f3291e93b 100644 --- a/apps/dc_tools/odc/apps/dc_tools/_version.py +++ b/apps/dc_tools/odc/apps/dc_tools/_version.py @@ -1 +1 @@ -__version__ = "0.2.13" +__version__ = "0.2.14" diff --git a/apps/dc_tools/odc/apps/dc_tools/azure_to_dc.py b/apps/dc_tools/odc/apps/dc_tools/azure_to_dc.py index 2fb90f7ab..52ee4ec4f 100644 --- a/apps/dc_tools/odc/apps/dc_tools/azure_to_dc.py +++ b/apps/dc_tools/odc/apps/dc_tools/azure_to_dc.py @@ -39,7 +39,7 @@ def dump_list_to_odc( update: Optional[bool] = False, update_if_exists: Optional[bool] = False, allow_unsafe: Optional[bool] = False, - archive_less_mature: Optional[bool] = False, + archive_less_mature: Optional[int] = None, publish_action: Optional[str] = None, ): ds_added = 0 @@ -102,7 +102,7 @@ def cli( allow_unsafe: bool, stac: bool, statsd_setting: str, - archive_less_mature: bool, + archive_less_mature: int, publish_action: str, account_url: str, container_name: str, diff --git a/apps/dc_tools/odc/apps/dc_tools/cop_dem_to_dc.py b/apps/dc_tools/odc/apps/dc_tools/cop_dem_to_dc.py index 3a9cc06e5..0acd4b788 100644 --- a/apps/dc_tools/odc/apps/dc_tools/cop_dem_to_dc.py +++ b/apps/dc_tools/odc/apps/dc_tools/cop_dem_to_dc.py @@ -107,7 +107,7 @@ def process_uri_tile( dc: Datacube, doc2ds: Doc2Dataset, update_if_exists: bool = True, - archive_less_mature: bool = False, + archive_less_mature: int = None, publish_action: str = None, ) -> Tuple[pystac.Item, str]: product_name = f"dem_{product}" diff --git a/apps/dc_tools/odc/apps/dc_tools/esa_worldcover_to_dc.py b/apps/dc_tools/odc/apps/dc_tools/esa_worldcover_to_dc.py index a1bd75286..8474bc410 100644 --- a/apps/dc_tools/odc/apps/dc_tools/esa_worldcover_to_dc.py +++ b/apps/dc_tools/odc/apps/dc_tools/esa_worldcover_to_dc.py @@ -117,7 +117,7 @@ def process_uri_tile( dc: Datacube, doc2ds: Doc2Dataset, update_if_exists: bool = True, - archive_less_mature: bool = False, + archive_less_mature: int = None, publish_action: str = None, ) -> Tuple[pystac.Item, str]: product_name = "esa_worldcover_" + map_version["year"] @@ -166,7 +166,7 @@ def esa_wc_to_dc( limit: int, update: bool, n_workers: int = 100, - archive_less_mature: bool = False, + archive_less_mature: int = None, publish_action: str = None, ) -> Tuple[int, int]: doc2ds = Doc2Dataset(dc.index) diff --git a/apps/dc_tools/odc/apps/dc_tools/stac_api_to_dc.py b/apps/dc_tools/odc/apps/dc_tools/stac_api_to_dc.py index 34ec66b8d..35863f260 100644 --- a/apps/dc_tools/odc/apps/dc_tools/stac_api_to_dc.py +++ b/apps/dc_tools/odc/apps/dc_tools/stac_api_to_dc.py @@ -125,7 +125,7 @@ def process_item( allow_unsafe: bool, rewrite: Optional[Tuple[str, str]] = None, rename_product: Optional[str] = None, - archive_less_mature: bool = False, + archive_less_mature: int = None, publish_action: bool = False, ): meta, uri, stac = item_to_meta_uri(item, rewrite, rename_product) @@ -150,7 +150,7 @@ def stac_api_to_odc( allow_unsafe: bool = True, rewrite: Optional[Tuple[str, str]] = None, rename_product: Optional[str] = None, - archive_less_mature: bool = False, + archive_less_mature: int = None, publish_action: Optional[str] = None, ) -> Tuple[int, int, int]: doc2ds = Doc2Dataset(dc.index) diff --git a/apps/dc_tools/odc/apps/dc_tools/utils.py b/apps/dc_tools/odc/apps/dc_tools/utils.py index 05409d441..60d78b47d 100644 --- a/apps/dc_tools/odc/apps/dc_tools/utils.py +++ b/apps/dc_tools/odc/apps/dc_tools/utils.py @@ -3,7 +3,7 @@ import importlib_resources from datadog import statsd, initialize from odc.aws.queue import publish_to_topic -from typing import Iterable, Optional, Union +from typing import Optional from datacube import Datacube from datacube.index.hl import Doc2Dataset @@ -113,13 +113,17 @@ class SkippedException(Exception): archive_less_mature = click.option( "--archive-less-mature", - is_flag=True, - default=False, + is_flag=False, + flag_value=500, + default=None, + type=int, help=( "Archive existing any datasets that match product, " "time and region-code, but have lower dataset-maturity." "Note: An error will be raised and the dataset add will " "fail if a matching dataset with higher or equal dataset-maturity." + "Can specify an of leniency for comparing timestamps, provided in milliseconds. " + "Default value is 500ms." ), ) @@ -176,7 +180,7 @@ def index_update_dataset( update: bool = False, update_if_exists: bool = False, allow_unsafe: bool = False, - archive_less_mature: Optional[Union[bool, Iterable[str]]] = None, + archive_less_mature: Optional[int] = None, publish_action: Optional[str] = None, stac_doc: Optional[dict] = None, ) -> int: @@ -191,13 +195,12 @@ def index_update_dataset( :param update_if_exists: If true allow insert or update. :param allow_unsafe: Allow unsafe (arbitrary) dataset updates. :param archive_less_mature: Enforce dataset maturity. - * If None (the default) or False or an empty iterable, ignore dataset maturity. - * If True, enforce dataset maturity by looking for existing datasets with same product, region_code and time + * If None (the default), ignore dataset maturity. + * If int, enforce dataset maturity by looking for existing datasets with same product, region_code and time values. If a less mature match is found, it is archived and replaced with the new dataset being inserted. If a match of the same or greater maturity is found a SkippedException is raised. - * If an iterable of valid search field names is provided, it is used as the "grouping" fields for - identifying dataset maturity matches. - (i.e. `archive_less_mature=True` is the same as `archive_less_mature=['region_code', 'time']) + The integer value is used as the timedelta value for allowing a leniency when comparing + timestamp values, for datasets where there is a slight discrepancy. Default is 500ms. :param publish_action: SNS topic arn to publish action to. :param stac_doc: STAC document for publication to SNS topic. :return: Returns nothing. Raises an exception if anything goes wrong. @@ -221,8 +224,9 @@ def index_update_dataset( archive_stacs = [] added = False updated = False - if archive_less_mature and publish_action: - dupes = dc.index.datasets.find_less_mature(ds, 500) + + if isinstance(archive_less_mature, int) and publish_action: + dupes = dc.index.datasets.find_less_mature(ds, archive_less_mature) for dupe in dupes: archive_stacs.append(ds_to_stac(dupe)) diff --git a/apps/dc_tools/tests/test_fs_to_dc.py b/apps/dc_tools/tests/test_fs_to_dc.py index 2daa7a61e..49edadef9 100644 --- a/apps/dc_tools/tests/test_fs_to_dc.py +++ b/apps/dc_tools/tests/test_fs_to_dc.py @@ -51,6 +51,36 @@ def test_archive_less_mature(odc_db, test_data_dir, nrt_dsid, final_dsid): assert dc.index.datasets.get(nrt_dsid).archived_time is not None +def test_dont_archive_less_mature(odc_db, test_data_dir, nrt_dsid, final_dsid): + # no archiving should be done if --archive-less-mature is not set + dc = odc_db + runner = CliRunner() + + # Index NRT dataset + result = runner.invoke( + fs_to_dc_cli, + [ + test_data_dir, + "--glob=**/maturity-nrt.odc-metadata.yaml", + ], + ) + assert result.exit_code == 0 + assert dc.index.datasets.get(final_dsid) is None + assert dc.index.datasets.get(nrt_dsid).archived_time is None + + # Index Final dataset (autoarchiving NRT) + result = runner.invoke( + fs_to_dc_cli, + [ + test_data_dir, + "--glob=**/maturity-final.odc-metadata.yaml", + ], + ) + assert result.exit_code == 0 + assert dc.index.datasets.get(final_dsid).archived_time is None + assert dc.index.datasets.get(nrt_dsid).archived_time is None + + def test_keep_more_mature(odc_db, test_data_dir, nrt_dsid, final_dsid): dc = odc_db runner = CliRunner()