diff --git a/CHANGES.md b/CHANGES.md index 8788ee1..2c82e87 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,6 +4,8 @@ - OPTIM: Don't download an archive stored on the cloud when trying to read a vector stored inside it in `vectors.read` - OPTIM: Don't download files stored on cloud when applying `ci.assert_files_equal` on them +- OPTIM: Offer the ability to give the archived file list directly to `path.get_archived_file_list`, as this operation is expensive when done with large archives stored on the cloud (and thus better done only once). + Propagated into `path.get_archived_path`, `path.get_archived_rio_path` and `vectors.read`. ## 1.44.0 (2024-12-09) diff --git a/CI/SCRIPTS/test_path.py b/CI/SCRIPTS/test_path.py index 78a7cc9..44171ab 100644 --- a/CI/SCRIPTS/test_path.py +++ b/CI/SCRIPTS/test_path.py @@ -91,6 +91,11 @@ def test_archived_paths(): ci.assert_raster_equal(tif_ok, tif_list[0]) ci.assert_raster_equal(tif_ok, tif_tar) + file_list = path.get_archived_file_list(zip_file) + ci.assert_raster_equal( + tif_ok, path.get_archived_rio_path(zip_file, tif_regex, file_list=file_list) + ) + # VECTORS vect_name = "map-overlay.kml" vec_ok_path = ok_folder.joinpath(vect_name) diff --git a/CI/SCRIPTS/test_vectors.py b/CI/SCRIPTS/test_vectors.py index a3dca0a..0ecd93a 100644 --- a/CI/SCRIPTS/test_vectors.py +++ b/CI/SCRIPTS/test_vectors.py @@ -290,6 +290,12 @@ def test_read_archived(): vectors.read(tar_landsat, archive_regex=map_overlay_regex), ) + file_list = path.get_archived_file_list(tar_landsat) + ci.assert_geom_equal( + map_overlay_extracted, + vectors.read(tar_landsat, archive_regex=map_overlay_regex, file_list=file_list), + ) + def test_window(): """Test read with window""" diff --git a/sertit/path.py b/sertit/path.py index 20ff09b..1d0d2c7 100644 --- a/sertit/path.py +++ b/sertit/path.py @@ -188,6 +188,7 @@ def get_archived_path( file_regex: str, as_list: bool = False, case_sensitive: bool = False, + file_list: list = None, ) -> Union[list, AnyPathType]: """ Get archived file path from inside the archive. @@ -202,6 +203,7 @@ def get_archived_path( file_regex (str): File regex (used by re) as it can be found in the getmembers() list as_list (bool): If true, returns a list (including all found files). If false, returns only the first match case_sensitive (bool): If true, the regex is case-sensitive. + file_list (list): List of files to get archived from. Optional, if not given it will be re-computed. Returns: Union[list, str]: Path from inside the zipfile @@ -214,7 +216,10 @@ def get_archived_path( """ # Get file list archive_path = AnyPath(archive_path) - file_list = get_archived_file_list(archive_path) + + # Offer the ability to give the file list directly, as this operation is expensive when done with large archives stored on the cloud + if file_list is None: + file_list = get_archived_file_list(archive_path) # Search for file regex = ( @@ -236,7 +241,10 @@ def get_archived_path( def get_archived_rio_path( - archive_path: AnyPathStrType, file_regex: str, as_list: bool = False + archive_path: AnyPathStrType, + file_regex: str, + as_list: bool = False, + file_list: list = None, ) -> Union[list, AnyPathType]: """ Get archived file path from inside the archive, to be read with rasterio: @@ -283,7 +291,9 @@ def get_archived_rio_path( raise TypeError("Only .zip and .tar files can be read from inside its archive.") # Search for file - archived_band_paths = get_archived_path(archive_path, file_regex, as_list=True) + archived_band_paths = get_archived_path( + archive_path, file_regex, as_list=True, file_list=file_list + ) # Convert to rio path if is_cloud_path(archive_path): diff --git a/sertit/vectors.py b/sertit/vectors.py index 901cb37..818b7ee 100644 --- a/sertit/vectors.py +++ b/sertit/vectors.py @@ -422,7 +422,8 @@ def read( archive_regex (str): [Archive only] Regex for the wanted vector inside the archive window (Any): Anything that can be returned as a bbox (i.e. path, gpd.GeoPandas, Iterable, ...). In case of an iterable, assumption is made it corresponds to geographic bounds. Mimics :code:`rasters.read(..., window=)`. If given, :code:`bbox` is ignored. - **kwargs: Additional arguments used in gpd.read_file + **kwargs: Additional arguments used in gpd.read_file. + You can also give :code:`file_list`, the list of files of the archive to get the vector from, as this operation is expensive when done with large archives stored on the cloud. Returns: gpd.GeoDataFrame: Read vector as a GeoDataFrame @@ -477,7 +478,9 @@ def read( # Manage archive case if vector_path.suffix in [".tar", ".zip"]: prefix = vector_path.suffix[-3:] - file_list = path.get_archived_file_list(vector_path) + file_list = kwargs.pop( + "file_list", path.get_archived_file_list(vector_path) + ) try: regex = re.compile(archive_regex)