Skip to content

Commit

Permalink
OPTIM: Offer the ability to give the archived file list directly to `…
Browse files Browse the repository at this point in the history
…path.get_archived_file_list`, as this operation is expensive when done with large archives stored on the cloud (and thus better done only once). Propagated into `path.get_archived_path`, `path.get_archived_rio_path` and `vectors.read`.
  • Loading branch information
remi-braun committed Dec 10, 2024
1 parent 8f98c4c commit a17a70d
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 5 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

- OPTIM: Don't download an archive stored on the cloud when trying to read a vector stored inside it in `vectors.read`
- OPTIM: Don't download files stored on cloud when applying `ci.assert_files_equal` on them
- OPTIM: Offer the ability to give the archived file list directly to `path.get_archived_file_list`, as this operation is expensive when done with large archives stored on the cloud (and thus better done only once).
Propagated into `path.get_archived_path`, `path.get_archived_rio_path` and `vectors.read`.

## 1.44.0 (2024-12-09)

Expand Down
5 changes: 5 additions & 0 deletions CI/SCRIPTS/test_path.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ def test_archived_paths():
ci.assert_raster_equal(tif_ok, tif_list[0])
ci.assert_raster_equal(tif_ok, tif_tar)

file_list = path.get_archived_file_list(zip_file)
ci.assert_raster_equal(
tif_ok, path.get_archived_rio_path(zip_file, tif_regex, file_list=file_list)
)

# VECTORS
vect_name = "map-overlay.kml"
vec_ok_path = ok_folder.joinpath(vect_name)
Expand Down
6 changes: 6 additions & 0 deletions CI/SCRIPTS/test_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,12 @@ def test_read_archived():
vectors.read(tar_landsat, archive_regex=map_overlay_regex),
)

file_list = path.get_archived_file_list(tar_landsat)
ci.assert_geom_equal(
map_overlay_extracted,
vectors.read(tar_landsat, archive_regex=map_overlay_regex, file_list=file_list),
)


def test_window():
"""Test read with window"""
Expand Down
16 changes: 13 additions & 3 deletions sertit/path.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ def get_archived_path(
file_regex: str,
as_list: bool = False,
case_sensitive: bool = False,
file_list: list = None,
) -> Union[list, AnyPathType]:
"""
Get archived file path from inside the archive.
Expand All @@ -202,6 +203,7 @@ def get_archived_path(
file_regex (str): File regex (used by re) as it can be found in the getmembers() list
as_list (bool): If true, returns a list (including all found files). If false, returns only the first match
case_sensitive (bool): If true, the regex is case-sensitive.
file_list (list): List of files to get archived from. Optional, if not given it will be re-computed.
Returns:
Union[list, str]: Path from inside the zipfile
Expand All @@ -214,7 +216,10 @@ def get_archived_path(
"""
# Get file list
archive_path = AnyPath(archive_path)
file_list = get_archived_file_list(archive_path)

# Offer the ability to give the file list directly, as this operation is expensive when done with large archives stored on the cloud
if file_list is None:
file_list = get_archived_file_list(archive_path)

# Search for file
regex = (
Expand All @@ -236,7 +241,10 @@ def get_archived_path(


def get_archived_rio_path(
archive_path: AnyPathStrType, file_regex: str, as_list: bool = False
archive_path: AnyPathStrType,
file_regex: str,
as_list: bool = False,
file_list: list = None,
) -> Union[list, AnyPathType]:
"""
Get archived file path from inside the archive, to be read with rasterio:
Expand Down Expand Up @@ -283,7 +291,9 @@ def get_archived_rio_path(
raise TypeError("Only .zip and .tar files can be read from inside its archive.")

# Search for file
archived_band_paths = get_archived_path(archive_path, file_regex, as_list=True)
archived_band_paths = get_archived_path(
archive_path, file_regex, as_list=True, file_list=file_list
)

# Convert to rio path
if is_cloud_path(archive_path):
Expand Down
7 changes: 5 additions & 2 deletions sertit/vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,8 @@ def read(
archive_regex (str): [Archive only] Regex for the wanted vector inside the archive
window (Any): Anything that can be returned as a bbox (i.e. path, gpd.GeoPandas, Iterable, ...).
In case of an iterable, assumption is made it corresponds to geographic bounds. Mimics :code:`rasters.read(..., window=)`. If given, :code:`bbox` is ignored.
**kwargs: Additional arguments used in gpd.read_file
**kwargs: Additional arguments used in gpd.read_file.
You can also give :code:`file_list`, the list of files of the archive to get the vector from, as this operation is expensive when done with large archives stored on the cloud.
Returns:
gpd.GeoDataFrame: Read vector as a GeoDataFrame
Expand Down Expand Up @@ -477,7 +478,9 @@ def read(
# Manage archive case
if vector_path.suffix in [".tar", ".zip"]:
prefix = vector_path.suffix[-3:]
file_list = path.get_archived_file_list(vector_path)
file_list = kwargs.pop(
"file_list", path.get_archived_file_list(vector_path)
)

try:
regex = re.compile(archive_regex)
Expand Down

0 comments on commit a17a70d

Please sign in to comment.