Skip to content

Commit

Permalink
OPTIM: Offer the ability to give the archived file list directly to `…
Browse files Browse the repository at this point in the history
…files.read_archived_file`. Propagated into `files.read_archived_xml` and `files.read_archived_html`
  • Loading branch information
remi-braun committed Dec 10, 2024
1 parent a17a70d commit ab16e05
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 14 deletions.
4 changes: 2 additions & 2 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

- OPTIM: Don't download an archive stored on the cloud when trying to read a vector stored inside it in `vectors.read`
- OPTIM: Don't download files stored on cloud when applying `ci.assert_files_equal` on them
- OPTIM: Offer the ability to give the archived file list directly to `path.get_archived_file_list`, as this operation is expensive when done with large archives stored on the cloud (and thus better done only once).
Propagated into `path.get_archived_path`, `path.get_archived_rio_path` and `vectors.read`.
- OPTIM: Offer the ability to give the archived file list directly to `path.get_archived_file_list` and `files.read_archived_file`, as this operation is expensive when done with large archives stored on the cloud (and thus better done only once).
Propagated into `path.get_archived_path`, `path.get_archived_rio_path`, `vectors.read`, `files.read_archived_xml` and `files.read_archived_html`

## 1.44.0 (2024-12-09)

Expand Down
16 changes: 16 additions & 0 deletions CI/SCRIPTS/test_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,11 +149,27 @@ def test_archived_files():
ci.assert_html_equal(html_ok, html.fromstring(file_zip))
ci.assert_html_equal(html_ok, html.fromstring(file_tar))

file_list = path.get_archived_file_list(html_zip_file)
ci.assert_html_equal(
html_ok,
html.fromstring(
files.read_archived_file(html_zip_file, html_regex, file_list=file_list)
),
)

# HTML
html_zip = files.read_archived_html(html_zip_file, html_regex)
html_tar = files.read_archived_html(html_tar_file, html_regex)
ci.assert_html_equal(html_ok, html_zip)
ci.assert_html_equal(html_ok, html_tar)
ci.assert_html_equal(
html_ok,
files.read_archived_html(
html_tar_file,
html_regex,
file_list=path.get_archived_file_list(html_tar_file),
),
)

# ERRORS
with pytest.raises(TypeError):
Expand Down
36 changes: 24 additions & 12 deletions sertit/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,9 @@ def get_archived_rio_path(
return path.get_archived_rio_path(archive_path, file_regex, as_list)


def read_archived_file(archive_path: AnyPathStrType, regex: str) -> bytes:
def read_archived_file(
archive_path: AnyPathStrType, regex: str, file_list: list = None
) -> bytes:
"""
Read archived file (in bytes) from :code:`zip` or :code:`tar` archives.
Expand All @@ -386,6 +388,7 @@ def read_archived_file(archive_path: AnyPathStrType, regex: str) -> bytes:
Args:
archive_path (AnyPathStrType): Archive path
regex (str): Regex (used by re) as it can be found in the getmembers() list
file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed.
Returns:
bytes: Archived file in bytes
Expand All @@ -399,16 +402,19 @@ def read_archived_file(archive_path: AnyPathStrType, regex: str) -> bytes:
try:
if archive_path.suffix == ".tar":
with tarfile.open(archive_path) as tar_ds:
tar_mb = tar_ds.getmembers()
name_list = [mb.name for mb in tar_mb]
band_name = list(filter(regex.match, name_list))[0]
tarinfo = [mb for mb in tar_mb if mb.name == band_name][0]
# file_list is not very useful for TAR files...
if file_list is None:
tar_mb = tar_ds.getmembers()
file_list = [mb.name for mb in tar_mb]
name = list(filter(regex.match, file_list))[0]
tarinfo = tar_ds.getmember(name)
file_str = tar_ds.extractfile(tarinfo).read()
elif archive_path.suffix == ".zip":
with zipfile.ZipFile(archive_path) as zip_ds:
name_list = [f.filename for f in zip_ds.filelist]
band_name = list(filter(regex.match, name_list))[0]
file_str = zip_ds.read(band_name)
if file_list is None:
file_list = [f.filename for f in zip_ds.filelist]
name = list(filter(regex.match, file_list))[0]
file_str = zip_ds.read(name)

elif archive_path.suffix == ".tar.gz":
raise TypeError(
Expand All @@ -426,7 +432,9 @@ def read_archived_file(archive_path: AnyPathStrType, regex: str) -> bytes:
return file_str


def read_archived_xml(archive_path: AnyPathStrType, xml_regex: str) -> etree._Element:
def read_archived_xml(
archive_path: AnyPathStrType, xml_regex: str, file_list: list = None
) -> etree._Element:
"""
Read archived XML from :code:`zip` or :code:`tar` archives.
Expand All @@ -435,6 +443,7 @@ def read_archived_xml(archive_path: AnyPathStrType, xml_regex: str) -> etree._El
Args:
archive_path (AnyPathStrType): Archive path
xml_regex (str): XML regex (used by re) as it can be found in the getmembers() list
file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed.
Returns:
etree._Element: XML file
Expand All @@ -445,12 +454,14 @@ def read_archived_xml(archive_path: AnyPathStrType, xml_regex: str) -> etree._El
>>> read_archived_xml(arch_path, file_regex)
<Element LANDSAT_METADATA_FILE at 0x1c90007f8c8>
"""
xml_bytes = read_archived_file(archive_path, xml_regex)
xml_bytes = read_archived_file(archive_path, xml_regex, file_list=file_list)

return etree.fromstring(xml_bytes)


def read_archived_html(archive_path: AnyPathStrType, regex: str) -> html.HtmlElement:
def read_archived_html(
archive_path: AnyPathStrType, regex: str, file_list: list = None
) -> html.HtmlElement:
"""
Read archived HTML from :code:`zip` or :code:`tar` archives.
Expand All @@ -459,6 +470,7 @@ def read_archived_html(archive_path: AnyPathStrType, regex: str) -> html.HtmlEle
Args:
archive_path (AnyPathStrType): Archive path
regex (str): HTML regex (used by re) as it can be found in the getmembers() list
file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed.
Returns:
html._Element: HTML file
Expand All @@ -469,7 +481,7 @@ def read_archived_html(archive_path: AnyPathStrType, regex: str) -> html.HtmlEle
>>> read_archived_html(arch_path, file_regex)
<Element html at 0x1c90007f8c8>
"""
html_bytes = read_archived_file(archive_path, regex)
html_bytes = read_archived_file(archive_path, regex, file_list=file_list)

return html.fromstring(html_bytes)

Expand Down
1 change: 1 addition & 0 deletions sertit/path.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@ def get_archived_rio_path(
archive_path (AnyPathStrType): Archive path
file_regex (str): File regex (used by re) as it can be found in the getmembers() list
as_list (bool): If true, returns a list (including all found files). If false, returns only the first match
file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed.
Returns:
Union[list, str]: Band path that can be read by rasterio
Expand Down

0 comments on commit ab16e05

Please sign in to comment.