OPTIM: Offer the ability to give the archived file list directly to `…

…files.read_archived_file`. Propagated into `files.read_archived_xml` and `files.read_archived_html`
sertit · Dec 10, 2024 · ab16e05 · ab16e05
1 parent a17a70d
commit ab16e05
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 14 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -4,8 +4,8 @@
 
 - OPTIM: Don't download an archive stored on the cloud when trying to read a vector stored inside it in `vectors.read`
 - OPTIM: Don't download files stored on cloud when applying `ci.assert_files_equal` on them
-- OPTIM: Offer the ability to give the archived file list directly to `path.get_archived_file_list`, as this operation is expensive when done with large archives stored on the cloud (and thus better done only once).
-  Propagated into `path.get_archived_path`, `path.get_archived_rio_path` and `vectors.read`.
+- OPTIM: Offer the ability to give the archived file list directly to `path.get_archived_file_list` and `files.read_archived_file`, as this operation is expensive when done with large archives stored on the cloud (and thus better done only once).
+  Propagated into `path.get_archived_path`, `path.get_archived_rio_path`, `vectors.read`, `files.read_archived_xml` and `files.read_archived_html`
 
 ## 1.44.0 (2024-12-09)
 

diff --git a/CI/SCRIPTS/test_files.py b/CI/SCRIPTS/test_files.py
@@ -149,11 +149,27 @@ def test_archived_files():
         ci.assert_html_equal(html_ok, html.fromstring(file_zip))
         ci.assert_html_equal(html_ok, html.fromstring(file_tar))
 
+        file_list = path.get_archived_file_list(html_zip_file)
+        ci.assert_html_equal(
+            html_ok,
+            html.fromstring(
+                files.read_archived_file(html_zip_file, html_regex, file_list=file_list)
+            ),
+        )
+
         # HTML
         html_zip = files.read_archived_html(html_zip_file, html_regex)
         html_tar = files.read_archived_html(html_tar_file, html_regex)
         ci.assert_html_equal(html_ok, html_zip)
         ci.assert_html_equal(html_ok, html_tar)
+        ci.assert_html_equal(
+            html_ok,
+            files.read_archived_html(
+                html_tar_file,
+                html_regex,
+                file_list=path.get_archived_file_list(html_tar_file),
+            ),
+        )
 
         # ERRORS
         with pytest.raises(TypeError):

diff --git a/sertit/files.py b/sertit/files.py
@@ -377,7 +377,9 @@ def get_archived_rio_path(
     return path.get_archived_rio_path(archive_path, file_regex, as_list)
 
 
-def read_archived_file(archive_path: AnyPathStrType, regex: str) -> bytes:
+def read_archived_file(
+    archive_path: AnyPathStrType, regex: str, file_list: list = None
+) -> bytes:
     """
     Read archived file (in bytes) from :code:`zip` or :code:`tar` archives.
 
@@ -386,6 +388,7 @@ def read_archived_file(archive_path: AnyPathStrType, regex: str) -> bytes:
     Args:
         archive_path (AnyPathStrType): Archive path
         regex (str): Regex (used by re) as it can be found in the getmembers() list
+        file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed.
 
     Returns:
          bytes: Archived file in bytes
@@ -399,16 +402,19 @@ def read_archived_file(archive_path: AnyPathStrType, regex: str) -> bytes:
     try:
         if archive_path.suffix == ".tar":
             with tarfile.open(archive_path) as tar_ds:
-                tar_mb = tar_ds.getmembers()
-                name_list = [mb.name for mb in tar_mb]
-                band_name = list(filter(regex.match, name_list))[0]
-                tarinfo = [mb for mb in tar_mb if mb.name == band_name][0]
+                # file_list is not very useful for TAR files...
+                if file_list is None:
+                    tar_mb = tar_ds.getmembers()
+                    file_list = [mb.name for mb in tar_mb]
+                name = list(filter(regex.match, file_list))[0]
+                tarinfo = tar_ds.getmember(name)
                 file_str = tar_ds.extractfile(tarinfo).read()
         elif archive_path.suffix == ".zip":
             with zipfile.ZipFile(archive_path) as zip_ds:
-                name_list = [f.filename for f in zip_ds.filelist]
-                band_name = list(filter(regex.match, name_list))[0]
-                file_str = zip_ds.read(band_name)
+                if file_list is None:
+                    file_list = [f.filename for f in zip_ds.filelist]
+                name = list(filter(regex.match, file_list))[0]
+                file_str = zip_ds.read(name)
 
         elif archive_path.suffix == ".tar.gz":
             raise TypeError(
@@ -426,7 +432,9 @@ def read_archived_file(archive_path: AnyPathStrType, regex: str) -> bytes:
     return file_str
 
 
-def read_archived_xml(archive_path: AnyPathStrType, xml_regex: str) -> etree._Element:
+def read_archived_xml(
+    archive_path: AnyPathStrType, xml_regex: str, file_list: list = None
+) -> etree._Element:
     """
     Read archived XML from :code:`zip` or :code:`tar` archives.
 
@@ -435,6 +443,7 @@ def read_archived_xml(archive_path: AnyPathStrType, xml_regex: str) -> etree._El
     Args:
         archive_path (AnyPathStrType): Archive path
         xml_regex (str): XML regex (used by re) as it can be found in the getmembers() list
+        file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed.
 
     Returns:
          etree._Element: XML file
@@ -445,12 +454,14 @@ def read_archived_xml(archive_path: AnyPathStrType, xml_regex: str) -> etree._El
         >>> read_archived_xml(arch_path, file_regex)
         <Element LANDSAT_METADATA_FILE at 0x1c90007f8c8>
     """
-    xml_bytes = read_archived_file(archive_path, xml_regex)
+    xml_bytes = read_archived_file(archive_path, xml_regex, file_list=file_list)
 
     return etree.fromstring(xml_bytes)
 
 
-def read_archived_html(archive_path: AnyPathStrType, regex: str) -> html.HtmlElement:
+def read_archived_html(
+    archive_path: AnyPathStrType, regex: str, file_list: list = None
+) -> html.HtmlElement:
     """
     Read archived HTML from :code:`zip` or :code:`tar` archives.
 
@@ -459,6 +470,7 @@ def read_archived_html(archive_path: AnyPathStrType, regex: str) -> html.HtmlEle
     Args:
         archive_path (AnyPathStrType): Archive path
         regex (str): HTML regex (used by re) as it can be found in the getmembers() list
+        file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed.
 
     Returns:
          html._Element: HTML file
@@ -469,7 +481,7 @@ def read_archived_html(archive_path: AnyPathStrType, regex: str) -> html.HtmlEle
         >>> read_archived_html(arch_path, file_regex)
         <Element html at 0x1c90007f8c8>
     """
-    html_bytes = read_archived_file(archive_path, regex)
+    html_bytes = read_archived_file(archive_path, regex, file_list=file_list)
 
     return html.fromstring(html_bytes)
 

diff --git a/sertit/path.py b/sertit/path.py
@@ -268,6 +268,7 @@ def get_archived_rio_path(
         archive_path (AnyPathStrType): Archive path
         file_regex (str): File regex (used by re) as it can be found in the getmembers() list
         as_list (bool): If true, returns a list (including all found files). If false, returns only the first match
+        file_list (list): List of files contained in the archive. Optional, if not given it will be re-computed.
 
     Returns:
         Union[list, str]: Band path that can be read by rasterio