Skip to content

Commit

Permalink
Ensure all archive members are unpacked on a second call (#365)
Browse files Browse the repository at this point in the history
After running Untar/Unzip with a single member passed and then running
again with `members=None` we'd only check that the unzipped folder 
exists and then return only the single member. This ensures that we 
return all of the requested files by checking that they exist in the unpacked
folder. 
Co-authored-by: Leonardo Uieda <[email protected]>
  • Loading branch information
jni authored Feb 19, 2024
1 parent 1204d76 commit ec7f3ee
Show file tree
Hide file tree
Showing 2 changed files with 115 additions and 44 deletions.
93 changes: 66 additions & 27 deletions pooch/processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"""
Post-processing hooks
"""
import abc
import os
import bz2
import gzip
Expand All @@ -19,9 +20,9 @@
from .utils import get_logger


class ExtractorProcessor: # pylint: disable=too-few-public-methods
class ExtractorProcessor(abc.ABC): # pylint: disable=too-few-public-methods
"""
Base class for extractions from compressed archives.
Abstract base class for extractions from compressed archives.
Subclasses can be used with :meth:`pooch.Pooch.fetch` and
:func:`pooch.retrieve` to unzip a downloaded data file into a folder in the
Expand All @@ -34,16 +35,43 @@ class ExtractorProcessor: # pylint: disable=too-few-public-methods
If None, will unpack all files in the archive. Otherwise, *members*
must be a list of file names to unpack from the archive. Only these
files will be unpacked.
extract_dir : str or None
If None, files will be unpacked to the default location (a folder in
the same location as the downloaded zip file, with a suffix added).
Otherwise, files will be unpacked to ``extract_dir``, which is
interpreted as a *relative path* (relative to the cache location
provided by :func:`pooch.retrieve` or :meth:`pooch.Pooch.fetch`).
"""

# String appended to unpacked archive. To be implemented in subclass
suffix = None

def __init__(self, members=None, extract_dir=None):
self.members = members
self.extract_dir = extract_dir

@property
@abc.abstractmethod
def suffix(self):
"""
String appended to unpacked archive folder name.
Only used if extract_dir is None.
MUST BE IMPLEMENTED BY CHILD CLASSES.
"""

@abc.abstractmethod
def _all_members(self, fname):
"""
Return all the members in the archive.
MUST BE IMPLEMENTED BY CHILD CLASSES.
"""

@abc.abstractmethod
def _extract_file(self, fname, extract_dir):
"""
This method receives an argument for the archive to extract and the
destination path.
MUST BE IMPLEMENTED BY CHILD CLASSES.
"""

def __call__(self, fname, action, pooch):
"""
Extract all files from the given archive.
Expand All @@ -69,27 +97,23 @@ def __call__(self, fname, action, pooch):
A list of the full path to all files in the extracted archive.
"""
if self.suffix is None and self.extract_dir is None:
raise NotImplementedError(
"Derived classes must define either a 'suffix' attribute or "
"an 'extract_dir' attribute."
)
if self.extract_dir is None:
self.extract_dir = fname + self.suffix
else:
archive_dir = fname.rsplit(os.path.sep, maxsplit=1)[0]
self.extract_dir = os.path.join(archive_dir, self.extract_dir)
# Get a list of everyone who is supposed to be in the unpacked folder
# so we can check if they are all there or if we need to extract new
# files.
if self.members is None or not self.members:
members = self._all_members(fname)
else:
members = self.members
if (
(action in ("update", "download"))
or (not os.path.exists(self.extract_dir))
or (
(self.members is not None)
and (
not all(
os.path.exists(os.path.join(self.extract_dir, m))
for m in self.members
)
)
or not all(
os.path.exists(os.path.join(self.extract_dir, m)) for m in members
)
):
# Make sure that the folder with the extracted files exists
Expand All @@ -111,13 +135,6 @@ def __call__(self, fname, action, pooch):

return fnames

def _extract_file(self, fname, extract_dir):
"""
This method receives an argument for the archive to extract and the
destination path. MUST BE IMPLEMENTED BY CHILD CLASSES.
"""
raise NotImplementedError


class Unzip(ExtractorProcessor): # pylint: disable=too-few-public-methods
"""
Expand Down Expand Up @@ -146,7 +163,18 @@ class Unzip(ExtractorProcessor): # pylint: disable=too-few-public-methods
"""

suffix = ".unzip"
@property
def suffix(self):
"""
String appended to unpacked archive folder name.
Only used if extract_dir is None.
"""
return ".unzip"

def _all_members(self, fname):
"""Return all members from a given archive."""
with ZipFile(fname, "r") as zip_file:
return zip_file.namelist()

def _extract_file(self, fname, extract_dir):
"""
Expand Down Expand Up @@ -207,7 +235,18 @@ class Untar(ExtractorProcessor): # pylint: disable=too-few-public-methods
:meth:`pooch.Pooch.fetch`).
"""

suffix = ".untar"
@property
def suffix(self):
"""
String appended to unpacked archive folder name.
Only used if extract_dir is None.
"""
return ".untar"

def _all_members(self, fname):
"""Return all members from a given archive."""
with TarFile.open(fname, "r") as tar_file:
return [info.name for info in tar_file.getmembers()]

def _extract_file(self, fname, extract_dir):
"""
Expand Down
66 changes: 49 additions & 17 deletions pooch/tests/test_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import pytest

from .. import Pooch
from ..processors import Unzip, Untar, ExtractorProcessor, Decompress
from ..processors import Unzip, Untar, Decompress

from .utils import pooch_test_url, pooch_test_registry, check_tiny_data, capture_log

Expand Down Expand Up @@ -97,22 +97,6 @@ def test_decompress_fails():
assert "pooch.Unzip/Untar" in exception.value.args[0]


@pytest.mark.network
def test_extractprocessor_fails():
"The base class should be used and should fail when passed to fecth"
with TemporaryDirectory() as local_store:
# Setup a pooch in a temp dir
pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY)
processor = ExtractorProcessor()
with pytest.raises(NotImplementedError) as exception:
pup.fetch("tiny-data.tar.gz", processor=processor)
assert "'suffix'" in exception.value.args[0]
processor.suffix = "tar.gz"
with pytest.raises(NotImplementedError) as exception:
pup.fetch("tiny-data.tar.gz", processor=processor)
assert not exception.value.args


@pytest.mark.network
@pytest.mark.parametrize(
"target_path", [None, "some_custom_path"], ids=["default_path", "custom_path"]
Expand Down Expand Up @@ -255,3 +239,51 @@ def _unpacking_expected_paths_and_logs(archive, members, path, name):
log_lines.append(f"Extracting '{member}'")
true_paths = set(true_paths)
return true_paths, log_lines


@pytest.mark.network
@pytest.mark.parametrize(
"processor_class,extension",
[(Unzip, ".zip"), (Untar, ".tar.gz")],
)
def test_unpacking_members_then_no_members(processor_class, extension):
"""
Test that calling with valid members then without them works.
https://github.com/fatiando/pooch/issues/364
"""
with TemporaryDirectory() as local_store:
pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY)

# Do a first fetch with an existing member
processor1 = processor_class(members=["store/tiny-data.txt"])
filenames1 = pup.fetch("store" + extension, processor=processor1)
assert len(filenames1) == 1

# Do a second fetch with no members
processor2 = processor_class()
filenames2 = pup.fetch("store" + extension, processor=processor2)
assert len(filenames2) > 1


@pytest.mark.network
@pytest.mark.parametrize(
"processor_class,extension",
[(Unzip, ".zip"), (Untar, ".tar.gz")],
)
def test_unpacking_wrong_members_then_no_members(processor_class, extension):
"""
Test that calling with invalid members then without them works.
https://github.com/fatiando/pooch/issues/364
"""
with TemporaryDirectory() as local_store:
pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY)

# Do a first fetch with incorrect member
processor1 = processor_class(members=["not-a-valid-file.csv"])
filenames1 = pup.fetch("store" + extension, processor=processor1)
assert len(filenames1) == 0

# Do a second fetch with no members
processor2 = processor_class()
filenames2 = pup.fetch("store" + extension, processor=processor2)
assert len(filenames2) > 0

0 comments on commit ec7f3ee

Please sign in to comment.