Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: resume archive extraction by skipping existing files #786

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
test for actual size and not only > 0
  • Loading branch information
SiQube committed Jan 2, 2025

Verified

This commit was signed with the committer’s verified signature.
dplocki Dawid Płocki
commit c06ded89254dbe1426a033ab97976a6c7043eb74
58 changes: 37 additions & 21 deletions src/pymovements/utils/archives.py
Original file line number Diff line number Diff line change
@@ -42,6 +42,7 @@ def extract_archive(
remove_finished: bool = False,
remove_top_level: bool = True,
verbose: int = 1,
resume: bool = False,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

verbose should come after resume

Copy link
Contributor

@dkrako dkrako Jan 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

which means this change would be a breaking change as verbose can be passed as a positional argument.

I strongly prefer verbose as the last argument. We should probably add a *, after the recursive parameter and enforce passing remove_finished, remove_top_level, resume and verbose as keyword arguments.

We can either introduce the breaking in this PR or in a separate one. What do you think?

Also we should keep this issue in mind when enhancing signatures in the future. To be future-proof and backwards compatible it is necessary to keep the number of positional arguments to a comfortable minimum.

) -> Path:
"""Extract an archive.

@@ -65,6 +66,8 @@ def extract_archive(
Verbosity levels: (1) Print messages for extracting each dataset resource without printing
messages for recursive archives. (2) Print additional messages for each recursive archive
extract. (default: 1)
resume: bool
Resume previous extraction. (default: True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a few more info like this:

Resume previous extraction by skipping existing files. Checks for correct size of existing files but not integrity. (default: True)


Returns
-------
@@ -90,7 +93,7 @@ def extract_archive(
print(f'Extracting {source_path.name} to {destination_path}')

# Extract file and remove archive if desired.
extractor(source_path, destination_path, compression_type)
extractor(source_path, destination_path, compression_type, resume)
if remove_finished:
source_path.unlink()

@@ -129,6 +132,7 @@ def extract_archive(
remove_finished=remove_finished,
remove_top_level=remove_top_level,
verbose=0 if verbose < 2 else 2,
resume=resume,
)

return destination_path
@@ -138,7 +142,7 @@ def _extract_tar(
source_path: Path,
destination_path: Path,
compression: str | None,
skip: bool = True,
resume: bool = True,
) -> None:
"""Extract a tar archive.

@@ -150,28 +154,32 @@ def _extract_tar(
Path to the directory the file will be extracted to.
compression: str | None
Compression filename suffix.
skip: bool
Skip already extracted files. (default: True)
resume: bool
Resume previous extraction. (default: True)
"""
with tarfile.open(source_path, f'r:{compression[1:]}' if compression else 'r') as archive:
for member in archive.getnames():
if (
os.path.exists(os.path.join(destination_path, member)) and
member[-4:] not in _ARCHIVE_EXTRACTORS and
tarfile.TarInfo(os.path.join(destination_path, member)).size > 0 and
skip
):
continue
for member in archive.getmembers():
member_name = member.name
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I doubt these two lines increase readability. there are probably just a leftover which should be refactored away

member_size = member.size
member_dest_path = os.path.join(destination_path, member_name)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

while you're at it, you can probably move this line into the following if-clause

if resume:
if (
os.path.exists(member_dest_path) and
member_name[-4:] not in _ARCHIVE_EXTRACTORS and
member_size == os.path.getsize(member_dest_path)
):
continue
if sys.version_info < (3, 12): # pragma: <3.12 cover
archive.extract(member, destination_path)
archive.extract(member_name, destination_path)
else: # pragma: >=3.12 cover
archive.extract(member, destination_path, filter='tar')
archive.extract(member_name, destination_path, filter='tar')


def _extract_zip(
source_path: Path,
destination_path: Path,
compression: str | None,
resume: bool,
) -> None:
"""Extract a zip archive.

@@ -186,13 +194,21 @@ def _extract_zip(
"""
compression_id = _ZIP_COMPRESSION_MAP[compression] if compression else zipfile.ZIP_STORED
with zipfile.ZipFile(source_path, 'r', compression=compression_id) as archive:
for member in archive.namelist():
if (
os.path.exists(os.path.join(destination_path, member)) and
member[-4:] not in _ARCHIVE_EXTRACTORS
):
continue
archive.extract(member, destination_path)
for member in archive.filelist:
member_filename = member.filename
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above

member_dest_path = os.path.join(destination_path, member_filename)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above

if resume:
member_size = member.file_size
if (
os.path.exists(member_dest_path) and
member_filename[-4:] not in _ARCHIVE_EXTRACTORS and
member_size == os.path.getsize(member_dest_path)
):
continue
else:
archive.extract(member_filename, destination_path)
else:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in contrast to the tar implementation, there is no need for an additional if-else

archive.extract(member_filename, destination_path)


_ARCHIVE_EXTRACTORS: dict[str, Callable[[Path, Path, str | None], None]] = {
14 changes: 10 additions & 4 deletions tests/unit/utils/archives_test.py
Original file line number Diff line number Diff line change
@@ -500,31 +500,35 @@ def test_decompress_unknown_compression_suffix():


@pytest.mark.parametrize(
('recursive', 'remove_top_level', 'expected_files'),
('recursive', 'remove_top_level', 'expected_files', 'resume'),
[
pytest.param(
False, False,
False,
False,
(
'toplevel',
os.path.join('toplevel', 'recursive.zip'),
),
True,
id='recursive_false_remove_finished_false',
),
pytest.param(
True, False,
True,
False,
(
'toplevel',
os.path.join('toplevel', 'recursive.zip'),
os.path.join('toplevel', 'recursive'),
os.path.join('toplevel', 'recursive', 'singlechild'),
os.path.join('toplevel', 'recursive', 'singlechild', 'test.file'),
),
False,
id='recursive_true_remove_finished_false',
),
],
)
def test_extract_archive_destination_path_not_None_no_remove_top_level_no_remove_finished_twice(
dkrako marked this conversation as resolved.
Show resolved Hide resolved
recursive, remove_top_level, archive, tmp_path, expected_files,
recursive, remove_top_level, archive, tmp_path, expected_files, resume,
):
destination_path = tmp_path / pathlib.Path('tmpfoo')
extract_archive(
@@ -533,13 +537,15 @@ def test_extract_archive_destination_path_not_None_no_remove_top_level_no_remove
recursive=recursive,
remove_finished=False,
remove_top_level=remove_top_level,
resume=resume,
)
extract_archive(
source_path=archive,
destination_path=destination_path,
recursive=recursive,
remove_finished=False,
remove_top_level=remove_top_level,
resume=resume,
)

if destination_path.is_file():