From 7091b0ae91d82f560d651117ad8e1f01580397b1 Mon Sep 17 00:00:00 2001 From: Michal Charemza Date: Sun, 16 Jul 2023 10:21:27 +0100 Subject: [PATCH 1/3] refactor: towards modes that accept uncompressed size and crc32 --- stream_zip.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/stream_zip.py b/stream_zip.py index e5cce48..d64a875 100644 --- a/stream_zip.py +++ b/stream_zip.py @@ -11,16 +11,16 @@ _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY = object() def NO_COMPRESSION_32(offset, default_get_compressobj): - return _NO_COMPRESSION_32, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj + return _NO_COMPRESSION_32, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj, None, None def NO_COMPRESSION_64(offset, default_get_compressobj): - return _NO_COMPRESSION_64, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj + return _NO_COMPRESSION_64, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj, None, None def ZIP_32(offset, default_get_compressobj): - return _ZIP_32, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj + return _ZIP_32, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj, None, None def ZIP_64(offset, default_get_compressobj): - return _ZIP_64, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj + return _ZIP_64, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj, None, None def ZIP_AUTO(uncompressed_size, level=9): def method_compressobj(offset, default_get_compressobj): @@ -37,7 +37,7 @@ def method_compressobj(offset, default_get_compressobj): # so Python could be causing extra deflate-chunks output which could break the limit. However, couldn't # get output of sized 4293656841 to break the Zip32 bound of 0xffffffff here for any level, including 0 method = _ZIP_64 if uncompressed_size > 4293656841 or offset > 0xffffffff else _ZIP_32 - return (method, _AUTO_UPGRADE_CENTRAL_DIRECTORY, lambda: zlib.compressobj(level=level, memLevel=8, wbits=-zlib.MAX_WBITS)) + return (method, _AUTO_UPGRADE_CENTRAL_DIRECTORY, lambda: zlib.compressobj(level=level, memLevel=8, wbits=-zlib.MAX_WBITS), None, None) return method_compressobj @@ -114,7 +114,7 @@ def _raise_if_beyond(offset, maximum, exception_class): if offset > maximum: raise exception_class() - def _zip_64_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, _get_compress_obj, chunks): + def _zip_64_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): file_offset = offset _raise_if_beyond(file_offset, maximum=0xffffffffffffffff, exception_class=OffsetOverflowError) @@ -177,7 +177,7 @@ def _zip_64_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra 0xffffffff, # Offset of local header - since zip64 ), name_encoded, extra - def _zip_32_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, _get_compress_obj, chunks): + def _zip_32_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): file_offset = offset _raise_if_beyond(file_offset, maximum=0xffffffff, exception_class=OffsetOverflowError) @@ -255,7 +255,7 @@ def _zip_data(chunks, _get_compress_obj, max_uncompressed_size, max_compressed_s return uncompressed_size, compressed_size, crc_32 - def _no_compression_64_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, _get_compress_obj, chunks): + def _no_compression_64_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): file_offset = offset _raise_if_beyond(file_offset, maximum=0xffffffffffffffff, exception_class=OffsetOverflowError) @@ -314,7 +314,7 @@ def _no_compression_64_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at ), name_encoded, extra - def _no_compression_32_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, _get_compress_obj, chunks): + def _no_compression_32_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): file_offset = offset _raise_if_beyond(file_offset, maximum=0xffffffff, exception_class=OffsetOverflowError) @@ -380,7 +380,7 @@ def _chunks(): return chunks, size, crc_32 for name, modified_at, mode, method, chunks in files: - _method, _auto_upgrade_central_directory, _get_compress_obj = method(offset, get_compressobj) + _method, _auto_upgrade_central_directory, _get_compress_obj, uncompressed_size, crc_32 = method(offset, get_compressobj) name_encoded = name.encode('utf-8') _raise_if_beyond(len(name_encoded), maximum=0xffff, exception_class=NameLengthOverflowError) @@ -409,7 +409,7 @@ def _chunks(): _no_compression_64_local_header_and_data if _method is _NO_COMPRESSION_64 else \ _no_compression_32_local_header_and_data - central_directory_header_entry, name_encoded, extra = yield from data_func(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, _get_compress_obj, evenly_sized(chunks)) + central_directory_header_entry, name_encoded, extra = yield from data_func(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, evenly_sized(chunks)) central_directory_size += len(central_directory_header_signature) + len(central_directory_header_entry) + len(name_encoded) + len(extra) central_directory.append((central_directory_header_entry, name_encoded, extra)) From 3e0d7d074032628efa9f70ffd896691e5e482d4d Mon Sep 17 00:00:00 2001 From: Michal Charemza Date: Sun, 16 Jul 2023 10:27:32 +0100 Subject: [PATCH 2/3] refactor: rename variable to make clear its buffered --- stream_zip.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/stream_zip.py b/stream_zip.py index d64a875..ee4ab85 100644 --- a/stream_zip.py +++ b/stream_zip.py @@ -2,8 +2,8 @@ from struct import Struct import zlib -_NO_COMPRESSION_32 = object() -_NO_COMPRESSION_64 = object() +_NO_COMPRESSION_BUFFERED_32 = object() +_NO_COMPRESSION_BUFFERED_64 = object() _ZIP_32 = object() _ZIP_64 = object() @@ -11,10 +11,10 @@ _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY = object() def NO_COMPRESSION_32(offset, default_get_compressobj): - return _NO_COMPRESSION_32, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj, None, None + return _NO_COMPRESSION_BUFFERED_32, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj, None, None def NO_COMPRESSION_64(offset, default_get_compressobj): - return _NO_COMPRESSION_64, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj, None, None + return _NO_COMPRESSION_BUFFERED_64, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj, None, None def ZIP_32(offset, default_get_compressobj): return _ZIP_32, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj, None, None @@ -406,7 +406,7 @@ def _chunks(): data_func = \ _zip_64_local_header_and_data if _method is _ZIP_64 else \ _zip_32_local_header_and_data if _method is _ZIP_32 else \ - _no_compression_64_local_header_and_data if _method is _NO_COMPRESSION_64 else \ + _no_compression_64_local_header_and_data if _method is _NO_COMPRESSION_BUFFERED_64 else \ _no_compression_32_local_header_and_data central_directory_header_entry, name_encoded, extra = yield from data_func(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, evenly_sized(chunks)) @@ -416,7 +416,7 @@ def _chunks(): zip_64_central_directory = zip_64_central_directory \ or (_auto_upgrade_central_directory is _AUTO_UPGRADE_CENTRAL_DIRECTORY and offset > 0xffffffff) \ or (_auto_upgrade_central_directory is _AUTO_UPGRADE_CENTRAL_DIRECTORY and len(central_directory) > 0xffff) \ - or _method in (_ZIP_64, _NO_COMPRESSION_64) + or _method in (_ZIP_64, _NO_COMPRESSION_BUFFERED_64) max_central_directory_length, max_central_directory_start_offset, max_central_directory_size = \ (0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff) if zip_64_central_directory else \ From ec9258387448c816e31c716517e4010878be4d18 Mon Sep 17 00:00:00 2001 From: Michal Charemza Date: Sun, 16 Jul 2023 11:45:06 +0100 Subject: [PATCH 3/3] feat: dynamic mode to stream write uncompressed files --- docs/exceptions.md | 12 ++++ docs/methods.md | 14 ++-- stream_zip.py | 162 +++++++++++++++++++++++++++++++++++++++++++-- test_stream_zip.py | 64 ++++++++++++++++++ 4 files changed, 243 insertions(+), 9 deletions(-) diff --git a/docs/exceptions.md b/docs/exceptions.md index e92a4e5..878b7a8 100644 --- a/docs/exceptions.md +++ b/docs/exceptions.md @@ -12,6 +12,18 @@ Exceptions raised by the source iterables are passed through the `stream_zip` fu Base class for errors relating to invalid arguments + - **ZipIntegrityError** + + An integrity check failed + + - **CRC32IntegrityError** + + The CRC32 calculated from data did not match the CRC32 passed into the method + + - **UncompressedSizeIntegrityError** + + The uncompressed size of data did not match the uncompressed size passed into the method + - **ZipOverflowError** (also inherits from the **OverflowError** built-in) The size or positions of data in the ZIP are too large to store using the requested method diff --git a/docs/methods.md b/docs/methods.md index 29c7830..2268df0 100644 --- a/docs/methods.md +++ b/docs/methods.md @@ -3,26 +3,30 @@ Each member file of the ZIP is compressed with one following methods. -## ZIP_32, NO_COMPRESSION_32 +## ZIP_32, NO_COMPRESSION_32, NO_COMPRESSION_32(uncompressed_size, crc_32) These methods are the historical standard methods for ZIP files. `ZIP_32` compresses the file by default, but it is affected the `get_compressobj` parameter to `stream_unzip`. For example, by passing `get_compressobj=lambda: zlib.compressobj(wbits=-zlib.MAX_WBITS, level=0)`, the `level=0` part would result in this file not being compressed. Its size would increase slightly due to overhead of the underlying algorithm. -`NO_COMPRESSION_32` does not compress the member file, and is not affected by the `get_compressobj` parameter to `stream_unzip`. However, its entire contents is buffered in memory before output begins, and so should not be used for large files. It size does not increase - in the final ZIP file the contents of each `NO_COMPRESSION_32` member file is present byte-for-byte. +Both `NO_COMPRESSION_32` and `NO_COMPRESSION_32(uncompressed_size, crc_32)` store the contents of the file in the ZIP uncompressed exactly as supplied, and are not affected by the `get_compressobj` parameter to `stream_unzip`. -Each member file is limited to 4GiB (gibibyte). This limitation is on the uncompressed size of the data, and (if `ZIP_32`) the compressed size of the data, and how far the start of the member file is from the beginning in the final ZIP file. A `ZIP_32` or `NO_COMPRESSION_32` file can also not be later than the 65,535th member file in a ZIP. If a file only has `ZIP_32` or `NO_COMPRESSION_32` members, the entire file is a Zip32 file, and end of the final member file must be less than 4GiB from the beginning of the final ZIP. If these limits are breached, a `ZipOverflowError` will be raised. +For `NO_COMPRESSION_32` the entire contents are buffered in memory before output begins, and so should not be used for large files. For `NO_COMPRESSION_32(uncompressed_size, crc_32)` the contents are streamed, but at the price of having to determine the uncompressed size and CRC 32 of the contents beforehand. These limitations, although awkward when writing the ZIP, allow the ZIP file to be read in a streaming way. + +Each member file using using one of these methods is limited to 4GiB (gibibyte). This limitation is on the uncompressed size of the data, and (if `ZIP_32`) the compressed size of the data, and how far the start of the member file is from the beginning in the final ZIP file. Also, each member file cannot be later than the 65,535th member file in a ZIP. If a file only has only these members, the entire file is a Zip32 file, and the end of the final member file must be less than 4GiB from the beginning of the final ZIP. If these limits are breached, a `ZipOverflowError` will be raised. This has very high support. You can usually assume anything that can open a ZIP file can open ZIP files with only `ZIP_32` or `NO_COMPRESSION_32` members. -## ZIP_64, NO_COMPRESSION_64 +## ZIP_64, NO_COMPRESSION_64, NO_COMPRESSION_64(uncompressed_size, crc_32) These methods use the Zip64 extension to the original ZIP format. `ZIP_64` compresses the file by default, but it is affected the `get_compressobj` parameter to `stream_unzip`. For example, by passing `get_compressobj=lambda: zlib.compressobj(wbits=-zlib.MAX_WBITS, level=0)`, the `level=0` part would result in this file not being compressed. However, its size would increase slightly due to overhead of the underlying algorithm. -`NO_COMPRESSION_64` does not compress the member file, and is not affected by the `get_compressobj` parameter to `stream_unzip`. However, its entire contents is buffered in memory before output begins, and so should not be used for large files. It size does not increase - in the final ZIP file the contents of each `NO_COMPRESSION_32` member file is present byte-for-byte. +Both `NO_COMPRESSION_64` and `NO_COMPRESSION_64(uncompressed_size, crc_32)` store the contents of the file in the ZIP uncompressed exactly as supplied, and are not affected by the `get_compressobj` parameter to `stream_unzip`. + +For `NO_COMPRESSION_64` the entire contents are buffered in memory before output begins, and so should not be used for large files. For `NO_COMPRESSION_64(uncompressed_size, crc_32)` the contents are streamed, but at the price of having to determine the uncompressed size and CRC 32 of the contents beforehand. These limitations, although awkward when writing the ZIP, allow the ZIP file to be read in a streaming way. Each member file is limited to 16EiB (exbibyte). This limitation is on the uncompressed size of the data, and (if `ZIP_64`) the compressed size of the data, and how far the member starts from the beginning in the final ZIP file. If these limits are breached, a `ZipOverflowError` will be raised. diff --git a/stream_zip.py b/stream_zip.py index ee4ab85..ccdf611 100644 --- a/stream_zip.py +++ b/stream_zip.py @@ -2,20 +2,42 @@ from struct import Struct import zlib +# Private methods + _NO_COMPRESSION_BUFFERED_32 = object() _NO_COMPRESSION_BUFFERED_64 = object() +_NO_COMPRESSION_STREAMED_32 = object() +_NO_COMPRESSION_STREAMED_64 = object() _ZIP_32 = object() _ZIP_64 = object() _AUTO_UPGRADE_CENTRAL_DIRECTORY = object() _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY = object() -def NO_COMPRESSION_32(offset, default_get_compressobj): +def __NO_COMPRESSION_BUFFERED_32(offset, default_get_compressobj): return _NO_COMPRESSION_BUFFERED_32, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj, None, None -def NO_COMPRESSION_64(offset, default_get_compressobj): +def __NO_COMPRESSION_BUFFERED_64(offset, default_get_compressobj): return _NO_COMPRESSION_BUFFERED_64, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj, None, None +def __NO_COMPRESSION_STREAMED_32(uncompressed_size, crc_32): + def method_compressobj(offset, default_get_compressobj): + return _NO_COMPRESSION_STREAMED_32, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj, uncompressed_size, crc_32 + return method_compressobj + +def __NO_COMPRESSION_STREAMED_64(uncompressed_size, crc_32): + def method_compressobj(offset, default_get_compressobj): + return _NO_COMPRESSION_STREAMED_64, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj, uncompressed_size, crc_32 + return method_compressobj + +# Public methods + +def NO_COMPRESSION_32(uncompressed_size, crc_32): + return __NO_COMPRESSION_STREAMED_32(uncompressed_size, crc_32) + +def NO_COMPRESSION_64(uncompressed_size, crc_32): + return __NO_COMPRESSION_STREAMED_64(uncompressed_size, crc_32) + def ZIP_32(offset, default_get_compressobj): return _ZIP_32, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj, None, None @@ -379,7 +401,125 @@ def _chunks(): return chunks, size, crc_32 + def _no_compression_streamed_64_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): + file_offset = offset + + _raise_if_beyond(file_offset, maximum=0xffffffffffffffff, exception_class=OffsetOverflowError) + + extra = zip_64_local_extra_struct.pack( + zip_64_extra_signature, + 16, # Size of extra + uncompressed_size, # Uncompressed + uncompressed_size, # Compressed + ) + mod_at_unix_extra + yield from _(local_header_signature) + yield from _(local_header_struct.pack( + 45, # Version + b'\x00\x08', # Flags - utf-8 file names + 0, # Compression - no compression + mod_at_ms_dos, + crc_32, + 0xffffffff, # Compressed size - since zip64 + 0xffffffff, # Uncompressed size - since zip64 + len(name_encoded), + len(extra), + )) + yield from _(name_encoded) + yield from _(extra) + + yield from _no_compression_streamed_data(chunks, uncompressed_size, crc_32, 0xffffffffffffffff) + + extra = zip_64_central_directory_extra_struct.pack( + zip_64_extra_signature, + 24, # Size of extra + uncompressed_size, # Uncompressed + uncompressed_size, # Compressed + file_offset, + ) + mod_at_unix_extra + return central_directory_header_struct.pack( + 45, # Version made by + 3, # System made by (UNIX) + 45, # Version required + 0, # Reserved + b'\x00\x08', # Flags - utf-8 file names + 0, # Compression - none + mod_at_ms_dos, + crc_32, + 0xffffffff, # Compressed size - since zip64 + 0xffffffff, # Uncompressed size - since zip64 + len(name_encoded), + len(extra), + 0, # File comment length + 0, # Disk number + 0, # Internal file attributes - is binary + external_attr, + 0xffffffff, # File offset - since zip64 + ), name_encoded, extra + + + def _no_compression_streamed_32_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): + file_offset = offset + + _raise_if_beyond(file_offset, maximum=0xffffffff, exception_class=OffsetOverflowError) + + extra = mod_at_unix_extra + yield from _(local_header_signature) + yield from _(local_header_struct.pack( + 20, # Version + b'\x00\x08', # Flags - utf-8 file names + 0, # Compression - no compression + mod_at_ms_dos, + crc_32, + uncompressed_size, # Compressed + uncompressed_size, # Uncompressed + len(name_encoded), + len(extra), + )) + yield from _(name_encoded) + yield from _(extra) + + yield from _no_compression_streamed_data(chunks, uncompressed_size, crc_32, 0xffffffff) + + return central_directory_header_struct.pack( + 20, # Version made by + 3, # System made by (UNIX) + 20, # Version required + 0, # Reserved + b'\x00\x08', # Flags - utf-8 file names + 0, # Compression - none + mod_at_ms_dos, + crc_32, + uncompressed_size, # Compressed + uncompressed_size, # Uncompressed + len(name_encoded), + len(extra), + 0, # File comment length + 0, # Disk number + 0, # Internal file attributes - is binary + external_attr, + file_offset, + ), name_encoded, extra + + def _no_compression_streamed_data(chunks, uncompressed_size, crc_32, maximum_size): + actual_crc_32 = zlib.crc32(b'') + size = 0 + for chunk in chunks: + actual_crc_32 = zlib.crc32(chunk, actual_crc_32) + size += len(chunk) + _raise_if_beyond(size, maximum=maximum_size, exception_class=UncompressedSizeOverflowError) + yield from _(chunk) + + if actual_crc_32 != crc_32: + raise CRC32IntegrityError() + + if size != uncompressed_size: + raise UncompressedSizeIntegrityError() + for name, modified_at, mode, method, chunks in files: + method = \ + __NO_COMPRESSION_BUFFERED_32 if method is NO_COMPRESSION_32 else \ + __NO_COMPRESSION_BUFFERED_64 if method is NO_COMPRESSION_64 else \ + method _method, _auto_upgrade_central_directory, _get_compress_obj, uncompressed_size, crc_32 = method(offset, get_compressobj) name_encoded = name.encode('utf-8') @@ -407,7 +547,9 @@ def _chunks(): _zip_64_local_header_and_data if _method is _ZIP_64 else \ _zip_32_local_header_and_data if _method is _ZIP_32 else \ _no_compression_64_local_header_and_data if _method is _NO_COMPRESSION_BUFFERED_64 else \ - _no_compression_32_local_header_and_data + _no_compression_32_local_header_and_data if _method is _NO_COMPRESSION_BUFFERED_32 else \ + _no_compression_streamed_64_local_header_and_data if _method is _NO_COMPRESSION_STREAMED_64 else \ + _no_compression_streamed_32_local_header_and_data central_directory_header_entry, name_encoded, extra = yield from data_func(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, evenly_sized(chunks)) central_directory_size += len(central_directory_header_signature) + len(central_directory_header_entry) + len(name_encoded) + len(extra) @@ -416,7 +558,7 @@ def _chunks(): zip_64_central_directory = zip_64_central_directory \ or (_auto_upgrade_central_directory is _AUTO_UPGRADE_CENTRAL_DIRECTORY and offset > 0xffffffff) \ or (_auto_upgrade_central_directory is _AUTO_UPGRADE_CENTRAL_DIRECTORY and len(central_directory) > 0xffff) \ - or _method in (_ZIP_64, _NO_COMPRESSION_BUFFERED_64) + or _method in (_ZIP_64, _NO_COMPRESSION_BUFFERED_64, _NO_COMPRESSION_STREAMED_64) max_central_directory_length, max_central_directory_start_offset, max_central_directory_size = \ (0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff) if zip_64_central_directory else \ @@ -491,6 +633,18 @@ class ZipValueError(ZipError, ValueError): pass +class ZipIntegrityError(ZipValueError): + pass + + +class CRC32IntegrityError(ZipIntegrityError): + pass + + +class UncompressedSizeIntegrityError(ZipIntegrityError): + pass + + class ZipOverflowError(ZipValueError, OverflowError): pass diff --git a/test_stream_zip.py b/test_stream_zip.py index ee12ab2..f332a05 100644 --- a/test_stream_zip.py +++ b/test_stream_zip.py @@ -4,6 +4,7 @@ import os import stat import subprocess +import zlib from tempfile import TemporaryDirectory from zipfile import ZipFile @@ -17,6 +18,8 @@ ZIP_AUTO, ZIP_64, ZIP_32, + CRC32IntegrityError, + UncompressedSizeIntegrityError, CompressedSizeOverflowError, UncompressedSizeOverflowError, OffsetOverflowError, @@ -100,6 +103,67 @@ def files(): ] +@pytest.mark.parametrize( + "method", + [ + NO_COMPRESSION_32, + NO_COMPRESSION_64, + ], +) +def test_with_stream_unzip_with_no_compresion_known_crc_32(method): + now = datetime.fromisoformat('2021-01-01 21:01:12') + mode = stat.S_IFREG | 0o600 + + def files(): + yield 'file-1', now, mode, method(20000, zlib.crc32(b'a' * 10000 + b'b' * 10000)), (b'a' * 10000, b'b' * 10000) + yield 'file-2', now, mode, method(2, zlib.crc32(b'c' + b'd')), (b'c', b'd') + + assert [(b'file-1', 20000, b'a' * 10000 + b'b' * 10000), (b'file-2', 2, b'cd')] == [ + (name, size, b''.join(chunks)) + for name, size, chunks in stream_unzip(stream_zip(files())) + ] + + +@pytest.mark.parametrize( + "method", + [ + NO_COMPRESSION_32, + NO_COMPRESSION_64, + ], +) +def test_with_stream_unzip_with_no_compresion_bad_crc_32(method): + now = datetime.fromisoformat('2021-01-01 21:01:12') + mode = stat.S_IFREG | 0o600 + + def files(): + yield 'file-1', now, mode, method(20000, zlib.crc32(b'a' * 10000 + b'b' * 10000)), (b'a' * 10000, b'b' * 10000) + yield 'file-1', now, mode, method(1, zlib.crc32(b'')), (b'a',) + + with pytest.raises(CRC32IntegrityError): + for name, size, chunks in stream_unzip(stream_zip(files())): + pass + + +@pytest.mark.parametrize( + "method", + [ + NO_COMPRESSION_32, + NO_COMPRESSION_64, + ], +) +def test_with_stream_unzip_with_no_compresion_bad_size(method): + now = datetime.fromisoformat('2021-01-01 21:01:12') + mode = stat.S_IFREG | 0o600 + + def files(): + yield 'file-1', now, mode, method(20000, zlib.crc32(b'a' * 10000 + b'b' * 10000)), (b'a' * 10000, b'b' * 10000) + yield 'file-1', now, mode, method(1, zlib.crc32(b'')), (b'',) + + with pytest.raises(UncompressedSizeIntegrityError): + for name, size, chunks in stream_unzip(stream_zip(files())): + pass + + def test_with_stream_unzip_auto_small(): now = datetime.fromisoformat('2021-01-01 21:01:12') mode = stat.S_IFREG | 0o600