From c64d926b9f293c7ea67fe090b707f2817150d9cc Mon Sep 17 00:00:00 2001 From: Michal Charemza Date: Thu, 4 Jan 2024 13:06:38 +0000 Subject: [PATCH 1/4] ci: bump version of pytest Without this for some reason the following error can happen if when we add other dependencies in later commits: > ImportError: cannot import name 'Config' from 'pytest' (Specifically the dependencies of pycryptodome and pyzipper) Have to admit I'm not really sure why, and haven't spent that long trying to work out why, but I think it's reasonable to use a more recent version even without this issue and not really worth the time investigating it. We're not using the most recent version of pytest because I don't think it can be installed with Python 3.6.7 that we're still testing on. Will like to maintain support for older Python for as long as possible to not cause annoyances for people that are restricted to older Python but want the latest stream-zip. --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1cf5107..06695e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,13 +20,13 @@ classifiers = [ [project.optional-dependencies] dev = [ "coverage>=6.2", - "pytest>=6.2.5", + "pytest>=7.0.1", "pytest-cov>=3.0.0", "stream-unzip>=0.0.86" ] ci = [ "coverage==6.2", - "pytest==6.2.5", + "pytest==7.0.1", "pytest-cov==3.0.0", "stream-unzip==0.0.86" ] From f1c008b74ce2319aec19e942b983087911e2aaba Mon Sep 17 00:00:00 2001 From: Michal Charemza Date: Thu, 4 Jan 2024 13:46:44 +0000 Subject: [PATCH 2/4] refactor: flags defined for each mode in single place This reduces a little big of duplication for each mode - the flags are now defined in a single line. Also, the flags are defined as a 2-byte unsigned integer rather than a 2-byte bytes array. This changes allow for the flags to be more easily dynamic for each mode - set in a single place and then use binary or | to combine flags. This will be useful in the upcoming chnage that adds AES encryption for each mode that will require the first bit of the flags to be set if encryption is enabled. --- stream_zip.py | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/stream_zip.py b/stream_zip.py index ccdf611..aa46ddb 100644 --- a/stream_zip.py +++ b/stream_zip.py @@ -94,14 +94,14 @@ def up_to(num): def get_zipped_chunks_uneven(): local_header_signature = b'PK\x03\x04' - local_header_struct = Struct(' Date: Thu, 4 Jan 2024 14:55:51 +0000 Subject: [PATCH 3/4] refactor: allow compression type to be injected into each mode function This reduces a bit of duplication, and more easily allows the compression type of "99" to be set in a later commit, which is one of the things that indicates that AES encryption is used. --- stream_zip.py | 52 +++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/stream_zip.py b/stream_zip.py index aa46ddb..0d9c118 100644 --- a/stream_zip.py +++ b/stream_zip.py @@ -139,7 +139,7 @@ def _raise_if_beyond(offset, maximum, exception_class): if offset > maximum: raise exception_class() - def _zip_64_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): + def _zip_64_local_header_and_data(compression, name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): file_offset = offset _raise_if_beyond(file_offset, maximum=0xffffffffffffffff, exception_class=OffsetOverflowError) @@ -156,7 +156,7 @@ def _zip_64_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra yield from _(local_header_struct.pack( 45, # Version flags, - 8, # Compression - deflate + compression, mod_at_ms_dos, 0, # CRC32 - 0 since data descriptor 0xffffffff, # Compressed size - since zip64 @@ -190,7 +190,7 @@ def _zip_64_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra 45, # Version required 0, # Reserved flags, - 8, # Compression - deflate + compression, mod_at_ms_dos, crc_32, 0xffffffff, # Compressed size - since zip64 @@ -204,7 +204,7 @@ def _zip_64_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra 0xffffffff, # Offset of local header - since zip64 ), name_encoded, extra - def _zip_32_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): + def _zip_32_local_header_and_data(compression, name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): file_offset = offset _raise_if_beyond(file_offset, maximum=0xffffffff, exception_class=OffsetOverflowError) @@ -216,7 +216,7 @@ def _zip_32_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra yield from _(local_header_struct.pack( 20, # Version flags, - 8, # Compression - deflate + compression, mod_at_ms_dos, 0, # CRC32 - 0 since data descriptor 0, # Compressed size - 0 since data descriptor @@ -243,7 +243,7 @@ def _zip_32_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra 20, # Version required 0, # Reserved flags, - 8, # Compression - deflate + compression, mod_at_ms_dos, crc_32, compressed_size, @@ -284,7 +284,7 @@ def _zip_data(chunks, _get_compress_obj, max_uncompressed_size, max_compressed_s return uncompressed_size, compressed_size, crc_32 - def _no_compression_64_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): + def _no_compression_64_local_header_and_data(compression, name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): file_offset = offset _raise_if_beyond(file_offset, maximum=0xffffffffffffffff, exception_class=OffsetOverflowError) @@ -303,7 +303,7 @@ def _no_compression_64_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at yield from _(local_header_struct.pack( 45, # Version flags, - 0, # Compression - no compression + compression, mod_at_ms_dos, crc_32, 0xffffffff, # Compressed size - since zip64 @@ -330,7 +330,7 @@ def _no_compression_64_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at 45, # Version required 0, # Reserved flags, - 0, # Compression - none + compression, mod_at_ms_dos, crc_32, 0xffffffff, # Compressed size - since zip64 @@ -345,7 +345,7 @@ def _no_compression_64_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at ), name_encoded, extra - def _no_compression_32_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): + def _no_compression_32_local_header_and_data(compression, name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): file_offset = offset _raise_if_beyond(file_offset, maximum=0xffffffff, exception_class=OffsetOverflowError) @@ -359,7 +359,7 @@ def _no_compression_32_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at yield from _(local_header_struct.pack( 20, # Version flags, - 0, # Compression - no compression + compression, mod_at_ms_dos, crc_32, size, # Compressed @@ -379,7 +379,7 @@ def _no_compression_32_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at 20, # Version required 0, # Reserved flags, - 0, # Compression - none + compression, mod_at_ms_dos, crc_32, size, # Compressed @@ -412,7 +412,7 @@ def _chunks(): return chunks, size, crc_32 - def _no_compression_streamed_64_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): + def _no_compression_streamed_64_local_header_and_data(compression, name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): file_offset = offset _raise_if_beyond(file_offset, maximum=0xffffffffffffffff, exception_class=OffsetOverflowError) @@ -429,7 +429,7 @@ def _no_compression_streamed_64_local_header_and_data(name_encoded, mod_at_ms_do yield from _(local_header_struct.pack( 45, # Version flags, - 0, # Compression - no compression + compression, mod_at_ms_dos, crc_32, 0xffffffff, # Compressed size - since zip64 @@ -455,7 +455,7 @@ def _no_compression_streamed_64_local_header_and_data(name_encoded, mod_at_ms_do 45, # Version required 0, # Reserved flags, - 0, # Compression - none + compression, mod_at_ms_dos, crc_32, 0xffffffff, # Compressed size - since zip64 @@ -470,7 +470,7 @@ def _no_compression_streamed_64_local_header_and_data(name_encoded, mod_at_ms_do ), name_encoded, extra - def _no_compression_streamed_32_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): + def _no_compression_streamed_32_local_header_and_data(compression, name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): file_offset = offset _raise_if_beyond(file_offset, maximum=0xffffffff, exception_class=OffsetOverflowError) @@ -482,7 +482,7 @@ def _no_compression_streamed_32_local_header_and_data(name_encoded, mod_at_ms_do yield from _(local_header_struct.pack( 20, # Version flags, - 0, # Compression - no compression + compression, mod_at_ms_dos, crc_32, uncompressed_size, # Compressed @@ -501,7 +501,7 @@ def _no_compression_streamed_32_local_header_and_data(name_encoded, mod_at_ms_do 20, # Version required 0, # Reserved flags, - 0, # Compression - none + compression, mod_at_ms_dos, crc_32, uncompressed_size, # Compressed @@ -558,15 +558,15 @@ def _no_compression_streamed_data(chunks, uncompressed_size, crc_32, maximum_siz (mode << 16) | \ (0x10 if name_encoded[-1:] == b'/' else 0x0) # MS-DOS directory - data_func = \ - _zip_64_local_header_and_data if _method is _ZIP_64 else \ - _zip_32_local_header_and_data if _method is _ZIP_32 else \ - _no_compression_64_local_header_and_data if _method is _NO_COMPRESSION_BUFFERED_64 else \ - _no_compression_32_local_header_and_data if _method is _NO_COMPRESSION_BUFFERED_32 else \ - _no_compression_streamed_64_local_header_and_data if _method is _NO_COMPRESSION_STREAMED_64 else \ - _no_compression_streamed_32_local_header_and_data + data_func, compression = \ + (_zip_64_local_header_and_data, 8) if _method is _ZIP_64 else \ + (_zip_32_local_header_and_data, 8) if _method is _ZIP_32 else \ + (_no_compression_64_local_header_and_data, 0) if _method is _NO_COMPRESSION_BUFFERED_64 else \ + (_no_compression_32_local_header_and_data, 0) if _method is _NO_COMPRESSION_BUFFERED_32 else \ + (_no_compression_streamed_64_local_header_and_data, 0) if _method is _NO_COMPRESSION_STREAMED_64 else \ + (_no_compression_streamed_32_local_header_and_data, 0) - central_directory_header_entry, name_encoded, extra = yield from data_func(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, evenly_sized(chunks)) + central_directory_header_entry, name_encoded, extra = yield from data_func(compression, name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, evenly_sized(chunks)) central_directory_size += len(central_directory_header_signature) + len(central_directory_header_entry) + len(name_encoded) + len(extra) central_directory.append((central_directory_header_entry, name_encoded, extra)) From 28287a3343a7b821c838a29d6d5bed39877fe649 Mon Sep 17 00:00:00 2001 From: Michal Charemza Date: Wed, 3 Jan 2024 16:30:29 +0000 Subject: [PATCH 4/4] feat: AES-2 encryption This adds AES-2 encryption as requested/discussed in https://github.com/uktrade/stream-zip/issues/93 and defined at https://www.winzip.com/en/support/aes-encryption/ For now, AES-2 is used over AES-1 to prevent leakage of information via CRC-32 for small files, at the price of not having a checksum on the uncompressed plain text data (although there is an HMAC check on the encrypted compressed data as part of AES-2). In a later change, we should be able to make it AES-1 for larger files as recommended at https://www.winzip.com/en/support/aes-encryption/, but not doing this now to keep this change reasonably small. --- .github/workflows/test.yml | 11 ++ README.md | 2 + docs/features.md | 2 + docs/get-started.md | 17 +++ pyproject.toml | 10 +- stream_zip.py | 209 +++++++++++++++++++++++++------------ test_stream_zip.py | 193 +++++++++++++++++++++++++++++++++- 7 files changed, 374 insertions(+), 70 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e0442ad..92bbc97 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -34,6 +34,17 @@ jobs: - name: "Install bsdcpio" run: | ./install-libarachive.sh + - name: "Install 7z" + run: | + mkdir bin + ( + cd ./bin + wget https://www.7-zip.org/a/7z2301-linux-x64.tar.xz + echo "23babcab045b78016e443f862363e4ab63c77d75bc715c0b3463f6134cbcf318 7z2301-linux-x64.tar.xz" | sha256sum --check + tar -xJf ./7z2301-linux-x64.tar.xz 7zz + rm 7z2301-linux-x64.tar.xz + echo "$PWD" >> $GITHUB_PATH + ) - name: "Install python dependencies" run: | pip install ".[ci]" diff --git a/README.md b/README.md index c95cf77..9f17872 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,8 @@ In addition to being memory efficient (with some [limitations](https://stream-zi - Can construct ZIP files that contain directories, including empty directories +- Can construct password protected/encrypted ZIP files adhering to the [WinZip AE-2 specification](https://www.winzip.com/en/support/aes-encryption/). + - Allows the specification of permissions on the member files and directories (although not all clients respect them) - By default stores modification time as an extended timestamp. An extended timestamp is a more accurate timestamp than the original ZIP format allows diff --git a/docs/features.md b/docs/features.md index b6da671..aedce93 100644 --- a/docs/features.md +++ b/docs/features.md @@ -14,6 +14,8 @@ In addition to being memory efficient (with some [limitations](/get-started/#lim - Can construct ZIP files that contain directories, including empty directories +- Can construct password protected/encrypted ZIP files adhering to the [WinZip AE-2 specification](https://www.winzip.com/en/support/aes-encryption/). + - Allows the specification of permissions on the member files and directories (although not all clients respect them) - By default stores modification time as an extended timestamp. An extended timestamp is a more accurate timestamp than the original ZIP format allows diff --git a/docs/get-started.md b/docs/get-started.md index c0cea13..4cf7468 100644 --- a/docs/get-started.md +++ b/docs/get-started.md @@ -147,6 +147,23 @@ The `stat.S_IFDIR` on the file is technically optional, but is probably good pra It is not required to have a directory member file in order to have files in that directory. So this pattern is most useful to have empty directories in the ZIP. +## Password + +The data of ZIP files can be password protected by passing a password as the `password` parameter to `stream_zip` + +```python +password_protected_zipped_chunks = stream_zip(member_files(), password='my-password'): +``` + +Note: + +1. This encrypts the data with AES-256, adhering to the [WinZip AE-2 specification](https://www.winzip.com/en/support/aes-encryption/). + +2. This is seen as more secure than ZipCrypto, the original mechanism of password protecting ZIP files, but fewer clients can open such ZIP files. + +3. While a step forward from ZipCrypto, it has flaws that you should be aware of before using it. See ["Attacking and Repairing the WinZip Encryption Scheme" by Tadayoshi Kohno](https://homes.cs.washington.edu/~yoshi/papers/WinZip/winzip.pdf). + + ## Methods Each member file is compressed with a method that must be specified in client code. See [Methods](/methods/) for an explanation of each. diff --git a/pyproject.toml b/pyproject.toml index 06695e9..754aff7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,19 +16,25 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Topic :: System :: Archiving :: Compression", ] +dependencies = [ + "pycryptodome>=3.10.1", +] [project.optional-dependencies] dev = [ "coverage>=6.2", "pytest>=7.0.1", "pytest-cov>=3.0.0", - "stream-unzip>=0.0.86" + "stream-unzip>=0.0.86", + "pyzipper>=0.3.6", ] ci = [ + "pycryptodome==3.10.1", "coverage==6.2", "pytest==7.0.1", "pytest-cov==3.0.0", - "stream-unzip==0.0.86" + "stream-unzip==0.0.86", + "pyzipper==0.3.6", ] [project.urls] diff --git a/stream_zip.py b/stream_zip.py index 0d9c118..8ca9265 100644 --- a/stream_zip.py +++ b/stream_zip.py @@ -1,7 +1,13 @@ from collections import deque from struct import Struct +import secrets import zlib +from Crypto.Cipher import AES +from Crypto.Hash import HMAC, SHA1 +from Crypto.Util import Counter +from Crypto.Protocol.KDF import PBKDF2 + # Private methods _NO_COMPRESSION_BUFFERED_32 = object() @@ -63,7 +69,12 @@ def method_compressobj(offset, default_get_compressobj): return method_compressobj -def stream_zip(files, chunk_size=65536, get_compressobj=lambda: zlib.compressobj(wbits=-zlib.MAX_WBITS, level=9), extended_timestamps=True): +def stream_zip(files, chunk_size=65536, + get_compressobj=lambda: zlib.compressobj(wbits=-zlib.MAX_WBITS, level=9), + extended_timestamps=True, + password=None, + get_crypto_random=lambda num_bytes: secrets.token_bytes(num_bytes), +): def evenly_sized(chunks): chunk = b'' @@ -119,8 +130,12 @@ def get_zipped_chunks_uneven(): mod_at_unix_extra_signature = b'UT' mod_at_unix_extra_struct = Struct('<2sH1sl') + aes_extra_signature = b'\x01\x99' + aes_extra_struct = Struct('<2sHH2sBH') + modified_at_struct = Struct(' maximum: raise exception_class() - def _zip_64_local_header_and_data(compression, name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): + def _with_returned(gen): + # We leverage the not-often used "return value" of generators. Here, we want to iterate + # over chunks (to encrypt them), but still return the same "return value". So we use a + # bit of a trick to extract the return value but still have access to the chunks as + # we iterate over them + + return_value = None + def with_return_value(): + nonlocal return_value + return_value = yield from gen + + return ((lambda: return_value), with_return_value()) + + def _encrypt_dummy(chunks): + get_return_value, chunks_with_return = _with_returned(chunks) + for chunk in chunks_with_return: + yield from _(chunk) + return get_return_value() + + def _encrypt_aes(chunks): + key_length = 32 + salt_length = 16 + password_verification_length = 2 + + salt = get_crypto_random(salt_length) + yield from _(salt) + + keys = PBKDF2(password, salt, 2 * key_length + password_verification_length, 1000) + yield from _(keys[-password_verification_length:]) + + encrypter = AES.new( + keys[:key_length], AES.MODE_CTR, + counter=Counter.new(nbits=128, little_endian=True), + ) + hmac = HMAC.new(keys[key_length:key_length*2], digestmod=SHA1) + + get_return_value, chunks_with_return = _with_returned(chunks) + for chunk in chunks_with_return: + encrypted_chunk = encrypter.encrypt(chunk) + hmac.update(encrypted_chunk) + yield from _(encrypted_chunk) + + yield from _(hmac.digest()[:10]) + + return get_return_value() + + def _zip_64_local_header_and_data(compression, aes_size_increase, aes_flags, name_encoded, mod_at_ms_dos, mod_at_unix_extra, aes_extra, external_attr, uncompressed_size, crc_32, crc_32_mask, _get_compress_obj, encryption_func, chunks): file_offset = offset _raise_if_beyond(file_offset, maximum=0xffffffffffffffff, exception_class=OffsetOverflowError) @@ -149,8 +210,8 @@ def _zip_64_local_header_and_data(compression, name_encoded, mod_at_ms_dos, mod_ 16, # Size of extra 0, # Uncompressed size - since data descriptor 0, # Compressed size - since data descriptor - ) + mod_at_unix_extra - flags = data_descriptor_flag | utf8_flag + ) + mod_at_unix_extra + aes_extra + flags = aes_flags | data_descriptor_flag | utf8_flag yield from _(local_header_signature) yield from _(local_header_struct.pack( @@ -167,15 +228,17 @@ def _zip_64_local_header_and_data(compression, name_encoded, mod_at_ms_dos, mod_ yield from _(name_encoded) yield from _(extra) - uncompressed_size, compressed_size, crc_32 = yield from _zip_data( + uncompressed_size, raw_compressed_size, crc_32 = yield from encryption_func(_zip_data( chunks, _get_compress_obj, max_uncompressed_size=0xffffffffffffffff, max_compressed_size=0xffffffffffffffff, - ) + )) + compressed_size = raw_compressed_size + aes_size_increase + masked_crc_32 = crc_32 & crc_32_mask yield from _(data_descriptor_signature) - yield from _(data_descriptor_zip_64_struct.pack(crc_32, compressed_size, uncompressed_size)) + yield from _(data_descriptor_zip_64_struct.pack(masked_crc_32, compressed_size, uncompressed_size)) extra = zip_64_central_directory_extra_struct.pack( zip_64_extra_signature, @@ -183,7 +246,7 @@ def _zip_64_local_header_and_data(compression, name_encoded, mod_at_ms_dos, mod_ uncompressed_size, compressed_size, file_offset, - ) + mod_at_unix_extra + ) + mod_at_unix_extra + aes_extra return central_directory_header_struct.pack( 45, # Version made by 3, # System made by (UNIX) @@ -192,7 +255,7 @@ def _zip_64_local_header_and_data(compression, name_encoded, mod_at_ms_dos, mod_ flags, compression, mod_at_ms_dos, - crc_32, + masked_crc_32, 0xffffffff, # Compressed size - since zip64 0xffffffff, # Uncompressed size - since zip64 len(name_encoded), @@ -204,13 +267,13 @@ def _zip_64_local_header_and_data(compression, name_encoded, mod_at_ms_dos, mod_ 0xffffffff, # Offset of local header - since zip64 ), name_encoded, extra - def _zip_32_local_header_and_data(compression, name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): + def _zip_32_local_header_and_data(compression, aes_size_increase, aes_flags, name_encoded, mod_at_ms_dos, mod_at_unix_extra, aes_extra, external_attr, uncompressed_size, crc_32, crc_32_mask, _get_compress_obj, encryption_func, chunks): file_offset = offset _raise_if_beyond(file_offset, maximum=0xffffffff, exception_class=OffsetOverflowError) - extra = mod_at_unix_extra - flags = data_descriptor_flag | utf8_flag + extra = mod_at_unix_extra + aes_extra + flags = aes_flags | data_descriptor_flag | utf8_flag yield from _(local_header_signature) yield from _(local_header_struct.pack( @@ -227,15 +290,17 @@ def _zip_32_local_header_and_data(compression, name_encoded, mod_at_ms_dos, mod_ yield from _(name_encoded) yield from _(extra) - uncompressed_size, compressed_size, crc_32 = yield from _zip_data( + uncompressed_size, raw_compressed_size, crc_32 = yield from encryption_func(_zip_data( chunks, _get_compress_obj, max_uncompressed_size=0xffffffff, max_compressed_size=0xffffffff, - ) + )) + compressed_size = raw_compressed_size + aes_size_increase + masked_crc_32 = crc_32 & crc_32_mask yield from _(data_descriptor_signature) - yield from _(data_descriptor_zip_32_struct.pack(crc_32, compressed_size, uncompressed_size)) + yield from _(data_descriptor_zip_32_struct.pack(masked_crc_32, compressed_size, uncompressed_size)) return central_directory_header_struct.pack( 20, # Version made by @@ -245,7 +310,7 @@ def _zip_32_local_header_and_data(compression, name_encoded, mod_at_ms_dos, mod_ flags, compression, mod_at_ms_dos, - crc_32, + masked_crc_32, compressed_size, uncompressed_size, len(name_encoded), @@ -273,31 +338,33 @@ def _zip_data(chunks, _get_compress_obj, max_uncompressed_size, max_compressed_s _raise_if_beyond(compressed_size, maximum=max_compressed_size, exception_class=CompressedSizeOverflowError) - yield from _(compressed_chunk) + yield compressed_chunk compressed_chunk = compress_obj.flush() compressed_size += len(compressed_chunk) _raise_if_beyond(compressed_size, maximum=max_compressed_size, exception_class=CompressedSizeOverflowError) - yield from _(compressed_chunk) + yield compressed_chunk return uncompressed_size, compressed_size, crc_32 - def _no_compression_64_local_header_and_data(compression, name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): + def _no_compression_64_local_header_and_data(compression, aes_size_increase, aes_flags, name_encoded, mod_at_ms_dos, mod_at_unix_extra, aes_extra, external_attr, uncompressed_size, crc_32, crc_32_mask, _get_compress_obj, encryption_func, chunks): file_offset = offset _raise_if_beyond(file_offset, maximum=0xffffffffffffffff, exception_class=OffsetOverflowError) - chunks, size, crc_32 = _no_compression_buffered_data_size_crc_32(chunks, maximum_size=0xffffffffffffffff) + chunks, uncompressed_size, crc_32 = _no_compression_buffered_data_size_crc_32(chunks, maximum_size=0xffffffffffffffff) + compressed_size = uncompressed_size + aes_size_increase extra = zip_64_local_extra_struct.pack( zip_64_extra_signature, 16, # Size of extra - size, # Uncompressed - size, # Compressed - ) + mod_at_unix_extra - flags = utf8_flag + uncompressed_size, + compressed_size, + ) + mod_at_unix_extra + aes_extra + flags = aes_flags | utf8_flag + masked_crc_32 = crc_32 & crc_32_mask yield from _(local_header_signature) yield from _(local_header_struct.pack( @@ -305,7 +372,7 @@ def _no_compression_64_local_header_and_data(compression, name_encoded, mod_at_m flags, compression, mod_at_ms_dos, - crc_32, + masked_crc_32, 0xffffffff, # Compressed size - since zip64 0xffffffff, # Uncompressed size - since zip64 len(name_encoded), @@ -314,16 +381,15 @@ def _no_compression_64_local_header_and_data(compression, name_encoded, mod_at_m yield from _(name_encoded) yield from _(extra) - for chunk in chunks: - yield from _(chunk) + yield from encryption_func(chunks) extra = zip_64_central_directory_extra_struct.pack( zip_64_extra_signature, 24, # Size of extra - size, # Uncompressed - size, # Compressed + uncompressed_size, + compressed_size, file_offset, - ) + mod_at_unix_extra + ) + mod_at_unix_extra + aes_extra return central_directory_header_struct.pack( 45, # Version made by 3, # System made by (UNIX) @@ -332,7 +398,7 @@ def _no_compression_64_local_header_and_data(compression, name_encoded, mod_at_m flags, compression, mod_at_ms_dos, - crc_32, + masked_crc_32, 0xffffffff, # Compressed size - since zip64 0xffffffff, # Uncompressed size - since zip64 len(name_encoded), @@ -345,15 +411,17 @@ def _no_compression_64_local_header_and_data(compression, name_encoded, mod_at_m ), name_encoded, extra - def _no_compression_32_local_header_and_data(compression, name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): + def _no_compression_32_local_header_and_data(compression, aes_size_increase, aes_flags, name_encoded, mod_at_ms_dos, mod_at_unix_extra, aes_extra, external_attr, uncompressed_size, crc_32, crc_32_mask, _get_compress_obj, encryption_func, chunks): file_offset = offset _raise_if_beyond(file_offset, maximum=0xffffffff, exception_class=OffsetOverflowError) - chunks, size, crc_32 = _no_compression_buffered_data_size_crc_32(chunks, maximum_size=0xffffffff) + chunks, uncompressed_size, crc_32 = _no_compression_buffered_data_size_crc_32(chunks, maximum_size=0xffffffff) - extra = mod_at_unix_extra - flags = utf8_flag + compressed_size = uncompressed_size + aes_size_increase + extra = mod_at_unix_extra + aes_extra + flags = aes_flags | utf8_flag + masked_crc_32 = crc_32 & crc_32_mask yield from _(local_header_signature) yield from _(local_header_struct.pack( @@ -361,17 +429,16 @@ def _no_compression_32_local_header_and_data(compression, name_encoded, mod_at_m flags, compression, mod_at_ms_dos, - crc_32, - size, # Compressed - size, # Uncompressed + masked_crc_32, + compressed_size, + uncompressed_size, len(name_encoded), len(extra), )) yield from _(name_encoded) yield from _(extra) - for chunk in chunks: - yield from _(chunk) + yield from encryption_func(chunks) return central_directory_header_struct.pack( 20, # Version made by @@ -381,9 +448,9 @@ def _no_compression_32_local_header_and_data(compression, name_encoded, mod_at_m flags, compression, mod_at_ms_dos, - crc_32, - size, # Compressed - size, # Uncompressed + masked_crc_32, + compressed_size, + uncompressed_size, len(name_encoded), len(extra), 0, # File comment length @@ -412,18 +479,20 @@ def _chunks(): return chunks, size, crc_32 - def _no_compression_streamed_64_local_header_and_data(compression, name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): + def _no_compression_streamed_64_local_header_and_data(compression, aes_size_increase, aes_flags, name_encoded, mod_at_ms_dos, mod_at_unix_extra, aes_extra, external_attr, uncompressed_size, crc_32, crc_32_mask, _get_compress_obj, encryption_func, chunks): file_offset = offset _raise_if_beyond(file_offset, maximum=0xffffffffffffffff, exception_class=OffsetOverflowError) + compressed_size = uncompressed_size + aes_size_increase extra = zip_64_local_extra_struct.pack( zip_64_extra_signature, 16, # Size of extra - uncompressed_size, # Uncompressed - uncompressed_size, # Compressed - ) + mod_at_unix_extra - flags = utf8_flag + uncompressed_size, + compressed_size, + ) + mod_at_unix_extra + aes_extra + flags = aes_flags | utf8_flag + masked_crc_32 = crc_32 & crc_32_mask yield from _(local_header_signature) yield from _(local_header_struct.pack( @@ -431,7 +500,7 @@ def _no_compression_streamed_64_local_header_and_data(compression, name_encoded, flags, compression, mod_at_ms_dos, - crc_32, + masked_crc_32, 0xffffffff, # Compressed size - since zip64 0xffffffff, # Uncompressed size - since zip64 len(name_encoded), @@ -440,15 +509,15 @@ def _no_compression_streamed_64_local_header_and_data(compression, name_encoded, yield from _(name_encoded) yield from _(extra) - yield from _no_compression_streamed_data(chunks, uncompressed_size, crc_32, 0xffffffffffffffff) + yield from encryption_func(_no_compression_streamed_data(chunks, uncompressed_size, crc_32, 0xffffffffffffffff)) extra = zip_64_central_directory_extra_struct.pack( zip_64_extra_signature, 24, # Size of extra - uncompressed_size, # Uncompressed - uncompressed_size, # Compressed + uncompressed_size, + compressed_size, file_offset, - ) + mod_at_unix_extra + ) + mod_at_unix_extra + aes_extra return central_directory_header_struct.pack( 45, # Version made by 3, # System made by (UNIX) @@ -457,7 +526,7 @@ def _no_compression_streamed_64_local_header_and_data(compression, name_encoded, flags, compression, mod_at_ms_dos, - crc_32, + masked_crc_32, 0xffffffff, # Compressed size - since zip64 0xffffffff, # Uncompressed size - since zip64 len(name_encoded), @@ -470,13 +539,15 @@ def _no_compression_streamed_64_local_header_and_data(compression, name_encoded, ), name_encoded, extra - def _no_compression_streamed_32_local_header_and_data(compression, name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, chunks): + def _no_compression_streamed_32_local_header_and_data(compression, aes_size_increase, aes_flags, name_encoded, mod_at_ms_dos, mod_at_unix_extra, aes_extra, external_attr, uncompressed_size, crc_32, crc_32_mask, _get_compress_obj, encryption_func, chunks): file_offset = offset _raise_if_beyond(file_offset, maximum=0xffffffff, exception_class=OffsetOverflowError) - extra = mod_at_unix_extra - flags = utf8_flag + compressed_size = uncompressed_size + aes_size_increase + extra = mod_at_unix_extra + aes_extra + flags = aes_flags | utf8_flag + masked_crc_32 = crc_32 & crc_32_mask yield from _(local_header_signature) yield from _(local_header_struct.pack( @@ -484,16 +555,16 @@ def _no_compression_streamed_32_local_header_and_data(compression, name_encoded, flags, compression, mod_at_ms_dos, - crc_32, - uncompressed_size, # Compressed - uncompressed_size, # Uncompressed + masked_crc_32, + compressed_size, + uncompressed_size, len(name_encoded), len(extra), )) yield from _(name_encoded) yield from _(extra) - yield from _no_compression_streamed_data(chunks, uncompressed_size, crc_32, 0xffffffff) + yield from encryption_func(_no_compression_streamed_data(chunks, uncompressed_size, crc_32, 0xffffffff)) return central_directory_header_struct.pack( 20, # Version made by @@ -503,9 +574,9 @@ def _no_compression_streamed_32_local_header_and_data(compression, name_encoded, flags, compression, mod_at_ms_dos, - crc_32, - uncompressed_size, # Compressed - uncompressed_size, # Uncompressed + masked_crc_32, + compressed_size, + uncompressed_size, len(name_encoded), len(extra), 0, # File comment length @@ -522,7 +593,7 @@ def _no_compression_streamed_data(chunks, uncompressed_size, crc_32, maximum_siz actual_crc_32 = zlib.crc32(chunk, actual_crc_32) size += len(chunk) _raise_if_beyond(size, maximum=maximum_size, exception_class=UncompressedSizeOverflowError) - yield from _(chunk) + yield chunk if actual_crc_32 != crc_32: raise CRC32IntegrityError() @@ -558,7 +629,7 @@ def _no_compression_streamed_data(chunks, uncompressed_size, crc_32, maximum_siz (mode << 16) | \ (0x10 if name_encoded[-1:] == b'/' else 0x0) # MS-DOS directory - data_func, compression = \ + data_func, raw_compression = \ (_zip_64_local_header_and_data, 8) if _method is _ZIP_64 else \ (_zip_32_local_header_and_data, 8) if _method is _ZIP_32 else \ (_no_compression_64_local_header_and_data, 0) if _method is _NO_COMPRESSION_BUFFERED_64 else \ @@ -566,7 +637,11 @@ def _no_compression_streamed_data(chunks, uncompressed_size, crc_32, maximum_siz (_no_compression_streamed_64_local_header_and_data, 0) if _method is _NO_COMPRESSION_STREAMED_64 else \ (_no_compression_streamed_32_local_header_and_data, 0) - central_directory_header_entry, name_encoded, extra = yield from data_func(compression, name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, uncompressed_size, crc_32, _get_compress_obj, evenly_sized(chunks)) + compression, aes_size_increase, aes_flags, aes_extra, crc_32_mask, encryption_func = \ + (99, 28, aes_flag, aes_extra_struct.pack(aes_extra_signature, 7, 2, b'AE', 3, raw_compression), 0, _encrypt_aes) if password is not None else \ + (raw_compression, 0, 0, b'', 0xffffffff, _encrypt_dummy) + + central_directory_header_entry, name_encoded, extra = yield from data_func(compression, aes_size_increase, aes_flags, name_encoded, mod_at_ms_dos, mod_at_unix_extra, aes_extra, external_attr, uncompressed_size, crc_32, crc_32_mask, _get_compress_obj, encryption_func, evenly_sized(chunks)) central_directory_size += len(central_directory_header_signature) + len(central_directory_header_entry) + len(name_encoded) + len(extra) central_directory.append((central_directory_header_entry, name_encoded, extra)) diff --git a/test_stream_zip.py b/test_stream_zip.py index 623197e..17dafbb 100644 --- a/test_stream_zip.py +++ b/test_stream_zip.py @@ -6,10 +6,12 @@ import subprocess import zlib from tempfile import TemporaryDirectory +from struct import Struct from zipfile import ZipFile import pytest -from stream_unzip import UnsupportedZip64Error, stream_unzip +import pyzipper +from stream_unzip import IncorrectAESPasswordError, UnsupportedZip64Error, stream_unzip from stream_zip import ( stream_zip, @@ -1082,3 +1084,192 @@ def test_unzip_modification_time_extended_timestamps_disabled(method, timezone, subprocess.run(['unzip', f'{d}/test.zip', '-d', d], env={'TZ': timezone}) assert os.path.getmtime('my_file') == expected_modified_at.timestamp() + + +@pytest.mark.parametrize( + "method", + [ + ZIP_32, + ZIP_64, + NO_COMPRESSION_64, + NO_COMPRESSION_64(18, 1571107898), + NO_COMPRESSION_32, + NO_COMPRESSION_32(18, 1571107898), + ], +) +def test_password_unzips_with_stream_unzip(method): + now = datetime.strptime('2021-01-01 21:01:12', '%Y-%m-%d %H:%M:%S') + mode = stat.S_IFREG | 0o600 + password = 'my-pass' + + files = ( + ('file-1', now, mode, method, (b'a' * 9, b'b' * 9)), + ) + + assert b''.join( + chunk + for _, _, chunks in stream_unzip(stream_zip(files, password=password), password=password) + for chunk in chunks + ) == b'a' * 9 + b'b' * 9 + + +@pytest.mark.parametrize( + "method", + [ + ZIP_32, + ZIP_64, + NO_COMPRESSION_64, + NO_COMPRESSION_64(18, 1571107898), + NO_COMPRESSION_32, + NO_COMPRESSION_32(18, 1571107898), + ], +) +def test_bad_password_not_unzips_with_stream_unzip(method): + now = datetime.strptime('2021-01-01 21:01:12', '%Y-%m-%d %H:%M:%S') + mode = stat.S_IFREG | 0o600 + password = 'my-pass' + + files = ( + ('file-1', now, mode, method, (b'a' * 9, b'b' * 9)), + ) + + with pytest.raises(IncorrectAESPasswordError): + list(stream_unzip(stream_zip(files, password=password), password='not')) + + +@pytest.mark.parametrize( + "method", + [ + ZIP_32, + ZIP_64, + NO_COMPRESSION_64, + NO_COMPRESSION_64(18, 1571107898), + NO_COMPRESSION_32, + NO_COMPRESSION_32(18, 1571107898), + ], +) +def test_password_unzips_with_7z(method): + now = datetime.strptime('2021-01-01 21:01:12', '%Y-%m-%d %H:%M:%S') + mode = stat.S_IFREG | 0o600 + password = 'my-pass' + + files = ( + ('file-1', now, mode, method, (b'a' * 9, b'b' * 9)), + ) + + with \ + TemporaryDirectory() as d, \ + cwd(d): \ + + with open('test.zip', 'wb') as fp: + for zipped_chunk in stream_zip(files, password=password): + fp.write(zipped_chunk) + + r = subprocess.run(['7zz', '-pmy-pass', 'e', 'test.zip']) + assert r.returncode == 0 + + for file in files: + with open(file[0], 'rb') as f: + assert f.read() == (b'a' * 9 ) + (b'b' * 9) + + +@pytest.mark.parametrize( + "method", + [ + ZIP_32, + ZIP_64, + NO_COMPRESSION_64, + NO_COMPRESSION_64(18, 1571107898), + NO_COMPRESSION_32, + NO_COMPRESSION_32(18, 1571107898), + ], +) +def test_password_unzips_with_pyzipper(method): + now = datetime.strptime('2021-01-01 21:01:12', '%Y-%m-%d %H:%M:%S') + mode = stat.S_IFREG | 0o600 + password = 'my-pass' + + files = ( + ('file-1', now, mode, method, (b'a' * 9, b'b' * 9)), + ) + + with \ + TemporaryDirectory() as d, \ + cwd(d): \ + + with open('test.zip', 'wb') as fp: + for zipped_chunk in stream_zip(files, password=password): + fp.write(zipped_chunk) + + with pyzipper.AESZipFile('test.zip') as zf: + zf.setpassword(password.encode()) + zf.testzip() + assert zf.read('file-1') == (b'a' * 9 ) + (b'b' * 9) + + +@pytest.mark.parametrize( + "method", + [ + ZIP_32, + ZIP_64, + NO_COMPRESSION_64, + NO_COMPRESSION_64(18, 1571107898), + NO_COMPRESSION_32, + NO_COMPRESSION_32(18, 1571107898), + ], +) +def test_password_bytes_not_deterministic(method): + now = datetime.strptime('2021-01-01 21:01:12', '%Y-%m-%d %H:%M:%S') + mode = stat.S_IFREG | 0o600 + password = 'my-pass' + + files = ( + ('file-1', now, mode, method, (b'a' * 9, b'b' * 9)), + ) + + assert b''.join(stream_zip(files, password=password)) != b''.join(stream_zip(files, password=password)) + + +@pytest.mark.parametrize( + "method", + [ + ZIP_32, + ZIP_64, + NO_COMPRESSION_64, + NO_COMPRESSION_64(18, 1571107898), + NO_COMPRESSION_32, + NO_COMPRESSION_32(18, 1571107898), + ], +) +def test_crc_32_not_in_file(method): + # AE-2 should not have the CRC_32, so we check that the CRC_32 isn't anywhere in the file. This + # is "too strong" as check, because it could just happen to appear in the cipher text, which + # would be fine. The cipher text is by default non-deterministic due to its random salt, and + # so this could be a flaky test and faily randomly. To make the test not flaky, we make the + # bytes of the file completely deterministic, by forcing the random numbers used to generate + # the salt to be non-random + + now = datetime.strptime('2021-01-01 21:01:12', '%Y-%m-%d %H:%M:%S') + mode = stat.S_IFREG | 0o600 + password = 'my-pass' + + files = ( + ('file-1', now, mode, method, (b'a' * 9, b'b' * 9)), + ) + crc_32 = Struct('