diff --git a/docs/advanced-usage.md b/docs/advanced-usage.md index d8e4a52..c0fb372 100644 --- a/docs/advanced-usage.md +++ b/docs/advanced-usage.md @@ -22,3 +22,15 @@ for zipped_chunk in stream_zip(unzipped_files(), chunk_size=65536): This one size is used both for input - splitting or gathering any uncompressed data into `chunk_size` bytes before attempting to compress it, and in output - splitting or gathering any compressed data into `chunk_size` bytes before returning it to client code. There may be performance differences with a different `chunk_size` values. The default chunk_size may not be optimal for your use case. + + +## Without extended timestamps + +By default so-called extended timestamps are included in the ZIP, which store the modification time of member files more accurately than the original ZIP format allows. To omit the extended timestamps, you can pass `extended_timestamps=False` to `stream_zip`. + +```python +for zipped_chunk in stream_zip(unzipped_files(), extended_timestamps=False): + print(zipped_chunk) +``` + +This is useful to keep the total number of bytes down as much as possible. This is also useful when creating Open Document files using `stream_zip`. Open Document files cannot have extended timestamps in their member files if they are to pass validation. diff --git a/stream_zip.py b/stream_zip.py index bbf9683..e5cce48 100644 --- a/stream_zip.py +++ b/stream_zip.py @@ -41,7 +41,7 @@ def method_compressobj(offset, default_get_compressobj): return method_compressobj -def stream_zip(files, chunk_size=65536, get_compressobj=lambda: zlib.compressobj(wbits=-zlib.MAX_WBITS, level=9)): +def stream_zip(files, chunk_size=65536, get_compressobj=lambda: zlib.compressobj(wbits=-zlib.MAX_WBITS, level=9), extended_timestamps=True): def evenly_sized(chunks): chunk = b'' @@ -94,6 +94,9 @@ def get_zipped_chunks_uneven(): zip_64_local_extra_struct = Struct('<2sHQQ') zip_64_central_directory_extra_struct = Struct('<2sHQQQ') + mod_at_unix_extra_signature = b'UT' + mod_at_unix_extra_struct = Struct('<2sH1sl') + modified_at_struct = Struct(' maximum: raise exception_class() - def _zip_64_local_header_and_data(name_encoded, mod_at_encoded, external_attr, _get_compress_obj, chunks): + def _zip_64_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, _get_compress_obj, chunks): file_offset = offset _raise_if_beyond(file_offset, maximum=0xffffffffffffffff, exception_class=OffsetOverflowError) @@ -121,13 +124,13 @@ def _zip_64_local_header_and_data(name_encoded, mod_at_encoded, external_attr, _ 16, # Size of extra 0, # Uncompressed size - since data descriptor 0, # Compressed size - since data descriptor - ) + ) + mod_at_unix_extra yield from _(local_header_signature) yield from _(local_header_struct.pack( 45, # Version b'\x08\x08', # Flags - data descriptor and utf-8 file names 8, # Compression - deflate - mod_at_encoded, + mod_at_ms_dos, 0, # CRC32 - 0 since data descriptor 0xffffffff, # Compressed size - since zip64 0xffffffff, # Uncompressed size - since zip64 @@ -153,7 +156,7 @@ def _zip_64_local_header_and_data(name_encoded, mod_at_encoded, external_attr, _ uncompressed_size, compressed_size, file_offset, - ) + ) + mod_at_unix_extra return central_directory_header_struct.pack( 45, # Version made by 3, # System made by (UNIX) @@ -161,7 +164,7 @@ def _zip_64_local_header_and_data(name_encoded, mod_at_encoded, external_attr, _ 0, # Reserved b'\x08\x08', # Flags - data descriptor and utf-8 file names 8, # Compression - deflate - mod_at_encoded, + mod_at_ms_dos, crc_32, 0xffffffff, # Compressed size - since zip64 0xffffffff, # Uncompressed size - since zip64 @@ -174,24 +177,26 @@ def _zip_64_local_header_and_data(name_encoded, mod_at_encoded, external_attr, _ 0xffffffff, # Offset of local header - since zip64 ), name_encoded, extra - def _zip_32_local_header_and_data(name_encoded, mod_at_encoded, external_attr, _get_compress_obj, chunks): + def _zip_32_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, _get_compress_obj, chunks): file_offset = offset _raise_if_beyond(file_offset, maximum=0xffffffff, exception_class=OffsetOverflowError) + extra = mod_at_unix_extra yield from _(local_header_signature) yield from _(local_header_struct.pack( 20, # Version b'\x08\x08', # Flags - data descriptor and utf-8 file names 8, # Compression - deflate - mod_at_encoded, + mod_at_ms_dos, 0, # CRC32 - 0 since data descriptor 0, # Compressed size - 0 since data descriptor 0, # Uncompressed size - 0 since data descriptor len(name_encoded), - 0, # Length of local extra + len(extra), )) yield from _(name_encoded) + yield from _(extra) uncompressed_size, compressed_size, crc_32 = yield from _zip_data( chunks, @@ -203,7 +208,6 @@ def _zip_32_local_header_and_data(name_encoded, mod_at_encoded, external_attr, _ yield from _(data_descriptor_signature) yield from _(data_descriptor_zip_32_struct.pack(crc_32, compressed_size, uncompressed_size)) - extra = b'' return central_directory_header_struct.pack( 20, # Version made by 3, # System made by (UNIX) @@ -211,7 +215,7 @@ def _zip_32_local_header_and_data(name_encoded, mod_at_encoded, external_attr, _ 0, # Reserved b'\x08\x08', # Flags - data descriptor and utf-8 file names 8, # Compression - deflate - mod_at_encoded, + mod_at_ms_dos, crc_32, compressed_size, uncompressed_size, @@ -251,7 +255,7 @@ def _zip_data(chunks, _get_compress_obj, max_uncompressed_size, max_compressed_s return uncompressed_size, compressed_size, crc_32 - def _no_compression_64_local_header_and_data(name_encoded, mod_at_encoded, external_attr, _get_compress_obj, chunks): + def _no_compression_64_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, _get_compress_obj, chunks): file_offset = offset _raise_if_beyond(file_offset, maximum=0xffffffffffffffff, exception_class=OffsetOverflowError) @@ -263,13 +267,13 @@ def _no_compression_64_local_header_and_data(name_encoded, mod_at_encoded, exter 16, # Size of extra size, # Uncompressed size, # Compressed - ) + ) + mod_at_unix_extra yield from _(local_header_signature) yield from _(local_header_struct.pack( 45, # Version b'\x00\x08', # Flags - utf-8 file names 0, # Compression - no compression - mod_at_encoded, + mod_at_ms_dos, crc_32, 0xffffffff, # Compressed size - since zip64 0xffffffff, # Uncompressed size - since zip64 @@ -288,7 +292,7 @@ def _no_compression_64_local_header_and_data(name_encoded, mod_at_encoded, exter size, # Uncompressed size, # Compressed file_offset, - ) + ) + mod_at_unix_extra return central_directory_header_struct.pack( 45, # Version made by 3, # System made by (UNIX) @@ -296,7 +300,7 @@ def _no_compression_64_local_header_and_data(name_encoded, mod_at_encoded, exter 0, # Reserved b'\x00\x08', # Flags - utf-8 file names 0, # Compression - none - mod_at_encoded, + mod_at_ms_dos, crc_32, 0xffffffff, # Compressed size - since zip64 0xffffffff, # Uncompressed size - since zip64 @@ -310,20 +314,20 @@ def _no_compression_64_local_header_and_data(name_encoded, mod_at_encoded, exter ), name_encoded, extra - def _no_compression_32_local_header_and_data(name_encoded, mod_at_encoded, external_attr, _get_compress_obj, chunks): + def _no_compression_32_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, _get_compress_obj, chunks): file_offset = offset _raise_if_beyond(file_offset, maximum=0xffffffff, exception_class=OffsetOverflowError) chunks, size, crc_32 = _no_compression_buffered_data_size_crc_32(chunks, maximum_size=0xffffffff) - extra = b'' + extra = mod_at_unix_extra yield from _(local_header_signature) yield from _(local_header_struct.pack( 20, # Version b'\x00\x08', # Flags - utf-8 file names 0, # Compression - no compression - mod_at_encoded, + mod_at_ms_dos, crc_32, size, # Compressed size, # Uncompressed @@ -343,7 +347,7 @@ def _no_compression_32_local_header_and_data(name_encoded, mod_at_encoded, exter 0, # Reserved b'\x00\x08', # Flags - utf-8 file names 0, # Compression - none - mod_at_encoded, + mod_at_ms_dos, crc_32, size, # Compressed size, # Uncompressed @@ -381,7 +385,7 @@ def _chunks(): name_encoded = name.encode('utf-8') _raise_if_beyond(len(name_encoded), maximum=0xffff, exception_class=NameLengthOverflowError) - mod_at_encoded = modified_at_struct.pack( + mod_at_ms_dos = modified_at_struct.pack( int(modified_at.second / 2) | \ (modified_at.minute << 5) | \ (modified_at.hour << 11), @@ -389,6 +393,12 @@ def _chunks(): (modified_at.month << 5) | \ (modified_at.year - 1980) << 9, ) + mod_at_unix_extra = mod_at_unix_extra_struct.pack( + mod_at_unix_extra_signature, + 5, # Size of extra + b'\x01', # Only modification time (as opposed to also other times) + int(modified_at.timestamp()), + ) if extended_timestamps else b'' external_attr = \ (mode << 16) | \ (0x10 if name_encoded[-1:] == b'/' else 0x0) # MS-DOS directory @@ -399,7 +409,7 @@ def _chunks(): _no_compression_64_local_header_and_data if _method is _NO_COMPRESSION_64 else \ _no_compression_32_local_header_and_data - central_directory_header_entry, name_encoded, extra = yield from data_func(name_encoded, mod_at_encoded, external_attr, _get_compress_obj, evenly_sized(chunks)) + central_directory_header_entry, name_encoded, extra = yield from data_func(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, _get_compress_obj, evenly_sized(chunks)) central_directory_size += len(central_directory_header_signature) + len(central_directory_header_entry) + len(name_encoded) + len(extra) central_directory.append((central_directory_header_entry, name_encoded, extra)) diff --git a/test_stream_zip.py b/test_stream_zip.py index 80cb2e6..ee12ab2 100644 --- a/test_stream_zip.py +++ b/test_stream_zip.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import datetime, timezone, timedelta from io import BytesIO import contextlib import os @@ -899,3 +899,122 @@ def test_bsdio_empty_directory(method, trailing_slash, mode, expected_mode): subprocess.run([bsdcpio, f'{d}/test.zip', '-d', d]) assert stat.filemode(os.lstat('my-dir').st_mode) == expected_mode + + +@pytest.mark.parametrize( + "method", + [ + ZIP_32, + ZIP_64, + NO_COMPRESSION_64, + NO_COMPRESSION_32, + ], +) +@pytest.mark.parametrize( + "modified_at,expected_time", + [ + (datetime(2011, 1, 1, 1, 2, 3, 123), (2011, 1, 1, 1, 2, 2)), + (datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0))), (2011, 1, 1, 1, 2, 2)), + (datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1))), (2011, 1, 1, 1, 2, 2)), + (datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1))), (2011, 1, 1, 1, 2, 2)), + (datetime(2011, 1, 1, 1, 2, 3, 123), (2011, 1, 1, 1, 2, 2)), + (datetime(2011, 1, 1, 1, 2, 4, 123, tzinfo=timezone(timedelta(hours=0))), (2011, 1, 1, 1, 2, 4)), + (datetime(2011, 1, 1, 1, 2, 4, 123, tzinfo=timezone(timedelta(hours=1))), (2011, 1, 1, 1, 2, 4)), + (datetime(2011, 1, 1, 1, 2, 4, 123, tzinfo=timezone(timedelta(hours=-1))), (2011, 1, 1, 1, 2, 4)), + ], +) +def test_zipfile_modification_time(method, modified_at, expected_time): + member_files = ( + ('my_file', modified_at, stat.S_IFREG | 0o600, method, ()), + ) + zipped_chunks = stream_zip(member_files) + + def extracted(): + with ZipFile(BytesIO(b''.join(zipped_chunks))) as my_zip: + for my_info in my_zip.infolist(): + with my_zip.open(my_info.filename) as my_file: + yield ( + my_info.filename, + my_info.date_time, + ) + + assert [('my_file', expected_time)] == list(extracted()) + + +@pytest.mark.parametrize( + "method", + [ + ZIP_32, + ZIP_64, + NO_COMPRESSION_64, + NO_COMPRESSION_32, + ], +) +@pytest.mark.parametrize( + "timezone,modified_at", + [ + ('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123)), + ('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0)))), + ('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1)))), + ('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1)))), + ('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123)), + ('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0)))), + ('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1)))), + ('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1)))), + ('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123)), + ('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0)))), + ('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1)))), + ('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1)))), + ], +) +def test_unzip_modification_time(method, timezone, modified_at): + member_files = ( + ('my_file', modified_at, stat.S_IFREG | 0o600, method, ()), + ) + zipped_chunks = stream_zip(member_files) + + with \ + TemporaryDirectory() as d, \ + cwd(d): \ + + with open('test.zip', 'wb') as fp: + for zipped_chunk in zipped_chunks: + fp.write(zipped_chunk) + + subprocess.run(['unzip', f'{d}/test.zip', '-d', d], env={'TZ': timezone}) + + assert os.path.getmtime('my_file') == int(modified_at.timestamp()) + + +@pytest.mark.parametrize( + "method", + [ + ZIP_32, + ZIP_64, + NO_COMPRESSION_64, + NO_COMPRESSION_32, + ], +) +@pytest.mark.parametrize( + "timezone,modified_at,expected_modified_at", + [ + ('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123), datetime(2011, 1, 1, 2, 2, 2, 0)), + ], +) +def test_unzip_modification_time_extended_timestamps_disabled(method, timezone, modified_at, expected_modified_at): + member_files = ( + ('my_file', modified_at, stat.S_IFREG | 0o600, method, ()), + ) + zipped_chunks = stream_zip(member_files, extended_timestamps=False) + + with \ + TemporaryDirectory() as d, \ + cwd(d): \ + + with open('test.zip', 'wb') as fp: + for zipped_chunk in zipped_chunks: + fp.write(zipped_chunk) + + subprocess.run(['unzip', f'{d}/test.zip', '-d', d], env={'TZ': timezone}) + + assert os.path.getmtime('my_file') == expected_modified_at.timestamp()