Skip to content

Commit

Permalink
Merge pull request #62 from uktrade/feat/more-accurate-timestamps
Browse files Browse the repository at this point in the history
feat: more accurate modification timestamps for clients that support it
  • Loading branch information
michalc authored Jul 15, 2023
2 parents 20e9e97 + af58338 commit 3f331c7
Show file tree
Hide file tree
Showing 3 changed files with 164 additions and 23 deletions.
12 changes: 12 additions & 0 deletions docs/advanced-usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,15 @@ for zipped_chunk in stream_zip(unzipped_files(), chunk_size=65536):
This one size is used both for input - splitting or gathering any uncompressed data into `chunk_size` bytes before attempting to compress it, and in output - splitting or gathering any compressed data into `chunk_size` bytes before returning it to client code.

There may be performance differences with a different `chunk_size` values. The default chunk_size may not be optimal for your use case.


## Without extended timestamps

By default so-called extended timestamps are included in the ZIP, which store the modification time of member files more accurately than the original ZIP format allows. To omit the extended timestamps, you can pass `extended_timestamps=False` to `stream_zip`.

```python
for zipped_chunk in stream_zip(unzipped_files(), extended_timestamps=False):
print(zipped_chunk)
```

This is useful to keep the total number of bytes down as much as possible. This is also useful when creating Open Document files using `stream_zip`. Open Document files cannot have extended timestamps in their member files if they are to pass validation.
54 changes: 32 additions & 22 deletions stream_zip.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def method_compressobj(offset, default_get_compressobj):
return method_compressobj


def stream_zip(files, chunk_size=65536, get_compressobj=lambda: zlib.compressobj(wbits=-zlib.MAX_WBITS, level=9)):
def stream_zip(files, chunk_size=65536, get_compressobj=lambda: zlib.compressobj(wbits=-zlib.MAX_WBITS, level=9), extended_timestamps=True):

def evenly_sized(chunks):
chunk = b''
Expand Down Expand Up @@ -94,6 +94,9 @@ def get_zipped_chunks_uneven():
zip_64_local_extra_struct = Struct('<2sHQQ')
zip_64_central_directory_extra_struct = Struct('<2sHQQQ')

mod_at_unix_extra_signature = b'UT'
mod_at_unix_extra_struct = Struct('<2sH1sl')

modified_at_struct = Struct('<HH')

central_directory = deque()
Expand All @@ -111,7 +114,7 @@ def _raise_if_beyond(offset, maximum, exception_class):
if offset > maximum:
raise exception_class()

def _zip_64_local_header_and_data(name_encoded, mod_at_encoded, external_attr, _get_compress_obj, chunks):
def _zip_64_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, _get_compress_obj, chunks):
file_offset = offset

_raise_if_beyond(file_offset, maximum=0xffffffffffffffff, exception_class=OffsetOverflowError)
Expand All @@ -121,13 +124,13 @@ def _zip_64_local_header_and_data(name_encoded, mod_at_encoded, external_attr, _
16, # Size of extra
0, # Uncompressed size - since data descriptor
0, # Compressed size - since data descriptor
)
) + mod_at_unix_extra
yield from _(local_header_signature)
yield from _(local_header_struct.pack(
45, # Version
b'\x08\x08', # Flags - data descriptor and utf-8 file names
8, # Compression - deflate
mod_at_encoded,
mod_at_ms_dos,
0, # CRC32 - 0 since data descriptor
0xffffffff, # Compressed size - since zip64
0xffffffff, # Uncompressed size - since zip64
Expand All @@ -153,15 +156,15 @@ def _zip_64_local_header_and_data(name_encoded, mod_at_encoded, external_attr, _
uncompressed_size,
compressed_size,
file_offset,
)
) + mod_at_unix_extra
return central_directory_header_struct.pack(
45, # Version made by
3, # System made by (UNIX)
45, # Version required
0, # Reserved
b'\x08\x08', # Flags - data descriptor and utf-8 file names
8, # Compression - deflate
mod_at_encoded,
mod_at_ms_dos,
crc_32,
0xffffffff, # Compressed size - since zip64
0xffffffff, # Uncompressed size - since zip64
Expand All @@ -174,24 +177,26 @@ def _zip_64_local_header_and_data(name_encoded, mod_at_encoded, external_attr, _
0xffffffff, # Offset of local header - since zip64
), name_encoded, extra

def _zip_32_local_header_and_data(name_encoded, mod_at_encoded, external_attr, _get_compress_obj, chunks):
def _zip_32_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, _get_compress_obj, chunks):
file_offset = offset

_raise_if_beyond(file_offset, maximum=0xffffffff, exception_class=OffsetOverflowError)

extra = mod_at_unix_extra
yield from _(local_header_signature)
yield from _(local_header_struct.pack(
20, # Version
b'\x08\x08', # Flags - data descriptor and utf-8 file names
8, # Compression - deflate
mod_at_encoded,
mod_at_ms_dos,
0, # CRC32 - 0 since data descriptor
0, # Compressed size - 0 since data descriptor
0, # Uncompressed size - 0 since data descriptor
len(name_encoded),
0, # Length of local extra
len(extra),
))
yield from _(name_encoded)
yield from _(extra)

uncompressed_size, compressed_size, crc_32 = yield from _zip_data(
chunks,
Expand All @@ -203,15 +208,14 @@ def _zip_32_local_header_and_data(name_encoded, mod_at_encoded, external_attr, _
yield from _(data_descriptor_signature)
yield from _(data_descriptor_zip_32_struct.pack(crc_32, compressed_size, uncompressed_size))

extra = b''
return central_directory_header_struct.pack(
20, # Version made by
3, # System made by (UNIX)
20, # Version required
0, # Reserved
b'\x08\x08', # Flags - data descriptor and utf-8 file names
8, # Compression - deflate
mod_at_encoded,
mod_at_ms_dos,
crc_32,
compressed_size,
uncompressed_size,
Expand Down Expand Up @@ -251,7 +255,7 @@ def _zip_data(chunks, _get_compress_obj, max_uncompressed_size, max_compressed_s

return uncompressed_size, compressed_size, crc_32

def _no_compression_64_local_header_and_data(name_encoded, mod_at_encoded, external_attr, _get_compress_obj, chunks):
def _no_compression_64_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, _get_compress_obj, chunks):
file_offset = offset

_raise_if_beyond(file_offset, maximum=0xffffffffffffffff, exception_class=OffsetOverflowError)
Expand All @@ -263,13 +267,13 @@ def _no_compression_64_local_header_and_data(name_encoded, mod_at_encoded, exter
16, # Size of extra
size, # Uncompressed
size, # Compressed
)
) + mod_at_unix_extra
yield from _(local_header_signature)
yield from _(local_header_struct.pack(
45, # Version
b'\x00\x08', # Flags - utf-8 file names
0, # Compression - no compression
mod_at_encoded,
mod_at_ms_dos,
crc_32,
0xffffffff, # Compressed size - since zip64
0xffffffff, # Uncompressed size - since zip64
Expand All @@ -288,15 +292,15 @@ def _no_compression_64_local_header_and_data(name_encoded, mod_at_encoded, exter
size, # Uncompressed
size, # Compressed
file_offset,
)
) + mod_at_unix_extra
return central_directory_header_struct.pack(
45, # Version made by
3, # System made by (UNIX)
45, # Version required
0, # Reserved
b'\x00\x08', # Flags - utf-8 file names
0, # Compression - none
mod_at_encoded,
mod_at_ms_dos,
crc_32,
0xffffffff, # Compressed size - since zip64
0xffffffff, # Uncompressed size - since zip64
Expand All @@ -310,20 +314,20 @@ def _no_compression_64_local_header_and_data(name_encoded, mod_at_encoded, exter
), name_encoded, extra


def _no_compression_32_local_header_and_data(name_encoded, mod_at_encoded, external_attr, _get_compress_obj, chunks):
def _no_compression_32_local_header_and_data(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, _get_compress_obj, chunks):
file_offset = offset

_raise_if_beyond(file_offset, maximum=0xffffffff, exception_class=OffsetOverflowError)

chunks, size, crc_32 = _no_compression_buffered_data_size_crc_32(chunks, maximum_size=0xffffffff)

extra = b''
extra = mod_at_unix_extra
yield from _(local_header_signature)
yield from _(local_header_struct.pack(
20, # Version
b'\x00\x08', # Flags - utf-8 file names
0, # Compression - no compression
mod_at_encoded,
mod_at_ms_dos,
crc_32,
size, # Compressed
size, # Uncompressed
Expand All @@ -343,7 +347,7 @@ def _no_compression_32_local_header_and_data(name_encoded, mod_at_encoded, exter
0, # Reserved
b'\x00\x08', # Flags - utf-8 file names
0, # Compression - none
mod_at_encoded,
mod_at_ms_dos,
crc_32,
size, # Compressed
size, # Uncompressed
Expand Down Expand Up @@ -381,14 +385,20 @@ def _chunks():
name_encoded = name.encode('utf-8')
_raise_if_beyond(len(name_encoded), maximum=0xffff, exception_class=NameLengthOverflowError)

mod_at_encoded = modified_at_struct.pack(
mod_at_ms_dos = modified_at_struct.pack(
int(modified_at.second / 2) | \
(modified_at.minute << 5) | \
(modified_at.hour << 11),
modified_at.day | \
(modified_at.month << 5) | \
(modified_at.year - 1980) << 9,
)
mod_at_unix_extra = mod_at_unix_extra_struct.pack(
mod_at_unix_extra_signature,
5, # Size of extra
b'\x01', # Only modification time (as opposed to also other times)
int(modified_at.timestamp()),
) if extended_timestamps else b''
external_attr = \
(mode << 16) | \
(0x10 if name_encoded[-1:] == b'/' else 0x0) # MS-DOS directory
Expand All @@ -399,7 +409,7 @@ def _chunks():
_no_compression_64_local_header_and_data if _method is _NO_COMPRESSION_64 else \
_no_compression_32_local_header_and_data

central_directory_header_entry, name_encoded, extra = yield from data_func(name_encoded, mod_at_encoded, external_attr, _get_compress_obj, evenly_sized(chunks))
central_directory_header_entry, name_encoded, extra = yield from data_func(name_encoded, mod_at_ms_dos, mod_at_unix_extra, external_attr, _get_compress_obj, evenly_sized(chunks))
central_directory_size += len(central_directory_header_signature) + len(central_directory_header_entry) + len(name_encoded) + len(extra)
central_directory.append((central_directory_header_entry, name_encoded, extra))

Expand Down
121 changes: 120 additions & 1 deletion test_stream_zip.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from datetime import datetime
from datetime import datetime, timezone, timedelta
from io import BytesIO
import contextlib
import os
Expand Down Expand Up @@ -899,3 +899,122 @@ def test_bsdio_empty_directory(method, trailing_slash, mode, expected_mode):
subprocess.run([bsdcpio, f'{d}/test.zip', '-d', d])

assert stat.filemode(os.lstat('my-dir').st_mode) == expected_mode


@pytest.mark.parametrize(
"method",
[
ZIP_32,
ZIP_64,
NO_COMPRESSION_64,
NO_COMPRESSION_32,
],
)
@pytest.mark.parametrize(
"modified_at,expected_time",
[
(datetime(2011, 1, 1, 1, 2, 3, 123), (2011, 1, 1, 1, 2, 2)),
(datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0))), (2011, 1, 1, 1, 2, 2)),
(datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1))), (2011, 1, 1, 1, 2, 2)),
(datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1))), (2011, 1, 1, 1, 2, 2)),
(datetime(2011, 1, 1, 1, 2, 3, 123), (2011, 1, 1, 1, 2, 2)),
(datetime(2011, 1, 1, 1, 2, 4, 123, tzinfo=timezone(timedelta(hours=0))), (2011, 1, 1, 1, 2, 4)),
(datetime(2011, 1, 1, 1, 2, 4, 123, tzinfo=timezone(timedelta(hours=1))), (2011, 1, 1, 1, 2, 4)),
(datetime(2011, 1, 1, 1, 2, 4, 123, tzinfo=timezone(timedelta(hours=-1))), (2011, 1, 1, 1, 2, 4)),
],
)
def test_zipfile_modification_time(method, modified_at, expected_time):
member_files = (
('my_file', modified_at, stat.S_IFREG | 0o600, method, ()),
)
zipped_chunks = stream_zip(member_files)

def extracted():
with ZipFile(BytesIO(b''.join(zipped_chunks))) as my_zip:
for my_info in my_zip.infolist():
with my_zip.open(my_info.filename) as my_file:
yield (
my_info.filename,
my_info.date_time,
)

assert [('my_file', expected_time)] == list(extracted())


@pytest.mark.parametrize(
"method",
[
ZIP_32,
ZIP_64,
NO_COMPRESSION_64,
NO_COMPRESSION_32,
],
)
@pytest.mark.parametrize(
"timezone,modified_at",
[
('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123)),
('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0)))),
('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1)))),
('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1)))),
('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123)),
('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0)))),
('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1)))),
('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1)))),
('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123)),
('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0)))),
('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1)))),
('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1)))),
],
)
def test_unzip_modification_time(method, timezone, modified_at):
member_files = (
('my_file', modified_at, stat.S_IFREG | 0o600, method, ()),
)
zipped_chunks = stream_zip(member_files)

with \
TemporaryDirectory() as d, \
cwd(d): \

with open('test.zip', 'wb') as fp:
for zipped_chunk in zipped_chunks:
fp.write(zipped_chunk)

subprocess.run(['unzip', f'{d}/test.zip', '-d', d], env={'TZ': timezone})

assert os.path.getmtime('my_file') == int(modified_at.timestamp())


@pytest.mark.parametrize(
"method",
[
ZIP_32,
ZIP_64,
NO_COMPRESSION_64,
NO_COMPRESSION_32,
],
)
@pytest.mark.parametrize(
"timezone,modified_at,expected_modified_at",
[
('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123), datetime(2011, 1, 1, 2, 2, 2, 0)),
],
)
def test_unzip_modification_time_extended_timestamps_disabled(method, timezone, modified_at, expected_modified_at):
member_files = (
('my_file', modified_at, stat.S_IFREG | 0o600, method, ()),
)
zipped_chunks = stream_zip(member_files, extended_timestamps=False)

with \
TemporaryDirectory() as d, \
cwd(d): \

with open('test.zip', 'wb') as fp:
for zipped_chunk in zipped_chunks:
fp.write(zipped_chunk)

subprocess.run(['unzip', f'{d}/test.zip', '-d', d], env={'TZ': timezone})

assert os.path.getmtime('my_file') == expected_modified_at.timestamp()

0 comments on commit 3f331c7

Please sign in to comment.