Skip to content

Commit

Permalink
feat: clamp timestamps to fit header fields
Browse files Browse the repository at this point in the history
  • Loading branch information
frederikaalund committed May 4, 2024
1 parent c8a4503 commit 15139e3
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 22 deletions.
15 changes: 15 additions & 0 deletions docs/get-started.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,21 @@ Each member file is compressed with a method that must be specified in client co
In general, not all valid ZIP files are possible to be stream unzipped. However, all files generated by stream-zip are suitable for stream unzipping, for example by [stream-unzip](https://stream-unzip.docs.trade.gov.uk/).


## Timestamps

File timestamps (e.g., "modified at") have to fit into the ZIP file format.
Therefore, stream-zip both rounds and clamps timestamps to make them fit the ZIP file format.

If `extended_timestamps=True` (the default):

* Timestamps are clamped between 1970-1-1 and 2038-1-19 (both inclusive)
* Timestamps are rounded down with 1-second precision

If `extended_timestamps=False`:

* Timestamps are clamped between 1980-1-1 and 2107-12-31 (both inclusive)
* Timestamps are rounded down with 2-second precision

## Limitations

The `NO_COMPRESSION_32` and `NO_COMPRESSION_64` methods do not stream - they buffer the entire binary contents of the file in memory before output. They do this to calculate the length and CRC 32 to output them before the binary contents in the ZIP. This is required in order for ZIP to be stream unzippable.
Expand Down
32 changes: 25 additions & 7 deletions stream_zip.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections import deque
from datetime import datetime
from struct import Struct
import asyncio
import secrets
Expand All @@ -21,6 +22,17 @@
_AUTO_UPGRADE_CENTRAL_DIRECTORY = object()
_NO_AUTO_UPGRADE_CENTRAL_DIRECTORY = object()

_MS_DOS_DATE_BEGIN = datetime(1980, 1, 1)
_MS_DOS_DATE_END = datetime(
# Max year since 1980 repesentable in a 7-bit unsigned integer
year=_MS_DOS_DATE_BEGIN.year + 2**7-1,
month=12,
day=31,
hour=23,
minute=59,
second=59,
)

def __NO_COMPRESSION_BUFFERED_32(offset, default_get_compressobj):
return _NO_COMPRESSION_BUFFERED_32, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj, None, None

Expand Down Expand Up @@ -612,19 +624,25 @@ def _no_compression_streamed_data(chunks, uncompressed_size, crc_32, maximum_siz
name_encoded = name.encode('utf-8')
_raise_if_beyond(len(name_encoded), maximum=0xffff, exception_class=NameLengthOverflowError)

# Remove time zone information (if any) during clamp
mod_datetime_ms_dos = min(max(modified_at.replace(tzinfo=None), _MS_DOS_DATE_BEGIN), _MS_DOS_DATE_END)
mod_at_ms_dos = modified_at_struct.pack(
int(modified_at.second / 2) | \
(modified_at.minute << 5) | \
(modified_at.hour << 11),
modified_at.day | \
(modified_at.month << 5) | \
(modified_at.year - 1980) << 9,
(mod_datetime_ms_dos.second // 2) | \
(mod_datetime_ms_dos.minute << 5) | \
(mod_datetime_ms_dos.hour << 11),
mod_datetime_ms_dos.day | \
(mod_datetime_ms_dos.month << 5) | \
(mod_datetime_ms_dos.year - 1980) << 9,
)
mod_at_unix_extra = mod_at_unix_extra_struct.pack(
mod_at_unix_extra_signature,
5, # Size of extra
b'\x01', # Only modification time (as opposed to also other times)
int(modified_at.timestamp()),
# Clamp timestamp to fit the field size (4-byte signed integer)
# In principle, we the lower limit should be `-2**31` but we set it
# to zero to avoid issues with common zip utilities like `unzip`.
# Said tools do not correctly interpret negative timestamps.
max(min(int(modified_at.timestamp()), 2**31 - 1), 0),
) if extended_timestamps else b''
external_attr = \
(mode << 16) | \
Expand Down
84 changes: 69 additions & 15 deletions test_stream_zip.py
Original file line number Diff line number Diff line change
Expand Up @@ -989,6 +989,25 @@ def test_bsdio_empty_directory(method, trailing_slash, mode, expected_mode):
@pytest.mark.parametrize(
"modified_at,expected_time",
[
# Datetimes near the 1980 epoch used in the MS-DOS header.
# Note the 2-second precision and the cutoff of everything before the epoch.
(datetime(1979, 12, 31, 23, 59, 58), (1980, 1, 1, 0, 0, 0)),
(datetime(1979, 12, 31, 23, 59, 59), (1980, 1, 1, 0, 0, 0)),
(datetime(1980, 1, 1, 0, 0, 0), (1980, 1, 1, 0, 0, 0)),
(datetime(1980, 1, 1, 0, 0, 1), (1980, 1, 1, 0, 0, 0)),
(datetime(1980, 1, 1, 0, 0, 2), (1980, 1, 1, 0, 0, 2)),
(datetime(1980, 1, 1, 0, 0, 3), (1980, 1, 1, 0, 0, 2)),
(datetime(1980, 1, 1, 0, 0, 4), (1980, 1, 1, 0, 0, 4)),
# Datetimes near year 2108 test the maximum datetime that the MS-DOS
# header can store. Again, note the 2-second precision.
(datetime(2107, 12, 31, 23, 59, 56), (2107, 12, 31, 23, 59, 56)),
(datetime(2107, 12, 31, 23, 59, 57), (2107, 12, 31, 23, 59, 56)),
(datetime(2107, 12, 31, 23, 59, 58), (2107, 12, 31, 23, 59, 58)),
(datetime(2107, 12, 31, 23, 59, 59), (2107, 12, 31, 23, 59, 58)),
(datetime(2108, 1, 1, 0, 0, 0), (2107, 12, 31, 23, 59, 58)),
(datetime(2108, 1, 1, 0, 0, 1), (2107, 12, 31, 23, 59, 58)),
(datetime(2108, 1, 1, 0, 0, 2), (2107, 12, 31, 23, 59, 58)),
# Miscellaneous
(datetime(2011, 1, 1, 1, 2, 3, 123), (2011, 1, 1, 1, 2, 2)),
(datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0))), (2011, 1, 1, 1, 2, 2)),
(datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1))), (2011, 1, 1, 1, 2, 2)),
Expand Down Expand Up @@ -1027,27 +1046,40 @@ def extracted():
],
)
@pytest.mark.parametrize(
"timezone,modified_at",
"timezone,modified_at,expected_modified_at",
[
('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123)),
('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0)))),
('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1)))),
('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1)))),
('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123)),
('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0)))),
('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1)))),
('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1)))),
('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123)),
('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0)))),
('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1)))),
('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1)))),
# Datetimes near the UNIX epoch (1970)
('UTC+0', datetime(1969, 12, 31, 23, 59, 58), datetime(1970, 1, 1, 0, 0, 0)),
('UTC+0', datetime(1969, 12, 31, 23, 59, 59), datetime(1970, 1, 1, 0, 0, 0)),
('UTC+0', datetime(1970, 1, 1, 0, 0, 0), None),
# Datetimes near the maximum representable datetime in the UNIX timestamp header
# (4-byte signed integer counting the number of seconds since 1970)
('UTC+0', datetime(2038, 1, 19, 3, 14, 7), None),
('UTC+0', datetime(2038, 1, 19, 3, 14, 8), datetime(2038, 1, 19, 3, 14, 7)),
('UTC+0', datetime(2038, 1, 19, 3, 14, 9), datetime(2038, 1, 19, 3, 14, 7)),
('UTC+0', datetime(2038, 1, 19, 3, 14, 10), datetime(2038, 1, 19, 3, 14, 7)),
# Miscellaneous
('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123), None),
('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0))), None),
('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1))), None),
('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1))), None),
('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123), None),
('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0))), None),
('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1))), None),
('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1))), None),
('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123), None),
('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0))), None),
('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1))), None),
('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1))), None),
],
)
def test_unzip_modification_time(method, timezone, modified_at):
def test_unzip_modification_time(method, timezone, modified_at, expected_modified_at):
member_files = (
('my_file', modified_at, stat.S_IFREG | 0o600, method, ()),
)
zipped_chunks = stream_zip(member_files)
if expected_modified_at is None:
expected_modified_at = modified_at

with \
TemporaryDirectory() as d, \
Expand All @@ -1059,7 +1091,7 @@ def test_unzip_modification_time(method, timezone, modified_at):

subprocess.run(['unzip', f'{d}/test.zip', '-d', d], env={'TZ': timezone})

assert os.path.getmtime('my_file') == int(modified_at.timestamp())
assert os.path.getmtime('my_file') == int(expected_modified_at.timestamp())


@pytest.mark.parametrize(
Expand All @@ -1074,6 +1106,28 @@ def test_unzip_modification_time(method, timezone, modified_at):
@pytest.mark.parametrize(
"timezone,modified_at,expected_modified_at",
[
# Datetimes near the 1980 epoch used in the MS-DOS header.
# Note the 2-second precision and the cutoff of everything before the epoch.
("UTC+0", datetime(1979, 12, 31, 23, 59, 58), datetime(1980, 1, 1, 0, 0, 0)),
("UTC+0", datetime(1979, 12, 31, 23, 59, 59), datetime(1980, 1, 1, 0, 0, 0)),
("UTC+0", datetime(1980, 1, 1, 0, 0, 0), datetime(1980, 1, 1, 0, 0, 0)),
("UTC+0", datetime(1980, 1, 1, 0, 0, 1), datetime(1980, 1, 1, 0, 0, 0)),
("UTC+0", datetime(1980, 1, 1, 0, 0, 2), datetime(1980, 1, 1, 0, 0, 2)),
("UTC+0", datetime(1980, 1, 1, 0, 0, 3), datetime(1980, 1, 1, 0, 0, 2)),
("UTC+0", datetime(1980, 1, 1, 0, 0, 4), datetime(1980, 1, 1, 0, 0, 4)),
# Datetimes near year 2108 test the maximum datetime that the MS-DOS
# header can store. Again, note the 2-second precision.
("UTC+0", datetime(2100, 12, 31, 23, 59, 56), datetime(2100, 12, 31, 23, 59, 56)),
("UTC+0", datetime(2100, 12, 31, 23, 59, 57), datetime(2100, 12, 31, 23, 59, 56)),
("UTC+0", datetime(2100, 12, 31, 23, 59, 58), datetime(2100, 12, 31, 23, 59, 58)),
("UTC+0", datetime(2100, 12, 31, 23, 59, 59), datetime(2100, 12, 31, 23, 59, 58)),
# The upper limit for the datetime field is supposed to be the end of year 2107.
# In practice, however, we see very strange behaviour from `unzip` after year 2100.
# It seems that there is an off-by-one bug in `unzip` for dates after year 2100.
("UTC+0", datetime(2101, 1, 1, 0, 0, 0), datetime(2101, 1, 2, 0, 0, 0)),
("UTC+0", datetime(2101, 1, 2, 0, 0, 0), datetime(2101, 1, 3, 0, 0, 0)),
("UTC+0", datetime(2101, 1, 3, 0, 0, 0), datetime(2101, 1, 4, 0, 0, 0)),
# Miscellaneous
('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123), datetime(2011, 1, 1, 2, 2, 2, 0)),
],
)
Expand Down

0 comments on commit 15139e3

Please sign in to comment.