From 15139e394321079a7aaf3aa9681379bcacb25d78 Mon Sep 17 00:00:00 2001 From: Frederik Aalund Date: Fri, 12 Apr 2024 11:52:45 +0000 Subject: [PATCH] feat: clamp timestamps to fit header fields --- docs/get-started.md | 15 ++++++++ stream_zip.py | 32 +++++++++++++---- test_stream_zip.py | 84 +++++++++++++++++++++++++++++++++++++-------- 3 files changed, 109 insertions(+), 22 deletions(-) diff --git a/docs/get-started.md b/docs/get-started.md index c0cea13..e7997a8 100644 --- a/docs/get-started.md +++ b/docs/get-started.md @@ -157,6 +157,21 @@ Each member file is compressed with a method that must be specified in client co In general, not all valid ZIP files are possible to be stream unzipped. However, all files generated by stream-zip are suitable for stream unzipping, for example by [stream-unzip](https://stream-unzip.docs.trade.gov.uk/). +## Timestamps + +File timestamps (e.g., "modified at") have to fit into the ZIP file format. +Therefore, stream-zip both rounds and clamps timestamps to make them fit the ZIP file format. + +If `extended_timestamps=True` (the default): + + * Timestamps are clamped between 1970-1-1 and 2038-1-19 (both inclusive) + * Timestamps are rounded down with 1-second precision + +If `extended_timestamps=False`: + + * Timestamps are clamped between 1980-1-1 and 2107-12-31 (both inclusive) + * Timestamps are rounded down with 2-second precision + ## Limitations The `NO_COMPRESSION_32` and `NO_COMPRESSION_64` methods do not stream - they buffer the entire binary contents of the file in memory before output. They do this to calculate the length and CRC 32 to output them before the binary contents in the ZIP. This is required in order for ZIP to be stream unzippable. diff --git a/stream_zip.py b/stream_zip.py index 7f2bfd1..7abfdf5 100644 --- a/stream_zip.py +++ b/stream_zip.py @@ -1,4 +1,5 @@ from collections import deque +from datetime import datetime from struct import Struct import asyncio import secrets @@ -21,6 +22,17 @@ _AUTO_UPGRADE_CENTRAL_DIRECTORY = object() _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY = object() +_MS_DOS_DATE_BEGIN = datetime(1980, 1, 1) +_MS_DOS_DATE_END = datetime( + # Max year since 1980 repesentable in a 7-bit unsigned integer + year=_MS_DOS_DATE_BEGIN.year + 2**7-1, + month=12, + day=31, + hour=23, + minute=59, + second=59, +) + def __NO_COMPRESSION_BUFFERED_32(offset, default_get_compressobj): return _NO_COMPRESSION_BUFFERED_32, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj, None, None @@ -612,19 +624,25 @@ def _no_compression_streamed_data(chunks, uncompressed_size, crc_32, maximum_siz name_encoded = name.encode('utf-8') _raise_if_beyond(len(name_encoded), maximum=0xffff, exception_class=NameLengthOverflowError) + # Remove time zone information (if any) during clamp + mod_datetime_ms_dos = min(max(modified_at.replace(tzinfo=None), _MS_DOS_DATE_BEGIN), _MS_DOS_DATE_END) mod_at_ms_dos = modified_at_struct.pack( - int(modified_at.second / 2) | \ - (modified_at.minute << 5) | \ - (modified_at.hour << 11), - modified_at.day | \ - (modified_at.month << 5) | \ - (modified_at.year - 1980) << 9, + (mod_datetime_ms_dos.second // 2) | \ + (mod_datetime_ms_dos.minute << 5) | \ + (mod_datetime_ms_dos.hour << 11), + mod_datetime_ms_dos.day | \ + (mod_datetime_ms_dos.month << 5) | \ + (mod_datetime_ms_dos.year - 1980) << 9, ) mod_at_unix_extra = mod_at_unix_extra_struct.pack( mod_at_unix_extra_signature, 5, # Size of extra b'\x01', # Only modification time (as opposed to also other times) - int(modified_at.timestamp()), + # Clamp timestamp to fit the field size (4-byte signed integer) + # In principle, we the lower limit should be `-2**31` but we set it + # to zero to avoid issues with common zip utilities like `unzip`. + # Said tools do not correctly interpret negative timestamps. + max(min(int(modified_at.timestamp()), 2**31 - 1), 0), ) if extended_timestamps else b'' external_attr = \ (mode << 16) | \ diff --git a/test_stream_zip.py b/test_stream_zip.py index 8c06617..dcb42b6 100644 --- a/test_stream_zip.py +++ b/test_stream_zip.py @@ -989,6 +989,25 @@ def test_bsdio_empty_directory(method, trailing_slash, mode, expected_mode): @pytest.mark.parametrize( "modified_at,expected_time", [ + # Datetimes near the 1980 epoch used in the MS-DOS header. + # Note the 2-second precision and the cutoff of everything before the epoch. + (datetime(1979, 12, 31, 23, 59, 58), (1980, 1, 1, 0, 0, 0)), + (datetime(1979, 12, 31, 23, 59, 59), (1980, 1, 1, 0, 0, 0)), + (datetime(1980, 1, 1, 0, 0, 0), (1980, 1, 1, 0, 0, 0)), + (datetime(1980, 1, 1, 0, 0, 1), (1980, 1, 1, 0, 0, 0)), + (datetime(1980, 1, 1, 0, 0, 2), (1980, 1, 1, 0, 0, 2)), + (datetime(1980, 1, 1, 0, 0, 3), (1980, 1, 1, 0, 0, 2)), + (datetime(1980, 1, 1, 0, 0, 4), (1980, 1, 1, 0, 0, 4)), + # Datetimes near year 2108 test the maximum datetime that the MS-DOS + # header can store. Again, note the 2-second precision. + (datetime(2107, 12, 31, 23, 59, 56), (2107, 12, 31, 23, 59, 56)), + (datetime(2107, 12, 31, 23, 59, 57), (2107, 12, 31, 23, 59, 56)), + (datetime(2107, 12, 31, 23, 59, 58), (2107, 12, 31, 23, 59, 58)), + (datetime(2107, 12, 31, 23, 59, 59), (2107, 12, 31, 23, 59, 58)), + (datetime(2108, 1, 1, 0, 0, 0), (2107, 12, 31, 23, 59, 58)), + (datetime(2108, 1, 1, 0, 0, 1), (2107, 12, 31, 23, 59, 58)), + (datetime(2108, 1, 1, 0, 0, 2), (2107, 12, 31, 23, 59, 58)), + # Miscellaneous (datetime(2011, 1, 1, 1, 2, 3, 123), (2011, 1, 1, 1, 2, 2)), (datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0))), (2011, 1, 1, 1, 2, 2)), (datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1))), (2011, 1, 1, 1, 2, 2)), @@ -1027,27 +1046,40 @@ def extracted(): ], ) @pytest.mark.parametrize( - "timezone,modified_at", + "timezone,modified_at,expected_modified_at", [ - ('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123)), - ('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0)))), - ('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1)))), - ('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1)))), - ('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123)), - ('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0)))), - ('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1)))), - ('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1)))), - ('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123)), - ('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0)))), - ('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1)))), - ('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1)))), + # Datetimes near the UNIX epoch (1970) + ('UTC+0', datetime(1969, 12, 31, 23, 59, 58), datetime(1970, 1, 1, 0, 0, 0)), + ('UTC+0', datetime(1969, 12, 31, 23, 59, 59), datetime(1970, 1, 1, 0, 0, 0)), + ('UTC+0', datetime(1970, 1, 1, 0, 0, 0), None), + # Datetimes near the maximum representable datetime in the UNIX timestamp header + # (4-byte signed integer counting the number of seconds since 1970) + ('UTC+0', datetime(2038, 1, 19, 3, 14, 7), None), + ('UTC+0', datetime(2038, 1, 19, 3, 14, 8), datetime(2038, 1, 19, 3, 14, 7)), + ('UTC+0', datetime(2038, 1, 19, 3, 14, 9), datetime(2038, 1, 19, 3, 14, 7)), + ('UTC+0', datetime(2038, 1, 19, 3, 14, 10), datetime(2038, 1, 19, 3, 14, 7)), + # Miscellaneous + ('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123), None), + ('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0))), None), + ('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1))), None), + ('UTC+0', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1))), None), + ('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123), None), + ('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0))), None), + ('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1))), None), + ('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1))), None), + ('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123), None), + ('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=0))), None), + ('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=1))), None), + ('UTC-1', datetime(2011, 1, 1, 1, 2, 3, 123, tzinfo=timezone(timedelta(hours=-1))), None), ], ) -def test_unzip_modification_time(method, timezone, modified_at): +def test_unzip_modification_time(method, timezone, modified_at, expected_modified_at): member_files = ( ('my_file', modified_at, stat.S_IFREG | 0o600, method, ()), ) zipped_chunks = stream_zip(member_files) + if expected_modified_at is None: + expected_modified_at = modified_at with \ TemporaryDirectory() as d, \ @@ -1059,7 +1091,7 @@ def test_unzip_modification_time(method, timezone, modified_at): subprocess.run(['unzip', f'{d}/test.zip', '-d', d], env={'TZ': timezone}) - assert os.path.getmtime('my_file') == int(modified_at.timestamp()) + assert os.path.getmtime('my_file') == int(expected_modified_at.timestamp()) @pytest.mark.parametrize( @@ -1074,6 +1106,28 @@ def test_unzip_modification_time(method, timezone, modified_at): @pytest.mark.parametrize( "timezone,modified_at,expected_modified_at", [ + # Datetimes near the 1980 epoch used in the MS-DOS header. + # Note the 2-second precision and the cutoff of everything before the epoch. + ("UTC+0", datetime(1979, 12, 31, 23, 59, 58), datetime(1980, 1, 1, 0, 0, 0)), + ("UTC+0", datetime(1979, 12, 31, 23, 59, 59), datetime(1980, 1, 1, 0, 0, 0)), + ("UTC+0", datetime(1980, 1, 1, 0, 0, 0), datetime(1980, 1, 1, 0, 0, 0)), + ("UTC+0", datetime(1980, 1, 1, 0, 0, 1), datetime(1980, 1, 1, 0, 0, 0)), + ("UTC+0", datetime(1980, 1, 1, 0, 0, 2), datetime(1980, 1, 1, 0, 0, 2)), + ("UTC+0", datetime(1980, 1, 1, 0, 0, 3), datetime(1980, 1, 1, 0, 0, 2)), + ("UTC+0", datetime(1980, 1, 1, 0, 0, 4), datetime(1980, 1, 1, 0, 0, 4)), + # Datetimes near year 2108 test the maximum datetime that the MS-DOS + # header can store. Again, note the 2-second precision. + ("UTC+0", datetime(2100, 12, 31, 23, 59, 56), datetime(2100, 12, 31, 23, 59, 56)), + ("UTC+0", datetime(2100, 12, 31, 23, 59, 57), datetime(2100, 12, 31, 23, 59, 56)), + ("UTC+0", datetime(2100, 12, 31, 23, 59, 58), datetime(2100, 12, 31, 23, 59, 58)), + ("UTC+0", datetime(2100, 12, 31, 23, 59, 59), datetime(2100, 12, 31, 23, 59, 58)), + # The upper limit for the datetime field is supposed to be the end of year 2107. + # In practice, however, we see very strange behaviour from `unzip` after year 2100. + # It seems that there is an off-by-one bug in `unzip` for dates after year 2100. + ("UTC+0", datetime(2101, 1, 1, 0, 0, 0), datetime(2101, 1, 2, 0, 0, 0)), + ("UTC+0", datetime(2101, 1, 2, 0, 0, 0), datetime(2101, 1, 3, 0, 0, 0)), + ("UTC+0", datetime(2101, 1, 3, 0, 0, 0), datetime(2101, 1, 4, 0, 0, 0)), + # Miscellaneous ('UTC+1', datetime(2011, 1, 1, 1, 2, 3, 123), datetime(2011, 1, 1, 2, 2, 2, 0)), ], )