Skip to content

Commit

Permalink
pythongh-88339: enable fast seeking of uncompressed unencrypted zipfi…
Browse files Browse the repository at this point in the history
…le.ZipExtFile (pythonGH-27737)

Avoid reading all of the intermediate data in uncompressed items in a zip file when the user seeks forward.

Contributed by: @JuniorJPDJ
  • Loading branch information
JuniorJPDJ authored Aug 6, 2022
1 parent 56af5a2 commit 330f1d5
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 5 deletions.
2 changes: 2 additions & 0 deletions Lib/test/test_zipfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -2032,6 +2032,7 @@ def test_seek_tell(self):
fp.seek(bloc, os.SEEK_CUR)
self.assertEqual(fp.tell(), bloc)
self.assertEqual(fp.read(5), txt[bloc:bloc+5])
self.assertEqual(fp.tell(), bloc + 5)
fp.seek(0, os.SEEK_END)
self.assertEqual(fp.tell(), len(txt))
fp.seek(0, os.SEEK_SET)
Expand All @@ -2049,6 +2050,7 @@ def test_seek_tell(self):
fp.seek(bloc, os.SEEK_CUR)
self.assertEqual(fp.tell(), bloc)
self.assertEqual(fp.read(5), txt[bloc:bloc+5])
self.assertEqual(fp.tell(), bloc + 5)
fp.seek(0, os.SEEK_END)
self.assertEqual(fp.tell(), len(txt))
fp.seek(0, os.SEEK_SET)
Expand Down
24 changes: 19 additions & 5 deletions Lib/zipfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -847,6 +847,7 @@ def __init__(self, fileobj, mode, zipinfo, pwd=None,
self._orig_compress_size = zipinfo.compress_size
self._orig_file_size = zipinfo.file_size
self._orig_start_crc = self._running_crc
self._orig_crc = self._expected_crc
self._seekable = True
except AttributeError:
pass
Expand Down Expand Up @@ -1069,17 +1070,17 @@ def seekable(self):
raise ValueError("I/O operation on closed file.")
return self._seekable

def seek(self, offset, whence=0):
def seek(self, offset, whence=os.SEEK_SET):
if self.closed:
raise ValueError("seek on closed file.")
if not self._seekable:
raise io.UnsupportedOperation("underlying stream is not seekable")
curr_pos = self.tell()
if whence == 0: # Seek from start of file
if whence == os.SEEK_SET:
new_pos = offset
elif whence == 1: # Seek from current position
elif whence == os.SEEK_CUR:
new_pos = curr_pos + offset
elif whence == 2: # Seek from EOF
elif whence == os.SEEK_END:
new_pos = self._orig_file_size + offset
else:
raise ValueError("whence must be os.SEEK_SET (0), "
Expand All @@ -1094,14 +1095,27 @@ def seek(self, offset, whence=0):
read_offset = new_pos - curr_pos
buff_offset = read_offset + self._offset

if buff_offset >= 0 and buff_offset < len(self._readbuffer):
# Fast seek uncompressed unencrypted file
if self._compress_type == ZIP_STORED and self._decrypter is None and read_offset > 0:
# disable CRC checking after first seeking - it would be invalid
self._expected_crc = None
# seek actual file taking already buffered data into account
read_offset -= len(self._readbuffer) - self._offset
self._fileobj.seek(read_offset, os.SEEK_CUR)
self._left -= read_offset
read_offset = 0
# flush read buffer
self._readbuffer = b''
self._offset = 0
elif buff_offset >= 0 and buff_offset < len(self._readbuffer):
# Just move the _offset index if the new position is in the _readbuffer
self._offset = buff_offset
read_offset = 0
elif read_offset < 0:
# Position is before the current position. Reset the ZipExtFile
self._fileobj.seek(self._orig_compress_start)
self._running_crc = self._orig_start_crc
self._expected_crc = self._orig_crc
self._compress_left = self._orig_compress_size
self._left = self._orig_file_size
self._readbuffer = b''
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Enable fast seeking of uncompressed unencrypted :class:`zipfile.ZipExtFile`

0 comments on commit 330f1d5

Please sign in to comment.