diff --git a/src/pip/_internal/network/lazy_wheel.py b/src/pip/_internal/network/lazy_wheel.py index cd451c9227a..6a231bcf22b 100644 --- a/src/pip/_internal/network/lazy_wheel.py +++ b/src/pip/_internal/network/lazy_wheel.py @@ -22,6 +22,7 @@ from pip._internal.metadata import BaseDistribution, MemoryWheel, get_wheel_distribution from pip._internal.network.session import PipSession as Session from pip._internal.network.utils import HEADERS +from pip._internal.utils.logging import indent_log logger = logging.getLogger(__name__) @@ -40,6 +41,11 @@ def dist_from_wheel_url(name: str, url: str, session: Session) -> BaseDistributi """ try: with LazyHTTPFile(url, session) as lazy_file: + with indent_log(): + logger.debug("begin prefetching for %s", name) + lazy_file.prefetch_contiguous_dist_info(name) + logger.debug("done prefetching for %s", name) + # For read-only ZIP files, ZipFile only needs methods read, # seek, seekable and tell, not the whole IO protocol. wheel = MemoryWheel(lazy_file.name, lazy_file) @@ -145,6 +151,11 @@ def __next__(self) -> bytes: raise NotImplementedError +# The central directory for tensorflow_gpu-2.5.3-cp38-cp38-manylinux2010_x86_64.whl is +# 944931 bytes, for a 459424488 byte file (about 486x as large). +_DEFAULT_INITIAL_FETCH = 1_000_000 + + class LazyHTTPFile(ReadOnlyIOWrapper): """File-like object mapped to a ZIP file over HTTP. @@ -159,7 +170,10 @@ class LazyHTTPFile(ReadOnlyIOWrapper): _domains_without_negative_range: ClassVar[set[str]] = set() def __init__( - self, url: str, session: Session, initial_chunk_size: int = CONTENT_CHUNK_SIZE + self, + url: str, + session: Session, + initial_chunk_size: int = _DEFAULT_INITIAL_FETCH, ) -> None: # Add delete=False and print the file's `.name` to debug invalid virtual zips. super().__init__(cast(BinaryIO, NamedTemporaryFile())) @@ -172,21 +186,20 @@ def __init__( self._length, initial_chunk = self._extract_content_length(initial_chunk_size) self.truncate(self._length) - # The central directory for - # tensorflow_gpu-2.5.3-cp38-cp38-manylinux2010_x86_64.whl is 944931 bytes, for - # a 459424488 byte file (about 486x as large). - self._minimum_fetch_granularity = max(initial_chunk_size, self._length // 400) if initial_chunk is None: # If we could not download any file contents yet (e.g. if negative byte # ranges were not supported), then download all of this at once, hopefully # pulling in the entire central directory. - initial_start = max(0, self._length - self._minimum_fetch_granularity) + initial_start = max(0, self._length - initial_chunk_size) self._download(initial_start, self._length) else: - self.seek(-len(initial_chunk), io.SEEK_END) - self._file.write(initial_chunk) - self._left.append(self._length - len(initial_chunk)) - self._right.append(self._length - 1) + # If we could download file contents, then write them to the end of the + # file and set up our bisect boundaries by hand. + with self._stay(): + self.seek(-len(initial_chunk), io.SEEK_END) + self._file.write(initial_chunk) + self._left.append(self._length - len(initial_chunk)) + self._right.append(self._length - 1) def read(self, size: int = -1) -> bytes: """Read up to size bytes from the object and return them. @@ -195,17 +208,18 @@ def read(self, size: int = -1) -> bytes: all bytes until EOF are returned. Fewer than size bytes may be returned if EOF is reached. """ - # BUG does not download correctly if size is unspecified cur = self.tell() + logger.debug("read size %d at %d", size, cur) if size < 0: assert cur <= self._length download_size = self._length - cur elif size == 0: - return b'' + return b"" else: - download_size = max(size, self._minimum_fetch_granularity) + download_size = size stop = min(cur + download_size, self._length) - self._download(cur, stop - 1) + logger.debug("read->download from %d->%d", cur, stop) + self._download(cur, stop) return self._file.read(size) def __enter__(self) -> LazyHTTPFile: @@ -260,7 +274,9 @@ def _try_initial_chunk_request(self, initial_chunk_size: int) -> tuple[int, byte return (file_length, tail.content) raise HTTPRangeRequestUnsupported(f"could not parse content-range: {range_arg}") - def _extract_content_length(self, initial_chunk_size: int) -> tuple[int, Optional[bytes]]: + def _extract_content_length( + self, initial_chunk_size: int + ) -> tuple[int, Optional[bytes]]: domain = urlparse(self._url).netloc if domain in self._domains_without_negative_range: return (self._content_length_from_head(), None) @@ -364,6 +380,8 @@ def _merge( def _download(self, start: int, end: int) -> None: """Download bytes from start to end inclusively.""" + # Reducing by 1 to get an inclusive end range. + end -= 1 with self._stay(): left = bisect_left(self._right, start) right = bisect_right(self._left, end) @@ -372,3 +390,35 @@ def _download(self, start: int, end: int) -> None: self.seek(start) for chunk in response.iter_content(CONTENT_CHUNK_SIZE): self._file.write(chunk) + + def prefetch_contiguous_dist_info(self, name: str) -> None: + """ + Read contents of entire dist-info section of wheel. + + pip will read every entry in this directory when generating a dist from a wheel, + so prepopulating the file contents avoids waiting for multiple range requests. + """ + dist_info_prefix = re.compile(r"^[^/]*\.dist-info/") + start: Optional[int] = None + end: Optional[int] = None + + zf = ZipFile(self) + + for info in zf.infolist(): + if start is None: + if dist_info_prefix.search(info.filename): + start = info.header_offset + continue + else: + if not dist_info_prefix.search(info.filename): + end = info.header_offset + break + if start is None: + raise UnsupportedWheel( + f"no {dist_info_prefix} directory found for {name} in {self.name}" + ) + # If the last entries of the zip are the .dist-info/ dir (as usual), then give + # us everything until the start of the central directory. + if end is None: + end = zf.start_dir + self._download(start, end)