rewrite some parts of lazy wheel

pypa · Aug 7, 2023 · 8c35424 · 8c35424
1 parent 477c5bb
commit 8c35424
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 40 deletions.
diff --git a/src/pip/_internal/network/lazy_wheel.py b/src/pip/_internal/network/lazy_wheel.py
@@ -39,12 +39,10 @@ def dist_from_wheel_url(name: str, url: str, session: Session) -> BaseDistributi
     is raised.
     """
     try:
-        with LazyZipOverHTTP(url, session) as zf:
-            zf.prefetch_dist_info()
-
+        with LazyHTTPFile(url, session) as lazy_file:
             # For read-only ZIP files, ZipFile only needs methods read,
             # seek, seekable and tell, not the whole IO protocol.
-            wheel = MemoryWheel(zf.name, zf)
+            wheel = MemoryWheel(lazy_file.name, lazy_file)
             # After context manager exit, wheel.name is an invalid file by intention.
             return get_wheel_distribution(wheel, canonicalize_name(name))
     except (BadZipFile, UnsupportedWheel):
@@ -147,7 +145,7 @@ def __next__(self) -> bytes:
         raise NotImplementedError
 
 
-class LazyZipOverHTTP(ReadOnlyIOWrapper):
+class LazyHTTPFile(ReadOnlyIOWrapper):
     """File-like object mapped to a ZIP file over HTTP.
 
     This uses HTTP range requests to lazily fetch the file's content,
@@ -161,20 +159,30 @@ class LazyZipOverHTTP(ReadOnlyIOWrapper):
     _domains_without_negative_range: ClassVar[set[str]] = set()
 
     def __init__(
-        self, url: str, session: Session, chunk_size: int = CONTENT_CHUNK_SIZE
+        self, url: str, session: Session, initial_chunk_size: int = CONTENT_CHUNK_SIZE
     ) -> None:
+        # Add delete=False and print the file's `.name` to debug invalid virtual zips.
         super().__init__(cast(BinaryIO, NamedTemporaryFile()))
 
         self._request_count = 0
         self._session = session
         self._url = url
-        self._chunk_size = chunk_size
         self._left: list[int] = []
         self._right: list[int] = []
 
-        self._length, initial_chunk = self._extract_content_length()
+        self._length, initial_chunk = self._extract_content_length(initial_chunk_size)
         self.truncate(self._length)
-        if initial_chunk is not None:
+        # The central directory for
+        # tensorflow_gpu-2.5.3-cp38-cp38-manylinux2010_x86_64.whl is 944931 bytes, for
+        # a 459424488 byte file (about 486x as large).
+        self._minimum_fetch_granularity = max(initial_chunk_size, self._length // 400)
+        if initial_chunk is None:
+            # If we could not download any file contents yet (e.g. if negative byte
+            # ranges were not supported), then download all of this at once, hopefully
+            # pulling in the entire central directory.
+            initial_start = max(0, self._length - self._minimum_fetch_granularity)
+            self._download(initial_start, self._length)
+        else:
             self.seek(-len(initial_chunk), io.SEEK_END)
             self._file.write(initial_chunk)
             self._left.append(self._length - len(initial_chunk))
@@ -192,28 +200,27 @@ def read(self, size: int = -1) -> bytes:
         if size < 0:
             assert cur <= self._length
             download_size = self._length - cur
+        elif size == 0:
+            return b''
         else:
-            download_size = max(size, self._chunk_size)
+            download_size = max(size, self._minimum_fetch_granularity)
         stop = min(cur + download_size, self._length)
         self._download(cur, stop - 1)
         return self._file.read(size)
 
-    def __enter__(self) -> LazyZipOverHTTP:
+    def __enter__(self) -> LazyHTTPFile:
         super().__enter__()
         return self
 
     def __exit__(self, *exc: Any) -> None:
-        logger.debug("requests for url %s: %s", self._url, self._request_count)
+        logger.debug("%d requests for url %s", self._request_count, self._url)
         super().__exit__(*exc)
 
     def _content_length_from_head(self) -> int:
         self._request_count += 1
         head = self._session.head(self._url, headers=HEADERS)
         head.raise_for_status()
         assert head.status_code == codes.ok
-        # S3 provides lowercased headers, and in the normal case these will return the
-        # same as 'Content-Length'.
-        # FIXME: provide documentation for this?
         return int(head.headers["content-length"])
 
     @staticmethod
@@ -222,10 +229,10 @@ def _parse_full_length_from_content_range(arg: str) -> Optional[int]:
             return int(m.group(1))
         return None
 
-    def _try_initial_chunk_request(self) -> tuple[int, bytes]:
+    def _try_initial_chunk_request(self, initial_chunk_size: int) -> tuple[int, bytes]:
         headers = HEADERS.copy()
         # Perform a negative range index, which is not supported by some servers.
-        headers["Range"] = f"bytes=-{self._chunk_size}"
+        headers["Range"] = f"bytes=-{initial_chunk_size}"
         # TODO: Get range requests to be correctly cached
         headers["Cache-Control"] = "no-cache"
         # TODO: If-Match (etag) to detect file changed during fetch would be a
@@ -243,7 +250,7 @@ def _try_initial_chunk_request(self) -> tuple[int, bytes]:
         if code == codes.ok:
             # If this was done despite a smaller requested byte range, then we assume
             # the server does not support range requests.
-            if len(tail) > self._chunk_size:
+            if len(tail) > initial_chunk_size:
                 raise HTTPRangeRequestUnsupported("returned complete file contents")
         elif code != codes.partial_content:
             raise HTTPRangeRequestUnsupported("did not receive partial content or ok")
@@ -253,14 +260,14 @@ def _try_initial_chunk_request(self) -> tuple[int, bytes]:
             return (file_length, tail.content)
         raise HTTPRangeRequestUnsupported(f"could not parse content-range: {range_arg}")
 
-    def _extract_content_length(self) -> tuple[int, Optional[bytes]]:
+    def _extract_content_length(self, initial_chunk_size: int) -> tuple[int, Optional[bytes]]:
         domain = urlparse(self._url).netloc
         if domain in self._domains_without_negative_range:
             return (self._content_length_from_head(), None)
 
         # Initial range request for just the end of the file.
         try:
-            return self._try_initial_chunk_request()
+            return self._try_initial_chunk_request(initial_chunk_size)
         except HTTPError as e:
             resp = e.response
             code = resp.status_code
@@ -306,7 +313,7 @@ def _stay(self) -> Iterator[None]:
     def _check_zip(self) -> None:
         """Check and download until the file is a valid ZIP."""
         end = self._length - 1
-        for start in reversed(range(0, end, self._chunk_size)):
+        for start in reversed(range(0, end, CONTENT_CHUNK_SIZE)):
             self._download(start, end)
             with self._stay():
                 try:
@@ -363,23 +370,5 @@ def _download(self, start: int, end: int) -> None:
             for start, end in self._merge(start, end, left, right):
                 response = self._stream_response(start, end)
                 self.seek(start)
-                for chunk in response.iter_content(self._chunk_size):
+                for chunk in response.iter_content(CONTENT_CHUNK_SIZE):
                     self._file.write(chunk)
-
-    def prefetch_dist_info(self) -> None:
-        """
-        Read contents of entire dist-info section of wheel.
-
-        pip wants to read WHEEL and METADATA.
-        """
-        with self._stay():
-            zf = ZipFile(self)
-            infolist = zf.infolist()
-            for info in infolist:
-                # should be (wheel filename without extension etc) + (.dist-info/)
-                if ".dist-info/" in info.filename:
-                    start = info.header_offset
-                    end = zf.start_dir
-                    self.seek(start)
-                    self.read(end - start)
-                    break
diff --git a/src/pip/_internal/utils/wheel.py b/src/pip/_internal/utils/wheel.py
@@ -70,6 +70,7 @@ def wheel_dist_info_dir(source: ZipFile, name: str) -> str:
 
 def read_wheel_metadata_file(source: ZipFile, path: str) -> bytes:
     try:
+        logger.debug("extracting entry '%s' from zip '%s'", path, source.fp.name)
         return source.read(path)
         # BadZipFile for general corruption, KeyError for missing entry,
         # and RuntimeError for password-protected files