From f28046bfa240a2e339afd2386cc449f5751276cc Mon Sep 17 00:00:00 2001 From: John Sirois Date: Mon, 30 Sep 2024 18:24:46 -0700 Subject: [PATCH] Fix Zip extraction UTF-8 handling for Python 2.7. (#2546) Previously, zips with non-ASCII file names could cause extraction errors under Python 2.7 on Linux. Fixes #298 --- CHANGES.md | 9 +++ pex/common.py | 119 +++++++++++++++++++++++----- pex/layout.py | 11 +-- pex/version.py | 2 +- tests/integration/test_issue_298.py | 84 ++++++++++++++++++++ tests/test_common.py | 10 +-- tests/test_enum.py | 6 +- 7 files changed, 201 insertions(+), 40 deletions(-) create mode 100644 tests/integration/test_issue_298.py diff --git a/CHANGES.md b/CHANGES.md index e63e88a3a..fe7fb412d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,14 @@ # Release Notes +## 2.20.2 + +This release fixes an old bug handling certain sdist zips under +Python 2.7 as well missing support for Python 3.13's `PYTHON_COLORS` +env var. + +* Fix Zip extraction UTF-8 handling for Python 2.7. (#2546) +* Add repl support for `PYTHON_COLORS`. (#2545) + ## 2.20.1 This release fixes Pex `--interpreter-constraint` handling such that diff --git a/pex/common.py b/pex/common.py index 0799e4969..f3baa821d 100644 --- a/pex/common.py +++ b/pex/common.py @@ -19,9 +19,10 @@ from collections import defaultdict, namedtuple from datetime import datetime from uuid import uuid4 +from zipfile import ZipFile, ZipInfo from pex.enum import Enum -from pex.typing import TYPE_CHECKING +from pex.typing import TYPE_CHECKING, cast if TYPE_CHECKING: from typing import ( @@ -43,7 +44,7 @@ Union, ) - _Text = TypeVar("_Text", str, Text) + _Text = TypeVar("_Text", bytes, str, Text) # We use the start of MS-DOS time, which is what zipfiles use (see section 4.4.6 of # https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT). @@ -170,14 +171,25 @@ def teardown(self): _MKDTEMP_SINGLETON = MktempTeardownRegistry() -class PermPreservingZipFile(zipfile.ZipFile, object): - """A ZipFile that works around https://bugs.python.org/issue15795.""" +class ZipFileEx(ZipFile): + """A ZipFile that works around several issues in the stdlib. + + See: + + https://bugs.python.org/issue15795 + + https://github.com/pex-tool/pex/issues/298 + """ class ZipEntry(namedtuple("ZipEntry", ["info", "data"])): pass @classmethod - def zip_entry_from_file(cls, filename, arcname=None, date_time=None): + def zip_entry_from_file( + cls, + filename, # type: str + arcname=None, # type: Optional[str] + date_time=None, # type: Optional[time.struct_time] + ): + # type: (...) -> ZipEntry """Construct a ZipEntry for a file on the filesystem. Usually a similar `zip_info_from_file` method is provided by `ZipInfo`, but it is not @@ -196,27 +208,43 @@ def zip_entry_from_file(cls, filename, arcname=None, date_time=None): arcname += "/" if date_time is None: date_time = time.localtime(st.st_mtime) - zinfo = zipfile.ZipInfo(filename=arcname, date_time=date_time[:6]) - zinfo.external_attr = (st.st_mode & 0xFFFF) << 16 # Unix attributes + zip_info = zipfile.ZipInfo(filename=arcname, date_time=date_time[:6]) + zip_info.external_attr = (st.st_mode & 0xFFFF) << 16 # Unix attributes if isdir: - zinfo.file_size = 0 - zinfo.external_attr |= 0x10 # MS-DOS directory flag - zinfo.compress_type = zipfile.ZIP_STORED + zip_info.file_size = 0 + zip_info.external_attr |= 0x10 # MS-DOS directory flag + zip_info.compress_type = zipfile.ZIP_STORED data = b"" else: - zinfo.file_size = st.st_size - zinfo.compress_type = zipfile.ZIP_DEFLATED + zip_info.file_size = st.st_size + zip_info.compress_type = zipfile.ZIP_DEFLATED with open(filename, "rb") as fp: data = fp.read() - return cls.ZipEntry(info=zinfo, data=data) + return cls.ZipEntry(info=zip_info, data=data) - def _extract_member(self, member, targetpath, pwd): - result = super(PermPreservingZipFile, self)._extract_member(member, targetpath, pwd) + def _extract_member( + self, + member, # type: Union[str, ZipInfo] + targetpath, # type: str + pwd, # type: Optional[bytes] + ): + # type: (...) -> str + + # MyPy doesn't see the superclass private method. + result = super(ZipFileEx, self)._extract_member( # type: ignore[misc] + member, targetpath, pwd + ) info = member if isinstance(member, zipfile.ZipInfo) else self.getinfo(member) self._chmod(info, result) - return result + return cast(str, result) + + @staticmethod + def _chmod( + info, # type: ZipInfo + path, # type: Text + ): + # type: (...) -> None - def _chmod(self, info, path): # This magic works to extract perm bits from the 32 bit external file attributes field for # unix-created zip files, for the layout, see: # https://www.forensicswiki.org/wiki/ZIP#External_file_attributes @@ -224,10 +252,57 @@ def _chmod(self, info, path): attr = info.external_attr >> 16 os.chmod(path, attr) + # Python 3 also takes PathLike[str] for the path arg, but we only ever pass str since we support + # Python 2.7 and don't use pathlib as a result. + def extractall( # type: ignore[override] + self, + path=None, # type: Optional[str] + members=None, # type: Optional[Iterable[Union[str, ZipInfo]]] + pwd=None, # type: Optional[bytes] + ): + # type: (...) -> None + if sys.version_info[0] != 2: + return super(ZipFileEx, self).extractall(path=path, members=members, pwd=pwd) + + # Under Python 2.7, ZipFile does not handle Zip entry name encoding correctly. Older Zip + # standards supported IBM code page 437 and newer support UTF-8. The newer support is + # indicated by the bit 11 flag in the file header. + # From https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT section + # "4.4.4 general purpose bit flag: (2 bytes)": + # + # Bit 11: Language encoding flag (EFS). If this bit is set, + # the filename and comment fields for this file + # MUST be encoded using UTF-8. (see APPENDIX D) + # + # N.B.: MyPy fails to see this code can be reached for Python 2.7. + efs_bit = 1 << 11 # type: ignore[unreachable] + + target_path = path or os.getcwd() + for member in members or self.infolist(): + info = member if isinstance(member, ZipInfo) else self.getinfo(member) + encoding = "utf-8" if info.flag_bits & efs_bit else "cp437" + member_path = info.filename.encode(encoding) + target = target_path.encode(encoding) + + rel_dir = os.path.dirname(member_path) + abs_dir = os.path.join(target, rel_dir) + abs_path = os.path.join(abs_dir, os.path.basename(member_path)) + if member_path.endswith(b"/"): + safe_mkdir(abs_path) + else: + safe_mkdir(abs_dir) + with open(abs_path, "wb") as tfp, self.open(info) as zf_entry: + shutil.copyfileobj(zf_entry, tfp) + self._chmod(info, abs_path) + @contextlib.contextmanager -def open_zip(path, *args, **kwargs): - # type: (Text, *Any, **Any) -> Iterator[PermPreservingZipFile] +def open_zip( + path, # type: Text + *args, # type: Any + **kwargs # type: Any +): + # type: (...) -> Iterator[ZipFileEx] """A contextmanager for zip files. Passes through positional and kwargs to zipfile.ZipFile. @@ -237,8 +312,8 @@ def open_zip(path, *args, **kwargs): # extensions across all Pex supported Pythons. kwargs.setdefault("allowZip64", True) - with contextlib.closing(PermPreservingZipFile(path, *args, **kwargs)) as zip: - yield zip + with contextlib.closing(ZipFileEx(path, *args, **kwargs)) as zip_fp: + yield zip_fp def deterministic_walk(*args, **kwargs): @@ -336,7 +411,7 @@ def safe_delete(filename): def safe_rmtree(directory): - # type: (Text) -> None + # type: (_Text) -> None """Delete a directory if it's present. If it's not present, no-op. diff --git a/pex/layout.py b/pex/layout.py index 2576ec9a6..d8ee4b54c 100644 --- a/pex/layout.py +++ b/pex/layout.py @@ -11,14 +11,7 @@ from pex.atomic_directory import atomic_directory from pex.cache import access as cache_access from pex.cache.dirs import CacheDir -from pex.common import ( - PermPreservingZipFile, - is_script, - open_zip, - safe_copy, - safe_mkdir, - safe_mkdtemp, -) +from pex.common import ZipFileEx, is_script, open_zip, safe_copy, safe_mkdir, safe_mkdtemp from pex.enum import Enum from pex.tracer import TRACER from pex.typing import TYPE_CHECKING @@ -395,7 +388,7 @@ def close(self): def open(self): # type: () -> zipfile.ZipFile if self._zfp is None: - self._zfp = PermPreservingZipFile(self.path) + self._zfp = ZipFileEx(self.path) return self._zfp def __getstate__(self): diff --git a/pex/version.py b/pex/version.py index 451441830..3d29bb977 100644 --- a/pex/version.py +++ b/pex/version.py @@ -1,4 +1,4 @@ # Copyright 2015 Pex project contributors. # Licensed under the Apache License, Version 2.0 (see LICENSE). -__version__ = "2.20.1" +__version__ = "2.20.2" diff --git a/tests/integration/test_issue_298.py b/tests/integration/test_issue_298.py new file mode 100644 index 000000000..caf5fc26d --- /dev/null +++ b/tests/integration/test_issue_298.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- +# Copyright 2024 Pex project contributors. +# Licensed under the Apache License, Version 2.0 (see LICENSE). + +from __future__ import absolute_import + +import os.path +import shutil +import subprocess +from textwrap import dedent + +from pex.common import safe_open +from pex.fetcher import URLFetcher +from pex.typing import TYPE_CHECKING +from testing.docker import skip_unless_docker + +if TYPE_CHECKING: + from typing import Any + + +@skip_unless_docker +def test_confounding_encoding( + tmpdir, # type: Any + pex_project_dir, # type: str +): + # type: (...) -> None + + sdists = os.path.join(str(tmpdir), "sdists") + with safe_open( + os.path.join(sdists, "CherryPy-7.1.0.zip"), "wb" + ) as write_fp, URLFetcher().get_body_stream( + "https://files.pythonhosted.org/packages/" + "b7/dd/e95de2d7042bd53009e8673ca489effebd4a35d9b64b75ecfcca160efaf6/CherryPy-7.1.0.zip" + ) as read_fp: + shutil.copyfileobj(read_fp, write_fp) + + dest = os.path.join(str(tmpdir), "dest") + subprocess.check_call( + args=[ + "docker", + "run", + "--rm", + "-v", + "{code}:/code".format(code=pex_project_dir), + "-w", + "/code", + "-v", + "{sdists}:/sdists".format(sdists=sdists), + "-v", + "{dest}:/dest".format(dest=dest), + "-e", + "LANG=en_US.ISO-8859-1", + "python:2.7-slim", + "python2.7", + "-c", + dedent( + """\ + from __future__ import absolute_import + + import os + import zipfile + + from pex.common import open_zip + + + with zipfile.ZipFile("/sdists/CherryPy-7.1.0.zip") as zf: + try: + zf.extractall("/dest") + raise AssertionError( + "Expected standard Python 2.7 ZipFile.extractall to fail." + ) + except UnicodeEncodeError as e: + pass + + with open_zip("/sdists/CherryPy-7.1.0.zip") as zf: + zf.extractall("/dest") + """ + ), + ] + ) + + assert os.path.isfile( + os.path.join(dest, "CherryPy-7.1.0", "cherrypy", "test", "static", "Слава Україні.html") + ) diff --git a/tests/test_common.py b/tests/test_common.py index 96ce3ade8..e1fabeb25 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -9,7 +9,7 @@ from pex.common import ( Chroot, - PermPreservingZipFile, + ZipFileEx, can_write_dir, chmod_plus_x, deterministic_walk, @@ -51,7 +51,7 @@ def zip_fixture(): assert extract_perms(one) != extract_perms(two) zip_file = os.path.join(target_dir, "test.zip") - with contextlib.closing(PermPreservingZipFile(zip_file, "w")) as zf: + with contextlib.closing(ZipFileEx(zip_file, "w")) as zf: zf.write(one, "one") zf.write(two, "two") @@ -61,7 +61,7 @@ def zip_fixture(): def test_perm_preserving_zipfile_extractall(): # type: () -> None with zip_fixture() as (zip_file, extract_dir, one, two): - with contextlib.closing(PermPreservingZipFile(zip_file)) as zf: + with contextlib.closing(ZipFileEx(zip_file)) as zf: zf.extractall(extract_dir) assert extract_perms(one) == extract_perms(os.path.join(extract_dir, "one")) @@ -71,7 +71,7 @@ def test_perm_preserving_zipfile_extractall(): def test_perm_preserving_zipfile_extract(): # type: () -> None with zip_fixture() as (zip_file, extract_dir, one, two): - with contextlib.closing(PermPreservingZipFile(zip_file)) as zf: + with contextlib.closing(ZipFileEx(zip_file)) as zf: zf.extract("one", path=extract_dir) zf.extract("two", path=extract_dir) @@ -98,7 +98,7 @@ def assert_chroot_perms(copyfn): zip_path = os.path.join(src, "chroot.zip") chroot.zip(zip_path) with temporary_dir() as extract_dir: - with contextlib.closing(PermPreservingZipFile(zip_path)) as zf: + with contextlib.closing(ZipFileEx(zip_path)) as zf: zf.extractall(extract_dir) assert extract_perms(one) == extract_perms(os.path.join(extract_dir, "one")) diff --git a/tests/test_enum.py b/tests/test_enum.py index 759b688e5..8d0af7946 100644 --- a/tests/test_enum.py +++ b/tests/test_enum.py @@ -5,7 +5,7 @@ import pytest from pex.atomic_directory import AtomicDirectory -from pex.common import PermPreservingZipFile +from pex.common import ZipFileEx from pex.compatibility import PY2 from pex.enum import Enum, qualified_name @@ -116,9 +116,9 @@ def test_qualified_name(): AtomicDirectory.work_dir ), "Expected @property to be handled." - expected_prefix = "pex.common." if PY2 else "pex.common.PermPreservingZipFile." + expected_prefix = "pex.common." if PY2 else "pex.common.ZipFileEx." assert expected_prefix + "zip_entry_from_file" == qualified_name( - PermPreservingZipFile.zip_entry_from_file + ZipFileEx.zip_entry_from_file ), "Expected @classmethod to be handled." class Test(object):