diff --git a/.github/workflows/deploy-package-to-pypi.yml b/.github/workflows/deploy-package-to-pypi.yml index 8257c1a..e8bd265 100644 --- a/.github/workflows/deploy-package-to-pypi.yml +++ b/.github/workflows/deploy-package-to-pypi.yml @@ -5,29 +5,161 @@ on: types: [published] jobs: - build: + build-source: + name: Build source package runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - - uses: actions/setup-python@v3 + - name: Update version in pyproject.toml from current git tag + run: | + sed -i "s/0\\.0\\.0\\.dev0/${GITHUB_REF_NAME}/g" pyproject.toml + + - uses: actions/setup-python@v4 + with: + python-version: 3.13 + + - name: Build package + run: | + pip install build + python -m build --sdist + + - uses: actions/upload-artifact@v4 with: - python-version: 3.11 + name: source + path: ./dist + + build-linux: + name: Build Linux wheels + runs-on: ubuntu-latest + strategy: + matrix: + image: + - "manylinux2014_x86_64" + - "musllinux_1_1_x86_64" + - "manylinux2014_aarch64" + - "musllinux_1_1_aarch64" + folder: + - "cp37-cp37m" + - "cp38-cp38" + - "cp39-cp39" + - "cp310-cp310" + - "cp311-cp311" + - "cp312-cp312" + - "cp313-cp313" + + steps: + - uses: actions/checkout@v4 - name: Update version in pyproject.toml from current git tag + run: | + sed -i "s/0\\.0\\.0\\.dev0/${GITHUB_REF_NAME}/g" pyproject.toml + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + with: + platforms: arm64 + + - name: Build packages run: >- - sed -i "s/0\\.0\\.0\\.dev0/${GITHUB_REF/refs\/tags\/v/}/g" pyproject.toml + docker run --rm -v ${{ github.workspace }}:/app quay.io/pypa/${{ matrix.image }} bash -c ' + cd /app && + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && + . "$HOME/.cargo/env" && + /opt/python/${{ matrix.folder }}/bin/python -m build --wheel + auditwheel repair $(ls dist/*.whl) && + rm dist/*.whl && + cp wheelhouse/*.whl dist + ' - - run: | + - uses: actions/upload-artifact@v4 + with: + name: linux-${{ matrix.image }}-$${{ matrix.folder }} + path: ./dist + + build-macos: + name: Build macOS wheels + strategy: + matrix: + os: + - "macos-12" + - "macos-13" + - "macos-14" # ARM + python-version: + - "3.7.1" + - "3.8.10" + - "3.9.13" + - "3.10.11" + - "3.11.9" + - "3.12.6" + - "3.13.0" + exclude: + - python-version: "3.7.1" + os: "macos-14" + runs-on: '${{ matrix.os }}' + + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: '${{ matrix.python-version }}' + + - name: Update version in pyproject.toml from current git tag + run: | + sed -i "" "s/0\\.0\\.0\\.dev0/${GITHUB_REF_NAME}/g" pyproject.toml + + - name: Build package + run: | + pip install build + python -m build --wheel + + - uses: actions/upload-artifact@v4 + with: + name: ${{ matrix.os }}-${{ matrix.python-version }} + path: ./dist + + build-windows: + name: Build Windows wheels + strategy: + matrix: + os: + - "windows-2019" + python-version: + - "3.7.1" + - "3.8.0" + - "3.9.0" + - "3.10.0" + - "3.11.0" + - "3.12.0" + - "3.13.0" + runs-on: '${{ matrix.os }}' + + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: '${{ matrix.python-version }}' + + - name: Update version in pyproject.toml from current git tag + run: | + (Get-Content pyproject.toml).Replace('0.0.0.dev0', $Env:GITHUB_REF_NAME) | Set-Content pyproject.toml + + - name: Build package + run: | pip install build - python -m build + python -m build --wheel - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: + name: ${{ matrix.os }}-${{ matrix.python-version }} path: ./dist deploy: - needs: [build] + needs: + - build-source + - build-linux + - build-macos + - build-windows environment: name: pypi url: https://pypi.org/project/stream-unzip/ @@ -37,9 +169,20 @@ jobs: permissions: id-token: write steps: - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4 + with: + path: ./dist + + # The "merge-multiple" option of download-artifact seems to cause corruption when there are + # multiple files of the same name, which happens because in some different macOS versions + # make the exact same Python package. So we avoid that and do a manual move of packages + # to the top level for upload + - name: Move packages to top level + run: | + find ./dist -mindepth 2 -type f -exec mv -t ./dist -i '{}' + + rm -R -- ./dist/*/ - name: Publish package distributions to PyPI uses: pypa/gh-action-pypi-publish@release/v1 with: - packages_dir: artifact/ + packages_dir: ./dist/ diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..578af57 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,181 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "indoc" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" + +[[package]] +name = "libc" +version = "0.2.159" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5" + +[[package]] +name = "memoffset" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" + +[[package]] +name = "portable-atomic" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2" + +[[package]] +name = "proc-macro2" +version = "1.0.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3e4daa0dcf6feba26f985457cdf104d4b4256fc5a09547140f3631bb076b19a" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "pyo3" +version = "0.22.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d922163ba1f79c04bc49073ba7b32fd5a8d3b76a87c955921234b8e77333c51" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "memoffset", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.22.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc38c5feeb496c8321091edf3d63e9a6829eab4b863b4a6a65f26f3e9cc6b179" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.22.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94845622d88ae274d2729fcefc850e63d7a3ddff5e3ce11bd88486db9f1d357d" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.22.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e655aad15e09b94ffdb3ce3d217acf652e26bbc37697ef012f5e5e348c716e5e" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.22.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae1e3f09eecd94618f60a455a23def79f79eba4dc561a97324bf9ac8c6df30ce" +dependencies = [ + "heck", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "stream_unzip_zipcrypto_decrypt" +version = "0.1.0" +dependencies = [ + "crc32fast", + "pyo3", +] + +[[package]] +name = "syn" +version = "2.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "target-lexicon" +version = "0.12.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" + +[[package]] +name = "unicode-ident" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" + +[[package]] +name = "unindent" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..d886801 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "stream_unzip_zipcrypto_decrypt" +version = "0.1.0" +edition = "2021" + +[lib] +name = "stream_unzip_zipcrypto_decrypt" +crate-type = ["cdylib"] + +[dependencies] +pyo3 = { version = "0.22.5", features = ["extension-module", "gil-refs"] } +crc32fast = "1.4.2" diff --git a/README.md b/README.md index 22ad42d..a964489 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ In addition to being memory efficient, stream-unzip supports: - WinZip-style AES-encrypted / password-protected ZIPs. Python's zipfile module cannot open AES-encrypted ZIPs. -- Legacy-encrypted / password-protected ZIP files. This is also known as ZipCrypto/Zip 2.0. +- Legacy-encrypted / password-protected ZIP files. This is also known as ZipCrypto/Zip 2.0. Decrypting ZipCrypto with stream-zip is approximately 10 times faster than Python's zipfile module. - ZIP files created by Java's ZipOutputStream that are larger than 4GiB. At the time of writing libarchive-based stream readers cannot read these without error. diff --git a/docs/features.md b/docs/features.md index 54a073f..4867ba4 100644 --- a/docs/features.md +++ b/docs/features.md @@ -15,7 +15,7 @@ In addition to being memory efficient, stream-unzip supports: - WinZip-style AES-encrypted / password-protected ZIPs. Python's zipfile module cannot open AES-encrypted ZIPs. -- Legacy-encrypted / password-protected ZIP files. This is also known as ZipCrypto/Zip 2.0. +- Legacy-encrypted / password-protected ZIP files. This is also known as ZipCrypto/Zip 2.0. Decrypting ZipCrypto with stream-zip is approximately 10 times faster than Python's zipfile module. - ZIP files created by Java's ZipOutputStream that are larger than 4GiB. At the time of writing libarchive-based stream readers cannot read these without error. diff --git a/pyproject.toml b/pyproject.toml index 3f32316..5bd7158 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" +requires = ["maturin>=0.12,<0.13"] +build-backend = "maturin" [project] name = "stream-unzip" @@ -37,7 +37,8 @@ ci = [ "Documentation" = "https://stream-unzip.docs.trade.gov.uk/" "Source" = "https://github.com/uktrade/stream-unzip" -[tool.hatch.build] +[tool.maturin] include = [ "stream_unzip.py", + "src/**" ] diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..0e4f4ed --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,91 @@ +use pyo3::prelude::*; +use pyo3::types::PyBytes; +use crc32fast::Hasher; + +// ZipCrypto key initialization vector and constants +const ZIPCRYPTO_KEY0: u32 = 0x12345678; +const ZIPCRYPTO_KEY1: u32 = 0x23456789; +const ZIPCRYPTO_KEY2: u32 = 0x34567890; + +#[derive(Clone)] +struct ZipCrypto { + keys: [u32; 3], +} + +impl ZipCrypto { + fn new() -> Self { + ZipCrypto { + keys: [ZIPCRYPTO_KEY0, ZIPCRYPTO_KEY1, ZIPCRYPTO_KEY2], + } + } + + #[inline(always)] + fn init_password(&mut self, password: &[u8]) { + for &b in password { + self.update_keys(b); + } + } + + #[inline(always)] + fn update_keys(&mut self, byte: u8) { + self.keys[0] = !crc32_update(!self.keys[0], &[byte]); + + self.keys[1] = self + .keys[1] + .wrapping_add(self.keys[0] & 0xFF) + .wrapping_mul(134775813) + .wrapping_add(1); + + let temp_byte = (self.keys[1] >> 24) as u8; + self.keys[2] = !crc32_update(!self.keys[2], &[temp_byte]); + } + + #[inline(always)] + fn decrypt_byte(&mut self, byte: u8) -> u8 { + let temp = (self.keys[2] | 2) as u16; + let key = (((temp.wrapping_mul(temp ^ 1)) >> 8) & 0xFF) as u8; + let decrypted = byte ^ key; + self.update_keys(decrypted); + decrypted + } + + #[inline(always)] + fn decrypt_chunk(&mut self, chunk: &[u8]) -> Vec { + chunk.iter().map(|&b| self.decrypt_byte(b)).collect() + } +} + +#[inline(always)] +fn crc32_update(crc: u32, data: &[u8]) -> u32 { + let mut hasher = Hasher::new_with_initial(crc); + hasher.update(data); + hasher.finalize() +} + +#[pyclass] +struct StreamUnzipZipCryptoDecryptor { + zipcrypto: ZipCrypto, +} + +#[pymethods] +impl StreamUnzipZipCryptoDecryptor { + #[new] + fn new(password: &[u8]) -> Self { + let mut zipcrypto = ZipCrypto::new(); + zipcrypto.init_password(password); + StreamUnzipZipCryptoDecryptor { zipcrypto } + } + + // Decrypts a single chunk and returns the decrypted result + fn __call__<'py>(&mut self, py: Python<'py>, chunk: Vec) -> PyResult<&'py PyBytes> { + let result = self.zipcrypto.decrypt_chunk(&chunk); + // Return the decrypted result as a Python bytes object so it can be used in Python code + Ok(PyBytes::new(py, &result)) + } +} + +#[pymodule] +fn stream_unzip_zipcrypto_decrypt(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_class::()?; + Ok(()) +} diff --git a/stream_unzip.py b/stream_unzip.py index 8d09080..83883c5 100644 --- a/stream_unzip.py +++ b/stream_unzip.py @@ -11,6 +11,8 @@ from stream_inflate import stream_inflate64 +from stream_unzip_zipcrypto_decrypt import StreamUnzipZipCryptoDecryptor + NO_ENCRYPTION = object() ZIP_CRYPTO = object() @@ -226,39 +228,7 @@ def get_extra_value(extra, if_true, signature, exception_if_missing, min_length, return value def decrypt_weak_decompress(chunks, decompress, is_done, num_unused): - # There are a few optimisations that make this code unusual: - # - There is code repetition (to avoid function calls inside loops) - # - We assign global variables to local (to avoid the dictionary lookups globals involve) - # - Use bytearray rather than bytes (to avoid allocating memory) - # - Avoids intermediate statements/variables (to minimise unnecessary operations) - # From some light tests these make it ~5%-10% faster than Python's zipfile (although it - # does use similar optimisations from what I can tell) - key_0 = 305419896 - key_1 = 591751049 - key_2 = 878082192 - crc32 = zlib.crc32 - bytearray_byte = bytearray(1) - - def decrypt(chunk): - nonlocal key_0, key_1, key_2 - chunk = bytearray(chunk) - for i, byte in enumerate(chunk): - temp = key_2 | 2 - byte ^= ((temp * (temp ^ 1)) >> 8) & 0xFF - bytearray_byte[0] = byte - key_0 = ~crc32(bytearray_byte, ~key_0) & 0xFFFFFFFF - key_1 = ((((key_1 + (key_0 & 0xFF)) & 0xFFFFFFFF) * 134775813) + 1) & 0xFFFFFFFF - bytearray_byte[0] = key_1 >> 24 - key_2 = ~crc32(bytearray_byte, ~key_2) & 0xFFFFFFFF - chunk[i] = byte - return chunk - - for byte in password: - bytearray_byte[0] = byte - key_0 = ~crc32(bytearray_byte, ~key_0) & 0xFFFFFFFF - key_1 = ((((key_1 + (key_0 & 0xFF)) & 0xFFFFFFFF) * 134775813) + 1) & 0xFFFFFFFF - bytearray_byte[0] = key_1 >> 24 - key_2 = ~crc32(bytearray_byte, ~key_2) & 0xFFFFFFFF + decrypt = StreamUnzipZipCryptoDecryptor(password) encryption_header = decrypt(get_num(12)) check_password_byte = \