diff --git a/.docker/docker-compose.ci-test.yml b/.docker/docker-compose.ci-test.yml index e47ea4a..c01b639 100644 --- a/.docker/docker-compose.ci-test.yml +++ b/.docker/docker-compose.ci-test.yml @@ -4,9 +4,9 @@ version: "3" services: - gotenberg: + gotenberg-client-test-server: image: docker.io/gotenberg/gotenberg:7.9.2 - hostname: gotenberg - container_name: gotenberg + hostname: gotenberg-client-test-server + container_name: gotenberg-client-test-server network_mode: host restart: unless-stopped diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d166e59..0c7e7a0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,6 +64,11 @@ jobs: docker compose --file ${GITHUB_WORKSPACE}/.docker/docker-compose.ci-test.yml up --detach echo "Wait for container to be started" sleep 5 + - + name: Install poppler-utils + run: | + sudo apt-get update + sudo apt-get install --yes --no-install-reccomends poppler-utils - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 @@ -72,7 +77,9 @@ jobs: cache: 'pip' - name: Install Hatch - run: pip install --upgrade hatch + run: | + python3 -m pip install --upgrade pip + pip install --upgrade hatch - name: Run tests run: hatch run cov diff --git a/CHANGELOG.md b/CHANGELOG.md index de445a1..97c805c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Fixed + +- An issue with the sorting of merging PDFs. Expanded testing to cover the merged ordering + ## [0.2.0] - 2023-10-16 ### Added diff --git a/src/gotenberg_client/_base.py b/src/gotenberg_client/_base.py index 41a69e8..4beb317 100644 --- a/src/gotenberg_client/_base.py +++ b/src/gotenberg_client/_base.py @@ -79,23 +79,22 @@ def get_files(self) -> RequestFiles: files = {} for filename in self._file_map: file_path = self._file_map[filename] - # Gotenberg requires these to have the specific name - filepath_name = filename if filename in {"index.html", "header.html", "footer.html"} else file_path.name # Helpful but not necessary to provide the mime type when possible mime_type = guess_mime_type(file_path) if mime_type is not None: files.update( - {filepath_name: (filepath_name, self._stack.enter_context(file_path.open("rb")), mime_type)}, + {filename: (filename, self._stack.enter_context(file_path.open("rb")), mime_type)}, ) else: # pragma: no cover - files.update({filepath_name: (filepath_name, self._stack.enter_context(file_path.open("rb")))}) # type: ignore + files.update({filename: (filename, self._stack.enter_context(file_path.open("rb")))}) # type: ignore return files def _add_file_map(self, filepath: Path, name: Optional[str] = None) -> None: """ Small helper to handle bookkeeping of files for later opening. The name is optional to support those things which are required to have a certain name + generally for ordering or just to be found at all """ if name is None: name = filepath.name diff --git a/src/gotenberg_client/_merge.py b/src/gotenberg_client/_merge.py index 52848a1..70ff94e 100644 --- a/src/gotenberg_client/_merge.py +++ b/src/gotenberg_client/_merge.py @@ -4,6 +4,8 @@ from pathlib import Path from typing import List +from httpx import Client + from gotenberg_client._base import BaseApi from gotenberg_client._base import BaseRoute @@ -13,15 +15,20 @@ class MergeRoute(BaseRoute): Handles the merging of a given set of files """ + def __init__(self, client: Client, api_route: str) -> None: + super().__init__(client, api_route) + self._next = 1 + def merge(self, files: List[Path]) -> "MergeRoute": """ Adds the given files into the file mapping. This method will maintain the ordering of the list. Calling this method multiple times may not merge in the expected ordering """ - for idx, filepath in enumerate(files): + for filepath in files: # Include index to enforce ordering - self._add_file_map(filepath, f"{idx}_{filepath.name}") + self._add_file_map(filepath, f"{self._next}_{filepath.name}") + self._next += 1 return self diff --git a/src/gotenberg_client/_types_compat.py b/src/gotenberg_client/_types_compat.py index 91bfbc1..8ccf339 100644 --- a/src/gotenberg_client/_types_compat.py +++ b/src/gotenberg_client/_types_compat.py @@ -4,7 +4,7 @@ import sys -if sys.version_info >= (3, 11): +if sys.version_info >= (3, 11): # pragma: no cover from typing import Self -else: +else: # pragma: no cover from typing_extensions import Self # noqa: F401 diff --git a/src/gotenberg_client/_utils.py b/src/gotenberg_client/_utils.py index eec25ff..5c80146 100644 --- a/src/gotenberg_client/_utils.py +++ b/src/gotenberg_client/_utils.py @@ -19,7 +19,7 @@ def optional_to_form(value: Optional[Union[bool, int, float, str]], name: str) - return {name: str(value).lower()} -def guess_mime_type_stdlib(url: Path) -> Optional[str]: +def guess_mime_type_stdlib(url: Path) -> Optional[str]: # pragma: no cover """ Uses the standard library to guess a mimetype """ diff --git a/tests/samples/a_merge_second.pdf b/tests/samples/a_merge_second.pdf new file mode 100644 index 0000000..2955c8d Binary files /dev/null and b/tests/samples/a_merge_second.pdf differ diff --git a/tests/samples/sample1.pdf b/tests/samples/sample1.pdf old mode 100755 new mode 100644 diff --git a/tests/samples/z_first_merge.pdf b/tests/samples/z_first_merge.pdf new file mode 100644 index 0000000..4f74613 Binary files /dev/null and b/tests/samples/z_first_merge.pdf differ diff --git a/tests/test_merge.py b/tests/test_merge.py index fc869fc..3239dba 100644 --- a/tests/test_merge.py +++ b/tests/test_merge.py @@ -1,7 +1,7 @@ import shutil +import subprocess import tempfile from pathlib import Path -from typing import List import pikepdf import pytest @@ -15,18 +15,27 @@ from tests.utils import call_run_with_server_error_handling -@pytest.fixture() -def create_files(): +def extract_text(pdf_path: Path) -> str: """ - Creates 2 files in a temporary directory and cleans them up - after their use + Using pdftotext from poppler, extracts the text of a PDF into a file, + then reads the file contents and returns it """ - temp_dir = Path(tempfile.mkdtemp()) - test_file = SAMPLE_DIR / "sample1.pdf" - other_test_file = temp_dir / "sample2.pdf" - other_test_file.write_bytes(test_file.read_bytes()) - yield [test_file, other_test_file] - shutil.rmtree(temp_dir, ignore_errors=True) + with tempfile.NamedTemporaryFile( + mode="w+", + ) as tmp: + subprocess.run( + [ # noqa: S603 + shutil.which("pdftotext"), + "-q", + "-layout", + "-enc", + "UTF-8", + str(pdf_path), + tmp.name, + ], + check=True, + ) + return tmp.read() class TestMergePdfs: @@ -37,12 +46,15 @@ class TestMergePdfs: def test_merge_files_pdf_a( self, client: GotenbergClient, - create_files: List[Path], gt_format: PdfAFormat, pike_format: str, ): with client.merge.merge() as route: - resp = call_run_with_server_error_handling(route.merge(create_files).pdf_format(gt_format)) + resp = call_run_with_server_error_handling( + route.merge([SAMPLE_DIR / "z_first_merge.pdf", SAMPLE_DIR / "a_merge_second.pdf"]).pdf_format( + gt_format, + ), + ) assert resp.status_code == codes.OK assert "Content-Type" in resp.headers @@ -58,14 +70,31 @@ def test_merge_files_pdf_a( if SAVE_OUTPUTS: (SAVE_DIR / f"test_libre_office_convert_xlsx_format_{pike_format}.pdf").write_bytes(resp.content) - def test_pdf_a_multiple_file( + def test_merge_multiple_file( self, client: GotenbergClient, - create_files: List[Path], ): - with client.merge.merge() as route: - resp = call_run_with_server_error_handling(route.merge(create_files)) + if shutil.which("pdftotext") is None: + pytest.skip("No pdftotext executable found") + else: + with client.merge.merge() as route: + # By default, these would not merge correctly + route.merge([SAMPLE_DIR / "z_first_merge.pdf", SAMPLE_DIR / "a_merge_second.pdf"]) + resp = call_run_with_server_error_handling(route) + + assert resp.status_code == codes.OK + assert "Content-Type" in resp.headers + assert resp.headers["Content-Type"] == "application/pdf" + + with tempfile.NamedTemporaryFile(mode="wb") as tmp: + tmp.write(resp.content) + + text = extract_text(tmp.name) + lines = text.split("\n") + # Extra is empty line + assert len(lines) == 3 + assert "first PDF to be merged." in lines[0] + assert "second PDF to be merged." in lines[1] - assert resp.status_code == codes.OK - assert "Content-Type" in resp.headers - assert resp.headers["Content-Type"] == "application/pdf" + if SAVE_OUTPUTS: + (SAVE_DIR / "test_pdf_a_multiple_file.pdf").write_bytes(resp.content) diff --git a/tests/utils.py b/tests/utils.py index 0db432b..29f97d7 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -33,16 +33,16 @@ def call_run_with_server_error_handling(route: BaseRoute) -> Response: one attempt to parse. This will wait the following: - - Attempt 1 - 20s following failure - - Attempt 2 - 40s following failure - - Attempt 3 - 80s following failure - - Attempt 4 - 160s - - Attempt 5 - 320s + - Attempt 1 - 5s following failure + - Attempt 2 - 10s following failure + - Attempt 3 - 20s following failure + - Attempt 4 - 40s following failure + - Attempt 5 - 80s following failure """ result = None succeeded = False - retry_time = 20.0 + retry_time = 5.0 retry_count = 0 max_retry_count = 5