From a55de2df66e7c829e153ab5cec552f69c05332ef Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 25 Apr 2024 14:24:58 +0200 Subject: [PATCH 01/15] Windows specific prefix check (magick) --- src/invoice2data/input/tesseract.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/invoice2data/input/tesseract.py b/src/invoice2data/input/tesseract.py index 7bea12dd..b5fbef01 100644 --- a/src/invoice2data/input/tesseract.py +++ b/src/invoice2data/input/tesseract.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- - +import platform import shutil import tempfile import mimetypes @@ -32,17 +32,21 @@ def to_text(path: str, area_details: dict = None): """ # Check for dependencies. Needs Tesseract and Imagemagick installed. + current_platform = platform.platform() + if current_platform.startswith("win32"): + convert_command_prefix = "magick" + else: + convert_command_prefix = "convert" if not shutil.which("tesseract"): raise EnvironmentError("tesseract not installed.") - if not shutil.which("convert"): + if not shutil.which(convert_command_prefix): raise EnvironmentError("imagemagick not installed.") language = get_languages() logger.debug("tesseract language arg is, %s", language) timeout = 180 - # convert the (multi-page) pdf file to a 300dpi png - convert = [ + convert = [convert_command_prefix] + [ "convert", "-units", "PixelsPerInch", From 3c61b97166dd85d409411c6cc87c72b19cf60704 Mon Sep 17 00:00:00 2001 From: alexm96 <38464076+alexm96@users.noreply.github.com> Date: Thu, 25 Apr 2024 14:33:38 +0200 Subject: [PATCH 02/15] test runner, add windows tests --- .github/workflows/main.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 88698867..22077961 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -9,7 +9,7 @@ jobs: strategy: matrix: python-version: [3.8, 3.9, 3.10.x, 3.11, 3.12] - os: [ubuntu-latest] + os: [ubuntu-latest, windows-latest] steps: - uses: actions/checkout@v2 @@ -18,10 +18,18 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install system dependencies (Linux) + if: runner.os == 'Linux' run: | sudo apt update sudo apt install tesseract-ocr poppler-utils imagemagick ghostscript pip install -U ocrmypdf + + - name: Install system dependencies (Windows) + if: runner.os == 'Windows' + run: | + choco install tesseract poppler imagemagick ghostscript + pip install -U ocrmypdf + - name: Install testing dependencies run: | pip install -U wheel pip From b5259a2de2b07920a8406b00e39c052e6ad3bf19 Mon Sep 17 00:00:00 2001 From: bosd Date: Wed, 24 Jul 2024 21:45:12 +0200 Subject: [PATCH 03/15] Update main.yml --- .github/workflows/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 22077961..3cf71e3b 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -6,6 +6,8 @@ jobs: test: timeout-minutes: 10 runs-on: ${{ matrix.os }} + env: + PYTHONIOENCODING: "utf8" strategy: matrix: python-version: [3.8, 3.9, 3.10.x, 3.11, 3.12] From d44f7f386a4d6c148dc05ca0b3301b7bb44cdc64 Mon Sep 17 00:00:00 2001 From: bosd Date: Thu, 25 Jul 2024 07:24:21 +0200 Subject: [PATCH 04/15] allow to run tesseract choco script by adding -y flag --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3cf71e3b..70bc2439 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -23,7 +23,7 @@ jobs: if: runner.os == 'Linux' run: | sudo apt update - sudo apt install tesseract-ocr poppler-utils imagemagick ghostscript + sudo apt install tesseract-ocr poppler-utils imagemagick ghostscript -y pip install -U ocrmypdf - name: Install system dependencies (Windows) From 009a3d88cf4b93a6a32409fc93533a761efeaec8 Mon Sep 17 00:00:00 2001 From: bosd Date: Thu, 25 Jul 2024 12:19:36 +0200 Subject: [PATCH 05/15] Update main.yml --- .github/workflows/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 70bc2439..c86cf864 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -23,13 +23,13 @@ jobs: if: runner.os == 'Linux' run: | sudo apt update - sudo apt install tesseract-ocr poppler-utils imagemagick ghostscript -y + sudo apt install tesseract-ocr poppler-utils imagemagick ghostscript pip install -U ocrmypdf - name: Install system dependencies (Windows) if: runner.os == 'Windows' run: | - choco install tesseract poppler imagemagick ghostscript + choco install tesseract poppler imagemagick ghostscript -y pip install -U ocrmypdf - name: Install testing dependencies From 21269d4234767152601f0a9b56ab0d22c2c6f681 Mon Sep 17 00:00:00 2001 From: bosd <11499387+bosd@users.noreply.github.com> Date: Sat, 16 Nov 2024 11:21:56 +0100 Subject: [PATCH 06/15] Update choco command --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c86cf864..c1b3f7ab 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -29,7 +29,7 @@ jobs: - name: Install system dependencies (Windows) if: runner.os == 'Windows' run: | - choco install tesseract poppler imagemagick ghostscript -y + choco install --yes --no-progress --ignore-checksums ghostscript tesseract poppler imagemagick pip install -U ocrmypdf - name: Install testing dependencies From 92b859b33caafd3266bd33d8c57a13cf93426036 Mon Sep 17 00:00:00 2001 From: bosd <11499387+bosd@users.noreply.github.com> Date: Sat, 16 Nov 2024 11:28:33 +0100 Subject: [PATCH 07/15] Update main.yml --- .github/workflows/main.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c1b3f7ab..995c0849 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -29,7 +29,8 @@ jobs: - name: Install system dependencies (Windows) if: runner.os == 'Windows' run: | - choco install --yes --no-progress --ignore-checksums ghostscript tesseract poppler imagemagick + choco install --yes --no-progress --pre tesseract + choco install --yes --no-progress --ignore-checksums ghostscript poppler imagemagick pip install -U ocrmypdf - name: Install testing dependencies From 7cadce062361ba2c25a8fe7624c2038c937fe8e8 Mon Sep 17 00:00:00 2001 From: bosd <11499387+bosd@users.noreply.github.com> Date: Sat, 16 Nov 2024 12:07:40 +0100 Subject: [PATCH 08/15] Skip tesseract test on windows --- tests/test_lib.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_lib.py b/tests/test_lib.py index 3e4ba7f6..a78277b7 100644 --- a/tests/test_lib.py +++ b/tests/test_lib.py @@ -34,6 +34,10 @@ def have_pdfplumber(): needs_pdfplumber = unittest.skipIf(not have_pdfplumber(), reason="requires pdfplumber\n") +skip_on_windows = pytest.mark.skipif( + sys.platform.startswith("win"), + reason="Tesseract executable cannot be found in Windows test environment. FIXME", +) def _extract_data_for_export(): @@ -108,6 +112,7 @@ def test_extract_data_pdfplumber(self): print("Testing pdfplumber with file", file) extract_data(file, None, pdfplumber) + @skip_on_windows def test_tesseract_for_return(self): png_files = get_sample_files('.png') for file in png_files: From 38c0572f31bcaddea5f309dc532cc15b58fd290b Mon Sep 17 00:00:00 2001 From: bosd <11499387+bosd@users.noreply.github.com> Date: Sat, 16 Nov 2024 12:09:56 +0100 Subject: [PATCH 09/15] Fixup --- tests/test_lib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_lib.py b/tests/test_lib.py index a78277b7..18f33cf4 100644 --- a/tests/test_lib.py +++ b/tests/test_lib.py @@ -34,7 +34,7 @@ def have_pdfplumber(): needs_pdfplumber = unittest.skipIf(not have_pdfplumber(), reason="requires pdfplumber\n") -skip_on_windows = pytest.mark.skipif( +skip_on_windows = unittest.skipif( sys.platform.startswith("win"), reason="Tesseract executable cannot be found in Windows test environment. FIXME", ) From 4a1e8c9b9ac7d4ae8cf3c19d647c66d73f531660 Mon Sep 17 00:00:00 2001 From: bosd <11499387+bosd@users.noreply.github.com> Date: Sat, 16 Nov 2024 12:11:55 +0100 Subject: [PATCH 10/15] Fixup --- tests/test_lib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_lib.py b/tests/test_lib.py index 18f33cf4..c9278f04 100644 --- a/tests/test_lib.py +++ b/tests/test_lib.py @@ -34,7 +34,7 @@ def have_pdfplumber(): needs_pdfplumber = unittest.skipIf(not have_pdfplumber(), reason="requires pdfplumber\n") -skip_on_windows = unittest.skipif( +skip_on_windows = unittest.skipIf( sys.platform.startswith("win"), reason="Tesseract executable cannot be found in Windows test environment. FIXME", ) From 8d0c26eff70cecd2cf1ea19815b1deab1a0d6ea7 Mon Sep 17 00:00:00 2001 From: bosd <11499387+bosd@users.noreply.github.com> Date: Sat, 16 Nov 2024 12:14:25 +0100 Subject: [PATCH 11/15] Fixup --- tests/test_lib.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_lib.py b/tests/test_lib.py index c9278f04..08816dce 100644 --- a/tests/test_lib.py +++ b/tests/test_lib.py @@ -11,6 +11,7 @@ # https://docs.python.org/3.10/library/unittest.html#test-cases import os +import sys try: from StringIO import StringIO # noqa: F401 From 5164864c272e65d9a2fc2e60b16d6dfae2b9d465 Mon Sep 17 00:00:00 2001 From: bosd <11499387+bosd@users.noreply.github.com> Date: Sat, 16 Nov 2024 12:29:47 +0100 Subject: [PATCH 12/15] Skip cli tests on windows --- tests/test_cli.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 597b5eca..3a4f9cc3 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -12,6 +12,7 @@ import datetime import os +import sys import json import shutil import csv @@ -39,7 +40,10 @@ def have_ocrmypdf(): needs_ocrmypdf = unittest.skipIf(not have_ocrmypdf(), reason="requires ocrmypdf") - +skip_on_windows = unittest.skipIf( + sys.platform.startswith("win"), + reason="Tesseract executable cannot be found in Windows test environment. FIXME", +) class TestCLI(unittest.TestCase): def setUp(self): @@ -74,6 +78,7 @@ def test_debug(self): # TODO: move result comparison to own test module. # TODO: parse output files instaed of comparing them byte-by-byte. + @skip_on_windows def test_content_json(self): input_files = get_sample_files(('.pdf', '.txt')) json_files = get_sample_files('.json') @@ -141,6 +146,7 @@ def test_output_format_date_xml(self): self.assertTrue(False, 'Unexpected date format') os.remove(test_file) + @skip_on_windows def test_copy(self): # folder = pkg_resources.resource_filename(__name__, 'pdfs') directory = os.path.dirname("tests/copy_test/pdf/") @@ -213,6 +219,7 @@ def get_filename_format_test_data(self, filename_format): ) return data + @skip_on_windows def test_copy_with_default_filename_format(self): copy_dir = os.path.join('tests', 'copy_test', 'pdf') # make sure directory is deleted @@ -234,6 +241,7 @@ def test_copy_with_default_filename_format(self): shutil.rmtree(os.path.dirname(copy_dir), ignore_errors=True) + @skip_on_windows def test_copy_with_custom_filename_format(self): copy_dir = os.path.join('tests', 'copy_test', 'pdf') filename_format = "Custom Prefix {date} {invoice_number}.pdf" @@ -274,6 +282,7 @@ def test_area(self): # Where the pdf has to be ocr'd first # before any keywords can be matched + @skip_on_windows @needs_ocrmypdf def test_ocrmypdf(self): pdf_files = get_sample_files("saeco.pdf", exclude_input_specific=False) @@ -303,6 +312,7 @@ def test_ocrmypdf(self): # Test the fallback from pdf to text to ocrmypdf. # with ocrmypdf installed + @skip_on_windows @needs_ocrmypdf def test_fallback_with_ocrmypdf(self): pdf_files = get_sample_files("saeco.pdf", exclude_input_specific=False) From 60ac1bef0affca05e613609b764f430a54930927 Mon Sep 17 00:00:00 2001 From: bosd <11499387+bosd@users.noreply.github.com> Date: Sat, 16 Nov 2024 12:36:10 +0100 Subject: [PATCH 13/15] Fixup --- tests/test_cli.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index 3a4f9cc3..e43ab2ab 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -96,6 +96,7 @@ def test_content_json(self): self.assertTrue(False, 'Failed to verify parsing result for ' + jfile) os.remove(test_files) + @skip_on_windows def test_output_format_date_json(self): pdf_files = get_sample_files('free_fiber.pdf') test_file = 'test_compare.json' @@ -129,6 +130,7 @@ def test_output_format_date_csv(self): self.assertTrue(False, 'Unexpected date format') os.remove(test_file) + @skip_on_windows def test_output_format_date_xml(self): pdf_files = get_sample_files('free_fiber.pdf') test_file = 'test_compare.xml' @@ -263,6 +265,7 @@ def test_copy_with_custom_filename_format(self): shutil.rmtree(os.path.dirname(copy_dir), ignore_errors=True) + @skip_on_windows def test_area(self): pdf_files = get_sample_files('NetpresseInvoice.pdf') test_file = 'test_area.json' From d55d9208d877ba16384c66d2f2108c208841b3ab Mon Sep 17 00:00:00 2001 From: bosd <11499387+bosd@users.noreply.github.com> Date: Sat, 16 Nov 2024 12:42:18 +0100 Subject: [PATCH 14/15] Refresh env In attempt to let invoice2data find the tesseract executable. --- .github/workflows/main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 995c0849..2c56f99e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -30,6 +30,7 @@ jobs: if: runner.os == 'Windows' run: | choco install --yes --no-progress --pre tesseract + call RefreshEnv.cmd choco install --yes --no-progress --ignore-checksums ghostscript poppler imagemagick pip install -U ocrmypdf From 87d75439f5e7bd77a0a1bd3306406472a964022b Mon Sep 17 00:00:00 2001 From: bosd <11499387+bosd@users.noreply.github.com> Date: Sat, 16 Nov 2024 12:47:25 +0100 Subject: [PATCH 15/15] Fixup --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2c56f99e..41b9531b 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -30,7 +30,7 @@ jobs: if: runner.os == 'Windows' run: | choco install --yes --no-progress --pre tesseract - call RefreshEnv.cmd + refreshenv choco install --yes --no-progress --ignore-checksums ghostscript poppler imagemagick pip install -U ocrmypdf