From 88ffe5b5dfe659b558b651aea679ef755a56a30d Mon Sep 17 00:00:00 2001
From: Jonathan Chang <jonathan@skylight.digital>
Date: Wed, 20 Nov 2024 15:06:41 -0800
Subject: [PATCH] Implement tesseract backend (#375)

* Initial tesserocr

* drop pytesseract

* Use actual raw API backend for confidence score

* ensure PIL image is passed

* Guess at tessdata path

* Install tesseract as part of docker setup

* documentation

* lint check

* Use tesserocr api instead of pathlib shenanigans

* Update docstring

* Fix path detection crash

* Strip tesseract output

* Update tests for tesseract comparisons

* Update CI runs
---
 .github/workflows/ocr-tests.yml   |  1 +
 OCR/Dockerfile                    |  2 +-
 OCR/dev-dockerfile                |  3 +-
 OCR/ocr/services/tesseract_ocr.py | 63 +++++++++++++++++++++++++++++++
 OCR/poetry.lock                   | 37 +++++++++++++++++-
 OCR/pyproject.toml                |  1 +
 OCR/tests/ocr_test.py             | 28 ++++++++------
 7 files changed, 119 insertions(+), 16 deletions(-)
 create mode 100644 OCR/ocr/services/tesseract_ocr.py

diff --git a/.github/workflows/ocr-tests.yml b/.github/workflows/ocr-tests.yml
index 6523fa91..c3754eee 100644
--- a/.github/workflows/ocr-tests.yml
+++ b/.github/workflows/ocr-tests.yml
@@ -17,6 +17,7 @@ jobs:
           python-version: "3.10"
       - name: Install dependencies
         run: |
+          sudo apt install tesseract-ocr-eng tesseract-ocr -y
           python -m pip install --upgrade pip
           pip install poetry
           poetry install --with dev
diff --git a/OCR/Dockerfile b/OCR/Dockerfile
index 3ccc0314..0d54ab72 100644
--- a/OCR/Dockerfile
+++ b/OCR/Dockerfile
@@ -1,6 +1,6 @@
 FROM python:3.10-bullseye
 
-RUN apt-get update && apt-get install ffmpeg libsm6 libxext6  -y
+RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 tesseract-ocr-eng tesseract-ocr -y
 
 RUN pip install poetry
 
diff --git a/OCR/dev-dockerfile b/OCR/dev-dockerfile
index c88c659b..f135fad6 100644
--- a/OCR/dev-dockerfile
+++ b/OCR/dev-dockerfile
@@ -1,7 +1,6 @@
 FROM python:3.10-bullseye
 
-
-RUN apt-get update && apt-get install ffmpeg libsm6 libxext6  -y
+RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 tesseract-ocr-eng tesseract-ocr -y
 
 RUN pip install poetry
 
diff --git a/OCR/ocr/services/tesseract_ocr.py b/OCR/ocr/services/tesseract_ocr.py
new file mode 100644
index 00000000..294d80f1
--- /dev/null
+++ b/OCR/ocr/services/tesseract_ocr.py
@@ -0,0 +1,63 @@
+import os
+
+import tesserocr
+import numpy as np
+from PIL import Image
+
+
+class TesseractOCR:
+    @staticmethod
+    def _guess_tessdata_path(wanted_lang="eng") -> bytes:
+        """
+        Attempts to guess potential locations for the `tessdata` folder.
+
+        The `tessdata` folder is needed to use pre-trained Tesseract OCR data, though the automatic detection
+        provided in `tesserocr` may not be reliable. Instead iterate over common paths on various systems (e.g.,
+        Red Hat, Ubuntu, macOS) and check for the presence of a `tessdata` folder.
+
+        If `TESSDATA_PREFIX` is available in the environment, this function will check that location first.
+        If all guessed locations do not exist, fall back to automatic detection provided by `tesserocr` and
+        the tesseract API.
+
+        `wanted_lang` (str): a desired language to search for. Defaults to English `eng`.
+        """
+        candidate_paths = [
+            "/usr/local/share/tesseract/tessdata",
+            "/usr/share/tesseract/tessdata",
+            "/usr/share/tesseract-ocr/4.00/tessdata",
+            "/opt/homebrew/share/tessdata",
+            "/opt/local/share/tessdata",
+        ]
+
+        # Prepend env variable if defined
+        if "TESSDATA_PREFIX" in os.environ:
+            candidate_paths.insert(os.environ["TESSDATA_PREFIX"], 0)
+
+        # Test candidate paths
+        for path in candidate_paths:
+            # When compiled for certain systems (macOS), libtesseract aborts due to an untrapped exception if it
+            # cannot access the path for any reason (e.g., does not exist, lacks read permissions). Attempt to
+            # enumerate the directory and, if it fails, skip this path.
+            try:
+                os.listdir(path)
+            except OSError:
+                continue
+
+            retpath, langs = tesserocr.get_languages(path)
+            if wanted_lang in langs:
+                return retpath
+
+        # Nothing matched, just return the default path
+        return tesserocr.get_languages()[0]
+
+    def image_to_text(self, segments: dict[str, np.ndarray]) -> dict[str, tuple[str, float]]:
+        digitized: dict[str, tuple[str, float]] = {}
+        with tesserocr.PyTessBaseAPI(path=self._guess_tessdata_path()) as api:
+            for label, image in segments.items():
+                if image is None:
+                    continue
+
+                api.SetImage(Image.fromarray(image))
+                digitized[label] = (api.GetUTF8Text().strip(), api.MeanTextConf())
+
+        return digitized
diff --git a/OCR/poetry.lock b/OCR/poetry.lock
index 86844331..befd0e25 100644
--- a/OCR/poetry.lock
+++ b/OCR/poetry.lock
@@ -2697,6 +2697,41 @@ mpmath = ">=1.1.0,<1.4"
 [package.extras]
 dev = ["hypothesis (>=6.70.0)", "pytest (>=7.1.0)"]
 
+[[package]]
+name = "tesserocr"
+version = "2.7.1"
+description = "A simple, Pillow-friendly, Python wrapper around tesseract-ocr API using Cython"
+optional = false
+python-versions = "*"
+files = [
+    {file = "tesserocr-2.7.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1b8c4828f970af7bcfca83a1fb228aa68a2587299387bc875d0dfad8b6baf8ed"},
+    {file = "tesserocr-2.7.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3bb5d336ebf2cc47cd0d117cadc8b25b2e558f54fb9a2dedaa28a14cb5a6b437"},
+    {file = "tesserocr-2.7.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:3ff7f6d6b5c12dd31b80842eb0892b661a41ca3edf0e6cc1e54ec2c14552ceef"},
+    {file = "tesserocr-2.7.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:ae794c5434373f4afa4c7f8b59f19fde810f8caf096d8bb701a4b2f3a6739460"},
+    {file = "tesserocr-2.7.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0a0895a4d9ff6a34f5a6f203fe0c9899f31d6f2378ae99be80605637b622687b"},
+    {file = "tesserocr-2.7.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c3187d14b95c866aa1d34cc374a53d583e2168742eefe33347e4790af70338e"},
+    {file = "tesserocr-2.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ec52be3d82136430081427062ad0211a52fc38fa28fe58e216b89f840354f216"},
+    {file = "tesserocr-2.7.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:44e71b3e8da36b2567760309398689ea9785ee62db3ff21140a9ea6941a233c4"},
+    {file = "tesserocr-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e31a49d7784e7e52fe656719145c3a872856d67daa9bfb340c2990db00e023e9"},
+    {file = "tesserocr-2.7.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:37abde15c1c940d691305fd87836e4cad25a1434799729c324bbcd2277bcae44"},
+    {file = "tesserocr-2.7.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:1b6349d35d333d420d24acf1953ad6f1d5613ffcde462c62126b68bdfca12753"},
+    {file = "tesserocr-2.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:42f009cde8479f3b339da12a8e419fd9559b64b13bc08a248bd0833c6ae94331"},
+    {file = "tesserocr-2.7.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6e13204b3b92fac76ece6e33f55eba6335b30e379f4a7b75e285c2ad05762027"},
+    {file = "tesserocr-2.7.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:65afdec0c5dc09a4a23a62e65524989cd940af41be1603e251a64ac10de9babf"},
+    {file = "tesserocr-2.7.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4c5f59fb072c90bff8aa6a365fc82b747c2668b7b48233901728b155860d1ff9"},
+    {file = "tesserocr-2.7.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f62d662e3002868384e14e8cd620bdedf34ab9f9fc3ebbce527cfe032a7485ee"},
+    {file = "tesserocr-2.7.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e80051812685bd521bc17cb70cf1480ffbb3e54ccc2883e90d5bcda15f8278ea"},
+    {file = "tesserocr-2.7.1-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:2690cb2330fc9349d68ff027cbdac09693fdda36470836b196c04f16dcc99e9d"},
+    {file = "tesserocr-2.7.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d01ebd094103451ecb77b6510ade2f6bb064c51413ff35b135f649f3d6067a67"},
+    {file = "tesserocr-2.7.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f8069ae6cd9ea3c056b6a596bc99f501ee9f95d6fd2928fcaffb9777071c210d"},
+    {file = "tesserocr-2.7.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2d3d23223d0a448877fb91af83c46ce95ff0a497a82fa93e93068148c9712e5"},
+    {file = "tesserocr-2.7.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef8a09a44c2e96bab0f40dbf0633767d063680d86b79365b43fc4e1234219694"},
+    {file = "tesserocr-2.7.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:6e613213ea5b64db06f2cba0b93c3656b7e6aec2d9b2d2e929edf49da7143225"},
+    {file = "tesserocr-2.7.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:4a8888b765e26680a6e34b8ec09b7bb85a17e08cea76f0661eafe2a84254562a"},
+    {file = "tesserocr-2.7.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:64f25763e56c4c29b808e59b485c930cac46b6a1ac8eadd994086dc40a29d3a1"},
+    {file = "tesserocr-2.7.1.tar.gz", hash = "sha256:3744c5c8bbabf18172849c7731be00dc2e5e44f8c556d37c850e788794ae0af4"},
+]
+
 [[package]]
 name = "tokenizers"
 version = "0.20.0"
@@ -3548,4 +3583,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "539bc8a4aa19c47e8f5eff93dcf57c986d19a9c3197d79071284f8f38d9c7200"
+content-hash = "2993b1bb088823677673c61c126ae057f2032e87edbe3a772fee9b9c21ba2396"
diff --git a/OCR/pyproject.toml b/OCR/pyproject.toml
index 119c5eb8..8b4de10e 100644
--- a/OCR/pyproject.toml
+++ b/OCR/pyproject.toml
@@ -16,6 +16,7 @@ transformers = {extras = ["torch"], version = "^4.45.1"}
 pillow = "^10.3.0"
 
 datasets = "^3.0.1"
+tesserocr = "^2.7.1"
 [tool.poetry.group.dev.dependencies]
 lxml = "^5.3.0"
 docopt = "^0.6.2"
diff --git a/OCR/tests/ocr_test.py b/OCR/tests/ocr_test.py
index 1f6647aa..53450ccb 100644
--- a/OCR/tests/ocr_test.py
+++ b/OCR/tests/ocr_test.py
@@ -2,12 +2,14 @@
 
 import numpy as np
 import cv2 as cv
+import pytest
 
 from ocr.services.image_segmenter import (
     ImageSegmenter,
     segment_by_color_bounding_box,
 )
 from ocr.services.image_ocr import ImageOCR
+from ocr.services.tesseract_ocr import TesseractOCR
 
 
 path = os.path.dirname(__file__)
@@ -19,8 +21,9 @@
 
 
 class TestOCR:
-    def test_extra_blank_space(self):
-        ocr = ImageOCR()
+    @pytest.mark.parametrize("ocr_class", [TesseractOCR, ImageOCR])
+    def test_extra_blank_space(self, ocr_class):
+        ocr = ocr_class()
         paragraph = cv.imread(paragraph_image_path, cv.IMREAD_COLOR)
         padding = (200, 200, 200, 200)
         paragraph_extra_space = cv.copyMakeBorder(paragraph, *padding, cv.BORDER_CONSTANT, value=(255, 255, 255))
@@ -34,11 +37,12 @@ def test_split_text_blocks(self):
         result = ocr.split_text_blocks(img)
         assert np.array_equiv(result, img)
 
-    def test_ocr_printed(self):
+    @pytest.mark.parametrize("ocr_class", [TesseractOCR, ImageOCR])
+    def test_ocr_printed(self, ocr_class):
         segmenter = ImageSegmenter(
             segmentation_function=segment_by_color_bounding_box,
         )
-        ocr = ImageOCR()
+        ocr = ocr_class()
 
         results = ocr.image_to_text(
             segmenter.load_and_segment(
@@ -51,25 +55,27 @@ def test_ocr_printed(self):
         patient_id, patient_confidence = results["nbs_patient_id"]
         cas_id, cas_confidence = results["nbs_cas_id"]
 
-        assert patient_id == "SIENNA HAMPTON"
+        assert patient_id.upper() == "SIENNA HAMPTON"
         assert cas_id == "123555"
 
-    def test_ocr_paragraph(self):
-        ocr = ImageOCR()
+    @pytest.mark.parametrize("ocr_class", [TesseractOCR, ImageOCR])
+    def test_ocr_paragraph(self, ocr_class):
+        ocr = ocr_class()
         segment = {"text": cv.imread(paragraph_image_path, cv.IMREAD_COLOR)}
         results = ocr.image_to_text(segment)
         text, confidence = results["text"]
         assert (
-            text
+            text.upper().replace("\n", " ")
             == "THIS TEST WAS DEVELOPED AND ITS ANALYTICAL PERFORMANCE CHARACTERISTICS HAVE BEEN DETERMINED BY QUEST DIAGNOSTICS NICHOLS INSTITUTE SAN JUAN CAPISTRANO. IT HAS NOT BEEN CLEARED OR APPROVED BY FDA. THIS ASSAY HAS BEEN VALIDATED PURSUANT TO THE CLIA REGULATIONS AND IS USED FOR CLINICAL PURPOSES."
         )
         assert confidence > 50
 
-    def test_confidence_values_returned(self):
+    @pytest.mark.parametrize("ocr_class", [TesseractOCR, ImageOCR])
+    def test_confidence_values_returned(self, ocr_class):
         segmenter = ImageSegmenter(
             segmentation_function=segment_by_color_bounding_box,
         )
-        ocr = ImageOCR()
+        ocr = ocr_class()
 
         results = ocr.image_to_text(
             segmenter.load_and_segment(
@@ -82,7 +88,5 @@ def test_confidence_values_returned(self):
         patient_id, patient_confidence = results["nbs_patient_id"]
         cas_id, cas_confidence = results["nbs_cas_id"]
 
-        assert isinstance(patient_confidence, float)
-        assert isinstance(cas_confidence, float)
         assert patient_confidence > 0
         assert cas_confidence > 0