Implement tesseract backend (#375)

* Initial tesserocr * drop pytesseract * Use actual raw API backend for confidence score * ensure PIL image is passed * Guess at tessdata path * Install tesseract as part of docker setup * documentation * lint check * Use tesserocr api instead of pathlib shenanigans * Update docstring * Fix path detection crash * Strip tesseract output * Update tests for tesseract comparisons * Update CI runs
CDCgov · Nov 20, 2024 · 88ffe5b · 88ffe5b
1 parent aed0f28
commit 88ffe5b
Show file tree

Hide file tree

Showing 7 changed files with 119 additions and 16 deletions.
diff --git a/.github/workflows/ocr-tests.yml b/.github/workflows/ocr-tests.yml
@@ -17,6 +17,7 @@ jobs:
           python-version: "3.10"
       - name: Install dependencies
         run: |
+          sudo apt install tesseract-ocr-eng tesseract-ocr -y
           python -m pip install --upgrade pip
           pip install poetry
           poetry install --with dev

diff --git a/OCR/Dockerfile b/OCR/Dockerfile
@@ -1,6 +1,6 @@
 FROM python:3.10-bullseye
 
-RUN apt-get update && apt-get install ffmpeg libsm6 libxext6  -y
+RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 tesseract-ocr-eng tesseract-ocr -y
 
 RUN pip install poetry
 

diff --git a/OCR/dev-dockerfile b/OCR/dev-dockerfile
@@ -1,7 +1,6 @@
 FROM python:3.10-bullseye
 
-
-RUN apt-get update && apt-get install ffmpeg libsm6 libxext6  -y
+RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 tesseract-ocr-eng tesseract-ocr -y
 
 RUN pip install poetry
 

diff --git a/OCR/ocr/services/tesseract_ocr.py b/OCR/ocr/services/tesseract_ocr.py
@@ -0,0 +1,63 @@
+import os
+
+import tesserocr
+import numpy as np
+from PIL import Image
+
+
+class TesseractOCR:
+    @staticmethod
+    def _guess_tessdata_path(wanted_lang="eng") -> bytes:
+        """
+        Attempts to guess potential locations for the `tessdata` folder.
+
+        The `tessdata` folder is needed to use pre-trained Tesseract OCR data, though the automatic detection
+        provided in `tesserocr` may not be reliable. Instead iterate over common paths on various systems (e.g.,
+        Red Hat, Ubuntu, macOS) and check for the presence of a `tessdata` folder.
+
+        If `TESSDATA_PREFIX` is available in the environment, this function will check that location first.
+        If all guessed locations do not exist, fall back to automatic detection provided by `tesserocr` and
+        the tesseract API.
+
+        `wanted_lang` (str): a desired language to search for. Defaults to English `eng`.
+        """
+        candidate_paths = [
+            "/usr/local/share/tesseract/tessdata",
+            "/usr/share/tesseract/tessdata",
+            "/usr/share/tesseract-ocr/4.00/tessdata",
+            "/opt/homebrew/share/tessdata",
+            "/opt/local/share/tessdata",
+        ]
+
+        # Prepend env variable if defined
+        if "TESSDATA_PREFIX" in os.environ:
+            candidate_paths.insert(os.environ["TESSDATA_PREFIX"], 0)
+
+        # Test candidate paths
+        for path in candidate_paths:
+            # When compiled for certain systems (macOS), libtesseract aborts due to an untrapped exception if it
+            # cannot access the path for any reason (e.g., does not exist, lacks read permissions). Attempt to
+            # enumerate the directory and, if it fails, skip this path.
+            try:
+                os.listdir(path)
+            except OSError:
+                continue
+
+            retpath, langs = tesserocr.get_languages(path)
+            if wanted_lang in langs:
+                return retpath
+
+        # Nothing matched, just return the default path
+        return tesserocr.get_languages()[0]
+
+    def image_to_text(self, segments: dict[str, np.ndarray]) -> dict[str, tuple[str, float]]:
+        digitized: dict[str, tuple[str, float]] = {}
+        with tesserocr.PyTessBaseAPI(path=self._guess_tessdata_path()) as api:
+            for label, image in segments.items():
+                if image is None:
+                    continue
+
+                api.SetImage(Image.fromarray(image))
+                digitized[label] = (api.GetUTF8Text().strip(), api.MeanTextConf())
+
+        return digitized
diff --git a/OCR/poetry.lock b/OCR/poetry.lock
diff --git a/OCR/pyproject.toml b/OCR/pyproject.toml
@@ -16,6 +16,7 @@ transformers = {extras = ["torch"], version = "^4.45.1"}
 pillow = "^10.3.0"
 
 datasets = "^3.0.1"
+tesserocr = "^2.7.1"
 [tool.poetry.group.dev.dependencies]
 lxml = "^5.3.0"
 docopt = "^0.6.2"

diff --git a/OCR/tests/ocr_test.py b/OCR/tests/ocr_test.py
@@ -2,12 +2,14 @@
 
 import numpy as np
 import cv2 as cv
+import pytest
 
 from ocr.services.image_segmenter import (
     ImageSegmenter,
     segment_by_color_bounding_box,
 )
 from ocr.services.image_ocr import ImageOCR
+from ocr.services.tesseract_ocr import TesseractOCR
 
 
 path = os.path.dirname(__file__)
@@ -19,8 +21,9 @@
 
 
 class TestOCR:
-    def test_extra_blank_space(self):
-        ocr = ImageOCR()
+    @pytest.mark.parametrize("ocr_class", [TesseractOCR, ImageOCR])
+    def test_extra_blank_space(self, ocr_class):
+        ocr = ocr_class()
         paragraph = cv.imread(paragraph_image_path, cv.IMREAD_COLOR)
         padding = (200, 200, 200, 200)
         paragraph_extra_space = cv.copyMakeBorder(paragraph, *padding, cv.BORDER_CONSTANT, value=(255, 255, 255))
@@ -34,11 +37,12 @@ def test_split_text_blocks(self):
         result = ocr.split_text_blocks(img)
         assert np.array_equiv(result, img)
 
-    def test_ocr_printed(self):
+    @pytest.mark.parametrize("ocr_class", [TesseractOCR, ImageOCR])
+    def test_ocr_printed(self, ocr_class):
         segmenter = ImageSegmenter(
             segmentation_function=segment_by_color_bounding_box,
         )
-        ocr = ImageOCR()
+        ocr = ocr_class()
 
         results = ocr.image_to_text(
             segmenter.load_and_segment(
@@ -51,25 +55,27 @@ def test_ocr_printed(self):
         patient_id, patient_confidence = results["nbs_patient_id"]
         cas_id, cas_confidence = results["nbs_cas_id"]
 
-        assert patient_id == "SIENNA HAMPTON"
+        assert patient_id.upper() == "SIENNA HAMPTON"
         assert cas_id == "123555"
 
-    def test_ocr_paragraph(self):
-        ocr = ImageOCR()
+    @pytest.mark.parametrize("ocr_class", [TesseractOCR, ImageOCR])
+    def test_ocr_paragraph(self, ocr_class):
+        ocr = ocr_class()
         segment = {"text": cv.imread(paragraph_image_path, cv.IMREAD_COLOR)}
         results = ocr.image_to_text(segment)
         text, confidence = results["text"]
         assert (
-            text
+            text.upper().replace("\n", " ")
             == "THIS TEST WAS DEVELOPED AND ITS ANALYTICAL PERFORMANCE CHARACTERISTICS HAVE BEEN DETERMINED BY QUEST DIAGNOSTICS NICHOLS INSTITUTE SAN JUAN CAPISTRANO. IT HAS NOT BEEN CLEARED OR APPROVED BY FDA. THIS ASSAY HAS BEEN VALIDATED PURSUANT TO THE CLIA REGULATIONS AND IS USED FOR CLINICAL PURPOSES."
         )
         assert confidence > 50
 
-    def test_confidence_values_returned(self):
+    @pytest.mark.parametrize("ocr_class", [TesseractOCR, ImageOCR])
+    def test_confidence_values_returned(self, ocr_class):
         segmenter = ImageSegmenter(
             segmentation_function=segment_by_color_bounding_box,
         )
-        ocr = ImageOCR()
+        ocr = ocr_class()
 
         results = ocr.image_to_text(
             segmenter.load_and_segment(
@@ -82,7 +88,5 @@ def test_confidence_values_returned(self):
         patient_id, patient_confidence = results["nbs_patient_id"]
         cas_id, cas_confidence = results["nbs_cas_id"]
 
-        assert isinstance(patient_confidence, float)
-        assert isinstance(cas_confidence, float)
         assert patient_confidence > 0
         assert cas_confidence > 0