Skip to content

Commit

Permalink
Implement tesseract backend (#375)
Browse files Browse the repository at this point in the history
* Initial tesserocr

* drop pytesseract

* Use actual raw API backend for confidence score

* ensure PIL image is passed

* Guess at tessdata path

* Install tesseract as part of docker setup

* documentation

* lint check

* Use tesserocr api instead of pathlib shenanigans

* Update docstring

* Fix path detection crash

* Strip tesseract output

* Update tests for tesseract comparisons

* Update CI runs
  • Loading branch information
jonchang authored Nov 20, 2024
1 parent aed0f28 commit 88ffe5b
Show file tree
Hide file tree
Showing 7 changed files with 119 additions and 16 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ocr-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ jobs:
python-version: "3.10"
- name: Install dependencies
run: |
sudo apt install tesseract-ocr-eng tesseract-ocr -y
python -m pip install --upgrade pip
pip install poetry
poetry install --with dev
Expand Down
2 changes: 1 addition & 1 deletion OCR/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
FROM python:3.10-bullseye

RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y
RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 tesseract-ocr-eng tesseract-ocr -y

RUN pip install poetry

Expand Down
3 changes: 1 addition & 2 deletions OCR/dev-dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
FROM python:3.10-bullseye


RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y
RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 tesseract-ocr-eng tesseract-ocr -y

RUN pip install poetry

Expand Down
63 changes: 63 additions & 0 deletions OCR/ocr/services/tesseract_ocr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import os

import tesserocr
import numpy as np
from PIL import Image


class TesseractOCR:
@staticmethod
def _guess_tessdata_path(wanted_lang="eng") -> bytes:
"""
Attempts to guess potential locations for the `tessdata` folder.
The `tessdata` folder is needed to use pre-trained Tesseract OCR data, though the automatic detection
provided in `tesserocr` may not be reliable. Instead iterate over common paths on various systems (e.g.,
Red Hat, Ubuntu, macOS) and check for the presence of a `tessdata` folder.
If `TESSDATA_PREFIX` is available in the environment, this function will check that location first.
If all guessed locations do not exist, fall back to automatic detection provided by `tesserocr` and
the tesseract API.
`wanted_lang` (str): a desired language to search for. Defaults to English `eng`.
"""
candidate_paths = [
"/usr/local/share/tesseract/tessdata",
"/usr/share/tesseract/tessdata",
"/usr/share/tesseract-ocr/4.00/tessdata",
"/opt/homebrew/share/tessdata",
"/opt/local/share/tessdata",
]

# Prepend env variable if defined
if "TESSDATA_PREFIX" in os.environ:
candidate_paths.insert(os.environ["TESSDATA_PREFIX"], 0)

# Test candidate paths
for path in candidate_paths:
# When compiled for certain systems (macOS), libtesseract aborts due to an untrapped exception if it
# cannot access the path for any reason (e.g., does not exist, lacks read permissions). Attempt to
# enumerate the directory and, if it fails, skip this path.
try:
os.listdir(path)
except OSError:
continue

retpath, langs = tesserocr.get_languages(path)
if wanted_lang in langs:
return retpath

# Nothing matched, just return the default path
return tesserocr.get_languages()[0]

def image_to_text(self, segments: dict[str, np.ndarray]) -> dict[str, tuple[str, float]]:
digitized: dict[str, tuple[str, float]] = {}
with tesserocr.PyTessBaseAPI(path=self._guess_tessdata_path()) as api:
for label, image in segments.items():
if image is None:
continue

api.SetImage(Image.fromarray(image))
digitized[label] = (api.GetUTF8Text().strip(), api.MeanTextConf())

return digitized
37 changes: 36 additions & 1 deletion OCR/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions OCR/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ transformers = {extras = ["torch"], version = "^4.45.1"}
pillow = "^10.3.0"

datasets = "^3.0.1"
tesserocr = "^2.7.1"
[tool.poetry.group.dev.dependencies]
lxml = "^5.3.0"
docopt = "^0.6.2"
Expand Down
28 changes: 16 additions & 12 deletions OCR/tests/ocr_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

import numpy as np
import cv2 as cv
import pytest

from ocr.services.image_segmenter import (
ImageSegmenter,
segment_by_color_bounding_box,
)
from ocr.services.image_ocr import ImageOCR
from ocr.services.tesseract_ocr import TesseractOCR


path = os.path.dirname(__file__)
Expand All @@ -19,8 +21,9 @@


class TestOCR:
def test_extra_blank_space(self):
ocr = ImageOCR()
@pytest.mark.parametrize("ocr_class", [TesseractOCR, ImageOCR])
def test_extra_blank_space(self, ocr_class):
ocr = ocr_class()
paragraph = cv.imread(paragraph_image_path, cv.IMREAD_COLOR)
padding = (200, 200, 200, 200)
paragraph_extra_space = cv.copyMakeBorder(paragraph, *padding, cv.BORDER_CONSTANT, value=(255, 255, 255))
Expand All @@ -34,11 +37,12 @@ def test_split_text_blocks(self):
result = ocr.split_text_blocks(img)
assert np.array_equiv(result, img)

def test_ocr_printed(self):
@pytest.mark.parametrize("ocr_class", [TesseractOCR, ImageOCR])
def test_ocr_printed(self, ocr_class):
segmenter = ImageSegmenter(
segmentation_function=segment_by_color_bounding_box,
)
ocr = ImageOCR()
ocr = ocr_class()

results = ocr.image_to_text(
segmenter.load_and_segment(
Expand All @@ -51,25 +55,27 @@ def test_ocr_printed(self):
patient_id, patient_confidence = results["nbs_patient_id"]
cas_id, cas_confidence = results["nbs_cas_id"]

assert patient_id == "SIENNA HAMPTON"
assert patient_id.upper() == "SIENNA HAMPTON"
assert cas_id == "123555"

def test_ocr_paragraph(self):
ocr = ImageOCR()
@pytest.mark.parametrize("ocr_class", [TesseractOCR, ImageOCR])
def test_ocr_paragraph(self, ocr_class):
ocr = ocr_class()
segment = {"text": cv.imread(paragraph_image_path, cv.IMREAD_COLOR)}
results = ocr.image_to_text(segment)
text, confidence = results["text"]
assert (
text
text.upper().replace("\n", " ")
== "THIS TEST WAS DEVELOPED AND ITS ANALYTICAL PERFORMANCE CHARACTERISTICS HAVE BEEN DETERMINED BY QUEST DIAGNOSTICS NICHOLS INSTITUTE SAN JUAN CAPISTRANO. IT HAS NOT BEEN CLEARED OR APPROVED BY FDA. THIS ASSAY HAS BEEN VALIDATED PURSUANT TO THE CLIA REGULATIONS AND IS USED FOR CLINICAL PURPOSES."
)
assert confidence > 50

def test_confidence_values_returned(self):
@pytest.mark.parametrize("ocr_class", [TesseractOCR, ImageOCR])
def test_confidence_values_returned(self, ocr_class):
segmenter = ImageSegmenter(
segmentation_function=segment_by_color_bounding_box,
)
ocr = ImageOCR()
ocr = ocr_class()

results = ocr.image_to_text(
segmenter.load_and_segment(
Expand All @@ -82,7 +88,5 @@ def test_confidence_values_returned(self):
patient_id, patient_confidence = results["nbs_patient_id"]
cas_id, cas_confidence = results["nbs_cas_id"]

assert isinstance(patient_confidence, float)
assert isinstance(cas_confidence, float)
assert patient_confidence > 0
assert cas_confidence > 0

0 comments on commit 88ffe5b

Please sign in to comment.