From c8d2b3994b17600e4d2d9d5815db6324a81d3712 Mon Sep 17 00:00:00 2001 From: Jonathan Chang Date: Mon, 9 Dec 2024 11:03:36 -0800 Subject: [PATCH] Use better tesseract training dataset (#459) * Download tessdata-best instead of tessdata-fast This should improve OCR performance using tesseract with only a mild increase in runtime and container size. * Drop unused `cdifflib` --- OCR/Dockerfile | 10 +++++++--- OCR/dev-dockerfile | 11 +++++++---- OCR/ocr/services/tesseract_ocr.py | 1 + OCR/poetry.lock | 15 +-------------- OCR/pyproject.toml | 3 +-- 5 files changed, 17 insertions(+), 23 deletions(-) diff --git a/OCR/Dockerfile b/OCR/Dockerfile index 0d54ab72..68787a90 100644 --- a/OCR/Dockerfile +++ b/OCR/Dockerfile @@ -1,6 +1,10 @@ -FROM python:3.10-bullseye +FROM python:3.10-slim -RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 tesseract-ocr-eng tesseract-ocr -y +RUN apt-get update && apt-get install libgl1 tesseract-ocr-eng tesseract-ocr -y + +# Use larger "best" training data, rather than "fast" +# Python one-liner because we don't have curl or wget +RUN python3 -c 'from urllib.request import urlopen; open("/usr/share/tesseract-ocr/5/tessdata/eng.traineddata", "wb").write(urlopen("https://github.com/tesseract-ocr/tessdata_best/raw/refs/heads/main/eng.traineddata").read())' RUN pip install poetry @@ -8,7 +12,7 @@ COPY /ocr /ocr COPY poetry.lock . COPY pyproject.toml . -RUN poetry install --without dev +RUN poetry install --only main && poetry cache list | xargs -n1 poetry cache clear --all ENTRYPOINT ["poetry", "run", "api"] diff --git a/OCR/dev-dockerfile b/OCR/dev-dockerfile index f135fad6..d2e02d6b 100644 --- a/OCR/dev-dockerfile +++ b/OCR/dev-dockerfile @@ -1,6 +1,10 @@ -FROM python:3.10-bullseye +FROM python:3.10-slim -RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 tesseract-ocr-eng tesseract-ocr -y +RUN apt-get update && apt-get install libgl1 tesseract-ocr-eng tesseract-ocr -y + +# Use larger "best" training data, rather than "fast" +# Python one-liner because we don't have curl or wget +RUN python3 -c 'from urllib.request import urlopen; open("/usr/share/tesseract-ocr/5/tessdata/eng.traineddata", "wb").write(urlopen("https://github.com/tesseract-ocr/tessdata_best/raw/refs/heads/main/eng.traineddata").read())' RUN pip install poetry @@ -8,5 +12,4 @@ COPY ./pyproject.toml /ocr/pyproject.toml COPY ./poetry.lock /ocr/poetry.lock WORKDIR /ocr -RUN poetry install - +RUN poetry install && poetry cache list | xargs -n1 poetry cache clear --all diff --git a/OCR/ocr/services/tesseract_ocr.py b/OCR/ocr/services/tesseract_ocr.py index 9ce4a78e..c796a065 100644 --- a/OCR/ocr/services/tesseract_ocr.py +++ b/OCR/ocr/services/tesseract_ocr.py @@ -36,6 +36,7 @@ def _guess_tessdata_path(wanted_lang="eng") -> bytes: "/usr/local/share/tesseract/tessdata", "/usr/share/tesseract/tessdata", "/usr/share/tesseract-ocr/4.00/tessdata", + "/usr/share/tesseract-ocr/5/tessdata", "/opt/homebrew/share/tessdata", "/opt/local/share/tessdata", ] diff --git a/OCR/poetry.lock b/OCR/poetry.lock index 1802ca4b..73ee4c68 100644 --- a/OCR/poetry.lock +++ b/OCR/poetry.lock @@ -231,19 +231,6 @@ docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphi tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] -[[package]] -name = "cdifflib" -version = "1.2.6" -description = "C implementation of parts of difflib" -optional = false -python-versions = ">=3.4" -files = [ - {file = "cdifflib-1.2.6-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:bd236fc9e166e911f8ad87d89c1d1ade4e33df6f67e8c34fbe5f2bd89f0225f1"}, - {file = "cdifflib-1.2.6-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:3b509f3a2b51abe45af36dc074a878ba3309a48968683a14e4104b46c2ef5b44"}, - {file = "cdifflib-1.2.6-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:90c3bc02f3812f8def2e5b901345795a73e0547e9ea7aab2153200f4c84cab44"}, - {file = "cdifflib-1.2.6.tar.gz", hash = "sha256:57517c390392a71d59e9d7e799e9b685eaf9e07812fc8f234540ff19c4b03e66"}, -] - [[package]] name = "certifi" version = "2024.8.30" @@ -3583,4 +3570,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "2993b1bb088823677673c61c126ae057f2032e87edbe3a772fee9b9c21ba2396" +content-hash = "45a5fb639bff707f562a3f4d44cb73158f74a9b0debb5c5bdd7c930931dd6944" diff --git a/OCR/pyproject.toml b/OCR/pyproject.toml index 8b4de10e..eb95539b 100644 --- a/OCR/pyproject.toml +++ b/OCR/pyproject.toml @@ -10,13 +10,12 @@ python = "^3.10" numpy = "^1.26.4" opencv-python = "^4.9.0.80" levenshtein = "^0.25.1" -cdifflib = "^1.2.6" fastapi = {extras = ["standard"], version = "^0.112.1"} transformers = {extras = ["torch"], version = "^4.45.1"} pillow = "^10.3.0" - datasets = "^3.0.1" tesserocr = "^2.7.1" + [tool.poetry.group.dev.dependencies] lxml = "^5.3.0" docopt = "^0.6.2"