Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Introduce automatic language detection in TesseractOcrCliModel #800

Merged
merged 3 commits into from
Jan 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 72 additions & 4 deletions docling/models/tesseract_ocr_cli_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
import tempfile
from subprocess import DEVNULL, PIPE, Popen
from typing import Iterable, Optional, Tuple
from typing import Iterable, List, Optional, Tuple

import pandas as pd
from docling_core.types.doc import BoundingBox, CoordOrigin
Expand All @@ -14,6 +14,7 @@
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.ocr_utils import map_tesseract_script
from docling.utils.profiling import TimeRecorder

_log = logging.getLogger(__name__)
Expand All @@ -29,10 +30,13 @@ def __init__(self, enabled: bool, options: TesseractCliOcrOptions):

self._name: Optional[str] = None
self._version: Optional[str] = None
self._tesseract_languages: Optional[List[str]] = None
self._script_prefix: Optional[str] = None

if self.enabled:
try:
self._get_name_and_version()
self._set_languages_and_prefix()

except Exception as exc:
raise RuntimeError(
Expand Down Expand Up @@ -74,12 +78,20 @@ def _get_name_and_version(self) -> Tuple[str, str]:
return name, version

def _run_tesseract(self, ifilename: str):

r"""
Run tesseract CLI
"""
cmd = [self.options.tesseract_cmd]

if self.options.lang is not None and len(self.options.lang) > 0:
if "auto" in self.options.lang:
lang = self._detect_language(ifilename)
if lang is not None:
cmd.append("-l")
cmd.append(lang)
elif self.options.lang is not None and len(self.options.lang) > 0:
cmd.append("-l")
cmd.append("+".join(self.options.lang))

if self.options.path is not None:
cmd.append("--tessdata-dir")
cmd.append(self.options.path)
Expand Down Expand Up @@ -107,6 +119,63 @@ def _run_tesseract(self, ifilename: str):

return df_filtered

def _detect_language(self, ifilename: str):
r"""
Run tesseract in PSM 0 mode to detect the language
"""
assert self._tesseract_languages is not None

cmd = [self.options.tesseract_cmd]
cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
_log.info("command: {}".format(" ".join(cmd)))
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
output, _ = proc.communicate()
decoded_data = output.decode("utf-8")
df = pd.read_csv(
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
)
scripts = df.loc[df["key"] == "Script"].value.tolist()
if len(scripts) == 0:
_log.warning("Tesseract cannot detect the script of the page")
return None

script = map_tesseract_script(scripts[0].strip())
lang = f"{self._script_prefix}{script}"

# Check if the detected language has been installed
if lang not in self._tesseract_languages:
msg = f"Tesseract detected the script '{script}' and language '{lang}'."
msg += " However this language is not installed in your system and will be ignored."
_log.warning(msg)
return None

_log.debug(
f"Using tesseract model for the detected script '{script}' and language '{lang}'"
)
return lang

def _set_languages_and_prefix(self):
r"""
Read and set the languages installed in tesseract and decide the script prefix
"""
# Get all languages
cmd = [self.options.tesseract_cmd]
cmd.append("--list-langs")
_log.info("command: {}".format(" ".join(cmd)))
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
output, _ = proc.communicate()
decoded_data = output.decode("utf-8")
df = pd.read_csv(io.StringIO(decoded_data), header=None)
self._tesseract_languages = df[0].tolist()[1:]

# Decide the script prefix
if any([l.startswith("script/") for l in self._tesseract_languages]):
script_prefix = "script/"
else:
script_prefix = ""

self._script_prefix = script_prefix

def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
Expand All @@ -121,7 +190,6 @@ def __call__(
yield page
else:
with TimeRecorder(conv_res, "ocr"):

ocr_rects = self.get_ocr_rects(page)

all_ocr_cells = []
Expand Down
74 changes: 37 additions & 37 deletions docling/models/tesseract_ocr_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from docling.datamodel.pipeline_options import TesseractOcrOptions
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.ocr_utils import map_tesseract_script
from docling.utils.profiling import TimeRecorder

_log = logging.getLogger(__name__)
Expand All @@ -20,6 +21,7 @@ def __init__(self, enabled: bool, options: TesseractOcrOptions):

self.scale = 3 # multiplier for 72 dpi == 216 dpi.
self.reader = None
self.osd_reader = None

if self.enabled:
install_errmsg = (
Expand Down Expand Up @@ -47,8 +49,8 @@ def __init__(self, enabled: bool, options: TesseractOcrOptions):
except:
raise ImportError(install_errmsg)

_, tesserocr_languages = tesserocr.get_languages()
if not tesserocr_languages:
_, self._tesserocr_languages = tesserocr.get_languages()
if not self._tesserocr_languages:
raise ImportError(missing_langs_errmsg)

# Initialize the tesseractAPI
Expand All @@ -57,7 +59,7 @@ def __init__(self, enabled: bool, options: TesseractOcrOptions):

self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}

if any([l.startswith("script/") for l in tesserocr_languages]):
if any([l.startswith("script/") for l in self._tesserocr_languages]):
self.script_prefix = "script/"
else:
self.script_prefix = ""
Expand All @@ -72,14 +74,14 @@ def __init__(self, enabled: bool, options: TesseractOcrOptions):
tesserocr_kwargs["path"] = self.options.path

if lang == "auto":
self.reader = tesserocr.PyTessBaseAPI(
self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
self.osd_reader = tesserocr.PyTessBaseAPI(
**{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
)
else:
self.reader = tesserocr.PyTessBaseAPI(
**{"lang": lang} | tesserocr_kwargs,
)

self.reader_RIL = tesserocr.RIL

def __del__(self):
Expand All @@ -96,15 +98,14 @@ def __call__(
yield from page_batch
return

import tesserocr

for page in page_batch:
assert page._backend is not None
if not page._backend.is_valid():
yield page
else:
with TimeRecorder(conv_res, "ocr"):
assert self.reader is not None
assert self._tesserocr_languages is not None

ocr_rects = self.get_ocr_rects(page)

Expand All @@ -117,43 +118,42 @@ def __call__(
scale=self.scale, cropbox=ocr_rect
)

# Retrieve text snippets with their bounding boxes
self.reader.SetImage(high_res_image)
local_reader = self.reader
if "auto" in self.options.lang:
assert self.osd_reader is not None

if self.options.lang == ["auto"]:
osd = self.reader.DetectOrientationScript()
self.osd_reader.SetImage(high_res_image)
osd = self.osd_reader.DetectOrientationScript()

# No text, probably
if osd is None:
continue

script = osd["script_name"]

if script == "Katakana" or script == "Hiragana":
script = "Japanese"
elif script == "Han":
script = "HanS"
elif script == "Korean":
script = "Hangul"

_log.debug(
f'Using model for the detected script "{script}"'
)

if script not in self.script_readers:
self.script_readers[script] = tesserocr.PyTessBaseAPI(
path=self.reader.GetDatapath(),
lang=f"{self.script_prefix}{script}",
psm=tesserocr.PSM.AUTO,
init=True,
oem=tesserocr.OEM.DEFAULT,
)

local_reader = self.script_readers[script]
local_reader.SetImage(high_res_image)
else:
local_reader = self.reader

script = map_tesseract_script(script)
lang = f"{self.script_prefix}{script}"

# Check if the detected languge is present in the system
if lang not in self._tesserocr_languages:
msg = f"Tesseract detected the script '{script}' and language '{lang}'."
msg += " However this language is not installed in your system and will be ignored."
_log.warning(msg)
else:
if script not in self.script_readers:
import tesserocr

self.script_readers[script] = (
tesserocr.PyTessBaseAPI(
path=self.reader.GetDatapath(),
lang=lang,
psm=tesserocr.PSM.AUTO,
init=True,
oem=tesserocr.OEM.DEFAULT,
)
)
local_reader = self.script_readers[script]

local_reader.SetImage(high_res_image)
boxes = local_reader.GetComponentImages(
self.reader_RIL.TEXTLINE, True
)
Expand Down
9 changes: 9 additions & 0 deletions docling/utils/ocr_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
def map_tesseract_script(script: str) -> str:
r""" """
if script == "Katakana" or script == "Hiragana":
script = "Japanese"
elif script == "Han":
script = "HanS"
elif script == "Korean":
script = "Hangul"
return script
37 changes: 37 additions & 0 deletions docs/examples/tesseract_lang_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from pathlib import Path

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption


def main():
input_doc = Path("./tests/data/2206.01062.pdf")

# Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions
# ocr_options = TesseractOcrOptions(lang=["auto"])
ocr_options = TesseractCliOcrOptions(lang=["auto"])

pipeline_options = PdfPipelineOptions(
do_ocr=True, force_full_page_ocr=True, ocr_options=ocr_options
)

converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
)
}
)

doc = converter.convert(input_doc).document
md = doc.export_to_markdown()
print(md)


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ nav:
- "Table export": examples/export_tables.py
- "Multimodal export": examples/export_multimodal.py
- "Force full page OCR": examples/full_page_ocr.py
- "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py
- "Accelerator options": examples/run_with_accelerator.py
- "Simple translation": examples/translate.py
- ✂️ Chunking:
Expand Down
1 change: 1 addition & 0 deletions tests/test_e2e_ocr_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def test_e2e_conversions():
TesseractOcrOptions(force_full_page_ocr=True),
TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]),
TesseractCliOcrOptions(force_full_page_ocr=True),
TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]),
RapidOcrOptions(force_full_page_ocr=True),
]

Expand Down
Loading