From 5243dc71144cfddd641e1ca0fcb26fc3171f52a2 Mon Sep 17 00:00:00 2001 From: Sami Liedes Date: Tue, 2 Jul 2024 11:47:30 +0200 Subject: [PATCH] Add hOCR output format --- easyocr/cli.py | 2 +- easyocr/easyocr.py | 4 +++- easyocr/utils.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/easyocr/cli.py b/easyocr/cli.py index e8520b8b1d..b70006c65e 100644 --- a/easyocr/cli.py +++ b/easyocr/cli.py @@ -229,7 +229,7 @@ def parse_args(): parser.add_argument( "--output_format", type=str, - choices=["standard", 'dict', 'json'], + choices=["standard", 'dict', 'json', "hocr"], default='standard', help="output format.", ) diff --git a/easyocr/easyocr.py b/easyocr/easyocr.py index 681c05b3ce..310727b32d 100644 --- a/easyocr/easyocr.py +++ b/easyocr/easyocr.py @@ -4,7 +4,7 @@ from .utils import group_text_box, get_image_list, calculate_md5, get_paragraph,\ download_and_unzip, printProgressBar, diff, reformat_input,\ make_rotated_img_list, set_result_with_confidence,\ - reformat_input_batched, merge_to_free + reformat_input_batched, merge_to_free, to_hocr from .config import * from bidi.algorithm import get_display import numpy as np @@ -434,6 +434,8 @@ def recognize(self, img_cv_grey, horizontal_list=None, free_list=None,\ return [json.dumps({'boxes':[list(map(int, lst)) for lst in item[0]],'text':item[1],'confident':item[2]}, ensure_ascii=False) for item in result] elif output_format == 'free_merge': return merge_to_free(result, free_list) + elif output_format == "hocr": + return to_hocr(result) else: return result diff --git a/easyocr/utils.py b/easyocr/utils.py index 987baf2c9a..56687e907f 100644 --- a/easyocr/utils.py +++ b/easyocr/utils.py @@ -8,6 +8,7 @@ from PIL import Image, JpegImagePlugin from scipy import ndimage import hashlib +import html import sys, os from zipfile import ZipFile from .imgproc import loadImage @@ -383,6 +384,49 @@ def decode_wordbeamsearch(self, mat, beamWidth=5): texts.append(string) return texts +OCR_PREAMBLE = """ + + + + + + + + + + +
+""".strip() + + +# In order to get a browser-renderable HTML file, you can add this before the closing tag: +# +# + +OCR_POSTAMBLE = """
+ + +""".splitlines() + +def to_hocr(result): + content = [] + min_x0, min_y0, max_x1, max_y1 = 1e9, 1e9, 0, 0 + for box, text, confidence in result: + # We have the corners of the box, clockwise from top-left + c1, _, c3, _ = [[int(x) for x in c] for c in box] + x0, y0 = c1 + x1, y1 = c3 + min_x0 = min(min_x0, x0) + min_y0 = min(min_y0, y0) + max_x1 = max(max_x1, x1) + max_y1 = max(max_y1, y1) + content.append(' {text}'.format( + x0=x0, y0=y0, x1=x1, y1=y1, text=html.escape(text) + )) + preamble = OCR_PREAMBLE.format(x0=min_x0, y0=min_y0, x1=max_x1, y1=max_y1).splitlines() + return preamble + content + OCR_POSTAMBLE + def merge_to_free(merge_result, free_list): merge_result_buf, mr_buf = [], []