Skip to content

Commit

Permalink
fix(ocr): checking the abnormal post correction feature added (#264)
Browse files Browse the repository at this point in the history
* fix(ocr): checking the abnormal post correction feature added

* fix(fix-ocr): test case of hocr is updated
  • Loading branch information
kaldan007 committed Dec 21, 2023
1 parent 5e70b7a commit e94d65d
Show file tree
Hide file tree
Showing 14 changed files with 323 additions and 288 deletions.
5 changes: 4 additions & 1 deletion openpecha/formatters/ocr/google_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class GoogleVisionFormatter(OCRFormatter):

def __init__(self, output_path=None, metadata=None):
super().__init__(output_path, metadata)
self.check_postprocessing = False

def has_space_attached(self, symbol):
"""Checks if symbol has space followed by it or not
Expand Down Expand Up @@ -175,6 +176,7 @@ def get_char_base_bboxes_and_avg_width(self, response):
widths = []
for page in response['fullTextAnnotation']['pages']:
for block in page['blocks']:
cur_line_boxes = []
for paragraph in block['paragraphs']:
for word in paragraph['words']:
bbox = self.dict_to_bbox(word)
Expand All @@ -199,7 +201,8 @@ def get_char_base_bboxes_and_avg_width(self, response):
# language = self.get_language_code_from_gv_poly(word)
# instead we use our custom detection system
bbox.language = self.get_main_language_code(cur_word)
bboxes.append(bbox)
cur_line_boxes.append(bbox)
bboxes.append(cur_line_boxes)
avg_width = statistics.mean(widths) if widths else None
logging.debug("average char width: %f", avg_width)
return bboxes, avg_width
Expand Down
8 changes: 6 additions & 2 deletions openpecha/formatters/ocr/hocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,12 +269,14 @@ def get_boxes(self, hocr_page_html):
hocr_html = BeautifulSoup(hocr_page_html, 'html.parser')
line_boxes = hocr_html.find_all("span", {"class": "ocr_line"})
for line_box in line_boxes:
cur_line_boxes = []
self.word_span = 0
word_boxes = line_box.find_all("span", {"class": "ocrx_word"})
for word_box in word_boxes:
bbox = self.parse_box(line_box,word_box)
if bbox is not None:
bboxes.append(bbox)
cur_line_boxes.append(bbox)
bboxes.append(cur_line_boxes)
return bboxes

def get_boxes_for_IA(self, page_html):
Expand All @@ -291,12 +293,14 @@ def get_boxes_for_IA(self, page_html):
for paragraph_html in paragraphs_html:
line_boxes = paragraph_html.find_all("span", {"class": "ocr_line"})
for line_box in line_boxes:
cur_line_boxes = []
self.word_span = 0
word_boxes = line_box.find_all("span", {"class": "ocrx_word"})
for word_box in word_boxes:
bbox = self.parse_box(line_box,word_box)
if bbox is not None:
bboxes.append(bbox)
cur_line_boxes.append(bbox)
bboxes.append(cur_line_boxes)
return bboxes


Expand Down
15 changes: 14 additions & 1 deletion openpecha/formatters/ocr/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def __init__(self, output_path=None, metadata=None):
self.language_annotation_min_len = ANNOTATION_MINIMAL_LEN
self.remove_rotated_boxes = True
self.remove_duplicate_symbols = True
self.check_postprocessing = True
self.script_to_lang_map = DEFAULT_SCRIPT_TO_LANG_MAPPING
self.max_low_conf_per_page = ANNOTATION_MAX_LOW_CONF_PER_PAGE

Expand Down Expand Up @@ -471,10 +472,22 @@ def bbox_line_has_characters(self, bbox_line):
line += bbox.text
logging.debug("ignoring line '%s', detected as noise", line)
return False

def has_abnormal_postprocessing(self, original_bboxes, postprocessed_bboxes):
number_line_difference = len(original_bboxes) - len(postprocessed_bboxes)
if number_line_difference < 0 or number_line_difference > len(postprocessed_bboxes):
return True
return False

def build_page(self, bboxes, image_number, image_filename, state, avg_char_width=None):
sorted_bboxes = self.sort_bboxes(bboxes)
flatten_bboxes = []
for line_bboxes in bboxes:
for bbox in line_bboxes:
flatten_bboxes.append(bbox)
sorted_bboxes = self.sort_bboxes(flatten_bboxes)
bbox_lines = self.get_bbox_lines(sorted_bboxes)
if self.check_postprocessing and self.has_abnormal_postprocessing(bboxes, bbox_lines):
bbox_lines = bboxes
page_start_cc = state["base_layer_len"]
page_word_confidences = []
for bbox_line in bbox_lines:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def test_remove_overlap_and_duplicates():
ocr_formatter = OCRFormatter()
ocr_formatter.remove_duplicate_symbols = True
ocr_formatter.same_line_ratio_threshold = 0.2
ocr_formatter.check_postprocessing = False

ocr_formatter.build_page(bboxes, 1, "I1PD958780125", state, avg_char_width)
base = state['base_layer']
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
id: 9582dfdd80474733935072ce5600516d
id: 0271de46ee424b6b83416ac97c6b82f4
annotation_type: Language
revision: '00001'
annotations:
9b6e5568a64d42c499351b493ebdfd30:
c56897b32ad846f5a317542ec9c0b4ba:
span:
start: 39
end: 170
language: en
199d197b000443f5b2555c9d7364b268:
8bd03239973342f5adc6fe72f31500da:
span:
start: 199
end: 949
Expand Down
Loading

0 comments on commit e94d65d

Please sign in to comment.