Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(ocr): checking the abnormal post correction feature added #264

Merged
merged 2 commits into from
Dec 21, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion openpecha/formatters/ocr/google_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class GoogleVisionFormatter(OCRFormatter):

def __init__(self, output_path=None, metadata=None):
super().__init__(output_path, metadata)
self.check_postprocessing = False

def has_space_attached(self, symbol):
"""Checks if symbol has space followed by it or not
Expand Down Expand Up @@ -175,6 +176,7 @@ def get_char_base_bboxes_and_avg_width(self, response):
widths = []
for page in response['fullTextAnnotation']['pages']:
for block in page['blocks']:
cur_line_boxes = []
for paragraph in block['paragraphs']:
for word in paragraph['words']:
bbox = self.dict_to_bbox(word)
Expand All @@ -199,7 +201,8 @@ def get_char_base_bboxes_and_avg_width(self, response):
# language = self.get_language_code_from_gv_poly(word)
# instead we use our custom detection system
bbox.language = self.get_main_language_code(cur_word)
bboxes.append(bbox)
cur_line_boxes.append(bbox)
bboxes.append(cur_line_boxes)
avg_width = statistics.mean(widths) if widths else None
logging.debug("average char width: %f", avg_width)
return bboxes, avg_width
Expand Down
8 changes: 6 additions & 2 deletions openpecha/formatters/ocr/hocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,12 +269,14 @@ def get_boxes(self, hocr_page_html):
hocr_html = BeautifulSoup(hocr_page_html, 'html.parser')
line_boxes = hocr_html.find_all("span", {"class": "ocr_line"})
for line_box in line_boxes:
cur_line_boxes = []
self.word_span = 0
word_boxes = line_box.find_all("span", {"class": "ocrx_word"})
for word_box in word_boxes:
bbox = self.parse_box(line_box,word_box)
if bbox is not None:
bboxes.append(bbox)
cur_line_boxes.append(bbox)
bboxes.append(cur_line_boxes)
return bboxes

def get_boxes_for_IA(self, page_html):
Expand All @@ -291,12 +293,14 @@ def get_boxes_for_IA(self, page_html):
for paragraph_html in paragraphs_html:
line_boxes = paragraph_html.find_all("span", {"class": "ocr_line"})
for line_box in line_boxes:
cur_line_boxes = []
self.word_span = 0
word_boxes = line_box.find_all("span", {"class": "ocrx_word"})
for word_box in word_boxes:
bbox = self.parse_box(line_box,word_box)
if bbox is not None:
bboxes.append(bbox)
cur_line_boxes.append(bbox)
bboxes.append(cur_line_boxes)
return bboxes


Expand Down
15 changes: 14 additions & 1 deletion openpecha/formatters/ocr/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def __init__(self, output_path=None, metadata=None):
self.language_annotation_min_len = ANNOTATION_MINIMAL_LEN
self.remove_rotated_boxes = True
self.remove_duplicate_symbols = True
self.check_postprocessing = True
self.script_to_lang_map = DEFAULT_SCRIPT_TO_LANG_MAPPING
self.max_low_conf_per_page = ANNOTATION_MAX_LOW_CONF_PER_PAGE

Expand Down Expand Up @@ -471,10 +472,22 @@ def bbox_line_has_characters(self, bbox_line):
line += bbox.text
logging.debug("ignoring line '%s', detected as noise", line)
return False

def has_abnormal_postprocessing(self, original_bboxes, postprocessed_bboxes):
number_line_difference = len(original_bboxes) - len(postprocessed_bboxes)
if number_line_difference < 0 or number_line_difference > len(postprocessed_bboxes):
return True
return False

def build_page(self, bboxes, image_number, image_filename, state, avg_char_width=None):
sorted_bboxes = self.sort_bboxes(bboxes)
flatten_bboxes = []
for line_bboxes in bboxes:
for bbox in line_bboxes:
flatten_bboxes.append(bbox)
sorted_bboxes = self.sort_bboxes(flatten_bboxes)
bbox_lines = self.get_bbox_lines(sorted_bboxes)
if self.check_postprocessing and self.has_abnormal_postprocessing(bboxes, bbox_lines):
bbox_lines = bboxes
page_start_cc = state["base_layer_len"]
page_word_confidences = []
for bbox_line in bbox_lines:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def test_remove_overlap_and_duplicates():
ocr_formatter = OCRFormatter()
ocr_formatter.remove_duplicate_symbols = True
ocr_formatter.same_line_ratio_threshold = 0.2
ocr_formatter.check_postprocessing = False

ocr_formatter.build_page(bboxes, 1, "I1PD958780125", state, avg_char_width)
base = state['base_layer']
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
id: 9582dfdd80474733935072ce5600516d
id: 0271de46ee424b6b83416ac97c6b82f4
annotation_type: Language
revision: '00001'
annotations:
9b6e5568a64d42c499351b493ebdfd30:
c56897b32ad846f5a317542ec9c0b4ba:
span:
start: 39
end: 170
language: en
199d197b000443f5b2555c9d7364b268:
8bd03239973342f5adc6fe72f31500da:
span:
start: 199
end: 949
Expand Down
Loading
Loading