OpenPecha · eroux · Dec 21, 2023 · Dec 9, 2023 · Dec 19, 2023
@@ -46,6 +46,7 @@ class GoogleVisionFormatter(OCRFormatter):
 
     def __init__(self, output_path=None, metadata=None):
         super().__init__(output_path, metadata)
+        self.check_postprocessing = False
 
     def has_space_attached(self, symbol):
         """Checks if symbol has space followed by it or not
@@ -175,6 +176,7 @@ def get_char_base_bboxes_and_avg_width(self, response):
         widths = []
         for page in response['fullTextAnnotation']['pages']:
             for block in page['blocks']:
+                cur_line_boxes = []
                 for paragraph in block['paragraphs']:
                     for word in paragraph['words']:
                         bbox = self.dict_to_bbox(word)
@@ -199,7 +201,8 @@ def get_char_base_bboxes_and_avg_width(self, response):
                             # language = self.get_language_code_from_gv_poly(word)
                             # instead we use our custom detection system
                             bbox.language = self.get_main_language_code(cur_word)
-                            bboxes.append(bbox)
+                            cur_line_boxes.append(bbox)
+                bboxes.append(cur_line_boxes)
         avg_width = statistics.mean(widths) if widths else None
         logging.debug("average char width: %f", avg_width)
         return bboxes, avg_width

@@ -269,12 +269,14 @@ def get_boxes(self, hocr_page_html):
         hocr_html = BeautifulSoup(hocr_page_html, 'html.parser')
         line_boxes = hocr_html.find_all("span", {"class": "ocr_line"})
         for line_box in line_boxes:
+            cur_line_boxes = []
             self.word_span = 0
             word_boxes = line_box.find_all("span", {"class": "ocrx_word"})
             for word_box in word_boxes:
                 bbox = self.parse_box(line_box,word_box)
                 if bbox is not None:
-                    bboxes.append(bbox)
+                    cur_line_boxes.append(bbox)
+            bboxes.append(cur_line_boxes)
         return bboxes
 
     def get_boxes_for_IA(self, page_html):
@@ -291,12 +293,14 @@ def get_boxes_for_IA(self, page_html):
         for paragraph_html in paragraphs_html:
             line_boxes = paragraph_html.find_all("span", {"class": "ocr_line"})
             for line_box in line_boxes:
+                cur_line_boxes = []
                 self.word_span = 0
                 word_boxes = line_box.find_all("span", {"class": "ocrx_word"})
                 for word_box in word_boxes:
                     bbox = self.parse_box(line_box,word_box)
                     if bbox is not None:
-                        bboxes.append(bbox)
+                        cur_line_boxes.append(bbox)
+                bboxes.append(cur_line_boxes)
         return bboxes
 
 

@@ -128,6 +128,7 @@ def __init__(self, output_path=None, metadata=None):
         self.language_annotation_min_len = ANNOTATION_MINIMAL_LEN
         self.remove_rotated_boxes = True
         self.remove_duplicate_symbols = True
+        self.check_postprocessing = True
         self.script_to_lang_map = DEFAULT_SCRIPT_TO_LANG_MAPPING
         self.max_low_conf_per_page = ANNOTATION_MAX_LOW_CONF_PER_PAGE
 
@@ -471,10 +472,22 @@ def bbox_line_has_characters(self, bbox_line):
                 line += bbox.text
             logging.debug("ignoring line '%s', detected as noise", line)
         return False
+
+    def has_abnormal_postprocessing(self, original_bboxes, postprocessed_bboxes):
+        number_line_difference = len(original_bboxes) - len(postprocessed_bboxes)
+        if number_line_difference < 0 or number_line_difference > len(postprocessed_bboxes):
+            return True
+        return False
 
     def build_page(self, bboxes, image_number, image_filename, state, avg_char_width=None):
-        sorted_bboxes = self.sort_bboxes(bboxes)
+        flatten_bboxes = []
+        for line_bboxes in bboxes:
+            for bbox in line_bboxes:
+                flatten_bboxes.append(bbox) 
+        sorted_bboxes = self.sort_bboxes(flatten_bboxes)
         bbox_lines = self.get_bbox_lines(sorted_bboxes)
+        if self.check_postprocessing and self.has_abnormal_postprocessing(bboxes, bbox_lines):
+            bbox_lines = bboxes
         page_start_cc = state["base_layer_len"]
         page_word_confidences = []
         for bbox_line in bbox_lines:

@@ -32,6 +32,7 @@ def test_remove_overlap_and_duplicates():
     ocr_formatter = OCRFormatter()
     ocr_formatter.remove_duplicate_symbols = True
     ocr_formatter.same_line_ratio_threshold = 0.2
+    ocr_formatter.check_postprocessing = False
 
     ocr_formatter.build_page(bboxes, 1, "I1PD958780125", state, avg_char_width)
     base = state['base_layer']

@@ -1,13 +1,13 @@
-id: 9582dfdd80474733935072ce5600516d
+id: 0271de46ee424b6b83416ac97c6b82f4
 annotation_type: Language
 revision: '00001'
 annotations:
-  9b6e5568a64d42c499351b493ebdfd30:
+  c56897b32ad846f5a317542ec9c0b4ba:
     span:
       start: 39
       end: 170
     language: en
-  199d197b000443f5b2555c9d7364b268:
+  8bd03239973342f5adc6fe72f31500da:
     span:
       start: 199
       end: 949