From c1c07983620f41c16a30191146c9e331e26cad7c Mon Sep 17 00:00:00 2001
From: Jonathan Chang <jonathan@skylight.digital>
Date: Mon, 21 Oct 2024 10:46:07 -0700
Subject: [PATCH] Implement fine text block breaking algorithm (#339)

* init work on fine text blocks algo

* wip

* improve "phrase" kernel

* add test
---
 OCR/ocr/services/image_ocr.py | 46 ++++++++++++++++++++---------------
 OCR/tests/ocr_test.py         | 11 ++++++++-
 2 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/OCR/ocr/services/image_ocr.py b/OCR/ocr/services/image_ocr.py
index c483bdb4..c8d91f48 100644
--- a/OCR/ocr/services/image_ocr.py
+++ b/OCR/ocr/services/image_ocr.py
@@ -52,6 +52,21 @@ def merge_bounding_boxes(boxes: list) -> Iterator[list]:
         # Return the final box
         yield [current[0], current[1], current[2] - current[0], current[3] - current[1]]
 
+    def identify_blocks(self, input_image: np.ndarray, kernel: np.ndarray):
+        """
+        Given an input image and a morphological operation kernel, returns unique (non-overlapping)
+        bounding boxes of potential text regions.
+        """
+        # Invert threshold `input_image` and dilate using `kernel` to "expand" the size of text blocks
+        _, thresh = cv.threshold(cv.cvtColor(input_image, cv.COLOR_BGR2GRAY), 128, 255, cv.THRESH_BINARY_INV)
+        dial = cv.dilate(thresh, kernel)
+
+        # Estimate contours, only looking for outlines (`RETR_EXTERNAL`) and simplifying the shapes (`CHAIN_APPROX_SIMPLE`)
+        contours, _ = cv.findContours(dial, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
+
+        # Simplify each contour into a bounding box and merge potential overlaps
+        return self.merge_bounding_boxes([cv.boundingRect(contour) for contour in contours])
+
     def deskew_image_text(self, image: np.ndarray, line_length_prop=0.5, max_skew_angle=10) -> np.ndarray:
         """
         Deskew an image using Hough transforms to detect lines.
@@ -88,28 +103,25 @@ def split_text_blocks(self, image: np.ndarray, line_length_prop=0.5) -> list[np.
         line_length = image.shape[1] * line_length_prop
         rotated = self.deskew_image_text(image, line_length_prop)
 
-        # Invert threshold and dilate using a horizontal kernel to "expand" the size of text blocks
-        _, thresh = cv.threshold(cv.cvtColor(rotated, cv.COLOR_BGR2GRAY), 128, 255, cv.THRESH_BINARY_INV)
-
+        # Kernels for morphological operations.
         # Kernel height of 1 implies a minimum separation between lines of 1px
-        kernel = np.ones([1, int(line_length)], np.uint8)
-        dial = cv.dilate(thresh, kernel)
-
-        # Estimate contours, only looking for outlines (`RETR_EXTERNAL`) and simplifying the shapes (`CHAIN_APPROX_SIMPLE`)
-        contours, _ = cv.findContours(dial, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
-
-        # Simplify each contour into a bounding box
-        bbox = [cv.boundingRect(contour) for contour in contours]
+        line_kernel = np.ones([1, int(line_length)], np.uint8)
+        # 11x5 cross-shaped kernel to help identify words in blank space.
+        word_kernel = cv.getStructuringElement(cv.MORPH_CROSS, (11, 5))
 
         acc = []
-        # Merge overlapping bounding boxes, then sort the bounding boxes by y-position (top to bottom)
-        for x, y, w, h in sorted(self.merge_bounding_boxes(bbox), key=lambda x: x[1]):
+
+        # Sort identified lines by y-position (top to bottom)
+        for x, y, w, h in sorted(self.identify_blocks(rotated, line_kernel), key=lambda x: x[1]):
             # Filter lines that are too tiny and probably invalid
-            if h < 10:
+            if h < 5:
                 continue
 
             res = rotated[y : (y + h), x : (x + w)]
-            acc.append(res)
+
+            # Sort identified text blocks (putative words or phrases) by x-position (left to right)
+            for x, y, w, h in sorted(self.identify_blocks(res, word_kernel), key=lambda x: x[0]):
+                acc.append(res[y : (y + h), x : (x + w)])
 
         # If we skipped all potential text blocks due to filtering conditions, return the
         # original image anyway.
@@ -128,10 +140,6 @@ def image_to_text(self, segments: dict[str, np.ndarray]) -> dict[str, tuple[str,
 
             text_blocks = self.split_text_blocks(image)
 
-            # Ignore output from `split_text_blocks` algorithm if only one text block is detected
-            if len(text_blocks) == 1:
-                text_blocks = [image]
-
             for block in text_blocks:
                 pixel_values = self.processor(images=block, return_tensors="pt").pixel_values
                 with torch.no_grad():
diff --git a/OCR/tests/ocr_test.py b/OCR/tests/ocr_test.py
index d8cd5277..1f6647aa 100644
--- a/OCR/tests/ocr_test.py
+++ b/OCR/tests/ocr_test.py
@@ -19,6 +19,15 @@
 
 
 class TestOCR:
+    def test_extra_blank_space(self):
+        ocr = ImageOCR()
+        paragraph = cv.imread(paragraph_image_path, cv.IMREAD_COLOR)
+        padding = (200, 200, 200, 200)
+        paragraph_extra_space = cv.copyMakeBorder(paragraph, *padding, cv.BORDER_CONSTANT, value=(255, 255, 255))
+        segment = {"text": paragraph, "text_extra": paragraph_extra_space}
+        results = ocr.image_to_text(segment)
+        assert results["text"][0] == results["text_extra"][0]
+
     def test_split_text_blocks(self):
         ocr = ImageOCR()
         img = np.ones([10, 10, 3], np.uint8)
@@ -52,7 +61,7 @@ def test_ocr_paragraph(self):
         text, confidence = results["text"]
         assert (
             text
-            == "THIS TEST WAS DEVELOPED AND ITS ANALYTICAL PERFORMANCE CHARACTERISTICS HAVE BEEN DETERMINED BY QUEST DIAGNOSTICS NICHOLS INSTITUTE SAN JUAN CAPISTRAND. IT HAS NOT BEEN CLEARED OR APPROVED BY FDA. THIS ASSAY HAS BEEN VALIDATED PURSUANT TO THE CLIA REGULATIONS AND IS USED FOR CLINICAL PURPOSES."
+            == "THIS TEST WAS DEVELOPED AND ITS ANALYTICAL PERFORMANCE CHARACTERISTICS HAVE BEEN DETERMINED BY QUEST DIAGNOSTICS NICHOLS INSTITUTE SAN JUAN CAPISTRANO. IT HAS NOT BEEN CLEARED OR APPROVED BY FDA. THIS ASSAY HAS BEEN VALIDATED PURSUANT TO THE CLIA REGULATIONS AND IS USED FOR CLINICAL PURPOSES."
         )
         assert confidence > 50