improve "phrase" kernel

CDCgov · Oct 18, 2024 · 2d5bf9a · 2d5bf9a
1 parent 692cbdb
commit 2d5bf9a
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 6 deletions.
diff --git a/OCR/ocr/services/image_ocr.py b/OCR/ocr/services/image_ocr.py
@@ -54,7 +54,8 @@ def merge_bounding_boxes(boxes: list) -> Iterator[list]:
 
     def identify_blocks(self, input_image: np.ndarray, kernel: np.ndarray):
         """
-        Given an input image and a morphological operation kernel, return bounding boxes of
+        Given an input image and a morphological operation kernel, returns unique (non-overlapping)
+        bounding boxes of potential text regions.
         """
         # Invert threshold `input_image` and dilate using `kernel` to "expand" the size of text blocks
         _, thresh = cv.threshold(cv.cvtColor(input_image, cv.COLOR_BGR2GRAY), 128, 255, cv.THRESH_BINARY_INV)
@@ -93,7 +94,6 @@ def deskew_image_text(self, image: np.ndarray, line_length_prop=0.5, max_skew_an
         rotation_mat = cv.getRotationMatrix2D((image.shape[1] / 2, image.shape[0] / 2), skew_angle, 1)
         return cv.warpAffine(np.array(image, dtype=np.uint8), rotation_mat, (image.shape[1], image.shape[0]))
 
-
     def split_text_blocks(self, image: np.ndarray, line_length_prop=0.5) -> list[np.ndarray]:
         """
         Splits an image with text in it into possibly multiple images, one for each line.
@@ -106,18 +106,22 @@ def split_text_blocks(self, image: np.ndarray, line_length_prop=0.5) -> list[np.
         # Kernels for morphological operations.
         # Kernel height of 1 implies a minimum separation between lines of 1px
         line_kernel = np.ones([1, int(line_length)], np.uint8)
-        # 3x3 cross-shaped kernel to help identify words in blank space.
-        word_kernel = cv.getStructuringElement(cv.MORPH_CROSS, (3, 3))
+        # 11x5 cross-shaped kernel to help identify words in blank space.
+        word_kernel = cv.getStructuringElement(cv.MORPH_CROSS, (11, 5))
 
         acc = []
 
         # Sort identified lines by y-position (top to bottom)
         for x, y, w, h in sorted(self.identify_blocks(rotated, line_kernel), key=lambda x: x[1]):
+            # Filter lines that are too tiny and probably invalid
+            if h < 5:
+                continue
+
             res = rotated[y : (y + h), x : (x + w)]
 
             # Sort identified text blocks (putative words or phrases) by x-position (left to right)
             for x, y, w, h in sorted(self.identify_blocks(res, word_kernel), key=lambda x: x[0]):
-                acc.append(res[y:(y+h), x:(x+w)])
+                acc.append(res[y : (y + h), x : (x + w)])
 
         # If we skipped all potential text blocks due to filtering conditions, return the
         # original image anyway.

diff --git a/OCR/tests/ocr_test.py b/OCR/tests/ocr_test.py
@@ -52,7 +52,7 @@ def test_ocr_paragraph(self):
         text, confidence = results["text"]
         assert (
             text
-            == "THIS TEST WAS DEVELOPED AND ITS ANALYTICAL PERFORMANCE CHARACTERISTICS HAVE BEEN DETERMINED BY QUEST DIAGNOSTICS NICHOLS INSTITUTE SAN JUAN CAPISTRAND. IT HAS NOT BEEN CLEARED OR APPROVED BY FDA. THIS ASSAY HAS BEEN VALIDATED PURSUANT TO THE CLIA REGULATIONS AND IS USED FOR CLINICAL PURPOSES."
+            == "THIS TEST WAS DEVELOPED AND ITS ANALYTICAL PERFORMANCE CHARACTERISTICS HAVE BEEN DETERMINED BY QUEST DIAGNOSTICS NICHOLS INSTITUTE SAN JUAN CAPISTRANO. IT HAS NOT BEEN CLEARED OR APPROVED BY FDA. THIS ASSAY HAS BEEN VALIDATED PURSUANT TO THE CLIA REGULATIONS AND IS USED FOR CLINICAL PURPOSES."
         )
         assert confidence > 50