Skip to content

Commit

Permalink
Implement fine text block breaking algorithm (#339)
Browse files Browse the repository at this point in the history
* init work on fine text blocks algo

* wip

* improve "phrase" kernel

* add test
  • Loading branch information
jonchang authored Oct 21, 2024
1 parent 08371b5 commit c1c0798
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 20 deletions.
46 changes: 27 additions & 19 deletions OCR/ocr/services/image_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,21 @@ def merge_bounding_boxes(boxes: list) -> Iterator[list]:
# Return the final box
yield [current[0], current[1], current[2] - current[0], current[3] - current[1]]

def identify_blocks(self, input_image: np.ndarray, kernel: np.ndarray):
"""
Given an input image and a morphological operation kernel, returns unique (non-overlapping)
bounding boxes of potential text regions.
"""
# Invert threshold `input_image` and dilate using `kernel` to "expand" the size of text blocks
_, thresh = cv.threshold(cv.cvtColor(input_image, cv.COLOR_BGR2GRAY), 128, 255, cv.THRESH_BINARY_INV)
dial = cv.dilate(thresh, kernel)

# Estimate contours, only looking for outlines (`RETR_EXTERNAL`) and simplifying the shapes (`CHAIN_APPROX_SIMPLE`)
contours, _ = cv.findContours(dial, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)

# Simplify each contour into a bounding box and merge potential overlaps
return self.merge_bounding_boxes([cv.boundingRect(contour) for contour in contours])

def deskew_image_text(self, image: np.ndarray, line_length_prop=0.5, max_skew_angle=10) -> np.ndarray:
"""
Deskew an image using Hough transforms to detect lines.
Expand Down Expand Up @@ -88,28 +103,25 @@ def split_text_blocks(self, image: np.ndarray, line_length_prop=0.5) -> list[np.
line_length = image.shape[1] * line_length_prop
rotated = self.deskew_image_text(image, line_length_prop)

# Invert threshold and dilate using a horizontal kernel to "expand" the size of text blocks
_, thresh = cv.threshold(cv.cvtColor(rotated, cv.COLOR_BGR2GRAY), 128, 255, cv.THRESH_BINARY_INV)

# Kernels for morphological operations.
# Kernel height of 1 implies a minimum separation between lines of 1px
kernel = np.ones([1, int(line_length)], np.uint8)
dial = cv.dilate(thresh, kernel)

# Estimate contours, only looking for outlines (`RETR_EXTERNAL`) and simplifying the shapes (`CHAIN_APPROX_SIMPLE`)
contours, _ = cv.findContours(dial, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)

# Simplify each contour into a bounding box
bbox = [cv.boundingRect(contour) for contour in contours]
line_kernel = np.ones([1, int(line_length)], np.uint8)
# 11x5 cross-shaped kernel to help identify words in blank space.
word_kernel = cv.getStructuringElement(cv.MORPH_CROSS, (11, 5))

acc = []
# Merge overlapping bounding boxes, then sort the bounding boxes by y-position (top to bottom)
for x, y, w, h in sorted(self.merge_bounding_boxes(bbox), key=lambda x: x[1]):

# Sort identified lines by y-position (top to bottom)
for x, y, w, h in sorted(self.identify_blocks(rotated, line_kernel), key=lambda x: x[1]):
# Filter lines that are too tiny and probably invalid
if h < 10:
if h < 5:
continue

res = rotated[y : (y + h), x : (x + w)]
acc.append(res)

# Sort identified text blocks (putative words or phrases) by x-position (left to right)
for x, y, w, h in sorted(self.identify_blocks(res, word_kernel), key=lambda x: x[0]):
acc.append(res[y : (y + h), x : (x + w)])

# If we skipped all potential text blocks due to filtering conditions, return the
# original image anyway.
Expand All @@ -128,10 +140,6 @@ def image_to_text(self, segments: dict[str, np.ndarray]) -> dict[str, tuple[str,

text_blocks = self.split_text_blocks(image)

# Ignore output from `split_text_blocks` algorithm if only one text block is detected
if len(text_blocks) == 1:
text_blocks = [image]

for block in text_blocks:
pixel_values = self.processor(images=block, return_tensors="pt").pixel_values
with torch.no_grad():
Expand Down
11 changes: 10 additions & 1 deletion OCR/tests/ocr_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,15 @@


class TestOCR:
def test_extra_blank_space(self):
ocr = ImageOCR()
paragraph = cv.imread(paragraph_image_path, cv.IMREAD_COLOR)
padding = (200, 200, 200, 200)
paragraph_extra_space = cv.copyMakeBorder(paragraph, *padding, cv.BORDER_CONSTANT, value=(255, 255, 255))
segment = {"text": paragraph, "text_extra": paragraph_extra_space}
results = ocr.image_to_text(segment)
assert results["text"][0] == results["text_extra"][0]

def test_split_text_blocks(self):
ocr = ImageOCR()
img = np.ones([10, 10, 3], np.uint8)
Expand Down Expand Up @@ -52,7 +61,7 @@ def test_ocr_paragraph(self):
text, confidence = results["text"]
assert (
text
== "THIS TEST WAS DEVELOPED AND ITS ANALYTICAL PERFORMANCE CHARACTERISTICS HAVE BEEN DETERMINED BY QUEST DIAGNOSTICS NICHOLS INSTITUTE SAN JUAN CAPISTRAND. IT HAS NOT BEEN CLEARED OR APPROVED BY FDA. THIS ASSAY HAS BEEN VALIDATED PURSUANT TO THE CLIA REGULATIONS AND IS USED FOR CLINICAL PURPOSES."
== "THIS TEST WAS DEVELOPED AND ITS ANALYTICAL PERFORMANCE CHARACTERISTICS HAVE BEEN DETERMINED BY QUEST DIAGNOSTICS NICHOLS INSTITUTE SAN JUAN CAPISTRANO. IT HAS NOT BEEN CLEARED OR APPROVED BY FDA. THIS ASSAY HAS BEEN VALIDATED PURSUANT TO THE CLIA REGULATIONS AND IS USED FOR CLINICAL PURPOSES."
)
assert confidence > 50

Expand Down

0 comments on commit c1c0798

Please sign in to comment.