Skip to content

Commit

Permalink
move color matches to a list and tested against a list
Browse files Browse the repository at this point in the history
  • Loading branch information
arinkulshi committed May 20, 2024
1 parent 3fe261b commit 9c4b7c8
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 17 deletions.
24 changes: 18 additions & 6 deletions OCR/ocr/services/pdf_field_extractor.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import json
import os
import random
from typing import Dict, List, Optional, Tuple

import pypdf
import os
from pdf2image import convert_from_path


Expand All @@ -17,6 +18,7 @@ def __init__(self, file_path: str):
self.file_path = file_path
self.reader = None
self.form_fields = []
self.color_matches = []

def initialize_reader(self, base_path: Optional[str] = None) -> None:
"""
Expand Down Expand Up @@ -135,7 +137,7 @@ def update_annotations_and_save(
json.dump(color_label_map, json_file, indent=4)
return output_path, labels_path

def mark_rectangles_on_pdf(self) -> Tuple[str, str]:
def mark_rectangles_on_pdf(self):
"""
Process the PDF to add rectangle annotations and save the document along with a JSON mapping file.
Expand All @@ -150,6 +152,7 @@ def mark_rectangles_on_pdf(self) -> Tuple[str, str]:

color_label_map = {}
count = 0
color_matches = []

Check failure on line 155 in OCR/ocr/services/pdf_field_extractor.py

View workflow job for this annotation

GitHub Actions / python

Ruff (F841)

OCR/ocr/services/pdf_field_extractor.py:155:9: F841 Local variable `color_matches` is assigned to but never used
for page in self.reader.pages:
annotations = page.get("/Annots", pypdf.generic.ArrayObject())

Expand All @@ -168,11 +171,11 @@ def mark_rectangles_on_pdf(self) -> Tuple[str, str]:

new_annot = self.create_rectangle_annotation(rect, color)
new_annotations.append(new_annot)

pdf_color = new_annot.get("/C").get_object()
pdf_color_values = [int(color_val * 255) for color_val in pdf_color]
pdf_color_str = ",".join(map(str, pdf_color_values))
self.color_matches.append((color_str, pdf_color_str))
if count < 5:
pdf_color = new_annot.get("/C").get_object()
pdf_color_values = [int(color_val * 255) for color_val in pdf_color]
pdf_color_str = ",".join(map(str, pdf_color_values))
print(f"Color in labels file: {color_str}")
print(f"Color in PDF annotation: {pdf_color_str}")
count += 1
Expand Down Expand Up @@ -211,3 +214,12 @@ def pdf_to_images(self, path) -> List[str]:
image_paths.append(image_path)

return image_paths

def get_color_matches(self) -> List[Tuple[str, str]]:
"""
Get the color matches for testing purposes
Returns:
List[Tuple[str, str]]: A list of tuples containing the colors in labels and the colors in the PDF annotations.
"""
return self.color_matches
20 changes: 9 additions & 11 deletions OCR/tests/pdf_field_extractor_test.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import pytest
from ocr.services.pdf_field_extractor import PDFFieldExtractor
import os

import pypdf
import re
import pytest

from ocr.services.pdf_field_extractor import PDFFieldExtractor


@pytest.fixture
Expand Down Expand Up @@ -30,7 +31,7 @@ def test_generate_random_color(pdf_extractor):
assert isinstance(color, str), "Output should be a string"
parts = color.split(",")
assert len(parts) == 3, "Color should have three parts"
all(int(part) >= 0 and int(part) <= 255 for part in parts), "All RGB values should be within 0-255"
all(0 <= int(part) <= 255 for part in parts), "All RGB values should be within 0-255"


def test_create_rectangle_annotation(pdf_extractor):
Expand All @@ -48,13 +49,10 @@ def test_document_creation(pdf_extractor, mocker):
assert labels == "path_to_labels", "Should return the correct path to the labels JSON"


def test_end_to_end_segment_creation(pdf_extractor, mocker, capsys):
mocker.patch.object(pdf_extractor, "update_annotations_and_save", return_value=("path_to_pdf", "path_to_labels"))
output, labels = pdf_extractor.mark_rectangles_on_pdf()
captured = capsys.readouterr()
color_matches = re.findall(
r"Color in labels file: (\d+,\d+,\d+)\nColor in PDF annotation: (\d+,\d+,\d+)", captured.out
)
def test_end_to_end_segment_creation(pdf_extractor):
pdf_extractor.initialize_reader()
pdf_extractor.mark_rectangles_on_pdf()
color_matches = pdf_extractor.get_color_matches()
for label_color, pdf_color in color_matches:
assert (
label_color == pdf_color
Expand Down

0 comments on commit 9c4b7c8

Please sign in to comment.