From 9c4b7c880d027d202c2cd6ec3dae0114825234f5 Mon Sep 17 00:00:00 2001 From: Arindam Kulshi Date: Mon, 20 May 2024 08:30:32 -0700 Subject: [PATCH] move color matches to a list and tested against a list --- OCR/ocr/services/pdf_field_extractor.py | 24 ++++++++++++++++++------ OCR/tests/pdf_field_extractor_test.py | 20 +++++++++----------- 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/OCR/ocr/services/pdf_field_extractor.py b/OCR/ocr/services/pdf_field_extractor.py index bee7856e..8c035293 100644 --- a/OCR/ocr/services/pdf_field_extractor.py +++ b/OCR/ocr/services/pdf_field_extractor.py @@ -1,8 +1,9 @@ import json +import os import random from typing import Dict, List, Optional, Tuple + import pypdf -import os from pdf2image import convert_from_path @@ -17,6 +18,7 @@ def __init__(self, file_path: str): self.file_path = file_path self.reader = None self.form_fields = [] + self.color_matches = [] def initialize_reader(self, base_path: Optional[str] = None) -> None: """ @@ -135,7 +137,7 @@ def update_annotations_and_save( json.dump(color_label_map, json_file, indent=4) return output_path, labels_path - def mark_rectangles_on_pdf(self) -> Tuple[str, str]: + def mark_rectangles_on_pdf(self): """ Process the PDF to add rectangle annotations and save the document along with a JSON mapping file. @@ -150,6 +152,7 @@ def mark_rectangles_on_pdf(self) -> Tuple[str, str]: color_label_map = {} count = 0 + color_matches = [] for page in self.reader.pages: annotations = page.get("/Annots", pypdf.generic.ArrayObject()) @@ -168,11 +171,11 @@ def mark_rectangles_on_pdf(self) -> Tuple[str, str]: new_annot = self.create_rectangle_annotation(rect, color) new_annotations.append(new_annot) - + pdf_color = new_annot.get("/C").get_object() + pdf_color_values = [int(color_val * 255) for color_val in pdf_color] + pdf_color_str = ",".join(map(str, pdf_color_values)) + self.color_matches.append((color_str, pdf_color_str)) if count < 5: - pdf_color = new_annot.get("/C").get_object() - pdf_color_values = [int(color_val * 255) for color_val in pdf_color] - pdf_color_str = ",".join(map(str, pdf_color_values)) print(f"Color in labels file: {color_str}") print(f"Color in PDF annotation: {pdf_color_str}") count += 1 @@ -211,3 +214,12 @@ def pdf_to_images(self, path) -> List[str]: image_paths.append(image_path) return image_paths + + def get_color_matches(self) -> List[Tuple[str, str]]: + """ + Get the color matches for testing purposes + + Returns: + List[Tuple[str, str]]: A list of tuples containing the colors in labels and the colors in the PDF annotations. + """ + return self.color_matches diff --git a/OCR/tests/pdf_field_extractor_test.py b/OCR/tests/pdf_field_extractor_test.py index 0ac50648..17b49518 100644 --- a/OCR/tests/pdf_field_extractor_test.py +++ b/OCR/tests/pdf_field_extractor_test.py @@ -1,8 +1,9 @@ -import pytest -from ocr.services.pdf_field_extractor import PDFFieldExtractor import os + import pypdf -import re +import pytest + +from ocr.services.pdf_field_extractor import PDFFieldExtractor @pytest.fixture @@ -30,7 +31,7 @@ def test_generate_random_color(pdf_extractor): assert isinstance(color, str), "Output should be a string" parts = color.split(",") assert len(parts) == 3, "Color should have three parts" - all(int(part) >= 0 and int(part) <= 255 for part in parts), "All RGB values should be within 0-255" + all(0 <= int(part) <= 255 for part in parts), "All RGB values should be within 0-255" def test_create_rectangle_annotation(pdf_extractor): @@ -48,13 +49,10 @@ def test_document_creation(pdf_extractor, mocker): assert labels == "path_to_labels", "Should return the correct path to the labels JSON" -def test_end_to_end_segment_creation(pdf_extractor, mocker, capsys): - mocker.patch.object(pdf_extractor, "update_annotations_and_save", return_value=("path_to_pdf", "path_to_labels")) - output, labels = pdf_extractor.mark_rectangles_on_pdf() - captured = capsys.readouterr() - color_matches = re.findall( - r"Color in labels file: (\d+,\d+,\d+)\nColor in PDF annotation: (\d+,\d+,\d+)", captured.out - ) +def test_end_to_end_segment_creation(pdf_extractor): + pdf_extractor.initialize_reader() + pdf_extractor.mark_rectangles_on_pdf() + color_matches = pdf_extractor.get_color_matches() for label_color, pdf_color in color_matches: assert ( label_color == pdf_color