From 9c4b7c880d027d202c2cd6ec3dae0114825234f5 Mon Sep 17 00:00:00 2001
From: Arindam Kulshi <akulshi04@gmail.com>
Date: Mon, 20 May 2024 08:30:32 -0700
Subject: [PATCH] move color matches to a list and tested against a list

---
 OCR/ocr/services/pdf_field_extractor.py | 24 ++++++++++++++++++------
 OCR/tests/pdf_field_extractor_test.py   | 20 +++++++++-----------
 2 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/OCR/ocr/services/pdf_field_extractor.py b/OCR/ocr/services/pdf_field_extractor.py
index bee7856e..8c035293 100644
--- a/OCR/ocr/services/pdf_field_extractor.py
+++ b/OCR/ocr/services/pdf_field_extractor.py
@@ -1,8 +1,9 @@
 import json
+import os
 import random
 from typing import Dict, List, Optional, Tuple
+
 import pypdf
-import os
 from pdf2image import convert_from_path
 
 
@@ -17,6 +18,7 @@ def __init__(self, file_path: str):
         self.file_path = file_path
         self.reader = None
         self.form_fields = []
+        self.color_matches = []
 
     def initialize_reader(self, base_path: Optional[str] = None) -> None:
         """
@@ -135,7 +137,7 @@ def update_annotations_and_save(
             json.dump(color_label_map, json_file, indent=4)
         return output_path, labels_path
 
-    def mark_rectangles_on_pdf(self) -> Tuple[str, str]:
+    def mark_rectangles_on_pdf(self):
         """
         Process the PDF to add rectangle annotations and save the document along with a JSON mapping file.
 
@@ -150,6 +152,7 @@ def mark_rectangles_on_pdf(self) -> Tuple[str, str]:
 
         color_label_map = {}
         count = 0
+        color_matches = []
         for page in self.reader.pages:
             annotations = page.get("/Annots", pypdf.generic.ArrayObject())
 
@@ -168,11 +171,11 @@ def mark_rectangles_on_pdf(self) -> Tuple[str, str]:
 
                     new_annot = self.create_rectangle_annotation(rect, color)
                     new_annotations.append(new_annot)
-
+                    pdf_color = new_annot.get("/C").get_object()
+                    pdf_color_values = [int(color_val * 255) for color_val in pdf_color]
+                    pdf_color_str = ",".join(map(str, pdf_color_values))
+                    self.color_matches.append((color_str, pdf_color_str))
                     if count < 5:
-                        pdf_color = new_annot.get("/C").get_object()
-                        pdf_color_values = [int(color_val * 255) for color_val in pdf_color]
-                        pdf_color_str = ",".join(map(str, pdf_color_values))
                         print(f"Color in labels file: {color_str}")
                         print(f"Color in PDF annotation: {pdf_color_str}")
                         count += 1
@@ -211,3 +214,12 @@ def pdf_to_images(self, path) -> List[str]:
             image_paths.append(image_path)
 
         return image_paths
+
+    def get_color_matches(self) -> List[Tuple[str, str]]:
+        """
+        Get the color matches for testing purposes
+
+        Returns:
+            List[Tuple[str, str]]: A list of tuples containing the colors in labels and the colors in the PDF annotations.
+        """
+        return self.color_matches
diff --git a/OCR/tests/pdf_field_extractor_test.py b/OCR/tests/pdf_field_extractor_test.py
index 0ac50648..17b49518 100644
--- a/OCR/tests/pdf_field_extractor_test.py
+++ b/OCR/tests/pdf_field_extractor_test.py
@@ -1,8 +1,9 @@
-import pytest
-from ocr.services.pdf_field_extractor import PDFFieldExtractor
 import os
+
 import pypdf
-import re
+import pytest
+
+from ocr.services.pdf_field_extractor import PDFFieldExtractor
 
 
 @pytest.fixture
@@ -30,7 +31,7 @@ def test_generate_random_color(pdf_extractor):
     assert isinstance(color, str), "Output should be a string"
     parts = color.split(",")
     assert len(parts) == 3, "Color should have three parts"
-    all(int(part) >= 0 and int(part) <= 255 for part in parts), "All RGB values should be within 0-255"
+    all(0 <= int(part) <= 255 for part in parts), "All RGB values should be within 0-255"
 
 
 def test_create_rectangle_annotation(pdf_extractor):
@@ -48,13 +49,10 @@ def test_document_creation(pdf_extractor, mocker):
     assert labels == "path_to_labels", "Should return the correct path to the labels JSON"
 
 
-def test_end_to_end_segment_creation(pdf_extractor, mocker, capsys):
-    mocker.patch.object(pdf_extractor, "update_annotations_and_save", return_value=("path_to_pdf", "path_to_labels"))
-    output, labels = pdf_extractor.mark_rectangles_on_pdf()
-    captured = capsys.readouterr()
-    color_matches = re.findall(
-        r"Color in labels file: (\d+,\d+,\d+)\nColor in PDF annotation: (\d+,\d+,\d+)", captured.out
-    )
+def test_end_to_end_segment_creation(pdf_extractor):
+    pdf_extractor.initialize_reader()
+    pdf_extractor.mark_rectangles_on_pdf()
+    color_matches = pdf_extractor.get_color_matches()
     for label_color, pdf_color in color_matches:
         assert (
             label_color == pdf_color