diff --git a/OCR/ocr/services/pdf_field_extractor_old.py b/OCR/ocr/services/pdf_field_extractor_old.py deleted file mode 100644 index e50a6a34..00000000 --- a/OCR/ocr/services/pdf_field_extractor_old.py +++ /dev/null @@ -1,78 +0,0 @@ -import pypdf -import os -from pdf2image import convert_from_bytes -from PIL import Image -import fitz - - -class PDFFieldExtractor: - def __init__(self, file_path): - self.file_path = file_path - self.reader = None - self.form_fields = [] - - def initialize_reader(self, base_path=None): - if base_path is None: - base_path = os.path.dirname(__file__) - full_path = os.path.join(base_path, self.file_path) - self.reader = pypdf.PdfReader(full_path) - - def close_reader(self): - if self.reader is not None: - self.reader.stream.close() # Close the stream explicitly - self.reader = None - - def segment_fields(self, field_names): - # Iterate through each page in the PDF - if self.reader is None: - raise ValueError("PDF reader is not initialized. Call initialize_reader() first.") - for page in self.reader.pages: - # Check if there are annotations (textboxes are considered annotations) - if "/Annots" in page: - annotations = page["/Annots"] - for annot in annotations: - if isinstance(annot, pypdf.generic.IndirectObject): - annot = annot.get_object() - field = annot.get("/T") - rect = annot.get("/Rect") - if field and rect: - field_str = str(field) - self.form_fields.append((field_str, rect)) - - def extract_images(self): - # extracts the images from the rectangle coordinates - doc = fitz.open(self.file_path) - page = doc[0] - page_rect = page.mediabox - - for field_name, rect in self.form_fields: - left, bottom, right, top = [int(coord) for coord in rect] - top = page_rect[3] - top - bottom = page_rect[3] - bottom - - # Render the page and crop the image - pix = page.get_pixmap(clip=fitz.Rect(left, top, right, bottom), dpi=300) - img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) - img.save(f'extracted_{field_name.replace("/", "_")}.png') - break - - def pdf_to_images(self): - # converts the pdf to a png image - if self.reader is None: - raise ValueError("PDF reader is not initialized. Call initialize_reader() first.") - - # Define the path to save images - output_folder = os.getcwd() - os.makedirs(output_folder, exist_ok=True) - - # Convert each page to an image - images = convert_from_bytes(self.reader.stream.getvalue(), dpi=300) # High DPI for better quality - image_paths = [] - - # Save each image - for i, image in enumerate(images): - image_path = os.path.join(output_folder, f"page_{i+1}.png") - image.save(image_path, "PNG") - image_paths.append(image_path) - - return image_paths diff --git a/OCR/ocr/services/pdf_image_conversion_script.py b/OCR/ocr/services/pdf_image_conversion_script.py deleted file mode 100644 index 9a52656e..00000000 --- a/OCR/ocr/services/pdf_image_conversion_script.py +++ /dev/null @@ -1,23 +0,0 @@ -import os -from pdf2image import convert_from_path - - -def pdf_to_images(path): - # Define the path to save images - output_folder = os.getcwd() - os.makedirs(output_folder, exist_ok=True) - - # Convert each page to an image using the pdf2image library - image_paths = [] - images = convert_from_path(path, dpi=300) # High DPI for better quality - - # Save each image to the defined path - for i, image in enumerate(images): - image_path = os.path.join(output_folder, f"page_{i+1}.png") - image.save(image_path, "PNG") - image_paths.append(image_path) - - return image_paths - - -pdf_to_images("/Users/arindamkulshi/IDWA/IDWA/OCR/tests/assets/fillable_marked.pdf") diff --git a/OCR/tests/assets/output.json b/OCR/tests/assets/output.json deleted file mode 100644 index 1452bdea..00000000 --- a/OCR/tests/assets/output.json +++ /dev/null @@ -1,51 +0,0 @@ -{ - "nbs_patient_id": "1902320", - "nbs_investigation_id": "123458", - "patient_first_name": "OHN", - "patient_last_name": "SMITH", - "patient_address": "123 ROSE LANE", - "patient_city": "SAN RAMON", - "patient_country": "CONTRA COSTA", - "patient_zip": "94583", - "patient_region": "NCAL", - "patient_phone": "(925) 943-2455", - "patient_parent": "JOHAS", - "physician_name": "DR.KING", - "physician_phone": "(925) 454-4322", - "physician_address": "SR: 43 DAY LAN SAM RAMAN, CALIMIA 94583", - "reported_by_name": "DR.KING COME AGAIN US ONCLUSIVE", - "reporter_agency": "CDC SUMMARY", - "reporter_agency_phone": "(925) 964-3488", - "reporter_agency_date_reported": "02/07/2024", - "investigator_name": "ADAM SMITH", - "investigator_agency_name": "STLTAGENCY", - "investigator_agency_phone_number": "(925) 862-3211", - "investigator_agency_email": "ISMOITH@GMAIL.COM", - "investigation_agency_start_date": "06/21/2029", - "investigation_agency_completion_date": "06/19/2029", - "patient_dob": "10/12/1998", - "patient_age": "24", - "patient_place_of_birth_other": "CASHIER", - "patient_race_other": "-", - "patient__date_location_delivery": "06/21/2023", - "obstetrician's_name,_address,_and_phone_#": "425 JAMES SHEET SAH RAMON, CARLIMIA, 94582", - "patient_hospitalized_at": "MERCY HOSPITAL", - "patient_admitted": "11/16/2022", - "discharged": "11/16/2022", - "duration_Of_Stay": "25 6", - "diagnosis_date": "11/11/2922", - "symptomatic_onset_date": "-", - "symptomatic_end_date": "-9.95", - "date_of_bilirubin_test": "IT", - "hepatitis_date_of_death": "LOTLE", - "date_of_lab_test": "11/23/202", - "testing_facility": "NEW MEXICO", - "alt_sgpt_result": "45", - "alt_upper_limit_normal": "100", - "ast_sgpt_result": "+54", - "ast_upper_limit_normal": "120", - "date_of_alt_result": "05/18/2023", - "date_of_ast_result": "05/24/2023", - "vaccination_history_year": "2000041" - } - \ No newline at end of file diff --git a/OCR/tests/assets/presentation_show/per_example_marked_labels.json b/OCR/tests/assets/presentation_show/per_example_marked_labels.json deleted file mode 100644 index 5d3ddff8..00000000 --- a/OCR/tests/assets/presentation_show/per_example_marked_labels.json +++ /dev/null @@ -1,160 +0,0 @@ -{ - "41,252,255": "invalid_string_0.5605351260882224", - "23,105,108": "invalid_string_0.8966159851301492", - "25,121,245": "invalid_string_0.8054654336519245", - "82,28,104": "NBS INVESTIGATION ID 1", - "234,193,39": "invalid_string_0.5838974266478411", - "105,7,113": "Address", - "83,42,215": "City", - "153,180,123": "County", - "227,122,77": "Zip", - "12,132,65": "Region", - "13,57,33": "Phone_1", - "173,45,232": "ParentGuardian", - "104,110,134": "Physician", - "174,227,252": "Phone_2", - "52,148,63": "Address 1", - "107,236,254": "Address 2", - "67,57,152": "Check box if history of homelessness in last 6 months", - "103,51,204": "Reported by", - "184,173,196": "Agency", - "71,72,144": "Phone_3", - "196,201,162": "Month1", - "156,42,36": "Day1", - "209,43,167": "Year1", - "32,190,163": "Investigated by", - "59,137,163": "Agency_2", - "173,43,5": "Phone_4", - "233,156,75": "Email", - "207,237,71": "Month2", - "49,80,87": "Day2", - "65,134,62": "Year2", - "184,250,169": "Month3", - "251,176,159": "Day3", - "26,189,32": "Year3", - "49,237,214": "Month4", - "189,4,178": "Day4", - "116,57,0": "Year4", - "80,173,124": "AGE", - "57,94,47": "invalid_string_0.7208335460109319", - "30,141,59": "Other", - "220,210,175": "invalid_string_0.8479278438146428", - "105,113,250": "invalid_string_0.7507941635473869", - "162,246,130": "invalid_string_0.8308565950534758", - "111,152,163": "White", - "122,157,72": "Black", - "204,51,11": "Asian", - "155,34,41": "Native Hawaiian or Other Pac Islander", - "150,11,11": "Am Indian or Alaska Native", - "225,9,81": "Unknown_3", - "161,74,6": "undefined_14", - "185,187,81": "Other_2", - "178,223,22": "invalid_string_0.6390488130163259", - "1,124,190": "invalid_string_0.7021717315173803", - "255,64,200": "invalid_string_0.162992466611241", - "15,215,27": "invalid_string_0.9730733396349347", - "233,245,195": "invalid_string_0.15532696519664813", - "187,82,187": "If yes Mothers age at date of infant birth", - "35,136,6": "Infant birth weight", - "91,40,197": "lbs", - "16,68,158": "oz OR", - "238,98,230": "Unknown_5", - "60,254,110": "invalid_string_0.8671924547664371", - "225,0,168": "Month5", - "143,169,182": "Day5", - "240,113,209": "Year5", - "157,34,170": "invalid_string_0.9459024060583142", - "151,162,140": "invalid_string_0.6454094539715259", - "98,186,83": "Obstetricians name address and phone 1", - "251,137,154": "Obstetricians name address and phone 2", - "2,195,242": "Month6", - "101,166,8": "Day6", - "248,197,108": "Year6", - "13,8,220": "Month7", - "212,202,155": "Day7", - "162,227,181": "Year7", - "45,146,17": "Month8", - "147,47,132": "Day8", - "183,200,115": "Year8", - "185,106,153": "Final Cough Duration total of days", - "69,226,106": "invalid_string_0.14513825470553632", - "201,39,16": "invalid_string_0.40193986397486425", - "92,16,37": "invalid_string_0.30566366019702085", - "149,189,15": "invalid_string_0.5291564927990424", - "212,61,150": "invalid_string_0.955800430825003", - "219,86,5": "invalid_string_0.20931474499662028", - "58,30,62": "invalid_string_0.25514587549946754", - "55,65,158": "invalid_string_0.34253021501743586", - "166,187,247": "invalid_string_0.8618180806834933", - "73,250,72": "invalid_string_0.7149184781235941", - "66,203,0": "Month9", - "238,171,164": "Day9", - "209,129,71": "Year9", - "131,76,223": "invalid_string_0.020150434301934195", - "76,169,175": "invalid_string_0.48344496104297685", - "157,247,55": "invalid_string_0.5056960939085311", - "26,0,247": "invalid_string_0.12308788243304625", - "45,55,231": "invalid_string_0.674655208066316", - "56,176,233": "invalid_string_0.11593493525680065", - "156,109,155": "invalid_string_0.7897160223327044", - "217,70,231": "Other_5", - "237,176,151": "invalid_string_0.5820500189645772", - "32,104,164": "invalid_string_0.8391709165012194", - "68,90,27": "invalid_string_0.026454101625835946", - "123,5,174": "invalid_string_0.605243701705765", - "232,85,120": "invalid_string_0.1311346196396641", - "145,110,37": "invalid_string_0.3855133021102556", - "214,125,121": "Azithromycin", - "20,36,154": "invalid_string_0.6733478445043256", - "83,198,140": "Month10", - "115,237,94": "Day10", - "13,150,29": "Year10", - "17,249,202": "days_1", - "204,246,137": "Bactrim", - "103,4,203": "Month11", - "37,109,50": "Day11", - "17,132,166": "Year11", - "147,230,28": "days_2", - "175,77,81": "Clarithromycin", - "141,253,183": "Month12", - "236,218,89": "Day12", - "149,141,134": "Year12", - "216,129,225": "days_3", - "199,94,149": "Erythromycin", - "150,9,204": "Month13", - "110,136,128": "Day13", - "255,183,7": "Year13", - "184,72,102": "days_4", - "45,216,202": "Other Abx1", - "215,50,45": "Other_3", - "142,132,42": "Month14", - "114,54,61": "Day14", - "144,216,106": "Year14", - "18,177,101": "days_5", - "246,161,79": "Other Abx 2", - "16,32,138": "Other_4", - "55,239,144": "Month15", - "90,3,253": "Day15", - "229,53,173": "Year15", - "11,107,240": "days_6", - "5,241,222": "invalid_string_0.6626685908884317", - "137,3,94": "invalid_string_0.6181188725469583", - "29,101,33": "Hospitalized at", - "201,4,105": "Admit month", - "64,27,250": "admit day", - "219,198,241": "admit year", - "0,186,55": "discharge month", - "184,184,163": "discharge day", - "174,160,115": "discharge year", - "227,252,79": "Duration of Stay", - "18,41,163": "invalid_string_0.918431788462065", - "99,163,210": "Month16", - "56,254,69": "Day16", - "53,53,226": "Year16", - "216,189,194": "invalid_string_0.14962989558911177", - "25,82,16": "invalid_string_0.4116693176992734", - "33,201,205": "invalid_string_0.7005161681585382", - "41,195,210": "invalid_string_0.5215430319434585", - "248,106,14": "invalid_string_0.2053305895501707", - "92,52,237": "invalid_string_0.00521647438578432" -} \ No newline at end of file