forked from lykawang/PET-Report-Summarization
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread_data.py
138 lines (122 loc) · 4.92 KB
/
read_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os
import pandas as pd
import PyPDF2
from datetime import datetime
# Directory containing PDF files
pdf_folder = 'reports_data'
# Output Excel file
output_file = 'output.xlsx'
# Function to extract text from a single PDF
def extract_text_from_pdf(file_path):
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = []
for page in reader.pages:
text.append(page.extract_text())
return '\n'.join(text)
# Helper function to format date
def format_date(date_str):
try:
date_obj = datetime.strptime(date_str, '%d %b %Y')
return date_obj.strftime('%Y-%m-%d')
except ValueError:
return date_str # Return original if parsing fails
# Function to clean text from headers and page numbers
def clean_text(text):
text = text.replace("BC Cancer Agency Transcription Text", "")
text = text.replace("BCCA PET Scan Report", "")
text = text.replace("PET Scan Report (MM)", "")
text = text.replace("BC Cancer Agency - Vancouver Centre", "")
text = text.replace("DIAGNOSTIC REPORT - PET SCAN", "")
text = text.replace("PET Scan Report", "")
lines = text.split('\n')
cleaned_lines = []
i = 0
while i < len(lines):
line = lines[i].strip()
if line.startswith('Page'):
i += 5 # Skip this line and the next four lines
elif not (line.startswith('MRN (Trial ID):') or line.startswith('Date of Service:') or line.startswith('Tel:')):
cleaned_lines.append(line)
i += 1
else:
i += 1
return '\n'.join(cleaned_lines)
# Function to parse information from extracted text
def parse_information(text):
data = {
'Scan Date': '',
'Dictated on': '',
'Trial ID': '',
'Sex': '',
'Birth': '',
'Procedure': '',
'Referring Physician': '',
'Clinical History': '',
'Clinical Information': '',
'History': '',
'Technique': '',
'Comparison': '',
'Findings or PET Findings': '',
'Impression': ''
}
section_keys = {
'Procedure': 'Procedure',
'PROCEDURE': 'Procedure',
'Referring Physician': 'Referring Physician',
'REFERRING PHYSICIAN': 'Referring Physician',
'Clinical History': 'Clinical History',
'CLINICAL HISTORY': 'Clinical History',
'Clinical Information': 'Clinical Information',
'CLINICAL INFORMATION': 'Clinical Information',
'History': 'History',
'HISTORY': 'History',
'Technique': 'Technique',
'TECHNIQUE': 'Technique',
'Comparison': 'Comparison',
'COMPARISON': 'Comparison',
'Findings': 'Findings or PET Findings',
'FINDINGS': 'Findings or PET Findings',
'PET Findings': 'Findings or PET Findings',
'PET FINDINGS': 'Findings or PET Findings',
'Impression': 'Impression',
'IMPRESSION': 'Impression'
}
current_section = None
text = clean_text(text) # Clean headers and page numbers before parsing
lines = text.split('\n')
for line in lines:
line = line.strip()
if line.startswith('Scan Date:'):
data['Scan Date'] = format_date(line.split('Scan Date:')[1].split('Dictated on:')[0].strip())
data['Dictated on'] = format_date(line.split('Dictated on:')[1].split('Trial ID:')[0].strip())
elif line.startswith('Trial ID:'):
data['Trial ID'] = line.split('Trial ID:')[1].split('Sex:')[0].strip()
data['Sex'] = line.split('Sex:')[1].split('Birth:')[0].strip()
birth_and_report = line.split('Birth:')[1]
data['Birth'] = format_date(birth_and_report.strip())
# Handle sections, considering case insensitivity
for sec in section_keys:
if line.startswith(sec):
current_section = section_keys[sec]
content_start = line.find(':') + 1 if ':' in line else len(line)
data[current_section] = line[content_start:].strip()
break
if current_section and not any(line.startswith(sec) for sec in section_keys):
data[current_section] += ' ' + line.strip()
return data
# List to store data from all PDFs
all_data = []
# Process each PDF in the folder
for pdf_file in os.listdir(pdf_folder):
if pdf_file.endswith('.pdf'):
pdf_path = os.path.join(pdf_folder, pdf_file)
text = extract_text_from_pdf(pdf_path)
report_data = parse_information(text)
all_data.append(report_data)
# Create a DataFrame and write it to an Excel file
df = pd.DataFrame(all_data)
df = df[['Scan Date', 'Dictated on', 'Trial ID', 'Sex', 'Birth', 'Procedure', 'Referring Physician', 'Clinical History',
'Clinical Information', 'History', 'Technique', 'Comparison', 'Findings or PET Findings', 'Impression']]
df.to_excel(output_file, index=False)
print('Data extraction complete. Output saved to:', output_file)