-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf_processor.py
106 lines (88 loc) · 3.91 KB
/
pdf_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import pytesseract
from PIL import Image
from io import BytesIO
import cv2
import easyocr
import numpy as np
from pdf2image import convert_from_path
import PyPDF2
class PDFProcessor:
def __init__(self, file):
self.file = file
self.pages_text = []
self.reader = None
try:
pdf_bytes = self.file.read()
self.reader = PyPDF2.PdfReader(BytesIO(pdf_bytes))
except Exception as e:
print(f"Error opening with PyPDF2: {e}")
self.reader = None
def extract_text_by_page(self):
if not self.reader:
raise ValueError("PDF document is not initialized.")
self.pages_text = []
try:
for i, page in enumerate(self.reader.pages):
print(f"Processing page {i + 1}/{len(self.reader.pages)}...")
extracted_text = page.extract_text()
if extracted_text:
print(f"Text extracted from page {i + 1}: {extracted_text[:5000]}...\n")
self.pages_text.append(extracted_text)
else:
print(f"No text found on page {i + 1}, switching to OCR...")
ocr_text = self.extract_text_with_ocr_per_page(i)
self.pages_text.append(ocr_text)
except Exception as e:
print(f"Error encountered: {e}, switching to OCR for entire document...")
self.pages_text = self.extract_text_with_ocr()
return self.pages_text
def extract_text_with_ocr_per_page(self, page_number):
pdf_bytes = self.file.read()
images = convert_from_path(BytesIO(pdf_bytes), first_page=page_number + 1, last_page=page_number + 1)
ocr_text = ""
for image in images:
print(f"Processing OCR for page {page_number + 1}...")
open_cv_image = np.array(image)
processed_image = self.preprocess_image_for_ocr(open_cv_image)
ocr_text += pytesseract.image_to_string(processed_image)
return ocr_text
def extract_text_with_ocr(self):
pdf_bytes = self.file.read()
images = convert_from_path(BytesIO(pdf_bytes))
ocr_text = []
for i, image in enumerate(images):
print(f"Processing OCR for page {i + 1}/{len(images)}...")
open_cv_image = np.array(image)
processed_image = self.preprocess_image_for_ocr(open_cv_image)
ocr_text.append(pytesseract.image_to_string(processed_image))
return ocr_text
def preprocess_image_for_ocr(self, image):
gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
adaptive_thresh = cv2.adaptiveThreshold(gray_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 11, 2)
blurred_image = cv2.GaussianBlur(adaptive_thresh, (5, 5), 0)
kernel = np.ones((1, 1), np.uint8)
dilated_image = cv2.dilate(blurred_image, kernel, iterations=1)
return dilated_image
def extract_text_with_easyocr(self):
reader = easyocr.Reader(['en'])
pdf_bytes = self.file.read()
images = convert_from_path(BytesIO(pdf_bytes))
ocr_text = []
for i, image in enumerate(images):
print(f"Processing EasyOCR for page {i + 1}/{len(images)}...")
image_np = np.array(image)
results = reader.readtext(image_np)
page_text = ""
for bbox, text, prob in results:
page_text += text + "\n"
ocr_text.append(page_text)
return ocr_text
def extract_metadata(self):
try:
pdf_reader = PyPDF2.PdfReader(BytesIO(self.file.read()))
metadata = pdf_reader.metadata
return metadata
except Exception as e:
print(f"Error extracting metadata: {e}")
return {}