Skip to content

Commit

Permalink
Merge pull request #241 from phidatahq/pdf-image-reader-phi-800
Browse files Browse the repository at this point in the history
pdf-image-reader-phi-800
  • Loading branch information
ashpreetbedi authored May 14, 2024
2 parents 1fc3308 + a47eab6 commit 253a419
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 0 deletions.
64 changes: 64 additions & 0 deletions phi/document/reader/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,67 @@ def read(self, url: str) -> List[Document]:
chunked_documents.extend(self.chunk_document(document))
return chunked_documents
return documents


class PDFImageReader(Reader):
"""Reader for PDF files with images"""

def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
if not pdf:
raise ValueError("No pdf provided")

try:
import rapidocr_onnxruntime as rapidocr
from pypdf import PdfReader as DocumentReader # noqa: F401
except ImportError:
raise ImportError("`pypdf or rapidocr_onnxruntime` not installed")

doc_name = ""
try:
if isinstance(pdf, str):
doc_name = pdf.split("/")[-1].split(".")[0].replace(" ", "_")
else:
doc_name = pdf.name.split(".")[0]
except Exception:
doc_name = "pdf"

logger.info(f"Reading: {doc_name}")
doc_reader = DocumentReader(pdf)

# Initialize RapidOCR
ocr = rapidocr.RapidOCR()

documents = []
for page_number, page in enumerate(doc_reader.pages, start=1):
page_text = page.extract_text() or ""
images_text_list: List = []

for image_object in page.images:
image_data = image_object.data

# Perform OCR on the image
ocr_result, elapse = ocr(image_data)

# Extract text from OCR result
if ocr_result:
images_text_list += [item[1] for item in ocr_result]

images_text: str = "\n".join(images_text_list)
content = page_text + "\n" + images_text

documents.append(
Document(
name=doc_name,
id=f"{doc_name}_{page_number}",
meta_data={"page": page_number},
content=content,
)
)

if self.chunk:
chunked_documents = []
for document in documents:
chunked_documents.extend(self.chunk_document(document))
return chunked_documents

return documents
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ module = [
"psycopg.*",
"pypdf.*",
"qdrant_client.*",
"rapidocr_onnxruntime.*",
"requests.*",
"simplejson.*",
"serpapi.*",
Expand Down

0 comments on commit 253a419

Please sign in to comment.