Merge pull request #241 from phidatahq/pdf-image-reader-phi-800

pdf-image-reader-phi-800
phidatahq · May 14, 2024 · 253a419 · 253a419
2 parents 1fc3308 + a47eab6
commit 253a419
Show file tree

Hide file tree

Showing 2 changed files with 65 additions and 0 deletions.
diff --git a/phi/document/reader/pdf.py b/phi/document/reader/pdf.py
@@ -87,3 +87,67 @@ def read(self, url: str) -> List[Document]:
                 chunked_documents.extend(self.chunk_document(document))
             return chunked_documents
         return documents
+
+
+class PDFImageReader(Reader):
+    """Reader for PDF files with images"""
+
+    def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
+        if not pdf:
+            raise ValueError("No pdf provided")
+
+        try:
+            import rapidocr_onnxruntime as rapidocr
+            from pypdf import PdfReader as DocumentReader  # noqa: F401
+        except ImportError:
+            raise ImportError("`pypdf or rapidocr_onnxruntime` not installed")
+
+        doc_name = ""
+        try:
+            if isinstance(pdf, str):
+                doc_name = pdf.split("/")[-1].split(".")[0].replace(" ", "_")
+            else:
+                doc_name = pdf.name.split(".")[0]
+        except Exception:
+            doc_name = "pdf"
+
+        logger.info(f"Reading: {doc_name}")
+        doc_reader = DocumentReader(pdf)
+
+        # Initialize RapidOCR
+        ocr = rapidocr.RapidOCR()
+
+        documents = []
+        for page_number, page in enumerate(doc_reader.pages, start=1):
+            page_text = page.extract_text() or ""
+            images_text_list: List = []
+
+            for image_object in page.images:
+                image_data = image_object.data
+
+                # Perform OCR on the image
+                ocr_result, elapse = ocr(image_data)
+
+                # Extract text from OCR result
+                if ocr_result:
+                    images_text_list += [item[1] for item in ocr_result]
+
+            images_text: str = "\n".join(images_text_list)
+            content = page_text + "\n" + images_text
+
+            documents.append(
+                Document(
+                    name=doc_name,
+                    id=f"{doc_name}_{page_number}",
+                    meta_data={"page": page_number},
+                    content=content,
+                )
+            )
+
+        if self.chunk:
+            chunked_documents = []
+            for document in documents:
+                chunked_documents.extend(self.chunk_document(document))
+            return chunked_documents
+
+        return documents
diff --git a/pyproject.toml b/pyproject.toml
@@ -110,6 +110,7 @@ module = [
   "psycopg.*",
   "pypdf.*",
   "qdrant_client.*",
+  "rapidocr_onnxruntime.*",
   "requests.*",
   "simplejson.*",
   "serpapi.*",