Add Docling PDF Support (#50)

* add docling as a dependency * "now we're thinking with docling logging" * "update logger" * output to cdsw home (won't work locally) and don't fail indexing * note for next monday * don't log the whole output process object * run docling help * -v * write to txt file * -vv * use docling if it works * save output to a docling-specifically named log file * gate docling processing on an env var. refactor nop to simple_file for clarity * add docling option to the env & a todo about page numbers with docling * add the docling log file to the gitignore * fix mypy issues * fix a todo --------- Co-authored-by: jwatson <[email protected]> Co-authored-by: Michael Liu <[email protected]>
cloudera · Dec 3, 2024 · 4f9178e · 4f9178e
1 parent 03f0b93
commit 4f9178e
Show file tree

Hide file tree

Showing 8 changed files with 897 additions and 52 deletions.
diff --git a/.env.example b/.env.example
@@ -17,3 +17,6 @@ CAII_EMBEDDING_ENDPOINT_NAME=
 
 # set this to true if you have uv installed on your system, other wise don't include this
 USE_SYSTEM_UV=true
+
+# set this to true to enable enhanced pdf processing with docling
+USE_ENHANCED_PDF_PROCESSING=false
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,4 @@
 !.idea/google-java-format.xml
 chat_store.json
 databases/
+**/docling-output.txt
diff --git a/.project-metadata.yaml b/.project-metadata.yaml
@@ -26,6 +26,10 @@ environment_variables:
         default: ""
         description: "AWS Secret Access Key"
         required: true
+    USE_ENHANCED_PDF_PROCESSING:
+        default: "false"
+        description: "Use enhanced PDF processing for better text extraction. This option makes PDF parsing take significantly longer. A GPU is highly recommended to speed up the process."
+        required: false
     CAII_DOMAIN:
       default: ""
       description: "The domain of the CAII service. Setting this will enable CAII as the sole source for both inference and embedding models."

diff --git a/llm-service/app/ai/indexing/index.py b/llm-service/app/ai/indexing/index.py
@@ -53,15 +53,15 @@
 from .readers.csv import CSVReader
 from .readers.docx import DocxReader
 from .readers.json import JSONReader
-from .readers.nop import NopReader
+from .readers.simple_file import SimpleFileReader
 from .readers.pdf import PDFReader
 
 logger = logging.getLogger(__name__)
 
 READERS: Dict[str, Type[BaseReader]] = {
     ".pdf": PDFReader,
-    ".txt": NopReader,
-    ".md": NopReader,
+    ".txt": SimpleFileReader,
+    ".md": SimpleFileReader,
     ".docx": DocxReader,
     ".csv": CSVReader,
     ".json": JSONReader,

diff --git a/llm-service/app/ai/indexing/readers/pdf.py b/llm-service/app/ai/indexing/readers/pdf.py
@@ -35,22 +35,34 @@
 #  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
 #  DATA.
 #
-
+import logging
+import os
+import subprocess
 from pathlib import Path
-from typing import Any, List
+from subprocess import CompletedProcess
+from typing import Any
 
 from llama_index.core.schema import Document, TextNode
 from llama_index.readers.file import PDFReader as LlamaIndexPDFReader
 
 from .base_reader import BaseReader
+from .simple_file import SimpleFileReader
+
+logger = logging.getLogger(__name__)
 
 
 class PDFReader(BaseReader):
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
         self.inner = LlamaIndexPDFReader(return_full_document=False)
+        self.markdown_reader = SimpleFileReader(*args, **kwargs)
+
+    def load_chunks(self, file_path: Path) -> list[TextNode]:
+        logger.debug(f"{file_path=}")
+        chunks: list[TextNode] = self.process_with_docling(file_path)
+        if chunks:
+            return chunks
 
-    def load_chunks(self, file_path: Path) -> List[TextNode]:
         pages = self.inner.load_data(file_path)
 
         page_labels = [page.metadata["page_label"] for page in pages]
@@ -88,3 +100,24 @@ def find_label(start_index: int) -> str:
                 chunk.metadata["page_label"] = chunk_label
 
         return chunks
+
+
+    def process_with_docling(self, file_path: Path) -> list[TextNode] | None:
+        docling_enabled = os.getenv("USE_ENHANCED_PDF_PROCESSING", "false").lower() == "true"
+        if not docling_enabled:
+            return None
+        directory = file_path.parent
+        logger.debug(f"{directory=}")
+        with open("docling-output.txt", "a") as f:
+            process: CompletedProcess[bytes] = subprocess.run(
+                ["docling", "-v", "--abort-on-error", f"--output={directory}", str(file_path)], stdout=f, stderr=f)
+        logger.debug(f"docling return code = {process.returncode}")
+        # todo: figure out page numbers & look into the docling llama-index integration
+        markdown_file_path = file_path.with_suffix(".md")
+        if process.returncode == 0 and markdown_file_path.exists():
+            # update chunk metadata to point at the original pdf
+            chunks = self.markdown_reader.load_chunks(markdown_file_path)
+            for chunk in chunks:
+                chunk.metadata["file_name"] = file_path.name
+            return chunks
+        return None
diff --git a/llm-service/app/ai/indexing/readers/nop.py → ...ce/app/ai/indexing/readers/simple_file.py b/llm-service/app/ai/indexing/readers/nop.py → ...ce/app/ai/indexing/readers/simple_file.py
@@ -44,7 +44,7 @@
 from .base_reader import BaseReader
 
 
-class NopReader(BaseReader):
+class SimpleFileReader(BaseReader):
     def load_chunks(self, file_path: Path) -> List[TextNode]:
         with open(file_path, "r") as f:
             document = Document(text=f.read())

diff --git a/llm-service/pyproject.toml b/llm-service/pyproject.toml
@@ -20,7 +20,8 @@ dependencies = [
     "llama-index-vector-stores-qdrant==0.2.17",
     "docx2txt>=0.8",
     "pandas>=2.2.3",
-    "fastapi-utils>=0.8.0"
+    "fastapi-utils>=0.8.0",
+    "docling>=2.7.0",
 ]
 requires-python = "==3.10.*"
 readme = "README.md"

diff --git a/llm-service/uv.lock b/llm-service/uv.lock