Skip to content

Commit

Permalink
Add Docling PDF Support (#50)
Browse files Browse the repository at this point in the history
* add docling as a dependency

* "now we're thinking with docling logging"

* "update logger"

* output to cdsw home (won't work locally) and don't fail indexing

* note for next monday

* don't log the whole output process object

* run docling help

* -v

* write to txt file

* -vv

* use docling if it works

* save output to a docling-specifically named log file

* gate docling processing on an env var. refactor nop to simple_file for clarity

* add docling option to the env & a todo about page numbers with docling

* add the docling log file to the gitignore

* fix mypy issues

* fix a todo

---------

Co-authored-by: jwatson <[email protected]>
Co-authored-by: Michael Liu <[email protected]>
  • Loading branch information
3 people authored Dec 3, 2024
1 parent 03f0b93 commit 4f9178e
Show file tree
Hide file tree
Showing 8 changed files with 897 additions and 52 deletions.
3 changes: 3 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,6 @@ CAII_EMBEDDING_ENDPOINT_NAME=

# set this to true if you have uv installed on your system, other wise don't include this
USE_SYSTEM_UV=true

# set this to true to enable enhanced pdf processing with docling
USE_ENHANCED_PDF_PROCESSING=false
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
!.idea/google-java-format.xml
chat_store.json
databases/
**/docling-output.txt
4 changes: 4 additions & 0 deletions .project-metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ environment_variables:
default: ""
description: "AWS Secret Access Key"
required: true
USE_ENHANCED_PDF_PROCESSING:
default: "false"
description: "Use enhanced PDF processing for better text extraction. This option makes PDF parsing take significantly longer. A GPU is highly recommended to speed up the process."
required: false
CAII_DOMAIN:
default: ""
description: "The domain of the CAII service. Setting this will enable CAII as the sole source for both inference and embedding models."
Expand Down
6 changes: 3 additions & 3 deletions llm-service/app/ai/indexing/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,15 @@
from .readers.csv import CSVReader
from .readers.docx import DocxReader
from .readers.json import JSONReader
from .readers.nop import NopReader
from .readers.simple_file import SimpleFileReader
from .readers.pdf import PDFReader

logger = logging.getLogger(__name__)

READERS: Dict[str, Type[BaseReader]] = {
".pdf": PDFReader,
".txt": NopReader,
".md": NopReader,
".txt": SimpleFileReader,
".md": SimpleFileReader,
".docx": DocxReader,
".csv": CSVReader,
".json": JSONReader,
Expand Down
39 changes: 36 additions & 3 deletions llm-service/app/ai/indexing/readers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,22 +35,34 @@
# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
# DATA.
#

import logging
import os
import subprocess
from pathlib import Path
from typing import Any, List
from subprocess import CompletedProcess
from typing import Any

from llama_index.core.schema import Document, TextNode
from llama_index.readers.file import PDFReader as LlamaIndexPDFReader

from .base_reader import BaseReader
from .simple_file import SimpleFileReader

logger = logging.getLogger(__name__)


class PDFReader(BaseReader):
def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self.inner = LlamaIndexPDFReader(return_full_document=False)
self.markdown_reader = SimpleFileReader(*args, **kwargs)

def load_chunks(self, file_path: Path) -> list[TextNode]:
logger.debug(f"{file_path=}")
chunks: list[TextNode] = self.process_with_docling(file_path)
if chunks:
return chunks

def load_chunks(self, file_path: Path) -> List[TextNode]:
pages = self.inner.load_data(file_path)

page_labels = [page.metadata["page_label"] for page in pages]
Expand Down Expand Up @@ -88,3 +100,24 @@ def find_label(start_index: int) -> str:
chunk.metadata["page_label"] = chunk_label

return chunks


def process_with_docling(self, file_path: Path) -> list[TextNode] | None:
docling_enabled = os.getenv("USE_ENHANCED_PDF_PROCESSING", "false").lower() == "true"
if not docling_enabled:
return None
directory = file_path.parent
logger.debug(f"{directory=}")
with open("docling-output.txt", "a") as f:
process: CompletedProcess[bytes] = subprocess.run(
["docling", "-v", "--abort-on-error", f"--output={directory}", str(file_path)], stdout=f, stderr=f)
logger.debug(f"docling return code = {process.returncode}")
# todo: figure out page numbers & look into the docling llama-index integration
markdown_file_path = file_path.with_suffix(".md")
if process.returncode == 0 and markdown_file_path.exists():
# update chunk metadata to point at the original pdf
chunks = self.markdown_reader.load_chunks(markdown_file_path)
for chunk in chunks:
chunk.metadata["file_name"] = file_path.name
return chunks
return None
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
from .base_reader import BaseReader


class NopReader(BaseReader):
class SimpleFileReader(BaseReader):
def load_chunks(self, file_path: Path) -> List[TextNode]:
with open(file_path, "r") as f:
document = Document(text=f.read())
Expand Down
3 changes: 2 additions & 1 deletion llm-service/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ dependencies = [
"llama-index-vector-stores-qdrant==0.2.17",
"docx2txt>=0.8",
"pandas>=2.2.3",
"fastapi-utils>=0.8.0"
"fastapi-utils>=0.8.0",
"docling>=2.7.0",
]
requires-python = "==3.10.*"
readme = "README.md"
Expand Down
891 changes: 847 additions & 44 deletions llm-service/uv.lock

Large diffs are not rendered by default.

0 comments on commit 4f9178e

Please sign in to comment.