feat: upgrade endpoint to docling v2 (#13)

* upgrade endpoint to docling v2 Signed-off-by: Michele Dolfi <[email protected]> * fix Containerfile Signed-off-by: Michele Dolfi <[email protected]> --------- Signed-off-by: Michele Dolfi <[email protected]>
DS4SD · Dec 19, 2024 · b00718b · b00718b
1 parent 3824aa6
commit b00718b
Show file tree

Hide file tree

Showing 5 changed files with 2,657 additions and 2,015 deletions.
diff --git a/Containerfile b/Containerfile
@@ -20,7 +20,7 @@ RUN if [ "$CPU_ONLY" = "true" ]; then \
 ENV HF_HOME=/tmp/
 ENV TORCH_HOME=/tmp/
 
-RUN poetry run python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
+RUN poetry run python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);'
 
 # On container environments, always set a thread budget to avoid undesired thread congestion.
 ENV OMP_NUM_THREADS=4
@@ -29,4 +29,4 @@ COPY ./docling_serve /docling-serve/docling_serve
 
 EXPOSE 5000
 
-CMD ["poetry", "run", "uvicorn", "--port", "5000", "docling_serve.app:app"]
+CMD ["poetry", "run", "uvicorn", "--port", "5000", "--host", "0.0.0.0", "docling_serve.app:app"]
diff --git a/docling_serve/app.py b/docling_serve/app.py
@@ -1,21 +1,55 @@
 import base64
+import hashlib
 from contextlib import asynccontextmanager
+from enum import Enum
 from io import BytesIO
-from pathlib import Path
-from typing import Any, Dict, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import httpx
 from docling.datamodel.base_models import (
     ConversionStatus,
     DocumentStream,
-    PipelineOptions,
+    ErrorItem,
+    InputFormat,
 )
-from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.document_converter import DocumentConverter
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    OcrOptions,
+    PdfPipelineOptions,
+    RapidOcrOptions,
+    TesseractOcrOptions,
+)
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.utils.profiling import ProfilingItem
+from docling_core.types.doc import DoclingDocument, ImageRefMode
+from docling_core.utils.file import resolve_remote_filename
+from fastapi import FastAPI, HTTPException, Response
+from pydantic import AnyHttpUrl, BaseModel
+
+
+# TODO: import enum from Docling, once it is exposed
+class OcrEngine(str, Enum):
+    EASYOCR = "easyocr"
+    TESSERACT = "tesseract"
+    RAPIDOCR = "rapidocr"
+
+
+class ConvertOptions(BaseModel):
+    output_docling_document: bool = True
+    output_markdown: bool = False
+    output_html: bool = False
+    do_ocr: bool = True
+    ocr_engine: OcrEngine = OcrEngine.EASYOCR
+    ocr_lang: Optional[List[str]] = None
+    force_ocr: bool = False
+    do_table_structure: bool = True
+    include_images: bool = True
+    images_scale: float = 2.0
+
 
-from docling_serve.settings import Settings
+class DocumentConvertBase(BaseModel):
+    options: ConvertOptions = ConvertOptions()
 
 
 class HttpSource(BaseModel):
@@ -28,37 +62,124 @@ class FileSource(BaseModel):
     filename: str
 
 
-class ConvertDocumentHttpSourceRequest(BaseModel):
+class ConvertDocumentHttpSourceRequest(DocumentConvertBase):
     http_source: HttpSource
 
 
-class ConvertDocumentFileSourceRequest(BaseModel):
+class ConvertDocumentFileSourceRequest(DocumentConvertBase):
     file_source: FileSource
 
 
+class DocumentResponse(BaseModel):
+    markdown: Optional[str] = None
+    docling_document: Optional[DoclingDocument] = None
+    html: Optional[str] = None
+
+
 class ConvertDocumentResponse(BaseModel):
-    content_md: str
+    document: DocumentResponse
+    status: ConversionStatus
+    errors: List[ErrorItem] = []
+    timings: Dict[str, ProfilingItem] = {}
+
+
+class ConvertDocumentErrorResponse(BaseModel):
+    status: ConversionStatus
+    # errors: List[ErrorItem] = []
 
 
 ConvertDocumentRequest = Union[
     ConvertDocumentFileSourceRequest, ConvertDocumentHttpSourceRequest
 ]
 
 
-models = {}
+class MarkdownTextResponse(Response):
+    media_type = "text/markdown"
+
+
+class HealthCheckResponse(BaseModel):
+    status: str = "ok"
+
+
+def get_pdf_pipeline_opts(options: ConvertOptions) -> Tuple[PdfPipelineOptions, str]:
+
+    if options.ocr_engine == OcrEngine.EASYOCR:
+        try:
+            import easyocr  # noqa: F401
+        except ImportError:
+            raise HTTPException(
+                status_code=400,
+                detail="The requested OCR engine"
+                f" (ocr_engine={options.ocr_engine.value})"
+                " is not available on this system. Please choose another OCR engine "
+                "or contact your system administrator.",
+            )
+        ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=options.force_ocr)
+    elif options.ocr_engine == OcrEngine.TESSERACT:
+        try:
+            import tesserocr  # noqa: F401
+        except ImportError:
+            raise HTTPException(
+                status_code=400,
+                detail="The requested OCR engine"
+                f" (ocr_engine={options.ocr_engine.value})"
+                " is not available on this system. Please choose another OCR engine "
+                "or contact your system administrator.",
+            )
+        ocr_options = TesseractOcrOptions(force_full_page_ocr=options.force_ocr)
+    elif options.ocr_engine == OcrEngine.RAPIDOCR:
+        try:
+            from rapidocr_onnxruntime import RapidOCR  # noqa: F401
+        except ImportError:
+            raise HTTPException(
+                status_code=400,
+                detail="The requested OCR engine"
+                f" (ocr_engine={options.ocr_engine.value})"
+                " is not available on this system. Please choose another OCR engine "
+                "or contact your system administrator.",
+            )
+        ocr_options = RapidOcrOptions(force_full_page_ocr=options.force_ocr)
+    else:
+        raise RuntimeError(f"Unexpected OCR engine type {options.ocr_engine}")
+
+    if options.ocr_lang is not None:
+        ocr_options.lang = options.ocr_lang
+
+    pipeline_options = PdfPipelineOptions(
+        do_ocr=options.do_ocr,
+        ocr_options=ocr_options,
+        do_table_structure=options.do_table_structure,
+        generate_page_images=options.include_images,
+        generate_picture_images=options.include_images,
+        images_scale=options.images_scale,
+    )
+
+    options_hash = hashlib.sha1(pipeline_options.model_dump_json().encode()).hexdigest()
+
+    return pipeline_options, options_hash
+
+
+converters: Dict[str, DocumentConverter] = {}
 
 
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    # Converter
-    settings = Settings()
-    pipeline_options = PipelineOptions()
-    pipeline_options.do_ocr = settings.do_ocr
-    pipeline_options.do_table_structure = settings.do_table_structure
-    models["converter"] = DocumentConverter(pipeline_options=pipeline_options)
+    # settings = Settings()
+
+    # Converter with default options
+    pipeline_options, options_hash = get_pdf_pipeline_opts(ConvertOptions())
+    converters[options_hash] = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
+            InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
+        }
+    )
+
+    converters[options_hash].initialize_pipeline(InputFormat.PDF)
+
     yield
 
-    models.clear()
+    converters.clear()
 
 
 app = FastAPI(
@@ -67,10 +188,14 @@ async def lifespan(app: FastAPI):
 )
 
 
-@app.post("/convert")
-def convert_pdf_document(
+@app.get("/health")
+def health() -> HealthCheckResponse:
+    return HealthCheckResponse()
+
+
+def _convert_document(
     body: ConvertDocumentRequest,
-) -> ConvertDocumentResponse:
+) -> ConversionResult:
 
     filename: str
     buf: BytesIO
@@ -81,16 +206,74 @@ def convert_pdf_document(
     elif isinstance(body, ConvertDocumentHttpSourceRequest):
         http_res = httpx.get(body.http_source.url, headers=body.http_source.headers)
         buf = BytesIO(http_res.content)
-        filename = Path(
-            body.http_source.url
-        ).name  # TODO: use better way to detect filename, e.g. from Content-Disposition
+        filename = resolve_remote_filename(
+            http_url=AnyHttpUrl(body.http_source.url),
+            response_headers=dict(**http_res.headers),
+        )
+
+    doc_input = DocumentStream(name=filename, stream=buf)
+
+    pipeline_options, options_hash = get_pdf_pipeline_opts(body.options)
+    if options_hash not in converters:
+        converters[options_hash] = DocumentConverter(
+            format_options={
+                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
+                InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
+            }
+        )
+
+    result: ConversionResult = converters[options_hash].convert(doc_input)
+
+    if result is None or result.status == ConversionStatus.SKIPPED:
+        raise HTTPException(status_code=400, detail=result.errors)
+
+    if result is None or result.status not in {
+        ConversionStatus.SUCCESS,
+    }:
+        raise HTTPException(
+            status_code=500, detail={"errors": result.errors, "status": result.status}
+        )
+
+    return result
 
-    docs_input = DocumentConversionInput.from_streams(
-        [DocumentStream(filename=filename, stream=buf)]
+
+@app.post(
+    "/convert",
+)
+def convert_document(
+    body: ConvertDocumentRequest,
+) -> ConvertDocumentResponse:
+
+    result = _convert_document(body=body)
+
+    image_mode = (
+        ImageRefMode.EMBEDDED
+        if body.options.include_images
+        else ImageRefMode.PLACEHOLDER
+    )
+    doc_resp = DocumentResponse()
+    if body.options.output_docling_document:
+        doc_resp.docling_document = result.document
+    if body.options.output_markdown:
+        doc_resp.markdown = result.document.export_to_markdown(image_mode=image_mode)
+    if body.options.output_html:
+        doc_resp.html = result.document.export_to_html(image_mode=image_mode)
+
+    return ConvertDocumentResponse(
+        document=doc_resp, status=result.status, timings=result.timings
     )
-    result: ConversionResult = next(models["converter"].convert(docs_input), None)
 
-    if result is None or result.status != ConversionStatus.SUCCESS:
-        raise HTTPException(status_code=500, detail={"errors": result.errors})
 
-    return ConvertDocumentResponse(content_md=result.render_as_markdown())
+@app.post("/convert/markdown", response_class=MarkdownTextResponse)
+def convert_document_md(
+    body: ConvertDocumentRequest,
+) -> MarkdownTextResponse:
+    result = _convert_document(body=body)
+    image_mode = (
+        ImageRefMode.EMBEDDED
+        if body.options.include_images
+        else ImageRefMode.PLACEHOLDER
+    )
+    return MarkdownTextResponse(
+        result.document.export_to_markdown(image_mode=image_mode)
+    )
diff --git a/docling_serve/settings.py b/docling_serve/settings.py
@@ -2,7 +2,5 @@
 
 
 class Settings(BaseSettings):
-    do_ocr: bool = True
-    do_table_structure: bool = True
 
     model_config = SettingsConfigDict(env_prefix="DOCLING_")