Skip to content

Commit

Permalink
feat: upgrade endpoint to docling v2 (#13)
Browse files Browse the repository at this point in the history
* upgrade endpoint to docling v2

Signed-off-by: Michele Dolfi <[email protected]>

* fix Containerfile

Signed-off-by: Michele Dolfi <[email protected]>

---------

Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm authored Dec 19, 2024
1 parent 3824aa6 commit b00718b
Show file tree
Hide file tree
Showing 5 changed files with 2,657 additions and 2,015 deletions.
4 changes: 2 additions & 2 deletions Containerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ RUN if [ "$CPU_ONLY" = "true" ]; then \
ENV HF_HOME=/tmp/
ENV TORCH_HOME=/tmp/

RUN poetry run python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
RUN poetry run python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);'

# On container environments, always set a thread budget to avoid undesired thread congestion.
ENV OMP_NUM_THREADS=4
Expand All @@ -29,4 +29,4 @@ COPY ./docling_serve /docling-serve/docling_serve

EXPOSE 5000

CMD ["poetry", "run", "uvicorn", "--port", "5000", "docling_serve.app:app"]
CMD ["poetry", "run", "uvicorn", "--port", "5000", "--host", "0.0.0.0", "docling_serve.app:app"]
245 changes: 214 additions & 31 deletions docling_serve/app.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,55 @@
import base64
import hashlib
from contextlib import asynccontextmanager
from enum import Enum
from io import BytesIO
from pathlib import Path
from typing import Any, Dict, Union
from typing import Any, Dict, List, Optional, Tuple, Union

import httpx
from docling.datamodel.base_models import (
ConversionStatus,
DocumentStream,
PipelineOptions,
ErrorItem,
InputFormat,
)
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrOptions,
PdfPipelineOptions,
RapidOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.utils.profiling import ProfilingItem
from docling_core.types.doc import DoclingDocument, ImageRefMode
from docling_core.utils.file import resolve_remote_filename
from fastapi import FastAPI, HTTPException, Response
from pydantic import AnyHttpUrl, BaseModel


# TODO: import enum from Docling, once it is exposed
class OcrEngine(str, Enum):
EASYOCR = "easyocr"
TESSERACT = "tesseract"
RAPIDOCR = "rapidocr"


class ConvertOptions(BaseModel):
output_docling_document: bool = True
output_markdown: bool = False
output_html: bool = False
do_ocr: bool = True
ocr_engine: OcrEngine = OcrEngine.EASYOCR
ocr_lang: Optional[List[str]] = None
force_ocr: bool = False
do_table_structure: bool = True
include_images: bool = True
images_scale: float = 2.0


from docling_serve.settings import Settings
class DocumentConvertBase(BaseModel):
options: ConvertOptions = ConvertOptions()


class HttpSource(BaseModel):
Expand All @@ -28,37 +62,124 @@ class FileSource(BaseModel):
filename: str


class ConvertDocumentHttpSourceRequest(BaseModel):
class ConvertDocumentHttpSourceRequest(DocumentConvertBase):
http_source: HttpSource


class ConvertDocumentFileSourceRequest(BaseModel):
class ConvertDocumentFileSourceRequest(DocumentConvertBase):
file_source: FileSource


class DocumentResponse(BaseModel):
markdown: Optional[str] = None
docling_document: Optional[DoclingDocument] = None
html: Optional[str] = None


class ConvertDocumentResponse(BaseModel):
content_md: str
document: DocumentResponse
status: ConversionStatus
errors: List[ErrorItem] = []
timings: Dict[str, ProfilingItem] = {}


class ConvertDocumentErrorResponse(BaseModel):
status: ConversionStatus
# errors: List[ErrorItem] = []


ConvertDocumentRequest = Union[
ConvertDocumentFileSourceRequest, ConvertDocumentHttpSourceRequest
]


models = {}
class MarkdownTextResponse(Response):
media_type = "text/markdown"


class HealthCheckResponse(BaseModel):
status: str = "ok"


def get_pdf_pipeline_opts(options: ConvertOptions) -> Tuple[PdfPipelineOptions, str]:

if options.ocr_engine == OcrEngine.EASYOCR:
try:
import easyocr # noqa: F401
except ImportError:
raise HTTPException(
status_code=400,
detail="The requested OCR engine"
f" (ocr_engine={options.ocr_engine.value})"
" is not available on this system. Please choose another OCR engine "
"or contact your system administrator.",
)
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=options.force_ocr)
elif options.ocr_engine == OcrEngine.TESSERACT:
try:
import tesserocr # noqa: F401
except ImportError:
raise HTTPException(
status_code=400,
detail="The requested OCR engine"
f" (ocr_engine={options.ocr_engine.value})"
" is not available on this system. Please choose another OCR engine "
"or contact your system administrator.",
)
ocr_options = TesseractOcrOptions(force_full_page_ocr=options.force_ocr)
elif options.ocr_engine == OcrEngine.RAPIDOCR:
try:
from rapidocr_onnxruntime import RapidOCR # noqa: F401
except ImportError:
raise HTTPException(
status_code=400,
detail="The requested OCR engine"
f" (ocr_engine={options.ocr_engine.value})"
" is not available on this system. Please choose another OCR engine "
"or contact your system administrator.",
)
ocr_options = RapidOcrOptions(force_full_page_ocr=options.force_ocr)
else:
raise RuntimeError(f"Unexpected OCR engine type {options.ocr_engine}")

if options.ocr_lang is not None:
ocr_options.lang = options.ocr_lang

pipeline_options = PdfPipelineOptions(
do_ocr=options.do_ocr,
ocr_options=ocr_options,
do_table_structure=options.do_table_structure,
generate_page_images=options.include_images,
generate_picture_images=options.include_images,
images_scale=options.images_scale,
)

options_hash = hashlib.sha1(pipeline_options.model_dump_json().encode()).hexdigest()

return pipeline_options, options_hash


converters: Dict[str, DocumentConverter] = {}


@asynccontextmanager
async def lifespan(app: FastAPI):
# Converter
settings = Settings()
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = settings.do_ocr
pipeline_options.do_table_structure = settings.do_table_structure
models["converter"] = DocumentConverter(pipeline_options=pipeline_options)
# settings = Settings()

# Converter with default options
pipeline_options, options_hash = get_pdf_pipeline_opts(ConvertOptions())
converters[options_hash] = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
}
)

converters[options_hash].initialize_pipeline(InputFormat.PDF)

yield

models.clear()
converters.clear()


app = FastAPI(
Expand All @@ -67,10 +188,14 @@ async def lifespan(app: FastAPI):
)


@app.post("/convert")
def convert_pdf_document(
@app.get("/health")
def health() -> HealthCheckResponse:
return HealthCheckResponse()


def _convert_document(
body: ConvertDocumentRequest,
) -> ConvertDocumentResponse:
) -> ConversionResult:

filename: str
buf: BytesIO
Expand All @@ -81,16 +206,74 @@ def convert_pdf_document(
elif isinstance(body, ConvertDocumentHttpSourceRequest):
http_res = httpx.get(body.http_source.url, headers=body.http_source.headers)
buf = BytesIO(http_res.content)
filename = Path(
body.http_source.url
).name # TODO: use better way to detect filename, e.g. from Content-Disposition
filename = resolve_remote_filename(
http_url=AnyHttpUrl(body.http_source.url),
response_headers=dict(**http_res.headers),
)

doc_input = DocumentStream(name=filename, stream=buf)

pipeline_options, options_hash = get_pdf_pipeline_opts(body.options)
if options_hash not in converters:
converters[options_hash] = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
}
)

result: ConversionResult = converters[options_hash].convert(doc_input)

if result is None or result.status == ConversionStatus.SKIPPED:
raise HTTPException(status_code=400, detail=result.errors)

if result is None or result.status not in {
ConversionStatus.SUCCESS,
}:
raise HTTPException(
status_code=500, detail={"errors": result.errors, "status": result.status}
)

return result

docs_input = DocumentConversionInput.from_streams(
[DocumentStream(filename=filename, stream=buf)]

@app.post(
"/convert",
)
def convert_document(
body: ConvertDocumentRequest,
) -> ConvertDocumentResponse:

result = _convert_document(body=body)

image_mode = (
ImageRefMode.EMBEDDED
if body.options.include_images
else ImageRefMode.PLACEHOLDER
)
doc_resp = DocumentResponse()
if body.options.output_docling_document:
doc_resp.docling_document = result.document
if body.options.output_markdown:
doc_resp.markdown = result.document.export_to_markdown(image_mode=image_mode)
if body.options.output_html:
doc_resp.html = result.document.export_to_html(image_mode=image_mode)

return ConvertDocumentResponse(
document=doc_resp, status=result.status, timings=result.timings
)
result: ConversionResult = next(models["converter"].convert(docs_input), None)

if result is None or result.status != ConversionStatus.SUCCESS:
raise HTTPException(status_code=500, detail={"errors": result.errors})

return ConvertDocumentResponse(content_md=result.render_as_markdown())
@app.post("/convert/markdown", response_class=MarkdownTextResponse)
def convert_document_md(
body: ConvertDocumentRequest,
) -> MarkdownTextResponse:
result = _convert_document(body=body)
image_mode = (
ImageRefMode.EMBEDDED
if body.options.include_images
else ImageRefMode.PLACEHOLDER
)
return MarkdownTextResponse(
result.document.export_to_markdown(image_mode=image_mode)
)
2 changes: 0 additions & 2 deletions docling_serve/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,5 @@


class Settings(BaseSettings):
do_ocr: bool = True
do_table_structure: bool = True

model_config = SettingsConfigDict(env_prefix="DOCLING_")
Loading

0 comments on commit b00718b

Please sign in to comment.