DS4SD · guimou · Dec 10, 2024
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,40 @@
+# Ignore Python cache files
+__pycache__/
+**/__pycache__/
+*.pyc
+*.pyo
+*.pyd
+
+# Ignore virtual environments
+env/
+venv/
+
+# Ignore development artifacts
+*.log
+*.db
+*.sqlite3
+
+# Ignore configuration and sensitive files
+**/.env
+*.env
+*.ini
+*.cfg
+
+# Ignore IDE and editor settings
+.vscode/
+.idea/
+*.swp
+*.swo
+
+# Ignore Git files
+.git/
+.gitignore
+
+# Ignore Docker files themselves (optional if not needed in the image)
+.dockerignore
+Dockerfile*
+
+# Ignore build artifacts (if applicable)
+build/
+dist/
+*.egg-info
diff --git a/Containerfile b/Containerfile
@@ -1,32 +1,59 @@
-FROM python:3.11-slim-bookworm
+ARG BASE_IMAGE=quay.io/sclorg/python-312-c9s:c9s
+
+FROM ${BASE_IMAGE}
 
 ARG CPU_ONLY=false
-WORKDIR /docling-serve
 
-RUN apt-get update \
-    && apt-get install -y libgl1 libglib2.0-0 curl wget git \
-    && apt-get clean
+USER 0
 
-RUN pip install --no-cache-dir poetry
+###################################################################################################
+# OS Layer                                                                                        #
+###################################################################################################
 
-COPY pyproject.toml poetry.lock README.md /docling-serve/
+RUN --mount=type=bind,source=os-packages.txt,target=/tmp/os-packages.txt \
+    dnf -y install --best --nodocs --setopt=install_weak_deps=False dnf-plugins-core && \
+    dnf config-manager --best --nodocs --setopt=install_weak_deps=False --save && \
+    dnf config-manager --enable crb && \
+    dnf -y update && \
+    dnf install -y $(cat /tmp/os-packages.txt) && \
+    dnf -y clean all && \
+    rm -rf /var/cache/dnf
 
-RUN if [ "$CPU_ONLY" = "true" ]; then \
-    poetry install --no-root --with cpu; \
-    else \
-        poetry install --no-root; \
-    fi
+ENV TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
 
-ENV HF_HOME=/tmp/
-ENV TORCH_HOME=/tmp/
+###################################################################################################
+# Docling layer                                                                                   #
+###################################################################################################
 
-RUN poetry run python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);'
+USER 1001
+
+WORKDIR /opt/app-root/src
 
 # On container environments, always set a thread budget to avoid undesired thread congestion.
 ENV OMP_NUM_THREADS=4
 
-COPY ./docling_serve /docling-serve/docling_serve
+ENV LANG=en_US.UTF-8
+ENV LC_ALL=en_US.UTF-8
+ENV PYTHONIOENCODING=utf-8
+
+COPY --chown=1001:0 pyproject.toml poetry.lock models_download.py README.md ./
+
+RUN pip install --no-cache-dir poetry && \
+    # We already are in a virtual environment, so we don't need to create a new one, only activate it.
+    poetry config virtualenvs.create false && \
+    source /opt/app-root/bin/activate && \
+    if [ "$CPU_ONLY" = "true" ]; then \
+        poetry install --no-root --no-cache --no-interaction --all-extras --with cpu --without dev; \
+    else \
+        poetry install --no-root --no-cache --no-interaction --all-extras --without dev; \
+    fi && \
+    echo "Downloading models..." && \
+    python models_download.py && \
+    chown -R 1001:0 /opt/app-root/src && \
+    chmod -R g=u /opt/app-root/src
+
+COPY --chown=1001:0 --chmod=664 ./docling_serve ./docling_serve
 
-EXPOSE 5000
+EXPOSE 8080
 
-CMD ["poetry", "run", "uvicorn", "--port", "5000", "--host", "0.0.0.0", "docling_serve.app:app"]
+CMD ["python", "docling_serve/app.py"]
diff --git a/Makefile b/Makefile
@@ -25,14 +25,14 @@ md-lint-file:
 	$(CMD_PREFIX) touch .markdown-lint
 
 .PHONY: docling-serve-cpu-image
-docling-serve-cpu-image: Containerfile ## Build docling-serve "cpu only" continaer image
+docling-serve-cpu-image: Containerfile ## Build docling-serve "cpu only" container image
 	$(ECHO_PREFIX) printf "  %-12s Containerfile\n" "[docling-serve CPU ONLY]"
 	$(CMD_PREFIX) docker build --build-arg CPU_ONLY=true -f Containerfile --platform linux/amd64 -t ghcr.io/ds4sd/docling-serve-cpu:$(TAG) .
 	$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve-cpu:$(TAG) ghcr.io/ds4sd/docling-serve-cpu:main
 	$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve-cpu:$(TAG) quay.io/ds4sd/docling-serve-cpu:main
 
 .PHONY: docling-serve-gpu-image
-docling-serve-gpu-image: Containerfile ## Build docling-serve continaer image with GPU support
+docling-serve-gpu-image: Containerfile ## Build docling-serve container image with GPU support
 	$(ECHO_PREFIX) printf "  %-12s Containerfile\n" "[docling-serve with GPU]"
 	$(CMD_PREFIX) docker build --build-arg CPU_ONLY=false -f Containerfile --platform linux/amd64 -t ghcr.io/ds4sd/docling-serve:$(TAG) .
 	$(CMD_PREFIX) docker tag ghcr.io/ds4sd/docling-serve:$(TAG) ghcr.io/ds4sd/docling-serve:main