From e947eaa597e517bd3720a359202d911edde0661c Mon Sep 17 00:00:00 2001
From: Zheng Ma <Zheng.Ma-CIC.Germany@ibm.com>
Date: Mon, 23 Dec 2024 11:55:40 +0100
Subject: [PATCH] add models_download.py script to download model files at
 docker-building time, to avoid model download issues at run time.

---
 Containerfile      |  8 ++++----
 models_download.py | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 4 deletions(-)
 create mode 100644 models_download.py

diff --git a/Containerfile b/Containerfile
index 61ba2ba..dbf2dc5 100644
--- a/Containerfile
+++ b/Containerfile
@@ -9,18 +9,18 @@ RUN apt-get update \
 
 RUN pip install --no-cache-dir poetry
 
-COPY pyproject.toml poetry.lock README.md /docling-serve/
+COPY pyproject.toml poetry.lock README.md models_download.py /docling-serve/
 
 RUN if [ "$CPU_ONLY" = "true" ]; then \
     poetry install --no-root --with cpu; \
     else \
         poetry install --no-root; \
-    fi
+    fi && \
+    poetry run python models_download.py
 
 ENV HF_HOME=/tmp/
 ENV TORCH_HOME=/tmp/
 
-RUN poetry run python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);'
 
 # On container environments, always set a thread budget to avoid undesired thread congestion.
 ENV OMP_NUM_THREADS=4
@@ -29,4 +29,4 @@ COPY ./docling_serve /docling-serve/docling_serve
 
 EXPOSE 5000
 
-CMD ["poetry", "run", "uvicorn", "--port", "5000", "--host", "0.0.0.0", "docling_serve.app:app"]
+CMD ["poetry", "run", "uvicorn", "--port", "5000", "--host", "0.0.0.0", "--log-level", "debug", "docling_serve.app:app"]
diff --git a/models_download.py b/models_download.py
new file mode 100644
index 0000000..c429be5
--- /dev/null
+++ b/models_download.py
@@ -0,0 +1,40 @@
+import os
+import zipfile
+
+import requests
+from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
+from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+
+# Download Docling models
+StandardPdfPipeline.download_models_hf(force=True)
+load_pretrained_nlp_models(verbose=True)
+
+# Download EasyOCR models
+urls = [
+    "https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/latin_g2.zip",
+    "https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip"
+]
+
+local_zip_paths = [
+    "/root/latin_g2.zip",
+    "/root/craft_mlt_25k.zip"
+]
+
+extract_path = "/root/.EasyOCR/model/"
+
+# Create the extract directory if it doesn't exist
+os.makedirs(extract_path, exist_ok=True)
+os.makedirs(os.path.dirname(local_zip_paths[0]), exist_ok=True)  # Create directory for zip files
+
+for url, local_zip_path in zip(urls, local_zip_paths):
+    # Download the file
+    response = requests.get(url)
+    with open(local_zip_path, "wb") as file:
+        file.write(response.content)
+
+    # Unzip the file
+    with zipfile.ZipFile(local_zip_path, "r") as zip_ref:
+        zip_ref.extractall(extract_path)
+
+    # Clean up the zip file
+    os.remove(local_zip_path)