From e947eaa597e517bd3720a359202d911edde0661c Mon Sep 17 00:00:00 2001 From: Zheng Ma Date: Mon, 23 Dec 2024 11:55:40 +0100 Subject: [PATCH] add models_download.py script to download model files at docker-building time, to avoid model download issues at run time. --- Containerfile | 8 ++++---- models_download.py | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 4 deletions(-) create mode 100644 models_download.py diff --git a/Containerfile b/Containerfile index 61ba2ba..dbf2dc5 100644 --- a/Containerfile +++ b/Containerfile @@ -9,18 +9,18 @@ RUN apt-get update \ RUN pip install --no-cache-dir poetry -COPY pyproject.toml poetry.lock README.md /docling-serve/ +COPY pyproject.toml poetry.lock README.md models_download.py /docling-serve/ RUN if [ "$CPU_ONLY" = "true" ]; then \ poetry install --no-root --with cpu; \ else \ poetry install --no-root; \ - fi + fi && \ + poetry run python models_download.py ENV HF_HOME=/tmp/ ENV TORCH_HOME=/tmp/ -RUN poetry run python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);' # On container environments, always set a thread budget to avoid undesired thread congestion. ENV OMP_NUM_THREADS=4 @@ -29,4 +29,4 @@ COPY ./docling_serve /docling-serve/docling_serve EXPOSE 5000 -CMD ["poetry", "run", "uvicorn", "--port", "5000", "--host", "0.0.0.0", "docling_serve.app:app"] +CMD ["poetry", "run", "uvicorn", "--port", "5000", "--host", "0.0.0.0", "--log-level", "debug", "docling_serve.app:app"] diff --git a/models_download.py b/models_download.py new file mode 100644 index 0000000..c429be5 --- /dev/null +++ b/models_download.py @@ -0,0 +1,40 @@ +import os +import zipfile + +import requests +from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models +from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline + +# Download Docling models +StandardPdfPipeline.download_models_hf(force=True) +load_pretrained_nlp_models(verbose=True) + +# Download EasyOCR models +urls = [ + "https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/latin_g2.zip", + "https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip" +] + +local_zip_paths = [ + "/root/latin_g2.zip", + "/root/craft_mlt_25k.zip" +] + +extract_path = "/root/.EasyOCR/model/" + +# Create the extract directory if it doesn't exist +os.makedirs(extract_path, exist_ok=True) +os.makedirs(os.path.dirname(local_zip_paths[0]), exist_ok=True) # Create directory for zip files + +for url, local_zip_path in zip(urls, local_zip_paths): + # Download the file + response = requests.get(url) + with open(local_zip_path, "wb") as file: + file.write(response.content) + + # Unzip the file + with zipfile.ZipFile(local_zip_path, "r") as zip_ref: + zip_ref.extractall(extract_path) + + # Clean up the zip file + os.remove(local_zip_path)