diff --git a/model-server-llama-cpp/Dockerfile b/model-server-llama-cpp/Dockerfile index 03a3d88..35b31fb 100644 --- a/model-server-llama-cpp/Dockerfile +++ b/model-server-llama-cpp/Dockerfile @@ -1,32 +1,49 @@ -ARG BASE_IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -FROM ${BASE_IMAGE} - -ENV MODEL_DIR="/content/saved-model" -ENV HOST=0.0.0.0 -ENV PORT=8080 - -WORKDIR /content +ARG BASE_IMAGE=substratusai/base:latest +FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 as build RUN --mount=type=cache,target=/var/cache/apt --mount=type=cache,target=/var/lib/apt \ apt-get update && \ apt-get -y --no-install-recommends install \ - python3 python3-pip git build-essential gcc wget \ + python3 python3-pip python3-venv git build-essential gcc wget \ ocl-icd-opencl-dev opencl-headers clinfo libclblast-dev libopenblas-dev && \ mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \ rm -rf /var/lib/apt/lists/* - -RUN ln -s /usr/bin/python3 /usr/bin/python +# Ensures that the python and pip executables used +# in the image will be those from our virtualenv. +RUN python3 -m venv /venv +ENV PATH="/venv/bin:$PATH" # setting build related env vars ENV CUDA_DOCKER_ARCH=all ENV LLAMA_CUBLAS=1 -ENV USE_MLOCK=0 # Install depencencies -RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings +RUN /venv/bin/python3 -m pip install --upgrade pip wheel pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings +RUN git clone --recurse-submodules https://github.com/abetlen/llama-cpp-python.git && \ + cd llama-cpp-python && make build.cuda # Install llama-cpp-python (build with cuda) -RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python +# RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 /venv/bin/python3 -m pip install llama-cpp-python + + +FROM ${BASE_IMAGE} + +WORKDIR /content +ENV USE_MLOCK=0 +ENV MODEL_DIR="/content/saved-model" +ENV HOST=0.0.0.0 +ENV PORT=8080 +ENV PATH="/venv/bin:$PATH" +COPY --from=build /venv /venv +COPY --from=build /llama-cpp-python /llama-cpp-python +RUN --mount=type=cache,target=/var/cache/apt --mount=type=cache,target=/var/lib/apt \ + apt-get update && \ + apt-get -y --no-install-recommends install \ + git wget \ + ocl-icd-libopencl1 clinfo libclblast1 libopenblas0 && \ + mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \ + rm -rf /var/lib/apt/lists/* + COPY scripts/ scripts/ diff --git a/model-server-llama-cpp/README.md b/model-server-llama-cpp/README.md index af813b9..3d86bae 100644 --- a/model-server-llama-cpp/README.md +++ b/model-server-llama-cpp/README.md @@ -9,5 +9,24 @@ The image expects a single GGML model as a single bin file under the /content/sa ### Building Build the image: ```sh -docker build -t substratusai/model-server-llama-cpp . +docker build -t llama-cpp . +``` + +### Testing +Download a GGML model: +```bash +curl -L -o model.bin https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q2_K.bin +``` + +Run the image with that model: +```bash +docker run --gpus=all -d -p 8080:8080 \ + -v $PWD/model.bin:/content/saved-model/model.bin llama-cpp +``` + +Verify it's working: +```bash +curl http://localhost:8080/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ "prompt": "Who was the first president of the United States?", "stop": ["."]}' ```