ensure llama cpp can use GPU (#36)

Hittng an issue when trying out multstage build with venv: ``` CUDA error 304 at /tmp/pip-install-2buew0g7/llama-cpp-python_d94ee4c9feba4392a5a6259b67b5556f/vendor/llama.cpp/ggml-cuda.cu:505 6: OS call failed or operation not supported on this OS ```
substratusai · Aug 28, 2023 · d72d4fa · d72d4fa
1 parent ad53c22
commit d72d4fa
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 14 deletions.
diff --git a/model-server-llama-cpp/Dockerfile b/model-server-llama-cpp/Dockerfile
@@ -1,31 +1,46 @@
 ARG BASE_IMAGE=substratusai/base:latest
-FROM ${BASE_IMAGE}
-
-ENV MODEL_DIR="/content/saved-model"
-ENV HOST=0.0.0.0
-ENV PORT=8080
-
-WORKDIR /content
+FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 as build
 
 RUN --mount=type=cache,target=/var/cache/apt --mount=type=cache,target=/var/lib/apt \
     apt-get update && \
     apt-get -y --no-install-recommends install \
-      python3 python3-pip git build-essential gcc wget \
+      python3 python3-pip python3-venv git build-essential gcc wget \
       ocl-icd-opencl-dev opencl-headers clinfo libclblast-dev libopenblas-dev && \
     mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \
     rm -rf /var/lib/apt/lists/*
-
-RUN ln -s /usr/bin/python3 /usr/bin/python
+# Ensures that the python and pip executables used
+# in the image will be those from our virtualenv.
+RUN python3 -m venv /venv
+ENV PATH="/venv/bin:$PATH"
 
 # setting build related env vars
 ENV CUDA_DOCKER_ARCH=all
 ENV LLAMA_CUBLAS=1
 
 # Install depencencies
-RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
+RUN /venv/bin/python3 -m pip install --upgrade pip wheel pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
+
+RUN git clone --recurse-submodules https://github.com/abetlen/llama-cpp-python.git && \
+    cd llama-cpp-python && make build.cuda
+
+FROM ${BASE_IMAGE}
+
+WORKDIR /content
+ENV USE_MLOCK=0
+ENV MODEL_DIR="/content/saved-model"
+ENV HOST=0.0.0.0
+ENV PORT=8080
+ENV PATH="/venv/bin:$PATH"
+COPY --from=build /venv /venv
+COPY --from=build /llama-cpp-python /llama-cpp-python
+RUN --mount=type=cache,target=/var/cache/apt --mount=type=cache,target=/var/lib/apt \
+    apt-get update && \
+    apt-get -y --no-install-recommends install \
+      git wget \
+      ocl-icd-libopencl1 clinfo libclblast1 libopenblas0 && \
+    mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \
+    rm -rf /var/lib/apt/lists/*
 
-# Install llama-cpp-python (build with cuda)
-RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
 
 COPY scripts/ scripts/
 

diff --git a/model-server-llama-cpp/README.md b/model-server-llama-cpp/README.md
@@ -9,5 +9,39 @@ The image expects a single GGML model as a single bin file under the /content/sa
 ### Building
 Build the image:
 ```sh
-docker build -t substratusai/model-server-llama-cpp .
+docker build -t llama-cpp .
+```
+
+### Testing
+Download a GGML model:
+```bash
+curl -L -o model-ggml.bin https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q2_K.bin
+```
+
+Convert the model to GGUF with this [script](https://github.com/ggerganov/llama.cpp/blob/master/convert-llama-ggmlv3-to-gguf.py):
+```bash
+convert-llama-ggmlv3-to-gguf.py --input model-ggml.bin --output model.bin
+```
+
+Run the image with that model:
+```bash
+docker run --gpus=all -d -p 8080:8080 --security-opt seccomp=unconfined  \
+  -v $PWD/model.bin:/content/saved-model/model.bin --cap-add SYS_RESOURCE \
+  -e USE_MLOCK=0 -e MODEL=/content/saved-model/model.bin \
+  -e N_GPU_LAYERS=30 llama-cpp
+```
+Note that `N_GPU_LAYERS` will cause it to load 30 layers to the GPU. You can increase
+that number from `30` to something more if you have more GPU memory available.
+
+Verify it's working:
+```bash
+curl http://localhost:8080/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{ "prompt": "Who was the first president of the United States?", "stop": ["."]}'
+```
+
+You can also run a OpenAI compatible UI to test it out:
+```bash
+docker run -e OPENAI_API_KEY=notused -e OPENAI_API_HOST=http://localhost:8080 \
+  --net=host -p 3000:3000 ghcr.io/mckaywrigley/chatbot-ui:main
 ```
diff --git a/model-server-llama-cpp/scripts/serve.sh b/model-server-llama-cpp/scripts/serve.sh
@@ -4,4 +4,6 @@ set -xe
 
 ls ${MODEL_DIR}
 export MODEL=$(find "${MODEL_DIR}" -type f -iname "*.bin" | head -n 1)
+# TODO figure out how to automatically set N_GPU_LAYERS when a GPU is available
+export N_GPU_LAYERS="${PARAM_N_GPU_LAYERS:-${N_GPU_LAYERS:-0}}"
 PYTHONUNBUFFERED=1 python3 -m llama_cpp.server