llama-cpp add support for cpu

substratusai · Aug 29, 2023 · f153836 · f153836
1 parent d72d4fa
commit f153836
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 11 deletions.
diff --git a/.github/workflows/build-and-push.yaml b/.github/workflows/build-and-push.yaml
@@ -209,18 +209,40 @@ jobs:
           username: "${{ vars.DOCKERHUB_USERNAME }}"
           password: "${{ secrets.DOCKERHUB_TOKEN }}"
       - name: Docker meta
-        id: meta
+        id: meta-cpu
         uses: docker/metadata-action@v4
         with:
           images: substratusai/${{ github.job }}
+          flavor: |
+            latest=auto
+            suffix=-cpu
       - name: Build and push
-        id: build-and-push
         uses: docker/build-push-action@v4
         with:
           context: ./${{ github.job }}
           platforms: "linux/amd64"
           build-args: |
             BASE_IMAGE=${{ needs.base.outputs.tag }}
+            COMPUTE_TYPE=cpu
           push: true
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
+          tags: ${{ steps.meta-cpu.outputs.tags }}
+          labels: ${{ steps.meta-cpu.outputs.labels }}
+      - name: Docker meta
+        id: meta-gpu
+        uses: docker/metadata-action@v4
+        with:
+          images: substratusai/${{ github.job }}
+          flavor: |
+            latest=auto
+            suffix=-gpu,onlatest=true
+      - name: Build and push
+        uses: docker/build-push-action@v4
+        with:
+          context: ./${{ github.job }}
+          platforms: "linux/amd64"
+          build-args: |
+            BASE_IMAGE=${{ needs.base.outputs.tag }}
+            COMPUTE_TYPE=gpu
+          push: true
+          tags: ${{ steps.meta-gpu.outputs.tags }}
+          labels: ${{ steps.meta-gpu.outputs.labels }}
diff --git a/model-server-llama-cpp/Dockerfile b/model-server-llama-cpp/Dockerfile
@@ -1,5 +1,7 @@
 ARG BASE_IMAGE=substratusai/base:latest
+ARG COMPUTE_TYPE=gpu
 FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 as build
+ARG COMPUTE_TYPE
 
 RUN --mount=type=cache,target=/var/cache/apt --mount=type=cache,target=/var/lib/apt \
     apt-get update && \
@@ -20,8 +22,10 @@ ENV LLAMA_CUBLAS=1
 # Install depencencies
 RUN /venv/bin/python3 -m pip install --upgrade pip wheel pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
 
-RUN git clone --recurse-submodules https://github.com/abetlen/llama-cpp-python.git && \
-    cd llama-cpp-python && make build.cuda
+RUN git clone --recurse-submodules https://github.com/abetlen/llama-cpp-python.git
+WORKDIR /llama-cpp-python
+RUN if [ "$COMPUTE_TYPE" = "gpu" ]; then make build.cuda; fi
+RUN if [ "$COMPUTE_TYPE" = "cpu" ]; then make build.openblas; fi
 
 FROM ${BASE_IMAGE}
 

diff --git a/model-server-llama-cpp/README.md b/model-server-llama-cpp/README.md
@@ -7,12 +7,17 @@ The image expects a single GGML model as a single bin file under the /content/sa
 ## Usage for testing
 
 ### Building
-Build the image:
+Build the image for CPU:
 ```sh
-docker build -t llama-cpp .
+docker build -t llama-cpp:cpu --build-arg "COMPUTE_TYPE=cpu" .
 ```
 
-### Testing
+Build the image for GPU:
+```bash
+docker build -t llama-cpp:gpu --build-arg "COMPUTE_TYPE=gpu" .
+```
+
+### Download and convert a model
 Download a GGML model:
 ```bash
 curl -L -o model-ggml.bin https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q2_K.bin
@@ -22,13 +27,23 @@ Convert the model to GGUF with this [script](https://github.com/ggerganov/llama.
 ```bash
 convert-llama-ggmlv3-to-gguf.py --input model-ggml.bin --output model.bin
 ```
+### Run the image
+You can run it in CPU only mode or with GPU.
+
+Run the image with that model using CPU:
+```bash
+docker run -d -p 8080:8080 --security-opt seccomp=unconfined  \
+  -v $PWD/model.bin:/content/saved-model/model.bin --cap-add SYS_RESOURCE \
+  -e USE_MLOCK=0 -e MODEL=/content/saved-model/model.bin \
+  llama-cpp:cpu
+```
 
-Run the image with that model:
+Run the image with that model using GPU:
 ```bash
 docker run --gpus=all -d -p 8080:8080 --security-opt seccomp=unconfined  \
   -v $PWD/model.bin:/content/saved-model/model.bin --cap-add SYS_RESOURCE \
   -e USE_MLOCK=0 -e MODEL=/content/saved-model/model.bin \
-  -e N_GPU_LAYERS=30 llama-cpp
+  -e N_GPU_LAYERS=30 llama-cpp:gpu
 ```
 Note that `N_GPU_LAYERS` will cause it to load 30 layers to the GPU. You can increase
 that number from `30` to something more if you have more GPU memory available.