diff --git a/.github/workflows/build-and-push.yaml b/.github/workflows/build-and-push.yaml index 841d149..4bd1769 100644 --- a/.github/workflows/build-and-push.yaml +++ b/.github/workflows/build-and-push.yaml @@ -209,18 +209,40 @@ jobs: username: "${{ vars.DOCKERHUB_USERNAME }}" password: "${{ secrets.DOCKERHUB_TOKEN }}" - name: Docker meta - id: meta + id: meta-cpu uses: docker/metadata-action@v4 with: images: substratusai/${{ github.job }} + flavor: | + latest=auto + suffix=-cpu - name: Build and push - id: build-and-push uses: docker/build-push-action@v4 with: context: ./${{ github.job }} platforms: "linux/amd64" build-args: | BASE_IMAGE=${{ needs.base.outputs.tag }} + COMPUTE_TYPE=cpu push: true - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} + tags: ${{ steps.meta-cpu.outputs.tags }} + labels: ${{ steps.meta-cpu.outputs.labels }} + - name: Docker meta + id: meta-gpu + uses: docker/metadata-action@v4 + with: + images: substratusai/${{ github.job }} + flavor: | + latest=auto + suffix=-gpu,onlatest=true + - name: Build and push + uses: docker/build-push-action@v4 + with: + context: ./${{ github.job }} + platforms: "linux/amd64" + build-args: | + BASE_IMAGE=${{ needs.base.outputs.tag }} + COMPUTE_TYPE=gpu + push: true + tags: ${{ steps.meta-gpu.outputs.tags }} + labels: ${{ steps.meta-gpu.outputs.labels }} diff --git a/model-server-llama-cpp/Dockerfile b/model-server-llama-cpp/Dockerfile index 325c9bf..40a8062 100644 --- a/model-server-llama-cpp/Dockerfile +++ b/model-server-llama-cpp/Dockerfile @@ -1,5 +1,7 @@ ARG BASE_IMAGE=substratusai/base:latest +ARG COMPUTE_TYPE=gpu FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 as build +ARG COMPUTE_TYPE RUN --mount=type=cache,target=/var/cache/apt --mount=type=cache,target=/var/lib/apt \ apt-get update && \ @@ -20,8 +22,10 @@ ENV LLAMA_CUBLAS=1 # Install depencencies RUN /venv/bin/python3 -m pip install --upgrade pip wheel pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings -RUN git clone --recurse-submodules https://github.com/abetlen/llama-cpp-python.git && \ - cd llama-cpp-python && make build.cuda +RUN git clone --recurse-submodules https://github.com/abetlen/llama-cpp-python.git +WORKDIR /llama-cpp-python +RUN if [ "$COMPUTE_TYPE" = "gpu" ]; then make build.cuda; fi +RUN if [ "$COMPUTE_TYPE" = "cpu" ]; then make build.openblas; fi FROM ${BASE_IMAGE} diff --git a/model-server-llama-cpp/README.md b/model-server-llama-cpp/README.md index 87cf420..a6be4e6 100644 --- a/model-server-llama-cpp/README.md +++ b/model-server-llama-cpp/README.md @@ -7,12 +7,17 @@ The image expects a single GGML model as a single bin file under the /content/sa ## Usage for testing ### Building -Build the image: +Build the image for CPU: ```sh -docker build -t llama-cpp . +docker build -t llama-cpp:cpu --build-arg "COMPUTE_TYPE=cpu" . ``` -### Testing +Build the image for GPU: +```bash +docker build -t llama-cpp:gpu --build-arg "COMPUTE_TYPE=gpu" . +``` + +### Download and convert a model Download a GGML model: ```bash curl -L -o model-ggml.bin https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q2_K.bin @@ -22,13 +27,23 @@ Convert the model to GGUF with this [script](https://github.com/ggerganov/llama. ```bash convert-llama-ggmlv3-to-gguf.py --input model-ggml.bin --output model.bin ``` +### Run the image +You can run it in CPU only mode or with GPU. + +Run the image with that model using CPU: +```bash +docker run -d -p 8080:8080 --security-opt seccomp=unconfined \ + -v $PWD/model.bin:/content/saved-model/model.bin --cap-add SYS_RESOURCE \ + -e USE_MLOCK=0 -e MODEL=/content/saved-model/model.bin \ + llama-cpp:cpu +``` -Run the image with that model: +Run the image with that model using GPU: ```bash docker run --gpus=all -d -p 8080:8080 --security-opt seccomp=unconfined \ -v $PWD/model.bin:/content/saved-model/model.bin --cap-add SYS_RESOURCE \ -e USE_MLOCK=0 -e MODEL=/content/saved-model/model.bin \ - -e N_GPU_LAYERS=30 llama-cpp + -e N_GPU_LAYERS=30 llama-cpp:gpu ``` Note that `N_GPU_LAYERS` will cause it to load 30 layers to the GPU. You can increase that number from `30` to something more if you have more GPU memory available.