diff --git a/model-server-llama-cpp/Dockerfile b/model-server-llama-cpp/Dockerfile
index 03a3d88..35b31fb 100644
--- a/model-server-llama-cpp/Dockerfile
+++ b/model-server-llama-cpp/Dockerfile
@@ -1,32 +1,49 @@
-ARG BASE_IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04
-FROM ${BASE_IMAGE}
-
-ENV MODEL_DIR="/content/saved-model"
-ENV HOST=0.0.0.0
-ENV PORT=8080
-
-WORKDIR /content
+ARG BASE_IMAGE=substratusai/base:latest
+FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 as build
 
 RUN --mount=type=cache,target=/var/cache/apt --mount=type=cache,target=/var/lib/apt \
     apt-get update && \
     apt-get -y --no-install-recommends install \
-      python3 python3-pip git build-essential gcc wget \
+      python3 python3-pip python3-venv git build-essential gcc wget \
       ocl-icd-opencl-dev opencl-headers clinfo libclblast-dev libopenblas-dev && \
     mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \
     rm -rf /var/lib/apt/lists/*
-
-RUN ln -s /usr/bin/python3 /usr/bin/python
+# Ensures that the python and pip executables used
+# in the image will be those from our virtualenv.
+RUN python3 -m venv /venv
+ENV PATH="/venv/bin:$PATH"
 
 # setting build related env vars
 ENV CUDA_DOCKER_ARCH=all
 ENV LLAMA_CUBLAS=1
-ENV USE_MLOCK=0
 
 # Install depencencies
-RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
+RUN /venv/bin/python3 -m pip install --upgrade pip wheel pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
 
+RUN git clone --recurse-submodules https://github.com/abetlen/llama-cpp-python.git && \
+    cd llama-cpp-python && make build.cuda
 # Install llama-cpp-python (build with cuda)
-RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
+# RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 /venv/bin/python3 -m pip install llama-cpp-python
+
+
+FROM ${BASE_IMAGE}
+
+WORKDIR /content
+ENV USE_MLOCK=0
+ENV MODEL_DIR="/content/saved-model"
+ENV HOST=0.0.0.0
+ENV PORT=8080
+ENV PATH="/venv/bin:$PATH"
+COPY --from=build /venv /venv
+COPY --from=build /llama-cpp-python /llama-cpp-python
+RUN --mount=type=cache,target=/var/cache/apt --mount=type=cache,target=/var/lib/apt \
+    apt-get update && \
+    apt-get -y --no-install-recommends install \
+      git wget \
+      ocl-icd-libopencl1 clinfo libclblast1 libopenblas0 && \
+    mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \
+    rm -rf /var/lib/apt/lists/*
+
 
 COPY scripts/ scripts/
 
diff --git a/model-server-llama-cpp/README.md b/model-server-llama-cpp/README.md
index af813b9..3d86bae 100644
--- a/model-server-llama-cpp/README.md
+++ b/model-server-llama-cpp/README.md
@@ -9,5 +9,24 @@ The image expects a single GGML model as a single bin file under the /content/sa
 ### Building
 Build the image:
 ```sh
-docker build -t substratusai/model-server-llama-cpp .
+docker build -t llama-cpp .
+```
+
+### Testing
+Download a GGML model:
+```bash
+curl -L -o model.bin https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q2_K.bin
+```
+
+Run the image with that model:
+```bash
+docker run --gpus=all -d -p 8080:8080 \
+  -v $PWD/model.bin:/content/saved-model/model.bin llama-cpp
+```
+
+Verify it's working:
+```bash
+curl http://localhost:8080/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{ "prompt": "Who was the first president of the United States?", "stop": ["."]}'
 ```