Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 182 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
# syntax=docker/dockerfile:1.7-labs

########################################
# Kolosal Server – CUDA Docker image
# - Multi-stage: build (devel) -> runtime (slim)
# - Defaults to GPU (CUDA) build
# - Uses config_rms.yaml as default config
# - Copies only required runtime bits
########################################

ARG CUDA_VERSION=12.4.1
ARG BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
FROM ${BASE_IMAGE} AS build

ARG DEBIAN_FRONTEND=noninteractive
ARG TZ=UTC
ARG BUILD_TYPE=Release
ARG ENABLE_CUDA=ON
ARG ENABLE_NATIVE_OPTIMIZATION=OFF
ARG USE_PODOFO=ON
ARG CMAKE_VERSION=3.27.9

ENV TZ=${TZ} \
CC=gcc \
CXX=g++ \
BUILD_TYPE=${BUILD_TYPE}

# Build dependencies (system CURL required by inference/CMakeLists on Linux)
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential git pkg-config ca-certificates curl \
cmake ninja-build ccache \
libcurl4-openssl-dev libssl-dev libbz2-dev \
libomp-dev libblas-dev liblapack-dev \
# PDF (PoDoFo) optional deps – safe to install even if disabled
libfreetype6-dev libjpeg-dev libpng-dev libtiff-dev libxml2-dev libfontconfig1-dev \
&& rm -rf /var/lib/apt/lists/*

# Upgrade to pinned CMake (>=3.23 required by PoDoFo)
RUN set -eux; \
ver="$(${SHELL:-/bin/sh} -c 'cmake --version 2>/dev/null | awk "NR==1{print \$3}"' || true)"; \
need="${CMAKE_VERSION}"; \
if [ -z "$ver" ] || [ "$(printf '%s\n' "$need" "$ver" | sort -V | head -n1)" != "$need" ] || [ "$ver" != "$need" ]; then \
cd /tmp; \
curl -fsSL -o cmake.tar.gz https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz; \
tar -xf cmake.tar.gz; \
cp -r cmake-${CMAKE_VERSION}-linux-x86_64/bin/* /usr/local/bin/; \
cp -r cmake-${CMAKE_VERSION}-linux-x86_64/share/cmake* /usr/local/share/ || true; \
rm -rf cmake-* cmake.tar.gz; \
fi; \
cmake --version

# Speed up rebuilds
ENV PATH=/usr/lib/ccache:${PATH} \
CCACHE_DIR=/root/.ccache \
CCACHE_MAXSIZE=1G

WORKDIR /src

# Copy repository (rely on .dockerignore to keep context small)
COPY . .

# Initialize submodules when available (no-op if not a git context)
RUN if [ -d .git ]; then git submodule update --init --recursive; else echo "No .git directory – skipping submodules"; fi

# Ensure llama.cpp source is present (fallback when submodules are not in context)
RUN set -eux; \
if [ ! -f inference/external/llama.cpp/CMakeLists.txt ] && [ ! -f external/llama.cpp/CMakeLists.txt ]; then \
echo "[Docker build] llama.cpp not found in repo – cloning shallow copy..."; \
mkdir -p inference/external; \
git clone --depth=1 https://github.com/ggerganov/llama.cpp.git inference/external/llama.cpp; \
else \
echo "[Docker build] Found llama.cpp sources in repo"; \
fi

# Configure & build (CUDA by default)
RUN set -eux; \
cmake -S . -B build -G Ninja \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DCMAKE_C_COMPILER=${CC} -DCMAKE_CXX_COMPILER=${CXX} \
-DENABLE_NATIVE_OPTIMIZATION=${ENABLE_NATIVE_OPTIMIZATION} \
-DUSE_CUDA=${ENABLE_CUDA} \
-DUSE_PODOFO=${USE_PODOFO} \
-DGGML_CUDA_NO_VMM=ON \
-DCUDA_CUDA_LIBRARY=/usr/local/cuda/lib64/stubs/libcuda.so \
-DBUILD_TESTING=OFF \
-DBUILD_INFERENCE_TESTS=OFF; \
cmake --build build --config ${BUILD_TYPE}

# Determine single-config output dir (project sets output to build/<TYPE>)
RUN set -eux; \
OUTDIR="build/${BUILD_TYPE}"; \
test -x "${OUTDIR}/kolosal-server" || { echo "Build output not found at ${OUTDIR}"; ls -la build || true; exit 1; };

# Collect runtime payload
RUN set -eux; \
OUTDIR="build/${BUILD_TYPE}"; \
strip -s "${OUTDIR}/kolosal-server" || true; \
mkdir -p /out/bin /out/config /out/libs /out/licenses; \
cp "${OUTDIR}/kolosal-server" /out/bin/; \
# Prefer basic (vanilla) config by default, then fall back
if [ -f configs/config_basic.yaml ]; then \
cp configs/config_basic.yaml /out/config/config_basic.yaml; \
cp configs/config_basic.yaml /out/config/config.yaml; \
elif [ -f configs/config_rms.yaml ]; then \
cp configs/config_rms.yaml /out/config/config_rms.yaml; \
cp configs/config_rms.yaml /out/config/config.yaml; \
elif [ -f config_basic.yaml ]; then \
cp config_basic.yaml /out/config/config_basic.yaml; \
cp config_basic.yaml /out/config/config.yaml; \
elif [ -f config_rms.yaml ]; then \
cp config_rms.yaml /out/config/config_rms.yaml; \
cp config_rms.yaml /out/config/config.yaml; \
elif [ -f config.yaml ]; then \
cp config.yaml /out/config/config.yaml; \
else \
echo "No config found; you can mount one at runtime"; \
fi; \
# Shared libs placed by post-build step alongside the exe
for p in libllama-*.so* libkolosal_server.so*; do \
if ls "${OUTDIR}/$p" 1>/dev/null 2>&1; then \
cp -n "${OUTDIR}/"$p /out/libs/ || true; \
fi; \
done; \
# Non-system dependencies referenced by the binary
ldd "${OUTDIR}/kolosal-server" | awk '{for(i=1;i<=NF;i++) if ($i ~ /\//) print $i}' | sort -u > /tmp/libs.txt || true; \
while read -r lib; do case "$lib" in /lib/*|/usr/lib/*) ;; *) cp -n "$lib" /out/libs/ 2>/dev/null || true ;; esac; done < /tmp/libs.txt; \
cp LICENSE /out/licenses/ 2>/dev/null || true; \
echo "Collected libs:"; ls -1 /out/libs || true

########################################
# Runtime image
########################################
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04 AS runtime

ARG DEBIAN_FRONTEND=noninteractive
ENV LD_LIBRARY_PATH=/usr/local/lib:/app/libs:/usr/local/cuda/lib64 \
KOL_MODELS_DIR=/app/models

# Minimal runtime deps (keep in sync with ldd if needed)
RUN apt-get update && apt-get install -y --no-install-recommends \
libcurl4 libssl3 libbz2-1.0 zlib1g libgomp1 ca-certificates curl tini \
libblas3 liblapack3 libgfortran5 \
# PoDoFo runtime libs (safe if unused)
libfreetype6 libjpeg-turbo8 libpng16-16 libtiff5 libxml2 fontconfig \
&& rm -rf /var/lib/apt/lists/*

WORKDIR /app

COPY --from=build /out/bin/kolosal-server /usr/local/bin/kolosal-server
COPY --from=build /out/config /app/config
COPY --from=build /out/libs /app/libs
COPY --from=build /out/licenses /licenses

# Make inference libs discoverable and refresh linker cache
RUN set -eux; \
mkdir -p /usr/local/lib; \
if ls /app/libs/libllama-*.so* 1>/dev/null 2>&1; then cp /app/libs/libllama-*.so* /usr/local/lib/ || true; fi; \
if ls /app/libs/libkolosal_server.so* 1>/dev/null 2>&1; then cp /app/libs/libkolosal_server.so* /usr/local/lib/ || true; fi; \
ldconfig || true

# Simple entrypoint wrapper – use config_rms.yaml by default when present
RUN printf '%s\n' '#!/bin/sh' \
'set -e' \
'CONFIG="/app/config/config_basic.yaml"' \
'if [ ! -f "$CONFIG" ]; then CONFIG="/app/config/config_rms.yaml"; fi' \
'if [ ! -f "$CONFIG" ]; then CONFIG="/app/config/config.yaml"; fi' \
'echo "Starting kolosal-server with: $CONFIG"' \
'exec kolosal-server --config "$CONFIG"' \
> /usr/local/bin/kolosal-entry.sh && chmod +x /usr/local/bin/kolosal-entry.sh

# Non-root runtime
RUN useradd -r -u 10001 -d /app kolosal && chown -R kolosal:kolosal /app
USER kolosal

VOLUME ["/app/models", "/app/data"]

EXPOSE 8080

HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \
CMD curl -fsS http://localhost:8080/v1/health || exit 1

ENTRYPOINT ["/usr/bin/tini", "--", "/usr/local/bin/kolosal-entry.sh"]
166 changes: 166 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,172 @@ A high-performance inference server for large language models with OpenAI-compat

## Quick Start

### Docker (CUDA GPU)

Prerequisites:
- NVIDIA GPU + drivers on host
- NVIDIA Container Toolkit installed

Build (CUDA by default):

```powershell
docker build -t kolosal-server:cuda . --build-arg BUILD_TYPE=Release --build-arg ENABLE_CUDA=ON
```

Run on GPU (uses configs/config_rms.yaml by default inside the image):

```powershell
# expose port 8080; mount models dir (optional)
docker run --rm --gpus all -p 8080:8080 -v ${PWD}\models:/app/models kolosal-server:cuda
```

Use a custom config (for example, your edited config_rms.yaml in the local configs folder):

```powershell
docker run --rm --gpus all -p 8080:8080 ^
-v ${PWD}\models:/app/models ^
-v ${PWD}\configs:/app/config ^
kolosal-server:cuda
```

Health check:

```powershell
curl http://localhost:8080/v1/health
```

## Docker (prebuilt images from GHCR)

Use a ready-made image from GitHub Container Registry so you don’t rebuild on every VM.

### Prerequisites

- NVIDIA GPU + drivers on the host
- NVIDIA Container Toolkit installed (for `--gpus all`)
- Open port 8080/tcp on your VM firewall/security group if accessing remotely

### Pull the image

Windows PowerShell:

```powershell
docker pull ghcr.io/kolosalai/kolosal-server:v0.0.1
```

Linux/macOS:

```bash
docker pull ghcr.io/kolosalai/kolosal-server:v0.0.1
```

If your package is private, login first with a Personal Access Token (scopes: `read:packages`):

```powershell
$env:GHCR_TOKEN = "<YOUR_GHCR_PAT>"
echo $env:GHCR_TOKEN | docker login ghcr.io -u <YOUR_GH_USERNAME> --password-stdin
```

### Run (GPU)

Minimal run (exposes 8080 and runs with GPU):

Windows PowerShell:

```powershell
docker run -d --name kolosal-server --gpus all --restart unless-stopped -p 8080:8080 ghcr.io/kolosalai/kolosal-server:v0.0.1
```

Linux/macOS:

```bash
docker run -d --name kolosal-server --gpus all --restart unless-stopped -p 8080:8080 ghcr.io/kolosalai/kolosal-server:v0.0.1
```

Mount a models directory (recommended):

```powershell
docker run -d --name kolosal-server --gpus all --restart unless-stopped `
-p 8080:8080 `
-v C:\\kolosal\\models:/app/models `
ghcr.io/kolosalai/kolosal-server:v0.0.1
```

Linux/macOS:

```bash
docker run -d --name kolosal-server --gpus all --restart unless-stopped \
-p 8080:8080 \
-v $PWD/models:/app/models \
ghcr.io/kolosalai/kolosal-server:v0.0.1
```

### Choose a config

Images support these defaults inside the container:

- Prefer `/app/config/config_basic.yaml` (embedding-only “vanilla”)
- Fallback to `/app/config/config_rms.yaml`
- Fallback to `/app/config/config.yaml`

To use your own config, bind-mount it to `/app/config/config.yaml`:

Windows PowerShell:

```powershell
docker run -d --name kolosal-server --gpus all --restart unless-stopped `
-p 8080:8080 `
-v ${PWD}\configs\config_basic.yaml:/app/config/config.yaml:ro `
-v ${PWD}\models:/app/models `
ghcr.io/kolosalai/kolosal-server:v0.0.1
```

Linux/macOS:

```bash
docker run -d --name kolosal-server --gpus all --restart unless-stopped \
-p 8080:8080 \
-v $PWD/configs/config_basic.yaml:/app/config/config.yaml:ro \
-v $PWD/models:/app/models \
ghcr.io/kolosalai/kolosal-server:v0.0.1
```

Note: Ensure `server.allow_public_access: true` in your config if you’ll call the API from outside the container/host.

### Verify and logs

```powershell
curl http://localhost:8080/v1/health
docker logs -f kolosal-server
```

If you see model “unloaded”, that’s okay if `load_immediately` is false; it will load on first use.

### Persist data

- Models: mount a host folder to `/app/models`
- App data/cache: mount a host folder to `/app/data` if desired

### Update or rollback

```powershell
# Update to a new tag
docker pull ghcr.io/kolosalai/kolosal-server:v0.0.2
docker rm -f kolosal-server
docker run -d --name kolosal-server --gpus all --restart unless-stopped -p 8080:8080 ghcr.io/kolosalai/kolosal-server:v0.0.2

# Rollback to previous
docker rm -f kolosal-server
docker run -d --name kolosal-server --gpus all --restart unless-stopped -p 8080:8080 ghcr.io/kolosalai/kolosal-server:v0.0.1
```

### Troubleshooting

- Connection reset: confirm `-p 8080:8080` and `server.allow_public_access: true` in the config.
- GPU not used: ensure NVIDIA drivers + NVIDIA Container Toolkit; run with `--gpus all`; check `nvidia-smi` on host.
- Missing models: mount a models directory to `/app/models` or use paths/URLs in your config.
- 404s: check path (`/v1/health`, `/v1/models`, etc.).
- Authentication: set `auth.enabled` and `api_keys` in your config and send `X-API-Key` header.

### Linux (Recommended)

#### Prerequisites
Expand Down
Loading