diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..057dcf1e --- /dev/null +++ b/Dockerfile @@ -0,0 +1,182 @@ +# syntax=docker/dockerfile:1.7-labs + +######################################## +# Kolosal Server – CUDA Docker image +# - Multi-stage: build (devel) -> runtime (slim) +# - Defaults to GPU (CUDA) build +# - Uses config_rms.yaml as default config +# - Copies only required runtime bits +######################################## + +ARG CUDA_VERSION=12.4.1 +ARG BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 +FROM ${BASE_IMAGE} AS build + +ARG DEBIAN_FRONTEND=noninteractive +ARG TZ=UTC +ARG BUILD_TYPE=Release +ARG ENABLE_CUDA=ON +ARG ENABLE_NATIVE_OPTIMIZATION=OFF +ARG USE_PODOFO=ON +ARG CMAKE_VERSION=3.27.9 + +ENV TZ=${TZ} \ + CC=gcc \ + CXX=g++ \ + BUILD_TYPE=${BUILD_TYPE} + +# Build dependencies (system CURL required by inference/CMakeLists on Linux) +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential git pkg-config ca-certificates curl \ + cmake ninja-build ccache \ + libcurl4-openssl-dev libssl-dev libbz2-dev \ + libomp-dev libblas-dev liblapack-dev \ + # PDF (PoDoFo) optional deps – safe to install even if disabled + libfreetype6-dev libjpeg-dev libpng-dev libtiff-dev libxml2-dev libfontconfig1-dev \ + && rm -rf /var/lib/apt/lists/* + +# Upgrade to pinned CMake (>=3.23 required by PoDoFo) +RUN set -eux; \ + ver="$(${SHELL:-/bin/sh} -c 'cmake --version 2>/dev/null | awk "NR==1{print \$3}"' || true)"; \ + need="${CMAKE_VERSION}"; \ + if [ -z "$ver" ] || [ "$(printf '%s\n' "$need" "$ver" | sort -V | head -n1)" != "$need" ] || [ "$ver" != "$need" ]; then \ + cd /tmp; \ + curl -fsSL -o cmake.tar.gz https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz; \ + tar -xf cmake.tar.gz; \ + cp -r cmake-${CMAKE_VERSION}-linux-x86_64/bin/* /usr/local/bin/; \ + cp -r cmake-${CMAKE_VERSION}-linux-x86_64/share/cmake* /usr/local/share/ || true; \ + rm -rf cmake-* cmake.tar.gz; \ + fi; \ + cmake --version + +# Speed up rebuilds +ENV PATH=/usr/lib/ccache:${PATH} \ + CCACHE_DIR=/root/.ccache \ + CCACHE_MAXSIZE=1G + +WORKDIR /src + +# Copy repository (rely on .dockerignore to keep context small) +COPY . . + +# Initialize submodules when available (no-op if not a git context) +RUN if [ -d .git ]; then git submodule update --init --recursive; else echo "No .git directory – skipping submodules"; fi + +# Ensure llama.cpp source is present (fallback when submodules are not in context) +RUN set -eux; \ + if [ ! -f inference/external/llama.cpp/CMakeLists.txt ] && [ ! -f external/llama.cpp/CMakeLists.txt ]; then \ + echo "[Docker build] llama.cpp not found in repo – cloning shallow copy..."; \ + mkdir -p inference/external; \ + git clone --depth=1 https://github.com/ggerganov/llama.cpp.git inference/external/llama.cpp; \ + else \ + echo "[Docker build] Found llama.cpp sources in repo"; \ + fi + +# Configure & build (CUDA by default) +RUN set -eux; \ + cmake -S . -B build -G Ninja \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DCMAKE_C_COMPILER=${CC} -DCMAKE_CXX_COMPILER=${CXX} \ + -DENABLE_NATIVE_OPTIMIZATION=${ENABLE_NATIVE_OPTIMIZATION} \ + -DUSE_CUDA=${ENABLE_CUDA} \ + -DUSE_PODOFO=${USE_PODOFO} \ + -DGGML_CUDA_NO_VMM=ON \ + -DCUDA_CUDA_LIBRARY=/usr/local/cuda/lib64/stubs/libcuda.so \ + -DBUILD_TESTING=OFF \ + -DBUILD_INFERENCE_TESTS=OFF; \ + cmake --build build --config ${BUILD_TYPE} + +# Determine single-config output dir (project sets output to build/) +RUN set -eux; \ + OUTDIR="build/${BUILD_TYPE}"; \ + test -x "${OUTDIR}/kolosal-server" || { echo "Build output not found at ${OUTDIR}"; ls -la build || true; exit 1; }; + +# Collect runtime payload +RUN set -eux; \ + OUTDIR="build/${BUILD_TYPE}"; \ + strip -s "${OUTDIR}/kolosal-server" || true; \ + mkdir -p /out/bin /out/config /out/libs /out/licenses; \ + cp "${OUTDIR}/kolosal-server" /out/bin/; \ + # Prefer basic (vanilla) config by default, then fall back + if [ -f configs/config_basic.yaml ]; then \ + cp configs/config_basic.yaml /out/config/config_basic.yaml; \ + cp configs/config_basic.yaml /out/config/config.yaml; \ + elif [ -f configs/config_rms.yaml ]; then \ + cp configs/config_rms.yaml /out/config/config_rms.yaml; \ + cp configs/config_rms.yaml /out/config/config.yaml; \ + elif [ -f config_basic.yaml ]; then \ + cp config_basic.yaml /out/config/config_basic.yaml; \ + cp config_basic.yaml /out/config/config.yaml; \ + elif [ -f config_rms.yaml ]; then \ + cp config_rms.yaml /out/config/config_rms.yaml; \ + cp config_rms.yaml /out/config/config.yaml; \ + elif [ -f config.yaml ]; then \ + cp config.yaml /out/config/config.yaml; \ + else \ + echo "No config found; you can mount one at runtime"; \ + fi; \ + # Shared libs placed by post-build step alongside the exe + for p in libllama-*.so* libkolosal_server.so*; do \ + if ls "${OUTDIR}/$p" 1>/dev/null 2>&1; then \ + cp -n "${OUTDIR}/"$p /out/libs/ || true; \ + fi; \ + done; \ + # Non-system dependencies referenced by the binary + ldd "${OUTDIR}/kolosal-server" | awk '{for(i=1;i<=NF;i++) if ($i ~ /\//) print $i}' | sort -u > /tmp/libs.txt || true; \ + while read -r lib; do case "$lib" in /lib/*|/usr/lib/*) ;; *) cp -n "$lib" /out/libs/ 2>/dev/null || true ;; esac; done < /tmp/libs.txt; \ + cp LICENSE /out/licenses/ 2>/dev/null || true; \ + echo "Collected libs:"; ls -1 /out/libs || true + +######################################## +# Runtime image +######################################## +FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04 AS runtime + +ARG DEBIAN_FRONTEND=noninteractive +ENV LD_LIBRARY_PATH=/usr/local/lib:/app/libs:/usr/local/cuda/lib64 \ + KOL_MODELS_DIR=/app/models + +# Minimal runtime deps (keep in sync with ldd if needed) +RUN apt-get update && apt-get install -y --no-install-recommends \ + libcurl4 libssl3 libbz2-1.0 zlib1g libgomp1 ca-certificates curl tini \ + libblas3 liblapack3 libgfortran5 \ + # PoDoFo runtime libs (safe if unused) + libfreetype6 libjpeg-turbo8 libpng16-16 libtiff5 libxml2 fontconfig \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY --from=build /out/bin/kolosal-server /usr/local/bin/kolosal-server +COPY --from=build /out/config /app/config +COPY --from=build /out/libs /app/libs +COPY --from=build /out/licenses /licenses + +# Make inference libs discoverable and refresh linker cache +RUN set -eux; \ + mkdir -p /usr/local/lib; \ + if ls /app/libs/libllama-*.so* 1>/dev/null 2>&1; then cp /app/libs/libllama-*.so* /usr/local/lib/ || true; fi; \ + if ls /app/libs/libkolosal_server.so* 1>/dev/null 2>&1; then cp /app/libs/libkolosal_server.so* /usr/local/lib/ || true; fi; \ + ldconfig || true + +# Simple entrypoint wrapper – use config_rms.yaml by default when present +RUN printf '%s\n' '#!/bin/sh' \ + 'set -e' \ + 'CONFIG="/app/config/config_basic.yaml"' \ + 'if [ ! -f "$CONFIG" ]; then CONFIG="/app/config/config_rms.yaml"; fi' \ + 'if [ ! -f "$CONFIG" ]; then CONFIG="/app/config/config.yaml"; fi' \ + 'echo "Starting kolosal-server with: $CONFIG"' \ + 'exec kolosal-server --config "$CONFIG"' \ + > /usr/local/bin/kolosal-entry.sh && chmod +x /usr/local/bin/kolosal-entry.sh + +# Non-root runtime +RUN useradd -r -u 10001 -d /app kolosal && chown -R kolosal:kolosal /app +USER kolosal + +VOLUME ["/app/models", "/app/data"] + +EXPOSE 8080 + +HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \ + CMD curl -fsS http://localhost:8080/v1/health || exit 1 + +ENTRYPOINT ["/usr/bin/tini", "--", "/usr/local/bin/kolosal-entry.sh"] diff --git a/README.md b/README.md index e83b5782..a3a056e3 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,172 @@ A high-performance inference server for large language models with OpenAI-compat ## Quick Start +### Docker (CUDA GPU) + +Prerequisites: +- NVIDIA GPU + drivers on host +- NVIDIA Container Toolkit installed + +Build (CUDA by default): + +```powershell +docker build -t kolosal-server:cuda . --build-arg BUILD_TYPE=Release --build-arg ENABLE_CUDA=ON +``` + +Run on GPU (uses configs/config_rms.yaml by default inside the image): + +```powershell +# expose port 8080; mount models dir (optional) +docker run --rm --gpus all -p 8080:8080 -v ${PWD}\models:/app/models kolosal-server:cuda +``` + +Use a custom config (for example, your edited config_rms.yaml in the local configs folder): + +```powershell +docker run --rm --gpus all -p 8080:8080 ^ + -v ${PWD}\models:/app/models ^ + -v ${PWD}\configs:/app/config ^ + kolosal-server:cuda +``` + +Health check: + +```powershell +curl http://localhost:8080/v1/health +``` + +## Docker (prebuilt images from GHCR) + +Use a ready-made image from GitHub Container Registry so you don’t rebuild on every VM. + +### Prerequisites + +- NVIDIA GPU + drivers on the host +- NVIDIA Container Toolkit installed (for `--gpus all`) +- Open port 8080/tcp on your VM firewall/security group if accessing remotely + +### Pull the image + +Windows PowerShell: + +```powershell +docker pull ghcr.io/kolosalai/kolosal-server:v0.0.1 +``` + +Linux/macOS: + +```bash +docker pull ghcr.io/kolosalai/kolosal-server:v0.0.1 +``` + +If your package is private, login first with a Personal Access Token (scopes: `read:packages`): + +```powershell +$env:GHCR_TOKEN = "" +echo $env:GHCR_TOKEN | docker login ghcr.io -u --password-stdin +``` + +### Run (GPU) + +Minimal run (exposes 8080 and runs with GPU): + +Windows PowerShell: + +```powershell +docker run -d --name kolosal-server --gpus all --restart unless-stopped -p 8080:8080 ghcr.io/kolosalai/kolosal-server:v0.0.1 +``` + +Linux/macOS: + +```bash +docker run -d --name kolosal-server --gpus all --restart unless-stopped -p 8080:8080 ghcr.io/kolosalai/kolosal-server:v0.0.1 +``` + +Mount a models directory (recommended): + +```powershell +docker run -d --name kolosal-server --gpus all --restart unless-stopped ` + -p 8080:8080 ` + -v C:\\kolosal\\models:/app/models ` + ghcr.io/kolosalai/kolosal-server:v0.0.1 +``` + +Linux/macOS: + +```bash +docker run -d --name kolosal-server --gpus all --restart unless-stopped \ + -p 8080:8080 \ + -v $PWD/models:/app/models \ + ghcr.io/kolosalai/kolosal-server:v0.0.1 +``` + +### Choose a config + +Images support these defaults inside the container: + +- Prefer `/app/config/config_basic.yaml` (embedding-only “vanilla”) +- Fallback to `/app/config/config_rms.yaml` +- Fallback to `/app/config/config.yaml` + +To use your own config, bind-mount it to `/app/config/config.yaml`: + +Windows PowerShell: + +```powershell +docker run -d --name kolosal-server --gpus all --restart unless-stopped ` + -p 8080:8080 ` + -v ${PWD}\configs\config_basic.yaml:/app/config/config.yaml:ro ` + -v ${PWD}\models:/app/models ` + ghcr.io/kolosalai/kolosal-server:v0.0.1 +``` + +Linux/macOS: + +```bash +docker run -d --name kolosal-server --gpus all --restart unless-stopped \ + -p 8080:8080 \ + -v $PWD/configs/config_basic.yaml:/app/config/config.yaml:ro \ + -v $PWD/models:/app/models \ + ghcr.io/kolosalai/kolosal-server:v0.0.1 +``` + +Note: Ensure `server.allow_public_access: true` in your config if you’ll call the API from outside the container/host. + +### Verify and logs + +```powershell +curl http://localhost:8080/v1/health +docker logs -f kolosal-server +``` + +If you see model “unloaded”, that’s okay if `load_immediately` is false; it will load on first use. + +### Persist data + +- Models: mount a host folder to `/app/models` +- App data/cache: mount a host folder to `/app/data` if desired + +### Update or rollback + +```powershell +# Update to a new tag +docker pull ghcr.io/kolosalai/kolosal-server:v0.0.2 +docker rm -f kolosal-server +docker run -d --name kolosal-server --gpus all --restart unless-stopped -p 8080:8080 ghcr.io/kolosalai/kolosal-server:v0.0.2 + +# Rollback to previous +docker rm -f kolosal-server +docker run -d --name kolosal-server --gpus all --restart unless-stopped -p 8080:8080 ghcr.io/kolosalai/kolosal-server:v0.0.1 +``` + +### Troubleshooting + +- Connection reset: confirm `-p 8080:8080` and `server.allow_public_access: true` in the config. +- GPU not used: ensure NVIDIA drivers + NVIDIA Container Toolkit; run with `--gpus all`; check `nvidia-smi` on host. +- Missing models: mount a models directory to `/app/models` or use paths/URLs in your config. +- 404s: check path (`/v1/health`, `/v1/models`, etc.). +- Authentication: set `auth.enabled` and `api_keys` in your config and send `X-API-Key` header. + ### Linux (Recommended) #### Prerequisites diff --git a/configs/config_basic.yaml b/configs/config_basic.yaml new file mode 100644 index 00000000..1956607e --- /dev/null +++ b/configs/config_basic.yaml @@ -0,0 +1,102 @@ +# Minimal config for deployments that only need embeddings into Qdrant. +# Assumptions: +# - Exposes HTTP on 8080 and binds to all interfaces. +# - Only an embedding model is loaded (no LLMs). +# - Qdrant is reachable at the configured host/port (use "localhost" on a single VM, +# or the service DNS name like "qdrant" when running in Kubernetes/Helm). + +server: + port: "8080" + host: "0.0.0.0" + idle_timeout: 300 + allow_public_access: true + allow_internet_access: false + +logging: + level: INFO + file: "" + access_log: false + quiet_mode: false + show_request_details: false + +auth: + enabled: true + require_api_key: false + api_key_header: X-API-Key + api_keys: + - change_me_if_enabled + rate_limit: + enabled: true + max_requests: 100 + window_size: 60 + cors: + enabled: true + allow_credentials: false + max_age: 86400 + allowed_origins: ["*"] + allowed_methods: [GET, POST, PUT, DELETE, OPTIONS, HEAD, PATCH] + allowed_headers: [Content-Type, Authorization, X-Requested-With, Accept, Origin] + +# Disable internet search for this minimal embedding-only setup +search: + enabled: false + +database: + qdrant: + enabled: true + host: "localhost" # On K8s/Helm, set to the Qdrant service name, e.g., "qdrant" + port: 6333 + collection_name: "documents" + default_embedding_model: "qwen3-embedding-4b" + timeout: 30 + api_key: "" + max_connections: 10 + connection_timeout: 5 + +inference_engines: + - name: llama-cuda + library_path: /usr/local/lib/libllama-cuda.so + version: 1.0.0 + description: CUDA-accelerated inference engine for embeddings + load_on_startup: true + - name: llama-cpu + library_path: /usr/local/lib/libllama-cpu.so + version: 1.0.0 + description: CPU fallback (optional) + load_on_startup: false + +default_inference_engine: llama-cuda + +features: + health_check: true + metrics: true + +embedding_autoscaling: + enabled: true + min_instances: 1 + max_instances: 4 + scale_up_threshold: 10 + scale_down_threshold: 2 + scale_up_delay: 30 + scale_down_delay: 300 + check_interval: 15 + +# Only the embedding model is defined here; no LLMs. +models: + - id: qwen3-embedding-4b + path: https://huggingface.co/kolosal/qwen3-embedding-4b/resolve/main/Qwen3-Embedding-4B-Q4_K_M.gguf + type: embedding + load_immediately: true + main_gpu_id: 0 + inference_engine: llama-cuda + load_params: + n_ctx: 4096 + n_keep: 1024 + use_mmap: true + use_mlock: false + n_parallel: 1 + cont_batching: true + warmup: false + n_gpu_layers: 100 # Set to -1 to offload all layers if VRAM allows + n_batch: 2048 + n_ubatch: 512 diff --git a/configs/config_rms.yaml b/configs/config_rms.yaml index 555ac174..836eb9ed 100644 --- a/configs/config_rms.yaml +++ b/configs/config_rms.yaml @@ -1,9 +1,9 @@ # Server Configuration server: - port: "8084" # Port number to run the server on + port: "8080" # Port number to run the server on (Docker exposes 8080) host: "0.0.0.0" # Host to bind the server; 0.0.0.0 means all available interfaces idle_timeout: 300 # Idle timeout in seconds - allow_public_access: false # Allow access from other devices on the same network + allow_public_access: true # Allow access from other devices on the same network allow_internet_access: false # Allow internet access (requires proper port forwarding) # Logging Configuration @@ -54,7 +54,7 @@ auth: # Search Integration (e.g., with SearXNG) search: enabled: true - searxng_url: http://localhost:8090 # URL of SearXNG or compatible search engine + searxng_url: https://searx.stream # URL of SearXNG or compatible search engine timeout: 30 # Search timeout in seconds max_results: 20 # Maximum number of results returned default_engine: "" # Optional default search engine @@ -79,14 +79,22 @@ database: # Inference Engine Definitions inference_engines: + # CUDA (GPU) engine used inside Docker/Linux runtime + - name: llama-cuda + library_path: /usr/local/lib/libllama-cuda.so # Path inside Docker image; present at runtime + version: 1.0.0 + description: NVIDIA CUDA-accelerated inference engine for LLaMA models + load_on_startup: true + + # Optional CPU fallback (useful for Windows dev; will be ignored in Docker if not present) - name: llama-cpu - library_path: ./build/Release/llama-cpu.dll # Path to the inference engine library + library_path: /usr/local/lib/libllama-cpu.so version: 1.0.0 description: CPU-based inference engine for LLaMA models - load_on_startup: true # Whether to load this engine when the server starts + load_on_startup: false # Default inference engine to use -default_inference_engine: llama-cpu +default_inference_engine: llama-cuda # General feature toggles features: @@ -111,7 +119,7 @@ models: type: embedding load_immediately: true main_gpu_id: 0 - inference_engine: llama-cpu + inference_engine: llama-cuda load_params: n_ctx: 4096 n_keep: 1024 @@ -120,7 +128,7 @@ models: n_parallel: 1 cont_batching: true warmup: false - n_gpu_layers: 100 + n_gpu_layers: 100 # Increase or set to -1 to offload all layers if VRAM allows n_batch: 2048 n_ubatch: 512 @@ -129,7 +137,7 @@ models: type: llm load_immediately: true main_gpu_id: 0 - inference_engine: llama-cpu + inference_engine: llama-cuda load_params: n_ctx: 2048 n_keep: 1024 @@ -138,6 +146,6 @@ models: n_parallel: 1 cont_batching: true warmup: false - n_gpu_layers: 100 + n_gpu_layers: 100 # Increase or set to -1 to offload all layers if VRAM allows n_batch: 2048 n_ubatch: 512