diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 00000000..057dcf1e
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,182 @@
+# syntax=docker/dockerfile:1.7-labs
+
+########################################
+# Kolosal Server – CUDA Docker image
+# - Multi-stage: build (devel) -> runtime (slim)
+# - Defaults to GPU (CUDA) build
+# - Uses config_rms.yaml as default config
+# - Copies only required runtime bits
+########################################
+
+ARG CUDA_VERSION=12.4.1
+ARG BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+FROM ${BASE_IMAGE} AS build
+
+ARG DEBIAN_FRONTEND=noninteractive
+ARG TZ=UTC
+ARG BUILD_TYPE=Release
+ARG ENABLE_CUDA=ON
+ARG ENABLE_NATIVE_OPTIMIZATION=OFF
+ARG USE_PODOFO=ON
+ARG CMAKE_VERSION=3.27.9
+
+ENV TZ=${TZ} \
+    CC=gcc \
+    CXX=g++ \
+    BUILD_TYPE=${BUILD_TYPE}
+
+# Build dependencies (system CURL required by inference/CMakeLists on Linux)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  build-essential git pkg-config ca-certificates curl \
+      cmake ninja-build ccache \
+  libcurl4-openssl-dev libssl-dev libbz2-dev \
+  libomp-dev libblas-dev liblapack-dev \
+      # PDF (PoDoFo) optional deps – safe to install even if disabled
+      libfreetype6-dev libjpeg-dev libpng-dev libtiff-dev libxml2-dev libfontconfig1-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Upgrade to pinned CMake (>=3.23 required by PoDoFo)
+RUN set -eux; \
+    ver="$(${SHELL:-/bin/sh} -c 'cmake --version 2>/dev/null | awk "NR==1{print \$3}"' || true)"; \
+    need="${CMAKE_VERSION}"; \
+    if [ -z "$ver" ] || [ "$(printf '%s\n' "$need" "$ver" | sort -V | head -n1)" != "$need" ] || [ "$ver" != "$need" ]; then \
+      cd /tmp; \
+      curl -fsSL -o cmake.tar.gz https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz; \
+      tar -xf cmake.tar.gz; \
+      cp -r cmake-${CMAKE_VERSION}-linux-x86_64/bin/* /usr/local/bin/; \
+      cp -r cmake-${CMAKE_VERSION}-linux-x86_64/share/cmake* /usr/local/share/ || true; \
+      rm -rf cmake-* cmake.tar.gz; \
+    fi; \
+    cmake --version
+
+# Speed up rebuilds
+ENV PATH=/usr/lib/ccache:${PATH} \
+    CCACHE_DIR=/root/.ccache \
+    CCACHE_MAXSIZE=1G
+
+WORKDIR /src
+
+# Copy repository (rely on .dockerignore to keep context small)
+COPY . .
+
+# Initialize submodules when available (no-op if not a git context)
+RUN if [ -d .git ]; then git submodule update --init --recursive; else echo "No .git directory – skipping submodules"; fi
+
+# Ensure llama.cpp source is present (fallback when submodules are not in context)
+RUN set -eux; \
+    if [ ! -f inference/external/llama.cpp/CMakeLists.txt ] && [ ! -f external/llama.cpp/CMakeLists.txt ]; then \
+      echo "[Docker build] llama.cpp not found in repo – cloning shallow copy..."; \
+      mkdir -p inference/external; \
+      git clone --depth=1 https://github.com/ggerganov/llama.cpp.git inference/external/llama.cpp; \
+    else \
+      echo "[Docker build] Found llama.cpp sources in repo"; \
+    fi
+
+# Configure & build (CUDA by default)
+RUN set -eux; \
+    cmake -S . -B build -G Ninja \
+      -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+      -DCMAKE_C_COMPILER=${CC} -DCMAKE_CXX_COMPILER=${CXX} \
+      -DENABLE_NATIVE_OPTIMIZATION=${ENABLE_NATIVE_OPTIMIZATION} \
+      -DUSE_CUDA=${ENABLE_CUDA} \
+      -DUSE_PODOFO=${USE_PODOFO} \
+  -DGGML_CUDA_NO_VMM=ON \
+      -DCUDA_CUDA_LIBRARY=/usr/local/cuda/lib64/stubs/libcuda.so \
+      -DBUILD_TESTING=OFF \
+      -DBUILD_INFERENCE_TESTS=OFF; \
+    cmake --build build --config ${BUILD_TYPE}
+
+# Determine single-config output dir (project sets output to build/<TYPE>)
+RUN set -eux; \
+    OUTDIR="build/${BUILD_TYPE}"; \
+    test -x "${OUTDIR}/kolosal-server" || { echo "Build output not found at ${OUTDIR}"; ls -la build || true; exit 1; };
+
+# Collect runtime payload
+RUN set -eux; \
+    OUTDIR="build/${BUILD_TYPE}"; \
+    strip -s "${OUTDIR}/kolosal-server" || true; \
+    mkdir -p /out/bin /out/config /out/libs /out/licenses; \
+    cp "${OUTDIR}/kolosal-server" /out/bin/; \
+    # Prefer basic (vanilla) config by default, then fall back
+    if [ -f configs/config_basic.yaml ]; then \
+      cp configs/config_basic.yaml /out/config/config_basic.yaml; \
+      cp configs/config_basic.yaml /out/config/config.yaml; \
+    elif [ -f configs/config_rms.yaml ]; then \
+      cp configs/config_rms.yaml /out/config/config_rms.yaml; \
+      cp configs/config_rms.yaml /out/config/config.yaml; \
+    elif [ -f config_basic.yaml ]; then \
+      cp config_basic.yaml /out/config/config_basic.yaml; \
+      cp config_basic.yaml /out/config/config.yaml; \
+    elif [ -f config_rms.yaml ]; then \
+      cp config_rms.yaml /out/config/config_rms.yaml; \
+      cp config_rms.yaml /out/config/config.yaml; \
+    elif [ -f config.yaml ]; then \
+      cp config.yaml /out/config/config.yaml; \
+    else \
+      echo "No config found; you can mount one at runtime"; \
+    fi; \
+    # Shared libs placed by post-build step alongside the exe
+    for p in libllama-*.so* libkolosal_server.so*; do \
+      if ls "${OUTDIR}/$p" 1>/dev/null 2>&1; then \
+        cp -n "${OUTDIR}/"$p /out/libs/ || true; \
+      fi; \
+    done; \
+    # Non-system dependencies referenced by the binary
+    ldd "${OUTDIR}/kolosal-server" | awk '{for(i=1;i<=NF;i++) if ($i ~ /\//) print $i}' | sort -u > /tmp/libs.txt || true; \
+    while read -r lib; do case "$lib" in /lib/*|/usr/lib/*) ;; *) cp -n "$lib" /out/libs/ 2>/dev/null || true ;; esac; done < /tmp/libs.txt; \
+    cp LICENSE /out/licenses/ 2>/dev/null || true; \
+    echo "Collected libs:"; ls -1 /out/libs || true
+
+########################################
+# Runtime image
+########################################
+FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu22.04 AS runtime
+
+ARG DEBIAN_FRONTEND=noninteractive
+ENV LD_LIBRARY_PATH=/usr/local/lib:/app/libs:/usr/local/cuda/lib64 \
+    KOL_MODELS_DIR=/app/models
+
+# Minimal runtime deps (keep in sync with ldd if needed)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  libcurl4 libssl3 libbz2-1.0 zlib1g libgomp1 ca-certificates curl tini \
+  libblas3 liblapack3 libgfortran5 \
+      # PoDoFo runtime libs (safe if unused)
+      libfreetype6 libjpeg-turbo8 libpng16-16 libtiff5 libxml2 fontconfig \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+COPY --from=build /out/bin/kolosal-server /usr/local/bin/kolosal-server
+COPY --from=build /out/config /app/config
+COPY --from=build /out/libs /app/libs
+COPY --from=build /out/licenses /licenses
+
+# Make inference libs discoverable and refresh linker cache
+RUN set -eux; \
+    mkdir -p /usr/local/lib; \
+    if ls /app/libs/libllama-*.so* 1>/dev/null 2>&1; then cp /app/libs/libllama-*.so* /usr/local/lib/ || true; fi; \
+    if ls /app/libs/libkolosal_server.so* 1>/dev/null 2>&1; then cp /app/libs/libkolosal_server.so* /usr/local/lib/ || true; fi; \
+    ldconfig || true
+
+# Simple entrypoint wrapper – use config_rms.yaml by default when present
+RUN printf '%s\n' '#!/bin/sh' \
+  'set -e' \
+  'CONFIG="/app/config/config_basic.yaml"' \
+  'if [ ! -f "$CONFIG" ]; then CONFIG="/app/config/config_rms.yaml"; fi' \
+  'if [ ! -f "$CONFIG" ]; then CONFIG="/app/config/config.yaml"; fi' \
+  'echo "Starting kolosal-server with: $CONFIG"' \
+  'exec kolosal-server --config "$CONFIG"' \
+  > /usr/local/bin/kolosal-entry.sh && chmod +x /usr/local/bin/kolosal-entry.sh
+
+# Non-root runtime
+RUN useradd -r -u 10001 -d /app kolosal && chown -R kolosal:kolosal /app
+USER kolosal
+
+VOLUME ["/app/models", "/app/data"]
+
+EXPOSE 8080
+
+HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \
+  CMD curl -fsS http://localhost:8080/v1/health || exit 1
+
+ENTRYPOINT ["/usr/bin/tini", "--", "/usr/local/bin/kolosal-entry.sh"]
diff --git a/README.md b/README.md
index e83b5782..a3a056e3 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,172 @@ A high-performance inference server for large language models with OpenAI-compat
 
 ## Quick Start
 
+### Docker (CUDA GPU)
+
+Prerequisites:
+- NVIDIA GPU + drivers on host
+- NVIDIA Container Toolkit installed
+
+Build (CUDA by default):
+
+```powershell
+docker build -t kolosal-server:cuda . --build-arg BUILD_TYPE=Release --build-arg ENABLE_CUDA=ON
+```
+
+Run on GPU (uses configs/config_rms.yaml by default inside the image):
+
+```powershell
+# expose port 8080; mount models dir (optional)
+docker run --rm --gpus all -p 8080:8080 -v ${PWD}\models:/app/models kolosal-server:cuda
+```
+
+Use a custom config (for example, your edited config_rms.yaml in the local configs folder):
+
+```powershell
+docker run --rm --gpus all -p 8080:8080 ^
+  -v ${PWD}\models:/app/models ^
+  -v ${PWD}\configs:/app/config ^
+  kolosal-server:cuda
+```
+
+Health check:
+
+```powershell
+curl http://localhost:8080/v1/health
+```
+
+## Docker (prebuilt images from GHCR)
+
+Use a ready-made image from GitHub Container Registry so you don’t rebuild on every VM.
+
+### Prerequisites
+
+- NVIDIA GPU + drivers on the host
+- NVIDIA Container Toolkit installed (for `--gpus all`)
+- Open port 8080/tcp on your VM firewall/security group if accessing remotely
+
+### Pull the image
+
+Windows PowerShell:
+
+```powershell
+docker pull ghcr.io/kolosalai/kolosal-server:v0.0.1
+```
+
+Linux/macOS:
+
+```bash
+docker pull ghcr.io/kolosalai/kolosal-server:v0.0.1
+```
+
+If your package is private, login first with a Personal Access Token (scopes: `read:packages`):
+
+```powershell
+$env:GHCR_TOKEN = "<YOUR_GHCR_PAT>"
+echo $env:GHCR_TOKEN | docker login ghcr.io -u <YOUR_GH_USERNAME> --password-stdin
+```
+
+### Run (GPU)
+
+Minimal run (exposes 8080 and runs with GPU):
+
+Windows PowerShell:
+
+```powershell
+docker run -d --name kolosal-server --gpus all --restart unless-stopped -p 8080:8080 ghcr.io/kolosalai/kolosal-server:v0.0.1
+```
+
+Linux/macOS:
+
+```bash
+docker run -d --name kolosal-server --gpus all --restart unless-stopped -p 8080:8080 ghcr.io/kolosalai/kolosal-server:v0.0.1
+```
+
+Mount a models directory (recommended):
+
+```powershell
+docker run -d --name kolosal-server --gpus all --restart unless-stopped `
+  -p 8080:8080 `
+  -v C:\\kolosal\\models:/app/models `
+  ghcr.io/kolosalai/kolosal-server:v0.0.1
+```
+
+Linux/macOS:
+
+```bash
+docker run -d --name kolosal-server --gpus all --restart unless-stopped \
+  -p 8080:8080 \
+  -v $PWD/models:/app/models \
+  ghcr.io/kolosalai/kolosal-server:v0.0.1
+```
+
+### Choose a config
+
+Images support these defaults inside the container:
+
+- Prefer `/app/config/config_basic.yaml` (embedding-only “vanilla”)
+- Fallback to `/app/config/config_rms.yaml`
+- Fallback to `/app/config/config.yaml`
+
+To use your own config, bind-mount it to `/app/config/config.yaml`:
+
+Windows PowerShell:
+
+```powershell
+docker run -d --name kolosal-server --gpus all --restart unless-stopped `
+  -p 8080:8080 `
+  -v ${PWD}\configs\config_basic.yaml:/app/config/config.yaml:ro `
+  -v ${PWD}\models:/app/models `
+  ghcr.io/kolosalai/kolosal-server:v0.0.1
+```
+
+Linux/macOS:
+
+```bash
+docker run -d --name kolosal-server --gpus all --restart unless-stopped \
+  -p 8080:8080 \
+  -v $PWD/configs/config_basic.yaml:/app/config/config.yaml:ro \
+  -v $PWD/models:/app/models \
+  ghcr.io/kolosalai/kolosal-server:v0.0.1
+```
+
+Note: Ensure `server.allow_public_access: true` in your config if you’ll call the API from outside the container/host.
+
+### Verify and logs
+
+```powershell
+curl http://localhost:8080/v1/health
+docker logs -f kolosal-server
+```
+
+If you see model “unloaded”, that’s okay if `load_immediately` is false; it will load on first use.
+
+### Persist data
+
+- Models: mount a host folder to `/app/models`
+- App data/cache: mount a host folder to `/app/data` if desired
+
+### Update or rollback
+
+```powershell
+# Update to a new tag
+docker pull ghcr.io/kolosalai/kolosal-server:v0.0.2
+docker rm -f kolosal-server
+docker run -d --name kolosal-server --gpus all --restart unless-stopped -p 8080:8080 ghcr.io/kolosalai/kolosal-server:v0.0.2
+
+# Rollback to previous
+docker rm -f kolosal-server
+docker run -d --name kolosal-server --gpus all --restart unless-stopped -p 8080:8080 ghcr.io/kolosalai/kolosal-server:v0.0.1
+```
+
+### Troubleshooting
+
+- Connection reset: confirm `-p 8080:8080` and `server.allow_public_access: true` in the config.
+- GPU not used: ensure NVIDIA drivers + NVIDIA Container Toolkit; run with `--gpus all`; check `nvidia-smi` on host.
+- Missing models: mount a models directory to `/app/models` or use paths/URLs in your config.
+- 404s: check path (`/v1/health`, `/v1/models`, etc.).
+- Authentication: set `auth.enabled` and `api_keys` in your config and send `X-API-Key` header.
+
 ### Linux (Recommended)
 
 #### Prerequisites
diff --git a/configs/config_basic.yaml b/configs/config_basic.yaml
new file mode 100644
index 00000000..1956607e
--- /dev/null
+++ b/configs/config_basic.yaml
@@ -0,0 +1,102 @@
+# Minimal config for deployments that only need embeddings into Qdrant.
+# Assumptions:
+# - Exposes HTTP on 8080 and binds to all interfaces.
+# - Only an embedding model is loaded (no LLMs).
+# - Qdrant is reachable at the configured host/port (use "localhost" on a single VM,
+#   or the service DNS name like "qdrant" when running in Kubernetes/Helm).
+
+server:
+  port: "8080"
+  host: "0.0.0.0"
+  idle_timeout: 300
+  allow_public_access: true
+  allow_internet_access: false
+
+logging:
+  level: INFO
+  file: ""
+  access_log: false
+  quiet_mode: false
+  show_request_details: false
+
+auth:
+  enabled: true
+  require_api_key: false
+  api_key_header: X-API-Key
+  api_keys:
+    - change_me_if_enabled
+  rate_limit:
+    enabled: true
+    max_requests: 100
+    window_size: 60
+  cors:
+    enabled: true
+    allow_credentials: false
+    max_age: 86400
+    allowed_origins: ["*"]
+    allowed_methods: [GET, POST, PUT, DELETE, OPTIONS, HEAD, PATCH]
+    allowed_headers: [Content-Type, Authorization, X-Requested-With, Accept, Origin]
+
+# Disable internet search for this minimal embedding-only setup
+search:
+  enabled: false
+
+database:
+  qdrant:
+    enabled: true
+    host: "localhost"   # On K8s/Helm, set to the Qdrant service name, e.g., "qdrant"
+    port: 6333
+    collection_name: "documents"
+    default_embedding_model: "qwen3-embedding-4b"
+    timeout: 30
+    api_key: ""
+    max_connections: 10
+    connection_timeout: 5
+
+inference_engines:
+  - name: llama-cuda
+    library_path: /usr/local/lib/libllama-cuda.so
+    version: 1.0.0
+    description: CUDA-accelerated inference engine for embeddings
+    load_on_startup: true
+  - name: llama-cpu
+    library_path: /usr/local/lib/libllama-cpu.so
+    version: 1.0.0
+    description: CPU fallback (optional)
+    load_on_startup: false
+
+default_inference_engine: llama-cuda
+
+features:
+  health_check: true
+  metrics: true
+
+embedding_autoscaling:
+  enabled: true
+  min_instances: 1
+  max_instances: 4
+  scale_up_threshold: 10
+  scale_down_threshold: 2
+  scale_up_delay: 30
+  scale_down_delay: 300
+  check_interval: 15
+
+# Only the embedding model is defined here; no LLMs.
+models:
+  - id: qwen3-embedding-4b
+    path: https://huggingface.co/kolosal/qwen3-embedding-4b/resolve/main/Qwen3-Embedding-4B-Q4_K_M.gguf
+    type: embedding
+    load_immediately: true
+    main_gpu_id: 0
+    inference_engine: llama-cuda
+    load_params:
+      n_ctx: 4096
+      n_keep: 1024
+      use_mmap: true
+      use_mlock: false
+      n_parallel: 1
+      cont_batching: true
+      warmup: false
+      n_gpu_layers: 100   # Set to -1 to offload all layers if VRAM allows
+      n_batch: 2048
+      n_ubatch: 512
diff --git a/configs/config_rms.yaml b/configs/config_rms.yaml
index 555ac174..836eb9ed 100644
--- a/configs/config_rms.yaml
+++ b/configs/config_rms.yaml
@@ -1,9 +1,9 @@
 # Server Configuration
 server:
-  port: "8084"                        # Port number to run the server on
+  port: "8080"                        # Port number to run the server on (Docker exposes 8080)
   host: "0.0.0.0"                     # Host to bind the server; 0.0.0.0 means all available interfaces
   idle_timeout: 300                  # Idle timeout in seconds
-  allow_public_access: false        # Allow access from other devices on the same network
+  allow_public_access: true         # Allow access from other devices on the same network
   allow_internet_access: false      # Allow internet access (requires proper port forwarding)
 
 # Logging Configuration
@@ -54,7 +54,7 @@ auth:
 # Search Integration (e.g., with SearXNG)
 search:
   enabled: true
-  searxng_url: http://localhost:8090  # URL of SearXNG or compatible search engine
+  searxng_url: https://searx.stream  # URL of SearXNG or compatible search engine
   timeout: 30                         # Search timeout in seconds
   max_results: 20                     # Maximum number of results returned
   default_engine: ""                  # Optional default search engine
@@ -79,14 +79,22 @@ database:
 
 # Inference Engine Definitions
 inference_engines:
+  # CUDA (GPU) engine used inside Docker/Linux runtime
+  - name: llama-cuda
+    library_path: /usr/local/lib/libllama-cuda.so  # Path inside Docker image; present at runtime
+    version: 1.0.0
+    description: NVIDIA CUDA-accelerated inference engine for LLaMA models
+    load_on_startup: true
+
+  # Optional CPU fallback (useful for Windows dev; will be ignored in Docker if not present)
   - name: llama-cpu
-    library_path: ./build/Release/llama-cpu.dll  # Path to the inference engine library
+    library_path: /usr/local/lib/libllama-cpu.so
     version: 1.0.0
     description: CPU-based inference engine for LLaMA models
-    load_on_startup: true            # Whether to load this engine when the server starts
+    load_on_startup: false
 
 # Default inference engine to use
-default_inference_engine: llama-cpu
+default_inference_engine: llama-cuda
 
 # General feature toggles
 features:
@@ -111,7 +119,7 @@ models:
     type: embedding
     load_immediately: true
     main_gpu_id: 0
-    inference_engine: llama-cpu
+    inference_engine: llama-cuda
     load_params:
       n_ctx: 4096
       n_keep: 1024
@@ -120,7 +128,7 @@ models:
       n_parallel: 1
       cont_batching: true
       warmup: false
-      n_gpu_layers: 100
+      n_gpu_layers: 100              # Increase or set to -1 to offload all layers if VRAM allows
       n_batch: 2048
       n_ubatch: 512
 
@@ -129,7 +137,7 @@ models:
     type: llm
     load_immediately: true
     main_gpu_id: 0
-    inference_engine: llama-cpu
+    inference_engine: llama-cuda
     load_params:
       n_ctx: 2048
       n_keep: 1024
@@ -138,6 +146,6 @@ models:
       n_parallel: 1
       cont_batching: true
       warmup: false
-      n_gpu_layers: 100
+      n_gpu_layers: 100              # Increase or set to -1 to offload all layers if VRAM allows
       n_batch: 2048
       n_ubatch: 512