diff --git a/inference-engine/llamacpp/Dockerfile b/inference-engine/llamacpp/Dockerfile
new file mode 100644
index 00000000..ac2011a1
--- /dev/null
+++ b/inference-engine/llamacpp/Dockerfile
@@ -0,0 +1,7 @@
+# syntax=docker/dockerfile:1
+
+FROM scratch
+ARG TARGETOS
+ARG TARGETARCH
+ARG ACCEL
+COPY --from=release-artifacts /com.docker.llama-server.native.$TARGETOS.$ACCEL.$TARGETARCH /com.docker.llama-server.native.$TARGETOS.$ACCEL.$TARGETARCH
diff --git a/inference-engine/llamacpp/Makefile b/inference-engine/llamacpp/Makefile
new file mode 100644
index 00000000..d61de267
--- /dev/null
+++ b/inference-engine/llamacpp/Makefile
@@ -0,0 +1,90 @@
+ifeq ($(OS),Windows_NT)
+    DETECTED_OS := Windows
+else
+    UNAME_S := $(shell uname -s)
+    ifeq ($(UNAME_S),Linux)
+        DETECTED_OS := Linux
+    endif
+    ifeq ($(UNAME_S),Darwin)
+        DETECTED_OS := macOS
+    endif
+endif
+
+BUILD_DIR := build
+INSTALL_DIR := install
+NATIVE_DIR := native
+
+.PHONY: build clean install-deps install-dir
+
+build: install-deps
+ifeq ($(DETECTED_OS),macOS)
+	@echo "Building for macOS..."
+	@echo "Configuring CMake..."
+	cmake -B $(BUILD_DIR) \
+		-DCMAKE_CXX_COMPILER=clang++ \
+		-DCMAKE_C_COMPILER=clang \
+		-DCMAKE_BUILD_TYPE=Release \
+		-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3 \
+		-DCMAKE_MACOSX_RPATH=ON \
+		-DCMAKE_INSTALL_RPATH='@executable_path/../lib' \
+		-DGGML_NATIVE=OFF \
+		-DGGML_OPENMP=OFF \
+		-DLLAMA_CURL=OFF \
+		-GNinja \
+		-S $(NATIVE_DIR)
+	@echo "Building..."
+	cmake --build $(BUILD_DIR) --config Release
+	@echo "Installing..."
+	cmake --install $(BUILD_DIR) \
+		--config Release \
+		--prefix $(INSTALL_DIR)
+	@echo "Cleaning install directory..."
+	rm -rf $(INSTALL_DIR)/lib/cmake
+	rm -rf $(INSTALL_DIR)/lib/pkgconfig
+	rm -rf $(INSTALL_DIR)/include
+	@echo "Build complete! Binaries are in $(INSTALL_DIR)"
+else ifeq ($(DETECTED_OS),Linux)
+	@echo "Linux build not implemented yet"
+	@exit 1
+else ifeq ($(DETECTED_OS),Windows)
+	@echo "Windows build not implemented yet"
+	@exit 1
+else
+	@echo "Unsupported OS: $(DETECTED_OS)"
+	@exit 1
+endif
+
+install-deps:
+ifeq ($(DETECTED_OS),macOS)
+	@echo "Installing build dependencies for macOS..."
+	@if ! command -v ninja >/dev/null 2>&1; then \
+		echo "Installing Ninja..."; \
+		brew install ninja; \
+	else \
+		echo "Ninja already installed"; \
+	fi
+else ifeq ($(DETECTED_OS),Linux)
+	@echo "Linux dependency installation not implemented yet"
+	@exit 1
+else ifeq ($(DETECTED_OS),Windows)
+	@echo "Windows dependency installation not implemented yet"
+	@exit 1
+else
+	@echo "Unsupported OS: $(DETECTED_OS)"
+	@exit 1
+endif
+
+clean:
+	rm -rf $(BUILD_DIR)
+	rm -rf $(INSTALL_DIR)
+
+install-dir:
+	@echo "$(INSTALL_DIR)"
+
+help:
+	@echo "Available targets:"
+	@echo "  build      	- Build llama.cpp (macOS only for now)"
+	@echo "  install-deps	- Install build dependencies"
+	@echo "  install-dir	- Print install directory path"
+	@echo "  clean       	- Clean build artifacts"
+	@echo "  help        	- Show this help"
diff --git a/inference-engine/llamacpp/README.md b/inference-engine/llamacpp/README.md
new file mode 100644
index 00000000..e147612d
--- /dev/null
+++ b/inference-engine/llamacpp/README.md
@@ -0,0 +1,6 @@
+# llama.cpp inference runtime
+
+This repo contains implementations of the llama.cpp inference runtime.
+
+* native/ - contains an implementaion based on `llama.cpp`'s native server
+  implementation
diff --git a/inference-engine/llamacpp/native/.gitignore b/inference-engine/llamacpp/native/.gitignore
new file mode 100644
index 00000000..567609b1
--- /dev/null
+++ b/inference-engine/llamacpp/native/.gitignore
@@ -0,0 +1 @@
+build/
diff --git a/inference-engine/llamacpp/native/CMakeLists.txt b/inference-engine/llamacpp/native/CMakeLists.txt
new file mode 100644
index 00000000..e369c73f
--- /dev/null
+++ b/inference-engine/llamacpp/native/CMakeLists.txt
@@ -0,0 +1,25 @@
+cmake_minimum_required(VERSION 3.13)
+
+project(
+    com.docker.llama-server.native
+    DESCRIPTION "DD inference server, based on llama.cpp native server"
+    LANGUAGES C CXX
+)
+
+option(DDLLAMA_BUILD_SERVER "Build the DD llama.cpp server executable" ON)
+option(DDLLAMA_BUILD_UTILS "Build utilities, e.g. nv-gpu-info" OFF)
+set(DDLLAMA_PATCH_COMMAND "patch" CACHE STRING "patch command")
+
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
+if (DDLLAMA_BUILD_SERVER)
+    set(LLAMA_BUILD_COMMON ON)
+    add_subdirectory(vendor/llama.cpp)
+    add_subdirectory(vendor/llama.cpp/tools/mtmd)
+    add_subdirectory(src/server)
+endif()
+
+if (WIN32 AND DDLLAMA_BUILD_UTILS)
+    add_subdirectory(src/nv-gpu-info)
+endif()
diff --git a/inference-engine/llamacpp/native/README.md b/inference-engine/llamacpp/native/README.md
new file mode 100644
index 00000000..de7d54f6
--- /dev/null
+++ b/inference-engine/llamacpp/native/README.md
@@ -0,0 +1,10 @@
+# Native llama-server for DD
+
+## Building
+
+    cmake -B build
+    cmake --build build --parallel 8 --config Release
+
+## Running
+
+    DD_INF_UDS=<socket path> ./build/bin/com.docker.llama-server --model <path to model>
diff --git a/inference-engine/llamacpp/native/cuda.Dockerfile b/inference-engine/llamacpp/native/cuda.Dockerfile
new file mode 100644
index 00000000..4c42f813
--- /dev/null
+++ b/inference-engine/llamacpp/native/cuda.Dockerfile
@@ -0,0 +1,53 @@
+# syntax=docker/dockerfile:1
+
+ARG CUDA_VERSION=12.9.0
+ARG CUDA_IMAGE_VARIANT=ubuntu24.04
+
+FROM nvidia/cuda:${CUDA_VERSION}-devel-${CUDA_IMAGE_VARIANT} AS builder
+
+ARG TARGETARCH
+ARG CUDA_IMAGE_VARIANT
+
+COPY native/install-clang.sh .
+RUN ./install-clang.sh "${CUDA_IMAGE_VARIANT}"
+
+WORKDIR /llama-server
+
+COPY .git .git
+COPY native/CMakeLists.txt .
+COPY native/src src
+COPY native/vendor vendor
+
+# Fix submodule .git file to point to correct location in container
+RUN echo "gitdir: ../../.git/modules/native/vendor/llama.cpp" > vendor/llama.cpp/.git && \
+    sed -i 's|worktree = ../../../../../native/vendor/llama.cpp|worktree = /llama-server/vendor/llama.cpp|' .git/modules/native/vendor/llama.cpp/config
+
+ENV CC=/usr/bin/clang-20
+ENV CXX=/usr/bin/clang++-20
+RUN echo "-B build \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DBUILD_SHARED_LIBS=ON \
+    -DGGML_BACKEND_DL=ON \
+    -DGGML_CPU_ALL_VARIANTS=ON \
+    -DGGML_NATIVE=OFF \
+    -DGGML_OPENMP=OFF \
+    -DGGML_CUDA=ON \
+    -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
+    -DLLAMA_CURL=OFF \
+    -GNinja \
+    -S ." > cmake-flags
+RUN cmake $(cat cmake-flags)
+RUN cmake --build build --config Release
+RUN cmake --install build --config Release --prefix install
+
+RUN rm install/bin/*.py
+RUN rm -r install/lib/cmake
+RUN rm -r install/lib/pkgconfig
+RUN rm -r install/include
+
+FROM scratch AS final
+
+ARG TARGETARCH
+ARG CUDA_VERSION
+
+COPY --from=builder /llama-server/install /com.docker.llama-server.native.linux.cuda$CUDA_VERSION.$TARGETARCH
diff --git a/inference-engine/llamacpp/native/generic.Dockerfile b/inference-engine/llamacpp/native/generic.Dockerfile
new file mode 100644
index 00000000..f55f4550
--- /dev/null
+++ b/inference-engine/llamacpp/native/generic.Dockerfile
@@ -0,0 +1,62 @@
+# syntax=docker/dockerfile:1
+
+ARG BASE_IMAGE=ubuntu:25.10
+
+FROM ${BASE_IMAGE} AS builder
+
+ARG TARGETARCH
+
+RUN apt-get update && apt-get install -y cmake ninja-build git build-essential curl
+
+COPY native/install-vulkan.sh .
+RUN ./install-vulkan.sh
+
+ENV VULKAN_SDK=/opt/vulkan
+ENV PATH=$VULKAN_SDK/bin:$PATH
+ENV LD_LIBRARY_PATH=$VULKAN_SDK/lib
+ENV CMAKE_PREFIX_PATH=$VULKAN_SDK
+ENV PKG_CONFIG_PATH=$VULKAN_SDK/lib/pkgconfig
+
+WORKDIR /llama-server
+
+COPY .git .git
+COPY native/CMakeLists.txt .
+COPY native/src src
+COPY native/vendor vendor
+
+# Fix submodule .git file to point to correct location in container
+RUN echo "gitdir: ../../.git/modules/native/vendor/llama.cpp" > vendor/llama.cpp/.git && \
+    sed -i 's|worktree = ../../../../../native/vendor/llama.cpp|worktree = /llama-server/vendor/llama.cpp|' .git/modules/native/vendor/llama.cpp/config
+
+RUN echo "-B build \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DGGML_NATIVE=OFF \
+          -DGGML_OPENMP=OFF \
+          -DLLAMA_CURL=OFF \
+          -DGGML_VULKAN=ON \
+          -GNinja \
+    -S ." > cmake-flags
+RUN if [ "${TARGETARCH}" = "amd64" ]; then \
+      echo " -DBUILD_SHARED_LIBS=ON \
+             -DGGML_BACKEND_DL=ON \
+             -DGGML_CPU_ALL_VARIANTS=ON" >> cmake-flags; \
+    elif [ "${TARGETARCH}" = "arm64" ]; then \
+      echo " -DBUILD_SHARED_LIBS=OFF" >> cmake-flags; \
+    else \
+      echo "${TARGETARCH} is not supported"; \
+      exit 1; \
+    fi
+RUN cmake $(cat cmake-flags)
+RUN cmake --build build --config Release -j 4
+RUN cmake --install build --config Release --prefix install
+
+RUN rm install/bin/*.py
+RUN rm -r install/lib/cmake
+RUN rm -r install/lib/pkgconfig
+RUN rm -r install/include
+
+FROM scratch AS final
+
+ARG TARGETARCH
+
+COPY --from=builder /llama-server/install /com.docker.llama-server.native.linux.cpu.$TARGETARCH
diff --git a/inference-engine/llamacpp/native/install-clang.sh b/inference-engine/llamacpp/native/install-clang.sh
new file mode 100755
index 00000000..a4ae0ab3
--- /dev/null
+++ b/inference-engine/llamacpp/native/install-clang.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+main() {
+  set -eux -o pipefail
+
+  apt-get update && apt-get install -y cmake ninja-build git wget gnupg2
+  wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key | tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc
+
+  if [ "$1" = "ubuntu22.04" ]; then
+    echo "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-20 main" >> /etc/apt/sources.list
+    echo "deb-src http://apt.llvm.org/jammy/ llvm-toolchain-jammy-20 main" >> /etc/apt/sources.list
+  elif [ "$1" = "ubuntu24.04" ]; then
+    echo "deb http://apt.llvm.org/noble/ llvm-toolchain-noble-20 main" >> /etc/apt/sources.list
+    echo "deb-src http://apt.llvm.org/noble/ llvm-toolchain-noble-20 main" >> /etc/apt/sources.list
+  else
+      echo "distro variant not supported yet"
+      exit 1
+  fi
+
+  apt-get update && apt-get install -y clang-20 lldb-20 lld-20
+}
+
+main "$@"
+
diff --git a/inference-engine/llamacpp/native/install-vulkan.sh b/inference-engine/llamacpp/native/install-vulkan.sh
new file mode 100755
index 00000000..989b4088
--- /dev/null
+++ b/inference-engine/llamacpp/native/install-vulkan.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+main() {
+  set -eux -o pipefail
+
+  apt-get install -y glslc libvulkan-dev
+}
+
+main "$@"
+
diff --git a/inference-engine/llamacpp/native/src/nv-gpu-info/CMakeLists.txt b/inference-engine/llamacpp/native/src/nv-gpu-info/CMakeLists.txt
new file mode 100644
index 00000000..f7289c9e
--- /dev/null
+++ b/inference-engine/llamacpp/native/src/nv-gpu-info/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(TARGET com.docker.nv-gpu-info)
+
+add_library(nvapi STATIC IMPORTED)
+set_target_properties(nvapi PROPERTIES
+  IMPORTED_LOCATION "${CMAKE_SOURCE_DIR}/vendor/nvapi/amd64/nvapi64.lib"
+  INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_SOURCE_DIR}/vendor/nvapi"
+)
+
+add_executable(${TARGET} nv-gpu-info.c)
+install(TARGETS ${TARGET} RUNTIME)
+
+target_link_libraries(${TARGET} nvapi)
\ No newline at end of file
diff --git a/inference-engine/llamacpp/native/src/nv-gpu-info/nv-gpu-info.c b/inference-engine/llamacpp/native/src/nv-gpu-info/nv-gpu-info.c
new file mode 100644
index 00000000..4ae9e07e
--- /dev/null
+++ b/inference-engine/llamacpp/native/src/nv-gpu-info/nv-gpu-info.c
@@ -0,0 +1,75 @@
+#include <stdio.h>
+#include "nvapi.h"
+
+#pragma comment(lib, "nvapi64.lib")
+int main() {
+    NvAPI_Status status = NVAPI_OK;
+    NvAPI_ShortString error_str = { 0 };
+
+    status = NvAPI_Initialize();
+    if (status != NVAPI_OK) {
+        NvAPI_GetErrorMessage(status, error_str);
+        printf("Failed to initialise NVAPI: %s\n", error_str);
+        return -1;
+    }
+
+    NvU32 driver_version;
+    NvAPI_ShortString build_branch;
+
+    status = NvAPI_SYS_GetDriverAndBranchVersion(&driver_version, build_branch);
+    if (status != NVAPI_OK) {
+        NvAPI_GetErrorMessage(status, error_str);
+        printf("Failed to retrieve driver info: %s\n", error_str);
+        return -1;
+    }
+
+    printf("driver version: %u\n", driver_version);
+    printf("build branch string: %s\n", build_branch);
+
+    NV_PHYSICAL_GPUS_V1 nvPhysicalGPUs = { 0 };
+    nvPhysicalGPUs.version = NV_PHYSICAL_GPUS_VER1;
+
+    status = NvAPI_SYS_GetPhysicalGPUs(&nvPhysicalGPUs);
+    if (status != NVAPI_OK) {
+        NvAPI_GetErrorMessage(status, error_str);
+        printf("Failed to retrieve physical GPU descriptors: %s\n", error_str);
+        return -1;
+    }
+
+    for (NvU32 i = 0; i < nvPhysicalGPUs.gpuHandleCount; i++) {
+        NvPhysicalGpuHandle gpu = nvPhysicalGPUs.gpuHandleData[i].hPhysicalGpu;
+
+        NvAPI_ShortString gpu_name = { 0 };
+        status = NvAPI_GPU_GetFullName(gpu, gpu_name);
+        if (status == NVAPI_OK) {
+            printf("GPU[%d]: full name: %s\n", i, gpu_name);
+        } else {
+            printf("GPU[%d]: full name: error\n", i);
+        }
+
+        NvU32 devid;
+        NvU32 subsysid;
+        NvU32 revid;
+        NvU32 extid;
+        status = NvAPI_GPU_GetPCIIdentifiers(gpu, &devid, &subsysid, &revid, &extid);
+        if (status == NVAPI_OK) {
+            printf("GPU[%d]: pci ids: device_id: 0x%04x; subsystem_id: 0x%04x; revision_id: 0x%04x; ext_device_id: 0x%04x\n",
+                i, devid, subsysid, revid, extid);
+        } else {
+            printf("GPU[%d]: pci ids: error\n", i);
+        }
+
+        NV_GPU_MEMORY_INFO_EX_V1 nvMemoryInfo = { 0 };
+        nvMemoryInfo.version = NV_GPU_MEMORY_INFO_EX_VER_1;
+
+        status = NvAPI_GPU_GetMemoryInfoEx(gpu, &nvMemoryInfo);
+        if (status == NVAPI_OK) {
+            printf("GPU[%d]: dedicated memory: %lld\n",
+                i, nvMemoryInfo.dedicatedVideoMemory);
+        } else {
+            printf("GPU[%d]: dedicated memory: error\n", i);
+        }
+    }
+
+    return 0;
+}
diff --git a/inference-engine/llamacpp/promote-rc.Dockerfile b/inference-engine/llamacpp/promote-rc.Dockerfile
new file mode 100644
index 00000000..957935da
--- /dev/null
+++ b/inference-engine/llamacpp/promote-rc.Dockerfile
@@ -0,0 +1,5 @@
+# syntax=docker/dockerfile:1
+
+ARG BASE_IMAGE
+
+FROM ${BASE_IMAGE}