diff --git a/inference-engine/llamacpp/Dockerfile b/inference-engine/llamacpp/Dockerfile new file mode 100644 index 00000000..ac2011a1 --- /dev/null +++ b/inference-engine/llamacpp/Dockerfile @@ -0,0 +1,7 @@ +# syntax=docker/dockerfile:1 + +FROM scratch +ARG TARGETOS +ARG TARGETARCH +ARG ACCEL +COPY --from=release-artifacts /com.docker.llama-server.native.$TARGETOS.$ACCEL.$TARGETARCH /com.docker.llama-server.native.$TARGETOS.$ACCEL.$TARGETARCH diff --git a/inference-engine/llamacpp/Makefile b/inference-engine/llamacpp/Makefile new file mode 100644 index 00000000..d61de267 --- /dev/null +++ b/inference-engine/llamacpp/Makefile @@ -0,0 +1,90 @@ +ifeq ($(OS),Windows_NT) + DETECTED_OS := Windows +else + UNAME_S := $(shell uname -s) + ifeq ($(UNAME_S),Linux) + DETECTED_OS := Linux + endif + ifeq ($(UNAME_S),Darwin) + DETECTED_OS := macOS + endif +endif + +BUILD_DIR := build +INSTALL_DIR := install +NATIVE_DIR := native + +.PHONY: build clean install-deps install-dir + +build: install-deps +ifeq ($(DETECTED_OS),macOS) + @echo "Building for macOS..." + @echo "Configuring CMake..." + cmake -B $(BUILD_DIR) \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_C_COMPILER=clang \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3 \ + -DCMAKE_MACOSX_RPATH=ON \ + -DCMAKE_INSTALL_RPATH='@executable_path/../lib' \ + -DGGML_NATIVE=OFF \ + -DGGML_OPENMP=OFF \ + -DLLAMA_CURL=OFF \ + -GNinja \ + -S $(NATIVE_DIR) + @echo "Building..." + cmake --build $(BUILD_DIR) --config Release + @echo "Installing..." + cmake --install $(BUILD_DIR) \ + --config Release \ + --prefix $(INSTALL_DIR) + @echo "Cleaning install directory..." + rm -rf $(INSTALL_DIR)/lib/cmake + rm -rf $(INSTALL_DIR)/lib/pkgconfig + rm -rf $(INSTALL_DIR)/include + @echo "Build complete! Binaries are in $(INSTALL_DIR)" +else ifeq ($(DETECTED_OS),Linux) + @echo "Linux build not implemented yet" + @exit 1 +else ifeq ($(DETECTED_OS),Windows) + @echo "Windows build not implemented yet" + @exit 1 +else + @echo "Unsupported OS: $(DETECTED_OS)" + @exit 1 +endif + +install-deps: +ifeq ($(DETECTED_OS),macOS) + @echo "Installing build dependencies for macOS..." + @if ! command -v ninja >/dev/null 2>&1; then \ + echo "Installing Ninja..."; \ + brew install ninja; \ + else \ + echo "Ninja already installed"; \ + fi +else ifeq ($(DETECTED_OS),Linux) + @echo "Linux dependency installation not implemented yet" + @exit 1 +else ifeq ($(DETECTED_OS),Windows) + @echo "Windows dependency installation not implemented yet" + @exit 1 +else + @echo "Unsupported OS: $(DETECTED_OS)" + @exit 1 +endif + +clean: + rm -rf $(BUILD_DIR) + rm -rf $(INSTALL_DIR) + +install-dir: + @echo "$(INSTALL_DIR)" + +help: + @echo "Available targets:" + @echo " build - Build llama.cpp (macOS only for now)" + @echo " install-deps - Install build dependencies" + @echo " install-dir - Print install directory path" + @echo " clean - Clean build artifacts" + @echo " help - Show this help" diff --git a/inference-engine/llamacpp/README.md b/inference-engine/llamacpp/README.md new file mode 100644 index 00000000..e147612d --- /dev/null +++ b/inference-engine/llamacpp/README.md @@ -0,0 +1,6 @@ +# llama.cpp inference runtime + +This repo contains implementations of the llama.cpp inference runtime. + +* native/ - contains an implementaion based on `llama.cpp`'s native server + implementation diff --git a/inference-engine/llamacpp/native/.gitignore b/inference-engine/llamacpp/native/.gitignore new file mode 100644 index 00000000..567609b1 --- /dev/null +++ b/inference-engine/llamacpp/native/.gitignore @@ -0,0 +1 @@ +build/ diff --git a/inference-engine/llamacpp/native/CMakeLists.txt b/inference-engine/llamacpp/native/CMakeLists.txt new file mode 100644 index 00000000..e369c73f --- /dev/null +++ b/inference-engine/llamacpp/native/CMakeLists.txt @@ -0,0 +1,25 @@ +cmake_minimum_required(VERSION 3.13) + +project( + com.docker.llama-server.native + DESCRIPTION "DD inference server, based on llama.cpp native server" + LANGUAGES C CXX +) + +option(DDLLAMA_BUILD_SERVER "Build the DD llama.cpp server executable" ON) +option(DDLLAMA_BUILD_UTILS "Build utilities, e.g. nv-gpu-info" OFF) +set(DDLLAMA_PATCH_COMMAND "patch" CACHE STRING "patch command") + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) + +if (DDLLAMA_BUILD_SERVER) + set(LLAMA_BUILD_COMMON ON) + add_subdirectory(vendor/llama.cpp) + add_subdirectory(vendor/llama.cpp/tools/mtmd) + add_subdirectory(src/server) +endif() + +if (WIN32 AND DDLLAMA_BUILD_UTILS) + add_subdirectory(src/nv-gpu-info) +endif() diff --git a/inference-engine/llamacpp/native/README.md b/inference-engine/llamacpp/native/README.md new file mode 100644 index 00000000..de7d54f6 --- /dev/null +++ b/inference-engine/llamacpp/native/README.md @@ -0,0 +1,10 @@ +# Native llama-server for DD + +## Building + + cmake -B build + cmake --build build --parallel 8 --config Release + +## Running + + DD_INF_UDS= ./build/bin/com.docker.llama-server --model diff --git a/inference-engine/llamacpp/native/cuda.Dockerfile b/inference-engine/llamacpp/native/cuda.Dockerfile new file mode 100644 index 00000000..4c42f813 --- /dev/null +++ b/inference-engine/llamacpp/native/cuda.Dockerfile @@ -0,0 +1,53 @@ +# syntax=docker/dockerfile:1 + +ARG CUDA_VERSION=12.9.0 +ARG CUDA_IMAGE_VARIANT=ubuntu24.04 + +FROM nvidia/cuda:${CUDA_VERSION}-devel-${CUDA_IMAGE_VARIANT} AS builder + +ARG TARGETARCH +ARG CUDA_IMAGE_VARIANT + +COPY native/install-clang.sh . +RUN ./install-clang.sh "${CUDA_IMAGE_VARIANT}" + +WORKDIR /llama-server + +COPY .git .git +COPY native/CMakeLists.txt . +COPY native/src src +COPY native/vendor vendor + +# Fix submodule .git file to point to correct location in container +RUN echo "gitdir: ../../.git/modules/native/vendor/llama.cpp" > vendor/llama.cpp/.git && \ + sed -i 's|worktree = ../../../../../native/vendor/llama.cpp|worktree = /llama-server/vendor/llama.cpp|' .git/modules/native/vendor/llama.cpp/config + +ENV CC=/usr/bin/clang-20 +ENV CXX=/usr/bin/clang++-20 +RUN echo "-B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_SHARED_LIBS=ON \ + -DGGML_BACKEND_DL=ON \ + -DGGML_CPU_ALL_VARIANTS=ON \ + -DGGML_NATIVE=OFF \ + -DGGML_OPENMP=OFF \ + -DGGML_CUDA=ON \ + -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \ + -DLLAMA_CURL=OFF \ + -GNinja \ + -S ." > cmake-flags +RUN cmake $(cat cmake-flags) +RUN cmake --build build --config Release +RUN cmake --install build --config Release --prefix install + +RUN rm install/bin/*.py +RUN rm -r install/lib/cmake +RUN rm -r install/lib/pkgconfig +RUN rm -r install/include + +FROM scratch AS final + +ARG TARGETARCH +ARG CUDA_VERSION + +COPY --from=builder /llama-server/install /com.docker.llama-server.native.linux.cuda$CUDA_VERSION.$TARGETARCH diff --git a/inference-engine/llamacpp/native/generic.Dockerfile b/inference-engine/llamacpp/native/generic.Dockerfile new file mode 100644 index 00000000..f55f4550 --- /dev/null +++ b/inference-engine/llamacpp/native/generic.Dockerfile @@ -0,0 +1,62 @@ +# syntax=docker/dockerfile:1 + +ARG BASE_IMAGE=ubuntu:25.10 + +FROM ${BASE_IMAGE} AS builder + +ARG TARGETARCH + +RUN apt-get update && apt-get install -y cmake ninja-build git build-essential curl + +COPY native/install-vulkan.sh . +RUN ./install-vulkan.sh + +ENV VULKAN_SDK=/opt/vulkan +ENV PATH=$VULKAN_SDK/bin:$PATH +ENV LD_LIBRARY_PATH=$VULKAN_SDK/lib +ENV CMAKE_PREFIX_PATH=$VULKAN_SDK +ENV PKG_CONFIG_PATH=$VULKAN_SDK/lib/pkgconfig + +WORKDIR /llama-server + +COPY .git .git +COPY native/CMakeLists.txt . +COPY native/src src +COPY native/vendor vendor + +# Fix submodule .git file to point to correct location in container +RUN echo "gitdir: ../../.git/modules/native/vendor/llama.cpp" > vendor/llama.cpp/.git && \ + sed -i 's|worktree = ../../../../../native/vendor/llama.cpp|worktree = /llama-server/vendor/llama.cpp|' .git/modules/native/vendor/llama.cpp/config + +RUN echo "-B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_NATIVE=OFF \ + -DGGML_OPENMP=OFF \ + -DLLAMA_CURL=OFF \ + -DGGML_VULKAN=ON \ + -GNinja \ + -S ." > cmake-flags +RUN if [ "${TARGETARCH}" = "amd64" ]; then \ + echo " -DBUILD_SHARED_LIBS=ON \ + -DGGML_BACKEND_DL=ON \ + -DGGML_CPU_ALL_VARIANTS=ON" >> cmake-flags; \ + elif [ "${TARGETARCH}" = "arm64" ]; then \ + echo " -DBUILD_SHARED_LIBS=OFF" >> cmake-flags; \ + else \ + echo "${TARGETARCH} is not supported"; \ + exit 1; \ + fi +RUN cmake $(cat cmake-flags) +RUN cmake --build build --config Release -j 4 +RUN cmake --install build --config Release --prefix install + +RUN rm install/bin/*.py +RUN rm -r install/lib/cmake +RUN rm -r install/lib/pkgconfig +RUN rm -r install/include + +FROM scratch AS final + +ARG TARGETARCH + +COPY --from=builder /llama-server/install /com.docker.llama-server.native.linux.cpu.$TARGETARCH diff --git a/inference-engine/llamacpp/native/install-clang.sh b/inference-engine/llamacpp/native/install-clang.sh new file mode 100755 index 00000000..a4ae0ab3 --- /dev/null +++ b/inference-engine/llamacpp/native/install-clang.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +main() { + set -eux -o pipefail + + apt-get update && apt-get install -y cmake ninja-build git wget gnupg2 + wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key | tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc + + if [ "$1" = "ubuntu22.04" ]; then + echo "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-20 main" >> /etc/apt/sources.list + echo "deb-src http://apt.llvm.org/jammy/ llvm-toolchain-jammy-20 main" >> /etc/apt/sources.list + elif [ "$1" = "ubuntu24.04" ]; then + echo "deb http://apt.llvm.org/noble/ llvm-toolchain-noble-20 main" >> /etc/apt/sources.list + echo "deb-src http://apt.llvm.org/noble/ llvm-toolchain-noble-20 main" >> /etc/apt/sources.list + else + echo "distro variant not supported yet" + exit 1 + fi + + apt-get update && apt-get install -y clang-20 lldb-20 lld-20 +} + +main "$@" + diff --git a/inference-engine/llamacpp/native/install-vulkan.sh b/inference-engine/llamacpp/native/install-vulkan.sh new file mode 100755 index 00000000..989b4088 --- /dev/null +++ b/inference-engine/llamacpp/native/install-vulkan.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +main() { + set -eux -o pipefail + + apt-get install -y glslc libvulkan-dev +} + +main "$@" + diff --git a/inference-engine/llamacpp/native/src/nv-gpu-info/CMakeLists.txt b/inference-engine/llamacpp/native/src/nv-gpu-info/CMakeLists.txt new file mode 100644 index 00000000..f7289c9e --- /dev/null +++ b/inference-engine/llamacpp/native/src/nv-gpu-info/CMakeLists.txt @@ -0,0 +1,12 @@ +set(TARGET com.docker.nv-gpu-info) + +add_library(nvapi STATIC IMPORTED) +set_target_properties(nvapi PROPERTIES + IMPORTED_LOCATION "${CMAKE_SOURCE_DIR}/vendor/nvapi/amd64/nvapi64.lib" + INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_SOURCE_DIR}/vendor/nvapi" +) + +add_executable(${TARGET} nv-gpu-info.c) +install(TARGETS ${TARGET} RUNTIME) + +target_link_libraries(${TARGET} nvapi) \ No newline at end of file diff --git a/inference-engine/llamacpp/native/src/nv-gpu-info/nv-gpu-info.c b/inference-engine/llamacpp/native/src/nv-gpu-info/nv-gpu-info.c new file mode 100644 index 00000000..4ae9e07e --- /dev/null +++ b/inference-engine/llamacpp/native/src/nv-gpu-info/nv-gpu-info.c @@ -0,0 +1,75 @@ +#include +#include "nvapi.h" + +#pragma comment(lib, "nvapi64.lib") +int main() { + NvAPI_Status status = NVAPI_OK; + NvAPI_ShortString error_str = { 0 }; + + status = NvAPI_Initialize(); + if (status != NVAPI_OK) { + NvAPI_GetErrorMessage(status, error_str); + printf("Failed to initialise NVAPI: %s\n", error_str); + return -1; + } + + NvU32 driver_version; + NvAPI_ShortString build_branch; + + status = NvAPI_SYS_GetDriverAndBranchVersion(&driver_version, build_branch); + if (status != NVAPI_OK) { + NvAPI_GetErrorMessage(status, error_str); + printf("Failed to retrieve driver info: %s\n", error_str); + return -1; + } + + printf("driver version: %u\n", driver_version); + printf("build branch string: %s\n", build_branch); + + NV_PHYSICAL_GPUS_V1 nvPhysicalGPUs = { 0 }; + nvPhysicalGPUs.version = NV_PHYSICAL_GPUS_VER1; + + status = NvAPI_SYS_GetPhysicalGPUs(&nvPhysicalGPUs); + if (status != NVAPI_OK) { + NvAPI_GetErrorMessage(status, error_str); + printf("Failed to retrieve physical GPU descriptors: %s\n", error_str); + return -1; + } + + for (NvU32 i = 0; i < nvPhysicalGPUs.gpuHandleCount; i++) { + NvPhysicalGpuHandle gpu = nvPhysicalGPUs.gpuHandleData[i].hPhysicalGpu; + + NvAPI_ShortString gpu_name = { 0 }; + status = NvAPI_GPU_GetFullName(gpu, gpu_name); + if (status == NVAPI_OK) { + printf("GPU[%d]: full name: %s\n", i, gpu_name); + } else { + printf("GPU[%d]: full name: error\n", i); + } + + NvU32 devid; + NvU32 subsysid; + NvU32 revid; + NvU32 extid; + status = NvAPI_GPU_GetPCIIdentifiers(gpu, &devid, &subsysid, &revid, &extid); + if (status == NVAPI_OK) { + printf("GPU[%d]: pci ids: device_id: 0x%04x; subsystem_id: 0x%04x; revision_id: 0x%04x; ext_device_id: 0x%04x\n", + i, devid, subsysid, revid, extid); + } else { + printf("GPU[%d]: pci ids: error\n", i); + } + + NV_GPU_MEMORY_INFO_EX_V1 nvMemoryInfo = { 0 }; + nvMemoryInfo.version = NV_GPU_MEMORY_INFO_EX_VER_1; + + status = NvAPI_GPU_GetMemoryInfoEx(gpu, &nvMemoryInfo); + if (status == NVAPI_OK) { + printf("GPU[%d]: dedicated memory: %lld\n", + i, nvMemoryInfo.dedicatedVideoMemory); + } else { + printf("GPU[%d]: dedicated memory: error\n", i); + } + } + + return 0; +} diff --git a/inference-engine/llamacpp/promote-rc.Dockerfile b/inference-engine/llamacpp/promote-rc.Dockerfile new file mode 100644 index 00000000..957935da --- /dev/null +++ b/inference-engine/llamacpp/promote-rc.Dockerfile @@ -0,0 +1,5 @@ +# syntax=docker/dockerfile:1 + +ARG BASE_IMAGE + +FROM ${BASE_IMAGE}