Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TensorRT-LLM backend bump to latest version + misc fixes #2791

Merged
merged 30 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
0f17415
misc(cmake) update dependencies
mfuntowicz Nov 18, 2024
7a81040
feat(hardware) enable new hardware.hpp and unittests
mfuntowicz Nov 18, 2024
1830fe8
test(ctest) enable address sanitizer
mfuntowicz Nov 18, 2024
3a2698f
feat(backend): initial rewrite of the backend for simplicity
mfuntowicz Nov 18, 2024
6d35657
feat(backend): remove all the logs from hardware.hpp
mfuntowicz Nov 18, 2024
9bb6309
feat(backend): added some logging
mfuntowicz Nov 30, 2024
87272ff
feat(backend): enable compiler warning if support for RVO not applying
mfuntowicz Nov 30, 2024
702dc9c
feat(backend): missing return statement
mfuntowicz Nov 30, 2024
25c6bbe
feat(backend): introduce backend_workspace_t to store precomputed inf…
mfuntowicz Nov 30, 2024
df99164
feat(backend): delete previous backend impl
mfuntowicz Dec 1, 2024
fd7e2b5
feat(backend): more impl
mfuntowicz Dec 1, 2024
71e700a
feat(backend): use latest trtllm main version to have g++ >= 13 compa…
mfuntowicz Dec 1, 2024
879e1a4
feat(backend): allow overriding which Python to use
mfuntowicz Dec 2, 2024
a7bad25
feat(backend): fix backend_exception_t -> backend_error_t naming
mfuntowicz Dec 2, 2024
2f8634e
feat(backend): impl missing generation_step_t as return value of pull…
mfuntowicz Dec 2, 2024
874bc28
feat(backend): make backend_workspace_t::engines_folder constexpr
mfuntowicz Dec 3, 2024
16ba2f5
feat(backend): fix main.rs retrieving the tokenizer
mfuntowicz Dec 3, 2024
c94b9de
feat(backend): add guard to multiple header definitions
mfuntowicz Dec 3, 2024
ad3ed0d
test(backend): add more unittest
mfuntowicz Dec 3, 2024
881527a
feat(backend): remove constexpr from par
mfuntowicz Dec 3, 2024
6253064
feat(backend): remove constexpig
mfuntowicz Dec 3, 2024
cc6bc33
test(backend): more test coverage
mfuntowicz Dec 3, 2024
b6dbf60
chore(trtllm): update dependency towards 0.15.0
mfuntowicz Dec 4, 2024
460f290
effectively cancel the request on the executor
mfuntowicz Dec 4, 2024
300f6c6
feat(backend) fix moving backend when pulling
mfuntowicz Dec 4, 2024
b3cd5ea
feat(backend): make sure we can easily cancel request on the executor
mfuntowicz Dec 5, 2024
049f4ac
feat(backend): fix missing "0" field access
mfuntowicz Dec 5, 2024
f0cd474
misc(backend): fix reborrowing Pin<&mut T> as described in the doc ht…
mfuntowicz Dec 5, 2024
ab6591e
chore: Add doc and CI for TRTLLM (#2799)
Hugoch Dec 11, 2024
1640da7
misc(backend): indent
mfuntowicz Dec 13, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ on:
description: Hardware
# options:
# - cuda
# - cuda-trtllm
# - rocm
# - intel
required: true
Expand Down Expand Up @@ -52,6 +53,15 @@ jobs:
export platform=""
export extra_pytest=""
;;
cuda-trtllm)
export dockerfile="Dockerfile_trtllm"
export label_extension="-trtllm"
export docker_volume="/mnt/cache"
export docker_devices=""
export runs_on="ubuntu-latest"
export platform=""
export extra_pytest=""
;;
rocm)
export dockerfile="Dockerfile_amd"
export label_extension="-rocm"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci_build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
# fail-fast is true by default
fail-fast: false
matrix:
hardware: ["cuda", "rocm", "intel-xpu", "intel-cpu"]
hardware: ["cuda", "cuda-trtllm", "rocm", "intel-xpu", "intel-cpu"]
uses: ./.github/workflows/build.yaml # calls the one above ^
permissions:
contents: write
Expand Down
55 changes: 1 addition & 54 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 13 additions & 8 deletions Dockerfile_trtllm
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ARG CUDA_ARCH_LIST="75-real;80-real;86-real;89-real;90-real"
ARG OMPI_VERSION="4.1.6"
ARG OMPI_VERSION="4.1.7rc1"

# Build dependencies resolver stage
FROM lukemathwalker/cargo-chef:latest AS chef
Expand All @@ -10,26 +10,29 @@ COPY . .
RUN cargo chef prepare --recipe-path recipe.json

# CUDA dependent dependencies resolver stage
FROM nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04 AS cuda-builder
FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder

RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt update && apt install -y \
build-essential \
cmake \
curl \
gcc \
g++ \
gcc-14 \
g++-14 \
git \
git-lfs \
libssl-dev \
libucx-dev \
ninja-build \
pkg-config \
pipx \
python3 \
python3-dev \
python3-setuptools \
tar \
wget
wget && \
pipx ensurepath

ENV TGI_INSTALL_PREFIX=/usr/local/tgi
ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt
Expand Down Expand Up @@ -83,13 +86,15 @@ RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$
cd backends/trtllm && \
CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release

FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu22.04 AS runtime
RUN apt update && apt install -y python3-minimal python3-dev python3-pip && \
FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS runtime
RUN apt update && apt install -y libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
python3 -m pip install transformers tokenizers
pipx ensurepath && \
pipx install --include-deps transformers tokenizers

WORKDIR /usr/local/tgi/bin

ENV PATH=/root/.local/share/pipx/venvs/transformers/bin/:$PATH
ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/mpi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
ENV TOKENIZERS_PARALLELISM=false
ENV OMPI_MCA_plm_rsh_agent=""
Expand Down
54 changes: 42 additions & 12 deletions backends/trtllm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
endif ()

project(tgi-trtllm-backend VERSION 1.0.0)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD 23)

include(FetchContent)
include(ExternalProject)
include(CheckCXXCompilerFlag)

option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF)
Expand All @@ -29,27 +30,42 @@ set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE ST
find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)

#### External dependencies ####
include(cmake/fmt.cmake)
include(cmake/json.cmake)
include(cmake/spdlog.cmake)
include(cmake/trtllm.cmake)

if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
add_compile_definitions(TGI_TRTLLM_BACKEND_DEBUG=1)
endif()

# This attempt to detect if the compiler can emit warning if it can't apply return value optimization from a function
check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO)
if(${COMPILER_SUPPORT_WARNING_ON_NVRO})
set(CMAKE_CXX_FLAGS "{CMAKE_CXX_FLAGS} -Wnvro")
endif()

# Let's build TRTLLM as part of CMake
add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")

# Tell CMake to need try to override the RPATH for executorWorker as it has not information on how to do so
set_target_properties(executorWorker PROPERTIES SKIP_BUILD_RPATH TRUE)

# TGI TRTLLM Backend definition
add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp include/hardware.h)
add_library(tgi_trtllm_backend_impl STATIC csrc/hardware.hpp csrc/backend.hpp csrc/backend.cpp)
include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
target_include_directories(tgi_trtllm_backend_impl PRIVATE
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/csrc>
# $<INSTALL_INTERFACE:csrc>
)
target_include_directories(tgi_trtllm_backend_impl PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper CUDA::cudart CUDA::nvml)
target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog fmt::fmt)
target_link_libraries(tgi_trtllm_backend_impl PRIVATE CUDA::cudart CUDA::nvml)
target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog)

if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm)
else()
target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm)
endif ()

# This install all the artifacts in CMAKE_INSTALL_PREFIX under include/ lib/ bin/ to make easy to link / find it back
install(TARGETS tgi_trtllm_backend_impl tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker)
Expand All @@ -60,16 +76,30 @@ if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
message(STATUS "Building tests")
FetchContent_Declare(
Catch2
GIT_REPOSITORY https://github.com/catchorg/Catch2
GIT_TAG v3.6.0
URL https://github.com/catchorg/Catch2/archive/refs/tags/v3.7.1.tar.gz
)
FetchContent_MakeAvailable(Catch2)

# add_executable(tgi_trtllm_backend_tests tests/infer_test.cpp)
# target_link_libraries(tgi_trtllm_backend_tests PRIVATE tgi_trtllm_backend_impl Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog fmt::fmt CUDA::cudart CUDA::nvml)
add_executable(tgi_trtllm_backend_tests tests/test_hardware.cpp tests/test_backend.cpp)
target_include_directories(tgi_trtllm_backend_tests PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
target_include_directories(tgi_trtllm_backend_tests PUBLIC "csrc/")
target_link_libraries(tgi_trtllm_backend_tests PRIVATE ${TRTLLM_LIBS} CUDA::cudart CUDA::nvml)
target_link_libraries(tgi_trtllm_backend_tests PUBLIC Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog tgi_trtllm_backend_impl)

if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm)
else()
target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm)
endif ()

if(CMAKE_BUILD_TYPE MATCHES "Debug")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror -fsanitize=undefined -fsanitize=address")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fsanitize=undefined -fsanitize=address")
target_link_options(tgi_trtllm_backend_tests BEFORE PUBLIC -fsanitize=undefined PUBLIC -fsanitize=address)
endif()

list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
include(CTest)
include(Catch)
# catch_discover_tests(tgi_trtllm_backend_tests)
catch_discover_tests(tgi_trtllm_backend_tests)
endif ()
9 changes: 5 additions & 4 deletions backends/trtllm/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,21 @@ homepage.workspace = true

[dependencies]
async-trait = "0.1"
async-stream = "0.3"
#async-stream = "0.3"
clap = { version = "4.5", features = ["derive"] }
cxx = "1.0"
hashbrown = "0.14"
hf-hub = { workspace = true }
log = { version = "0.4", features = [] }
#log = { version = "0.4", features = [] }
text-generation-router = { path = "../../router" }
tokenizers = { workspace = true }
tokio = { version = "1.39", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
tokio-stream = "0.1.15"
thiserror = "1.0.63"
tracing = "0.1"
tracing-opentelemetry = "0.25"
tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
#tracing-opentelemetry = "0.25"
#tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
pyo3 = { workspace = true }

[build-dependencies]
cmake = "0.1"
Expand Down
34 changes: 20 additions & 14 deletions backends/trtllm/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use std::env;
use std::env::consts::ARCH;
use std::path::{absolute, PathBuf};

const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"];
const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
const CUDA_REQUIRED_VERSION: &str = "12.6";
const MPI_REQUIRED_VERSION: &str = "4.1";
Expand Down Expand Up @@ -43,8 +43,8 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
install_path = absolute(out_dir).expect("cannot happen").join(install_path);
}

let _ = cmake::Config::new(".")
.uses_cxx11()
let mut config = cmake::Config::new(".");
config.uses_cxx11()
.generator("Ninja")
.profile(match is_debug {
true => "Debug",
Expand All @@ -53,9 +53,16 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
.env("OPT_LEVEL", opt_level)
.define("CMAKE_INSTALL_PREFIX", &install_path)
.define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc")
.define("Python3_ROOT_DIR", "../venv")
.define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list)
.define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path)
.build();
.define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path);

// Allow to override which Python to use ...
if let Some(python3) = option_env!("Python3_EXECUTABLE") {
config.define("Python3_EXECUTABLE", python3);
}

config.build();

// Additional transitive CMake dependencies
let deps_folder = out_dir.join("build").join("_deps");
Expand Down Expand Up @@ -90,26 +97,25 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
CFG.include_prefix = "backends/trtllm";
cxx_build::bridge("src/lib.rs")
.static_flag(true)
.include(deps_folder.join("fmt-src").join("include"))
.std("c++23")
.include(deps_folder.join("spdlog-src").join("include"))
.include(deps_folder.join("json-src").join("include"))
.include(deps_folder.join("trtllm-src").join("cpp").join("include"))
.include("/usr/local/cuda/include")
.include("/usr/local/tensorrt/include")
.file("src/ffi.cpp")
.std("c++20")
.define("NDEBUG", ndebug)
.include("csrc/")
.file("csrc/ffi.hpp")
.define("TGI_TRTLLM_BACKEND_DEBUG", ndebug)
.compile("tgi_trtllm_backend");

println!("cargo:rerun-if-changed=CMakeLists.txt");
println!("cargo:rerun-if-changed=cmake/trtllm.cmake");
println!("cargo:rerun-if-changed=cmake/json.cmake");
println!("cargo:rerun-if-changed=cmake/fmt.cmake");
println!("cargo:rerun-if-changed=cmake/spdlog.cmake");
println!("cargo:rerun-if-changed=include/backend.h");
println!("cargo:rerun-if-changed=lib/backend.cpp");
println!("cargo:rerun-if-changed=include/ffi.h");
println!("cargo:rerun-if-changed=src/ffi.cpp");
println!("cargo:rerun-if-changed=csrc/backend.hpp");
println!("cargo:rerun-if-changed=csrc/backend.cpp");
println!("cargo:rerun-if-changed=csrc/hardware.hpp");
println!("cargo:rerun-if-changed=csrc/ffi.hpp");
}

fn main() {
Expand Down
6 changes: 0 additions & 6 deletions backends/trtllm/cmake/fmt.cmake

This file was deleted.

Loading
Loading