Skip to content

Commit 982038d

Browse files
authored
Merge branch 'main' into supportFP8BlockWideEp
2 parents 22c253d + b0558c7 commit 982038d

File tree

39 files changed

+937
-630
lines changed

39 files changed

+937
-630
lines changed

.dockerignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,6 @@ examples/**/.git
99
examples/**/*.bin
1010
examples/**/*.engine
1111
examples/**/*.onnx
12+
examples/**/*.safetensors
1213
examples/**/c-model
1314
examples/models/core/gpt/gpt*

.github/CODEOWNERS

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,5 @@
11
# This file defines code ownership rules for the repository.
22

3-
# The following rule should only be uncommented on release branches (e.g., release/0.19).
4-
# The rule below requires that any PR to release/**/* branches must be approved by at least one member
5-
# of the NVIDIA/trt-llm-release-branch-approval team, regardless of who else approves the PR.
6-
# Without approval from a member of this team, PRs cannot be merged to release branches.
7-
# * @NVIDIA/trt-llm-release-branch-approval
83

94
## TensorRT-LLM Infra
105
### CI
@@ -160,3 +155,9 @@ docs/source/performance/perf-benchmarking.md @NVIDIA/trtllm-bench-reviewers
160155
# from a member of this team, PRs affecting public APIs cannot be merged to main or release branches.
161156
/tests/unittest/api_stability/ @NVIDIA/trt-llm-noncommitted-api-review-committee
162157
/tests/unittest/api_stability/references_committed/ @NVIDIA/trt-llm-committed-api-review-committee
158+
159+
# The following rule should only be uncommented on release branches (e.g., release/0.19).
160+
# The rule below requires that any PR to release/**/* branches must be approved by at least one member
161+
# of the NVIDIA/trt-llm-release-branch-approval team, regardless of who else approves the PR.
162+
# Without approval from a member of this team, PRs cannot be merged to release branches.
163+
# * @NVIDIA/trt-llm-release-branch-approval

cpp/tensorrt_llm/thop/moeOp.cpp

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -392,8 +392,8 @@ class FusedMoeRunner : public torch::CustomClassHolder
392392
std::vector<int64_t> output_shape = {num_rows, unpadded_hidden_size_val};
393393
auto output = torch::empty(output_shape, input.options().dtype(mOutputDtype));
394394

395-
WorkspaceInfo workspace_info = getWorkspaceInfo(num_rows, hidden_size, inter_size, num_experts_total,
396-
static_cast<int>(experts_per_token), base_activation_type, parallelism_config, min_latency_mode);
395+
WorkspaceInfo const& workspace_info = getWorkspaceInfo(num_rows, hidden_size, inter_size, num_experts_total,
396+
static_cast<int>(experts_per_token), base_activation_type, parallelism_config, min_latency_mode, stream);
397397

398398
auto const quant_params = getQuantParams(num_experts_on_rank, hidden_size, inter_size, quant_scales);
399399
kernels::MoeMinLatencyParams min_latency_params{};
@@ -553,8 +553,8 @@ class FusedMoeRunner : public torch::CustomClassHolder
553553
min_latency_params.experts_to_token_score = static_cast<float*>(experts_to_token_score.data_ptr());
554554
min_latency_params.active_expert_global_ids = static_cast<int*>(active_expert_global_ids.data_ptr());
555555

556-
WorkspaceInfo workspace_info = getWorkspaceInfo(num_rows, hidden_size, inter_size, num_experts_total,
557-
static_cast<int>(experts_per_token), base_activation_type, parallelism_config, min_latency_mode);
556+
WorkspaceInfo const& workspace_info = getWorkspaceInfo(num_rows, hidden_size, inter_size, num_experts_total,
557+
static_cast<int>(experts_per_token), base_activation_type, parallelism_config, min_latency_mode, stream);
558558

559559
auto const quant_params = getQuantParams(num_experts_on_rank, hidden_size, inter_size, quant_scales);
560560

@@ -709,6 +709,7 @@ class FusedMoeRunner : public torch::CustomClassHolder
709709
// e.g. 16 nvfp4 elements are packed into a single int64 element
710710
int64_t mInnerDimMultiplier;
711711
char* mProfileWorkspace = nullptr;
712+
WorkspaceInfo workspace_info;
712713

713714
bool mUseDeepSeekFP8BlockScaling = false;
714715
bool mUseW4GroupScaling = false;
@@ -757,9 +758,9 @@ class FusedMoeRunner : public torch::CustomClassHolder
757758
mKernelRunner->setTactic(best_gemm1_profile, best_gemm2_profile);
758759
}
759760

760-
WorkspaceInfo getWorkspaceInfo(int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size,
761+
WorkspaceInfo const& getWorkspaceInfo(int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size,
761762
int num_experts, int experts_per_token, ActivationType activation_type,
762-
kernels::MOEParallelismConfig const& parallelismConfig, bool min_latency_mode)
763+
kernels::MOEParallelismConfig const& parallelismConfig, bool min_latency_mode, cudaStream_t stream)
763764
{
764765
size_t moe_workspace_size = mKernelRunner->getWorkspaceSize(num_rows, hidden_size, inter_size, num_experts,
765766
experts_per_token, activation_type, parallelismConfig, /* use_lora */ false, mUseDeepSeekFP8BlockScaling,
@@ -768,15 +769,29 @@ class FusedMoeRunner : public torch::CustomClassHolder
768769

769770
std::vector<size_t> workspaces{moe_workspace_size, src_to_dest_map_size};
770771

771-
size_t total_workspace_size = common::calculateTotalWorkspaceSize(workspaces.data(), workspaces.size());
772+
int64_t const total_workspace_size = common::calculateTotalWorkspaceSize(workspaces.data(), workspaces.size());
772773

773-
WorkspaceInfo info{};
774-
info.workspace = torch::empty({static_cast<long>(total_workspace_size)},
775-
torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
776-
info.src_to_dest_map
777-
= common::nextWorkspacePtr(static_cast<int8_t*>(info.workspace.data_ptr()), moe_workspace_size);
774+
bool is_capturing = tensorrt_llm::common::isCapturing(stream);
775+
// Always allocate workspace when capturing cuda graph to avoid illegal memory access during replay
776+
if (is_capturing || workspace_info.workspace.numel() < total_workspace_size)
777+
{
778+
if (is_capturing)
779+
{
780+
TLLM_LOG_DEBUG(
781+
"Allocating MoE workspace with %ld bytes size during cuda graph capture", total_workspace_size);
782+
}
783+
else
784+
{
785+
TLLM_LOG_DEBUG("MoE workspace size is not enough, increase the size from %ld bytes to %ld bytes",
786+
workspace_info.workspace.numel(), total_workspace_size);
787+
}
788+
workspace_info.workspace = torch::empty({static_cast<long>(total_workspace_size)},
789+
torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
790+
}
791+
workspace_info.src_to_dest_map
792+
= common::nextWorkspacePtr(static_cast<int8_t*>(workspace_info.workspace.data_ptr()), moe_workspace_size);
778793

779-
return info;
794+
return workspace_info;
780795
}
781796

782797
kernels::QuantParams getQuantParams(int64_t const num_experts_on_rank, int64_t const hidden_size,

docker/Dockerfile.multi

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,8 @@ FROM wheel AS tritonbuild
174174
WORKDIR /src/tensorrt_llm
175175
RUN pip install /src/tensorrt_llm/build/tensorrt_llm*.whl
176176
COPY ./triton_backend/ ./triton_backend/
177-
RUN bash ./triton_backend/inflight_batcher_llm/scripts/build.sh
177+
ARG TRITON_BASE_TAG
178+
RUN bash ./triton_backend/inflight_batcher_llm/scripts/build.sh -s "r${TRITON_BASE_TAG%-py3}"
178179

179180

180181
FROM release AS tritonrelease

docker/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ CODE_DIR ?= /code/tensorrt_llm
137137
EXTRA_VOLUMES ?=
138138
CCACHE_DIR ?= $(CODE_DIR)/cpp/.ccache
139139
CONAN_DIR ?= $(CODE_DIR)/cpp/.conan
140-
USER_CACHE_DIR ?= $(HOME_DIR)/.cache
140+
USER_CACHE_DIR ?= $(shell readlink -f "${HOME_DIR}/.cache")
141141
RUN_CMD ?=
142142
CONTAINER_NAME ?= tensorrt_llm
143143
WORK_DIR ?= $(CODE_DIR)

docs/source/index.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,12 @@ Welcome to TensorRT-LLM's Documentation!
160160
blogs/XQA-kernel.md
161161
blogs/tech_blog/*
162162

163+
.. toctree::
164+
:maxdepth: 2
165+
:caption: Use TensorRT Engine
166+
:hidden:
167+
168+
legacy/tensorrt_quickstart.md
163169

164170
Indices and tables
165171
==================
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# LLM API with TensorRT Engine
2+
A simple inference example with TinyLlama using the LLM API:
3+
4+
```{literalinclude} ../../examples/llm-api/_tensorrt_engine/quickstart_example.py
5+
:language: python
6+
:linenos:
7+
```
8+
9+
For more advanced usage including distributed inference, multimodal, and speculative decoding, please refer to this [README](../../../examples/llm-api/README.md).
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from tensorrt_llm import LLM, SamplingParams
2+
3+
4+
def main():
5+
6+
# Model could accept HF model name, a path to local HF model,
7+
# or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
8+
llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
9+
10+
# Sample prompts.
11+
prompts = [
12+
"Hello, my name is",
13+
"The capital of France is",
14+
"The future of AI is",
15+
]
16+
17+
# Create a sampling params.
18+
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
19+
20+
for output in llm.generate(prompts, sampling_params):
21+
print(
22+
f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
23+
)
24+
25+
# Got output like
26+
# Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
27+
# Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the'
28+
# Prompt: 'The capital of France is', Generated text: 'Paris.'
29+
# Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are'
30+
31+
32+
if __name__ == '__main__':
33+
main()

examples/llm-api/llm_runtime.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,7 @@ def example_cuda_graph_config():
2929
cuda_graph_config=cuda_graph_config, # Enable CUDA graphs
3030
max_batch_size=4,
3131
max_seq_len=512,
32-
kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.8,
33-
enable_block_reuse=True))
32+
kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.5))
3433

3534
prompts = [
3635
"Hello, my name is",
@@ -56,7 +55,7 @@ def example_kv_cache_config():
5655
max_batch_size=8,
5756
max_seq_len=1024,
5857
kv_cache_config=KvCacheConfig(
59-
free_gpu_memory_fraction=0.85,
58+
free_gpu_memory_fraction=0.5,
6059
enable_block_reuse=True))
6160

6261
prompts = [

examples/llm-api/quickstart_example.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,17 @@
1-
from tensorrt_llm import LLM, SamplingParams
1+
from tensorrt_llm import BuildConfig, SamplingParams
2+
from tensorrt_llm._tensorrt_engine import LLM # NOTE the change
23

34

45
def main():
56

7+
build_config = BuildConfig()
8+
build_config.max_batch_size = 256
9+
build_config.max_num_tokens = 1024
10+
611
# Model could accept HF model name, a path to local HF model,
712
# or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
8-
llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
13+
llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
14+
build_config=build_config)
915

1016
# Sample prompts.
1117
prompts = [

0 commit comments

Comments
 (0)