@@ -141,9 +141,9 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \
141141 --num-requests 24 > /tmp/benchmarking_64k.txt
142142
143143cat << EOF > /tmp/extra-llm-api-config.yml
144- use_cuda_graph: true
145- cuda_graph_padding_enabled : true
146- cuda_graph_batch_sizes : [1, 4, 8, 12]
144+ cuda_graph_config:
145+ padding_enabled : true
146+ batch_sizes : [1, 4, 8, 12]
147147EOF
148148
149149trtllm-bench -m deepseek-ai/DeepSeek-R1 --model_path ${DS_R1_NVFP4_MODEL_PATH} throughput \
@@ -168,9 +168,9 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \
168168 --num-requests 4 > /tmp/benchmarking_128k.txt
169169
170170cat << EOF > /tmp/extra-llm-api-config.yml
171- use_cuda_graph: true
172- cuda_graph_padding_enabled : true
173- cuda_graph_batch_sizes : [1, 2]
171+ cuda_graph_config:
172+ padding_enabled : true
173+ batch_sizes : [1, 2]
174174moe_max_num_tokens: 16384
175175EOF
176176
@@ -236,19 +236,19 @@ To serve the model using `trtllm-serve`:
236236
237237``` bash
238238cat > ./extra-llm-api-config.yml << EOF
239- use_cuda_graph: true
240- cuda_graph_padding_enabled : true
241- cuda_graph_batch_sizes :
242- - 1
243- - 2
244- - 4
245- - 8
246- - 16
247- - 32
248- - 64
249- - 128
250- - 256
251- - 384
239+ cuda_graph_config:
240+ padding_enabled : true
241+ batch_sizes :
242+ - 1
243+ - 2
244+ - 4
245+ - 8
246+ - 16
247+ - 32
248+ - 64
249+ - 128
250+ - 256
251+ - 384
252252print_iter_log: true
253253enable_attention_dp: true
254254EOF
@@ -315,19 +315,19 @@ And you can launch two generation servers on port 8002 and 8003 with:
315315export TRTLLM_USE_UCX_KVCACHE=1
316316
317317cat > ./gen-extra-llm-api-config.yml << EOF
318- use_cuda_graph: true
319- cuda_graph_padding_enabled : true
320- cuda_graph_batch_sizes :
321- - 1
322- - 2
323- - 4
324- - 8
325- - 16
326- - 32
327- - 64
328- - 128
329- - 256
330- - 384
318+ cuda_graph_config:
319+ padding_enabled : true
320+ batch_sizes :
321+ - 1
322+ - 2
323+ - 4
324+ - 8
325+ - 16
326+ - 32
327+ - 64
328+ - 128
329+ - 256
330+ - 384
331331print_iter_log: true
332332enable_attention_dp: true
333333EOF
@@ -537,19 +537,19 @@ python3 /path/to/TensorRT-LLM/benchmarks/cpp/prepare_dataset.py \
537537 --input-mean=1024 --output-mean=2048 --input-stdev=0 --output-stdev=0 > /tmp/dataset.txt
538538
539539cat > /path/to/TensorRT-LLM/extra-llm-api-config.yml << EOF
540- use_cuda_graph: true
541- cuda_graph_padding_enabled : true
542- cuda_graph_batch_sizes :
543- - 1
544- - 2
545- - 4
546- - 8
547- - 16
548- - 32
549- - 64
550- - 128
551- - 256
552- - 384
540+ cuda_graph_config:
541+ padding_enabled : true
542+ batch_sizes :
543+ - 1
544+ - 2
545+ - 4
546+ - 8
547+ - 16
548+ - 32
549+ - 64
550+ - 128
551+ - 256
552+ - 384
553553print_iter_log: true
554554enable_attention_dp: true
555555EOF
0 commit comments