NVIDIA
diff --git a/‎docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md‎
Lines changed: 27 additions & 27 deletions b/‎docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md‎
Lines changed: 27 additions & 27 deletions
diff --git a/‎docs/source/performance/perf-overview.md‎
Lines changed: 18 additions & 18 deletions b/‎docs/source/performance/perf-overview.md‎
Lines changed: 18 additions & 18 deletions
diff --git a/‎examples/llm-api/llm_mgmn_trtllm_bench.sh‎
Lines changed: 0 additions & 2 deletions b/‎examples/llm-api/llm_mgmn_trtllm_bench.sh‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎examples/models/core/deepseek_v3/README.md‎
Lines changed: 45 additions & 45 deletions b/‎examples/models/core/deepseek_v3/README.md‎
Lines changed: 45 additions & 45 deletions
diff --git a/‎examples/models/core/qwen/README.md‎
Lines changed: 26 additions & 26 deletions b/‎examples/models/core/qwen/README.md‎
Lines changed: 26 additions & 26 deletions
diff --git a/‎examples/pytorch/quickstart_advanced.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/pytorch/quickstart_advanced.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/transformations/transform.py‎
Lines changed: 1 addition & 4 deletions b/‎tensorrt_llm/_torch/auto_deploy/transformations/transform.py‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎tensorrt_llm/bench/benchmark/utils/general.py‎
Lines changed: 2 additions & 2 deletions b/‎tensorrt_llm/bench/benchmark/utils/general.py‎
Lines changed: 2 additions & 2 deletions
@@ -196,20 +196,20 @@ We are seeing meaningful speedup using FP8 KV cache, thus refreshing the numbers
 ```bash
 cat >./extra-llm-api-config.yml <<EOF
 pytorch_backend_config:
-  use_cuda_graph: true
-  cuda_graph_padding_enabled: true
-  cuda_graph_batch_sizes:
-  - 896
-  - 512
-  - 256
-  - 128
-  - 64
-  - 32
-  - 16
-  - 8
-  - 4
-  - 2
-  - 1
+  cuda_graph_config:
+    padding_enabled: true
+    batch_sizes:
+    - 896
+    - 512
+    - 256
+    - 128
+    - 64
+    - 32
+    - 16
+    - 8
+    - 4
+    - 2
+    - 1
   print_iter_log: true
   kv_cache_dtype: fp8
 enable_attention_dp: true
@@ -264,19 +264,19 @@ YOUR_DATA_PATH=./dataset.txt
 
 cat >./extra-llm-api-config.yml <<EOF
 pytorch_backend_config:
-    use_cuda_graph: true
-    cuda_graph_padding_enabled: true
-    cuda_graph_batch_sizes:
-    - 1
-    - 2
-    - 4
-    - 8
-    - 16
-    - 32
-    - 64
-    - 128
-    - 256
-    - 384
+    cuda_graph_config:
+      padding_enabled: true
+      batch_sizes:
+      - 1
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 384
     print_iter_log: ${PRINT_ITER_LOG}
 enable_attention_dp: true
 EOF
 
@@ -200,24 +200,24 @@ trtllm-bench --model $model_name throughput --dataset $dataset_file --backend py
 
 `llm_options.yml`
 ```yaml
-use_cuda_graph: true
-cuda_graph_padding_enabled: true
-cuda_graph_batch_sizes:
-  - 1
-  - 2
-  - 4
-  - 8
-  - 16
-  - 32
-  - 64
-  - 128
-  - 256
-  - 384
-  - 512
-  - 1024
-  - 2048
-  - 4096
-  - 8192
+cuda_graph_config:
+  padding_enabled: true
+  batch_sizes:
+    - 1
+    - 2
+    - 4
+    - 8
+    - 16
+    - 32
+    - 64
+    - 128
+    - 256
+    - 384
+    - 512
+    - 1024
+    - 2048
+    - 4096
+    - 8192
 ```
 
 In majority of cases, we also use a higher KV cache percentage by setting `--kv_cache_free_gpu_mem_fraction 0.95` in the benchmark command. This allows us to obtain better performance than the default setting of `0.90`. We fall back to `0.90` if we hit an out of memory issue.
 
@@ -74,8 +74,6 @@ srun -l \
 
         # This is optional
         cat > /tmp/pytorch_extra_args.txt << EOF
-use_cuda_graph: false
-cuda_graph_padding_enabled: false
 print_iter_log: true
 enable_attention_dp: false
 EOF
 
@@ -141,9 +141,9 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \
         --num-requests 24 > /tmp/benchmarking_64k.txt
 
 cat <<EOF > /tmp/extra-llm-api-config.yml
-use_cuda_graph: true
-cuda_graph_padding_enabled: true
-cuda_graph_batch_sizes: [1, 4, 8, 12]
+cuda_graph_config:
+  padding_enabled: true
+  batch_sizes: [1, 4, 8, 12]
 EOF
 
 trtllm-bench -m deepseek-ai/DeepSeek-R1 --model_path ${DS_R1_NVFP4_MODEL_PATH} throughput \
@@ -168,9 +168,9 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \
         --num-requests 4 > /tmp/benchmarking_128k.txt
 
 cat <<EOF > /tmp/extra-llm-api-config.yml
-use_cuda_graph: true
-cuda_graph_padding_enabled: true
-cuda_graph_batch_sizes: [1, 2]
+cuda_graph_config:
+  padding_enabled: true
+  batch_sizes: [1, 2]
 moe_max_num_tokens: 16384
 EOF
 
@@ -236,19 +236,19 @@ To serve the model using `trtllm-serve`:
 
 ```bash
 cat >./extra-llm-api-config.yml <<EOF
-use_cuda_graph: true
-cuda_graph_padding_enabled: true
-cuda_graph_batch_sizes:
-  - 1
-  - 2
-  - 4
-  - 8
-  - 16
-  - 32
-  - 64
-  - 128
-  - 256
-  - 384
+cuda_graph_config:
+  padding_enabled: true
+  batch_sizes:
+    - 1
+    - 2
+    - 4
+    - 8
+    - 16
+    - 32
+    - 64
+    - 128
+    - 256
+    - 384
 print_iter_log: true
 enable_attention_dp: true
 EOF
@@ -315,19 +315,19 @@ And you can launch two generation servers on port 8002 and 8003 with:
 export TRTLLM_USE_UCX_KVCACHE=1
 
 cat >./gen-extra-llm-api-config.yml <<EOF
-use_cuda_graph: true
-cuda_graph_padding_enabled: true
-cuda_graph_batch_sizes:
-  - 1
-  - 2
-  - 4
-  - 8
-  - 16
-  - 32
-  - 64
-  - 128
-  - 256
-  - 384
+cuda_graph_config:
+  padding_enabled: true
+  batch_sizes:
+    - 1
+    - 2
+    - 4
+    - 8
+    - 16
+    - 32
+    - 64
+    - 128
+    - 256
+    - 384
 print_iter_log: true
 enable_attention_dp: true
 EOF
@@ -537,19 +537,19 @@ python3 /path/to/TensorRT-LLM/benchmarks/cpp/prepare_dataset.py \
     --input-mean=1024 --output-mean=2048 --input-stdev=0 --output-stdev=0 > /tmp/dataset.txt
 
 cat >/path/to/TensorRT-LLM/extra-llm-api-config.yml <<EOF
-use_cuda_graph: true
-cuda_graph_padding_enabled: true
-cuda_graph_batch_sizes:
-  - 1
-  - 2
-  - 4
-  - 8
-  - 16
-  - 32
-  - 64
-  - 128
-  - 256
-  - 384
+cuda_graph_config:
+  padding_enabled: true
+  batch_sizes:
+    - 1
+    - 2
+    - 4
+    - 8
+    - 16
+    - 32
+    - 64
+    - 128
+    - 256
+    - 384
 print_iter_log: true
 enable_attention_dp: true
 EOF
 
@@ -733,19 +733,19 @@ To serve the model using `trtllm-serve`:
 
 ```bash
 cat >./extra-llm-api-config.yml <<EOF
-use_cuda_graph: true
-cuda_graph_padding_enabled: true
-cuda_graph_batch_sizes:
-- 1
-- 2
-- 4
-- 8
-- 16
-- 32
-- 64
-- 128
-- 256
-- 384
+cuda_graph_config:
+  padding_enabled: true
+  batch_sizes:
+  - 1
+  - 2
+  - 4
+  - 8
+  - 16
+  - 32
+  - 64
+  - 128
+  - 256
+  - 384
 print_iter_log: true
 enable_attention_dp: true
 EOF
@@ -809,19 +809,19 @@ And you can launch two generation servers on port 8002 and 8003 with:
 export TRTLLM_USE_UCX_KVCACHE=1
 
 cat >./gen-extra-llm-api-config.yml <<EOF
-use_cuda_graph: true
-cuda_graph_padding_enabled: true
-cuda_graph_batch_sizes:
-  - 1
-  - 2
-  - 4
-  - 8
-  - 16
-  - 32
-  - 64
-  - 128
-  - 256
-  - 384
+cuda_graph_config:
+  padding_enabled: true
+  batch_sizes:
+    - 1
+    - 2
+    - 4
+    - 8
+    - 16
+    - 32
+    - 64
+    - 128
+    - 256
+    - 384
 print_iter_log: true
 enable_attention_dp: true
 EOF
 
@@ -187,8 +187,8 @@ def setup_llm(args):
         spec_config = None
 
     cuda_graph_config = CudaGraphConfig(
-        cuda_graph_batch_sizes=args.cuda_graph_batch_sizes,
-        cuda_graph_padding_enabled=args.cuda_graph_padding_enabled,
+        batch_sizes=args.cuda_graph_batch_sizes,
+        padding_enabled=args.cuda_graph_padding_enabled,
     ) if args.use_cuda_graph else None
     llm = LLM(
         model=args.model_dir,
 
@@ -194,10 +194,7 @@ def __call__(self, cm: CachedSequenceInterface) -> GraphModule:
 
         cm.info.set_generate_only_batch()
         compiler_kwargs = {
-            "cuda_graph_batch_sizes": self.ad_config.cuda_graph_config.cuda_graph_batch_sizes
-            if hasattr(self.ad_config, "cuda_graph_config")
-            and self.ad_config.cuda_graph_config is not None
-            else None,
+            "cuda_graph_batch_sizes": self.ad_config.cuda_graph_batch_sizes,
             "num_batched_inputs": 2,  # TODO (lucaslie): improve once we have a config system...
         }
         egm_compiled = compile_and_capture(
 
@@ -149,9 +149,9 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
 
     pyt_options = {
         "cuda_graph_config": {
-            "cuda_graph_padding_enabled":
+            "padding_enabled":
             True,
-            "cuda_graph_max_batch_size":
+            "max_batch_size":
             max_batch_size if cuda_graph_batch_sizes is None else 0,
         },
         "kv_cache_dtype": kv_cache_dtype,