NVIDIA · nv-guomingz · Jul 29, 2025 · Jul 29, 2025
@@ -149,7 +149,6 @@ export TRTLLM_ENABLE_PDL=1
 trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
     throughput \
     --dataset $YOUR_DATA_PATH \
-    --backend pytorch \
     --num_requests 10 \
     --concurrency 1 \
     --max_batch_size 1 \
@@ -161,7 +160,6 @@ trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
 Explanation:
 - `trtllm-bench`: A CLI benchmarking utility that aims to make it easier for users to reproduce our officially published. See [TensorRT-LLM Benchmarking](https://nvidia.github.io/TensorRT-LLM/performance/perf-benchmarking.html) for details.
 - `--dataset`: Prompt dataset used to benchmark. Our official benchmark dataset has ISL = 1K, OSL = 2K
-- `--backend`: Inference backend. Here we use PyTorch backend.
 - `--num_requests`: Num requests used for the benchmark.
 - `--concurrency`: Total concurrency for the system.
 - `--max_batch_size`: Max batch size in each rank.
@@ -216,7 +214,6 @@ EOF
 trtllm-bench  --model nvidia/DeepSeek-R1-0528-FP4
      throughput
      --dataset ${YOUR_DATA_PATH}
-     --backend pytorch
      --tp 8  --ep 8
      --extra_llm_api_options ./extra-llm-api-config.yml
      --max_batch_size 896
@@ -285,7 +282,6 @@ trtllm-bench -m nvidia/DeepSeek-R1-FP4 \
     --ep 8 \
     --warmup 0 \
     --dataset ${YOUR_DATA_PATH} \
-    --backend pytorch \
     --max_batch_size 384 \
     --max_num_tokens 1536 \
     --num_requests 49152 \
@@ -325,7 +321,6 @@ EOF
 trtllm-bench --model deepseek-ai/DeepSeek-R1 \
     throughput \
     --dataset $YOUR_DATA_PATH \
-    --backend pytorch \
     --num_requests 10 \
     --max_batch_size 1 \
     --tp 8 \
@@ -380,7 +375,6 @@ trtllm-bench -m deepseek-ai/DeepSeek-R1 \
     --ep 8 \
     --warmup 0 \
     --dataset $YOUR_DATA_PATH \
-    --backend pytorch \
     --max_batch_size 128 \
     --max_num_tokens 1151 \
     --num_requests 5120 \

@@ -83,7 +83,6 @@ TLLM_PROFILE_START_STOP=100-150 nsys profile \
     --model_path ${MODEL_PATH} \
     throughput \
     --dataset /tmp/dataset.txt --warmup 0 \
-    --backend pytorch \
     --streaming
 ```
 

@@ -438,7 +438,7 @@ for build heuristics.
 ```
 
 ```shell
-trtllm-bench --model meta-llama/Llama-3.1-8B --model_path /Ckpt/Path/To/Llama-3.1-8B throughput --dataset /tmp/synthetic_128_128.txt --backend pytorch
+trtllm-bench --model meta-llama/Llama-3.1-8B --model_path /Ckpt/Path/To/Llama-3.1-8B throughput --dataset /tmp/synthetic_128_128.txt
 
 # Example output
 <snip verbose logging>
@@ -544,7 +544,6 @@ lora_config:
 trtllm-bench --model /path/to/base/model \
   throughput \
   --dataset synthetic_lora_data.json \
-  --backend pytorch \
   --extra_llm_api_options extra-llm-api-options.yaml
 ```
 
@@ -586,7 +585,6 @@ Run the benchmark:
 trtllm-bench --model Qwen/Qwen2-VL-2B-Instruct \
   throughput \
   --dataset mm_data.jsonl \
-  --backend pytorch \
   --num_requests 10 \
   --max_batch_size 4 \
   --modality image

@@ -150,7 +150,6 @@ trtllm-bench -m deepseek-ai/DeepSeek-R1 --model_path ${DS_R1_NVFP4_MODEL_PATH} t
         --tp 8 --ep 8 \
         --warmup 0 \
         --dataset /tmp/benchmarking_64k.txt \
-        --backend pytorch \
         --max_batch_size 12 \
         --max_num_tokens 65548 \
         --kv_cache_free_gpu_mem_fraction 0.6 \
@@ -179,7 +178,6 @@ trtllm-bench -m deepseek-ai/DeepSeek-R1 --model_path ${DS_R1_NVFP4_MODEL_PATH} t
         --tp 8 --ep 8 \
         --warmup 0 \
         --dataset /tmp/benchmarking_128k.txt \
-        --backend pytorch \
         --max_batch_size 2 \
         --max_num_tokens 131074 \
         --kv_cache_free_gpu_mem_fraction 0.3 \
@@ -512,7 +510,7 @@ mpirun \
 -H <HOST1>:8,<HOST2>:8 \
 -mca plm_rsh_args "-p 2233" \
 --allow-run-as-root -n 16 \
-trtllm-llmapi-launch trtllm-bench --model deepseek-ai/DeepSeek-V3 --model_path /models/DeepSeek-V3 throughput --backend pytorch --max_batch_size 161 --max_num_tokens 1160 --dataset /workspace/tensorrt_llm/dataset_isl1000.txt --tp 16 --ep 8 --kv_cache_free_gpu_mem_fraction 0.95 --extra_llm_api_options /workspace/tensorrt_llm/extra-llm-api-config.yml --concurrency 4096 --streaming
+trtllm-llmapi-launch trtllm-bench --model deepseek-ai/DeepSeek-V3 --model_path /models/DeepSeek-V3 throughput --max_batch_size 161 --max_num_tokens 1160 --dataset /workspace/tensorrt_llm/dataset_isl1000.txt --tp 16 --ep 8 --kv_cache_free_gpu_mem_fraction 0.95 --extra_llm_api_options /workspace/tensorrt_llm/extra-llm-api-config.yml --concurrency 4096 --streaming
 ```
 
 #### Slurm
@@ -524,7 +522,7 @@ trtllm-llmapi-launch trtllm-bench --model deepseek-ai/DeepSeek-V3 --model_path /
   --container-image=<CONTAINER_IMG> \
   --container-mounts=/workspace:/workspace \
   --container-workdir /workspace \
-  bash -c "trtllm-llmapi-launch trtllm-bench --model deepseek-ai/DeepSeek-V3 --model_path <YOUR_MODEL_DIR> throughput --backend pytorch --max_batch_size 161 --max_num_tokens 1160 --dataset /workspace/dataset.txt --tp 16 --ep 4 --kv_cache_free_gpu_mem_fraction 0.95 --extra_llm_api_options ./extra-llm-api-config.yml"
+  bash -c "trtllm-llmapi-launch trtllm-bench --model deepseek-ai/DeepSeek-V3 --model_path <YOUR_MODEL_DIR> throughput --max_batch_size 161 --max_num_tokens 1160 --dataset /workspace/dataset.txt --tp 16 --ep 4 --kv_cache_free_gpu_mem_fraction 0.95 --extra_llm_api_options ./extra-llm-api-config.yml"
 ```
 
 
@@ -592,7 +590,7 @@ DS_R1_NVFP4_MODEL_PATH=/path/to/DeepSeek-R1  # optional
 trtllm-llmapi-launch trtllm-bench \
     --model deepseek-ai/DeepSeek-R1 \
     --model_path $DS_R1_NVFP4_MODEL_PATH \
-    throughput --backend pytorch \
+    throughput \
     --num_requests 49152 \
     --max_batch_size 384 --max_num_tokens 1536 \
     --concurrency 3072 \
@@ -644,7 +642,6 @@ trtllm-bench \
       --model deepseek-ai/DeepSeek-V3 \
       --model_path /models/DeepSeek-V3 \
       throughput \
-      --backend pytorch \
       --max_batch_size ${MAX_BATCH_SIZE} \
       --max_num_tokens ${MAX_NUM_TOKENS} \
       --dataset dataset.txt \
@@ -666,7 +663,6 @@ mpirun -H <HOST1>:8,<HOST2>:8 \
       --model deepseek-ai/DeepSeek-V3 \
       --model_path /models/DeepSeek-V3 \
       throughput \
-      --backend pytorch \
       --max_batch_size ${MAX_BATCH_SIZE} \
       --max_num_tokens ${MAX_NUM_TOKENS} \
       --dataset dataset.txt \

@@ -41,7 +41,6 @@ trtllm-bench --model ${MODEL_NAME} \
     --ep 32 \
     --extra_llm_api_options ./extra_llm_api_options.yaml \
     --kv_cache_free_gpu_mem_fraction 0.75 \
-    --backend pytorch \
     --dataset ./dataset.json \
     --warmup 0 \
     --eos_id -1
@@ -133,7 +132,6 @@ trtllm-bench --model ${MODEL_NAME} \
     --ep 36 \
     --extra_llm_api_options ./extra_llm_api_options_eplb.yaml \
     --kv_cache_free_gpu_mem_fraction 0.75 \
-    --backend pytorch \
     --dataset ./dataset.json \
     --warmup 0 \
     --eos_id -1
@@ -200,7 +198,6 @@ trtllm-bench --model ${MODEL_NAME} \
     --ep 36 \
     --extra_llm_api_options ./extra_llm_api_options_eplb.yaml \
     --kv_cache_free_gpu_mem_fraction 0.75 \
-    --backend pytorch \
     --dataset ./dataset.json \
     --warmup 0 \
     --eos_id -1