[TRTLLM-7399][test] Add DS-R1/Qwen3 test cases for RTX 6000 (#7662)

pamelap-nvidia · coderabbitai[bot] · web-flow · commit b1dc84b4a384 · 2025-09-24T11:40:26.000-04:00
Signed-off-by: Pamela &lt;179191831+pamelap-nvidia@users.noreply.github.com&gt;
Signed-off-by: Pamela Peng &lt;179191831+pamelap-nvidia@users.noreply.github.com&gt;
Co-authored-by: coderabbitai[bot] &lt;136622811+coderabbitai[bot]@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -3022,9 +3022,13 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
         [
             (4, 1, 4, False, False, False, "TRTLLM",
              True),  # TP8 has bug when we use TRTLLM moe backend and eagle3
+            (4, 1, 4, False, False, False, "CUTLASS", False),
+            (4, 1, 4, False, False, False, "CUTLASS", True),
         ],
         ids=[
             "latency_moe_trtllm_eagle3",
+            "latency_moe_cutlass",
+            "latency_moe_cutlass_eagle3",
         ],
     )
     def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
@@ -1844,6 +1844,14 @@ def test_ptp_quickstart(llm_root, llm_venv):
     pytest.param('Qwen3-30B-A3B',
                  'Qwen3/Qwen3-30B-A3B',
                  marks=pytest.mark.skip_less_device_memory(80000)),
+    pytest.param(
+        'Qwen3-30B-A3B_fp8_hf',
+        'Qwen3/saved_models_Qwen3-30B-A3B_fp8_hf',
+        marks=(skip_pre_hopper, pytest.mark.skip_less_device_memory(40000))),
+    pytest.param(
+        'Qwen3-30B-A3B_nvfp4_hf',
+        'Qwen3/saved_models_Qwen3-30B-A3B_nvfp4_hf',
+        marks=(skip_pre_blackwell, pytest.mark.skip_less_device_memory(20000))),
     pytest.param('Llama3.3-70B-FP8',
                  'modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8',
                  marks=skip_pre_blackwell),
diff --git a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
@@ -30,6 +30,12 @@ l0_rtx_pro_6000:
   - test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-FP8-nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8]
   - test_e2e.py::test_ptp_quickstart_advanced[Mixtral-8x7B-NVFP4-nvfp4-quantized/Mixtral-8x7B-Instruct-v0.1]
   - test_e2e.py::test_ptp_quickstart_advanced[Mixtral-8x7B-FP8-Mixtral-8x7B-Instruct-v0.1-fp8]
+  - test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B] # 3mins
+  - test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B_fp8_hf-Qwen3/saved_models_Qwen3-30B-A3B_fp8_hf] # 3mins
+  - test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B_nvfp4_hf-Qwen3/saved_models_Qwen3-30B-A3B_nvfp4_hf] # 2mins
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] # 8mins
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True] # 8 mins
+
 - condition:
     ranges:
       system_gpu_count:
@@ -81,3 +87,15 @@ l0_rtx_pro_6000:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2]
+  # - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False] # hopper only
+  # - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
+  - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0]
+  - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2]
+  - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=0]
+  - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=2]
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_cutlass] # 20 mins
+  # - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_cutlass_eagle3] # failed
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True]
+  # - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False] # failed
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=True]

Original file line number	Diff line number	Diff line change
`@@ -3022,9 +3022,13 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,`
`3022`	`3022`	`[`
`3023`	`3023`	`(4, 1, 4, False, False, False, "TRTLLM",`
`3024`	`3024`	`True), # TP8 has bug when we use TRTLLM moe backend and eagle3`
	`3025`	`+ (4, 1, 4, False, False, False, "CUTLASS", False),`
	`3026`	`+ (4, 1, 4, False, False, False, "CUTLASS", True),`
`3025`	`3027`	`],`
`3026`	`3028`	`ids=[`
`3027`	`3029`	`"latency_moe_trtllm_eagle3",`
	`3030`	`+ "latency_moe_cutlass",`
	`3031`	`+ "latency_moe_cutlass_eagle3",`
`3028`	`3032`	`],`
`3029`	`3033`	`)`
`3030`	`3034`	`def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,`