add qwen3 235b eagle3 ci

byshiue · byshiue · commit cb560fb9b511 · 2025-07-30T16:17:22.000+08:00
Signed-off-by: bhsueh &lt;11360707+byshiue@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -86,6 +86,10 @@ Qwen3/Qwen3-235B-A22B:
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8
     accuracy: 85.78
+  - spec_dec_algo: Eagle
+    quant_algo: NVFP4
+    kv_cache_quant_algo: FP8
+    accuracy: 85.78
 nvidia/Llama-3_3-Nemotron-Super-49B-v1:
   - accuracy: 92.57
   - quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -170,6 +170,10 @@ Qwen3/Qwen3-235B-A22B:
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8
     accuracy: 86
+  - spec_dec_algo: Eagle
+    quant_algo: NVFP4
+    kv_cache_quant_algo: FP8
+    accuracy: 86
 nvidia/Llama-3_3-Nemotron-Super-49B-v1:
   - accuracy: 79.43
   - quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -1888,28 +1888,44 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
     @skip_pre_blackwell
     @pytest.mark.skip_less_mpi_world_size(8)
     @pytest.mark.parametrize(
-        "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend",
-        [(8, 1, 8, True, True, True, "CUTLASS"),
-         (8, 1, 8, True, True, True, "TRTLLM")],
-        ids=["latency_moe_cutlass", "latency_moe_trtllm"],
+        "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3",
+        [
+            (8, 1, 8, True, True, True, "CUTLASS", False),
+            (8, 1, 8, True, True, True, "TRTLLM", False),
+            (8, 1, 8, False, False, False, "TRTLLM", True),
+        ],
+        ids=[
+            "latency_moe_cutlass", "latency_moe_trtllm",
+            "latency_moe_trtllm_eagle3"
+        ],
     )
     def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
-                   overlap_scheduler, moe_backend):
+                   overlap_scheduler, moe_backend, eagle3):
 
         pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
             moe_config=MoeConfig(backend=moe_backend))
 
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
+                                        enable_block_reuse=not eagle3)
+        spec_config = None
+        if eagle3:
+            spec_config = EagleDecodingConfig(
+                max_draft_len=2,
+                speculative_model_dir=
+                f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3/",
+                eagle3_one_model=not eagle3)
         with LLM(
                 f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
                 tensor_parallel_size=tp_size,
                 pipeline_parallel_size=pp_size,
                 moe_expert_parallel_size=ep_size,
                 **pytorch_config,
                 enable_attention_dp=attention_dp,
-                kv_cache_config=kv_cache_config) as llm:
+                kv_cache_config=kv_cache_config,
+                speculative_config=spec_config) as llm:
+
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
             task = GSM8K(self.MODEL_NAME)
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
@@ -18,3 +18,4 @@ l0_gb200_multi_nodes:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (180)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (180)
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] TIMEOUT (180)