[doc][ci][Qwen3][nvbugs 5374145] Add Qwen3 235B eagle3 CI (NVIDIA#6477)

byshiue · lancelly · commit 9096fed9b695 · 2025-08-06T03:01:49.000Z
Signed-off-by: bhsueh &lt;11360707+byshiue@users.noreply.github.com&gt;
Signed-off-by: Lanyu Liao &lt;lancelly@users.noreply.github.com&gt;
diff --git a/examples/models/core/qwen/README.md b/examples/models/core/qwen/README.md
@@ -26,6 +26,7 @@ This document shows how to build and run a [Qwen](https://huggingface.co/Qwen) m
     - [Serving](#serving)
       - [trtllm-serve](#trtllm-serve)
       - [Disaggregated Serving](#disaggregated-serving)
+    - [Eagle3](#eagle3)
   - [Dynamo](#dynamo)
   - [Notes and Troubleshooting](#notes-and-troubleshooting)
   - [Credits](#credits)
@@ -888,6 +889,38 @@ Note that the optimal disaggregated serving configuration (i.e. tp/pp/ep mapping
 on the request parameters, the number of concurrent requests and the GPU type. It is recommended to experiment to identify optimal
 settings for your specific use case.
 
+#### Eagle3
+
+Qwen3 now supports Eagle3 (Speculative Decoding with Eagle3). To enable Eagle3 on Qwen3, you need to set the following arguments when running `trtllm-bench` or `trtllm-serve`:
+
+- `speculative_config.decoding_type: Eagle`
+  Set the decoding type to "Eagle" to enable Eagle3 speculative decoding.
+- `speculative_config.max_draft_len: 3`
+  Set the maximum number of draft tokens generated per step (this value can be adjusted as needed).
+- `speculative_config.speculative_model_dir: <EAGLE3_DRAFT_MODEL_PATH>`
+  Specify the path to the Eagle3 draft model (ensure the corresponding draft model weights are prepared).
+
+Currently, there are some limitations when enabling Eagle3:
+
+1. `attention_dp` is not supported. Please disable it or do not set the related flag (it is disabled by default).
+2. If you want to use `enable_block_reuse`, the kv cache type of the target model and the draft model must be the same. Since the draft model only supports fp16/bf16, you need to disable `enable_block_reuse` when using fp8 kv cache.
+
+Example `extra-llm-api-config.yml` snippet for Eagle3:
+
+```bash
+echo "
+enable_attention_dp: false
+speculative_config:
+    decoding_type: Eagle
+    max_draft_len: 3
+    speculative_model_dir: <EAGLE3_DRAFT_MODEL_PATH>
+kv_cache_config:
+    enable_block_reuse: false
+" >> ${path_config}
+```
+
+For further details, please refer to [speculative-decoding.md](../../../../docs/source/advanced/speculative-decoding.md)
+
 ### Dynamo
 
 NVIDIA Dynamo is a high-throughput low-latency inference framework designed for serving generative AI and reasoning models in multi-node distributed environments.
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -86,6 +86,10 @@ Qwen3/Qwen3-235B-A22B:
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8
     accuracy: 85.78
+  - spec_dec_algo: Eagle
+    quant_algo: NVFP4
+    kv_cache_quant_algo: FP8
+    accuracy: 85.78
 nvidia/Llama-3_3-Nemotron-Super-49B-v1:
   - accuracy: 92.57
   - quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -170,6 +170,10 @@ Qwen3/Qwen3-235B-A22B:
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8
     accuracy: 86
+  - spec_dec_algo: Eagle
+    quant_algo: NVFP4
+    kv_cache_quant_algo: FP8
+    accuracy: 86
 nvidia/Llama-3_3-Nemotron-Super-49B-v1:
   - accuracy: 79.43
   - quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -1971,28 +1971,44 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
     @skip_pre_blackwell
     @pytest.mark.skip_less_mpi_world_size(8)
     @pytest.mark.parametrize(
-        "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend",
-        [(8, 1, 8, True, True, True, "CUTLASS"),
-         (8, 1, 8, True, True, True, "TRTLLM")],
-        ids=["latency_moe_cutlass", "latency_moe_trtllm"],
+        "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3",
+        [
+            (8, 1, 8, True, True, True, "CUTLASS", False),
+            (8, 1, 8, True, True, True, "TRTLLM", False),
+            (8, 1, 8, False, False, False, "TRTLLM", True),
+        ],
+        ids=[
+            "latency_moe_cutlass", "latency_moe_trtllm",
+            "latency_moe_trtllm_eagle3"
+        ],
     )
     def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
-                   overlap_scheduler, moe_backend):
+                   overlap_scheduler, moe_backend, eagle3):
 
         pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
             moe_config=MoeConfig(backend=moe_backend))
 
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
+                                        enable_block_reuse=not eagle3)
+        spec_config = None
+        if eagle3:
+            spec_config = EagleDecodingConfig(
+                max_draft_len=2,
+                speculative_model_dir=
+                f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3/",
+                eagle3_one_model=True)
         with LLM(
                 f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
                 tensor_parallel_size=tp_size,
                 pipeline_parallel_size=pp_size,
                 moe_expert_parallel_size=ep_size,
                 **pytorch_config,
                 enable_attention_dp=attention_dp,
-                kv_cache_config=kv_cache_config) as llm:
+                kv_cache_config=kv_cache_config,
+                speculative_config=spec_config) as llm:
+
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
             task = GSM8K(self.MODEL_NAME)
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
@@ -18,3 +18,4 @@ l0_gb200_multi_nodes:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (180)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (180)
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] TIMEOUT (180)