@@ -1888,28 +1888,44 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
18881888 @skip_pre_blackwell
18891889 @pytest .mark .skip_less_mpi_world_size (8 )
18901890 @pytest .mark .parametrize (
1891- "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend" ,
1892- [(8 , 1 , 8 , True , True , True , "CUTLASS" ),
1893- (8 , 1 , 8 , True , True , True , "TRTLLM" )],
1894- ids = ["latency_moe_cutlass" , "latency_moe_trtllm" ],
1891+ "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3" ,
1892+ [
1893+ (8 , 1 , 8 , True , True , True , "CUTLASS" , False ),
1894+ (8 , 1 , 8 , True , True , True , "TRTLLM" , False ),
1895+ (8 , 1 , 8 , False , False , False , "TRTLLM" , True ),
1896+ ],
1897+ ids = [
1898+ "latency_moe_cutlass" , "latency_moe_trtllm" ,
1899+ "latency_moe_trtllm_eagle3"
1900+ ],
18951901 )
18961902 def test_nvfp4 (self , tp_size , pp_size , ep_size , attention_dp , cuda_graph ,
1897- overlap_scheduler , moe_backend ):
1903+ overlap_scheduler , moe_backend , eagle3 ):
18981904
18991905 pytorch_config = dict (
19001906 disable_overlap_scheduler = not overlap_scheduler ,
19011907 cuda_graph_config = CudaGraphConfig () if cuda_graph else None ,
19021908 moe_config = MoeConfig (backend = moe_backend ))
19031909
1904- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.4 )
1910+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.4 ,
1911+ enable_block_reuse = not eagle3 )
1912+ spec_config = None
1913+ if eagle3 :
1914+ spec_config = EagleDecodingConfig (
1915+ max_draft_len = 2 ,
1916+ speculative_model_dir =
1917+ f"{ llm_models_root ()} /Qwen3/qwen3-235B-eagle3/" ,
1918+ eagle3_one_model = not eagle3 )
19051919 with LLM (
19061920 f"{ llm_models_root ()} /Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf" ,
19071921 tensor_parallel_size = tp_size ,
19081922 pipeline_parallel_size = pp_size ,
19091923 moe_expert_parallel_size = ep_size ,
19101924 ** pytorch_config ,
19111925 enable_attention_dp = attention_dp ,
1912- kv_cache_config = kv_cache_config ) as llm :
1926+ kv_cache_config = kv_cache_config ,
1927+ speculative_config = spec_config ) as llm :
1928+
19131929 task = MMLU (self .MODEL_NAME )
19141930 task .evaluate (llm )
19151931 task = GSM8K (self .MODEL_NAME )
0 commit comments