@@ -2416,7 +2416,8 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
24162416 (8 , 1 , 8 , True , True , True , "TRTLLM" , False ),
24172417 ],
24182418 ids = [
2419- "latency_moe_cutlass" , "latency_moe_trtllm" ,
2419+ "latency_moe_cutlass" ,
2420+ "latency_moe_trtllm" ,
24202421 ],
24212422 )
24222423 def test_nvfp4 (self , tp_size , pp_size , ep_size , attention_dp , cuda_graph ,
@@ -2456,14 +2457,15 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
24562457 @pytest .mark .parametrize (
24572458 "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3" ,
24582459 [
2459- (4 , 1 , 4 , False , False , False , "TRTLLM" , True ), # TP8 has bug when we use TRTLLM moe backend and eagle3
2460+ (4 , 1 , 4 , False , False , False , "TRTLLM" ,
2461+ True ), # TP8 has bug when we use TRTLLM moe backend and eagle3
24602462 ],
24612463 ids = [
24622464 "latency_moe_trtllm_eagle3" ,
24632465 ],
24642466 )
2465- def test_nvfp4_4gpus (self , tp_size , pp_size , ep_size , attention_dp , cuda_graph ,
2466- overlap_scheduler , moe_backend , eagle3 ):
2467+ def test_nvfp4_4gpus (self , tp_size , pp_size , ep_size , attention_dp ,
2468+ cuda_graph , overlap_scheduler , moe_backend , eagle3 ):
24672469
24682470 pytorch_config = dict (
24692471 disable_overlap_scheduler = not overlap_scheduler ,
@@ -2494,6 +2496,7 @@ def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
24942496 task = GSM8K (self .MODEL_NAME )
24952497 task .evaluate (llm )
24962498
2499+
24972500class TestPhi4MiniInstruct (LlmapiAccuracyTestHarness ):
24982501 MODEL_NAME = "microsoft/Phi-4-mini-instruct"
24992502 MODEL_PATH = f"{ llm_models_root ()} /Phi-4-mini-instruct"
0 commit comments