2727                                 SamplingParams , TorchCompileConfig )
2828from  tensorrt_llm .quantization  import  QuantAlgo 
2929
30- from  ..conftest  import  (llm_models_root , parametrize_with_ids , skip_no_hopper ,
30+ from  ..conftest  import  (get_device_count , get_device_memory , llm_models_root ,
31+                         parametrize_with_ids , skip_no_hopper ,
3132                        skip_post_blackwell , skip_pre_ada , skip_pre_blackwell ,
3233                        skip_pre_hopper )
3334from  .accuracy_core  import  (GSM8K , MMLU , CnnDailymail , GPQADiamond ,
@@ -509,19 +510,26 @@ class TestLlama4MaverickInstruct(LlmapiAccuracyTestHarness):
509510    MODEL_PATH  =  f"{ llm_models_root ()}  /llama4-models/Llama-4-Maverick-17B-128E-Instruct" 
510511
511512    @skip_pre_blackwell  
512-     @pytest .mark .skip_less_mpi_world_size (8 ) 
513513    @parametrize_with_ids ("cuda_graph" , [False , True ]) 
514-     @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ), 
515-                                                          (8 , 1 , 8 )], 
516-                              ids = ["tp8" , "tp8ep4" , "tp8ep8" ]) 
514+     @pytest .mark .parametrize ( 
515+         "tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ), (8 , 1 , 8 ), (4 , 1 , 1 ), 
516+                                     (4 , 1 , 2 ), (4 , 1 , 4 )], 
517+         ids = ["tp8" , "tp8ep4" , "tp8ep8" , "tp4" , "tp4ep2" , "tp4ep4" ]) 
517518    def  test_auto_dtype (self , cuda_graph , tp_size , pp_size , ep_size ):
519+         if  get_device_memory () <  270000  and  get_device_count () <  8 :
520+             pytest .skip ("Not enough memory for this test" )
521+         if  get_device_count () !=  tp_size  *  pp_size :
522+             pytest .skip ("Device count mismatch with world size" )
523+ 
524+         kv_cache_config  =  KvCacheConfig (free_gpu_memory_fraction = 0.8 )
518525        with  LLM (
519526                self .MODEL_PATH ,
520527                tensor_parallel_size = tp_size ,
521528                # Keep this low to avoid warmup OOM in CI 
522529                max_seq_len = 8192 ,
523530                pipeline_parallel_size = pp_size ,
524531                moe_expert_parallel_size = ep_size ,
532+                 kv_cache_config = kv_cache_config ,
525533                cuda_graph_config = CudaGraphConfig ()
526534                if  cuda_graph  else  None ) as  llm :
527535            task  =  MMLU (self .MODEL_NAME )
@@ -547,20 +555,27 @@ def test_chunked_prefill(self, attn_backend):
547555            task .evaluate (llm )
548556
549557    @skip_pre_hopper  
550-     @pytest .mark .skip_less_mpi_world_size ( 8 ) 
558+     @pytest .mark .skip_less_device_memory ( 80000 ) 
551559    @parametrize_with_ids ("cuda_graph" , [False , True ]) 
552-     @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ), 
553-                                                          (8 , 1 , 8 )], 
554-                              ids = ["tp8" , "tp8ep4" , "tp8ep8" ]) 
560+     @pytest .mark .parametrize ( 
561+         "tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ), (8 , 1 , 8 ), (4 , 1 , 1 ), 
562+                                     (4 , 1 , 2 ), (4 , 1 , 4 )], 
563+         ids = ["tp8" , "tp8ep4" , "tp8ep8" , "tp4" , "tp4ep2" , "tp4ep4" ]) 
555564    def  test_fp8 (self , cuda_graph , tp_size , pp_size , ep_size ):
565+         if  get_device_memory () <  140000  and  get_device_count () <  8 :
566+             pytest .skip ("Not enough memory for this test" )
567+         if  get_device_count () !=  tp_size  *  pp_size :
568+             pytest .skip ("Device count mismatch with world size" )
569+ 
556570        with  LLM (
557571                f"{ llm_models_root ()}  /llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8" ,
558572                tensor_parallel_size = tp_size ,
559573                # Keep this low to avoid warmup OOM in CI 
560574                max_seq_len = 8192 ,
561575                pipeline_parallel_size = pp_size ,
562576                moe_expert_parallel_size = ep_size ,
563-                 use_cuda_graph = cuda_graph ) as  llm :
577+                 cuda_graph_config = CudaGraphConfig ()
578+                 if  cuda_graph  else  None ) as  llm :
564579            assert  llm .args .quant_config .quant_algo  ==  QuantAlgo .FP8 
565580            assert  llm .args .quant_config .kv_cache_quant_algo  ==  QuantAlgo .FP8 
566581            task  =  MMLU (self .MODEL_NAME )
@@ -583,7 +598,8 @@ def test_fp8_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
583598                moe_expert_parallel_size = ep_size ,
584599                enable_chunked_prefill = True ,
585600                max_num_tokens = 256 ,
586-                 use_cuda_graph = cuda_graph ) as  llm :
601+                 cuda_graph_config = CudaGraphConfig ()
602+                 if  cuda_graph  else  None ) as  llm :
587603            assert  llm .args .quant_config .quant_algo  ==  QuantAlgo .FP8 
588604            assert  llm .args .quant_config .kv_cache_quant_algo  ==  QuantAlgo .FP8 
589605            task  =  MMLU (self .MODEL_NAME )
@@ -622,16 +638,20 @@ def test_fp8_eagle3(self, tp_size, pp_size, ep_size, torch_compile):
622638            task .evaluate (llm )
623639
624640
641+ @pytest .mark .skip_less_device_memory (80000 ) 
625642class  TestLlama4ScoutInstruct (LlmapiAccuracyTestHarness ):
626643    MODEL_NAME  =  "meta-llama/Llama-4-Scout-17B-16E-Instruct" 
627644
628645    @skip_pre_hopper  
629-     @pytest .mark .skip_less_mpi_world_size (8 ) 
630646    @parametrize_with_ids ("cuda_graph" , [False , True ]) 
631-     @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ), 
632-                                                          (8 , 1 , 8 )], 
633-                              ids = ["tp8" , "tp8ep4" , "tp8ep8" ]) 
647+     @pytest .mark .parametrize ( 
648+         "tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ), (8 , 1 , 8 ), (4 , 1 , 1 ), 
649+                                     (4 , 1 , 2 ), (4 , 1 , 4 )], 
650+         ids = ["tp8" , "tp8ep4" , "tp8ep8" , "tp4" , "tp4ep2" , "tp4ep4" ]) 
634651    def  test_auto_dtype (self , cuda_graph , tp_size , pp_size , ep_size ):
652+         if  get_device_count () !=  tp_size  *  pp_size :
653+             pytest .skip ("Device count mismatch with world size" )
654+ 
635655        model_path  =  f"{ llm_models_root ()}  /llama4-models/Llama-4-Scout-17B-16E-Instruct" 
636656        with  LLM (
637657                model_path ,
@@ -648,11 +668,13 @@ def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
648668            task .evaluate (llm )
649669
650670    @skip_pre_hopper  
651-     @pytest .mark .skip_less_mpi_world_size (8 ) 
652671    @parametrize_with_ids ("cuda_graph" , [True ]) 
653672    @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 8 ), (4 , 1 , 1 )], 
654673                             ids = ["tp8ep8" , "tp4" ]) 
655674    def  test_fp8 (self , cuda_graph , tp_size , pp_size , ep_size ):
675+         if  get_device_count () !=  tp_size  *  pp_size :
676+             pytest .skip ("Device count mismatch with world size" )
677+ 
656678        model_path  =  f"{ llm_models_root ()}  /llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8" 
657679        with  LLM (
658680                model_path ,
@@ -661,6 +683,7 @@ def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size):
661683                max_seq_len = 8192 ,
662684                pipeline_parallel_size = pp_size ,
663685                moe_expert_parallel_size = ep_size ,
686+                 kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.8 ),
664687                cuda_graph_config = CudaGraphConfig ()
665688                if  cuda_graph  else  None ) as  llm :
666689            assert  llm .args .quant_config .quant_algo  ==  QuantAlgo .FP8 
@@ -670,11 +693,13 @@ def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size):
670693            task .evaluate (llm )
671694
672695    @skip_pre_blackwell  
673-     @pytest .mark .skip_less_mpi_world_size (8 ) 
674696    @parametrize_with_ids ("cuda_graph" , [True ]) 
675697    @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 8 ), (4 , 1 , 1 )], 
676698                             ids = ["tp8ep8" , "tp4" ]) 
677699    def  test_fp4 (self , cuda_graph , tp_size , pp_size , ep_size ):
700+         if  get_device_count () !=  tp_size  *  pp_size :
701+             pytest .skip ("Device count mismatch with world size" )
702+ 
678703        model_path  =  f"{ llm_models_root ()}  /llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4" 
679704        with  LLM (
680705                model_path ,
@@ -715,7 +740,7 @@ def test_fp8_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
715740            task .evaluate (llm )
716741
717742    @skip_pre_blackwell  
718-     @pytest .mark .skip_less_mpi_world_size (8 ) 
743+     @pytest .mark .skip_less_mpi_world_size (4 ) 
719744    @parametrize_with_ids ("cuda_graph" , [True ]) 
720745    @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(4 , 1 , 4 )], 
721746                             ids = ["tp4ep4" ]) 
0 commit comments