@@ -568,25 +568,27 @@ def test_auto_dtype_tp8(self):
568
568
task .evaluate (llm ,
569
569
extra_evaluator_kwargs = dict (apply_chat_template = True ))
570
570
571
+ @skip_pre_hopper
571
572
@pytest .mark .skip_less_mpi_world_size (8 )
572
573
@parametrize_with_ids ("eagle3_one_model" , [True , False ])
573
- def test_eagle3_tp8 (self , eagle3_one_model ):
574
- model_path = f"{ llm_models_root ()} /llama-3.3-models /Llama-3.3-70B-Instruct"
574
+ def test_fp8_eagle3_tp8 (self , eagle3_one_model ):
575
+ model_path = f"{ llm_models_root ()} /modelopt-hf-model-hub /Llama-3.3-70B-Instruct-fp8 "
575
576
eagle_model_dir = f"{ llm_models_root ()} /EAGLE3-LLaMA3.3-Instruct-70B"
576
577
kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.6 )
577
578
spec_config = EagleDecodingConfig (max_draft_len = 4 ,
578
579
speculative_model_dir = eagle_model_dir ,
579
580
eagle3_one_model = eagle3_one_model )
580
- pytorch_config = dict (disable_overlap_scheduler = True , )
581
+ pytorch_config = dict (
582
+ disable_overlap_scheduler = True ,
583
+ cuda_graph_config = CudaGraphConfig (max_batch_size = 1 ))
581
584
with LLM (model_path ,
585
+ max_batch_size = 16 ,
582
586
tensor_parallel_size = 8 ,
583
587
speculative_config = spec_config ,
584
588
kv_cache_config = kv_cache_config ,
585
589
** pytorch_config ) as llm :
586
590
task = CnnDailymail (self .MODEL_NAME )
587
591
task .evaluate (llm )
588
- task = MMLU (self .MODEL_NAME )
589
- task .evaluate (llm )
590
592
591
593
@pytest .mark .skip_less_device (4 )
592
594
@skip_pre_hopper
0 commit comments