@@ -543,25 +543,27 @@ def test_auto_dtype_tp8(self):
543543 task .evaluate (llm ,
544544 extra_evaluator_kwargs = dict (apply_chat_template = True ))
545545
546+ @skip_pre_hopper
546547 @pytest .mark .skip_less_mpi_world_size (8 )
547548 @parametrize_with_ids ("eagle3_one_model" , [True , False ])
548- def test_eagle3_tp8 (self , eagle3_one_model ):
549- model_path = f"{ llm_models_root ()} /llama-3.3-models /Llama-3.3-70B-Instruct"
549+ def test_fp8_eagle3_tp8 (self , eagle3_one_model ):
550+ model_path = f"{ llm_models_root ()} /modelopt-hf-model-hub /Llama-3.3-70B-Instruct-fp8 "
550551 eagle_model_dir = f"{ llm_models_root ()} /EAGLE3-LLaMA3.3-Instruct-70B"
551552 kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.6 )
552553 spec_config = EagleDecodingConfig (max_draft_len = 4 ,
553554 speculative_model_dir = eagle_model_dir ,
554555 eagle3_one_model = eagle3_one_model )
555- pytorch_config = dict (disable_overlap_scheduler = True , )
556+ pytorch_config = dict (
557+ disable_overlap_scheduler = True ,
558+ cuda_graph_config = CudaGraphConfig (max_batch_size = 1 ))
556559 with LLM (model_path ,
560+ max_batch_size = 16 ,
557561 tensor_parallel_size = 8 ,
558562 speculative_config = spec_config ,
559563 kv_cache_config = kv_cache_config ,
560564 ** pytorch_config ) as llm :
561565 task = CnnDailymail (self .MODEL_NAME )
562566 task .evaluate (llm )
563- task = MMLU (self .MODEL_NAME )
564- task .evaluate (llm )
565567
566568 @pytest .mark .skip_less_device (4 )
567569 @skip_pre_hopper
0 commit comments