@@ -480,25 +480,27 @@ def test_auto_dtype_tp8(self):
480480 task .evaluate (llm ,
481481 extra_evaluator_kwargs = dict (apply_chat_template = True ))
482482
483+ @skip_pre_hopper
483484 @pytest .mark .skip_less_mpi_world_size (8 )
484485 @parametrize_with_ids ("eagle3_one_model" , [True , False ])
485- def test_eagle3_tp8 (self , eagle3_one_model ):
486- model_path = f"{ llm_models_root ()} /llama-3.3-models /Llama-3.3-70B-Instruct"
486+ def test_fp8_eagle3_tp8 (self , eagle3_one_model ):
487+ model_path = f"{ llm_models_root ()} /modelopt-hf-model-hub /Llama-3.3-70B-Instruct-fp8 "
487488 eagle_model_dir = f"{ llm_models_root ()} /EAGLE3-LLaMA3.3-Instruct-70B"
488489 kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.6 )
489490 spec_config = EagleDecodingConfig (max_draft_len = 4 ,
490491 speculative_model_dir = eagle_model_dir ,
491492 eagle3_one_model = eagle3_one_model )
492- pytorch_config = dict (disable_overlap_scheduler = True , )
493+ pytorch_config = dict (
494+ disable_overlap_scheduler = True ,
495+ cuda_graph_config = CudaGraphConfig (max_batch_size = 1 ))
493496 with LLM (model_path ,
497+ max_batch_size = 16 ,
494498 tensor_parallel_size = 8 ,
495499 speculative_config = spec_config ,
496500 kv_cache_config = kv_cache_config ,
497501 ** pytorch_config ) as llm :
498502 task = CnnDailymail (self .MODEL_NAME )
499503 task .evaluate (llm )
500- task = MMLU (self .MODEL_NAME )
501- task .evaluate (llm )
502504
503505 @pytest .mark .skip_less_device (4 )
504506 @skip_pre_hopper
0 commit comments