@@ -229,35 +229,6 @@ def test_fp8_beam_search(self):
229229 sampling_params = sampling_params ,
230230 extra_acc_spec = "beam_width=4" )
231231
232- @skip_pre_hopper
233- @pytest .mark .parametrize ("eagle3_one_model" , [True , False ],
234- ids = ["one_model" , "two_model" ])
235- def test_eagle3 (self , eagle3_one_model ):
236- pytorch_config = dict (
237- disable_overlap_scheduler = True ,
238- cuda_graph_config = CudaGraphConfig (batch_sizes = [1 ]),
239- )
240- kv_cache_config = KvCacheConfig (enable_block_reuse = False ,
241- free_gpu_memory_fraction = 0.7 )
242-
243- eagle_model_dir = f"{ llm_models_root ()} /EAGLE3-LLaMA3.1-Instruct-8B"
244- target_model_dir = f"{ llm_models_root ()} /llama-3.1-model/Llama-3.1-8B-Instruct"
245-
246- draft_len = 4
247- spec_config = EagleDecodingConfig (max_draft_len = draft_len ,
248- speculative_model_dir = eagle_model_dir ,
249- eagle3_one_model = eagle3_one_model )
250-
251- with LLM (model = target_model_dir ,
252- ** pytorch_config ,
253- kv_cache_config = kv_cache_config ,
254- speculative_config = spec_config ,
255- build_config = None ) as llm :
256- task = CnnDailymail (self .MODEL_NAME )
257- task .evaluate (llm )
258- task = MMLU (self .MODEL_NAME )
259- task .evaluate (llm )
260-
261232 @skip_pre_hopper
262233 def test_ngram (self ):
263234 pytorch_config = dict (
@@ -370,26 +341,24 @@ def test_auto_dtype_tp8(self):
370341 extra_evaluator_kwargs = dict (apply_chat_template = True ))
371342
372343 @pytest .mark .skip_less_mpi_world_size (8 )
373- @pytest .mark .parametrize ("eagle3_one_model" , [True , False ],
374- ids = ["one_model" , "two_model" ])
344+ @pytest .mark .parametrize ("eagle3_one_model" , [True , False ])
375345 def test_eagle3_tp8 (self , eagle3_one_model ):
376346 model_path = f"{ llm_models_root ()} /llama-3.3-models/Llama-3.3-70B-Instruct"
377347 eagle_model_dir = f"{ llm_models_root ()} /EAGLE3-LLaMA3.3-Instruct-70B"
378- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 )
348+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.6 )
379349 spec_config = EagleDecodingConfig (max_draft_len = 4 ,
380350 speculative_model_dir = eagle_model_dir ,
381351 eagle3_one_model = eagle3_one_model )
352+ pytorch_config = dict (disable_overlap_scheduler = True , )
382353 with LLM (model_path ,
383354 tensor_parallel_size = 8 ,
384355 speculative_config = spec_config ,
385- kv_cache_config = kv_cache_config ) as llm :
386- task = MMLU (self .MODEL_NAME )
356+ kv_cache_config = kv_cache_config ,
357+ ** pytorch_config ) as llm :
358+ task = CnnDailymail (self .MODEL_NAME )
387359 task .evaluate (llm )
388- task = GSM8K (self .MODEL_NAME )
360+ task = MMLU (self .MODEL_NAME )
389361 task .evaluate (llm )
390- task = GPQADiamond (self .MODEL_NAME )
391- task .evaluate (llm ,
392- extra_evaluator_kwargs = dict (apply_chat_template = True ))
393362
394363 @pytest .mark .skip_less_device (4 )
395364 @skip_pre_hopper
@@ -469,18 +438,21 @@ def test_chunked_prefill(self, attn_backend):
469438
470439 @skip_pre_hopper
471440 @pytest .mark .skip_less_mpi_world_size (8 )
441+ @parametrize_with_ids ("torch_compile" , [True , False ])
472442 @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 1 )],
473443 ids = ["tp8" ])
474- def test_fp8_eagle3 (self , cuda_graph , tp_size , pp_size , ep_size ):
444+ def test_fp8_eagle3 (self , tp_size , pp_size , ep_size , torch_compile ):
475445 model_path = f"{ llm_models_root ()} /llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8"
476446 eagle_model_dir = f"{ llm_models_root ()} /Llama-4-Maverick-17B-128E-Eagle3"
477447 spec_config = EagleDecodingConfig (max_draft_len = 3 ,
478448 speculative_model_dir = eagle_model_dir )
479- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.75 )
449+ kv_cache_config = KvCacheConfig (enable_block_reuse = False ,
450+ free_gpu_memory_fraction = 0.75 )
480451 pytorch_config = dict (
481- disable_overlap_scheduler = not cuda_graph ,
482452 cuda_graph_config = CudaGraphConfig (max_batch_size = 8 ),
483- enable_attention_dp = False )
453+ enable_attention_dp = False ,
454+ torch_compile_config = TorchCompileConfig (
455+ enable_fullgraph = torch_compile ))
484456 with LLM (model_path ,
485457 kv_cache_config = kv_cache_config ,
486458 tensor_parallel_size = tp_size ,
0 commit comments