@@ -336,36 +336,44 @@ def test_ngram(self):
336336 task = GSM8K (self .MODEL_NAME )
337337 task .evaluate (llm )
338338
339- @pytest .mark .parametrize ("overlap_scheduler" , [False ])
340- def test_eagle3 (self , overlap_scheduler ):
339+ @parametrize_with_ids ("overlap_scheduler" , [True , False ])
340+ @parametrize_with_ids ("eagle3_one_model" , [True , False ])
341+ def test_eagle3 (self , overlap_scheduler , eagle3_one_model ):
341342 speculative_decoding_config = {
342343 "decoding_type" : "Eagle" ,
343344 "max_draft_len" : 4 ,
344345 "speculative_model_dir" :
345346 f"{ llm_models_root ()} /EAGLE3-LLaMA3.1-Instruct-8B" ,
346- "eagle3_one_model" : False
347- }
348- kv_cache_config = {
349- "free_gpu_memory_fraction" : 0.5 ,
350- "enable_block_reuse" : False
347+ "eagle3_one_model" : eagle3_one_model
351348 }
352349 ctx_server_config = {
353- "disable_overlap_scheduler" : True ,
350+ "disable_overlap_scheduler" :
351+ True , # BS=1 does not need overlap scheduling
354352 "speculative_config" : speculative_decoding_config ,
355- "kv_cache_config" : kv_cache_config ,
353+ "kv_cache_config" : {
354+ "free_gpu_memory_fraction" : 0.5 ,
355+ "enable_block_reuse" : True # reuse on context requests
356+ },
356357 "max_num_tokens" : 13393 * 2 ,
358+ "max_batch_size" : 1 ,
357359 "cache_transceiver_config" : {
358360 "backend" : "default"
359- }
361+ },
362+ "cuda_graph_config" : None ,
360363 }
361364 gen_server_config = {
362365 "disable_overlap_scheduler" : not overlap_scheduler ,
363366 "speculative_config" : speculative_decoding_config ,
364- "kv_cache_config" : kv_cache_config ,
367+ "kv_cache_config" : {
368+ "free_gpu_memory_fraction" : 0.5 ,
369+ "enable_block_reuse" : False
370+ },
365371 "max_num_tokens" : 13393 * 2 ,
372+ "max_batch_size" : 16 ,
366373 "cache_transceiver_config" : {
367374 "backend" : "default"
368- }
375+ },
376+ "cuda_graph_config" : None ,
369377 }
370378 disaggregated_server_config = {
371379 "hostname" : "localhost" ,
0 commit comments