@@ -237,7 +237,8 @@ def test_eagle3(self, eagle3_one_model):
237237 disable_overlap_scheduler = True ,
238238 cuda_graph_config = CudaGraphConfig (batch_sizes = [1 ]),
239239 )
240- kv_cache_config = KvCacheConfig (enable_block_reuse = False )
240+ kv_cache_config = KvCacheConfig (enable_block_reuse = False ,
241+ free_gpu_memory_fraction = 0.7 )
241242
242243 eagle_model_dir = f"{ llm_models_root ()} /EAGLE3-LLaMA3.1-Instruct-8B"
243244 target_model_dir = f"{ llm_models_root ()} /llama-3.1-model/Llama-3.1-8B-Instruct"
@@ -374,12 +375,14 @@ def test_auto_dtype_tp8(self):
374375 def test_eagle3_tp8 (self , eagle3_one_model ):
375376 model_path = f"{ llm_models_root ()} /llama-3.3-models/Llama-3.3-70B-Instruct"
376377 eagle_model_dir = f"{ llm_models_root ()} /EAGLE3-LLaMA3.3-Instruct-70B"
378+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 )
377379 spec_config = EagleDecodingConfig (max_draft_len = 4 ,
378380 speculative_model_dir = eagle_model_dir ,
379381 eagle3_one_model = eagle3_one_model )
380382 with LLM (model_path ,
381383 tensor_parallel_size = 8 ,
382- speculative_config = spec_config ) as llm :
384+ speculative_config = spec_config ,
385+ kv_cache_config = kv_cache_config ) as llm :
383386 task = MMLU (self .MODEL_NAME )
384387 task .evaluate (llm )
385388 task = GSM8K (self .MODEL_NAME )
0 commit comments