@@ -226,5 +226,106 @@ def test_deepseek_eagle3():
226226 pass
227227
228228
229+ @pytest .mark .parametrize ("use_one_model" , [True , False ])
230+ def test_multi_eagle3 (use_one_model : bool ):
231+ use_cuda_graph = True
232+ attn_backend = "TRTLLM"
233+ disable_overlap_scheduler = False
234+ enable_block_reuse = False
235+ enable_chunked_prefill = False
236+
237+ # Eagle3 one model works with overlap scheduler and block reuse.
238+ total_mem_gb = torch .cuda .get_device_properties (0 ).total_memory / 1e9
239+ if total_mem_gb < 150 :
240+ pytest .skip ("Not enough memory to load target + draft model" )
241+
242+ models_path = llm_models_root ()
243+ eagle_config = {
244+ 'architectures' : ['LlamaForCausalLMEagle3' ],
245+ 'attention_bias' : False ,
246+ 'attention_dropout' : 0.0 ,
247+ 'bos_token_id' : 128000 ,
248+ 'eos_token_id' : [128001 , 128008 , 128009 ],
249+ 'eagle_config' : {
250+ 'use_aux_hidden_state' : False ,
251+ 'use_input_layernorm_in_first_layer' : True ,
252+ 'use_last_layernorm' : True ,
253+ 'use_mtp_layernorm' : False
254+ },
255+ 'head_dim' : 128 ,
256+ 'hidden_act' : 'silu' ,
257+ 'hidden_size' : 4096 ,
258+ 'initializer_range' : 0.02 ,
259+ 'intermediate_size' : 16384 ,
260+ 'max_position_embeddings' : 131072 ,
261+ 'mlp_bias' : False ,
262+ 'model_type' : 'llama' ,
263+ 'num_attention_heads' : 32 ,
264+ 'num_eagle_features' : 1 ,
265+ 'num_hidden_layers' : 2 ,
266+ 'num_key_value_heads' : 8 ,
267+ 'pretraining_tp' : 1 ,
268+ 'rms_norm_eps' : 1e-05 ,
269+ 'rope_scaling' : {
270+ 'factor' : 8.0 ,
271+ 'high_freq_factor' : 4.0 ,
272+ 'low_freq_factor' : 1.0 ,
273+ 'original_max_position_embeddings' : 8192 ,
274+ 'rope_type' : 'llama3'
275+ },
276+ 'rope_theta' : 500000.0 ,
277+ 'tie_word_embeddings' : False ,
278+ 'torch_dtype' : 'bfloat16' ,
279+ 'transformers_version' : '4.52.4' ,
280+ 'use_cache' : True ,
281+ 'vocab_size' : 128256 ,
282+ 'draft_vocab_size' : 128256
283+ }
284+ with tempfile .TemporaryDirectory () as temp_dir :
285+ eagle_model_dir = Path (temp_dir )
286+ config_path = eagle_model_dir / "config.json"
287+ with config_path .open ("w" ) as f :
288+ json .dump (eagle_config , f , indent = 2 )
289+ target_model_dir = f"{ models_path } /llama-3.1-model/Llama-3.1-8B-Instruct"
290+
291+ # bs > 1 gives non-deterministic when doing IFB. There are slight chances
292+ # that ref and spec does not match 100%
293+ max_batch_size = 16
294+ max_draft_len = 3
295+ kv_cache_config = KvCacheConfig (enable_block_reuse = enable_block_reuse ,
296+ free_gpu_memory_fraction = 0.5 )
297+ cuda_graph_config = CudaGraphConfig (
298+ batch_sizes = [1 ]) if use_cuda_graph else None
299+
300+ llm_common_config = dict (
301+ model = target_model_dir ,
302+ attn_backend = attn_backend ,
303+ disable_overlap_scheduler = disable_overlap_scheduler ,
304+ cuda_graph_config = cuda_graph_config ,
305+ max_batch_size = max_batch_size ,
306+ kv_cache_config = kv_cache_config ,
307+ enable_chunked_prefill = enable_chunked_prefill ,
308+ load_format = "dummy" ,
309+ )
310+
311+ spec_config = EagleDecodingConfig (
312+ max_draft_len = max_draft_len ,
313+ speculative_model_dir = eagle_model_dir ,
314+ # Llama 3 does not support one model eagle.
315+ eagle3_one_model = use_one_model ,
316+ num_eagle_layers = 2 ,
317+ load_format = "dummy" )
318+
319+ llm_spec = LLM (** llm_common_config , speculative_config = spec_config )
320+
321+ tok_ids = llm_spec .tokenizer .encode ("The future of AI is" )
322+
323+ sampling_params = SamplingParams (max_tokens = 32 , temperature = 0 )
324+ for output in llm_spec .generate_async (tok_ids ,
325+ sampling_params ,
326+ streaming = True ):
327+ pass
328+
329+
229330if __name__ == "__main__" :
230331 unittest .main ()
0 commit comments