@@ -226,5 +226,108 @@ def test_deepseek_eagle3():
226226 pass
227227
228228
229+ @pytest .mark .parametrize ("use_one_model" , [
230+ [True ],
231+ [False ],
232+ ])
233+ def test_multi_eagle3 (use_one_model : bool ):
234+ use_cuda_graph = True
235+ attn_backend = "TRTLLM"
236+ disable_overlap_scheduler = False
237+ enable_block_reuse = False
238+ enable_chunked_prefill = False
239+
240+ # Eagle3 one model works with overlap scheduler and block reuse.
241+ total_mem_gb = torch .cuda .get_device_properties (0 ).total_memory / 1e9
242+ if total_mem_gb < 150 :
243+ pytest .skip ("Not enough memory to load target + draft model" )
244+
245+ models_path = llm_models_root ()
246+ eagle_config = {
247+ 'architectures' : ['LlamaForCausalLMEagle3' ],
248+ 'attention_bias' : False ,
249+ 'attention_dropout' : 0.0 ,
250+ 'bos_token_id' : 128000 ,
251+ 'eos_token_id' : [128001 , 128008 , 128009 ],
252+ 'eagle_config' : {
253+ 'use_aux_hidden_state' : False ,
254+ 'use_input_layernorm_in_first_layer' : True ,
255+ 'use_last_layernorm' : True ,
256+ 'use_mtp_layernorm' : False
257+ },
258+ 'head_dim' : 128 ,
259+ 'hidden_act' : 'silu' ,
260+ 'hidden_size' : 4096 ,
261+ 'initializer_range' : 0.02 ,
262+ 'intermediate_size' : 16384 ,
263+ 'max_position_embeddings' : 131072 ,
264+ 'mlp_bias' : False ,
265+ 'model_type' : 'llama' ,
266+ 'num_attention_heads' : 32 ,
267+ 'num_eagle_features' : 1 ,
268+ 'num_hidden_layers' : 2 ,
269+ 'num_key_value_heads' : 8 ,
270+ 'pretraining_tp' : 1 ,
271+ 'rms_norm_eps' : 1e-05 ,
272+ 'rope_scaling' : {
273+ 'factor' : 8.0 ,
274+ 'high_freq_factor' : 4.0 ,
275+ 'low_freq_factor' : 1.0 ,
276+ 'original_max_position_embeddings' : 8192 ,
277+ 'rope_type' : 'llama3'
278+ },
279+ 'rope_theta' : 500000.0 ,
280+ 'tie_word_embeddings' : False ,
281+ 'torch_dtype' : 'bfloat16' ,
282+ 'transformers_version' : '4.52.4' ,
283+ 'use_cache' : True ,
284+ 'vocab_size' : 128256 ,
285+ 'draft_vocab_size' : 128256
286+ }
287+ with tempfile .TemporaryDirectory () as temp_dir :
288+ eagle_model_dir = Path (temp_dir )
289+ config_path = eagle_model_dir / "config.json"
290+ with config_path .open ("w" ) as f :
291+ json .dump (eagle_config , f , indent = 2 )
292+ target_model_dir = f"{ models_path } /llama-3.1-model/Llama-3.1-8B-Instruct"
293+
294+ # bs > 1 gives non-deterministic when doing IFB. There are slight chances
295+ # that ref and spec does not match 100%
296+ max_batch_size = 16
297+ max_draft_len = 3
298+ kv_cache_config = KvCacheConfig (enable_block_reuse = enable_block_reuse ,
299+ free_gpu_memory_fraction = 0.5 )
300+ cuda_graph_config = CudaGraphConfig (
301+ batch_sizes = [1 ]) if use_cuda_graph else None
302+
303+ llm_common_config = dict (
304+ model = target_model_dir ,
305+ attn_backend = attn_backend ,
306+ disable_overlap_scheduler = disable_overlap_scheduler ,
307+ cuda_graph_config = cuda_graph_config ,
308+ max_batch_size = max_batch_size ,
309+ kv_cache_config = kv_cache_config ,
310+ enable_chunked_prefill = enable_chunked_prefill ,
311+ )
312+
313+ spec_config = EagleDecodingConfig (
314+ max_draft_len = max_draft_len ,
315+ speculative_model_dir = eagle_model_dir ,
316+ # Llama 3 does not support one model eagle.
317+ eagle3_one_model = use_one_model ,
318+ num_eagle_layers = 2 ,
319+ load_format = "dummy" )
320+
321+ llm_spec = LLM (** llm_common_config , speculative_config = spec_config )
322+
323+ tok_ids = llm_spec .tokenizer .encode ("The future of AI is" )
324+
325+ sampling_params = SamplingParams (max_tokens = 32 , temperature = 0 )
326+ for output in llm_spec .generate_async (tok_ids ,
327+ sampling_params ,
328+ streaming = True ):
329+ pass
330+
331+
229332if __name__ == "__main__" :
230333 unittest .main ()
0 commit comments