Implement SupportsEagle3 interface for Llama4 multimodal models

rahul-tuli · rahul-tuli · commit 936ad291a7d8 · 2025-09-30T15:24:11.000Z
Add Eagle3 support to Llama4ForConditionalGeneration by implementing
set_aux_hidden_state_layers() and get_eagle3_aux_hidden_state_layers()
methods. Both methods delegate to the underlying Llama4ForCausalLM
language model, enabling Eagle3 speculative decoding with Llama4
multimodal verifier models.

This allows text-only Eagle3 drafters to work with Llama4 multimodal
verifiers by consuming auxiliary hidden states from specified layers.
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
@@ -54,7 +54,8 @@
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .interfaces import (MultiModalEmbeddings, SupportsEagle3,
+                         SupportsMultiModal, SupportsPP)
 from .llama4 import Llama4ForCausalLM
 from .utils import AutoWeightsLoader, flatten_bn, maybe_prefix
 from .vision import run_dp_sharded_vision_model
@@ -708,8 +709,8 @@ def get_dummy_mm_data(
     info=Mllama4ProcessingInfo,
     dummy_inputs=Mllama4DummyInputsBuilder,
 )
-class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                     SupportsPP):
+class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
+                                     SupportsEagle3):
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
@@ -758,6 +759,23 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
+        """Set which layers should output auxiliary hidden states for EAGLE3."""
+        # Delegate to underlying language model (Llama4ForCausalLM)
+        assert hasattr(self.language_model, 'set_aux_hidden_state_layers')
+        self.language_model.set_aux_hidden_state_layers(layers)
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        """Get the layer indices for auxiliary hidden state outputs.
+
+        Note: The GPU model runner will override this with layers from
+        the speculative config if available, providing dynamic configuration.
+        """
+        # Delegate to underlying language model (Llama4ForCausalLM)
+        assert hasattr(self.language_model,
+                       'get_eagle3_aux_hidden_state_layers')
+        self.language_model.get_eagle3_aux_hidden_state_layers()
+
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[Llama4ImagePatchInputs]:
         # num_images, 1, num_chunks, channel, image_size, image_size