Draft: MultiLayer Eagle

IzzyPutterman · IzzyPutterman · commit 9e57506820cb · 2025-08-25T17:32:49.000-07:00
Signed-off-by: Izzy Putterman &lt;iputterman@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_speculative.py b/tensorrt_llm/_torch/models/modeling_speculative.py
@@ -150,6 +150,7 @@ def __init__(
         self.dtype = config.torch_dtype
         self.hidden_size = config.hidden_size
         self.mapping = model_config.mapping
+        self.num_layers = model_config.pretrained_config.num_hidden_layers
 
         if hasattr(config, "target_hidden_size"):
             self.hidden_size_in = config.target_hidden_size
@@ -163,7 +164,13 @@ def __init__(
                              bias=getattr(config, "bias", False),
                              dtype=config.torch_dtype)
 
-        self.midlayer = Eagle3DecoderLayer(model_config, start_layer_idx)
+        if self.num_layers > 1:
+            self.midlayer = nn.ModuleList([
+                Eagle3DecoderLayer(model_config, start_layer_idx + i)
+                for i in range(self.num_layers)
+            ])
+        else:
+            self.midlayer = Eagle3DecoderLayer(model_config, start_layer_idx)
 
         self.norm = RMSNorm(hidden_size=config.hidden_size,
                             eps=config.rms_norm_eps,
@@ -212,11 +219,19 @@ def forward(
         # we expect that to happen outside the model definition. This helps us
         # avoid data-dependent control flow and gives us better CUDA graph
         # coverage.
-        hidden_states, residual = self.midlayer(position_ids=position_ids,
+        if self.num_layers > 1:
+            for layer in self.midlayer:
+                hidden_states, residual = layer(position_ids=position_ids,
                                                 embeds=inputs_embeds,
                                                 hidden_states=hidden_states,
                                                 attn_metadata=attn_metadata,
                                                 spec_metadata=spec_metadata)
+        else:
+            hidden_states, residual = self.midlayer(position_ids=position_ids,
+                                                    embeds=inputs_embeds,
+                                                    hidden_states=hidden_states,
+                                                    attn_metadata=attn_metadata,
+                                                    spec_metadata=spec_metadata)
 
         hidden_states, hidden_states_to_save = self.norm(
             hidden_states, residual)
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -252,6 +252,7 @@ def create_py_executor(
         with mem_monitor.observe_creation_stage(
                 _ExecutorCreationStage.MODEL_ENGINE_DRAFT):
             draft_spec_config = copy.copy(spec_config)
+            draft_spec_config.update_for_draft_init()
             draft_pytorch_backend_config = copy.copy(pytorch_backend_config)
             if spec_config.load_format == "dummy":
                 draft_pytorch_backend_config.load_format = LoadFormat.DUMMY
diff --git a/tensorrt_llm/_torch/speculative/utils.py b/tensorrt_llm/_torch/speculative/utils.py
@@ -151,7 +151,8 @@ def get_num_spec_layers(spec_config):
     if spec_config.spec_dec_mode.is_mtp():
         return spec_config.num_nextn_predict_layers
     if spec_config.spec_dec_mode.is_eagle3_one_model():
-        return 1
+        num_eagle_layers = spec_config.num_eagle_layers
+        return num_eagle_layers if num_eagle_layers is not None else 1
     return 0
 
 
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -391,6 +391,11 @@ def validate(self) -> None:
         Do any additional error checking here.
         """
 
+    def update_for_draft_init(self):
+        """
+        Update the config for draft model initialization.
+        """
+
     @functools.cached_property
     def spec_dec_mode(self):
         # spec_dec_mode has more functionality than the raw decoding_mode string.
@@ -445,7 +450,7 @@ def spec_dec_mode(self):
             return TorchSpeculativeDecodingMode.EAGLE3_ONE_MODEL
         return TorchSpeculativeDecodingMode.EAGLE3
 
-    @functools.cached_property
+    @property
     def num_capture_layers(self):
         """
         Returns the number of layers to capture of the target model.
@@ -456,6 +461,16 @@ def num_capture_layers(self):
             return len(self.eagle3_layers_to_capture)
         return 3
 
+    def update_for_draft_init(self):
+        """
+        Update the config for draft model initialization.
+        """
+        if not self.eagle3_one_model:
+            num_layers = self.num_eagle_layers
+            if num_layers is None:
+                num_layers = 1
+            self.eagle3_layers_to_capture = set(num_layers - 1)
+
 
 class UserProvidedDecodingConfig(DecodingBaseConfig):
     # Cannot use real type annotations due to circular imports