[None][fix] Fix dummy load format for DeepSeek. (#7874)

yuxianq · web-flow · commit 48fda86c5678 · 2025-09-24T23:03:16.000+08:00
Signed-off-by: Yuxian Qiu &lt;142763828+yuxianq@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -393,28 +393,6 @@ def split_kv_b_proj(kv_b_proj: torch.Tensor,
                         for n, p in module.named_parameters():
                             p.data.copy_(module_weights[n][:])
 
-                if self.model_config.quant_config.layer_quant_mode.has_fp8_block_scales(
-                ) and is_sm_100f() and hasattr(module, "weight_scale"):
-                    weight, weight_scale = resmooth_to_fp8_e8m0(
-                        module.weight, module.weight_scale)
-                    transfromed_scale = transform_sf_into_required_layout(
-                        weight_scale,
-                        mn=weight.shape[0],
-                        k=weight.shape[1],
-                        recipe=(1, 128, 128),
-                        is_sfa=False)
-                    module.weight = nn.Parameter(weight, requires_grad=False)
-                    module.weight_scale = nn.Parameter(transfromed_scale,
-                                                       requires_grad=False)
-        if not self.is_draft_model:
-            for idx, layer in enumerate(
-                    self.model.model.layers[:self.config.num_hidden_layers]):
-                if idx == self.config.num_hidden_layers - 1:
-                    layer.next_layer_layernorm = self.model.model.norm
-                else:
-                    layer.next_layer_layernorm = self.model.model.layers[
-                        idx + 1].input_layernorm
-
 
 class DeepseekV3MTPHead(nn.Module):
 
@@ -1540,3 +1518,32 @@ def forward(
     def load_weights(self, weights: Dict):
         weight_loader = DeepseekV3WeightLoader(self)
         weight_loader.load_weights(weights)
+
+    def post_load_weights(self):
+        all_named_modules = dict(self.model.named_modules())
+        for name, module in tqdm(all_named_modules.items(),
+                                 desc="Post loading weights"):
+            if len(module._parameters) <= 0 or name.startswith("draft_model"):
+                continue
+            else:
+                if self.model_config.quant_config.layer_quant_mode.has_fp8_block_scales(
+                ) and is_sm_100f() and hasattr(module, "weight_scale"):
+                    weight, weight_scale = resmooth_to_fp8_e8m0(
+                        module.weight, module.weight_scale)
+                    transfromed_scale = transform_sf_into_required_layout(
+                        weight_scale,
+                        mn=weight.shape[0],
+                        k=weight.shape[1],
+                        recipe=(1, 128, 128),
+                        is_sfa=False)
+                    module.weight = nn.Parameter(weight, requires_grad=False)
+                    module.weight_scale = nn.Parameter(transfromed_scale,
+                                                       requires_grad=False)
+
+        for idx, layer in enumerate(
+                self.model.layers[:self.config.num_hidden_layers]):
+            if idx == self.config.num_hidden_layers - 1:
+                layer.next_layer_layernorm = self.model.norm
+            else:
+                layer.next_layer_layernorm = self.model.layers[
+                    idx + 1].input_layernorm
diff --git a/tensorrt_llm/_torch/models/modeling_llama_min_latency.py b/tensorrt_llm/_torch/models/modeling_llama_min_latency.py
@@ -1,5 +1,4 @@
 from collections.abc import Callable
-from functools import partial
 from typing import Dict, List, Optional, Tuple, Union
 
 import torch
@@ -66,7 +65,6 @@ def __init__(
         enable_fused_gemm_swiglu: bool = False,
         enable_fused_gemm_attn_scaling: bool = False,
         enable_trtllm_gen: bool = False,
-        post_load_weights_hook: Optional[Callable] = None,
     ):
         # First, initialize the base class.
         super().__init__(
@@ -88,7 +86,6 @@ def __init__(
         self.enable_fused_gemm_swiglu = enable_fused_gemm_swiglu
         self.enable_fused_gemm_attn_scaling = enable_fused_gemm_attn_scaling
         self.enable_trtllm_gen = enable_trtllm_gen
-        self.post_load_weights_hook = post_load_weights_hook
         self.position_ids = None
 
     def load_weights(self, weights: List[Dict]):
@@ -123,9 +120,6 @@ def load_weights(self, weights: List[Dict]):
                     self.weight.view(torch.uint8),
                     128).view(torch.float8_e4m3fn)
 
-        if self.post_load_weights_hook is not None:
-            self.post_load_weights_hook(self)
-
     # Override apply_linear instead of forward so that we can reuse the AllReduce/AllGather logic in the parent class.
     def apply_linear(
         self,
@@ -298,17 +292,6 @@ def __init__(self,
                 enable_trtllm_gen=True,
             )
 
-            # After loading both gate_up_proj and down_proj, we need to set the scales needed by the special kernels and by
-            # the trtllm-gen gemm+swiglu kernel.
-            def post_load_weights_hook(gate_up_proj, down_proj):
-                if gate_up_proj.has_fp8_qdq:
-                    # For the special gemm+swiglu kernel, we need to set the inverse of the output scale, which is the inverse
-                    # of down_proj's combined input scale.
-                    gate_up_proj.inv_output_scale = 1.0 / down_proj.input_scale
-                    # For the trtllm-gen gemm+swiglu kernel, we need to set the global scale, which is gate_up_proj's
-                    # combined input scale times inv_output_scale.
-                    gate_up_proj.trtllm_gen_global_scale = gate_up_proj.combined_scale * gate_up_proj.inv_output_scale
-
             self.down_proj = Llama4MinLatencyLinear(
                 self.intermediate_size,
                 self.hidden_size,
@@ -320,10 +303,19 @@ def post_load_weights_hook(gate_up_proj, down_proj):
                 reduce_output=reduce_output,
                 skip_create_weights_in_init=config.skip_create_weights_in_init,
                 enable_trtllm_gen=True,
-                post_load_weights_hook=partial(post_load_weights_hook,
-                                               self.gate_up_proj),
             )
 
+    # After loading both gate_up_proj and down_proj, we need to set the scales needed by the special kernels and by
+    # the trtllm-gen gemm+swiglu kernel.
+    def post_load_weights(self):
+        if self.gate_up_proj.has_fp8_qdq:
+            # For the special gemm+swiglu kernel, we need to set the inverse of the output scale, which is the inverse
+            # of down_proj's combined input scale.
+            self.gate_up_proj.inv_output_scale = 1.0 / self.down_proj.input_scale
+            # For the trtllm-gen gemm+swiglu kernel, we need to set the global scale, which is gate_up_proj's
+            # combined input scale times inv_output_scale.
+            self.gate_up_proj.trtllm_gen_global_scale = self.gate_up_proj.combined_scale * self.gate_up_proj.inv_output_scale
+
     def forward(
         self,
         x: Union[torch.Tensor, Fp4QuantizedTensor],
@@ -450,7 +442,6 @@ def __init__(
         weight_loading_mode: MoEWeightLoadingMode = MoEWeightLoadingMode.
         VANILLA,
         apply_router_weight_on_input: bool = False,
-        post_load_weights_hook: Optional[Callable] = None,
     ):
 
         super().__init__(
@@ -466,8 +457,6 @@ def __init__(
             apply_router_weight_on_input=apply_router_weight_on_input,
         )
 
-        self.post_load_weights_hook = post_load_weights_hook
-
         # Enable min-latency mode for Llama4 Maverick TP8 EP1.
         self.enable_min_latency_fused_moe = False
         if num_experts == 128 \
@@ -481,12 +470,6 @@ def __init__(
             and apply_router_weight_on_input:
             self.enable_min_latency_fused_moe = True
 
-    def load_weights(self, weights: List[Dict]):
-        super().load_weights(weights)
-
-        if self.post_load_weights_hook:
-            self.post_load_weights_hook(self)
-
     def forward(
         self,
         x: Union[torch.Tensor, Fp4QuantizedTensor],
@@ -560,22 +543,6 @@ def __init__(
             overridden_tp_size=1 if self.enable_attention_dp else None,
             reduce_output=False)
 
-        def post_load_weights_hook(shared_expert, experts):
-            # Set min-latency quant scales for routed experts if we plan to use min-latency MoE kernels.
-            # This is because the routed experts' input scale is after the score multiplication, so we must use the
-            # pre-score scaling input scale, which happens to be shared expert's input scale.
-            if experts.enable_min_latency_fused_moe and hasattr(
-                    shared_expert.gate_up_proj, "input_scale"):
-                pre_score_scaling_input_scale = shared_expert.gate_up_proj.input_scale
-                experts.min_latency_quant_scales = FusedMoEQuantScalesFP8(
-                    fc1_dequant=experts.fc31_dequant.data /
-                    experts.fc31_input_dequant.data *
-                    pre_score_scaling_input_scale,
-                    fc2_quant=experts.fc2_quant,
-                    fc2_dequant=experts.fc2_dequant,
-                    fc1_input_dequant=pre_score_scaling_input_scale,
-                )
-
         self.experts = Llama4MinLatencyFusedMoE(
             routing_method=Llama4RenormalizeMoeRoutingMethod(top_k),
             num_experts=num_experts,
@@ -587,8 +554,7 @@ def post_load_weights_hook(shared_expert, experts):
             weight_loading_mode=MoEWeightLoadingMode.FUSED_GATE_UP_PROJ,
             model_config=model_config,
             apply_router_weight_on_input=True,
-            post_load_weights_hook=partial(post_load_weights_hook,
-                                           self.shared_expert))
+        )
 
         self.router = Llama4MinLatencyLinear(
             hidden_size,
@@ -597,6 +563,22 @@ def post_load_weights_hook(shared_expert, experts):
             dtype=model_config.pretrained_config.torch_dtype,
             quant_config=None)
 
+    def post_load_weights(self):
+        # Set min-latency quant scales for routed experts if we plan to use min-latency MoE kernels.
+        # This is because the routed experts' input scale is after the score multiplication, so we must use the
+        # pre-score scaling input scale, which happens to be shared expert's input scale.
+        if self.experts.enable_min_latency_fused_moe and hasattr(
+                self.shared_expert.gate_up_proj, "input_scale"):
+            pre_score_scaling_input_scale = self.shared_expert.gate_up_proj.input_scale
+            self.experts.min_latency_quant_scales = FusedMoEQuantScalesFP8(
+                fc1_dequant=self.experts.fc31_dequant.data /
+                self.experts.fc31_input_dequant.data *
+                pre_score_scaling_input_scale,
+                fc2_quant=self.experts.fc2_quant,
+                fc2_dequant=self.experts.fc2_dequant,
+                fc1_input_dequant=pre_score_scaling_input_scale,
+            )
+
     def compute_routed_output(
             self,
             hidden_states,
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
@@ -598,3 +598,6 @@ def load_weights(self, weights: List[Dict]):
         weights = weights[0]
 
         self.quant_method.load_weights(self, weights, self.weight_loading_mode)
+
+    def post_load_weights(self):
+        self.quant_method.post_load_weights(self)
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py
@@ -1385,3 +1385,6 @@ def load_weights(self, weights: List[Dict]):
         weights = weights[0]
 
         self.quant_method.load_weights(self, weights, self.weight_loading_mode)
+
+    def post_load_weights(self):
+        self.quant_method.post_load_weights(self)
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py
@@ -169,6 +169,9 @@ def load_weights(self, weights: List[Dict]):
 
         self.quant_method.load_weights(self, weights, self.weight_loading_mode)
 
+    def post_load_weights(self):
+        self.quant_method.post_load_weights(self)
+
     def forward_impl(
         self,
         x: Union[torch.Tensor, Fp4QuantizedTensor],
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
@@ -1017,3 +1017,6 @@ def load_weights(self, weights: List[Dict]):
         weights = weights[0]
 
         self.quant_method.load_weights(self, weights, self.weight_loading_mode)
+
+    def post_load_weights(self):
+        self.quant_method.post_load_weights(self)
diff --git a/tensorrt_llm/_torch/modules/fused_moe/interface.py b/tensorrt_llm/_torch/modules/fused_moe/interface.py
@@ -195,6 +195,9 @@ def create_weights(self):
     def load_weights(self, weights: List[Dict]):
         raise NotImplementedError
 
+    def post_load_weights(self):
+        pass
+
     @abstractmethod
     def forward_impl(
         self,
diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
@@ -322,6 +322,9 @@ def load_weights(self, module: torch.nn.Module, weights: List[Dict],
             module.layer_load_balancer.set_initial_weight_assignments(
                 module.initial_global_assignments)
 
+    def post_load_weights(self, module: torch.nn.Module):
+        pass
+
     def load_quant_scales(self, module: torch.nn.Module, weights: List[Dict]):
         pass
 
@@ -726,6 +729,7 @@ def load_weights(self, module: torch.nn.Module, weights: List[Dict],
                         weight, scale)
         super().load_weights(module, weights, weight_loading_mode)
 
+    def post_load_weights(self, module: torch.nn.Module):
         if is_sm_100f():
             transfromed_w3_w1_scale = transform_sf_into_required_layout(
                 module.quant_scales[0],
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
@@ -241,6 +241,9 @@ def load_weights(self, module: Linear, weights: List[Dict],
         else:
             raise ValueError(f'unsupported weight mode: {weight_mode}')
 
+    def post_load_weights(self, module: Linear):
+        pass
+
     def load_weight_scales(self, weights: List[Dict], *args, **kwargs):
         """
         Load quantized weight scales from the checkpoint.
@@ -2001,3 +2004,6 @@ def load_weights(self, weights: List[Dict]):
 
         weight_mode = self.weights_loading_config.weight_mode
         self.quant_method.load_weights(self, weights, weight_mode)
+
+    def post_load_weights(self):
+        self.quant_method.post_load_weights(self)
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -1048,6 +1048,10 @@ def init_meta_tensor(t: torch.Tensor):
                 raise NotImplementedError(
                     f"No load support for load format: {load_format}")
 
+            for module in model.modules():
+                if hasattr(module, 'post_load_weights'):
+                    module.post_load_weights()
+
             if isinstance(moe_load_balancer, MoeLoadBalancer):
                 setattr(self, "moe_load_balancer", moe_load_balancer)
                 moe_load_balancer.register_weight_slots_after_to_cuda()
diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py
@@ -884,6 +884,7 @@ def test_fused_moe_fp8_blockwise_deepgemm(dtype,
     )
     fused_moe.cuda()
     fused_moe.load_weights([weights])
+    fused_moe.post_load_weights()
 
     def swiglu_fused_moe(x):
         x, gate = x.chunk(2, dim=-1)