V1 loader default (#4251)

bukejiyu · web-flow · commit bcaa98ff9c43 · 2025-10-15T16:49:17.000+08:00
* v1 laoder

* update

* update
diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py
@@ -387,7 +387,7 @@ class EngineArgs:
     Configuration for early stop.
     """
 
-    load_choices: str = "default"
+    load_choices: str = "default_v1"
     """The format of the model weights to load.
         Options include:
         - "default": default loader.
@@ -715,7 +715,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             type=str,
             default=EngineArgs.load_choices,
             help="The format of the model weights to load.\
-                 default/new_loader.",
+                 default/default_v1.",
         )
 
         # CacheConfig parameters group
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py
@@ -184,12 +184,20 @@ class UnquantizedFusedMoEMethod(MoEMethodBase):
     def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
 
         if current_platform.is_cuda():
-            self.up_gate_proj_weight_shape = [layer.num_experts, layer.hidden_size, layer.moe_intermediate_size * 2]
-            self.down_proj_weight_shape = [layer.num_experts, layer.moe_intermediate_size, layer.hidden_size]
+            self.up_gate_proj_weight_shape = [
+                layer.num_local_experts,
+                layer.hidden_size,
+                layer.moe_intermediate_size * 2,
+            ]
+            self.down_proj_weight_shape = [layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size]
             extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 1, "down": 0, "up": 1}}
         else:
-            self.up_gate_proj_weight_shape = [layer.num_experts, layer.moe_intermediate_size * 2, layer.hidden_size]
-            self.down_proj_weight_shape = [layer.num_experts, layer.hidden_size, layer.moe_intermediate_size]
+            self.up_gate_proj_weight_shape = [
+                layer.num_local_experts,
+                layer.moe_intermediate_size * 2,
+                layer.hidden_size,
+            ]
+            self.down_proj_weight_shape = [layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size]
             extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}
 
         layer.up_gate_proj_weight = layer.create_parameter(
@@ -209,14 +217,12 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
             {
                 "weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
                 "weight_need_transpose": extra_weight_attrs.get("model_format") == "torch",
-                "model_format": extra_weight_attrs.get("model_format", ""),
             },
         )
         set_weight_attrs(
             layer.down_proj_weight,
             {
                 "weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
                 "weight_need_transpose": extra_weight_attrs.get("model_format") == "torch",
-                "model_format": extra_weight_attrs.get("model_format", ""),
             },
         )
diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py
@@ -14,14 +14,18 @@
 # limitations under the License.
 """
 
+import os
 import re
 from contextlib import contextmanager
 from typing import Any, Optional, Union
 
 import paddle
+from paddleformers.utils.log import logger
 
+from fastdeploy import envs
 from fastdeploy.config import FDConfig
 from fastdeploy.model_executor.layers.utils import get_tensor
+from fastdeploy.platforms import current_platform
 
 
 class BitMaskTracker:
@@ -194,6 +198,53 @@ def fn(param, loaded_weight, shard_id: Optional[Union[int, str]] = None):
     return fn
 
 
+def is_pre_sliced_weight(model_path):
+    rank_dirs = [
+        f for f in os.listdir(model_path) if f.startswith("rank") and os.path.isdir(os.path.join(model_path, f))
+    ]
+    return len(rank_dirs) > 1
+
+
+def v1_loader_support(fd_config):
+    _v1_no_support_archs = ["Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration"]
+
+    def _err_msg(msg: str) -> str:
+        logger.info(msg + "; fallback to the v0 loader for model loading.")
+
+    if not current_platform.is_cuda():
+        _err_msg("v1loader currently does not support backends other than CUDA")
+        return False
+
+    if is_pre_sliced_weight(fd_config.model_config.model):
+        _err_msg("v1 loader currently does not support pre-sliced weights")
+        return False
+
+    if fd_config.parallel_config.use_ep:
+        _err_msg("v1 loader currently does not support expert parallelism")
+        return False
+
+    if envs.FD_MOE_BACKEND.lower() == "marlin":
+        _err_msg("v1 loader currently does not support marlin backend")
+        return False
+
+    if fd_config.quant_config is not None:
+        if fd_config.quant_config.name() == "mix_quant":
+            moe_quant_type = fd_config.quant_config.moe_quant_type
+            dense_quant_type = fd_config.quant_config.dense_quant_type
+        else:
+            moe_quant_type = fd_config.quant_config.name()
+            dense_quant_type = fd_config.quant_config.name()
+        unsupported_quant = {"w4a8", "w4afp8", "wint2"}
+
+        if unsupported_quant & {moe_quant_type, dense_quant_type}:
+            _err_msg("v1 loader currently does not support w4a8/w4afp8/win2 quantization")
+            return False
+    if fd_config.model_config.architectures[0] in _v1_no_support_archs:
+        _err_msg(f"v1 loader currently does not support {fd_config.model_config.architectures[0]}")
+        return False
+    return True
+
+
 @contextmanager
 def temporary_dtype(dtype: str):
     """Temporarily set Paddle default dtype"""
diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py
@@ -44,6 +44,7 @@
 from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue
 from fastdeploy.inter_communicator import ExistTaskStatus, IPCSignal, ModelWeightsStatus
 from fastdeploy.model_executor.layers.quantization import parse_quant_config
+from fastdeploy.model_executor.utils import v1_loader_support
 from fastdeploy.platforms import current_platform
 from fastdeploy.scheduler import SchedulerConfig
 from fastdeploy.utils import get_logger, optional_type
@@ -812,7 +813,8 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
         plas_attention_config=plas_attention_config,
     )
     update_fd_config_for_mm(fd_config)
-
+    if fd_config.load_config.load_choices == "default_v1" and not v1_loader_support(fd_config):
+        fd_config.load_config.load_choices = "default"
     return fd_config