Skip to content

Commit bcaa98f

Browse files
authored
V1 loader default (#4251)
* v1 laoder * update * update
1 parent e98c1c2 commit bcaa98f

File tree

4 files changed

+68
-9
lines changed

4 files changed

+68
-9
lines changed

fastdeploy/engine/args_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,7 @@ class EngineArgs:
387387
Configuration for early stop.
388388
"""
389389

390-
load_choices: str = "default"
390+
load_choices: str = "default_v1"
391391
"""The format of the model weights to load.
392392
Options include:
393393
- "default": default loader.
@@ -715,7 +715,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
715715
type=str,
716716
default=EngineArgs.load_choices,
717717
help="The format of the model weights to load.\
718-
default/new_loader.",
718+
default/default_v1.",
719719
)
720720

721721
# CacheConfig parameters group

fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -184,12 +184,20 @@ class UnquantizedFusedMoEMethod(MoEMethodBase):
184184
def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
185185

186186
if current_platform.is_cuda():
187-
self.up_gate_proj_weight_shape = [layer.num_experts, layer.hidden_size, layer.moe_intermediate_size * 2]
188-
self.down_proj_weight_shape = [layer.num_experts, layer.moe_intermediate_size, layer.hidden_size]
187+
self.up_gate_proj_weight_shape = [
188+
layer.num_local_experts,
189+
layer.hidden_size,
190+
layer.moe_intermediate_size * 2,
191+
]
192+
self.down_proj_weight_shape = [layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size]
189193
extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 1, "down": 0, "up": 1}}
190194
else:
191-
self.up_gate_proj_weight_shape = [layer.num_experts, layer.moe_intermediate_size * 2, layer.hidden_size]
192-
self.down_proj_weight_shape = [layer.num_experts, layer.hidden_size, layer.moe_intermediate_size]
195+
self.up_gate_proj_weight_shape = [
196+
layer.num_local_experts,
197+
layer.moe_intermediate_size * 2,
198+
layer.hidden_size,
199+
]
200+
self.down_proj_weight_shape = [layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size]
193201
extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}
194202

195203
layer.up_gate_proj_weight = layer.create_parameter(
@@ -209,14 +217,12 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
209217
{
210218
"weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
211219
"weight_need_transpose": extra_weight_attrs.get("model_format") == "torch",
212-
"model_format": extra_weight_attrs.get("model_format", ""),
213220
},
214221
)
215222
set_weight_attrs(
216223
layer.down_proj_weight,
217224
{
218225
"weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
219226
"weight_need_transpose": extra_weight_attrs.get("model_format") == "torch",
220-
"model_format": extra_weight_attrs.get("model_format", ""),
221227
},
222228
)

fastdeploy/model_executor/utils.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,18 @@
1414
# limitations under the License.
1515
"""
1616

17+
import os
1718
import re
1819
from contextlib import contextmanager
1920
from typing import Any, Optional, Union
2021

2122
import paddle
23+
from paddleformers.utils.log import logger
2224

25+
from fastdeploy import envs
2326
from fastdeploy.config import FDConfig
2427
from fastdeploy.model_executor.layers.utils import get_tensor
28+
from fastdeploy.platforms import current_platform
2529

2630

2731
class BitMaskTracker:
@@ -194,6 +198,53 @@ def fn(param, loaded_weight, shard_id: Optional[Union[int, str]] = None):
194198
return fn
195199

196200

201+
def is_pre_sliced_weight(model_path):
202+
rank_dirs = [
203+
f for f in os.listdir(model_path) if f.startswith("rank") and os.path.isdir(os.path.join(model_path, f))
204+
]
205+
return len(rank_dirs) > 1
206+
207+
208+
def v1_loader_support(fd_config):
209+
_v1_no_support_archs = ["Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration"]
210+
211+
def _err_msg(msg: str) -> str:
212+
logger.info(msg + "; fallback to the v0 loader for model loading.")
213+
214+
if not current_platform.is_cuda():
215+
_err_msg("v1loader currently does not support backends other than CUDA")
216+
return False
217+
218+
if is_pre_sliced_weight(fd_config.model_config.model):
219+
_err_msg("v1 loader currently does not support pre-sliced weights")
220+
return False
221+
222+
if fd_config.parallel_config.use_ep:
223+
_err_msg("v1 loader currently does not support expert parallelism")
224+
return False
225+
226+
if envs.FD_MOE_BACKEND.lower() == "marlin":
227+
_err_msg("v1 loader currently does not support marlin backend")
228+
return False
229+
230+
if fd_config.quant_config is not None:
231+
if fd_config.quant_config.name() == "mix_quant":
232+
moe_quant_type = fd_config.quant_config.moe_quant_type
233+
dense_quant_type = fd_config.quant_config.dense_quant_type
234+
else:
235+
moe_quant_type = fd_config.quant_config.name()
236+
dense_quant_type = fd_config.quant_config.name()
237+
unsupported_quant = {"w4a8", "w4afp8", "wint2"}
238+
239+
if unsupported_quant & {moe_quant_type, dense_quant_type}:
240+
_err_msg("v1 loader currently does not support w4a8/w4afp8/win2 quantization")
241+
return False
242+
if fd_config.model_config.architectures[0] in _v1_no_support_archs:
243+
_err_msg(f"v1 loader currently does not support {fd_config.model_config.architectures[0]}")
244+
return False
245+
return True
246+
247+
197248
@contextmanager
198249
def temporary_dtype(dtype: str):
199250
"""Temporarily set Paddle default dtype"""

fastdeploy/worker/worker_process.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue
4545
from fastdeploy.inter_communicator import ExistTaskStatus, IPCSignal, ModelWeightsStatus
4646
from fastdeploy.model_executor.layers.quantization import parse_quant_config
47+
from fastdeploy.model_executor.utils import v1_loader_support
4748
from fastdeploy.platforms import current_platform
4849
from fastdeploy.scheduler import SchedulerConfig
4950
from fastdeploy.utils import get_logger, optional_type
@@ -812,7 +813,8 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
812813
plas_attention_config=plas_attention_config,
813814
)
814815
update_fd_config_for_mm(fd_config)
815-
816+
if fd_config.load_config.load_choices == "default_v1" and not v1_loader_support(fd_config):
817+
fd_config.load_config.load_choices = "default"
816818
return fd_config
817819

818820

0 commit comments

Comments
 (0)