Skip to content

Commit

Permalink
fix ds2 config
Browse files Browse the repository at this point in the history
  • Loading branch information
wtmlon committed Feb 24, 2025
1 parent 5d13984 commit d0e834e
Show file tree
Hide file tree
Showing 4 changed files with 1 addition and 23 deletions.
2 changes: 1 addition & 1 deletion paddlenlp/transformers/deepseek_v2/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def __init__(
intermediate_size=11008,
moe_intermediate_size=1407,
num_hidden_layers=30,
num_nextn_predict_layers=1,
num_nextn_predict_layers=0,
num_attention_heads=32,
num_key_value_heads=32,
n_shared_experts=None,
Expand Down
6 changes: 0 additions & 6 deletions paddlenlp/transformers/deepseek_v2/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -946,17 +946,11 @@ def forward(
print("qa input: ", hidden_states._md5sum())
print("qa weight: ", self.q_a_proj.weight._md5sum())
q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
print("qb weight shape: ", self.q_b_proj.weight.shape)
print("qb weight reshape: ", [bsz, q_len, self.num_heads, self.q_head_dim])
print("q output shape: ", q.shape)
print(self.q_a_proj, self.q_b_proj)
q = q.reshape([bsz, q_len, self.num_heads, self.q_head_dim])
q_nope, q_pe = paddle.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], axis=-1)

# DeepSeekV2 kv_lora_rank+qk_rope_head_dim=512+64
print("kva weight: ", self.kv_a_proj_with_mqa.weight._md5sum())
compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
print(self.kv_a_proj_with_mqa, self.kv_b_proj)
compressed_kv, k_pe = paddle.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], axis=-1)
k_pe = k_pe.reshape([bsz, q_len, 1, self.qk_rope_head_dim])

Expand Down
14 changes: 0 additions & 14 deletions paddlenlp/transformers/moe_gate.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,18 +235,8 @@ def _priority(self, topk_idx: paddle.Tensor, capacity: int) -> paddle.Tensor:
token_priority = paddle.masked_fill(token_priority, ~valid_mask, 0)
dispatch_mask = F.one_hot(token_priority, capacity).cast(paddle.bool)
valid_mask = valid_mask.unsqueeze(-1).expand(valid_mask.shape + [capacity])
# p = paddle.topk(tmp_scores, k=k, axis=-1, sorted=Truirint('1', valid_mask)
# print('2', ~valid_mask)
# print('3', dispatch_mask)
# dispatch_mask = paddle.masked_fill(dispatch_mask, ~valid_mask, 0)
dispatch_mask = dispatch_mask * (~valid_mask)

# valid_mask = paddle.logical_and(token_priority >= 0, token_priority < capacity)
# token_priority = paddle.masked_fill(token_priority, ~valid_mask, 0)
# dispatch_mask = F.one_hot(token_priority, capacity).cast(paddle.int32)
# valid_mask = valid_mask.unsqueeze(-1).expand(valid_mask.shape + [capacity])
# dispatch_mask = paddle.masked_fill(dispatch_mask, ~valid_mask, 0)

return dispatch_mask

def _topk_greedy(self, scores: paddle.Tensor, k: int) -> Tuple[paddle.Tensor, paddle.Tensor]:
Expand Down Expand Up @@ -555,11 +545,7 @@ def topkgating(
"se,sec->sec", topk_masked_gates, token_priority.cast(paddle.get_default_dtype())
)

# print(gates_masked)
# gates_masked = gates_masked.astype("bool").unsqueeze(-1).expand(gates_masked.shape + token_priority.shape[-1:])
# print(gates_masked)
combine_weights = paddle.einsum("se,sec->sec", gates_masked, token_priority.cast(paddle.get_default_dtype()))
# combine_weights = gates_masked * token_priority
dispatch_mask = combine_weights.astype(paddle.bool)

return capacity, combine_weights, dispatch_mask, exp_counts, l_aux, l_zloss
2 changes: 0 additions & 2 deletions paddlenlp/transformers/moe_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,10 +253,8 @@ def forward(
# dispatched_input = paddle.masked_fill_(reshaped_input, dispatch_mask)

if self.expert_parallel_degree > 1:
print(dispatched_input, self.moe_group)
dispatched_input = _AllToAll.apply(dispatched_input, self.moe_group)
# Re-shape after all-to-all: ecm -> gecm
print(dispatched_input.shape)
dispatched_input = dispatched_input.reshape(
[self.expert_parallel_degree, self.moe_num_experts_per_device, -1, d_model]
)
Expand Down

0 comments on commit d0e834e

Please sign in to comment.