fix ds2 config

wtmlon · Feb 24, 2025 · d0e834e · d0e834e
1 parent 5d13984
commit d0e834e
Show file tree

Hide file tree

Showing 4 changed files with 1 addition and 23 deletions.
diff --git a/paddlenlp/transformers/deepseek_v2/configuration.py b/paddlenlp/transformers/deepseek_v2/configuration.py
@@ -139,7 +139,7 @@ def __init__(
         intermediate_size=11008,
         moe_intermediate_size=1407,
         num_hidden_layers=30,
-        num_nextn_predict_layers=1,
+        num_nextn_predict_layers=0,
         num_attention_heads=32,
         num_key_value_heads=32,
         n_shared_experts=None,

diff --git a/paddlenlp/transformers/deepseek_v2/modeling.py b/paddlenlp/transformers/deepseek_v2/modeling.py
@@ -946,17 +946,11 @@ def forward(
             print("qa input: ", hidden_states._md5sum())
             print("qa weight: ", self.q_a_proj.weight._md5sum())
             q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
-        print("qb weight shape: ", self.q_b_proj.weight.shape)
-        print("qb weight reshape: ", [bsz, q_len, self.num_heads, self.q_head_dim])
-        print("q output shape: ", q.shape)
-        print(self.q_a_proj, self.q_b_proj)
         q = q.reshape([bsz, q_len, self.num_heads, self.q_head_dim])
         q_nope, q_pe = paddle.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], axis=-1)
 
         # DeepSeekV2 kv_lora_rank+qk_rope_head_dim=512+64
-        print("kva weight: ", self.kv_a_proj_with_mqa.weight._md5sum())
         compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
-        print(self.kv_a_proj_with_mqa, self.kv_b_proj)
         compressed_kv, k_pe = paddle.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], axis=-1)
         k_pe = k_pe.reshape([bsz, q_len, 1, self.qk_rope_head_dim])
 

diff --git a/paddlenlp/transformers/moe_gate.py b/paddlenlp/transformers/moe_gate.py
@@ -235,18 +235,8 @@ def _priority(self, topk_idx: paddle.Tensor, capacity: int) -> paddle.Tensor:
         token_priority = paddle.masked_fill(token_priority, ~valid_mask, 0)
         dispatch_mask = F.one_hot(token_priority, capacity).cast(paddle.bool)
         valid_mask = valid_mask.unsqueeze(-1).expand(valid_mask.shape + [capacity])
-        # p = paddle.topk(tmp_scores, k=k, axis=-1, sorted=Truirint('1', valid_mask)
-        # print('2', ~valid_mask)
-        # print('3', dispatch_mask)
-        # dispatch_mask = paddle.masked_fill(dispatch_mask, ~valid_mask, 0)
         dispatch_mask = dispatch_mask * (~valid_mask)
 
-        # valid_mask = paddle.logical_and(token_priority >= 0, token_priority < capacity)
-        # token_priority = paddle.masked_fill(token_priority, ~valid_mask, 0)
-        # dispatch_mask = F.one_hot(token_priority, capacity).cast(paddle.int32)
-        # valid_mask = valid_mask.unsqueeze(-1).expand(valid_mask.shape + [capacity])
-        # dispatch_mask = paddle.masked_fill(dispatch_mask, ~valid_mask, 0)
-
         return dispatch_mask
 
     def _topk_greedy(self, scores: paddle.Tensor, k: int) -> Tuple[paddle.Tensor, paddle.Tensor]:
@@ -555,11 +545,7 @@ def topkgating(
                 "se,sec->sec", topk_masked_gates, token_priority.cast(paddle.get_default_dtype())
             )
 
-        # print(gates_masked)
-        # gates_masked = gates_masked.astype("bool").unsqueeze(-1).expand(gates_masked.shape + token_priority.shape[-1:])
-        # print(gates_masked)
         combine_weights = paddle.einsum("se,sec->sec", gates_masked, token_priority.cast(paddle.get_default_dtype()))
-        # combine_weights = gates_masked * token_priority
         dispatch_mask = combine_weights.astype(paddle.bool)
 
         return capacity, combine_weights, dispatch_mask, exp_counts, l_aux, l_zloss
diff --git a/paddlenlp/transformers/moe_layer.py b/paddlenlp/transformers/moe_layer.py
@@ -253,10 +253,8 @@ def forward(
         # dispatched_input = paddle.masked_fill_(reshaped_input, dispatch_mask)
 
         if self.expert_parallel_degree > 1:
-            print(dispatched_input, self.moe_group)
             dispatched_input = _AllToAll.apply(dispatched_input, self.moe_group)
         # Re-shape after all-to-all: ecm -> gecm
-        print(dispatched_input.shape)
         dispatched_input = dispatched_input.reshape(
             [self.expert_parallel_degree, self.moe_num_experts_per_device, -1, d_model]
         )