fix yarn for deepseek-v2

InternLM · Nov 27, 2024 · 3d4f22f · 3d4f22f
1 parent c6bd5fe
commit 3d4f22f
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 10 deletions.
diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek2.py b/lmdeploy/turbomind/deploy/source_model/deepseek2.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import math
+
 from .base import INPUT_MODELS
 from .llama import LlamaModel, LlamaReader
 
@@ -55,6 +57,23 @@ def mla_norm(self, i: int):
         return (*result, )
 
 
+def get_yarn_attention_factor(rope_scaling: dict):
+
+    scaling_factor = float(rope_scaling['factor'])
+    mscale = rope_scaling['mscale']
+    mscale_all_dim = rope_scaling['mscale_all_dim']
+
+    def yarn_get_mscale(scale=1, mscale=1):
+        if scale <= 1:
+            return 1.0
+        return 0.1 * mscale * math.log(scale) + 1.0
+
+    _mscale = float(
+        yarn_get_mscale(scaling_factor, mscale) /
+        yarn_get_mscale(scaling_factor, mscale_all_dim))
+    return _mscale
+
+
 @INPUT_MODELS.register_module(name='deepseek2')
 class DeepSeek2Model(LlamaModel):
 
@@ -97,4 +116,10 @@ def model_info(self):
                     topk_group=cfg['topk_group'],
                     moe_group_num=cfg['n_group'],
                     tune_layer_num=2)
+        rope_scaling = cfg.get('rope_scaling')
+        if rope_scaling and rope_scaling['type'] == 'yarn':
+            info.update(
+                max_position_embeddings=rope_scaling[
+                    'original_max_position_embeddings'],
+                attention_factor=get_yarn_attention_factor(rope_scaling))
         return info
diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc
@@ -334,8 +334,9 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
             };
             float low, high;
             find_correction_range(param_.beta_fast, param_.beta_slow, low, high);
+            // https://github.com/huggingface/transformers/blob/6c3f168b36882f0beebaa9121eafa1928ba29633/src/transformers/modeling_rope_utils.py#L216
             if (low == high) {
-                high += 0.01f;
+                high += 0.001f;
             }
             params.yarn_ramp_inv_factor_div_2   = 1.0 / (high - low) / 2.0;
             params.yarn_ramp_inv_factor_mul_min = 1.0 / (high - low) * low;

diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc
@@ -278,15 +278,6 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
 
     // Wait for `h_cu_q/k_len_` to be consumed
     check_cuda_error(cudaEventSynchronize(ev_h_cu_x_));
-
-    // check_cuda_error(cudaStreamSynchronize(stream_));
-    // if (tp_.rank_ == 0) {
-    //     std::abort();
-    // }
-    // else {
-    //     while (1)
-    //         ;
-    // }
 }
 
 #ifdef ENABLE_FP32