Skip to content

Commit

Permalink
fix yarn for deepseek-v2
Browse files Browse the repository at this point in the history
  • Loading branch information
lzhangzz committed Nov 27, 2024
1 parent c6bd5fe commit 3d4f22f
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 10 deletions.
25 changes: 25 additions & 0 deletions lmdeploy/turbomind/deploy/source_model/deepseek2.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# Copyright (c) OpenMMLab. All rights reserved.
import math

from .base import INPUT_MODELS
from .llama import LlamaModel, LlamaReader

Expand Down Expand Up @@ -55,6 +57,23 @@ def mla_norm(self, i: int):
return (*result, )


def get_yarn_attention_factor(rope_scaling: dict):

scaling_factor = float(rope_scaling['factor'])
mscale = rope_scaling['mscale']
mscale_all_dim = rope_scaling['mscale_all_dim']

def yarn_get_mscale(scale=1, mscale=1):
if scale <= 1:
return 1.0
return 0.1 * mscale * math.log(scale) + 1.0

_mscale = float(
yarn_get_mscale(scaling_factor, mscale) /
yarn_get_mscale(scaling_factor, mscale_all_dim))
return _mscale


@INPUT_MODELS.register_module(name='deepseek2')
class DeepSeek2Model(LlamaModel):

Expand Down Expand Up @@ -97,4 +116,10 @@ def model_info(self):
topk_group=cfg['topk_group'],
moe_group_num=cfg['n_group'],
tune_layer_num=2)
rope_scaling = cfg.get('rope_scaling')
if rope_scaling and rope_scaling['type'] == 'yarn':
info.update(
max_position_embeddings=rope_scaling[
'original_max_position_embeddings'],
attention_factor=get_yarn_attention_factor(rope_scaling))
return info
3 changes: 2 additions & 1 deletion src/turbomind/models/llama/unified_attention_layer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -334,8 +334,9 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
};
float low, high;
find_correction_range(param_.beta_fast, param_.beta_slow, low, high);
// https://github.com/huggingface/transformers/blob/6c3f168b36882f0beebaa9121eafa1928ba29633/src/transformers/modeling_rope_utils.py#L216
if (low == high) {
high += 0.01f;
high += 0.001f;
}
params.yarn_ramp_inv_factor_div_2 = 1.0 / (high - low) / 2.0;
params.yarn_ramp_inv_factor_mul_min = 1.0 / (high - low) * low;
Expand Down
9 changes: 0 additions & 9 deletions src/turbomind/models/llama/unified_decoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -278,15 +278,6 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con

// Wait for `h_cu_q/k_len_` to be consumed
check_cuda_error(cudaEventSynchronize(ev_h_cu_x_));

// check_cuda_error(cudaStreamSynchronize(stream_));
// if (tp_.rank_ == 0) {
// std::abort();
// }
// else {
// while (1)
// ;
// }
}

#ifdef ENABLE_FP32
Expand Down

0 comments on commit 3d4f22f

Please sign in to comment.