From 00c435a337b639c1cf818597524c71432717f5ad Mon Sep 17 00:00:00 2001 From: irexyc Date: Tue, 3 Dec 2024 07:13:04 +0000 Subject: [PATCH 1/4] replicate kv for some models when tp is divisble by kv_head_num --- lmdeploy/turbomind/deploy/module.py | 20 +++++++++++++++++++ .../turbomind/deploy/target_model/base.py | 11 ++++++++++ 2 files changed, 31 insertions(+) diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py index 52497175e..a349e85b5 100644 --- a/lmdeploy/turbomind/deploy/module.py +++ b/lmdeploy/turbomind/deploy/module.py @@ -202,6 +202,26 @@ def _reorder_and_merge(self, qkvo): o = torch.zeros_like(q) return qkv, o + def _repeat_kv(self, qkvo, kind: str): + """replicate kv.""" + q, k, v, o = qkvo + head_dim = self.model.model_config.size_per_head + hidden_dim = self.model.model_config.hidden_units + + def _repeat(x): + dim = hidden_dim if kind != 'bias' else 1 + x = x.view(-1, head_dim, dim).repeat(1, self.model.repeat_kv, 1) + x = x.reshape(-1, dim) + return x + + k, v = map(_repeat, (k, v)) + if kind == 'bias': + if o is None: + o = torch.zeros(hidden_dim, dtype=q.dtype, device=q.device) + q, k, v, o = map(torch.squeeze, (q, k, v, o)) + + return (q, k, v, o) + def _export(self, idx: int, qkvo, kind: str, pack_fn, **kwargs): if all(x is None for x in qkvo): return diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py index f2c981bb2..7ea1a84f3 100644 --- a/lmdeploy/turbomind/deploy/target_model/base.py +++ b/lmdeploy/turbomind/deploy/target_model/base.py @@ -78,6 +78,17 @@ def __init__(self, self.model_config.expert_inter_size = _pad_inter_size( self.model_config.expert_inter_size, self.model_config.group_size, self.tensor_para_size) + + # head_num is divisble by tp but kv_head_num is not + # and tp is divisble by kv_head_num + assert self.model_config.head_num % self.tensor_para_size == 0 + self.repeat_kv = 0 + if (self.tensor_para_size > self.model_config.kv_head_num and + self.tensor_para_size % self.model_config.kv_head_num == 0): + self.repeat_kv = (self.tensor_para_size // + self.model_config.kv_head_num) + self.model_config.kv_head_num = self.tensor_para_size + self.model_config.verify() assert self.model_config.kv_head_num % self.tensor_para_size == 0 From 5e967c0a8c040f82ae1eded44555a9e288be72d4 Mon Sep 17 00:00:00 2001 From: irexyc Date: Mon, 9 Dec 2024 12:58:27 +0000 Subject: [PATCH 2/4] export --- lmdeploy/turbomind/deploy/module.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py index a349e85b5..f2a6e4078 100644 --- a/lmdeploy/turbomind/deploy/module.py +++ b/lmdeploy/turbomind/deploy/module.py @@ -229,6 +229,8 @@ def _export(self, idx: int, qkvo, kind: str, pack_fn, **kwargs): if is_lora_a: qkv, o = map(transpose, qkvo) else: + if self.model.repeat_kv: + qkvo = self._repeat_kv(qkvo, kind) qkv, o = self._reorder_and_merge(qkvo) self.model.save_split(pack_fn(qkv), self._attn.format(idx, 'w_qkv', kind), From 5750194d83f37881d9c8864c6810a2a62dc6d857 Mon Sep 17 00:00:00 2001 From: irexyc Date: Wed, 18 Dec 2024 02:33:58 +0000 Subject: [PATCH 3/4] update --- lmdeploy/turbomind/deploy/module.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py index f2a6e4078..6ca0f5395 100644 --- a/lmdeploy/turbomind/deploy/module.py +++ b/lmdeploy/turbomind/deploy/module.py @@ -191,7 +191,7 @@ def __init__(self, model: BaseOutputModel): self.attn_bias = model.model_config.attn_bias def _reorder_and_merge(self, qkvo): - q, k, v, o = map(transpose, qkvo) + q, k, v, o = qkvo # reorder output dim for tm's rotary embedding layout if self.model.permute_qk: q = permute_v2(q, self.head_dim) @@ -210,8 +210,9 @@ def _repeat_kv(self, qkvo, kind: str): def _repeat(x): dim = hidden_dim if kind != 'bias' else 1 - x = x.view(-1, head_dim, dim).repeat(1, self.model.repeat_kv, 1) - x = x.reshape(-1, dim) + x = x.t().reshape(-1, head_dim, dim) + x = x.repeat(1, self.model.repeat_kv, 1) + x = x.reshape(-1, dim).t() return x k, v = map(_repeat, (k, v)) @@ -229,6 +230,7 @@ def _export(self, idx: int, qkvo, kind: str, pack_fn, **kwargs): if is_lora_a: qkv, o = map(transpose, qkvo) else: + qkvo = tuple(map(transpose, qkvo)) if self.model.repeat_kv: qkvo = self._repeat_kv(qkvo, kind) qkv, o = self._reorder_and_merge(qkvo) From cd8107d7800e463d02c90046aa18351ab92df5a1 Mon Sep 17 00:00:00 2001 From: irexyc Date: Wed, 18 Dec 2024 11:49:35 +0000 Subject: [PATCH 4/4] update --- lmdeploy/turbomind/deploy/module.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py index 6ca0f5395..1754161ff 100644 --- a/lmdeploy/turbomind/deploy/module.py +++ b/lmdeploy/turbomind/deploy/module.py @@ -210,9 +210,9 @@ def _repeat_kv(self, qkvo, kind: str): def _repeat(x): dim = hidden_dim if kind != 'bias' else 1 - x = x.t().reshape(-1, head_dim, dim) - x = x.repeat(1, self.model.repeat_kv, 1) - x = x.reshape(-1, dim).t() + x = x.reshape(dim, -1, head_dim) + x = x.repeat(1, 1, self.model.repeat_kv) + x = x.reshape(dim, -1) return x k, v = map(_repeat, (k, v))