From 23f06a987303c3f4c12f55a173bc9699c68cb09a Mon Sep 17 00:00:00 2001
From: jimpang <jimpang@tencent.com>
Date: Tue, 21 Nov 2023 15:40:53 +0800
Subject: [PATCH] feat: support baichuan2

---
 vllm/model_executor/model_loader.py    |  10 ++
 vllm/model_executor/models/__init__.py |   6 +-
 vllm/model_executor/models/baichuan.py | 148 ++++++++++++++++---------
 3 files changed, 113 insertions(+), 51 deletions(-)

diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py
index 54b87c4b866e3..90383bc0d1245 100644
--- a/vllm/model_executor/model_loader.py
+++ b/vllm/model_executor/model_loader.py
@@ -17,6 +17,8 @@
     "AquilaForCausalLM": AquilaForCausalLM,  # AquilaChat2
     "BaiChuanForCausalLM": BaiChuanForCausalLM,  # baichuan-7b
     "BaichuanForCausalLM": BaichuanForCausalLM,  # baichuan-13b
+    "BaiChuan2ForCausalLM": BaiChuan2ForCausalLM,  # baichuan2-rope
+    "Baichuan2ForCausalLM": Baichuan2ForCausalLM,  # baichuan2-alibi
     "BloomForCausalLM": BloomForCausalLM,
     "ChatGLMModel": ChatGLMForCausalLM,
     "FalconForCausalLM": FalconForCausalLM,
@@ -52,6 +54,14 @@ def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
     architectures = getattr(config, "architectures", [])
     for arch in architectures:
         if arch in _MODEL_REGISTRY:
+            # baichuan 2 has different vocab size
+            if ("baichuan" in arch.lower()) and (getattr(config, "vocab_size")
+                                                 == 125696):
+                # baichuan 2 7b and 13b have different intermediate size
+                if getattr(config, "intermediate_size") == 11008:
+                    return BaiChuan2ForCausalLM
+                elif getattr(config, "intermediate_size") == 13696:
+                    return Baichuan2ForCausalLM
             return _MODEL_REGISTRY[arch]
     raise ValueError(
         f"Model architectures {architectures} are not supported for now. "
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 078d3d74719df..9babb8e0cd8ef 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -1,6 +1,8 @@
 from vllm.model_executor.models.aquila import AquilaForCausalLM
 from vllm.model_executor.models.baichuan import (BaiChuanForCausalLM,
-                                                 BaichuanForCausalLM)
+                                                 BaichuanForCausalLM,
+                                                 BaiChuan2ForCausalLM,
+                                                 Baichuan2ForCausalLM)
 from vllm.model_executor.models.bloom import BloomForCausalLM
 from vllm.model_executor.models.falcon import FalconForCausalLM
 from vllm.model_executor.models.gpt2 import GPT2LMHeadModel
@@ -21,6 +23,8 @@
     "AquilaForCausalLM",
     "BaiChuanForCausalLM",
     "BaichuanForCausalLM",
+    "BaiChuan2ForCausalLM",
+    "Baichuan2ForCausalLM",
     "BloomForCausalLM",
     "ChatGLMForCausalLM",
     "FalconForCausalLM",
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 93cbc1a8516a7..739d2fb86df9d 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -28,18 +28,17 @@
 import torch
 from torch import nn
 
+from vllm.logger import init_logger
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import (PagedAttentionWithRoPE,
-                                                  PagedAttentionWithALiBi)
+from vllm.model_executor.layers.attention import (PagedAttentionWithALiBi, PagedAttentionWithRoPE)
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear)
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
+                                               QKVParallelLinear)
 from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding, ParallelLMHead)
+from vllm.model_executor.layers.vocab_parallel_embedding import (ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.parallel_utils.parallel_state import (
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
 from vllm.model_executor.weight_utils import (default_weight_loader,
@@ -47,13 +46,15 @@
 from vllm.sequence import SamplerOutput
 from vllm.transformers_utils.configs.baichuan import BaiChuanConfig
 
+logger = init_logger(__name__)
+
 KVCache = Tuple[torch.Tensor, torch.Tensor]
 
 
 def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
-    closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
+    closest_power_of_2 = 2 ** math.floor(math.log2(total_num_heads))
     base = torch.tensor(
-        2**(-(2**-(math.log2(closest_power_of_2) - 3))),
+        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))),
         dtype=torch.float32,
     )
     powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
@@ -61,7 +62,7 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
 
     if closest_power_of_2 != total_num_heads:
         extra_base = torch.tensor(
-            2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
+            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))),
             dtype=torch.float32,
         )
         num_remaining_heads = min(closest_power_of_2,
@@ -78,11 +79,11 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
 class BaiChuanMLP(nn.Module):
 
     def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-        linear_method: Optional[LinearMethodBase] = None,
+            self,
+            hidden_size: int,
+            intermediate_size: int,
+            hidden_act: str,
+            linear_method: Optional[LinearMethodBase] = None,
     ):
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -109,13 +110,13 @@ class BaiChuanAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
     def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        position_embedding: str,
-        rope_theta: float = 10000,
-        max_position_embeddings: int = 8192,
-        linear_method: Optional[LinearMethodBase] = None,
+            self,
+            hidden_size: int,
+            num_heads: int,
+            position_embedding: str,
+            rope_theta: float = 10000,
+            max_position_embeddings: int = 8192,
+            linear_method: Optional[LinearMethodBase] = None,
     ):
         super().__init__()
         self.hidden_size = hidden_size
@@ -153,11 +154,11 @@ def __init__(
             alibi_slopes = _get_alibi_slopes(self.total_num_heads)
             alibi_slopes = alibi_slopes[head_start:head_end].tolist()
 
-            scaling = self.head_dim**-0.5
+            scaling = self.head_dim ** -0.5
             self.attn = PagedAttentionWithALiBi(self.num_heads, self.head_dim,
                                                 scaling, alibi_slopes)
         else:
-            self.scaling = self.head_dim**-0.5
+            self.scaling = self.head_dim ** -0.5
             self.attn = PagedAttentionWithRoPE(
                 self.num_heads,
                 self.head_dim,
@@ -167,12 +168,12 @@ def __init__(
                 max_position=self.max_position_embeddings)
 
     def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
+            self,
+            positions: torch.Tensor,
+            hidden_states: torch.Tensor,
+            kv_cache: KVCache,
+            input_metadata: InputMetadata,
+            cache_event: Optional[torch.cuda.Event],
     ) -> torch.Tensor:
         qkv, _ = self.W_pack(hidden_states)
         q, k, v = qkv.chunk(chunks=3, dim=-1)
@@ -219,13 +220,13 @@ def __init__(self,
                                                 eps=config.rms_norm_eps)
 
     def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-        residual: Optional[torch.Tensor],
+            self,
+            positions: torch.Tensor,
+            hidden_states: torch.Tensor,
+            kv_cache: KVCache,
+            input_metadata: InputMetadata,
+            cache_event: Optional[torch.cuda.Event],
+            residual: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
@@ -271,12 +272,12 @@ def __init__(self,
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
+            self,
+            input_ids: torch.Tensor,
+            positions: torch.Tensor,
+            kv_caches: List[KVCache],
+            input_metadata: InputMetadata,
+            cache_events: Optional[List[torch.cuda.Event]],
     ) -> torch.Tensor:
         hidden_states = self.embed_tokens(input_ids)
         residual = None
@@ -295,29 +296,62 @@ def forward(
         return hidden_states
 
 
+class NormHead(ColumnParallelLinear):
+
+    def __init__(self, hidden_size, vocab_size, bias=False):
+        super().__init__(hidden_size,
+                         vocab_size,
+                         bias=False,
+                         gather_output=False)
+        self.first_flag = True
+
+    def get_weight(self):
+        if self.first_flag:
+            self.first_flag = False
+            self.weight = nn.Parameter(nn.functional.normalize(self.weight))
+        return self.weight
+
+    def forward(self, hidden_states):
+        if self.first_flag:
+            self.first_flag = False
+            self.weight = nn.Parameter(nn.functional.normalize(self.weight))
+        return ColumnParallelLinear.forward(self, hidden_states)
+
+
 class BaiChuanBaseForCausalLM(nn.Module):
 
     def __init__(self,
                  config,
                  position_embedding: str,
-                 linear_method: Optional[LinearMethodBase] = None):
+                 linear_method: Optional[LinearMethodBase] = None, version: str = "1"):
         super().__init__()
         self.config = config
         self.linear_method = linear_method
         self.model = BaiChuanModel(config, position_embedding, linear_method)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.version = version
+        if self.version == "1":
+            self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        elif self.version == "2":
+            self.lm_head = NormHead(config.hidden_size, config.vocab_size, bias=False)
+        else:
+            raise ValueError("Only support baichuan version 1 and 2")
+
         self.sampler = Sampler(config.vocab_size)
 
     def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-        cache_events: Optional[List[torch.cuda.Event]],
+            self,
+            input_ids: torch.Tensor,
+            positions: torch.Tensor,
+            kv_caches: List[KVCache],
+            input_metadata: InputMetadata,
+            cache_events: Optional[List[torch.cuda.Event]],
     ) -> SamplerOutput:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    input_metadata, cache_events)
+
+        lm_head_weight = self.lm_head.weight
+        if self.version == "2":
+            lm_head_weight = self.lm_head.get_weight()
         next_tokens = self.sampler(self.lm_head.weight, hidden_states,
                                    input_metadata)
         return next_tokens
@@ -365,3 +399,17 @@ def __init__(self,
                  config,
                  linear_method: Optional[LinearMethodBase] = None):
         super().__init__(config, "ROPE", linear_method)
+
+
+class Baichuan2ForCausalLM(BaiChuanBaseForCausalLM):  # baichuan2 13b
+
+    def __init__(self, config, linear_method: Optional[LinearMethodBase] = None):
+        logger.info("start init Baichuan2ForCausalLM for 13B version")
+        super().__init__(config, "ALIBI", linear_method, "2")
+
+
+class BaiChuan2ForCausalLM(BaiChuanBaseForCausalLM):  # baichuan2 7b
+
+    def __init__(self, config, linear_method: Optional[LinearMethodBase] = None):
+        logger.info("start init Baichuan2ForCausalLM for 7B version")
+        super().__init__(config, "ROPE", linear_method, "2")