InternLM · wangruohui · Oct 18, 2023 · Sep 11, 2023 · Sep 14, 2023 · Sep 19, 2023
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
@@ -485,6 +485,19 @@ def stop_words(self):
         return [151645]  # <|im_end|>
 
 
+@MODELS.register_module(name='falcon')
+class Falcon(BaseModel):
+
+    def __init__(self):
+        super().__init__()
+
+    def update_input_ids(self, input_ids: List):
+        if len(input_ids) == 0:
+            # avoid empty input to model
+            input_ids = [11]
+        return input_ids
+
+
 @MODELS.register_module(name='chatglm2-6b')
 class ChatGLM2(BaseModel):
 

diff --git a/lmdeploy/pytorch_poc/engine/engine.py b/lmdeploy/pytorch_poc/engine/engine.py
@@ -479,7 +479,22 @@ def __init__(
             cache_config = CacheConfig(block_size=64,
                                        num_cpu_blocks=0,
                                        num_gpu_blocks=0)
-        if 'chatglm' in model_path:
+        if 'falcon' in model_path:
+            if hf_config.multi_query:
+                kv_dim = hf_config.hidden_size // hf_config.num_attention_heads
+                kv_head = 1
+            else:
+                kv_dim = hf_config.hidden_size
+                kv_head = hf_config.num_attention_heads
+            model_config = ModelConfig(
+                kv_dim,
+                hf_config.num_hidden_layers,
+                kv_head,
+                bos_token_id=hf_config.bos_token_id,
+                eos_token_id=hf_config.eos_token_id,
+                dtype=torch_dtype,
+            )
+        elif 'chatglm' in model_path:
             model_config = ModelConfig(
                 hf_config.hidden_size // hf_config.num_attention_heads *
                 hf_config.multi_query_group_num,

diff --git a/lmdeploy/pytorch_poc/kernels/alibi_pagedattention.py b/lmdeploy/pytorch_poc/kernels/alibi_pagedattention.py
@@ -51,6 +51,7 @@ def _fwd_kernel(
     K,
     V,
     sm_scale,
+    alibi_scale,
     B_Start_Loc,
     B_Seqlen,
     B_kvlen,
@@ -134,8 +135,9 @@ def _fwd_kernel(
         qk *= sm_scale
 
         mask = start_n + offs_n[None, :]
-        bias = mask.to(tl.float32) * head_slope
+        bias = mask.to(tl.float32) * (head_slope * alibi_scale)
         qk += bias
+
         # NOTE: inf - inf = nan, and nan will leads to error
         qk = tl.where(
             (history_len + offs_m[:, None]) >= mask,
@@ -191,6 +193,7 @@ def alibi_paged_attention_fwd(
     max_input_len: int,
     head_offset: int = 0,
     num_heads: int = -1,
+    alibi_scale: float = 1.0,
     BLOCK: int = 64,
 ):
     """Paged attention forward with alibi bias.
@@ -230,6 +233,7 @@ def alibi_paged_attention_fwd(
         k,
         v,
         sm_scale,
+        alibi_scale,
         b_start_loc,
         b_seq_len,
         b_kv_seq_len,
@@ -257,4 +261,5 @@ def alibi_paged_attention_fwd(
         num_warps=num_warps,
         num_stages=1,
     )
+
     return