diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index cd43e79c9..469ece487 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -51,47 +51,47 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 |     Llama      |  7B - 65B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |     Llama2     |  7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |     Llama3     |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
-|    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|    Llama3.2    |   1B, 3B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  Llama3.2-VL   |  11B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |   -   |
+|    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|    Llama3.2    |   1B, 3B    | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|  Llama3.2-VL   |  11B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |   InternLM2    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  InternLM2.5   |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |   Baichuan2    |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  No   |
 |   Baichuan2    |     13B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |    ChatGLM2    |     6B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |     Falcon     |  7B - 180B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|       YI       |  6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|    Mistral     |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|       YI       |  6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|    Mistral     |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |    Mixtral     | 8x7B, 8x22B | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|      QWen      | 1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|      QWen      | 1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  QWen1.5-MoE   |    A2.7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    |  No  |  Yes  |
+|     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    | Yes  |  Yes  |
 |    QWen2-VL    |   2B, 7B    | MLLM |    Yes    |   Yes   |   No    |  No  |  No   |
 |  DeepSeek-MoE  |     16B     | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |  DeepSeek-V2   |  16B, 236B  | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |    MiniCPM3    |     4B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    |  No  |  Yes  |
+| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    | Yes  |  Yes  |
 |     Gemma      |    2B-7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |      Dbrx      |    132B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |   StarCoder2   |   3B-15B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|   Phi-3-mini   |    3.8B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|  Phi-3-vision  |    4.2B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  CogVLM-Chat   |     17B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-| LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-| Mono-InternVL  |     2B      | MLLM |   Yes\*   |   Yes   |   Yes   |  No  |   -   |
-|    ChemVLM     |   8B-26B    | MLLM |    Yes    |   Yes   |   No    |  No  |   -   |
-|     Gemma2     |   9B-27B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|   Phi-3-mini   |    3.8B     | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|  Phi-3-vision  |    4.2B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|  CogVLM-Chat   |     17B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+| LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+| Mono-InternVL  |     2B      | MLLM |   Yes\*   |   Yes   |   Yes   |  -   |   -   |
+|    ChemVLM     |   8B-26B    | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
+|     Gemma2     |   9B-27B    | LLM  |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |      GLM4      |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |     GLM-4V     |     9B      | MLLM |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|   CodeGeeX4    |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  Phi-3.5-mini  |    3.8B     | LLM  |    Yes    |   Yes   |   No    |  No  |   -   |
-|  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   Yes   |   No    |  No  |   -   |
-| Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   Yes   |   No    |  No  |   -   |
+|   CodeGeeX4    |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|  Phi-3.5-mini  |    3.8B     | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
+|  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
+| Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
 
 ```{note}
 * Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index 7ec36d235..d73452328 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -51,47 +51,47 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 |     Llama      |  7B - 65B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |     Llama2     |  7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |     Llama3     |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
-|    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|    Llama3.2    |   1B, 3B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  Llama3.2-VL   |  11B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |   -   |
+|    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|    Llama3.2    |   1B, 3B    | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|  Llama3.2-VL   |  11B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |   InternLM2    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  InternLM2.5   |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |   Baichuan2    |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  No   |
 |   Baichuan2    |     13B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |    ChatGLM2    |     6B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |     Falcon     |  7B - 180B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|       YI       |  6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|    Mistral     |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|       YI       |  6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|    Mistral     |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |    Mixtral     | 8x7B, 8x22B | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|      QWen      | 1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|      QWen      | 1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  QWen1.5-MoE   |    A2.7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    |  No  |  Yes  |
+|     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    | Yes  |  Yes  |
 |    QWen2-VL    |   2B, 7B    | MLLM |    Yes    |   Yes   |   No    |  No  |  No   |
 |  DeepSeek-MoE  |     16B     | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |  DeepSeek-V2   |  16B, 236B  | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |    MiniCPM3    |     4B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    |  No  |  Yes  |
+| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    | Yes  |  Yes  |
 |     Gemma      |    2B-7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |      Dbrx      |    132B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |   StarCoder2   |   3B-15B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|   Phi-3-mini   |    3.8B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|  Phi-3-vision  |    4.2B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  CogVLM-Chat   |     17B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-| LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-| Mono-InternVL  |     2B      | MLLM |   Yes\*   |   Yes   |   Yes   |  No  |   -   |
-|    ChemVLM     |   8B-26B    | MLLM |    Yes    |   Yes   |   No    |  No  |   -   |
-|     Gemma2     |   9B-27B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|   Phi-3-mini   |    3.8B     | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|  Phi-3-vision  |    4.2B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|  CogVLM-Chat   |     17B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+| LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+| Mono-InternVL  |     2B      | MLLM |   Yes\*   |   Yes   |   Yes   |  -   |   -   |
+|    ChemVLM     |   8B-26B    | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
+|     Gemma2     |   9B-27B    | LLM  |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |      GLM4      |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |     GLM-4V     |     9B      | MLLM |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|   CodeGeeX4    |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  Phi-3.5-mini  |    3.8B     | LLM  |    Yes    |   Yes   |   No    |  No  |   -   |
-|  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   Yes   |   No    |  No  |   -   |
-| Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   Yes   |   No    |  No  |   -   |
+|   CodeGeeX4    |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|  Phi-3.5-mini  |    3.8B     | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
+|  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
+| Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
 
 ```{note}
 * Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.
diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
index 0780e9359..71f7a5900 100644
--- a/lmdeploy/lite/apis/calibrate.py
+++ b/lmdeploy/lite/apis/calibrate.py
@@ -27,6 +27,7 @@
     'ChatGLMForConditionalGeneration': 'GLMBlock',
     'MixtralForCausalLM': 'MixtralDecoderLayer',
     'Qwen2VLForConditionalGeneration': 'Qwen2VLDecoderLayer',
+    'MistralForCausalLM': 'MistralDecoderLayer',
 }
 
 NORM_TYPE_MAP = {
@@ -44,6 +45,7 @@
     'ChatGLMForConditionalGeneration': 'RMSNorm',
     'MixtralForCausalLM': 'MixtralRMSNorm',
     'Qwen2VLForConditionalGeneration': 'Qwen2RMSNorm',
+    'MistralForCausalLM': 'MistralRMSNorm',
 }
 
 HEAD_NAME_MAP = {
@@ -61,6 +63,7 @@
     'ChatGLMForConditionalGeneration': 'output_layer',
     'MixtralForCausalLM': 'lm_head',
     'Qwen2VLForConditionalGeneration': 'lm_head',
+    'MistralForCausalLM': 'lm_head',
 }
 
 
diff --git a/lmdeploy/lite/apis/smooth_quant.py b/lmdeploy/lite/apis/smooth_quant.py
index 45684602b..c8df67355 100644
--- a/lmdeploy/lite/apis/smooth_quant.py
+++ b/lmdeploy/lite/apis/smooth_quant.py
@@ -1,70 +1,15 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-
-import os.path as osp
-import shutil
-
 import fire
 import torch
 from torch import nn
 
-import lmdeploy
-from lmdeploy.lite.apis.calibrate import calibrate
+from lmdeploy.lite.apis.calibrate import (LAYER_TYPE_MAP, NORM_TYPE_MAP,
+                                          calibrate)
 from lmdeploy.lite.quantization.awq import (FC_FCS_MAP, NORM_FCS_MAP,
                                             awq_layers, smooth_layers)
 from lmdeploy.lite.utils import collect_target_modules
 from lmdeploy.pytorch.models import QLinear, QRMSNorm
 
-LAYER_TYPE_MAP = {
-    'InternLMForCausalLM': 'InternLMDecoderLayer',
-    'InternLM2ForCausalLM': 'InternLM2DecoderLayer',
-    'QWenLMHeadModel': 'QWenBlock',
-    'BaiChuanForCausalLM': 'DecoderLayer',
-    'LlamaForCausalLM': 'LlamaDecoderLayer',
-    'ChatGLMForConditionalGeneration': 'GLMBlock',
-}
-NORM_TYPE_MAP = {
-    'InternLMForCausalLM': 'InternLMRMSNorm',
-    'InternLM2ForCausalLM': 'InternLM2RMSNorm',
-    'QWenLMHeadModel': 'RMSNorm',
-    'BaiChuanForCausalLM': 'RMSNorm',
-    'LlamaForCausalLM': 'LlamaRMSNorm',
-    'ChatGLMForConditionalGeneration': 'RMSNorm',
-}
-
-LMDEPLOY_ROOT = lmdeploy.__path__[0]
-
-MODEL_PATH_MAP = {
-    'InternLMForCausalLM':
-    osp.join(LMDEPLOY_ROOT, 'pytorch/modeling/modeling_internlm.py'),
-    'InternLM2ForCausalLM':
-    osp.join(LMDEPLOY_ROOT, 'pytorch/modeling/modeling_internlm2.py'),
-    'LlamaForCausalLM':
-    osp.join(LMDEPLOY_ROOT, 'pytorch/modeling/modeling_llama.py'),
-    'BaiChuanForCausalLM':
-    osp.join(LMDEPLOY_ROOT, 'pytorch/modeling/modeling_baichuan.py')
-}
-
-AUTO_MAP = {
-    'InternLMForCausalLM': {
-        'AutoConfig': 'configuration_internlm.InternLMConfig',
-        'AutoModel': 'modeling_internlm.InternLMForCausalLM',
-        'AutoModelForCausalLM': 'modeling_internlm.InternLMForCausalLM'
-    },
-    'InternLM2ForCausalLM': {
-        'AutoConfig': 'configuration_internlm2.InternLMConfig',
-        'AutoModelForCausalLM': 'modeling_internlm2.InternLM2ForCausalLM',
-        'AutoModel': 'modeling_internlm2.InternLM2ForCausalLM'
-    },
-    'LlamaForCausalLM': {
-        'AutoModel': 'modeling_llama.LlamaForCausalLM',
-        'AutoModelForCausalLM': 'modeling_llama.LlamaForCausalLM'
-    },
-    'BaiChuanForCausalLM': {
-        'AutoConfig': 'configuration_baichuan.BaiChuanConfig',
-        'AutoModelForCausalLM': 'modeling_baichuan.BaiChuanForCausalLM'
-    }
-}
-
 
 def smooth_quant(model: str,
                  work_dir: str = './work_dir',
@@ -146,11 +91,6 @@ def smooth_quant(model: str,
         setattr(parent, child_name, q_norm)
         norm.to('cpu')
 
-    if hasattr(model.config, 'auto_map'):
-        model.config.auto_map.update(AUTO_MAP[type(model).__name__])
-    else:
-        model.config.auto_map = AUTO_MAP[type(model).__name__]
-
     if vl_model:
         from .auto_awq import save_vl_model
         save_vl_model(vl_model, model_path, work_dir)
@@ -162,8 +102,6 @@ def smooth_quant(model: str,
                               safe_serialization=False)
     tokenizer.save_pretrained(work_dir)
 
-    shutil.copy(MODEL_PATH_MAP[type(model).__name__], work_dir)
-
 
 if __name__ == '__main__':
     fire.Fire(smooth_quant)
diff --git a/lmdeploy/lite/quantization/awq.py b/lmdeploy/lite/quantization/awq.py
index 2efe41b6d..cf03a7521 100644
--- a/lmdeploy/lite/quantization/awq.py
+++ b/lmdeploy/lite/quantization/awq.py
@@ -50,7 +50,12 @@
         'input_layernorm':
         ['self_attn.k_proj', 'self_attn.q_proj', 'self_attn.v_proj'],
         'post_attention_layernorm': ['mlp.gate_proj', 'mlp.up_proj']
-    }
+    },
+    'MistralDecoderLayer': {
+        'input_layernorm':
+        ['self_attn.k_proj', 'self_attn.q_proj', 'self_attn.v_proj'],
+        'post_attention_layernorm': ['mlp.gate_proj', 'mlp.up_proj']
+    },
 }
 
 FC_FCS_MAP = {
@@ -92,6 +97,10 @@
     'Qwen2VLDecoderLayer': {
         'self_attn.v_proj': ['self_attn.o_proj'],
         'mlp.up_proj': ['mlp.down_proj']
+    },
+    'MistralDecoderLayer': {
+        'self_attn.v_proj': ['self_attn.o_proj'],
+        'mlp.up_proj': ['mlp.down_proj']
     }
 }
 
diff --git a/lmdeploy/pytorch/modeling/__init__.py b/lmdeploy/pytorch/modeling/__init__.py
deleted file mode 100644
index ef101fec6..000000000
--- a/lmdeploy/pytorch/modeling/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/lmdeploy/pytorch/modeling/convert_to_qmodules.py b/lmdeploy/pytorch/modeling/convert_to_qmodules.py
deleted file mode 100644
index 4a95c9f16..000000000
--- a/lmdeploy/pytorch/modeling/convert_to_qmodules.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch.nn as nn
-
-from lmdeploy.pytorch.models import QLinear, QRMSNorm
-
-LAYER_TYPE_MAP = {
-    'InternLMForCausalLM': 'InternLMDecoderLayer',
-    'InternLM2ForCausalLM': 'InternLM2DecoderLayer',
-    'QWenLMHeadModel': 'QWenBlock',
-    'BaiChuanForCausalLM': 'DecoderLayer',
-    'LlamaForCausalLM': 'LlamaDecoderLayer',
-}
-NORM_TYPE_MAP = {
-    'InternLMForCausalLM': 'InternLMRMSNorm',
-    'InternLM2ForCausalLM': 'InternLM2RMSNorm',
-    'QWenLMHeadModel': 'RMSNorm',
-    'BaiChuanForCausalLM': 'RMSNorm',
-    'LlamaForCausalLM': 'LlamaRMSNorm',
-}
-
-
-def convert_decoder_layer(module, norm_type):
-    """Converts a given module's child layers from regular Linear or RMSNorm to
-    their Quantized versions (QLinear, QRMSNorm).
-
-    The conversion is done in place.
-    """
-    for name, child in module.named_children():
-        if isinstance(child, nn.Linear):
-            new_child = QLinear.from_float(child, initialization=False)
-            setattr(module, name, new_child)
-        elif type(child).__name__ == norm_type:
-            new_child = QRMSNorm.from_float(child, initialization=False)
-            setattr(module, name, new_child)
-        else:
-            convert_decoder_layer(child, norm_type)
-
-
-def convert(module, layer_type, norm_type):
-    """Recursively traverses through given PyTorch module and identifies child
-    layers that match the specified layer_type and norm_type for conversion to
-    their Quantized counterparts.
-
-    The conversion is done using the `convert_decoder_layer` function.
-    """
-    for child in module.children():
-        if type(child).__name__ == layer_type:
-            convert_decoder_layer(child, norm_type)
-        else:
-            convert(child, layer_type, norm_type)
-
-
-def convert_to_qmodules(model):
-    """Convert all Linear and RMSNorm in the decoder layers of the model into
-    their Quantized versions (QLinear, QRMSNorm)."""
-    layer_type = LAYER_TYPE_MAP[type(model).__name__]
-    norm_type = NORM_TYPE_MAP[type(model).__name__]
-    convert(model, layer_type, norm_type)
-    return
diff --git a/lmdeploy/pytorch/modeling/modeling_baichuan.py b/lmdeploy/pytorch/modeling/modeling_baichuan.py
deleted file mode 100644
index a790e81d0..000000000
--- a/lmdeploy/pytorch/modeling/modeling_baichuan.py
+++ /dev/null
@@ -1,824 +0,0 @@
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from transformers import PreTrainedModel
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import (BaseModelOutputWithPast,
-                                           CausalLMOutputWithPast)
-
-from lmdeploy.pytorch.modeling.convert_to_qmodules import convert_to_qmodules
-from lmdeploy.utils import get_logger
-
-from .configuration_baichuan import BaiChuanConfig
-
-logger = get_logger('lmdeploy')
-
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size,
-                      dtype: torch.dtype,
-                      device: torch.device,
-                      past_key_values_length: int = 0):
-    """Make causal mask used for bi-directional self-attention."""
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len),
-                      torch.tensor(torch.finfo(dtype).min, device=device),
-                      device=device)
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([
-            torch.zeros(
-                tgt_len, past_key_values_length, dtype=dtype, device=device),
-            mask
-        ],
-                         dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len,
-                                         tgt_len + past_key_values_length)
-
-
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor,
-                 dtype: torch.dtype,
-                 tgt_len: Optional[int] = None):
-    """Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len,
-    src_seq_len]`."""
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len,
-                                                  src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool),
-                                     torch.finfo(dtype).min)
-
-
-class RMSNorm(nn.Module):
-
-    def __init__(self, hidden_size, eps=1e-6):
-        """RMSNorm is equivalent to T5LayerNorm."""
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1,
-                                                               keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance +
-                                                    self.variance_epsilon)
-
-        # convert into half-precision if necessary
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.to(self.weight.dtype)
-
-        return self.weight * hidden_states
-
-
-class RotaryEmbedding(torch.nn.Module):
-    """RotaryEmbedding for Baichuan Model.
-
-    This module generates sine and cosine positional encodings based on
-    the paper "RoFormer: Enhanced Transformer with Rotary Position Embedding".
-    The purpose of this class is to provide positional embeddings to the
-    input tensors. It utilizes a cache mechanism to store precomputed
-    sine and cosine values for speedup.
-
-    Args:
-        dim (int): The dimensionality of the embeddings.
-        max_position_embeddings (int, optional): The maximum number of
-            position embeddings. Default is 2048.
-        base (int, optional): The base value for the inverse frequency
-            calculation. Default is 10000.
-        device (str, optional): The device to run operations on.
-            If None, defaults to the device of the model.
-    """
-
-    def __init__(self,
-                 dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 device=None):
-        super().__init__()
-        index = (torch.arange(0, dim, 2).float().to(device) / dim)
-        inv_freq = 1.0 / (base**index)
-        self.register_buffer('inv_freq', inv_freq)
-
-        # Build here to make `torch.jit.trace` work.
-        self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(self.max_seq_len_cached,
-                         device=self.inv_freq.device,
-                         dtype=self.inv_freq.dtype)
-        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order
-        # to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer('cos_cached',
-                             emb.cos()[None, None, :, :],
-                             persistent=False)
-        self.register_buffer('sin_cached',
-                             emb.sin()[None, None, :, :],
-                             persistent=False)
-
-    def forward(self, x, seq_len=None):
-        """Forward propagation method for the embedding layer.
-
-        Generates positional embeddings for the given input tensor.
-        """
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        # This `if` block is unlikely to be run after we build sin/cos in
-        # `__init__`. Keep the logic here just in case.
-        if seq_len > self.max_seq_len_cached:
-            self.max_seq_len_cached = seq_len
-            t = torch.arange(self.max_seq_len_cached,
-                             device=x.device,
-                             dtype=self.inv_freq.dtype)
-            freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-            # Different from paper, but it uses a different permutation in
-            # order to obtain the same calculation
-            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            self.register_buffer('cos_cached',
-                                 emb.cos()[None, None, :, :],
-                                 persistent=False)
-            self.register_buffer('sin_cached',
-                                 emb.sin()[None, None, :, :],
-                                 persistent=False)
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., :x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    """Apply rotary positional embeddings to query and key tensors.
-
-    This function applies the cosine and sine positional embeddings on the
-    input query (q) and key (k) tensors using element-wise multiplication and
-    addition.
-    """
-    # The first two dimensions of cos and sin are always 1,
-    # so we can `squeeze` them.
-    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
-    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
-    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class MLP(nn.Module):
-    """MLP for Baichuan Model."""
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-    ):
-        super().__init__()
-        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
-        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.act_fn = ACT2FN[hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
-class Attention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper."""
-
-    def __init__(self, config: BaiChuanConfig):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.max_position_embeddings = config.max_position_embeddings
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError('hidden_size must be divisible by num_heads '
-                             f'(got `hidden_size`: {self.hidden_size}'
-                             f' and `num_heads`: {self.num_heads}).')
-        self.W_pack = nn.Linear(self.hidden_size,
-                                3 * self.hidden_size,
-                                bias=False)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim,
-                                self.hidden_size,
-                                bias=False)
-        self.rotary_emb = RotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings)
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads,
-                           self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        """Forward propagation method for the attention layer."""
-        bsz, q_len, _ = hidden_states.size()
-
-        proj = self.W_pack(hidden_states)
-        proj = proj.unflatten(-1,
-                              (3, self.hidden_size)).unsqueeze(0).transpose(
-                                  0, -2).squeeze(-2)
-        query_states = proj[0].view(
-            bsz, q_len, self.num_heads, self.head_dim).transpose(
-                1, 2)  # batch_size x source_len x hidden_size
-        key_states = proj[1].view(bsz, q_len,
-                                  self.num_heads, self.head_dim).transpose(
-                                      1,
-                                      2)  # batch_size x target_len x head_size
-        value_states = proj[2].view(
-            bsz, q_len, self.num_heads, self.head_dim).transpose(
-                1, 2)  # batch_size x source_len x hidden_size
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin, position_ids)
-        # [bsz, nh, t, hd]
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(
-            2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                'Attention weights should be of size '
-                f'{(bsz, self.num_heads, q_len, kv_seq_len)}, but is'
-                f' {attn_weights.size()}')
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError('Attention mask should be of size '
-                                 f'{(bsz, 1, q_len, kv_seq_len)},'
-                                 f' but is {attention_mask.size()}')
-            attn_weights = attn_weights + attention_mask
-            attn_weights = torch.max(
-                attn_weights,
-                torch.tensor(torch.finfo(attn_weights.dtype).min))
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights,
-                                             dim=-1,
-                                             dtype=torch.float32).to(
-                                                 query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                '`attn_output` should be of size '
-                f'{(bsz, self.num_heads, q_len, self.head_dim)}, but is'
-                f' {attn_output.size()}')
-
-        attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class DecoderLayer(nn.Module):
-    """Decoder layer for Baichuan Model."""
-
-    def __init__(self, config: BaiChuanConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = Attention(config=config)
-        self.mlp = MLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-        )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor,
-                                                 torch.FloatTensor]]]:
-        """  # noqa: E501
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states, )
-
-        if output_attentions:
-            outputs += (self_attn_weights, )
-
-        if use_cache:
-            outputs += (present_key_value, )
-
-        return outputs
-
-
-class PreTrainedModel(PreTrainedModel):
-    config_class = BaiChuanConfig
-    base_model_prefix = 'model'
-    supports_gradient_checkpointing = True
-    _no_split_modules = ['DecoderLayer']
-    _keys_to_ignore_on_load_unexpected = [r'decoder\.version']
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, Model):
-            module.gradient_checkpointing = value
-
-
-class Model(PreTrainedModel):
-    """Transformer decoder consisting of *config.num_hidden_layers* layers.
-    Each layer is a [`DecoderLayer`]
-
-    Args:
-        config: BaiChuanConfig
-    """
-
-    def __init__(self, config: BaiChuanConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size,
-                                         self.padding_idx)
-        self.layers = nn.ModuleList(
-            [DecoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder.
-    # prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape,
-                                        inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
-            )
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask,
-                                              inputs_embeds.dtype,
-                                              tgt_len=input_shape[-1]).to(
-                                                  inputs_embeds.device)
-            combined_attention_mask = (expanded_attn_mask
-                                       if combined_attention_mask is None else
-                                       expanded_attn_mask +
-                                       combined_attention_mask)
-
-        return combined_attention_mask
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        use_cache = (use_cache
-                     if use_cache is not None else self.config.use_cache)
-
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError('You cannot specify both decoder_input_ids '
-                             'and decoder_inputs_embeds at the same time')
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError('You have to specify either decoder_input_ids '
-                             'or decoder_inputs_embeds')
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = (seq_length_with_past +
-                                    past_key_values_length)
-
-        if position_ids is None:
-            device = (input_ids.device
-                      if input_ids is not None else inputs_embeds.device)
-            position_ids = torch.arange(past_key_values_length,
-                                        seq_length + past_key_values_length,
-                                        dtype=torch.long,
-                                        device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_length_with_past),
-                                        dtype=torch.bool,
-                                        device=inputs_embeds.device)
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds,
-            past_key_values_length)
-
-        hidden_states = inputs_embeds
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    '`use_cache=True` is incompatible with gradient '
-                    'checkpointing. Setting `use_cache=False`...')
-                use_cache = False
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states, )
-
-            past_key_value = past_key_values[
-                idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, output_attentions, None)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    None,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (
-                    layer_outputs[2 if output_attentions else 1], )
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1], )
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states, )
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(
-                v for v in
-                [hidden_states, next_cache, all_hidden_states, all_self_attns]
-                if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class BaiChuanForCausalLM(PreTrainedModel):
-    """This class extends the `PreTrainedModel` to enable causal language
-    modeling.
-
-    It wraps the basic Baichuan model (`Model`) and includes a linear layer as
-    a language model head (`lm_head`). The purpose is to predict token
-    probabilities, given the previous tokens in the sequence.
-    """
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = Model(config)
-
-        self.lm_head = nn.Linear(config.hidden_size,
-                                 config.vocab_size,
-                                 bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-        convert_to_qmodules(self)
-
-    def get_input_embeddings(self):
-        """Get the token embedding layer."""
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        """Set the token embedding layer."""
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        """Get the output embedding layer."""
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        """Set the output embedding layer."""
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        """Set the decoder model."""
-        self.model = decoder
-
-    def get_decoder(self):
-        """Get the decoder model."""
-        return self.model
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""  # noqa: E501
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, ModelForCausalLM
-
-        >>> model = ModelForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you consciours? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
-        ```"""
-
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        # decoder outputs consists of
-        # (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits, ) + outputs[1:]
-            return (loss, ) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      past_key_values=None,
-                                      attention_mask=None,
-                                      inputs_embeds=None,
-                                      **kwargs):
-        """Prepare inputs for generating sequences using the model.
-
-        Args:
-            input_ids (torch.Tensor): Input token ids.
-            past_key_values (list[torch.Tensor], optional): List of past key
-                and value states.
-            attention_mask (torch.Tensor, optional): Mask indicating which
-                tokens should be attended to.
-            inputs_embeds (torch.FloatTensor, optional): Optionally,
-                the input embeddings instead of token ids.
-
-        Returns:
-            dict: Dictionary containing prepared inputs for model generation.
-        """
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-
-        position_ids = kwargs.get('position_ids', None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-
-        # if `inputs_embeds` are passed,
-        # we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {'inputs_embeds': inputs_embeds}
-        else:
-            model_inputs = {'input_ids': input_ids}
-
-        model_inputs.update({
-            'position_ids': position_ids,
-            'past_key_values': past_key_values,
-            'use_cache': kwargs.get('use_cache'),
-            'attention_mask': attention_mask,
-        })
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        """Reorder cached past key-values during generation using beam search.
-
-        This function reorders the cached past key-values according to the
-        given indices. It's useful in beam search where the order of hypotheses
-        can change from one time-step to another.
-        """
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx)
-                for past_state in layer_past), )
-        return reordered_past
diff --git a/lmdeploy/pytorch/modeling/modeling_internlm.py b/lmdeploy/pytorch/modeling/modeling_internlm.py
deleted file mode 100644
index c64064113..000000000
--- a/lmdeploy/pytorch/modeling/modeling_internlm.py
+++ /dev/null
@@ -1,1171 +0,0 @@
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch InternLM model."""
-import math
-import queue
-import threading
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers.activations import ACT2FN
-from transformers.generation.streamers import BaseStreamer
-from transformers.modeling_outputs import (BaseModelOutputWithPast,
-                                           CausalLMOutputWithPast,
-                                           SequenceClassifierOutputWithPast)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (add_start_docstrings,
-                                add_start_docstrings_to_model_forward,
-                                replace_return_docstrings)
-
-from lmdeploy.pytorch.modeling.convert_to_qmodules import convert_to_qmodules
-from lmdeploy.utils import get_logger
-
-from .configuration_internlm import InternLMConfig
-
-logger = get_logger('lmdeploy')
-
-_CONFIG_FOR_DOC = 'InternLMConfig'
-
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size,
-                      dtype: torch.dtype,
-                      device: torch.device,
-                      past_key_values_length: int = 0):
-    """Make causal mask used for bi-directional self-attention."""
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len),
-                      torch.tensor(torch.finfo(dtype).min, device=device),
-                      device=device)
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([
-            torch.zeros(
-                tgt_len, past_key_values_length, dtype=dtype, device=device),
-            mask
-        ],
-                         dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len,
-                                         tgt_len + past_key_values_length)
-
-
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor,
-                 dtype: torch.dtype,
-                 tgt_len: Optional[int] = None):
-    """Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len,
-    src_seq_len]`."""
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len,
-                                                  src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool),
-                                     torch.finfo(dtype).min)
-
-
-class InternLMRMSNorm(nn.Module):
-
-    def __init__(self, hidden_size, eps=1e-6):
-        """InternLMRMSNorm is equivalent to T5LayerNorm."""
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1,
-                                                               keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance +
-                                                    self.variance_epsilon)
-
-        # convert into half-precision if necessary
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.to(self.weight.dtype)
-
-        return self.weight * hidden_states
-
-
-class InternLMRotaryEmbedding(torch.nn.Module):
-    """RotaryEmbedding for InternLM Model.
-
-    This module generates sine and cosine positional encodings based on
-    the paper "RoFormer: Enhanced Transformer with Rotary Position Embedding".
-    The purpose of this class is to provide positional embeddings to the
-    input tensors. It utilizes a cache mechanism to store precomputed
-    sine and cosine values for speedup.
-
-    Args:
-        dim (int): The dimensionality of the embeddings.
-        max_position_embeddings (int, optional): The maximum number of
-            position embeddings. Default is 2048.
-        base (int, optional): The base value for the inverse frequency
-            calculation. Default is 10000.
-        device (str, optional): The device to run operations on.
-            If None, defaults to the device of the model.
-    """
-
-    def __init__(self,
-                 dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 device=None):
-        super().__init__()
-        index = (torch.arange(0, dim, 2).float().to(device) / dim)
-        inv_freq = 1.0 / (base**index)
-        self.register_buffer('inv_freq', inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(self.max_seq_len_cached,
-                         device=self.inv_freq.device,
-                         dtype=self.inv_freq.dtype)
-        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order
-        # to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer('cos_cached',
-                             emb.cos()[None, None, :, :],
-                             persistent=False)
-        self.register_buffer('sin_cached',
-                             emb.sin()[None, None, :, :],
-                             persistent=False)
-
-    def forward(self, x, seq_len=None):
-        """Forward propagation method for the embedding layer.
-
-        Generates positional embeddings for the given input tensor.
-        """
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        # This `if` block is unlikely to be run after we build sin/cos in
-        # `__init__`. Keep the logic here just in case.
-        if seq_len > self.max_seq_len_cached:
-            self.max_seq_len_cached = seq_len
-            t = torch.arange(self.max_seq_len_cached,
-                             device=x.device,
-                             dtype=self.inv_freq.dtype)
-            freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-            # Different from paper, but it uses a different permutation in
-            # order to obtain the same calculation
-            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            self.register_buffer('cos_cached',
-                                 emb.cos()[None, None, :, :],
-                                 persistent=False)
-            self.register_buffer('sin_cached',
-                                 emb.sin()[None, None, :, :],
-                                 persistent=False)
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., :x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    """Apply rotary positional embeddings to query and key tensors.
-
-    This function applies the cosine and sine positional embeddings on the
-    input query (q) and key (k) tensors using element-wise multiplication and
-    addition.
-    """
-    # The first two dimensions of cos and sin are always 1, so we can
-    # `squeeze` them.
-    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
-    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
-    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class InternLMMLP(nn.Module):
-    """MLP for InternLM Model."""
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-    ):
-        super().__init__()
-        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
-        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.act_fn = ACT2FN[hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
-class InternLMAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper."""
-
-    def __init__(self, config: InternLMConfig):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.max_position_embeddings = config.max_position_embeddings
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError('hidden_size must be divisible by num_heads '
-                             f'(got `hidden_size`: {self.hidden_size}'
-                             f' and `num_heads`: {self.num_heads}).')
-        self.q_proj = nn.Linear(self.hidden_size,
-                                self.num_heads * self.head_dim,
-                                bias=config.bias)
-        self.k_proj = nn.Linear(self.hidden_size,
-                                self.num_heads * self.head_dim,
-                                bias=config.bias)
-        self.v_proj = nn.Linear(self.hidden_size,
-                                self.num_heads * self.head_dim,
-                                bias=config.bias)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim,
-                                self.hidden_size,
-                                bias=config.bias)
-        self.rotary_emb = InternLMRotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings)
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads,
-                           self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        """Forward propagation method for the attention layer."""
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states).view(
-            bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = self.k_proj(hidden_states).view(
-            bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = self.v_proj(hidden_states).view(
-            bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin, position_ids)
-        # [bsz, nh, t, hd]
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(
-            2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                'Attention weights should be of size '
-                f'{(bsz, self.num_heads, q_len, kv_seq_len)}, but is'
-                f' {attn_weights.size()}')
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError('Attention mask should be of size '
-                                 f'{(bsz, 1, q_len, kv_seq_len)}, '
-                                 f'but is {attention_mask.size()}')
-            attn_weights = attn_weights + attention_mask
-            attn_weights = torch.max(
-                attn_weights,
-                torch.tensor(torch.finfo(attn_weights.dtype).min))
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights,
-                                             dim=-1,
-                                             dtype=torch.float32).to(
-                                                 query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                'attn_output` should be of size '
-                f'`{(bsz, self.num_heads, q_len, self.head_dim)}, but is'
-                f' {attn_output.size()}')
-
-        attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class InternLMDecoderLayer(nn.Module):
-    """Decoder layer for InternLM Model."""
-
-    def __init__(self, config: InternLMConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = InternLMAttention(config=config)
-        self.mlp = InternLMMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-        )
-        self.input_layernorm = InternLMRMSNorm(config.hidden_size,
-                                               eps=config.rms_norm_eps)
-        self.post_attention_layernorm = InternLMRMSNorm(
-            config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor,
-                                                 torch.FloatTensor]]]:
-        """ # noqa: E501
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states, )
-
-        if output_attentions:
-            outputs += (self_attn_weights, )
-
-        if use_cache:
-            outputs += (present_key_value, )
-
-        return outputs
-
-
-INTERNLM_START_DOCSTRING = r""" # noqa: E501
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`InternLMConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    'The bare InternLM Model outputting raw hidden-states without any specific head on top.',  # noqa: E501
-    INTERNLM_START_DOCSTRING,
-)
-class InternLMPreTrainedModel(PreTrainedModel):
-    config_class = InternLMConfig
-    base_model_prefix = 'model'
-    supports_gradient_checkpointing = True
-    _no_split_modules = ['InternLMDecoderLayer']
-    _keys_to_ignore_on_load_unexpected = [r'decoder\.version']
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, InternLMModel):
-            module.gradient_checkpointing = value
-
-
-INTERNLM_INPUTS_DOCSTRING = r""" # noqa: E501
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    'The bare InternLM Model outputting raw hidden-states without any specific head on top.',  # noqa: E501
-    INTERNLM_START_DOCSTRING,
-)
-class InternLMModel(InternLMPreTrainedModel):
-    """Transformer decoder consisting of *config.num_hidden_layers* layers.
-    Each layer is a [`InternLMDecoderLayer`]
-
-    Args:
-        config: InternLMConfig
-    """
-    _auto_class = 'AutoModel'
-
-    def __init__(self, config: InternLMConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size,
-                                         self.padding_idx)
-        self.layers = nn.ModuleList([
-            InternLMDecoderLayer(config)
-            for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = InternLMRMSNorm(config.hidden_size,
-                                    eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder.
-    # prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape,
-                                        inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
-            )
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask,
-                                              inputs_embeds.dtype,
-                                              tgt_len=input_shape[-1]).to(
-                                                  inputs_embeds.device)
-            combined_attention_mask = (expanded_attn_mask
-                                       if combined_attention_mask is None else
-                                       expanded_attn_mask +
-                                       combined_attention_mask)
-
-        return combined_attention_mask
-
-    @add_start_docstrings_to_model_forward(INTERNLM_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions or self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                or self.config.output_hidden_states)
-        use_cache = use_cache or self.config.use_cache
-
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError('You cannot specify both decoder_input_ids '
-                             'and decoder_inputs_embeds at the same time')
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError('You have to specify either decoder_input_ids '
-                             'or decoder_inputs_embeds')
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = (seq_length_with_past +
-                                    past_key_values_length)
-
-        if position_ids is None:
-            device = (input_ids.device
-                      if input_ids is not None else inputs_embeds.device)
-            position_ids = torch.arange(past_key_values_length,
-                                        seq_length + past_key_values_length,
-                                        dtype=torch.long,
-                                        device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_length_with_past),
-                                        dtype=torch.bool,
-                                        device=inputs_embeds.device)
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds,
-            past_key_values_length)
-
-        hidden_states = inputs_embeds
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    '`use_cache=True` is incompatible with gradient '
-                    'checkpointing. Setting `use_cache=False`...')
-                use_cache = False
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states, )
-
-            past_key_value = past_key_values[
-                idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, output_attentions, None)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    None,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (
-                    layer_outputs[2 if output_attentions else 1], )
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1], )
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states, )
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(
-                v for v in
-                [hidden_states, next_cache, all_hidden_states, all_self_attns]
-                if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class InternLMForCausalLM(InternLMPreTrainedModel):
-    """This class extends the `InternLMPreTrainedModel` to enable causal
-    language modeling.
-
-    It wraps the basic InternLM model (`InternLMModel`) and includes a linear
-    layer as a language model head (`lm_head`). The purpose is to predict token
-    probabilities, given the previous tokens in the sequence.
-    """
-    _auto_class = 'AutoModelForCausalLM'
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = InternLMModel(config)
-
-        self.lm_head = nn.Linear(config.hidden_size,
-                                 config.vocab_size,
-                                 bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-        convert_to_qmodules(self)
-
-    def get_input_embeddings(self):
-        """Get the token embedding layer."""
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        """Set the token embedding layer."""
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        """Get the output embedding layer."""
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        """Set the output embedding layer."""
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        """Set the decoder model."""
-        self.model = decoder
-
-    def get_decoder(self):
-        """Get the decoder model."""
-        return self.model
-
-    @add_start_docstrings_to_model_forward(INTERNLM_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast,
-                               config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""  # noqa: E501
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, InternLMForCausalLM
-
-        >>> model = InternLMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you consciours? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions or self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        # decoder outputs consists of
-        # (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits, ) + outputs[1:]
-            return (loss, ) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      past_key_values=None,
-                                      attention_mask=None,
-                                      inputs_embeds=None,
-                                      **kwargs):
-        """Prepare inputs for generating sequences using the model.
-
-        Args:
-            input_ids (torch.Tensor): Input token ids.
-            past_key_values (list[torch.Tensor], optional): List of past key
-                and value states.
-            attention_mask (torch.Tensor, optional): Mask indicating which
-                tokens should be attended to.
-            inputs_embeds (torch.FloatTensor, optional): Optionally,
-                the input embeddings instead of token ids.
-
-        Returns:
-            dict: Dictionary containing prepared inputs for model generation.
-        """
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-
-        position_ids = kwargs.get('position_ids', None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-
-        # if `inputs_embeds` are passed,
-        # we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {'inputs_embeds': inputs_embeds}
-        else:
-            model_inputs = {'input_ids': input_ids}
-
-        model_inputs.update({
-            'position_ids': position_ids,
-            'past_key_values': past_key_values,
-            'use_cache': kwargs.get('use_cache'),
-            'attention_mask': attention_mask,
-        })
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        """Reorder cached past key-values during generation using beam search.
-
-        This function reorders the cached past key-values according to the
-        given indices. It's useful in beam search where the order of hypotheses
-        can change from one time-step to another.
-        """
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx)
-                for past_state in layer_past), )
-        return reordered_past
-
-    def build_inputs(self,
-                     tokenizer,
-                     query: str,
-                     history: List[Tuple[str, str]] = []):
-        """Builds the input for the model."""
-        prompt = ''
-        for record in history:
-            prompt += f"""<|User|>:{record[0]}<eoh>\n<|Bot|>:{record[1]}<eoa>\n"""  # noqa: E501
-        prompt += f"""<|User|>:{query}<eoh>\n<|Bot|>:"""
-        return tokenizer([prompt], return_tensors='pt')
-
-    @torch.no_grad()
-    def chat(self,
-             tokenizer,
-             query: str,
-             history: List[Tuple[str, str]] = [],
-             streamer: Optional[BaseStreamer] = None,
-             max_new_tokens: int = 1024,
-             do_sample: bool = True,
-             temperature: float = 0.8,
-             top_p: float = 0.8,
-             **kwargs):
-        """Provides a chatting functionality for the model."""
-        inputs = self.build_inputs(tokenizer, query, history)
-        inputs = {
-            k: v.to(self.device)
-            for k, v in inputs.items() if torch.is_tensor(v)
-        }
-        outputs = self.generate(**inputs,
-                                streamer=streamer,
-                                max_new_tokens=max_new_tokens,
-                                do_sample=do_sample,
-                                temperature=temperature,
-                                top_p=top_p,
-                                **kwargs)
-        outputs = outputs[0].cpu().tolist()[len(inputs['input_ids'][0]):]
-        response = tokenizer.decode(outputs, skip_special_tokens=True)
-        response = response.split('<eoa>')[0]
-        history = history + [(query, response)]
-        return response, history
-
-    @torch.no_grad()
-    def stream_chat(self,
-                    tokenizer,
-                    query: str,
-                    history: List[Tuple[str, str]] = [],
-                    max_new_tokens: int = 1024,
-                    do_sample: bool = True,
-                    temperature: float = 0.8,
-                    top_p: float = 0.8,
-                    **kwargs):
-        """Return a generator in format: (response, history) Eg.
-
-        ('你好，有什么可以帮助您的吗', [('你好', '你好，有什么可以帮助您的吗')]) ('你好，有什么可以帮助您的吗？', [('你好',
-        '你好，有什么可以帮助您的吗？')])
-        """
-
-        response_queue = queue.Queue(maxsize=20)
-
-        class ChatStreamer(BaseStreamer):
-
-            def __init__(self, tokenizer) -> None:
-                super().__init__()
-                self.tokenizer = tokenizer
-                self.queue = response_queue
-                self.query = query
-                self.history = history
-                self.response = ''
-                self.received_inputs = False
-                self.queue.put(
-                    (self.response, history + [(self.query, self.response)]))
-
-            def put(self, value):
-                if len(value.shape) > 1 and value.shape[0] > 1:
-                    raise ValueError('ChatStreamer only supports batch size 1')
-                elif len(value.shape) > 1:
-                    value = value[0]
-
-                if not self.received_inputs:
-                    # The first received value is input_ids, ignore here
-                    self.received_inputs = True
-                    return
-
-                token = self.tokenizer.decode([value[-1]],
-                                              skip_special_tokens=True)
-                if token.strip() != '<eoa>':
-                    self.response = self.response + token
-                    history = self.history + [(self.query, self.response)]
-                    self.queue.put((self.response, history))
-
-            def end(self):
-                self.queue.put(None)
-
-        def stream_producer():
-            return self.chat(tokenizer=tokenizer,
-                             query=query,
-                             streamer=ChatStreamer(tokenizer=tokenizer),
-                             history=history,
-                             max_new_tokens=max_new_tokens,
-                             do_sample=do_sample,
-                             temperature=temperature,
-                             top_p=top_p,
-                             **kwargs)
-
-        def consumer():
-            producer = threading.Thread(target=stream_producer)
-            producer.start()
-            while True:
-                res = response_queue.get()
-                if res is None:
-                    return
-                yield res
-
-        return consumer()
-
-
-@add_start_docstrings(
-    """  # noqa: E501
-    The InternLM Model transformer with a sequence classification head on top (linear layer).
-
-    [`InternLMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    INTERNLM_START_DOCSTRING,
-)
-class InternLMForSequenceClassification(InternLMPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r'lm_head.weight']
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = InternLMModel(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(INTERNLM_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""  # noqa: E501
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError(
-                'Cannot handle batch sizes > 1 if no padding token is defined.'
-            )
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = (
-                    torch.ne(input_ids, self.config.pad_token_id).sum(-1) -
-                    1).to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device),
-                               sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = 'regression'
-                elif self.num_labels > 1 and (labels.dtype == torch.long
-                                              or labels.dtype == torch.int):
-                    self.config.problem_type = 'single_label_classification'
-                else:
-                    self.config.problem_type = 'multi_label_classification'
-
-            if self.config.problem_type == 'regression':
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == 'single_label_classification':
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels),
-                                labels.view(-1))
-            elif self.config.problem_type == 'multi_label_classification':
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits, ) + transformer_outputs[1:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/lmdeploy/pytorch/modeling/modeling_internlm2.py b/lmdeploy/pytorch/modeling/modeling_internlm2.py
deleted file mode 100644
index cb61df025..000000000
--- a/lmdeploy/pytorch/modeling/modeling_internlm2.py
+++ /dev/null
@@ -1,1940 +0,0 @@
-# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on transformers/src/transformers/models/llama/modeling_llama.py
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch InternLM2 model."""
-import math
-import queue
-import threading
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from einops import rearrange
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache, StaticCache
-from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-from transformers.modeling_outputs import (BaseModelOutputWithPast,
-                                           CausalLMOutputWithPast,
-                                           QuestionAnsweringModelOutput,
-                                           SequenceClassifierOutputWithPast,
-                                           TokenClassifierOutput)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
-from transformers.utils import (add_start_docstrings,
-                                add_start_docstrings_to_model_forward,
-                                is_flash_attn_2_available,
-                                is_flash_attn_greater_or_equal_2_10, logging,
-                                replace_return_docstrings)
-
-from lmdeploy.pytorch.modeling.convert_to_qmodules import convert_to_qmodules
-
-try:
-    from transformers.generation.streamers import BaseStreamer
-except Exception:
-    BaseStreamer = None
-
-from .configuration_internlm2 import InternLM2Config
-
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import (index_first_axis, pad_input,
-                                         unpad_input)
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = 'InternLM2Config'
-
-
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0,
-                                    dtype=torch.int32), (1, 0))  # pylint: disable=E1102
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-class InternLM2RMSNorm(nn.Module):
-    """InternLM2RMSNorm is equivalent to T5LayerNorm."""
-
-    def __init__(self, hidden_size, eps=1e-6):
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance +
-                                                    self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-ALL_LAYERNORM_LAYERS.append(InternLM2RMSNorm)
-
-
-class InternLM2RotaryEmbedding(nn.Module):
-    """Rotary Position Embedding for the InternLM2 model.
-
-    Credits to the Reddit user /u/lucidrains.
-    """
-
-    def __init__(self,
-                 dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 device=None,
-                 scaling_factor=1.0):
-        super().__init__()
-        self.scaling_factor = scaling_factor
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base**(torch.arange(
-            0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
-        self.register_buffer('inv_freq', inv_freq, persistent=False)
-        # For BC we register cos and sin cached
-        self.max_seq_len_cached = max_position_embeddings
-
-    @torch.no_grad()
-    def forward(self, x, position_ids):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(
-            position_ids.shape[0], -1, 1)
-        position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 since bfloat16 loses precision on long contexts
-        # See https://github.com/huggingface/transformers/pull/29285
-        device_type = x.device.type
-        device_type = device_type if isinstance(
-            device_type, str) and device_type != 'mps' else 'cpu'
-        with torch.autocast(device_type=device_type, enabled=False):
-            freqs = (inv_freq_expanded.float()
-                     @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-
-
-class InternLM2LinearScalingRotaryEmbedding(InternLM2RotaryEmbedding):
-    """InternLM2RotaryEmbedding extended with linear scaling.
-
-    Credits to the Reddit user /u/kaiokendev
-    """
-
-    def forward(self, x, position_ids):
-        # difference to the original RoPE: a scaling factor is applied to the position ids
-        position_ids = position_ids.float() / self.scaling_factor
-        cos, sin = super().forward(x, position_ids)
-        return cos, sin
-
-
-class InternLM2DynamicNTKScalingRotaryEmbedding(InternLM2RotaryEmbedding):
-    """InternLM2RotaryEmbedding extended with Dynamic NTK scaling.
-
-    Credits to the Reddit users /u/bloc97 and /u/emozilla
-    """
-
-    def forward(self, x, position_ids):
-        # difference to the original RoPE: inv_freq is recomputed when the sequence length > original length
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_position_embeddings:
-            base = self.base * ((self.scaling_factor * seq_len /
-                                 self.max_position_embeddings) -
-                                (self.scaling_factor - 1))**(self.dim /
-                                                             (self.dim - 2))
-            inv_freq = 1.0 / (base**(torch.arange(
-                0, self.dim, 2, dtype=torch.int64).float().to(x.device) /
-                                     self.dim))
-            self.register_buffer(
-                'inv_freq', inv_freq,
-                persistent=False)  # TODO joao: this may break with compilation
-
-        cos, sin = super().forward(x, position_ids)
-        return cos, sin
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., :x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):  # pylint: disable=unused-argument
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`, *optional*):
-            Deprecated and unused.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos.unsqueeze(unsqueeze_dim)
-    sin = sin.unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class InternLM2MLP(nn.Module):
-    """MLP for InternLM2 model."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.w1 = nn.Linear(self.hidden_size,
-                            self.intermediate_size,
-                            bias=False)
-        self.w3 = nn.Linear(self.hidden_size,
-                            self.intermediate_size,
-                            bias=False)
-        self.w2 = nn.Linear(self.intermediate_size,
-                            self.hidden_size,
-                            bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        down_proj = self.w2(self.act_fn(self.w1(x)) * self.w3(x))
-
-        return down_proj
-
-
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """This is the equivalent of torch.repeat_interleave(x, dim=1,
-    repeats=n_rep).
-
-    The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to
-    (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :,
-                                  None, :, :].expand(batch,
-                                                     num_key_value_heads,
-                                                     n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen,
-                                 head_dim)
-
-
-class InternLM2Attention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper."""
-
-    def __init__(self,
-                 config: InternLM2Config,
-                 layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f'Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will '
-                'lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` '
-                'when creating this class.')
-
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f'hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}'
-                f' and `num_heads`: {self.num_heads}).')
-
-        self.wqkv = nn.Linear(
-            self.hidden_size,
-            (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim,
-            bias=config.bias,
-        )
-        self.wo = nn.Linear(self.num_heads * self.head_dim,
-                            self.hidden_size,
-                            bias=config.bias)
-
-        self._init_rope()
-
-    def _init_rope(self):
-        if self.config.rope_scaling is None:
-            self.rotary_emb = InternLM2RotaryEmbedding(
-                self.head_dim,
-                max_position_embeddings=self.max_position_embeddings,
-                base=self.rope_theta,
-            )
-        else:
-            scaling_type = self.config.rope_scaling['type']
-            scaling_factor = self.config.rope_scaling['factor']
-            if scaling_type == 'linear':
-                self.rotary_emb = InternLM2LinearScalingRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            elif scaling_type == 'dynamic':
-                self.rotary_emb = InternLM2DynamicNTKScalingRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            else:
-                raise ValueError(f'Unknown RoPE scaling type {scaling_type}')
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,  # pylint: disable=unused-argument
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-
-        if self.config.pretraining_tp > 1:
-            # split qkv_states by tp size
-            key_value_slicing = (self.num_key_value_heads *
-                                 self.head_dim) // self.config.pretraining_tp
-            qkv_slices = self.wqkv.weight.split(key_value_slicing, dim=0)
-            qkv_states = torch.cat(
-                [
-                    F.linear(hidden_states, qkv_slice)
-                    for qkv_slice in qkv_slices
-                ],
-                dim=-1  # pylint: disable=E1102
-            )
-        else:
-            qkv_states = self.wqkv(hidden_states)
-
-        qkv_states = rearrange(
-            qkv_states,
-            'b q (h gs d) -> b q h gs d',
-            gs=2 + self.num_key_value_groups,
-            d=self.head_dim,
-        )
-
-        query_states = qkv_states[..., :self.num_key_value_groups, :]
-        query_states = rearrange(query_states,
-                                 'b q h gs d -> b q (h gs) d').transpose(1, 2)
-        key_states = qkv_states[..., -2, :].transpose(1, 2)
-        value_states = qkv_states[..., -1, :].transpose(1, 2)
-
-        cos, sin = self.rotary_emb(value_states, position_ids)
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {
-                'sin': sin,
-                'cos': cos,
-                'cache_position': cache_position
-            }
-            key_states, value_states = past_key_value.update(
-                key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(
-            2, 3)) / math.sqrt(self.head_dim)
-
-        if attention_mask is not None:  # no matter the length, we just slice it
-            causal_mask = attention_mask[:, :, :, :key_states.shape[-2]]
-            attn_weights = attn_weights + causal_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights,
-                                             dim=-1,
-                                             dtype=torch.float32).to(
-                                                 query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f'`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is'
-                f' {attn_output.size()}')
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        if self.config.pretraining_tp > 1:
-            attn_output = attn_output.split(self.hidden_size //
-                                            self.config.pretraining_tp,
-                                            dim=2)
-            o_proj_slices = self.wo.weight.split(self.hidden_size //
-                                                 self.config.pretraining_tp,
-                                                 dim=1)
-            attn_output = sum([
-                F.linear(attn_output[i], o_proj_slices[i])  # pylint: disable=E1102
-                for i in range(self.config.pretraining_tp)
-            ])
-        else:
-            attn_output = self.wo(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class InternLM2FlashAttention2(InternLM2Attention):
-    """InternLM2 flash attention module.
-
-    This module inherits from `InternLM2Attention` as the weights of the module
-    stays untouched. The only required change would be on the forward pass
-    where it needs to correctly call the public API of flash attention and deal
-    with padding tokens in case the input contains any of them.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment,
-        #   that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
-        # Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1)
-        #   produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10(
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        if isinstance(past_key_value, StaticCache):
-            raise ValueError(
-                '`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` '
-                'make sure to use `sdpa` in the mean time, and open an issue at '
-                'https://github.com/huggingface/transformers')
-
-        output_attentions = False
-
-        bsz, q_len, _ = hidden_states.size()
-
-        qkv_states = self.wqkv(hidden_states)
-
-        qkv_states = rearrange(
-            qkv_states,
-            'b q (h gs d) -> b q h gs d',
-            gs=2 + self.num_key_value_groups,
-            d=self.head_dim,
-        )
-
-        query_states = qkv_states[..., :self.num_key_value_groups, :]
-        query_states = rearrange(query_states, 'b q h gs d -> b q (h gs) d')
-        key_states = qkv_states[..., -2, :]
-        value_states = qkv_states[..., -1, :]
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        cos, sin = self.rotary_emb(value_states, position_ids)
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin)
-
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {
-                'sin': sin,
-                'cos': cos,
-                'cache_position': cache_position
-            }
-            key_states, value_states = past_key_value.update(
-                key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # TODO: These transpose are quite inefficient but Flash Attention requires the layout
-        # [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
-        # to be able to avoid many of these transpose/reshape/view.
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        # dropout_rate = self.attention_dropout if self.training else 0.0
-        dropout_rate = 0.0
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32. (InternLM2RMSNorm handles it correctly)
-
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, '_pre_quantization_dtype'):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.wqkv.weight.dtype
-
-            logger.warning_once(
-                f'The input hidden states seems to be silently casted in float32, this might be related to'
-                f' the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in'
-                f' {target_dtype}.')
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        attn_output = self._flash_attention_forward(query_states,
-                                                    key_states,
-                                                    value_states,
-                                                    attention_mask,
-                                                    q_len,
-                                                    dropout=dropout_rate)
-
-        attn_output = attn_output.reshape(bsz, q_len,
-                                          self.hidden_size).contiguous()
-        attn_output = self.wo(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value  # pylint: disable=E0606
-
-    def _flash_attention_forward(self,
-                                 query_states,
-                                 key_states,
-                                 value_states,
-                                 attention_mask,
-                                 query_length,
-                                 dropout=0.0,
-                                 softmax_scale=None):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`float`):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1.
-            # For details, please see the comment in InternLM2FlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask,
-                query_length)
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            attn_output_unpad = flash_attn_varlen_func(  # pylint: disable=E0606
-                query_states,
-                key_states,
-                value_states,
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_k=cu_seqlens_k,
-                max_seqlen_q=max_seqlen_in_batch_q,
-                max_seqlen_k=max_seqlen_in_batch_k,
-                dropout_p=dropout,
-                softmax_scale=softmax_scale,
-                causal=causal,
-            )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size,
-                                    query_length)  # pylint: disable=E0606
-        else:
-            attn_output = flash_attn_func(  # pylint: disable=E0606
-                query_states,
-                key_states,
-                value_states,
-                dropout,
-                softmax_scale=softmax_scale,
-                causal=causal)
-
-        return attn_output
-
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask,
-                    query_length):
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(
-            attention_mask)
-        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
-
-        key_layer = index_first_axis(  # pylint: disable=E0606
-            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
-                              head_dim), indices_k)
-        value_layer = index_first_axis(  # pylint: disable=E0606
-            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
-                                head_dim), indices_k)
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(  # pylint: disable=E0606
-                query_layer.reshape(batch_size * kv_seq_len, self.num_heads,
-                                    head_dim), indices_k)
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(  # pylint: disable=E0606
-                query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.LllamaSdpaAttention with Llama->InternLM2
-class InternLM2SdpaAttention(InternLM2Attention):
-    """InternLM2 attention module using
-    torch.nn.functional.scaled_dot_product_attention.
-
-    This module inherits from `InternLM2Attention` as the weights of the module
-    stays untouched. The only changes are on the forward pass to adapt to SDPA
-    API.
-    """
-
-    # Adapted from InternLM2Attention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"`
-            # once this is implemented.
-            logger.warning_once(
-                'InternLM2Model uses InternLM2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` '
-                'does not support `output_attentions=True`. Falling back to the manual attention implementation, '
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. '
-                'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                cache_position=cache_position,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        qkv_states = self.wqkv(hidden_states)
-
-        qkv_states = rearrange(
-            qkv_states,
-            'b q (h gs d) -> b q h gs d',
-            gs=2 + self.num_key_value_groups,
-            d=self.head_dim,
-        )
-
-        query_states = qkv_states[..., :self.num_key_value_groups, :]
-        query_states = rearrange(query_states, 'b q h gs d -> b q (h gs) d')
-        key_states = qkv_states[..., -2, :]
-        value_states = qkv_states[..., -1, :]
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        cos, sin = self.rotary_emb(value_states, position_ids)
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin)
-
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {
-                'sin': sin,
-                'cos': cos,
-                'cache_position': cache_position
-            }
-            key_states, value_states = past_key_value.update(
-                key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        causal_mask = attention_mask
-        if attention_mask is not None:
-            causal_mask = causal_mask[:, :, :, :key_states.shape[-2]]
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with
-        # custom attn_mask, Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == 'cuda' and causal_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of
-        # an inline conditional assignment in SDPA to support both torch.compile's dynamic shapes and full graph
-        # options. An inline conditional prevents dynamic shapes from compiling.
-        is_causal = bool(causal_mask is None and q_len > 1)
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(  # pylint: disable=E1102
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=causal_mask,
-            dropout_p=0.0,
-            is_causal=is_causal,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
-
-        attn_output = self.wo(attn_output)
-
-        return attn_output, None, past_key_value
-
-
-INTERNLM2_ATTENTION_CLASSES = {
-    'eager': InternLM2Attention,
-    'flash_attention_2': InternLM2FlashAttention2,
-    'sdpa': InternLM2SdpaAttention,
-}
-
-
-# Modified from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->InternLM2
-class InternLM2DecoderLayer(nn.Module):
-    """InternLM2 Decoder Layer.
-
-    This module is a single layer of the InternLM2 model.
-    """
-
-    def __init__(self, config: InternLM2Config, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.layer_idx = layer_idx
-
-        self.attention = INTERNLM2_ATTENTION_CLASSES[
-            config.attn_implementation](config=config, layer_idx=layer_idx)
-
-        self.feed_forward = InternLM2MLP(config)
-        self.attention_norm = InternLM2RMSNorm(config.hidden_size,
-                                               eps=config.rms_norm_eps)
-        self.ffn_norm = InternLM2RMSNorm(config.hidden_size,
-                                         eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor,
-                                                 torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*):
-                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
-                query_sequence_length, key_sequence_length)` if default attention is used.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-        residual = hidden_states
-
-        hidden_states = self.attention_norm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.attention(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            cache_position=cache_position,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.ffn_norm(hidden_states)
-        hidden_states = self.feed_forward(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states, )
-
-        if output_attentions:
-            outputs += (self_attn_weights, )
-
-        if use_cache:
-            outputs += (present_key_value, )
-
-        return outputs
-
-
-InternLM2_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`InternLM2Config`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->InternLM2
-@add_start_docstrings(
-    'The bare InternLM2 Model outputting raw hidden-states without any specific head on top.',
-    InternLM2_START_DOCSTRING,
-)
-class InternLM2PreTrainedModel(PreTrainedModel):
-    """InternLM2 pretraiend model's base class."""
-
-    config_class = InternLM2Config
-    base_model_prefix = 'model'
-    supports_gradient_checkpointing = True
-    _no_split_modules = ['InternLM2DecoderLayer']
-    _skip_keys_device_placement = ['past_key_values']
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_cache_class = True
-    _supports_quantized_cache = True
-    _supports_static_cache = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-InternLM2_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
-            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
-            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
-            the complete sequence length.
-"""
-
-
-# Modified from transformers.models.llama.modeling_llama.LlamaModel with Llama->InternLM2
-@add_start_docstrings(
-    'The bare InternLM2 Model outputting raw hidden-states without any specific head on top.',
-    InternLM2_START_DOCSTRING,
-)
-class InternLM2Model(InternLM2PreTrainedModel):
-    """Transformer decoder consisting of *config.num_hidden_layers* layers.
-    Each layer is a [`InternLM2DecoderLayer`]
-
-    Args:
-        config: InternLM2Config
-    """
-
-    _auto_class = 'AutoModel'
-
-    def __init__(self, config: InternLM2Config):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-        self.config = config
-
-        self.tok_embeddings = nn.Embedding(config.vocab_size,
-                                           config.hidden_size,
-                                           self.padding_idx)
-
-        self.layers = nn.ModuleList([
-            InternLM2DecoderLayer(config, layer_idx)
-            for layer_idx in range(config.num_hidden_layers)
-        ])
-        self.norm = InternLM2RMSNorm(config.hidden_size,
-                                     eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.tok_embeddings
-
-    def set_input_embeddings(self, value):
-        self.tok_embeddings = value
-
-    @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache,
-                                        List[torch.FloatTensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                'You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one'
-            )
-
-        if self.gradient_checkpointing and self.training and use_cache:
-            logger.warning_once(
-                '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.'
-            )
-            use_cache = False
-
-        if inputs_embeds is None:
-            inputs_embeds = self.tok_embeddings(input_ids)
-
-        return_legacy_cache = False
-        if use_cache and not isinstance(
-                past_key_values,
-                Cache):  # kept for BC (non `Cache` `past_key_values` inputs)
-            return_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-
-        if cache_position is None:
-            past_seen_tokens = past_key_values.get_seq_length(
-            ) if past_key_values is not None else 0
-            cache_position = torch.arange(past_seen_tokens,
-                                          past_seen_tokens +
-                                          inputs_embeds.shape[1],
-                                          device=inputs_embeds.device)
-        if position_ids is None:
-            position_ids = cache_position.unsqueeze(0)
-
-        causal_mask = self._update_causal_mask(attention_mask, inputs_embeds,
-                                               cache_position, past_key_values,
-                                               output_attentions)
-
-        # embed positions
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-
-        for decoder_layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states, )
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    causal_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                    cache_position,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=causal_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                    cache_position=cache_position,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache = layer_outputs[
-                    2 if output_attentions else 1]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1], )
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states, )
-
-        next_cache = next_decoder_cache if use_cache else None
-        if return_legacy_cache:
-            next_cache = next_cache.to_legacy_cache()
-
-        if not return_dict:
-            return tuple(
-                v for v in
-                [hidden_states, next_cache, all_hidden_states, all_self_attns]
-                if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-    def _update_causal_mask(
-        self,
-        attention_mask: torch.Tensor,
-        input_tensor: torch.Tensor,
-        cache_position: torch.Tensor,
-        past_key_values: Cache,
-        output_attentions: bool,
-    ):
-        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length
-        # even when the static KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at
-        # each decode steps due to the dynamic shapes. (`recording cudagraph tree for symint key 13`, etc.), which is
-        # VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using `fullgraph=True`.
-        # See more context in https://github.com/huggingface/transformers/pull/29114
-
-        if self.config.attn_implementation == 'flash_attention_2':
-            if attention_mask is not None and 0.0 in attention_mask:
-                return attention_mask
-            return None
-
-        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
-        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
-        # to infer the attention mask.
-        past_seen_tokens = past_key_values.get_seq_length(
-        ) if past_key_values is not None else 0
-        using_static_cache = isinstance(past_key_values, StaticCache)
-
-        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
-        if self.config.attn_implementation == 'sdpa' and not using_static_cache and not output_attentions:
-            if AttentionMaskConverter._ignore_causal_mask_sdpa(
-                    attention_mask,
-                    inputs_embeds=input_tensor,
-                    past_key_values_length=past_seen_tokens,
-                    is_training=self.training,
-            ):
-                return None
-
-        dtype, device = input_tensor.dtype, input_tensor.device
-        min_dtype = torch.finfo(dtype).min
-        sequence_length = input_tensor.shape[1]
-        if using_static_cache:
-            target_length = past_key_values.get_max_length()
-        else:
-            target_length = (attention_mask.shape[-1] if isinstance(
-                attention_mask, torch.Tensor) else past_seen_tokens +
-                             sequence_length + 1)
-
-        if attention_mask is not None and attention_mask.dim() == 4:
-            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
-            if attention_mask.max() != 0:
-                raise ValueError(
-                    'Custom 4D attention mask should be passed in inverted form with max==0`'
-                )
-            causal_mask = attention_mask
-        else:
-            causal_mask = torch.full((sequence_length, target_length),
-                                     fill_value=min_dtype,
-                                     dtype=dtype,
-                                     device=device)
-            if sequence_length != 1:
-                causal_mask = torch.triu(causal_mask, diagonal=1)
-            causal_mask *= torch.arange(
-                target_length, device=device) > cache_position.reshape(-1, 1)
-            causal_mask = causal_mask[None, None, :, :].expand(
-                input_tensor.shape[0], 1, -1, -1)
-            if attention_mask is not None:
-                causal_mask = causal_mask.clone(
-                )  # copy to contiguous memory for in-place edit
-                mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[:, :, :, :
-                                           mask_length] + attention_mask[:,
-                                                                         None,
-                                                                         None, :]
-                padding_mask = padding_mask == 0
-                causal_mask[:, :, :, :
-                            mask_length] = causal_mask[:, :, :, :
-                                                       mask_length].masked_fill(
-                                                           padding_mask,
-                                                           min_dtype)
-        if (self.config.attn_implementation == 'sdpa'
-                and attention_mask is not None
-                and attention_mask.device.type == 'cuda'
-                and not output_attentions):
-            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
-            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
-            # Details: https://github.com/pytorch/pytorch/issues/110213
-            causal_mask = AttentionMaskConverter._unmask_unattended(
-                causal_mask, min_dtype)  # pylint: disable=E1120
-
-        return causal_mask
-
-
-# Modified from transformers.models.llama.modeling_llama.LlamaForCausalLM
-class InternLM2ForCausalLM(InternLM2PreTrainedModel):
-    """Causal language model (CLM) for InternLM2."""
-
-    _auto_class = 'AutoModelForCausalLM'
-    _tied_weights_keys = ['output.weight']
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = InternLM2Model(config)
-        self.vocab_size = config.vocab_size
-        self.output = nn.Linear(config.hidden_size,
-                                config.vocab_size,
-                                bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-        convert_to_qmodules(self)
-
-    def get_input_embeddings(self):
-        return self.model.tok_embeddings
-
-    def set_input_embeddings(self, value):
-        self.model.tok_embeddings = value
-
-    def get_output_embeddings(self):
-        return self.output
-
-    def set_output_embeddings(self, new_embeddings):
-        self.output = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast,
-                               config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache,
-                                        List[torch.FloatTensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, InternLM2ForCausalLM
-
-        >>> model = InternLM2ForCausalLM.from_pretrained("meta-InternLM2/InternLM2-2-7b-hf")
-        >>> tokenizer = AutoTokenizer.from_pretrained("meta-InternLM2/InternLM2-2-7b-hf")
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            cache_position=cache_position,
-        )
-
-        hidden_states = outputs[0]
-        if self.config.pretraining_tp > 1:
-            output_slices = self.output.weight.split(
-                self.vocab_size // self.config.pretraining_tp, dim=0)
-            logits = [
-                F.linear(hidden_states, output_slices[i])  # pylint: disable=not-callable
-                for i in range(self.config.pretraining_tp)
-            ]
-            logits = torch.cat(logits, dim=-1)
-        else:
-            logits = self.output(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits, ) + outputs[1:]
-            return (loss, ) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past_key_values=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        cache_position=None,
-        use_cache=True,
-        **kwargs,
-    ):
-        past_length = 0
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                past_length = cache_position[
-                    0] if cache_position is not None else past_key_values.get_seq_length(
-                    )
-                max_cache_length = (torch.tensor(
-                    past_key_values.get_max_length(), device=input_ids.device)
-                                    if past_key_values.get_max_length()
-                                    is not None else None)
-                cache_length = past_length if max_cache_length is None else torch.min(
-                    max_cache_length, past_length)
-            # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as input)
-            if attention_mask is not None and attention_mask.shape[
-                    1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] -
-                                           past_length):]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (max_cache_length is not None and attention_mask is not None
-                    and cache_length + input_ids.shape[1] > max_cache_length):
-                attention_mask = attention_mask[:, -max_cache_length:]  # pylint: disable=E1130
-
-        position_ids = kwargs.get('position_ids', None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1]:]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {'inputs_embeds': inputs_embeds}
-        else:
-            # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise
-            # recompiles graphs as the stride of the inputs is a guard.
-            # Ref: https://github.com/huggingface/transformers/pull/29114
-            # TODO: use `next_tokens` directly instead.
-            model_inputs = {'input_ids': input_ids.contiguous()}
-
-        input_length = position_ids.shape[
-            -1] if position_ids is not None else input_ids.shape[-1]
-        if cache_position is None:
-            cache_position = torch.arange(past_length,
-                                          past_length + input_length,
-                                          device=input_ids.device)
-        elif use_cache:
-            cache_position = cache_position[-input_length:]
-
-        model_inputs.update({
-            'position_ids': position_ids,
-            'cache_position': cache_position,
-            'past_key_values': past_key_values,
-            'use_cache': use_cache,
-            'attention_mask': attention_mask,
-        })
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx.to(past_state.device))
-                for past_state in layer_past), )
-        return reordered_past
-
-    def build_inputs(self,
-                     tokenizer,
-                     query: str,
-                     history: List[Tuple[str, str]] = None,
-                     meta_instruction=''):
-        if history is None:
-            history = []
-        if tokenizer.add_bos_token:
-            prompt = ''
-        else:
-            prompt = tokenizer.bos_token
-        if meta_instruction:
-            prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n"""
-        for record in history:
-            prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n"""
-        prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
-        return tokenizer([prompt], return_tensors='pt')
-
-    @torch.no_grad()
-    def chat(
-        self,
-        tokenizer,
-        query: str,
-        history: Optional[List[Tuple[str, str]]] = None,
-        streamer: Optional[BaseStreamer] = None,
-        max_new_tokens: int = 1024,
-        do_sample: bool = True,
-        temperature: float = 0.8,
-        top_p: float = 0.8,
-        meta_instruction:
-        str = 'You are an AI assistant whose name is InternLM (书生·浦语).\n'
-        '- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory '
-        '(上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n'
-        '- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such '
-        'as English and 中文.',
-        **kwargs,
-    ):
-        if history is None:
-            history = []
-        inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
-        inputs = {
-            k: v.to(self.device)
-            for k, v in inputs.items() if torch.is_tensor(v)
-        }
-        # also add end-of-assistant token in eos token id to avoid unnecessary generation
-        eos_token_id = [
-            tokenizer.eos_token_id,
-            tokenizer.convert_tokens_to_ids(['<|im_end|>'])[0]
-        ]
-        outputs = self.generate(
-            **inputs,
-            streamer=streamer,
-            max_new_tokens=max_new_tokens,
-            do_sample=do_sample,
-            temperature=temperature,
-            top_p=top_p,
-            eos_token_id=eos_token_id,
-            **kwargs,
-        )
-        outputs = outputs[0].cpu().tolist()[len(inputs['input_ids'][0]):]
-        response = tokenizer.decode(outputs, skip_special_tokens=True)
-        response = response.split('<|im_end|>')[0]
-        history = history + [(query, response)]
-        return response, history
-
-    @torch.no_grad()
-    def stream_chat(
-        self,
-        tokenizer,
-        query: str,
-        history: List[Tuple[str, str]] = None,
-        max_new_tokens: int = 1024,
-        do_sample: bool = True,
-        temperature: float = 0.8,
-        top_p: float = 0.8,
-        **kwargs,
-    ):
-        if history is None:
-            history = []
-        """
-        Return a generator in format: (response, history)
-        Eg.
-        ('你好，有什么可以帮助您的吗', [('你好', '你好，有什么可以帮助您的吗')])
-        ('你好，有什么可以帮助您的吗？', [('你好', '你好，有什么可以帮助您的吗？')])
-        """
-        if BaseStreamer is None:
-            raise ModuleNotFoundError(
-                'The version of `transformers` is too low. Please make sure '
-                'that you have installed `transformers>=4.28.0`.')
-
-        response_queue = queue.Queue(maxsize=20)
-
-        class ChatStreamer(BaseStreamer):
-            """Streamer used in generate to print words one by one."""
-
-            def __init__(self, tokenizer) -> None:
-                super().__init__()
-                self.tokenizer = tokenizer
-                self.queue = response_queue
-                self.query = query
-                self.history = history
-                self.response = ''
-                self.cache = []
-                self.received_inputs = False
-                self.queue.put(
-                    (self.response, history + [(self.query, self.response)]))
-
-            def put(self, value):
-                if len(value.shape) > 1 and value.shape[0] > 1:
-                    raise ValueError('ChatStreamer only supports batch size 1')
-                elif len(value.shape) > 1:
-                    value = value[0]
-
-                if not self.received_inputs:
-                    # The first received value is input_ids, ignore here
-                    self.received_inputs = True
-                    return
-
-                self.cache.extend(value.tolist())
-                token = self.tokenizer.decode(self.cache,
-                                              skip_special_tokens=True)
-                if token.strip() != '<|im_end|>':
-                    self.response = self.response + token
-                    history = self.history + [(self.query, self.response)]
-                    self.queue.put((self.response, history))
-                    self.cache = []
-                else:
-                    self.end()
-
-            def end(self):
-                self.queue.put(None)
-
-        def stream_producer():
-            return self.chat(
-                tokenizer=tokenizer,
-                query=query,
-                streamer=ChatStreamer(tokenizer=tokenizer),
-                history=history,
-                max_new_tokens=max_new_tokens,
-                do_sample=do_sample,
-                temperature=temperature,
-                top_p=top_p,
-                **kwargs,
-            )
-
-        def consumer():
-            producer = threading.Thread(target=stream_producer)
-            producer.start()
-            while True:
-                res = response_queue.get()
-                if res is None:
-                    return
-                yield res
-
-        return consumer()
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2
-@add_start_docstrings(
-    """
-    The InternLM2 Model transformer with a sequence classification head on top (linear layer).
-
-    [`InternLM2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    InternLM2_START_DOCSTRING,
-)
-class InternLM2ForSequenceClassification(InternLM2PreTrainedModel):
-    """Sequence Classification Head for InternLM2 Model."""
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = InternLM2Model(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.tok_embeddings
-
-    def set_input_embeddings(self, value):
-        self.model.tok_embeddings = value
-
-    @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache,
-                                        List[torch.FloatTensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError(
-                'Cannot handle batch sizes > 1 if no padding token is defined.'
-            )
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(
-                    input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device),
-                               sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = 'regression'
-                elif self.num_labels > 1 and (labels.dtype
-                                              in (torch.long, torch.int)):
-                    self.config.problem_type = 'single_label_classification'
-                else:
-                    self.config.problem_type = 'multi_label_classification'
-
-            if self.config.problem_type == 'regression':
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == 'single_label_classification':
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels),
-                                labels.view(-1))
-            elif self.config.problem_type == 'multi_label_classification':
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits, ) + transformer_outputs[1:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaForQuestionAnswering with Llama->InternLM2
-@add_start_docstrings(
-    """
-The InternLM2 Model transformer with a span classification head on top for extractive question-answering tasks like
-SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
-    """,
-    InternLM2_START_DOCSTRING,
-)
-class InternLM2ForQuestionAnswering(InternLM2PreTrainedModel):
-    """Question Answering model for InternLM2."""
-
-    base_model_prefix = 'transformer'
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.transformer = InternLM2Model(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, 2)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.transformer.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.transformer.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache,
-                                        List[torch.FloatTensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        start_positions: Optional[torch.LongTensor] = None,
-        end_positions: Optional[torch.LongTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
-        r"""
-        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1).contiguous()
-        end_logits = end_logits.squeeze(-1).contiguous()
-
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1).to(
-                    start_logits.device)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1).to(end_logits.device)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions = end_positions.clamp(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-
-        if not return_dict:
-            output = (start_logits, end_logits) + outputs[2:]
-            return ((total_loss, ) +
-                    output) if total_loss is not None else output
-
-        return QuestionAnsweringModelOutput(
-            loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->InternLM2
-@add_start_docstrings(
-    """
-    The InternLM2 Model transformer with a token classification head on top (a linear layer on top of the hidden-states
-    output) e.g. for Named-Entity-Recognition (NER) tasks.
-    """,
-    InternLM2_START_DOCSTRING,
-)
-class InternLM2ForTokenClassification(InternLM2PreTrainedModel):
-    """Token classification model for InternLM2."""
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = InternLM2Model(config)
-        if getattr(config, 'classifier_dropout', None) is not None:
-            classifier_dropout = config.classifier_dropout
-        elif getattr(config, 'hidden_dropout', None) is not None:
-            classifier_dropout = config.hidden_dropout
-        else:
-            classifier_dropout = 0.1
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.score = nn.Linear(config.hidden_size, config.num_labels)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = outputs[0]
-        sequence_output = self.dropout(sequence_output)
-        logits = self.score(sequence_output)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-
-        if not return_dict:
-            output = (logits, ) + outputs[2:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return TokenClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
diff --git a/lmdeploy/pytorch/modeling/modeling_llama.py b/lmdeploy/pytorch/modeling/modeling_llama.py
deleted file mode 100644
index c37d1e9ae..000000000
--- a/lmdeploy/pytorch/modeling/modeling_llama.py
+++ /dev/null
@@ -1,1297 +0,0 @@
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch LLaMA model."""
-import math
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import (BaseModelOutputWithPast,
-                                           CausalLMOutputWithPast,
-                                           SequenceClassifierOutputWithPast)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.models.llama.configuration_llama import LlamaConfig
-from transformers.utils import (add_start_docstrings,
-                                add_start_docstrings_to_model_forward,
-                                replace_return_docstrings)
-
-from lmdeploy.pytorch.modeling.convert_to_qmodules import convert_to_qmodules
-from lmdeploy.utils import get_logger
-
-logger = get_logger('lmdeploy')
-
-_CONFIG_FOR_DOC = 'LlamaConfig'
-
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size,
-                      dtype: torch.dtype,
-                      device: torch.device,
-                      past_key_values_length: int = 0):
-    """Make causal mask used for bi-directional self-attention."""
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len),
-                      torch.finfo(dtype).min,
-                      device=device)
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([
-            torch.zeros(
-                tgt_len, past_key_values_length, dtype=dtype, device=device),
-            mask
-        ],
-                         dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len,
-                                         tgt_len + past_key_values_length)
-
-
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor,
-                 dtype: torch.dtype,
-                 tgt_len: Optional[int] = None):
-    """Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len,
-    src_seq_len]`."""
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len,
-                                                  src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool),
-                                     torch.finfo(dtype).min)
-
-
-class LlamaRMSNorm(nn.Module):
-
-    def __init__(self, hidden_size, eps=1e-6):
-        """LlamaRMSNorm is equivalent to T5LayerNorm."""
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance +
-                                                    self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-class LlamaRotaryEmbedding(torch.nn.Module):
-    """RotaryEmbedding for Llama Model.
-
-    This module generates sine and cosine positional encodings based on
-    the paper "RoFormer: Enhanced Transformer with Rotary Position Embedding".
-    The purpose of this class is to provide positional embeddings to the
-    input tensors. It utilizes a cache mechanism to store precomputed
-    sine and cosine values for speedup.
-
-    Args:
-        dim (int): The dimensionality of the embeddings.
-        max_position_embeddings (int, optional): The maximum number of
-            position embeddings. Default is 2048.
-        base (int, optional): The base value for the inverse frequency
-            calculation. Default is 10000.
-        device (str, optional): The device to run operations on.
-            If None, defaults to the device of the model.
-    """
-
-    def __init__(self,
-                 dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base**(
-            torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer('inv_freq', inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(seq_len=max_position_embeddings,
-                                device=self.inv_freq.device,
-                                dtype=torch.get_default_dtype())
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        """Sets the cached sine and cosine values for the specified sequence
-        length.
-
-        Args:
-            seq_len (int): The sequence length for which to set the cache.
-            device (str): The device to use for computation.
-            dtype (torch.dtype): The data type to be used for tensors.
-        """
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached,
-                         device=device,
-                         dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order
-        # to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer('cos_cached',
-                             emb.cos()[None, None, :, :].to(dtype),
-                             persistent=False)
-        self.register_buffer('sin_cached',
-                             emb.sin()[None, None, :, :].to(dtype),
-                             persistent=False)
-
-    def forward(self, x, seq_len=None):
-        """Forward propagation method for the embedding layer. Generates
-        positional embeddings for the given input tensor.
-
-        If the sequence length is larger than the cache, it resets the cache.
-
-        Args:
-            x (torch.Tensor): Input tensor of shape
-                [batch_size, num_attention_heads, seq_len, head_size].
-            seq_len (int, optional): Sequence length. If None, it is obtained
-                from `x`.
-
-        Returns:
-            tuple: Tuple containing cosine and sine positional embeddings.
-        """
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len,
-                                    device=x.device,
-                                    dtype=x.dtype)
-
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
-
-
-class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """This class extends the `LlamaRotaryEmbedding` with linear scaling.
-
-    It provides a mechanism for adjusting the scale of the positional
-    embeddings by dividing the tensor generated by the range of sequence length
-    with a scaling factor. This is useful when dealing with sequences of
-    varying lengths.
-
-    Credits to Reddit User /u/kaiokendev for this extension.
-
-    Args:
-        dim (int): The dimensionality of the embeddings.
-        max_position_embeddings (int, optional): The maximum number of
-            position embeddings. Default is 2048.
-        base (int, optional): The base value for the inverse frequency
-            calculation. Default is 10000.
-        device (str, optional): The device to run operations on. If None,
-            defaults to the device of the model.
-        scaling_factor (float, optional): Scaling factor used in adjusting
-            the scale of positional embeddings. Default is 1.0.
-    """
-
-    def __init__(self,
-                 dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 device=None,
-                 scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        """Sets the cached sine and cosine values for the specified sequence
-        length.
-
-        Args:
-            seq_len (int): The sequence length for which to set the cache.
-            device (str): The device to use for computation.
-            dtype (torch.dtype): The data type to use for tensors.
-        """
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached,
-                         device=device,
-                         dtype=self.inv_freq.dtype)
-        t = t / self.scaling_factor
-
-        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order
-        # to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer('cos_cached',
-                             emb.cos()[None, None, :, :].to(dtype),
-                             persistent=False)
-        self.register_buffer('sin_cached',
-                             emb.sin()[None, None, :, :].to(dtype),
-                             persistent=False)
-
-
-class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with Dynamic NTK scaling.
-
-    Credits to the Reddit users /u/bloc97 and /u/emozilla
-    """
-
-    def __init__(self,
-                 dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 device=None,
-                 scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-
-        if seq_len > self.max_position_embeddings:
-            base = self.base * ((self.scaling_factor * seq_len /
-                                 self.max_position_embeddings) -
-                                (self.scaling_factor - 1))**(self.dim /
-                                                             (self.dim - 2))
-            inv_freq = 1.0 / (base**(
-                torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-            self.register_buffer('inv_freq', inv_freq, persistent=False)
-
-        t = torch.arange(self.max_seq_len_cached,
-                         device=device,
-                         dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order
-        # to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer('cos_cached',
-                             emb.cos()[None, None, :, :].to(dtype),
-                             persistent=False)
-        self.register_buffer('sin_cached',
-                             emb.sin()[None, None, :, :].to(dtype),
-                             persistent=False)
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., :x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    """Apply rotary positional embeddings to query and key tensors.
-
-    This function applies the cosine and sine positional embeddings on the
-    input query (q) and key (k) tensors using element-wise multiplication and
-    addition.
-    """
-    # The first two dimensions of cos and sin are always 1,
-    # so we can `squeeze` them.
-    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
-    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
-    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class LlamaMLP(nn.Module):
-    """MLP for Llama Model."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size,
-                                   self.intermediate_size,
-                                   bias=False)
-        self.up_proj = nn.Linear(self.hidden_size,
-                                 self.intermediate_size,
-                                 bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size,
-                                   self.hidden_size,
-                                   bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        if self.config.pretraining_tp > 1:
-            slice = self.intermediate_size // self.config.pretraining_tp
-            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
-            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
-            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
-
-            gate_proj = torch.cat([
-                F.linear(x, gate_proj_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ],
-                                  dim=-1)
-            up_proj = torch.cat([
-                F.linear(x, up_proj_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ],
-                                dim=-1)
-
-            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(
-                slice, dim=2)
-            down_proj = [
-                F.linear(intermediate_states[i], down_proj_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ]
-            down_proj = sum(down_proj)
-        else:
-            down_proj = self.down_proj(
-                self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-        return down_proj
-
-
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """This is the equivalent of torch.repeat_interleave(x, dim=1,
-    repeats=n_rep).
-
-    The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to
-    (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :,
-                                  None, :, :].expand(batch,
-                                                     num_key_value_heads,
-                                                     n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen,
-                                 head_dim)
-
-
-class LlamaAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper."""
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError('hidden_size must be divisible by num_heads '
-                             f'(got `hidden_size`: {self.hidden_size}'
-                             f' and `num_heads`: {self.num_heads}).')
-        self.q_proj = nn.Linear(self.hidden_size,
-                                self.num_heads * self.head_dim,
-                                bias=False)
-        self.k_proj = nn.Linear(self.hidden_size,
-                                self.num_key_value_heads * self.head_dim,
-                                bias=False)
-        self.v_proj = nn.Linear(self.hidden_size,
-                                self.num_key_value_heads * self.head_dim,
-                                bias=False)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim,
-                                self.hidden_size,
-                                bias=False)
-        self._init_rope()
-
-    def _init_rope(self):
-        """Initialize the Rotary Embedding Module."""
-        if self.config.rope_scaling is None:
-            self.rotary_emb = LlamaRotaryEmbedding(
-                self.head_dim,
-                max_position_embeddings=self.max_position_embeddings,
-                base=self.rope_theta,
-            )
-        else:
-            scaling_type = self.config.rope_scaling['type']
-            scaling_factor = self.config.rope_scaling['factor']
-            if scaling_type == 'linear':
-                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            elif scaling_type == 'dynamic':
-                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            else:
-                raise ValueError(f'Unknown RoPE scaling type {scaling_type}')
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads,
-                           self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        """Forward propagation method for the attention layer."""
-        bsz, q_len, _ = hidden_states.size()
-
-        if self.config.pretraining_tp > 1:
-            key_value_slicing = (self.num_key_value_heads *
-                                 self.head_dim) // self.config.pretraining_tp
-            query_slices = self.q_proj.weight.split(
-                (self.num_heads * self.head_dim) // self.config.pretraining_tp,
-                dim=0)
-            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-            query_states = [
-                F.linear(hidden_states, query_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ]
-            query_states = torch.cat(query_states, dim=-1)
-
-            key_states = [
-                F.linear(hidden_states, key_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ]
-            key_states = torch.cat(key_states, dim=-1)
-
-            value_states = [
-                F.linear(hidden_states, value_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ]
-            value_states = torch.cat(value_states, dim=-1)
-
-        else:
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads,
-                                         self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
-                                     self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
-                                         self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(
-            2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                'Attention weights should be of size '
-                f'{(bsz, self.num_heads, q_len, kv_seq_len)}, but is'
-                f' {attn_weights.size()}')
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError('Attention mask should be of size '
-                                 f'{(bsz, 1, q_len, kv_seq_len)}, '
-                                 f'but is {attention_mask.size()}')
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights,
-                                             dim=-1,
-                                             dtype=torch.float32).to(
-                                                 query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                '`attn_output` should be of size '
-                f'{(bsz, self.num_heads, q_len, self.head_dim)}, but is'
-                f' {attn_output.size()}')
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        if self.config.pretraining_tp > 1:
-            attn_output = attn_output.split(self.hidden_size //
-                                            self.config.pretraining_tp,
-                                            dim=2)
-            o_proj_slices = self.o_proj.weight.split(
-                self.hidden_size // self.config.pretraining_tp, dim=1)
-            attn_output = sum([
-                F.linear(attn_output[i], o_proj_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ])
-        else:
-            attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class LlamaDecoderLayer(nn.Module):
-    """Decoder layer for Llama Model."""
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = LlamaAttention(config=config)
-        self.mlp = LlamaMLP(config)
-        self.input_layernorm = LlamaRMSNorm(config.hidden_size,
-                                            eps=config.rms_norm_eps)
-        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size,
-                                                     eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor,
-                                                 torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape
-                `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask
-                of size `(batch, 1, tgt_len, src_len)` where padding elements
-                are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all
-                attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are
-                returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached
-                past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states, )
-
-        if output_attentions:
-            outputs += (self_attn_weights, )
-
-        if use_cache:
-            outputs += (present_key_value, )
-
-        return outputs
-
-
-LLAMA_START_DOCSTRING = r"""    # noqa: E501
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`LlamaConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    'The bare LLaMA Model outputting raw hidden-states without any specific head on top.',  # noqa: E501
-    LLAMA_START_DOCSTRING,
-)
-class LlamaPreTrainedModel(PreTrainedModel):
-    config_class = LlamaConfig
-    base_model_prefix = 'model'
-    supports_gradient_checkpointing = True
-    _no_split_modules = ['LlamaDecoderLayer']
-    _skip_keys_device_placement = 'past_key_values'
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, LlamaModel):
-            module.gradient_checkpointing = value
-
-
-LLAMA_INPUTS_DOCSTRING = r"""    # noqa: E501
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    'The bare LLaMA Model outputting raw hidden-states without any specific head on top.',  # noqa: E501
-    LLAMA_START_DOCSTRING,
-)
-class LlamaModel(LlamaPreTrainedModel):
-    """Transformer decoder consisting of *config.num_hidden_layers* layers.
-    Each layer is a [`LlamaDecoderLayer`]
-
-    Args:
-        config: LlamaConfig
-    """
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size,
-                                         self.padding_idx)
-        self.layers = nn.ModuleList([
-            LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask    # noqa
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape,
-                                        inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
-            )
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask,
-                                              inputs_embeds.dtype,
-                                              tgt_len=input_shape[-1]).to(
-                                                  inputs_embeds.device)
-            combined_attention_mask = (expanded_attn_mask
-                                       if combined_attention_mask is None else
-                                       expanded_attn_mask +
-                                       combined_attention_mask)
-
-        return combined_attention_mask
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        use_cache = (use_cache
-                     if use_cache is not None else self.config.use_cache)
-
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError('You cannot specify both decoder_input_ids'
-                             'and decoder_inputs_embeds at the same time')
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError('You have to specify either decoder_input_ids'
-                             'or decoder_inputs_embeds')
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = (seq_length_with_past +
-                                    past_key_values_length)
-
-        if position_ids is None:
-            device = (input_ids.device
-                      if input_ids is not None else inputs_embeds.device)
-            position_ids = torch.arange(past_key_values_length,
-                                        seq_length + past_key_values_length,
-                                        dtype=torch.long,
-                                        device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_length_with_past),
-                                        dtype=torch.bool,
-                                        device=inputs_embeds.device)
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds,
-            past_key_values_length)
-
-        hidden_states = inputs_embeds
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    '`use_cache=True` is incompatible with gradient'
-                    ' checkpointing. Setting `use_cache=False`...')
-                use_cache = False
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states, )
-
-            past_key_value = past_key_values[
-                idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, past_key_value,
-                                      output_attentions)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (
-                    layer_outputs[2 if output_attentions else 1], )
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1], )
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states, )
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(
-                v for v in
-                [hidden_states, next_cache, all_hidden_states, all_self_attns]
-                if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class LlamaForCausalLM(LlamaPreTrainedModel):
-    """This class extends the `LlamaPreTrainedModel` to enable causal language
-    modeling.
-
-    It wraps the basic Llama model (`LlamaModel`) and includes a linear layer
-    as a language model head (`lm_head`). The purpose is to predict token
-    probabilities, given the previous tokens in the sequence.
-    """
-    _tied_weights_keys = ['lm_head.weight']
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = LlamaModel(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size,
-                                 config.vocab_size,
-                                 bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-        convert_to_qmodules(self)
-
-    def get_input_embeddings(self):
-        """Get the token embedding layer."""
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        """Set the token embedding layer."""
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        """Get the output embedding layer."""
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        """Set the output embedding layer."""
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        """Set the decoder model."""
-        self.model = decoder
-
-    def get_decoder(self):
-        """Get the decoder model."""
-        return self.model
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast,
-                               config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r""" # noqa: E501
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, LlamaForCausalLM
-
-        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) # noqa: E501
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        if self.config.pretraining_tp > 1:
-            lm_head_slices = self.lm_head.weight.split(
-                self.vocab_size // self.config.pretraining_tp, dim=0)
-            logits = [
-                F.linear(hidden_states, lm_head_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ]
-            logits = torch.cat(logits, dim=-1)
-        else:
-            logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits, ) + outputs[1:]
-            return (loss, ) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      past_key_values=None,
-                                      attention_mask=None,
-                                      inputs_embeds=None,
-                                      **kwargs):
-        """Prepare inputs for generating sequences using the model.
-
-        Args:
-            input_ids (torch.Tensor): Input token ids.
-            past_key_values (list[torch.Tensor], optional): List of past key
-                and value states.
-            attention_mask (torch.Tensor, optional): Mask indicating which
-                tokens should be attended to.
-            inputs_embeds (torch.FloatTensor, optional): Optionally,
-                the input embeddings instead of token ids.
-
-        Returns:
-            dict: Dictionary containing prepared inputs for model generation.
-        """
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-
-        position_ids = kwargs.get('position_ids', None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-
-        # if `inputs_embeds` are passed, we only want to use them
-        # in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {'inputs_embeds': inputs_embeds}
-        else:
-            model_inputs = {'input_ids': input_ids}
-
-        model_inputs.update({
-            'position_ids': position_ids,
-            'past_key_values': past_key_values,
-            'use_cache': kwargs.get('use_cache'),
-            'attention_mask': attention_mask,
-        })
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        """Reorder cached past key-values during generation using beam search.
-
-        This function reorders the cached past key-values according to the
-        given indices. It's useful in beam search where the order of hypotheses
-        can change from one time-step to another.
-        """
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx.to(past_state.device))
-                for past_state in layer_past), )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """ # noqa: E501
-    The LLaMa Model transformer with a sequence classification head on top (linear layer).
-
-    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    LLAMA_START_DOCSTRING,
-)
-class LlamaForSequenceClassification(LlamaPreTrainedModel):
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = LlamaModel(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r""" # noqa: E501
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError(
-                'Cannot handle batch sizes > 1 if no padding token is defined.'
-            )
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = (torch.eq(
-                    input_ids, self.config.pad_token_id).long().argmax(-1) -
-                                    1).to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device),
-                               sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = 'regression'
-                elif self.num_labels > 1 and (labels.dtype == torch.long
-                                              or labels.dtype == torch.int):
-                    self.config.problem_type = 'single_label_classification'
-                else:
-                    self.config.problem_type = 'multi_label_classification'
-
-            if self.config.problem_type == 'regression':
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == 'single_label_classification':
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels),
-                                labels.view(-1))
-            elif self.config.problem_type == 'multi_label_classification':
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits, ) + transformer_outputs[1:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )