From 64c5084a2b49c0e974131aeca3d72edd5f3d2ccb Mon Sep 17 00:00:00 2001
From: cookieyyds <126683903+cookieyyds@users.noreply.github.com>
Date: Fri, 11 Oct 2024 11:07:49 +0800
Subject: [PATCH 001/122] Add instruction for downloading models from openmind
 hub (#2577)

---
 README.md         | 4 ++++
 README_ja.md      | 4 ++++
 README_zh-CN.md   | 4 ++++
 lmdeploy/utils.py | 2 +-
 4 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 26763b3c53..41ff9c0268 100644
--- a/README.md
+++ b/README.md
@@ -199,6 +199,10 @@ print(response)
 > By default, LMDeploy downloads model from HuggingFace. If you would like to use models from ModelScope, please install ModelScope by `pip install modelscope` and set the environment variable:
 >
 > `export LMDEPLOY_USE_MODELSCOPE=True`
+>
+> If you would like to use models from openMind Hub, please install openMind Hub by `pip install openmind_hub` and set the environment variable:
+>
+> `export LMDEPLOY_USE_OPENMIND_HUB=True`
 
 For more information about inference pipeline, please refer to [here](docs/en/llm/pipeline.md).
 
diff --git a/README_ja.md b/README_ja.md
index 9a7419e7b3..ea4480e282 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -197,6 +197,10 @@ print(response)
 > デフォルトでは、LMDeployはHuggingFaceからモデルをダウンロードします。ModelScopeからモデルを使用する場合は、`pip install modelscope`コマンドでModelScopeをインストールし、環境変数を設定してください：
 >
 > `export LMDEPLOY_USE_MODELSCOPE=True`
+>
+> openMind Hubからモデルを使用する場合は、`pip install openmind_hub`コマンドでopenMind Hubをインストールし、環境変数を設定してください：
+>
+> `export LMDEPLOY_USE_OPENMIND_HUB=True`
 
 推論パイプラインに関する詳細情報は[こちら](./docs/en/llm/pipeline.md)を参照してください。
 
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 4b9f85c735..cdddb64a22 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -199,6 +199,10 @@ print(response)
 > LMDeploy 默认从 HuggingFace 上面下载模型，如果要从 ModelScope 上面下载模型，请通过命令 `pip install modelscope` 安装ModelScope，并设置环境变量：
 >
 > `export LMDEPLOY_USE_MODELSCOPE=True`
+>
+> 如果要从 openMind Hub 上面下载模型，请通过命令 `pip install openmind_hub` 安装openMind Hub，并设置环境变量：
+>
+> `export LMDEPLOY_USE_OPENMIND_HUB=True`
 
 关于 pipeline 的更多推理参数说明，请参考[这里](docs/zh_cn/llm/pipeline.md)
 
diff --git a/lmdeploy/utils.py b/lmdeploy/utils.py
index 599af15900..a540b73916 100644
--- a/lmdeploy/utils.py
+++ b/lmdeploy/utils.py
@@ -186,7 +186,7 @@ def get_model(pretrained_model_name_or_path: str,
               download_dir: str = None,
               revision: str = None,
               token: str = None):
-    """Get model from huggingface or modelscope."""
+    """Get model from huggingface, modelscope or openmind_hub."""
     import os
     if os.getenv('LMDEPLOY_USE_MODELSCOPE', 'False').lower() == 'true':
         from modelscope import snapshot_download

From 1d442df3076fa7227d89a98e47f49a6b75301469 Mon Sep 17 00:00:00 2001
From: RunningLeon <mnsheng@yeah.net>
Date: Sat, 12 Oct 2024 14:45:56 +0800
Subject: [PATCH 002/122] [Doc]: Lock sphinx version (#2594)

* update

* fix link
---
 docs/en/quantization/w4a16.md | 2 +-
 requirements/docs.txt         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/en/quantization/w4a16.md b/docs/en/quantization/w4a16.md
index 3adaf7a750..3a04cd7b05 100644
--- a/docs/en/quantization/w4a16.md
+++ b/docs/en/quantization/w4a16.md
@@ -69,7 +69,7 @@ lmdeploy serve gradio ./internlm2_5-7b-chat-4bit --server_name {ip_addr} --serve
 
 ## Evaluation
 
-Please overview [this guide](https://opencompass.readthedocs.io/en/latest/advanced_guides/evaluation_turbomind.html) about model evaluation with LMDeploy.
+Please refer to [OpenCompass](https://opencompass.readthedocs.io/en/latest/index.html) about model evaluation with LMDeploy.
 
 ## Inference
 
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 4805d25fd6..47a4ce2e19 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -1,6 +1,6 @@
 markdown>=3.4.0
 myst-parser
-sphinx
+sphinx==8.0.2
 sphinx-book-theme
 sphinx-copybutton
 sphinx-tabs

From 9b52f8d5b5131dd0a7e8f2069a25719daec51dbc Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Sat, 12 Oct 2024 17:06:19 +0800
Subject: [PATCH 003/122] [ci] use local requirements for test workflow (#2569)

* update

* update
---
 .github/workflows/benchmark.yml      |  8 +-------
 .github/workflows/daily_ete_test.yml | 16 +++++-----------
 .github/workflows/evaluate.yml       |  8 +-------
 .github/workflows/stable.yml         |  8 +-------
 4 files changed, 8 insertions(+), 32 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index cf8283bbda..bd3876f9ed 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -23,11 +23,6 @@ on:
         description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
         type: boolean
         default: false
-      dependency_pkgs:
-        required: true
-        description: 'Dependency packages, you can also set a specific version'
-        type: string
-        default: 'packaging transformers_stream_generator transformers datasets matplotlib jmespath'
 
 env:
   HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
@@ -35,7 +30,6 @@ env:
   OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
   REPORT_DIR: /nvme/qa_test_models/benchmark-reports/${{ github.run_id }}
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
-  dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib jmespath'}}
   FAIL_CONFIG: ${{ github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
 
 jobs:
@@ -115,7 +109,7 @@ jobs:
           python3 -m pip install -e /root/packages/AutoAWQ_kernels
           python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
           python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps
-          python3 -m pip install ${{env.dependency_pkgs}}
+          python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
index 16202656ec..1cb242a74b 100644
--- a/.github/workflows/daily_ete_test.yml
+++ b/.github/workflows/daily_ete_test.yml
@@ -28,11 +28,6 @@ on:
         description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
         type: boolean
         default: false
-      dependency_pkgs:
-        required: true
-        description: 'Dependency packages, you can also set a specific version'
-        type: string
-        default: 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord auto_gptq qwen_vl_utils mmengine-lite==0.10.5'
       regression_func:
         required: true
         description: 'regression functions'
@@ -43,7 +38,6 @@ on:
 
 env:
   HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
-  dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord auto_gptq qwen_vl_utils mmengine-lite==0.10.5'}}
   HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
   OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
@@ -132,7 +126,7 @@ jobs:
           python3 -m pip install -e /root/packages/AutoAWQ_kernels
           python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
           python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps
-          python3 -m pip install ${{env.dependency_pkgs}}
+          python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
@@ -306,7 +300,7 @@ jobs:
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-          python3 -m pip install ${{env.dependency_pkgs}}
+          python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
@@ -410,7 +404,7 @@ jobs:
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-          python3 -m pip install ${{env.dependency_pkgs}}
+          python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
@@ -483,7 +477,7 @@ jobs:
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-          python3 -m pip install ${{env.dependency_pkgs}}
+          python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
@@ -559,7 +553,7 @@ jobs:
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
           python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps
-          python3 -m pip install ${{env.dependency_pkgs}}
+          python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml
index 433f432efa..1e8d78d143 100644
--- a/.github/workflows/evaluate.yml
+++ b/.github/workflows/evaluate.yml
@@ -43,15 +43,9 @@ on:
         description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
         type: boolean
         default: false
-      dependency_pkgs:
-        required: true
-        description: 'Dependency packages, you can also set a specific version'
-        type: string
-        default: 'pynvml packaging protobuf transformers_stream_generator transformers human_eval mmengine-lite==0.10.5'
 
 env:
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
-  dependency_pkgs: ${{inputs.dependency_pkgs || 'pynvml packaging protobuf transformers_stream_generator transformers human_eval mmengine-lite==0.10.5'}}
 
 jobs:
   linux-build:
@@ -136,7 +130,7 @@ jobs:
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
           python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps
-          python3 -m pip install ${{env.dependency_pkgs}}
+          python3 -m pip install -r /root/models/offline_pkg/requirements.txt
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
diff --git a/.github/workflows/stable.yml b/.github/workflows/stable.yml
index 97a9df7826..85daed8e2b 100644
--- a/.github/workflows/stable.yml
+++ b/.github/workflows/stable.yml
@@ -18,11 +18,6 @@ on:
         description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
         type: boolean
         default: false
-      dependency_pkgs:
-        required: true
-        description: 'Dependency packages, you can also set a specific version'
-        type: string
-        default: 'packaging transformers_stream_generator transformers datasets matplotlib jmespath mmengine-lite==0.10.5'
   schedule:
     - cron:  '00 8 * * 1'
 
@@ -32,7 +27,6 @@ env:
   OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
   REPORT_DIR: /nvme/qa_test_models/stable_reports/${{ github.run_id }}
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
-  dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib jmespath mmengine-lite==0.10.5'}}
   COMPASS_DATA_CACHE: /nvme/qa_test_models/dataset
 
 jobs:
@@ -110,7 +104,7 @@ jobs:
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
           python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps
-          python3 -m pip install ${{env.dependency_pkgs}}
+          python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
       - name: Install lmdeploy
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |

From 88eccb20402ec93dc29074c9bbf13ea7b5ce6103 Mon Sep 17 00:00:00 2001
From: CyCle1024 <chenchiyu@pjlab.org.cn>
Date: Mon, 14 Oct 2024 11:35:58 +0800
Subject: [PATCH 004/122] fix: make exit_flag verification for ascend more
 general (#2588)

---
 lmdeploy/pytorch/engine/model_agent.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
index 9649a61d2f..bba25213b6 100644
--- a/lmdeploy/pytorch/engine/model_agent.py
+++ b/lmdeploy/pytorch/engine/model_agent.py
@@ -726,8 +726,7 @@ def _exit_by_sending_exit_flag(rank: int, agent: TPModelAgent):
         return
 
     import sys
-    if agent.backend_config.device_type == 'ascend' \
-            and 'uvicorn.server' in sys.modules:
+    if 'torch_npu' in sys.modules and 'uvicorn.server' in sys.modules:
         # Workaround for CLI serve mode with device_type ascend:
         # using uvicorn server causes ascend low-level backend of subprocesses
         # corrupted, and using _broadcast_inputs in this case leads to

From 4126067ad89cd7c9bf7191f1483bcd69224041c1 Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Mon, 14 Oct 2024 20:40:39 +0800
Subject: [PATCH 005/122] Support pytorch engine kv int4/int8 quantization
 (#2438)

* Support pytorch engine kv int4/int8 quantization

* add baichuan kernel

* update all the models

* refine int4 dequant and skip ut

* no trans op enable reshape->view

* update benchmark cli and UT

* mv quant_policy to attn_meta

* resolve comments

* triton>=2.2.0 check for int4

* better unpack method for int4

* cast inside _quant_int4

* add ascend device check

* add short docstring for kernels

* fix qwen2-vl and update documents

* update documents
---
 benchmark/profile_throughput.py               |   4 +-
 docs/en/supported_models/supported_models.md  |  82 +-
 .../supported_models/supported_models.md      |  82 +-
 lmdeploy/cli/cli.py                           |   6 +-
 lmdeploy/cli/serve.py                         |   5 +-
 lmdeploy/messages.py                          |   6 +
 lmdeploy/pytorch/backends/ascend/attention.py |   2 +
 lmdeploy/pytorch/backends/attention.py        |   3 +-
 lmdeploy/pytorch/backends/cuda/attention.py   |  12 +
 lmdeploy/pytorch/backends/cuda/op_backend.py  |   1 +
 lmdeploy/pytorch/config.py                    |   3 +-
 lmdeploy/pytorch/engine/cache_engine.py       |  92 +-
 lmdeploy/pytorch/engine/engine.py             |   1 +
 lmdeploy/pytorch/engine/model_agent.py        |   4 +-
 .../kernels/cuda/alibi_pagedattention.py      | 644 ++++++++++++--
 .../pytorch/kernels/cuda/fill_kv_cache.py     | 366 +++++++-
 .../pytorch/kernels/cuda/pagedattention.py    | 792 ++++++++++++++++--
 lmdeploy/pytorch/model_inputs.py              |   7 +-
 lmdeploy/pytorch/models/baichuan.py           |   4 +
 lmdeploy/pytorch/models/chatglm2.py           |   4 +
 lmdeploy/pytorch/models/cogvlm.py             |   4 +
 lmdeploy/pytorch/models/dbrx.py               |   4 +
 lmdeploy/pytorch/models/deepseek.py           |   4 +
 lmdeploy/pytorch/models/deepseek_v2.py        |   4 +
 lmdeploy/pytorch/models/falcon.py             |   4 +
 lmdeploy/pytorch/models/gemma.py              |   4 +
 lmdeploy/pytorch/models/internlm.py           |   4 +
 lmdeploy/pytorch/models/internlm2.py          |   4 +
 lmdeploy/pytorch/models/llama.py              |   4 +
 lmdeploy/pytorch/models/mistral.py            |   4 +
 lmdeploy/pytorch/models/mixtral.py            |   4 +
 lmdeploy/pytorch/models/phi3.py               |   4 +
 lmdeploy/pytorch/models/phi3_moe.py           |   4 +
 lmdeploy/pytorch/models/qwen.py               |   4 +
 lmdeploy/pytorch/models/qwen2.py              |   4 +
 lmdeploy/pytorch/models/qwen2_moe.py          |   4 +
 lmdeploy/pytorch/models/qwen2_vl.py           |   4 +
 lmdeploy/pytorch/models/starcoder2.py         |   4 +
 lmdeploy/pytorch/nn/attention.py              |   4 +
 tests/pytorch/kernel/test_fill_kv_cache.py    | 142 ++++
 tests/pytorch/kernel/test_paged_attention.py  | 154 ++++
 41 files changed, 2192 insertions(+), 300 deletions(-)

diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py
index 23fa317810..e6f461d97b 100644
--- a/benchmark/profile_throughput.py
+++ b/benchmark/profile_throughput.py
@@ -286,6 +286,7 @@ def parse_args():
     cache_count_act = ArgumentHelper.cache_max_entry_count(pt_group)
     cache_block_seq_len_act = ArgumentHelper.cache_block_seq_len(pt_group)
     prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group)
+    quant_policy_act = ArgumentHelper.quant_policy(pt_group, default=0)
 
     # turbomind engine args
     tb_group = parser.add_argument_group('TurboMind engine argument')
@@ -294,8 +295,8 @@ def parse_args():
     tb_group._group_actions.append(cache_count_act)
     tb_group._group_actions.append(cache_block_seq_len_act)
     tb_group._group_actions.append(prefix_caching_act)
+    tb_group._group_actions.append(quant_policy_act)
     ArgumentHelper.model_format(tb_group, default='hf')
-    ArgumentHelper.quant_policy(tb_group, default=0)
     ArgumentHelper.num_tokens_per_iter(tb_group)
     ArgumentHelper.max_prefill_iters(tb_group)
 
@@ -328,6 +329,7 @@ def main():
             tp=args.tp,
             thread_safe=True,
             enable_prefix_caching=args.enable_prefix_caching,
+            quant_policy=args.quant_policy,
         )
 
     engine = Engine(args.model_path, engine_config, csv=args.csv)
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index 25010f63bd..8626164e66 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -43,47 +43,47 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 
 ## PyTorchEngine on CUDA Platform
 
-|     Model      |    Size     | Type | FP16/BF16 | KV INT8 | W8A8 | W4A16 |
-| :------------: | :---------: | :--: | :-------: | :-----: | :--: | :---: |
-|     Llama      |  7B - 65B   | LLM  |    Yes    |   No    | Yes  |  Yes  |
-|     Llama2     |  7B - 70B   | LLM  |    Yes    |   No    | Yes  |  Yes  |
-|     Llama3     |   8B, 70B   | LLM  |    Yes    |   No    | Yes  |  Yes  |
-|    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   No    |  No  |   -   |
-|    InternLM    |  7B - 20B   | LLM  |    Yes    |   No    | Yes  |   -   |
-|   InternLM2    |  7B - 20B   | LLM  |    Yes    |   No    | Yes  |  Yes  |
-|  InternLM2.5   |     7B      | LLM  |    Yes    |   No    | Yes  |  Yes  |
-|   Baichuan2    |     7B      | LLM  |    Yes    |   No    | Yes  |  No   |
-|   Baichuan2    |     13B     | LLM  |    Yes    |   No    |  No  |  No   |
-|    ChatGLM2    |     6B      | LLM  |    Yes    |   No    |  No  |  No   |
-|     Falcon     |  7B - 180B  | LLM  |    Yes    |   No    |  No  |  No   |
-|       YI       |  6B - 34B   | LLM  |    Yes    |   No    |  No  |  Yes  |
-|    Mistral     |     7B      | LLM  |    Yes    |   No    |  No  |  No   |
-|    Mixtral     |    8x7B     | LLM  |    Yes    |   No    |  No  |  No   |
-|      QWen      | 1.8B - 72B  | LLM  |    Yes    |   No    |  No  |  Yes  |
-|    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   No    |  No  |  Yes  |
-|  QWen1.5-MoE   |    A2.7B    | LLM  |    Yes    |   No    |  No  |  No   |
-|     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   No    |  No  |  Yes  |
-|    QWen2-VL    |   2B, 7B    | MLLM |    Yes    |   No    |  No  |  No   |
-|  DeepSeek-MoE  |     16B     | LLM  |    Yes    |   No    |  No  |  No   |
-|  DeepSeek-V2   |  16B, 236B  | LLM  |    Yes    |   No    |  No  |  No   |
-|    MiniCPM3    |     4B      | LLM  |    Yes    |   No    |  No  |  No   |
-|     Gemma      |    2B-7B    | LLM  |    Yes    |   No    |  No  |  No   |
-|      Dbrx      |    132B     | LLM  |    Yes    |   No    |  No  |  No   |
-|   StarCoder2   |   3B-15B    | LLM  |    Yes    |   No    |  No  |  No   |
-|   Phi-3-mini   |    3.8B     | LLM  |    Yes    |   No    |  No  |  Yes  |
-|  Phi-3-vision  |    4.2B     | MLLM |    Yes    |   No    |  No  |   -   |
-|  CogVLM-Chat   |     17B     | MLLM |    Yes    |   No    |  No  |   -   |
-|  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   No    |  No  |   -   |
-| LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   No    |  No  |   -   |
-| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   No    |  No  |  Yes  |
-|   InternVL2    |   1B-40B    | MLLM |    Yes    |   No    |  No  |   -   |
-|     Gemma2     |   9B-27B    | LLM  |    Yes    |   No    |  No  |   -   |
-|      GLM4      |     9B      | LLM  |    Yes    |   No    |  No  |  No   |
-|     GLM-4V     |     9B      | MLLM |    Yes    |   No    |  No  |  No   |
-|   CodeGeeX4    |     9B      | LLM  |    Yes    |   No    |  No  |   -   |
-|  Phi-3.5-mini  |    3.8B     | LLM  |    Yes    |   No    |  No  |   -   |
-|  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   No    |  No  |   -   |
-| Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   No    |  No  |   -   |
+|     Model      |    Size     | Type | FP16/BF16 | KV INT8 | KV INT4 | W8A8 | W4A16 |
+| :------------: | :---------: | :--: | :-------: | :-----: | :-----: | :--: | :---: |
+|     Llama      |  7B - 65B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|     Llama2     |  7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|     Llama3     |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |   -   |
+|   InternLM2    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|  InternLM2.5   |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|   Baichuan2    |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  No   |
+|   Baichuan2    |     13B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|    ChatGLM2    |     6B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|     Falcon     |  7B - 180B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|       YI       |  6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|    Mistral     |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|    Mixtral     |    8x7B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|      QWen      | 1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|  QWen1.5-MoE   |    A2.7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    |  No  |  Yes  |
+|    QWen2-VL    |   2B, 7B    | MLLM |    Yes    |   Yes   |   No    |  No  |  No   |
+|  DeepSeek-MoE  |     16B     | LLM  |    Yes    |   No    |   No    |  No  |  No   |
+|  DeepSeek-V2   |  16B, 236B  | LLM  |    Yes    |   No    |   No    |  No  |  No   |
+|    MiniCPM3    |     4B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|     Gemma      |    2B-7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|      Dbrx      |    132B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|   StarCoder2   |   3B-15B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|   Phi-3-mini   |    3.8B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|  Phi-3-vision  |    4.2B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|  CogVLM-Chat   |     17B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+| LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|     Gemma2     |   9B-27B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|      GLM4      |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|     GLM-4V     |     9B      | MLLM |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|   CodeGeeX4    |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|  Phi-3.5-mini  |    3.8B     | LLM  |    Yes    |   Yes   |   No    |  No  |   -   |
+|  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   Yes   |   No    |  No  |   -   |
+| Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   Yes   |   No    |  No  |   -   |
 
 ## PyTorchEngine on Huawei Ascend Platform
 
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index 92fa39669e..c0cb6affb2 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -43,47 +43,47 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 
 ## PyTorchEngine CUDA 平台
 
-|     Model      |    Size     | Type | FP16/BF16 | KV INT8 | W8A8 | W4A16 |
-| :------------: | :---------: | :--: | :-------: | :-----: | :--: | :---: |
-|     Llama      |  7B - 65B   | LLM  |    Yes    |   No    | Yes  |  Yes  |
-|     Llama2     |  7B - 70B   | LLM  |    Yes    |   No    | Yes  |  Yes  |
-|     Llama3     |   8B, 70B   | LLM  |    Yes    |   No    | Yes  |  Yes  |
-|    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   No    |  No  |   -   |
-|    InternLM    |  7B - 20B   | LLM  |    Yes    |   No    | Yes  |   -   |
-|   InternLM2    |  7B - 20B   | LLM  |    Yes    |   No    | Yes  |  Yes  |
-|  InternLM2.5   |     7B      | LLM  |    Yes    |   No    | Yes  |  Yes  |
-|   Baichuan2    |     7B      | LLM  |    Yes    |   No    | Yes  |  No   |
-|   Baichuan2    |     13B     | LLM  |    Yes    |   No    |  No  |  No   |
-|    ChatGLM2    |     6B      | LLM  |    Yes    |   No    |  No  |  No   |
-|     Falcon     |  7B - 180B  | LLM  |    Yes    |   No    |  No  |  No   |
-|       YI       |  6B - 34B   | LLM  |    Yes    |   No    |  No  |  Yes  |
-|    Mistral     |     7B      | LLM  |    Yes    |   No    |  No  |  No   |
-|    Mixtral     |    8x7B     | LLM  |    Yes    |   No    |  No  |  No   |
-|      QWen      | 1.8B - 72B  | LLM  |    Yes    |   No    |  No  |  Yes  |
-|    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   No    |  No  |  Yes  |
-|  QWen1.5-MoE   |    A2.7B    | LLM  |    Yes    |   No    |  No  |  No   |
-|     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   No    |  No  |  Yes  |
-|    QWen2-VL    |   2B, 7B    | MLLM |    Yes    |   No    |  No  |  No   |
-|  DeepSeek-MoE  |     16B     | LLM  |    Yes    |   No    |  No  |  No   |
-|  DeepSeek-V2   |  16B, 236B  | LLM  |    Yes    |   No    |  No  |  No   |
-|    MiniCPM3    |     4B      | LLM  |    Yes    |   No    |  No  |  No   |
-|     Gemma      |    2B-7B    | LLM  |    Yes    |   No    |  No  |  No   |
-|      Dbrx      |    132B     | LLM  |    Yes    |   No    |  No  |  No   |
-|   StarCoder2   |   3B-15B    | LLM  |    Yes    |   No    |  No  |  No   |
-|   Phi-3-mini   |    3.8B     | LLM  |    Yes    |   No    |  No  |  Yes  |
-|  Phi-3-vision  |    4.2B     | MLLM |    Yes    |   No    |  No  |   -   |
-|  CogVLM-Chat   |     17B     | MLLM |    Yes    |   No    |  No  |   -   |
-|  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   No    |  No  |   -   |
-| LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   No    |  No  |   -   |
-| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   No    |  No  |  Yes  |
-|   InternVL2    |   1B-40B    | MLLM |    Yes    |   No    |  No  |   -   |
-|     Gemma2     |   9B-27B    | LLM  |    Yes    |   No    |  No  |   -   |
-|      GLM4      |     9B      | LLM  |    Yes    |   No    |  No  |  No   |
-|     GLM-4V     |     9B      | MLLM |    Yes    |   No    |  No  |  No   |
-|   CodeGeeX4    |     9B      | LLM  |    Yes    |   No    |  No  |   -   |
-|  Phi-3.5-mini  |    3.8B     | LLM  |    Yes    |   No    |  No  |   -   |
-|  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   No    |  No  |   -   |
-| Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   No    |  No  |   -   |
+|     Model      |    Size     | Type | FP16/BF16 | KV INT8 | KV INT4 | W8A8 | W4A16 |
+| :------------: | :---------: | :--: | :-------: | :-----: | :-----: | :--: | :---: |
+|     Llama      |  7B - 65B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|     Llama2     |  7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|     Llama3     |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |   -   |
+|   InternLM2    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|  InternLM2.5   |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|   Baichuan2    |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  No   |
+|   Baichuan2    |     13B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|    ChatGLM2    |     6B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|     Falcon     |  7B - 180B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|       YI       |  6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|    Mistral     |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|    Mixtral     |    8x7B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|      QWen      | 1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|  QWen1.5-MoE   |    A2.7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    |  No  |  Yes  |
+|    QWen2-VL    |   2B, 7B    | MLLM |    Yes    |   Yes   |   No    |  No  |  No   |
+|  DeepSeek-MoE  |     16B     | LLM  |    Yes    |   No    |   No    |  No  |  No   |
+|  DeepSeek-V2   |  16B, 236B  | LLM  |    Yes    |   No    |   No    |  No  |  No   |
+|    MiniCPM3    |     4B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|     Gemma      |    2B-7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|      Dbrx      |    132B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|   StarCoder2   |   3B-15B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|   Phi-3-mini   |    3.8B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|  Phi-3-vision  |    4.2B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|  CogVLM-Chat   |     17B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+| LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|     Gemma2     |   9B-27B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|      GLM4      |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|     GLM-4V     |     9B      | MLLM |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|   CodeGeeX4    |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|  Phi-3.5-mini  |    3.8B     | LLM  |    Yes    |   Yes   |   No    |  No  |   -   |
+|  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   Yes   |   No    |  No  |   -   |
+| Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   Yes   |   No    |  No  |   -   |
 
 ## PyTorchEngine 华为昇腾平台
 
diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py
index 7eedd458f3..79ac2833cf 100644
--- a/lmdeploy/cli/cli.py
+++ b/lmdeploy/cli/cli.py
@@ -128,6 +128,7 @@ def add_parser_chat():
         session_len_act = ArgumentHelper.session_len(pt_group)
         cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
         prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group)
+        quant_policy = ArgumentHelper.quant_policy(pt_group)
 
         # turbomind args
         tb_group = parser.add_argument_group('TurboMind engine arguments')
@@ -137,8 +138,8 @@ def add_parser_chat():
         tb_group._group_actions.append(session_len_act)
         tb_group._group_actions.append(cache_max_entry_act)
         tb_group._group_actions.append(prefix_caching_act)
+        tb_group._group_actions.append(quant_policy)
         ArgumentHelper.model_format(tb_group)
-        ArgumentHelper.quant_policy(tb_group)
         ArgumentHelper.rope_scaling_factor(tb_group)
 
     @staticmethod
@@ -263,7 +264,8 @@ def chat(args):
                 cache_max_entry_count=args.cache_max_entry_count,
                 adapters=adapters,
                 enable_prefix_caching=args.enable_prefix_caching,
-                device_type=args.device)
+                device_type=args.device,
+                quant_policy=args.quant_policy)
             run_chat(args.model_path,
                      engine_config,
                      chat_template_config=chat_template_config)
diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
index 00d16b6246..8007a96678 100644
--- a/lmdeploy/cli/serve.py
+++ b/lmdeploy/cli/serve.py
@@ -169,6 +169,7 @@ def add_parser_api_server():
         prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group)
         max_prefill_token_num_act = ArgumentHelper.max_prefill_token_num(
             pt_group)
+        quant_policy = ArgumentHelper.quant_policy(pt_group)
         # turbomind args
         tb_group = parser.add_argument_group('TurboMind engine arguments')
         # common engine args
@@ -180,8 +181,8 @@ def add_parser_api_server():
         tb_group._group_actions.append(cache_block_seq_len_act)
         tb_group._group_actions.append(prefix_caching_act)
         tb_group._group_actions.append(max_prefill_token_num_act)
+        tb_group._group_actions.append(quant_policy)
         ArgumentHelper.model_format(tb_group)
-        ArgumentHelper.quant_policy(tb_group)
         ArgumentHelper.rope_scaling_factor(tb_group)
         ArgumentHelper.num_tokens_per_iter(tb_group)
         ArgumentHelper.max_prefill_iters(tb_group)
@@ -259,6 +260,7 @@ def gradio(args):
                 session_len=args.session_len,
                 enable_prefix_caching=args.enable_prefix_caching,
                 device_type=args.device,
+                quant_policy=args.quant_policy,
                 max_prefill_token_num=args.max_prefill_token_num)
         else:
             backend_config = TurbomindEngineConfig(
@@ -308,6 +310,7 @@ def api_server(args):
                 adapters=adapters,
                 enable_prefix_caching=args.enable_prefix_caching,
                 device_type=args.device,
+                quant_policy=args.quant_policy,
                 max_prefill_token_num=args.max_prefill_token_num)
         else:
             from lmdeploy.messages import TurbomindEngineConfig
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
index 5b84fd0f2b..38c2153669 100644
--- a/lmdeploy/messages.py
+++ b/lmdeploy/messages.py
@@ -256,6 +256,8 @@ class PytorchEngineConfig:
         revision (str): The specific model version to use.
             It can be a branch name, a tag name, or a commit id.
             If unspecified, will use the default version.
+        quant_policy (int): default to 0. When k/v is quantized into 4 or 8
+            bit, set it to 4 or 8, respectively
     """
     dtype: str = 'auto'
     tp: int = 1
@@ -275,6 +277,7 @@ class PytorchEngineConfig:
     custom_module_map: Dict[str, str] = None
     download_dir: str = None
     revision: str = None
+    quant_policy: Literal[0, 4, 8] = 0
 
     def __post_init__(self):
         """Check input validation."""
@@ -286,9 +289,12 @@ def __post_init__(self):
         assert self.max_prefill_token_num >= 0, \
             'invalid max_prefill_token_num'
         assert self.num_gpu_blocks >= 0, 'invalid num_gpu_blocks'
+        assert self.quant_policy in (0, 4, 8), 'invalid quant_policy'
         assert self.device_type in [
             'cuda', 'ascend'
         ], (f'invalid device_type: {self.device_type}')
+        if self.quant_policy > 0 and self.device_type != 'cuda':
+            assert False, 'kv cache quantization only works for CUDA.'
 
 
 class ResponseType(enum.Enum):
diff --git a/lmdeploy/pytorch/backends/ascend/attention.py b/lmdeploy/pytorch/backends/ascend/attention.py
index b2e96329f3..1accbd3ecd 100644
--- a/lmdeploy/pytorch/backends/ascend/attention.py
+++ b/lmdeploy/pytorch/backends/ascend/attention.py
@@ -57,6 +57,8 @@ def forward(
         k_cache: Tensor,
         v_cache: Tensor,
         attn_metadata: AscendAttentionMetadata,
+        k_scales_zeros: Tensor = None,
+        v_scales_zeros: Tensor = None,
         inplace: bool = True,
     ) -> Tensor:
         """forward."""
diff --git a/lmdeploy/pytorch/backends/attention.py b/lmdeploy/pytorch/backends/attention.py
index ef0ac0e9ab..fccb8f3c74 100644
--- a/lmdeploy/pytorch/backends/attention.py
+++ b/lmdeploy/pytorch/backends/attention.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Generic, TypeVar
+from typing import Generic, Literal, TypeVar
 
 import torch
 
@@ -14,6 +14,7 @@ class AttentionMetadata:
     q_start_loc: torch.Tensor = None
     q_seqlens: torch.Tensor = None
     kv_seqlens: torch.Tensor = None
+    quant_policy: Literal[0, 4, 8] = 0
 
 
 T = TypeVar('T', bound=AttentionMetadata)
diff --git a/lmdeploy/pytorch/backends/cuda/attention.py b/lmdeploy/pytorch/backends/cuda/attention.py
index d54b04850f..b43dd3d20f 100644
--- a/lmdeploy/pytorch/backends/cuda/attention.py
+++ b/lmdeploy/pytorch/backends/cuda/attention.py
@@ -69,6 +69,8 @@ def forward(
         k_cache: torch.Tensor,
         v_cache: torch.Tensor,
         attn_metadata: TritonAttentionMetadata,
+        k_scales_zeros: torch.Tensor = None,
+        v_scales_zeros: torch.Tensor = None,
         inplace: bool = True,
     ) -> torch.Tensor:
         """forward."""
@@ -77,6 +79,7 @@ def forward(
         q_start_loc = attn_metadata.q_start_loc
         q_seqlens = attn_metadata.q_seqlens
         kv_seqlens = attn_metadata.kv_seqlens
+        quant_policy = attn_metadata.quant_policy
         max_q_seqlen = query.numel() // (query.size(-1) * query.size(-2))
 
         # fill kv cache
@@ -90,6 +93,9 @@ def forward(
             kv_seq_length=kv_seqlens,
             max_q_seq_length=max_q_seqlen,
             block_offsets=block_offsets,
+            k_scales_zeros=k_scales_zeros,
+            v_scales_zeros=v_scales_zeros,
+            quant_policy=quant_policy,
         )
 
         if inplace:
@@ -110,6 +116,9 @@ def forward(
                 q_seqlens=q_seqlens,
                 kv_seqlens=kv_seqlens,
                 max_seqlen=max_q_seqlen,
+                k_scales_zeros=k_scales_zeros,
+                v_scales_zeros=v_scales_zeros,
+                quant_policy=quant_policy,
                 window_size=self.sliding_window,
                 sm_scale=self.scale,
                 logit_softcapping=self.logit_softcapping,
@@ -127,6 +136,9 @@ def forward(
                 max_input_len=max_q_seqlen,
                 head_offset=self.alibi_head_offset,
                 num_heads=self.alibi_num_heads,
+                k_scales_zeros=k_scales_zeros,
+                v_scales_zeros=v_scales_zeros,
+                quant_policy=quant_policy,
             )
 
         return attn_output
diff --git a/lmdeploy/pytorch/backends/cuda/op_backend.py b/lmdeploy/pytorch/backends/cuda/op_backend.py
index 02e4eb66b0..af93aac5c9 100644
--- a/lmdeploy/pytorch/backends/cuda/op_backend.py
+++ b/lmdeploy/pytorch/backends/cuda/op_backend.py
@@ -110,6 +110,7 @@ def update_step_context(cls, step_context):
             q_start_loc=q_start_loc,
             q_seqlens=q_seqlens,
             kv_seqlens=step_context.kv_seqlens,
+            quant_policy=step_context.kv_quant_policy,
         )
 
         step_context.attn_metadata = attn_metadata
diff --git a/lmdeploy/pytorch/config.py b/lmdeploy/pytorch/config.py
index 2625de5dc1..c350f4b4cf 100644
--- a/lmdeploy/pytorch/config.py
+++ b/lmdeploy/pytorch/config.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from dataclasses import dataclass
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Literal
 
 import torch
 
@@ -76,6 +76,7 @@ class CacheConfig:
     cache_max_entry_count: float = 0.8
     max_prefill_token_num: int = 4096
     enable_prefix_caching: bool = False
+    quant_policy: Literal[0, 4, 8] = 0
 
     def __post_init__(self):
         """post init."""
diff --git a/lmdeploy/pytorch/engine/cache_engine.py b/lmdeploy/pytorch/engine/cache_engine.py
index a0689dab1a..8eaa563947 100644
--- a/lmdeploy/pytorch/engine/cache_engine.py
+++ b/lmdeploy/pytorch/engine/cache_engine.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # modify from: https://github.com/vllm-project/vllm
-from typing import Dict, List, Tuple
+from typing import Dict, List, Literal, Tuple
 
 import torch
 
@@ -43,6 +43,8 @@ def __init__(
         self.block_size = cache_config.block_size
         self.num_layers = model_config.num_layers
         self.kv_cache_dtype = model_config.dtype
+        if cache_config.quant_policy > 0:
+            self.kv_cache_dtype = torch.uint8
 
         # Initialize the cache.
         self.local_gpu_cache = self.allocate_gpu_cache()
@@ -84,6 +86,7 @@ def _get_key_block_shape_impl(cls,
                                   block_size: int,
                                   head_size: int,
                                   world_size: int = 1,
+                                  quant_policy: Literal[0, 4, 8] = 0,
                                   local: bool = True):
         """get single block shape."""
         attn_backend = get_backend()
@@ -93,6 +96,10 @@ def _get_key_block_shape_impl(cls,
             assert num_heads % world_size == 0, \
                 f'num_heads: {num_heads}, world_size: {world_size}'
             num_heads = num_heads // world_size
+        if quant_policy == 4:  # pack head_dim to uint8
+            assert head_size % 2 == 0, \
+                f'head_size: {head_size}, quant_policy: {quant_policy}'
+            head_size = head_size // 2
         return attn_backend.get_k_block_shape(block_size, num_heads, head_size,
                                               dtype)
 
@@ -102,6 +109,7 @@ def _get_value_block_shape_impl(cls,
                                     block_size: int,
                                     head_size: int,
                                     world_size: int = 1,
+                                    quant_policy: Literal[0, 4, 8] = 0,
                                     local: bool = True):
         """get single block shape."""
         attn_backend = get_backend()
@@ -111,6 +119,11 @@ def _get_value_block_shape_impl(cls,
             assert num_heads % world_size == 0, \
                 f'num_heads: {num_heads}, world_size: {world_size}'
             num_heads = num_heads // world_size
+        if quant_policy == 4:  # pack head_dim to uint8
+            assert head_size % 2 == 0, \
+                f'head_size: {head_size}, quant_policy: {quant_policy}'
+            head_size = head_size // 2
+
         return attn_backend.get_v_block_shape(block_size, num_heads, head_size,
                                               dtype)
 
@@ -124,6 +137,7 @@ def get_key_block_shape(self, local: bool = False) -> Tuple[int, int, int]:
             block_size=self.block_size,
             head_size=head_size,
             world_size=self.world_size,
+            quant_policy=self.cache_config.quant_policy,
             local=local,
         )
 
@@ -138,6 +152,7 @@ def get_value_block_shape(self,
             block_size=self.block_size,
             head_size=head_size,
             world_size=self.world_size,
+            quant_policy=self.cache_config.quant_policy,
             local=local,
         )
 
@@ -158,7 +173,21 @@ def allocate_gpu_cache(self):
                 dtype=self.kv_cache_dtype,
                 device='cuda',
             )
-            gpu_cache.append((key_blocks, value_blocks))
+            if self.cache_config.quant_policy in (4, 8):
+                key_scales_zeros = torch.empty(
+                    size=(self.num_gpu_blocks, *key_block_shape[:-1], 2),
+                    dtype=self.model_config.dtype,
+                    device='cuda',
+                )
+                value_scales_zeros = torch.empty(
+                    size=(self.num_gpu_blocks, *value_block_shape[:-1], 2),
+                    dtype=self.model_config.dtype,
+                    device='cuda',
+                )
+                gpu_cache.append((key_blocks, value_blocks, key_scales_zeros,
+                                  value_scales_zeros))
+            else:
+                gpu_cache.append((key_blocks, value_blocks))
 
         return gpu_cache
 
@@ -182,7 +211,21 @@ def allocate_cpu_cache(self):
                 dtype=self.kv_cache_dtype,
                 pin_memory=pin_memory,
             )
-            cpu_cache.append((key_blocks, value_blocks))
+            if self.cache_config.quant_policy in (4, 8):
+                key_scales_zeros = torch.empty(
+                    size=(self.num_cpu_blocks, *key_block_shape[:-1], 2),
+                    dtype=self.model_config.dtype,
+                    pin_memory=pin_memory,
+                )
+                value_scales_zeros = torch.empty(
+                    size=(self.num_cpu_blocks, *value_block_shape[:-1], 2),
+                    dtype=self.model_config.dtype,
+                    pin_memory=pin_memory,
+                )
+                cpu_cache.append((key_blocks, value_blocks, key_scales_zeros,
+                                  value_scales_zeros))
+            else:
+                cpu_cache.append((key_blocks, value_blocks))
         return cpu_cache
 
     @torch.inference_mode()
@@ -201,8 +244,9 @@ def _swap(self, src: List[KVCache], dst: List[KVCache],
                 dst_key_cache, dst_value_cache = dst[i]
 
                 for src_id, dst_id in src_to_dst.items():
-                    dst_key_cache[dst_id].copy_(src_key_cache[src_id])
-                    dst_value_cache[dst_id].copy_(src_value_cache[src_id])
+                    if isinstance(dst_key_cache[dst_id], torch.Tensor):
+                        dst_key_cache[dst_id].copy_(src_key_cache[src_id])
+                        dst_value_cache[dst_id].copy_(src_value_cache[src_id])
 
                     event = self.events[i]
                     event.record(stream=self.cache_stream)
@@ -227,7 +271,8 @@ def swap_out(self, src_to_dst: Dict[int, int]) -> None:
     def get_cache_block_size(cls,
                              block_size: int,
                              model_config: ModelConfig,
-                             world_size: int = 1) -> int:
+                             world_size: int = 1,
+                             quant_policy: int = 0) -> int:
         """Get the required cache size of the model.
 
         Args:
@@ -250,18 +295,43 @@ def get_cache_block_size(cls,
             head_size=key_head_size,
             world_size=world_size,
             local=True,
+            quant_policy=quant_policy,
         )
         value_shape = cls._get_value_block_shape_impl(
             model_config,
             block_size=block_size,
             head_size=value_head_size,
             world_size=world_size,
+            quant_policy=quant_policy,
             local=True,
         )
-        dtype = model_config.dtype
-        key_block = torch.empty(key_shape, dtype=dtype, device='meta')
-        value_block = torch.empty(value_shape, dtype=dtype, device='meta')
-        mem_key_block = key_block.numel() * key_block.element_size()
-        mem_value_block = value_block.numel() * value_block.element_size()
+        if quant_policy == 0:
+            dtype = model_config.dtype
+            key_block = torch.empty(key_shape, dtype=dtype, device='meta')
+            value_block = torch.empty(value_shape, dtype=dtype, device='meta')
+            mem_key_block = key_block.numel() * key_block.element_size()
+            mem_value_block = value_block.numel() * value_block.element_size()
+        elif quant_policy in (4, 8):
+            key_block = torch.empty(key_shape,
+                                    dtype=torch.uint8,
+                                    device='meta')
+            value_block = torch.empty(value_shape,
+                                      dtype=torch.uint8,
+                                      device='meta')
+            key_scale_zero_block = torch.empty((*key_shape[:-1], 2),
+                                               dtype=model_config.dtype,
+                                               device='meta')
+            value_scale_zero_block = torch.empty((*value_shape[:-1], 2),
+                                                 dtype=model_config.dtype,
+                                                 device='meta')
+            mem_key_block = key_block.numel() * key_block.element_size(
+            ) + key_scale_zero_block.numel(
+            ) * key_scale_zero_block.element_size()
+            mem_value_block = value_block.numel() * value_block.element_size(
+            ) + value_scale_zero_block.numel(
+            ) * value_scale_zero_block.element_size()
+        else:
+            raise ValueError(f'unsupported quant_policy {quant_policy}')
+
         total = num_layers * (mem_key_block + mem_value_block)
         return total
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
index 5a898cdf4e..58d319ef8c 100644
--- a/lmdeploy/pytorch/engine/engine.py
+++ b/lmdeploy/pytorch/engine/engine.py
@@ -128,6 +128,7 @@ def __init__(self,
             cache_max_entry_count=engine_config.cache_max_entry_count,
             max_prefill_token_num=engine_config.max_prefill_token_num,
             enable_prefix_caching=engine_config.enable_prefix_caching,
+            quant_policy=engine_config.quant_policy,
         )
 
         if not os.path.exists(model_path):
diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
index bba25213b6..4c902dbe2e 100644
--- a/lmdeploy/pytorch/engine/model_agent.py
+++ b/lmdeploy/pytorch/engine/model_agent.py
@@ -91,7 +91,8 @@ def __adjust_block_size():
     __adjust_block_size()
 
     cache_block_size = CacheEngine.get_cache_block_size(
-        cache_config.block_size, model_config, world_size)
+        cache_config.block_size, model_config, world_size,
+        cache_config.quant_policy)
     gpu_mem = __get_free_gpu_mem_size(cache_block_size)
     cpu_mem = host_mem_size
     if cache_config.num_cpu_blocks == 0:
@@ -142,6 +143,7 @@ def model_forward(
             inputs=inputs,
             world_size=world_size,
             kv_caches=cache_engine.gpu_cache,
+            kv_quant_policy=cache_engine.cache_config.quant_policy,
         )
         with ctx_mgr.context(context):
             input_dict = model.prepare_inputs_for_generation(
diff --git a/lmdeploy/pytorch/kernels/cuda/alibi_pagedattention.py b/lmdeploy/pytorch/kernels/cuda/alibi_pagedattention.py
index 936c3dab01..1e54b5c134 100644
--- a/lmdeploy/pytorch/kernels/cuda/alibi_pagedattention.py
+++ b/lmdeploy/pytorch/kernels/cuda/alibi_pagedattention.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # modify from: https://github.com/ModelTC/lightllm
 import math
+from typing import Literal
 
 import torch
 import triton
@@ -387,18 +388,406 @@ def _fwd_kernel(
     tl.store(out_ptrs, acc, mask=offs_m[:, None] < cur_batch_seq_len)
 
 
-def alibi_paged_attention_fwd(q: Tensor,
-                              k: Tensor,
-                              v: Tensor,
-                              o: Tensor,
-                              block_offsets: Tensor,
-                              b_start_loc: Tensor,
-                              b_seq_len: Tensor,
-                              b_kv_seq_len: Tensor,
-                              max_input_len: int,
-                              head_offset: int = 0,
-                              num_heads: int = -1,
-                              alibi_scale: float = 1.0):
+@wrap_jit_func
+@triton.jit
+def _fwd_split_kernel_quant(
+    Q,
+    K,
+    V,
+    KScalesZeros,
+    VScalesZeros,
+    sm_scale,
+    alibi_scale,
+    B_kvlen,
+    Block_offsets,
+    Acc_out,
+    stride_qbs,
+    stride_qh,
+    stride_qd,
+    stride_kbs,
+    stride_kh,
+    stride_kd,
+    stride_vbs,
+    stride_vh,
+    stride_vd,
+    stride_kszbs: tl.constexpr,
+    stride_kszh: tl.constexpr,
+    stride_kszd: tl.constexpr,
+    stride_vszbs: tl.constexpr,
+    stride_vszh: tl.constexpr,
+    stride_vszd: tl.constexpr,
+    quant_policy: tl.constexpr,
+    stride_ok,
+    stride_obs,
+    stride_oh,
+    stride_od,
+    stride_boffb,
+    head_offset,
+    num_heads,
+    kv_group_num,
+    block_per_cta,
+    num_sub_blocks: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    """first step kernel of split k attention with dequant fused.
+
+    Args:
+        stride_xbs: stride of block size dim
+        stride_h: stride of head num dim
+        stride_d: stride of head size dim
+    """
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    split_k_id = tl.program_id(2)
+
+    cur_kv_head = cur_head // kv_group_num
+
+    cur_batch_seq_len = 1
+    cur_batch_kv_len = tl.load(B_kvlen + cur_batch)
+    history_len = cur_batch_kv_len - cur_batch_seq_len
+
+    # initialize offsets
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dsz = tl.arange(0, 1)
+    off_q = (cur_batch * stride_qbs + cur_head * stride_qh +
+             offs_d * stride_qd)
+    if quant_policy == 4:
+        shift_d = offs_d // (BLOCK_DMODEL // 2) * 4
+        off_k = (cur_kv_head * stride_kh +
+                 (offs_d % (BLOCK_DMODEL // 2))[None, :] * stride_kd)
+        off_v = (cur_kv_head * stride_vh +
+                 (offs_d % (BLOCK_DMODEL // 2))[None, :] * stride_vd)
+    else:
+        off_k = (cur_kv_head * stride_kh + offs_d[None, :] * stride_kd)
+        off_v = (cur_kv_head * stride_vh + offs_d[None, :] * stride_vd)
+    off_ksz = (cur_kv_head * stride_kszh + offs_dsz[None, :] * stride_kszd)
+    off_vsz = (cur_kv_head * stride_vszh + offs_dsz[None, :] * stride_vszd)
+
+    q = tl.load(Q + off_q).to(tl.float32)
+
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+    ksz_ptrs = KScalesZeros + off_ksz
+    vsz_ptrs = VScalesZeros + off_vsz
+
+    block_offset_ptrs = Block_offsets + cur_batch * stride_boffb
+    head_slope = get_slope(
+        cur_head.to(tl.float32) + head_offset, num_heads.to(tl.float32))
+
+    # initialize pointer to m and l
+    m_i = -float('inf')
+    l_i = float(0)
+    acc = tl.zeros([BLOCK_DMODEL], dtype=tl.float32)
+
+    kv_len_per_prog = block_per_cta * BLOCK_N
+    loop_start = kv_len_per_prog * split_k_id
+    loop_end = tl.minimum(loop_start + kv_len_per_prog, cur_batch_kv_len)
+
+    # load block offset
+    start_block_id = loop_start // BLOCK_N
+    b_offset = _load_block_offsets(block_offset_ptrs, start_block_id,
+                                   num_sub_blocks, BLOCK_N)
+
+    for start_n in range(loop_start, loop_end, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+
+        mask = (start_n + offs_n[:, None]) < cur_batch_kv_len
+
+        # -- compute qk ----
+        k = tl.load(
+            k_ptrs + b_offset[:, None] * stride_kbs,
+            mask=mask,
+            other=0.0,
+        )
+        if quant_policy == 4:
+            k = (k >> shift_d) & 0x0F
+        ks = tl.load(
+            ksz_ptrs + b_offset[:, None] * stride_kszbs,
+            mask=mask,
+            other=0.0,
+        )
+        kz = tl.load(
+            ksz_ptrs + b_offset[:, None] * stride_kszbs + 1,
+            mask=mask,
+            other=0.0,
+        )
+
+        v = tl.load(
+            v_ptrs + b_offset[:, None] * stride_vbs,
+            mask=mask,
+            other=0.0,
+        )
+        if quant_policy == 4:
+            v = (v >> shift_d) & 0x0F
+        vs = tl.load(
+            vsz_ptrs + b_offset[:, None] * stride_vszbs,
+            mask=mask,
+            other=0.0,
+        )
+        vz = tl.load(
+            vsz_ptrs + b_offset[:, None] * stride_vszbs + 1,
+            mask=mask,
+            other=0.0,
+        )
+
+        k = (k - kz) * ks
+        v = (v - vz) * vs
+        # prefetch b_offset
+        if start_n + BLOCK_N < loop_end:
+            start_block_id += 1
+            b_offset = _load_block_offsets(block_offset_ptrs, start_block_id,
+                                           num_sub_blocks, BLOCK_N)
+
+        qk = tl.sum(q[None, :] * k, 1)
+        qk *= sm_scale
+
+        mask = start_n + offs_n
+        bias = mask.to(tl.float32) * (head_slope * alibi_scale)
+        qk += bias
+
+        # NOTE: inf - inf = nan, and nan will leads to error
+        qk = tl.where(
+            history_len >= (start_n + offs_n),
+            qk,
+            -float('inf'),
+        )
+
+        # -- compute p, m_i and l_i
+        m_i_new = tl.maximum(m_i, tl.max(qk, 0))
+        p = tl.exp(qk - m_i_new)
+        alpha = tl.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + tl.sum(p, 0)
+
+        # -- update output accumulator --
+        # scale acc
+        acc = acc * alpha
+
+        # update acc
+        p_new = p.to(v.dtype)
+        acc += tl.sum(p_new[:, None] * v, 0)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    # initialize pointers to output
+    off_acc = (cur_batch * stride_obs + split_k_id * stride_ok +
+               cur_head * stride_oh + offs_d * stride_od)
+    tl.store(Acc_out + off_acc, acc)
+
+    off_meta = (cur_batch * stride_obs + split_k_id * stride_ok +
+                cur_head * stride_oh + BLOCK_DMODEL)
+    tl.store(Acc_out + off_meta + tl.arange(0, 1), m_i)
+    tl.store(Acc_out + off_meta + 1 + tl.arange(0, 1), l_i)
+
+
+@wrap_jit_func
+@triton.jit
+def _fwd_kernel_quant(
+    Q,
+    K,
+    V,
+    KScalesZeros,
+    VScalesZeros,
+    sm_scale,
+    alibi_scale,
+    B_Start_Loc,
+    B_Seqlen,
+    B_kvlen,
+    Block_offsets,
+    Out,
+    stride_qbs,
+    stride_qh,
+    stride_qd,
+    stride_kbs,
+    stride_kh,
+    stride_kd,
+    stride_vbs,
+    stride_vh,
+    stride_vd,
+    stride_kszbs: tl.constexpr,
+    stride_kszh: tl.constexpr,
+    stride_kszd: tl.constexpr,
+    stride_vszbs: tl.constexpr,
+    stride_vszh: tl.constexpr,
+    stride_vszd: tl.constexpr,
+    quant_policy: tl.constexpr,
+    stride_obs,
+    stride_oh,
+    stride_od,
+    stride_boffb,
+    head_offset,
+    num_heads,
+    kv_group_num,
+    num_sub_blocks: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    """forward kernel with dequant fused.
+
+    Args:
+        stride_xbs: stride of block size dim
+        stride_h: stride of head num dim
+        stride_d: stride of head size dim
+    """
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_m = tl.program_id(2)
+
+    cur_kv_head = cur_head // kv_group_num
+
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_kv_len = tl.load(B_kvlen + cur_batch)
+    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+    history_len = cur_batch_kv_len - cur_batch_seq_len
+
+    block_start_loc = BLOCK_M * start_m
+    head_slope = get_slope(
+        cur_head.to(tl.float32) + head_offset, num_heads.to(tl.float32))
+
+    # initialize offsets
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dsz = tl.arange(0, 1)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+             cur_head * stride_qh + offs_d[None, :] * stride_qd)
+    if quant_policy == 4:
+        shift_kd = (offs_d // (BLOCK_DMODEL // 2) * 4)[:, None]
+        shift_vd = (offs_d // (BLOCK_DMODEL // 2) * 4)[None, :]
+        off_k = (cur_kv_head * stride_kh +
+                 (offs_d % (BLOCK_DMODEL // 2))[:, None] * stride_kd)
+        off_v = (cur_kv_head * stride_vh +
+                 (offs_d % (BLOCK_DMODEL // 2))[None, :] * stride_vd)
+    else:
+        off_k = (cur_kv_head * stride_kh + offs_d[:, None] * stride_kd)
+        off_v = (cur_kv_head * stride_vh + offs_d[None, :] * stride_vd)
+    off_ksz = (cur_kv_head * stride_kszh + offs_dsz[:, None] * stride_kszd)
+    off_vsz = (cur_kv_head * stride_vszh + offs_dsz[None, :] * stride_vszd)
+
+    q = tl.load(Q + off_q, mask=offs_m[:, None] < cur_batch_seq_len, other=0.0)
+
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+    ksz_ptrs = KScalesZeros + off_ksz
+    vsz_ptrs = VScalesZeros + off_vsz
+
+    block_offset_ptrs = Block_offsets + cur_batch * stride_boffb
+
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+
+    block_mask = tl.where(block_start_loc < cur_batch_seq_len, 1, 0)
+
+    b_offset = _load_block_offsets(block_offset_ptrs, 0, num_sub_blocks,
+                                   BLOCK_N)
+    for start_n in range(0, block_mask * cur_batch_kv_len, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+
+        # -- compute qk ----
+        k = tl.load(
+            k_ptrs + b_offset[None, :] * stride_kbs,
+            mask=(start_n + offs_n[None, :]) < cur_batch_kv_len,
+            other=0.0,
+        )
+        if quant_policy == 4:
+            k = (k >> shift_kd) & 0x0F
+        ks = tl.load(
+            ksz_ptrs + b_offset[None, :] * stride_kszbs,
+            mask=(start_n + offs_n[None, :]) < cur_batch_kv_len,
+            other=0.0,
+        )
+        kz = tl.load(
+            ksz_ptrs + b_offset[None, :] * stride_kszbs + 1,
+            mask=(start_n + offs_n[None, :]) < cur_batch_kv_len,
+            other=0.0,
+        )
+
+        v = tl.load(
+            v_ptrs + b_offset[:, None] * stride_vbs,
+            mask=(start_n + offs_n[:, None]) < cur_batch_kv_len,
+            other=0.0,
+        )
+        if quant_policy == 4:
+            v = (v >> shift_vd) & 0x0F
+        vs = tl.load(
+            vsz_ptrs + b_offset[:, None] * stride_vszbs,
+            mask=(start_n + offs_n[:, None]) < cur_batch_kv_len,
+            other=0.0,
+        )
+        vz = tl.load(
+            vsz_ptrs + b_offset[:, None] * stride_vszbs + 1,
+            mask=(start_n + offs_n[:, None]) < cur_batch_kv_len,
+            other=0.0,
+        )
+
+        v = ((v - vz) * vs).to(q.dtype)
+        k = ((k - kz) * ks).to(q.dtype)
+        if start_n + BLOCK_N < cur_batch_kv_len:
+            start_block_id = start_n // BLOCK_N + 1
+            b_offset = _load_block_offsets(block_offset_ptrs, start_block_id,
+                                           num_sub_blocks, BLOCK_N)
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        qk *= sm_scale
+
+        mask = start_n + offs_n[None, :]
+        bias = mask.to(tl.float32) * (head_slope * alibi_scale)
+        qk += bias
+
+        # NOTE: inf - inf = nan, and nan will leads to error
+        qk = tl.where(
+            (history_len + offs_m[:, None]) >= mask,
+            qk,
+            float(-1e30),
+        )
+
+        # -- compute p, m_i and l_i
+        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+        p = tl.exp(qk - m_i_new[:, None])
+        alpha = tl.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + tl.sum(p, 1)
+        # -- update output accumulator --
+        # scale acc
+        acc = acc * alpha[:, None]
+
+        # update acc
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    acc = acc / l_i[:, None]
+    # initialize pointers to output
+    off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+             cur_head * stride_oh + offs_d[None, :] * stride_od)
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs, acc, mask=offs_m[:, None] < cur_batch_seq_len)
+
+
+def alibi_paged_attention_fwd(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    o: Tensor,
+    block_offsets: Tensor,
+    b_start_loc: Tensor,
+    b_seq_len: Tensor,
+    b_kv_seq_len: Tensor,
+    max_input_len: int,
+    head_offset: int = 0,
+    num_heads: int = -1,
+    alibi_scale: float = 1.0,
+    k_scales_zeros: Tensor = None,
+    v_scales_zeros: Tensor = None,
+    quant_policy: Literal[0, 4, 8] = 0,
+):
     """Paged attention forward with alibi bias.
 
     Args:
@@ -420,8 +809,12 @@ def alibi_paged_attention_fwd(q: Tensor,
 
     # shape constraints
     Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
-    assert Lq == Lk and Lk == Lv
-    assert Lk in {16, 32, 64, 128}
+    if quant_policy == 4:
+        assert Lq == Lk * 2 and Lk == Lv
+        assert Lk in {8, 16, 32, 64}
+    else:
+        assert Lq == Lk and Lk == Lv
+        assert Lk in {16, 32, 64, 128}
 
     sm_scale = 1.0 / (Lq**0.5)  # 计算scale系数
     batch, head = b_seq_len.shape[0], q.shape[-2]
@@ -434,80 +827,169 @@ def alibi_paged_attention_fwd(q: Tensor,
 
     grid = (batch, head, triton.cdiv(max_input_len, BLOCK))  # batch, head,
 
-    num_warps = 4 if Lk <= 64 else 8
+    num_warps = 4 if Lq <= 64 else 8
     kernel_meta = get_kernel_meta(q)
     is_decoding = q.shape[-3] == b_seq_len.size(0)
     if not is_decoding:
-        _fwd_kernel[grid](q,
-                          k,
-                          v,
-                          sm_scale,
-                          alibi_scale,
-                          b_start_loc,
-                          b_seq_len,
-                          b_kv_seq_len,
-                          block_offsets,
-                          o,
-                          q.stride(-3),
-                          q.stride(-2),
-                          q.stride(-1),
-                          k.stride(-3),
-                          k.stride(-2),
-                          k.stride(-1),
-                          v.stride(-3),
-                          v.stride(-2),
-                          v.stride(-1),
-                          o.stride(-3),
-                          o.stride(-2),
-                          o.stride(-1),
-                          block_offsets.stride(0),
-                          head_offset=head_offset,
-                          num_heads=num_heads,
-                          kv_group_num=kv_group_num,
-                          num_sub_blocks=num_sub_blocks,
-                          BLOCK_M=BLOCK,
-                          BLOCK_DMODEL=Lk,
-                          BLOCK_N=BLOCK,
-                          num_warps=num_warps,
-                          num_stages=1,
-                          **kernel_meta)
+        if quant_policy > 0:
+            _fwd_kernel_quant[grid](q,
+                                    k,
+                                    v,
+                                    k_scales_zeros,
+                                    v_scales_zeros,
+                                    sm_scale,
+                                    alibi_scale,
+                                    b_start_loc,
+                                    b_seq_len,
+                                    b_kv_seq_len,
+                                    block_offsets,
+                                    o,
+                                    q.stride(-3),
+                                    q.stride(-2),
+                                    q.stride(-1),
+                                    k.stride(-3),
+                                    k.stride(-2),
+                                    k.stride(-1),
+                                    v.stride(-3),
+                                    v.stride(-2),
+                                    v.stride(-1),
+                                    k_scales_zeros.stride(-3),
+                                    k_scales_zeros.stride(-2),
+                                    k_scales_zeros.stride(-1),
+                                    v_scales_zeros.stride(-3),
+                                    v_scales_zeros.stride(-2),
+                                    v_scales_zeros.stride(-1),
+                                    quant_policy,
+                                    o.stride(-3),
+                                    o.stride(-2),
+                                    o.stride(-1),
+                                    block_offsets.stride(0),
+                                    head_offset=head_offset,
+                                    num_heads=num_heads,
+                                    kv_group_num=kv_group_num,
+                                    num_sub_blocks=num_sub_blocks,
+                                    BLOCK_M=BLOCK,
+                                    BLOCK_DMODEL=Lq,
+                                    BLOCK_N=BLOCK,
+                                    num_warps=num_warps,
+                                    num_stages=1,
+                                    **kernel_meta)
+        else:
+            _fwd_kernel[grid](q,
+                              k,
+                              v,
+                              sm_scale,
+                              alibi_scale,
+                              b_start_loc,
+                              b_seq_len,
+                              b_kv_seq_len,
+                              block_offsets,
+                              o,
+                              q.stride(-3),
+                              q.stride(-2),
+                              q.stride(-1),
+                              k.stride(-3),
+                              k.stride(-2),
+                              k.stride(-1),
+                              v.stride(-3),
+                              v.stride(-2),
+                              v.stride(-1),
+                              o.stride(-3),
+                              o.stride(-2),
+                              o.stride(-1),
+                              block_offsets.stride(0),
+                              head_offset=head_offset,
+                              num_heads=num_heads,
+                              kv_group_num=kv_group_num,
+                              num_sub_blocks=num_sub_blocks,
+                              BLOCK_M=BLOCK,
+                              BLOCK_DMODEL=Lq,
+                              BLOCK_N=BLOCK,
+                              num_warps=num_warps,
+                              num_stages=1,
+                              **kernel_meta)
     else:
         SPLIT_K = 4
         grid = (batch, head, SPLIT_K)
         block_per_cta = triton.cdiv(block_offsets.size(-1), SPLIT_K)
         acc = q.new_empty(batch, head, SPLIT_K, Lq + 2, dtype=torch.float32)
-        _fwd_split_kernel[grid](q,
-                                k,
-                                v,
-                                sm_scale,
-                                alibi_scale,
-                                b_kv_seq_len,
-                                block_offsets,
-                                acc,
-                                stride_qbs=q.stride(-3),
-                                stride_qh=q.stride(-2),
-                                stride_qd=q.stride(-1),
-                                stride_kbs=k.stride(-3),
-                                stride_kh=k.stride(-2),
-                                stride_kd=k.stride(-1),
-                                stride_vbs=v.stride(-3),
-                                stride_vh=v.stride(-2),
-                                stride_vd=v.stride(-1),
-                                stride_ok=acc.stride(-2),
-                                stride_obs=acc.stride(-4),
-                                stride_oh=acc.stride(-3),
-                                stride_od=acc.stride(-1),
-                                stride_boffb=block_offsets.stride(0),
-                                head_offset=head_offset,
-                                num_heads=num_heads,
-                                kv_group_num=kv_group_num,
-                                block_per_cta=block_per_cta,
-                                num_sub_blocks=num_sub_blocks,
-                                BLOCK_DMODEL=Lk,
-                                BLOCK_N=BLOCK,
-                                num_warps=4,
-                                num_stages=1,
-                                **kernel_meta)
+        if quant_policy > 0:
+            _fwd_split_kernel_quant[grid](
+                q,
+                k,
+                v,
+                k_scales_zeros,
+                v_scales_zeros,
+                sm_scale,
+                alibi_scale,
+                b_kv_seq_len,
+                block_offsets,
+                acc,
+                stride_qbs=q.stride(-3),
+                stride_qh=q.stride(-2),
+                stride_qd=q.stride(-1),
+                stride_kbs=k.stride(-3),
+                stride_kh=k.stride(-2),
+                stride_kd=k.stride(-1),
+                stride_vbs=v.stride(-3),
+                stride_vh=v.stride(-2),
+                stride_vd=v.stride(-1),
+                stride_kszbs=k_scales_zeros.stride(-3),
+                stride_kszh=k_scales_zeros.stride(-2),
+                stride_kszd=k_scales_zeros.stride(-1),
+                stride_vszbs=v_scales_zeros.stride(-3),
+                stride_vszh=v_scales_zeros.stride(-2),
+                stride_vszd=v_scales_zeros.stride(-1),
+                quant_policy=quant_policy,
+                stride_ok=acc.stride(-2),
+                stride_obs=acc.stride(-4),
+                stride_oh=acc.stride(-3),
+                stride_od=acc.stride(-1),
+                stride_boffb=block_offsets.stride(0),
+                head_offset=head_offset,
+                num_heads=num_heads,
+                kv_group_num=kv_group_num,
+                block_per_cta=block_per_cta,
+                num_sub_blocks=num_sub_blocks,
+                BLOCK_DMODEL=Lq,
+                BLOCK_N=BLOCK,
+                num_warps=4,
+                num_stages=1,
+                **kernel_meta)
+
+        else:
+            _fwd_split_kernel[grid](q,
+                                    k,
+                                    v,
+                                    sm_scale,
+                                    alibi_scale,
+                                    b_kv_seq_len,
+                                    block_offsets,
+                                    acc,
+                                    stride_qbs=q.stride(-3),
+                                    stride_qh=q.stride(-2),
+                                    stride_qd=q.stride(-1),
+                                    stride_kbs=k.stride(-3),
+                                    stride_kh=k.stride(-2),
+                                    stride_kd=k.stride(-1),
+                                    stride_vbs=v.stride(-3),
+                                    stride_vh=v.stride(-2),
+                                    stride_vd=v.stride(-1),
+                                    stride_ok=acc.stride(-2),
+                                    stride_obs=acc.stride(-4),
+                                    stride_oh=acc.stride(-3),
+                                    stride_od=acc.stride(-1),
+                                    stride_boffb=block_offsets.stride(0),
+                                    head_offset=head_offset,
+                                    num_heads=num_heads,
+                                    kv_group_num=kv_group_num,
+                                    block_per_cta=block_per_cta,
+                                    num_sub_blocks=num_sub_blocks,
+                                    BLOCK_DMODEL=Lq,
+                                    BLOCK_N=BLOCK,
+                                    num_warps=4,
+                                    num_stages=1,
+                                    **kernel_meta)
 
         grid = (batch, head)
         _reduce_split_kernel[grid](acc,
@@ -520,7 +1002,7 @@ def alibi_paged_attention_fwd(q: Tensor,
                                    stride_oh=o.stride(-2),
                                    stride_od=o.stride(-1),
                                    SPLIT_K=SPLIT_K,
-                                   BLOCK_DMODEL=Lk,
+                                   BLOCK_DMODEL=Lq,
                                    num_warps=num_warps,
                                    num_stages=1,
                                    **kernel_meta)
diff --git a/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py b/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py
index 69c18f5230..a9a6cab010 100644
--- a/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py
+++ b/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Literal
+
 import torch
 import triton
 import triton.language as tl
@@ -12,6 +14,30 @@ def _div_up(val, other):
     return (val + other - 1) // other
 
 
+@triton.jit
+def _quant_int8(val):
+    val_min = tl.min(val, 1)
+    val_max = tl.max(val, 1)
+    scales = (val_max - val_min) / 255
+    zeros = -val_min / scales
+    q_val = (val / scales[:, None] + zeros[:, None] + 0.5).to(tl.uint8)
+    return q_val, scales, zeros
+
+
+@triton.jit
+def _quant_int4(val1, val2):
+    val1 = val1.to(tl.float32)
+    val2 = val2.to(tl.float32)
+    val_min = tl.min(tl.minimum(val1, val2), 1)
+    val_max = tl.max(tl.maximum(val1, val2), 1)
+    scales = (val_max - val_min) / 15
+    zeros = -val_min / scales
+    q_val1 = (val1 / scales[:, None] + zeros[:, None] + 0.5).to(tl.uint8)
+    q_val2 = (val2 / scales[:, None] + zeros[:, None] + 0.5).to(tl.uint8)
+    q_val = q_val1 + q_val2 * 16
+    return q_val, scales, zeros
+
+
 @wrap_jit_func(type_hint=dict(
     KStates=Tensor,
     VStates=Tensor,
@@ -135,10 +161,224 @@ def _fill_kv_cache_kernel(
                      mask=maskv)
 
 
-def fill_kv_cache(k_states: Tensor, v_states: Tensor, k_caches: Tensor,
-                  v_caches: Tensor, q_start_loc: Tensor, q_seq_length: Tensor,
-                  kv_seq_length: Tensor, max_q_seq_length: int,
-                  block_offsets: Tensor):
+@wrap_jit_func(type_hint=dict(
+    KStates=Tensor,
+    VStates=Tensor,
+    KCaches=Tensor,
+    VCaches=Tensor,
+    KScalesZeros=Tensor,
+    VScalesZeros=Tensor,
+    QStartLoc=Tensor,
+    QSeqLens=Tensor,
+    KVSeqLens=Tensor,
+    BlockOffsets=Tensor,
+    num_heads=torch.int32,
+    head_dim=torch.int32,
+    stride_kss=int,
+    stride_ksh=int,
+    stride_ksd=int,
+    stride_vss=int,
+    stride_vsh=int,
+    stride_vsd=int,
+    stride_kcn=int,
+    stride_kcb=int,
+    stride_kch=int,
+    stride_kcd=int,
+    stride_vcn=int,
+    stride_vcb=int,
+    stride_vch=int,
+    stride_vcd=int,
+    stride_kszn=int,
+    stride_kszb=int,
+    stride_kszh=int,
+    stride_kszd=int,
+    stride_vszn=int,
+    stride_vszb=int,
+    stride_vszh=int,
+    stride_vszd=int,
+    stride_boff=int,
+    BLOCK=torch.int32,
+    BLOCK_D=torch.int32,
+    BLOCK_DV=torch.int32,
+    BLOCK_H=torch.int32,
+))
+@triton.jit
+def _fill_kv_cache_quant_kernel(
+    KStates,
+    VStates,
+    KCaches,
+    VCaches,
+    KScalesZeros,
+    VScalesZeros,
+    QStartLoc,
+    QSeqLens,
+    KVSeqLens,
+    BlockOffsets,
+    num_heads: tl.constexpr,
+    head_dim: tl.constexpr,
+    head_dim_v: tl.constexpr,
+    stride_kss,
+    stride_ksh,
+    stride_ksd,
+    stride_vss,
+    stride_vsh,
+    stride_vsd,
+    stride_kcn: tl.constexpr,
+    stride_kcb: tl.constexpr,
+    stride_kch: tl.constexpr,
+    stride_kcd: tl.constexpr,
+    stride_vcn: tl.constexpr,
+    stride_vcb: tl.constexpr,
+    stride_vch: tl.constexpr,
+    stride_vcd: tl.constexpr,
+    stride_kszn: tl.constexpr,
+    stride_kszb: tl.constexpr,
+    stride_kszh: tl.constexpr,
+    stride_kszd: tl.constexpr,
+    stride_vszn: tl.constexpr,
+    stride_vszb: tl.constexpr,
+    stride_vszh: tl.constexpr,
+    stride_vszd: tl.constexpr,
+    quant_policy: tl.constexpr,
+    stride_boff,
+    BLOCK: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+):
+    """fill kv cache kernel with int4 and int8 quant fuzed.
+
+    Args:
+        stride_xss: stride of sequence length dim of key or value states
+        stride_xsh: stride of head_num dim of key or value states
+        stride_xsh: stride of head_size dim of key or value states
+        stride_xn: stride of page num dim
+        stride_xb: stride of block size dim
+        stride_xh: stride of head_num dim
+        stride_xd: stride of head_size dim
+    """
+    batch_id = tl.program_id(0)
+    block_id = tl.program_id(1)
+    d_off = tl.arange(0, BLOCK_D)
+
+    # initialize
+    h_off = tl.arange(0, BLOCK_H)
+    szd_off = tl.arange(0, 2)
+
+    q_startloc = tl.load(QStartLoc + batch_id)
+    q_seqlen = tl.load(QSeqLens + batch_id)
+    kv_seqlen = tl.load(KVSeqLens + batch_id)
+    history_seqlen = kv_seqlen - q_seqlen
+
+    block0_first_tokenloc = history_seqlen % BLOCK
+
+    state_token_offset = tl.maximum(block_id * BLOCK - block0_first_tokenloc,
+                                    0)
+    kv_block_id = _div_up(history_seqlen + 1, BLOCK) - 1 + block_id
+    kv_block_id = min(kv_block_id, stride_boff - 1)
+    block_off = tl.load(BlockOffsets + batch_id * stride_boff + kv_block_id)
+
+    cur_startloc = q_startloc + state_token_offset
+    ks_ptr = KStates + cur_startloc * stride_kss
+    vs_ptr = VStates + cur_startloc * stride_vss
+
+    kc_ptr = KCaches + block_off * stride_kcn
+    vc_ptr = VCaches + block_off * stride_vcn
+
+    ksz_ptr = KScalesZeros + block_off * stride_kszn
+    vsz_ptr = VScalesZeros + block_off * stride_vszn
+
+    c_first_tokenloc = block0_first_tokenloc
+    if block_id != 0:
+        c_first_tokenloc *= 0
+    c_last_tokenloc = tl.minimum(
+        BLOCK, q_seqlen + block0_first_tokenloc - block_id * BLOCK)
+
+    for bidx in range(c_first_tokenloc, c_last_tokenloc):
+        sidx = bidx - c_first_tokenloc
+        mask = (h_off[:, None] < num_heads) & (d_off[None, :] < head_dim)
+        if quant_policy == 4:
+            k1 = tl.load(ks_ptr + sidx * stride_kss +
+                         h_off[:, None] * stride_ksh +
+                         d_off[None, :] * stride_ksd,
+                         mask=mask)
+            k2 = tl.load(ks_ptr + sidx * stride_kss +
+                         h_off[:, None] * stride_ksh +
+                         d_off[None, :] * stride_ksd + head_dim * stride_ksd,
+                         mask=mask)
+            q_k, k_scales, k_zeros = _quant_int4(k1, k2)
+        else:
+            k = tl.load(ks_ptr + sidx * stride_kss +
+                        h_off[:, None] * stride_ksh +
+                        d_off[None, :] * stride_ksd,
+                        mask=mask)
+            q_k, k_scales, k_zeros = _quant_int8(k)
+        tl.store(kc_ptr + bidx * stride_kcb + h_off[:, None] * stride_kch +
+                 d_off[None, :] * stride_kcd,
+                 q_k,
+                 mask=mask)
+        tl.store(ksz_ptr + bidx * stride_kszb + h_off[:, None] * stride_kszh +
+                 szd_off[None, :] * stride_kszd,
+                 k_scales[:, None],
+                 mask=(h_off[:, None] < num_heads) & (szd_off[None, :] < 1))
+        tl.store(ksz_ptr + bidx * stride_kszb + h_off[:, None] * stride_kszh +
+                 szd_off[None, :] * stride_kszd,
+                 k_zeros[:, None],
+                 mask=(h_off[:, None] < num_heads) & (szd_off[None, :] == 1))
+
+        if BLOCK_DV > 0:
+            if quant_policy == 4:
+                dv_off = tl.arange(0, BLOCK_DV //
+                                   2)  # int4 pack, half the head_dim
+                maskv = (h_off[:, None] < num_heads) & (dv_off[None, :] <
+                                                        head_dim_v // 2)
+                v1 = tl.load(vs_ptr + sidx * stride_vss +
+                             h_off[:, None] * stride_vsh +
+                             dv_off[None, :] * stride_vsd,
+                             mask=maskv)
+                v2 = tl.load(vs_ptr + sidx * stride_vss +
+                             h_off[:, None] * stride_vsh +
+                             dv_off[None, :] * stride_vsd +
+                             head_dim_v // 2 * stride_vsd,
+                             mask=maskv)
+                q_v, v_scales, v_zeros = _quant_int4(v1, v2)
+            else:
+                dv_off = tl.arange(0, BLOCK_DV)
+                maskv = (h_off[:, None] < num_heads) & (dv_off[None, :] <
+                                                        head_dim_v)
+                v = tl.load(vs_ptr + sidx * stride_vss +
+                            h_off[:, None] * stride_vsh +
+                            dv_off[None, :] * stride_vsd,
+                            mask=maskv)
+                q_v, v_scales, v_zeros = _quant_int8(v)
+            tl.store(vc_ptr + bidx * stride_vcb + h_off[:, None] * stride_vch +
+                     dv_off[None, :] * stride_vcd,
+                     q_v,
+                     mask=maskv)
+            tl.store(
+                vsz_ptr + bidx * stride_vszb + h_off[:, None] * stride_vszh +
+                szd_off[None, :] * stride_vszd,
+                v_scales[:, None],
+                mask=(h_off[:, None] < num_heads) & (szd_off[None, :] < 1))
+            tl.store(
+                vsz_ptr + bidx * stride_vszb + h_off[:, None] * stride_vszh +
+                szd_off[None, :] * stride_vszd,
+                v_zeros[:, None],
+                mask=(h_off[:, None] < num_heads) & (szd_off[None, :] == 1))
+
+
+def fill_kv_cache(k_states: Tensor,
+                  v_states: Tensor,
+                  k_caches: Tensor,
+                  v_caches: Tensor,
+                  q_start_loc: Tensor,
+                  q_seq_length: Tensor,
+                  kv_seq_length: Tensor,
+                  max_q_seq_length: int,
+                  block_offsets: Tensor,
+                  k_scales_zeros: Tensor = None,
+                  v_scales_zeros: Tensor = None,
+                  quant_policy: Literal[0, 4, 8] = 0):
     """fill key/value state to cache for paged attention."""
 
     block_offsets = block_offsets.contiguous()
@@ -153,38 +393,86 @@ def fill_kv_cache(k_states: Tensor, v_states: Tensor, k_caches: Tensor,
     BLOCK_DV = triton.next_power_of_2(head_dim_v)
     grid = [batch_size, max_num_blocks]
     kernel_meta = get_kernel_meta(k_states)
-    _fill_kv_cache_kernel[grid](
-        k_states,
-        v_states,
-        k_caches,
-        v_caches,
-        q_start_loc,
-        q_seq_length,
-        kv_seq_length,
-        block_offsets,
-        num_heads=num_heads,
-        head_dim=head_dim,
-        head_dim_v=head_dim_v,
-        stride_kss=k_states.stride(-3),
-        stride_ksh=k_states.stride(-2),
-        stride_ksd=k_states.stride(-1),
-        stride_vss=v_states.stride(-3),
-        stride_vsh=v_states.stride(-2),
-        stride_vsd=v_states.stride(-1),
-        stride_kcn=k_caches.stride(0),
-        stride_kcb=k_caches.stride(1),
-        stride_kch=k_caches.stride(2),
-        stride_kcd=k_caches.stride(3),
-        stride_vcn=v_caches.stride(0),
-        stride_vcb=v_caches.stride(1),
-        stride_vch=v_caches.stride(2),
-        stride_vcd=v_caches.stride(3),
-        stride_boff=block_offsets.stride(0),
-        BLOCK=BLOCK,
-        BLOCK_D=BLOCK_D,
-        BLOCK_DV=BLOCK_DV,
-        BLOCK_H=BLOCK_H,
-        num_warps=4,
-        num_stages=3,
-        **kernel_meta,
-    )
+    if quant_policy == 0:
+        _fill_kv_cache_kernel[grid](
+            k_states,
+            v_states,
+            k_caches,
+            v_caches,
+            q_start_loc,
+            q_seq_length,
+            kv_seq_length,
+            block_offsets,
+            num_heads=num_heads,
+            head_dim=head_dim,
+            head_dim_v=head_dim_v,
+            stride_kss=k_states.stride(-3),
+            stride_ksh=k_states.stride(-2),
+            stride_ksd=k_states.stride(-1),
+            stride_vss=v_states.stride(-3),
+            stride_vsh=v_states.stride(-2),
+            stride_vsd=v_states.stride(-1),
+            stride_kcn=k_caches.stride(0),
+            stride_kcb=k_caches.stride(1),
+            stride_kch=k_caches.stride(2),
+            stride_kcd=k_caches.stride(3),
+            stride_vcn=v_caches.stride(0),
+            stride_vcb=v_caches.stride(1),
+            stride_vch=v_caches.stride(2),
+            stride_vcd=v_caches.stride(3),
+            stride_boff=block_offsets.stride(0),
+            BLOCK=BLOCK,
+            BLOCK_D=BLOCK_D,
+            BLOCK_DV=BLOCK_DV,
+            BLOCK_H=BLOCK_H,
+            num_warps=4,
+            num_stages=3,
+            **kernel_meta,
+        )
+    else:
+        _fill_kv_cache_quant_kernel[grid](
+            k_states,
+            v_states,
+            k_caches,
+            v_caches,
+            k_scales_zeros,
+            v_scales_zeros,
+            q_start_loc,
+            q_seq_length,
+            kv_seq_length,
+            block_offsets,
+            num_heads=num_heads,
+            head_dim=head_dim,
+            head_dim_v=head_dim_v,
+            stride_kss=k_states.stride(-3),
+            stride_ksh=k_states.stride(-2),
+            stride_ksd=k_states.stride(-1),
+            stride_vss=v_states.stride(-3),
+            stride_vsh=v_states.stride(-2),
+            stride_vsd=v_states.stride(-1),
+            stride_kcn=k_caches.stride(0),
+            stride_kcb=k_caches.stride(1),
+            stride_kch=k_caches.stride(2),
+            stride_kcd=k_caches.stride(3),
+            stride_vcn=v_caches.stride(0),
+            stride_vcb=v_caches.stride(1),
+            stride_vch=v_caches.stride(2),
+            stride_vcd=v_caches.stride(3),
+            stride_kszn=k_scales_zeros.stride(0),
+            stride_kszb=k_scales_zeros.stride(1),
+            stride_kszh=k_scales_zeros.stride(2),
+            stride_kszd=k_scales_zeros.stride(3),
+            stride_vszn=v_scales_zeros.stride(0),
+            stride_vszb=v_scales_zeros.stride(1),
+            stride_vszh=v_scales_zeros.stride(2),
+            stride_vszd=v_scales_zeros.stride(3),
+            quant_policy=quant_policy,
+            stride_boff=block_offsets.stride(0),
+            BLOCK=BLOCK,
+            BLOCK_D=BLOCK_D,
+            BLOCK_DV=BLOCK_DV,
+            BLOCK_H=BLOCK_H,
+            num_warps=4,
+            num_stages=3,
+            **kernel_meta,
+        )
diff --git a/lmdeploy/pytorch/kernels/cuda/pagedattention.py b/lmdeploy/pytorch/kernels/cuda/pagedattention.py
index 5f1661d8a2..aa363d2bd4 100644
--- a/lmdeploy/pytorch/kernels/cuda/pagedattention.py
+++ b/lmdeploy/pytorch/kernels/cuda/pagedattention.py
@@ -1,5 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # modify from: https://github.com/ModelTC/lightllm
+from typing import Literal
+
 import torch
 import triton
 import triton.language as tl
@@ -243,6 +245,290 @@ def _fwd_grouped_split_kernel(
     tl.store(Acc_out + off_meta + 1, l_i, mask=mask_h)
 
 
+@triton.autotune(configs=[
+    triton.Config({}, num_stages=2, num_warps=16),
+    triton.Config({}, num_stages=2, num_warps=8),
+    triton.Config({}, num_stages=2, num_warps=4),
+],
+                 key=['BLOCK_H', 'BLOCK_N', 'BLOCK_DMODEL', 'BLOCK_DV'])
+@wrap_jit_func(type_hint=dict(
+    Q=torch.Tensor,
+    K=torch.Tensor,
+    V=torch.Tensor,
+    KScalesZeros=torch.Tensor,
+    VScalesZeros=torch.Tensor,
+    sm_scale=float,
+    KV_seqlens=torch.Tensor,
+    Block_offsets=torch.Tensor,
+    Acc_out=torch.Tensor,
+    stride_qbs=int,
+    stride_qh=int,
+    stride_qd=int,
+    stride_kbs=int,
+    stride_kh=int,
+    stride_kd=int,
+    stride_vbs=int,
+    stride_vh=int,
+    stride_vd=int,
+    stride_kszp=int,
+    stride_kszbs=int,
+    stride_kszh=int,
+    stride_kszd=int,
+    stride_vszp=int,
+    stride_vszbs=int,
+    stride_vszh=int,
+    stride_vszd=int,
+    quant_policy=int,
+    stride_ok=int,
+    stride_obs=int,
+    stride_oh=int,
+    stride_od=int,
+    stride_boffb=int,
+    kv_group_num=torch.int32,
+    block_per_cta=torch.int32,
+    window_size=torch.int32,
+    head_size=torch.int32,
+    head_size_v=torch.int32,
+    BLOCK_DMODEL=torch.int32,
+    BLOCK_DV=torch.int32,
+    BLOCK_N=torch.int32,
+    BLOCK_H=torch.int32,
+    BLOCK_DMODEL1=torch.int32,
+))
+@triton.jit
+def _fwd_grouped_split_quant_kernel(
+    Q,
+    K,
+    V,
+    KScalesZeros,
+    VScalesZeros,
+    sm_scale,
+    KV_seqlens,
+    Block_offsets,
+    Acc_out,
+    stride_qbs: tl.constexpr,
+    stride_qh: tl.constexpr,
+    stride_qd: tl.constexpr,
+    stride_kp: tl.constexpr,
+    stride_kbs: tl.constexpr,
+    stride_kh: tl.constexpr,
+    stride_kd: tl.constexpr,
+    stride_vp: tl.constexpr,
+    stride_vbs: tl.constexpr,
+    stride_vh: tl.constexpr,
+    stride_vd: tl.constexpr,
+    stride_kszp: tl.constexpr,
+    stride_kszbs: tl.constexpr,
+    stride_kszh: tl.constexpr,
+    stride_kszd: tl.constexpr,
+    stride_vszp: tl.constexpr,
+    stride_vszbs: tl.constexpr,
+    stride_vszh: tl.constexpr,
+    stride_vszd: tl.constexpr,
+    quant_policy: tl.constexpr,
+    stride_ok: tl.constexpr,
+    stride_obs: tl.constexpr,
+    stride_oh: tl.constexpr,
+    stride_od: tl.constexpr,
+    stride_boffb,
+    kv_group_num: tl.constexpr,
+    window_size: tl.constexpr,
+    head_size: tl.constexpr,
+    head_size_v: tl.constexpr,
+    num_heads_q: tl.constexpr,
+    logit_softcapping: tl.constexpr,
+    SPLIT_K: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+    BLOCK_DMODEL1: tl.constexpr,
+):
+    """first step kernel of split k attention.
+
+    Args:
+        stride_xp: stride of page num dim
+        stride_xbs: stride of block size dim
+        stride_h: stride of head num dim
+        stride_d: stride of head size dim
+    """
+    cur_batch = tl.program_id(2)
+    cur_kv_head = tl.program_id(0)
+    split_k_id = tl.program_id(1)
+
+    if BLOCK_H < kv_group_num:
+        HEAD_PER_CTA: tl.constexpr = BLOCK_H
+    else:
+        HEAD_PER_CTA: tl.constexpr = kv_group_num
+    cur_head = cur_kv_head * HEAD_PER_CTA + tl.arange(0, BLOCK_H)
+    mask_h = cur_head < cur_kv_head * HEAD_PER_CTA + HEAD_PER_CTA
+    mask_h = mask_h & (cur_head < num_heads_q)
+
+    q_seqlen = 1
+    kv_seqlen = tl.load(KV_seqlens + cur_batch)
+    if kv_seqlen <= 0:
+        return
+    history_len = kv_seqlen - q_seqlen
+
+    # initialize offsets
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dsz = tl.arange(0, 1)
+    mask_d = offs_d < head_size
+    offs_d = offs_d % head_size
+    offs_dv = tl.arange(0, BLOCK_DV)
+    mask_dv = offs_dv < head_size_v
+    offs_dv = offs_dv % head_size_v
+    off_k = (cur_kv_head * stride_kh + offs_d[:, None] * stride_kd +
+             offs_n[None, :] * stride_kbs)
+    off_v = (cur_kv_head * stride_vh + offs_dv[None, :] * stride_vd +
+             offs_n[:, None] * stride_vbs)
+    off_ksz = (cur_kv_head * stride_kszh + offs_dsz[:, None] * stride_kszd +
+               offs_n[None, :] * stride_kszbs)
+    off_vsz = (cur_kv_head * stride_vszh + offs_dsz[None, :] * stride_vszd +
+               offs_n[:, None] * stride_vszbs)
+
+    off_q = (cur_batch * stride_qbs + cur_head[:, None] * stride_qh +
+             offs_d[None, :] * stride_qd)
+    q = tl.load(Q + off_q, mask=mask_h[:, None] & mask_d[None, :], other=0)
+
+    ksz_ptrs = KScalesZeros + off_ksz
+    vsz_ptrs = VScalesZeros + off_vsz
+
+    if BLOCK_DMODEL1 != 0:
+        offs_d1 = BLOCK_DMODEL + tl.arange(0, BLOCK_DMODEL1)
+        mask_d1 = offs_d1 < head_size
+        offs_d1 = offs_d1 % head_size
+        off_q1 = (cur_batch * stride_qbs + cur_head[:, None] * stride_qh +
+                  offs_d1[None, :] * stride_qd)
+        q1 = tl.load(Q + off_q1,
+                     mask=mask_h[:, None] & mask_d1[None, :],
+                     other=0)
+        off_k1 = (cur_kv_head * stride_kh + offs_d1[:, None] * stride_kd +
+                  offs_n[None, :] * stride_kbs)
+
+    block_offset_ptrs = Block_offsets + cur_batch * stride_boffb
+
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_H], dtype=tl.float32) - float('inf')
+    l_i = tl.zeros([BLOCK_H], dtype=tl.float32)
+    if quant_policy == 4:
+        if BLOCK_DMODEL1 != 0:
+            offs_d1 = BLOCK_DMODEL // 2 + tl.arange(0, BLOCK_DMODEL1)
+            shift_k1d = (offs_d1 // (head_size // 2) * 4)[:, None]
+            offs_d1 = offs_d1 % (head_size // 2)
+            off_k1 = (cur_kv_head * stride_kh + offs_d1[:, None] * stride_kd +
+                      offs_n[None, :] * stride_kbs)
+        offs_d = tl.arange(0, BLOCK_DMODEL) % (head_size // 2)
+        shift_kd = (tl.arange(0, BLOCK_DMODEL) // (head_size // 2) * 4)[:,
+                                                                        None]
+        off_k = (cur_kv_head * stride_kh + offs_d[:, None] * stride_kd +
+                 offs_n[None, :] * stride_kbs)
+        offs_dv = tl.arange(0, BLOCK_DV * 2) % head_size_v
+        shift_vd = (tl.arange(0, BLOCK_DV * 2) // head_size_v * 4)
+        off_v = (cur_kv_head * stride_vh + offs_dv[None, :] * stride_vd +
+                 offs_n[:, None] * stride_vbs)
+        acc = tl.zeros([BLOCK_H, BLOCK_DV * 2],
+                       dtype=tl.float32)  # v head_dim packed
+        mask_dv = tl.arange(0, BLOCK_DV * 2) < (head_size_v * 2)
+        offs_dv = tl.arange(0, BLOCK_DV * 2) % (head_size_v * 2)
+    else:
+        acc = tl.zeros([BLOCK_H, BLOCK_DV], dtype=tl.float32)
+
+    num_total_blocks = tl.cdiv(kv_seqlen, BLOCK_N)
+    BLOCK_PER_CTA = tl.cdiv(num_total_blocks, SPLIT_K)
+    kv_len_per_prog = BLOCK_PER_CTA * BLOCK_N
+    loop_start = kv_len_per_prog * split_k_id
+    loop_end = tl.minimum(loop_start + kv_len_per_prog, kv_seqlen)
+
+    # load block offset
+    # dirty
+    start_block_id = loop_start // BLOCK_N
+    if window_size > 0:
+        start_block_id = tl.maximum(history_len - window_size,
+                                    loop_start) // BLOCK_N
+        kv_min_loc = tl.maximum(history_len - window_size, 0)
+
+    loop_start = start_block_id * BLOCK_N
+    for start_n in range(loop_start, loop_end, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        b_offset = tl.load(block_offset_ptrs + start_n // BLOCK_N)
+
+        # -- compute qk ----
+        # k = tl.load(k_ptrs + b_offset * stride_kp)
+        k = tl.load(K + off_k + b_offset * stride_kp)
+        if quant_policy == 4:
+            k = (k >> shift_kd) & 0x0F
+        ks = tl.load(ksz_ptrs + b_offset * stride_kszp)
+        kz = tl.load(ksz_ptrs + b_offset * stride_kszp + 1)
+        if BLOCK_DMODEL1 != 0:
+            k1 = tl.load(K + off_k1 + b_offset * stride_kp)
+            if quant_policy == 4:
+                k1 = (k1 >> shift_k1d) & 0x0F
+            k1 = ((k1 - kz) * ks).to(q.dtype)
+
+        if quant_policy == 4:
+            v = tl.load(V + off_v + b_offset * stride_vp)
+            v = (v >> shift_vd) & 0x0F
+        else:
+            v = tl.load(V + off_v + b_offset * stride_vp)
+        vs = tl.load(vsz_ptrs + b_offset * stride_vszp)
+        vz = tl.load(vsz_ptrs + b_offset * stride_vszp + 1)
+
+        k = ((k - kz) * ks).to(q.dtype)
+        v = ((v - vz) * vs).to(q.dtype)
+        qk = tl.zeros([BLOCK_H, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        if BLOCK_DMODEL1 != 0:
+            qk += tl.dot(q1, k1)
+        qk *= sm_scale
+        if logit_softcapping > 0.0:
+            qk = qk / logit_softcapping
+            qk = tanh(qk)
+            qk = qk * logit_softcapping
+        # NOTE: inf - inf = nan, and nan will leads to error
+        if start_n + BLOCK_N > history_len or window_size > 0:
+            qk_mask = history_len >= (start_n + offs_n)
+            if window_size > 0:
+                qk_mask = qk_mask and ((start_n + offs_n) >= kv_min_loc)
+            qk = tl.where(
+                qk_mask[None, :],
+                qk,
+                -float('inf'),
+            )
+
+        # -- compute p, m_i and l_i
+        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+        p = fast_expf(qk - m_i_new[:, None])
+        alpha = fast_expf(m_i - m_i_new)
+        l_i_new = alpha * l_i + tl.sum(p, 1)
+
+        # -- update output accumulator --
+        # scale acc
+        acc = acc * alpha[:, None]
+
+        # update acc
+        p, v = _convert_pv(p, v)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    # initialize pointers to output
+    off_acc = (cur_batch * stride_obs + split_k_id * stride_ok +
+               cur_head[:, None] * stride_oh + offs_dv[None, :] * stride_od)
+    tl.store(Acc_out + off_acc, acc, mask=mask_h[:, None] & mask_dv[None, :])
+
+    if quant_policy == 4:
+        off_meta = (cur_batch * stride_obs + split_k_id * stride_ok +
+                    cur_head * stride_oh + head_size_v * 2)
+    else:
+        off_meta = (cur_batch * stride_obs + split_k_id * stride_ok +
+                    cur_head * stride_oh + head_size_v)
+    tl.store(Acc_out + off_meta, m_i, mask=mask_h)
+    tl.store(Acc_out + off_meta + 1, l_i, mask=mask_h)
+
+
 @wrap_jit_func(type_hint=dict(
     Acc=torch.Tensor,
     Out=torch.Tensor,
@@ -494,6 +780,232 @@ def _fwd_kernel(
              mask=(offs_m[:, None] < q_seqlen) & mask_dv[None, :])
 
 
+# TODO: how to support inplace autotune?
+# @triton.autotune(configs=[
+#     triton.Config({}, num_stages=1, num_warps=16),
+#     triton.Config({}, num_stages=1, num_warps=8),
+#     triton.Config({}, num_stages=1, num_warps=4),
+# ],
+#                  key=['BLOCK_M', 'BLOCK_N', 'BLOCK_DMODEL', 'BLOCK_DV'])
+@wrap_jit_func
+@triton.jit
+def _fwd_kernel_quant(
+    Q,
+    K,
+    V,
+    KScalesZeros,
+    VScalesZeros,
+    sm_scale,
+    Q_start_loc,
+    Q_seqlens,
+    KV_seqlens,
+    Block_offsets,
+    Out,
+    stride_qbs: tl.constexpr,
+    stride_qh: tl.constexpr,
+    stride_qd: tl.constexpr,
+    stride_kp: tl.constexpr,
+    stride_kbs: tl.constexpr,
+    stride_kh: tl.constexpr,
+    stride_kd: tl.constexpr,
+    stride_vp: tl.constexpr,
+    stride_vbs: tl.constexpr,
+    stride_vh: tl.constexpr,
+    stride_vd: tl.constexpr,
+    stride_kszp: tl.constexpr,
+    stride_kszbs: tl.constexpr,
+    stride_kszh: tl.constexpr,
+    stride_kszd: tl.constexpr,
+    stride_vszp: tl.constexpr,
+    stride_vszbs: tl.constexpr,
+    stride_vszh: tl.constexpr,
+    stride_vszd: tl.constexpr,
+    quant_policy: tl.constexpr,
+    stride_obs: tl.constexpr,
+    stride_oh: tl.constexpr,
+    stride_od: tl.constexpr,
+    stride_boffb,
+    kv_group_num: tl.constexpr,
+    window_size: tl.constexpr,
+    head_size: tl.constexpr,
+    head_size_v: tl.constexpr,
+    logit_softcapping: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_DMODEL1: tl.constexpr,
+):
+    """paged attention kernel with dequant fused.
+
+    Args:
+        stride_xp: stride of page num dim
+        stride_xbs: stride of block size dim
+        stride_h: stride of head num dim
+        stride_d: stride of head size dim
+    """
+    cur_batch = tl.program_id(2)
+    cur_head = tl.program_id(1)
+    start_m = tl.program_id(0)
+
+    cur_kv_head = cur_head // kv_group_num
+
+    q_seqlen = tl.load(Q_seqlens + cur_batch)
+    kv_seqlen = tl.load(KV_seqlens + cur_batch)
+    q_start_loc = tl.load(Q_start_loc + cur_batch)
+    history_len = kv_seqlen - q_seqlen
+
+    block_start_loc = BLOCK_M * start_m
+    if block_start_loc >= q_seqlen:
+        return
+
+    # initialize offsets
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
+    offs_dsz = tl.arange(0, 1)
+    mask_d = offs_d < head_size
+    offs_d = offs_d % head_size
+    mask_dv = offs_dv < head_size_v
+    offs_dv = offs_dv % head_size_v
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_q = ((q_start_loc + offs_m[:, None]) * stride_qbs +
+             cur_head * stride_qh + offs_d[None, :] * stride_qd)
+    off_k = (cur_kv_head * stride_kh + offs_d[:, None] * stride_kd +
+             offs_n[None, :] * stride_kbs)
+    off_v = (cur_kv_head * stride_vh + offs_dv[None, :] * stride_vd +
+             offs_n[:, None] * stride_vbs)
+    off_ksz = (cur_kv_head * stride_kszh + offs_dsz[:, None] * stride_kszd +
+               offs_n[None, :] * stride_kszbs)
+    off_vsz = (cur_kv_head * stride_vszh + offs_dsz[None, :] * stride_vszd +
+               offs_n[:, None] * stride_vszbs)
+
+    q = tl.load(Q + off_q,
+                mask=(offs_m[:, None] < q_seqlen) & mask_d[None, :],
+                other=0.0)
+
+    ksz_ptrs = KScalesZeros + off_ksz
+    vsz_ptrs = VScalesZeros + off_vsz
+
+    if BLOCK_DMODEL1 != 0:
+        offs_d1 = BLOCK_DMODEL + tl.arange(0, BLOCK_DMODEL1)
+        mask_d1 = offs_d1 < head_size
+        offs_d1 = offs_d1 % head_size
+        off_q1 = ((q_start_loc + offs_m[:, None]) * stride_qbs +
+                  cur_head * stride_qh + offs_d1[None, :] * stride_qd)
+        q1 = tl.load(Q + off_q1, mask=(offs_m[:, None] < q_seqlen) & mask_d1)
+        off_k1 = (cur_kv_head * stride_kh + offs_d1[:, None] * stride_kd +
+                  offs_n[None, :] * stride_kbs)
+
+    block_offset_ptrs = Block_offsets + cur_batch * stride_boffb
+
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    if quant_policy == 4:
+        offs_d = tl.arange(0, BLOCK_DMODEL) % (head_size // 2)
+        offs_dv = tl.arange(0, BLOCK_DV * 2) % (head_size_v)
+        shift_kd = (tl.arange(0, BLOCK_DMODEL) // (head_size // 2) * 4)[:,
+                                                                        None]
+        off_k = (cur_kv_head * stride_kh + offs_d[:, None] * stride_kd +
+                 offs_n[None, :] * stride_kbs)
+        shift_vd = (tl.arange(0, BLOCK_DV * 2) // head_size_v * 4)
+        off_v = (cur_kv_head * stride_vh + offs_dv[None, :] * stride_vd +
+                 offs_n[:, None] * stride_vbs)
+        if BLOCK_DMODEL1 != 0:
+            offs_d1 = BLOCK_DMODEL // 2 + tl.arange(0, BLOCK_DMODEL1)
+            shift_k1d = (offs_d1 // (head_size // 2) * 4)[:, None]
+            offs_d1 = offs_d1 % (head_size // 2)
+            off_k1 = (cur_kv_head * stride_kh + offs_d1[:, None] * stride_kd +
+                      offs_n[None, :] * stride_kbs)
+        acc = tl.zeros([BLOCK_M, BLOCK_DV * 2],
+                       dtype=tl.float32)  # v head_dim packed
+        mask_dv = tl.arange(0, BLOCK_DV * 2) < (head_size_v * 2)
+        offs_dv = tl.arange(0, BLOCK_DV * 2) % (head_size_v * 2)
+    else:
+        acc = tl.zeros([BLOCK_M, BLOCK_DV], dtype=tl.float32)
+
+    kv_start_loc = 0
+    if window_size > 0:
+        start_block_id = tl.maximum(history_len - window_size, 0) // BLOCK_N
+        kv_min_loc = tl.maximum(history_len + offs_m - window_size, 0)
+        kv_start_loc = start_block_id * BLOCK_N
+        block_offset_ptrs += start_block_id
+    for start_n in range(kv_start_loc, kv_seqlen, BLOCK_N):
+        b_offset = tl.load(block_offset_ptrs)
+        block_offset_ptrs += 1
+
+        # -- compute qk ----
+        k = tl.load(K + off_k + b_offset * stride_kp)
+        if quant_policy == 4:
+            k = (k >> shift_kd) & 0x0F
+        ks = tl.load(ksz_ptrs + b_offset * stride_kszp)
+        kz = tl.load(ksz_ptrs + b_offset * stride_kszp + 1)
+        if BLOCK_DMODEL1 != 0:
+            k1 = tl.load(K + off_k1 + b_offset * stride_kp)
+            if quant_policy == 4:
+                k1 = (k1 >> shift_k1d) & 0x0F
+            k1 = ((k1 - kz) * ks).to(q.dtype)
+
+        if quant_policy == 4:
+            v = tl.load(V + off_v + b_offset * stride_vp)
+            v = (v >> shift_vd) & 0x0F
+        else:
+            v = tl.load(V + off_v + b_offset * stride_vp)
+        vs = tl.load(vsz_ptrs + b_offset * stride_vszp)
+        vz = tl.load(vsz_ptrs + b_offset * stride_vszp + 1)
+
+        # k = tl.view(k, (ks.shape[0], ks.shape[1]))
+        v = ((v - vz) * vs).to(q.dtype)
+        k = ((k - kz) * ks).to(q.dtype)
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        if BLOCK_DMODEL1 != 0:
+            qk += tl.dot(q1, k1)
+        qk *= sm_scale
+        if logit_softcapping > 0.0:
+            qk = qk / logit_softcapping
+            qk = tanh(qk)
+            qk = qk * logit_softcapping
+        # NOTE: inf - inf = nan, and nan will leads to error
+        if start_n + BLOCK_N > history_len or window_size > 0:
+            qk_mask = (history_len + offs_m[:, None]) >= (start_n +
+                                                          offs_n[None, :])
+            if window_size > 0:
+                qk_mask = qk_mask and (
+                    (start_n + offs_n[None, :]) >= kv_min_loc[:, None])
+            qk = tl.where(
+                qk_mask,
+                qk,
+                float(-1e30),
+            )
+
+        # -- compute p, m_i and l_i
+        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+        p = fast_expf(qk - m_i_new[:, None])
+        alpha = fast_expf(m_i - m_i_new)
+        l_i_new = alpha * l_i + tl.sum(p, 1)
+        # -- update output accumulator --
+        # scale acc
+        acc = acc * alpha[:, None]
+
+        # update acc
+        p, v = _convert_pv(p, v)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    acc = fast_dividef(acc, l_i[:, None])
+    # initialize pointers to output
+    off_o = ((q_start_loc + offs_m[:, None]) * stride_obs +
+             cur_head * stride_oh + offs_dv[None, :] * stride_od)
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs,
+             acc,
+             mask=(offs_m[:, None] < q_seqlen) & mask_dv[None, :])
+
+
 def paged_attention_fwd(
     q: Tensor,
     k: Tensor,
@@ -504,6 +1016,9 @@ def paged_attention_fwd(
     q_seqlens: Tensor,
     kv_seqlens: Tensor,
     max_seqlen: int,
+    k_scales_zeros: Tensor = None,
+    v_scales_zeros: Tensor = None,
+    quant_policy: Literal[0, 4, 8] = 0,
     window_size: int = None,
     sm_scale: float = None,
     logit_softcapping: float = None,
@@ -545,7 +1060,10 @@ def _get_block_d(Lk):
 
     # shape constraints
     Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
-    assert Lq == Lk, Lv == o.shape[-1]
+    if quant_policy == 4:
+        assert Lq == Lk * 2 and Lv * 2 == o.shape[-1]
+    else:
+        assert Lq == Lk and Lv == o.shape[-1]
 
     if sm_scale is None:
         sm_scale = 1.0 / (Lq**0.5)
@@ -554,60 +1072,121 @@ def _get_block_d(Lk):
 
     BLOCK = k.size(1)
     assert BLOCK >= 16
-    if Lk > 512 and BLOCK > 32:
-        logger.warning(f'`head_dim={Lk}` and `block_size={BLOCK}` '
+    if Lq > 512 and BLOCK > 32:
+        logger.warning(f'`head_dim={Lq}` and `block_size={BLOCK}` '
                        'might leads to bad performance. '
                        'Please reduce `block_size`.')
 
     kernel_meta = get_kernel_meta(q)
     is_decoding = q.shape[-3] == q_seqlens.size(0)
     if not is_decoding:
-        BLOCK_DMODEL, BLOCK_DMODEL1, BLOCK_DV = _get_block_d(Lk)
+        BLOCK_DMODEL, BLOCK_DMODEL1, BLOCK_DV = _get_block_d(Lq)
         BLOCK_M = max(16, min(BLOCK, 16384 // BLOCK_DMODEL))
         num_warps = 4
         num_stages = 2
         grid = (triton.cdiv(max_seqlen, BLOCK_M), head, batch)
-        _fwd_kernel[grid](q,
-                          k,
-                          v,
-                          sm_scale,
-                          q_start_loc,
-                          q_seqlens,
-                          kv_seqlens,
-                          block_offsets,
-                          o,
-                          stride_qbs=q.stride(-3),
-                          stride_qh=q.stride(-2),
-                          stride_qd=q.stride(-1),
-                          stride_kp=k.stride(-4),
-                          stride_kbs=k.stride(-3),
-                          stride_kh=k.stride(-2),
-                          stride_kd=k.stride(-1),
-                          stride_vp=v.stride(-4),
-                          stride_vbs=v.stride(-3),
-                          stride_vh=v.stride(-2),
-                          stride_vd=v.stride(-1),
-                          stride_obs=o.stride(-3),
-                          stride_oh=o.stride(-2),
-                          stride_od=o.stride(-1),
-                          stride_boffb=block_offsets.stride(0),
-                          kv_group_num=kv_group_num,
-                          window_size=window_size,
-                          head_size=Lk,
-                          head_size_v=Lv,
-                          logit_softcapping=logit_softcapping,
-                          BLOCK_M=BLOCK_M,
-                          BLOCK_DMODEL=BLOCK_DMODEL,
-                          BLOCK_DV=BLOCK_DV,
-                          BLOCK_N=BLOCK,
-                          BLOCK_DMODEL1=BLOCK_DMODEL1,
-                          num_warps=num_warps,
-                          num_stages=num_stages,
-                          **kernel_meta)
+        if quant_policy > 0:
+            _fwd_kernel_quant[grid](q,
+                                    k,
+                                    v,
+                                    k_scales_zeros,
+                                    v_scales_zeros,
+                                    sm_scale,
+                                    q_start_loc,
+                                    q_seqlens,
+                                    kv_seqlens,
+                                    block_offsets,
+                                    o,
+                                    stride_qbs=q.stride(-3),
+                                    stride_qh=q.stride(-2),
+                                    stride_qd=q.stride(-1),
+                                    stride_kp=k.stride(-4),
+                                    stride_kbs=k.stride(-3),
+                                    stride_kh=k.stride(-2),
+                                    stride_kd=k.stride(-1),
+                                    stride_vp=v.stride(-4),
+                                    stride_vbs=v.stride(-3),
+                                    stride_vh=v.stride(-2),
+                                    stride_vd=v.stride(-1),
+                                    stride_kszp=k_scales_zeros.stride(-4),
+                                    stride_kszbs=k_scales_zeros.stride(-3),
+                                    stride_kszh=k_scales_zeros.stride(-2),
+                                    stride_kszd=k_scales_zeros.stride(-1),
+                                    stride_vszp=v_scales_zeros.stride(-4),
+                                    stride_vszbs=v_scales_zeros.stride(-3),
+                                    stride_vszh=v_scales_zeros.stride(-2),
+                                    stride_vszd=v_scales_zeros.stride(-1),
+                                    quant_policy=quant_policy,
+                                    stride_obs=o.stride(-3),
+                                    stride_oh=o.stride(-2),
+                                    stride_od=o.stride(-1),
+                                    stride_boffb=block_offsets.stride(0),
+                                    kv_group_num=kv_group_num,
+                                    window_size=window_size,
+                                    head_size=Lq,
+                                    head_size_v=Lv,
+                                    logit_softcapping=logit_softcapping,
+                                    BLOCK_M=BLOCK_M,
+                                    BLOCK_DMODEL=BLOCK_DMODEL,
+                                    BLOCK_DV=BLOCK_DV,
+                                    BLOCK_N=BLOCK,
+                                    BLOCK_DMODEL1=BLOCK_DMODEL1,
+                                    num_warps=num_warps,
+                                    num_stages=num_stages,
+                                    **kernel_meta)
+        else:
+            _fwd_kernel[grid](q,
+                              k,
+                              v,
+                              sm_scale,
+                              q_start_loc,
+                              q_seqlens,
+                              kv_seqlens,
+                              block_offsets,
+                              o,
+                              stride_qbs=q.stride(-3),
+                              stride_qh=q.stride(-2),
+                              stride_qd=q.stride(-1),
+                              stride_kp=k.stride(-4),
+                              stride_kbs=k.stride(-3),
+                              stride_kh=k.stride(-2),
+                              stride_kd=k.stride(-1),
+                              stride_vp=v.stride(-4),
+                              stride_vbs=v.stride(-3),
+                              stride_vh=v.stride(-2),
+                              stride_vd=v.stride(-1),
+                              stride_obs=o.stride(-3),
+                              stride_oh=o.stride(-2),
+                              stride_od=o.stride(-1),
+                              stride_boffb=block_offsets.stride(0),
+                              kv_group_num=kv_group_num,
+                              window_size=window_size,
+                              head_size=Lk,
+                              head_size_v=Lv,
+                              logit_softcapping=logit_softcapping,
+                              BLOCK_M=BLOCK_M,
+                              BLOCK_DMODEL=BLOCK_DMODEL,
+                              BLOCK_DV=BLOCK_DV,
+                              BLOCK_N=BLOCK,
+                              BLOCK_DMODEL1=BLOCK_DMODEL1,
+                              num_warps=num_warps,
+                              num_stages=num_stages,
+                              **kernel_meta)
     else:
         SPLIT_K = 4
-        acc = q.new_empty(batch, head, SPLIT_K, Lv + 2, dtype=torch.float32)
-        BLOCK_DMODEL, BLOCK_DMODEL1, BLOCK_DV = _get_block_d(Lk)
+        if quant_policy != 4:
+            acc = q.new_empty(batch,
+                              head,
+                              SPLIT_K,
+                              Lv + 2,
+                              dtype=torch.float32)
+        else:
+            acc = q.new_empty(batch,
+                              head,
+                              SPLIT_K,
+                              o.shape[-1] + 2,
+                              dtype=torch.float32)
+        BLOCK_DMODEL, BLOCK_DMODEL1, BLOCK_DV = _get_block_d(Lq)
         p2_kv_group_num = triton.next_power_of_2(kv_group_num)
         BLOCK_H = max(16, min(BLOCK, p2_kv_group_num))
         grid_1 = triton.cdiv(head, min(BLOCK_H, kv_group_num))
@@ -616,45 +1195,100 @@ def _get_block_d(Lk):
             SPLIT_K,
             batch,
         )
-        _fwd_grouped_split_kernel[grid](q,
-                                        k,
-                                        v,
-                                        sm_scale,
-                                        kv_seqlens,
-                                        block_offsets,
-                                        acc,
-                                        stride_qbs=q.stride(-3),
-                                        stride_qh=q.stride(-2),
-                                        stride_qd=q.stride(-1),
-                                        stride_kp=k.stride(-4),
-                                        stride_kbs=k.stride(-3),
-                                        stride_kh=k.stride(-2),
-                                        stride_kd=k.stride(-1),
-                                        stride_vp=v.stride(-4),
-                                        stride_vbs=v.stride(-3),
-                                        stride_vh=v.stride(-2),
-                                        stride_vd=v.stride(-1),
-                                        stride_ok=acc.stride(-2),
-                                        stride_obs=acc.stride(-4),
-                                        stride_oh=acc.stride(-3),
-                                        stride_od=acc.stride(-1),
-                                        stride_boffb=block_offsets.stride(0),
-                                        kv_group_num=kv_group_num,
-                                        window_size=window_size,
-                                        head_size=Lk,
-                                        head_size_v=Lv,
-                                        num_heads_q=head,
-                                        logit_softcapping=logit_softcapping,
-                                        SPLIT_K=SPLIT_K,
-                                        BLOCK_DMODEL=BLOCK_DMODEL,
-                                        BLOCK_DV=BLOCK_DV,
-                                        BLOCK_N=BLOCK,
-                                        BLOCK_H=BLOCK_H,
-                                        BLOCK_DMODEL1=BLOCK_DMODEL1,
-                                        **kernel_meta)
+        if quant_policy > 0:
+            _fwd_grouped_split_quant_kernel[grid](
+                q,
+                k,
+                v,
+                k_scales_zeros,
+                v_scales_zeros,
+                sm_scale,
+                kv_seqlens,
+                block_offsets,
+                acc,
+                stride_qbs=q.stride(-3),
+                stride_qh=q.stride(-2),
+                stride_qd=q.stride(-1),
+                stride_kp=k.stride(-4),
+                stride_kbs=k.stride(-3),
+                stride_kh=k.stride(-2),
+                stride_kd=k.stride(-1),
+                stride_vp=v.stride(-4),
+                stride_vbs=v.stride(-3),
+                stride_vh=v.stride(-2),
+                stride_vd=v.stride(-1),
+                stride_kszp=k_scales_zeros.stride(-4),
+                stride_kszbs=k_scales_zeros.stride(-3),
+                stride_kszh=k_scales_zeros.stride(-2),
+                stride_kszd=k_scales_zeros.stride(-1),
+                stride_vszp=v_scales_zeros.stride(-4),
+                stride_vszbs=v_scales_zeros.stride(-3),
+                stride_vszh=v_scales_zeros.stride(-2),
+                stride_vszd=v_scales_zeros.stride(-1),
+                quant_policy=quant_policy,
+                stride_ok=acc.stride(-2),
+                stride_obs=acc.stride(-4),
+                stride_oh=acc.stride(-3),
+                stride_od=acc.stride(-1),
+                stride_boffb=block_offsets.stride(0),
+                kv_group_num=kv_group_num,
+                window_size=window_size,
+                head_size=Lq,
+                head_size_v=Lv,
+                num_heads_q=head,
+                logit_softcapping=logit_softcapping,
+                SPLIT_K=SPLIT_K,
+                BLOCK_DMODEL=BLOCK_DMODEL,
+                BLOCK_DV=BLOCK_DV,
+                BLOCK_N=BLOCK,
+                BLOCK_H=BLOCK_H,
+                BLOCK_DMODEL1=BLOCK_DMODEL1,
+                **kernel_meta)
+
+        else:
+            _fwd_grouped_split_kernel[grid](
+                q,
+                k,
+                v,
+                sm_scale,
+                kv_seqlens,
+                block_offsets,
+                acc,
+                stride_qbs=q.stride(-3),
+                stride_qh=q.stride(-2),
+                stride_qd=q.stride(-1),
+                stride_kp=k.stride(-4),
+                stride_kbs=k.stride(-3),
+                stride_kh=k.stride(-2),
+                stride_kd=k.stride(-1),
+                stride_vp=v.stride(-4),
+                stride_vbs=v.stride(-3),
+                stride_vh=v.stride(-2),
+                stride_vd=v.stride(-1),
+                stride_ok=acc.stride(-2),
+                stride_obs=acc.stride(-4),
+                stride_oh=acc.stride(-3),
+                stride_od=acc.stride(-1),
+                stride_boffb=block_offsets.stride(0),
+                kv_group_num=kv_group_num,
+                window_size=window_size,
+                head_size=Lk,
+                head_size_v=Lv,
+                num_heads_q=head,
+                logit_softcapping=logit_softcapping,
+                SPLIT_K=SPLIT_K,
+                BLOCK_DMODEL=BLOCK_DMODEL,
+                BLOCK_DV=BLOCK_DV,
+                BLOCK_N=BLOCK,
+                BLOCK_H=BLOCK_H,
+                BLOCK_DMODEL1=BLOCK_DMODEL1,
+                **kernel_meta)
 
         num_warps = 4
         grid = (batch, head)
+        if quant_policy == 4:
+            Lv *= 2
+            BLOCK_DV *= 2
         _reduce_split_kernel[grid](acc,
                                    o,
                                    stride_ak=acc.stride(-2),
diff --git a/lmdeploy/pytorch/model_inputs.py b/lmdeploy/pytorch/model_inputs.py
index 25731445ca..af36237a56 100644
--- a/lmdeploy/pytorch/model_inputs.py
+++ b/lmdeploy/pytorch/model_inputs.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from contextlib import contextmanager
 from dataclasses import dataclass, field, fields
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Literal
 
 import torch
 
@@ -210,6 +210,7 @@ class StepContext:
     vision_inputs: VisionModelInputs = None
     mrope_position_ids: torch.Tensor = None
     attn_metadata: Any = None
+    kv_quant_policy: Literal[0, 4, 8] = 0
 
     _outputs: Dict = field(default_factory=dict)
 
@@ -219,6 +220,7 @@ def new(
         inputs: ModelInputs,
         world_size: int = 1,
         kv_caches: List = None,
+        kv_quant_policy: Literal[0, 4, 8] = 0,
     ):
         """build step context.
 
@@ -277,6 +279,7 @@ def new(
             local_adapter_ids=inputs.local_adapter_ids,
             vision_inputs=inputs.vision_inputs,
             mrope_position_ids=mrope_position_ids,
+            kv_quant_policy=kv_quant_policy,
         )
 
         ret = get_backend().update_step_context(ret)
@@ -307,12 +310,14 @@ def build_context(
         inputs: ModelInputs,
         world_size: int = 1,
         kv_caches: List = None,
+        kv_quant_policy: Literal[0, 4, 8] = 0,
     ):
         """build context."""
         return StepContext.new(
             inputs,
             world_size,
             kv_caches,
+            kv_quant_policy,
         )
 
     @contextmanager
diff --git a/lmdeploy/pytorch/models/baichuan.py b/lmdeploy/pytorch/models/baichuan.py
index 599846e8ed..6bd18d9e58 100644
--- a/lmdeploy/pytorch/models/baichuan.py
+++ b/lmdeploy/pytorch/models/baichuan.py
@@ -101,6 +101,10 @@ def forward(
             past_key_value[0],
             past_key_value[1],
             attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
             inplace=True,
         )
         attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
diff --git a/lmdeploy/pytorch/models/chatglm2.py b/lmdeploy/pytorch/models/chatglm2.py
index 81dbef74c1..efb44b2431 100644
--- a/lmdeploy/pytorch/models/chatglm2.py
+++ b/lmdeploy/pytorch/models/chatglm2.py
@@ -201,6 +201,10 @@ def forward(
             past_key_value[0],
             past_key_value[1],
             attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
             inplace=True,
         )
         attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
diff --git a/lmdeploy/pytorch/models/cogvlm.py b/lmdeploy/pytorch/models/cogvlm.py
index 7e6e9cd91b..5023dd8e81 100644
--- a/lmdeploy/pytorch/models/cogvlm.py
+++ b/lmdeploy/pytorch/models/cogvlm.py
@@ -152,6 +152,10 @@ def forward(
             past_key_value[0],
             past_key_value[1],
             attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
             inplace=True,
         )
         attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
diff --git a/lmdeploy/pytorch/models/dbrx.py b/lmdeploy/pytorch/models/dbrx.py
index 7d10f21cb2..dd1191625b 100644
--- a/lmdeploy/pytorch/models/dbrx.py
+++ b/lmdeploy/pytorch/models/dbrx.py
@@ -94,6 +94,10 @@ def forward(
             past_key_value[0],
             past_key_value[1],
             attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
             inplace=True,
         )
         attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
diff --git a/lmdeploy/pytorch/models/deepseek.py b/lmdeploy/pytorch/models/deepseek.py
index d4dd4e763b..5ae59316e2 100644
--- a/lmdeploy/pytorch/models/deepseek.py
+++ b/lmdeploy/pytorch/models/deepseek.py
@@ -108,6 +108,10 @@ def forward(
             past_key_value[0],
             past_key_value[1],
             attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
             inplace=True,
         )
         attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
diff --git a/lmdeploy/pytorch/models/deepseek_v2.py b/lmdeploy/pytorch/models/deepseek_v2.py
index 94151a0656..4e82a67abe 100644
--- a/lmdeploy/pytorch/models/deepseek_v2.py
+++ b/lmdeploy/pytorch/models/deepseek_v2.py
@@ -269,6 +269,10 @@ def forward(
             past_key_value[0],
             past_key_value[0][..., :nope_size],
             attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
             inplace=True,
         )
         attn_bmm_out = attn_output.new_empty(q_len, num_heads, self.v_head_dim)
diff --git a/lmdeploy/pytorch/models/falcon.py b/lmdeploy/pytorch/models/falcon.py
index 65e5deb7fe..e767d29849 100644
--- a/lmdeploy/pytorch/models/falcon.py
+++ b/lmdeploy/pytorch/models/falcon.py
@@ -104,6 +104,10 @@ def forward(
             past_key_value[0],
             past_key_value[1],
             attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
             inplace=True,
         )
         attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
diff --git a/lmdeploy/pytorch/models/gemma.py b/lmdeploy/pytorch/models/gemma.py
index 3c4149b2ca..2d9f85f2ca 100644
--- a/lmdeploy/pytorch/models/gemma.py
+++ b/lmdeploy/pytorch/models/gemma.py
@@ -103,6 +103,10 @@ def forward(
             past_key_value[0],
             past_key_value[1],
             attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
             inplace=True,
         )
         attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
diff --git a/lmdeploy/pytorch/models/internlm.py b/lmdeploy/pytorch/models/internlm.py
index 8537259efc..f8869543be 100644
--- a/lmdeploy/pytorch/models/internlm.py
+++ b/lmdeploy/pytorch/models/internlm.py
@@ -94,6 +94,10 @@ def forward(
             past_key_value[0],
             past_key_value[1],
             attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
             inplace=True,
         )
         attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
diff --git a/lmdeploy/pytorch/models/internlm2.py b/lmdeploy/pytorch/models/internlm2.py
index 8176f3b212..497090afc5 100644
--- a/lmdeploy/pytorch/models/internlm2.py
+++ b/lmdeploy/pytorch/models/internlm2.py
@@ -93,6 +93,10 @@ def forward(
             past_key_value[0],
             past_key_value[1],
             attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
             inplace=True,
         )
         attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
diff --git a/lmdeploy/pytorch/models/llama.py b/lmdeploy/pytorch/models/llama.py
index a933e60825..91f9ec4cfc 100644
--- a/lmdeploy/pytorch/models/llama.py
+++ b/lmdeploy/pytorch/models/llama.py
@@ -95,6 +95,10 @@ def forward(
             past_key_value[0],
             past_key_value[1],
             attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
             inplace=True,
         )
         attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
diff --git a/lmdeploy/pytorch/models/mistral.py b/lmdeploy/pytorch/models/mistral.py
index 73897005b3..4c369b716b 100644
--- a/lmdeploy/pytorch/models/mistral.py
+++ b/lmdeploy/pytorch/models/mistral.py
@@ -95,6 +95,10 @@ def forward(
             past_key_value[0],
             past_key_value[1],
             attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
             inplace=True,
         )
         attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
diff --git a/lmdeploy/pytorch/models/mixtral.py b/lmdeploy/pytorch/models/mixtral.py
index a575af118a..677d82905b 100644
--- a/lmdeploy/pytorch/models/mixtral.py
+++ b/lmdeploy/pytorch/models/mixtral.py
@@ -91,6 +91,10 @@ def forward(
             past_key_value[0],
             past_key_value[1],
             attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
             inplace=True,
         )
         attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
diff --git a/lmdeploy/pytorch/models/phi3.py b/lmdeploy/pytorch/models/phi3.py
index 8581d2b6ec..a2859e3e3e 100644
--- a/lmdeploy/pytorch/models/phi3.py
+++ b/lmdeploy/pytorch/models/phi3.py
@@ -98,6 +98,10 @@ def forward(
             past_key_value[0],
             past_key_value[1],
             attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
             inplace=True,
         )
         attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
diff --git a/lmdeploy/pytorch/models/phi3_moe.py b/lmdeploy/pytorch/models/phi3_moe.py
index ed017130da..080f5e996c 100644
--- a/lmdeploy/pytorch/models/phi3_moe.py
+++ b/lmdeploy/pytorch/models/phi3_moe.py
@@ -146,6 +146,10 @@ def forward(
             past_key_value[0],
             past_key_value[1],
             attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
             inplace=True,
         )
         attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
diff --git a/lmdeploy/pytorch/models/qwen.py b/lmdeploy/pytorch/models/qwen.py
index 2bb9e53dff..50b9fd4ee8 100644
--- a/lmdeploy/pytorch/models/qwen.py
+++ b/lmdeploy/pytorch/models/qwen.py
@@ -99,6 +99,10 @@ def forward(
             past_key_value[0],
             past_key_value[1],
             attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
             inplace=True,
         )
         attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
diff --git a/lmdeploy/pytorch/models/qwen2.py b/lmdeploy/pytorch/models/qwen2.py
index f9fb80c18d..5ffa06665b 100644
--- a/lmdeploy/pytorch/models/qwen2.py
+++ b/lmdeploy/pytorch/models/qwen2.py
@@ -95,6 +95,10 @@ def forward(
             past_key_value[0],
             past_key_value[1],
             attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
             inplace=True,
         )
         attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
diff --git a/lmdeploy/pytorch/models/qwen2_moe.py b/lmdeploy/pytorch/models/qwen2_moe.py
index 1c461abb40..4f3406a5d8 100644
--- a/lmdeploy/pytorch/models/qwen2_moe.py
+++ b/lmdeploy/pytorch/models/qwen2_moe.py
@@ -111,6 +111,10 @@ def forward(
             past_key_value[0],
             past_key_value[1],
             attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
             inplace=True,
         )
         attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
diff --git a/lmdeploy/pytorch/models/qwen2_vl.py b/lmdeploy/pytorch/models/qwen2_vl.py
index b543234696..ced57f79f4 100644
--- a/lmdeploy/pytorch/models/qwen2_vl.py
+++ b/lmdeploy/pytorch/models/qwen2_vl.py
@@ -125,6 +125,10 @@ def forward(
             past_key_value[0],
             past_key_value[1],
             attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
             inplace=True,
         )
         attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
diff --git a/lmdeploy/pytorch/models/starcoder2.py b/lmdeploy/pytorch/models/starcoder2.py
index f0e4299d07..7498df606f 100644
--- a/lmdeploy/pytorch/models/starcoder2.py
+++ b/lmdeploy/pytorch/models/starcoder2.py
@@ -96,6 +96,10 @@ def forward(
             past_key_value[0],
             past_key_value[1],
             attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
             inplace=True,
         )
         attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
diff --git a/lmdeploy/pytorch/nn/attention.py b/lmdeploy/pytorch/nn/attention.py
index 8f18da4dcb..860bcfaaec 100644
--- a/lmdeploy/pytorch/nn/attention.py
+++ b/lmdeploy/pytorch/nn/attention.py
@@ -59,6 +59,8 @@ def forward(
         k_cache: torch.Tensor,
         v_cache: torch.Tensor,
         attn_metadata: AttentionMetadata,
+        k_scales_zeros: torch.Tensor = None,
+        v_scales_zeros: torch.Tensor = None,
         inplace: bool = True,
     ) -> torch.Tensor:
         """forward."""
@@ -69,5 +71,7 @@ def forward(
             k_cache,
             v_cache,
             attn_metadata=attn_metadata,
+            k_scales_zeros=k_scales_zeros,
+            v_scales_zeros=v_scales_zeros,
             inplace=inplace,
         )
diff --git a/tests/pytorch/kernel/test_fill_kv_cache.py b/tests/pytorch/kernel/test_fill_kv_cache.py
index 5ab97e6030..92aa8d7672 100644
--- a/tests/pytorch/kernel/test_fill_kv_cache.py
+++ b/tests/pytorch/kernel/test_fill_kv_cache.py
@@ -8,6 +8,23 @@ def _div_up(a, b):
     return (a + b - 1) // b
 
 
+def precise_round(x: torch.Tensor):
+    return x.sign() * (x.abs() + 0.5).floor()
+
+
+def quant(kv: torch.Tensor, nbits: int = 8):
+    """Quant kv on the head_dim."""
+    amax = kv.amax(dim=-1, keepdim=True)
+    amin = kv.amin(dim=-1, keepdim=True)
+    scales = (amax - amin) / (2**nbits - 1)
+    zeros = -amin / scales
+    q_kv = precise_round((kv - amin) / scales).to(torch.uint8)
+    if nbits == 4:
+        q_kv1, q_kv2 = q_kv.split(q_kv.shape[-1] // 2, -1)
+        q_kv = q_kv1 + q_kv2 * 16
+    return q_kv, torch.cat([scales, zeros], dim=-1)
+
+
 class TestFillKVCache:
 
     @pytest.fixture
@@ -141,3 +158,128 @@ def test_fill_kv_cache(self, k_states, v_states, k_caches, v_caches,
 
         torch.testing.assert_close(k_caches, gt[0])
         torch.testing.assert_close(v_caches, gt[1])
+
+
+class TestFillKVCacheInt8(TestFillKVCache):
+
+    @pytest.fixture
+    def k_caches(self, batch_size, max_num_blocks, block_size, num_heads,
+                 head_dim):
+        shape = (batch_size * max_num_blocks, block_size, num_heads, head_dim)
+        yield torch.full(shape, 0, dtype=torch.uint8).cuda()
+
+    @pytest.fixture
+    def v_caches(self, k_caches):
+        yield torch.rand_like(k_caches.to(torch.float32)).to(torch.uint8)
+
+    @pytest.fixture
+    def k_scales_zeros(self, batch_size, max_num_blocks, block_size,
+                       num_heads):
+        shape = (batch_size * max_num_blocks, block_size, num_heads, 2)
+        yield torch.full(shape, 0.0).cuda()
+
+    @pytest.fixture
+    def v_scales_zeros(self, k_scales_zeros):
+        yield torch.zeros_like(k_scales_zeros)
+
+    @pytest.fixture
+    def nbits(self):
+        yield 8
+
+    @pytest.fixture
+    def gt(self, k_states, v_states, k_caches, v_caches, seq_lens,
+           history_lens, block_offsets, block_size, k_scales_zeros,
+           v_scales_zeros, nbits):
+        k_states, k_states_sz = quant(k_states, nbits)
+        v_states, v_states_sz = quant(v_states, nbits)
+        batch_size = len(seq_lens)
+        k_caches = k_caches.clone()
+        v_caches = v_caches.clone()
+        splited_k_states = k_states.split(seq_lens)
+        splited_v_states = v_states.split(seq_lens)
+        splited_k_states_sz = k_states_sz.split(seq_lens)
+        splited_v_states_sz = v_states_sz.split(seq_lens)
+        for bidx in range(batch_size):
+            k_state = splited_k_states[bidx]
+            v_state = splited_v_states[bidx]
+            k_state_sz = splited_k_states_sz[bidx]
+            v_state_sz = splited_v_states_sz[bidx]
+            h_len = history_lens[bidx]
+            b_offs = block_offsets[bidx]
+            block_id = _div_up(h_len + 1, block_size) - 1
+            fill_start = h_len % block_size
+            fill_size = min(block_size - fill_start, k_state.size(0))
+            while True:
+                boff = b_offs[block_id]
+                tmp_ks = k_state[:fill_size]
+                tmp_vs = v_state[:fill_size]
+                tmp_ks_sz = k_state_sz[:fill_size]
+                tmp_vs_sz = v_state_sz[:fill_size]
+                fill_end = fill_start + fill_size
+                k_caches[boff, fill_start:fill_end] = tmp_ks
+                v_caches[boff, fill_start:fill_end] = tmp_vs
+                k_scales_zeros[boff, fill_start:fill_end] = tmp_ks_sz
+                v_scales_zeros[boff, fill_start:fill_end] = tmp_vs_sz
+                k_state = k_state[fill_size:]
+                v_state = v_state[fill_size:]
+                k_state_sz = k_state_sz[fill_size:]
+                v_state_sz = v_state_sz[fill_size:]
+                block_id += 1
+                fill_start = 0
+                fill_size = min(block_size, k_state.size(0))
+                if fill_size == 0:
+                    break
+
+        yield k_caches, v_caches, k_scales_zeros, v_scales_zeros
+
+    @pytest.mark.parametrize(['seq_lens', 'history_lens'], [
+        ((1, 1, 1, 1), (1, 16, 31, 24)),
+        ((1, 8, 16, 24), (1, 16, 31, 24)),
+    ],
+                             indirect=True)
+    def test_fill_kv_cache(self, k_states, v_states, k_caches, v_caches,
+                           k_scales_zeros, v_scales_zeros, block_offsets,
+                           q_start_loc, q_seq_length, kv_seq_length,
+                           max_q_seq_length, gt):
+        fill_kv_cache(k_states, v_states, k_caches, v_caches, q_start_loc,
+                      q_seq_length, kv_seq_length, max_q_seq_length,
+                      block_offsets, k_scales_zeros, v_scales_zeros, 8)
+
+        torch.testing.assert_close(k_caches, gt[0])
+        torch.testing.assert_close(v_caches, gt[1])
+        torch.testing.assert_close(k_scales_zeros, gt[2])
+        torch.testing.assert_close(v_scales_zeros, gt[3])
+
+
+class TestFillKVCacheInt4(TestFillKVCacheInt8):
+
+    @pytest.fixture
+    def k_caches(self, batch_size, max_num_blocks, block_size, num_heads,
+                 head_dim):
+        shape = (batch_size * max_num_blocks, block_size, num_heads,
+                 head_dim // 2)
+        yield torch.full(shape, 0, dtype=torch.uint8).cuda()
+
+    @pytest.fixture
+    def nbits(self):
+        yield 4
+
+    @pytest.mark.parametrize(['seq_lens', 'history_lens'], [
+        ((1, 1, 1, 1), (1, 16, 31, 24)),
+        ((1, 8, 16, 24), (1, 16, 31, 24)),
+    ],
+                             indirect=True)
+    def test_fill_kv_cache(self, k_states, v_states, k_caches, v_caches,
+                           k_scales_zeros, v_scales_zeros, block_offsets,
+                           q_start_loc, q_seq_length, kv_seq_length,
+                           max_q_seq_length, gt, nbits):
+        k_scales_zeros = torch.zeros_like(k_scales_zeros)
+        v_scales_zeros = torch.zeros_like(v_scales_zeros)
+        fill_kv_cache(k_states, v_states, k_caches, v_caches, q_start_loc,
+                      q_seq_length, kv_seq_length, max_q_seq_length,
+                      block_offsets, k_scales_zeros, v_scales_zeros, nbits)
+
+        torch.testing.assert_close(k_scales_zeros, gt[2])
+        torch.testing.assert_close(v_scales_zeros, gt[3])
+        torch.testing.assert_close(k_caches, gt[0])
+        torch.testing.assert_close(v_caches, gt[1])
diff --git a/tests/pytorch/kernel/test_paged_attention.py b/tests/pytorch/kernel/test_paged_attention.py
index 90dc153aeb..5d4b024199 100644
--- a/tests/pytorch/kernel/test_paged_attention.py
+++ b/tests/pytorch/kernel/test_paged_attention.py
@@ -302,3 +302,157 @@ def test_window_attention(self, conti_q, blocked_kv, block_offsets,
                             max_seqlen=max_seq_len,
                             window_size=win_size)
         torch.testing.assert_close(out, window_gt, atol=1e-3, rtol=1e-5)
+
+
+def precise_round(x: torch.Tensor):
+    return x.sign() * (x.abs() + 0.5).floor()
+
+
+def quant(kv: torch.Tensor, nbits: int = 8):
+    """Quant kv on the head_dim."""
+    amax = kv.amax(dim=-1, keepdim=True)
+    amin = kv.amin(dim=-1, keepdim=True)
+    scales = (amax - amin) / (2**nbits - 1)
+    zeros = -amin / scales
+    q_kv = precise_round((kv - amin) / scales).to(torch.uint8)
+    if nbits == 4:
+        q_kv1, q_kv2 = q_kv.split(q_kv.shape[-1] // 2, -1)
+        q_kv = q_kv1 + q_kv2 * 16
+    return q_kv, torch.cat([scales, zeros], dim=-1)
+
+
+def _make_blocked_cache_quant(batched_k, batched_v, seq_lens, history_lens,
+                              block_offsets, block_size, num_heads_k, feat_dim,
+                              feat_dim_v, nbits):
+    max_blocks_nums = block_offsets.max() + 1
+    full_seq_lens = seq_lens + history_lens
+    batched_k, k_scales_zeros = quant(batched_k, nbits)
+    batched_v, v_scales_zeros = quant(batched_v, nbits)
+    if nbits == 4:
+        feat_dim //= 2
+        feat_dim_v //= 2
+    blocked_k = batched_k.new_zeros(max_blocks_nums, block_size, num_heads_k,
+                                    feat_dim)
+    blocked_v = batched_v.new_zeros(max_blocks_nums, block_size, num_heads_k,
+                                    feat_dim_v)
+    blocked_ksz = k_scales_zeros.new_zeros(max_blocks_nums, block_size,
+                                           num_heads_k, 2)
+    blocked_vsz = v_scales_zeros.new_zeros(max_blocks_nums, block_size,
+                                           num_heads_k, 2)
+
+    for batch_id, offset in enumerate(block_offsets):
+        ori_k = batched_k[batch_id]
+        ori_v = batched_v[batch_id]
+        ori_ksz = k_scales_zeros[batch_id]
+        ori_vsz = v_scales_zeros[batch_id]
+        seq_len = full_seq_lens[batch_id]
+        for block_id, block_start in enumerate(range(0, seq_len, block_size)):
+            block_off = offset[block_id]
+            tmp_k = ori_k[block_start:block_start + block_size]
+            tmp_v = ori_v[block_start:block_start + block_size]
+            tmp_ksz = ori_ksz[block_start:block_start + block_size]
+            tmp_vsz = ori_vsz[block_start:block_start + block_size]
+            size = tmp_k.size(0)
+            blocked_k[block_off, :size] = tmp_k
+            blocked_v[block_off, :size] = tmp_v
+            blocked_ksz[block_off, :size] = tmp_ksz
+            blocked_vsz[block_off, :size] = tmp_vsz
+
+    return blocked_k, blocked_v, blocked_ksz, blocked_vsz
+
+
+class TestPagedAttentionInt8(TestPagedAttention):
+
+    @pytest.fixture
+    def nbits(self):
+        yield 8
+
+    @pytest.fixture
+    def blocked_kv(self, batched_kv, seq_lens, history_lens, block_offsets,
+                   block_size, num_heads_k, feat_dim, feat_dim_v, nbits):
+        batched_k, batched_v = batched_kv
+        yield _make_blocked_cache_quant(batched_k, batched_v, seq_lens,
+                                        history_lens, block_offsets,
+                                        block_size, num_heads_k, feat_dim,
+                                        feat_dim_v, nbits)
+
+    @pytest.mark.parametrize('feat_dim', [48, 32], indirect=True)
+    @pytest.mark.parametrize('feat_dim_v', [32], indirect=True)
+    @pytest.mark.parametrize(['num_heads_q', 'num_heads_k'], [(8, 2), (2, 2)],
+                             indirect=True)
+    @pytest.mark.parametrize(['seq_lens', 'history_lens'],
+                             [([30, 50, 70, 90], [50, 40, 30, 20]),
+                              ([1, 1, 1, 1], [50, 40, 30, 20])],
+                             indirect=True)
+    @pytest.mark.parametrize('block_size', [16], indirect=True)
+    def test_paged_attention(self, conti_q, blocked_kv, block_offsets,
+                             start_loc, seq_lens, history_lens, feat_dim_v,
+                             conti_gt, nbits):
+        from lmdeploy.pytorch.kernels import paged_attention_fwd
+        kv_seq_lens = seq_lens + history_lens
+        max_seq_len = seq_lens.max().item()
+
+        blocked_k, blocked_v, blocked_ksz, blocked_vsz = blocked_kv
+        out = conti_q.new_empty(*conti_q.shape[:-1], feat_dim_v)
+
+        paged_attention_fwd(conti_q,
+                            blocked_k,
+                            blocked_v,
+                            out,
+                            k_scales_zeros=blocked_ksz,
+                            v_scales_zeros=blocked_vsz,
+                            quant_policy=nbits,
+                            block_offsets=block_offsets,
+                            q_start_loc=start_loc,
+                            q_seqlens=seq_lens,
+                            kv_seqlens=kv_seq_lens,
+                            max_seqlen=max_seq_len)
+        if nbits == 4:
+            torch.testing.assert_close(out, conti_gt, atol=0.05, rtol=0.01)
+        else:
+            torch.testing.assert_close(out, conti_gt, atol=1e-3, rtol=1e-5)
+
+    @pytest.mark.parametrize('feat_dim', [16], indirect=True)
+    @pytest.mark.parametrize('feat_dim_v', [16], indirect=True)
+    @pytest.mark.parametrize(['num_heads_q', 'num_heads_k'], [(4, 2)],
+                             indirect=True)
+    @pytest.mark.parametrize(['seq_lens', 'history_lens'], [
+        ([30, 50, 70, 90], [50, 40, 30, 20]),
+        ([1, 1, 1, 1], [50, 40, 30, 20]),
+    ],
+                             indirect=True)
+    @pytest.mark.parametrize('win_size', (32, ), indirect=True)
+    @pytest.mark.parametrize('block_size', [16], indirect=True)
+    def test_window_attention(self, conti_q, blocked_kv, block_offsets,
+                              start_loc, seq_lens, history_lens, feat_dim_v,
+                              win_size, window_gt, nbits):
+        from lmdeploy.pytorch.kernels import paged_attention_fwd
+        kv_seq_lens = seq_lens + history_lens
+        max_seq_len = seq_lens.max().item()
+
+        blocked_k, blocked_v, blocked_ksz, blocked_vsz = blocked_kv
+        out = conti_q.new_empty(*conti_q.shape[:-1], feat_dim_v)
+        paged_attention_fwd(conti_q,
+                            blocked_k,
+                            blocked_v,
+                            out,
+                            k_scales_zeros=blocked_ksz,
+                            v_scales_zeros=blocked_vsz,
+                            quant_policy=nbits,
+                            block_offsets=block_offsets,
+                            q_start_loc=start_loc,
+                            q_seqlens=seq_lens,
+                            kv_seqlens=kv_seq_lens,
+                            max_seqlen=max_seq_len,
+                            window_size=win_size)
+        if nbits == 4:
+            torch.testing.assert_close(out, window_gt, atol=0.05, rtol=0.01)
+        else:
+            torch.testing.assert_close(out, window_gt, atol=1e-3, rtol=1e-5)
+
+
+class TestPagedAttentionInt4(TestPagedAttentionInt8):
+
+    @pytest.fixture
+    def nbits(self):
+        yield 4

From ba3701b57548e81aec88e2d5861a9915a5463ca7 Mon Sep 17 00:00:00 2001
From: Superskyyy <yihaochen@apache.org>
Date: Mon, 14 Oct 2024 23:30:22 -0400
Subject: [PATCH 006/122] Fix spacing in ascend user guide(#2601)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 41ff9c0268..91ac1d4134 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ ______________________________________________________________________
 <details open>
 <summary><b>2024</b></summary>
 
-- \[2024/09\] LMDeploy PyTorchEngine adds support for[Huawei Ascend](./docs/en/get_started/ascend/get_started.md). See supported models [here](docs/en/supported_models/supported_models.md)
+- \[2024/09\] LMDeploy PyTorchEngine adds support for [Huawei Ascend](./docs/en/get_started/ascend/get_started.md). See supported models [here](docs/en/supported_models/supported_models.md)
 - \[2024/09\] LMDeploy PyTorchEngine achieves 1.3x faster on Llama3-8B inference by introducing CUDA graph
 - \[2024/08\] LMDeploy is integrated into [modelscope/swift](https://github.com/modelscope/swift) as the default accelerator for VLMs inference
 - \[2024/07\] Support Llama3.1 8B, 70B and its TOOLS CALLING

From b689cbc31a5a414566f30f3f70b8ff05669668f4 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Wed, 16 Oct 2024 17:28:07 +0800
Subject: [PATCH 007/122] [ci] add pytorch kvint testcase into function
 regresstion (#2584)

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* updata

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* Update config.yaml

* update

* update kvint testcase for vl model

* update

* update

* update

* updaste

* update

* update

* update
---
 autotest/config.yaml                          |  83 +++-------
 .../chat/test_command_chat_hf_pytorch.py      |  48 ++++++
 .../pipeline/test_pipeline_chat_pytorch.py    | 113 +++++++++++--
 .../pipeline/test_pipeline_chat_turbomind.py  |  68 ++++++--
 .../test_pipeline_chat_turbomind_vl.py        |  54 +++++++
 .../restful/test_restful_chat_hf_pytorch.py   |  81 +++++++++-
 .../restful/test_restful_chat_hf_turbomind.py |  58 +++++--
 .../test_restful_chat_hf_turbomind_vl.py      |  63 +++++++-
 autotest/utils/config_utils.py                | 152 ++++++++++--------
 autotest/utils/pipeline_chat.py               |  13 +-
 autotest/utils/run_restful_chat.py            |   2 +-
 11 files changed, 560 insertions(+), 175 deletions(-)

diff --git a/autotest/config.yaml b/autotest/config.yaml
index 07505718c6..c0db4a71fc 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -57,10 +57,13 @@ turbomind_chat_model:
     - deepseek-ai/deepseek-coder-1.3b-instruct
     - codellama/CodeLlama-7b-Instruct-hf
     - THUDM/glm-4-9b-chat
+    - openbmb/MiniCPM-Llama3-V-2_5
+    - openbmb/MiniCPM-V-2_6
 
 pytorch_chat_model:
     - meta-llama/Meta-Llama-3-8B-Instruct
     - meta-llama/Meta-Llama-3-1-8B-Instruct
+    - meta-llama/Llama-3.2-1B-Instruct
     - meta-llama/Llama-2-7b-chat-hf
     - internlm/internlm2_5-7b-chat
     - internlm/internlm2_5-20b-chat
@@ -71,6 +74,7 @@ pytorch_chat_model:
     - OpenGVLab/InternVL2-8B
     - OpenGVLab/InternVL2-26B
     - OpenGVLab/InternVL2-40B
+    - OpenGVLab/InternVL-Chat-V1-5
     - baichuan-inc/Baichuan2-7B-Chat
     - baichuan-inc/Baichuan2-13B-Chat
     - 01-ai/Yi-6B-Chat
@@ -94,9 +98,9 @@ pytorch_chat_model:
     - THUDM/cogvlm2-llama3-chinese-chat-19B
     - THUDM/glm-4v-9b
     - THUDM/glm-4-9b-chat
+    - THUDM/cogvlm-chat-hf
     - microsoft/Phi-3-mini-4k-instruct
     - microsoft/Phi-3-vision-128k-instruct
-    - bigcode/starcoder2-7b
 
 turbomind_base_model:
     - internlm/internlm2_5-7b
@@ -109,6 +113,7 @@ pytorch_base_model:
     - internlm/internlm2_5-7b
     - internlm/internlm2_5-1_8b
     - internlm/internlm2_5-20b
+    - bigcode/starcoder2-7b
 
 vl_model:
     - Qwen/Qwen-VL-Chat
@@ -125,81 +130,27 @@ vl_model:
     - OpenGVLab/InternVL2-40B
     - Qwen/Qwen2-VL-2B-Instruct
     - Qwen/Qwen2-VL-7B-Instruct
-    - internlm/internlm-xcomposer2-vl-7b
     - internlm/internlm-xcomposer2d5-7b
     - internlm/internlm-xcomposer2-4khd-7b
     - THUDM/cogvlm-chat-hf
     - THUDM/cogvlm2-llama3-chinese-chat-19B
     - THUDM/glm-4v-9b
+    - microsoft/Phi-3-mini-4k-instruct
     - microsoft/Phi-3-vision-128k-instruct
     - openbmb/MiniCPM-Llama3-V-2_5
     - openbmb/MiniCPM-V-2_6
 
 turbomind_quatization:
-    awq:
-        - meta-llama/Meta-Llama-3-1-8B-Instruct
-        - meta-llama/Meta-Llama-3-8B-Instruct
-        - meta-llama/Llama-2-7b-chat-hf
-        - internlm/internlm2_5-7b-chat
-        - internlm/internlm2_5-7b
-        - internlm/internlm2_5-20b-chat
-        - internlm/internlm2-chat-20b
-        - internlm/internlm2_5-20b
-        - internlm/internlm-chat-20b
-        - internlm/internlm-xcomposer2-4khd-7b
-        - internlm/internlm-xcomposer2d5-7b
-        - OpenGVLab/InternVL-Chat-V1-5
-        - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
-        - OpenGVLab/InternVL2-2B
-        - OpenGVLab/InternVL2-8B
-        - OpenGVLab/InternVL2-26B
-        - OpenGVLab/InternVL2-40B
-        - Qwen/Qwen1.5-7B-Chat
-        - Qwen/Qwen2-7B-Instruct
-        - Qwen/Qwen2-1.5B-Instruct
-        - Qwen/Qwen2.5-7B-Instruct
-        - Qwen/Qwen-VL-Chat
-        - liuhaotian/llava-v1.5-13b
-        - liuhaotian/llava-v1.6-vicuna-7b
-        - 01-ai/Yi-VL-6B
-        - 01-ai/Yi-6B-Chat
-        - deepseek-ai/deepseek-vl-1.3b-chat
-        - baichuan-inc/Baichuan2-7B-Chat
-        - codellama/CodeLlama-7b-hf
-        - openbmb/MiniCPM-Llama3-V-2_5
-        - THUDM/glm-4-9b-chat
-    gptq:
-        - internlm/internlm2_5-7b-chat
-    kvint:
-        - meta-llama/Meta-Llama-3-1-8B-Instruct
-        - meta-llama/Meta-Llama-3-8B-Instruct
-        - meta-llama/Llama-2-7b-chat-hf
-        - internlm/internlm2_5-7b-chat
-        - internlm/internlm2_5-20b-chat
-        - internlm/internlm2-chat-20b
-        - internlm/internlm2-chat-20b-4bits
-        - internlm/internlm-chat-20b
-        - internlm/internlm-xcomposer2-4khd-7b
-        - internlm/internlm-xcomposer2d5-7b
-        - OpenGVLab/InternVL-Chat-V1-5
-        - Qwen/Qwen2-7B-Instruct
-        - Qwen/Qwen2-7B-Instruct-AWQ
-        - Qwen/Qwen2-1.5B-Instruct
-        - Qwen/Qwen1.5-7B-Chat
-        - Qwen/Qwen1.5-4B-Chat-AWQ
-        - Qwen/Qwen-VL-Chat
+    no_awq:
+        - Qwen/Qwen2-VL-2B-Instruct
+        - Qwen/Qwen2-VL-7B-Instruct
+        - mistralai/Mistral-7B-Instruct-v0.1
         - mistralai/Mistral-7B-Instruct-v0.2
         - mistralai/Mistral-7B-Instruct-v0.3
-        - lmdeploy/llama2-chat-7b-w4
-        - baichuan-inc/Baichuan2-7B-Chat
-        - 01-ai/Yi-6B-Chat
-        - 01-ai/Yi-VL-6B
-        - liuhaotian/llava-v1.5-13b
-        - liuhaotian/llava-v1.6-vicuna-7b
-        - deepseek-ai/deepseek-vl-1.3b-chat
         - deepseek-ai/deepseek-coder-1.3b-instruct
         - codellama/CodeLlama-7b-Instruct-hf
-        - THUDM/glm-4-9b-chat
+    gptq:
+        - internlm/internlm2_5-7b-chat
 
 pytorch_quatization:
     awq:
@@ -211,6 +162,7 @@ pytorch_quatization:
         - internlm/internlm2-chat-20b
         - OpenGVLab/InternVL-Chat-V1-5
         - 01-ai/Yi-6B-Chat
+        - Qwen/Qwen1.5-7B-Chat
         - Qwen/Qwen2-7B-Instruct
         - Qwen/Qwen2-1.5B-Instruct
         - microsoft/Phi-3-mini-4k-instruct
@@ -223,6 +175,13 @@ pytorch_quatization:
         - 01-ai/Yi-6B-Chat
         - internlm/internlm2_5-20b
         - internlm/internlm2_5-7b
+    no_kvint4:
+        - OpenGVLab/InternVL2-4B
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
+        - microsoft/Phi-3-mini-4k-instruct
+        - microsoft/Phi-3-vision-128k-instruct
+    no_kvint8:
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
 
 
 longtext_model:
diff --git a/autotest/tools/chat/test_command_chat_hf_pytorch.py b/autotest/tools/chat/test_command_chat_hf_pytorch.py
index 642f87ec28..1ae3be338b 100644
--- a/autotest/tools/chat/test_command_chat_hf_pytorch.py
+++ b/autotest/tools/chat/test_command_chat_hf_pytorch.py
@@ -51,6 +51,54 @@ def test_hf_pytorch_chat_tp2(config, model, cli_case_config, worker_id):
     assert result, msg
 
 
+@pytest.mark.order(10)
+@pytest.mark.usefixtures('cli_case_config')
+@pytest.mark.hf_turbomind_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=1,
+                                              model_type='base_model'))
+def test_hf_pytorch_base_tp1(config, model, cli_case_config, worker_id):
+    usercase = 'base_testcase'
+    result, chat_log, msg = hf_command_line_test(
+        config,
+        usercase,
+        cli_case_config.get(usercase),
+        model,
+        'pytorch',
+        cuda_prefix=get_cuda_prefix_by_workerid(worker_id))
+
+    if chat_log is not None:
+        allure.attach.file(chat_log,
+                           attachment_type=allure.attachment_type.TEXT)
+
+    assert result, msg
+
+
+@pytest.mark.order(10)
+@pytest.mark.usefixtures('cli_case_config')
+@pytest.mark.hf_turbomind_chat
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=2,
+                                              model_type='base_model'))
+def test_hf_pytorch_base_tp2(config, model, cli_case_config, worker_id):
+    usercase = 'base_testcase'
+    result, chat_log, msg = hf_command_line_test(
+        config,
+        usercase,
+        cli_case_config.get(usercase),
+        model,
+        'pytorch',
+        cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2))
+
+    if chat_log is not None:
+        allure.attach.file(chat_log,
+                           attachment_type=allure.attachment_type.TEXT)
+
+    assert result, msg
+
+
 @pytest.mark.order(10)
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_pytorch_chat
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch.py
index 8f56225ebc..270d4b6831 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_pytorch.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch.py
@@ -7,19 +7,13 @@
                                  run_pipeline_chat_test)
 
 
-def getModelList(tp_num):
-    return [
-        item for item in get_torch_model_list(tp_num)
-        if 'falcon' not in item.lower() and 'chatglm2' not in item.lower()
-    ]
-
-
 @pytest.mark.order(6)
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat_pytorch
 @pytest.mark.gpu_num_1
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('model', getModelList(tp_num=1))
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=1, exclude_dup=True))
 def test_pipeline_chat_pytorch_tp1(config, common_case_config, model,
                                    worker_id):
     if 'gw' in worker_id:
@@ -39,7 +33,8 @@ def test_pipeline_chat_pytorch_tp1(config, common_case_config, model,
 @pytest.mark.pipeline_chat_pytorch
 @pytest.mark.gpu_num_2
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('model', getModelList(tp_num=2))
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=2, exclude_dup=True))
 def test_pipeline_chat_pytorch_tp2(config, common_case_config, model,
                                    worker_id):
     if 'gw' in worker_id:
@@ -57,6 +52,106 @@ def test_pipeline_chat_pytorch_tp2(config, common_case_config, model,
                              worker_id)
 
 
+@pytest.mark.order(6)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=1,
+                                              quant_policy=4,
+                                              exclude_dup=True))
+def test_pipeline_chat_kvint4_tp1(config, common_case_config, model,
+                                  worker_id):
+    if 'Qwen2' in model:
+        return  # kvint4 for qwen2 is not support
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+    p = Process(target=run_pipeline_chat_test,
+                args=(config, common_case_config, model, 'pytorch-kvint',
+                      worker_id, {
+                          'quant_policy': 4
+                      }))
+    p.start()
+    p.join()
+    assert_pipeline_chat_log(config, common_case_config, model,
+                             'pytorch-kvint', worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_2
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=2,
+                                              quant_policy=4,
+                                              exclude_dup=True))
+def test_pipeline_chat_kvint4_tp2(config, common_case_config, model,
+                                  worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
+                                                                     tp_num=2)
+    p = Process(target=run_pipeline_chat_test,
+                args=(config, common_case_config, model, 'pytorch-kvint',
+                      worker_id, {
+                          'quant_policy': 4
+                      }))
+    p.start()
+    p.join()
+    assert_pipeline_chat_log(config, common_case_config, model,
+                             'pytorch-kvint', worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=1,
+                                              quant_policy=8,
+                                              exclude_dup=True))
+def test_pipeline_chat_kvint8_tp1(config, common_case_config, model,
+                                  worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+    p = Process(target=run_pipeline_chat_test,
+                args=(config, common_case_config, model, 'pytorch-kvint',
+                      worker_id, {
+                          'quant_policy': 8
+                      }))
+    p.start()
+    p.join()
+    assert_pipeline_chat_log(config, common_case_config, model,
+                             'pytorch-kvint', worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_2
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=2,
+                                              quant_policy=8,
+                                              exclude_dup=True))
+def test_pipeline_chat_kvint8_tp2(config, common_case_config, model,
+                                  worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
+                                                                     tp_num=2)
+    p = Process(target=run_pipeline_chat_test,
+                args=(config, common_case_config, model, 'pytorch-kvint',
+                      worker_id, {
+                          'quant_policy': 8
+                      }))
+    p.start()
+    p.join()
+    assert_pipeline_chat_log(config, common_case_config, model,
+                             'pytorch-kvint', worker_id)
+
+
 @pytest.mark.order(6)
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat_pytorch
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind.py
index d92af06ecb..d67b5d27b3 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind.py
@@ -2,8 +2,7 @@
 from multiprocessing import Process
 
 import pytest
-from utils.config_utils import (get_all_model_list, get_cuda_id_by_workerid,
-                                get_kvint_model_list)
+from utils.config_utils import get_all_model_list, get_cuda_id_by_workerid
 from utils.pipeline_chat import (assert_pipeline_chat_log,
                                  run_pipeline_chat_test)
 
@@ -50,18 +49,17 @@ def test_pipeline_chat_tp2(config, common_case_config, model, worker_id):
 @pytest.mark.pipeline_chat
 @pytest.mark.gpu_num_1
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('model', get_kvint_model_list(tp_num=1))
-@pytest.mark.parametrize('quant_policy', (4, 8))
-def test_pipeline_chat_kvint_tp1(config, common_case_config, model,
-                                 quant_policy, worker_id):
-    if quant_policy == 4 and 'Qwen2' in model:
+@pytest.mark.parametrize('model', get_all_model_list(tp_num=1, quant_policy=4))
+def test_pipeline_chat_kvint4_tp1(config, common_case_config, model,
+                                  worker_id):
+    if 'Qwen2' in model:
         return  # kvint4 for qwen2 is not support
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
     p = Process(target=run_pipeline_chat_test,
                 args=(config, common_case_config, model, 'turbomind-kvint',
                       worker_id, {
-                          'quant_policy': quant_policy
+                          'quant_policy': 4
                       }))
     p.start()
     p.join()
@@ -74,19 +72,59 @@ def test_pipeline_chat_kvint_tp1(config, common_case_config, model,
 @pytest.mark.pipeline_chat
 @pytest.mark.gpu_num_2
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('model', get_kvint_model_list(tp_num=2))
-@pytest.mark.parametrize('quant_policy', (4, 8))
-def test_pipeline_chat_kvint_tp2(config, common_case_config, model,
-                                 quant_policy, worker_id):
-    if quant_policy == 4 and 'Qwen2' in model:
-        return  # kvint4 for qwen2 is not support
+@pytest.mark.parametrize('model', get_all_model_list(tp_num=2, quant_policy=4))
+def test_pipeline_chat_kvint4_tp2(config, common_case_config, model,
+                                  worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
+                                                                     tp_num=2)
+    p = Process(target=run_pipeline_chat_test,
+                args=(config, common_case_config, model, 'turbomind-kvint',
+                      worker_id, {
+                          'quant_policy': 4
+                      }))
+    p.start()
+    p.join()
+    assert_pipeline_chat_log(config, common_case_config, model,
+                             'turbomind-kvint', worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('model', get_all_model_list(tp_num=1, quant_policy=8))
+def test_pipeline_chat_kvint8_tp1(config, common_case_config, model,
+                                  worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+    p = Process(target=run_pipeline_chat_test,
+                args=(config, common_case_config, model, 'turbomind-kvint',
+                      worker_id, {
+                          'quant_policy': 8
+                      }))
+    p.start()
+    p.join()
+    assert_pipeline_chat_log(config, common_case_config, model,
+                             'turbomind-kvint', worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_2
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('model', get_all_model_list(tp_num=2, quant_policy=8))
+def test_pipeline_chat_kvint8_tp2(config, common_case_config, model,
+                                  worker_id):
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
                                                                      tp_num=2)
     p = Process(target=run_pipeline_chat_test,
                 args=(config, common_case_config, model, 'turbomind-kvint',
                       worker_id, {
-                          'quant_policy': quant_policy
+                          'quant_policy': 8
                       }))
     p.start()
     p.join()
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py
index b02aa21fd3..cee08308ff 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py
@@ -34,6 +34,60 @@ def test_pipeline_chat_tp2(config, model, worker_id):
     assert_pipeline_vl_chat_log(config, model)
 
 
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('model', get_vl_model_list(tp_num=1, quant_policy=4))
+def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+    p = Process(target=run_pipeline_vl_chat_test, args=(config, model, 4))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model)
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('model', get_vl_model_list(tp_num=2, quant_policy=4))
+def test_pipeline_chat_kvint4_tp2(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
+                                                                     tp_num=2)
+    p = Process(target=run_pipeline_vl_chat_test, args=(config, model, 4))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model)
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('model', get_vl_model_list(tp_num=1, quant_policy=8))
+def test_pipeline_chat_kvint8_tp1(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+    p = Process(target=run_pipeline_vl_chat_test, args=(config, model, 8))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model)
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('model', get_vl_model_list(tp_num=2, quant_policy=8))
+def test_pipeline_chat_kvint8_tp2(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
+                                                                     tp_num=2)
+    p = Process(target=run_pipeline_vl_chat_test, args=(config, model, 8))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model)
+
+
 @pytest.mark.pipeline_chat
 @pytest.mark.gpu_num_1
 @pytest.mark.pr_test
diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch.py b/autotest/tools/restful/test_restful_chat_hf_pytorch.py
index aef836812a..ab1f5595ae 100644
--- a/autotest/tools/restful/test_restful_chat_hf_pytorch.py
+++ b/autotest/tools/restful/test_restful_chat_hf_pytorch.py
@@ -23,7 +23,7 @@ def getModelList(tp_num):
         'model': item,
         'cuda_prefix': None,
         'tp_num': tp_num
-    } for item in get_torch_model_list(tp_num)]
+    } for item in get_torch_model_list(tp_num, exclude_dup=True)]
 
 
 @pytest.mark.order(7)
@@ -60,6 +60,85 @@ def test_restful_chat_tp2(config, common_case_config, worker_id):
                      port=DEFAULT_PORT + get_workerid(worker_id))
 
 
+def getKvintModelList(tp_num, quant_policy):
+    return [{
+        'model': item,
+        'cuda_prefix': None,
+        'tp_num': tp_num,
+        'extra': f'--quant-policy {quant_policy}'
+    } for item in get_torch_model_list(
+        tp_num, quant_policy=quant_policy, exclude_dup=True)
+            if 'qwen2' not in item.lower() or quant_policy == 8]
+
+
+@pytest.mark.order(7)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.restful_api
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=1, quant_policy=4),
+                         indirect=True)
+def test_restful_chat_kvint4_tp1(config, common_case_config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_all_step(config, common_case_config)
+    else:
+        run_all_step(config,
+                     common_case_config,
+                     worker_id=worker_id,
+                     port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.restful_api
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=2, quant_policy=4),
+                         indirect=True)
+def test_restful_chat_kvint4_tp2(config, common_case_config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_all_step(config, common_case_config)
+    else:
+        run_all_step(config,
+                     common_case_config,
+                     worker_id=worker_id,
+                     port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.restful_api
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=1, quant_policy=8),
+                         indirect=True)
+def test_restful_chat_kvint8_tp1(config, common_case_config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_all_step(config, common_case_config)
+    else:
+        run_all_step(config,
+                     common_case_config,
+                     worker_id=worker_id,
+                     port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.restful_api
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=2, quant_policy=8),
+                         indirect=True)
+def test_restful_chat_kvint8_tp2(config, common_case_config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_all_step(config, common_case_config)
+    else:
+        run_all_step(config,
+                     common_case_config,
+                     worker_id=worker_id,
+                     port=DEFAULT_PORT + get_workerid(worker_id))
+
+
 @pytest.mark.order(7)
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind.py b/autotest/tools/restful/test_restful_chat_hf_turbomind.py
index c9fade16a4..91e65ee51a 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind.py
@@ -1,6 +1,5 @@
 import pytest
-from utils.config_utils import (get_all_model_list, get_kvint_model_list,
-                                get_workerid)
+from utils.config_utils import get_all_model_list, get_workerid
 from utils.run_restful_chat import (run_all_step, start_restful_api,
                                     stop_restful_api)
 
@@ -61,19 +60,14 @@ def test_restful_chat_tp2(config, common_case_config, worker_id):
                      port=DEFAULT_PORT + get_workerid(worker_id))
 
 
-def getKvintModelList(tp_num):
+def getKvintModelList(tp_num, quant_policy):
     return [{
         'model': item,
         'cuda_prefix': None,
         'tp_num': tp_num,
-        'extra': '--quant-policy 4'
-    } for item in get_kvint_model_list(tp_num)
-            if 'qwen2' not in item.lower()] + [{
-                'model': item,
-                'cuda_prefix': None,
-                'tp_num': tp_num,
-                'extra': '--quant-policy 8'
-            } for item in get_kvint_model_list(tp_num)]
+        'extra': f'--quant-policy {quant_policy}'
+    } for item in get_all_model_list(tp_num, quant_policy=quant_policy)
+            if 'qwen2' not in item.lower() or quant_policy == 8]
 
 
 @pytest.mark.order(7)
@@ -81,9 +75,9 @@ def getKvintModelList(tp_num):
 @pytest.mark.restful_api
 @pytest.mark.gpu_num_1
 @pytest.mark.parametrize('prepare_environment',
-                         getKvintModelList(tp_num=1),
+                         getKvintModelList(tp_num=1, quant_policy=4),
                          indirect=True)
-def test_restful_chat_kvint_tp1(config, common_case_config, worker_id):
+def test_restful_chat_kvint4_tp1(config, common_case_config, worker_id):
     if get_workerid(worker_id) is None:
         run_all_step(config, common_case_config)
     else:
@@ -98,9 +92,43 @@ def test_restful_chat_kvint_tp1(config, common_case_config, worker_id):
 @pytest.mark.restful_api
 @pytest.mark.gpu_num_2
 @pytest.mark.parametrize('prepare_environment',
-                         getKvintModelList(tp_num=2),
+                         getKvintModelList(tp_num=2, quant_policy=4),
                          indirect=True)
-def test_restful_chat_kvint_tp2(config, common_case_config, worker_id):
+def test_restful_chat_kvint4_tp2(config, common_case_config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_all_step(config, common_case_config)
+    else:
+        run_all_step(config,
+                     common_case_config,
+                     worker_id=worker_id,
+                     port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.restful_api
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=1, quant_policy=8),
+                         indirect=True)
+def test_restful_chat_kvint8_tp1(config, common_case_config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_all_step(config, common_case_config)
+    else:
+        run_all_step(config,
+                     common_case_config,
+                     worker_id=worker_id,
+                     port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.restful_api
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=2, quant_policy=8),
+                         indirect=True)
+def test_restful_chat_kvint8_tp2(config, common_case_config, worker_id):
     if get_workerid(worker_id) is None:
         run_all_step(config, common_case_config)
     else:
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py
index 0046d84eef..68c254d6b8 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py
@@ -28,7 +28,7 @@ def getModelList(tp_num):
     return [{
         'model': item,
         'cuda_prefix': None,
-        'tp_num': tp_num
+        'tp_num': tp_num,
     } for item in get_vl_model_list(tp_num)]
 
 
@@ -58,6 +58,67 @@ def test_restful_chat_tp2(config, worker_id):
         run_all_step(config, port=DEFAULT_PORT + get_workerid(worker_id))
 
 
+def getKvintModelList(tp_num, quant_policy: int = None):
+    return [{
+        'model': item,
+        'cuda_prefix': None,
+        'tp_num': tp_num,
+        'extra': f'--quant-policy {quant_policy}'
+    } for item in get_vl_model_list(tp_num, quant_policy)]
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=1, quant_policy=4),
+                         indirect=True)
+def test_restful_chat_kvint4_tp1(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_all_step(config)
+    else:
+        run_all_step(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=2, quant_policy=4),
+                         indirect=True)
+def test_restful_chat_kvint4_tp2(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_all_step(config)
+    else:
+        run_all_step(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=1, quant_policy=8),
+                         indirect=True)
+def test_restful_chat_kvint8_tp1(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_all_step(config)
+    else:
+        run_all_step(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=2, quant_policy=8),
+                         indirect=True)
+def test_restful_chat_kvint8_tp2(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_all_step(config)
+    else:
+        run_all_step(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
 PIC = 'https://raw.githubusercontent.com/' + \
     'open-mmlab/mmdeploy/main/tests/data/tiger.jpeg'
 
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index c8ff08ad91..1f2c72b26b 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -1,3 +1,4 @@
+import copy
 import os
 
 import yaml
@@ -8,10 +9,11 @@ def get_turbomind_model_list(tp_num: int = None,
                              model_type: str = 'chat_model'):
     config = get_config()
 
-    case_list = config.get('turbomind_' + model_type)
+    case_list = copy.deepcopy(config.get('turbomind_' + model_type))
     quatization_case_config = config.get('turbomind_quatization')
-    for key in quatization_case_config.get('awq'):
-        if key in case_list:
+    for key in config.get('turbomind_' + model_type):
+        if key not in quatization_case_config.get(
+                'no_awq') and not is_quantization_model(key):
             case_list.append(key + '-inner-4bits')
     for key in quatization_case_config.get('gptq'):
         if key in case_list:
@@ -25,10 +27,36 @@ def get_turbomind_model_list(tp_num: int = None,
         return case_list
 
 
-def get_torch_model_list(tp_num: int = None, model_type: str = 'chat_model'):
+def get_torch_model_list(tp_num: int = None,
+                         model_type: str = 'chat_model',
+                         exclude_dup: bool = False,
+                         quant_policy: int = None):
     config = get_config()
+    exclude_dup = False
+
+    if exclude_dup:
+        if quant_policy is None:
+            case_list = [
+                x for x in config.get('pytorch_' + model_type)
+                if x in config.get('turbomind_' + model_type)
+            ]
+        else:
+            case_list = [
+                x for x in config.get('pytorch_' + model_type)
+                if x in config.get('turbomind_' + model_type) and x not in
+                config.get('pytorch_quatization').get('no_kvint' +
+                                                      str(quant_policy))
+            ]
+    else:
+        if quant_policy is None:
+            case_list = config.get('pytorch_' + model_type)
+        else:
+            case_list = [
+                x for x in config.get('pytorch_' + model_type)
+                if x not in config.get('pytorch_quatization').get(
+                    'no_kvint' + str(quant_policy))
+            ]
 
-    case_list = config.get('pytorch_' + model_type)
     quatization_case_config = config.get('pytorch_quatization')
     for key in quatization_case_config.get('w8a8'):
         if key in case_list:
@@ -45,67 +73,31 @@ def get_torch_model_list(tp_num: int = None, model_type: str = 'chat_model'):
         return case_list
 
 
-def get_all_model_list(tp_num: int = None, model_type: str = 'chat_model'):
-    config = get_config()
-
-    case_list = config.get('turbomind_' + model_type)
-    for key in config.get('pytorch_' + model_type):
-        if key not in case_list:
-            case_list.append(key)
-    turbomind_quantization_config = config.get('turbomind_quatization')
-    pytorch_quantization_config = config.get('pytorch_quatization')
-    for key in turbomind_quantization_config.get(
-            'awq') + pytorch_quantization_config.get(
-                'awq') + turbomind_quantization_config.get('gptq'):
-        if key in case_list and key + '-inner-4bits' not in case_list:
-            case_list.append(key + '-inner-4bits')
-
-    if tp_num is not None:
-        return [
-            item for item in case_list if get_tp_num(config, item) == tp_num
-        ]
-    else:
-        return case_list
-
-
-def get_kvint_model_list(tp_num: int = None, model_type: str = 'chat_model'):
-    config = get_config()
-
-    case_list_base = config.get('turbomind_' + model_type)
-    for key in config.get('pytorch_' + model_type):
-        if key not in case_list_base:
-            case_list_base.append(key)
-
-    case_list = []
-    for key in config.get('turbomind_quatization').get('kvint'):
-        if key in case_list_base:
-            case_list.append(key)
-
-    for key in config.get('turbomind_quatization').get('awq'):
-        if key in case_list_base and key in case_list:
-            case_list.append(key + '-inner-4bits')
-    for key in config.get('turbomind_quatization').get('gptq'):
-        if key in case_list_base and key in case_list:
-            case_list.append(key + '-inner-gptq')
-
-    if tp_num is not None:
-        return [
-            item for item in case_list if get_tp_num(config, item) == tp_num
-        ]
-    else:
-        return case_list
+def get_all_model_list(tp_num: int = None,
+                       quant_policy: int = None,
+                       model_type: str = 'chat_model'):
+    case_list = get_turbomind_model_list(tp_num=tp_num, model_type=model_type)
+    for case in get_torch_model_list(tp_num=tp_num,
+                                     quant_policy=quant_policy,
+                                     model_type=model_type):
+        if case not in case_list:
+            case_list.append(case)
+    return [x for x in case_list if 'w8a8' not in x]
 
 
 def get_quantization_model_list(type):
     config = get_config()
     if type == 'awq':
-        case_list = config.get('turbomind_quatization').get('awq')
+        case_list = [
+            x for x in config.get('turbomind_chat_model') +
+            config.get('turbomind_base_model')
+            if x not in config.get('turbomind_quatization').get('no_awq')
+            and not is_quantization_model(x)
+        ]
         for key in config.get('pytorch_quatization').get('awq'):
             if key not in case_list:
                 case_list.append(key)
         return case_list
-    if type == 'kvint':
-        return config.get('turbomind_quatization').get(type)
     if type == 'gptq':
         return config.get('turbomind_quatization').get(type)
     if type == 'w8a8':
@@ -113,13 +105,32 @@ def get_quantization_model_list(type):
     return []
 
 
-def get_vl_model_list(tp_num: int = None):
+def get_vl_model_list(tp_num: int = None, quant_policy: int = None):
     config = get_config()
 
-    case_list = config.get('vl_model')
+    if quant_policy is None:
+        case_list = copy.deepcopy(config.get('vl_model'))
+    else:
+        case_list = [
+            x for x in config.get('vl_model')
+            if x in config.get('turbomind_chat_model') or (
+                x in config.get('pytorch_chat_model') and x not in config.get(
+                    'pytorch_quatization').get('no_kvint' + str(quant_policy)))
+        ]
 
-    for key in config.get('turbomind_quatization').get('awq'):
-        if key in case_list:
+    for key in config.get('vl_model'):
+        if key in config.get('turbomind_chat_model') and key not in config.get(
+                'turbomind_quatization').get(
+                    'no_awq') and not is_quantization_model(
+                        key) and key + '-inner-4bits' not in case_list:
+            case_list.append(key + '-inner-4bits')
+        if key in config.get('pytorch_chat_model') and key in config.get(
+                'pytorch_quatization'
+        ).get('awq') and not is_quantization_model(
+                key) and key + '-inner-4bits' not in case_list and (
+                    quant_policy is not None
+                    and key not in config.get('pytorch_quatization').get(
+                        'no_kvint' + str(quant_policy))):
             case_list.append(key + '-inner-4bits')
 
     if tp_num is not None:
@@ -169,12 +180,16 @@ def get_benchmark_model_list(tp_num,
                              kvint_list: list = []):
     config = get_config()
     if is_longtext:
-        case_list = [item for item in config.get('longtext_model')]
+        case_list_base = [item for item in config.get('longtext_model')]
     else:
-        case_list = config.get('benchmark_model')
+        case_list_base = config.get('benchmark_model')
     quatization_case_config = config.get('turbomind_quatization')
-    for key in quatization_case_config.get('awq'):
-        if key in case_list:
+
+    case_list = copy.deepcopy(case_list_base)
+    for key in case_list_base:
+        if key in config.get('turbomind_chat_model'
+                             ) and key not in quatization_case_config.get(
+                                 'no_awq') and not is_quantization_model(key):
             case_list.append(key + '-inner-4bits')
 
     model_list = [
@@ -203,7 +218,7 @@ def get_benchmark_model_list(tp_num,
                 'quant_policy': kvint,
                 'tp_num': tp_num
             } for item in model_list if item.replace('-inner-4bits', '') in
-                       config.get('turbomind_quatization').get('kvint')]
+                       config.get('turbomind_chat_model')]
     return result
 
 
@@ -212,3 +227,8 @@ def get_workerid(worker_id):
         return None
     else:
         return int(worker_id.replace('gw', ''))
+
+
+def is_quantization_model(name):
+    return 'awq' in name.lower() or '4bits' in name.lower(
+    ) or 'w4' in name.lower() or 'int4' in name.lower()
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index e94b331881..33d65448ab 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -30,14 +30,13 @@ def run_pipeline_chat_test(config,
     else:
         hf_path = model_case
 
-    if 'pytorch' == type:
+    if 'pytorch' in type:
         backend_config = PytorchEngineConfig(tp=tp)
-    elif 'pytorch_lora' == type:
-        backend_config = PytorchEngineConfig(tp=tp,
-                                             adapters=extra.get('adapters'))
     else:
         backend_config = TurbomindEngineConfig(tp=tp)
 
+    if 'lora' in type:
+        backend_config.adapters = extra.get('adapters')
     if 'kvint' in type:
         backend_config.quant_policy = extra.get('quant_policy')
 
@@ -277,7 +276,7 @@ def assert_pipeline_single_element(output,
     'open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg'
 
 
-def run_pipeline_vl_chat_test(config, model_case):
+def run_pipeline_vl_chat_test(config, model_case, quant_policy: int = None):
     log_path = config.get('log_path')
     tp = get_tp_num(config, model_case)
     model_path = config.get('model_path')
@@ -291,6 +290,8 @@ def run_pipeline_vl_chat_test(config, model_case):
         backend_config = TurbomindEngineConfig(tp=tp, session_len=8192)
     if '4bit' in model_case.lower() or 'awq' in model_case.lower():
         backend_config.model_format = 'awq'
+    if quant_policy is not None:
+        backend_config.quant_policy = quant_policy
     pipe = pipeline(hf_path, backend_config=backend_config)
 
     pipeline_chat_log = os.path.join(
@@ -303,6 +304,8 @@ def run_pipeline_vl_chat_test(config, model_case):
         prompt = f'describe this image{IMAGE_TOKEN}'
     else:
         prompt = 'describe this image'
+
+    file.writelines('engineconfig:' + str(backend_config))
     response = pipe((prompt, image))
     result = 'tiger' in response.text.lower() or '虎' in response.text.lower()
     file.writelines('result:' + str(result) +
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index 6e60c53833..1eb84f1d93 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -62,7 +62,7 @@ def start_restful_api(config, param, model, model_path, backend_type,
         cmd += ' --backend pytorch'
     if 'llava' in model:
         cmd += ' --model-name vicuna'
-    if backend_type == 'turbomind' and 'quant_policy' in param.keys():
+    if 'quant_policy' in param.keys() and param['quant_policy'] is not None:
         quant_policy = param['quant_policy']
         cmd += f' --quant-policy {quant_policy}'
 

From fec94c9c320fdb7f25abc643342606339a1debbe Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Thu, 17 Oct 2024 15:22:21 +0800
Subject: [PATCH 008/122] Add a workaround for saving internvl2 with latest
 transformers (#2583)

* Add a workaround for saving internvl2 with latest transformers

* fix lint

* use arch
---
 lmdeploy/lite/apis/calibrate.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
index c867f45962..b2fd8e3883 100644
--- a/lmdeploy/lite/apis/calibrate.py
+++ b/lmdeploy/lite/apis/calibrate.py
@@ -132,6 +132,24 @@ class name or the class type itself.
             print(f'Move {mod_name} to GPU.')
 
 
+# TODO to be removed
+def make_compatible_internvl_config(model_path):
+    """Patch model.config since after transformers v4.45.0, InternVL models
+    can't use `save_pretrained`"""
+    from lmdeploy.archs import get_model_arch
+    arch, _ = get_model_arch(model_path)
+    if arch == 'InternVLChatModel':
+        import transformers
+        from packaging import version
+        if version.parse(transformers.__version__) >= version.parse('4.45.0'):
+
+            def _get_non_default_generation_parameters(self):
+                return {}
+
+            from transformers import PretrainedConfig
+            PretrainedConfig._get_non_default_generation_parameters = _get_non_default_generation_parameters  # noqa
+
+
 def calibrate(model: str,
               calib_dataset: str = 'ptb',
               calib_samples: int = 128,
@@ -175,6 +193,7 @@ def calibrate(model: str,
         'Support only `c4`, `ptb`, `wikitext2` or `pileval`.'
 
     model_type, _ = get_task(model)
+    make_compatible_internvl_config(model)
     if model_type == 'llm':
         # Load tokenizer and configuration
         tokenizer = AutoTokenizer.from_pretrained(model,

From 7dc0a5c7772c35bbb166fa7d6d89caf65a0b86ff Mon Sep 17 00:00:00 2001
From: q yao <streetyao@live.com>
Date: Fri, 18 Oct 2024 12:31:34 +0800
Subject: [PATCH 009/122] optimize paged attention on triton3 (#2553)

* optimize paged attention on triton3

* fix w8a8 kernel

* optimize prefill

* optimize short decoding

* optimize sm<8

* optimize short context

* fix triton2.2.0

* recovery test

* add ut for custom layout

* update stride

* update ut
---
 .github/workflows/unit-test.yml               |   4 +-
 lmdeploy/pytorch/check_env/__init__.py        |  18 +-
 lmdeploy/pytorch/engine/engine.py             |   2 +-
 .../pytorch/kernels/cuda/fill_kv_cache.py     |  61 ++--
 .../pytorch/kernels/cuda/pagedattention.py    | 273 +++++++++++-------
 .../kernels/cuda/w8a8_triton_kernels.py       |  11 +-
 requirements/runtime.txt                      |   4 +-
 tests/pytorch/kernel/test_paged_attention.py  |  37 ++-
 8 files changed, 259 insertions(+), 151 deletions(-)

diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml
index 64b3acd52a..ec6db0682d 100644
--- a/.github/workflows/unit-test.yml
+++ b/.github/workflows/unit-test.yml
@@ -53,7 +53,7 @@ jobs:
       - name: Install pytorch
         run: |
           python3 -m pip cache dir
-          python3 -m pip install torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu118
+          python3 -m pip install torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118
       - name: Build lmdeploy
         run: |
           python3 -m pip install cmake
@@ -77,7 +77,7 @@ jobs:
         run: |
           python3 -m pip install pynvml packaging protobuf transformers_stream_generator
           # manually install flash attn
-          python3 -m pip install /root/packages/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
+          python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
           python3 -m pip install -r requirements.txt -r requirements/test.txt
           python3 -m pip install .
       - name: Check env
diff --git a/lmdeploy/pytorch/check_env/__init__.py b/lmdeploy/pytorch/check_env/__init__.py
index ea2dda8e8d..5ace70b53c 100644
--- a/lmdeploy/pytorch/check_env/__init__.py
+++ b/lmdeploy/pytorch/check_env/__init__.py
@@ -93,15 +93,13 @@ def check_env_triton(device: str):
 
     if device == 'cuda':
         device_cap = torch.cuda.get_device_capability()
-        TRITON_VER_220 = version.parse('2.2.0')
         TRITON_VER_231 = version.parse('2.3.1')
 
         if device_cap[0] <= 7:
-            if (triton_version >= TRITON_VER_220
-                    and triton_version <= TRITON_VER_231):
+            if triton_version <= TRITON_VER_231:
                 err = RuntimeError(
                     'Attention triton kernel does not fully support '
-                    'triton[2.2.0~2.3.1] on device with capability<8. '
+                    'triton<3.0.0 on device with capability<8. '
                     'Please upgrade your triton version.')
                 _handle_exception(err, 'Triton', logger)
 
@@ -142,7 +140,8 @@ def check_awq(hf_config):
 
 
 def check_transformers_version(model_path: str,
-                               trust_remote_code: bool = True):
+                               trust_remote_code: bool = True,
+                               dtype: str = 'auto'):
     """check transformers version."""
     from packaging import version
     logger = get_logger('lmdeploy')
@@ -206,7 +205,8 @@ def __check_model_dtype_support(config):
 
         try:
             model_config = ModelConfig.from_hf_config(config,
-                                                      model_path=model_path)
+                                                      model_path=model_path,
+                                                      dtype=dtype)
             if model_config.dtype == torch.bfloat16:
                 assert torch.cuda.is_bf16_supported(), (
                     'bf16 is not supported on your device')
@@ -229,11 +229,13 @@ def __check_model_dtype_support(config):
     check_awq(config)
 
 
-def check_model(model_path: str, trust_remote_code: bool = True):
+def check_model(model_path: str,
+                trust_remote_code: bool = True,
+                dtype: str = 'auto'):
     """check model requirements."""
     logger = get_logger('lmdeploy')
     logger.info('Checking model.')
-    check_transformers_version(model_path, trust_remote_code)
+    check_transformers_version(model_path, trust_remote_code, dtype)
 
 
 def check_adapter(path: str):
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
index 58d319ef8c..f6ce4c29a1 100644
--- a/lmdeploy/pytorch/engine/engine.py
+++ b/lmdeploy/pytorch/engine/engine.py
@@ -96,7 +96,7 @@ def __init__(self,
         else:
             engine_config = copy.deepcopy(engine_config)
         check_env(engine_config.device_type)
-        check_model(model_path, trust_remote_code)
+        check_model(model_path, trust_remote_code, engine_config.dtype)
         if engine_config.max_batch_size is None:
             engine_config.max_batch_size = get_max_batch_size(
                 engine_config.device_type)
diff --git a/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py b/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py
index a9a6cab010..9ef614fadd 100644
--- a/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py
+++ b/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py
@@ -378,12 +378,21 @@ def fill_kv_cache(k_states: Tensor,
                   block_offsets: Tensor,
                   k_scales_zeros: Tensor = None,
                   v_scales_zeros: Tensor = None,
-                  quant_policy: Literal[0, 4, 8] = 0):
+                  quant_policy: Literal[0, 4, 8] = 0,
+                  kv_layout: str = 'bshd'):
     """fill key/value state to cache for paged attention."""
+    if kv_layout == 'bshd':
+        b_dim, s_dim, h_dim, d_dim = (0, 1, 2, 3)
+    elif kv_layout == 'bhsd':
+        b_dim, s_dim, h_dim, d_dim = (0, 2, 1, 3)
+    else:
+        raise RuntimeError('Unsupported layout.')
 
     block_offsets = block_offsets.contiguous()
     batch_size = block_offsets.size(0)
-    block_size, num_heads, head_dim = k_caches.size()[1:]
+    block_size = k_caches.size(s_dim)
+    num_heads = k_caches.size(h_dim)
+    head_dim = k_caches.size(d_dim)
     head_dim_v = v_states.size(-1)
     max_num_blocks = triton.cdiv(max_q_seq_length, block_size) + 1
 
@@ -412,14 +421,14 @@ def fill_kv_cache(k_states: Tensor,
             stride_vss=v_states.stride(-3),
             stride_vsh=v_states.stride(-2),
             stride_vsd=v_states.stride(-1),
-            stride_kcn=k_caches.stride(0),
-            stride_kcb=k_caches.stride(1),
-            stride_kch=k_caches.stride(2),
-            stride_kcd=k_caches.stride(3),
-            stride_vcn=v_caches.stride(0),
-            stride_vcb=v_caches.stride(1),
-            stride_vch=v_caches.stride(2),
-            stride_vcd=v_caches.stride(3),
+            stride_kcn=k_caches.stride(b_dim),
+            stride_kcb=k_caches.stride(s_dim),
+            stride_kch=k_caches.stride(h_dim),
+            stride_kcd=k_caches.stride(d_dim),
+            stride_vcn=v_caches.stride(b_dim),
+            stride_vcb=v_caches.stride(s_dim),
+            stride_vch=v_caches.stride(h_dim),
+            stride_vcd=v_caches.stride(d_dim),
             stride_boff=block_offsets.stride(0),
             BLOCK=BLOCK,
             BLOCK_D=BLOCK_D,
@@ -450,22 +459,22 @@ def fill_kv_cache(k_states: Tensor,
             stride_vss=v_states.stride(-3),
             stride_vsh=v_states.stride(-2),
             stride_vsd=v_states.stride(-1),
-            stride_kcn=k_caches.stride(0),
-            stride_kcb=k_caches.stride(1),
-            stride_kch=k_caches.stride(2),
-            stride_kcd=k_caches.stride(3),
-            stride_vcn=v_caches.stride(0),
-            stride_vcb=v_caches.stride(1),
-            stride_vch=v_caches.stride(2),
-            stride_vcd=v_caches.stride(3),
-            stride_kszn=k_scales_zeros.stride(0),
-            stride_kszb=k_scales_zeros.stride(1),
-            stride_kszh=k_scales_zeros.stride(2),
-            stride_kszd=k_scales_zeros.stride(3),
-            stride_vszn=v_scales_zeros.stride(0),
-            stride_vszb=v_scales_zeros.stride(1),
-            stride_vszh=v_scales_zeros.stride(2),
-            stride_vszd=v_scales_zeros.stride(3),
+            stride_kcn=k_caches.stride(b_dim),
+            stride_kcb=k_caches.stride(s_dim),
+            stride_kch=k_caches.stride(h_dim),
+            stride_kcd=k_caches.stride(d_dim),
+            stride_vcn=v_caches.stride(b_dim),
+            stride_vcb=v_caches.stride(s_dim),
+            stride_vch=v_caches.stride(h_dim),
+            stride_vcd=v_caches.stride(d_dim),
+            stride_kszn=k_scales_zeros.stride(b_dim),
+            stride_kszb=k_scales_zeros.stride(s_dim),
+            stride_kszh=k_scales_zeros.stride(h_dim),
+            stride_kszd=k_scales_zeros.stride(d_dim),
+            stride_vszn=v_scales_zeros.stride(b_dim),
+            stride_vszb=v_scales_zeros.stride(s_dim),
+            stride_vszh=v_scales_zeros.stride(h_dim),
+            stride_vszd=v_scales_zeros.stride(d_dim),
             quant_policy=quant_policy,
             stride_boff=block_offsets.stride(0),
             BLOCK=BLOCK,
diff --git a/lmdeploy/pytorch/kernels/cuda/pagedattention.py b/lmdeploy/pytorch/kernels/cuda/pagedattention.py
index aa363d2bd4..d8e6ec5013 100644
--- a/lmdeploy/pytorch/kernels/cuda/pagedattention.py
+++ b/lmdeploy/pytorch/kernels/cuda/pagedattention.py
@@ -15,18 +15,15 @@
 logger = get_logger('lmdeploy')
 
 TRITON_VERSION = version.parse(triton.__version__)
+VERSION_300 = version.parse('3.0.0')
 
-assert TRITON_VERSION >= version.parse('2.1.0')
+assert TRITON_VERSION >= version.parse('2.2.0')
 
-if TRITON_VERSION >= version.parse('3.0.0'):
-
-    @triton.jit
-    def tanh(x):
-        """tanh."""
-        return 2 * tl.sigmoid(2 * x) - 1
-
-    fast_expf = tl.math.exp
-    fast_dividef = tl.math.fdiv
+# TODO: fast op might not work on non-nv device
+if TRITON_VERSION >= VERSION_300:
+    tanh = tl.extra.cuda.libdevice.tanh
+    fast_expf = tl.extra.cuda.libdevice.fast_expf
+    fast_dividef = tl.extra.cuda.libdevice.fast_dividef
 else:
     tanh = tl.math.tanh
     fast_expf = tl.math.fast_expf
@@ -38,7 +35,9 @@ def tanh(x):
     triton.Config({}, num_stages=2, num_warps=8),
     triton.Config({}, num_stages=2, num_warps=4),
 ],
-                 key=['BLOCK_H', 'BLOCK_N', 'BLOCK_DMODEL', 'BLOCK_DV'])
+                 key=['BLOCK_H', 'BLOCK_N', 'BLOCK_DMODEL', 'BLOCK_DV'],
+                 warmup=10,
+                 rep=25)
 @wrap_jit_func(type_hint=dict(
     Q=torch.Tensor,
     K=torch.Tensor,
@@ -235,9 +234,13 @@ def _fwd_grouped_split_kernel(
         m_i = m_i_new
 
     # initialize pointers to output
-    off_acc = (cur_batch * stride_obs + split_k_id * stride_ok +
-               cur_head[:, None] * stride_oh + offs_dv[None, :] * stride_od)
-    tl.store(Acc_out + off_acc, acc, mask=mask_h[:, None] & mask_dv[None, :])
+    if loop_end > loop_start:
+        off_acc = (cur_batch * stride_obs + split_k_id * stride_ok +
+                   cur_head[:, None] * stride_oh +
+                   offs_dv[None, :] * stride_od)
+        tl.store(Acc_out + off_acc,
+                 acc,
+                 mask=mask_h[:, None] & mask_dv[None, :])
 
     off_meta = (cur_batch * stride_obs + split_k_id * stride_ok +
                 cur_head * stride_oh + head_size_v)
@@ -515,9 +518,13 @@ def _fwd_grouped_split_quant_kernel(
         m_i = m_i_new
 
     # initialize pointers to output
-    off_acc = (cur_batch * stride_obs + split_k_id * stride_ok +
-               cur_head[:, None] * stride_oh + offs_dv[None, :] * stride_od)
-    tl.store(Acc_out + off_acc, acc, mask=mask_h[:, None] & mask_dv[None, :])
+    if loop_end > loop_start:
+        off_acc = (cur_batch * stride_obs + split_k_id * stride_ok +
+                   cur_head[:, None] * stride_oh +
+                   offs_dv[None, :] * stride_od)
+        tl.store(Acc_out + off_acc,
+                 acc,
+                 mask=mask_h[:, None] & mask_dv[None, :])
 
     if quant_policy == 4:
         off_meta = (cur_batch * stride_obs + split_k_id * stride_ok +
@@ -572,9 +579,11 @@ def _reduce_split_kernel(
     offs_mi = (cur_batch * stride_abs + cur_head * stride_ah +
                stride_ak * offs_k + head_size_v)
 
-    acc_k = tl.load(Acc + offs_acc, mask=mask_dv[None, :], other=0.0)
     m_k = tl.load(Acc + offs_mi)
     l_k = tl.load(Acc + offs_mi + 1)
+    acc_k = tl.load(Acc + offs_acc,
+                    mask=mask_dv[None, :] & (m_k[:, None] > -float('inf')),
+                    other=0.0)
 
     m_max = tl.max(m_k, 0)
     alpha = fast_expf(m_k - m_max)
@@ -592,7 +601,8 @@ def _reduce_split_kernel(
 
 def _get_convert_pv(nv_capability):
     """lazy load convert_pv."""
-    if nv_capability[0] >= 8:
+    global TRITON_VERSION, VERSION_300
+    if TRITON_VERSION >= VERSION_300 or nv_capability[0] >= 8:
 
         @triton.jit
         def convert_pv(p, v):
@@ -620,7 +630,6 @@ def convert_pv(p, v):
 #     triton.Config({}, num_stages=1, num_warps=4),
 # ],
 #                  key=['BLOCK_M', 'BLOCK_N', 'BLOCK_DMODEL', 'BLOCK_DV'])
-@wrap_jit_func
 @triton.jit
 def _fwd_kernel(
     Q,
@@ -647,7 +656,7 @@ def _fwd_kernel(
     stride_oh: tl.constexpr,
     stride_od: tl.constexpr,
     stride_boffb,
-    kv_group_num: tl.constexpr,
+    kv_group_num,
     window_size: tl.constexpr,
     head_size: tl.constexpr,
     head_size_v: tl.constexpr,
@@ -660,18 +669,16 @@ def _fwd_kernel(
 ):
     """paged attention kernel."""
     cur_batch = tl.program_id(2)
-    cur_head = tl.program_id(1)
+    cur_kv_head = tl.program_id(1)
     start_m = tl.program_id(0)
 
-    cur_kv_head = cur_head // kv_group_num
-
     q_seqlen = tl.load(Q_seqlens + cur_batch)
     kv_seqlen = tl.load(KV_seqlens + cur_batch)
     q_start_loc = tl.load(Q_start_loc + cur_batch)
     history_len = kv_seqlen - q_seqlen
 
     block_start_loc = BLOCK_M * start_m
-    if block_start_loc >= q_seqlen:
+    if block_start_loc >= q_seqlen * kv_group_num:
         return
 
     # initialize offsets
@@ -682,17 +689,22 @@ def _fwd_kernel(
     offs_d = offs_d % head_size
     mask_dv = offs_dv < head_size_v
     offs_dv = offs_dv % head_size_v
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_mh = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_m = offs_mh // kv_group_num
+    cur_head = offs_mh % kv_group_num + cur_kv_head * kv_group_num
     off_q = ((q_start_loc + offs_m[:, None]) * stride_qbs +
-             cur_head * stride_qh + offs_d[None, :] * stride_qd)
+             cur_head[:, None] * stride_qh + offs_d[None, :] * stride_qd)
     off_k = (cur_kv_head * stride_kh + offs_d[:, None] * stride_kd +
              offs_n[None, :] * stride_kbs)
     off_v = (cur_kv_head * stride_vh + offs_dv[None, :] * stride_vd +
              offs_n[:, None] * stride_vbs)
 
-    q = tl.load(Q + off_q,
-                mask=(offs_m[:, None] < q_seqlen) & mask_d[None, :],
-                other=0.0)
+    q = tl.load(
+        Q + off_q,
+        mask=(offs_m[:, None] < q_seqlen) & mask_d[None, :],
+        other=0.0,
+        eviction_policy='evict_first',
+    )
 
     k_ptrs = K + off_k
     v_ptrs = V + off_v
@@ -702,7 +714,7 @@ def _fwd_kernel(
         mask_d1 = offs_d1 < head_size
         offs_d1 = offs_d1 % head_size
         off_q1 = ((q_start_loc + offs_m[:, None]) * stride_qbs +
-                  cur_head * stride_qh + offs_d1[None, :] * stride_qd)
+                  cur_head[:, None] * stride_qh + offs_d1[None, :] * stride_qd)
         q1 = tl.load(Q + off_q1, mask=(offs_m[:, None] < q_seqlen) & mask_d1)
         off_k1 = (cur_kv_head * stride_kh + offs_d1[:, None] * stride_kd +
                   offs_n[None, :] * stride_kbs)
@@ -722,7 +734,56 @@ def _fwd_kernel(
         kv_start_loc = start_block_id * BLOCK_N
         block_offset_ptrs += start_block_id
 
-    for start_n in range(kv_start_loc, kv_seqlen, BLOCK_N):
+    loop_start = kv_start_loc
+    loop_end = history_len // BLOCK_N * BLOCK_N
+    for start_n in range(loop_start, loop_end, BLOCK_N):
+        b_offset = tl.load(block_offset_ptrs)
+        block_offset_ptrs += 1
+
+        # -- compute qk ----
+        k = tl.load(k_ptrs + b_offset * stride_kp)
+        if BLOCK_DMODEL1 != 0:
+            k1 = tl.load(k1_ptrs + b_offset * stride_kp)
+
+        v = tl.load(v_ptrs + b_offset * stride_vp)
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        if BLOCK_DMODEL1 != 0:
+            qk += tl.dot(q1, k1)
+        qk *= sm_scale
+        if logit_softcapping > 0.0:
+            qk = qk / logit_softcapping
+            qk = tanh(qk)
+            qk = qk * logit_softcapping
+        # NOTE: inf - inf = nan, and nan will leads to error
+        if window_size > 0:
+            qk_mask = ((start_n + offs_n[None, :]) >= kv_min_loc[:, None])
+            qk = tl.where(
+                qk_mask,
+                qk,
+                float(-1e30),
+            )
+
+        # -- compute p, m_i and l_i
+        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+        p = fast_expf(qk - m_i_new[:, None])
+        alpha = fast_expf(m_i - m_i_new)
+        l_i_new = alpha * l_i + tl.sum(p, 1)
+        # -- update output accumulator --
+        # scale acc
+        acc = acc * alpha[:, None]
+
+        # update acc
+        p, v = _convert_pv(p, v)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    loop_start = loop_end
+    loop_end = kv_seqlen
+    for start_n in range(loop_start, loop_end, BLOCK_N):
         b_offset = tl.load(block_offset_ptrs)
         block_offset_ptrs += 1
 
@@ -773,7 +834,7 @@ def _fwd_kernel(
     acc = fast_dividef(acc, l_i[:, None])
     # initialize pointers to output
     off_o = ((q_start_loc + offs_m[:, None]) * stride_obs +
-             cur_head * stride_oh + offs_dv[None, :] * stride_od)
+             cur_head[:, None] * stride_oh + offs_dv[None, :] * stride_od)
     out_ptrs = Out + off_o
     tl.store(out_ptrs,
              acc,
@@ -825,7 +886,7 @@ def _fwd_kernel_quant(
     stride_oh: tl.constexpr,
     stride_od: tl.constexpr,
     stride_boffb,
-    kv_group_num: tl.constexpr,
+    kv_group_num,
     window_size: tl.constexpr,
     head_size: tl.constexpr,
     head_size_v: tl.constexpr,
@@ -845,18 +906,16 @@ def _fwd_kernel_quant(
         stride_d: stride of head size dim
     """
     cur_batch = tl.program_id(2)
-    cur_head = tl.program_id(1)
+    cur_kv_head = tl.program_id(1)
     start_m = tl.program_id(0)
 
-    cur_kv_head = cur_head // kv_group_num
-
     q_seqlen = tl.load(Q_seqlens + cur_batch)
     kv_seqlen = tl.load(KV_seqlens + cur_batch)
     q_start_loc = tl.load(Q_start_loc + cur_batch)
     history_len = kv_seqlen - q_seqlen
 
     block_start_loc = BLOCK_M * start_m
-    if block_start_loc >= q_seqlen:
+    if block_start_loc >= q_seqlen * kv_group_num:
         return
 
     # initialize offsets
@@ -868,9 +927,11 @@ def _fwd_kernel_quant(
     offs_d = offs_d % head_size
     mask_dv = offs_dv < head_size_v
     offs_dv = offs_dv % head_size_v
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_mh = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_m = offs_mh // kv_group_num
+    cur_head = offs_mh % kv_group_num + cur_kv_head * kv_group_num
     off_q = ((q_start_loc + offs_m[:, None]) * stride_qbs +
-             cur_head * stride_qh + offs_d[None, :] * stride_qd)
+             cur_head[:, None] * stride_qh + offs_d[None, :] * stride_qd)
     off_k = (cur_kv_head * stride_kh + offs_d[:, None] * stride_kd +
              offs_n[None, :] * stride_kbs)
     off_v = (cur_kv_head * stride_vh + offs_dv[None, :] * stride_vd +
@@ -892,7 +953,7 @@ def _fwd_kernel_quant(
         mask_d1 = offs_d1 < head_size
         offs_d1 = offs_d1 % head_size
         off_q1 = ((q_start_loc + offs_m[:, None]) * stride_qbs +
-                  cur_head * stride_qh + offs_d1[None, :] * stride_qd)
+                  cur_head[:, None] * stride_qh + offs_d1[None, :] * stride_qd)
         q1 = tl.load(Q + off_q1, mask=(offs_m[:, None] < q_seqlen) & mask_d1)
         off_k1 = (cur_kv_head * stride_kh + offs_d1[:, None] * stride_kd +
                   offs_n[None, :] * stride_kbs)
@@ -999,7 +1060,7 @@ def _fwd_kernel_quant(
     acc = fast_dividef(acc, l_i[:, None])
     # initialize pointers to output
     off_o = ((q_start_loc + offs_m[:, None]) * stride_obs +
-             cur_head * stride_oh + offs_dv[None, :] * stride_od)
+             cur_head[:, None] * stride_oh + offs_dv[None, :] * stride_od)
     out_ptrs = Out + off_o
     tl.store(out_ptrs,
              acc,
@@ -1022,6 +1083,7 @@ def paged_attention_fwd(
     window_size: int = None,
     sm_scale: float = None,
     logit_softcapping: float = None,
+    kv_layout: str = 'bshd',
 ):
     """Paged Attention forward.
 
@@ -1042,6 +1104,13 @@ def paged_attention_fwd(
         nv_cap = torch.cuda.get_device_capability()
         _convert_pv = _get_convert_pv(nv_cap)
 
+    if kv_layout == 'bshd':
+        b_dim, s_dim, h_dim, d_dim = (0, 1, 2, 3)
+    elif kv_layout == 'bhsd':
+        b_dim, s_dim, h_dim, d_dim = (0, 2, 1, 3)
+    else:
+        raise RuntimeError('Unsupported layout.')
+
     if window_size is None:
         window_size = -1
 
@@ -1059,7 +1128,7 @@ def _get_block_d(Lk):
         return BLOCK_DMODEL, BLOCK_DMODEL1, BLOCK_DV
 
     # shape constraints
-    Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+    Lq, Lk, Lv = q.shape[-1], k.shape[d_dim], v.shape[d_dim]
     if quant_policy == 4:
         assert Lq == Lk * 2 and Lv * 2 == o.shape[-1]
     else:
@@ -1068,9 +1137,9 @@ def _get_block_d(Lk):
     if sm_scale is None:
         sm_scale = 1.0 / (Lq**0.5)
     batch, head = q_seqlens.shape[0], q.shape[-2]
-    kv_group_num = q.shape[-2] // k.shape[-2]
+    kv_group_num = q.shape[-2] // k.shape[h_dim]
 
-    BLOCK = k.size(1)
+    BLOCK = k.size(s_dim)
     assert BLOCK >= 16
     if Lq > 512 and BLOCK > 32:
         logger.warning(f'`head_dim={Lq}` and `block_size={BLOCK}` '
@@ -1084,7 +1153,9 @@ def _get_block_d(Lk):
         BLOCK_M = max(16, min(BLOCK, 16384 // BLOCK_DMODEL))
         num_warps = 4
         num_stages = 2
-        grid = (triton.cdiv(max_seqlen, BLOCK_M), head, batch)
+        kv_head = k.shape[h_dim]
+        grid = (triton.cdiv(max_seqlen * kv_group_num,
+                            BLOCK_M), kv_head, batch)
         if quant_policy > 0:
             _fwd_kernel_quant[grid](q,
                                     k,
@@ -1100,22 +1171,22 @@ def _get_block_d(Lk):
                                     stride_qbs=q.stride(-3),
                                     stride_qh=q.stride(-2),
                                     stride_qd=q.stride(-1),
-                                    stride_kp=k.stride(-4),
-                                    stride_kbs=k.stride(-3),
-                                    stride_kh=k.stride(-2),
-                                    stride_kd=k.stride(-1),
-                                    stride_vp=v.stride(-4),
-                                    stride_vbs=v.stride(-3),
-                                    stride_vh=v.stride(-2),
-                                    stride_vd=v.stride(-1),
-                                    stride_kszp=k_scales_zeros.stride(-4),
-                                    stride_kszbs=k_scales_zeros.stride(-3),
-                                    stride_kszh=k_scales_zeros.stride(-2),
-                                    stride_kszd=k_scales_zeros.stride(-1),
-                                    stride_vszp=v_scales_zeros.stride(-4),
-                                    stride_vszbs=v_scales_zeros.stride(-3),
-                                    stride_vszh=v_scales_zeros.stride(-2),
-                                    stride_vszd=v_scales_zeros.stride(-1),
+                                    stride_kp=k.stride(b_dim),
+                                    stride_kbs=k.stride(s_dim),
+                                    stride_kh=k.stride(h_dim),
+                                    stride_kd=k.stride(d_dim),
+                                    stride_vp=v.stride(b_dim),
+                                    stride_vbs=v.stride(s_dim),
+                                    stride_vh=v.stride(h_dim),
+                                    stride_vd=v.stride(d_dim),
+                                    stride_kszp=k_scales_zeros.stride(b_dim),
+                                    stride_kszbs=k_scales_zeros.stride(s_dim),
+                                    stride_kszh=k_scales_zeros.stride(h_dim),
+                                    stride_kszd=k_scales_zeros.stride(d_dim),
+                                    stride_vszp=v_scales_zeros.stride(b_dim),
+                                    stride_vszbs=v_scales_zeros.stride(s_dim),
+                                    stride_vszh=v_scales_zeros.stride(h_dim),
+                                    stride_vszd=v_scales_zeros.stride(d_dim),
                                     quant_policy=quant_policy,
                                     stride_obs=o.stride(-3),
                                     stride_oh=o.stride(-2),
@@ -1147,14 +1218,14 @@ def _get_block_d(Lk):
                               stride_qbs=q.stride(-3),
                               stride_qh=q.stride(-2),
                               stride_qd=q.stride(-1),
-                              stride_kp=k.stride(-4),
-                              stride_kbs=k.stride(-3),
-                              stride_kh=k.stride(-2),
-                              stride_kd=k.stride(-1),
-                              stride_vp=v.stride(-4),
-                              stride_vbs=v.stride(-3),
-                              stride_vh=v.stride(-2),
-                              stride_vd=v.stride(-1),
+                              stride_kp=k.stride(b_dim),
+                              stride_kbs=k.stride(s_dim),
+                              stride_kh=k.stride(h_dim),
+                              stride_kd=k.stride(d_dim),
+                              stride_vp=v.stride(b_dim),
+                              stride_vbs=v.stride(s_dim),
+                              stride_vh=v.stride(h_dim),
+                              stride_vd=v.stride(d_dim),
                               stride_obs=o.stride(-3),
                               stride_oh=o.stride(-2),
                               stride_od=o.stride(-1),
@@ -1209,22 +1280,22 @@ def _get_block_d(Lk):
                 stride_qbs=q.stride(-3),
                 stride_qh=q.stride(-2),
                 stride_qd=q.stride(-1),
-                stride_kp=k.stride(-4),
-                stride_kbs=k.stride(-3),
-                stride_kh=k.stride(-2),
-                stride_kd=k.stride(-1),
-                stride_vp=v.stride(-4),
-                stride_vbs=v.stride(-3),
-                stride_vh=v.stride(-2),
-                stride_vd=v.stride(-1),
-                stride_kszp=k_scales_zeros.stride(-4),
-                stride_kszbs=k_scales_zeros.stride(-3),
-                stride_kszh=k_scales_zeros.stride(-2),
-                stride_kszd=k_scales_zeros.stride(-1),
-                stride_vszp=v_scales_zeros.stride(-4),
-                stride_vszbs=v_scales_zeros.stride(-3),
-                stride_vszh=v_scales_zeros.stride(-2),
-                stride_vszd=v_scales_zeros.stride(-1),
+                stride_kp=k.stride(b_dim),
+                stride_kbs=k.stride(s_dim),
+                stride_kh=k.stride(h_dim),
+                stride_kd=k.stride(d_dim),
+                stride_vp=v.stride(b_dim),
+                stride_vbs=v.stride(s_dim),
+                stride_vh=v.stride(h_dim),
+                stride_vd=v.stride(d_dim),
+                stride_kszp=k_scales_zeros.stride(b_dim),
+                stride_kszbs=k_scales_zeros.stride(s_dim),
+                stride_kszh=k_scales_zeros.stride(h_dim),
+                stride_kszd=k_scales_zeros.stride(d_dim),
+                stride_vszp=v_scales_zeros.stride(b_dim),
+                stride_vszbs=v_scales_zeros.stride(s_dim),
+                stride_vszh=v_scales_zeros.stride(h_dim),
+                stride_vszd=v_scales_zeros.stride(d_dim),
                 quant_policy=quant_policy,
                 stride_ok=acc.stride(-2),
                 stride_obs=acc.stride(-4),
@@ -1257,14 +1328,14 @@ def _get_block_d(Lk):
                 stride_qbs=q.stride(-3),
                 stride_qh=q.stride(-2),
                 stride_qd=q.stride(-1),
-                stride_kp=k.stride(-4),
-                stride_kbs=k.stride(-3),
-                stride_kh=k.stride(-2),
-                stride_kd=k.stride(-1),
-                stride_vp=v.stride(-4),
-                stride_vbs=v.stride(-3),
-                stride_vh=v.stride(-2),
-                stride_vd=v.stride(-1),
+                stride_kp=k.stride(b_dim),
+                stride_kbs=k.stride(s_dim),
+                stride_kh=k.stride(h_dim),
+                stride_kd=k.stride(d_dim),
+                stride_vp=v.stride(b_dim),
+                stride_vbs=v.stride(s_dim),
+                stride_vh=v.stride(h_dim),
+                stride_vd=v.stride(d_dim),
                 stride_ok=acc.stride(-2),
                 stride_obs=acc.stride(-4),
                 stride_oh=acc.stride(-3),
@@ -1291,13 +1362,13 @@ def _get_block_d(Lk):
             BLOCK_DV *= 2
         _reduce_split_kernel[grid](acc,
                                    o,
-                                   stride_ak=acc.stride(-2),
-                                   stride_abs=acc.stride(-4),
-                                   stride_ah=acc.stride(-3),
-                                   stride_ad=acc.stride(-1),
-                                   stride_obs=o.stride(-3),
-                                   stride_oh=o.stride(-2),
-                                   stride_od=o.stride(-1),
+                                   stride_ak=acc.stride(2),
+                                   stride_abs=acc.stride(0),
+                                   stride_ah=acc.stride(1),
+                                   stride_ad=acc.stride(3),
+                                   stride_obs=o.stride(0),
+                                   stride_oh=o.stride(1),
+                                   stride_od=o.stride(2),
                                    SPLIT_K=SPLIT_K,
                                    head_size_v=Lv,
                                    BLOCK_DV=BLOCK_DV,
diff --git a/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py b/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py
index 5fdaa5802c..0d0e10ec83 100644
--- a/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py
+++ b/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py
@@ -3,9 +3,16 @@
 import torch.nn.functional as F
 import triton
 import triton.language as tl
+from packaging import version
 
 from .triton_utils import get_kernel_meta
 
+TRITON_VERSION = version.parse(triton.__version__)
+if TRITON_VERSION >= version.parse('3.0.0'):
+    tl_round = tl.extra.cuda.libdevice.round
+else:
+    tl_round = tl.math.round
+
 
 def per_channel_quant(x, n_bits, dtype):
     """Quantize the input tensor 'x' channel-wise using the given number of
@@ -305,7 +312,7 @@ def _per_token_quant_int8(
     # Quant
     _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
     y_s = _absmax / 127
-    y_q = tl.math.round(y / y_s).to(tl.int8)
+    y_q = tl_round(y / y_s).to(tl.int8)
 
     tl.store(y_q_ptr + cols, y_q, mask=mask)
     tl.store(y_s_ptr, y_s)
@@ -373,7 +380,7 @@ def _rms_norm_fwd_fused_dynamic_symmetric(
     scale = tl.max(tl.abs(y)).to(tl.float32) / 127
     tl.store(Scale + row, scale)
 
-    y = tl.math.round(y / scale)
+    y = tl_round(y / scale)
     y = tl.minimum(y, 127)
     y = tl.maximum(y, -128)
     tl.store(Y + cols, y, mask=mask)
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index 7e5058c17b..7fb2491014 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -15,8 +15,8 @@ safetensors
 sentencepiece
 shortuuid
 tiktoken
-torch<=2.3.1,>=2.0.0
+torch<=2.4.0,>=2.0.0
 torchvision<=0.18.1,>=0.15.0
 transformers
-triton>=2.1.0,<=3.0.0; sys_platform == "linux"
+triton>=2.2.0,<=3.0.0; sys_platform == "linux"
 uvicorn
diff --git a/tests/pytorch/kernel/test_paged_attention.py b/tests/pytorch/kernel/test_paged_attention.py
index 5d4b024199..7f63b281c5 100644
--- a/tests/pytorch/kernel/test_paged_attention.py
+++ b/tests/pytorch/kernel/test_paged_attention.py
@@ -26,9 +26,16 @@ def _make_bias(seq_lens, history_lens, neg_val):
     return mask.float() * neg_val
 
 
-def _make_blocked_cache(batched_k, batched_v, seq_lens, history_lens,
-                        block_offsets, block_size, num_heads_k, feat_dim,
-                        feat_dim_v):
+def _make_blocked_cache(batched_k,
+                        batched_v,
+                        seq_lens,
+                        history_lens,
+                        block_offsets,
+                        block_size,
+                        num_heads_k,
+                        feat_dim,
+                        feat_dim_v,
+                        layout: str = 'bshd'):
     max_blocks_nums = block_offsets.max() + 1
     full_seq_lens = seq_lens + history_lens
     blocked_k = batched_k.new_zeros(max_blocks_nums, block_size, num_heads_k,
@@ -48,6 +55,10 @@ def _make_blocked_cache(batched_k, batched_v, seq_lens, history_lens,
             blocked_k[block_off, :size] = tmp_k
             blocked_v[block_off, :size] = tmp_v
 
+    if layout == 'bhsd':
+        blocked_k = blocked_k.transpose(1, 2).contiguous()
+        blocked_v = blocked_v.transpose(1, 2).contiguous()
+
     return blocked_k, blocked_v
 
 
@@ -129,6 +140,10 @@ def num_heads_k(self, request):
     def block_size(self, request):
         yield request.param
 
+    @pytest.fixture
+    def layout(self, request):
+        yield request.param
+
     @pytest.fixture
     def seq_lens(self, request):
         yield torch.tensor(request.param, device='cuda')
@@ -208,11 +223,11 @@ def conti_kv(self, batched_kv, seq_lens, history_lens):
 
     @pytest.fixture
     def blocked_kv(self, batched_kv, seq_lens, history_lens, block_offsets,
-                   block_size, num_heads_k, feat_dim, feat_dim_v):
+                   block_size, num_heads_k, feat_dim, feat_dim_v, layout):
         batched_k, batched_v = batched_kv
         yield _make_blocked_cache(batched_k, batched_v, seq_lens, history_lens,
                                   block_offsets, block_size, num_heads_k,
-                                  feat_dim, feat_dim_v)
+                                  feat_dim, feat_dim_v, layout)
 
     @pytest.fixture
     def mask(self, seq_lens, history_lens):
@@ -236,9 +251,10 @@ def conti_gt(self, gt, seq_lens):
                               ([1, 1, 1, 1], [50, 40, 30, 20])],
                              indirect=True)
     @pytest.mark.parametrize('block_size', [16], indirect=True)
+    @pytest.mark.parametrize('layout', ['bshd', 'bhsd'], indirect=True)
     def test_paged_attention(self, conti_q, blocked_kv, block_offsets,
                              start_loc, seq_lens, history_lens, feat_dim_v,
-                             conti_gt):
+                             layout, conti_gt):
         from lmdeploy.pytorch.kernels import paged_attention_fwd
         kv_seq_lens = seq_lens + history_lens
         max_seq_len = seq_lens.max().item()
@@ -254,7 +270,8 @@ def test_paged_attention(self, conti_q, blocked_kv, block_offsets,
                             q_start_loc=start_loc,
                             q_seqlens=seq_lens,
                             kv_seqlens=kv_seq_lens,
-                            max_seqlen=max_seq_len)
+                            max_seqlen=max_seq_len,
+                            kv_layout=layout)
         torch.testing.assert_close(out, conti_gt, atol=1e-3, rtol=1e-5)
 
     @pytest.fixture
@@ -282,9 +299,10 @@ def window_gt(self, conti_q, conti_kv, seq_lens, history_lens, win_size):
                              indirect=True)
     @pytest.mark.parametrize('win_size', (32, ), indirect=True)
     @pytest.mark.parametrize('block_size', [16], indirect=True)
+    @pytest.mark.parametrize('layout', ['bshd'], indirect=True)
     def test_window_attention(self, conti_q, blocked_kv, block_offsets,
                               start_loc, seq_lens, history_lens, feat_dim_v,
-                              win_size, window_gt):
+                              win_size, layout, window_gt):
         from lmdeploy.pytorch.kernels import paged_attention_fwd
         kv_seq_lens = seq_lens + history_lens
         max_seq_len = seq_lens.max().item()
@@ -300,7 +318,8 @@ def test_window_attention(self, conti_q, blocked_kv, block_offsets,
                             q_seqlens=seq_lens,
                             kv_seqlens=kv_seq_lens,
                             max_seqlen=max_seq_len,
-                            window_size=win_size)
+                            window_size=win_size,
+                            kv_layout=layout)
         torch.testing.assert_close(out, window_gt, atol=1e-3, rtol=1e-5)
 
 
From e98ed5b41ac1cbf9f746b8865f17bf4a61ddb6fb Mon Sep 17 00:00:00 2001
From: CyCle1024 <chenchiyu@pjlab.org.cn>
Date: Fri, 18 Oct 2024 17:16:49 +0800
Subject: [PATCH 010/122] refactor for multiple devices in dlinfer (#2619)

---
 lmdeploy/pytorch/backends/dlinfer/__init__.py |  2 +
 .../{ascend => dlinfer}/apply_rotary_emb.py   |  8 +-
 .../backends/{ => dlinfer}/ascend/__init__.py |  0
 .../{ => dlinfer}/ascend/op_backend.py        | 33 +-------
 .../backends/{ascend => dlinfer}/attention.py | 36 ++++-----
 .../backends/{ascend => dlinfer}/moe.py       | 22 +++---
 .../backends/{ascend => dlinfer}/norm.py      | 12 +--
 .../pytorch/backends/dlinfer/op_backend.py    | 79 +++++++++++++++++++
 lmdeploy/pytorch/backends/selector.py         |  2 +-
 .../kernels/{ascend => dlinfer}/__init__.py   |  0
 .../apply_rotary_pos_emb.py                   |  0
 .../{ascend => dlinfer}/fill_kv_cache.py      |  0
 .../{ascend => dlinfer}/fused_rotary_emb.py   |  0
 .../moe_gating_topk_softmax.py                |  0
 .../{ascend => dlinfer}/pagedattention.py     |  0
 .../kernels/{ascend => dlinfer}/rms_norm.py   |  0
 16 files changed, 123 insertions(+), 71 deletions(-)
 create mode 100644 lmdeploy/pytorch/backends/dlinfer/__init__.py
 rename lmdeploy/pytorch/backends/{ascend => dlinfer}/apply_rotary_emb.py (78%)
 rename lmdeploy/pytorch/backends/{ => dlinfer}/ascend/__init__.py (100%)
 rename lmdeploy/pytorch/backends/{ => dlinfer}/ascend/op_backend.py (75%)
 rename lmdeploy/pytorch/backends/{ascend => dlinfer}/attention.py (76%)
 rename lmdeploy/pytorch/backends/{ascend => dlinfer}/moe.py (77%)
 rename lmdeploy/pytorch/backends/{ascend => dlinfer}/norm.py (71%)
 create mode 100644 lmdeploy/pytorch/backends/dlinfer/op_backend.py
 rename lmdeploy/pytorch/kernels/{ascend => dlinfer}/__init__.py (100%)
 rename lmdeploy/pytorch/kernels/{ascend => dlinfer}/apply_rotary_pos_emb.py (100%)
 rename lmdeploy/pytorch/kernels/{ascend => dlinfer}/fill_kv_cache.py (100%)
 rename lmdeploy/pytorch/kernels/{ascend => dlinfer}/fused_rotary_emb.py (100%)
 rename lmdeploy/pytorch/kernels/{ascend => dlinfer}/moe_gating_topk_softmax.py (100%)
 rename lmdeploy/pytorch/kernels/{ascend => dlinfer}/pagedattention.py (100%)
 rename lmdeploy/pytorch/kernels/{ascend => dlinfer}/rms_norm.py (100%)

diff --git a/lmdeploy/pytorch/backends/dlinfer/__init__.py b/lmdeploy/pytorch/backends/dlinfer/__init__.py
new file mode 100644
index 0000000000..947e66e0ce
--- /dev/null
+++ b/lmdeploy/pytorch/backends/dlinfer/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .ascend import AscendOpsBackend  # noqa: F401
diff --git a/lmdeploy/pytorch/backends/ascend/apply_rotary_emb.py b/lmdeploy/pytorch/backends/dlinfer/apply_rotary_emb.py
similarity index 78%
rename from lmdeploy/pytorch/backends/ascend/apply_rotary_emb.py
rename to lmdeploy/pytorch/backends/dlinfer/apply_rotary_emb.py
index 2f3dce560c..c2bc1f7dce 100644
--- a/lmdeploy/pytorch/backends/ascend/apply_rotary_emb.py
+++ b/lmdeploy/pytorch/backends/dlinfer/apply_rotary_emb.py
@@ -2,12 +2,12 @@
 import torch
 from torch import Tensor
 
-from lmdeploy.pytorch.kernels.ascend import apply_rotary_pos_emb
+from lmdeploy.pytorch.kernels.dlinfer import apply_rotary_pos_emb
 
 from ..apply_rotary_emb import ApplyRotaryEmbBuilder, ApplyRotaryEmbImpl
 
 
-class AscendApplyRotaryEmbImpl(ApplyRotaryEmbImpl):
+class DlinferApplyRotaryEmbImpl(ApplyRotaryEmbImpl):
     """Apply rotary embedding implementation."""
 
     def forward(self,
@@ -26,10 +26,10 @@ def forward(self,
         return apply_rotary_pos_emb(query, key, cos, sin, q_embed, k_embed)
 
 
-class AscendApplyRotaryEmbBuilder(ApplyRotaryEmbBuilder):
+class DlinferApplyRotaryEmbBuilder(ApplyRotaryEmbBuilder):
     """Apply rotary embedding implementation builder."""
 
     @staticmethod
     def build():
         """build implementation."""
-        return AscendApplyRotaryEmbImpl()
+        return DlinferApplyRotaryEmbImpl()
diff --git a/lmdeploy/pytorch/backends/ascend/__init__.py b/lmdeploy/pytorch/backends/dlinfer/ascend/__init__.py
similarity index 100%
rename from lmdeploy/pytorch/backends/ascend/__init__.py
rename to lmdeploy/pytorch/backends/dlinfer/ascend/__init__.py
diff --git a/lmdeploy/pytorch/backends/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
similarity index 75%
rename from lmdeploy/pytorch/backends/ascend/op_backend.py
rename to lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
index 45918febe2..53eea622ea 100644
--- a/lmdeploy/pytorch/backends/ascend/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -5,13 +5,12 @@
 
 from lmdeploy.utils import get_logger
 
-from ..base import OpType
-from ..default import DefaultOpsBackend
+from ..op_backend import DlinferOpsBackend
 
 logger = get_logger('lmdeploy')
 
 
-class AscendOpsBackend(DefaultOpsBackend):
+class AscendOpsBackend(DlinferOpsBackend):
     """ascend layer backend."""
 
     @staticmethod
@@ -19,34 +18,6 @@ def get_name() -> str:
         """backend name."""
         return 'ascend'
 
-    @classmethod
-    def get_layer_impl_builder(cls, layer_type: OpType):
-        """get ascend layer builder."""
-        if layer_type == OpType.Attention:
-            from .attention import AscendAttentionBuilder
-            return AscendAttentionBuilder
-        elif layer_type == OpType.ApplyRotaryEmb:
-            from .apply_rotary_emb import AscendApplyRotaryEmbBuilder
-            return AscendApplyRotaryEmbBuilder
-        elif layer_type == OpType.RMSNorm:
-            from .norm import AscendRMSNormBuilder
-            return AscendRMSNormBuilder
-        elif layer_type == OpType.SoftmaxTopK:
-            from .moe import AscendSoftmaxTopKBuilder
-            return AscendSoftmaxTopKBuilder
-        elif layer_type == OpType.FusedMoE:
-            from .moe import AscendFusedMoEBuilder
-            return AscendFusedMoEBuilder
-        else:
-            logger.debug(
-                f'Op {layer_type} fallback to default implementation.')
-            return super().get_layer_impl_builder(layer_type)
-
-    @staticmethod
-    def get_attention_metadata_cls():
-        from .attention import AscendAttentionMetadata
-        return AscendAttentionMetadata
-
     @staticmethod
     def get_k_block_shape(
         block_size: int,
diff --git a/lmdeploy/pytorch/backends/ascend/attention.py b/lmdeploy/pytorch/backends/dlinfer/attention.py
similarity index 76%
rename from lmdeploy/pytorch/backends/ascend/attention.py
rename to lmdeploy/pytorch/backends/dlinfer/attention.py
index 1accbd3ecd..0d666c9130 100644
--- a/lmdeploy/pytorch/backends/ascend/attention.py
+++ b/lmdeploy/pytorch/backends/dlinfer/attention.py
@@ -8,7 +8,7 @@
 
 
 @dataclass
-class AscendAttentionMetadata(AttentionMetadata):
+class DlinferAttentionMetadata(AttentionMetadata):
     kv_start_indices: Optional[Tensor] = None
     block_size: int = 64
     attention_mask: Sequence[Tensor] = tuple()
@@ -17,8 +17,8 @@ class AscendAttentionMetadata(AttentionMetadata):
     max_kv_seq_len: int = 1
 
 
-class AscendAttentionImpl(AttentionImpl[AscendAttentionMetadata]):
-    """ascend attention implementation."""
+class DlinferAttentionImpl(AttentionImpl[DlinferAttentionMetadata]):
+    """dlinfer attention implementation."""
 
     def __init__(
         self,
@@ -44,8 +44,8 @@ def __init__(
             **kwargs,
         )
 
-        from lmdeploy.pytorch.kernels.ascend import (fill_kv_cache,
-                                                     paged_attention_fwd)
+        from lmdeploy.pytorch.kernels.dlinfer import (fill_kv_cache,
+                                                      paged_attention_fwd)
         self.fill_kv_cache = fill_kv_cache
         self.paged_attention_fwd = paged_attention_fwd
 
@@ -56,7 +56,7 @@ def forward(
         value: Tensor,
         k_cache: Tensor,
         v_cache: Tensor,
-        attn_metadata: AscendAttentionMetadata,
+        attn_metadata: DlinferAttentionMetadata,
         k_scales_zeros: Tensor = None,
         v_scales_zeros: Tensor = None,
         inplace: bool = True,
@@ -108,8 +108,8 @@ def forward(
         return attn_output
 
 
-class AscendAttentionBuilder(AttentionBuilder[AscendAttentionMetadata]):
-    """ascend attention builder."""
+class DlinferAttentionBuilder(AttentionBuilder[DlinferAttentionMetadata]):
+    """dlinfer attention builder."""
 
     @staticmethod
     def build(
@@ -122,14 +122,14 @@ def build(
         sliding_window: int = None,
         logical_softcapping: float = None,
         **kwargs,
-    ) -> AscendAttentionImpl:
+    ) -> DlinferAttentionImpl:
         """build."""
-        return AscendAttentionImpl(num_heads,
-                                   head_size,
-                                   scale=scale,
-                                   num_kv_heads=num_kv_heads,
-                                   v_head_size=v_head_size,
-                                   alibi_scale=alibi_scale,
-                                   sliding_window=sliding_window,
-                                   logical_softcapping=logical_softcapping,
-                                   **kwargs)
+        return DlinferAttentionImpl(num_heads,
+                                    head_size,
+                                    scale=scale,
+                                    num_kv_heads=num_kv_heads,
+                                    v_head_size=v_head_size,
+                                    alibi_scale=alibi_scale,
+                                    sliding_window=sliding_window,
+                                    logical_softcapping=logical_softcapping,
+                                    **kwargs)
diff --git a/lmdeploy/pytorch/backends/ascend/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py
similarity index 77%
rename from lmdeploy/pytorch/backends/ascend/moe.py
rename to lmdeploy/pytorch/backends/dlinfer/moe.py
index 8f36be53af..a242e30417 100644
--- a/lmdeploy/pytorch/backends/ascend/moe.py
+++ b/lmdeploy/pytorch/backends/dlinfer/moe.py
@@ -2,14 +2,14 @@
 
 import torch
 
-from lmdeploy.pytorch.kernels.ascend import moe_gating_topk_softmax
+from lmdeploy.pytorch.kernels.dlinfer import moe_gating_topk_softmax
 
 from ..moe import (FusedMoEBuilder, FusedMoEImpl, SoftmaxTopKBuilder,
                    SoftmaxTopKImpl)
 
 
-class AscendSoftmaxTopKImpl(SoftmaxTopKImpl):
-    """ascend softmax topk implementation."""
+class DlinferSoftmaxTopKImpl(SoftmaxTopKImpl):
+    """dlinfer softmax topk implementation."""
 
     def __init__(self, top_k: int, dim: int = -1):
         self.top_k = top_k
@@ -22,17 +22,17 @@ def forward(self, x: torch.Tensor):
             torch.int64)
 
 
-class AscendSoftmaxTopKBuilder(SoftmaxTopKBuilder):
-    """ascend softmax topk implementation builder."""
+class DlinferSoftmaxTopKBuilder(SoftmaxTopKBuilder):
+    """dlinfer softmax topk implementation builder."""
 
     @staticmethod
     def build(top_k: int, dim: int = -1):
         """build."""
-        return AscendSoftmaxTopKImpl(top_k, dim)
+        return DlinferSoftmaxTopKImpl(top_k, dim)
 
 
-class AscendFusedMoEImpl(FusedMoEImpl):
-    """ascend fused moe implementation."""
+class DlinferFusedMoEImpl(FusedMoEImpl):
+    """dlinfer fused moe implementation."""
 
     def __init__(self, top_k: int, renormalize: bool = False):
         self.top_k = top_k
@@ -68,10 +68,10 @@ def forward(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor,
         return moe_output
 
 
-class AscendFusedMoEBuilder(FusedMoEBuilder):
-    """ascend fused moe builder."""
+class DlinferFusedMoEBuilder(FusedMoEBuilder):
+    """dlinfer fused moe builder."""
 
     @staticmethod
     def build(top_k: int, renormalize: bool = False):
         """build from mlp."""
-        return AscendFusedMoEImpl(top_k=top_k, renormalize=renormalize)
+        return DlinferFusedMoEImpl(top_k=top_k, renormalize=renormalize)
diff --git a/lmdeploy/pytorch/backends/ascend/norm.py b/lmdeploy/pytorch/backends/dlinfer/norm.py
similarity index 71%
rename from lmdeploy/pytorch/backends/ascend/norm.py
rename to lmdeploy/pytorch/backends/dlinfer/norm.py
index e6ee3963e3..3659f50c85 100644
--- a/lmdeploy/pytorch/backends/ascend/norm.py
+++ b/lmdeploy/pytorch/backends/dlinfer/norm.py
@@ -1,13 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
 
-from lmdeploy.pytorch.kernels.ascend import rms_norm
+from lmdeploy.pytorch.kernels.dlinfer import rms_norm
 
 from ..norm import RMSNormBuilder, RMSNormImpl
 
 
-class AscendRMSNormImpl(RMSNormImpl):
-    """ascend RMS norm implementation."""
+class DlinferRMSNormImpl(RMSNormImpl):
+    """dlinfer RMS norm implementation."""
 
     def __init__(self, hidden_size: int, eps: float = 1e-6):
         self.hidden_size = hidden_size
@@ -26,10 +26,10 @@ def forward(self,
             return x, residual
 
 
-class AscendRMSNormBuilder(RMSNormBuilder):
-    """ascend RMS norm implementation builder."""
+class DlinferRMSNormBuilder(RMSNormBuilder):
+    """dlinfer RMS norm implementation builder."""
 
     @staticmethod
     def build(weight: torch.Tensor, eps: float = 1e-6):
         """build."""
-        return AscendRMSNormImpl(weight, eps)
+        return DlinferRMSNormImpl(weight, eps)
diff --git a/lmdeploy/pytorch/backends/dlinfer/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/op_backend.py
new file mode 100644
index 0000000000..9ee1dd4773
--- /dev/null
+++ b/lmdeploy/pytorch/backends/dlinfer/op_backend.py
@@ -0,0 +1,79 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+
+from lmdeploy.utils import get_logger
+
+from ..base import OpType
+from ..default import DefaultOpsBackend
+
+logger = get_logger('lmdeploy')
+
+
+class DlinferOpsBackend(DefaultOpsBackend):
+    """dlinfer layer backend."""
+
+    @staticmethod
+    def get_name() -> str:
+        """backend name."""
+        return 'dlinfer'
+
+    @classmethod
+    def get_layer_impl_builder(cls, layer_type: OpType):
+        """get dlinfer layer builder."""
+        if layer_type == OpType.Attention:
+            from .attention import DlinferAttentionBuilder
+            return DlinferAttentionBuilder
+        elif layer_type == OpType.ApplyRotaryEmb:
+            from .apply_rotary_emb import DlinferApplyRotaryEmbBuilder
+            return DlinferApplyRotaryEmbBuilder
+        elif layer_type == OpType.RMSNorm:
+            from .norm import DlinferRMSNormBuilder
+            return DlinferRMSNormBuilder
+        elif layer_type == OpType.SoftmaxTopK:
+            from .moe import DlinferSoftmaxTopKBuilder
+            return DlinferSoftmaxTopKBuilder
+        elif layer_type == OpType.FusedMoE:
+            from .moe import DlinferFusedMoEBuilder
+            return DlinferFusedMoEBuilder
+        else:
+            logger.debug(
+                f'Op {layer_type} fallback to default implementation.')
+            return super().get_layer_impl_builder(layer_type)
+
+    @staticmethod
+    def get_attention_metadata_cls():
+        from .attention import DlinferAttentionMetadata
+        return DlinferAttentionMetadata
+
+    @staticmethod
+    def get_k_block_shape(
+        block_size: int,
+        num_heads: int,
+        head_size: int,
+        dtype: torch.dtype,
+    ) -> Tuple[int, ...]:
+        return (
+            block_size,
+            num_heads,
+            head_size,
+        )
+
+    @staticmethod
+    def get_v_block_shape(
+        block_size: int,
+        num_heads: int,
+        head_size: int,
+        dtype: torch.dtype,
+    ) -> Tuple[int, ...]:
+        return (
+            block_size,
+            num_heads,
+            head_size,
+        )
+
+    @classmethod
+    def update_step_context(cls, step_context):
+        """update step context."""
+        raise NotImplementedError
diff --git a/lmdeploy/pytorch/backends/selector.py b/lmdeploy/pytorch/backends/selector.py
index 181dd15358..1ac85de0cb 100644
--- a/lmdeploy/pytorch/backends/selector.py
+++ b/lmdeploy/pytorch/backends/selector.py
@@ -13,7 +13,7 @@ def get_backend():
         from .cuda import CudaOpsBackend
         return CudaOpsBackend
     if device_type == 'ascend':
-        from .ascend import AscendOpsBackend
+        from .dlinfer import AscendOpsBackend
         return AscendOpsBackend
     else:
         raise RuntimeError(f'Unsupported device type: {device_type}')
diff --git a/lmdeploy/pytorch/kernels/ascend/__init__.py b/lmdeploy/pytorch/kernels/dlinfer/__init__.py
similarity index 100%
rename from lmdeploy/pytorch/kernels/ascend/__init__.py
rename to lmdeploy/pytorch/kernels/dlinfer/__init__.py
diff --git a/lmdeploy/pytorch/kernels/ascend/apply_rotary_pos_emb.py b/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py
similarity index 100%
rename from lmdeploy/pytorch/kernels/ascend/apply_rotary_pos_emb.py
rename to lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py
diff --git a/lmdeploy/pytorch/kernels/ascend/fill_kv_cache.py b/lmdeploy/pytorch/kernels/dlinfer/fill_kv_cache.py
similarity index 100%
rename from lmdeploy/pytorch/kernels/ascend/fill_kv_cache.py
rename to lmdeploy/pytorch/kernels/dlinfer/fill_kv_cache.py
diff --git a/lmdeploy/pytorch/kernels/ascend/fused_rotary_emb.py b/lmdeploy/pytorch/kernels/dlinfer/fused_rotary_emb.py
similarity index 100%
rename from lmdeploy/pytorch/kernels/ascend/fused_rotary_emb.py
rename to lmdeploy/pytorch/kernels/dlinfer/fused_rotary_emb.py
diff --git a/lmdeploy/pytorch/kernels/ascend/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
similarity index 100%
rename from lmdeploy/pytorch/kernels/ascend/moe_gating_topk_softmax.py
rename to lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
diff --git a/lmdeploy/pytorch/kernels/ascend/pagedattention.py b/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py
similarity index 100%
rename from lmdeploy/pytorch/kernels/ascend/pagedattention.py
rename to lmdeploy/pytorch/kernels/dlinfer/pagedattention.py
diff --git a/lmdeploy/pytorch/kernels/ascend/rms_norm.py b/lmdeploy/pytorch/kernels/dlinfer/rms_norm.py
similarity index 100%
rename from lmdeploy/pytorch/kernels/ascend/rms_norm.py
rename to lmdeploy/pytorch/kernels/dlinfer/rms_norm.py

From d00933586f0f91fc5c16530f69ee278696479f7e Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Sun, 20 Oct 2024 11:17:36 +0800
Subject: [PATCH 011/122] Copy sglang/bench_serving.py to lmdeploy as serving
 benchmark script (#2620)

* Copy sglang/bench_serving.py to lmdeploy as serving benchmark script

* rollback filename

* update
---
 .github/workflows/stable.yml               |   12 +-
 autotest/utils/benchmark_utils.py          |    2 +-
 benchmark/README.md                        |   20 +-
 benchmark/profile_restful_api.py           | 1305 ++++++++++++++++----
 docs/en/advance/debug_turbomind.md         |    2 +-
 docs/en/benchmark/profile_api_server.md    |   43 +-
 docs/zh_cn/advance/debug_turbomind.md      |    2 +-
 docs/zh_cn/benchmark/profile_api_server.md |   42 +-
 8 files changed, 1093 insertions(+), 335 deletions(-)

diff --git a/.github/workflows/stable.yml b/.github/workflows/stable.yml
index 85daed8e2b..98faf2ffa4 100644
--- a/.github/workflows/stable.yml
+++ b/.github/workflows/stable.yml
@@ -143,12 +143,12 @@ jobs:
           opencompass .github/scripts/eval_stable_subject_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-subject-3
       - name: Test lmdeploy - restful api
         run: |
-          python3 benchmark/profile_restful_api.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json --stream-output True --num-prompts 10000 --csv ${{env.REPORT_DIR}}/stable.csv > ${{env.REPORT_DIR}}/stable.log
-          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-1.csv > ${{env.REPORT_DIR}}/stable-internal-1.log
-          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-2.csv > ${{env.REPORT_DIR}}/stable-internal-2.log
-          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-3.csv > ${{env.REPORT_DIR}}/stable-internal-3.log
-          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-2.csv > ${{env.REPORT_DIR}}/stable-internal-4.log
-          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-3.csv > ${{env.REPORT_DIR}}/stable-internal-5.log
+          python3 benchmark/profile_restful_api.py --port 23344 --dataset-path /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10000 > ${{env.REPORT_DIR}}/stable.log
+          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --num-prompts 100000 > ${{env.REPORT_DIR}}/stable-internal-1.log
+          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --num-prompts 100000 > ${{env.REPORT_DIR}}/stable-internal-2.log
+          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --num-prompts 100000 > ${{env.REPORT_DIR}}/stable-internal-3.log
+          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --num-prompts 100000 > ${{env.REPORT_DIR}}/stable-internal-4.log
+          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 > ${{env.REPORT_DIR}}/stable-internal-5.log
       - name: Attach result
         if: always()
         run: |
diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py
index 3da375ccb5..24eb6c8f1c 100644
--- a/autotest/utils/benchmark_utils.py
+++ b/autotest/utils/benchmark_utils.py
@@ -168,7 +168,7 @@ def restful_test(config,
     if not health_check(http_url):
         return False, 'server not start'
 
-    command = f'python3 benchmark/profile_restful_api.py localhost:{port} {model_path} {dataset_path} --stream-output True '  # noqa: F401, E501
+    command = f'python3 benchmark/profile_restful_api.py --port {port} --tokenizer {model_path} --dataset-path {dataset_path}'  # noqa: F401, E501
     if is_smoke:
         command += ' --num-prompts 200'
     else:
diff --git a/benchmark/README.md b/benchmark/README.md
index 057d38bb11..9e56768640 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -33,20 +33,6 @@ python profile_generation.py \
  --concurrency 1 8 --prompt-tokens 1 512 --completion-tokens 2048 512
 ```
 
-## profile serving
-
-Tools above profile models with Python API. `profile_serving.py` is used to do benchmark on serving.
-
-```bash
-wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-python profile_serving.py \
-    ${TritonServerAddress} \
-    /path/to/tokenizer \ # ends with .model for most models. Otherwise, please pass model_path/triton_models/tokenizer.
-    ShareGPT_V3_unfiltered_cleaned_split.json \
-    --concurrency 64
-```
-
 ## profile restful api
 
 `profile_restful_api.py` is used to do benchmark on api server.
@@ -54,9 +40,5 @@ python profile_serving.py \
 ```bash
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 
-python profile_restful_api.py \
-    ${ServerAddress} \
-    /path/to/tokenizer \ # ends with .model for most models. Otherwise, please pass model_path/triton_models/tokenizer.
-    ShareGPT_V3_unfiltered_cleaned_split.json \
-    --concurrency 64
+python3 profile_restful_api.py --backend lmdeploy --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json
 ```
diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
index 524b302906..963c00d490 100644
--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
@@ -1,24 +1,434 @@
-import csv
+# Modify from https://github.com/sgl-project/sglang/blob/main/python/sglang/bench_serving.py  # noqa
+# Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/backend_request_func.py  # noqa
+# Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/benchmark_serving.py  # noqa
+"""Benchmark online serving with dynamic requests.
+
+Usage:
+python3 -m sglang.bench_serving --backend sglang --num-prompt 10
+
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 3000 --random-input 1024 --random-output 1024 --random-range-ratio 0.5
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --request-rate-range 1,2,4,8,16,32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --multi
+"""  # noqa
+
+import argparse
+import asyncio
 import json
+import os
 import random
+import resource
+import sys
 import time
-from queue import Queue
-from threading import Thread
-from typing import List, Optional, Tuple
+import traceback
+import warnings
+from argparse import ArgumentParser
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
 
-import fire
+import aiohttp
 import numpy as np
-from tqdm import tqdm
-from transformers import AutoTokenizer
+import requests
+from tqdm.asyncio import tqdm
+from transformers import (AutoTokenizer, PreTrainedTokenizer,
+                          PreTrainedTokenizerBase, PreTrainedTokenizerFast)
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+global args
+
+
+@dataclass
+class RequestFuncInput:
+    prompt: str
+    api_url: str
+    prompt_len: int
+    output_len: int
+    model: str
+    extra_request_body: Dict[str, Any]
+
+
+@dataclass
+class RequestFuncOutput:
+    generated_text: str = ''
+    success: bool = False
+    latency: float = 0.0
+    ttft: float = 0.0  # Time to first token
+    itl: List[float] = field(
+        default_factory=list)  # List of inter-token latencies
+    prompt_len: int = 0
+    error: str = ''
+    output_len: int = 0
+
+
+def remove_prefix(text: str, prefix: str) -> str:
+    return text[len(prefix):] if text.startswith(prefix) else text
+
+
+# trt llm not support ignore_eos
+# https://github.com/triton-inference-server/tensorrtllm_backend/issues/505
+async def async_request_trt_llm(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith('generate_stream')
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            'accumulate_tokens': True,
+            'text_input': request_func_input.prompt,
+            'temperature': 0.000001,
+            'top_p': 1.0,
+            'max_tokens': request_func_input.output_len,
+            'stream': True,
+            'min_length': request_func_input.output_len,
+            'end_id': 1048576,
+            **request_func_input.extra_request_body,
+        }
+        if args.disable_ignore_eos:
+            del payload['min_length']
+            del payload['end_id']
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = remove_prefix(chunk_bytes.decode('utf-8'),
+                                              'data:')
+
+                        data = json.loads(chunk)
+                        output.generated_text += data['text_output']
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp -
+                                              most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+                    output.output_len = request_func_input.output_len
+
+                else:
+                    output.error = response.reason or ''
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = ''.join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+# set ignore_eos True by default
+async def async_request_openai_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        'completions'
+    ), "OpenAI Completions API URL must end with 'completions'."
+
+    prompt = request_func_input.prompt
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            'model': request_func_input.model,
+            'prompt': prompt,
+            'temperature': 0.0,
+            'best_of': 1,
+            'max_tokens': request_func_input.output_len,
+            'stream': not args.disable_stream,
+            'ignore_eos': not args.disable_ignore_eos,
+            **request_func_input.extra_request_body,
+        }
+        headers = {
+            'Authorization': f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ''
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = remove_prefix(chunk_bytes.decode('utf-8'),
+                                              'data: ')
+                        latency = time.perf_counter() - st
+                        if chunk == '[DONE]':
+                            pass
+                        else:
+                            data = json.loads(chunk)
+
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if data['choices'][0]['text']:
+                                timestamp = time.perf_counter()
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text += data['choices'][0]['text']
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                    output.output_len = request_func_input.output_len
+                else:
+                    output.error = response.reason or ''
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = ''.join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_sglang_generate(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    prompt = request_func_input.prompt
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            'text': prompt,
+            'sampling_params': {
+                'temperature': 0.0,
+                'max_new_tokens': request_func_input.output_len,
+                'ignore_eos': not args.disable_ignore_eos,
+            },
+            'stream': not args.disable_stream,
+            **request_func_input.extra_request_body,
+        }
+        headers = {}
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ''
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        # print(chunk_bytes)
+
+                        chunk = remove_prefix(chunk_bytes.decode('utf-8'),
+                                              'data: ')
+                        latency = time.perf_counter() - st
+                        if chunk == '[DONE]':
+                            pass
+                        else:
+                            data = json.loads(chunk)
+
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if data['text']:
+                                timestamp = time.perf_counter()
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text = data['text']
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                    output.output_len = request_func_input.output_len
+                else:
+                    output.error = response.reason or ''
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = ''.join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_gserver(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    raise NotImplementedError()
+
+
+def get_model(pretrained_model_name_or_path: str) -> str:
+    if os.getenv('SGLANG_USE_MODELSCOPE', 'False').lower() == 'true':
+        import huggingface_hub.constants
+        from modelscope import snapshot_download
+
+        model_path = snapshot_download(
+            model_id=pretrained_model_name_or_path,
+            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+            ignore_file_pattern=['.*.pt', '.*.safetensors', '.*.bin'],
+        )
+
+        return model_path
+    return pretrained_model_name_or_path
+
+
+def get_tokenizer(
+    pretrained_model_name_or_path: str,
+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+    if pretrained_model_name_or_path.endswith(
+            '.json') or pretrained_model_name_or_path.endswith('.model'):
+        from sglang.srt.hf_transformers_utils import get_tokenizer
+
+        return get_tokenizer(pretrained_model_name_or_path)
+
+    if pretrained_model_name_or_path is not None and not os.path.exists(
+            pretrained_model_name_or_path):
+        pretrained_model_name_or_path = get_model(
+            pretrained_model_name_or_path)
+    return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
+                                         trust_remote_code=True)
+
+
+ASYNC_REQUEST_FUNCS = {
+    'sglang': async_request_sglang_generate,
+    'sglang-native': async_request_sglang_generate,
+    'sglang-oai': async_request_openai_completions,
+    'vllm': async_request_openai_completions,
+    'lmdeploy': async_request_openai_completions,
+    'trt': async_request_trt_llm,
+    'gserver': async_request_gserver,
+}
+
+
+@dataclass
+class BenchmarkMetrics:
+    completed: int
+    total_input: int
+    total_output: int
+    total_output_retokenized: int
+    request_throughput: float
+    input_throughput: float
+    output_throughput: float
+    output_throughput_retokenized: float
+    mean_ttft_ms: float
+    median_ttft_ms: float
+    std_ttft_ms: float
+    p99_ttft_ms: float
+    mean_tpot_ms: float
+    median_tpot_ms: float
+    std_tpot_ms: float
+    p99_tpot_ms: float
+    mean_itl_ms: float
+    median_itl_ms: float
+    std_itl_ms: float
+    p99_itl_ms: float
+    mean_e2e_latency_ms: float
+    median_e2e_latency_ms: float
+
+
+SHAREGPT_URL = 'https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json'  # noqa
+
+
+def download_and_cache_file(url: str, filename: Optional[str] = None):
+    """Read and cache a file from a url."""
+    if filename is None:
+        filename = os.path.join('/tmp', url.split('/')[-1])
+
+    # Check if the cache file already exists
+    if os.path.exists(filename):
+        return filename
+
+    print(f'Downloading from {url} to {filename}')
+
+    # Stream the response to show the progress bar
+    response = requests.get(url, stream=True)
+    response.raise_for_status()  # Check for request errors
+
+    # Total size of the file in bytes
+    total_size = int(response.headers.get('content-length', 0))
+    chunk_size = 1024  # Download in chunks of 1KB
+
+    # Use tqdm to display the progress bar
+    with open(filename, 'wb') as f, tqdm(
+            desc=filename,
+            total=total_size,
+            unit='B',
+            unit_scale=True,
+            unit_divisor=1024,
+    ) as bar:
+        for chunk in response.iter_content(chunk_size=chunk_size):
+            f.write(chunk)
+            bar.update(len(chunk))
 
-from lmdeploy.serve.openai.api_client import APIClient
+    return filename
 
 
-def sample_requests(
+def sample_sharegpt_requests(
     dataset_path: str,
     num_requests: int,
-    tokenizer: AutoTokenizer,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int] = None,
 ) -> List[Tuple[str, int, int]]:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError('output_len too small')
+
+    # Download sharegpt if necessary
+    if not os.path.isfile(dataset_path):
+        dataset_path = download_and_cache_file(SHAREGPT_URL)
+
     # Load the dataset.
     with open(dataset_path) as f:
         dataset = json.load(f)
@@ -28,238 +438,673 @@ def sample_requests(
     dataset = [(data['conversations'][0]['value'],
                 data['conversations'][1]['value']) for data in dataset]
 
-    # pre-sample to avoid go through all the dataset
-    dataset = random.sample(dataset, max(int(num_requests * 1.2), 1000))
+    # Shuffle the dataset.
+    random.shuffle(dataset)
 
-    # Tokenize the prompts and completions.
-    prompts = [prompt for prompt, _ in dataset]
-    prompt_token_ids = tokenizer(prompts).input_ids
-    completions = [completion for _, completion in dataset]
-    completion_token_ids = tokenizer(completions).input_ids
-    tokenized_dataset = []
+    # Filter out sequences that are too long or too short
+    filtered_dataset: List[Tuple[str, int, int]] = []
     for i in range(len(dataset)):
-        output_len = len(completion_token_ids[i])
-        tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
+        if len(filtered_dataset) == num_requests:
+            break
 
-    # Filter out too long sequences.
-    filtered_dataset: List[Tuple[str, int, int]] = []
-    for prompt, prompt_token_ids, output_len in tokenized_dataset:
+        # Tokenize the prompts and completions.
+        prompt = dataset[i][0]
+        prompt_token_ids = tokenizer.encode(prompt)
+        completion = dataset[i][1]
+        completion_token_ids = tokenizer.encode(completion)
         prompt_len = len(prompt_token_ids)
+        output_len = (len(completion_token_ids)
+                      if fixed_output_len is None else fixed_output_len)
         if prompt_len < 4 or output_len < 4:
             # Prune too short sequences.
             continue
-        if prompt_len > 1024 or prompt_len + output_len > 2048:
+        if prompt_len > 1024 or (prompt_len + output_len > 2048
+                                 and fixed_output_len is None):
             # Prune too long sequences.
             continue
         filtered_dataset.append((prompt, prompt_len, output_len))
 
-    # Sample the requests.
-    sampled_requests = random.sample(filtered_dataset, num_requests)
-    return sampled_requests
-
-
-class Engine:
-
-    def __init__(self,
-                 server_addr: str,
-                 tokenzier_path: str,
-                 temperature: float = 0.8,
-                 top_p: float = 1.0,
-                 csv: str = '',
-                 api_key: Optional[str] = None,
-                 model_name: Optional[str] = None,
-                 **kwargs):
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenzier_path,
-                                                       trust_remote_code=True)
-        self.server_addr = server_addr
-        self.temperature = temperature
-        self.top_p = top_p
-        self.csv = csv
-        self.api_key = api_key
-        client = APIClient(self.server_addr, api_key=self.api_key)
-        if model_name is None:
-            self.model_name = client.available_models[0]
-            print(f'using model: {self.model_name}\n')
-        else:
-            self.model_name = model_name
-        self.pbar = None
-
-    def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
-                   stream_output: bool):
-
-        stats = []
-        client = APIClient(self.server_addr, api_key=self.api_key)
-
-        for prompt, input_seqlen, output_seqlen in iter(
-                req_queue.get, [None, None, None]):
-            timestamps = []
-            timestamps.append(time.perf_counter())
-            for output in client.chat_completions_v1(
-                    model=self.model_name,
-                    messages=prompt,
-                    temperature=self.temperature,
-                    top_p=self.top_p,
-                    n=1,
-                    max_tokens=output_seqlen,
-                    stream=stream_output,
-                    session_id=session_id,
-                    ignore_eos=True):
-                timestamps.append(time.perf_counter())
-
-            first_token_latency = np.round(timestamps[1] - timestamps[0], 3)
-            token_latency = np.round(timestamps[-1] - timestamps[0], 3)
-            # assert output.pop('finish_reason') == 'length', \
-            #     f'Error. session_id({session_id}) request {output_seqlen} ' \
-            #     f'tokens, but `finish_reason` is not `length`'
-            total_tokens = input_seqlen + output_seqlen
-            stats.append([
-                first_token_latency, output_seqlen, output_seqlen,
-                total_tokens, token_latency
+    print(f'#Input tokens: {np.sum([x[1] for x in filtered_dataset])}')
+    print(f'#Output tokens: {np.sum([x[2] for x in filtered_dataset])}')
+    return filtered_dataset
+
+
+def sample_random_requests(
+    input_len: int,
+    output_len: int,
+    num_prompts: int,
+    range_ratio: float,
+    tokenizer: PreTrainedTokenizerBase,
+    dataset_path: str,
+) -> List[Tuple[str, int, int]]:
+
+    input_lens = np.random.randint(
+        max(int(input_len * range_ratio), 1),
+        input_len + 1,
+        size=num_prompts,
+    )
+    output_lens = np.random.randint(
+        int(output_len * range_ratio),
+        output_len + 1,
+        size=num_prompts,
+    )
+
+    if True:
+        # Sample token ids from ShareGPT and repeat/truncate them to
+        # satisfy the input_lens
+
+        # Download sharegpt if necessary
+        if not os.path.isfile(dataset_path):
+            dataset_path = download_and_cache_file(SHAREGPT_URL)
+
+        # Load the dataset.
+        with open(dataset_path) as f:
+            dataset = json.load(f)
+        # Filter out the conversations with less than 2 turns.
+        dataset = [data for data in dataset if len(data['conversations']) >= 2]
+        # Only keep the first two turns of each conversation.
+        dataset = [(data['conversations'][0]['value'],
+                    data['conversations'][1]['value']) for data in dataset]
+
+        # Shuffle the dataset.
+        random.shuffle(dataset)
+
+        # Filter out sequences that are too long or too short
+        input_requests: List[Tuple[str, int, int]] = []
+        for i in range(num_prompts):
+            # Tokenize the prompts and completions.
+            prompt = dataset[i][0]
+            prompt_token_ids = tokenizer.encode(prompt)
+            prompt_len = len(prompt_token_ids)
+
+            if prompt_len > input_lens[i]:
+                input_ids = prompt_token_ids[:input_lens[i]]
+            else:
+                ratio = (input_lens[i] + prompt_len - 1) // prompt_len
+                input_ids = (prompt_token_ids * ratio)[:input_lens[i]]
+            prompt = tokenizer.decode(input_ids)
+            input_requests.append(
+                (prompt, int(input_lens[i]), int(output_lens[i])))
+    else:
+        # Sample token ids from random integers.
+        # This can cause some NaN issues.
+        offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
+        input_requests = []
+        for i in range(num_prompts):
+            prompt = tokenizer.decode([
+                (offsets[i] + i + j) % tokenizer.vocab_size
+                for j in range(input_lens[i])
             ])
-            self.pbar.update(1)
-
-        res_queue.put((session_id, stats))
-
-    def process_request(self,
-                        requests,
-                        concurrency: int = 1,
-                        stream_output: bool = False):
-        res_queue = Queue()
-        req_queue = Queue()
-        threads = []
-
-        self.pbar = tqdm(total=len(requests))
-
-        # feed request to q
-        for req in requests:
-            req_queue.put(req)
-        for i in range(concurrency):
-            req_queue.put([None, None, None])
-
-        start = time.time()
-
-        # start threads
-        for i in range(concurrency):
-            t = Thread(target=self._inference,
-                       args=(req_queue, res_queue, i, stream_output))
-            t.start()
-            threads.append(t)
-
-        # wait for finish
-        for t in threads:
-            t.join()
-
-        elapsed_time = time.time() - start
-
-        stats = []
-        while not res_queue.empty():
-            session_id, _stats = res_queue.get()
-            if len(_stats) != 0:
-                stats.append(np.array(_stats))
-
-        stats = np.concatenate(stats).reshape(-1, 5)
-
-        first_token_latency_min = np.min(stats[:, 0], axis=0)
-        first_token_latency_max = np.max(stats[:, 0], axis=0)
-        first_token_latency_ave = np.mean(stats[:, 0], axis=0)
-        completion_tokens = np.sum(stats[:, 1], axis=0)
-        request_output_tokens = np.sum(stats[:, 2], axis=0)
-        total_tokens = np.sum(stats[:, 3], axis=0)
-        prompt_tokens = total_tokens - completion_tokens
-        completion_token_throughput = completion_tokens / elapsed_time
-        total_token_throughput = total_tokens / elapsed_time
-        rps = len(requests) / elapsed_time
-        rpm = rps * 60
-
-        if (np.abs(stats[:, 1] - stats[:, 2]) <= 1).min() is False:
-            print(f'Did not generate requested number of tokens. '
-                  f'Request {request_output_tokens:.0f}, '
-                  f'but got {completion_tokens:.0f}')
-
-        print(f'\n{"-" * 50}\nconcurrency: {concurrency}\n'
-              f'elapsed_time: {elapsed_time:.3f}s\n')
-        if stream_output:
-            print(f'first_token latency(min, max, ave): '
-                  f'{first_token_latency_min:.3f}s, '
-                  f'{first_token_latency_max:.3f}s, '
-                  f'{first_token_latency_ave:.3f}s\n')
-        print(
-            f'number of prompt tokens: {prompt_tokens:.0f}\n'
-            f'number of completion tokens: {completion_tokens:.0f}\n'
-            f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n'  # noqa
-            f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n'  # noqa
-            f'RPS (request per second): {rps:.3f} req/s\n'
-            f'RPM (request per minute): {rpm:.3f} req/min\n'
-            f'{"-" * 50}\n')
-
-        if self.csv:
-            with open(self.csv, 'w') as csvfile:
-                writer = csv.writer(csvfile)
-                writer.writerow([
-                    'batch', 'num_prompts', 'RPS', 'RPM', 'FTL(ave)(s)',
-                    'FTL(min)(s)', 'FTL(max)(s)', 'throughput(out tok/s)',
-                    'throughput(total tok/s)'
-                ])
-                writer.writerow([
-                    concurrency,
-                    len(requests), f'{rps:.3f}', f'{rpm:.3f}',
-                    f'{first_token_latency_ave:.3f}' if stream_output else '-',
-                    f'{first_token_latency_min:.3f}' if stream_output else '-',
-                    f'{first_token_latency_max:.3f}' if stream_output else '-',
-                    f'{completion_token_throughput:.3f}',
-                    f'{total_token_throughput:.3f}'
-                ])
-
-
-def main(server_addr: str,
-         tokenizer_path: str,
-         dataset: str,
-         api_key: Optional[str] = None,
-         model_name: Optional[str] = None,
-         concurrency: int = 128,
-         num_prompts: int = 5000,
-         top_p: float = 1.0,
-         temperature: float = 1.0,
-         stream_output: bool = False,
-         csv: str = './profile_api_server.csv',
-         seed: int = 0):
-    """Benchmark the request througput of api server.
-
-    Args:
-        server_addr (str): http url of api_server with format http://0.0.0.0:0
-        tokenizer_path (str): Path to the tokenizer model in localhost
-        dataset (str): Path to the dataset
-        concurrency (int, optional): Number of working threads to process the sampled prompts.
-            Defaults to 128.
-        num_prompts (int, optional): Number of prompts to process. Defaults to 5000.
-        top_p (float, optional): the set of most probable tokens with
-            probabilities that add up to top_p or higher
-            are kept for generation. Defaults to 1.0.
-        temperature (float, optional): The value used to modulate the next token probabilities.
-            Defaults to 1.0.
-        stream_output (bool, optional): Indicator for streaming output. Defaults to False.
-        csv (str, optional): The path to save the result.
-        seed (int, optional): Seed used in sampling prompts from dataset. Defaults to 0.
-    """    # noqa
-    if not server_addr.startswith('http://'):
-        print(f'[WARNING] server_addr of the api_server should '
-              f'start with "http://", but got "{server_addr}"')
-        server_addr = 'http://' + server_addr.strip()
-
-    random.seed(seed)
-
-    engine = Engine(server_addr,
-                    tokenizer_path,
-                    top_p=top_p,
-                    temperature=temperature,
-                    csv=csv,
-                    api_key=api_key,
-                    model_name=model_name)
-
-    requests = sample_requests(dataset, num_prompts, engine.tokenizer)
-
-    engine.process_request(requests, concurrency, stream_output)
+            input_requests.append(
+                (prompt, int(input_lens[i]), int(output_lens[i])))
+
+    print(f'#Input tokens: {np.sum(input_lens)}')
+    print(f'#Output tokens: {np.sum(output_lens)}')
+    return input_requests
+
+
+async def get_request(
+    input_requests: List[Tuple[str, int, int]],
+    request_rate: float,
+) -> AsyncGenerator[Tuple[str, int, int], None]:
+    input_requests = iter(input_requests)
+    for request in input_requests:
+        yield request
+
+        if request_rate == float('inf'):
+            # If the request rate is infinity, then we don't need to wait.
+            continue
+
+        # Sample the request interval from the exponential distribution.
+        interval = np.random.exponential(1.0 / request_rate)
+        # The next request will be sent after the interval.
+        await asyncio.sleep(interval)
+
+
+def calculate_metrics(
+    input_requests: List[Tuple[str, int, int]],
+    outputs: List[RequestFuncOutput],
+    dur_s: float,
+    tokenizer: PreTrainedTokenizerBase,
+    backend: str,
+) -> Tuple[BenchmarkMetrics, List[int]]:
+    output_lens: List[int] = []
+    retokenized_output_lens: List[int] = []
+    total_input = 0
+    completed = 0
+    itls: List[float] = []
+    tpots: List[float] = []
+    ttfts: List[float] = []
+    e2e_latencies: List[float] = []
+    for i in range(len(outputs)):
+        if outputs[i].success:
+            output_len = outputs[i].output_len
+            output_lens.append(output_len)
+            retokenized_output_len = len(
+                tokenizer.encode(outputs[i].generated_text,
+                                 add_special_tokens=False))
+            retokenized_output_lens.append(retokenized_output_len)
+            total_input += input_requests[i][1]
+            if output_len > 1:
+                tpots.append(
+                    (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
+            itls += outputs[i].itl
+            ttfts.append(outputs[i].ttft)
+
+            e2e_latencies.append(outputs[i].latency)
+
+            completed += 1
+        else:
+            output_lens.append(0)
+            retokenized_output_lens.append(0)
+
+    if completed == 0:
+        warnings.warn(
+            'All requests failed. This is likely due to a misconfiguration '
+            'on the benchmark arguments.',
+            stacklevel=2,
+        )
+    metrics = BenchmarkMetrics(
+        completed=completed,
+        total_input=total_input,
+        total_output=sum(output_lens),
+        total_output_retokenized=sum(retokenized_output_lens),
+        request_throughput=completed / dur_s,
+        input_throughput=total_input / dur_s,
+        output_throughput=sum(output_lens) / dur_s,
+        output_throughput_retokenized=sum(retokenized_output_lens) / dur_s,
+        mean_ttft_ms=np.mean(ttfts or 0) *
+        1000,  # ttfts is empty if streaming is not supported by backend
+        median_ttft_ms=np.median(ttfts or 0) * 1000,
+        std_ttft_ms=np.std(ttfts or 0) * 1000,
+        p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
+        mean_tpot_ms=np.mean(tpots or 0) * 1000,
+        median_tpot_ms=np.median(tpots or 0) * 1000,
+        std_tpot_ms=np.std(tpots or 0) * 1000,
+        p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
+        mean_itl_ms=np.mean(itls or 0) * 1000,
+        median_itl_ms=np.median(itls or 0) * 1000,
+        std_itl_ms=np.std(itls or 0) * 1000,
+        p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
+        mean_e2e_latency_ms=np.mean(e2e_latencies) * 1000,
+        median_e2e_latency_ms=np.median(e2e_latencies) * 1000,
+    )
+
+    return metrics, output_lens
+
+
+async def benchmark(
+    backend: str,
+    api_url: str,
+    model_id: str,
+    tokenizer: PreTrainedTokenizerBase,
+    input_requests: List[Tuple[str, int, int]],
+    request_rate: float,
+    disable_tqdm: bool,
+    extra_request_body: Dict[str, Any],
+):
+    if backend in ASYNC_REQUEST_FUNCS:
+        request_func = ASYNC_REQUEST_FUNCS[backend]
+    else:
+        raise ValueError(f'Unknown backend: {backend}')
+
+    print('Starting initial single prompt test run...')
+    test_prompt, test_prompt_len, test_output_len = input_requests[0]
+    test_input = RequestFuncInput(
+        model=model_id,
+        prompt=test_prompt,
+        api_url=api_url,
+        prompt_len=test_prompt_len,
+        output_len=test_output_len,
+        extra_request_body=extra_request_body,
+    )
+    test_output = await request_func(request_func_input=test_input)
+    if not test_output.success:
+        raise ValueError(
+            'Initial test run failed - Please make sure benchmark arguments '
+            f'are correctly specified. Error: {test_output.error}')
+    else:
+        print('Initial test run completed. Starting main benchmark run...')
+
+    time.sleep(1.5)
+
+    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+
+    benchmark_start_time = time.perf_counter()
+    tasks: List[asyncio.Task] = []
+    async for request in get_request(input_requests, request_rate):
+        prompt, prompt_len, output_len = request
+        request_func_input = RequestFuncInput(
+            model=model_id,
+            prompt=prompt,
+            api_url=api_url,
+            prompt_len=prompt_len,
+            output_len=output_len,
+            extra_request_body=extra_request_body,
+        )
+        tasks.append(
+            asyncio.create_task(
+                request_func(request_func_input=request_func_input,
+                             pbar=pbar)))
+    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
+
+    if pbar is not None:
+        pbar.close()
+
+    benchmark_duration = time.perf_counter() - benchmark_start_time
+
+    metrics, output_lens = calculate_metrics(
+        input_requests=input_requests,
+        outputs=outputs,
+        dur_s=benchmark_duration,
+        tokenizer=tokenizer,
+        backend=backend,
+    )
+
+    print('\n{s:{c}^{n}}'.format(s=' Serving Benchmark Result ', n=50, c='='))
+    print('{:<40} {:<10}'.format('Backend:', backend))
+    print('{:<40} {:<10}'.format('Traffic request rate:', request_rate))
+    print('{:<40} {:<10}'.format('Successful requests:', metrics.completed))
+    print('{:<40} {:<10.2f}'.format('Benchmark duration (s):',
+                                    benchmark_duration))
+    print('{:<40} {:<10}'.format('Total input tokens:', metrics.total_input))
+    print('{:<40} {:<10}'.format('Total generated tokens:',
+                                 metrics.total_output))
+    print('{:<40} {:<10}'.format('Total generated tokens (retokenized):',
+                                 metrics.total_output_retokenized))
+    print('{:<40} {:<10.2f}'.format('Request throughput (req/s):',
+                                    metrics.request_throughput))
+    print('{:<40} {:<10.2f}'.format('Input token throughput (tok/s):',
+                                    metrics.input_throughput))
+    print('{:<40} {:<10.2f}'.format('Output token throughput (tok/s):',
+                                    metrics.output_throughput))
+    print('{s:{c}^{n}}'.format(s='End-to-End Latency', n=50, c='-'))
+    print('{:<40} {:<10.2f}'.format('Mean E2E Latency (ms):',
+                                    metrics.mean_e2e_latency_ms))
+    print('{:<40} {:<10.2f}'.format('Median E2E Latency (ms):',
+                                    metrics.median_e2e_latency_ms))
+    print('{s:{c}^{n}}'.format(s='Time to First Token', n=50, c='-'))
+    print('{:<40} {:<10.2f}'.format('Mean TTFT (ms):', metrics.mean_ttft_ms))
+    print('{:<40} {:<10.2f}'.format('Median TTFT (ms):',
+                                    metrics.median_ttft_ms))
+    print('{:<40} {:<10.2f}'.format('P99 TTFT (ms):', metrics.p99_ttft_ms))
+    print('{s:{c}^{n}}'.format(s='Time per Output Token (excl. 1st token)',
+                               n=50,
+                               c='-'))
+    print('{:<40} {:<10.2f}'.format('Mean TPOT (ms):', metrics.mean_tpot_ms))
+    print('{:<40} {:<10.2f}'.format('Median TPOT (ms):',
+                                    metrics.median_tpot_ms))
+    print('{:<40} {:<10.2f}'.format('P99 TPOT (ms):', metrics.p99_tpot_ms))
+    print('{s:{c}^{n}}'.format(s='Inter-token Latency', n=50, c='-'))
+    print('{:<40} {:<10.2f}'.format('Mean ITL (ms):', metrics.mean_itl_ms))
+    print('{:<40} {:<10.2f}'.format('Median ITL (ms):', metrics.median_itl_ms))
+    print('{:<40} {:<10.2f}'.format('P99 ITL (ms):', metrics.p99_itl_ms))
+    print('=' * 50)
+
+    if (metrics.median_ttft_ms is not None and metrics.mean_itl_ms is not None
+            and metrics.output_throughput is not None):
+        result = {
+            'backend': args.backend,
+            'dataset_name': args.dataset_name,
+            'request_rate': request_rate,
+            'total_input_tokens': metrics.total_input,
+            'total_output_tokens': metrics.total_output,
+            'total_output_tokens_retokenized':
+            metrics.total_output_retokenized,
+            'mean_e2e_latency_ms': metrics.mean_e2e_latency_ms,
+            'median_e2e_latency_ms': metrics.median_e2e_latency_ms,
+            'median_ttft_ms': metrics.median_ttft_ms,
+            'median_itl_ms': metrics.median_itl_ms,
+            'output_throughput': metrics.output_throughput,
+            'sharegpt_output_len': args.sharegpt_output_len,
+            'random_input_len': args.random_input_len,
+            'random_output_len': args.random_output_len,
+            'random_range_ratio': args.random_range_ratio,
+            'duration': benchmark_duration,
+            'completed': metrics.completed,
+        }
+    else:
+        print(f'Error running benchmark for request rate: {request_rate}')
+        print('-' * 30)
+
+    # Determine output file name
+    if args.output_file:
+        output_file_name = args.output_file
+    else:
+        now = datetime.now().strftime('%m%d')
+        if args.dataset_name == 'random':
+            output_file_name = f'{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl'  # noqa
+        else:
+            output_file_name = f'{args.backend}_{now}_{args.num_prompts}_sharegpt.jsonl'  # noqa
+
+    # Append results to a JSONL file
+    with open(output_file_name, 'a') as file:
+        file.write(json.dumps(result) + '\n')
+
+    result = {
+        'duration': benchmark_duration,
+        'completed': metrics.completed,
+        'total_input_tokens': metrics.total_input,
+        'total_output_tokens': metrics.total_output,
+        'total_output_tokens_retokenized': metrics.total_output_retokenized,
+        'request_throughput': metrics.request_throughput,
+        'input_throughput': metrics.input_throughput,
+        'output_throughput': metrics.output_throughput,
+        'mean_ttft_ms': metrics.mean_ttft_ms,
+        'median_ttft_ms': metrics.median_ttft_ms,
+        'std_ttft_ms': metrics.std_ttft_ms,
+        'p99_ttft_ms': metrics.p99_ttft_ms,
+        'mean_tpot_ms': metrics.mean_tpot_ms,
+        'median_tpot_ms': metrics.median_tpot_ms,
+        'std_tpot_ms': metrics.std_tpot_ms,
+        'p99_tpot_ms': metrics.p99_tpot_ms,
+        'mean_itl_ms': metrics.mean_itl_ms,
+        'median_itl_ms': metrics.median_itl_ms,
+        'std_itl_ms': metrics.std_itl_ms,
+        'p99_itl_ms': metrics.p99_itl_ms,
+        'input_lens': [output.prompt_len for output in outputs],
+        'output_lens': output_lens,
+        'ttfts': [output.ttft for output in outputs],
+        'itls': [output.itl for output in outputs],
+        'generated_texts': [output.generated_text for output in outputs],
+        'errors': [output.error for output in outputs],
+        'mean_e2e_latency_ms': metrics.mean_e2e_latency_ms,
+        'median_e2e_latency_ms': metrics.median_e2e_latency_ms,
+    }
+    return result
+
+
+def parse_request_rate_range(request_rate_range):
+    if len(request_rate_range.split(',')) == 3:
+        start, stop, step = map(int, request_rate_range.split(','))
+        return list(range(start, stop, step))
+    else:
+        return list(map(int, request_rate_range.split(',')))
+
+
+def check_chat_template(model_path):
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_path,
+                                                  trust_remote_code=True)
+        return 'chat_template' in tokenizer.init_kwargs
+    except Exception as e:
+        print(f'Fail to load tokenizer config with error={e}')
+        return False
+
+
+def run_benchmark(args_: argparse.Namespace):
+    global args
+    args = args_
+
+    # Set global environments
+    set_ulimit()
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    extra_request_body = {}
+    if args.extra_request_body:
+        extra_request_body = json.loads(args.extra_request_body)
+
+    # Set url
+    if args.port is None:
+        args.port = {
+            'sglang': 30000,
+            'sglang-native': 30000,
+            'sglang-oai': 30000,
+            'lmdeploy': 23333,
+            'vllm': 8000,
+            'trt': 8000,
+            'gserver': 9988,
+        }.get(args.backend, 30000)
+
+    model_url = (f'{args.base_url}/v1/models' if args.base_url else
+                 f'http://{args.host}:{args.port}/v1/models')
+
+    if args.backend in ['sglang', 'sglang-native']:
+        api_url = (f'{args.base_url}/generate' if args.base_url else
+                   f'http://{args.host}:{args.port}/generate')
+    elif args.backend in ['sglang-oai', 'vllm', 'lmdeploy']:
+        api_url = (f'{args.base_url}/v1/completions' if args.base_url else
+                   f'http://{args.host}:{args.port}/v1/completions')
+    elif args.backend == 'trt':
+        api_url = (
+            f'{args.base_url}/v2/models/ensemble/generate_stream'
+            if args.base_url else
+            f'http://{args.host}:{args.port}/v2/models/ensemble/generate_stream'  # noqa
+        )
+        if args.model is None:
+            print('Please provide a model using `--model` when using '
+                  '`trt` backend.')
+            sys.exit(1)
+    elif args.backend == 'gserver':
+        api_url = args.base_url if args.base_url else \
+            f'{args.host}:{args.port}'
+        args.model = args.model or 'default'
+
+    # Get model name
+    if args.model is None:
+        try:
+            response = requests.get(model_url)
+            model_list = response.json().get('data', [])
+            args.model = model_list[0]['id'] if model_list else None
+        except Exception as e:
+            print(f'Failed to fetch model from {model_url}. Error: {e}')
+            print('Please specify the correct host and port using '
+                  '`--host` and `--port`.')
+            sys.exit(1)
+
+    if args.model is None:
+        print('No model specified or found. Please provide a model '
+              'using `--model`.')
+        sys.exit(1)
+
+    if not check_chat_template(args.model):
+        print('\nWARNING It is recommended to use the `Chat` or `Instruct` '
+              'model for benchmarking.\n'
+              'Because when the tokenizer counts the output tokens, if '
+              'there is gibberish, it might count incorrectly.\n')
+
+    print(f'{args}\n')
+
+    # Read dataset
+    backend = args.backend
+    model_id = args.model
+    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+
+    tokenizer = get_tokenizer(tokenizer_id)
+
+    if args.dataset_name == 'sharegpt':
+        assert args.random_input_len is None and args.random_output_len is None
+        input_requests = sample_sharegpt_requests(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            fixed_output_len=args.sharegpt_output_len,
+        )
+    elif args.dataset_name == 'random':
+        assert args.random_input_len is not None and \
+            args.random_output_len is not None
+        input_requests = sample_random_requests(
+            input_len=args.random_input_len,
+            output_len=args.random_output_len,
+            num_prompts=args.num_prompts,
+            range_ratio=args.random_range_ratio,
+            tokenizer=tokenizer,
+            dataset_path=args.dataset_path,
+        )
+    else:
+        raise ValueError(f'Unknown dataset: {args.dataset_name}')
+
+    if not args.multi:
+        return asyncio.run(
+            benchmark(
+                backend=backend,
+                api_url=api_url,
+                model_id=model_id,
+                tokenizer=tokenizer,
+                input_requests=input_requests,
+                request_rate=args.request_rate,
+                disable_tqdm=args.disable_tqdm,
+                extra_request_body=extra_request_body,
+            ))
+    else:
+        # Benchmark multiple rps.
+        # TODO: use a fixed duration to compute num_prompts
+        request_rates = parse_request_rate_range(args.request_rate_range)
+
+        for rate in request_rates:
+            asyncio.run(
+                benchmark(
+                    backend=backend,
+                    api_url=api_url,
+                    model_id=model_id,
+                    tokenizer=tokenizer,
+                    input_requests=input_requests,
+                    request_rate=rate,
+                    disable_tqdm=args.disable_tqdm,
+                    extra_request_body=extra_request_body,
+                ))
+
+
+def set_ulimit(target_soft_limit=65535):
+    resource_type = resource.RLIMIT_NOFILE
+    current_soft, current_hard = resource.getrlimit(resource_type)
+
+    if current_soft < target_soft_limit:
+        try:
+            resource.setrlimit(resource_type,
+                               (target_soft_limit, current_hard))
+        except ValueError as e:
+            print(f'Fail to set RLIMIT_NOFILE: {e}')
 
 
 if __name__ == '__main__':
-    fire.Fire(main)
+    parser = ArgumentParser(
+        description='Benchmark the online serving throughput.')
+    parser.add_argument(
+        '--backend',
+        type=str,
+        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+        default='sglang',
+        help='Must specify a backend, depending on the LLM Inference Engine.',
+    )
+    parser.add_argument(
+        '--base-url',
+        type=str,
+        default=None,
+        help='Server or API base url if not using http host and port.',
+    )
+    parser.add_argument('--host',
+                        type=str,
+                        default='0.0.0.0',
+                        help='Default host is 0.0.0.0.')
+    parser.add_argument(
+        '--port',
+        type=int,
+        help='If not set, the default port is configured according to its '
+        'default value for different LLM Inference Engines.',
+    )
+    parser.add_argument(
+        '--dataset-name',
+        type=str,
+        default='sharegpt',
+        choices=['sharegpt', 'random'],
+        help='Name of the dataset to benchmark on.',
+    )
+    parser.add_argument('--dataset-path',
+                        type=str,
+                        default='',
+                        help='Path to the dataset.')
+    parser.add_argument(
+        '--model',
+        type=str,
+        help='Name or path of the model. If not set, the default model will '
+        'request /v1/models for conf.',
+    )
+    parser.add_argument(
+        '--tokenizer',
+        type=str,
+        help='Name or path of the tokenizer. If not set, using the model '
+        'conf.',
+    )
+    parser.add_argument(
+        '--num-prompts',
+        type=int,
+        default=1000,
+        help='Number of prompts to process. Default is 1000.',
+    )
+    parser.add_argument(
+        '--sharegpt-output-len',
+        type=int,
+        default=None,
+        help='Output length for each request. Overrides the output length '
+        'from the ShareGPT dataset.',
+    )
+    parser.add_argument(
+        '--random-input-len',
+        type=int,
+        help='Number of input tokens per request, used only for random '
+        'dataset.',
+    )
+    parser.add_argument(
+        '--random-output-len',
+        type=int,
+        help='Number of output tokens per request, used only for random '
+        'dataset.',
+    )
+    parser.add_argument(
+        '--random-range-ratio',
+        type=float,
+        default=0.0,
+        help='Range of sampled ratio of input/output length, '
+        'used only for random dataset.',
+    )
+    parser.add_argument(
+        '--request-rate',
+        type=float,
+        default=float('inf'),
+        help='Number of requests per second. If this is inf, then all the '
+        'requests are sent at time 0. Otherwise, we use Poisson process to '
+        'synthesize the request arrival times. Default is inf.',
+    )
+    parser.add_argument('--seed', type=int, default=1, help='The random seed.')
+    parser.add_argument(
+        '--multi',
+        action='store_true',
+        help='Use request rate range rather than single value.',
+    )
+    parser.add_argument(
+        '--request-rate-range',
+        type=str,
+        default='2,34,2',
+        help='Range of request rates in the format start,stop,step. Default '
+        'is 2,34,2. It also supports a list of request rates, requiring '
+        'the parameters to not equal three.',
+    )
+    parser.add_argument('--output-file',
+                        type=str,
+                        help='Output JSONL file name.')
+    parser.add_argument(
+        '--disable-tqdm',
+        action='store_true',
+        help='Specify to disable tqdm progress bar.',
+    )
+    parser.add_argument(
+        '--disable-stream',
+        action='store_true',
+        help='Disable streaming mode.',
+    )
+    parser.add_argument(
+        '--disable-ignore-eos',
+        action='store_true',
+        help='Disable ignoring EOS.',
+    )
+    parser.add_argument(
+        '--extra-request-body',
+        metavar='{"key1": "value1", "key2": "value2"}',
+        type=str,
+        help='Append given JSON object to the request payload. You can use '
+        'this to specify additional generate params like sampling params.',
+    )
+    args = parser.parse_args()
+    run_benchmark(args)
diff --git a/docs/en/advance/debug_turbomind.md b/docs/en/advance/debug_turbomind.md
index 91733ce2a5..d38b548a95 100644
--- a/docs/en/advance/debug_turbomind.md
+++ b/docs/en/advance/debug_turbomind.md
@@ -129,7 +129,7 @@ Reading symbols from python3...
 
 # (Optional) Use https://github.com/InternLM/lmdeploy/blob/main/benchmark/profile_restful_api.py to send a request
 
-python3 profile_restful_api.py --server_addr 127.0.0.1:23333 --tokenizer_path /workdir/Llama-2-13b-chat-hf --dataset /workdir/ShareGPT_V3_unfiltered_cleaned_split.json --concurrency 1 --num_prompts 1
+python3 profile_restful_api.py --backend lmdeploy --dataset-path /workdir/ShareGPT_V3_unfiltered_cleaned_split.json --num_prompts 1
 ````
 
 ## Using GDB
diff --git a/docs/en/benchmark/profile_api_server.md b/docs/en/benchmark/profile_api_server.md
index 07dfc49007..c8b626af36 100644
--- a/docs/en/benchmark/profile_api_server.md
+++ b/docs/en/benchmark/profile_api_server.md
@@ -1,52 +1,17 @@
 # Profile API Server
 
-The way to profiling `api_server` performance is similar to the method for [profiling throughput](./profile_throughput.md). The difference is `api_server` should be launched successfully before testing.
-
-The profiling script is `profile_restful_api.py`. Before running it, please install the lmdeploy precompiled package, download the script and the test dataset:
+Before benchmarking the api_server, please install the lmdeploy precompiled package and download the script and the test dataset:
 
 ```shell
 pip install lmdeploy
 git clone --depth=1 https://github.com/InternLM/lmdeploy
 cd lmdeploy/benchmark
-wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-```
-
-## Metrics
-
-LMDeploy records the performance metrics like first token latency, token throughput (tokens/s) and request throughput (RPM)
-
-`first_token_latency` is only reported in the case of streaming inference.
-
-The formula for calculating `token throughput` is:
-
-$$
-TokenThroughput = Number\\ of\\ generated\\ tokens/TotalTime
-$$
-
-And the formula for calculating `request throughput` is:
-
-$$
-RPM(request\\ per\\ minute)=Number\\ of\\ prompts/TotalTime * 60
-$$
-
-Total time includes prefill time.
-
-## Profile
-
-In this section, we take [internlm/internlm-7b](https://huggingface.co/internlm/internlm-7b) as an example to show the benchmark procedure.
-
-### Launch api_server
-
-```shell
-lmdeploy serve api_server internlm/internlm-7b
 ```
 
-If you would like to change the server's port or other parameters, such as inference engine, max batch size and etc., please run `lmdeploy serve api_server -h` or read [this](../llm/api_server.md) guide to get the detailed explanation.
-
-### Profile
+Launch the server first (you may refer [here](../llm/api_server.md) for guide) and run the following command:
 
 ```shell
-python3 profile_restful_api.py http://0.0.0.0:23333 internlm/internlm-7b ./ShareGPT_V3_unfiltered_cleaned_split.json
+python3 benchmark/profile_restful_api.py --backend lmdeploy --num-prompts 5000 --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json
 ```
 
-For detailed argument specification of `profile_restful_api.py`, such as request concurrency, sampling parameters an so on, please run the help command `python3 profile_restful_api.py -h`.
+For detailed argument specification of `profile_restful_api.py`, please run the help command `python3 benchmark/profile_restful_api.py -h`.
diff --git a/docs/zh_cn/advance/debug_turbomind.md b/docs/zh_cn/advance/debug_turbomind.md
index 3c3b75421d..7c00e2f9d6 100644
--- a/docs/zh_cn/advance/debug_turbomind.md
+++ b/docs/zh_cn/advance/debug_turbomind.md
@@ -129,7 +129,7 @@ Reading symbols from python3...
 
 # (可选) 使用 https://github.com/InternLM/lmdeploy/blob/main/benchmark/profile_restful_api.py 发送请求
 
-python3 profile_restful_api.py --server_addr 127.0.0.1:23333 --tokenizer_path /workdir/Llama-2-13b-chat-hf --dataset /workdir/ShareGPT_V3_unfiltered_cleaned_split.json --concurrency 1 --num_prompts 1
+python3 profile_restful_api.py --backend lmdeploy --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json
 ````
 
 ## 使用 GDB
diff --git a/docs/zh_cn/benchmark/profile_api_server.md b/docs/zh_cn/benchmark/profile_api_server.md
index c872820040..687b22ad19 100644
--- a/docs/zh_cn/benchmark/profile_api_server.md
+++ b/docs/zh_cn/benchmark/profile_api_server.md
@@ -1,8 +1,6 @@
 # api_server 性能测试
 
-api_server 的测试方式与[求吞吐量测试方法](./profile_throughput.md)类似。不同的是，在测试前，需要先启动 api_server，然后再通过测试脚本发送请求进行测试。
-
-测试脚本是 `profile_restful_api.py`。测试之前，请安装 lmdeploy 预编译包，并下载评测脚本和测试数据集。
+测试之前，请安装 lmdeploy 预编译包，并下载测试脚本和数据。
 
 ```shell
 pip install lmdeploy
@@ -11,42 +9,10 @@ cd lmdeploy/benchmark
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 ```
 
-## 测量指标
-
-LMDeploy 统计首token延时（first_token_latency）、token吞吐量（tokens/s）和请求吞吐量（RPM）。
-
-`first_token_latency` 只有在流式推理的情况下才会输出。
-
-token吞吐量的计算公式为：
-
-$$
-吞吐量 = 生成的token数量 / 总时间
-$$
-
-请求吞吐量的计算公式为：
-
-$$
-吞吐量 = 请求数量 / 总时间
-$$
-
-总时间包括 prefill 时间
-
-## 测量方法
-
-我们以 [internlm/internlm-7b](https://huggingface.co/internlm/internlm-7b) 为例，展示 api_server 的性能测试流程
-
-### 启动服务
-
-```shell
-lmdeploy serve api_server internlm/internlm-7b
-```
-
-如果你想改变 server 的端口，或者诸如推理引擎、最大批处理值等参数，请运行 `lmdeploy serve api_server -h` 或者阅读[这篇文档](../llm/api_server.md)，查看详细的参数说明。
-
-### 测速
+然后，启动模型服务（可以参考[这里](../llm/api_server.md)）。接着，使用下面的命令:
 
 ```shell
-python3 profile_restful_api.py http://0.0.0.0:23333 internlm/internlm-7b ./ShareGPT_V3_unfiltered_cleaned_split.json
+python3 profile_restful_api.py --backend lmdeploy  --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json
 ```
 
-关于 `profile_restful_api.py` 脚本中的参数，比如请求并发数、采样参数等等，可以通过运行命令 `python3 profile_restful_api.py -h` 查阅。
+关于 `profile_restful_api.py`的帮助信息，可以通过`python3 profile_restful_api.py -h`查阅

From a465e60ed504eb9094a83a4769cdc7f41438da8c Mon Sep 17 00:00:00 2001
From: q yao <streetyao@live.com>
Date: Mon, 21 Oct 2024 10:20:29 +0800
Subject: [PATCH 012/122] set capture mode thread_local (#2560)

---
 lmdeploy/pytorch/backends/cuda/graph_runner.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lmdeploy/pytorch/backends/cuda/graph_runner.py b/lmdeploy/pytorch/backends/cuda/graph_runner.py
index 059a44a17c..33b1c85280 100644
--- a/lmdeploy/pytorch/backends/cuda/graph_runner.py
+++ b/lmdeploy/pytorch/backends/cuda/graph_runner.py
@@ -77,9 +77,12 @@ def capture(self, **kwargs):
         self.model(**padded_kwargs)
 
         self._graph = torch.cuda.CUDAGraph()
+        # unsafe kernel call in other thread might invalid the capture
+        # so we set thread_safe capture mode here.
         with torch.cuda.graph(self._graph,
                               pool=self.pool,
-                              stream=current_stream):
+                              stream=current_stream,
+                              capture_error_mode='thread_local'):
             output = self.model(**padded_kwargs)
 
         output_buffers = dict(logits=output)

From c9186691b5b14edbe70405a20738ecff4a230e9f Mon Sep 17 00:00:00 2001
From: q yao <streetyao@live.com>
Date: Mon, 21 Oct 2024 10:59:32 +0800
Subject: [PATCH 013/122] Add barrier to prevent TP nccl kernel waiting.
 (#2607)

* add mp.barrier

* remove old exit mechanism of exit_flag (#4)

* fix exit problem on ascend platform

* remove exit_flag in tp exit

* set log level

---------

Co-authored-by: CyCle1024 <ccy_justin@163.com>
---
 lmdeploy/pytorch/engine/model_agent.py | 106 ++++++++++++-------------
 1 file changed, 52 insertions(+), 54 deletions(-)

diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
index 4c902dbe2e..1daf614c8f 100644
--- a/lmdeploy/pytorch/engine/model_agent.py
+++ b/lmdeploy/pytorch/engine/model_agent.py
@@ -2,6 +2,7 @@
 import asyncio
 import atexit
 import os
+import threading
 from datetime import timedelta
 from typing import Any, Callable, Dict, List
 
@@ -400,7 +401,7 @@ def _broadcast_inputs(rank: int, inputs: Any, stream: torch.cuda.Stream):
     """get input tensor parallel."""
     # broadcast meta info
     if rank != 0:
-        inputs = [None, None, None, None]
+        inputs = [None, None, None]
 
     with torch.cuda.stream(stream):
         dist.broadcast_object_list(inputs)
@@ -415,6 +416,7 @@ def _tp_model_loop(
     backend_config: BackendConfig,
     adapters: Dict[str, str],
     world_size: int,
+    barrier: mp.Barrier,
 ):
     """Start model loops for tensor parallel model inference.
 
@@ -438,12 +440,10 @@ def _tp_model_loop(
                                                      world_size=world_size)
 
     while True:
-        inputs, swap_in_map, swap_out_map, exit_flag = _broadcast_inputs(
+        barrier.wait()
+        inputs, swap_in_map, swap_out_map = _broadcast_inputs(
             rank, None, stream)
 
-        if exit_flag:
-            break
-
         cache_swapping(cache_engine,
                        swap_in_map=swap_in_map,
                        swap_out_map=swap_out_map)
@@ -460,6 +460,7 @@ def _tp_model_loop(
 def _start_tp_process(proc_id: int,
                       world_size: int,
                       func: Callable,
+                      log_level: int,
                       device_context: DeviceContext,
                       args: List = None,
                       kwargs: Dict = None):
@@ -473,6 +474,7 @@ def _start_tp_process(proc_id: int,
         kwargs (Dict): The keyword arguments of the func.
     """
     rank = proc_id + 1
+    logger.setLevel(log_level)
     try:
         from lmdeploy.pytorch.check_env import check_env_deeplink
         check_env_deeplink(device_context.device_type)
@@ -499,14 +501,16 @@ def _check_context_alive(mp_context: mp.ProcessContext):
     """check context alive."""
     procs: List[mp.Process] = mp_context.processes
     failed_ranks = list(idx for idx, p in enumerate(procs) if not p.is_alive())
-    if len(failed_ranks) > 0:
-        for p in procs:
-            if p.is_alive():
-                p.terminate()
-            else:
-                p.close()
-        logger.error(f'TP process Rank{failed_ranks} failed.')
-        exit(1)
+    if len(failed_ranks) == 0:
+        return
+    for p in procs:
+        if p.is_alive():
+            p.terminate()
+        else:
+            p.close()
+    logger.error(f'TP process {failed_ranks} failed.')
+    # TODO: not safe exit.
+    os._exit(1)
 
 
 def _find_available_port() -> bool:
@@ -561,13 +565,14 @@ def __signal_term_handler(sig, frame):
         self.world_size = world_size
         self.backend_config = backend_config
 
+        self.mp_bar = self.mp_ctx.Barrier(world_size)
         self._start_sub_process(model_path,
                                 model_config=model_config,
                                 cache_config=cache_config,
                                 backend_config=backend_config,
                                 adapters=adapters,
                                 world_size=world_size,
-                                trust_remote_code=trust_remote_code)
+                                barrier=self.mp_bar)
 
         model, cache_engine, cache_config = self._build_model(
             model_path=model_path,
@@ -575,18 +580,29 @@ def __signal_term_handler(sig, frame):
             cache_config=cache_config,
             backend_config=backend_config,
             adapters=adapters,
-            world_size=world_size,
-        )
+            world_size=world_size)
         self.patched_model = model
         self.cache_config = cache_config
         self.cache_engine = cache_engine
         self.stream = torch.cuda.Stream()
 
+    def _mp_watchdog(self, mp_context: mp.ProcessContext, timeout: int = 1):
+        """watch dog of mp context.
+
+        Args:
+            mp_context: context of multiprocess.
+            timeout: timeout
+        """
+        import time
+        while True:
+            _check_context_alive(mp_context)
+            time.sleep(timeout)
+
     def _start_sub_process(self, model_path: str, model_config: ModelConfig,
                            cache_config: CacheConfig,
                            backend_config: BackendConfig, adapters: Dict[str,
                                                                          str],
-                           world_size: int, trust_remote_code: bool):
+                           world_size: int, barrier: mp.Barrier):
         """Start tensor parallel sub process."""
         port = _find_available_port()
         os.environ.setdefault('MASTER_ADDR', '127.0.0.1')
@@ -601,19 +617,27 @@ def _start_sub_process(self, model_path: str, model_config: ModelConfig,
             args=(
                 world_size,
                 _tp_model_loop,
+                logger.level,
                 device_context,
                 (model_path, ),
-                dict(model_config=model_config,
-                     cache_config=cache_config,
-                     backend_config=backend_config,
-                     adapters=adapters,
-                     world_size=world_size),
+                dict(
+                    model_config=model_config,
+                    cache_config=cache_config,
+                    backend_config=backend_config,
+                    adapters=adapters,
+                    world_size=world_size,
+                    barrier=barrier,
+                ),
             ),
             nprocs=world_size - 1,
             join=False,
             daemon=True,
         )
-        _check_context_alive(self.mp_context)
+
+        t_watchdog = threading.Thread(target=self._mp_watchdog,
+                                      args=[self.mp_context, 1.0],
+                                      daemon=True)
+        t_watchdog.start()
 
         rank = 0
         try:
@@ -628,8 +652,7 @@ def _start_sub_process(self, model_path: str, model_config: ModelConfig,
             if dist.is_initialized():
                 dist.destroy_process_group()
             raise e
-        # Please see Note [Exit By Sending Exit Flag]
-        atexit.register(_exit_by_sending_exit_flag, rank, self)
+        atexit.register(_exit_handler, self)
 
     @torch.inference_mode()
     def _build_model(
@@ -642,7 +665,6 @@ def _build_model(
         world_size: int,
     ):
         """build model."""
-        _check_context_alive(self.mp_context)
         rank = 0
         model, cache_engine, cache_config = _tp_build_model(
             rank,
@@ -664,10 +686,9 @@ def get_block_numel(self):
     def _forward_impl(self, inputs: ModelInputs, swap_in_map: SwapMap,
                       swap_out_map: SwapMap):
         """forward impl."""
-        _check_context_alive(self.mp_context)
+        self.mp_bar.wait()
         rank = 0
-        exit_flag = False
-        _broadcast_inputs(rank, [inputs, swap_in_map, swap_out_map, exit_flag],
+        _broadcast_inputs(rank, [inputs, swap_in_map, swap_out_map],
                           self.stream)
         cache_swapping(self.cache_engine,
                        swap_in_map=swap_in_map,
@@ -717,32 +738,9 @@ def get_logits(self, hidden_states: torch.Tensor):
         return self.patched_model.get_logits(hidden_states)
 
 
-def _exit_by_sending_exit_flag(rank: int, agent: TPModelAgent):
-    """[Note] Exit By Sending Exit Flag: the registration to `atexit` of this
-    function should be called after importing torch.multiprocessing and the
-    initialization of distributed process group."""
-    if not hasattr(agent, 'stream'):
-        # agent is not initialized, just exits normally
-        if hasattr(agent, 'patched_model'):
-            del agent.patched_model
-        return
-
-    import sys
-    if 'torch_npu' in sys.modules and 'uvicorn.server' in sys.modules:
-        # Workaround for CLI serve mode with device_type ascend:
-        # using uvicorn server causes ascend low-level backend of subprocesses
-        # corrupted, and using _broadcast_inputs in this case leads to
-        # main process hanging, just exits normally
+def _exit_handler(agent: TPModelAgent):
+    if hasattr(agent, 'patched_model'):
         del agent.patched_model
-        return
-
-    # send exit_flag to all subprocess relying on all subprocess are alive
-    # and wait at _broadcast_inputs
-    exit_flag = True
-    _broadcast_inputs(rank, [None, None, None, exit_flag], agent.stream)
-    agent.stream.synchronize()
-
-    del agent.patched_model
 
 
 def build_model_agent(model_path: str,

From 77be205df347f77f249afd8e2a83c5e0d71ca7f0 Mon Sep 17 00:00:00 2001
From: yaofengchen <67218893+yao-fengchen@users.noreply.github.com>
Date: Mon, 21 Oct 2024 11:03:55 +0800
Subject: [PATCH 014/122] refactor fused_moe on ascend platform (#2613)

---
 lmdeploy/pytorch/backends/dlinfer/moe.py      | 28 ++-----------------
 lmdeploy/pytorch/kernels/dlinfer/__init__.py  |  2 ++
 lmdeploy/pytorch/kernels/dlinfer/fused_moe.py | 16 +++++++++++
 3 files changed, 21 insertions(+), 25 deletions(-)
 create mode 100644 lmdeploy/pytorch/kernels/dlinfer/fused_moe.py

diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py
index a242e30417..eb8b1e591e 100644
--- a/lmdeploy/pytorch/backends/dlinfer/moe.py
+++ b/lmdeploy/pytorch/backends/dlinfer/moe.py
@@ -2,7 +2,7 @@
 
 import torch
 
-from lmdeploy.pytorch.kernels.dlinfer import moe_gating_topk_softmax
+from lmdeploy.pytorch.kernels.dlinfer import fused_moe, moe_gating_topk_softmax
 
 from ..moe import (FusedMoEBuilder, FusedMoEImpl, SoftmaxTopKBuilder,
                    SoftmaxTopKImpl)
@@ -42,30 +42,8 @@ def forward(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor,
                 topk_ids: torch.LongTensor, gate_up_weights: torch.Tensor,
                 down_weights: torch.Tensor):
         """forward."""
-        seq_length = hidden_states.size(0)
-        moe_output = torch.zeros_like(hidden_states)
-
-        for i in range(seq_length):
-            current_hidden_state = hidden_states[i]
-
-            # faster than remove the for loop
-            for j in range(self.top_k):
-                expert_id = topk_ids[i][j]
-                weight = topk_weights[i][j]
-
-                up_weight = gate_up_weights[expert_id]
-                up_proj = torch.matmul(up_weight, current_hidden_state)
-
-                gate_cache, up_cache = up_proj.chunk(2, -1)
-                gate_cache = torch.nn.functional.silu(gate_cache,
-                                                      inplace=True) * up_cache
-
-                down_weight = down_weights[expert_id]
-                down_proj = torch.matmul(down_weight, gate_cache)
-
-                moe_output[i] += weight * down_proj
-
-        return moe_output
+        return fused_moe(hidden_states, self.top_k, topk_ids, topk_weights,
+                         gate_up_weights, down_weights)
 
 
 class DlinferFusedMoEBuilder(FusedMoEBuilder):
diff --git a/lmdeploy/pytorch/kernels/dlinfer/__init__.py b/lmdeploy/pytorch/kernels/dlinfer/__init__.py
index a8216e70ae..4d30e3b842 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/__init__.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/__init__.py
@@ -2,6 +2,7 @@
 from ..default import multinomial_sampling
 from .apply_rotary_pos_emb import apply_rotary_pos_emb
 from .fill_kv_cache import fill_kv_cache
+from .fused_moe import fused_moe
 from .moe_gating_topk_softmax import moe_gating_topk_softmax
 from .pagedattention import paged_attention_fwd
 from .rms_norm import rms_norm
@@ -10,6 +11,7 @@
     'rms_norm',
     'apply_rotary_pos_emb',
     'fill_kv_cache',
+    'fused_moe',
     'paged_attention_fwd',
     'moe_gating_topk_softmax',
     'multinomial_sampling',
diff --git a/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
new file mode 100644
index 0000000000..72bab2d720
--- /dev/null
+++ b/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import dlinfer.ops as ext_ops
+from torch import Tensor
+
+
+def fused_moe(
+    hidden_states: Tensor,
+    top_k: int,
+    topk_ids: Tensor,
+    topk_weights: Tensor,
+    gate_up_weights: Tensor,
+    down_weights: Tensor,
+):
+    """ascend fused moe."""
+    return ext_ops.fused_moe(hidden_states, top_k, topk_ids, topk_weights,
+                             gate_up_weights, down_weights)

From 48dcd215b8be97f57efa3e0b883cc2d6b82ea3e0 Mon Sep 17 00:00:00 2001
From: yaofengchen <67218893+yao-fengchen@users.noreply.github.com>
Date: Tue, 22 Oct 2024 17:04:17 +0800
Subject: [PATCH 015/122] [ascend] support paged_prefill_attn when batch > 1
 (#2612)

---
 .../backends/dlinfer/ascend/op_backend.py     | 73 ++++++++++++-------
 1 file changed, 48 insertions(+), 25 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
index 53eea622ea..065e39b421 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -50,50 +50,73 @@ def update_step_context(cls, step_context):
         device = step_context.block_offsets.device
 
         is_unpaged_prefill = False
-        q_start_loc_cpu = step_context.q_start_loc.cpu()
-        q_seqlens_cpu = step_context.q_seqlens.cpu()
-        kv_seqlens_cpu = step_context.kv_seqlens.cpu()
-        max_q_seq_len = torch.max(q_seqlens_cpu).item()
-        max_kv_seq_len = torch.max(kv_seqlens_cpu).item()
-
         if not step_context.is_decoding:
             is_unpaged_prefill = \
                 all((step_context.q_seqlens ==
                      step_context.kv_seqlens).tolist())
-            if is_unpaged_prefill:
-                single_attention_mask = torch.logical_not(
-                    torch.tril(
-                        torch.ones(max_q_seq_len,
-                                   max_kv_seq_len,
-                                   dtype=torch.bool).cuda(),
-                        diagonal=max_kv_seq_len - max_q_seq_len,
-                    ))
-                attention_mask.append(single_attention_mask)
+
         total_slots = torch.arange(block_num * block_size,
                                    dtype=torch.long,
                                    device=device)
         total_slots = total_slots.view(block_num, block_size)
+
+        q_seqlens_list = step_context.q_seqlens.tolist()
+        kv_seqlens_list = step_context.kv_seqlens.tolist()
+        max_q_seq_len = max(q_seqlens_list)
+        max_kv_seq_len = max(kv_seqlens_list)
+
         for i in range(step_context.q_start_loc.size(0)):
-            q_seq_len = int(step_context.q_seqlens[i])
-            kv_seq_len = int(step_context.kv_seqlens[i])
+            q_seq_len = q_seqlens_list[i]
+            kv_seq_len = kv_seqlens_list[i]
+
+            # collect kv start indices.
+            history_length = kv_seq_len - q_seq_len
+            slot_tables = total_slots[step_context.block_offsets[i]].flatten()
+            slot_indices = [p for p in range(history_length, kv_seq_len)]
+            slots = slot_tables[slot_indices].reshape((-1, 1))
+            kv_start_indices.append(slots)
+
+            # collect attention mask of paged_prefill attention stage.
             if not (step_context.is_decoding or is_unpaged_prefill):
                 single_attention_mask = torch.logical_not(
                     torch.tril(
-                        torch.ones(step_context.q_seqlens[i],
+                        torch.ones(q_seq_len,
                                    step_context.block_offsets.shape[1] *
                                    block_size,
                                    dtype=torch.bool).cuda(),
-                        diagonal=step_context.kv_seqlens[i] -
-                        step_context.q_seqlens[i],
+                        diagonal=kv_seq_len - q_seq_len,
                     ))
                 attention_mask.append(single_attention_mask)
-            history_length = kv_seq_len - q_seq_len
-            slot_tables = total_slots[step_context.block_offsets[i]].flatten()
-            slot_indices = [p for p in range(history_length, kv_seq_len)]
-            slots = slot_tables[slot_indices].reshape((-1, 1))
-            kv_start_indices.append(slots)
+
         kv_start_indices = torch.cat(kv_start_indices)
 
+        if step_context.is_decoding:
+            # prepare somae params of paged_decode attention stage.
+            q_start_loc_cpu, q_seqlens_cpu = None, None
+            kv_seqlens_cpu = step_context.kv_seqlens.cpu()
+        elif is_unpaged_prefill:
+            # prepare somae params of unpaged_prefill attention stage.
+            q_start_loc_cpu, kv_seqlens_cpu = None, None
+            q_seqlens_cpu = step_context.q_seqlens.cpu()
+            single_attention_mask = torch.logical_not(
+                torch.tril(
+                    torch.ones(max_q_seq_len, max_kv_seq_len,
+                               dtype=torch.bool).cuda(),
+                    diagonal=max_kv_seq_len - max_q_seq_len,
+                ))
+            attention_mask.append(single_attention_mask)
+        else:
+            # prepare somae params of paged_prefill attention stage.
+            q_start_loc_cpu, q_seqlens_cpu = None, None
+            kv_seqlens_cpu = step_context.kv_seqlens.repeat_interleave(
+                step_context.q_seqlens, 0).cpu()
+            block_offsets_int32 = step_context.block_offsets.to(torch.int32)
+            step_context.block_offsets = block_offsets_int32.repeat_interleave(
+                step_context.q_seqlens, 0)
+            attention_mask = [
+                torch.cat([mask for mask in attention_mask]).unsqueeze(1)
+            ]
+
         attn_meta_cls = cls.get_attention_metadata_cls()
         attn_metadata = attn_meta_cls(
             step_context.is_decoding,

From b232c904c7c703603e348f57a5879462af965dbe Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Tue, 22 Oct 2024 18:38:52 +0800
Subject: [PATCH 016/122] Raise error if user used base template during
 chatting (#2618)

---
 lmdeploy/serve/async_engine.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
index ff13b79083..598977747c 100644
--- a/lmdeploy/serve/async_engine.py
+++ b/lmdeploy/serve/async_engine.py
@@ -455,6 +455,10 @@ async def _get_prompt_input(self,
             prompt = chat_template.messages2prompt(prompt,
                                                    sequence_start,
                                                    tools=tools)
+        if prompt is None:
+            raise ValueError(
+                f'You are using base template to handle chat task. Please specify a `--chat-template` name chosen from `lmdeploy list` if you want to use OpenAI messages input.'  # noqa
+            )
         input_ids = self.tokenizer.encode(prompt, add_bos=sequence_start)
         return {'prompt': prompt, 'input_ids': input_ids}
 

From ffcc6104565e03a9c650c5cd66ef50ae49ce3107 Mon Sep 17 00:00:00 2001
From: jinminxi104 <jinminxi104@hotmail.com>
Date: Wed, 23 Oct 2024 11:53:17 +0800
Subject: [PATCH 017/122] refine pre-post-process (#2632)

* refine pre-post-process

* pytorch engine uses modelagent stream

* put num_appendable_ids.cuda() back
---
 lmdeploy/pytorch/engine/engine.py         | 4 ++--
 lmdeploy/pytorch/engine/logits_process.py | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
index f6ce4c29a1..7ba33d73c4 100644
--- a/lmdeploy/pytorch/engine/engine.py
+++ b/lmdeploy/pytorch/engine/engine.py
@@ -162,7 +162,7 @@ def __init__(self,
         self.scheduler_config = scheduler_config
         self.cache_config = cache_config
         self.backend_config = backend_config
-        self.stream = torch.cuda.Stream()
+        self.stream = self.model_agent.stream
 
         self.req_manager = self._bind_request_manager()
 
@@ -526,7 +526,7 @@ def __get_last_logits():
             last_idx = seq_length.cumsum(-1) - 1
             return logits[last_idx, :]
 
-        split_logits = __get_last_logits().cuda()
+        split_logits = __get_last_logits()
         logits_processor = FusedLogitsProcessor(sampling_inputs, ignore_eos,
                                                 self.tokenizer.model.model)
         logits = logits_processor(all_ids, guided_input_ids, split_logits)
diff --git a/lmdeploy/pytorch/engine/logits_process.py b/lmdeploy/pytorch/engine/logits_process.py
index 806ad91b86..2ee2eaced2 100644
--- a/lmdeploy/pytorch/engine/logits_process.py
+++ b/lmdeploy/pytorch/engine/logits_process.py
@@ -308,7 +308,6 @@ def __call__(self, all_ids: torch.LongTensor,
 
         """
         sampling_inputs = self.sampling_inputs
-        scores = scores.clone()
 
         custom_logits_processors = self.sampling_inputs.logits_processors
         if any(custom_logits_processors):

From 1530afe7bafb961ca258a7f563d1036b66cd64f6 Mon Sep 17 00:00:00 2001
From: yaofengchen <67218893+yao-fengchen@users.noreply.github.com>
Date: Wed, 23 Oct 2024 11:57:42 +0800
Subject: [PATCH 018/122] feat(ascend): support w4a16 (#2587)

* feat(ascend): support w4a16

* refactor ascend awq_linear

* update doc

* refine code

* update code

* rebase main
---
 docs/en/supported_models/supported_models.md  | 26 +++++------
 .../supported_models/supported_models.md      | 26 +++++------
 lmdeploy/cli/lite.py                          |  5 +++
 lmdeploy/lite/apis/auto_awq.py                |  2 +
 .../pytorch/backends/dlinfer/awq_modules.py   | 45 +++++++++++++++++++
 .../pytorch/backends/dlinfer/op_backend.py    |  3 ++
 lmdeploy/pytorch/check_env/__init__.py        | 45 ++++++++++---------
 lmdeploy/pytorch/engine/engine.py             |  3 +-
 lmdeploy/pytorch/kernels/dlinfer/__init__.py  |  2 +
 .../pytorch/kernels/dlinfer/awq_kernels.py    | 21 +++++++++
 10 files changed, 131 insertions(+), 47 deletions(-)
 create mode 100644 lmdeploy/pytorch/backends/dlinfer/awq_modules.py
 create mode 100644 lmdeploy/pytorch/kernels/dlinfer/awq_kernels.py

diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index 8626164e66..7a498247f3 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -87,16 +87,16 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 
 ## PyTorchEngine on Huawei Ascend Platform
 
-|     Model      |   Size   | Type | FP16/BF16 |
-| :------------: | :------: | :--: | :-------: |
-|     Llama2     | 7B - 70B | LLM  |    Yes    |
-|     Llama3     |    8B    | LLM  |    Yes    |
-|    Llama3.1    |    8B    | LLM  |    Yes    |
-|   InternLM2    | 7B - 20B | LLM  |    Yes    |
-|  InternLM2.5   | 7B - 20B | LLM  |    Yes    |
-|    Mixtral     |   8x7B   | LLM  |    Yes    |
-|  QWen1.5-MoE   |  A2.7B   | LLM  |    Yes    |
-|     QWen2      |    7B    | LLM  |    Yes    |
-|   QWen2-MoE    | A14.57B  | LLM  |    Yes    |
-| InternVL(v1.5) |  2B-26B  | MLLM |    Yes    |
-|   InternVL2    |  1B-40B  | MLLM |    Yes    |
+|     Model      |   Size   | Type | FP16/BF16 | W4A16 |
+| :------------: | :------: | :--: | :-------: | :---: |
+|     Llama2     | 7B - 70B | LLM  |    Yes    |  Yes  |
+|     Llama3     |    8B    | LLM  |    Yes    |  Yes  |
+|    Llama3.1    |    8B    | LLM  |    Yes    |  Yes  |
+|   InternLM2    | 7B - 20B | LLM  |    Yes    |  Yes  |
+|  InternLM2.5   | 7B - 20B | LLM  |    Yes    |  Yes  |
+|    Mixtral     |   8x7B   | LLM  |    Yes    |  No   |
+|  QWen1.5-MoE   |  A2.7B   | LLM  |    Yes    |  No   |
+|     QWen2      |    7B    | LLM  |    Yes    |  No   |
+|   QWen2-MoE    | A14.57B  | LLM  |    Yes    |  No   |
+| InternVL(v1.5) |  2B-26B  | MLLM |    Yes    |  Yes  |
+|   InternVL2    |  1B-40B  | MLLM |    Yes    |  Yes  |
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index c0cb6affb2..b362aa2050 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -87,16 +87,16 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 
 ## PyTorchEngine 华为昇腾平台
 
-|     Model      |   Size   | Type | FP16/BF16 |
-| :------------: | :------: | :--: | :-------: |
-|     Llama2     | 7B - 70B | LLM  |    Yes    |
-|     Llama3     |    8B    | LLM  |    Yes    |
-|    Llama3.1    |    8B    | LLM  |    Yes    |
-|   InternLM2    | 7B - 20B | LLM  |    Yes    |
-|  InternLM2.5   | 7B - 20B | LLM  |    Yes    |
-|    Mixtral     |   8x7B   | LLM  |    Yes    |
-|  QWen1.5-MoE   |  A2.7B   | LLM  |    Yes    |
-|     QWen2      |    7B    | LLM  |    Yes    |
-|   QWen2-MoE    | A14.57B  | LLM  |    Yes    |
-| InternVL(v1.5) |  2B-26B  | MLLM |    Yes    |
-|   InternVL2    |  1B-40B  | MLLM |    Yes    |
+|     Model      |   Size   | Type | FP16/BF16 | W4A16 |
+| :------------: | :------: | :--: | :-------: | :---: |
+|     Llama2     | 7B - 70B | LLM  |    Yes    |  Yes  |
+|     Llama3     |    8B    | LLM  |    Yes    |  Yes  |
+|    Llama3.1    |    8B    | LLM  |    Yes    |  Yes  |
+|   InternLM2    | 7B - 20B | LLM  |    Yes    |  Yes  |
+|  InternLM2.5   | 7B - 20B | LLM  |    Yes    |  Yes  |
+|    Mixtral     |   8x7B   | LLM  |    Yes    |  No   |
+|  QWen1.5-MoE   |  A2.7B   | LLM  |    Yes    |  No   |
+|     QWen2      |    7B    | LLM  |    Yes    |  No   |
+|   QWen2-MoE    | A14.57B  | LLM  |    Yes    |  No   |
+| InternVL(v1.5) |  2B-26B  | MLLM |    Yes    |  Yes  |
+|   InternVL2    |  1B-40B  | MLLM |    Yes    |  Yes  |
diff --git a/lmdeploy/cli/lite.py b/lmdeploy/cli/lite.py
index 1239f9d365..d76d6a5f34 100644
--- a/lmdeploy/cli/lite.py
+++ b/lmdeploy/cli/lite.py
@@ -35,6 +35,11 @@ def add_parser_auto_awq():
         ArgumentHelper.calib_seqlen(parser)
         ArgumentHelper.calib_batchsize(parser)
         ArgumentHelper.calib_search_scale(parser)
+        parser.add_argument(
+            '--device',
+            type=str,
+            default='cuda',
+            help='Device for weight quantization (cuda or npu)')
         parser.add_argument('--w-bits',
                             type=int,
                             default=4,
diff --git a/lmdeploy/lite/apis/auto_awq.py b/lmdeploy/lite/apis/auto_awq.py
index 6cd9b1fd42..d7d6a5560e 100644
--- a/lmdeploy/lite/apis/auto_awq.py
+++ b/lmdeploy/lite/apis/auto_awq.py
@@ -10,6 +10,7 @@
                                             awq_layers, quant_weights,
                                             smooth_layers)
 from lmdeploy.lite.utils import collect_target_modules
+from lmdeploy.pytorch.check_env import try_import_deeplink
 
 from .calibrate import LAYER_TYPE_MAP, NORM_TYPE_MAP, calibrate
 
@@ -79,6 +80,7 @@ def auto_awq(model: str,
         download_dir (str): Directory to download and load the weights,
             default to the default cache directory of huggingface.
     """
+    try_import_deeplink(device)
     if not osp.exists(model):
         print(f'can\'t find model from local_path {model}, '
               'try to download from remote')
diff --git a/lmdeploy/pytorch/backends/dlinfer/awq_modules.py b/lmdeploy/pytorch/backends/dlinfer/awq_modules.py
new file mode 100644
index 0000000000..7eb767f62f
--- /dev/null
+++ b/lmdeploy/pytorch/backends/dlinfer/awq_modules.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+
+from lmdeploy.pytorch.kernels.dlinfer import awq_linear
+
+from ..awq_modules import LinearW4A16Builder, LinearW4A16Impl
+
+
+class AwqLinearW4A16Impl(LinearW4A16Impl):
+    """awq kernel linear."""
+
+    def __init__(self, in_features: int, out_features: int, w_bit: int,
+                 group_size: int):
+        self.in_features = in_features
+        self.out_features = out_features
+        self.w_bit = w_bit
+        self.group_size = group_size
+
+    def forward(self,
+                x,
+                qweight: torch.Tensor,
+                scales: torch.Tensor,
+                qzeros: torch.Tensor,
+                bias: Optional[torch.Tensor] = None,
+                all_reduce: bool = False):
+        """forward."""
+        out = awq_linear(x, qweight, scales, qzeros, bias, all_reduce,
+                         self.group_size)
+        return out
+
+
+class AwqLinearW4A16Builder(LinearW4A16Builder):
+    """awq linear builder."""
+
+    @staticmethod
+    def build(in_features: int,
+              out_features: int,
+              w_bit: int,
+              group_size: int,
+              bias: bool = False,
+              dtype: torch.dtype = None):
+        """build."""
+        return AwqLinearW4A16Impl(in_features, out_features, w_bit, group_size)
diff --git a/lmdeploy/pytorch/backends/dlinfer/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/op_backend.py
index 9ee1dd4773..124633f857 100644
--- a/lmdeploy/pytorch/backends/dlinfer/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/op_backend.py
@@ -37,6 +37,9 @@ def get_layer_impl_builder(cls, layer_type: OpType):
         elif layer_type == OpType.FusedMoE:
             from .moe import DlinferFusedMoEBuilder
             return DlinferFusedMoEBuilder
+        elif layer_type == OpType.LinearW4A16:
+            from .awq_modules import AwqLinearW4A16Builder
+            return AwqLinearW4A16Builder
         else:
             logger.debug(
                 f'Op {layer_type} fallback to default implementation.')
diff --git a/lmdeploy/pytorch/check_env/__init__.py b/lmdeploy/pytorch/check_env/__init__.py
index 5ace70b53c..f5c7ea8d0b 100644
--- a/lmdeploy/pytorch/check_env/__init__.py
+++ b/lmdeploy/pytorch/check_env/__init__.py
@@ -31,6 +31,7 @@ def try_import_deeplink(device_type: str):
     """import dlinfer if specific device_type is set."""
     deeplink_device_type_list = [
         'ascend',
+        'npu',
     ]
     if device_type in deeplink_device_type_list:
         logger = get_logger('lmdeploy')
@@ -118,30 +119,32 @@ def check_env(device_type: str):
 MAX_TRANSFORMERS_VERSION = '4.44.1'
 
 
-def check_awq(hf_config):
+def check_awq(hf_config, device_type):
     """check awq support."""
     logger = get_logger('lmdeploy')
-    quantization_config = getattr(hf_config, 'quantization_config', dict())
-    quant_method = quantization_config.get('quant_method', None)
-    if quant_method != 'awq':
-        return
-    try:
-        import awq  # noqa
-    except Exception as e:
-        _handle_exception(e, 'autoawq', logger)
+    if device_type == 'cuda':
+        quantization_config = getattr(hf_config, 'quantization_config', dict())
+        quant_method = quantization_config.get('quant_method', None)
+        if quant_method != 'awq':
+            return
+        try:
+            import awq  # noqa
+        except Exception as e:
+            _handle_exception(e, 'autoawq', logger)
 
-    try:
-        import awq_ext  # noqa
-    except Exception:
-        logger.debug('Exception:', exc_info=1)
-        logger.warning('Failed to import `awq_ext`. '
-                       'Try reinstall it from source: '
-                       'https://github.com/casper-hansen/AutoAWQ_kernels')
+        try:
+            import awq_ext  # noqa
+        except Exception:
+            logger.debug('Exception:', exc_info=1)
+            logger.warning('Failed to import `awq_ext`. '
+                           'Try reinstall it from source: '
+                           'https://github.com/casper-hansen/AutoAWQ_kernels')
 
 
 def check_transformers_version(model_path: str,
                                trust_remote_code: bool = True,
-                               dtype: str = 'auto'):
+                               dtype: str = 'auto',
+                               device_type: str = 'cuda'):
     """check transformers version."""
     from packaging import version
     logger = get_logger('lmdeploy')
@@ -226,16 +229,18 @@ def __check_model_dtype_support(config):
     config = __check_config(trans_version)
     __check_model_transformers_version(config, trans_version)
     __check_model_dtype_support(config)
-    check_awq(config)
+    check_awq(config, device_type)
 
 
 def check_model(model_path: str,
                 trust_remote_code: bool = True,
-                dtype: str = 'auto'):
+                dtype: str = 'auto',
+                device_type: str = 'cuda'):
     """check model requirements."""
     logger = get_logger('lmdeploy')
     logger.info('Checking model.')
-    check_transformers_version(model_path, trust_remote_code, dtype)
+    check_transformers_version(model_path, trust_remote_code, dtype,
+                               device_type)
 
 
 def check_adapter(path: str):
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
index 7ba33d73c4..0ebb6f6310 100644
--- a/lmdeploy/pytorch/engine/engine.py
+++ b/lmdeploy/pytorch/engine/engine.py
@@ -96,7 +96,8 @@ def __init__(self,
         else:
             engine_config = copy.deepcopy(engine_config)
         check_env(engine_config.device_type)
-        check_model(model_path, trust_remote_code, engine_config.dtype)
+        check_model(model_path, trust_remote_code, engine_config.dtype,
+                    engine_config.device_type)
         if engine_config.max_batch_size is None:
             engine_config.max_batch_size = get_max_batch_size(
                 engine_config.device_type)
diff --git a/lmdeploy/pytorch/kernels/dlinfer/__init__.py b/lmdeploy/pytorch/kernels/dlinfer/__init__.py
index 4d30e3b842..4d678bfe68 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/__init__.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/__init__.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from ..default import multinomial_sampling
 from .apply_rotary_pos_emb import apply_rotary_pos_emb
+from .awq_kernels import awq_linear
 from .fill_kv_cache import fill_kv_cache
 from .fused_moe import fused_moe
 from .moe_gating_topk_softmax import moe_gating_topk_softmax
@@ -10,6 +11,7 @@
 __all__ = [
     'rms_norm',
     'apply_rotary_pos_emb',
+    'awq_linear',
     'fill_kv_cache',
     'fused_moe',
     'paged_attention_fwd',
diff --git a/lmdeploy/pytorch/kernels/dlinfer/awq_kernels.py b/lmdeploy/pytorch/kernels/dlinfer/awq_kernels.py
new file mode 100644
index 0000000000..473e0404c9
--- /dev/null
+++ b/lmdeploy/pytorch/kernels/dlinfer/awq_kernels.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import dlinfer.ops as ext_ops
+from torch import Tensor
+
+
+def awq_linear(x: Tensor,
+               qweight: Tensor,
+               scales: Tensor,
+               qzeros: Tensor,
+               bias: Optional[Tensor] = None,
+               all_reduce: bool = False,
+               group_size: int = 0):
+    return ext_ops.weight_quant_matmul(x.squeeze(0),
+                                       qweight,
+                                       scales,
+                                       offset=qzeros,
+                                       bias=bias,
+                                       all_reduce=all_reduce,
+                                       group_size=group_size).unsqueeze(0)

From 1bb7a9e960edc9f93d812fa2d52727435b103276 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Wed, 23 Oct 2024 13:44:43 +0800
Subject: [PATCH 019/122] [ci] React dailytest workflow (#2617)

* Update daily_ete_test.yml

* Update daily_ete_test.yml

* Update daily_ete_test.yml

* Update daily_ete_test.yml

* Update daily_ete_test.yml

* Update daily_ete_test.yml

* Update daily_ete_test.yml

* Update daily_ete_test.yml

* Update daily_ete_test.yml

* Update daily_ete_test.yml

* Update daily_ete_test.yml

* Update daily_ete_test.yml

* Update daily_ete_test.yml

* Update daily_ete_test.yml

* Update daily_ete_test.yml

* Update daily_ete_test.yml

* update
---
 .github/workflows/daily_ete_test.yml          | 391 +++++++++---------
 autotest/config.yaml                          |   7 +
 .../test_pipeline_chat_turbomind_vl.py        |   4 +
 .../test_restful_chat_hf_turbomind_vl.py      |   3 +-
 autotest/utils/config_utils.py                |  36 +-
 5 files changed, 224 insertions(+), 217 deletions(-)

diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
index 1cb242a74b..229fdd6ca6 100644
--- a/.github/workflows/daily_ete_test.yml
+++ b/.github/workflows/daily_ete_test.yml
@@ -17,12 +17,12 @@ on:
         required: true
         description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
         type: string
-        default: "['turbomind', 'pytorch', 'turbomind-vl']"
+        default: "['turbomind', 'pytorch', 'turbomind_vl']"
       model:
         required: true
         description: 'Set testcase module filter: chat, restful, pipeline, quantization. Default contains all models'
         type: string
-        default: "['quantization','convert','pipeline','restful','chat','local_case']"
+        default: "['pipeline','restful','chat']"
       offline_mode:
         required: true
         description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
@@ -32,7 +32,7 @@ on:
         required: true
         description: 'regression functions'
         type: string
-        default: "['tools','restful','pipeline','benchmark','evaluation']"
+        default: "['quant', 'tools','restful','pipeline','benchmark','evaluation']"
   schedule:
     - cron:  '00 16 * * 0-4'
 
@@ -44,6 +44,10 @@ env:
   REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }}
   COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
   FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
+  TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy
+  OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
+  OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
+  DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL
 
 jobs:
   linux-build:
@@ -81,27 +85,16 @@ jobs:
           name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
 
 
-  test_tools:
+  download_pkgs:
     needs: linux-build
-    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
+    if: ${{!cancelled()}}
     runs-on: [self-hosted, linux-a100]
-    timeout-minutes: 450
-    env:
-      PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
-      MODELSCOPE_CACHE: /root/modelscope_hub
-      MODELSCOPE_MODULES_CACHE: /root/modelscope_modules
+    timeout-minutes: 50
     container:
       image: openmmlab/lmdeploy:latest-cu11
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
-        - /nvme/github-actions/pip-cache:/root/.cache/pip
-        - /nvme/github-actions/packages:/root/packages
-        - /nvme/github-actions/modelscope_hub:/root/modelscope_hub
-        - /nvme/github-actions/modelscope_modules:/root/modelscope_modules
-        - /nvme/github-actions/resources/lora:/root/lora
         - /nvme/qa_test_models:/nvme/qa_test_models
-        - /mnt/shared:/mnt/shared
-        - /nvme/qa_test_models/lmdeploy/autotest:/local_case
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
       - name: Clone repository
@@ -110,14 +103,48 @@ jobs:
         with:
           repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
           ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Copy repository
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}}
       - name: Copy repository - offline
         if: ${{inputs.offline_mode}}
-        run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}}
       - name: Download Artifacts
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         uses: actions/download-artifact@v4
         with:
           name: my-artifact-${{ github.run_id }}-py310
+      - name: Copy Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+      - name: Copy Artifacts - offline
+        if: ${{inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+
+  test_quantization:
+    needs: download_pkgs
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}}
+    runs-on: [self-hosted, linux-a100]
+    timeout-minutes: 120
+    env:
+      PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
+      MODELSCOPE_CACHE: /root/modelscope_hub
+      MODELSCOPE_MODULES_CACHE: /root/modelscope_modules
+    container:
+      image: openmmlab/lmdeploy:latest-cu11
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/github-actions/modelscope_hub:/root/modelscope_hub
+        - /nvme/github-actions/modelscope_modules:/root/modelscope_modules
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /nvme/qa_test_models/lmdeploy/autotest:/local_case
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
           # manually install flash attn
@@ -126,25 +153,18 @@ jobs:
           python3 -m pip install -e /root/packages/AutoAWQ_kernels
           python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
           python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps
-          python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
           python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
-          pip install /nvme/qa_test_models/offline_pkg/DeepSeek-VL --no-deps
-      - name: Install lmdeploy - offline
-        if: ${{inputs.offline_mode}}
-        run: |
-          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps
-          python3 -m pip install -r requirements/test.txt
-          pip install /nvme/qa_test_models/offline_pkg/DeepSeek-VL --no-deps
+          pip install ${{env.DEEPSEEK_VL}} --no-deps
       - name: Check env
         run: |
+          pip install transformers
           pip uninstall -y nvidia-nccl-cu11
           python3 -m pip list
           lmdeploy check_env
-          cp -r /root/lora .
           rm -rf allure-results
           # remove tmp log in testcase
           rm -rf /nvme/qa_test_models/autotest_model/log/*
@@ -152,104 +172,135 @@ jobs:
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - quantization w4a16
         continue-on-error: true
-        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'quantization'))
+        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'turbomind')
         run: |
           pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - quantization w8a8
         continue-on-error: true
-        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'quantization'))
+        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'pytorch')
         run: |
           pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - convert
         continue-on-error: true
-        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'convert'))
+        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'turbomind')
         run: |
           pytest autotest/tools/convert -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  test_tools:
+    needs: test_quantization
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
+    runs-on: [self-hosted, linux-a100]
+    timeout-minutes: 150
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: ${{ fromJSON(inputs.backend) || fromJSON('["turbomind", "pytorch", "turbomind_vl"]')}}
+        model: ${{ fromJSON(inputs.model) || fromJSON('["pipeline","restful","chat"]')}}
+        exclude:
+          - backend: turbomind_vl
+            model: chat
+        include:
+          - backend: turbomind
+            model: local_case
+    env:
+      PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
+      MODELSCOPE_CACHE: /root/modelscope_hub
+      MODELSCOPE_MODULES_CACHE: /root/modelscope_modules
+    container:
+      image: openmmlab/lmdeploy:latest-cu11
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/github-actions/modelscope_hub:/root/modelscope_hub
+        - /nvme/github-actions/modelscope_modules:/root/modelscope_modules
+        - /nvme/github-actions/resources/lora:/root/lora
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /nvme/qa_test_models/lmdeploy/autotest:/local_case
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+          python3 -m pip install -e /root/packages/AutoAWQ_kernels
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+          pip install ${{env.DEEPSEEK_VL}} --no-deps
+      - name: Check env
+        run: |
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          cp -r /root/lora .
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - chat workspace
         continue-on-error: true
-        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'chat'))
+        if: matrix.backend == 'turbomind' && matrix.model == 'chat'
         run: |
           pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
           pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Test lmdeploy - chat hf turbomind
-        continue-on-error: true
-        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'chat'))
-        run: |
-          pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Test lmdeploy - chat hf torch
-        continue-on-error: true
-        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'chat'))
-        run: |
-          pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Test lmdeploy - pipeline turbomind
+      - name: Test lmdeploy - chat
         continue-on-error: true
-        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'pipeline'))
+        if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'chat'
         run: |
-          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Test lmdeploy - pipeline turbomind vl
+      - name: Test lmdeploy - pipeline
         continue-on-error: true
-        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind-vl') && contains(fromJSON(github.event.inputs.model), 'pipeline'))
+        if: matrix.model == 'pipeline'
         run: |
-          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Test lmdeploy - restful turbomind
+      - name: Test lmdeploy - restful
         continue-on-error: true
-        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'restful'))
+        if: matrix.model == 'restful'
         run: |
-          pytest autotest/tools/restful/test_restful_chat_hf_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/restful/test_restful_chat_hf_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - restful workspace
         continue-on-error: true
-        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'restful'))
+        if: matrix.backend == 'turbomind' && matrix.model == 'restful'
         run: |
           pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
           pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Test lmdeploy - restful turbomind vl
-        continue-on-error: true
-        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind-vl') && contains(fromJSON(github.event.inputs.model), 'restful'))
-        run: |
-          pytest autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Test lmdeploy - pipeline torch
-        continue-on-error: true
-        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'pipeline'))
-        run: |
-          pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Test lmdeploy - restful torch
-        continue-on-error: true
-        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'restful'))
-        run: |
-          pytest autotest/tools/restful/test_restful_chat_hf_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/restful/test_restful_chat_hf_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - local testcase
-        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.model), 'local_case')
+        if: matrix.backend == 'turbomind' && matrix.model == 'local_case'
         run: |
           pytest /local_case/issue_regression --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}}|| true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
@@ -266,12 +317,12 @@ jobs:
   test_restful:
     if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
     runs-on: [self-hosted, linux-a100]
-    needs: test_tools
+    needs: test_quantization
     strategy:
       fail-fast: false
       matrix:
         backend: ['turbomind', 'pytorch']
-    timeout-minutes: 300
+    timeout-minutes: 60
     container:
       image: openmmlab/lmdeploy:latest-cu11
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
@@ -281,42 +332,30 @@ jobs:
         - /nvme/qa_test_models:/nvme/qa_test_models
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
-      - name: Clone repository
-        uses: actions/checkout@v2
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
-        with:
-          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
-          ref: ${{github.event.inputs.repo_ref || 'main'}}
-      - name: Copy repository - offline
-        if: ${{inputs.offline_mode}}
-        run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
-      - name: Download Artifacts
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
-        uses: actions/download-artifact@v4
-        with:
-          name: my-artifact-${{ github.run_id }}-py310
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-          python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
+          python3 -m pip install -e /root/packages/AutoAWQ_kernels
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
           python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
-      - name: Install lmdeploy - offline
-        if: ${{inputs.offline_mode}}
-        run: |
-          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps
-          python3 -m pip install -r requirements/test.txt
+          pip install ${{env.DEEPSEEK_VL}} --no-deps
       - name: Check env
         run: |
           pip uninstall -y nvidia-nccl-cu11
           python3 -m pip list
           lmdeploy check_env
           rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
           mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Start restful api turbomind
@@ -374,8 +413,8 @@ jobs:
   test_pipeline:
     if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'pipeline'))}}
     runs-on: [self-hosted, linux-a100]
-    needs: test_tools
-    timeout-minutes: 300
+    needs: test_quantization
+    timeout-minutes: 120
     container:
       image: openmmlab/lmdeploy:latest-cu11
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
@@ -385,42 +424,30 @@ jobs:
         - /nvme/qa_test_models:/nvme/qa_test_models
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
-      - name: Clone repository
-        uses: actions/checkout@v2
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
-        with:
-          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
-          ref: ${{github.event.inputs.repo_ref || 'main'}}
-      - name: Copy repository - offline
-        if: ${{inputs.offline_mode}}
-        run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
-      - name: Download Artifacts
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
-        uses: actions/download-artifact@v4
-        with:
-          name: my-artifact-${{ github.run_id }}-py310
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-          python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
+          python3 -m pip install -e /root/packages/AutoAWQ_kernels
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
           python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
-      - name: Install lmdeploy - offline
-        if: ${{inputs.offline_mode}}
-        run: |
-          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps
-          python3 -m pip install -r requirements/test.txt
+          pip install ${{env.DEEPSEEK_VL}} --no-deps
       - name: Check env
         run: |
           pip uninstall -y nvidia-nccl-cu11
           python3 -m pip list
           lmdeploy check_env
           rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
           mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - interface pipeline case
@@ -447,8 +474,8 @@ jobs:
   test_benchmark:
     if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}}
     runs-on: [self-hosted, linux-a100]
-    needs: test_tools
-    timeout-minutes: 300
+    needs: test_quantization
+    timeout-minutes: 120
     container:
       image: openmmlab/lmdeploy:latest-cu11
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
@@ -458,41 +485,30 @@ jobs:
         - /nvme/qa_test_models:/nvme/qa_test_models
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
-      - name: Clone repository
-        uses: actions/checkout@v2
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
-        with:
-          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
-          ref: ${{github.event.inputs.repo_ref || 'main'}}
-      - name: Copy repository - offline
-        if: ${{inputs.offline_mode}}
-        run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
-      - name: Download Artifacts
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
-        uses: actions/download-artifact@v4
-        with:
-          name: my-artifact-${{ github.run_id }}-py310
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-          python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
+          python3 -m pip install -e /root/packages/AutoAWQ_kernels
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
           python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
-      - name: Install lmdeploy - offline
-        if: ${{inputs.offline_mode}}
-        run: |
-          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps
-          python3 -m pip install -r requirements/test.txt
+          pip install ${{env.DEEPSEEK_VL}} --no-deps
       - name: Check env
         run: |
           pip uninstall -y nvidia-nccl-cu11
           python3 -m pip list
           lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
           mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test benchmark script
@@ -513,8 +529,8 @@ jobs:
   test_evaluation:
     if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'evaluation'))}}
     runs-on: [self-hosted, linux-a100]
-    needs: test_tools
-    timeout-minutes: 300 # 5hours
+    needs: test_quantization
+    timeout-minutes: 120 # 5hours
     container:
       image: openmmlab/lmdeploy:latest-cu11
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
@@ -524,46 +540,26 @@ jobs:
         - /nvme/github-actions/resources:/root/resources
         - /nvme/github-actions/opencompass-data:/root/opencompass-data
         - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
-        - /nvme/qa_test_models:/root/models
-        - /nvme/qa_test_models/offline_pkg:/nvme/qa_test_models/offline_pkg
+        - /nvme/qa_test_models:/nvme/qa_test_models
         - /mnt/shared:/mnt/shared
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
-      - name: Setup systems
-        run: |
-          export TIME_STAMP="$(date +'%Y%m%d-%H%M%S')"
-          echo "TIME_STAMP=$TIME_STAMP" >> $GITHUB_ENV
-      - name: Clone repository
-        uses: actions/checkout@v2
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
-        with:
-          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
-          ref: ${{github.event.inputs.repo_ref || 'main'}}
-      - name: Copy repository - offline
-        if: ${{inputs.offline_mode}}
-        run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
-      - name: Download Artifacts
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
-        uses: actions/download-artifact@v4
-        with:
-          name: my-artifact-${{ github.run_id }}-py310
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy - dependency
         run: |
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
           python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+          python3 -m pip install -e /root/packages/AutoAWQ_kernels
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
           python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps
-          python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
       - name: Install lmdeploy
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
           python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
-      - name: Install lmdeploy - offline
-        if: ${{inputs.offline_mode}}
-        run: |
-          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps
-          python3 -m pip install -r requirements/test.txt
+          pip install ${{env.DEEPSEEK_VL}} --no-deps
       - name: Install opencompass
         run: |
           git clone --depth=1 https://github.com/open-compass/opencompass.git
@@ -575,12 +571,15 @@ jobs:
           pip uninstall -y nvidia-nccl-cu11
           python3 -m pip list
           lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
           mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Setup paths for evaluation
         run: |
           ln -s /root/opencompass-data ./data
-          python3 .github/scripts/action_tools.py create_model_links /root/models .
+          python3 .github/scripts/action_tools.py create_model_links /nvme/qa_test_models .
       - name: Evaluate models
         run: |
           export LMDEPLOY_DIR=$(pwd)
@@ -629,30 +628,12 @@ jobs:
         - /nvme/qa_test_models:/nvme/qa_test_models
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
-      - name: Clone repository
-        uses: actions/checkout@v2
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
-        with:
-          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
-          ref: ${{github.event.inputs.repo_ref || 'main'}}
-      - name: Copy repository - offline
-        if: ${{inputs.offline_mode}}
-        run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
-      - name: Download Artifacts
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
-        uses: actions/download-artifact@v4
-        with:
-          name: my-artifact-${{ github.run_id }}-py310
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
           python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
-      - name: Install lmdeploy - offline
-        if: ${{inputs.offline_mode}}
-        run: |
-          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps
-          python3 -m pip install -r requirements/test.txt
       - name: Get coverage report
         run: |
           pip install coverage
diff --git a/autotest/config.yaml b/autotest/config.yaml
index c0db4a71fc..4e4b20f206 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -12,6 +12,7 @@ tp_config:
     Qwen-VL-Chat: 2
     llava-v1.5-13b: 2
     internlm2_5-20b-chat: 2
+    internlm2_5-20b: 2
     Meta-Llama-3-1-70B-Instruct: 4
     internlm2_5-7b-chat-1m: 4
     Qwen2-7B-Instruct-GPTQ-Int4: 2
@@ -89,6 +90,7 @@ pytorch_chat_model:
     - mistralai/Mistral-7B-Instruct-v0.1
     - mistralai/Mistral-7B-Instruct-v0.2
     - mistralai/Mixtral-8x7B-Instruct-v0.1
+    - mistralai/Mixtral-8x7B-Instruct-v0.1
     - google/gemma-7b-it
     - google/gemma-2-9b-it
     - deepseek-ai/deepseek-moe-16b-chat
@@ -151,6 +153,10 @@ turbomind_quatization:
         - codellama/CodeLlama-7b-Instruct-hf
     gptq:
         - internlm/internlm2_5-7b-chat
+    no_kvint4:
+        - openbmb/MiniCPM-V-2_6
+    no_kvint8:
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
 
 pytorch_quatization:
     awq:
@@ -180,6 +186,7 @@ pytorch_quatization:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
         - microsoft/Phi-3-mini-4k-instruct
         - microsoft/Phi-3-vision-128k-instruct
+        - openbmb/MiniCPM-V-2_6
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
 
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py
index cee08308ff..3279495493 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py
@@ -39,6 +39,8 @@ def test_pipeline_chat_tp2(config, model, worker_id):
 @pytest.mark.gpu_num_1
 @pytest.mark.parametrize('model', get_vl_model_list(tp_num=1, quant_policy=4))
 def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
+    if 'Qwen2' in model:
+        return  # kvint4 for qwen2 is not support
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
     p = Process(target=run_pipeline_vl_chat_test, args=(config, model, 4))
@@ -52,6 +54,8 @@ def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
 @pytest.mark.gpu_num_2
 @pytest.mark.parametrize('model', get_vl_model_list(tp_num=2, quant_policy=4))
 def test_pipeline_chat_kvint4_tp2(config, model, worker_id):
+    if 'Qwen2' in model:
+        return  # kvint4 for qwen2 is not support
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
                                                                      tp_num=2)
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py
index 68c254d6b8..1248ec3d50 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py
@@ -64,7 +64,8 @@ def getKvintModelList(tp_num, quant_policy: int = None):
         'cuda_prefix': None,
         'tp_num': tp_num,
         'extra': f'--quant-policy {quant_policy}'
-    } for item in get_vl_model_list(tp_num, quant_policy)]
+    } for item in get_vl_model_list(tp_num, quant_policy)
+            if 'qwen2' not in item.lower() or quant_policy == 8]
 
 
 @pytest.mark.order(7)
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index 1f2c72b26b..ca041dc9a1 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -6,10 +6,19 @@
 
 
 def get_turbomind_model_list(tp_num: int = None,
-                             model_type: str = 'chat_model'):
+                             model_type: str = 'chat_model',
+                             quant_policy: int = None):
     config = get_config()
 
-    case_list = copy.deepcopy(config.get('turbomind_' + model_type))
+    if quant_policy is None:
+        case_list = copy.deepcopy(config.get('turbomind_' + model_type))
+    else:
+        case_list = [
+            x for x in config.get('turbomind_' + model_type)
+            if x not in config.get('turbomind_quatization').get(
+                'no_kvint' + str(quant_policy))
+        ]
+
     quatization_case_config = config.get('turbomind_quatization')
     for key in config.get('turbomind_' + model_type):
         if key not in quatization_case_config.get(
@@ -76,7 +85,9 @@ def get_torch_model_list(tp_num: int = None,
 def get_all_model_list(tp_num: int = None,
                        quant_policy: int = None,
                        model_type: str = 'chat_model'):
-    case_list = get_turbomind_model_list(tp_num=tp_num, model_type=model_type)
+    case_list = get_turbomind_model_list(tp_num=tp_num,
+                                         model_type=model_type,
+                                         quant_policy=quant_policy)
     for case in get_torch_model_list(tp_num=tp_num,
                                      quant_policy=quant_policy,
                                      model_type=model_type):
@@ -107,22 +118,26 @@ def get_quantization_model_list(type):
 
 def get_vl_model_list(tp_num: int = None, quant_policy: int = None):
     config = get_config()
-
     if quant_policy is None:
         case_list = copy.deepcopy(config.get('vl_model'))
     else:
         case_list = [
             x for x in config.get('vl_model')
-            if x in config.get('turbomind_chat_model') or (
-                x in config.get('pytorch_chat_model') and x not in config.get(
-                    'pytorch_quatization').get('no_kvint' + str(quant_policy)))
+            if (x in config.get('turbomind_chat_model') and x not in
+                config.get('turbomind_quatization').get('no_kvint' +
+                                                        str(quant_policy))) or
+            (x in config.get('pytorch_chat_model') and x not in config.get(
+                'pytorch_quatization').get('no_kvint' + str(quant_policy)))
         ]
 
     for key in config.get('vl_model'):
         if key in config.get('turbomind_chat_model') and key not in config.get(
-                'turbomind_quatization').get(
-                    'no_awq') and not is_quantization_model(
-                        key) and key + '-inner-4bits' not in case_list:
+                'turbomind_quatization'
+        ).get('no_awq') and not is_quantization_model(
+                key) and key + '-inner-4bits' not in case_list and (
+                    quant_policy is not None
+                    and key not in config.get('turbomind_quatization').get(
+                        'no_kvint' + str(quant_policy))):
             case_list.append(key + '-inner-4bits')
         if key in config.get('pytorch_chat_model') and key in config.get(
                 'pytorch_quatization'
@@ -132,7 +147,6 @@ def get_vl_model_list(tp_num: int = None, quant_policy: int = None):
                     and key not in config.get('pytorch_quatization').get(
                         'no_kvint' + str(quant_policy))):
             case_list.append(key + '-inner-4bits')
-
     if tp_num is not None:
         return [
             item for item in case_list if get_tp_num(config, item) == tp_num

From a50555b18471cbc7f6ff2015ff7e33ba3c7f5ad9 Mon Sep 17 00:00:00 2001
From: q yao <streetyao@live.com>
Date: Wed, 23 Oct 2024 15:51:52 +0800
Subject: [PATCH 020/122] small block_m for sm7.x (#2626)

* small block_m for sm7.x

* fix alibi
---
 lmdeploy/pytorch/kernels/cuda/alibi_pagedattention.py | 2 +-
 lmdeploy/pytorch/kernels/cuda/pagedattention.py       | 9 +++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/lmdeploy/pytorch/kernels/cuda/alibi_pagedattention.py b/lmdeploy/pytorch/kernels/cuda/alibi_pagedattention.py
index 1e54b5c134..66a8442039 100644
--- a/lmdeploy/pytorch/kernels/cuda/alibi_pagedattention.py
+++ b/lmdeploy/pytorch/kernels/cuda/alibi_pagedattention.py
@@ -12,7 +12,7 @@
 
 assert triton.__version__ >= '2.1.0'
 
-LOG2 = math.log(2)
+LOG2: tl.constexpr = math.log(2)
 
 
 @triton.jit
diff --git a/lmdeploy/pytorch/kernels/cuda/pagedattention.py b/lmdeploy/pytorch/kernels/cuda/pagedattention.py
index d8e6ec5013..7790a44b19 100644
--- a/lmdeploy/pytorch/kernels/cuda/pagedattention.py
+++ b/lmdeploy/pytorch/kernels/cuda/pagedattention.py
@@ -621,6 +621,7 @@ def convert_pv(p, v):
 
 
 _convert_pv = None
+_nv_cap = None
 
 
 # TODO: how to support inplace autotune?
@@ -1099,9 +1100,10 @@ def paged_attention_fwd(
         max_seqlen (int): The max input length.
         BLOCK (int): The kernel block size.
     """
-    global _convert_pv
+    global _convert_pv, _nv_cap
     if _convert_pv is None:
         nv_cap = torch.cuda.get_device_capability()
+        _nv_cap = nv_cap
         _convert_pv = _get_convert_pv(nv_cap)
 
     if kv_layout == 'bshd':
@@ -1150,7 +1152,10 @@ def _get_block_d(Lk):
     is_decoding = q.shape[-3] == q_seqlens.size(0)
     if not is_decoding:
         BLOCK_DMODEL, BLOCK_DMODEL1, BLOCK_DV = _get_block_d(Lq)
-        BLOCK_M = max(16, min(BLOCK, 16384 // BLOCK_DMODEL))
+        if _nv_cap[0] < 8:
+            BLOCK_M = max(16, min(BLOCK, 8192 // BLOCK_DMODEL))
+        else:
+            BLOCK_M = max(16, min(BLOCK, 16384 // BLOCK_DMODEL))
         num_warps = 4
         num_stages = 2
         kv_head = k.shape[h_dim]

From cca7d363b65a264915af356d50d5eb948d0e5f56 Mon Sep 17 00:00:00 2001
From: q yao <streetyao@live.com>
Date: Wed, 23 Oct 2024 15:52:58 +0800
Subject: [PATCH 021/122] Add distributed context in pytorch engine to support
 torchrun (#2615)

---
 lmdeploy/pytorch/backends/cuda/attention.py   | 15 +----
 lmdeploy/pytorch/distributed.py               | 66 +++++++++++++++++++
 lmdeploy/pytorch/engine/model_agent.py        | 64 ++++++++++--------
 lmdeploy/pytorch/models/cogvlm.py             | 14 +---
 lmdeploy/pytorch/models/deepseek.py           | 14 +---
 lmdeploy/pytorch/models/deepseek_v2.py        | 18 +----
 lmdeploy/pytorch/models/minicpm3.py           |  6 +-
 lmdeploy/pytorch/models/qwen2_moe.py          | 14 +---
 lmdeploy/pytorch/nn/attention.py              |  4 +-
 lmdeploy/pytorch/nn/linear.py                 |  3 +-
 lmdeploy/pytorch/nn/moe.py                    |  3 +-
 lmdeploy/pytorch/nn/utils.py                  | 16 -----
 .../weight_loader/model_weight_loader.py      | 14 +---
 13 files changed, 119 insertions(+), 132 deletions(-)
 create mode 100644 lmdeploy/pytorch/distributed.py

diff --git a/lmdeploy/pytorch/backends/cuda/attention.py b/lmdeploy/pytorch/backends/cuda/attention.py
index b43dd3d20f..de5c039b2d 100644
--- a/lmdeploy/pytorch/backends/cuda/attention.py
+++ b/lmdeploy/pytorch/backends/cuda/attention.py
@@ -1,20 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
-import torch.distributed as dist
 
-from ..attention import AttentionBuilder, AttentionImpl, AttentionMetadata
-
-
-def get_world_rank():
-    """get current world size and rank."""
-    world_size = 1
-    rank = 0
+from lmdeploy.pytorch.distributed import get_world_rank
 
-    if dist.is_initialized():
-        world_size = dist.get_world_size()
-        rank = dist.get_rank()
-
-    return world_size, rank
+from ..attention import AttentionBuilder, AttentionImpl, AttentionMetadata
 
 
 class TritonAttentionMetadata(AttentionMetadata):
diff --git a/lmdeploy/pytorch/distributed.py b/lmdeploy/pytorch/distributed.py
new file mode 100644
index 0000000000..f732a816ad
--- /dev/null
+++ b/lmdeploy/pytorch/distributed.py
@@ -0,0 +1,66 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import threading
+from contextlib import contextmanager
+from dataclasses import dataclass
+
+from torch import distributed as dist
+
+
+@dataclass
+class DistContext:
+    rank: int = 0
+    world_size: int = 1
+    dist_group: dist.ProcessGroup = None
+
+
+DefaultContext = DistContext()
+
+
+class DistManager:
+    """distributed context manager."""
+
+    def __init__(self):
+        self.t_local = threading.local()
+        self.t_local.device_context = DefaultContext
+
+    def current_context(self) -> DistContext:
+        """get current context."""
+        return getattr(self.t_local, 'device_context', DefaultContext)
+
+    def set_context(self, context: DistContext):
+        """set current context."""
+        self.t_local.device_context = context
+
+    @contextmanager
+    def context(self, context: DistContext):
+        """context manager."""
+        origin_context = self.current_context()
+        self.set_context(context)
+        yield self
+        self.set_context(origin_context)
+
+
+_DIST_MANAGER: DistManager = None
+
+
+def get_dist_manager():
+    """get device manager."""
+    global _DIST_MANAGER
+    if _DIST_MANAGER is None:
+        _DIST_MANAGER = DistManager()
+    return _DIST_MANAGER
+
+
+def get_world_rank():
+    """get distributed world size and rank."""
+    ctx = get_dist_manager().current_context()
+    world_size = ctx.world_size
+    rank = ctx.rank
+
+    return world_size, rank
+
+
+def get_process_group():
+    """get process group."""
+    ctx = get_dist_manager().current_context()
+    return ctx.dist_group
diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
index 1daf614c8f..918e64e782 100644
--- a/lmdeploy/pytorch/engine/model_agent.py
+++ b/lmdeploy/pytorch/engine/model_agent.py
@@ -15,6 +15,7 @@
 from ..backends import get_backend
 from ..config import BackendConfig, CacheConfig, ModelConfig
 from ..devices import DeviceContext, get_device_manager
+from ..distributed import DistContext, get_dist_manager, get_world_rank
 from ..model_inputs import ModelInputs
 from ..models.patch import (add_adapters, build_patched_model,
                             update_custom_module_map)
@@ -81,9 +82,7 @@ def __adjust_block_size():
         # TODO: support kernel with both large head dim and large block size.
         if model_config.k_head_dim >= 512 and cache_config.block_size > 32:
             cache_config.block_size = 32
-            rank = 0
-            if dist.is_initialized():
-                rank = dist.get_rank()
+            _, rank = get_world_rank()
             if rank == 0:
                 logger.warning(
                     f'Update `block_size={cache_config.block_size}`'
@@ -482,9 +481,11 @@ def _start_tp_process(proc_id: int,
                                 rank=rank,
                                 world_size=world_size,
                                 timeout=timedelta(days=35600))
+        dist_ctx = DistContext(rank=rank, world_size=world_size)
         torch.cuda.set_device(rank)
-        with get_device_manager().context(
-                device_context), torch.inference_mode():
+        with (get_dist_manager().context(dist_ctx),
+              get_device_manager().context(device_context),
+              torch.inference_mode()):
             args = args or tuple()
             kwargs = kwargs or dict()
             func(rank, *args, **kwargs)
@@ -565,6 +566,7 @@ def __signal_term_handler(sig, frame):
         self.world_size = world_size
         self.backend_config = backend_config
 
+        self._dist_ctx = None
         self.mp_bar = self.mp_ctx.Barrier(world_size)
         self._start_sub_process(model_path,
                                 model_config=model_config,
@@ -645,6 +647,8 @@ def _start_sub_process(self, model_path: str, model_config: ModelConfig,
                                     rank=rank,
                                     world_size=world_size,
                                     timeout=timedelta(days=35600))
+            dist_ctx = DistContext(rank=rank, world_size=world_size)
+            self._dist_ctx = dist_ctx
         except Exception as e:
             from traceback import print_exc
             logger.error(f'Rank[{rank}] failed.')
@@ -665,16 +669,17 @@ def _build_model(
         world_size: int,
     ):
         """build model."""
-        rank = 0
-        model, cache_engine, cache_config = _tp_build_model(
-            rank,
-            model_path=model_path,
-            model_config=model_config,
-            cache_config=cache_config,
-            backend_config=backend_config,
-            adapters=adapters,
-            world_size=world_size,
-        )
+        with get_dist_manager().context(self._dist_ctx):
+            rank = 0
+            model, cache_engine, cache_config = _tp_build_model(
+                rank,
+                model_path=model_path,
+                model_config=model_config,
+                cache_config=cache_config,
+                backend_config=backend_config,
+                adapters=adapters,
+                world_size=world_size,
+            )
 
         return model, cache_engine, cache_config
 
@@ -686,20 +691,21 @@ def get_block_numel(self):
     def _forward_impl(self, inputs: ModelInputs, swap_in_map: SwapMap,
                       swap_out_map: SwapMap):
         """forward impl."""
-        self.mp_bar.wait()
-        rank = 0
-        _broadcast_inputs(rank, [inputs, swap_in_map, swap_out_map],
-                          self.stream)
-        cache_swapping(self.cache_engine,
-                       swap_in_map=swap_in_map,
-                       swap_out_map=swap_out_map)
-        output = model_forward(
-            self.patched_model,
-            inputs,
-            self.cache_engine,
-            world_size=1,
-            stream=self.stream,
-        )
+        with get_dist_manager().context(self._dist_ctx):
+            self.mp_bar.wait()
+            rank = 0
+            _broadcast_inputs(rank, [inputs, swap_in_map, swap_out_map],
+                              self.stream)
+            cache_swapping(self.cache_engine,
+                           swap_in_map=swap_in_map,
+                           swap_out_map=swap_out_map)
+            output = model_forward(
+                self.patched_model,
+                inputs,
+                self.cache_engine,
+                world_size=1,
+                stream=self.stream,
+            )
         return output
 
     def forward(self, inputs: ModelInputs, swap_in_map: SwapMap,
diff --git a/lmdeploy/pytorch/models/cogvlm.py b/lmdeploy/pytorch/models/cogvlm.py
index 5023dd8e81..093b367ce2 100644
--- a/lmdeploy/pytorch/models/cogvlm.py
+++ b/lmdeploy/pytorch/models/cogvlm.py
@@ -6,6 +6,7 @@
 from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 
+from lmdeploy.pytorch.distributed import get_world_rank
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm, RopeType,
                                  SiluAndMul, build_rotary_embedding)
@@ -16,19 +17,6 @@
 from .utils.cudagraph import CudaGraphMixin
 
 
-def get_world_rank():
-    """get current world size and rank."""
-    import torch.distributed as dist
-    world_size = 1
-    rank = 0
-
-    if dist.is_initialized():
-        world_size = dist.get_world_size()
-        rank = dist.get_rank()
-
-    return world_size, rank
-
-
 class VisionExpertAttention(nn.Module):
     """Rewrite module of VisionExpertAttention."""
 
diff --git a/lmdeploy/pytorch/models/deepseek.py b/lmdeploy/pytorch/models/deepseek.py
index 5ae59316e2..f4e80fb048 100644
--- a/lmdeploy/pytorch/models/deepseek.py
+++ b/lmdeploy/pytorch/models/deepseek.py
@@ -6,6 +6,7 @@
 from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 
+from lmdeploy.pytorch.distributed import get_world_rank
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm, RopeType,
                                  SiluAndMul, build_rotary_embedding)
@@ -17,19 +18,6 @@
 from .utils.cudagraph import CudaGraphMixin
 
 
-def get_world_rank():
-    """get current world size and rank."""
-    import torch.distributed as dist
-    world_size = 1
-    rank = 0
-
-    if dist.is_initialized():
-        world_size = dist.get_world_size()
-        rank = dist.get_rank()
-
-    return world_size, rank
-
-
 class DeepseekAttention(nn.Module):
     """Rewrite module of MistralAttention."""
 
diff --git a/lmdeploy/pytorch/models/deepseek_v2.py b/lmdeploy/pytorch/models/deepseek_v2.py
index 4e82a67abe..34debae229 100644
--- a/lmdeploy/pytorch/models/deepseek_v2.py
+++ b/lmdeploy/pytorch/models/deepseek_v2.py
@@ -6,6 +6,7 @@
 import torch.distributed as dist
 from torch import nn
 
+from lmdeploy.pytorch.distributed import get_world_rank
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm, RopeType,
                                  SiluAndMul, build_rotary_embedding)
@@ -25,19 +26,6 @@ def yarn_get_mscale(scale=1, mscale=1):
     return 0.1 * mscale * math.log(scale) + 1.0
 
 
-def get_world_rank():
-    """get current world size and rank."""
-    import torch.distributed as dist
-    world_size = 1
-    rank = 0
-
-    if dist.is_initialized():
-        world_size = dist.get_world_size()
-        rank = dist.get_rank()
-
-    return world_size, rank
-
-
 class DeepseekV2BMM(nn.Module):
     """wrapped bmm."""
 
@@ -240,9 +228,7 @@ def forward(
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
-        world_size = 1
-        if dist.is_initialized():
-            world_size = dist.get_world_size()
+        world_size, _ = get_world_rank()
         num_heads = self.num_heads // world_size
         nope_size = self.kv_lora_rank
         q_len = hidden_states.size(1)
diff --git a/lmdeploy/pytorch/models/minicpm3.py b/lmdeploy/pytorch/models/minicpm3.py
index 937a499ebd..56a1c4edf1 100644
--- a/lmdeploy/pytorch/models/minicpm3.py
+++ b/lmdeploy/pytorch/models/minicpm3.py
@@ -3,10 +3,10 @@
 from typing import Any, Iterable, List, Optional, Tuple
 
 import torch
-import torch.distributed as dist
 from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 
+from lmdeploy.pytorch.distributed import get_world_rank
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.nn import (Attention, RMSNorm, RopeType, SiluAndMul,
                                  build_rotary_embedding)
@@ -118,9 +118,7 @@ def forward(
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
-        world_size = 1
-        if dist.is_initialized():
-            world_size = dist.get_world_size()
+        world_size, _ = get_world_rank()
         num_heads = self.num_heads // world_size
         bsz, q_len, _ = hidden_states.size()
 
diff --git a/lmdeploy/pytorch/models/qwen2_moe.py b/lmdeploy/pytorch/models/qwen2_moe.py
index 4f3406a5d8..fdaff8e0cc 100644
--- a/lmdeploy/pytorch/models/qwen2_moe.py
+++ b/lmdeploy/pytorch/models/qwen2_moe.py
@@ -7,6 +7,7 @@
 from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 
+from lmdeploy.pytorch.distributed import get_world_rank
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm, RopeType,
                                  SiluAndMul, build_rotary_embedding)
@@ -18,19 +19,6 @@
 from .utils.cudagraph import CudaGraphMixin
 
 
-def get_world_rank():
-    """get current world size and rank."""
-    import torch.distributed as dist
-    world_size = 1
-    rank = 0
-
-    if dist.is_initialized():
-        world_size = dist.get_world_size()
-        rank = dist.get_rank()
-
-    return world_size, rank
-
-
 class Qwen2MoeAttention(nn.Module):
     """Rewrite module of Qwen2MoeAttention."""
 
diff --git a/lmdeploy/pytorch/nn/attention.py b/lmdeploy/pytorch/nn/attention.py
index 860bcfaaec..26f1034d36 100644
--- a/lmdeploy/pytorch/nn/attention.py
+++ b/lmdeploy/pytorch/nn/attention.py
@@ -2,9 +2,11 @@
 import torch
 from torch import nn
 
+from lmdeploy.pytorch.distributed import get_world_rank
+
 from ..backends import OpType, get_backend
 from ..backends.attention import AttentionMetadata
-from .utils import get_distribute_size, get_world_rank
+from .utils import get_distribute_size
 
 
 class Attention(nn.Module):
diff --git a/lmdeploy/pytorch/nn/linear.py b/lmdeploy/pytorch/nn/linear.py
index 98d2fea741..08040ee00c 100644
--- a/lmdeploy/pytorch/nn/linear.py
+++ b/lmdeploy/pytorch/nn/linear.py
@@ -5,13 +5,14 @@
 import torch.distributed as dist
 from torch import nn
 
+from lmdeploy.pytorch.distributed import get_world_rank
 from lmdeploy.pytorch.weight_loader.model_weight_loader import \
     default_weight_loader
 from lmdeploy.utils import get_logger
 
 from ..backends import OpType, get_backend
 from ..backends.lora import AdapterInfo
-from .utils import div_up, get_distribute_size, get_world_rank
+from .utils import div_up, get_distribute_size
 
 logger = get_logger('lmdeploy')
 
diff --git a/lmdeploy/pytorch/nn/moe.py b/lmdeploy/pytorch/nn/moe.py
index 378e8561d1..6467a6de08 100644
--- a/lmdeploy/pytorch/nn/moe.py
+++ b/lmdeploy/pytorch/nn/moe.py
@@ -5,8 +5,9 @@
 import torch.distributed as dist
 from torch import nn
 
+from lmdeploy.pytorch.distributed import get_world_rank
+
 from ..backends import OpType, get_backend
-from .utils import get_world_rank
 
 
 class SoftmaxTopK(nn.Module):
diff --git a/lmdeploy/pytorch/nn/utils.py b/lmdeploy/pytorch/nn/utils.py
index ad2fde818f..3289f858a7 100644
--- a/lmdeploy/pytorch/nn/utils.py
+++ b/lmdeploy/pytorch/nn/utils.py
@@ -1,25 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-
-import torch.distributed as dist
-
-
 def div_up(a: int, b: int):
     """div up."""
     return (a + b - 1) // b
 
 
-def get_world_rank():
-    """get current world size and rank."""
-    world_size = 1
-    rank = 0
-
-    if dist.is_initialized():
-        world_size = dist.get_world_size()
-        rank = dist.get_rank()
-
-    return world_size, rank
-
-
 def get_distribute_size(feature_size: int,
                         world_size: int,
                         rank: int,
diff --git a/lmdeploy/pytorch/weight_loader/model_weight_loader.py b/lmdeploy/pytorch/weight_loader/model_weight_loader.py
index 76f7c741c5..cb548614c7 100644
--- a/lmdeploy/pytorch/weight_loader/model_weight_loader.py
+++ b/lmdeploy/pytorch/weight_loader/model_weight_loader.py
@@ -3,26 +3,16 @@
 import os.path as osp
 
 import torch
-import torch.distributed as dist
 from transformers.modeling_utils import load_state_dict
 from transformers.utils import (SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME,
                                 WEIGHTS_INDEX_NAME, WEIGHTS_NAME)
 
+from lmdeploy.pytorch.distributed import get_world_rank
 from lmdeploy.utils import get_logger
 
 logger = get_logger('lmdeploy')
 
 
-def _get_world_rank():
-    """get rank."""
-    rank = 0
-    world_size = 1
-    if dist.is_initialized():
-        rank = dist.get_rank()
-        world_size = dist.get_world_size()
-    return world_size, rank
-
-
 def load_weight(param: torch.nn.Parameter, loaded_weight: torch.Tensor,
                 **kwargs):
     """load weight."""
@@ -141,7 +131,7 @@ def load_model_weights(
         """load model weights implementation."""
         assert hasattr(model, 'load_weights')
         paths = self._shard_paths
-        world_size, rank = _get_world_rank()
+        world_size, rank = get_world_rank()
         for path in paths:
 
             # log

From c887adc6b4738ee25c2ee3594a30754763661041 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Wed, 23 Oct 2024 15:57:24 +0800
Subject: [PATCH 022/122] [ci] fix restful script (#2635)

* updaste

* update

* update

* Update stable.yml

* Update stable.yml

* update

* update
---
 .github/scripts/action_tools.py               | 25 ++++++++
 .github/scripts/eval_stable_object_config.py  | 58 ++++++++++++++-----
 .github/scripts/eval_stable_subject_config.py |  8 +--
 .github/workflows/stable.yml                  | 13 +++--
 autotest/utils/benchmark_utils.py             |  2 +-
 5 files changed, 76 insertions(+), 30 deletions(-)

diff --git a/.github/scripts/action_tools.py b/.github/scripts/action_tools.py
index 589c5f1e33..e91e12dbb8 100644
--- a/.github/scripts/action_tools.py
+++ b/.github/scripts/action_tools.py
@@ -269,5 +269,30 @@ def generate_benchmark_report(report_path: str):
     _append_summary('## Benchmark Results End')
 
 
+def generate_csv_from_profile_result(file_path: str, out_path: str):
+    with open(file_path, 'r') as f:
+        data = f.readlines()
+        data = [json.loads(line) for line in data]
+
+        data_csv = []
+        for item in data:
+            row = [
+                item.get('request_rate'),
+                item.get('completed'),
+                round(item.get('completed') / item.get('duration'), 3),
+                round(item.get('median_ttft_ms'), 3),
+                round(item.get('output_throughput'), 3)
+            ]
+            data_csv.append(row)
+        import csv
+        with open(out_path, 'w', newline='') as f:
+            writer = csv.writer(f)
+            writer.writerow([
+                'request_rate', 'completed', 'RPM', 'median_ttft_ms',
+                'output_throughput'
+            ])
+            writer.writerows(data_csv)
+
+
 if __name__ == '__main__':
     fire.Fire()
diff --git a/.github/scripts/eval_stable_object_config.py b/.github/scripts/eval_stable_object_config.py
index 53e46b87ae..2ce0fc827d 100644
--- a/.github/scripts/eval_stable_object_config.py
+++ b/.github/scripts/eval_stable_object_config.py
@@ -3,36 +3,62 @@
 
 with read_base():
     # choose a list of datasets
-    from opencompass.configs.datasets.bbh.bbh_gen_2879b0 import \
+    from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \
+        ARC_c_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \
         bbh_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
-        ceval_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import \
+    from opencompass.configs.datasets.CHARM.charm_reason_cot_only_gen_f7b7d3 import \
+        charm_reason_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \
         cmmlu_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \
-        GaokaoBench_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.gpqa.gpqa_gen_4baadb import \
+    from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import \
+        drop_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import \
+        ds1000_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
         gpqa_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
+    from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
         gsm8k_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
         hellaswag_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \
+    from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import \
         humaneval_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.humanevalx.humanevalx_gen_620cfa import \
+        humanevalx_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \
         ifeval_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.LCBench.lcbench_gen_5ff288 import \
+        LCBench_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.math.math_0shot_gen_393424 import \
         math_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_830460 import \
+    from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \
+        mathbench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import \
         sanitized_mbpp_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import \
+    from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import \
         mmlu_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.nq.nq_gen_3dcea1 import \
-        nq_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.race.race_gen_69ee4f import \
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
+        mmlu_pro_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.race.race_cot_gen_d95929 import \
         race_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.scicode.scicode_gen_085b98 import \
+        SciCode_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_cot_gen_1d56df import \
+        BoolQ_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.teval.teval_en_gen_1ac254 import \
+        teval_datasets as teval_en_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.teval.teval_zh_gen_1ac254 import \
+        teval_datasets as teval_zh_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
+        TheoremQA_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.wikibench.wikibench_gen_0978ad import \
+        wikibench_datasets  # noqa: F401, E501
 
-datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')
+                and 'scicode' not in k.lower() and 'teval' not in k), [])
+datasets += teval_en_datasets
+datasets += teval_zh_datasets
+datasets += SciCode_datasets
 
 api_meta_template = dict(
     round=[
@@ -52,7 +78,7 @@
         tokenizer_path='/nvme/qa_test_models/internlm/internlm2_5-20b-chat',
         rpm_verbose=True,
         meta_template=api_meta_template,
-        query_per_second=50,
+        query_per_second=100,
         max_out_len=1024,
         max_seq_len=4096,
         temperature=0.01,
diff --git a/.github/scripts/eval_stable_subject_config.py b/.github/scripts/eval_stable_subject_config.py
index abcfba3db4..4ab7e620a5 100644
--- a/.github/scripts/eval_stable_subject_config.py
+++ b/.github/scripts/eval_stable_subject_config.py
@@ -20,12 +20,6 @@
         mtbench101_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \
         wildbench_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
-        TheoremQA_datasets  # noqa: F401, E501 # noqa: F401, E501
-    from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import \
-        triviaqa_datasets  # noqa: F401, E501 # noqa: F401, E501
-    from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import \
-        winogrande_datasets  # noqa: F401, E501
 
 datasets = sum((v for k, v in locals().items()
                 if k.endswith('_datasets') and 'wildbench' not in k), [])
@@ -49,7 +43,7 @@
         tokenizer_path='/nvme/qa_test_models/internlm/internlm2_5-20b-chat',
         rpm_verbose=True,
         meta_template=api_meta_template,
-        query_per_second=50,
+        query_per_second=100,
         max_out_len=1024,
         max_seq_len=4096,
         temperature=0.01,
diff --git a/.github/workflows/stable.yml b/.github/workflows/stable.yml
index 98faf2ffa4..af5effec49 100644
--- a/.github/workflows/stable.yml
+++ b/.github/workflows/stable.yml
@@ -143,15 +143,16 @@ jobs:
           opencompass .github/scripts/eval_stable_subject_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-subject-3
       - name: Test lmdeploy - restful api
         run: |
-          python3 benchmark/profile_restful_api.py --port 23344 --dataset-path /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10000 > ${{env.REPORT_DIR}}/stable.log
-          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --num-prompts 100000 > ${{env.REPORT_DIR}}/stable-internal-1.log
-          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --num-prompts 100000 > ${{env.REPORT_DIR}}/stable-internal-2.log
-          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --num-prompts 100000 > ${{env.REPORT_DIR}}/stable-internal-3.log
-          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --num-prompts 100000 > ${{env.REPORT_DIR}}/stable-internal-4.log
-          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 > ${{env.REPORT_DIR}}/stable-internal-5.log
+          python3 benchmark/profile_restful_api.py --backend lmdeploy --base-url http://localhost:23344 --dataset-path /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10000 --output-file ${{env.REPORT_DIR}}/stable.jsonl > ${{env.REPORT_DIR}}/stable.log
+          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-1.csv > ${{env.REPORT_DIR}}/stable-internal-1.log
+          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-2.csv > ${{env.REPORT_DIR}}/stable-internal-2.log
+          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-3.csv > ${{env.REPORT_DIR}}/stable-internal-3.log
+          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-2.csv > ${{env.REPORT_DIR}}/stable-internal-4.log
+          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-3.csv > ${{env.REPORT_DIR}}/stable-internal-5.log
       - name: Attach result
         if: always()
         run: |
+          python3 .github/scripts/action_tools.py generate_csv_from_profile_result ${{env.REPORT_DIR}}/stable.jsonl ${{env.REPORT_DIR}}/stable.csv
           python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable.csv
           python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable-internal-1.csv
           python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable-internal-2.csv
diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py
index 24eb6c8f1c..0ea643524f 100644
--- a/autotest/utils/benchmark_utils.py
+++ b/autotest/utils/benchmark_utils.py
@@ -168,7 +168,7 @@ def restful_test(config,
     if not health_check(http_url):
         return False, 'server not start'
 
-    command = f'python3 benchmark/profile_restful_api.py --port {port} --tokenizer {model_path} --dataset-path {dataset_path}'  # noqa: F401, E501
+    command = f'python3 /nvme/qa_test_models/offline_pkg/profile_restful_api.py localhost:{port} {model_path} {dataset_path} --stream-output True '  # noqa: F401, E501
     if is_smoke:
         command += ' --num-prompts 200'
     else:

From 4492df812363d112aed9151a3ff7e26a654d36ea Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Wed, 23 Oct 2024 16:01:03 +0800
Subject: [PATCH 023/122] [ci] add internlm2_5_7b_batch_1 into evaluation
 testcase (#2631)

---
 .github/scripts/eval_base_config.py  |  6 ++++++
 .github/workflows/daily_ete_test.yml | 13 ++++++++++++-
 .github/workflows/evaluate.yml       | 25 ++++++++++++++-----------
 3 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/.github/scripts/eval_base_config.py b/.github/scripts/eval_base_config.py
index 845286b87c..dc31293603 100644
--- a/.github/scripts/eval_base_config.py
+++ b/.github/scripts/eval_base_config.py
@@ -66,3 +66,9 @@
 turbomind_qwen1_5_7b = deepcopy(*lmdeploy_qwen1_5_7b)
 turbomind_qwen2_7b = deepcopy(*lmdeploy_qwen2_7b)
 turbomind_internlm2_5_7b = deepcopy(*lmdeploy_internlm2_5_7b)
+turbomind_internlm2_5_7b_batch1 = deepcopy(*lmdeploy_internlm2_5_7b)
+
+turbomind_internlm2_5_7b_batch1[
+    'abbr'] = turbomind_internlm2_5_7b_batch1['abbr'] + '_batch1'
+turbomind_internlm2_5_7b_batch1['engine_config']['max_batch_size'] = 1
+turbomind_internlm2_5_7b_batch1['batch_size'] = 1
diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
index 229fdd6ca6..84fcaf5034 100644
--- a/.github/workflows/daily_ete_test.yml
+++ b/.github/workflows/daily_ete_test.yml
@@ -530,7 +530,11 @@ jobs:
     if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'evaluation'))}}
     runs-on: [self-hosted, linux-a100]
     needs: test_quantization
-    timeout-minutes: 120 # 5hours
+    timeout-minutes: 120 # 2hours
+    strategy:
+      fail-fast: false
+      matrix:
+        evaluate_type: ['chat', 'base']
     container:
       image: openmmlab/lmdeploy:latest-cu11
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
@@ -581,10 +585,17 @@ jobs:
           ln -s /root/opencompass-data ./data
           python3 .github/scripts/action_tools.py create_model_links /nvme/qa_test_models .
       - name: Evaluate models
+        if: matrix.evaluate_type == 'chat'
         run: |
           export LMDEPLOY_DIR=$(pwd)
 
           python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_4bits, turbomind_internlm2_5_7b_chat_kvint4, pt_internlm2_5_7b_chat, turbomind_internlm2_5_20b_chat, turbomind_internlm2_5_20b_chat_4bits, turbomind_internlm2_5_20b_chat_kvint4, pt_internlm2_5_20b_chat, turbomind_llama_3d1_8b_instruct, pt_llama_3d1_8b_instruct, turbomind_llama_3d1_8b_instruct_4bits, turbomind_llama_3d1_8b_instruct_kvint4, turbomind_qwen2_7b_instruct, turbomind_qwen2_7b_instruct_4bits, pt_qwen1_5_moe_2_7b_chat, pt_gemma_2_9b_it]" "[*race_datasets, *gsm8k_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true
+      - name: Evaluate base models
+        if: matrix.evaluate_type == 'base'
+        run: |
+          export LMDEPLOY_DIR=$(pwd)
+
+          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b, turbomind_qwen2_7b, turbomind_internlm2_5_7b_batch1]" "[*mmlu_datasets, *gsm8k_datasets]" /root/evaluation-reports/${{ github.run_id }} base true
       - name: Clear workspace
         if: always()
         run: |
diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml
index 1e8d78d143..6b91cd2746 100644
--- a/.github/workflows/evaluate.yml
+++ b/.github/workflows/evaluate.yml
@@ -25,19 +25,24 @@ on:
         default: '[*mmlu_datasets, *gsm8k_datasets]'
       base_models:
         required: true
-        description: 'Tested TurboMind models list. eg. [turbomind_internlm2_5_7b, turbomind_qwen2_7b]'
+        description: 'Tested TurboMind models list. eg. [turbomind_internlm2_5_7b, turbomind_qwen2_7b, turbomind_internlm2_5_7b_batch1]'
         type: string
-        default: '[turbomind_internlm2_5_7b, turbomind_qwen2_7b]'
+        default: '[turbomind_internlm2_5_7b, turbomind_qwen2_7b, turbomind_internlm2_5_7b_batch1]'
       baes_datasets:
         required: true
         description: 'Tested datasets list. eg. [*mmlu_datasets, *gsm8k_datasets]'
         type: string
         default: '[*mmlu_datasets, *gsm8k_datasets]'
-      local_config:
-        required: true
-        description: 'Whether use local eval config'
-        type: boolean
-        default: false
+      oc_repo_org:
+        required: false
+        description: 'Tested repository organization name. Default is open-compass/opencompass'
+        type: string
+        default: 'open-compass/opencompass'
+      oc_repo_ref:
+        required: false
+        description: 'Set branch or tag or commit id. Default is "main"'
+        type: string
+        default: 'main'
       offline_mode:
         required: true
         description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
@@ -143,8 +148,9 @@ jobs:
           python3 -m pip install -r requirements/test.txt
       - name: Install opencompass
         run: |
-          git clone --depth=1 https://github.com/open-compass/opencompass.git
+          git clone https://github.com/${{ github.event.inputs.oc_repo_org}}.git
           cd opencompass
+          git checkout ${{ github.event.inputs.oc_repo_ref}}
           python3 -m pip install -e .
           echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV
       - name: Check env
@@ -156,9 +162,6 @@ jobs:
         run: |
           ln -s /root/opencompass-data ./data
           python3 .github/scripts/action_tools.py create_model_links /root/models .
-      - name: Use local config
-        if: ${{inputs.local_config}}
-        run: cp /root/models/offline_pkg/eval_config.py .github/scripts/eval_opencompass_config.py
       - name: Evaluate chat models
         if: matrix.evaluate_type == 'chat'
         run: |

From 321b9406041ae18db3f9bec7f81d89410097534e Mon Sep 17 00:00:00 2001
From: zhoushenglong <87467364+Reinerzhou@users.noreply.github.com>
Date: Thu, 24 Oct 2024 11:17:09 +0800
Subject: [PATCH 024/122] [maca] add maca backend support. (#2636)

* refactor: add maca backend.

* adjust prefill_attention paras.

* set a bigger bs for maca.

* fix update_step_context.
---
 lmdeploy/cli/utils.py                         |   2 +-
 lmdeploy/messages.py                          |   2 +-
 lmdeploy/pytorch/backends/dlinfer/__init__.py |   1 +
 .../pytorch/backends/dlinfer/maca/__init__.py |   2 +
 .../backends/dlinfer/maca/op_backend.py       | 111 ++++++++++++++++++
 lmdeploy/pytorch/backends/selector.py         |   3 +
 lmdeploy/pytorch/check_env/__init__.py        |   1 +
 lmdeploy/pytorch/models/module_map.py         |   4 +-
 lmdeploy/utils.py                             |   4 +-
 9 files changed, 126 insertions(+), 4 deletions(-)
 create mode 100644 lmdeploy/pytorch/backends/dlinfer/maca/__init__.py
 create mode 100644 lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py

diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
index baf2b3dc8e..44ce718b53 100644
--- a/lmdeploy/cli/utils.py
+++ b/lmdeploy/cli/utils.py
@@ -367,7 +367,7 @@ def calib_search_scale(parser):
     @staticmethod
     def device(parser,
                default: str = 'cuda',
-               choices: List[str] = ['cuda', 'ascend']):
+               choices: List[str] = ['cuda', 'ascend', 'maca']):
         """Add argument device to parser."""
 
         return parser.add_argument('--device',
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
index 38c2153669..90823598ea 100644
--- a/lmdeploy/messages.py
+++ b/lmdeploy/messages.py
@@ -291,7 +291,7 @@ def __post_init__(self):
         assert self.num_gpu_blocks >= 0, 'invalid num_gpu_blocks'
         assert self.quant_policy in (0, 4, 8), 'invalid quant_policy'
         assert self.device_type in [
-            'cuda', 'ascend'
+            'cuda', 'ascend', 'maca'
         ], (f'invalid device_type: {self.device_type}')
         if self.quant_policy > 0 and self.device_type != 'cuda':
             assert False, 'kv cache quantization only works for CUDA.'
diff --git a/lmdeploy/pytorch/backends/dlinfer/__init__.py b/lmdeploy/pytorch/backends/dlinfer/__init__.py
index 947e66e0ce..af3ccff085 100644
--- a/lmdeploy/pytorch/backends/dlinfer/__init__.py
+++ b/lmdeploy/pytorch/backends/dlinfer/__init__.py
@@ -1,2 +1,3 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .ascend import AscendOpsBackend  # noqa: F401
+from .maca import MacaOpsBackend  # noqa: F401
diff --git a/lmdeploy/pytorch/backends/dlinfer/maca/__init__.py b/lmdeploy/pytorch/backends/dlinfer/maca/__init__.py
new file mode 100644
index 0000000000..844b45cedf
--- /dev/null
+++ b/lmdeploy/pytorch/backends/dlinfer/maca/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .op_backend import MacaOpsBackend  # noqa: F401
diff --git a/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py
new file mode 100644
index 0000000000..084cae1bfe
--- /dev/null
+++ b/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+
+from lmdeploy.utils import get_logger
+
+from ..op_backend import DlinferOpsBackend
+
+logger = get_logger('lmdeploy')
+
+
+class MacaOpsBackend(DlinferOpsBackend):
+    """maca layer backend."""
+
+    @staticmethod
+    def get_name() -> str:
+        """backend name."""
+        return 'maca'
+
+    @staticmethod
+    def get_k_block_shape(
+        block_size: int,
+        num_heads: int,
+        head_size: int,
+        dtype: torch.dtype,
+    ) -> Tuple[int, ...]:
+        if head_size == 576:
+            x = 16
+            return (num_heads, head_size // x, block_size, x)
+        return (num_heads, block_size, head_size)
+
+    @staticmethod
+    def get_v_block_shape(
+        block_size: int,
+        num_heads: int,
+        head_size: int,
+        dtype: torch.dtype,
+    ) -> Tuple[int, ...]:
+        return (num_heads, block_size, head_size)
+
+    @classmethod
+    def update_step_context(cls, step_context):
+        """update step context."""
+        kv_start_indices, attention_mask = [], []
+        block_num, _, block_size, _ = step_context.kv_caches[0][0].shape
+        device = step_context.block_offsets.device
+
+        is_unpaged_prefill = False
+        q_start_loc = torch.cat((torch.tensor([0], device=device),
+                                 step_context.q_seqlens.cumsum(0))).int()
+        q_seqlens = step_context.q_seqlens.int()
+        kv_seqlens = step_context.kv_seqlens.int()
+        max_q_seq_len = torch.max(q_seqlens).item()
+        max_kv_seq_len = torch.max(kv_seqlens).item()
+
+        if not step_context.is_decoding:
+            is_unpaged_prefill = \
+                all((step_context.q_seqlens ==
+                     step_context.kv_seqlens).tolist())
+            if is_unpaged_prefill:
+                single_attention_mask = torch.logical_not(
+                    torch.tril(
+                        torch.ones(max_q_seq_len,
+                                   max_kv_seq_len,
+                                   dtype=torch.bool).cuda(),
+                        diagonal=max_kv_seq_len - max_q_seq_len,
+                    ))
+                attention_mask.append(single_attention_mask)
+        total_slots = torch.arange(block_num * block_size,
+                                   dtype=torch.long,
+                                   device=device)
+        total_slots = total_slots.view(block_num, block_size)
+        for i in range(step_context.q_start_loc.size(0)):
+            q_seq_len = int(step_context.q_seqlens[i])
+            kv_seq_len = int(step_context.kv_seqlens[i])
+            if not (step_context.is_decoding or is_unpaged_prefill):
+                single_attention_mask = torch.logical_not(
+                    torch.tril(
+                        torch.ones(step_context.q_seqlens[i],
+                                   step_context.block_offsets.shape[1] *
+                                   block_size,
+                                   dtype=torch.bool).cuda(),
+                        diagonal=step_context.kv_seqlens[i] -
+                        step_context.q_seqlens[i],
+                    ))
+                attention_mask.append(single_attention_mask)
+            history_length = kv_seq_len - q_seq_len
+            slot_tables = total_slots[step_context.block_offsets[i]].flatten()
+            slot_indices = [p for p in range(history_length, kv_seq_len)]
+            slots = slot_tables[slot_indices].reshape((-1, 1))
+            kv_start_indices.append(slots)
+        kv_start_indices = torch.cat(kv_start_indices)
+
+        attn_meta_cls = cls.get_attention_metadata_cls()
+        attn_metadata = attn_meta_cls(
+            step_context.is_decoding,
+            step_context.block_offsets.int(),
+            q_start_loc=q_start_loc,
+            q_seqlens=q_seqlens,
+            kv_seqlens=kv_seqlens,
+            kv_start_indices=kv_start_indices,
+            block_size=block_size,
+            attention_mask=attention_mask,
+            is_unpaged_prefill=is_unpaged_prefill,
+            max_q_seq_len=max_q_seq_len,
+            max_kv_seq_len=max_kv_seq_len,
+        )
+
+        step_context.attn_metadata = attn_metadata
+        return step_context
diff --git a/lmdeploy/pytorch/backends/selector.py b/lmdeploy/pytorch/backends/selector.py
index 1ac85de0cb..987730a981 100644
--- a/lmdeploy/pytorch/backends/selector.py
+++ b/lmdeploy/pytorch/backends/selector.py
@@ -15,5 +15,8 @@ def get_backend():
     if device_type == 'ascend':
         from .dlinfer import AscendOpsBackend
         return AscendOpsBackend
+    if device_type == 'maca':
+        from .dlinfer import MacaOpsBackend
+        return MacaOpsBackend
     else:
         raise RuntimeError(f'Unsupported device type: {device_type}')
diff --git a/lmdeploy/pytorch/check_env/__init__.py b/lmdeploy/pytorch/check_env/__init__.py
index f5c7ea8d0b..2b4b3cc521 100644
--- a/lmdeploy/pytorch/check_env/__init__.py
+++ b/lmdeploy/pytorch/check_env/__init__.py
@@ -32,6 +32,7 @@ def try_import_deeplink(device_type: str):
     deeplink_device_type_list = [
         'ascend',
         'npu',
+        'maca',
     ]
     if device_type in deeplink_device_type_list:
         logger = get_logger('lmdeploy')
diff --git a/lmdeploy/pytorch/models/module_map.py b/lmdeploy/pytorch/models/module_map.py
index 8d29b51efb..71c7bca238 100644
--- a/lmdeploy/pytorch/models/module_map.py
+++ b/lmdeploy/pytorch/models/module_map.py
@@ -5,8 +5,10 @@
 # ascend module
 MODULE_MAP = dict()
 ASCEND_MODULE_MAP = dict()
+MACA_MODULE_MAP = dict()
 
-DEVICE_SPECIAL_MODULE_MAP = dict(ascend=ASCEND_MODULE_MAP)
+DEVICE_SPECIAL_MODULE_MAP = dict(ascend=ASCEND_MODULE_MAP,
+                                 maca=MACA_MODULE_MAP)
 
 # llama
 MODULE_MAP.update({
diff --git a/lmdeploy/utils.py b/lmdeploy/utils.py
index a540b73916..affbac8073 100644
--- a/lmdeploy/utils.py
+++ b/lmdeploy/utils.py
@@ -332,7 +332,7 @@ def get_max_batch_size(device_type: str):
     Args:
         device_type (str): the type of device
     """
-    assert device_type in ['cuda', 'ascend']
+    assert device_type in ['cuda', 'ascend', 'maca']
     if device_type == 'cuda':
         max_batch_size_map = {
             'a100': 256,
@@ -350,3 +350,5 @@ def get_max_batch_size(device_type: str):
         return 128
     elif device_type == 'ascend':
         return 16
+    elif device_type == 'maca':
+        return 128

From f4e0343de28c2e21995762a0c809fda118af4188 Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Thu, 24 Oct 2024 11:17:44 +0800
Subject: [PATCH 025/122] Support mllama for pytorch engine (#2605)

* support cross-cache

* Support mllama in pytorch engine

* add rewrite to support accelerate pp

* update cross kv lens accordingly

* fix ut and fill cache index

* remove mllama.py

* fix pure text input error

* another cat for tp

* fix

* add no split module for 90B

* refine

* update supported_models

* strict check

* update image inputs

* handle image with shape 1x1

* use config device

---------

Co-authored-by: grimoire <yaoqian@pjlab.org.cn>
---
 docs/en/multi_modal/index.rst                 |   1 +
 docs/en/multi_modal/mllama.md                 |  67 ++
 docs/en/supported_models/supported_models.md  |   1 +
 docs/zh_cn/multi_modal/index.rst              |   1 +
 docs/zh_cn/multi_modal/mllama.md              |  66 ++
 .../supported_models/supported_models.md      |   1 +
 lmdeploy/archs.py                             |   2 +-
 lmdeploy/model.py                             |   3 +
 lmdeploy/pytorch/backends/attention.py        |   1 +
 lmdeploy/pytorch/backends/cuda/attention.py   |  36 +-
 lmdeploy/pytorch/backends/cuda/op_backend.py  |  18 +
 lmdeploy/pytorch/configurations/mllama.py     |  18 +
 lmdeploy/pytorch/engine/engine.py             |  17 +-
 lmdeploy/pytorch/engine/engine_instance.py    |   3 +-
 lmdeploy/pytorch/messages.py                  |  38 +-
 lmdeploy/pytorch/model_inputs.py              |  10 +
 lmdeploy/pytorch/models/mllama.py             | 758 ++++++++++++++++++
 lmdeploy/pytorch/models/module_map.py         |   6 +
 lmdeploy/pytorch/models/utils/cudagraph.py    |   9 +
 .../block_manager/base_block_manager.py       |   5 -
 .../block_manager/default_block_manager.py    |  15 +-
 lmdeploy/pytorch/supported_models.py          |   2 +
 lmdeploy/serve/vl_async_engine.py             |  14 +-
 lmdeploy/utils.py                             |   2 +-
 lmdeploy/vl/model/builder.py                  |   1 +
 lmdeploy/vl/model/mllama.py                   | 286 +++++++
 lmdeploy/vl/templates.py                      |  10 +
 tests/pytorch/paging/test_block_manager.py    |  20 +
 28 files changed, 1367 insertions(+), 44 deletions(-)
 create mode 100644 docs/en/multi_modal/mllama.md
 create mode 100644 docs/zh_cn/multi_modal/mllama.md
 create mode 100644 lmdeploy/pytorch/configurations/mllama.py
 create mode 100644 lmdeploy/pytorch/models/mllama.py
 create mode 100644 lmdeploy/vl/model/mllama.py

diff --git a/docs/en/multi_modal/index.rst b/docs/en/multi_modal/index.rst
index 4218e5c2eb..62f724070f 100644
--- a/docs/en/multi_modal/index.rst
+++ b/docs/en/multi_modal/index.rst
@@ -11,3 +11,4 @@ Vision-Language Models
    cogvlm.md
    minicpmv.md
    phi3.md
+   mllama.md
diff --git a/docs/en/multi_modal/mllama.md b/docs/en/multi_modal/mllama.md
new file mode 100644
index 0000000000..103aaed2ff
--- /dev/null
+++ b/docs/en/multi_modal/mllama.md
@@ -0,0 +1,67 @@
+# Mllama
+
+## Introduction
+
+[Llama3.2-VL](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf) is a family of large language and multi-modal models from Meta.
+
+We will demonstrate how to deploy an Llama3.2-VL model using LMDeploy, with [meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) as an example.
+
+## Installation
+
+Please install LMDeploy by following the [installation guide](../get_started/installation.md).
+
+## Offline inference
+
+The following sample code shows the basic usage of VLM pipeline. For more examples, please refer to [VLM Offline Inference Pipeline](./vl_pipeline.md)
+
+```python
+from lmdeploy import pipeline
+from lmdeploy.vl import load_image
+
+pipe = pipeline('meta-llama/Llama-3.2-11B-Vision-Instruct')
+
+image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
+response = pipe(('describe this image', image))
+print(response)
+```
+
+## Online serving
+
+### Launch Service
+
+You can launch the server by the `lmdeploy serve api_server` CLI:
+
+```shell
+lmdeploy serve api_server meta-llama/Llama-3.2-11B-Vision-Instruct
+```
+
+### Integrate with `OpenAI`
+
+Here is an example of interaction with the endpoint `v1/chat/completions` service via the openai package.
+Before running it, please install the openai package by `pip install openai`
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+model_name = client.models.list().data[0].id
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role':
+        'user',
+        'content': [{
+            'type': 'text',
+            'text': 'Describe the image please',
+        }, {
+            'type': 'image_url',
+            'image_url': {
+                'url':
+                'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg',
+            },
+        }],
+    }],
+    temperature=0.8,
+    top_p=0.8)
+print(response)
+```
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index 7a498247f3..c992f730c8 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -49,6 +49,7 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 |     Llama2     |  7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |     Llama3     |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|  Llama3.2-VL   |   8B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |   -   |
 |   InternLM2    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  InternLM2.5   |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
diff --git a/docs/zh_cn/multi_modal/index.rst b/docs/zh_cn/multi_modal/index.rst
index 2307127e75..0942d8d31c 100644
--- a/docs/zh_cn/multi_modal/index.rst
+++ b/docs/zh_cn/multi_modal/index.rst
@@ -11,3 +11,4 @@
    cogvlm.md
    minicpmv.md
    phi3.md
+   mllama.md
diff --git a/docs/zh_cn/multi_modal/mllama.md b/docs/zh_cn/multi_modal/mllama.md
new file mode 100644
index 0000000000..31ee99754f
--- /dev/null
+++ b/docs/zh_cn/multi_modal/mllama.md
@@ -0,0 +1,66 @@
+# Mllama
+
+## 简介
+
+[Llama3.2-VL](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf) 是 Meta 发布的新视觉模型。
+
+本文将以[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)为例，演示使用 LMDeploy 部署 Mllama 系列多模态模型的方法
+
+## 安装
+
+请参考[安装文档](../get_started/installation.md)安装 LMDeploy。
+
+## 离线推理 pipeline
+
+以下是使用pipeline进行离线推理的示例，更多用法参考[VLM离线推理 pipeline](./vl_pipeline.md)
+
+```python
+from lmdeploy import pipeline
+from lmdeploy.vl import load_image
+
+pipe = pipeline('meta-llama/Llama-3.2-11B-Vision-Instruct')
+
+image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
+response = pipe(('describe this image', image))
+print(response)
+```
+
+## 在线服务
+
+### 服务启动
+
+你可以通过 `lmdeploy serve api_server` CLI 工具启动服务：
+
+```shell
+lmdeploy serve api_server meta-llama/Llama-3.2-11B-Vision-Instruct
+```
+
+### 使用 openai 接口
+
+以下代码是通过 openai 包使用 `v1/chat/completions` 服务的例子。运行之前，请先安装 openai 包: `pip install openai`。
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+model_name = client.models.list().data[0].id
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role':
+        'user',
+        'content': [{
+            'type': 'text',
+            'text': 'Describe the image please',
+        }, {
+            'type': 'image_url',
+            'image_url': {
+                'url':
+                'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg',
+            },
+        }],
+    }],
+    temperature=0.8,
+    top_p=0.8)
+print(response)
+```
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index b362aa2050..695103b52e 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -49,6 +49,7 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 |     Llama2     |  7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |     Llama3     |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|  Llama3.2-VL   |   8B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |   -   |
 |   InternLM2    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  InternLM2.5   |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
diff --git a/lmdeploy/archs.py b/lmdeploy/archs.py
index 9b5beaecb7..8284c99741 100644
--- a/lmdeploy/archs.py
+++ b/lmdeploy/archs.py
@@ -121,7 +121,7 @@ def check_vl_llm(config: dict) -> bool:
         'InternVLChatModel', 'MiniGeminiLlamaForCausalLM',
         'MGMLlamaForCausalLM', 'MiniCPMV', 'LlavaForConditionalGeneration',
         'LlavaNextForConditionalGeneration', 'Phi3VForCausalLM',
-        'Qwen2VLForConditionalGeneration'
+        'Qwen2VLForConditionalGeneration', 'MllamaForConditionalGeneration'
     ])
     if arch == 'QWenLMHeadModel' and 'visual' in config:
         return True
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index d5f47a2fc2..f251ca18d2 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -886,6 +886,9 @@ def match(cls, model_path: str) -> Optional[str]:
         if 'llama-3.1-' in model_path.lower(
         ) or 'llama3.1-' in model_path.lower():
             return 'llama3_1'
+        if 'llama-3.2-' in model_path.lower(
+        ) or 'llama3.2-' in model_path.lower():
+            return 'llama3_1'
 
 
 @MODELS.register_module(name='minicpmv-2d6')
diff --git a/lmdeploy/pytorch/backends/attention.py b/lmdeploy/pytorch/backends/attention.py
index fccb8f3c74..92a0befbf4 100644
--- a/lmdeploy/pytorch/backends/attention.py
+++ b/lmdeploy/pytorch/backends/attention.py
@@ -14,6 +14,7 @@ class AttentionMetadata:
     q_start_loc: torch.Tensor = None
     q_seqlens: torch.Tensor = None
     kv_seqlens: torch.Tensor = None
+    fill_seqlens: torch.Tensor = None
     quant_policy: Literal[0, 4, 8] = 0
 
 
diff --git a/lmdeploy/pytorch/backends/cuda/attention.py b/lmdeploy/pytorch/backends/cuda/attention.py
index de5c039b2d..a0148c8782 100644
--- a/lmdeploy/pytorch/backends/cuda/attention.py
+++ b/lmdeploy/pytorch/backends/cuda/attention.py
@@ -66,26 +66,34 @@ def forward(
 
         block_offsets = attn_metadata.block_offsets
         q_start_loc = attn_metadata.q_start_loc
+        fill_q_start_loc = q_start_loc
         q_seqlens = attn_metadata.q_seqlens
+        fill_seqlens = q_seqlens
         kv_seqlens = attn_metadata.kv_seqlens
         quant_policy = attn_metadata.quant_policy
         max_q_seqlen = query.numel() // (query.size(-1) * query.size(-2))
+        fill_max_q_seqlen = max_q_seqlen
+        if attn_metadata.fill_seqlens is not None:
+            fill_seqlens = attn_metadata.fill_seqlens
+            fill_max_q_seqlen = key.numel() // (key.size(-1) * key.size(-2))
+            fill_q_start_loc = fill_seqlens.cumsum(0) - fill_seqlens
 
         # fill kv cache
-        self.fill_kv_cache(
-            key,
-            value,
-            k_cache,
-            v_cache,
-            q_start_loc,
-            q_seqlens,
-            kv_seq_length=kv_seqlens,
-            max_q_seq_length=max_q_seqlen,
-            block_offsets=block_offsets,
-            k_scales_zeros=k_scales_zeros,
-            v_scales_zeros=v_scales_zeros,
-            quant_policy=quant_policy,
-        )
+        if key is not None and value is not None:
+            self.fill_kv_cache(
+                key,
+                value,
+                k_cache,
+                v_cache,
+                fill_q_start_loc,
+                fill_seqlens,
+                kv_seq_length=kv_seqlens,
+                max_q_seq_length=fill_max_q_seqlen,
+                block_offsets=block_offsets,
+                k_scales_zeros=k_scales_zeros,
+                v_scales_zeros=v_scales_zeros,
+                quant_policy=quant_policy,
+            )
 
         if inplace:
             attn_output = query[..., :self.v_head_size]
diff --git a/lmdeploy/pytorch/backends/cuda/op_backend.py b/lmdeploy/pytorch/backends/cuda/op_backend.py
index af93aac5c9..c01b5f093a 100644
--- a/lmdeploy/pytorch/backends/cuda/op_backend.py
+++ b/lmdeploy/pytorch/backends/cuda/op_backend.py
@@ -113,7 +113,25 @@ def update_step_context(cls, step_context):
             quant_policy=step_context.kv_quant_policy,
         )
 
+        cross_attn_metadata = None
+        fill_seqlens = None
+        if step_context.cross_attention_states is not None:
+            fill_seqlens = torch.zeros_like(q_seqlens)
+            for idx, state in enumerate(step_context.cross_attention_states):
+                if state is not None:
+                    fill_seqlens[idx] = state.shape[-2]
+        cross_attn_metadata = attn_meta_cls(
+            step_context.is_decoding,
+            step_context.block_offsets,
+            q_start_loc=q_start_loc,
+            q_seqlens=q_seqlens,
+            kv_seqlens=step_context.cross_kv_seqlens,
+            fill_seqlens=fill_seqlens,
+            quant_policy=step_context.kv_quant_policy,
+        )
+
         step_context.attn_metadata = attn_metadata
+        step_context.cross_attn_metadata = cross_attn_metadata
         return step_context
 
     @staticmethod
diff --git a/lmdeploy/pytorch/configurations/mllama.py b/lmdeploy/pytorch/configurations/mllama.py
new file mode 100644
index 0000000000..2383c92c50
--- /dev/null
+++ b/lmdeploy/pytorch/configurations/mllama.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import AutoModelConfigBuilder
+from .default import DefaultModelConfigBuilder
+
+
+class MLlamaModelConfigBuilder(AutoModelConfigBuilder):
+
+    @classmethod
+    def condition(cls, hf_config):
+        """config."""
+        return hf_config.architectures[0] == 'MllamaForConditionalGeneration'
+
+    @classmethod
+    def build(cls, hf_config, model_path: str = None):
+        """build llava hf."""
+        cfg = DefaultModelConfigBuilder.build(hf_config.text_config)
+        cfg.hf_config = hf_config
+        return cfg
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
index 0ebb6f6310..cffe13bbdb 100644
--- a/lmdeploy/pytorch/engine/engine.py
+++ b/lmdeploy/pytorch/engine/engine.py
@@ -342,6 +342,8 @@ def __update_max_new_tokens(msg):
                     input_embeddings=req.data.get('input_embeddings'),
                     mrope_position_ids=req.data.get('mrope_position_ids'),
                     mrope_position_delta=req.data.get('mrope_position_delta'),
+                    cross_attention_states=req.data.get(
+                        'cross_attention_states'),
                 )
                 msg = next(iter(sess.sequences.values()))
                 __update_bad_words(msg)
@@ -350,7 +352,8 @@ def __update_max_new_tokens(msg):
             else:
                 msg = next(iter(sess.sequences.values()))
                 msg.update_token_ids(req.data['token_ids'],
-                                     req.data.get('input_embeddings'))
+                                     req.data.get('input_embeddings'),
+                                     req.data.get('cross_attention_states'))
                 msg.num_new_tokens = 0
                 msg.sampling_param = req.data['sampling_param']
                 msg.return_logits = req.data.get('return_logits', False)
@@ -484,6 +487,16 @@ def __get_mrope_inputs():
                 input_embedding_indexing=input_embedding_indexing,
                 input_embedding_ranges=input_embedding_ranges)
 
+        # only for mllama
+        cross_attention_states = None
+        history_cross_kv_seqlens = None
+        if any([msg.cross_attention_states is not None for msg in messages]):
+            cross_attention_states = [
+                msg.cross_attention_states for msg in messages
+            ]
+        history_cross_kv_seqlens = torch.tensor(
+            [msg.history_cross_kv_seqlens for msg in messages])
+
         return ModelInputs(
             input_ids=input_ids,
             seq_length=seq_length,
@@ -494,6 +507,8 @@ def __get_mrope_inputs():
             local_adapter_ids=local_adapter_ids,
             vision_inputs=vision_embedding_inputs,
             mrope_inputs=mrope_inputs,
+            cross_attention_states=cross_attention_states,
+            history_cross_kv_seqlens=history_cross_kv_seqlens,
         )
 
     def _batch_stopping_criteria(self, token_ids: torch.Tensor,
diff --git a/lmdeploy/pytorch/engine/engine_instance.py b/lmdeploy/pytorch/engine/engine_instance.py
index 0e8a1ff6b5..3e741c7ba2 100644
--- a/lmdeploy/pytorch/engine/engine_instance.py
+++ b/lmdeploy/pytorch/engine/engine_instance.py
@@ -168,7 +168,8 @@ async def async_stream_infer(
                    adapter_name=adapter_name,
                    input_embeddings=input_embeddings_new,
                    mrope_position_ids=kwargs.get('mrope_position_ids'),
-                   mrope_position_delta=kwargs.get('mrope_position_delta'))
+                   mrope_position_delta=kwargs.get('mrope_position_delta'),
+                   cross_attention_states=kwargs.get('cross_attention_states'))
         req_id = await self.req_sender.async_send_async(
             RequestType.ADD_MESSAGE, msg)
 
diff --git a/lmdeploy/pytorch/messages.py b/lmdeploy/pytorch/messages.py
index e49918fef2..6331577548 100644
--- a/lmdeploy/pytorch/messages.py
+++ b/lmdeploy/pytorch/messages.py
@@ -200,15 +200,15 @@ def __init__(self,
         self.seq_manager = seq_manager
 
     def add_sequence(
-        self,
-        token_ids: Tensor,
-        sampling_param: SamplingParam = None,
-        adapter_name: str = None,
-        return_logits: bool = False,
-        input_embeddings: List[InputEmbeddings] = None,
-        mrope_position_ids: Tensor = None,
-        mrope_position_delta: Tensor = None,
-    ) -> 'SchedulerSequence':
+            self,
+            token_ids: Tensor,
+            sampling_param: SamplingParam = None,
+            adapter_name: str = None,
+            return_logits: bool = False,
+            input_embeddings: List[InputEmbeddings] = None,
+            mrope_position_ids: Tensor = None,
+            mrope_position_delta: Tensor = None,
+            cross_attention_states: Tensor = None) -> 'SchedulerSequence':
         """Add a new message."""
         if isinstance(token_ids, Tensor):
             token_ids = token_ids.numpy()
@@ -231,6 +231,7 @@ def add_sequence(
             return_logits=return_logits,
             mrope_position_ids=mrope_position_ids,
             mrope_position_delta=mrope_position_delta,
+            cross_attention_states=cross_attention_states,
         )
         self.sequences[seq.seq_id] = seq
         if self.seq_manager is not None:
@@ -383,6 +384,8 @@ class SchedulerSequence:
     num_ignored_history: int = 0
     mrope_position_ids: Optional[Tensor] = None
     mrope_position_delta: Optional[int] = None
+    cross_attention_states: Optional[Tensor] = None
+    history_cross_kv_seqlens: Optional[int] = None
 
     def __post_init__(self):
         """post init."""
@@ -484,10 +487,25 @@ def num_all_tokens(self):
         """num all tokens."""
         return self.num_all_ids
 
+    def num_all_cross_tokens(self):
+        """num of all cross tokens."""
+        if self.history_cross_kv_seqlens is None:
+            if self.cross_attention_states is None:
+                self.history_cross_kv_seqlens = 0
+            else:
+                self.history_cross_kv_seqlens = self.cross_attention_states.shape[  # noqa
+                    -2]
+        return self.history_cross_kv_seqlens
+
     def update_token_ids(self,
                          token_ids: Tensor,
-                         embeddings: List[InputEmbeddings] = None):
+                         embeddings: List[InputEmbeddings] = None,
+                         cross_attention_states: List[Tensor] = None):
         """Update token ids, old token ids will be added to history."""
+        # cross attention
+        if cross_attention_states is not None:
+            self.history_cross_kv_seqlens += cross_attention_states.shape[-2]
+            self.cross_attention_states = cross_attention_states
         self._num_history_ids += self._num_token_ids
         # update history image nums
         self._num_history_images += self._num_images
diff --git a/lmdeploy/pytorch/model_inputs.py b/lmdeploy/pytorch/model_inputs.py
index af36237a56..669625d43d 100644
--- a/lmdeploy/pytorch/model_inputs.py
+++ b/lmdeploy/pytorch/model_inputs.py
@@ -120,6 +120,8 @@ class ModelInputs:
     local_adapter_ids: torch.LongTensor = None
     vision_inputs: VisionModelInputs = None
     mrope_inputs: MRopeModelInputs = None
+    cross_attention_states: torch.Tensor = None
+    history_cross_kv_seqlens: torch.LongTensor = None
 
     def update(self, input_ids: torch.LongTensor):
         """update input ids."""
@@ -164,6 +166,7 @@ def split(self, split_size: int, block_size: int):
                 local_adapter_ids=self.local_adapter_ids,
                 vision_inputs=self.vision_inputs,
                 mrope_inputs=self.mrope_inputs,
+                cross_attention_states=self.cross_attention_states,
             )
             ret.append(inp)
             block_start += num_blocks
@@ -210,6 +213,9 @@ class StepContext:
     vision_inputs: VisionModelInputs = None
     mrope_position_ids: torch.Tensor = None
     attn_metadata: Any = None
+    cross_attn_metadata: Any = None
+    cross_attention_states: torch.Tensor = None
+    cross_kv_seqlens: torch.LongTensor = None
     kv_quant_policy: Literal[0, 4, 8] = 0
 
     _outputs: Dict = field(default_factory=dict)
@@ -246,9 +252,11 @@ def new(
                 history_seqlens, q_seqlens)
 
         # kv_seqlens
+        cross_attention_states = inputs.cross_attention_states
         if inputs.is_decoding:
             attention_mask = torch.ones_like(q_seqlens)[:, None]
             position_ids = history_seqlens.unsqueeze(-1)
+            cross_attention_states = None
         else:
             max_q_seqlen = q_seqlens.max().item()
             mask_range = torch.arange(max_q_seqlen, device=device)[None, :]
@@ -279,6 +287,8 @@ def new(
             local_adapter_ids=inputs.local_adapter_ids,
             vision_inputs=inputs.vision_inputs,
             mrope_position_ids=mrope_position_ids,
+            cross_attention_states=cross_attention_states,
+            cross_kv_seqlens=inputs.history_cross_kv_seqlens,
             kv_quant_policy=kv_quant_policy,
         )
 
diff --git a/lmdeploy/pytorch/models/mllama.py b/lmdeploy/pytorch/models/mllama.py
new file mode 100644
index 0000000000..a16abd8b91
--- /dev/null
+++ b/lmdeploy/pytorch/models/mllama.py
@@ -0,0 +1,758 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers.models.llama import LlamaConfig
+from transformers.models.mllama.modeling_mllama import MllamaTextConfig
+
+from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
+from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm, RopeType,
+                                 SiluAndMul, build_rotary_embedding)
+from lmdeploy.pytorch.nn.linear import (build_merged_colwise_linear,
+                                        build_qkv_proj, build_rowwise_linear)
+from lmdeploy.pytorch.nn.rotary_embedding import Llama3Parameters
+from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
+
+from .utils.cudagraph import CudaGraphMixin
+
+MLLAMA_IMAGE_TOKEN_ID = 128256
+MLLAMA_IMAGE_TOKEN = '<|image|>'
+
+
+class LlamaAttention(nn.Module):
+    """Rewrite module of LlamaAttention."""
+
+    def __init__(self,
+                 config: LlamaConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        quantization_config = getattr(config, 'quantization_config', None)
+        num_heads = config.num_attention_heads
+        num_key_value_heads = config.num_key_value_heads
+        hidden_size = config.hidden_size
+        head_dim = getattr(config, 'head_dim', hidden_size // num_heads)
+
+        # packed qkv
+        self.qkv_proj = build_qkv_proj(
+            hidden_size,
+            num_q_heads=num_heads,
+            num_kv_heads=num_key_value_heads,
+            head_size=head_dim,
+            bias=getattr(config, 'attention_bias', False),
+            quant_config=quantization_config,
+            dtype=dtype,
+            device=device,
+        )
+
+        # rotary embedding
+        self.apply_rotary_pos_emb = ApplyRotaryEmb()
+
+        # attention
+        self.attn_fwd = Attention(
+            num_heads,
+            head_dim,
+            num_kv_heads=num_key_value_heads,
+            v_head_size=head_dim,
+        )
+
+        # o_proj
+        self.o_proj = build_rowwise_linear(num_heads * head_dim,
+                                           hidden_size,
+                                           bias=getattr(
+                                               config, 'attention_bias',
+                                               False),
+                                           quant_config=quantization_config,
+                                           dtype=dtype,
+                                           device=device,
+                                           is_tp=True)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attn_metadata: Any = None,
+    ):
+        """Rewrite of LlamaAttention.forward."""
+        # qkv proj
+        qkv_states = self.qkv_proj(hidden_states)
+        # (-1, heads, head_dim)
+        qkv_states = qkv_states.flatten(0, -2)
+        query_states, key_states, value_states = self.qkv_proj.split_qkv(
+            qkv_states)
+
+        # apply rotary embedding
+        cos, sin = rotary_pos_emb
+        query_states, key_states = self.apply_rotary_pos_emb(
+            query_states,
+            key_states,
+            cos,
+            sin,
+            inplace=True,
+        )
+
+        # attention
+        attn_output = self.attn_fwd(
+            query_states,
+            key_states,
+            value_states,
+            past_key_value[0],
+            past_key_value[1],
+            attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
+            inplace=True,
+        )
+        attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
+
+        # o proj
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+
+
+class MllamaTextCrossAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper."""
+
+    def __init__(self,
+                 config: Optional[MllamaTextConfig] = None,
+                 layer_idx: Optional[int] = None,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.config = config
+        quantization_config = getattr(config, 'quantization_config', None)
+        self.num_heads = self.config.num_attention_heads
+        self.num_key_value_heads = self.config.num_key_value_heads
+        self.hidden_size = config.hidden_size
+        self.head_dim = config.hidden_size // self.num_heads
+        self.layer_idx = layer_idx
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+
+        # packed qkv
+        self.qkv_proj = build_qkv_proj(
+            self.hidden_size,
+            num_q_heads=self.num_heads,
+            num_kv_heads=self.num_key_value_heads,
+            head_size=self.head_dim,
+            bias=getattr(config, 'attention_bias', False),
+            quant_config=quantization_config,
+            dtype=dtype,
+            device=device,
+        )
+        self.o_proj = build_rowwise_linear(self.num_heads * self.head_dim,
+                                           self.hidden_size,
+                                           bias=False,
+                                           quant_config=quantization_config,
+                                           dtype=dtype,
+                                           device=device,
+                                           is_tp=True)
+
+        # attention
+        self.attn_fwd = Attention(
+            self.num_heads,
+            self.head_dim,
+            num_kv_heads=self.num_key_value_heads,
+            v_head_size=self.head_dim,
+        )
+
+        self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
+        cross_attention_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        cross_attn_metadata: Any = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
+               Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel."""
+        qkv_states = self.qkv_proj(hidden_states)
+        qkv_states = qkv_states.flatten(0, -2)
+        query_states, _, _ = self.qkv_proj.split_qkv(qkv_states)
+        query_states = query_states.view(-1, query_states.shape[-2],
+                                         self.head_dim)
+        query_states = self.q_norm(query_states)
+
+        if cross_attention_states is not None:
+            qkv_states = self.qkv_proj(cross_attention_states)
+            qkv_states = qkv_states.flatten(0, -2)
+            _, key_states, value_states = self.qkv_proj.split_qkv(qkv_states)
+            key_states = key_states.view(-1, key_states.shape[-2],
+                                         self.head_dim)
+            value_states = value_states.view(-1, value_states.shape[-2],
+                                             self.head_dim)
+            key_states = self.k_norm(key_states)
+        else:
+            key_states = None
+            value_states = None
+
+        # attention
+        attn_output = self.attn_fwd(
+            query_states,
+            key_states,
+            value_states,
+            past_key_value[0],
+            past_key_value[1],
+            cross_attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
+            inplace=True,
+        )
+        attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
+
+        # o proj
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+
+
+class LlamaMLP(nn.Module):
+    """llama mlp."""
+
+    def __init__(self,
+                 config: LlamaConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        quantization_config = getattr(config, 'quantization_config', None)
+        # gate up
+        self.gate_up_proj = build_merged_colwise_linear(
+            config.hidden_size,
+            [config.intermediate_size, config.intermediate_size],
+            bias=False,
+            dtype=dtype,
+            device=device,
+            quant_config=quantization_config,
+            is_tp=True,
+        )
+
+        # silu and mul
+        self.act_fn = SiluAndMul(inplace=True)
+
+        # down
+        self.down_proj = build_rowwise_linear(config.intermediate_size,
+                                              config.hidden_size,
+                                              bias=False,
+                                              quant_config=quantization_config,
+                                              dtype=dtype,
+                                              device=device,
+                                              is_tp=True)
+
+    def forward(self, x):
+        """forward."""
+        gate_up = self.gate_up_proj(x)
+        act = self.act_fn(gate_up)
+        return self.down_proj(act)
+
+
+class MllamaSelfAttentionDecoderLayer(nn.Module):
+    """llama decoder layer."""
+
+    def __init__(self,
+                 config: LlamaConfig,
+                 layer_idx: int,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.layer_idx = layer_idx
+        quantization_config = getattr(config, 'quantization_config', None)
+
+        # build attention layer
+        self.self_attn = LlamaAttention(config, dtype=dtype, device=device)
+
+        # builf MLP
+        self.mlp = LlamaMLP(config, dtype=dtype, device=device)
+
+        # build input layer norm
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       config.rms_norm_eps,
+                                       quant_config=quantization_config,
+                                       dtype=dtype,
+                                       device=device)
+
+        # build attention layer norm
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size,
+            config.rms_norm_eps,
+            quant_config=quantization_config,
+            dtype=dtype,
+            device=device)
+
+    def forward(self,
+                hidden_states: torch.Tensor,
+                rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
+                past_key_value: Optional[List[torch.FloatTensor]],
+                cross_attention_states: Optional[torch.FloatTensor] = None,
+                full_text_row_masked_out_mask: Optional[torch.Tensor] = None,
+                residual: Optional[torch.Tensor] = None,
+                attn_metadata: Any = None,
+                cross_attn_metadata: Any = None):
+
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        # Self Attention
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            rotary_pos_emb=rotary_pos_emb,
+            past_key_value=past_key_value,
+            attn_metadata=attn_metadata,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+
+        outputs = (hidden_states, residual)
+        return outputs
+
+
+class MllamaCrossAttentionDecoderLayer(nn.Module):
+    """llama decoder layer."""
+
+    def __init__(self,
+                 config: LlamaConfig,
+                 layer_idx: int,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.layer_idx = layer_idx
+        quantization_config = getattr(config, 'quantization_config', None)
+
+        # build attention layer
+        self.cross_attn = MllamaTextCrossAttention(config,
+                                                   dtype=dtype,
+                                                   device=device)
+
+        # builf MLP
+        self.mlp = LlamaMLP(config, dtype=dtype, device=device)
+
+        # build input layer norm
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       config.rms_norm_eps,
+                                       quant_config=quantization_config,
+                                       dtype=dtype,
+                                       device=device)
+
+        # build attention layer norm
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size,
+            config.rms_norm_eps,
+            quant_config=quantization_config,
+            dtype=dtype,
+            device=device)
+        self.cross_attn_attn_gate = torch.nn.Parameter(
+            torch.zeros(1, dtype=dtype))
+        self.cross_attn_mlp_gate = torch.nn.Parameter(
+            torch.zeros(1, dtype=dtype))
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cross_attention_states: torch.Tensor,
+        full_text_row_masked_out_mask: torch.Tensor,
+        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: Optional[List[torch.FloatTensor]],
+        residual: Optional[torch.Tensor] = None,
+        attn_metadata: Any = None,
+        cross_attn_metadata: Any = None,
+    ):
+
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        # Cross Attention
+        hidden_states = self.cross_attn(
+            hidden_states=hidden_states,
+            cross_attention_states=cross_attention_states,
+            rotary_pos_emb=rotary_pos_emb,
+            past_key_value=past_key_value,
+            cross_attn_metadata=cross_attn_metadata,
+        )
+        hidden_states = residual + self.cross_attn_attn_gate.tanh(
+        ) * hidden_states
+        residual = hidden_states
+
+        # Fully Connected
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = full_text_row_masked_out_mask * hidden_states
+        hidden_states = residual + self.cross_attn_mlp_gate.tanh(
+        ) * hidden_states
+
+        outputs = (hidden_states, None)
+        return outputs
+
+
+class MllamaTextModel(nn.Module):
+    """llama model."""
+
+    def __init__(self,
+                 config: LlamaConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size + 8,
+                                         config.hidden_size,
+                                         self.padding_idx,
+                                         dtype=dtype,
+                                         device=device)
+        self.cross_attention_layers = config.cross_attention_layers
+
+        # build all decode layers
+        layers = []
+        for layer_idx in range(config.num_hidden_layers):
+            if layer_idx in self.cross_attention_layers:
+                layers.append(
+                    MllamaCrossAttentionDecoderLayer(config,
+                                                     layer_idx,
+                                                     dtype=dtype,
+                                                     device=device))
+            else:
+                layers.append(
+                    MllamaSelfAttentionDecoderLayer(config,
+                                                    layer_idx,
+                                                    dtype=dtype,
+                                                    device=device))
+        self.layers = nn.ModuleList(layers)
+
+        # build norm
+        self.norm = RMSNorm(config.hidden_size,
+                            config.rms_norm_eps,
+                            dtype=dtype,
+                            device=device)
+
+        # build rotary embedding in LlamaModel
+        rope_dim = config.hidden_size // config.num_attention_heads
+        rope_max_pos_emb = config.max_position_embeddings
+        rope_base = config.rope_theta
+        scaling_factor = 1.0
+        llama3_params = None
+        rope_scaling = config.rope_scaling
+        if rope_scaling is None:
+            emb_type = RopeType.LinearScaling
+        else:
+            if 'scaling_factor' in rope_scaling:
+                scaling_factor = rope_scaling['scaling_factor']
+            elif 'factor' in rope_scaling:
+                scaling_factor = rope_scaling['factor']
+
+            rope_type = rope_scaling['rope_type']
+            if rope_type == 'dynamic':
+                emb_type = RopeType.DynamicNTKScaling
+            elif rope_type == 'linear':
+                emb_type = RopeType.LinearScaling
+            elif rope_type == 'llama3':
+                emb_type = RopeType.Llama3
+                low_freq_factor = rope_scaling.get('low_freq_factor', 1.0)
+                high_freq_factor = rope_scaling.get('high_freq_factor', 1.0)
+                llama3_params = Llama3Parameters(low_freq_factor,
+                                                 high_freq_factor)
+            else:
+                raise RuntimeError(f'Unsupported rope type: {rope_type}')
+
+        self.rotary_emb = build_rotary_embedding(
+            rope_dim,
+            rope_max_pos_emb,
+            rope_base,
+            scaling_factor,
+            llama3_params=llama3_params,
+            emb_type=emb_type,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        cross_attention_states: Optional[torch.FloatTensor] = None,
+        full_text_row_masked_out_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        attn_metadata: Any = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cross_attn_metadata: Any = None,
+    ):
+        """Rewrite of LlamaModel.forward."""
+
+        # token embedding
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        hidden_states = inputs_embeds
+
+        # rotary embedding
+        cos, sin = self.rotary_emb(hidden_states, position_ids)
+        cos, sin = cos[0], sin[0]
+        rotary_pos_emb = (cos, sin)
+
+        # decoding
+        residual = None
+        for idx, decoder_layer in enumerate(self.layers):
+            if full_text_row_masked_out_mask is None and idx in self.cross_attention_layers:  # noqa
+                continue
+            past_key_value = past_key_values[idx]
+            hidden_states, residual = decoder_layer(
+                hidden_states,
+                cross_attention_states=cross_attention_states,
+                full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+                rotary_pos_emb=rotary_pos_emb,
+                past_key_value=past_key_value,
+                residual=residual,
+                attn_metadata=attn_metadata,
+                cross_attn_metadata=cross_attn_metadata,
+            )
+
+        # norm
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+    def get_input_embeddings(self):
+        """get input embeddings."""
+        return self.embed_tokens
+
+
+class MllamaForCausalLM(nn.Module):
+    """llama model."""
+
+    def __init__(self,
+                 config: MllamaTextConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.text_config = config.get_text_config()
+        self.vocab_size = self.text_config.vocab_size
+
+        self.model = MllamaTextModel(config, dtype=dtype, device=device)
+        # build lm_head
+        self.lm_head = build_rowwise_linear(config.hidden_size,
+                                            config.vocab_size,
+                                            bias=False,
+                                            dtype=dtype,
+                                            device=device)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        past_key_values: List[List[torch.Tensor]],
+        cross_attention_states: Optional[torch.Tensor] = None,
+        full_text_row_masked_out_mask: Optional[torch.Tensor] = None,
+        attn_metadata: Any = None,
+        inputs_embeds: torch.Tensor = None,
+        cross_attn_metadata: Any = None,
+        **kwargs,
+    ):
+        """model forward, return logits."""
+        hidden_states = self.model(
+            input_ids=input_ids,
+            cross_attention_states=cross_attention_states,
+            full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            attn_metadata=attn_metadata,
+            inputs_embeds=inputs_embeds,
+            cross_attn_metadata=cross_attn_metadata,
+        )
+        return hidden_states
+
+    def get_logits(self, hidden_states: torch.Tensor):
+        """compute logits of the model output."""
+        return self.lm_head(hidden_states)
+
+
+class MllamaForConditionalGeneration(nn.Module, CudaGraphMixin):
+    """rewrote model of MllamaForConditionalGeneration."""
+
+    packed_modules_mapping = {
+        'qkv_proj': [
+            'q_proj',
+            'k_proj',
+            'v_proj',
+        ],
+        'gate_up_proj': [
+            'gate_proj',
+            'up_proj',
+        ],
+    }
+
+    def __init__(self,
+                 config: LlamaConfig,
+                 ctx_mgr: StepContextManager,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.config = config
+        self.ctx_mgr = ctx_mgr
+        # build MllamaForCausalLM
+        self.language_model = MllamaForCausalLM(config.text_config,
+                                                dtype=dtype,
+                                                device=device)
+        self.dtype = dtype
+
+    def flat_encoder_result(self, cross_attention_states: torch.Tensor,
+                            attn_metadata: Any, input_ids: torch.LongTensor):
+        # since every state share the same shape
+        cross_attention_states = torch.cat(cross_attention_states, 0)
+        full_text_row_masked_out_mask = torch.ones(
+            (attn_metadata.q_seqlens.sum(), 1), dtype=torch.bool)
+        start_pos = 0
+        img_idx = torch.where(input_ids == MLLAMA_IMAGE_TOKEN_ID)[1]
+        for img_id, q_seq_len in zip(img_idx.cpu(),
+                                     attn_metadata.q_seqlens.cpu()):
+            full_text_row_masked_out_mask[start_pos:img_id] = False
+            start_pos += q_seq_len
+        full_text_row_masked_out_mask = full_text_row_masked_out_mask.to(
+            cross_attention_states.device)
+
+        return cross_attention_states, full_text_row_masked_out_mask
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        past_key_values: List[List[torch.Tensor]],
+        cross_attention_states: Optional[torch.Tensor] = None,
+        attn_metadata: Any = None,
+        inputs_embeds: torch.Tensor = None,
+        cross_attn_metadata: Any = None,
+        **kwargs,
+    ):
+        """model forward, return logits."""
+        if cross_attn_metadata is None:
+            full_text_row_masked_out_mask = None
+        # FIXME basically, we want to inference
+        # text requests and image requests separately
+        elif cross_attention_states is None and (
+                cross_attn_metadata.kv_seqlens is None
+                or int(cross_attn_metadata.kv_seqlens.sum()) == 0):
+            full_text_row_masked_out_mask = None
+        elif cross_attn_metadata.is_decoding:
+            cross_attention_states = None
+            full_text_row_masked_out_mask = torch.ones(
+                (attn_metadata.q_seqlens.sum(), 1),
+                dtype=torch.bool,
+                device=input_ids.device)
+        else:
+            cross_attention_states, full_text_row_masked_out_mask = \
+                self.flat_encoder_result(cross_attention_states, cross_attn_metadata, input_ids)  # noqa
+        hidden_states = self.language_model(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            cross_attention_states=cross_attention_states,
+            full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+            past_key_values=past_key_values,
+            attn_metadata=attn_metadata,
+            inputs_embeds=inputs_embeds,
+            cross_attn_metadata=cross_attn_metadata,
+        )
+        return hidden_states
+
+    def get_logits(self, hidden_states: torch.Tensor):
+        """compute logits of the model output."""
+        return self.language_model.get_logits(hidden_states)
+
+    def support_cuda_graph(
+        self,
+        input_ids: torch.Tensor,
+        **kwargs,
+    ):
+        """support cudagraph."""
+
+        return False
+
+    def get_input_embeddings(self):
+        """get input embeddings."""
+        return self.language_model.model.get_input_embeddings()
+
+    def prepare_inputs_for_generation(
+        self,
+        past_key_values: List[List[torch.Tensor]],
+        inputs_embeds: Optional[torch.Tensor] = None,
+        context: StepContext = None,
+    ):
+        """prepare input."""
+        # get input_ids, position_ids and attention metadatas
+        input_ids = context.input_ids
+        position_ids = context.position_ids
+        attn_metadata = context.attn_metadata
+        cross_attention_states = context.cross_attention_states
+        if cross_attention_states is not None:
+            cross_attention_states = [
+                t.to(input_ids.device) for t in cross_attention_states
+                if t is not None
+            ]
+        cross_attn_metadata = context.cross_attn_metadata
+
+        # process vision embeddings
+        vision_embeddings = context.input_embeddings
+        vision_embedding_indexing = context.input_embedding_indexing
+        if vision_embeddings is not None and len(vision_embeddings) > 0:
+            if inputs_embeds is None:
+                inputs_embeds = self.get_input_embeddings()(input_ids)
+            inputs_embeds[:,
+                          vision_embedding_indexing, :] = vision_embeddings.to(
+                              inputs_embeds)
+
+        # inputs of forward
+        return dict(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            attn_metadata=attn_metadata,
+            inputs_embeds=inputs_embeds,
+            cross_attention_states=cross_attention_states,
+            cross_attn_metadata=cross_attn_metadata,
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        """load weights."""
+        # modify from vllm
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ('.qkv_proj', '.q_proj', 'q'),
+            ('.qkv_proj', '.k_proj', 'k'),
+            ('.qkv_proj', '.v_proj', 'v'),
+            ('.gate_up_proj', '.gate_proj', 0),
+            ('.gate_up_proj', '.up_proj', 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if 'rotary_emb.inv_freq' in name:
+                continue
+            if ('rotary_emb.cos_cached' in name
+                    or 'rotary_emb.sin_cached' in name):
+                continue
+            if 'vision_model' in name or 'multi_modal_projector' in name:
+                continue
+            if self.config.text_config.tie_word_embeddings and 'lm_head.weight' in name:  # noqa
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                load_weight(param, loaded_weight, shard_id=shard_id)
+                break
+            else:
+                param = params_dict[name]
+                load_weight(param, loaded_weight)
diff --git a/lmdeploy/pytorch/models/module_map.py b/lmdeploy/pytorch/models/module_map.py
index 71c7bca238..bc6385d8b2 100644
--- a/lmdeploy/pytorch/models/module_map.py
+++ b/lmdeploy/pytorch/models/module_map.py
@@ -167,4 +167,10 @@
     f'{LMDEPLOY_PYTORCH_MODEL_PATH}.minicpm3.MiniCPM3ForCausalLM',
 })
 
+# mllama
+MODULE_MAP.update({
+    'MllamaForConditionalGeneration':
+    f'{LMDEPLOY_PYTORCH_MODEL_PATH}.mllama.MllamaForConditionalGeneration',
+})
+
 CUSTOM_MODULE_MAP = dict()
diff --git a/lmdeploy/pytorch/models/utils/cudagraph.py b/lmdeploy/pytorch/models/utils/cudagraph.py
index d8bf055af6..f56899be89 100644
--- a/lmdeploy/pytorch/models/utils/cudagraph.py
+++ b/lmdeploy/pytorch/models/utils/cudagraph.py
@@ -83,6 +83,10 @@ def make_buffers_cudagraph(self, graph_meta: CudaGraphMeta, *args,
         input_buffers['local_adapter_ids'] = torch.zeros(max_batches,
                                                          dtype=torch.int64,
                                                          device=device)
+        # create buffer for cross_attn_metadata here
+        input_buffers['fill_seqlens'] = torch.zeros(max_batches,
+                                                    dtype=torch.int64,
+                                                    device=device)
 
         return input_buffers
 
@@ -137,6 +141,11 @@ def fill_buffers_cudagraph(self, graph_meta: CudaGraphMeta,
             attn_metadata=attn_metadata,
         )
 
+        cross_attn_metadata = kwargs.get('cross_attn_metadata', None)
+        if cross_attn_metadata is not None:
+            # TODO: update cross_attn_metadata here
+            new_inputs['cross_attn_metadata'] = cross_attn_metadata
+
         if is_decoding:
             new_inputs['input_ids'] = input_buffers[
                 'input_ids'][:, :new_batch_size]
diff --git a/lmdeploy/pytorch/paging/block_manager/base_block_manager.py b/lmdeploy/pytorch/paging/block_manager/base_block_manager.py
index ef6709624b..b8d131a556 100644
--- a/lmdeploy/pytorch/paging/block_manager/base_block_manager.py
+++ b/lmdeploy/pytorch/paging/block_manager/base_block_manager.py
@@ -250,11 +250,6 @@ def num_required_blocks(cls,
         """get num required blocks."""
         raise NotImplementedError('Not implemented.')
 
-    @classmethod
-    def last_block_size(cls, seq: SchedulerSequence) -> int:
-        """get last block size."""
-        raise NotImplementedError('Not implemented.')
-
     def can_allocate(self, msg: SchedulerSequence, prealloc_size: int = 0):
         """Return if physical block can be allocated for given message."""
         raise NotImplementedError('Not implemented.')
diff --git a/lmdeploy/pytorch/paging/block_manager/default_block_manager.py b/lmdeploy/pytorch/paging/block_manager/default_block_manager.py
index 9a5ff0136d..6159c9f3f6 100644
--- a/lmdeploy/pytorch/paging/block_manager/default_block_manager.py
+++ b/lmdeploy/pytorch/paging/block_manager/default_block_manager.py
@@ -28,19 +28,14 @@ def num_required_blocks(cls,
                             prealloc_size: int = 0):
         """get num required blocks."""
         num_tokens = obj.num_all_tokens() + prealloc_size
+
+        # cross tokens
+        num_cross = obj.num_all_cross_tokens()
+        num_tokens = max(num_tokens, num_cross)
+
         num_all_blocks = _div_up(num_tokens, obj.block_size)
         return max(0, num_all_blocks - len(obj.logical_blocks))
 
-    @classmethod
-    def last_block_size(cls, seq: SchedulerSequence) -> int:
-        """get last block size."""
-        num_blocks = len(seq.logical_blocks)
-        if num_blocks == 0:
-            return 0
-        elif num_blocks * seq.block_size < seq.history_len:
-            return seq.block_size
-        return seq.history_len % seq.block_size
-
     def can_allocate(self, msg: SchedulerSequence, prealloc_size: int = 0):
         """Return if physical block can be allocated for given message."""
         num_required_blocks = self.num_required_blocks(msg, prealloc_size)
diff --git a/lmdeploy/pytorch/supported_models.py b/lmdeploy/pytorch/supported_models.py
index e6a544b5b9..3a5baf8fc6 100644
--- a/lmdeploy/pytorch/supported_models.py
+++ b/lmdeploy/pytorch/supported_models.py
@@ -66,6 +66,8 @@
     Gemma2ForCausalLM=True,
     # phi3.5-moe
     PhiMoEForCausalLM=True,
+    # mllama
+    MllamaForConditionalGeneration=True,
 )
 
 
diff --git a/lmdeploy/serve/vl_async_engine.py b/lmdeploy/serve/vl_async_engine.py
index 3971e068c0..fd0b0bb5e4 100644
--- a/lmdeploy/serve/vl_async_engine.py
+++ b/lmdeploy/serve/vl_async_engine.py
@@ -63,7 +63,8 @@ async def _get_prompt_input(self,
 
         results = {}
         input_ids = []
-        from lmdeploy.vl.templates import Qwen2VLChatTemplateWrapper
+        from lmdeploy.vl.templates import (MllamaTempateWrapper,
+                                           Qwen2VLChatTemplateWrapper)
         ranges = None
         grid_thws = None
         if len(segs) > 1:
@@ -87,6 +88,17 @@ async def _get_prompt_input(self,
                     grid_thws = [x['grid_thw'] for x in features]
                     features = [x['embeddings'] for x in features]
 
+                if isinstance(self.vl_prompt_template, MllamaTempateWrapper):
+                    # llama3.2 just encode <|image|> and inference
+                    decorated = decorated.replace(IMAGE_TOKEN, '<|image|>')
+                    input_ids = self.tokenizer.encode(decorated,
+                                                      add_bos=sequence_start)
+                    results['input_ids'] = input_ids
+                    results['prompt'] = decorated
+                    assert len(features)
+                    results['cross_attention_states'] = features[0]
+                    return results
+
             features = [x.cpu().numpy() for x in features]
             input_ids = []
             begins = []
diff --git a/lmdeploy/utils.py b/lmdeploy/utils.py
index affbac8073..1f069b6807 100644
--- a/lmdeploy/utils.py
+++ b/lmdeploy/utils.py
@@ -265,7 +265,7 @@ def _get_and_verify_max_len(
         return max_model_len if max_model_len else session_len
 
     # vl configs hide session-len inside llm configs
-    llm_keys = ['language_config', 'llm_config']
+    llm_keys = ['language_config', 'llm_config', 'text_config']
     for key in llm_keys:
         hf_tm_config = getattr(hf_tm_config, key, hf_tm_config)
 
diff --git a/lmdeploy/vl/model/builder.py b/lmdeploy/vl/model/builder.py
index 5556ffe0f8..9e71f7d1c0 100644
--- a/lmdeploy/vl/model/builder.py
+++ b/lmdeploy/vl/model/builder.py
@@ -17,6 +17,7 @@
 from .llava_next import LlavaNextVisionModel  # noqa F401
 from .mini_gemeni import MiniGeminiVisionModel  # noqa F401
 from .minicpmv import MiniCPMVModel  # noqa F401
+from .mllama import MllamaVLModel  # noqa F401
 from .phi3_vision import Phi3VisionModel  # noqa F401
 from .qwen import QwenVisionModel  # noqa F401
 from .qwen2 import Qwen2VLModel  # noqa F401
diff --git a/lmdeploy/vl/model/mllama.py b/lmdeploy/vl/model/mllama.py
new file mode 100644
index 0000000000..db0a0e9cbf
--- /dev/null
+++ b/lmdeploy/vl/model/mllama.py
@@ -0,0 +1,286 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from typing import Dict, List
+
+import torch
+import torch.nn.functional as F
+from PIL.Image import Image
+from transformers.modeling_outputs import BaseModelOutput
+from transformers.models.mllama.modeling_mllama import MllamaPreTrainedModel
+
+from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
+from lmdeploy.vl.model.utils import disable_logging
+
+
+class MllamaVisionModelPatch(MllamaPreTrainedModel):
+
+    def apply_class_embedding(self,
+                              hidden_state: torch.Tensor) -> torch.Tensor:
+        batch_size, _, hidden_size = hidden_state.shape
+        class_embedding = self.class_embedding.expand(batch_size, 1,
+                                                      hidden_size)
+        class_embedding = class_embedding.to(hidden_state.device)
+        hidden_state = torch.cat([class_embedding, hidden_state], dim=1)
+        return hidden_state
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        aspect_ratio_ids: torch.Tensor,
+        aspect_ratio_mask: torch.Tensor,
+        output_attentions: bool = None,
+        output_hidden_states: bool = None,
+        return_dict: bool = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions  # noqa
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict  # noqa
+
+        batch_size, num_concurrent_media, num_tiles, num_channels, height, width = pixel_values.shape  # noqa
+
+        pixel_values = pixel_values.reshape(
+            batch_size * num_concurrent_media * num_tiles, num_channels,
+            height, width)
+        aspect_ratio_ids = aspect_ratio_ids.reshape(
+            batch_size * num_concurrent_media, -1)
+
+        # Patch embedding
+        patch_embeds = self.patch_embedding(
+            pixel_values.to(self.dtype).to(self.device))
+        hidden_state = patch_embeds.flatten(2).transpose(1, 2)
+
+        # Tile embeddings
+        _, num_patches, dim = hidden_state.shape
+        hidden_state = hidden_state.reshape(batch_size * num_concurrent_media,
+                                            num_tiles, -1, dim)
+        hidden_state = self.pre_tile_positional_embedding(
+            hidden_state, aspect_ratio_ids)
+
+        # Add cls token
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media * num_tiles, num_patches, dim)
+        hidden_state = self.apply_class_embedding(hidden_state)
+        num_patches += 1
+
+        # Position embeddings
+        hidden_state = hidden_state.reshape(batch_size * num_concurrent_media,
+                                            num_tiles, num_patches, dim)
+        hidden_state = self.gated_positional_embedding(hidden_state,
+                                                       aspect_ratio_ids)
+
+        hidden_state = self.layernorm_pre(hidden_state)
+
+        # Compute the number of tokens to pad
+        num_padding_patches = (8 - (hidden_state.shape[-2] % 8)) % 8
+        # Compute padding tuple for pad function
+        padding = (
+            0, 0, 0, num_padding_patches
+        )  # (pad_left, pad_right, pad_left for dim -2, pad_right for dim -2)
+        # Pad the tensor
+        hidden_state = F.pad(hidden_state, padding, mode='constant', value=0)
+        slice_index = -num_padding_patches if num_padding_patches > 0 else None
+
+        # Prepare attention mask
+        attention_mask = aspect_ratio_mask.reshape(
+            batch_size * num_concurrent_media, -1)
+        from transformers.models.mllama.modeling_mllama import \
+            _prepare_aspect_ratio_attention_mask
+        attention_mask = _prepare_aspect_ratio_attention_mask(
+            aspect_ratio_mask=attention_mask,
+            num_patches=self.num_patches,
+            target_length=hidden_state.shape[2],
+            dtype=self.dtype,
+        )
+
+        # Apply encoder
+        hidden_state = hidden_state.view(batch_size * num_concurrent_media, -1,
+                                         dim)
+        output = self.transformer(
+            hidden_state,
+            attention_mask=attention_mask,
+            output_hidden_states=True,
+            output_attentions=output_attentions,
+        )
+        hidden_state = output[0]
+
+        hidden_state = self.layernorm_post(hidden_state)
+
+        # Apply global encoder
+        hidden_state = hidden_state.reshape(batch_size * num_concurrent_media,
+                                            num_tiles,
+                                            num_patches + num_padding_patches,
+                                            dim)
+        hidden_state = self.post_tile_positional_embedding(
+            hidden_state, aspect_ratio_ids)
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media,
+            num_tiles * (num_patches + num_padding_patches), dim)
+        global_output = self.global_transformer(
+            hidden_state,
+            attention_mask=attention_mask,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+        )
+        hidden_state = global_output[0]
+
+        # Remove padding form hidden state
+        hidden_state = hidden_state.reshape(batch_size * num_concurrent_media,
+                                            num_tiles,
+                                            num_patches + num_padding_patches,
+                                            dim)
+        hidden_state = hidden_state[:, :, :slice_index]
+        hidden_state = hidden_state.reshape(batch_size, num_concurrent_media,
+                                            num_tiles, num_patches, dim)
+
+        # Collect intermediate layer outputs from encoder output
+        all_intermediate_hidden_states = output[1]
+        # rewrite to sync device during accelerate pipeline parallel
+        device = hidden_state.device
+        all_intermediate_hidden_states = [
+            s.to(device) for s in all_intermediate_hidden_states
+        ]
+        intermediate_hidden_states = torch.stack(
+            all_intermediate_hidden_states, dim=-1)
+        intermediate_hidden_states = intermediate_hidden_states[
+            ..., self.intermediate_layers_indices]
+
+        # Remove padding from intermediate hidden states
+        intermediate_hidden_states = intermediate_hidden_states.reshape(
+            batch_size * num_concurrent_media, num_tiles,
+            num_patches + num_padding_patches, -1)
+        intermediate_hidden_states = intermediate_hidden_states[:, :, :
+                                                                slice_index]
+        intermediate_hidden_states = intermediate_hidden_states.reshape(
+            batch_size, num_concurrent_media, num_tiles, num_patches, -1)
+
+        # Concatenate final hidden state and intermediate hidden states
+        hidden_state = torch.cat([hidden_state, intermediate_hidden_states],
+                                 dim=-1)
+
+        if output_hidden_states:
+            hidden_states = tuple(all_intermediate_hidden_states) + tuple(
+                global_output[1])
+        else:
+            hidden_states = None
+
+        if output_attentions:
+            # global transformer in contrast to `self.transformer` doesn't
+            # always return hidden states so we might go index out-of-range
+            global_attn = tuple(
+                global_output[2]) if output_hidden_states else tuple(
+                    global_output[1])
+            attentions = tuple(output[2]) + global_attn
+        else:
+            attentions = None
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, hidden_states, attentions]
+                         if v is not None)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_state,
+            hidden_states=hidden_states,
+            attentions=attentions,
+        )
+
+
+def check_transformers():
+    """check qwen_vl_utils."""
+    try:
+        from transformers import MllamaForConditionalGeneration  # noqa: F401
+    except ImportError:
+        raise ImportError(
+            'please install latest transformers by '
+            'pip install git+https://github.com/huggingface/transformers.git')
+
+
+@VISION_MODELS.register_module()
+class MllamaVLModel(VisonModel):
+    """llama3.2 model."""
+
+    _arch = 'MllamaForConditionalGeneration'
+
+    def build_model(self):
+        check_transformers()
+
+        from transformers.models.mllama.modeling_mllama import \
+            MllamaVisionModel
+        MllamaVisionModel.forward = MllamaVisionModelPatch.forward
+        MllamaVisionModel.apply_class_embedding = MllamaVisionModelPatch.apply_class_embedding  # noqa
+        from accelerate import init_empty_weights
+        with init_empty_weights():
+            config = self.hf_config
+            config.quantization_config = {}  # disable vision part quantization
+            # disable accelerate check_tied_parameters_in_config
+            config.tie_word_embeddings = False
+            from transformers import MllamaForConditionalGeneration
+            model = MllamaForConditionalGeneration._from_config(config)
+            if not self.with_llm:
+                del model.language_model
+            else:
+                self.vl_model = model
+
+        from accelerate import load_checkpoint_and_dispatch
+        with disable_logging():
+            load_checkpoint_and_dispatch(
+                model=model,
+                checkpoint=self.model_path,
+                device_map='auto' if not self.with_llm else {'': 'cpu'},
+                max_memory=self.max_memory,
+                no_split_module_classes=[
+                    'MllamaPrecomputedPositionEmbedding',
+                    'MllamaPrecomputedAspectRatioEmbedding',
+                    'MllamaVisionEncoderLayer'
+                ],
+                dtype=config.torch_dtype)
+
+        self.model = model.eval()
+
+        # processor
+        from transformers import AutoProcessor
+        self.processor = AutoProcessor.from_pretrained(self.model_path)
+        self.image_token_id = 128256
+
+    @torch.no_grad()
+    def forward(self,
+                images: List[Image],
+                params: List[Dict] = None) -> List[torch.Tensor]:
+        """forward."""
+        # only support image input
+        if params is not None:
+            assert len(images) == len(
+                params), 'different length of images and params'
+        else:
+            params = [{}] * len(images)
+        # resize images with abnormal shape
+        # TODO try catch image feature extraction in pipeline and
+        # throw error back to users
+        for i, image in enumerate(images):
+            size = image.size
+            if any([s < 3 for s in size]):
+                images[i] = image.resize([s * 3 for s in size])
+        image_inputs = self.processor.image_processor(images=images,
+                                                      return_tensors='pt')
+        pixel_values = image_inputs['pixel_values'].to(
+            self.model.vision_model.device)
+        pixel_values = pixel_values.type(self.model.vision_model.dtype)
+        aspect_ratio_ids = image_inputs['aspect_ratio_ids'].to(
+            self.model.vision_model.device)
+        aspect_ratio_mask = image_inputs['aspect_ratio_mask'].to(
+            self.model.vision_model.device)
+        vision_outputs = self.model.vision_model(
+            pixel_values=pixel_values,
+            aspect_ratio_ids=aspect_ratio_ids,
+            aspect_ratio_mask=aspect_ratio_mask,
+            output_hidden_states=False,
+            output_attentions=False,
+            return_dict=True)
+        cross_attention_states = vision_outputs[0]
+        cross_attention_states = self.model.multi_modal_projector(
+            cross_attention_states)
+        _, bsz, _, _, image_token_dim = tuple(cross_attention_states.shape)
+        cross_attention_states = cross_attention_states.view(
+            bsz, -1, image_token_dim).split([1] * len(images))
+        return cross_attention_states
diff --git a/lmdeploy/vl/templates.py b/lmdeploy/vl/templates.py
index f445c729bd..45e457ad2c 100644
--- a/lmdeploy/vl/templates.py
+++ b/lmdeploy/vl/templates.py
@@ -352,6 +352,14 @@ def append_image_token(self, prompt, num_images: int):
         return res
 
 
+class MllamaTempateWrapper(VLChatTemplateWrapper):
+    """Mllama chat template."""
+
+    def append_image_token(self, prompt, num_images: int):
+        """append image tokens to user prompt."""
+        return f'{IMAGE_TOKEN * num_images}{prompt}'
+
+
 class MiniCPMVTempateWrapper(VLChatTemplateWrapper):
     """MiniCPM-Llama3-V-2_5 chat template."""
 
@@ -438,6 +446,8 @@ def get_vl_prompt_template(model_path: str, chat_template: BaseModel,
         return LlavaVLChatTemplateWrapper(chat_template)
     elif arch == 'MultiModalityCausalLM':  # deepseek-vl
         return DeepSeekVLChatTemplateWrapper(chat_template)
+    elif arch == 'MllamaForConditionalGeneration':  # llama 3.2
+        return MllamaTempateWrapper(chat_template)
     elif arch == 'CogVLMForCausalLM':
         return CogVLMChatTemplateWrapper(chat_template)
     elif arch in ['InternLMXComposer2ForCausalLM', 'InternLM2ForCausalLM']:
diff --git a/tests/pytorch/paging/test_block_manager.py b/tests/pytorch/paging/test_block_manager.py
index 52c011296d..8be904be0c 100644
--- a/tests/pytorch/paging/test_block_manager.py
+++ b/tests/pytorch/paging/test_block_manager.py
@@ -114,6 +114,26 @@ def test_alloc(self, block_mgr, block_size, num_gpu_blocks):
         msg = sess.add_sequence(token_ids)
         assert not block_mgr.can_allocate(msg)
 
+    def test_num_required_blocks(self, block_mgr, block_size, num_gpu_blocks):
+        from lmdeploy.pytorch.messages import InputEmbeddings
+        sess = SchedulerSession(0, block_size)
+
+        token_ids = torch.tensor([1])
+        msg = sess.add_sequence(token_ids)
+        num_required = block_mgr.num_required_blocks(msg)
+        assert num_required == 1
+
+        embedding = InputEmbeddings(None, 0, block_size * 2)
+        msg = sess.add_sequence(token_ids, input_embeddings=[embedding])
+        num_required = block_mgr.num_required_blocks(msg)
+        assert num_required == 1
+
+        token_ids = torch.tensor([1] * block_size * 3)
+        embedding = InputEmbeddings(None, 0, block_size * 2)
+        msg = sess.add_sequence(token_ids, input_embeddings=[embedding])
+        num_required = block_mgr.num_required_blocks(msg)
+        assert num_required == 3
+
     def test_append_slot(self, block_mgr, block_size, num_gpu_blocks):
         sess = SchedulerSession(0, block_size)
 

From eaa4e6f7f2772f5557c1dbeec919b9f95e27edca Mon Sep 17 00:00:00 2001
From: q yao <streetyao@live.com>
Date: Thu, 24 Oct 2024 12:59:02 +0800
Subject: [PATCH 026/122] update check for triton (#2641)

---
 lmdeploy/pytorch/check_env/__init__.py          | 8 +++++---
 lmdeploy/pytorch/check_env/triton_custom_add.py | 8 ++++++++
 lmdeploy/pytorch/kernels/cuda/pagedattention.py | 4 ++--
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/lmdeploy/pytorch/check_env/__init__.py b/lmdeploy/pytorch/check_env/__init__.py
index 2b4b3cc521..291b1afb35 100644
--- a/lmdeploy/pytorch/check_env/__init__.py
+++ b/lmdeploy/pytorch/check_env/__init__.py
@@ -66,6 +66,10 @@ def check_env_triton(device: str):
     from packaging import version
     logger = get_logger('lmdeploy')
 
+    msg = (
+        'Please ensure that your device is functioning properly with <Triton>.\n'  # noqa: E501
+        'You can verify your environment by running '
+        '`python -m lmdeploy.pytorch.check_env.triton_custom_add`.')
     try:
         logger.debug('Checking <Triton> environment.')
         import torch
@@ -87,11 +91,9 @@ def check_env_triton(device: str):
                 'This Error might caused by mismatching between NVIDIA Driver and nvcc compiler. \n'  # noqa: E501
                 'Try solution https://github.com/triton-lang/triton/issues/1955#issuecomment-1929908209'  # noqa: E501
                 ' or reinstall the driver.')
-        else:
-            msg = None
         _handle_exception(e, 'Triton', logger, msg)
     except Exception as e:
-        _handle_exception(e, 'Triton', logger)
+        _handle_exception(e, 'Triton', logger, msg)
 
     if device == 'cuda':
         device_cap = torch.cuda.get_device_capability()
diff --git a/lmdeploy/pytorch/check_env/triton_custom_add.py b/lmdeploy/pytorch/check_env/triton_custom_add.py
index ef77fb8105..077359110b 100644
--- a/lmdeploy/pytorch/check_env/triton_custom_add.py
+++ b/lmdeploy/pytorch/check_env/triton_custom_add.py
@@ -23,3 +23,11 @@ def custom_add(a, b):
     grid = (triton.cdiv(size, BLOCK), )
     _add_kernel[grid](a, b, c, size, BLOCK=BLOCK)
     return c
+
+
+if __name__ == '__main__':
+    a = torch.tensor([1, 2], device='cuda')
+    b = a.new_tensor([3, 4], device='cuda')
+    c = custom_add(a, b)
+    torch.testing.assert_close(c, a + b)
+    print('Done.')
diff --git a/lmdeploy/pytorch/kernels/cuda/pagedattention.py b/lmdeploy/pytorch/kernels/cuda/pagedattention.py
index 7790a44b19..e15ab911fc 100644
--- a/lmdeploy/pytorch/kernels/cuda/pagedattention.py
+++ b/lmdeploy/pytorch/kernels/cuda/pagedattention.py
@@ -1153,9 +1153,9 @@ def _get_block_d(Lk):
     if not is_decoding:
         BLOCK_DMODEL, BLOCK_DMODEL1, BLOCK_DV = _get_block_d(Lq)
         if _nv_cap[0] < 8:
-            BLOCK_M = max(16, min(BLOCK, 8192 // BLOCK_DMODEL))
+            BLOCK_M = max(16, 8192 // BLOCK_DMODEL)
         else:
-            BLOCK_M = max(16, min(BLOCK, 16384 // BLOCK_DMODEL))
+            BLOCK_M = max(16, 16384 // BLOCK_DMODEL)
         num_warps = 4
         num_stages = 2
         kv_head = k.shape[h_dim]

From d00e4703821161c53b14dfd4d892740663d4d749 Mon Sep 17 00:00:00 2001
From: zhoushenglong <87467364+Reinerzhou@users.noreply.github.com>
Date: Thu, 24 Oct 2024 17:33:06 +0800
Subject: [PATCH 027/122] fix error in python3.8. (#2646)

---
 lmdeploy/pytorch/engine/model_agent.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
index 918e64e782..84e3fba8fb 100644
--- a/lmdeploy/pytorch/engine/model_agent.py
+++ b/lmdeploy/pytorch/engine/model_agent.py
@@ -483,9 +483,8 @@ def _start_tp_process(proc_id: int,
                                 timeout=timedelta(days=35600))
         dist_ctx = DistContext(rank=rank, world_size=world_size)
         torch.cuda.set_device(rank)
-        with (get_dist_manager().context(dist_ctx),
-              get_device_manager().context(device_context),
-              torch.inference_mode()):
+        with get_dist_manager().context(dist_ctx), get_device_manager(
+        ).context(device_context), torch.inference_mode():
             args = args or tuple()
             kwargs = kwargs or dict()
             func(rank, *args, **kwargs)

From cd3e7918876a8caa8faacee6744b04f600a9ee84 Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Thu, 24 Oct 2024 18:32:37 +0800
Subject: [PATCH 028/122] Support llama3.2 LLM models in turbomind
 engine(#2596)

* update

* update doc

* fix typo

* update
---
 README.md                                       | 1 +
 README_ja.md                                    | 1 +
 README_zh-CN.md                                 | 1 +
 docs/en/quantization/w4a16.md                   | 2 +-
 docs/en/supported_models/supported_models.md    | 3 ++-
 docs/zh_cn/quantization/w4a16.md                | 2 +-
 docs/zh_cn/supported_models/supported_models.md | 3 ++-
 lmdeploy/model.py                               | 2 +-
 lmdeploy/turbomind/supported_models.py          | 6 +++---
 9 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 91ac1d4134..61c0eba45b 100644
--- a/README.md
+++ b/README.md
@@ -115,6 +115,7 @@ For detailed inference benchmarks in more devices and more settings, please refe
   <li>Llama2 (7B - 70B)</li>
   <li>Llama3 (8B, 70B)</li>
   <li>Llama3.1 (8B, 70B)</li>
+  <li>Llama3.2 (1B, 3B)</li>
   <li>InternLM (7B - 20B)</li>
   <li>InternLM2 (7B - 20B)</li>
   <li>InternLM2.5 (7B)</li>
diff --git a/README_ja.md b/README_ja.md
index ea4480e282..999ebc9f0b 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -114,6 +114,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
   <li>Llama2 (7B - 70B)</li>
   <li>Llama3 (8B, 70B)</li>
   <li>Llama3.1 (8B, 70B)</li>
+  <li>Llama3.2 (1B, 3B)</li>
   <li>InternLM (7B - 20B)</li>
   <li>InternLM2 (7B - 20B)</li>
   <li>InternLM2.5 (7B)</li>
diff --git a/README_zh-CN.md b/README_zh-CN.md
index cdddb64a22..f002899c60 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -116,6 +116,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型
   <li>Llama2 (7B - 70B)</li>
   <li>Llama3 (8B, 70B)</li>
   <li>Llama3.1 (8B, 70B)</li>
+  <li>Llama3.2 (1B, 3B)</li>
   <li>InternLM (7B - 20B)</li>
   <li>InternLM2 (7B - 20B)</li>
   <li>InternLM2.5 (7B)</li>
diff --git a/docs/en/quantization/w4a16.md b/docs/en/quantization/w4a16.md
index 3a04cd7b05..32dfe18d80 100644
--- a/docs/en/quantization/w4a16.md
+++ b/docs/en/quantization/w4a16.md
@@ -69,7 +69,7 @@ lmdeploy serve gradio ./internlm2_5-7b-chat-4bit --server_name {ip_addr} --serve
 
 ## Evaluation
 
-Please refer to [OpenCompass](https://opencompass.readthedocs.io/en/latest/index.html) about model evaluation with LMDeploy.
+Please refer to [OpenCompass](https://opencompass.readthedocs.io/en/latest/index.html) about model evaluation with LMDeploy. Here is the [guide](https://opencompass.readthedocs.io/en/latest/advanced_guides/evaluation_lmdeploy.html)
 
 ## Inference
 
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index c992f730c8..260120efe0 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -10,6 +10,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |        Llama2         |  7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Llama3         |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       Llama3.1        |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Llama3.2        |     3B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       InternLM        |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       InternLM2       |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |      InternLM2.5      |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -20,7 +21,6 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |         Qwen2         | 1.5B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Mistral        |     7B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
 |        Qwen-VL        |     7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Qwen2-VL        | 2B, 7B, 72B | MLLM |    Yes    |   Yes   |   Yes   |   -   |
 |      DeepSeek-VL      |     7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |       Baichuan        |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       Baichuan2       |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -49,6 +49,7 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 |     Llama2     |  7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |     Llama3     |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|    Llama3.2    |   1B, 3B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |  Llama3.2-VL   |   8B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |   -   |
 |   InternLM2    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
diff --git a/docs/zh_cn/quantization/w4a16.md b/docs/zh_cn/quantization/w4a16.md
index b61b894781..d50e464af3 100644
--- a/docs/zh_cn/quantization/w4a16.md
+++ b/docs/zh_cn/quantization/w4a16.md
@@ -72,7 +72,7 @@ lmdeploy serve gradio ./internlm2_5-7b-chat-4bit --server-name {ip_addr} --serve
 
 ## 模型评测
 
-我们使用 [OpenCompass](https://opencompass.readthedocs.io/zh-cn/latest/index.html) 评测量化模型在各个维度上的能力
+我们使用 [OpenCompass](https://opencompass.readthedocs.io/zh-cn/latest/index.html) 评测量化模型在各个维度上的能力。方法请参考[此处](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/evaluation_lmdeploy.html)
 
 ## 模型推理
 
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index 695103b52e..26930cf3ce 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -10,6 +10,7 @@
 |        Llama2         |  7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Llama3         |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       Llama3.1        |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Llama3.2        |     3B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       InternLM        |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       InternLM2       |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |      InternLM2.5      |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -20,7 +21,6 @@
 |         Qwen2         | 1.5B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Mistral        |     7B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
 |        Qwen-VL        |     7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Qwen2-VL        | 2B, 7B, 72B | MLLM |    Yes    |   Yes   |   Yes   |   -   |
 |      DeepSeek-VL      |     7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |       Baichuan        |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       Baichuan2       |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -49,6 +49,7 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 |     Llama2     |  7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |     Llama3     |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|    Llama3.2    |   1B, 3B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |  Llama3.2-VL   |   8B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |   -   |
 |   InternLM2    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index f251ca18d2..26ab856bc2 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -772,7 +772,7 @@ def match(cls, model_path: str) -> Optional[str]:
             return 'llama3'
 
 
-@MODELS.register_module(name='llama3_1')
+@MODELS.register_module(name=['llama3_1', 'llama3_2'])
 class Llama3_1(Llama3):
     """Chat template of LLaMA3.1 model."""
 
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index 8ebb93fdf2..bdf129b019 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -84,9 +84,9 @@ def _is_head_dim_128(cfg):
                 if num_attn_head == 40:
                     # baichuan-13B, baichuan2-13B not supported by turbomind
                     support_by_turbomind = False
-            elif arch == 'Qwen2ForCausalLM':
-                # qwen2 0.5b size_per_head is 64, which hasn't been supported
-                # by turbomind yet
+            elif arch in ['Qwen2ForCausalLM', 'LlamaForCausalLM']:
+                # the head_dim of qwen2 0.5b and llama3.2-1b is 64, which
+                # hasn't been supported by turbomind yet
                 support_by_turbomind = _is_head_dim_128(cfg)
             elif arch in ('ChatGLMModel', 'ChatGLMForConditionalGeneration'):
                 # chatglm1/2/3 is not working yet

From 4958071b8930b32ae539ad189a9dc1c6c618fa65 Mon Sep 17 00:00:00 2001
From: RunningLeon <mnsheng@yeah.net>
Date: Thu, 24 Oct 2024 19:44:37 +0800
Subject: [PATCH 029/122] add eager-mode to cli (#2645)

---
 benchmark/profile_generation.py   |  3 +++
 benchmark/profile_pipeline_api.py |  3 +++
 benchmark/profile_throughput.py   |  3 +++
 lmdeploy/cli/cli.py               |  3 +++
 lmdeploy/cli/serve.py             |  7 ++++++-
 lmdeploy/cli/utils.py             | 10 ++++++++++
 6 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py
index 89c07fb196..b28937dd4c 100644
--- a/benchmark/profile_generation.py
+++ b/benchmark/profile_generation.py
@@ -341,6 +341,8 @@ def parse_args():
     ArgumentHelper.backend(parser)
     # pytorch engine args
     pt_group = parser.add_argument_group('PyTorch engine arguments')
+    ArgumentHelper.eager_mode(pt_group)
+
     tp_act = ArgumentHelper.tp(pt_group)
     cache_count_act = ArgumentHelper.cache_max_entry_count(pt_group)
     cache_block_seq_len_act = ArgumentHelper.cache_block_seq_len(pt_group)
@@ -422,6 +424,7 @@ def main():
                     session_len=session_len,
                     tp=args.tp,
                     thread_safe=True,
+                    eager_mode=args.eager_mode,
                     enable_prefix_caching=args.enable_prefix_caching,
                 )
             gen_config = GenerationConfig(top_k=args.top_k,
diff --git a/benchmark/profile_pipeline_api.py b/benchmark/profile_pipeline_api.py
index cdffe870d0..764f78399c 100644
--- a/benchmark/profile_pipeline_api.py
+++ b/benchmark/profile_pipeline_api.py
@@ -194,6 +194,8 @@ def parse_args():
 
     # pytorch engine args
     pt_group = parser.add_argument_group('PyTorch engine arguments')
+    ArgumentHelper.eager_mode(pt_group)
+
     tp_act = ArgumentHelper.tp(pt_group)
     session_len_act = ArgumentHelper.session_len(pt_group, default=4096)
     cache_count_act = ArgumentHelper.cache_max_entry_count(pt_group)
@@ -241,6 +243,7 @@ def main():
             max_batch_size=args.concurrency,
             tp=args.tp,
             thread_safe=False,
+            eager_mode=args.eager_mode,
             enable_prefix_caching=args.enable_prefix_caching,
         )
 
diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py
index e6f461d97b..9d573d51b1 100644
--- a/benchmark/profile_throughput.py
+++ b/benchmark/profile_throughput.py
@@ -281,6 +281,8 @@ def parse_args():
 
     # pytorch engine args
     pt_group = parser.add_argument_group('PyTorch engine arguments')
+    ArgumentHelper.eager_mode(pt_group)
+
     tp_act = ArgumentHelper.tp(pt_group)
     session_len_act = ArgumentHelper.session_len(pt_group, default=4096)
     cache_count_act = ArgumentHelper.cache_max_entry_count(pt_group)
@@ -328,6 +330,7 @@ def main():
             max_batch_size=args.concurrency,
             tp=args.tp,
             thread_safe=True,
+            eager_mode=args.eager_mode,
             enable_prefix_caching=args.enable_prefix_caching,
             quant_policy=args.quant_policy,
         )
diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py
index 79ac2833cf..a2083c6e64 100644
--- a/lmdeploy/cli/cli.py
+++ b/lmdeploy/cli/cli.py
@@ -122,6 +122,7 @@ def add_parser_chat():
         pt_group = parser.add_argument_group('PyTorch engine arguments')
         ArgumentHelper.adapters(pt_group)
         ArgumentHelper.device(pt_group)
+        ArgumentHelper.eager_mode(pt_group)
         # common engine args
         dtype_act = ArgumentHelper.dtype(pt_group)
         tp_act = ArgumentHelper.tp(pt_group)
@@ -265,6 +266,7 @@ def chat(args):
                 adapters=adapters,
                 enable_prefix_caching=args.enable_prefix_caching,
                 device_type=args.device,
+                eager_mode=args.eager_mode,
                 quant_policy=args.quant_policy)
             run_chat(args.model_path,
                      engine_config,
@@ -275,6 +277,7 @@ def chat(args):
             kwargs.pop('chat_template')
             kwargs.pop('backend')
             kwargs.pop('device')
+            kwargs.pop('eager_mode')
             kwargs['chat_template_config'] = chat_template_config
             run_chat(**kwargs)
 
diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
index 8007a96678..68f9de8c15 100644
--- a/lmdeploy/cli/serve.py
+++ b/lmdeploy/cli/serve.py
@@ -59,11 +59,12 @@ def add_parser_gradio():
 
         # pytorch engine args
         pt_group = parser.add_argument_group('PyTorch engine arguments')
+        ArgumentHelper.device(pt_group)
+        ArgumentHelper.eager_mode(pt_group)
 
         # common engine args
         dtype_act = ArgumentHelper.dtype(pt_group)
         tp_act = ArgumentHelper.tp(pt_group)
-        ArgumentHelper.device(pt_group)
         session_len_act = ArgumentHelper.session_len(pt_group)
         max_batch_size_act = ArgumentHelper.max_batch_size(pt_group)
         cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
@@ -159,6 +160,8 @@ def add_parser_api_server():
 
         ArgumentHelper.adapters(pt_group)
         ArgumentHelper.device(pt_group)
+        ArgumentHelper.eager_mode(pt_group)
+
         # common engine args
         dtype_act = ArgumentHelper.dtype(pt_group)
         tp_act = ArgumentHelper.tp(pt_group)
@@ -261,6 +264,7 @@ def gradio(args):
                 enable_prefix_caching=args.enable_prefix_caching,
                 device_type=args.device,
                 quant_policy=args.quant_policy,
+                eager_mode=args.eager_mode,
                 max_prefill_token_num=args.max_prefill_token_num)
         else:
             backend_config = TurbomindEngineConfig(
@@ -311,6 +315,7 @@ def api_server(args):
                 enable_prefix_caching=args.enable_prefix_caching,
                 device_type=args.device,
                 quant_policy=args.quant_policy,
+                eager_mode=args.eager_mode,
                 max_prefill_token_num=args.max_prefill_token_num)
         else:
             from lmdeploy.messages import TurbomindEngineConfig
diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
index 44ce718b53..ad7a058c8f 100644
--- a/lmdeploy/cli/utils.py
+++ b/lmdeploy/cli/utils.py
@@ -495,3 +495,13 @@ def disable_fastapi_docs(parser):
                                    default=False,
                                    help="Disable FastAPI's OpenAPI schema,"
                                    ' Swagger UI, and ReDoc endpoint')
+
+    @staticmethod
+    def eager_mode(parser):
+        """Add argument eager_mode to parser."""
+
+        return parser.add_argument('--eager-mode',
+                                   action='store_true',
+                                   default=False,
+                                   help='Whether to enable eager mode. '
+                                   'If True, cuda graph would be disabled')

From 89f52bca7f84f4b84ee7336d5d8665fbc0962c9b Mon Sep 17 00:00:00 2001
From: q yao <streetyao@live.com>
Date: Thu, 24 Oct 2024 21:36:49 +0800
Subject: [PATCH 030/122] match torch and torch_vision version (#2649)

---
 requirements/runtime.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index 7fb2491014..400c492b09 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -16,7 +16,7 @@ sentencepiece
 shortuuid
 tiktoken
 torch<=2.4.0,>=2.0.0
-torchvision<=0.18.1,>=0.15.0
+torchvision<=0.19.0,>=0.15.0
 transformers
 triton>=2.2.0,<=3.0.0; sys_platform == "linux"
 uvicorn

From d8f9e3597954edb74b72f9bf8458404a70c5c323 Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Fri, 25 Oct 2024 14:19:18 +0800
Subject: [PATCH 031/122] Check whether device support bfloat16 (#2653)

* fallback to float16 if torch.cuda.is_bf16_supported is False

* fix coverting qwen2-awq failed

* update

* update

* rollback load.py

* update
---
 lmdeploy/pytorch/check_env/__init__.py        | 11 ++++--
 lmdeploy/pytorch/configurations/cogvlm.py     |  5 +--
 lmdeploy/pytorch/configurations/qwen.py       |  7 ++--
 lmdeploy/turbomind/deploy/converter.py        |  8 +++-
 lmdeploy/utils.py                             | 37 +++++++++++++++++++
 tests/pytorch/kernel/test_apply_rotary.py     |  3 +-
 .../kernel/test_multinomial_sampling.py       |  3 +-
 tests/pytorch/kernel/test_rms_norm.py         |  4 +-
 8 files changed, 62 insertions(+), 16 deletions(-)

diff --git a/lmdeploy/pytorch/check_env/__init__.py b/lmdeploy/pytorch/check_env/__init__.py
index 291b1afb35..6250997253 100644
--- a/lmdeploy/pytorch/check_env/__init__.py
+++ b/lmdeploy/pytorch/check_env/__init__.py
@@ -208,18 +208,21 @@ def __check_model_dtype_support(config):
         import torch
 
         from lmdeploy.pytorch.config import ModelConfig
+        from lmdeploy.utils import is_bf16_supported
 
         try:
             model_config = ModelConfig.from_hf_config(config,
                                                       model_path=model_path,
                                                       dtype=dtype)
             if model_config.dtype == torch.bfloat16:
-                assert torch.cuda.is_bf16_supported(), (
+                assert is_bf16_supported(), (
                     'bf16 is not supported on your device')
         except AssertionError as e:
-            message = (f'Your device does not support `{model_config.dtype}`. '
-                       'Try edit `torch_dtype` in `config.json`.\n'
-                       'Note that this might have negative effect!')
+            message = (
+                f'Your device does not support `{model_config.dtype}`. '
+                'You can set `dtype` to float16 in PyTorchEngineConfig or '
+                '`--dtype float16` to api_server.\n'
+                'Note that this might have negative effect!')
             _handle_exception(e, 'Model', logger, message=message)
         except Exception as e:
             message = (f'Checking failed with error {e}',
diff --git a/lmdeploy/pytorch/configurations/cogvlm.py b/lmdeploy/pytorch/configurations/cogvlm.py
index f5fe5695ef..b24d92d794 100644
--- a/lmdeploy/pytorch/configurations/cogvlm.py
+++ b/lmdeploy/pytorch/configurations/cogvlm.py
@@ -14,12 +14,11 @@ def condition(cls, hf_config):
     @classmethod
     def build(cls, hf_config, model_path: str = None):
         """build."""
-        import torch
+        from lmdeploy.utils import is_bf16_supported
         cfg = DefaultModelConfigBuilder.build(hf_config)
         if getattr(hf_config, 'num_multi_query_heads', None):
             cfg.num_key_value_heads = hf_config.num_multi_query_heads
         cfg.cogvlm_style = True
-        torch_dtype = 'bfloat16' if torch.cuda.is_bf16_supported(
-        ) else 'float16'
+        torch_dtype = 'bfloat16' if is_bf16_supported() else 'float16'
         hf_config.torch_dtype = torch_dtype
         return cfg
diff --git a/lmdeploy/pytorch/configurations/qwen.py b/lmdeploy/pytorch/configurations/qwen.py
index f8fbd0731b..05ac77c1d1 100644
--- a/lmdeploy/pytorch/configurations/qwen.py
+++ b/lmdeploy/pytorch/configurations/qwen.py
@@ -13,16 +13,15 @@ def condition(cls, hf_config):
     @classmethod
     def build(cls, hf_config, model_path: str = None):
         """build."""
-        import torch
+        from lmdeploy.utils import is_bf16_supported
         cfg = DefaultModelConfigBuilder.build(hf_config)
         if cfg.bos_token_id is None:
             cfg.bos_token_id = 151644
         if cfg.eos_token_id is None:
             cfg.eos_token_id = 151645
 
-        is_bf16_supported = torch.cuda.is_bf16_supported()
-        torch_dtype = 'bfloat16' if is_bf16_supported else 'float16'
-        if hf_config.bf16 and is_bf16_supported:
+        torch_dtype = 'bfloat16' if is_bf16_supported() else 'float16'
+        if hf_config.bf16 and is_bf16_supported():
             torch_dtype = 'bfloat16'
         elif hf_config.fp16:
             torch_dtype = 'float16'
diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
index a444501b3b..14787c7812 100644
--- a/lmdeploy/turbomind/deploy/converter.py
+++ b/lmdeploy/turbomind/deploy/converter.py
@@ -11,7 +11,7 @@
 from lmdeploy.model import MODELS, best_match_model
 from lmdeploy.utils import get_logger, get_model
 
-from ...utils import _get_and_verify_max_len
+from ...utils import _get_and_verify_max_len, is_bf16_supported
 from ..supported_models import SUPPORTED_ARCHS, is_supported
 from .config import TurbomindModelConfig
 from .exporter import get_exporter_factory
@@ -138,6 +138,10 @@ def get_output_model_registered_name_and_config(model_path: str,
     else:
         assert 0, f'unsupported specified data type {dtype}'
 
+    if weight_type == 'bfloat16' and not is_bf16_supported():
+        logger.warn('data type fallback to float16 since '
+                    'torch.cuda.is_bf16_supported is False')
+        weight_type = 'float16'
     config.model_config.model_arch = model_arch
     config.model_config.weight_type = weight_type
     config.model_config.model_format = model_format
@@ -226,7 +230,7 @@ def get_tm_model(model_path,
             f'mismatched quant method: user input ' \
             f'"{engine_config.model_format}" ' \
             f'vs model quant_config "{quant_method}"'
-        assert group_size is None or group_size == _group_size, \
+        assert not group_size or group_size == _group_size, \
             f'mismatched quant group size: user input "{group_size}" ' \
             f'vs model quant_config "{_group_size}"'
 
diff --git a/lmdeploy/utils.py b/lmdeploy/utils.py
index 1f069b6807..fbdd374f80 100644
--- a/lmdeploy/utils.py
+++ b/lmdeploy/utils.py
@@ -352,3 +352,40 @@ def get_max_batch_size(device_type: str):
         return 16
     elif device_type == 'maca':
         return 128
+
+
+def is_bf16_supported(device_type: str = 'cuda'):
+    """Check if device support bfloat16.
+
+    Args:
+        device_type (str): the type of device
+    """
+
+    if device_type == 'cuda':
+        import torch
+        device = torch.cuda.current_device()
+
+        # Check for CUDA version and device compute capability.
+        # This is a fast way to check for it.
+        cuda_version = torch.version.cuda
+        if (cuda_version is not None and int(cuda_version.split('.')[0]) >= 11
+                and torch.cuda.get_device_properties(device).major >= 8):
+            return True
+        else:
+            return False
+    elif device_type == 'ascend':
+        # The following API doesn't work somehow in multi-npu devices. Due to
+        # the `ascend910` device's capability to support bfloat16, we are
+        # returning true as a workaround
+        return True
+        # import torch_npu
+        # device_name = torch_npu.npu.get_device_name(0)[:10]
+        # device_name = device_name.lower()
+        # if device_name.startwith('ascend910'):
+        #     return True
+        # else:
+        #     return False
+    elif device_type == 'maca':
+        return True
+    else:
+        return False
diff --git a/tests/pytorch/kernel/test_apply_rotary.py b/tests/pytorch/kernel/test_apply_rotary.py
index c8ca1dd77c..e9d8c07bec 100644
--- a/tests/pytorch/kernel/test_apply_rotary.py
+++ b/tests/pytorch/kernel/test_apply_rotary.py
@@ -2,6 +2,7 @@
 import torch
 
 from lmdeploy.pytorch.kernels import apply_rotary_pos_emb
+from lmdeploy.utils import is_bf16_supported
 
 
 def _rotate_half(x):
@@ -12,7 +13,7 @@ def _rotate_half(x):
 
 
 def _bf16_mark():
-    return pytest.mark.skipif(not torch.cuda.is_bf16_supported(),
+    return pytest.mark.skipif(not is_bf16_supported(),
                               reason='bf16 not supported.')
 
 
diff --git a/tests/pytorch/kernel/test_multinomial_sampling.py b/tests/pytorch/kernel/test_multinomial_sampling.py
index 9636fa5d3f..b9e9264fa5 100644
--- a/tests/pytorch/kernel/test_multinomial_sampling.py
+++ b/tests/pytorch/kernel/test_multinomial_sampling.py
@@ -2,10 +2,11 @@
 import torch
 
 from lmdeploy.pytorch.kernels import multinomial_sampling
+from lmdeploy.utils import is_bf16_supported
 
 
 def _bf16_mark():
-    return pytest.mark.skipif(not torch.cuda.is_bf16_supported(),
+    return pytest.mark.skipif(not is_bf16_supported(),
                               reason='bf16 not supported.')
 
 
diff --git a/tests/pytorch/kernel/test_rms_norm.py b/tests/pytorch/kernel/test_rms_norm.py
index b731f372de..802a7a578b 100644
--- a/tests/pytorch/kernel/test_rms_norm.py
+++ b/tests/pytorch/kernel/test_rms_norm.py
@@ -1,9 +1,11 @@
 import pytest
 import torch
 
+from lmdeploy.utils import is_bf16_supported
+
 
 def _bf16_mark():
-    return pytest.mark.skipif(not torch.cuda.is_bf16_supported(),
+    return pytest.mark.skipif(not is_bf16_supported(),
                               reason='bf16 not supported.')
 
 
From 8e794b70ac778341424e37a2e13498354b9fb807 Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Fri, 25 Oct 2024 14:26:26 +0800
Subject: [PATCH 032/122] Align UT with triton fill_kv_cache_quant kernel
 (#2644)

---
 tests/pytorch/kernel/test_fill_kv_cache.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tests/pytorch/kernel/test_fill_kv_cache.py b/tests/pytorch/kernel/test_fill_kv_cache.py
index 92aa8d7672..20ead6e3ac 100644
--- a/tests/pytorch/kernel/test_fill_kv_cache.py
+++ b/tests/pytorch/kernel/test_fill_kv_cache.py
@@ -8,17 +8,13 @@ def _div_up(a, b):
     return (a + b - 1) // b
 
 
-def precise_round(x: torch.Tensor):
-    return x.sign() * (x.abs() + 0.5).floor()
-
-
 def quant(kv: torch.Tensor, nbits: int = 8):
     """Quant kv on the head_dim."""
     amax = kv.amax(dim=-1, keepdim=True)
     amin = kv.amin(dim=-1, keepdim=True)
     scales = (amax - amin) / (2**nbits - 1)
     zeros = -amin / scales
-    q_kv = precise_round((kv - amin) / scales).to(torch.uint8)
+    q_kv = (kv / scales + zeros + 0.5).to(torch.uint8)
     if nbits == 4:
         q_kv1, q_kv2 = q_kv.split(q_kv.shape[-1] // 2, -1)
         q_kv = q_kv1 + q_kv2 * 16

From fafd6664704a98547a8769cccff0e6c96f601ca6 Mon Sep 17 00:00:00 2001
From: jinminxi104 <jinminxi104@hotmail.com>
Date: Fri, 25 Oct 2024 14:45:00 +0800
Subject: [PATCH 033/122] Update get_started tutorial about deploying on ascend
 platform (#2655)

* Update get_started.md

* Update get_started.md

* Update get_started.md

* Update get_started.md

* Update get_started.md

* Update get_started.md

* Update get_started.md

* lint

* Update get_started.md

* Update get_started.md

* lint

---------

Co-authored-by: yaofengchen <fengchenyao@foxmail.com>
---
 docs/en/get_started/ascend/get_started.md    | 41 ++++++++++++++++----
 docs/zh_cn/get_started/ascend/get_started.md | 38 ++++++++++++++----
 2 files changed, 63 insertions(+), 16 deletions(-)

diff --git a/docs/en/get_started/ascend/get_started.md b/docs/en/get_started/ascend/get_started.md
index a60d85f4d7..fdc0267503 100644
--- a/docs/en/get_started/ascend/get_started.md
+++ b/docs/en/get_started/ascend/get_started.md
@@ -18,13 +18,16 @@ cd lmdeploy
 
 The Docker version is supposed to be no less than `18.03`. And `Ascend Docker Runtime` should be installed by following [the official guide](https://www.hiascend.com/document/detail/zh/mindx-dl/60rc2/clusterscheduling/clusterschedulingig/.clusterschedulingig/dlug_installation_012.html).
 
+> \[!CAUTION\]
+> If error message `libascend_hal.so: cannot open shared object file` shows, that means **Ascend Docker Runtime** is not installed correctly!
+
 #### Ascend Drivers, Firmware and CANN
 
 The target machine needs to install the Huawei driver and firmware version 23.0.3, refer to
 [CANN Driver and Firmware Installation](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha003/softwareinst/instg/instg_0019.html)
-and [download resources](https://www.hiascend.com/hardware/firmware-drivers/community?product=4&model=26&cann=8.0.RC3.alpha001&driver=1.0.0.2.alpha).
+and [download resources](https://www.hiascend.com/hardware/firmware-drivers/community?product=4&model=26&cann=8.0.RC2.beta1&driver=1.0.25.alpha).
 
-And the CANN (version 8.0.RC3.alpha001) software packages should also be downloaded from [Ascend Resource Download Center](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC3.alpha001) themselves. Make sure to place the `Ascend-cann-kernels-910b*.run` and `Ascend-cann-toolkit*-aarch64.run` under the root directory of lmdeploy source code
+And the CANN (version 8.0.RC2.beta1) software packages should also be downloaded from [Ascend Resource Download Center](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC2.beta1&product=4&model=26) themselves. Make sure to place the `Ascend-cann-kernels-910b*.run` and `Ascend-cann-toolkit*-aarch64.run` under the root directory of lmdeploy source code
 
 #### Build Docker Image
 
@@ -45,6 +48,11 @@ For more information about running the Docker client on Ascend devices, please r
 
 ## Offline batch inference
 
+> \[!TIP\]
+> Graph mode has been supported on Atlas 800T A2. Currently, InternLM2-7B/LLaMa2-7B/Qwen2-7B are tested on graph mode.
+> Users can set `eager_mode=False` to enable graph mode, or, set `eager_mode=True` to disable graph mode.
+> (Please source `/usr/local/Ascend/nnal/atb/set_env.sh` before enabling graph mode)
+
 ### LLM inference
 
 Set `device_type="ascend"` in the `PytorchEngineConfig`:
@@ -54,7 +62,7 @@ from lmdeploy import pipeline
 from lmdeploy import PytorchEngineConfig
 if __name__ == "__main__":
     pipe = pipeline("internlm/internlm2_5-7b-chat",
-                    backend_config = PytorchEngineConfig(tp=1, device_type="ascend"))
+                    backend_config=PytorchEngineConfig(tp=1, device_type="ascend", eager_mode=True))
     question = ["Shanghai is", "Please introduce China", "How are you?"]
     response = pipe(question)
     print(response)
@@ -69,7 +77,7 @@ from lmdeploy import pipeline, PytorchEngineConfig
 from lmdeploy.vl import load_image
 if __name__ == "__main__":
     pipe = pipeline('OpenGVLab/InternVL2-2B',
-                    backend_config=PytorchEngineConfig(tp=1, device_type='ascend'))
+                    backend_config=PytorchEngineConfig(tp=1, device_type='ascend', eager_mode=True))
     image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
     response = pipe(('describe this image', image))
     print(response)
@@ -77,12 +85,17 @@ if __name__ == "__main__":
 
 ## Online serving
 
+> \[!TIP\]
+> Graph mode has been supported on Atlas 800T A2. Currently, InternLM2-7B/LLaMa2-7B/Qwen2-7B are tested on graph mode.
+> Graph mode is default enabled in online serving. Users can add `--eager-mode` to disable graph mode.
+> (Please source `/usr/local/Ascend/nnal/atb/set_env.sh` before enabling graph mode)
+
 ### Serve a LLM model
 
 Add `--device ascend` in the serve command.
 
 ```bash
-lmdeploy serve api_server --backend pytorch --device ascend internlm/internlm2_5-7b-chat
+lmdeploy serve api_server --backend pytorch --device ascend --eager-mode internlm/internlm2_5-7b-chat
 ```
 
 ### Serve a VLM model
@@ -90,7 +103,7 @@ lmdeploy serve api_server --backend pytorch --device ascend internlm/internlm2_5
 Add `--device ascend` in the serve command
 
 ```bash
-lmdeploy serve api_server --backend pytorch --device ascend OpenGVLab/InternVL2-2B
+lmdeploy serve api_server --backend pytorch --device ascend --eager-mode OpenGVLab/InternVL2-2B
 ```
 
 ## Inference with Command line Interface
@@ -98,12 +111,24 @@ lmdeploy serve api_server --backend pytorch --device ascend OpenGVLab/InternVL2
 Add `--device ascend` in the serve command.
 
 ```bash
-lmdeploy chat internlm/internlm2_5-7b-chat --backend pytorch --device ascend
+lmdeploy chat internlm/internlm2_5-7b-chat --backend pytorch --device ascend --eager-mode
 ```
 
 Run the following commands to launch lmdeploy chatting after starting container:
 
 ```bash
 docker exec -it lmdeploy_ascend_demo \
-    bash -i -c "lmdeploy chat --backend pytorch --device ascend internlm/internlm2_5-7b-chat"
+    bash -i -c "lmdeploy chat --backend pytorch --device ascend --eager-mode internlm/internlm2_5-7b-chat"
+```
+
+## Quantization
+
+### w4a16 AWQ
+
+Run the following commands to quantize weights on Atlas 800T A2.
+
+```bash
+lmdeploy lite auto_awq $HF_MODEL --work-dir $WORK_DIR --device npu
 ```
+
+Please check [supported_models](../../supported_models/supported_models.md) before use this feature.
diff --git a/docs/zh_cn/get_started/ascend/get_started.md b/docs/zh_cn/get_started/ascend/get_started.md
index ad9ff791ff..2e5632311a 100644
--- a/docs/zh_cn/get_started/ascend/get_started.md
+++ b/docs/zh_cn/get_started/ascend/get_started.md
@@ -17,13 +17,16 @@ cd lmdeploy
 
 Docker 版本应不低于 18.03。并且需按照[官方指南](https://www.hiascend.com/document/detail/zh/mindx-dl/60rc2/clusterscheduling/clusterschedulingig/clusterschedulingig/dlug_installation_012.html)安装 Ascend Docker Runtime。
 
+> \[!CAUTION\]
+> 如果在后续容器内出现`libascend_hal.so: cannot open shared object file`错误，说明Ascend Docker Runtime没有被正确安装。
+
 #### Drivers，Firmware 和 CANN
 
 目标机器需安装华为驱动程序和固件版本 23.0.3，请参考
 [CANN 驱动程序和固件安装](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha003/softwareinst/instg/instg_0019.html)
-和[下载资源](https://www.hiascend.com/hardware/firmware-drivers/community?product=4&model=26&cann=8.0.RC3.alpha001&driver=1.0.0.2.alpha)。
+和[下载资源](https://www.hiascend.com/hardware/firmware-drivers/community?product=4&model=26&cann=8.0.RC2.beta1&driver=1.0.25.alpha)。
 
-另外，`docker/Dockerfile_aarch64_ascend`没有提供CANN 安装包，用户需要自己从[昇腾资源下载中心](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC3.alpha001)下载CANN(8.0.RC3.alpha001)软件包。
+另外，`docker/Dockerfile_aarch64_ascend`没有提供CANN 安装包，用户需要自己从[昇腾资源下载中心](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC2.beta1&product=4&model=26)下载CANN(version 8.0.RC2.beta1)软件包。
 并将``` Ascend-cann-kernels-910b*.run`` 和  ```Ascend-cann-toolkit\*-aarch64.run\`\` 放在 lmdeploy 源码根目录下。
 
 #### 构建镜像
@@ -45,6 +48,9 @@ docker run -e ASCEND_VISIBLE_DEVICES=0 --rm --name lmdeploy -t lmdeploy-aarch64-
 
 ## 离线批处理
 
+> \[!TIP\]
+> 图模式已经支持了Atlas 800T A2。目前，单卡下的InternLM2-7B/LLaMa2-7B/Qwen2-7B已经通过测试。用户可以设定`eager_mode=False`来开启图模式，或者设定`eager_mode=True`来关闭图模式。(启动图模式需要事先source `/usr/local/Ascend/nnal/atb/set_env.sh`)
+
 ### LLM 推理
 
 将`device_type="ascend"`加入`PytorchEngineConfig`的参数中。
@@ -54,7 +60,7 @@ from lmdeploy import pipeline
 from lmdeploy import PytorchEngineConfig
 if __name__ == "__main__":
     pipe = pipeline("internlm/internlm2_5-7b-chat",
-                    backend_config = PytorchEngineConfig(tp=1, device_type="ascend"))
+                    backend_config=PytorchEngineConfig(tp=1, device_type="ascend", eager_mode=True))
     question = ["Shanghai is", "Please introduce China", "How are you?"]
     response = pipe(question)
     print(response)
@@ -69,7 +75,7 @@ from lmdeploy import pipeline, PytorchEngineConfig
 from lmdeploy.vl import load_image
 if __name__ == "__main__":
     pipe = pipeline('OpenGVLab/InternVL2-2B',
-                    backend_config=PytorchEngineConfig(tp=1, device_type='ascend'))
+                    backend_config=PytorchEngineConfig(tp=1, device_type='ascend', eager_mode=True))
     image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
     response = pipe(('describe this image', image))
     print(response)
@@ -77,12 +83,16 @@ if __name__ == "__main__":
 
 ## 在线服务
 
+> \[!TIP\]
+> 图模式已经支持Atlas 800T A2。目前，单卡下的InternLM2-7B/LLaMa2-7B/Qwen2-7B已经通过测试。
+> 在线服务时，图模式默认开启，用户可以添加`--eager-mode`来关闭图模式。(启动图模式需要事先source `/usr/local/Ascend/nnal/atb/set_env.sh`)
+
 ### LLM 模型服务
 
 将`--device ascend`加入到服务启动命令中。
 
 ```bash
-lmdeploy serve api_server --backend pytorch --device ascend internlm/internlm2_5-7b-chat
+lmdeploy serve api_server --backend pytorch --device ascend --eager-mode internlm/internlm2_5-7b-chat
 ```
 
 ### VLM 模型服务
@@ -90,7 +100,7 @@ lmdeploy serve api_server --backend pytorch --device ascend internlm/internlm2_5
 将`--device ascend`加入到服务启动命令中。
 
 ```bash
-lmdeploy serve api_server --backend pytorch --device ascend OpenGVLab/InternVL2-2B
+lmdeploy serve api_server --backend pytorch --device ascend --eager-mode OpenGVLab/InternVL2-2B
 ```
 
 ## 使用命令行与LLM模型对话
@@ -98,12 +108,24 @@ lmdeploy serve api_server --backend pytorch --device ascend OpenGVLab/InternVL2
 将`--device ascend`加入到服务启动命令中。
 
 ```bash
-lmdeploy chat internlm/internlm2_5-7b-chat --backend pytorch --device ascend
+lmdeploy chat internlm/internlm2_5-7b-chat --backend pytorch --device ascend --eager-mode
 ```
 
 也可以运行以下命令使启动容器后开启lmdeploy聊天
 
 ```bash
 docker exec -it lmdeploy_ascend_demo \
-    bash -i -c "lmdeploy chat --backend pytorch --device ascend internlm/internlm2_5-7b-chat"
+    bash -i -c "lmdeploy chat --backend pytorch --device ascend --eager-mode internlm/internlm2_5-7b-chat"
+```
+
+## 量化
+
+### w4a16 AWQ
+
+运行下面的代码可以在Atlas 800T A2上对权重进行W4A16量化。
+
+```bash
+lmdeploy lite auto_awq $HF_MODEL --work-dir $WORK_DIR --device npu
 ```
+
+支持的模型列表请参考[支持的模型](../../supported_models/supported_models.md)。

From c25520a8e1ea49066ef8e9bc4b451bf00823c035 Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Fri, 25 Oct 2024 15:24:07 +0800
Subject: [PATCH 034/122] Add warning message about `do_sample` to alert BC
 (#2654)

* add warning message for do_sample

* update

* update
---
 lmdeploy/api.py                | 2 +-
 lmdeploy/serve/async_engine.py | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/lmdeploy/api.py b/lmdeploy/api.py
index 939065622a..e66d73754a 100644
--- a/lmdeploy/api.py
+++ b/lmdeploy/api.py
@@ -11,7 +11,7 @@ def pipeline(model_path: str,
              backend_config: Optional[Union[TurbomindEngineConfig,
                                             PytorchEngineConfig]] = None,
              chat_template_config: Optional[ChatTemplateConfig] = None,
-             log_level: str = 'ERROR',
+             log_level: str = 'WARNING',
              max_log_len: int = None,
              **kwargs):
     """
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
index 598977747c..6b7da04a99 100644
--- a/lmdeploy/serve/async_engine.py
+++ b/lmdeploy/serve/async_engine.py
@@ -501,6 +501,12 @@ async def generate(
         if gen_config.stop_token_ids is None:
             gen_config.stop_token_ids = self.stop_words
         if not gen_config.do_sample:
+            logger.warn(f'GenerationConfig: {gen_config}')
+            logger.warn(
+                'Since v0.6.0, lmdeploy add `do_sample` in '
+                'GenerationConfig. It defaults to False, meaning greedy '
+                'decoding. Please set `do_sample=True` if sampling '
+                ' decoding is needed')
             # greedy decode
             gen_config.top_k = 1
             # avoid unnecessary process

From 44a0cd31ec504d14fdb6f440368f885c1134e93a Mon Sep 17 00:00:00 2001
From: CyCle1024 <chenchiyu@pjlab.org.cn>
Date: Fri, 25 Oct 2024 16:32:58 +0800
Subject: [PATCH 035/122] [ascend] add ascend graph mode (#2647)

* [pytorch] ascend enable atbgraph

* add paged prefill attention

* refine ascend-update-step-ctx (#26)

refine ascend-update-step-ctx

---------

Co-authored-by: CyCle1024 <chenchiyu@pjlab.org.cn>

* fix: rewrite enable graph for ascend

* fix backend error due to folder refactor

* remove unnecessary comment

* fix rotary_embedding (#27)

---------

Co-authored-by: jinminxi104 <jinminxi104@hotmail.com>
Co-authored-by: tangzhiyi11 <tangzhiyi11@users.noreply.github.com>
---
 .../pytorch/backends/dlinfer/activation.py    |  21 ++++
 .../backends/dlinfer/ascend/graph_runner.py   | 116 ++++++++++++++++++
 .../backends/dlinfer/ascend/op_backend.py     |  99 +++++++++++----
 .../pytorch/backends/dlinfer/op_backend.py    |   6 +
 .../backends/dlinfer/rotary_embedding.py      |  84 +++++++++++++
 lmdeploy/pytorch/engine/logits_process.py     |   1 +
 .../pytorch/kernels/dlinfer/activation.py     |   7 ++
 .../kernels/dlinfer/apply_rotary_pos_emb.py   |   8 +-
 .../pytorch/kernels/dlinfer/pagedattention.py |   2 +-
 9 files changed, 315 insertions(+), 29 deletions(-)
 create mode 100644 lmdeploy/pytorch/backends/dlinfer/activation.py
 create mode 100644 lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py
 create mode 100644 lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py
 create mode 100644 lmdeploy/pytorch/kernels/dlinfer/activation.py

diff --git a/lmdeploy/pytorch/backends/dlinfer/activation.py b/lmdeploy/pytorch/backends/dlinfer/activation.py
new file mode 100644
index 0000000000..566fe11621
--- /dev/null
+++ b/lmdeploy/pytorch/backends/dlinfer/activation.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from lmdeploy.pytorch.kernels.dlinfer.activation import silu_and_mul
+
+from ..activation import SiluAndMulBuilder, SiluAndMulImpl
+
+
+class DlinferSiluAndMulImpl(SiluAndMulImpl):
+    """silu + multiple fused implementation."""
+
+    def forward(self, x):
+        """forward."""
+        return silu_and_mul(x)
+
+
+class DlinferSiluAndMulBuilder(SiluAndMulBuilder):
+    """silu and mul implementation builder."""
+
+    @staticmethod
+    def build(inplace: bool = False):
+        """build."""
+        return DlinferSiluAndMulImpl()
diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py b/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py
new file mode 100644
index 0000000000..3ecc4223bd
--- /dev/null
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py
@@ -0,0 +1,116 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from importlib import import_module
+
+import torch
+import torch.distributed
+
+from lmdeploy.pytorch.config import BackendConfig, CacheConfig, ModelConfig
+from lmdeploy.utils import get_logger
+
+from ...graph_runner import GraphRunner
+
+logger = get_logger('lmdeploy')
+
+
+class AscendGraphRunner(GraphRunner):
+    """ascend graph runner."""
+
+    def __init__(self, model: torch.nn.Module, model_config: ModelConfig,
+                 cache_config: CacheConfig, backend_config: BackendConfig,
+                 device: torch.device):
+        super().__init__(model, model_config, cache_config, backend_config,
+                         device)
+
+        self.enable_graph = self.check_enable_graph()
+        if self.enable_graph:
+            import dlinfer.graph
+            dlinfer.graph.config.enable_graph_mode = True
+            self.patch_kernels_custom_op()
+            self.patch_kvcache_static_shape()
+            self.model = torch.compile(self.model,
+                                       fullgraph=True,
+                                       dynamic=True,
+                                       backend='atbgraph')
+
+    def check_enable_graph(self):
+        """check enable graph."""
+        # eager_mode
+        if self.backend_config.eager_mode:
+            return False
+        # tp
+        if torch.distributed.is_initialized():
+            warnings.warn(
+                "Graph mode of device_type 'ascend' only supports tp=1 "
+                'for now, fallback to eager mode', RuntimeWarning)
+            return False
+        # model support
+        self.supported_model = {
+            'Llama2': 'LlamaConfig',
+            'InternLM2': 'InternLM2Config',
+            'Qwen2': 'Qwen2Config',
+        }
+        is_model_support = True
+        model_config_name = str(type(self.model_config.hf_config).__name__)
+        if model_config_name not in self.supported_model.values():
+            is_model_support = False
+        if not is_model_support:
+            warnings.warn(
+                "Graph mode of device_type 'ascend' only supports models: "
+                f"{', '.join(self.supported_model.keys())} when tp=1 for now",
+                RuntimeWarning)
+        return True
+
+    def patch_kernels_custom_op(self):
+        from dlinfer.graph.custom_op import register_custom_op
+        dlinfer_kernels_module = import_module(
+            'lmdeploy.pytorch.kernels.dlinfer')
+        dlinfer_backends_module = import_module(
+            'lmdeploy.pytorch.backends.dlinfer')
+
+        # prefill_attention
+        module_str = 'pagedattention'
+        paged_attn_module = getattr(dlinfer_kernels_module, module_str)
+        func_str = 'prefill_attention'
+        prefill_attn_origin = getattr(paged_attn_module, func_str)
+        prefill_attn_registered = register_custom_op(
+            f'lmdeploy::{func_str}', ['attn_output'])(prefill_attn_origin)
+        setattr(paged_attn_module, func_str, prefill_attn_registered)
+
+        # apply_rotary_pos_emb
+        def apply_rotary_emb_abstract_impl(q, k, cos, sin, q_out, k_out):
+            result = [q, k]
+            if q_out is not None:
+                result[0] = q_out
+            if k_out is not None:
+                result[1] = k_out
+            return tuple(result)
+
+        module_str = 'apply_rotary_emb'
+        apply_rotary_emb_module = getattr(dlinfer_backends_module, module_str)
+        func_str = 'apply_rotary_pos_emb'
+        apply_rotary_pos_emb_origin = getattr(apply_rotary_emb_module,
+                                              func_str)
+        apply_rotary_pos_emb_registered = register_custom_op(
+            f'lmdeploy::{func_str}',
+            impl_abstract_func=apply_rotary_emb_abstract_impl)(
+                apply_rotary_pos_emb_origin)
+        setattr(apply_rotary_emb_module, func_str,
+                apply_rotary_pos_emb_registered)
+
+    def patch_kvcache_static_shape(self):
+        import torch._dynamo as dynamo
+        from torch.utils._pytree import tree_map
+        cache_engine_module = import_module(
+            'lmdeploy.pytorch.engine.cache_engine')
+        class_str = 'CacheEngine'
+        cache_engine_class = getattr(cache_engine_module, class_str)
+        func_str = 'allocate_gpu_cache'
+        allocate_gpu_cache_origin = getattr(cache_engine_class, func_str)
+
+        def allocate_gpu_cache_mark_static(self):
+            gpu_cache = allocate_gpu_cache_origin(self)
+            tree_map(lambda x: dynamo.mark_static(x), gpu_cache)
+            return gpu_cache
+
+        setattr(cache_engine_class, func_str, allocate_gpu_cache_mark_static)
diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
index 065e39b421..79e5288364 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -3,6 +3,7 @@
 
 import torch
 
+from lmdeploy.pytorch.config import BackendConfig, CacheConfig, ModelConfig
 from lmdeploy.utils import get_logger
 
 from ..op_backend import DlinferOpsBackend
@@ -12,6 +13,9 @@
 
 class AscendOpsBackend(DlinferOpsBackend):
     """ascend layer backend."""
+    enable_graph = False
+    half_negative_inf = torch.finfo(torch.float16).min
+    total_slots = None
 
     @staticmethod
     def get_name() -> str:
@@ -45,21 +49,23 @@ def get_v_block_shape(
     @classmethod
     def update_step_context(cls, step_context):
         """update step context."""
+
+        def get_total_slots():
+            if cls.total_slots is None:
+                cls.total_slots = torch.arange(
+                    block_num * block_size,
+                    dtype=torch.long,
+                    device=step_context.block_offsets.device)
+                cls.total_slots = cls.total_slots.view(block_num, block_size)
+            return cls.total_slots
+
         kv_start_indices, attention_mask = [], []
         block_num, block_size, _ = step_context.kv_caches[0][0].shape
-        device = step_context.block_offsets.device
-
         is_unpaged_prefill = False
         if not step_context.is_decoding:
             is_unpaged_prefill = \
                 all((step_context.q_seqlens ==
                      step_context.kv_seqlens).tolist())
-
-        total_slots = torch.arange(block_num * block_size,
-                                   dtype=torch.long,
-                                   device=device)
-        total_slots = total_slots.view(block_num, block_size)
-
         q_seqlens_list = step_context.q_seqlens.tolist()
         kv_seqlens_list = step_context.kv_seqlens.tolist()
         max_q_seq_len = max(q_seqlens_list)
@@ -71,9 +77,9 @@ def update_step_context(cls, step_context):
 
             # collect kv start indices.
             history_length = kv_seq_len - q_seq_len
-            slot_tables = total_slots[step_context.block_offsets[i]].flatten()
-            slot_indices = [p for p in range(history_length, kv_seq_len)]
-            slots = slot_tables[slot_indices].reshape((-1, 1))
+            total_slots = get_total_slots()
+            slot_tables = total_slots[step_context.block_offsets[i]].view(-1)
+            slots = slot_tables[history_length:kv_seq_len]
             kv_start_indices.append(slots)
 
             # collect attention mask of paged_prefill attention stage.
@@ -83,7 +89,8 @@ def update_step_context(cls, step_context):
                         torch.ones(q_seq_len,
                                    step_context.block_offsets.shape[1] *
                                    block_size,
-                                   dtype=torch.bool).cuda(),
+                                   dtype=torch.bool,
+                                   device=step_context.block_offsets.device),
                         diagonal=kv_seq_len - q_seq_len,
                     ))
                 attention_mask.append(single_attention_mask)
@@ -91,11 +98,10 @@ def update_step_context(cls, step_context):
         kv_start_indices = torch.cat(kv_start_indices)
 
         if step_context.is_decoding:
-            # prepare somae params of paged_decode attention stage.
+            # prepare some params of paged_decode attention stage.
             q_start_loc_cpu, q_seqlens_cpu = None, None
-            kv_seqlens_cpu = step_context.kv_seqlens.cpu()
         elif is_unpaged_prefill:
-            # prepare somae params of unpaged_prefill attention stage.
+            # prepare some params of unpaged_prefill attention stage.
             q_start_loc_cpu, kv_seqlens_cpu = None, None
             q_seqlens_cpu = step_context.q_seqlens.cpu()
             single_attention_mask = torch.logical_not(
@@ -106,16 +112,46 @@ def update_step_context(cls, step_context):
                 ))
             attention_mask.append(single_attention_mask)
         else:
-            # prepare somae params of paged_prefill attention stage.
+            # prepare some params of paged_prefill attention stage.
             q_start_loc_cpu, q_seqlens_cpu = None, None
-            kv_seqlens_cpu = step_context.kv_seqlens.repeat_interleave(
-                step_context.q_seqlens, 0).cpu()
-            block_offsets_int32 = step_context.block_offsets.to(torch.int32)
-            step_context.block_offsets = block_offsets_int32.repeat_interleave(
-                step_context.q_seqlens, 0)
-            attention_mask = [
-                torch.cat([mask for mask in attention_mask]).unsqueeze(1)
-            ]
+            attention_mask = [torch.cat([mask for mask in attention_mask])]
+
+        if cls.enable_graph:
+            kv_start_indices = kv_start_indices.view(-1).to(torch.int32)
+            import torch._dynamo as dynamo
+            if not is_unpaged_prefill:
+                step_context.block_offsets = step_context.block_offsets.to(
+                    torch.int32)
+                if not step_context.is_decoding:
+                    step_context.block_offsets = step_context.block_offsets\
+                        .repeat_interleave(step_context.q_seqlens, 0)
+            dynamo.mark_dynamic(step_context.block_offsets, [0, 1])
+            kv_seqlens = step_context.kv_seqlens.to(torch.int32)
+            if not step_context.is_decoding:
+                if is_unpaged_prefill:
+                    attention_mask = [mask.half() for mask in attention_mask]
+                else:
+                    attention_mask = [
+                        torch.cat([
+                            mask.half() * cls.half_negative_inf
+                            for mask in attention_mask
+                        ]).unsqueeze(1)
+                    ]
+                    kv_seqlens = kv_seqlens.repeat_interleave(
+                        step_context.q_seqlens, 0)
+        else:
+            if step_context.is_decoding:
+                kv_seqlens_cpu = step_context.kv_seqlens.cpu()
+            elif is_unpaged_prefill:
+                pass
+            else:
+                kv_seqlens_cpu = step_context.kv_seqlens.repeat_interleave(
+                    step_context.q_seqlens, 0).cpu()
+                block_offsets_int32 = step_context.block_offsets.to(
+                    torch.int32)
+                step_context.block_offsets = block_offsets_int32\
+                    .repeat_interleave(step_context.q_seqlens, 0)
+            kv_seqlens = kv_seqlens_cpu
 
         attn_meta_cls = cls.get_attention_metadata_cls()
         attn_metadata = attn_meta_cls(
@@ -123,7 +159,7 @@ def update_step_context(cls, step_context):
             step_context.block_offsets,
             q_start_loc=q_start_loc_cpu,
             q_seqlens=q_seqlens_cpu,
-            kv_seqlens=kv_seqlens_cpu,
+            kv_seqlens=kv_seqlens,
             kv_start_indices=kv_start_indices,
             block_size=block_size,
             attention_mask=attention_mask,
@@ -134,3 +170,16 @@ def update_step_context(cls, step_context):
 
         step_context.attn_metadata = attn_metadata
         return step_context
+
+    @staticmethod
+    def build_graph_runner(model: torch.nn.Module, model_config: ModelConfig,
+                           cache_config: CacheConfig,
+                           backend_config: BackendConfig,
+                           device: torch.device):
+        """build graph runner."""
+        from .graph_runner import AscendGraphRunner
+        ascend_graph_runner = AscendGraphRunner(model, model_config,
+                                                cache_config, backend_config,
+                                                device)
+        AscendOpsBackend.enable_graph = ascend_graph_runner.enable_graph
+        return ascend_graph_runner
diff --git a/lmdeploy/pytorch/backends/dlinfer/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/op_backend.py
index 124633f857..031f51fdca 100644
--- a/lmdeploy/pytorch/backends/dlinfer/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/op_backend.py
@@ -28,6 +28,9 @@ def get_layer_impl_builder(cls, layer_type: OpType):
         elif layer_type == OpType.ApplyRotaryEmb:
             from .apply_rotary_emb import DlinferApplyRotaryEmbBuilder
             return DlinferApplyRotaryEmbBuilder
+        elif layer_type == OpType.SiluAndMul:
+            from .activation import DlinferSiluAndMulBuilder
+            return DlinferSiluAndMulBuilder
         elif layer_type == OpType.RMSNorm:
             from .norm import DlinferRMSNormBuilder
             return DlinferRMSNormBuilder
@@ -40,6 +43,9 @@ def get_layer_impl_builder(cls, layer_type: OpType):
         elif layer_type == OpType.LinearW4A16:
             from .awq_modules import AwqLinearW4A16Builder
             return AwqLinearW4A16Builder
+        elif layer_type == OpType.RotaryEmbedding:
+            from .rotary_embedding import DlinferRotaryEmbeddingBuilder
+            return DlinferRotaryEmbeddingBuilder
         else:
             logger.debug(
                 f'Op {layer_type} fallback to default implementation.')
diff --git a/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py b/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py
new file mode 100644
index 0000000000..e97c9d1338
--- /dev/null
+++ b/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py
@@ -0,0 +1,84 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import nn
+
+from ..default.rotary_embedding import (Llama3RotaryEmbeddingImpl,
+                                        LlamaDynamicNTKScalingRotaryEmbedding)
+from ..rotary_embedding import (Llama3Parameters, LongRoPEScalingParameters,
+                                RopeType, RotaryEmbeddingBuilder,
+                                RotaryEmbeddingImpl, YarnParameters)
+
+
+class DlinferRotaryEmbeddingImpl(RotaryEmbeddingImpl, nn.Module):
+    """base rotary embedding."""
+
+    def __init__(self,
+                 dim: int,
+                 base: int = 10000,
+                 scaling_factor: float = 1.0):
+        super().__init__()
+        self.scaling_factor = scaling_factor
+        self.dim = dim
+        self.base = base
+        inv_freq = 1.0 / (self.base**(
+            torch.arange(0, self.dim, 2, dtype=torch.int64).float() /
+            self.dim)).float().cuda()
+        self.register_buffer('inv_freq', inv_freq, persistent=False)
+
+    def forward(self, x, position_ids):
+        """forward."""
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if self.inv_freq.device != x.device:
+            self.inv_freq = self.inv_freq.to(x.device)
+
+        if self.scaling_factor != 1.0:
+            position_ids = position_ids.float() / self.scaling_factor
+        else:
+            position_ids = position_ids.float()
+
+        inv_freq_expanded = self.inv_freq.view(1, -1, 1)
+        position_ids_expanded = position_ids.unsqueeze(1)
+
+        # # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(
+            device_type, str) and device_type != 'mps' else 'cpu'
+        inv_freq_expanded = inv_freq_expanded
+        position_ids_expanded = position_ids_expanded
+        tmp = torch.bmm(inv_freq_expanded, position_ids_expanded)
+        freqs = tmp.transpose(1, 2)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        cos = emb.cos()
+        sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class DlinferRotaryEmbeddingBuilder(RotaryEmbeddingBuilder):
+    """rotary embedding builder."""
+
+    @staticmethod
+    def build(
+        dim: int,
+        max_position_embeddings: int = 2048,
+        base: int = 10000,
+        scaling_factor: float = 1.0,
+        yarn_params: YarnParameters = None,
+        longrope_params: LongRoPEScalingParameters = None,
+        llama3_params: Llama3Parameters = None,
+        emb_type: RopeType = RopeType.Default,
+    ):
+        """build."""
+        if emb_type in (RopeType.Default, RopeType.LinearScaling):
+            return DlinferRotaryEmbeddingImpl(dim, base, scaling_factor)
+        elif emb_type == RopeType.DynamicNTKScaling:
+            return LlamaDynamicNTKScalingRotaryEmbedding(
+                dim, base, scaling_factor, max_position_embeddings)
+        elif emb_type == RopeType.Llama3:
+            return Llama3RotaryEmbeddingImpl(dim, base, scaling_factor,
+                                             llama3_params.low_freq_factor,
+                                             llama3_params.high_freq_factor,
+                                             max_position_embeddings)
+        else:
+            raise NotImplementedError(
+                f'Unsupported embedding type: {emb_type}')
diff --git a/lmdeploy/pytorch/engine/logits_process.py b/lmdeploy/pytorch/engine/logits_process.py
index 2ee2eaced2..44eb25a8c5 100644
--- a/lmdeploy/pytorch/engine/logits_process.py
+++ b/lmdeploy/pytorch/engine/logits_process.py
@@ -336,6 +336,7 @@ def __call__(self, all_ids: torch.LongTensor,
                                   guided_input_ids, self.tokenizer)
         return scores
 
+    @torch.inference_mode()
     def sampling(self, logits: torch.Tensor):
         """sampling."""
         sampling_inputs = self.sampling_inputs
diff --git a/lmdeploy/pytorch/kernels/dlinfer/activation.py b/lmdeploy/pytorch/kernels/dlinfer/activation.py
new file mode 100644
index 0000000000..b862fdfb86
--- /dev/null
+++ b/lmdeploy/pytorch/kernels/dlinfer/activation.py
@@ -0,0 +1,7 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import dlinfer.ops as ext_ops
+from torch import Tensor
+
+
+def silu_and_mul(input_tensor: Tensor, ) -> Tensor:
+    return ext_ops.silu_and_mul(input_tensor)
diff --git a/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py b/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py
index e67cfda232..0f13f3f38c 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
 import dlinfer.ops as ext_ops
 from torch import Tensor
 
@@ -8,9 +10,9 @@ def apply_rotary_pos_emb(
     key_states: Tensor,
     cos: Tensor,
     sin: Tensor,
-    q_embed: Tensor = None,
-    k_embed: Tensor = None,
-):
+    q_embed: Optional[Tensor],
+    k_embed: Optional[Tensor],
+) -> Tuple[Tensor, Tensor]:
     query_states = query_states.contiguous()
     key_states = key_states.contiguous()
     bs = query_states.shape[0]
diff --git a/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py b/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py
index c8fc4e90e2..21c72074a4 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py
@@ -19,7 +19,7 @@ def prefill_attention(
     block_size: int,
     attn_mask: Sequence[Optional[Tensor]],
     is_unpaged_prefill: Optional[bool],
-):
+) -> Tensor:
     num_q_heads = query_states.shape[1]
     num_kv_heads = value_states.shape[1]
 

From c1819879bce8ac39d6368c866d53530f0418a95a Mon Sep 17 00:00:00 2001
From: CyCle1024 <chenchiyu@pjlab.org.cn>
Date: Fri, 25 Oct 2024 17:46:12 +0800
Subject: [PATCH 036/122] update ascend dockerfile (#2661)

* update ascend dockerfile

* update nnal source in dockerfile

* add torchvision version in Dockerfile_aarch64_ascend
---
 docker/Dockerfile_aarch64_ascend | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/docker/Dockerfile_aarch64_ascend b/docker/Dockerfile_aarch64_ascend
index fe7f0c8e2a..f9c07cb69c 100644
--- a/docker/Dockerfile_aarch64_ascend
+++ b/docker/Dockerfile_aarch64_ascend
@@ -73,6 +73,7 @@ $LD_LIBRARY_PATH
 ARG CHIP=all
 ARG TOOLKIT_PKG=Ascend-cann-toolkit_*.run
 ARG KERNELS_PKG=Ascend-cann-kernels-*.run
+ARG NNAL_PKG=Ascend-nnal_*.run
 
 RUN --mount=type=cache,target=/tmp,from=build_temp,source=/tmp \
     umask 0022 && \
@@ -83,10 +84,11 @@ RUN --mount=type=cache,target=/tmp,from=build_temp,source=/tmp \
     else \
         CHIPOPTION=""; \
     fi && \
-    chmod +x $TOOLKIT_PKG $KERNELS_PKG && \
+    chmod +x $TOOLKIT_PKG $KERNELS_PKG $NNAL_PKG && \
     ./$TOOLKIT_PKG --quiet --install --install-path=$ASCEND_BASE --install-for-all $CHIPOPTION && \
     ./$KERNELS_PKG --quiet --install --install-path=$ASCEND_BASE --install-for-all && \
-    rm -f $TOOLKIT_PKG $KERNELS_PKG
+    ./$NNAL_PKG --quiet --install --install-path=$ASCEND_BASE &&
+    rm -f $TOOLKIT_PKG $KERNELS_PKG $NNAL_PKG
 
 ENV GLOG_v=2 \
     LD_LIBRARY_PATH=$TOOLKIT_PATH/lib64:$LD_LIBRARY_PATH \
@@ -99,14 +101,15 @@ ENV PYTHONPATH=$TBE_IMPL_PATH:$PYTHONPATH
 
 SHELL ["/bin/bash", "-c"]
 RUN echo "source /usr/local/Ascend/ascend-toolkit/set_env.sh" >> ~/.bashrc && \
+    echo "source /usr/local/Ascend/nnal/atb/set_env.sh --cxx_abi=0" >> ~/.bashrc && \
     . ~/.bashrc
 
 # dlinfer
-# transformers>=4.41.0 is required for internlm2 model
 # timm is required for internvl2 model
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip3 install transformers>=4.41.0 timm && \
-    pip3 install dlinfer-ascend==0.1.0.post1
+    pip3 install torch==2.3.1 torchvision==0.18.1 torch-npu==2.3.1 && \
+    pip3 install transformers timm && \
+    pip3 install dlinfer-ascend==0.1.1
 
 # lmdeploy
 FROM build_temp as copy_temp

From 1a76efbe24e5f951628bbd34e3b85615a3f0924e Mon Sep 17 00:00:00 2001
From: jinminxi104 <jinminxi104@hotmail.com>
Date: Fri, 25 Oct 2024 17:46:57 +0800
Subject: [PATCH 037/122] Update ascend get_started tutorial about installing
 nnal (#2662)

* Update get_started.md

* Update get_started.md
---
 docs/en/get_started/ascend/get_started.md    | 2 +-
 docs/zh_cn/get_started/ascend/get_started.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/en/get_started/ascend/get_started.md b/docs/en/get_started/ascend/get_started.md
index fdc0267503..c737185420 100644
--- a/docs/en/get_started/ascend/get_started.md
+++ b/docs/en/get_started/ascend/get_started.md
@@ -27,7 +27,7 @@ The target machine needs to install the Huawei driver and firmware version 23.0.
 [CANN Driver and Firmware Installation](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha003/softwareinst/instg/instg_0019.html)
 and [download resources](https://www.hiascend.com/hardware/firmware-drivers/community?product=4&model=26&cann=8.0.RC2.beta1&driver=1.0.25.alpha).
 
-And the CANN (version 8.0.RC2.beta1) software packages should also be downloaded from [Ascend Resource Download Center](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC2.beta1&product=4&model=26) themselves. Make sure to place the `Ascend-cann-kernels-910b*.run` and `Ascend-cann-toolkit*-aarch64.run` under the root directory of lmdeploy source code
+And the CANN (version 8.0.RC2.beta1) software packages should also be downloaded from [Ascend Resource Download Center](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC2.beta1&product=4&model=26) themselves. Make sure to place the `Ascend-cann-kernels-910b*.run`, `Ascend-cann-nnal_*.run` and `Ascend-cann-toolkit*-aarch64.run` under the root directory of lmdeploy source code
 
 #### Build Docker Image
 
diff --git a/docs/zh_cn/get_started/ascend/get_started.md b/docs/zh_cn/get_started/ascend/get_started.md
index 2e5632311a..cde7409a36 100644
--- a/docs/zh_cn/get_started/ascend/get_started.md
+++ b/docs/zh_cn/get_started/ascend/get_started.md
@@ -27,7 +27,7 @@ Docker 版本应不低于 18.03。并且需按照[官方指南](https://www.hias
 和[下载资源](https://www.hiascend.com/hardware/firmware-drivers/community?product=4&model=26&cann=8.0.RC2.beta1&driver=1.0.25.alpha)。
 
 另外，`docker/Dockerfile_aarch64_ascend`没有提供CANN 安装包，用户需要自己从[昇腾资源下载中心](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC2.beta1&product=4&model=26)下载CANN(version 8.0.RC2.beta1)软件包。
-并将``` Ascend-cann-kernels-910b*.run`` 和  ```Ascend-cann-toolkit\*-aarch64.run\`\` 放在 lmdeploy 源码根目录下。
+并将`Ascend-cann-kernels-910b*.run`，`Ascend-cann-nnal_*.run`和`Ascend-cann-toolkit*.run` 放在 lmdeploy 源码根目录下。
 
 #### 构建镜像
 

From 962e76070b5a1a1c6c61443b095d7db4b31bcd93 Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Fri, 25 Oct 2024 18:43:31 +0800
Subject: [PATCH 038/122] MoE support for turbomind (#2621)

* initial moe support

* dynamic grouped gemm

* benchmark

* moe benchmark

* moe sampling

* split-k

* refactor tuning

* simplify

* n-major weight

* add `num` for `MatrixLayout`

* packed rows

* packed cols

* dispatch for packed rows

* w4a16 moe

* refactor model loading

* fix pytorch loader

* refactor

* dispatch w4a16 moe

* fix loader

* add comment

* fix msvc build

* fix msvc build

* fix msvc build

* fix ut

* fix ut

* fix p-lora

* add all support arches

* minor

* fix lint

* fix lint

* fix lint

* fix ut

* bf16 support

* minor

* refactor

* fix lint

* fix ut

* minor

* minor

* minor

* fix inter_size config

* load with non-standard filenames

* fix loader

* fix missing default param

* defer the loading of misc weights for safetensors

* fix conversion

* fix deepseek-vl

* verify model config

* pad inter size by group size and tp

* fix minicpm attn bias & ignore un-needed bias

* set `attn_bias` based on minicpm version
---
 .pre-commit-config.yaml                       |   2 +-
 lmdeploy/turbomind/deploy/config.py           |  12 +-
 lmdeploy/turbomind/deploy/converter.py        |  14 +-
 lmdeploy/turbomind/deploy/exporter.py         | 211 ------
 lmdeploy/turbomind/deploy/loader.py           | 160 +++++
 lmdeploy/turbomind/deploy/module.py           | 262 ++++++++
 lmdeploy/turbomind/deploy/parameter.py        |  86 +++
 lmdeploy/turbomind/deploy/policy.py           |   6 +-
 .../turbomind/deploy/source_model/__init__.py |   1 +
 .../turbomind/deploy/source_model/base.py     | 141 +---
 .../deploy/source_model/deepseek_vl.py        |   6 +-
 .../turbomind/deploy/source_model/glm4.py     |   8 +
 .../deploy/source_model/internlm2.py          |   3 +
 .../turbomind/deploy/source_model/internvl.py |   4 +
 .../turbomind/deploy/source_model/llama.py    | 126 +---
 .../turbomind/deploy/source_model/minicpmv.py |  11 +
 .../turbomind/deploy/source_model/mixtral.py  |  36 ++
 .../turbomind/deploy/source_model/qwen.py     |  13 +
 .../deploy/source_model/xcomposer2.py         |  28 +-
 .../turbomind/deploy/target_model/base.py     |  69 +-
 lmdeploy/turbomind/deploy/target_model/fp.py  |  36 +-
 lmdeploy/turbomind/supported_models.py        |   4 +-
 src/turbomind/kernels/core/math.h             |  41 ++
 src/turbomind/kernels/core/sub_byte_ptr.h     |  10 +-
 src/turbomind/kernels/gemm/CMakeLists.txt     |  18 +-
 src/turbomind/kernels/gemm/arch/config_simt.h |  22 +-
 .../kernels/gemm/arch/config_sm70_s884.h      |  22 +-
 .../kernels/gemm/arch/config_sm75_s16816.h    |  24 +-
 .../kernels/gemm/arch/config_sm80_s16816.h    |  17 +-
 src/turbomind/kernels/gemm/arch/mma_sm80.h    |  12 +-
 .../kernels/gemm/arch/operand_sm80_s16816.h   |  10 +-
 src/turbomind/kernels/gemm/context.cu         | 603 ++++++++++++++++++
 src/turbomind/kernels/gemm/context.h          | 177 +++++
 src/turbomind/kernels/gemm/convert_v2.cu      |  89 ++-
 src/turbomind/kernels/gemm/convert_v2.h       |  35 +-
 src/turbomind/kernels/gemm/cta_map.h          | 250 ++++++--
 src/turbomind/kernels/gemm/desc.h             |  15 +-
 src/turbomind/kernels/gemm/dispatch_cache.cu  |  50 +-
 src/turbomind/kernels/gemm/epilogue.h         | 250 ++++----
 src/turbomind/kernels/gemm/gemm.cu            | 170 ++---
 src/turbomind/kernels/gemm/gemm.h             |   8 +-
 src/turbomind/kernels/gemm/gemm_universal.h   | 130 ++--
 src/turbomind/kernels/gemm/iterator.h         |  15 +-
 src/turbomind/kernels/gemm/iterator_sm70.h    | 106 +--
 src/turbomind/kernels/gemm/iterator_sm80.h    |  83 ++-
 src/turbomind/kernels/gemm/kernel.cu          | 118 ++--
 src/turbomind/kernels/gemm/kernel.h           |  10 +-
 .../kernel/f16_u4g128_f16_tnt_sm70_s884.cu    |   6 +-
 .../kernel/f16_u4g128_f16_tnt_sm75_s16816.cu  |   9 +-
 .../kernel/f16_u4g128_f16_tnt_sm75_simt.cu    |   6 +-
 .../kernel/f16_u4g128_f16_tnt_sm80_s16816.cu  |  25 +-
 .../kernel/f16_u4g128_f16_tnt_sm90_s16816.cu  |  21 +-
 .../kernels/gemm/kernel/sm70_s884_dynamic.cu  |  77 +++
 .../gemm/kernel/sm75_s16816_dynamic.cu        |  73 +++
 .../gemm/kernel/sm80_s16816_dynamic.cu        | 123 ++++
 .../gemm/kernel/sm90_s16816_dynamic.cu        | 123 ++++
 .../kernel/u4g128_f16_f16_nnn_sm80_s16816.cu  |  26 +-
 src/turbomind/kernels/gemm/kernel_impl.h      | 236 ++++---
 src/turbomind/kernels/gemm/matrix_ptr.h       | 112 ++++
 src/turbomind/kernels/gemm/moe_utils_v2.cu    | 475 ++++++++++++++
 src/turbomind/kernels/gemm/moe_utils_v2.h     |  62 ++
 src/turbomind/kernels/gemm/registry.cu        |  33 +-
 src/turbomind/kernels/gemm/registry.h         |   7 +
 src/turbomind/kernels/gemm/test/gemm_bench.cu |  22 +-
 src/turbomind/kernels/gemm/test/gemm_test.cu  |  24 +-
 src/turbomind/kernels/gemm/test/models.h      |  12 +-
 src/turbomind/kernels/gemm/test/reference.cu  |   3 +
 .../kernels/gemm/test/test_moe_utils.cu       | 373 +++++++++++
 src/turbomind/kernels/gemm/test/test_utils.h  |   2 +
 src/turbomind/kernels/gemm/test/testbed.h     | 439 +++++++++++--
 src/turbomind/kernels/gemm/tuner/measurer.cu  |   5 +
 src/turbomind/kernels/gemm/types.h            |  58 ++
 src/turbomind/kernels/gemm/utils.h            |  16 +
 src/turbomind/kernels/gpt_kernels.cu          |   3 +
 src/turbomind/models/llama/CMakeLists.txt     |   1 +
 src/turbomind/models/llama/LlamaBatch.cc      | 127 +++-
 src/turbomind/models/llama/LlamaBatch.h       |   2 +
 .../models/llama/LlamaDecoderLayerWeight.cc   | 353 +++++-----
 .../models/llama/LlamaDecoderLayerWeight.h    |   4 +
 src/turbomind/models/llama/LlamaDenseWeight.h | 113 +++-
 src/turbomind/models/llama/LlamaFfnLayer.cc   |   2 +-
 src/turbomind/models/llama/LlamaFfnLayer.h    |   7 +-
 .../models/llama/LlamaInstanceComm.h          |  34 -
 src/turbomind/models/llama/LlamaLinear.cu     |  89 +++
 src/turbomind/models/llama/LlamaLinear.h      |   9 +
 src/turbomind/models/llama/LlamaV2.cc         | 108 +---
 src/turbomind/models/llama/LlamaV2.h          |   3 +-
 src/turbomind/models/llama/LlamaWeight.cc     |  14 +-
 src/turbomind/models/llama/LlamaWeight.h      |   1 +
 src/turbomind/models/llama/context.h          |  66 ++
 src/turbomind/models/llama/llama_params.h     |  11 +
 src/turbomind/models/llama/llama_utils.cu     |   6 +
 src/turbomind/models/llama/llama_utils.h      |   2 +
 src/turbomind/models/llama/moe_ffn_layer.cc   | 293 +++++++++
 src/turbomind/models/llama/moe_ffn_layer.h    |  92 +++
 .../models/llama/unified_attention_layer.cc   |   9 +-
 .../models/llama/unified_attention_layer.h    |   3 +
 src/turbomind/models/llama/unified_decoder.cc |  33 +-
 src/turbomind/models/llama/unified_decoder.h  |   3 +
 .../triton_backend/llama/LlamaTritonModel.cc  | 121 ++--
 .../triton_backend/llama/LlamaTritonModel.h   |   1 +
 src/turbomind/utils/cublasMMWrapper.cc        |   1 -
 src/turbomind/utils/monotonic.h               |  43 ++
 tests/test_lmdeploy/test_auto_backend.py      |  21 +-
 .../test_turbomind/test_converter.py          |   8 +-
 105 files changed, 5703 insertions(+), 1772 deletions(-)
 delete mode 100644 lmdeploy/turbomind/deploy/exporter.py
 create mode 100644 lmdeploy/turbomind/deploy/loader.py
 create mode 100644 lmdeploy/turbomind/deploy/module.py
 create mode 100644 lmdeploy/turbomind/deploy/parameter.py
 create mode 100644 lmdeploy/turbomind/deploy/source_model/mixtral.py
 create mode 100644 src/turbomind/kernels/gemm/context.cu
 create mode 100644 src/turbomind/kernels/gemm/context.h
 create mode 100644 src/turbomind/kernels/gemm/kernel/sm70_s884_dynamic.cu
 create mode 100644 src/turbomind/kernels/gemm/kernel/sm75_s16816_dynamic.cu
 create mode 100644 src/turbomind/kernels/gemm/kernel/sm80_s16816_dynamic.cu
 create mode 100644 src/turbomind/kernels/gemm/kernel/sm90_s16816_dynamic.cu
 create mode 100644 src/turbomind/kernels/gemm/matrix_ptr.h
 create mode 100644 src/turbomind/kernels/gemm/moe_utils_v2.cu
 create mode 100644 src/turbomind/kernels/gemm/moe_utils_v2.h
 create mode 100644 src/turbomind/kernels/gemm/test/test_moe_utils.cu
 delete mode 100644 src/turbomind/models/llama/LlamaInstanceComm.h
 create mode 100644 src/turbomind/models/llama/moe_ffn_layer.cc
 create mode 100644 src/turbomind/models/llama/moe_ffn_layer.h
 create mode 100644 src/turbomind/utils/monotonic.h

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 62f19298d2..83d077f782 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -44,7 +44,7 @@ repos:
     rev: v2.1.0
     hooks:
       - id: codespell
-        args: ["--skip=third_party/*,*.ipynb,*.proto,src/turbomind/kernels/gemm/transform.h,docker/Dockerfile_aarch64_ascend,docs/en/get_started/ascend/get_started.md,docs/zh_cn/get_started/ascend/get_started.md"]
+        args: ["--skip=third_party/*,*.ipynb,*.proto,src/turbomind/*,docker/Dockerfile_aarch64_ascend,docs/en/get_started/ascend/get_started.md,docs/zh_cn/get_started/ascend/get_started.md"]
 
 
   - repo: https://github.com/myint/docformatter
diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
index 4ee464e46d..6652650949 100644
--- a/lmdeploy/turbomind/deploy/config.py
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -38,7 +38,7 @@ class ModelConfig:
     num_layer: int = None
     inter_size: int = None
     norm_eps: float = None
-    attn_bias: int = None
+    attn_bias: int = 0
     start_id: int = None
     end_id: int = None
     size_per_head: int = 128
@@ -47,6 +47,16 @@ class ModelConfig:
     session_len: int = None
     tp: int = 1
     model_format: str = 'hf'
+    expert_num: int = 0
+    expert_inter_size: int = 0
+    experts_per_token: int = 0
+
+    def verify(self):
+        invalid = {}
+        for k, v in self.__dict__.items():
+            if v is None:
+                invalid[k] = v
+        assert not invalid, f'incomplete model config: {invalid}'
 
 
 @dataclass
diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
index 14787c7812..1c847ede01 100644
--- a/lmdeploy/turbomind/deploy/converter.py
+++ b/lmdeploy/turbomind/deploy/converter.py
@@ -14,7 +14,7 @@
 from ...utils import _get_and_verify_max_len, is_bf16_supported
 from ..supported_models import SUPPORTED_ARCHS, is_supported
 from .config import TurbomindModelConfig
-from .exporter import get_exporter_factory
+from .module import Transformer
 from .policy import get_input_policy
 from .source_model.base import INPUT_MODELS
 from .target_model.base import OUTPUT_MODELS
@@ -99,7 +99,6 @@ def get_output_model_registered_name_and_config(model_path: str,
         group_size (int): the size of group used by awq model
     """
     register_name = 'tm'
-    turbomind_model_arch = 'llama'
     weight_type = 'float16'
 
     config = TurbomindModelConfig.from_dict()
@@ -108,7 +107,6 @@ def get_output_model_registered_name_and_config(model_path: str,
         session_len = 2048
     else:  # hf, awq, None
         model_arch, model_config = get_model_arch(model_path)
-        turbomind_model_arch = SUPPORTED_ARCHS[model_arch]
         session_len = _get_and_verify_max_len(model_config, None)
         if model_format in ['awq', 'gptq']:
             weight_type = 'int4'
@@ -148,11 +146,7 @@ def get_output_model_registered_name_and_config(model_path: str,
     config.model_config.group_size = group_size
     config.model_config.session_len = session_len
 
-    lora_type = 'plora' if turbomind_model_arch == 'xcomposer2' else ''
-
-    exporter_factory = get_exporter_factory(weight_type, lora_type)
-
-    return register_name, config, exporter_factory
+    return register_name, config
 
 
 def pack_model_repository(workspace_path: str):
@@ -264,7 +258,7 @@ def get_tm_model(model_path,
                                                      tokenizer_path=model_path,
                                                      input_policy=input_policy)
 
-    output_model_name, tm_cfg, exporter_factory = \
+    output_model_name, tm_cfg = \
         get_output_model_registered_name_and_config(
             model_path=model_path,
             model_format=engine_config.model_format,
@@ -278,7 +272,7 @@ def get_tm_model(model_path,
     output_model = OUTPUT_MODELS.get(output_model_name)(
         input_model=input_model,
         cfg=tm_cfg,
-        exporter_factory=exporter_factory,
+        model_cls=Transformer,
         out_dir=out_dir)
 
     return output_model
diff --git a/lmdeploy/turbomind/deploy/exporter.py b/lmdeploy/turbomind/deploy/exporter.py
deleted file mode 100644
index 9667d34583..0000000000
--- a/lmdeploy/turbomind/deploy/exporter.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from abc import ABC, abstractmethod
-
-import torch
-
-from .target_model.base import BaseOutputModel, BaseReader
-
-
-def permute_v2(x: torch.Tensor, size_per_head: int = 128):
-    """
-        Contract: x.size(-1) is output dims
-    """
-
-    assert x.size(-1) > 1
-
-    output_dims = x.size(-1)
-    head_num = output_dims // size_per_head
-
-    return x.view(-1, head_num, 2,
-                  size_per_head // 2).transpose(2, 3).reshape(x.shape)
-
-
-def merge_qkv_v2(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, tp: int):
-    """
-        Contract: x.size(-1) is output dims
-    """
-
-    def reshape(x):
-        return x.view(x.size(0), tp, -1) if q.dim() == 2 else x.view(tp, -1)
-
-    qkv = torch.cat(tuple(map(reshape, (q, k, v))), dim=-1)
-
-    qkv = qkv.view(-1, qkv.size(-1) * tp)
-    if q.dim() == 1:
-        qkv.squeeze_()
-
-    return qkv
-
-
-def identity(x):
-    return x
-
-
-def transpose(x):
-    return x.t() if x is not None else x
-
-
-def pack_u4_row(x: torch.Tensor) -> torch.Tensor:
-    assert x.dtype == torch.uint8
-    xs = x.view(*x.shape[:-1], -1, 8).split(1, dim=-1)
-    a = torch.zeros(xs[0].shape, dtype=torch.int32, device=x.device)
-    for t in reversed(xs):
-        a = (a << 4) | t
-    return a.squeeze(dim=-1)
-
-
-def pad_out_dims(x: torch.Tensor, dims: int):
-    pad = dims - x.size(-1)
-    assert pad >= 0
-    return torch.nn.functional.pad(x, (0, pad), 'constant', 0)
-
-
-def pad_in_dims(x: torch.Tensor, dims: int):
-    pad = dims - x.size(0)
-    assert x.dim() == 2
-    assert pad >= 0
-    return torch.nn.functional.pad(x, (0, 0, 0, pad), 'constant', 0)
-
-
-class BaseExporter(ABC):
-
-    _attn = 'layers.{0}.attention.{1}.{2}'
-    _ffn = 'layers.{0}.feed_forward.{1}.{2}'
-
-    def __init__(self, model: BaseOutputModel):
-        self.model = model
-        self.tp = model.tensor_para_size
-        self.head_dim = model.model_config.size_per_head
-        self.inter_size = model.model_config.inter_size
-
-    def export_attn(self, idx: int, qkvo, kind: str, pack_fn=identity):
-        if all(x is None for x in qkvo):
-            return
-        is_lora_a, is_lora_b = self.get_lora_flags(kind)
-        q, k, v, o = map(transpose, qkvo)
-        if self.model.permute_qk:
-            q = permute_v2(q, self.head_dim)
-            k = permute_v2(k, self.head_dim)
-        qkv = merge_qkv_v2(q, k, v, self.tp)
-        if o is None and q.dim() == 1:
-            o = torch.zeros_like(q)
-        qkv = pack_fn(qkv)
-        o = pack_fn(o)
-        self.model.save_split(qkv,
-                              self._attn.format(idx, 'w_qkv', kind),
-                              split_dim=-1,
-                              copy=is_lora_a)
-        self.model.save_split(o,
-                              self._attn.format(idx, 'wo', kind),
-                              split_dim=0,
-                              copy=is_lora_b)
-
-    def export_ffn(self, idx: int, w123, kind: str, pack_fn=identity, g=1):
-        is_lora_a, is_lora_b = self.get_lora_flags(kind)
-        w1, w2, w3 = map(transpose, w123)
-
-        if not is_lora_a:
-            w1 = pad_out_dims(w1, self.inter_size)
-            w3 = pad_out_dims(w3, self.inter_size)
-        if not is_lora_b:
-            w2 = pad_in_dims(w2, self.inter_size // g)
-
-        w1, w2, w3 = map(pack_fn, (w1, w2, w3))
-        self.model.save_split(w1,
-                              self._ffn.format(idx, 'w1', kind),
-                              split_dim=-1,
-                              copy=is_lora_a)
-        self.model.save_split(w3,
-                              self._ffn.format(idx, 'w3', kind),
-                              split_dim=-1,
-                              copy=is_lora_a)
-        self.model.save_split(w2,
-                              self._ffn.format(idx, 'w2', kind),
-                              split_dim=0,
-                              copy=is_lora_b)
-
-    # split out dims -> copy A, split-out-dims B (qkv, w1, w3)
-    # split  in dims -> split-in-dims A,  copy B (  o, w2)
-    def get_lora_flags(self, kind: str):
-        return ('lora_a' in kind, 'lora_b' in kind)
-
-    @abstractmethod
-    def export(self, r: BaseReader, idx: int):
-        pass
-
-
-class WeightExporter(BaseExporter):
-
-    def export(self, r: BaseReader, i: int):
-        self.export_attn(i, r.attn(i), 'weight')
-        self.export_attn(i, r.attn_bias(i), 'bias')
-        self.export_ffn(i, r.ffn(i), 'weight')
-
-
-class LayerNormExporter(BaseExporter):
-
-    def export(self, r: BaseReader, i: int):
-        attn_norm = r.attn_norm(i)
-        ffn_norm = r.ffn_norm(i)
-        self.model.save_split(attn_norm, f'layers.{i}.attention_norm.weight')
-        self.model.save_split(ffn_norm, f'layers.{i}.ffn_norm.weight')
-
-
-class QuantWeightExporter(BaseExporter):
-
-    def __init__(self, model: BaseOutputModel, pack_fn):
-        super().__init__(model)
-        self.pack_fn = pack_fn
-        self.group_size = model.tm_config.group_size
-
-    def export(self, r: BaseReader, i: int):
-
-        def to_half(x: torch.Tensor):
-            return x.to(torch.half)
-
-        self.export_attn(i, r.attn(i), 'qweight', self.pack_fn)
-        self.export_attn(i, r.attn_bias(i), 'bias', to_half)
-        self.export_attn(i, r.attn_scale(i), 'scales', to_half)
-        self.export_attn(i, r.attn_zero(i), 'zeros', to_half)
-        self.export_ffn(i, r.ffn(i), 'qweight', self.pack_fn)
-        self.export_ffn(i, r.ffn_scale(i), 'scales', to_half, self.group_size)
-        self.export_ffn(i, r.ffn_zero(i), 'zeros', to_half, self.group_size)
-
-
-class PLoraExporter(BaseExporter):
-
-    def export_attn_lora_a(self, idx: int, ws, kind: str):
-        is_lora_a, is_lora_b = self.get_lora_flags(kind)
-        qkv, o = map(transpose, ws)
-        self.model.save_split(qkv,
-                              self._attn.format(idx, 'w_qkv', kind),
-                              split_dim=-1,
-                              copy=is_lora_a)
-        self.model.save_split(o,
-                              self._attn.format(idx, 'wo', kind),
-                              split_dim=0,
-                              copy=is_lora_b)
-
-    def export(self, r: BaseReader, i: int):
-        self.export_attn_lora_a(i, r.attn_lora_a(i), 'lora_a.weight')
-        self.export_attn(i, r.attn_lora_b(i), 'lora_b.weight')
-        self.export_ffn(i, r.ffn_lora_a(i), 'lora_a.weight')
-        self.export_ffn(i, r.ffn_lora_b(i), 'lora_b.weight')
-
-
-def get_exporter_factory(weight_type, lora_type):
-
-    def get_exporters(model: BaseOutputModel):
-        exporters = [LayerNormExporter(model)]
-
-        if weight_type == 'int4':
-            exporters.append(QuantWeightExporter(model, pack_u4_row))
-        else:
-            exporters.append(WeightExporter(model))
-
-        if lora_type == 'plora':
-            exporters.append(PLoraExporter(model))
-
-        return exporters
-
-    return get_exporters
diff --git a/lmdeploy/turbomind/deploy/loader.py b/lmdeploy/turbomind/deploy/loader.py
new file mode 100644
index 0000000000..e3d79b164a
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/loader.py
@@ -0,0 +1,160 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+import re
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from functools import partial
+from glob import glob
+from typing import Iterator, Tuple
+
+import torch
+from safetensors import safe_open
+
+# https://github.com/huggingface/transformers/blob/53fad641cfdb5105e2470bcf3ef17ea8e25cc300/src/transformers/modeling_utils.py#L372
+WEIGHT_INDEX_NAME = 'pytorch_model.bin.index.json'
+WEIGHT_PATTERN = 'pytorch_model*.bin'
+SAFE_WEIGHT_INDEX_NAME = 'model.safetensors.index.json'
+SAFE_WEIGHT_PATTERN = 'model*.safetensors'
+EXTRA_WEIGHT_PATTERNS = ['*.pt', '*.bin']
+EXTRA_SAFE_WEIGHT_PATTERN = '*.safetensors'
+
+
+class BaseLoader(ABC):
+
+    def __init__(self, model_path: str, pattern):
+        self.model_path = model_path
+        self.pattern = pattern
+        self.item_count = defaultdict(int)
+
+    def get_index(self, index_name: str,
+                  file_pattern: str) -> Tuple[dict, list]:
+        """get shards and weight map (if possible) for the model."""
+        get_path = partial(osp.join, self.model_path)
+        shards = []
+        if index_name:
+            with open(get_path(index_name), 'r') as f:
+                index = json.load(f)
+            index = index['weight_map']
+            shards = list(map(get_path, set(index.values())))
+        else:
+            index = {}
+            shards = glob(get_path(file_pattern))
+        if not shards:
+            raise RuntimeError(
+                f'failed to locate weight files for {self.model_path}')
+        return sorted(shards), index
+
+    @abstractmethod
+    def items(self) -> Iterator[Tuple[int, dict]]:
+        pass
+
+
+class SafetensorsLoader(BaseLoader):
+
+    def __init__(self,
+                 model_path: str,
+                 pattern: str,
+                 index_name=None,
+                 file_pattern=None):
+        super().__init__(model_path, pattern)
+        self.shards, index = self.get_index(index_name, file_pattern)
+        if not index:
+            for shard in self.shards:
+                with safe_open(shard, 'pt') as f:
+                    index.update({k: shard for k in f.keys()})
+        # count layer-wise parameters
+        for k in index.keys():
+            match = re.findall(self.pattern, k)
+            if match:
+                self.item_count[int(match[0])] += 1
+
+    def items(self):
+        params = defaultdict(dict)
+        for shard in self.shards:
+            with safe_open(shard, 'pt') as f:
+                misc = []
+                for k in f.keys():
+                    match = re.findall(self.pattern, k)
+                    if not match:
+                        misc.append(k)
+                    else:
+                        idx = int(match[0])
+                        param = params[idx]
+                        param[k] = f.get_tensor(k)
+                        if len(param) == self.item_count[idx]:
+                            yield (idx, params.pop(idx))
+                if misc:
+                    yield (-1, {k: f.get_tensor(k) for k in misc})
+        assert not params
+
+
+class PytorchLoader(BaseLoader):
+
+    def __init__(self,
+                 model_path: str,
+                 pattern: str,
+                 index_name=None,
+                 file_pattern=None):
+        super().__init__(model_path, pattern)
+        self.shards, index = self.get_index(index_name, file_pattern)
+        for k in index.keys():
+            match = re.findall(self.pattern, k)
+            if match:
+                self.item_count[int(match[0])] += 1
+
+    def items(self):
+        params = defaultdict(dict)
+        for shard in self.shards:
+            misc = {}
+            tmp = torch.load(shard, map_location='cpu')
+            for k, v in tmp.items():
+                match = re.findall(self.pattern, k)
+                if not match:
+                    misc[k] = v
+                else:
+                    idx = int(match[0])
+                    params[idx][k] = v
+            del tmp
+            if misc:
+                yield (-1, misc)
+                misc.clear()
+            ready = []
+            if self.item_count:
+                for idx, param in params.items():
+                    if len(param) == self.item_count[idx]:
+                        ready.append(idx)
+            else:
+                ready = sorted(params.keys())[:-1]
+            for idx in ready:
+                yield (idx, params.pop(idx))
+        idxs = sorted(params.keys())
+        for idx in idxs:
+            yield (idx, params.pop(idx))
+
+
+def create_loader(model_path: str, pattern: str) -> BaseLoader:
+    args = (model_path, pattern)
+
+    if osp.exists(osp.join(model_path, SAFE_WEIGHT_INDEX_NAME)):
+        return SafetensorsLoader(*args, index_name=SAFE_WEIGHT_INDEX_NAME)
+
+    if glob(osp.join(model_path, SAFE_WEIGHT_PATTERN)):
+        return SafetensorsLoader(*args, file_pattern=SAFE_WEIGHT_PATTERN)
+
+    if osp.exists(osp.join(model_path, WEIGHT_INDEX_NAME)):
+        return PytorchLoader(*args, index_name=WEIGHT_INDEX_NAME)
+
+    if glob(osp.join(model_path, WEIGHT_PATTERN)):
+        return PytorchLoader(*args, file_pattern=WEIGHT_PATTERN)
+
+    # non-standard safetensors model (*.safetensors)
+    if glob(osp.join(model_path, EXTRA_SAFE_WEIGHT_PATTERN)):
+        return SafetensorsLoader(*args, file_pattern=EXTRA_SAFE_WEIGHT_PATTERN)
+
+    # non-standard pytorch model (*.bin, *.pt)
+    for p in EXTRA_WEIGHT_PATTERNS:
+        if glob(osp.join(model_path, p)):
+            return PytorchLoader(*args, file_pattern=p)
+
+    raise RuntimeError(f'Failed to find valid loader for {model_path}')
diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py
new file mode 100644
index 0000000000..a9f7385376
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/module.py
@@ -0,0 +1,262 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABC, abstractmethod
+from functools import partial
+
+import torch
+
+from .parameter import get_params
+from .source_model.base import BaseReader
+from .target_model.base import BaseOutputModel
+
+
+def permute_v2(x: torch.Tensor, size_per_head: int = 128):
+    """
+        Contract: x.size(-1) is output dims
+    """
+
+    assert x.size(-1) > 1
+
+    output_dims = x.size(-1)
+    head_num = output_dims // size_per_head
+
+    return x.view(-1, head_num, 2,
+                  size_per_head // 2).transpose(2, 3).reshape(x.shape)
+
+
+def merge_qkv_v2(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, tp: int):
+    """
+        Contract: x.size(-1) is output dims
+    """
+
+    def reshape(x):
+        return x.view(x.size(0), tp, -1) if q.dim() == 2 else x.view(tp, -1)
+
+    qkv = torch.cat(tuple(map(reshape, (q, k, v))), dim=-1)
+
+    qkv = qkv.view(-1, qkv.size(-1) * tp)
+    if q.dim() == 1:
+        qkv.squeeze_()
+
+    return qkv
+
+
+def transpose(x):
+    return x.t() if x is not None else x
+
+
+def pad_out_dims(x: torch.Tensor, dims: int):
+    pad = dims - x.size(-1)
+    assert pad >= 0
+    return torch.nn.functional.pad(x, (0, pad), 'constant', 0)
+
+
+def pad_in_dims(x: torch.Tensor, dims: int):
+    pad = dims - x.size(0)
+    assert x.dim() == 2
+    assert pad >= 0
+    return torch.nn.functional.pad(x, (0, 0, 0, pad), 'constant', 0)
+
+
+# split out dims -> copy A, split-out-dims B (qkv, w1, w3)
+# split  in dims -> split-in-dims A,  copy B (  o, w2)
+def get_lora_flags(kind: str):
+    return ('lora_a' in kind, 'lora_b' in kind)
+
+
+class Module(ABC):
+
+    def __init__(self, model: BaseOutputModel):
+        self.model = model
+
+    def __call__(self, *args, **kwargs):
+        return self.apply(*args, **kwargs)
+
+    @abstractmethod
+    def apply(self, idx: int, r: BaseReader):
+        pass
+
+
+class LayerNorm(Module):
+
+    def apply(self, i: int, r: BaseReader):
+        attn_norm = r.attn_norm(i)
+        ffn_norm = r.ffn_norm(i)
+        self.model.save_split(attn_norm, f'layers.{i}.attention_norm.weight')
+        self.model.save_split(ffn_norm, f'layers.{i}.ffn_norm.weight')
+
+
+class Ffn(Module):
+    """
+    requires:
+        r.ffn(i, kind)
+    """
+
+    _ffn = 'layers.{0}.feed_forward.{1}.{2}'
+
+    def __init__(self, model: BaseOutputModel):
+        self.model = model
+        self.tp = model.tensor_para_size
+        self.inter_size = model.model_config.inter_size
+        self.group_size = max(1, model.model_config.group_size)
+
+    def _export(self,
+                fmt: str,
+                idx: int,
+                w123,
+                kind: str,
+                pack_fn,
+                apply_gs=False):
+        is_lora_a, is_lora_b = get_lora_flags(kind)
+        w1, w2, w3 = map(transpose, w123)
+
+        if not is_lora_a:
+            w1 = pad_out_dims(w1, self.inter_size)
+            w3 = pad_out_dims(w3, self.inter_size)
+        if not is_lora_b:
+            group_size = self.group_size if apply_gs else 1
+            w2 = pad_in_dims(w2, self.inter_size // group_size)
+
+        w1, w2, w3 = map(pack_fn, (w1, w2, w3))
+        self.model.save_split(w1,
+                              fmt.format(idx, 'w1', kind),
+                              split_dim=-1,
+                              copy=is_lora_a)
+        self.model.save_split(w3,
+                              fmt.format(idx, 'w3', kind),
+                              split_dim=-1,
+                              copy=is_lora_a)
+        self.model.save_split(w2,
+                              fmt.format(idx, 'w2', kind),
+                              split_dim=0,
+                              copy=is_lora_b)
+
+    def apply(self, i: int, r: BaseReader):
+        for e in get_params(r.ffn(i, None)):
+            e(partial(self._export, self._ffn), partial(r.ffn, i), i)
+
+
+class MoeFfn(Ffn):
+    """
+    requires:
+        r.moe_ffn_expert(e, i, kind)
+        r.moe_ffn_gate(i)
+    """
+
+    _moe_ffn_expert = 'layers.{0}.moe_ffn.experts.E.{1}.{2}'
+    _moe_ffn_gate = 'layers.{0}.moe_ffn.gate.{1}'
+
+    def __init__(self, model: BaseOutputModel):
+        super().__init__(model)
+        self.expert_num = model.model_config.expert_num
+
+    def apply(self, i: int, r: BaseReader):
+        for p in get_params(r.moe_ffn_expert()):
+            for e in range(self.expert_num):
+                fmt = self._moe_ffn_expert.replace('E', str(e))
+                p(partial(self._export, fmt), partial(r.moe_ffn_expert, e, i),
+                  i)
+
+        gate = transpose(r.moe_ffn_gate(i))
+        self.model.save_split(gate, self._moe_ffn_gate.format(i, 'weight'))
+
+
+class Attn(Module):
+    """
+    requires:
+        r.attn(i, kind)
+    """
+
+    _attn = 'layers.{0}.attention.{1}.{2}'
+
+    def __init__(self, model: BaseOutputModel):
+        self.model = model
+        self.tp = model.tensor_para_size
+        self.head_dim = model.model_config.size_per_head
+        self.attn_bias = model.model_config.attn_bias
+
+    def _reorder_and_merge(self, qkvo):
+        q, k, v, o = map(transpose, qkvo)
+        # reorder output dim for tm's rotary embedding layout
+        if self.model.permute_qk:
+            q = permute_v2(q, self.head_dim)
+            k = permute_v2(k, self.head_dim)
+        qkv = merge_qkv_v2(q, k, v, self.tp)
+        # zero bias for `wo` when `w_qkv` has bias but `wo` doesn't
+        if o is None and q.dim() == 1:
+            o = torch.zeros_like(q)
+        return qkv, o
+
+    def _export(self, idx: int, qkvo, kind: str, pack_fn, **kwargs):
+        if all(x is None for x in qkvo):
+            return
+        is_lora_a, is_lora_b = get_lora_flags(kind)
+        if is_lora_a:
+            qkv, o = map(transpose, qkvo)
+        else:
+            qkv, o = self._reorder_and_merge(qkvo)
+        self.model.save_split(pack_fn(qkv),
+                              self._attn.format(idx, 'w_qkv', kind),
+                              split_dim=-1,
+                              copy=is_lora_a)
+        self.model.save_split(pack_fn(o),
+                              self._attn.format(idx, 'wo', kind),
+                              split_dim=0,
+                              copy=is_lora_b)
+
+    def apply(self, i: int, r: BaseReader):
+        for e in get_params(r.attn(i, None), bias=self.attn_bias):
+            e(self._export, partial(r.attn, i), i)
+
+
+class Misc(Module):
+    """
+    requires:
+        r.tok_embeddings()
+        r.norm_weight()
+        r.output_weight()
+    """
+
+    def apply(self, i: int, r: BaseReader):
+        """Export embedding, norm, output weight."""
+        emb = r.tok_embeddings()
+        norm_weight = r.norm_weight()
+        output_weight = r.output_weight()
+
+        def pad_weight(tensor):
+            pad_size = None
+            vocab_size = self.model.model_config.vocab_size
+            tp = self.model.tensor_para_size
+            if vocab_size % tp != 0:
+                pad_size = (vocab_size + tp - 1) // tp * tp - vocab_size
+
+            if pad_size is None:
+                return tensor
+            return torch.nn.functional.pad(tensor, (0, 0, 0, pad_size),
+                                           'constant', 0)
+
+        if emb is not None:
+            emb = pad_weight(emb)
+            self.model.save_split(emb, 'tok_embeddings.weight', split_dim=1)
+        if norm_weight is not None:
+            self.model.export_weight(norm_weight, 'norm.weight')
+        if output_weight is not None:
+            output_weight = pad_weight(output_weight)
+            self.model.save_split(output_weight, 'output.weight', split_dim=0)
+
+
+class Transformer:
+
+    def __init__(self, model: BaseOutputModel):
+        self.model = model
+        ffn = MoeFfn if model.model_config.expert_num else Ffn
+        modules = [Attn, LayerNorm, ffn]
+        self.modules = [c(model) for c in modules]
+        self.misc = Misc(model)
+
+    def __call__(self, i: int, r: BaseReader):
+        if i >= 0:
+            for m in self.modules:
+                m(i, r)
+            return 1
+        else:
+            self.misc(i, r)
diff --git a/lmdeploy/turbomind/deploy/parameter.py b/lmdeploy/turbomind/deploy/parameter.py
new file mode 100644
index 0000000000..82a95c3bfb
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/parameter.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import abstractmethod
+from typing import List
+
+import torch
+
+
+def identity(x):
+    return x
+
+
+def to_half(x: torch.Tensor):
+    return x.to(torch.half)
+
+
+def pack_u4_row(x: torch.Tensor) -> torch.Tensor:
+    assert x.dtype == torch.uint8
+    xs = x.view(*x.shape[:-1], -1, 8).split(1, dim=-1)
+    a = torch.zeros(xs[0].shape, dtype=torch.int32, device=x.device)
+    for t in reversed(xs):
+        a = (a << 4) | t
+    return a.squeeze(dim=-1)
+
+
+class Parameter:
+    KEY = ()
+
+    @classmethod
+    def take(cls, keys: List[str]):
+        if not any(k.endswith(cls.KEYS[0]) for k in keys):
+            return False
+        xs = []
+        for k in keys:
+            if any(k.endswith(p) for p in cls.KEYS):
+                xs.append(k)
+        for x in xs:
+            keys.remove(x)
+        return True
+
+    @abstractmethod
+    def __call__(cls, f, g, i):
+        pass
+
+
+class QuantWeightOnly(Parameter):
+    KEYS = '.qweight', '.scales', '.qzeros'
+
+    def __call__(self, f, g, i):
+        f(i, g('qweight'), 'qweight', pack_u4_row)
+        f(i, g('scales'), 'scales', to_half, apply_gs=True)
+        f(i, g('qzeros'), 'zeros', to_half, apply_gs=True)
+
+
+class Weight(Parameter):
+    KEYS = '.weight',
+
+    def __call__(self, f, g, i):
+        f(i, g('weight'), 'weight', identity)
+
+
+class Bias(Parameter):
+    KEYS = '.bias',
+
+    def __call__(self, f, g, i):
+        f(i, g('bias'), 'bias', identity)
+
+
+class PLora(Parameter):
+    KEYS = '.Plora_A.weight', '.Plora_B.weight'
+
+    def __call__(self, f, g, i):
+        f(i, g('Plora_A.weight'), 'lora_a.weight', identity)
+        f(i, g('Plora_B.weight'), 'lora_b.weight', identity)
+
+
+def get_params(keys: List[str], bias=0):
+    ps = []
+    if PLora.take(keys):
+        ps.append(PLora())
+    if QuantWeightOnly.take(keys):
+        ps.append(QuantWeightOnly())
+    if Weight.take(keys):
+        ps.append(Weight())
+    if bias and Bias.take(keys):
+        ps.append(Bias())
+    return ps
diff --git a/lmdeploy/turbomind/deploy/policy.py b/lmdeploy/turbomind/deploy/policy.py
index bc4db7ddae..111e7eca57 100644
--- a/lmdeploy/turbomind/deploy/policy.py
+++ b/lmdeploy/turbomind/deploy/policy.py
@@ -48,8 +48,8 @@ def process_gptq(x: torch.Tensor, kind: str):
 
 def get_input_policy(model_format):
     if model_format == 'awq':
-        return ('qweight', process_awq_gemm)
+        return process_awq_gemm
     elif model_format == 'gptq':
-        return ('qweight', process_gptq)
+        return process_gptq
     else:
-        return ('weight', to_cuda)
+        return to_cuda
diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py
index 9ca06b0cc9..a36102e1c6 100644
--- a/lmdeploy/turbomind/deploy/source_model/__init__.py
+++ b/lmdeploy/turbomind/deploy/source_model/__init__.py
@@ -7,5 +7,6 @@
 from .llama import LlamaModel  # noqa: F401
 from .meta_llama import MetaLlamaModel  # noqa: F401
 from .minicpmv import MiniCPMVModel  # noqa: F401
+from .mixtral import MixtralModel  # noqa: F401
 from .qwen import QwenModel  # noqa: F401
 from .xcomposer2 import Xcomposer2Model  # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/source_model/base.py b/lmdeploy/turbomind/deploy/source_model/base.py
index c0e468d645..93740c8ff4 100644
--- a/lmdeploy/turbomind/deploy/source_model/base.py
+++ b/lmdeploy/turbomind/deploy/source_model/base.py
@@ -1,7 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import re
 from abc import ABC, abstractmethod
-from typing import Dict, Iterator, Tuple, Union
+from typing import Dict, Iterator, Union
 
 import torch
 from mmengine import Registry
@@ -11,69 +10,11 @@
 
 
 class BaseReader(ABC):
-    """Base checkpoint manager."""
+    """Mapping between TM modules and source modules."""
 
     def __init__(self):
         pass
 
-    @property
-    @abstractmethod
-    def start_layer_id(self) -> int:
-        """Get the start transformer layer number."""
-        pass
-
-    @property
-    @abstractmethod
-    def end_layer_id(self) -> int:
-        """Get the end transformer layer number."""
-        pass
-
-    @abstractmethod
-    def init_layer_id(self) -> None:
-        """Get start and end transformer layer number."""
-        self._start_layer_id = -1
-        self._end_layer_id = -1
-        layer_count = {}
-        for key in self.params:
-            layer_id = re.findall(self.attn_layer_patten, key)
-            if len(layer_id) == 0:
-                continue
-            layer_id = int(layer_id[0])
-            if layer_id not in layer_count:
-                layer_count[layer_id] = 0
-            layer_count[layer_id] += 1
-        if len(layer_count) == 0:
-            return
-        if not (len(layer_count) > 1 or self.last_bin):
-            return
-        max_count = max([layer_count[layer_id] for layer_id in layer_count])
-        valid_layer_id = [
-            layer_id for layer_id in layer_count
-            if layer_count[layer_id] == max_count
-        ]
-        self._start_layer_id = min(valid_layer_id)
-        self._end_layer_id = max(valid_layer_id) + 1
-
-    @abstractmethod
-    def clean_up(self, last: bool) -> None:
-        """Clean up unused params."""
-        if last:
-            self.params.clear()
-        else:
-            to_remove = []
-            for key in self.params:
-                layer_id = re.findall(self.attn_layer_patten, key)
-                if len(layer_id) == 0:
-                    # tok, norm, output
-                    to_remove.append(key)
-                else:
-                    layer_id = int(layer_id[0])
-                    if layer_id < self.end_layer_id:
-                        to_remove.append(key)
-            for key in to_remove:
-                self.params.pop(key, None)
-        torch.cuda.empty_cache()
-
     def transform(self, x: Union[torch.Tensor, None],
                   kind: str) -> Union[torch.Tensor, None]:
         return None if x is None else self._transform(x, kind)
@@ -83,66 +24,6 @@ def _transform(self, x: torch.Tensor, kind: str):
         """Transform x."""
         pass
 
-    @abstractmethod
-    def tok_embeddings(self) -> Union[torch.Tensor, None]:
-        """Get embeddings."""
-        pass
-
-    @abstractmethod
-    def norm_weight(self) -> Union[torch.Tensor, None]:
-        """Get norm."""
-        pass
-
-    @abstractmethod
-    def output_weight(self) -> Union[torch.Tensor, None]:
-        """Get output."""
-        pass
-
-    @abstractmethod
-    def attn(self, i: int) -> Tuple[torch.Tensor]:
-        """Get q, k, v, o weight for layer i."""
-        pass
-
-    @abstractmethod
-    def attn_bias(self, i: int) -> Tuple[torch.Tensor, None]:
-        """Get q, k, v, o bias for layer i."""
-        pass
-
-    @abstractmethod
-    def attn_zero(self, i: int) -> Tuple[torch.Tensor, None]:
-        """Get q, k, v, o zero point for layer i."""
-        pass
-
-    @abstractmethod
-    def attn_scale(self, i: int) -> Tuple[torch.Tensor, None]:
-        """Get q, k, v, o scale for layer i."""
-        pass
-
-    @abstractmethod
-    def attn_norm(self, i: int) -> torch.Tensor:
-        """Get attn norm for layer i."""
-        pass
-
-    @abstractmethod
-    def ffn(self, i: int) -> Tuple[torch.Tensor]:
-        """Get ffn weight for layer i."""
-        pass
-
-    @abstractmethod
-    def ffn_zero(self, i: int) -> Tuple[torch.Tensor, None]:
-        """Get ffn zero point for layer i."""
-        pass
-
-    @abstractmethod
-    def ffn_scale(self, i: int) -> Tuple[torch.Tensor, None]:
-        """Get ffn scale for layer i."""
-        pass
-
-    @abstractmethod
-    def ffn_norm(self, i: int) -> torch.Tensor:
-        """Get ffn norm for layer i."""
-        pass
-
 
 class BaseInputModel(ABC):
     """Base class for input model."""
@@ -157,17 +38,6 @@ def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
         self.model_path = model_path
         self.tokenizer_path = tokenizer_path
 
-    @property
-    @abstractmethod
-    def nmgrs(self) -> int:
-        """Get number of checkpoint."""
-        pass
-
-    @abstractmethod
-    def get_mgrs(self) -> Iterator[BaseReader]:
-        """Conctruct all BaseReader."""
-        pass
-
     @abstractmethod
     def tokenizer_info(self):
         """Read tokenizer info."""
@@ -178,7 +48,6 @@ def model_info(self) -> Dict:
         """Read model info."""
         pass
 
-    def bins(self) -> Iterator[BaseReader]:
-        """Get Reader."""
-        for mgr in self.get_mgrs():
-            yield mgr
+    @abstractmethod
+    def readers(self) -> Iterator[BaseReader]:
+        pass
diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py b/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py
index 2b60454767..7710fb6168 100644
--- a/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py
+++ b/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py
@@ -47,7 +47,9 @@ def model_info(self):
                     'language_config'].get('model_type', None) == 'llama':
                 model_arg = model_arg['language_config']  # depseek-vl
             num_layer = model_arg['num_hidden_layers']
-            hidden_units = model_arg['hidden_size']
+            hidden_units = model_arg.get('hidden_size', 4096)
+            inter_size = model_arg.get('intermediate_size', 11008)
+            vocab_size = model_arg.get('vocab_size', 102400)
             norm_eps = model_arg.get('rms_norm_eps', 1e-06)
             attn_head_num = model_arg.get('num_attention_heads', 32)
             if 'num_key_value_heads' in model_arg:
@@ -71,6 +73,8 @@ def model_info(self):
                     head_num=attn_head_num,
                     kv_head_num=kv_head_num,
                     hidden_units=hidden_units,
+                    inter_size=inter_size,
+                    vocab_size=vocab_size,
                     rope_theta=rope_theta,
                     max_position_embeddings=max_position_embeddings,
                     use_dynamic_ntk=use_dynamic_ntk,
diff --git a/lmdeploy/turbomind/deploy/source_model/glm4.py b/lmdeploy/turbomind/deploy/source_model/glm4.py
index 1c26e0649a..dd27c837af 100644
--- a/lmdeploy/turbomind/deploy/source_model/glm4.py
+++ b/lmdeploy/turbomind/deploy/source_model/glm4.py
@@ -17,6 +17,8 @@ class Glm4Reader(LlamaReader):
     norm_weight_key = 'transformer.encoder.final_layernorm.weight'
     output_weight_key = 'transformer.output_layer.weight'
 
+    attn_pattern = r'self_attention'
+
     def _attn(self, i: int, kind: str):
         """Get q, k, v, o kind for layer i."""
         qkv = self.params[f'transformer.encoder.layers.{i}'
@@ -94,6 +96,9 @@ def model_info(self):
         rope_theta *= rope_ratio
         attn_head_num = config['num_attention_heads']
         kv_head_num = attn_head_num
+        inter_size = config['ffn_hidden_size']
+        vocab_size = config['padded_vocab_size']
+        attn_bias = config['add_qkv_bias']
         if config['multi_query_attention']:
             kv_head_num = config['multi_query_group_num']
         seq_length = config['seq_length']
@@ -102,6 +107,9 @@ def model_info(self):
                     head_num=attn_head_num,
                     kv_head_num=kv_head_num,
                     hidden_units=hidden_units,
+                    attn_bias=int(attn_bias),
+                    inter_size=inter_size,
+                    vocab_size=vocab_size,
                     rope_theta=rope_theta,
                     max_position_embeddings=seq_length,
                     rotary_embedding=64,
diff --git a/lmdeploy/turbomind/deploy/source_model/internlm2.py b/lmdeploy/turbomind/deploy/source_model/internlm2.py
index 71f17517b3..0d66775519 100644
--- a/lmdeploy/turbomind/deploy/source_model/internlm2.py
+++ b/lmdeploy/turbomind/deploy/source_model/internlm2.py
@@ -15,6 +15,9 @@ class InternLM2Reader(LlamaReader):
     norm_weight_key = 'model.norm.weight'
     output_weight_key = 'output.weight'
 
+    attn_pattern = r'attention'
+    ffn_pattern = r'feed_forward'
+
     def _attn(self, i: int, kind: str):
         """Get q, k, v, o kind for layer i."""
         q, k, v = (None, ) * 3
diff --git a/lmdeploy/turbomind/deploy/source_model/internvl.py b/lmdeploy/turbomind/deploy/source_model/internvl.py
index 83161adb15..51082fb3a1 100644
--- a/lmdeploy/turbomind/deploy/source_model/internvl.py
+++ b/lmdeploy/turbomind/deploy/source_model/internvl.py
@@ -63,6 +63,8 @@ def model_info(self):
             norm_eps = model_arg['rms_norm_eps']
             hidden_units = model_arg['hidden_size']
             attn_head_num = model_arg['num_attention_heads']
+            vocab_size = model_arg['vocab_size']
+            inter_size = model_arg['intermediate_size']
             if 'num_key_value_heads' in model_arg:
                 kv_head_num = model_arg['num_key_value_heads']
             else:
@@ -82,6 +84,8 @@ def model_info(self):
         return dict(num_layer=num_layer,
                     norm_eps=norm_eps,
                     hidden_units=hidden_units,
+                    inter_size=inter_size,
+                    vocab_size=vocab_size,
                     head_num=attn_head_num,
                     kv_head_num=kv_head_num,
                     rope_theta=rope_theta,
diff --git a/lmdeploy/turbomind/deploy/source_model/llama.py b/lmdeploy/turbomind/deploy/source_model/llama.py
index a67e3ee4e4..d61d1906e1 100644
--- a/lmdeploy/turbomind/deploy/source_model/llama.py
+++ b/lmdeploy/turbomind/deploy/source_model/llama.py
@@ -1,15 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import json
-import os
 import os.path as osp
-from glob import glob
+import re
 
 import torch
-from safetensors.torch import load_file
 
 from lmdeploy.archs import get_model_arch
 from lmdeploy.tokenizer import Tokenizer
 
+from ..loader import create_loader
 from .base import INPUT_MODELS, BaseInputModel, BaseReader
 
 
@@ -22,6 +21,9 @@ class LlamaReader(BaseReader):
     norm_weight_key = 'model.norm.weight'
     output_weight_key = 'lm_head.weight'
 
+    attn_pattern = r'self_attn'
+    ffn_pattern = r'mlp'
+
     def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
                  model_cfg: dict, policy):
         super().__init__()
@@ -32,26 +34,14 @@ def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
         tie_word_embeddings = self.model_cfg.get('tie_word_embeddings', False)
         if tie_word_embeddings:
             self.output_weight_key = self.tok_embeddings_key
-        self.weight_suffix, self.processor = policy
-        self.init_layer_id()
-
-    def init_layer_id(self):
-        """Get start/end transformer layer id."""
-        super().init_layer_id()
-
-    def clean_up(self, last: bool) -> None:
-        """Clean up unused params."""
-        super().clean_up(last)
+        self.processor = policy
 
-    @property
-    def start_layer_id(self):
-        """Get start transformer layer id."""
-        return self._start_layer_id
-
-    @property
-    def end_layer_id(self):
-        """Get end transformer layer id."""
-        return self._end_layer_id
+    def filter(self, pattern: str):
+        params = []
+        for k in self.params.keys():
+            if re.search(pattern, k):
+                params.append(k)
+        return params
 
     def tok_embeddings(self):
         """Get embeddings."""
@@ -78,21 +68,10 @@ def _attn(self, i: int, kind: str):
             result.append(tensor)
         return (*result, )
 
-    def attn(self, i: int):
-        """Get q, k, v, o weight for layer i."""
-        return self._attn(i, self.weight_suffix)
-
-    def attn_bias(self, i: int):
-        """Get q, k, v, o bias for layer i."""
-        return self._attn(i, 'bias')
-
-    def attn_zero(self, i: int):
-        """Get q, k, v, o zero point for layer i."""
-        return self._attn(i, 'qzeros')
-
-    def attn_scale(self, i: int):
-        """Get q, k, v, o scale for layer i."""
-        return self._attn(i, 'scales')
+    def attn(self, i: int, kind: str):
+        if not kind:
+            return self.filter(self.attn_pattern)
+        return self._attn(i, kind)
 
     def attn_norm(self, i: int):
         """Get attn norm for layer i."""
@@ -101,6 +80,8 @@ def attn_norm(self, i: int):
 
     def _ffn(self, i: int, kind: str):
         """Get ffn kind for layer i."""
+        if not kind:
+            return self.filter(self.ffn_pattern)
         result = []
         for key in ['gate', 'down', 'up']:
             tensor = self.params[
@@ -109,17 +90,10 @@ def _ffn(self, i: int, kind: str):
             result.append(tensor)
         return (*result, )
 
-    def ffn(self, i: int):
-        """Get ffn weight for layer i."""
-        return self._ffn(i, self.weight_suffix)
-
-    def ffn_zero(self, i: int):
-        """Get ffn zero point for layer i."""
-        return self._ffn(i, 'qzeros')
-
-    def ffn_scale(self, i: int):
-        """Get ffn scale for layer i."""
-        return self._ffn(i, 'scales')
+    def ffn(self, i: int, kind: str):
+        if not kind:
+            return self.filter(self.ffn_pattern)
+        return self._ffn(i, kind)
 
     def ffn_norm(self, i: int):
         """Get ffn norm for layer i."""
@@ -135,54 +109,18 @@ class LlamaModel(BaseInputModel):
 
     def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict):
         super().__init__(model_path, tokenizer_path)
-        ckpt_path = kwargs.get('ckpt_path')
         self.policy = kwargs.get('input_policy')
-        if ckpt_path is None:
-            ckpt_path = model_path
-        self.ckpt_path = ckpt_path
-        self.ckpt_files = self.get_ckpt()
         _, self.model_config = get_model_arch(model_path)
         self.model_config = self.model_config.to_dict()
 
-    def get_ckpt(self):
-        """Get weight files."""
-        patterns = ['*.safetensors', 'pytorch_model*.bin']
-        files = []
-        for pattern in patterns:
-            files = glob(os.path.join(self.ckpt_path, pattern))
-            files = [os.path.basename(file) for file in files]
-            if len(files) > 0:
-                break
-        files = sorted(files)
-        return files
-
-    @property
-    def nmgrs(self):
-        """Get number of checkpoint."""
-        return len(self.ckpt_files)
-
-    def get_mgrs(self):
-        """Conctruct all Reader."""
-        assert self.nmgrs > 0, \
-            f'could not find checkpoints in {self.ckpt_path}'
-        unused_params = {}
-        try:
-            for i, ckpt in enumerate(self.ckpt_files):
-                is_last_bin = i == len(self.ckpt_files) - 1
-                if ckpt.endswith('.bin'):
-                    new_params = torch.load(osp.join(self.ckpt_path, ckpt),
-                                            map_location='cpu')
-                else:
-                    new_params = load_file(osp.join(self.ckpt_path, ckpt))
-                ret = self.Reader(new_params,
-                                  unused_params,
-                                  i == self.nmgrs - 1,
-                                  self.model_config,
-                                  policy=self.policy)
-                yield ret
-                ret.clean_up(is_last_bin)
-        except GeneratorExit:
-            ret.clean_up(True)
+    def readers(self):
+        loader = create_loader(self.model_path, self.Reader.attn_layer_patten)
+        for i, param in loader.items():
+            reader = self.Reader(param, {},
+                                 False,
+                                 self.model_config,
+                                 policy=self.policy)
+            yield i, reader
 
     def tokenizer_info(self):
         """Read tokenizer info."""
@@ -203,6 +141,8 @@ def model_info(self):
             num_layer = model_arg['num_hidden_layers']
             norm_eps = model_arg['rms_norm_eps']
             attn_head_num = model_arg['num_attention_heads']
+            vocab_size = model_arg['vocab_size']
+            inter_size = model_arg['intermediate_size']
             if 'num_key_value_heads' in model_arg:
                 kv_head_num = model_arg['num_key_value_heads']
             else:
@@ -243,6 +183,8 @@ def model_info(self):
             head_num=attn_head_num,
             kv_head_num=kv_head_num,
             hidden_units=hidden_units,
+            inter_size=inter_size,
+            vocab_size=vocab_size,
             rope_theta=rope_theta,
             max_position_embeddings=max_position_embeddings,
             original_max_position_embeddings=original_max_position_embeddings,
diff --git a/lmdeploy/turbomind/deploy/source_model/minicpmv.py b/lmdeploy/turbomind/deploy/source_model/minicpmv.py
index a45469a470..0259aa2b22 100644
--- a/lmdeploy/turbomind/deploy/source_model/minicpmv.py
+++ b/lmdeploy/turbomind/deploy/source_model/minicpmv.py
@@ -1,5 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
+import json
+import os.path as osp
+
 from .base import INPUT_MODELS
 from .llama import LlamaModel, LlamaReader
 
@@ -18,3 +21,11 @@ class MiniCPMVReader(LlamaReader):
 class MiniCPMVModel(LlamaModel):
     """MiniCPMV model in hf format."""
     Reader = MiniCPMVReader
+
+    def model_info(self):
+        info = super().model_info()
+        with open(osp.join(self.model_path, 'config.json')) as f:
+            config = json.load(f)
+            if str(config.get('version')) == '2.6':
+                info['attn_bias'] = True
+        return info
diff --git a/lmdeploy/turbomind/deploy/source_model/mixtral.py b/lmdeploy/turbomind/deploy/source_model/mixtral.py
new file mode 100644
index 0000000000..102ede29f2
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/mixtral.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from .base import INPUT_MODELS
+from .llama import LlamaModel, LlamaReader
+
+
+class MixtralReader(LlamaReader):
+
+    def moe_ffn_expert(self, e=None, i=None, kind=None):
+        if not kind:
+            return self.filter(r'experts')
+        result = []
+        for x in ['w1', 'w2', 'w3']:
+            name = f'model.layers.{i}.block_sparse_moe.experts.{e}.{x}.{kind}'
+            tensor = self.params.get(name)
+            tensor = self.transform(tensor, kind)
+            result.append(tensor)
+        return (*result, )
+
+    def moe_ffn_gate(self, i):
+        return self.params.get(
+            f'model.layers.{i}.block_sparse_moe.gate.weight')
+
+
+@INPUT_MODELS.register_module(name='mixtral')
+class MixtralModel(LlamaModel):
+
+    Reader = MixtralReader
+
+    def model_info(self):
+        cfg = self.model_config
+        info = super().model_info()
+        info['expert_num'] = cfg['num_local_experts']
+        info['expert_inter_size'] = cfg['intermediate_size']
+        info['experts_per_token'] = cfg['num_experts_per_tok']
+        return info
diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py
index 4e87057e62..0ec0586a37 100644
--- a/lmdeploy/turbomind/deploy/source_model/qwen.py
+++ b/lmdeploy/turbomind/deploy/source_model/qwen.py
@@ -16,6 +16,9 @@ class QwenReader(LlamaReader):
     norm_weight_key = 'transformer.ln_f.weight'
     output_weight_key = 'lm_head.weight'
 
+    attn_pattern = r'attn'
+    ffn_pattern = r'mlp'
+
     def _attn(self, i: int, kind: str):
         """Get q, k, v, o kind for layer i."""
         q, k, v, o = (None, ) * 4
@@ -77,11 +80,16 @@ def model_info(self):
             seq_length = config['seq_length']
             use_dynamic_ntk = int(config['use_dynamic_ntk'])
             use_logn_attn = int(config['use_logn_attn'])
+            vocab_size = config['vocab_size']
+            inter_size = config['intermediate_size']
         return dict(num_layer=num_layer,
                     norm_eps=norm_eps,
                     hidden_units=hidden_units,
                     head_num=attn_head_num,
                     kv_head_num=kv_head_num,
+                    vocab_size=vocab_size,
+                    inter_size=inter_size,
+                    attn_bias=1,
                     rope_theta=rope_theta,
                     max_position_embeddings=seq_length,
                     use_dynamic_ntk=int(use_dynamic_ntk),
@@ -107,3 +115,8 @@ def tokenizer_info(self):
         bos_id = 151643
         eos_id = 151645
         return n_words, bos_id, eos_id
+
+    def model_info(self):
+        cfg = super().model_info()
+        cfg['attn_bias'] = 1
+        return cfg
diff --git a/lmdeploy/turbomind/deploy/source_model/xcomposer2.py b/lmdeploy/turbomind/deploy/source_model/xcomposer2.py
index 5564c4b937..d4943debb5 100644
--- a/lmdeploy/turbomind/deploy/source_model/xcomposer2.py
+++ b/lmdeploy/turbomind/deploy/source_model/xcomposer2.py
@@ -7,23 +7,17 @@
 class Xcomposer2Reader(InternLM2Reader):
     """Xcomposer2 model reader."""
 
-    def attn_lora_a(self, i):
-        """Get attn lora_a."""
-        qkv = self.params[f'model.layers.{i}.attention.wqkv.Plora_A.weight']
-        o = self.params[f'model.layers.{i}.attention.wo.Plora_A.weight']
-        return qkv, o
-
-    def attn_lora_b(self, i):
-        """Get attn lora_b."""
-        return self._attn(i, 'Plora_B.weight')
-
-    def ffn_lora_a(self, i: int):
-        """Get ffn lora_a weight for layer i."""
-        return self._ffn(i, 'Plora_A.weight')
-
-    def ffn_lora_b(self, i: int):
-        """Get fnn lora_b weight for layer i."""
-        return self._ffn(i, 'Plora_B.weight')
+    # include only Plora and ignore other lora weights
+    attn_pattern = r'attention.\w+(.Plora_[AB])?.\w+$'
+    ffn_pattern = r'feed_forward.\w+(.Plora_[AB])?.\w+$'
+
+    def _attn(self, i, kind):
+        if 'Plora_A' in kind:
+            qkv = self.params[
+                f'model.layers.{i}.attention.wqkv.Plora_A.weight']
+            o = self.params[f'model.layers.{i}.attention.wo.Plora_A.weight']
+            return qkv, o
+        return super()._attn(i, kind)
 
 
 @INPUT_MODELS.register_module(name='xcomposer2')
diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py
index b2da4b441f..4750cde850 100644
--- a/lmdeploy/turbomind/deploy/target_model/base.py
+++ b/lmdeploy/turbomind/deploy/target_model/base.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os.path as osp
-from abc import ABC, abstractmethod
+from abc import ABC
 
 import torch
 import tqdm
@@ -9,7 +9,7 @@
 
 from ..config import (AttentionConfig, LoraConfig, ModelConfig,
                       TurbomindModelConfig, config_from_dict, config_to_dict)
-from ..source_model.base import BaseInputModel, BaseReader
+from ..source_model.base import BaseInputModel
 
 OUTPUT_MODELS = Registry(
     'target model', locations=['lmdeploy.turbomind.deploy.target_model.base'])
@@ -36,13 +36,20 @@ def _weight_dtype_map(weight_type: str, default=None):
     return _WEIGHT_DTYPE_MAP.get(weight_type, default)
 
 
+def _pad_inter_size(inter_size: int, group_size: int, tp: int):
+    group_size = max(1, group_size)
+    groups_per_rank = (inter_size // group_size + tp - 1) // tp
+    inter_size_padded = groups_per_rank * group_size * tp
+    return inter_size_padded
+
+
 class BaseOutputModel(ABC):
     """Base output model."""
 
     def __init__(self,
                  input_model: BaseInputModel,
                  cfg: TurbomindModelConfig,
-                 exporter_factory,
+                 model_cls,
                  out_dir: str = ''):
         super().__init__()
         self.input_model = input_model
@@ -61,14 +68,21 @@ def __init__(self,
         self.permute_qk = self.input_model_info.get('permute_qk', True)
 
         self.update_model_config()
+        self.model_config.inter_size = _pad_inter_size(
+            self.model_config.inter_size, self.model_config.group_size,
+            self.tensor_para_size)
+        if self.model_config.expert_num:
+            self.model_config.expert_inter_size = _pad_inter_size(
+                self.model_config.expert_inter_size,
+                self.model_config.group_size, self.tensor_para_size)
+        self.model_config.verify()
         assert self.model_config.kv_head_num % self.tensor_para_size == 0
 
         self.update_attention_config()
         self.update_lora_config()
         # ! Dependency on `self`
-        self.exporters = exporter_factory(self)
+        self.model = model_cls(self)
 
-    @abstractmethod
     def update_model_config(self):
         """Update `self.model_config` according to the input_model's
         `tokenizer_info` and `model_info`"""
@@ -78,13 +92,6 @@ def update_model_config(self):
         final_cfg.update(dict(start_id=bos_id, end_id=eos_id))
         final_cfg.update(self.input_model_info)
 
-        # get vocab_size
-        for bin in self.input_model.bins():
-            emb = bin.tok_embeddings()
-            if emb is not None:
-                _vocab_size, _ = emb.shape
-                break
-        final_cfg.update(dict(vocab_size=_vocab_size))
         self.model_config = config_from_dict(ModelConfig, final_cfg)
 
     def update_attention_config(self):
@@ -196,10 +203,8 @@ def export(self) -> None:
                     desc='Convert to turbomind format',
                     leave=self.to_file)
         self.export_config()
-        for bin in self.input_model.bins():
-            self.export_misc(bin)
-            for i in range(bin.start_layer_id, bin.end_layer_id):
-                self.export_transformer_block(bin, i)
+        for i, reader in self.input_model.readers():
+            if self.model(i, reader):
                 pbar.update(1)
         pbar.close()
         # manually clean up meta reader
@@ -208,38 +213,6 @@ def export(self) -> None:
             del self.input_model.meta_reader
             torch.cuda.empty_cache()
 
-    def export_misc(self, bin: BaseReader) -> None:
-        """Export embedding, norm, output weight."""
-        emb = bin.tok_embeddings()
-        norm_weight = bin.norm_weight()
-        output_weight = bin.output_weight()
-
-        def pad_weight(tensor):
-            pad_size = None
-            vocab_size = self.model_config.vocab_size
-            tp = self.tensor_para_size
-            if vocab_size % tp != 0:
-                pad_size = (vocab_size + tp - 1) // tp * tp - vocab_size
-
-            if pad_size is None:
-                return tensor
-            return torch.nn.functional.pad(tensor, (0, 0, 0, pad_size),
-                                           'constant', 0)
-
-        if emb is not None:
-            emb = pad_weight(emb)
-            self.save_split(emb, 'tok_embeddings.weight', 1)
-        if norm_weight is not None:
-            self.export_weight(norm_weight, 'norm.weight')
-        if output_weight is not None:
-            output_weight = pad_weight(output_weight)
-            self.save_split(output_weight, 'output.weight', 0)
-
-    def export_transformer_block(self, bin: BaseReader, i: int) -> None:
-        """Export transformer block."""
-        for e in self.exporters:
-            e.export(bin, i)
-
     @property
     def tm_config(self):
         return TurbomindModelConfig(model_config=self.model_config,
diff --git a/lmdeploy/turbomind/deploy/target_model/fp.py b/lmdeploy/turbomind/deploy/target_model/fp.py
index 14e1115b20..11f1f78170 100644
--- a/lmdeploy/turbomind/deploy/target_model/fp.py
+++ b/lmdeploy/turbomind/deploy/target_model/fp.py
@@ -1,43 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from ..config import ModelConfig, config_from_dict, config_to_dict
 from .base import OUTPUT_MODELS, BaseOutputModel
 
 
 @OUTPUT_MODELS.register_module(name='tm')
 class TurbomindModel(BaseOutputModel):
     """Export to turbomind fp16 format."""
-
-    def update_model_config(self):
-        """Update `self.model_config`.
-
-        Firstly, call `update_model_config` of the superclass. Then update
-        `inter_size` and `attn_bias` that are indicates from the input_model's
-        weight files
-        """
-        super().update_model_config()
-        final_cfg = config_to_dict(self.model_config)
-        # get attn_bias, inter_size
-        visit = False
-        attn_bias = 0
-        for bin in self.input_model.bins():
-            for i in range(bin.start_layer_id, bin.end_layer_id):
-                visit = True
-                w1, w2, w3 = bin.ffn(i)
-                inter_size = w2.size(-1)
-                qb, _, _, _ = bin.attn_bias(i)
-                if qb is not None:
-                    attn_bias = 1
-                break
-            if visit:
-                break
-        inter_size = self._pad_inter_size(inter_size)
-        final_cfg.update(dict(attn_bias=attn_bias, inter_size=inter_size))
-        self.model_config = config_from_dict(ModelConfig, final_cfg)
-
-    def _pad_inter_size(self, inter_size: int):
-        group_size = max(1, self.model_config.group_size)
-        tp = self.tensor_para_size
-        groups_per_rank = (inter_size // group_size + tp - 1) // tp
-        inter_size_padded = groups_per_rank * group_size * tp
-        return inter_size_padded
+    pass
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index bdf129b019..8a1f5e7315 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -38,7 +38,9 @@
     MiniGeminiLlamaForCausalLM='llama',
     # chatglm2/3, glm4
     ChatGLMModel='glm4',
-    ChatGLMForConditionalGeneration='glm4')
+    ChatGLMForConditionalGeneration='glm4',
+    # mixtral
+    MixtralForCausalLM='mixtral')
 
 
 def is_supported(model_path: str):
diff --git a/src/turbomind/kernels/core/math.h b/src/turbomind/kernels/core/math.h
index e71d04c39b..a708a34985 100644
--- a/src/turbomind/kernels/core/math.h
+++ b/src/turbomind/kernels/core/math.h
@@ -4,6 +4,7 @@
 
 #include "src/turbomind/kernels/core/common.h"
 #include <cassert>
+#include <cstdint>
 
 namespace turbomind {
 
@@ -13,6 +14,12 @@ TM_HOST_DEVICE constexpr T ceil_div(T a, T b)
     return (a + b - 1) / b;
 }
 
+template<class T>
+TM_HOST_DEVICE constexpr T cdiv(T a, T b)
+{
+    return (a + b - 1) / b;
+}
+
 template<class T>
 TM_HOST_DEVICE constexpr T round_up(T a, T b)
 {
@@ -34,4 +41,38 @@ TM_HOST_DEVICE constexpr T log2(T x)
 // static_assert(log2(32) == 5);
 // static_assert(log2(1) == 0);
 
+// https://arxiv.org/abs/1902.01961
+template<class T>
+struct FastDivMod {
+};
+
+template<>
+struct FastDivMod<uint16_t> {
+    uint32_t c_;  // cdiv(2^32,d) = (2^32+d-1)/d = (2^32-1)/d+1
+    uint32_t d_;
+
+    TM_HOST_DEVICE constexpr FastDivMod(uint16_t d): c_{0xFFFFFFFF / d + 1}, d_{d} {}
+
+    template<class T>
+    TM_HOST_DEVICE friend constexpr uint16_t operator/(T a, FastDivMod b)
+    {
+        return (a * (uint64_t)b.c_) >> 32;
+    }
+
+    template<class T>
+    TM_HOST_DEVICE friend constexpr uint16_t operator%(T a, FastDivMod b)
+    {
+        uint64_t lowbits = (a * (uint64_t)b.c_) & 0xFFFFFFFF;
+        return (lowbits * b.d_) >> 32;
+    }
+
+    TM_HOST_DEVICE constexpr operator uint16_t() const noexcept
+    {
+        return d_;
+    }
+};
+
+static_assert(32 / FastDivMod<uint16_t>{5} == 6);
+static_assert(32 % FastDivMod<uint16_t>{5} == 2);
+
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/core/sub_byte_ptr.h b/src/turbomind/kernels/core/sub_byte_ptr.h
index da2e6c525a..0be32fc54f 100644
--- a/src/turbomind/kernels/core/sub_byte_ptr.h
+++ b/src/turbomind/kernels/core/sub_byte_ptr.h
@@ -15,27 +15,27 @@ struct SubBytePtr {
 
     constexpr __host__ __device__ SubBytePtr(char* ptr): ptr_(ptr) {}
 
-    __device__ T& operator[](int i)
+    __host__ __device__ T& operator[](int i)
     {
         return *reinterpret_cast<T*>(ptr_ + i * bitsof<T> / bitsof<char>);
     }
 
-    friend __device__ SubBytePtr operator+(const SubBytePtr a, int n)
+    friend __host__ __device__ SubBytePtr operator+(const SubBytePtr a, int n)
     {
         return SubBytePtr{a.ptr_ + n * bitsof<T> / bitsof<char>};
     }
 
-    friend __device__ SubBytePtr operator+(int n, const SubBytePtr a)
+    friend __host__ __device__ SubBytePtr operator+(int n, const SubBytePtr a)
     {
         return a + n;
     }
 
-    friend __device__ bool operator==(const SubBytePtr& a, const SubBytePtr& b)
+    friend __host__ __device__ bool operator==(const SubBytePtr& a, const SubBytePtr& b)
     {
         return a.ptr_ == b.ptr_;
     }
 
-    __device__ explicit operator T*() const
+    __host__ __device__ explicit operator T*() const
     {
         return (T*)ptr_;
     }
diff --git a/src/turbomind/kernels/gemm/CMakeLists.txt b/src/turbomind/kernels/gemm/CMakeLists.txt
index 6fc634dba0..4e398e9e25 100644
--- a/src/turbomind/kernels/gemm/CMakeLists.txt
+++ b/src/turbomind/kernels/gemm/CMakeLists.txt
@@ -9,6 +9,7 @@ add_library(gemm2
         convert_v2.cu
         cast.cu
         unpack.cu
+        context.cu
         tuner/cache_utils.cu
         tuner/measurer.cu
         tuner/sampler.cu
@@ -19,7 +20,13 @@ add_library(gemm2
         kernel/f16_u4g128_f16_tnt_sm75_s16816.cu
         kernel/f16_u4g128_f16_tnt_sm70_s884.cu
         kernel/f16_u4g128_f16_tnt_sm75_simt.cu
-        kernel/u4g128_f16_f16_nnn_sm80_s16816.cu
+        # kernel/u4g128_f16_f16_nnn_sm80_s16816.cu
+        kernel/sm70_s884_dynamic.cu
+        kernel/sm75_s16816_dynamic.cu
+        kernel/sm80_s16816_dynamic.cu
+        kernel/sm90_s16816_dynamic.cu
+        moe_utils_v2.cu
+        test/test_utils.cu
 )
 
 target_link_libraries(gemm2 PRIVATE parser)
@@ -29,7 +36,7 @@ target_compile_options(gemm2 PRIVATE
         $<$<COMPILE_LANGUAGE:CUDA>:
                 -Xptxas=-v
                 --generate-line-info
-                --threads 8>
+                --threads 16>
 )
 set_property(TARGET gemm2 PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET gemm2 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
@@ -37,11 +44,14 @@ set_property(TARGET gemm2 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 if (BUILD_TEST)
         add_executable(gemm_test
                 test/gemm_test.cu
-                test/test_utils.cu
+                # test/test_utils.cu
                 test/quantization.cu
                 test/reference.cu)
         target_link_libraries(gemm_test PRIVATE gemm2 cublas)
 
+        add_executable(test_moe_utils test/test_moe_utils.cu test/test_utils.cu)
+        target_link_libraries(test_moe_utils PRIVATE gemm2 cublas)
+
         if (NOT MSVC)
                 FetchContent_Declare(
                 repo-nvbench
@@ -56,7 +66,7 @@ if (BUILD_TEST)
 
                 add_executable(gemm_bench
                         test/gemm_bench.cu
-                        test/test_utils.cu
+                        # test/test_utils.cu
                         test/quantization.cu
                         test/reference.cu)
                 target_link_libraries(gemm_bench PRIVATE gemm2 nvbench::nvbench cublas)
diff --git a/src/turbomind/kernels/gemm/arch/config_simt.h b/src/turbomind/kernels/gemm/arch/config_simt.h
index 5652da6c53..8abfe26c96 100644
--- a/src/turbomind/kernels/gemm/arch/config_simt.h
+++ b/src/turbomind/kernels/gemm/arch/config_simt.h
@@ -17,7 +17,18 @@ namespace turbomind::gemm {
 
 namespace simt {
 
-template<class A, class TransformA, class U, class B, class TransformB, class V, Order order_c, class Tc>
+template<class A,
+         class TransformA,
+         class U,
+         class B,
+         class TransformB,
+         class V,
+         Order order_C,
+         class Tc,
+         Striding mode_A,
+         Striding mode_B,
+         Striding mode_C,
+         class CtaMap_>
 struct Sm75_Simt {
 
     static_assert(A::SmemCopyAtom::K == B::SmemCopyAtom::K);
@@ -58,12 +69,12 @@ struct Sm75_Simt {
 
         using Mainloop = MainloopSm70<MMA,
                                       A,
-                                      IteratorSm70<PolicyA>,
+                                      IteratorSm70<mode_A, PolicyA>,
                                       TransformA,
                                       U,
                                       GroupSizeU,
                                       B,
-                                      IteratorSm70<PolicyB>,
+                                      IteratorSm70<mode_B, PolicyB>,
                                       TransformB,
                                       V,
                                       GroupSizeV,
@@ -80,10 +91,11 @@ struct Sm75_Simt {
                                          TILE_C_N,
                                          MMA::kThreadCount,
                                          Rearrange<MMA>,
-                                         Operand_C<float, order_c>,
+                                         Operand_C<float, order_C>,
+                                         mode_C,
                                          SplitK>;
 
-        using Kernel = GemmUniversal<Sm75, Mainloop, Epilogue, CtaMap>;
+        using Kernel = GemmUniversal<Sm75, Mainloop, Epilogue, CtaMap_>;
     };
 };
 
diff --git a/src/turbomind/kernels/gemm/arch/config_sm70_s884.h b/src/turbomind/kernels/gemm/arch/config_sm70_s884.h
index b7b239162f..3f3cb8e074 100644
--- a/src/turbomind/kernels/gemm/arch/config_sm70_s884.h
+++ b/src/turbomind/kernels/gemm/arch/config_sm70_s884.h
@@ -16,7 +16,18 @@
 
 namespace turbomind::gemm::sm70_s884 {
 
-template<class A, class TransformA, class U, class B, class TransformB, class V, Order order_c, class Tc>
+template<class A,
+         class TransformA,
+         class U,
+         class B,
+         class TransformB,
+         class V,
+         Order order_C,
+         class Tc,
+         Striding mode_A,
+         Striding mode_B,
+         Striding mode_C,
+         class CtaMap_>
 struct Sm70_s884 {
 
     static_assert(A::SmemCopyAtom::K == B::SmemCopyAtom::K);
@@ -51,12 +62,12 @@ struct Sm70_s884 {
 
         using Mainloop = MainloopSm70<MMA,
                                       A,
-                                      IteratorSm70<PolicyA>,
+                                      IteratorSm70<mode_A, PolicyA>,
                                       TransformA,
                                       U,
                                       GroupSizeU,
                                       B,
-                                      IteratorSm70<PolicyB>,
+                                      IteratorSm70<mode_B, PolicyB>,
                                       TransformB,
                                       V,
                                       GroupSizeV,
@@ -73,10 +84,11 @@ struct Sm70_s884 {
                                          TILE_C_N,
                                          MMA::kThreadCount,
                                          Rearrange<MMA>,
-                                         Operand_C<float, order_c>,
+                                         Operand_C<float, order_C>,
+                                         mode_C,
                                          SplitK>;
 
-        using Kernel = GemmUniversal<Sm70, Mainloop, Epilogue, CtaMap>;
+        using Kernel = GemmUniversal<Sm70, Mainloop, Epilogue, CtaMap_>;
     };
 };
 
diff --git a/src/turbomind/kernels/gemm/arch/config_sm75_s16816.h b/src/turbomind/kernels/gemm/arch/config_sm75_s16816.h
index 0dd643d4e8..aced3b7bb7 100644
--- a/src/turbomind/kernels/gemm/arch/config_sm75_s16816.h
+++ b/src/turbomind/kernels/gemm/arch/config_sm75_s16816.h
@@ -18,7 +18,18 @@ namespace sm75_s16816 {
 
 using namespace sm80_s16816;
 
-template<class A, class TransformA, class U, class B, class TransformB, class V, Order order_c, class Tc>
+template<class A,
+         class TransformA,
+         class U,
+         class B,
+         class TransformB,
+         class V,
+         Order order_C,
+         class Tc,
+         Striding mode_A,
+         Striding mode_B,
+         Striding mode_C,
+         class CtaMap_>
 struct Sm75_s16816 {
 
     static_assert(A::SmemCopyAtom::K == B::SmemCopyAtom::K);
@@ -45,16 +56,16 @@ struct Sm75_s16816 {
         // Raked partition dont support `Pack_M > 1`
         using Partition = Blocked<TG_M, TG_N, kColMajor>;
         using MMA_Map   = MMA_Map<CTA_M, CTA_N, CTA_K, SMEM_M, SMEM_N, SMEM_K, Partition, TG_K>;
-        using MMA       = Tiled_MMA_v2<SM80_MMA_16x8x16_F32_F16_F16_F32_TN, MMA_Map>;
+        using MMA       = Tiled_MMA_v2<SM80_MMA_16x8x16_F32_F16_F16_F32_TN<half>, MMA_Map>;
 
         using Mainloop = MainloopSm70<MMA,
                                       A,
-                                      IteratorSm70<PolicyA>,
+                                      IteratorSm70<mode_A, PolicyA>,
                                       TransformA,
                                       U,
                                       GroupSizeU,
                                       B,
-                                      IteratorSm70<PolicyB>,
+                                      IteratorSm70<mode_B, PolicyB>,
                                       TransformB,
                                       V,
                                       GroupSizeV,
@@ -71,10 +82,11 @@ struct Sm75_s16816 {
                                          TILE_C_N,
                                          MMA::kThreadCount,
                                          Rearrange<MMA>,
-                                         Operand_C<float, order_c>,
+                                         Operand_C<float, order_C>,
+                                         mode_C,
                                          SplitK>;
 
-        using Kernel = GemmUniversal<Sm75, Mainloop, Epilogue, CtaMap>;
+        using Kernel = GemmUniversal<Sm75, Mainloop, Epilogue, CtaMap_>;
     };
 };
 
diff --git a/src/turbomind/kernels/gemm/arch/config_sm80_s16816.h b/src/turbomind/kernels/gemm/arch/config_sm80_s16816.h
index c12f556b01..e5707c14ae 100644
--- a/src/turbomind/kernels/gemm/arch/config_sm80_s16816.h
+++ b/src/turbomind/kernels/gemm/arch/config_sm80_s16816.h
@@ -17,15 +17,19 @@
 namespace turbomind::gemm::sm80_s16816 {
 
 template<class Arch,
+         class Dtype,
          class A,
          class TransformA,
          class U,
          class B,
          class TransformB,
          class V,
-         Order order_c,
+         Order order_C,
          class Tc,
-         class CtaMap_ = CtaMap>
+         Striding mode_A,
+         Striding mode_B,
+         Striding mode_C,
+         class CtaMap_>
 struct Sm80_s16816 {
 
     static_assert(A::SmemCopyAtom::K == B::SmemCopyAtom::K);
@@ -55,16 +59,16 @@ struct Sm80_s16816 {
         // Raked partition dont support `Pack_M > 1`
         using Partition = Blocked<TG_M, TG_N, kColMajor>;
         using MMA_Map   = MMA_Map<CTA_M, CTA_N, CTA_K, SMEM_M, SMEM_N, SMEM_K, Partition, TG_K>;
-        using MMA       = Tiled_MMA_v2<SM80_MMA_16x8x16_F32_F16_F16_F32_TN, MMA_Map>;
+        using MMA       = Tiled_MMA_v2<SM80_MMA_16x8x16_F32_F16_F16_F32_TN<Dtype>, MMA_Map>;
 
         using Mainloop = MainloopSm80_v2<MMA,
                                          A,
-                                         IteratorSm80<PolicyA>,
+                                         IteratorSm80<mode_A, PolicyA>,
                                          TransformA,
                                          U,
                                          GroupSizeU,
                                          B,
-                                         IteratorSm80<PolicyB>,
+                                         IteratorSm80<mode_B, PolicyB>,
                                          TransformB,
                                          V,
                                          GroupSizeV,
@@ -81,7 +85,8 @@ struct Sm80_s16816 {
                                          TILE_C_N,
                                          MMA::kThreadCount,
                                          Rearrange<MMA>,
-                                         Operand_C<float, order_c>,
+                                         Operand_C<float, order_C>,
+                                         mode_C,
                                          SplitK>;
 
         using Kernel = GemmUniversal<Arch, Mainloop, Epilogue, CtaMap_>;
diff --git a/src/turbomind/kernels/gemm/arch/mma_sm80.h b/src/turbomind/kernels/gemm/arch/mma_sm80.h
index c78ba0209a..8a197a1617 100644
--- a/src/turbomind/kernels/gemm/arch/mma_sm80.h
+++ b/src/turbomind/kernels/gemm/arch/mma_sm80.h
@@ -9,6 +9,7 @@
 
 namespace turbomind::gemm {
 
+template<class T>
 struct SM80_MMA_16x8x16_F32_F16_F16_F32_TN {
     static constexpr int M = 16;
     static constexpr int N = 8;
@@ -18,8 +19,8 @@ struct SM80_MMA_16x8x16_F32_F16_F16_F32_TN {
 
     static constexpr auto kOpClass = OpClass::kMMA_s16816;
 
-    using FragA = Array<half, 8>;
-    using FragB = Array<half, 4>;
+    using FragA = Array<T, 8>;
+    using FragB = Array<T, 4>;
     using FragC = Array<float, 4>;
 
     using OffsetC = Array<int2, 2>;  // (m, n)
@@ -56,13 +57,14 @@ struct SM80_MMA_16x8x16_F32_F16_F16_F32_TN {
 };
 
 // This is not used yet
-struct SM75_MMA_16x8x8_F32_F16_F16_F32_TN: SM80_MMA_16x8x16_F32_F16_F16_F32_TN {
+template<class T>
+struct SM75_MMA_16x8x8_F32_F16_F16_F32_TN: SM80_MMA_16x8x16_F32_F16_F16_F32_TN<T> {
     static constexpr int M = 16;
     static constexpr int N = 8;
     static constexpr int K = 8;
 
-    using FragA = Array<half, 4>;
-    using FragB = Array<half, 2>;
+    using FragA = Array<T, 4>;
+    using FragB = Array<T, 2>;
     using FragC = Array<float, 4>;
 
     __device__ static void fma(FragC& d, const FragA& a, const FragB& b, const FragC& c)
diff --git a/src/turbomind/kernels/gemm/arch/operand_sm80_s16816.h b/src/turbomind/kernels/gemm/arch/operand_sm80_s16816.h
index 7b36aa795a..83fb060dca 100644
--- a/src/turbomind/kernels/gemm/arch/operand_sm80_s16816.h
+++ b/src/turbomind/kernels/gemm/arch/operand_sm80_s16816.h
@@ -65,7 +65,7 @@ struct Operand_A {
 };
 
 // (n, k)
-template<class T, Order order, int N>
+template<class T, Order order, int N = 16>
 struct Operand_B {
     using Dtype = T;
 
@@ -156,11 +156,11 @@ struct GetSmemLayout_Pack {
     }
 };
 
-template<class T, Order order>
+template<class T, Order order, int Pack_M_>
 struct Operand_A_Pack {
     using Dtype = T;
 
-    static constexpr int Pack_M = 2;
+    static constexpr int Pack_M = Pack_M_;
 
     static constexpr Pack  kPack  = HMMA_16816 | OPERAND_A | Pack_M;
     static constexpr Order kOrder = order;
@@ -173,11 +173,11 @@ struct Operand_A_Pack {
     using GetGmemIter   = GetGmemIter;
 };
 
-template<class T, Order order>
+template<class T, Order order, int Pack_M_>
 struct Operand_B_Pack {
     using Dtype = T;
 
-    static constexpr int Pack_M = 2;
+    static constexpr int Pack_M = Pack_M_;
 
     static constexpr Pack  kPack  = HMMA_16816 | OPERAND_B | Pack_M;
     static constexpr Order kOrder = order;
diff --git a/src/turbomind/kernels/gemm/context.cu b/src/turbomind/kernels/gemm/context.cu
new file mode 100644
index 0000000000..1b1ea1a2c3
--- /dev/null
+++ b/src/turbomind/kernels/gemm/context.cu
@@ -0,0 +1,603 @@
+
+#include "src/turbomind/kernels/core/array.h"
+#include "src/turbomind/kernels/core/math.h"
+#include "src/turbomind/kernels/gemm/context.h"
+#include "src/turbomind/kernels/gemm/cta_map.h"
+#include "src/turbomind/kernels/gemm/desc.h"
+#include "src/turbomind/kernels/gemm/moe_utils_v2.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/kernels/gemm/utils.h"
+#include "src/turbomind/utils/monotonic.h"
+#include <algorithm>
+#include <cub/block/block_reduce.cuh>
+#include <iostream>
+#include <tuple>
+
+namespace turbomind::gemm {
+
+static std::optional<GemmDesc> get_gemm_desc(const Operation&    operation,
+                                             const MatrixLayout& Adesc,
+                                             const MatrixLayout& Udesc,
+                                             const MatrixLayout& Bdesc,
+                                             const MatrixLayout& Vdesc,
+                                             const MatrixLayout& Cdesc,
+                                             const MatrixLayout& Ddesc,
+                                             int                 arch)
+{
+
+    // Constant dimensions are set to the exact value
+    // Variable dimensions are set to sum of the values
+
+    const int m0 = Adesc.rows, k0 = Adesc.cols;
+    const int k1 = Bdesc.rows, n0 = Bdesc.cols;
+    const int m1 = Ddesc.rows, n1 = Ddesc.cols;
+
+    const int l0 = Adesc.num, l1 = Bdesc.num, l2 = Ddesc.num;
+
+    if (m0 != m1 || n0 != n1 || k0 != k1 || l0 != l1 || l0 != l2) {
+        fprintf(stderr, "%d %d %d %d %d %d %d %d %d\n", m0, m1, n0, n1, k0, k1, l0, l1, l2);
+        return {};
+    }
+
+    GemmDesc desc{arch,
+                  Adesc.type,
+                  Bdesc.type,
+                  Ddesc.type,
+                  Adesc.order,
+                  Bdesc.order,
+                  Ddesc.order,
+                  get_mode(Adesc),
+                  get_mode(Bdesc),
+                  get_mode(Ddesc),
+                  Adesc.pack,
+                  Bdesc.pack,
+                  Udesc.pack,
+                  Vdesc.pack,
+                  operation.quant_a,
+                  operation.quant_b,
+                  operation.epilogue,
+                  operation.batch_dim,
+                  operation.context && operation.context->is_dynamic_sched()};
+
+    desc.m   = m0;
+    desc.n   = n0;
+    desc.k   = k0;
+    desc.num = l0;
+
+    return desc;
+}
+
+std::vector<LaunchSpec> get_swizzle(const int4& shape, const LaunchSpec& spec, const std::vector<int>& swizzle)
+{
+    const auto [m, n, k, _] = shape;
+    std::vector<int> vec;
+    for (const auto& s : swizzle) {
+        auto x = spec.kernel->GetSwizzle(m, n, k, spec.splits, s);
+        if (std::find(vec.begin(), vec.end(), x) == vec.end()) {
+            vec.push_back(x);
+        }
+    }
+    std::vector<LaunchSpec> ret;
+    for (const auto& s : vec) {
+        auto tmp    = spec;
+        tmp.swizzle = s;
+        ret.push_back(tmp);
+    }
+    return ret;
+}
+
+std::vector<Kernel*> filter_by_batch_size(std::vector<Kernel*> kernels, const GemmDesc& desc, int batch_size)
+{
+    auto get_batch_dim = [idx = desc.batch_dim](const Kernel* k) {
+        return idx == 0 ? k->desc().cta_tile.x : k->desc().cta_tile.y;
+    };
+
+    int max_batch_size = 0;
+    for (const auto& k : kernels) {
+        max_batch_size = std::max(get_batch_dim(k), max_batch_size);
+    }
+    for (const auto& k : kernels) {
+        const auto x = get_batch_dim(k);
+        if (x >= batch_size) {
+            max_batch_size = std::min(max_batch_size, x);
+        }
+    }
+    const auto pred = [&](auto k) { return get_batch_dim(k) > max_batch_size; };
+    kernels.erase(std::remove_if(kernels.begin(), kernels.end(), pred), kernels.end());
+
+    return kernels;
+}
+
+Context::Context(const cudaDeviceProp& prop)
+{
+    arch_     = prop.major * 100 + prop.minor * 10;
+    sm_count_ = prop.multiProcessorCount;
+}
+
+StaticGemmContext::StaticGemmContext(const cudaDeviceProp& prop): Context{prop} {}
+
+std::optional<GemmDesc> StaticGemmContext::Init(const Operation&    operation,
+                                                const MatrixLayout& Adesc,
+                                                const MatrixLayout& Udesc,
+                                                const MatrixLayout& Bdesc,
+                                                const MatrixLayout& Vdesc,
+                                                const MatrixLayout& Cdesc,
+                                                const MatrixLayout& Ddesc)
+{
+
+    desc_ = get_gemm_desc(operation, Adesc, Udesc, Bdesc, Vdesc, Cdesc, Ddesc, arch_);
+    return desc_;
+}
+
+std::vector<Kernel*> StaticGemmContext::Filter(const std::vector<Kernel*>& kernels) const
+{
+    return filter_by_batch_size(kernels, *desc_, desc_->batch_dim == 0 ? desc_->m : desc_->n);
+}
+
+std::vector<LaunchSpec> StaticGemmContext::Populate(const Kernel& kernel, const PopulateParam& param) const
+{
+    const int m = desc_->m, n = desc_->n, k = desc_->k;
+
+    const auto& desc = kernel.desc();
+
+    const int64_t tiled_shape_m = cdiv(m, desc.cta_tile.x);
+    const int64_t tiled_shape_n = cdiv(n, desc.cta_tile.y);
+    const int     chunk_cnt_k   = cdiv(k, kernel.chunk_size_k());
+
+    // Despite we only have sm_count * constant tensor cores, this is the granularity for scheduling
+    const int   concurrency     = sm_count_ * kernel.desc().max_active_ctas;
+    const float waves_per_split = float(tiled_shape_m * tiled_shape_n) / concurrency;
+    const float splits_per_wave = 1.f / waves_per_split;
+
+    // Tile quantization
+    const int64_t ceil_m = tiled_shape_m * desc.cta_tile.x;
+    const int64_t ceil_n = tiled_shape_n * desc.cta_tile.y;
+
+    // int max_splits = kernel.GetMaxSplits(m, n, k, param.barriers_size, param.partials_size);
+    int max_splits =
+        kernel.GetMaxSplits({m, n, k, 1}, tiled_shape_m * tiled_shape_n, param.barriers_size, param.partials_size);
+
+    // std::cout << "max_splits: " << max_splits << std::endl;
+
+    max_splits = std::min(param.max_splits, max_splits);
+
+    std::vector<LaunchSpec> specs;
+
+    for (int splits = 1; splits <= max_splits; ++splits) {
+        // Split quantization, penalize uneven splits
+        const int64_t split_ceil_k = cdiv(chunk_cnt_k, splits) * kernel.chunk_size_k();
+        // Footprint for single split
+        const int64_t split_mma_cost = ceil_m * ceil_n * split_ceil_k;
+        // Footprint for single wave
+        const int64_t wave_mma_cost = split_mma_cost * splits_per_wave;
+
+        // Wave quantization
+        // const int waves = (int)std::ceil(wave_per_split * splits);
+
+        // Bold simulation of thread block scheduling
+        const int   grid_size    = tiled_shape_m * tiled_shape_n * splits;
+        const int   full_waves   = grid_size / concurrency;
+        const int   residue      = grid_size % concurrency;
+        const float partial_wave = (float)cdiv(residue, sm_count_) / desc.max_active_ctas;
+        const float waves        = full_waves + partial_wave;
+
+        if (splits > 1 && waves > param.max_waves) {
+            break;
+        }
+        // ceil(tiled_mn / C * splits) * C / tiled_mn * ceil_m * ceil_n * split_ceil_k
+        const int64_t mma_cost = wave_mma_cost * waves;
+
+        // IO has less severe quantization effect
+        const int64_t mio_cost_a = get_size(desc.type_a, tiled_shape_n * m * split_ceil_k) * splits;
+        const int64_t mio_cost_b = get_size(desc.type_b, tiled_shape_m * n * split_ceil_k) * splits;
+        /// TODO: read type from `desc_.accum` when added
+        const int64_t mio_cost_c = get_size(DataType::F32, (int64_t)m * n) * (splits - 1) * 2;
+        const int64_t mio_cost   = mio_cost_a + mio_cost_b + mio_cost_c;
+
+        // std::cout << name() << " " << splits << " " << waves << " " << (float)mio_cost << " " << (float)mma_cost
+        //           << "\n";
+
+        // metrics.emplace_back(splits, KernelMetric{mio_cost, mma_cost});
+
+        LaunchSpec spec{};
+        spec.kernel    = const_cast<Kernel*>(&kernel);
+        spec.splits    = splits;
+        spec.swizzle   = param.swizzle;
+        spec.estimated = {mio_cost, mma_cost};
+        specs.push_back(spec);
+    }
+
+    return specs;
+}
+
+std::vector<LaunchSpec> StaticGemmContext::Swizzle(const LaunchSpec& spec, const std::vector<int>& swizzle) const
+{
+    return get_swizzle({desc_->m, desc_->n, desc_->k, desc_->num}, spec, swizzle);
+}
+
+static void release(Tape& tape, cudaStream_t st)
+{
+    if (tape.buffer) {
+        cudaFreeAsync(tape.buffer, st);
+    }
+    tape = {};
+}
+
+static void resize(Tape& tape, int ctas, int num, cudaStream_t st)
+{
+    auto allocate = [&](void* base) {
+        Monotonic alloc{base};
+        alloc(&tape.tile_offsets, ctas);
+        alloc(&tape.iter_k_ranges, ctas);
+        alloc(&tape.tile_ids, ctas);
+        alloc(&tape.gemm_shapes, num);
+        alloc(&tape.tiled_shapes, num);
+        return (char*)alloc.ptr() - (char*)base;
+    };
+    if (tape.max_ctas < ctas || tape.max_num < num) {
+        release(tape, st);
+        const auto size = allocate(0);
+        cudaMallocAsync(&tape.buffer, size, st);
+        allocate(tape.buffer);
+        tape.max_ctas = ctas;
+        tape.max_num  = num;
+    }
+    tape.ctas = ctas;
+}
+
+template<Order order>
+__global__ void schedule_gemm_split_k(Tape tape, GemmScheduler<order> sched, dim3 grid)
+{
+    const int tid = threadIdx.x + blockDim.x * blockIdx.x;
+    if (tid >= tape.max_ctas) {
+        return;
+    }
+
+    int       idx         = tid;
+    const int block_idx_x = idx % grid.x;
+    idx /= grid.x;
+    const int block_idx_y = idx % grid.y;
+    const int block_idx_z = idx / grid.y;
+
+    sched.init(block_idx_x, block_idx_y, block_idx_z);
+
+    if (tid == 0) {
+        tape.gemm_shapes[tid]  = sched.gemm_shape();
+        tape.tiled_shapes[tid] = sched.tiled_shape();
+    }
+
+    tape.tile_offsets[tid]  = sched.tile_offset();
+    tape.iter_k_ranges[tid] = sched.iter_k_range();
+    tape.tile_ids[tid]      = sched.tile_id();
+}
+
+DynamicGemmContext::DynamicGemmContext(const cudaDeviceProp& prop, cudaStream_t stream):
+    StaticGemmContext{prop}, stream_{stream}, tape_{}
+{
+}
+
+DynamicGemmContext::~DynamicGemmContext()
+{
+    release(tape_, stream_);
+}
+
+static inline bool operator==(int4 a, int4 b)
+{
+    return std::tie(a.x, a.y, a.z, a.w) == std::tie(b.x, b.y, b.z, b.w);
+}
+
+static inline bool operator==(LaunchSpec a, LaunchSpec b)
+{
+    return std::tie(a.kernel, a.splits, a.swizzle) == std::tie(b.kernel, b.splits, b.swizzle);
+}
+
+Tape DynamicGemmContext::Schedule(const LaunchSpec& spec)
+{
+    const int4 shape{desc_->m, desc_->n, desc_->k, desc_->num};
+    if (shape == last_shape_ && spec == last_spec_) {
+        return tape_;
+    }
+
+    const auto cta_tile     = spec.kernel->cta_tile_size();
+    const auto chunk_k_size = spec.kernel->chunk_size_k();
+
+    const int2 tiled_mn = get_tiled_shape(shape.x, shape.y, cta_tile.x, cta_tile.y);
+
+    GemmScheduler<kColMajor> sched{shape, tiled_mn, spec.splits, spec.swizzle, cta_tile.z, chunk_k_size};
+
+    const dim3 grid = sched.get_grid_shape();
+    const int  ctas = grid.z * grid.y * grid.x;
+
+    // std::cout << grid.x << " " << grid.y << " " << grid.x << "\n";
+
+    resize(tape_, ctas, 1, stream_);
+
+    constexpr int threads = 256;
+    const int     blocks  = cdiv(ctas, threads);
+
+    schedule_gemm_split_k<<<blocks, threads, 0, stream_>>>(tape_, sched, grid);
+
+    last_shape_ = shape;
+    last_spec_  = spec;
+
+    return tape_;
+}
+
+MoeGemmContext::MoeGemmContext(int expert_num, int experts_per_token, const cudaDeviceProp& prop, cudaStream_t stream):
+    Context{prop},
+    expert_num_{expert_num},
+    experts_per_token_{experts_per_token},
+    stream_{stream},
+    tokens_{},
+    offsets_{},
+    tape_{}
+{
+    resize(tape_, 256 << 10, expert_num_, stream_);
+}
+
+MoeGemmContext::~MoeGemmContext()
+{
+    release(tape_, stream_);
+}
+
+std::optional<GemmDesc> MoeGemmContext::Init(const Operation&    operation,
+                                             const MatrixLayout& Adesc,
+                                             const MatrixLayout& Udesc,
+                                             const MatrixLayout& Bdesc,
+                                             const MatrixLayout& Vdesc,
+                                             const MatrixLayout& Cdesc,
+                                             const MatrixLayout& Ddesc)
+{
+
+    desc_ = get_gemm_desc(operation, Adesc, Udesc, Bdesc, Vdesc, Cdesc, Ddesc, arch_);
+
+    if (!desc_) {
+        return {};
+    }
+
+    // fprintf(
+    //     stderr, "%d %d %d vs %d %d %d\n", desc_->n, desc_->k, desc_->m, output_dim_, input_dim_, experts_per_token_);
+
+    if (desc_->m % experts_per_token_ != 0 || desc_->num != expert_num_) {
+        fprintf(stderr, "Context shape mismatch\n");
+        return {};
+    }
+
+    output_dim_ = desc_->n;
+    input_dim_  = desc_->k;
+
+    // desc_->align_m = 1;  // gcd([m])
+    // desc_->num     = expert_num_;
+
+    tokens_ = desc_->m / experts_per_token_;
+
+    // printf("tokens = %d, num = %d\n", tokens_, desc_->num);
+
+    return desc_;
+}
+
+std::vector<LaunchSpec> MoeGemmContext::Populate(const Kernel& kernel, const PopulateParam& param) const
+{
+    const int n = output_dim_, k = input_dim_;
+
+    const KernelDesc& desc = kernel.desc();
+
+    // Note: cdiv(t * e, E) * E >= t * e
+    const int batch_size = ceil_div(tokens_ * experts_per_token_, expert_num_);
+    const int num        = std::min(tokens_ * experts_per_token_, expert_num_);
+
+    const int64_t tiled_shape_m  = cdiv(batch_size, desc.cta_tile.x);
+    const int64_t tiled_shape_n  = cdiv(n, desc.cta_tile.y);
+    const int64_t tiled_shape_mn = tiled_shape_m * tiled_shape_n;
+    const int     chunk_cnt_k    = cdiv(k, kernel.chunk_size_k());
+
+    // Despite we only have sm_count * constant tensor cores, this is the granularity for scheduling
+    const int   concurrency     = sm_count_ * kernel.desc().max_active_ctas;
+    const float waves_per_split = float(tiled_shape_m * tiled_shape_n) / concurrency;
+    const float splits_per_wave = 1.f / waves_per_split;
+
+    // Tile quantization
+    const int64_t ceil_m = tiled_shape_m * desc.cta_tile.x;
+    const int64_t ceil_n = tiled_shape_n * desc.cta_tile.y;
+
+    int max_splits =
+        kernel.GetMaxSplits({batch_size, n, k, num}, tiled_shape_mn, param.barriers_size, param.partials_size);
+
+    max_splits = std::min(param.max_splits, max_splits);
+    // std::cout << "max_splits: " << max_splits << "\n";
+
+    std::vector<LaunchSpec> specs;
+
+    for (int splits = 1; splits <= max_splits; ++splits) {
+        // Split quantization, penalize uneven splits
+        const int64_t split_ceil_k = cdiv(chunk_cnt_k, splits) * kernel.chunk_size_k();
+        // Footprint for single split
+        const int64_t split_mma_cost = ceil_m * ceil_n * split_ceil_k;
+        // Footprint for single wave
+        const int64_t wave_mma_cost = split_mma_cost * splits_per_wave;
+
+        // Wave quantization
+        // const int waves = (int)std::ceil(wave_per_split * splits);
+
+        // Bold simulation of thread block scheduling
+        const int   grid_size    = tiled_shape_m * tiled_shape_n * splits * num;
+        const int   full_waves   = grid_size / concurrency;
+        const int   residue      = grid_size % concurrency;
+        const float partial_wave = (float)cdiv(residue, sm_count_) / desc.max_active_ctas;
+        const float waves        = full_waves + partial_wave;
+
+        // std::cout << splits << " " << grid_size << " " << concurrency << " " << waves << std::endl;
+
+        if (splits > 1 && waves > param.max_waves) {
+            break;
+        }
+        // ceil(tiled_mn / C * splits) * C / tiled_mn * ceil_m * ceil_n * split_ceil_k
+        const int64_t mma_cost = wave_mma_cost * waves;
+
+        // IO has less severe quantization effect
+        const int64_t mio_cost_a = get_size(desc.type_a, tiled_shape_n * batch_size * split_ceil_k) * num * splits;
+        const int64_t mio_cost_b = get_size(desc.type_b, tiled_shape_m * n * split_ceil_k) * num * splits;
+        /// TODO: read type from `desc_.accum` when added
+        const int64_t mio_cost_c = get_size(DataType::F32, (int64_t)batch_size * n) * num * (splits - 1) * 2;
+        const int64_t mio_cost   = mio_cost_a + mio_cost_b + mio_cost_c;
+
+        LaunchSpec spec{};
+        spec.kernel    = const_cast<Kernel*>(&kernel);
+        spec.splits    = splits;
+        spec.swizzle   = param.swizzle;
+        spec.estimated = {mio_cost, mma_cost};
+        specs.push_back(spec);
+    }
+
+    return specs;
+}
+
+template<int block_dim, class Sched>
+__global__ void schedule_gemm_moe(Tape       tape,
+                                  const int* offsets,
+                                  Sched      sched,
+                                  int3       cta_tile,
+                                  int        log_tile,
+                                  int        expert_num,
+                                  int        output_dims,
+                                  int        input_dims,
+                                  int        max_ctas)
+{
+    const int e = blockIdx.x;
+
+    __shared__ int  shared_sum_tiles;
+    __shared__ int  shared_tiles;
+    __shared__ int4 shared_grid;
+
+    {
+        const int tokens = threadIdx.x <= e ? offsets[threadIdx.x + 1] - offsets[threadIdx.x] : 0;
+
+        // Update tiled shape according to actual batch size
+        auto tiled_shape = sched.tiled_shape();
+        tiled_shape.x    = cdiv(tokens, cta_tile.x);
+
+        const dim3 grid = sched.get_grid_shape(tiled_shape, log_tile);
+
+        // Num of tiles for the expert
+        const int tiles = grid.x * grid.y;
+
+        using BlockReduce = cub::BlockReduce<int2, block_dim>;
+
+        __shared__ typename BlockReduce::TempStorage temp_storage;
+
+        auto plus = [](int2 a, int2 b) -> int2 { return {a.x + b.x, a.y + b.y}; };
+
+        // Sum tokens/cta_m in [0, e) range (exclusive)
+        // Not sure if `num_valid = 0` works (as no init value is supplied). Conditioning is used instead.
+        const int2 sum_tokens_tiles =
+            BlockReduce{temp_storage}.Reduce(threadIdx.x < e ? int2{tokens, tiles} : int2{}, plus);
+
+        // Only thread-0 has the reduced value
+        if (threadIdx.x == 0) {
+            shared_sum_tiles = sum_tokens_tiles.y;
+        }
+
+        // Shared properties of current expert
+        if (threadIdx.x == e) {
+            shared_grid = {(int)grid.x, (int)grid.y, (int)grid.z, 1};
+
+            shared_tiles = tiles;
+
+            tape.gemm_shapes[e]  = {tokens, output_dims, input_dims, 1};
+            tape.tiled_shapes[e] = tiled_shape;
+        }
+    }
+
+    __syncthreads();
+
+    const int4 grid   = shared_grid;
+    const int  ctas   = shared_tiles * grid.z;
+    const int  tiles  = shared_sum_tiles;
+    const int  offset = shared_sum_tiles * grid.z;
+
+    for (int i = threadIdx.x; i < ctas; i += block_dim) {
+        int idx = i;
+
+        // We need fast div-mod
+        const int block_idx_x = idx % grid.x;
+
+        idx = idx / grid.x;
+
+        const int block_idx_y = idx % grid.y;
+        const int block_idx_z = idx / grid.y;
+
+        sched.init(block_idx_x, block_idx_y, block_idx_z);
+
+        auto tile_offset = sched.tile_offset();
+        tile_offset.w    = e;
+
+        tape.tile_offsets[offset + i]  = tile_offset;
+        tape.iter_k_ranges[offset + i] = sched.iter_k_range();
+        tape.tile_ids[offset + i]      = tiles + sched.tile_id();
+    }
+
+    if (e == expert_num - 1) {
+        for (int i = threadIdx.x + offset + ctas; i < max_ctas; i += block_dim) {
+            tape.tile_offsets[i]  = int4{-1, -1, -1, -1};
+            tape.iter_k_ranges[i] = int2{-1, -1};
+            tape.tile_ids[i]      = -1;
+        }
+    }
+}
+
+Tape MoeGemmContext::Schedule(const LaunchSpec& spec)
+{
+    const int3 cta_tile = spec.kernel->cta_tile_size();
+
+    const int sum_m = tokens_ * experts_per_token_;
+
+    const int max_m_tiles = sum_m / cta_tile.x + std::min(sum_m, expert_num_);
+
+    const int proxy_m = max_m_tiles * cta_tile.x;
+
+    using Sched = GemmScheduler<kColMajor>;
+
+    const int4 gemm_shape{proxy_m, output_dim_, input_dim_, 1};
+    const int2 tiled_mn = get_tiled_shape(proxy_m, output_dim_, cta_tile.x, cta_tile.y);
+
+    const int4 tiled_shape{tiled_mn.x, tiled_mn.y, spec.splits, 1};
+    const dim3 grid = Sched::get_grid_shape(tiled_shape, spec.swizzle);
+
+    // std::cout << "splits: " << spec.splits << std::endl;
+    // std::cout << tiled_mn.x << " " << tiled_mn.y << " " << grid.x << " " << grid.y << " " << grid.z << std::endl;
+
+    const int ctas = grid.x * grid.y * grid.z;
+
+    resize(tape_, ctas, expert_num_, stream_);
+
+    Sched sched{gemm_shape, tiled_mn, spec.splits, spec.swizzle, cta_tile.z, spec.kernel->chunk_size_k()};
+
+    constexpr int threads = 256;
+    const int     blocks  = expert_num_;
+    schedule_gemm_moe<threads><<<blocks, threads, 0, stream_>>>(tape_,  //
+                                                                offsets_,
+                                                                sched,
+                                                                cta_tile,
+                                                                spec.swizzle,
+                                                                expert_num_,
+                                                                output_dim_,
+                                                                input_dim_,
+                                                                ctas);
+
+    return tape_;
+}
+
+std::vector<Kernel*> MoeGemmContext::Filter(const std::vector<Kernel*>& kernels) const
+{
+    // const int avg_m = cdiv(tokens_ * experts_per_token_, expert_num_);
+    const int max_m = cdiv(tokens_ * experts_per_token_, 1);
+    return filter_by_batch_size(kernels, *desc_, desc_->batch_dim == 0 ? max_m : desc_->n);
+}
+
+std::vector<LaunchSpec> MoeGemmContext::Swizzle(const LaunchSpec& spec, const std::vector<int>& swizzle) const
+{
+    const int avg_m = cdiv(tokens_ * experts_per_token_, expert_num_);
+    return get_swizzle({avg_m, desc_->n, desc_->k, desc_->num}, spec, swizzle);
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/context.h b/src/turbomind/kernels/gemm/context.h
new file mode 100644
index 0000000000..4fec5b732f
--- /dev/null
+++ b/src/turbomind/kernels/gemm/context.h
@@ -0,0 +1,177 @@
+#pragma once
+
+#include "src/turbomind/kernels/gemm/desc.h"
+#include "src/turbomind/kernels/gemm/kernel.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include <optional>
+
+namespace turbomind::gemm {
+
+struct PopulateParam {
+    int    max_splits;
+    int    max_waves;
+    int    swizzle;
+    size_t barriers_size;
+    size_t partials_size;
+};
+
+class Context {
+public:
+    virtual ~Context() = default;
+
+    explicit Context(const cudaDeviceProp& prop);
+
+    virtual std::optional<GemmDesc> Init(const Operation&    operation,
+                                         const MatrixLayout& Adesc,
+                                         const MatrixLayout& Udesc,
+                                         const MatrixLayout& Bdesc,
+                                         const MatrixLayout& Vdesc,
+                                         const MatrixLayout& Cdesc,
+                                         const MatrixLayout& Ddesc) = 0;
+
+    virtual std::vector<Kernel*> Filter(const std::vector<Kernel*>& kernels) const = 0;
+
+    virtual std::vector<LaunchSpec> Populate(const Kernel& kernel, const PopulateParam& param) const = 0;
+
+    virtual std::vector<LaunchSpec> Swizzle(const LaunchSpec& spec, const std::vector<int>& swizzle) const = 0;
+
+    virtual Tape Schedule(const LaunchSpec& spec) = 0;
+
+    virtual bool is_dynamic_sched() const noexcept = 0;
+
+    // Alignment
+    // (align_m, align_n, align_k) -> is_aligned
+    //  gcd_mnk need to be part of gemm desc
+
+    // Max splits
+    // (max_mn_tiles, max_k_tiles) -> max_splits
+
+    // CTA Swizzling
+    // - GemmScheduler: return get_log_tile
+    // - DynamicScheduler: bypass
+
+    // Cost estimation
+    //
+
+protected:
+    int arch_{};
+    int sm_count_{};
+
+    std::optional<GemmDesc> desc_{};
+};
+
+class StaticGemmContext: public Context {
+public:
+    explicit StaticGemmContext(const cudaDeviceProp& prop);
+
+    std::optional<GemmDesc> Init(const Operation&    operation,
+                                 const MatrixLayout& Adesc,
+                                 const MatrixLayout& Udesc,
+                                 const MatrixLayout& Bdesc,
+                                 const MatrixLayout& Vdesc,
+                                 const MatrixLayout& Cdesc,
+                                 const MatrixLayout& Ddesc) override;
+
+    std::vector<Kernel*> Filter(const std::vector<Kernel*>& kernels) const override;
+
+    std::vector<LaunchSpec> Populate(const Kernel& kernel, const PopulateParam& param) const override;
+
+    std::vector<LaunchSpec> Swizzle(const LaunchSpec& spec, const std::vector<int>& swizzle) const override;
+
+    Tape Schedule(const LaunchSpec&) override
+    {
+        return {};
+    }
+
+    bool is_dynamic_sched() const noexcept override
+    {
+        return false;
+    }
+
+protected:
+};
+
+class DynamicGemmContext: public StaticGemmContext {
+public:
+    DynamicGemmContext(const cudaDeviceProp& prop, cudaStream_t stream);
+
+    ~DynamicGemmContext() override;
+
+    Tape Schedule(const LaunchSpec&) override;
+
+    bool is_dynamic_sched() const noexcept override
+    {
+        return true;
+    }
+
+protected:
+    cudaStream_t stream_;
+    Tape         tape_;
+    int4         last_shape_{};
+    LaunchSpec   last_spec_{};
+};
+
+class MoeGemmContext: public Context {
+public:
+    MoeGemmContext(int experts,
+                   int experts_per_token,
+                   //    int                   output_dims,
+                   //    int                   input_dims,
+                   const cudaDeviceProp& prop,
+                   cudaStream_t          stream);
+
+    ~MoeGemmContext() override;
+
+    std::optional<GemmDesc> Init(const Operation&    operation,
+                                 const MatrixLayout& Adesc,
+                                 const MatrixLayout& Udesc,
+                                 const MatrixLayout& Bdesc,
+                                 const MatrixLayout& Vdesc,
+                                 const MatrixLayout& Cdesc,
+                                 const MatrixLayout& Ddesc) override;
+
+    std::vector<Kernel*> Filter(const std::vector<Kernel*>& kernels) const override;
+
+    // batch size
+    // m: cdiv(exp_per_tok * tokens, experts)
+
+    // FMA_all:
+    // m: exp_per_tok * tokens
+    // n: output_dims
+    // k:  input_dims
+
+    // MIO:
+    // A: exp_per_tok * tokens * input_dims
+    // C: exp_per_tok * tokens * output_dims
+    // B: experts * output_dims * input_dims
+
+    std::vector<LaunchSpec> Populate(const Kernel& kernel, const PopulateParam& param) const override;
+
+    std::vector<LaunchSpec> Swizzle(const LaunchSpec& spec, const std::vector<int>& swizzle) const override;
+
+    bool is_dynamic_sched() const noexcept override
+    {
+        return true;
+    }
+
+    Tape Schedule(const LaunchSpec&) override;
+
+    void set_offsets(const int* offsets)
+    {
+        offsets_ = offsets;
+    }
+
+protected:
+    int expert_num_;
+    int experts_per_token_;
+
+    cudaStream_t stream_;
+
+    int        output_dim_;
+    int        input_dim_;
+    int        tokens_;
+    const int* offsets_;
+    Tape       tape_;
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/convert_v2.cu b/src/turbomind/kernels/gemm/convert_v2.cu
index 7c26a10036..ed8b2ee2ff 100644
--- a/src/turbomind/kernels/gemm/convert_v2.cu
+++ b/src/turbomind/kernels/gemm/convert_v2.cu
@@ -9,6 +9,7 @@
 #include "src/turbomind/kernels/gemm/convert_v2.h"
 #include "src/turbomind/kernels/gemm/format.h"
 #include "src/turbomind/kernels/gemm/gemm.h"
+#include "src/turbomind/kernels/gemm/matrix_ptr.h"
 #include "src/turbomind/kernels/gemm/operand.h"
 #include "src/turbomind/kernels/gemm/types.h"
 
@@ -83,8 +84,7 @@ void Convert_v2_Impl(const void* S, const MatrixLayout& Sdesc, void* D, const Ma
         cudaFuncSetAttribute(convert_kernel<Kernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize);
     }
 
-    using PointerD = typename Kernel::PtrD;
-    typename Kernel::Param param{Sdesc.rows, Sdesc.cols, (const Stype*)S, Sdesc.ld, PointerD{(Dtype*)D}, Ddesc.ld};
+    typename Kernel::Param param{Sdesc.rows, Sdesc.cols, to_param((void*)S, Sdesc), to_param((void*)D, Ddesc)};
 
     constexpr int threads = Config::BLOCK_SIZE;
     const int     blocks  = ceil_div(Sdesc.rows, CTA_M);
@@ -99,7 +99,7 @@ void Convert_v2_Impl(const void* S, const MatrixLayout& Sdesc, void* D, const Ma
 int Convert(const void*         S,  //
             const MatrixLayout& _Sdesc,
             void*               D,
-            const MatrixLayout& _Ddesc,
+            MatrixLayout&       _Ddesc,
             cudaStream_t        stream)
 {
     const Op_Tag op_tag = get_operand_tag(_Ddesc.pack);
@@ -122,9 +122,15 @@ int Convert(const void*         S,  //
 
             static constexpr int  kPackSize = Operand::SmemCopyAtom::Frag::size() * pack_num_tag;
             static constexpr bool kIsValid  = kPackSize % unit_size(type_c<Dtype>) == 0;
+            constexpr Pack        pack      = mma | operand | pack_num;
 
             if constexpr (kIsValid) {
+                // Launch conversion kernel
                 Convert_v2_Impl<Config<Operand, Dtype, pack_num_tag>>(S, Sdesc, D, Ddesc, stream);
+                // Set leading dimension for destination
+                _Ddesc.ld = mk2cs<order>(Packing_v2<pack, order>::apply({Sdesc.rows, Sdesc.cols})).x;
+                // _Ddesc.ld = mk2cs<order>(Packing_v2<pack, order>::apply(cs2mk<order>(_Ddesc.ld, 0))).x;
+
                 return true;
             }
 
@@ -152,6 +158,7 @@ int Convert(const void*         S,  //
         if constexpr (is_AB(operand)) {
             switch (Ddesc.type) {
                 case DataType::F16:
+                case DataType::BF16:
                     return dispatch_4(mma, operand, order, type_c<uint16_t>, type_c<uint16_t>);
                 case DataType::U8:
                     return dispatch_4(mma, operand, order, type_c<uint16_t>, type_c<uint8_t>);
@@ -217,25 +224,73 @@ int Convert(const void*         S,  //
     return dispatch() - 1;
 }
 
-std::tuple<Order, Pack, Order, Pack> get_weight_and_scales_layout(int sm, bool force_simt)
+std::tuple<Order, Pack, Order, Pack>
+get_weight_and_scales_layout(DataType dtype, bool is_fused_moe, int sm, bool force_simt)
 {
-    if (force_simt) {
-        return {kColMajor, HMMA_SIMT | OPERAND_B | 1, kRowMajor, HMMA_SIMT | OPERAND_V | 1};
-    }
-    if (sm >= 80) {
-        return {kRowMajor, HMMA_16816 | OPERAND_B | 2, kRowMajor, HMMA_16816 | OPERAND_V | 1};
-    }
-    else if (sm == 75) {
-        return {kRowMajor, HMMA_16816 | OPERAND_B | 2, kRowMajor, HMMA_16816 | OPERAND_V | 1};
-    }
-    else if (sm == 70) {
-        return {kColMajor, HMMA_884 | OPERAND_B | 1, kRowMajor, HMMA_884 | OPERAND_V | 1};
+    if (is_fused_moe) {
+        if (dtype == DataType::BF16 && sm >= 80) {
+            return {kColMajor, HMMA_16816 | OPERAND_B | 1, {}, {}};
+        }
+
+        if (dtype == DataType::F16) {
+            if (sm >= 80) {
+                return {kColMajor, HMMA_16816 | OPERAND_B | 1, {}, {}};
+            }
+            else if (sm == 75) {
+                return {kColMajor, HMMA_16816 | OPERAND_B | 1, {}, {}};
+            }
+            else if (sm == 70) {
+                return {kColMajor, HMMA_884 | OPERAND_B | 1, {}, {}};
+            }
+        }
+        else if (dtype == DataType::U4) {
+            if (sm >= 80) {
+                return {kColMajor, HMMA_16816 | OPERAND_B | 2, kRowMajor, HMMA_16816 | OPERAND_V | 1};
+            }
+            else if (sm == 75) {
+                return {kColMajor, HMMA_16816 | OPERAND_B | 2, kRowMajor, HMMA_16816 | OPERAND_V | 1};
+            }
+            else if (sm == 70) {
+                return {kColMajor, HMMA_884 | OPERAND_B | 1, kRowMajor, HMMA_884 | OPERAND_V | 1};
+            }
+        }
     }
     else {
-        std::cerr << "not implemented: sm_" << sm << std::endl;
-        std::abort();
+        if (dtype == DataType::U4) {
+            if (force_simt) {
+                return {kColMajor, HMMA_SIMT | OPERAND_B | 1, kRowMajor, HMMA_SIMT | OPERAND_V | 1};
+            }
+            if (sm >= 80) {
+                return {kRowMajor, HMMA_16816 | OPERAND_B | 2, kRowMajor, HMMA_16816 | OPERAND_V | 1};
+            }
+            else if (sm == 75) {
+                return {kRowMajor, HMMA_16816 | OPERAND_B | 2, kRowMajor, HMMA_16816 | OPERAND_V | 1};
+            }
+            else if (sm == 70) {
+                return {kColMajor, HMMA_884 | OPERAND_B | 1, kRowMajor, HMMA_884 | OPERAND_V | 1};
+            }
+        }
     }
+
+    std::cerr << "not implemented: dtype=" << to_string(dtype) << ", is_fused_moe=" << is_fused_moe << ", sm=" << sm
+              << std::endl;
+    std::abort();
+
     return {};
 }
 
+void* make_blocked_ptrs(const std::vector<std::pair<void*, int>>& ptrs, cudaStream_t stream)
+{
+    std::vector<StridedPtr> tmp;
+    for (const auto& [p, s] : ptrs) {
+        tmp.push_back({p, s});
+    }
+    StridedPtr* ptr{};
+    cudaMallocAsync(&ptr, sizeof(StridedPtr) * ptrs.size(), stream);
+    cudaMemcpyAsync(ptr, tmp.data(), sizeof(StridedPtr) * ptrs.size(), cudaMemcpyDefault, stream);
+    // Sync before tmp can be destructed
+    cudaStreamSynchronize(stream);
+    return ptr;
+}
+
 }  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/convert_v2.h b/src/turbomind/kernels/gemm/convert_v2.h
index 7819517fbd..eb379085e3 100644
--- a/src/turbomind/kernels/gemm/convert_v2.h
+++ b/src/turbomind/kernels/gemm/convert_v2.h
@@ -2,9 +2,11 @@
 
 #pragma once
 
+#include "src/turbomind/kernels/core/data_type.h"
 #include "src/turbomind/kernels/core/math.h"
 #include "src/turbomind/kernels/gemm/cp_async.h"
 #include "src/turbomind/kernels/gemm/iterator_sm70.h"
+#include "src/turbomind/kernels/gemm/matrix_ptr.h"
 #include "src/turbomind/kernels/gemm/operand.h"
 #include "src/turbomind/kernels/gemm/smem_copy.h"
 #include "src/turbomind/kernels/gemm/types.h"
@@ -27,7 +29,7 @@ struct ConvertOperand {
     static constexpr int M = M_;
     static constexpr int K = K_;
 
-    using Operand = MakeOperand<Operand_, IteratorSm70<cache_policy::Default>, M_, K_, 1>;
+    using Operand = MakeOperand<Operand_, IteratorSm70<Striding::kFlat, cache_policy::Default>, M_, K_, 1>;
 
     using Ts         = typename Operand::Dtype;
     using SmemLayout = typename Operand::SmemLayout;
@@ -49,12 +51,10 @@ struct ConvertOperand {
     using PtrD = get_pointer_type<Td>;
 
     struct Param {
-        int       m;
-        int       k;
-        const Ts* src;
-        int       lds;
-        PtrD      dst;
-        int       ldd;
+        int         m;
+        int         k;
+        MatrixParam src;
+        MatrixParam dst;
     };
 
     using SharedStorage = Array<Ts, SmemLayout::kSize>;
@@ -114,18 +114,25 @@ struct ConvertOperand {
         const int cta_offset_k = (cta_cnt_k - 1) * K;
         const int residue_k    = min(K, param.k - cta_offset_k);
 
+        const auto mat_S = resolve<Ts, Striding::kFlat>(param.src, 0);
+        const auto mat_D = resolve<Td, Striding::kFlat>(param.dst, 0);
+
         // Handle residue k first
-        GmemIter gmem{(Ts*)param.src, param.lds, {cta_offset_m, cta_offset_k}, {residue_m, residue_k}};
+        GmemIter gmem{mat_S, {cta_offset_m, cta_offset_k}, {residue_m, residue_k}};
 
         gmem.smem_data_ = smem;
         gmem.ClearSmem();
 
         __syncthreads();
 
-        gmem.Prefetch(true);
+        // gmem.Prefetch(true);
+
+        typename GmemIter::Fragments fragments{};
+        gmem.Fetch(fragments, true);
+        gmem.Store(fragments);
 
         // Rest full k tiles
-        gmem            = GmemIter{(Ts*)param.src, param.lds, {cta_offset_m, 0}, {residue_m, K}};
+        gmem            = GmemIter{mat_S, {cta_offset_m, 0}, {residue_m, K}};
         gmem.smem_data_ = smem;
 
         SmemCopy smem_copy({warp_offset_m, 0});
@@ -133,6 +140,8 @@ struct ConvertOperand {
         // last, 0, 1, 2, 3, ..., last - 1
         int cta_idx_k = cta_cnt_k - 1;
 
+        get_pointer_type<Td> mat_D_ptr{(Td*)mat_D.ptr.ptr};
+
         for (int k_stage = 0; k_stage < cta_cnt_k; ++k_stage) {
             __syncthreads();
 
@@ -159,7 +168,7 @@ struct ConvertOperand {
                     auto [unique_id, repeat_id] = Atom::unique(threadIdx.x, pack_index);
 
                     // Store in [pack_id, lane_id], static cast is needed to decay SubBytePtr<T> to T*
-                    auto dst_ptr = static_cast<Td*>(param.dst + unique_id * kPackSize);
+                    auto dst_ptr = static_cast<Td*>(mat_D_ptr + unique_id * kPackSize);
 
                     if (pack_idx_m < pack_cnt_m && pack_idx_k < pack_cnt_k && repeat_id == 0) {
                         Store(dst_ptr, packed);
@@ -173,7 +182,9 @@ struct ConvertOperand {
                 break;
             }
 
-            gmem.Prefetch(true);
+            // gmem.Prefetch(true);
+            gmem.Fetch(fragments, true);
+            gmem.Store(fragments);
             gmem.Advance();
 
             cta_idx_k = k_stage;
diff --git a/src/turbomind/kernels/gemm/cta_map.h b/src/turbomind/kernels/gemm/cta_map.h
index d73c3142d0..0d9fb3e8bd 100644
--- a/src/turbomind/kernels/gemm/cta_map.h
+++ b/src/turbomind/kernels/gemm/cta_map.h
@@ -4,30 +4,40 @@
 
 #include "src/turbomind/kernels/core/common.h"
 #include "src/turbomind/kernels/core/math.h"
+#include "src/turbomind/kernels/gemm/types.h"
 
 namespace turbomind::gemm {
 
-struct CtaMap {
+TM_HOST_DEVICE constexpr int get_log_tile(int size, int tile_size)
+{
+    if (tile_size >= 32 && size >= 24)
+        return 5;
+    if (tile_size >= 16 && size >= 12)
+        return 4;
+    if (tile_size >= 8 && size >= 6)
+        return 3;
+    if (tile_size >= 4 && size >= 3)
+        return 2;
+    if (tile_size >= 2 && size >= 2)
+        return 1;
+    return 0;
+}
+
+TM_HOST_DEVICE constexpr int2 get_tiled_shape(int m, int n, int cta_m, int cta_n)
+{
+    return {ceil_div(m, cta_m), ceil_div(n, cta_n)};
+}
+
+struct CtaMap_ {
 
     TM_HOST_DEVICE static int3 get_tiled_shape(int m, int n, int k, int cta_m, int cta_n, int split_cnt)
     {
         return {(m + cta_m - 1) / cta_m, (n + cta_n - 1) / cta_n, split_cnt};
     }
 
-    TM_HOST_DEVICE static int get_log_tile(int3 tiled_shape, int N)
+    TM_HOST_DEVICE static int get_log_tile(int2 tiled_mn, int N)
     {
-        auto n = tiled_shape.y;
-        if (N >= 32 && n >= 24)
-            return 5;
-        if (N >= 16 && n >= 12)
-            return 4;
-        if (N >= 8 && n >= 6)
-            return 3;
-        if (N >= 4 && n >= 3)
-            return 2;
-        if (N >= 2 && n >= 2)
-            return 1;
-        return 0;
+        return gemm::get_log_tile(tiled_mn.y, N);
     }
 
     TM_HOST_DEVICE static dim3 get_grid_shape(int3 tiled_shape, int log_tile)
@@ -49,38 +59,196 @@ struct CtaMap {
     }
 };
 
-struct CtaMapN: public CtaMap {
-    TM_HOST_DEVICE static dim3 get_grid_shape(int3 tiled_shape, int log_tile)
+template<Order order>
+class GemmScheduler {
+    int4 gemm_shape_;
+    int4 tiled_shape_;
+    int  log_tile_;
+
+    int chunk_offset_;
+    int chunks_per_split_;
+    int iter_k_per_chunk_;
+
+    int4 tile_offset_;
+    int2 iter_k_range_;
+
+public:
+    TM_HOST_DEVICE
+    GemmScheduler(int4 gemm_shape, int2 tiled_mn, int splits, int log_tile, int cta_k, int chunk_size):
+        gemm_shape_{gemm_shape}, tiled_shape_{tiled_mn.x, tiled_mn.y, splits}, log_tile_{log_tile}
     {
-        int tile = 1 << log_tile;
-        return {static_cast<unsigned>(tiled_shape.y * tile),               // n * tile
-                static_cast<unsigned>((tiled_shape.x + tile - 1) / tile),  // m / tile
-                static_cast<unsigned>(tiled_shape.z)};
+        const int chunk_cnt = cdiv(gemm_shape_.z, chunk_size);
+
+        iter_k_per_chunk_ = chunk_size / cta_k;
+        chunks_per_split_ = chunk_cnt / splits;
+        chunk_offset_     = splits - chunk_cnt % splits;
     }
-    TM_HOST_DEVICE static int get_log_tile(int3 tiled_shape, int M)
-    {
-        auto m = tiled_shape.x;
-        if (M >= 32 && m >= 24)
-            return 5;
-        if (M >= 16 && m >= 12)
-            return 4;
-        if (M >= 8 && m >= 6)
-            return 3;
-        if (M >= 4 && m >= 3)
-            return 2;
-        if (M >= 2 && m >= 2)
-            return 1;
-        return 0;
+
+    TM_HOST_DEVICE static int get_log_tile(int2 tiled_mn, int tile_size)
+    {
+        return gemm::get_log_tile(order == kColMajor ? tiled_mn.y : tiled_mn.x, tile_size);
     }
-    TM_DEVICE static int3 get_tile_offset(int log_tile)
+
+    TM_HOST_DEVICE static dim3 get_grid_shape(int4 tiled_shape, int log_tile)
     {
-        int block_idx_x = blockIdx.x;
-        int block_idx_y = blockIdx.y;
-        int block_idx_z = blockIdx.z;
-        return {(block_idx_y << log_tile) + (block_idx_x & ((1 << log_tile) - 1)),  //
-                (block_idx_x >> log_tile),
-                block_idx_z};
+        const int tile = 1 << log_tile;
+        if constexpr (order == kColMajor) {
+            return {(unsigned)(tiled_shape.x * tile), (unsigned)(cdiv(tiled_shape.y, tile)), (unsigned)(tiled_shape.z)};
+        }
+        else {
+            return {(unsigned)(tiled_shape.y * tile), (unsigned)(cdiv(tiled_shape.x, tile)), (unsigned)(tiled_shape.z)};
+        }
+    }
+
+    TM_HOST_DEVICE dim3 get_grid_shape() const
+    {
+        return get_grid_shape(tiled_shape_, log_tile_);
+    }
+
+    TM_HOST_DEVICE std::true_type init(int block_idx_x, int block_idx_y, int block_idx_z)
+    {
+        if constexpr (order == kColMajor) {
+            tile_offset_ = {(block_idx_x >> log_tile_),
+                            (block_idx_y << log_tile_) + (block_idx_x & ((1 << log_tile_) - 1)),
+                            (block_idx_z)};
+        }
+        else {
+            tile_offset_ = {(block_idx_y << log_tile_) + (block_idx_x & ((1 << log_tile_) - 1)),
+                            (block_idx_x >> log_tile_),
+                            (block_idx_z)};
+        }
+        tile_offset_.w       = 0;
+        const int chunk_id   = tile_offset_.z * chunks_per_split_ + max(tile_offset_.z - chunk_offset_, 0);
+        const int iter_k_beg = chunk_id * iter_k_per_chunk_;
+        const int iter_k_cnt = (chunks_per_split_ + int(tile_offset_.z >= chunk_offset_)) * iter_k_per_chunk_;
+        iter_k_range_        = {iter_k_beg, iter_k_beg + iter_k_cnt};
+
+        return {};
+    }
+
+    TM_DEVICE std::true_type init()
+    {
+        return init(blockIdx.x, blockIdx.y, blockIdx.z);
+    }
+
+    TM_DEVICE int4 gemm_shape() const
+    {
+        return gemm_shape_;
+    }
+
+    TM_DEVICE int4 tiled_shape() const
+    {
+        return tiled_shape_;
+    }
+
+    TM_DEVICE int4 tile_offset() const
+    {
+        return tile_offset_;
+    }
+
+    TM_DEVICE int2 iter_k_range() const
+    {
+        return iter_k_range_;
+    }
+
+    TM_DEVICE int tile_id() const
+    {
+        return tile_offset_.x * tiled_shape_.y + tile_offset_.y;
     }
 };
 
+template<Order order>
+class DynamicScheduler {
+
+    int ctas_;
+
+    const int4* __restrict__ gemm_shapes_;    // [group_num]
+    const int4* __restrict__ tiled_shapes_;   // [group_num]
+    const int2* __restrict__ offsets_mn_;     // [group_num]
+    const int4* __restrict__ tile_offsets_;   // [ctas]
+    const int2* __restrict__ iter_k_ranges_;  // [ctas]
+    const int* __restrict__ tile_ids_;        // [ctas]
+
+    int4 gemm_shape_;
+    int4 tiled_shape_;
+    int4 tile_offset_;
+    int2 iter_k_range_;
+    int2 base_mn_;
+
+public:
+    DynamicScheduler(const Tape& tape):
+        ctas_{tape.ctas},
+        gemm_shapes_{tape.gemm_shapes},
+        tiled_shapes_{tape.tiled_shapes},
+        tile_offsets_{tape.tile_offsets},
+        iter_k_ranges_{tape.iter_k_ranges},
+        tile_ids_{tape.tile_ids}
+    {
+    }
+
+    TM_HOST_DEVICE static int get_log_tile(int2 tiled_mn, int tile_size)
+    {
+        return gemm::get_log_tile(order == kColMajor ? tiled_mn.y : tiled_mn.x, tile_size);
+    }
+
+    TM_HOST_DEVICE dim3 get_grid_shape()
+    {
+        return {(unsigned)ctas_, 1, 1};
+    }
+
+    TM_DEVICE bool init()
+    {
+        const int block_idx = blockIdx.x;
+
+        const auto [cta_m_id, cta_n_id, cta_k_id, group_id] = __ldg(tile_offsets_ + block_idx);
+
+        if (group_id < 0) {
+            return false;
+        }
+
+        gemm_shape_  = __ldg(gemm_shapes_ + group_id);
+        tiled_shape_ = __ldg(tiled_shapes_ + group_id);
+        base_mn_     = __ldg(offsets_mn_ + group_id);
+
+        tile_offset_ = {cta_m_id, cta_n_id, cta_k_id, group_id};
+
+        iter_k_range_ = __ldg(iter_k_ranges_ + block_idx);
+
+        return true;
+    }
+
+    TM_DEVICE int4 gemm_shape() const
+    {
+        return gemm_shape_;
+    }
+
+    TM_DEVICE int4 tiled_shape() const
+    {
+        return tiled_shape_;
+    }
+
+    TM_DEVICE int4 tile_offset() const
+    {
+        return tile_offset_;
+    }
+
+    TM_DEVICE int2 iter_k_range() const
+    {
+        return iter_k_range_;
+    }
+
+    TM_DEVICE int tile_id() const
+    {
+        return tile_ids_[blockIdx.x];
+    }
+};
+
+template<class S>
+struct is_dynamic_scheduler: std::false_type {
+};
+
+template<Order order>
+struct is_dynamic_scheduler<DynamicScheduler<order>>: std::true_type {
+};
+
 }  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/desc.h b/src/turbomind/kernels/gemm/desc.h
index 933a4b0201..0299235076 100644
--- a/src/turbomind/kernels/gemm/desc.h
+++ b/src/turbomind/kernels/gemm/desc.h
@@ -4,6 +4,7 @@
 
 #include "src/turbomind/kernels/core/data_type.h"
 #include "src/turbomind/kernels/gemm/types.h"
+#include <array>
 
 namespace turbomind::gemm {
 
@@ -16,6 +17,9 @@ struct GemmDesc {
     Order     order_a;
     Order     order_b;
     Order     order_c;
+    Striding  striding_a;
+    Striding  striding_b;
+    Striding  striding_c;
     Pack      pack_a;
     Pack      pack_b;
     Pack      pack_u;
@@ -23,10 +27,12 @@ struct GemmDesc {
     QuantDesc quant_a;
     QuantDesc quant_b;
     Epilogue  epilogue;
+    int       batch_dim;
+    int       sched;
     int       m;
     int       n;
     int       k;
-    int       batch_dim;
+    int       num;
 };
 
 enum class OpClass
@@ -60,6 +66,9 @@ struct KernelDesc {
     Order     order_a;
     Order     order_b;
     Order     order_c;
+    Striding  striding_a;
+    Striding  striding_b;
+    Striding  striding_c;
     Pack      pack_a;
     Pack      pack_b;
     Pack      pack_u;
@@ -74,6 +83,7 @@ struct KernelDesc {
     int2      c_tile;
     int       stages;
     bool      split_k;
+    int       sched;
 
     // set by `KernelImpl`
     int                max_active_ctas;
@@ -85,8 +95,9 @@ struct LaunchSpec {
     Kernel* kernel;
     int     swizzle;
     int     splits;
-    float   estimated;
     float   measured;
+
+    std::array<int64_t, 2> estimated;
 };
 
 }  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/dispatch_cache.cu b/src/turbomind/kernels/gemm/dispatch_cache.cu
index 850cf3a51f..719ea4f987 100644
--- a/src/turbomind/kernels/gemm/dispatch_cache.cu
+++ b/src/turbomind/kernels/gemm/dispatch_cache.cu
@@ -34,6 +34,9 @@ static inline decltype(auto) as_tuple(const KernelDesc& d)
                     d.order_a,
                     d.order_b,
                     d.order_c,
+                    d.striding_a,
+                    d.striding_b,
+                    d.striding_c,
                     d.pack_a,
                     d.pack_b,
                     d.pack_u,
@@ -47,7 +50,8 @@ static inline decltype(auto) as_tuple(const KernelDesc& d)
                     d.align,
                     d.c_tile,
                     d.stages,
-                    d.split_k);
+                    d.split_k,
+                    d.sched);
 }
 
 static inline bool operator==(const QuantDesc& a, const QuantDesc& b)
@@ -92,6 +96,9 @@ void ExportDispatchCache(std::ostream& os, const std::vector<std::pair<GemmDesc,
                     g.order_a,
                     g.order_b,
                     g.order_c,
+                    g.striding_a,
+                    g.striding_b,
+                    g.striding_c,
                     g.pack_a,
                     g.pack_b,
                     g.pack_u,
@@ -101,15 +108,20 @@ void ExportDispatchCache(std::ostream& os, const std::vector<std::pair<GemmDesc,
                     g.quant_b.type,
                     g.quant_b.group_size,
                     g.epilogue,
+                    g.batch_dim,
+                    g.sched,
                     g.m,
                     g.n,
                     g.k,
-                    g.batch_dim);
+                    g.num);
         // Kernel desc
         auto& k = spec.kernel->desc();
         export_impl(os,
                     k.arch,
                     k.op_class,
+                    k.striding_a,
+                    k.striding_b,
+                    k.striding_c,
                     k.cta_tile.x,
                     k.cta_tile.y,
                     k.cta_tile.z,
@@ -148,6 +160,9 @@ void ImportDispatchCache(std::istream&                                 is,
                     g.order_a,
                     g.order_b,
                     g.order_c,
+                    g.striding_a,
+                    g.striding_b,
+                    g.striding_c,
                     g.pack_a,
                     g.pack_b,
                     g.pack_u,
@@ -157,10 +172,12 @@ void ImportDispatchCache(std::istream&                                 is,
                     g.quant_b.type,
                     g.quant_b.group_size,
                     g.epilogue,
+                    g.batch_dim,
+                    g.sched,
                     g.m,
                     g.n,
                     g.k,
-                    g.batch_dim);
+                    g.num);
         KernelDesc k{};
         k.type_a  = g.type_a;
         k.type_b  = g.type_b;
@@ -174,9 +191,13 @@ void ImportDispatchCache(std::istream&                                 is,
         k.order_c = g.order_c;
         k.quant_a = g.quant_a;
         k.quant_b = g.quant_b;
+        k.sched   = g.sched;
         import_impl(ss,
                     k.arch,
                     k.op_class,
+                    k.striding_a,
+                    k.striding_b,
+                    k.striding_c,
                     k.cta_tile.x,
                     k.cta_tile.y,
                     k.cta_tile.z,
@@ -220,6 +241,9 @@ inline decltype(auto) as_tuple(const GemmDesc& d)
                     d.order_a,
                     d.order_b,
                     d.order_c,
+                    d.striding_a,
+                    d.striding_b,
+                    d.striding_c,
                     d.pack_a,
                     d.pack_b,
                     d.pack_u,
@@ -228,11 +252,13 @@ inline decltype(auto) as_tuple(const GemmDesc& d)
                     d.quant_a.group_size,
                     d.quant_b.type,
                     d.quant_b.group_size,
+                    d.batch_dim,
+                    d.sched,
                     d.m,
                     d.n,
                     d.k,
-                    d.batch_dim);
-    // d.epilogue
+                    d.num);
+    // Note: `d.epilogue` is not used yet
 }
 
 }  // namespace
@@ -267,7 +293,8 @@ struct DispatchCache::Impl {
     std::optional<LaunchSpec> Find(GemmDesc desc, bool exact) const
     {
         const int batch_size = extract_batch_size(desc);
-        // std::cerr << batch_size << " " << desc.m << " " << desc.n << " " << desc.k << "\n";
+        // std::cerr << batch_size << " " << desc.m << " " << desc.n << " " << desc.k << " " << std::boolalpha << exact
+        //           << "\n";
         const auto it = cache_.find(desc);
         if (it != cache_.end()) {
             const auto& [idxs, specs] = it->second;
@@ -276,8 +303,9 @@ struct DispatchCache::Impl {
                 std::lower_bound(idxs.begin(), idxs.end(), std::make_pair(batch_size, 0), [](auto& a, auto& b) {  //
                     return a.first < b.first;
                 });
-            // std::cerr << p->first << " " << p->second << "\n";
+            // std::cout << it->second.specs.size() << std::endl;
             if (p != idxs.end() && (!exact || p->first == batch_size)) {
+                // std::cerr << p->first << " " << p->second << "\n";
                 return specs[p->second];
             }
         }
@@ -342,11 +370,9 @@ struct DispatchCache::Impl {
         // Sort indices and deduplicate
         for (auto& [desc, flat] : cache_) {
             auto& [idxs, specs] = flat;
-            const auto cmp      = [](auto& a, auto& b) {  //
-                return a.first < b.first;
-            };
-            std::stable_sort(idxs.begin(), idxs.end(), cmp);
-            idxs.erase(std::unique(idxs.begin(), idxs.end(), cmp), idxs.end());
+            std::stable_sort(idxs.begin(), idxs.end(), [](auto a, auto b) { return a.first < b.first; });
+            idxs.erase(std::unique(idxs.begin(), idxs.end(), [](auto a, auto b) { return a.first == b.first; }),
+                       idxs.end());
             // Remove unreferenced specs and update spec indices
             std::vector<LaunchSpec> tmp;
             for (auto& [key, val] : idxs) {
diff --git a/src/turbomind/kernels/gemm/epilogue.h b/src/turbomind/kernels/gemm/epilogue.h
index c2c5ff54ce..5980892107 100644
--- a/src/turbomind/kernels/gemm/epilogue.h
+++ b/src/turbomind/kernels/gemm/epilogue.h
@@ -7,59 +7,14 @@
 #include "src/turbomind/kernels/core/math.h"
 #include "src/turbomind/kernels/core/meta.h"
 #include "src/turbomind/kernels/core/sync.h"
-#include "src/turbomind/kernels/gemm/iterator_sm80.h"
+#include "src/turbomind/kernels/gemm/matrix_ptr.h"
+#include "src/turbomind/kernels/gemm/predicate.h"
 #include "src/turbomind/kernels/gemm/smem_copy.h"
 #include "src/turbomind/kernels/gemm/types.h"
 #include "src/turbomind/kernels/gemm/utils.h"
 
 namespace turbomind::gemm {
 
-template<class Tc>
-struct ChannelCombination_v2 {
-    const Tc* __restrict__ scale_ptr;
-    const Tc* __restrict__ bias_ptr;
-
-    template<class T, int V, int S, int C>
-    __device__ void operator()(Array<T, V> (&x)[S][C], int2 cta_cs, int2 thr_cs, int2 delta_cs, int2 end_cs) const
-    {
-        // T scale[S];
-
-        Array<T, S> scale;
-        fill(scale, T(1));
-
-        if (scale_ptr) {
-            PRAGMA_UNROLL
-            for (int s = 0; s < S; ++s) {
-                const int ss = thr_cs.y + s * delta_cs.y;
-                if (ss < end_cs.y) {
-                    scale[s] = static_cast<T>(__ldg(scale_ptr + ss + cta_cs.y));
-                }
-            }
-        }
-
-        T bias[S]{};
-
-        if (bias_ptr) {
-            PRAGMA_UNROLL
-            for (int s = 0; s < S; ++s) {
-                const int ss = thr_cs.y + s * delta_cs.y;
-                if (ss < end_cs.y) {
-                    bias[s] = static_cast<T>(__ldg(bias_ptr + ss + cta_cs.y));
-                }
-            }
-        }
-
-        PRAGMA_UNROLL
-        for (int s = 0; s < S; ++s) {
-            PRAGMA_UNROLL
-            for (int c = 0; c < C; ++c) {
-                using namespace ops;
-                x[s][c] = x[s][c] * scale[s] + bias[s];
-            }
-        }
-    }
-};
-
 template<class Tc>
 struct ChannelCombination_v3 {
     const Tc* __restrict__ scale_bias_ptr;
@@ -92,64 +47,86 @@ struct ChannelCombination_v3 {
     }
 };
 
-template<class Tc>
-struct MatrixCombination_v2 {
-    float alpha;
-    float beta;
-
-    const Tc* C_ptr;  // can't `__restrict__` since it may be alias of `D`
-    int64_t   ldc;
-
-    template<class T, int N, int S, int C, int delta_c, int delta_s, class Pred>
-    __device__ void operator()(Array<T, N> (&x)[S][C], int2 cs0, pair<delta_c, delta_s>, Pred& pred) const
-    {
-        Array<Tc, N> frag[S][C]{};
-        if (beta) {
-            constexpr int dc  = sizeof(Tc) * delta_c;
-            const int     ds  = sizeof(Tc) * delta_s * ldc;
-            auto          ptr = reinterpret_cast<const char*>(C_ptr + cs2idx(cs0, (int64_t)ldc));
+template<bool     scale_S,
+         bool     scale_C,
+         Striding mode_S,
+         Striding mode_C,
+         class T,
+         int N,
+         int S,
+         int C,
+         int delta_C,
+         int delta_S,
+         class Pred>
+__device__ void Scale(pair<scale_S, scale_C>,
+                      pair<mode_S, mode_C>,
+                      pair<delta_C, delta_S>,
+                      Array<T, N> (&x)[S][C],
+                      const MatrixParam& param_S,
+                      const MatrixParam& param_C,
+                      int                gemm_id,
+                      int2               cs0,
+                      Pred&              pred)
+{
+    if (scale_S && param_S.ptr) {
+        const auto mat = resolve<T, mode_S>(param_S, gemm_id);
+        const T*   ptr = (const T*)mat.ptr.ptr;
+        T          param[S];
+        PRAGMA_UNROLL
+        for (int s = 0; s < S; ++s) {
+            const int ss  = cs0.y + s * delta_S;
+            const int idx = mat.idxs ? __ldg(mat.idxs + ss) : ss;
+            if (pred(s, 0)) {
+                param[s] = __ldg((const T*)(ptr + idx));
+            }
             PRAGMA_UNROLL
-            for (int s = 0; s < S; ++s) {
-                PRAGMA_UNROLL
-                for (int c = 0; c < C; ++c) {
-                    if (pred(s, c)) {
-                        Load(frag[s][c], reinterpret_cast<const Tc*>(ptr));
-                    }
-                    ptr += dc;
-                }
-                ptr -= dc * C;
-                ptr += ds;
+            for (int c = 0; c < C; ++c) {
+                using namespace ops;
+                x[s][c] = x[s][c] * param[s];
             }
         }
+    }
 
+    if (scale_C && param_C.ptr) {
+        const T*      ptr = (const T*)resolve<T, mode_C>(param_C, gemm_id).ptr.ptr + cs0.x;
+        constexpr int dc  = sizeof(Array<T, N>) * delta_C;
+        Array<T, N>   param[C];
         PRAGMA_UNROLL
-        for (int s = 0; s < S; ++s) {
+        for (int c = 0; c < C; ++c) {
+            if (pred(0, c)) {
+                Ldg(param[c], (const T*)(ptr + dc * c));
+            }
             PRAGMA_UNROLL
-            for (int c = 0; c < C; ++c) {
+            for (int s = 0; s < S; ++s) {
                 using namespace ops;
-                x[s][c] = x[s][c] * alpha + cast<T>(frag[s][c]) * beta;
+                x[s][c] = x[s][c] * param[c];
             }
         }
     }
-};
+}
 
-template<class Tc>
 struct MatrixCombination_v3 {
-    float alpha;
-    float beta;
 
-    const Tc* C_ptr;  // can't `__restrict__` since it may be alias of `D`
-    int64_t   ldc;
-
-    template<class T, int N, int S, int C, int delta_c, int delta_s, class Pred>
-    __device__ void operator()(Array<T, N> (&x)[S][C], int2 cs0, pair<delta_c, delta_s>, Pred& pred) const
+    MatrixParam param_c;
+    float       alpha;
+    float       beta;
+
+    template<class Tc, Striding mode, class T, int N, int S, int C, int delta_c, int delta_s, class Pred>
+    __device__ void operator()(Tc*,  //
+                               constant<mode>,
+                               Array<T, N> (&x)[S][C],
+                               int2 cs0,
+                               int  gemm_id,
+                               pair<delta_c, delta_s>,
+                               Pred& pred) const
     {
-
         if (beta) {
+            const auto c = resolve<Tc, mode>(param_c, gemm_id);
+
             Array<Tc, N>  frag[S][C];
             constexpr int dc  = sizeof(Tc) * delta_c;
-            const int     ds  = sizeof(Tc) * delta_s * ldc;
-            auto          ptr = reinterpret_cast<const char*>(C_ptr + cs2idx(cs0, (int64_t)ldc));
+            const int     ds  = sizeof(Tc) * delta_s * c.ptr.stride;
+            const char*   ptr = (const char*)c.ptr.ptr + sizeof(Tc) * dot(cs0, long2{1, c.ptr.stride});
             PRAGMA_UNROLL
             for (int s = 0; s < S; ++s) {
                 PRAGMA_UNROLL
@@ -198,30 +175,36 @@ struct Silu {
     }
 };
 
-template<class Tc>
 struct EpilogueParam {
-    int m;
-    int n;
-    Tc* C;
-    int ldc;
+    MatrixParam c;
+    MatrixParam partials;
+    int*        locks;
 
-    float* partial_C;
-    int    partial_C_ld;
+    // MatrixParam scale_S;
+    // MatrixParam scale_C;
 
-    int* locks;  // (m/cta_m, n/cta_n, k)
+    MatrixCombination_v3 combine_mat;
 
-    // ChannelCombination_v3<Tc> combine_chn;
-    MatrixCombination_v3<Tc> combine_mat;
-    bool                     silu_act;
+    bool silu_act;
 };
 
-template<class Tc_, int M, int N, int TM_, int TN_, int THREADS, class RearrangeC, class OperandC, bool SplitK_>
+template<class Tc_,
+         int M,
+         int N,
+         int TM_,
+         int TN_,
+         int THREADS,
+         class RearrangeC,
+         class OperandC,
+         Striding mode_C,
+         bool     SplitK_>
 struct Epilogue_ {
 
     using Dtype = typename OperandC::Dtype;
 
     static constexpr auto kOrder = OperandC::kOrder;
-    static constexpr auto SplitK = SplitK_;
+    static constexpr auto kMode  = mode_C;
+    static constexpr bool SplitK = SplitK_;
 
     using Tc = Tc_;
 
@@ -301,12 +284,12 @@ struct Epilogue_ {
         }
     }
 
-    template<class VecC, class T, class Pred>
-    __device__ void StoreC(const VecC& vec_C, T* data_C, int ldc, int2 cs0, Pred& pred)
+    template<class T, class VecC, class Pred>
+    __device__ void StoreC(const VecC& vec_C, const MatrixData& c, int2 cs0, Pred& pred)
     {
         constexpr int dc  = sizeof(T) * Map::kDeltaC;
-        const int     ds  = sizeof(T) * Map::kDeltaS * ldc;
-        auto          ptr = reinterpret_cast<char*>(data_C + cs2idx(cs0, (int64_t)ldc));
+        const int     ds  = sizeof(T) * Map::kDeltaS * c.ptr.stride;
+        char*         ptr = (char*)c.ptr.ptr + sizeof(T) * dot(cs0, long2{1, c.ptr.stride});
         PRAGMA_UNROLL
         for (int s = 0; s < S; ++s) {
             PRAGMA_UNROLL
@@ -322,9 +305,10 @@ struct Epilogue_ {
         }
     }
 
+#if 0
     template<class FragC, class Pred>
-    __device__ void Reduce(
-        FragC& frag_C, int splits, int64_t split_size, const int2& cta_cs, Pred& pred, const EpilogueParam<Tc>& param)
+    __device__ void
+    Reduce(FragC& frag_C, int splits, int64_t split_size, const int2& cta_cs, Pred& pred, const EpilogueParam& param)
     {
         using Vec         = OutputC<Dtype>;
         const int2 thr_cs = Map::get_offset(threadIdx.x / WARP_SIZE, threadIdx.x % WARP_SIZE);
@@ -346,20 +330,19 @@ struct Epilogue_ {
             }
         }
     }
+#endif
 
     template<class FragC, class Pred>
-    __device__ void
-    Reduce_v2(FragC& frag_C, int split_id, bool is_last, int2 cs0, Pred& pred, const EpilogueParam<Tc>& param)
+    __device__ void Reduce(FragC& frag_C, const MatrixData& p, bool is_first, bool is_last, int2 cs0, Pred& pred)
     {
         constexpr int dc = sizeof(Dtype) * Map::kDeltaC;
-        const int     ds = sizeof(Dtype) * Map::kDeltaS * param.partial_C_ld;
+        const int     ds = sizeof(Dtype) * Map::kDeltaS * p.ptr.stride;
 
-        const auto ptr0 = reinterpret_cast<char*>(param.partial_C + cs2idx(cs0, (int64_t)param.partial_C_ld));
+        char* ptr = (char*)p.ptr.ptr + sizeof(Dtype) * dot(cs0, long2{1, p.ptr.stride});
 
-        Pred ld_mask = split_id == 0 ? Pred{} : pred;
+        Pred ld_mask = is_first ? Pred{} : pred;
         Pred st_mask = is_last ? Pred{} : pred;
 
-        auto ptr = ptr0;
         PRAGMA_UNROLL
         for (int s = 0; s < S; ++s) {
             PRAGMA_UNROLL
@@ -383,17 +366,17 @@ struct Epilogue_ {
     }
 
     template<class FragC>
-    __device__ void operator()(FragC&                   frag_C,
-                               const int3&              tile_offset,
-                               const int3&              tiled_shape,
-                               int                      end_m,
-                               int                      end_n,
-                               bool                     is_last_split,
-                               const EpilogueParam<Tc>& param,
-                               SharedStorage&           storage)
+    __device__ void operator()(FragC&               frag_C,
+                               const int4&          tile_offset,
+                               const int4&          tiled_shape,
+                               const int2&          extents,
+                               int                  tile_id,
+                               bool                 is_last,
+                               const EpilogueParam& param,
+                               SharedStorage&       storage)
     {
         const int2 cta_cs = mk2cs<kOrder>(tile_offset.x * M, tile_offset.y * N);
-        const int2 end_cs = mk2cs<kOrder>(end_m, end_n);
+        const int2 end_cs = mk2cs<kOrder>(extents);
 
         OutputC<Dtype> tmp_C[S][C];
 
@@ -417,30 +400,35 @@ struct Epilogue_ {
         }
 
         if (SplitK_ && tiled_shape.z > 1) {
-            int* barrier = &param.locks[tile_offset.x * tiled_shape.y + tile_offset.y];
+            int* barrier = &param.locks[tile_id];
 
             sem_wait(barrier, tile_offset.z, threadIdx.x == 0);
 
-            Reduce_v2(tmp_C, tile_offset.z, is_last_split, cs0, pred, param);
+            const MatrixData p = resolve<Dtype, kMode>(param.partials, tile_offset.w);
 
-            const int post_id = is_last_split ? 0 : tile_offset.z + 1;
+            Reduce(tmp_C, p, tile_offset.z == 0, is_last, cs0, pred);
+
+            const int post_id = is_last ? 0 : tile_offset.z + 1;
             sem_post(barrier, post_id, threadIdx.x == 0);
 
-            if (!is_last_split) {
+            if (!is_last) {
                 return;
             }
         }
 
         constexpr pair<Map::kDeltaC, Map::kDeltaS> delta_cs{};
 
-        // param.combine_chn(tmp_C, cs0, delta_cs, pred);
+        // opt-in scaling
+        // Scale(scale_SC{}, mode_SC{}, delta_cs, tmp_C, param.scale_S, param.scale_C, tile_offset.w, cs0, pred);
+
+        param.combine_mat((Tc*)0, constant<kMode>{}, tmp_C, cs0, tile_offset.w, delta_cs, pred);
 
-        param.combine_mat(tmp_C, cs0, delta_cs, pred);
+        const MatrixData c = resolve<Tc, kMode>(param.c, tile_offset.w);
 
         if (param.silu_act) {
             constexpr int dc  = sizeof(Tc) * Map::kDeltaC / 2;
-            const int     ds  = sizeof(Tc) * Map::kDeltaS * param.ldc;
-            auto          ptr = reinterpret_cast<char*>(param.C + cs2idx({cs0.x / 2, cs0.y}, (int64_t)param.ldc));
+            const int     ds  = sizeof(Tc) * Map::kDeltaS * c.ptr.stride;
+            auto          ptr = (char*)c.ptr.ptr + sizeof(Tc) * dot({cs0.x / 2, cs0.y}, long2{1, c.ptr.stride});
             PRAGMA_UNROLL
             for (int s = 0; s < S; ++s) {
                 PRAGMA_UNROLL
@@ -457,7 +445,7 @@ struct Epilogue_ {
             }
         }
         else {
-            StoreC(tmp_C, param.C, param.ldc, cs0, pred);
+            StoreC<Tc>(tmp_C, c, cs0, pred);
         }
     }
 };
diff --git a/src/turbomind/kernels/gemm/gemm.cu b/src/turbomind/kernels/gemm/gemm.cu
index 59cd148182..4177efadf4 100644
--- a/src/turbomind/kernels/gemm/gemm.cu
+++ b/src/turbomind/kernels/gemm/gemm.cu
@@ -1,5 +1,6 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
+#include "src/turbomind/kernels/gemm/context.h"
 #include "src/turbomind/kernels/gemm/desc.h"
 #include "src/turbomind/kernels/gemm/dispatch_cache.h"
 #include "src/turbomind/kernels/gemm/gemm.h"
@@ -34,16 +35,6 @@ std::vector<int> ArgSort(size_t size, const Cmp& cmp)
     return idxs;
 }
 
-inline int get_batch_dim(const GemmDesc& desc)
-{
-    return desc.batch_dim == 0 ? desc.m : desc.n;
-}
-
-inline int get_batch_dim(const KernelDesc& desc, int batch_dim)
-{
-    return batch_dim == 0 ? desc.cta_tile.x : desc.cta_tile.y;
-}
-
 }  // namespace
 
 struct Gemm::Impl {
@@ -52,7 +43,8 @@ struct Gemm::Impl {
         props_{GetCudaDeviceProps()},
         arch_{props_->major * 100 + props_->minor * 10},
         registry_{props_},
-        cache_{registry_.kernels()}
+        cache_{registry_.kernels()},
+        default_ctx_(*props_)
     {
         if (auto str = std::getenv("TM_GEMM_TUNE")) {
             try {
@@ -67,7 +59,7 @@ struct Gemm::Impl {
     }
 
     // find launch spec in dispatch cache, dispatch by heuristic on cache miss
-    LaunchSpec Dispatch(DispatchPolicy policy, GemmDesc desc, size_t barriers_size, size_t partials_size)
+    LaunchSpec Dispatch(Context& ctx, DispatchPolicy policy, GemmDesc desc, size_t barriers_size, size_t partials_size)
     {
         if (policy & DispatchPolicy::kReuse) {
             if (auto spec = cache_.LowerBound(desc)) {
@@ -80,7 +72,7 @@ struct Gemm::Impl {
             return *spec;
         }
 
-        auto specs = Find(desc, barriers_size, partials_size, 1);
+        auto specs = Find(ctx, desc, barriers_size, partials_size, 1);
         if (!specs.empty()) {
             cache_.Insert(desc, specs.front());
             return specs.front();
@@ -88,7 +80,8 @@ struct Gemm::Impl {
         return {};
     }
 
-    std::vector<LaunchSpec> Find(const GemmDesc& desc, size_t barrier_size, size_t partials_size, int top_k)
+    std::vector<LaunchSpec>
+    Find(Context& ctx, const GemmDesc& desc, size_t barrier_size, size_t partials_size, int top_k)
     {
         std::vector<Kernel*> feasible;
         std::copy_if(registry_.kernels().begin(), registry_.kernels().end(), std::back_inserter(feasible), [&](auto p) {
@@ -98,21 +91,7 @@ struct Gemm::Impl {
             return {};
         }
 
-        if (1) {
-            int max_batch_size = 0;
-            for (const auto& k : feasible) {
-                max_batch_size = std::max(get_batch_dim(k->desc(), desc.batch_dim), max_batch_size);
-            }
-            const int batch_size = get_batch_dim(desc);
-            for (const auto& k : feasible) {
-                const auto x = get_batch_dim(k->desc(), desc.batch_dim);
-                if (x >= batch_size) {
-                    max_batch_size = std::min(max_batch_size, x);
-                }
-            }
-            auto pred = [&](auto k) { return get_batch_dim(k->desc(), desc.batch_dim) > max_batch_size; };
-            feasible.erase(std::remove_if(feasible.begin(), feasible.end(), pred), feasible.end());
-        }
+        feasible = ctx.Filter(feasible);
 
         std::vector<std::vector<LaunchSpec>> clusters;
         {
@@ -131,46 +110,44 @@ struct Gemm::Impl {
             proxies.push_back(c.front().kernel);
         }
 
-        //             cluster_id, splits, metrics
-        std::vector<std::tuple<int, int, KernelMetric>> metrics;
+        std::vector<std::pair<int, LaunchSpec>> specs;
 
-        for (int cluster_id = 0; cluster_id < (int)proxies.size(); ++cluster_id) {
-            auto&     kernel     = *proxies[cluster_id];
-            const int max_splits = kernel.GetMaxSplits(desc.m, desc.n, desc.k, barrier_size, partials_size);
+        PopulateParam param{};
+        param.max_splits    = tuning_.max_splits;
+        param.max_waves     = tuning_.max_waves;
+        param.swizzle       = tuning_.swizzle.at(0);
+        param.barriers_size = barrier_size;
+        param.partials_size = partials_size;
 
-            auto ms = kernel.Estimate_v2({desc.m, desc.n, desc.k},  //
-                                         std::min(max_splits, tuning_.max_splits),
-                                         tuning_.max_waves,
-                                         props_->multiProcessorCount);
+        for (int cluster_id = 0; cluster_id < (int)proxies.size(); ++cluster_id) {
+            auto& kernel = *proxies[cluster_id];
 
-            for (const auto& [splits, metric] : ms) {
-                metrics.emplace_back(cluster_id, splits, metric);
+            auto tmp = ctx.Populate(kernel, param);
+            for (const auto& s : tmp) {
+                specs.emplace_back(cluster_id, s);
             }
         }
 
         // std::cerr << "#kernel: " << kernels.size() << ", #cluster: " << clusters.size()
         //           << ", #metric: " << metrics.size() << "\n";
 
-        std::vector<int64_t> mio_cost;
-        std::vector<int64_t> mma_cost;
-        for (const auto& [_, s, m] : metrics) {
-            mio_cost.push_back(m.mio_cost);
-            mma_cost.push_back(m.mma_cost);
+        int64_t mio_max = 0;
+        int64_t mma_max = 0;
+        for (const auto& [_, s] : specs) {
+            auto& [mio, mma] = s.estimated;
+            mio_max          = std::max(mio_max, mio);
+            mma_max          = std::max(mma_max, mma);
         }
-
-        const auto mio_max = *std::max_element(mio_cost.begin(), mio_cost.end());
-        const auto mma_max = *std::max_element(mma_cost.begin(), mma_cost.end());
-
         std::vector<float> mio_ratio;
         std::vector<float> mma_ratio;
         std::vector<float> avg_ratio;
-        for (size_t i = 0; i < metrics.size(); ++i) {
-            mio_ratio.push_back(static_cast<float>(mio_cost[i]) / mio_max);
-            mma_ratio.push_back(static_cast<float>(mma_cost[i]) / mma_max);
-            avg_ratio.push_back(.5f * (mio_ratio.back() + mma_ratio.back()));
+        for (const auto& [_, s] : specs) {
+            auto& [mio, mma] = s.estimated;
+            mio_ratio.push_back((float)mio / mio_max);
+            mma_ratio.push_back((float)mma / mma_max);
+            avg_ratio.push_back(.5 * (mio_ratio.back() + mma_ratio.back()));
         }
-
-        auto idxs = ArgSort(metrics.size(), [&](int i, int j) {  //
+        auto idxs = ArgSort(specs.size(), [&](int i, int j) {  //
             return avg_ratio[i] < avg_ratio[j];
         });
 
@@ -185,10 +162,12 @@ struct Gemm::Impl {
         std::vector<LaunchSpec> ret;
         ret.reserve(top_k);
         for (int i = 0; i < top_k; ++i) {
-            const auto& [cluster_id, splits, cost] = metrics[idxs[i]];
+            const auto& [cluster_id, spec] = specs[idxs[i]];
             // Apply `splits` to all kernels in the cluster
             for (const auto& s : clusters[cluster_id]) {
-                ret.push_back(LaunchSpec{s.kernel, tuning_.swizzle.at(0), splits});
+                auto tmp   = spec;
+                tmp.kernel = s.kernel;
+                ret.push_back(tmp);
             }
         }
 
@@ -196,7 +175,8 @@ struct Gemm::Impl {
     }
 
     template<class LaunchFunc>
-    int Measure(const GemmDesc& desc,
+    int Measure(Context&        ctx,
+                const GemmDesc& desc,
                 size_t          barriers_size,
                 size_t          partials_size,
                 int             top_k,
@@ -209,16 +189,13 @@ struct Gemm::Impl {
         }
         // std::cerr << "GEMM: " << desc.m << "x" << desc.n << "x" << desc.k << "\n";
 
-        const auto tmp = Find(desc, barriers_size, partials_size, tuning_.top_k);
+        const auto tmp = Find(ctx, desc, barriers_size, partials_size, tuning_.top_k);
 
         std::vector<LaunchSpec> specs;
         for (const auto& spec : tmp) {
             // populate swizzle parameters
-            const auto swis = FilterSwizzleParam(*spec.kernel, desc.m, desc.n, desc.k, tuning_.swizzle);
-            for (const auto& swi : swis) {
-                specs.push_back(spec);
-                specs.back().swizzle = swi;
-            }
+            const auto swis = ctx.Swizzle(spec, tuning_.swizzle);
+            specs.insert(specs.end(), swis.begin(), swis.end());
         }
 
         specs = Sampler{*measurer_, tuning_.clusters}.Run(specs, launch_func, st);
@@ -241,30 +218,6 @@ struct Gemm::Impl {
         return 0;
     }
 
-    std::vector<int> FilterSwizzleParam(Kernel& kernel, int m, int n, int k, const std::vector<int>& swis)
-    {
-        std::vector<int> swizzles;
-        for (const auto& swi : swis) {
-            // To use splits=1 here, swizzling must not depends on split count
-            swizzles.push_back(kernel.GetSwizzle(m, n, k, 1, swi));
-        }
-        if (swizzles.size() == 1) {
-            return swizzles;
-        }
-
-        // De-duplicate possible swizzles while keep the order
-        std::sort(swizzles.begin(), swizzles.end());
-        swizzles.erase(std::unique(swizzles.begin(), swizzles.end()), swizzles.end());
-
-        std::vector<int> tmp;
-        std::copy_if(swis.begin(), swis.end(), std::back_inserter(tmp), [&](int swi) {
-            return std::find(swizzles.begin(), swizzles.end(), swi) != swizzles.end();
-        });
-        tmp.swap(swizzles);
-
-        return swizzles;
-    }
-
     /// TODO: move to cuda utils
     static std::unique_ptr<cudaDeviceProp> GetCudaDeviceProps()
     {
@@ -286,6 +239,8 @@ struct Gemm::Impl {
     std::optional<Measurer> measurer_;
 
     DispatchCache cache_;
+
+    StaticGemmContext default_ctx_;
 };
 
 // implementation of GEMM interfaces
@@ -313,34 +268,15 @@ int Gemm::Run(const Operation&    operation,
               cudaStream_t        stream)
 {
 
-    if (Adesc.rows != Ddesc.rows || Bdesc.cols != Ddesc.cols || Adesc.cols != Bdesc.rows) {
+    Context& context = operation.context ? *operation.context : (Context&)impl_->default_ctx_;
+
+    const auto desc = context.Init(operation, Adesc, Udesc, Bdesc, Vdesc, Cdesc, Ddesc);
+
+    if (!desc) {
+        fprintf(stderr, "invalid argument.\n");
         return -1;
     }
 
-    const int m = Ddesc.rows;
-    const int n = Ddesc.cols;
-    const int k = Adesc.cols;
-
-    const GemmDesc desc{
-        impl_->arch_,
-        Adesc.type,
-        Bdesc.type,
-        Ddesc.type,
-        Adesc.order,
-        Bdesc.order,
-        Ddesc.order,
-        Adesc.pack,
-        Bdesc.pack,
-        Udesc.pack,
-        Vdesc.pack,
-        operation.quant_a,
-        operation.quant_b,
-        operation.epilogue,
-        m,
-        n,
-        k,
-    };
-
     const auto launch = [=](LaunchSpec spec, cudaStream_t st) {
         auto _workspace = workspace;
         return spec.kernel->Launch(operation,
@@ -365,7 +301,7 @@ int Gemm::Run(const Operation&    operation,
     };
 
     if (operation.reserved) {
-        auto specs = impl_->Find(desc, workspace.barriers_size, workspace.partials_size, 0);
+        auto specs = impl_->Find(context, *desc, workspace.barriers_size, workspace.partials_size, 0);
         auto cases = (std::vector<std::function<LaunchSpec()>>*)operation.reserved;
         for (const auto& spec : specs) {
             cases->push_back([=] {
@@ -379,10 +315,10 @@ int Gemm::Run(const Operation&    operation,
     LaunchSpec spec{};
 
     if (operation.dispatch & DispatchPolicy::kMeasure) {
-        impl_->Measure(desc, workspace.barriers_size, workspace.partials_size, 1, launch, stream);
+        impl_->Measure(context, *desc, workspace.barriers_size, workspace.partials_size, 1, launch, stream);
     }
 
-    spec = impl_->Dispatch(operation.dispatch, desc, workspace.barriers_size, workspace.partials_size);
+    spec = impl_->Dispatch(context, operation.dispatch, *desc, workspace.barriers_size, workspace.partials_size);
 
     if (spec.kernel) {
         // std::cout << "[Gemm] dispatch: " << spec.kernel->name()  //
@@ -391,7 +327,7 @@ int Gemm::Run(const Operation&    operation,
         return launch(spec, stream);
     }
 
-    printf("No feasible kernel found for the problem.\n");
+    fprintf(stderr, "No feasible kernel found for the problem.\n");
 
     return -1;
 }
diff --git a/src/turbomind/kernels/gemm/gemm.h b/src/turbomind/kernels/gemm/gemm.h
index 5a23486b1b..faf340f154 100644
--- a/src/turbomind/kernels/gemm/gemm.h
+++ b/src/turbomind/kernels/gemm/gemm.h
@@ -47,9 +47,11 @@ class Gemm {
     std::unique_ptr<Impl> impl_;
 };
 
-[[nodiscard]] int
-Convert(const void* S, const MatrixLayout& Sdesc, void* D, const MatrixLayout& Ddesc, cudaStream_t stream);
+[[nodiscard]] int Convert(const void* S, const MatrixLayout& Sdesc, void* D, MatrixLayout& Ddesc, cudaStream_t stream);
 
-std::tuple<Order, Pack, Order, Pack> get_weight_and_scales_layout(int sm, bool force_simt);
+std::tuple<Order, Pack, Order, Pack>
+get_weight_and_scales_layout(DataType dtype, bool is_fused_moe, int sm, bool force_simt);
+
+void* make_blocked_ptrs(const std::vector<std::pair<void*, int>>& ptrs, cudaStream_t stream);
 
 }  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/gemm_universal.h b/src/turbomind/kernels/gemm/gemm_universal.h
index 9375dcb6fa..3ae0effdad 100644
--- a/src/turbomind/kernels/gemm/gemm_universal.h
+++ b/src/turbomind/kernels/gemm/gemm_universal.h
@@ -8,6 +8,7 @@
 #include "src/turbomind/kernels/core/layout.h"
 #include "src/turbomind/kernels/core/math.h"
 
+#include "src/turbomind/kernels/gemm/cta_map.h"
 #include "src/turbomind/kernels/gemm/desc.h"
 #include "src/turbomind/kernels/gemm/epilogue.h"
 #include "src/turbomind/kernels/gemm/thread_map.h"
@@ -16,30 +17,19 @@
 
 namespace turbomind::gemm {
 
-template<class PtrA, class PtrU, class PtrB, class PtrV, class Tc>
-struct GemmParams {
-    int m;
-    int n;
-    int k;
-
-    PtrA A;
-    int  lda;
-    PtrU U;
-    int  ldu;
-    PtrB B;
-    int  ldb;
-    PtrV V;
-    int  ldv;
-
-    int  log_tile;
-    int3 tiled_shape;
-
-    int chunk_per_split;
-    int chunk_offset;  // splits - chunk_cnt % splits
-
-    EpilogueParam<Tc> epilogue;
+struct GemmParam {
+    MatrixParam a;
+    MatrixParam b;
+    MatrixParam u;
+    MatrixParam v;
 };
 
+template<class Op>
+__inline__ __device__ MatrixData resolve_op(const MatrixParam& param, int gemm_id)
+{
+    return resolve<typename Op::Dtype, Op::GmemIter::kMode>(param, gemm_id);
+}
+
 template<class Arch_, class Mainloop, class Epilogue_, class CtaMap_>
 struct GemmUniversal {
 
@@ -66,7 +56,8 @@ struct GemmUniversal {
     static constexpr int CTA_N = Impl::CTA_N;
     static constexpr int CTA_K = Impl::CTA_K;
 
-    static constexpr bool SplitK = Epilogue::SplitK;
+    static constexpr bool kDynamicSched = is_dynamic_scheduler<CtaMap>::value;
+    static constexpr bool kSplitK       = Epilogue::SplitK;
 
     using FragC = typename Impl::FragC;
 
@@ -79,8 +70,8 @@ struct GemmUniversal {
 
     static constexpr int kChunkSizeK = std::max(CTA_K, std::max(OperandU::kGroupSize, OperandV::kGroupSize));
 
-    static constexpr int kGroupSizeU = OperandU::kGroupSize;
-    static constexpr int kGroupSizeV = OperandV::kGroupSize;
+    static constexpr int kGSizeU = OperandU::kGroupSize;
+    static constexpr int kGSizeV = OperandV::kGroupSize;
 
     union SharedStorage {
         typename Mainloop::SharedStorage mainloop;
@@ -95,78 +86,91 @@ struct GemmUniversal {
     static constexpr Pack kPackA = OperandA::kPack;
     static constexpr Pack kPackB = OperandB::kPack;
 
-    using PtrA = get_pointer_type<Ta>;
-    using PtrB = get_pointer_type<Tb>;
-    using PtrU = get_pointer_type<Tu>;
-    using PtrV = get_pointer_type<Tv>;
-
-    using Param = GemmParams<PtrA, PtrU, PtrB, PtrV, Tc>;
+    using Param = GemmParam;
 
-    __device__ void operator()(const Param& param, const CtaMap& cta_map, char* smem_buf)
+    __device__ void operator()(const Param& param, const EpilogueParam& epi_param, CtaMap& cta_map, char* smem_buf)
     {
-        const auto tile_offset = CtaMap::get_tile_offset(param.log_tile);
-
-        const auto& tiled_shape = param.tiled_shape;
+        if (!cta_map.init()) {
+            return;
+        }
 
-        // Sub-optimal when the split is uneven
-        //   e.g. ceil_div(10, 3) = 4 -> [4, 4, 2], however [3, 3, 4] is better in every aspect
-        //   const int chunk_cnt = (param.k + kChunkSizeK - 1) / kChunkSizeK;
-        // const int chunk_per_split = (chunk_cnt + tiled_shape.z - 1) / tiled_shape.z;
-        // const int offset_k        = chunk_per_split * kChunkSizeK * tile_offset.z;
-        // const int gemm_k_size     = std::min(offset_k + chunk_per_split * kChunkSizeK, param.k) - offset_k;
+        const auto [M, N, K, L] = cta_map.gemm_shape();
+        const auto tile_offset  = cta_map.tile_offset();
 
-        int chunk_id    = tile_offset.z * param.chunk_per_split + max(tile_offset.z - param.chunk_offset, 0);
-        int offset_k    = chunk_id * kChunkSizeK;
-        int gemm_k_size = (param.chunk_per_split + int(tile_offset.z >= param.chunk_offset)) * kChunkSizeK;
-        gemm_k_size     = min(offset_k + gemm_k_size, param.k) - offset_k;
+        const auto [iter_k_beg, iter_k_end] = cta_map.iter_k_range();
 
         const int offset_m = tile_offset.x * CTA_M;
         const int offset_n = tile_offset.y * CTA_N;
+        const int offset_k = iter_k_beg * CTA_K;
 
-        if (offset_m >= param.m || offset_n >= param.n || offset_k >= param.k) {  // empty tile
+        if (offset_m >= M || offset_n >= N || offset_k >= K) {  // empty tile
             return;
         }
 
-        const int end_m = min(CTA_M, param.m - offset_m);
-        const int end_n = min(CTA_N, param.n - offset_n);
+        const int extent_m = min(CTA_M, M - offset_m);
+        const int extent_n = min(CTA_N, N - offset_n);
 
         SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
 
         // Is 8 enough?
         __align__(8) FragC frag_C{};
 
-        int tile_iter = (gemm_k_size + CTA_K - 1) / CTA_K;
+        // int tile_iter = (gemm_k_size + CTA_K - 1) / CTA_K;
+        int tile_iter = iter_k_end - iter_k_beg;
 
-        typename OperandA::GmemIter gmem_A{param.A, param.lda, {offset_m, offset_k}, {end_m, CTA_K}};
-        typename OperandB::GmemIter gmem_B{param.B, param.ldb, {offset_n, offset_k}, {end_n, CTA_K}};
+        const int g = tile_offset.w;
 
-        /// TODO: move `ceil_div` into `GmemIter`
-        typename OperandU::GmemIter gmem_U{
-            param.U, param.ldu, {offset_m, ceil_div(offset_k, kGroupSizeU)}, {end_m, ceil_div(CTA_K, kGroupSizeU)}};
-        typename OperandV::GmemIter gmem_V{
-            param.V, param.ldv, {offset_n, ceil_div(offset_k, kGroupSizeV)}, {end_n, ceil_div(CTA_K, kGroupSizeV)}};
+        const auto mat_A = resolve_op<OperandA>(param.a, g);
+        const auto mat_B = resolve_op<OperandB>(param.b, g);
+        const auto mat_U = resolve_op<OperandU>(param.u, g);
+        const auto mat_V = resolve_op<OperandV>(param.v, g);
 
-        Mainloop mainloop{};
+        typename OperandA::GmemIter gmem_A{mat_A, {offset_m, offset_k}, {extent_m, CTA_K}};
+        typename OperandB::GmemIter gmem_B{mat_B, {offset_n, offset_k}, {extent_n, CTA_K}};
+
+        const int2 offset_U{offset_m, cdiv(offset_k, kGSizeU)}, extent_U{extent_m, cdiv(CTA_K, kGSizeU)};
+        typename OperandU::GmemIter gmem_U{mat_U, offset_U, extent_U};
 
+        const int2 offset_V{offset_n, cdiv(offset_k, kGSizeV)}, extent_V{extent_n, cdiv(CTA_K, kGSizeV)};
+        typename OperandV::GmemIter gmem_V{mat_V, offset_V, extent_V};
+
+        Mainloop mainloop{};
         mainloop(gmem_A, gmem_B, gmem_U, gmem_V, frag_C, tile_iter, storage.mainloop);
 
-        Epilogue epilogue{};
+        {
+            cta_map.init();
 
-        const bool is_primary = offset_k + gemm_k_size == param.k;
+            const auto [M, N, K, L] = cta_map.gemm_shape();
 
-        epilogue(frag_C, tile_offset, tiled_shape, end_m, end_n, is_primary, param.epilogue, storage.epilogue);
+            const auto tiled_shape = cta_map.tiled_shape();
+            const auto tile_offset = cta_map.tile_offset();
+
+            const int2 extents = {min(CTA_M, M - tile_offset.x * CTA_M), min(CTA_N, N - tile_offset.y * CTA_N)};
+
+            const bool is_last = cta_map.iter_k_range().y * CTA_K == K;
+
+            Epilogue epilogue{};
+            epilogue(frag_C,  //
+                     tile_offset,
+                     tiled_shape,
+                     extents,
+                     cta_map.tile_id(),
+                     is_last,
+                     epi_param,
+                     storage.epilogue);
+        }
     }
 };
 
 extern __shared__ char smem_buf[];
 
-template<class Kernel, class Params, class CtaMap>
-__global__ void gemm_kernel(Params params, CtaMap cta_map)
+template<class Kernel, class Param, class EpilogueParam, class CtaMap>
+__global__ void gemm_kernel(Param param, EpilogueParam epi_param, CtaMap cta_map)
 {
 #if __CUDA_ARCH__
     if constexpr (Kernel::Arch::is_compatible(__CUDA_ARCH__)) {
         Kernel kernel;
-        kernel(params, cta_map, smem_buf);
+        kernel(param, epi_param, cta_map, smem_buf);
     }
 #endif
 }
diff --git a/src/turbomind/kernels/gemm/iterator.h b/src/turbomind/kernels/gemm/iterator.h
index 71152c11fe..ae137a2648 100644
--- a/src/turbomind/kernels/gemm/iterator.h
+++ b/src/turbomind/kernels/gemm/iterator.h
@@ -5,18 +5,18 @@
 #include "src/turbomind/kernels/core/array.h"
 #include "src/turbomind/kernels/core/data_type.h"
 #include "src/turbomind/kernels/core/meta.h"
+#include "src/turbomind/kernels/gemm/matrix_ptr.h"
 #include "src/turbomind/kernels/gemm/thread_map.h"
+#include "src/turbomind/kernels/gemm/types.h"
 #include "src/turbomind/kernels/gemm/utils.h"
 
 namespace turbomind::gemm {
 
 struct VoidGmemIter {
-    static constexpr int ITER_S = 0;
-    using Fragments             = int;
-    template<class P>
-    __device__ VoidGmemIter(P, int, int2, int2)
-    {
-    }
+    static constexpr int  ITER_S = 0;
+    static constexpr auto kMode  = Striding::kFlat;
+    using Fragments              = int;
+    __device__      VoidGmemIter(...) {}
     __device__ void ClearSmem() {}
     __device__ void Prefetch(int, int, bool) {}
     __device__ void Prefetch(bool) {}
@@ -40,13 +40,14 @@ struct GetGmemIter {
         constexpr int2 kAligned = mk2cs<Operand::kOrder>(0, 1);
         constexpr int2 kCS      = mk2cs<Operand::kOrder>(M, K);
 
+#if 0
         constexpr int kMaxThrS = std::min(WARP_SIZE, ceil_div(kCS.y, WARPS));
         constexpr int kMaxThrC = std::min(WARP_SIZE, ceil_div(kCS.x, kAccessSize));
 
         constexpr int kTgtThrC = ceil_div<int>(256, sizeof(Array<Dtype, kAccessSize>));
 
         constexpr int kWarpThrC = std::min(kMaxThrC, std::max(WARP_SIZE / kMaxThrS, kTgtThrC));
-
+#endif
         using GmemIter = typename Iterator::template Type<Dtype,
                                                           gemm::ThreadMap_V2<kCS.x, kCS.y, kAccessSize, Blocked, WARPS>,
                                                           SmemLayout,
diff --git a/src/turbomind/kernels/gemm/iterator_sm70.h b/src/turbomind/kernels/gemm/iterator_sm70.h
index 4f32777da0..fc52463377 100644
--- a/src/turbomind/kernels/gemm/iterator_sm70.h
+++ b/src/turbomind/kernels/gemm/iterator_sm70.h
@@ -7,6 +7,7 @@
 #include "src/turbomind/kernels/core/data_type.h"
 #include "src/turbomind/kernels/core/layout.h"
 #include "src/turbomind/kernels/gemm/cp_async.h"
+#include "src/turbomind/kernels/gemm/matrix_ptr.h"
 #include "src/turbomind/kernels/gemm/predicate.h"
 #include "src/turbomind/kernels/gemm/types.h"
 #include "src/turbomind/kernels/gemm/utils.h"
@@ -34,7 +35,15 @@ inline __device__ void _Ld(Array<T, N>& dst, const T* src)
     }
 }
 
-template<class T, class Map, class SmemLayout, Pack kPack, Order kOrder, bool AlignedC, bool AlignedS, class Policy_>
+template<class T,
+         class Map,
+         class SmemLayout,
+         Pack     kPack,
+         Order    kOrder,
+         bool     AlignedC,
+         bool     AlignedS,
+         Striding mode,
+         class Policy_>
 struct GmemIteratorSm70 {
 
     using ThreadMap = Map;
@@ -47,6 +56,9 @@ struct GmemIteratorSm70 {
     static constexpr int ITER_S = Map::kIterS;
     static constexpr int ITER_C = Map::kIterC;
 
+    static constexpr Striding kMode      = mode;
+    static constexpr bool     is_indexed = mode == Striding::kIndexed;
+
     const char* src_data_;
 
     int src_offset_;
@@ -72,6 +84,8 @@ struct GmemIteratorSm70 {
 
     int phases_[kPeriodS][kPeriodC];
 
+    const char* src_data_vec_[ITER_S];
+
     using Fragments = AccessType[Map::kIterS][Map::kIterC];
 
     __device__ static constexpr int2 pack(int2 mk)
@@ -86,23 +100,23 @@ struct GmemIteratorSm70 {
 
     __device__ GmemIteratorSm70(): smem_data_{Pointer{nullptr}} {};
 
-    __device__ GmemIteratorSm70(Pointer data, int stride_s, int2 offset, int2 extent): smem_data_{Pointer{(T*)nullptr}}
+    __device__ GmemIteratorSm70(const MatrixData& mat, int2 offset, int2 extent): smem_data_{Pointer{(T*)nullptr}}
     {
         const int warp_id = threadIdx.x / WARP_SIZE;
         const int lane_id = threadIdx.x % WARP_SIZE;
 
-        data   = data + cs2idx(to_cs(pack(offset)), stride_s);
-        extent = to_cs(pack(extent));
+        const Pointer data{(T*)mat.ptr.ptr};
+        const int     ld = mat.ptr.stride;
 
-        int2 offsets    = Map::get_offset(warp_id, lane_id);
-        int  src_offset = offsets.x + offsets.y * stride_s;
+        const int2 offsets = Map::get_offset(warp_id, lane_id);
 
         offset_c_ = offsets.x;
         offset_s_ = offsets.y;
 
-        auto src_ptr = reinterpret_cast<const char*>((T*)data);
+        // auto src_ptr = reinterpret_cast<const char*>((T*)data);
 
         if constexpr (pred_.is_active) {
+            extent = to_cs(pack(extent));
             PRAGMA_UNROLL
             for (int s = 0; s < Map::kIterS; ++s) {
                 PRAGMA_UNROLL
@@ -124,15 +138,29 @@ struct GmemIteratorSm70 {
             }
         }
 
+        const int src_offset = is_indexed ? offsets.x : offsets.x + offsets.y * ld;
+
         src_offset_ = src_offset * bitsof<T> / bitsof<char>;
 
         src_step_c_ = bitsof<T> * Map::kDeltaC / bitsof<char>;
-        src_step_s_ = bitsof<T> * Map::kDeltaS * stride_s / bitsof<char>;
+        src_step_s_ = bitsof<T> * Map::kDeltaS * ld / bitsof<char>;
 
-        src_step_k_ = bitsof<T> * cs2mk<kOrder>(Map::kDimC, Map::kDimS * stride_s).y / bitsof<char>;
+        src_step_k_ = bitsof<T> * cs2mk<kOrder>(Map::kDimC, Map::kDimS * ld).y / bitsof<char>;
 
         // initialize for the first tile
-        src_data_ = src_ptr + src_offset_;
+        if constexpr (is_indexed) {
+            const int2 cta_cs = to_cs(offset);
+            for (int s = 0; s < ITER_S; ++s) {
+                const int  ss    = cta_cs.y + offset_s_ + s * Map::kDeltaS;
+                const int  idx   = (mat.idxs && pred_(s, 0)) ? __ldg(mat.idxs + ss) : ss;
+                const auto tmp   = data + cs2idx({cta_cs.x, idx}, ld);
+                src_data_vec_[s] = reinterpret_cast<const char*>((T*)tmp) + src_offset_;
+            }
+        }
+        else {
+            auto src_data = data + cs2idx(to_cs(pack(offset)), ld);
+            src_data_     = reinterpret_cast<const char*>((T*)src_data) + src_offset_;
+        }
     }
 
     __device__ constexpr int _src_step_k() const
@@ -156,39 +184,12 @@ struct GmemIteratorSm70 {
         }
     }
 
-    __device__ void Prefetch(int begin, int count, bool tile_mask)
-    {
-        PRAGMA_UNROLL
-        for (int s = begin; s < begin + count && s < Map::kIterS; ++s) {
-            PRAGMA_UNROLL
-            for (int c = 0; c < Map::kIterC; ++c) {
-                // auto dst = &smem_data_(offset_s_ + s * Map::kDeltaS, offset_c_ + c * Map::kDeltaC);
-
-                const int i0  = SmemLayout::apply(  //
-                    s / kPeriodS * kPeriodS * Map::kDeltaS,
-                    c / kPeriodC * kPeriodC * Map::kDeltaC);
-                const int i1  = phases_[s % kPeriodS][c % kPeriodC];
-                auto      dst = &smem_data_.ptr_[i0 + i1];
-
-                Copy(std::true_type{}, dst, src_data_ + src_step_c_ * c, tile_mask && g_mask && pred_(s, c));
-            }
-            src_data_ += src_step_s_;
-            if (s == Map::kIterS - 1) {
-                src_data_ -= src_step_s_ * Map::kIterS;
-                src_data_ += _src_step_k();
-            }
-        }
-    }
-
-    __device__ void Prefetch(bool tile_mask)
-    {
-        Prefetch(0, Map::kIterS, tile_mask);
-    }
-
     __device__ void Advance()
     {
-        if (!g_mask) {
-            src_data_ -= _src_step_k();
+        if constexpr (!is_indexed) {
+            if (!g_mask) {
+                src_data_ -= _src_step_k();
+            }
         }
     }
 
@@ -210,14 +211,25 @@ struct GmemIteratorSm70 {
     {
         PRAGMA_UNROLL
         for (int s = 0; s < Map::kIterS; ++s) {
+
+            if constexpr (is_indexed) {
+                src_data_ = src_data_vec_[s];
+            }
+
             PRAGMA_UNROLL
             for (int c = 0; c < Map::kIterC; ++c) {
                 Copy2(frags[s][c], src_data_ + src_step_c_ * c, tile_mask && g_mask && pred_(s, c));
             }
-            src_data_ += src_step_s_;
-            if (s == Map::kIterS - 1) {
-                src_data_ -= src_step_s_ * Map::kIterS;
-                src_data_ += _src_step_k();
+
+            if constexpr (is_indexed) {
+                src_data_vec_[s] += _src_step_k();
+            }
+            else {
+                src_data_ += src_step_s_;
+                if (s == Map::kIterS - 1) {
+                    src_data_ -= src_step_s_ * Map::kIterS;
+                    src_data_ += _src_step_k();
+                }
             }
         }
     }
@@ -256,10 +268,10 @@ struct GmemIteratorSm70 {
     }
 };
 
-template<class Policy>
+template<Striding mode, class Policy>
 struct IteratorSm70 {
     template<class T, class Map, class SmemLayout, Pack kPack, Order kOrder, bool AlignedC, bool AlignedS>
-    using Type = GmemIteratorSm70<T, Map, SmemLayout, kPack, kOrder, AlignedC, AlignedS, Policy>;
+    using Type = GmemIteratorSm70<T, Map, SmemLayout, kPack, kOrder, AlignedC, AlignedS, mode, Policy>;
 };
 
 }  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/iterator_sm80.h b/src/turbomind/kernels/gemm/iterator_sm80.h
index eab85e67b7..a625c68b65 100644
--- a/src/turbomind/kernels/gemm/iterator_sm80.h
+++ b/src/turbomind/kernels/gemm/iterator_sm80.h
@@ -8,6 +8,7 @@
 #include "src/turbomind/kernels/core/layout.h"
 #include "src/turbomind/kernels/core/smem.h"
 #include "src/turbomind/kernels/gemm/cp_async.h"
+#include "src/turbomind/kernels/gemm/matrix_ptr.h"
 #include "src/turbomind/kernels/gemm/predicate.h"
 #include "src/turbomind/kernels/gemm/types.h"
 #include "src/turbomind/kernels/gemm/utils.h"
@@ -16,7 +17,15 @@
 
 namespace turbomind::gemm {
 
-template<class T, class Map, class SmemLayout, Pack kPack, Order kOrder, bool AlignedC, bool AlignedS, class Policy_>
+template<class T,
+         class Map,
+         class SmemLayout,
+         Pack     kPack,
+         Order    kOrder,
+         bool     AlignedC,
+         bool     AlignedS,
+         Striding mode,
+         class Policy_>
 struct GmemIteratorSm80 {
 
     using ThreadMap = Map;
@@ -29,6 +38,9 @@ struct GmemIteratorSm80 {
     static constexpr int ITER_S = Map::kIterS;
     static constexpr int ITER_C = Map::kIterC;
 
+    static constexpr Striding kMode      = mode;
+    static constexpr bool     is_indexed = mode == Striding::kIndexed;
+
     const char* src_data_;
 
     int src_offset_;
@@ -54,6 +66,8 @@ struct GmemIteratorSm80 {
 
     int phases_[kPeriodS][kPeriodC];
 
+    const char* src_data_vec_[ITER_S];
+
     uint64_t cache_policy_{};
 
     __device__ static constexpr int2 pack(int2 mk)
@@ -68,23 +82,21 @@ struct GmemIteratorSm80 {
 
     __device__ GmemIteratorSm80(): smem_data_{Pointer{nullptr}} {};
 
-    __device__ GmemIteratorSm80(Pointer data, int stride_s, int2 offset, int2 extent): smem_data_{Pointer{(T*)nullptr}}
+    __device__ GmemIteratorSm80(const MatrixData& mat, int2 offset, int2 extent): smem_data_{Pointer{(T*)nullptr}}
     {
         const int warp_id = threadIdx.x / WARP_SIZE;
         const int lane_id = threadIdx.x % WARP_SIZE;
 
-        data   = data + cs2idx(to_cs(pack(offset)), stride_s);
-        extent = to_cs(pack(extent));
+        const Pointer data{(T*)mat.ptr.ptr};
+        const int     ld = mat.ptr.stride;
 
-        int2 offsets    = Map::get_offset(warp_id, lane_id);
-        int  src_offset = offsets.x + offsets.y * stride_s;
+        const int2 offsets = Map::get_offset(warp_id, lane_id);
 
         offset_c_ = offsets.x;
         offset_s_ = offsets.y;
 
-        auto src_ptr = reinterpret_cast<const char*>((T*)data);
-
         if constexpr (pred_.is_active) {
+            extent = to_cs(pack(extent));
             PRAGMA_UNROLL
             for (int s = 0; s < Map::kIterS; ++s) {
                 PRAGMA_UNROLL
@@ -106,15 +118,29 @@ struct GmemIteratorSm80 {
             }
         }
 
+        const int src_offset = is_indexed ? offsets.x : offsets.x + offsets.y * ld;
+
         src_offset_ = src_offset * bitsof<T> / bitsof<char>;
 
         src_step_c_ = bitsof<T> * Map::kDeltaC / bitsof<char>;
-        src_step_s_ = bitsof<T> * Map::kDeltaS * stride_s / bitsof<char>;
-
-        src_step_k_ = bitsof<T> * cs2mk<kOrder>(Map::kDimC, Map::kDimS * stride_s).y / bitsof<char>;
-
-        // initialize for the first tile
-        src_data_ = src_ptr + src_offset_;
+        src_step_s_ = bitsof<T> * Map::kDeltaS * ld / bitsof<char>;
+
+        src_step_k_ = bitsof<T> * cs2mk<kOrder>(Map::kDimC, Map::kDimS * ld).y / bitsof<char>;
+
+        // Initialize for the first tile
+        if constexpr (is_indexed) {
+            const int2 cta_cs = to_cs(offset);
+            for (int s = 0; s < ITER_S; ++s) {
+                const int  ss    = cta_cs.y + offset_s_ + s * Map::kDeltaS;
+                const int  idx   = (mat.idxs && pred_(s, 0)) ? __ldg(mat.idxs + ss) : ss;
+                const auto tmp   = data + cs2idx({cta_cs.x, idx}, ld);
+                src_data_vec_[s] = reinterpret_cast<const char*>((T*)tmp) + src_offset_;
+            }
+        }
+        else {
+            auto src_data = data + cs2idx(to_cs(pack(offset)), ld);
+            src_data_     = reinterpret_cast<const char*>((T*)src_data) + src_offset_;
+        }
 
 #if TURBOMIND_ARCH_SM80
         if constexpr (Policy::kEvictPolicy != EvictPolicy::kEvictNormal) {
@@ -148,6 +174,11 @@ struct GmemIteratorSm80 {
     {
         PRAGMA_UNROLL
         for (int s = begin; s < begin + count && s < Map::kIterS; ++s) {
+
+            if constexpr (is_indexed) {
+                src_data_ = src_data_vec_[s];
+            }
+
             PRAGMA_UNROLL
             for (int c = 0; c < Map::kIterC; ++c) {
                 // auto dst = &smem_data_(offset_s_ + s * Map::kDeltaS, offset_c_ + c * Map::kDeltaC);
@@ -160,10 +191,16 @@ struct GmemIteratorSm80 {
 
                 CpAsync(std::true_type{}, dst, src_data_ + src_step_c_ * c, tile_mask && g_mask && pred_(s, c));
             }
-            src_data_ += src_step_s_;
-            if (s == Map::kIterS - 1) {
-                src_data_ -= src_step_s_ * Map::kIterS;
-                src_data_ += _src_step_k();
+
+            if constexpr (is_indexed) {
+                src_data_vec_[s] += _src_step_k();
+            }
+            else {
+                src_data_ += src_step_s_;
+                if (s == Map::kIterS - 1) {
+                    src_data_ -= src_step_s_ * Map::kIterS;
+                    src_data_ += _src_step_k();
+                }
             }
         }
     }
@@ -175,8 +212,10 @@ struct GmemIteratorSm80 {
 
     __device__ void Advance()
     {
-        if (!g_mask) {
-            src_data_ -= _src_step_k();
+        if constexpr (!is_indexed) {
+            if (!g_mask) {
+                src_data_ -= _src_step_k();
+            }
         }
     }
 
@@ -204,10 +243,10 @@ struct GmemIteratorSm80 {
     }
 };
 
-template<class Policy>
+template<Striding mode, class Policy>
 struct IteratorSm80 {
     template<class T, class Map, class SmemLayout, Pack kPack, Order kOrder, bool AlignedC, bool AlignedS>
-    using Type = GmemIteratorSm80<T, Map, SmemLayout, kPack, kOrder, AlignedC, AlignedS, Policy>;
+    using Type = GmemIteratorSm80<T, Map, SmemLayout, kPack, kOrder, AlignedC, AlignedS, mode, Policy>;
 };
 
 }  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/kernel.cu b/src/turbomind/kernels/gemm/kernel.cu
index 4b2eddc36a..b6849007c0 100644
--- a/src/turbomind/kernels/gemm/kernel.cu
+++ b/src/turbomind/kernels/gemm/kernel.cu
@@ -12,6 +12,32 @@
 
 namespace turbomind::gemm {
 
+bool accept(Striding a, Striding b)
+{
+    if (a == Striding::kBlocked) {
+        switch (b) {
+            case Striding::kBlocked:
+            case Striding::kFlat:
+                return true;
+            default:
+                return false;
+        }
+    }
+    else if (a == Striding::kIndexed) {
+        switch (b) {
+            case Striding::kFlat:
+            case Striding::kBlocked:
+            case Striding::kIndexed:
+                return true;
+            default:
+                return false;
+        }
+    }
+    else {
+        return a == b;
+    }
+}
+
 bool Kernel::is_feasible(const GemmDesc& desc) const noexcept
 {
     constexpr bool debug = 0;
@@ -19,6 +45,8 @@ bool Kernel::is_feasible(const GemmDesc& desc) const noexcept
     if constexpr (debug)
         printf("S\n");
 
+    // printf("%d %d\n", desc.arch, desc_.arch);
+
     if (!is_arch_compatible(desc_.arch, desc.arch)) {
         return false;
     }
@@ -30,6 +58,16 @@ bool Kernel::is_feasible(const GemmDesc& desc) const noexcept
         return false;
     }
 
+    if (desc.sched != desc_.sched) {
+        return false;
+    }
+
+    if (!(accept(desc_.striding_a, desc.striding_a)     //
+          && accept(desc_.striding_b, desc.striding_b)  //
+          && accept(desc_.striding_c, desc.striding_c))) {
+        return false;
+    }
+
     if constexpr (debug)
         printf("A\n");
 
@@ -82,64 +120,9 @@ bool Kernel::is_feasible(const GemmDesc& desc) const noexcept
     return true;
 }
 
-std::vector<std::pair<int, KernelMetric>>
-Kernel::Estimate_v2(std::array<int, 3> size, int max_splits, int max_waves, int sm_count) const
-{
-    const auto [m, n, k]        = size;
-    const int64_t tiled_shape_m = ceil_div(m, desc_.cta_tile.x);
-    const int64_t tiled_shape_n = ceil_div(n, desc_.cta_tile.y);
-    const int     chunk_cnt_k   = ceil_div(k, chunk_size_k_);
-
-    // Despite we only have sm_count * constant tensor cores, this is the granularity for scheduling
-    const int   concurrency     = sm_count * desc_.max_active_ctas;
-    const float waves_per_split = float(tiled_shape_m * tiled_shape_n) / concurrency;
-    const float splits_per_wave = 1.f / waves_per_split;
-
-    // Tile quantization
-    const int64_t ceil_m = tiled_shape_m * desc_.cta_tile.x;
-    const int64_t ceil_n = tiled_shape_n * desc_.cta_tile.y;
-
-    std::vector<std::pair<int, KernelMetric>> metrics;
-
-    for (int splits = 1; splits <= max_splits; ++splits) {
-        // Split quantization, penalize uneven splits
-        const int64_t split_ceil_k = ceil_div(chunk_cnt_k, splits) * chunk_size_k_;
-        // Footprint for single split
-        const int64_t split_mma_cost = ceil_m * ceil_n * split_ceil_k;
-        // Footprint for single wave
-        const int64_t wave_mma_cost = split_mma_cost * splits_per_wave;
-
-        // Wave quantization
-        // const int waves = (int)std::ceil(wave_per_split * splits);
-
-        // Bold simulation of thread block scheduling
-        const int   grid_size    = tiled_shape_m * tiled_shape_n * splits;
-        const int   full_waves   = grid_size / concurrency;
-        const int   residue      = grid_size % concurrency;
-        const float partial_wave = (float)ceil_div(residue, sm_count) / desc_.max_active_ctas;
-        const float waves        = full_waves + partial_wave;
-
-        if (splits > 1 && waves > max_waves) {
-            break;
-        }
-        // ceil(tiled_mn / C * splits) * C / tiled_mn * ceil_m * ceil_n * split_ceil_k
-        const int64_t mma_cost = wave_mma_cost * waves;
-
-        // IO has less severe quantization effect
-        const int64_t mio_cost_a = get_size(desc_.type_a, tiled_shape_n * m * split_ceil_k) * splits;
-        const int64_t mio_cost_b = get_size(desc_.type_b, tiled_shape_m * n * split_ceil_k) * splits;
-        /// TODO: read type from `desc_.accum` when added
-        const int64_t mio_cost_c = get_size(DataType::F32, (int64_t)m * n) * (splits - 1) * 2;
-        const int64_t mio_cost   = mio_cost_a + mio_cost_b + mio_cost_c;
-
-        // std::cout << name() << " " << splits << " " << waves << " " << (float)mio_cost << " " << (float)mma_cost
-        //           << "\n";
-
-        metrics.emplace_back(splits, KernelMetric{mio_cost, mma_cost});
-    }
-
-    return metrics;
-}
+//  mm:     m * n * k,     m * k,     n * k,     m * n
+// Bmm: b * m * n * k, b * m * k, b * n * k, b * m * n
+// Gmm: S $ M * n * k, S $ M * k, S $ n * k, S $ M * n
 
 std::string Kernel::GetName() const
 {
@@ -155,16 +138,21 @@ std::string Kernel::GetName() const
         ss << "g" << desc_.quant_b.group_size;
     }
     ss << "_" << to_string(desc_.type_c);
-    ss << "_"                                                                            //
-       << (desc_.order_a == kColMajor ? 'n' : 't')                                       //
-       << (desc_.order_b == kColMajor ? 'n' : 't')                                       //
-       << (desc_.order_c == kColMajor ? 'n' : 't');                                      //
+    ss << "_"                                        //
+       << (desc_.order_a == kColMajor ? 'n' : 't')   //
+       << (desc_.order_b == kColMajor ? 'n' : 't')   //
+       << (desc_.order_c == kColMajor ? 'n' : 't');  //
+    ss << "_"                                        //
+       << to_string(desc_.striding_a)                //
+       << to_string(desc_.striding_b)                //
+       << to_string(desc_.striding_c);
     ss << "_" << desc_.cta_tile.x << "x" << desc_.cta_tile.y << "x" << desc_.cta_tile.z  //
        << "_" << desc_.stages                                                            //
        << "_" << to_string(desc_.op_class)                                               //
-       << "_" << desc_.mma_tile.x << "x" << desc_.mma_tile.y << "x" << desc_.mma_tile.z  //
-       << "_c" << desc_.c_tile.x << "x" << desc_.c_tile.y                                //
-       << "_a" << desc_.align.x << "x" << desc_.align.y << "x" << desc_.align.z          //
+       << "_" << desc_.mma_tile.x << "x" << desc_.mma_tile.y << "x" << desc_.mma_tile.z;
+    ss << (desc_.sched ? "_dynamic" : "");
+    ss << "_c" << desc_.c_tile.x << "x" << desc_.c_tile.y                        //
+       << "_a" << desc_.align.x << "x" << desc_.align.y << "x" << desc_.align.z  //
        << "_" << desc_.policy_a << desc_.policy_b;
 
     return ss.str();
diff --git a/src/turbomind/kernels/gemm/kernel.h b/src/turbomind/kernels/gemm/kernel.h
index 34e6094887..d7f6f73b36 100644
--- a/src/turbomind/kernels/gemm/kernel.h
+++ b/src/turbomind/kernels/gemm/kernel.h
@@ -39,18 +39,14 @@ class Kernel {
                        int                 swizzle,
                        int                 splits,
                        Workspace&          workspace,
-                       cudaStream_t        stream) = 0;
-
-    // virtual because different implementation may have different workspace requeirements
-    virtual int GetMaxSplits(int m, int n, int k, size_t barrier_size, size_t partials_size) = 0;
+                       cudaStream_t        stream) const = 0;
 
     // true if this kernel can be used to compute the gemm
     bool is_feasible(const GemmDesc& desc) const noexcept;
 
-    std::vector<std::pair<int, KernelMetric>>
-    Estimate_v2(std::array<int, 3> size, int max_splits, int max_waves, int sm_count) const;
+    virtual int GetMaxSplits(const int4& shape, int64_t tiles, size_t bsize, size_t psize) const = 0;
 
-    virtual int GetSwizzle(int m, int n, int k, int splits, int swizzle) = 0;
+    virtual int GetSwizzle(int m, int n, int k, int splits, int swizzle) const = 0;
 
     const KernelDesc& desc() const noexcept
     {
diff --git a/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm70_s884.cu b/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm70_s884.cu
index 3d678df08e..04cc5a49f6 100644
--- a/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm70_s884.cu
+++ b/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm70_s884.cu
@@ -19,7 +19,11 @@ void Registry::f16_u4g128_f16_tnt_sm70_s884()
                                  Transform_HMMA_SIMT_B,
                                  typename GetOperand<HMMA_884, OPERAND_V, uint32_t, kColMajor, true>::Operand,
                                  kRowMajor,
-                                 half>;
+                                 half,
+                                 Striding::kFlat,
+                                 Striding::kFlat,
+                                 Striding::kFlat,
+                                 GemmScheduler<kColMajor>>;
 
         using namespace cache_policy;
 
diff --git a/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm75_s16816.cu b/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm75_s16816.cu
index e2a0e3c4a5..93dde16b2a 100644
--- a/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm75_s16816.cu
+++ b/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm75_s16816.cu
@@ -1,6 +1,7 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
 #include "src/turbomind/kernels/gemm/arch/config_sm75_s16816.h"
+#include "src/turbomind/kernels/gemm/cta_map.h"
 #include "src/turbomind/kernels/gemm/operand.h"
 #include "src/turbomind/kernels/gemm/registry.h"
 #include "src/turbomind/kernels/gemm/transform.h"
@@ -16,11 +17,15 @@ void Registry::f16_u4g128_f16_tnt_sm75_s16816()
         using C = Sm75_s16816<Operand_A<half, kRowMajor>,
                               Transform_Default,
                               VoidOperand,
-                              Operand_B_Pack<uint4_t, kColMajor>,
+                              Operand_B_Pack<uint4_t, kColMajor, 2>,
                               Transform_HMMA_16816<1, 0>,
                               Operand_UV_Pack<uint32_t, true>,
                               kRowMajor,
-                              half>;
+                              half,
+                              Striding::kFlat,
+                              Striding::kFlat,
+                              Striding::kFlat,
+                              GemmScheduler<kColMajor>>;
 
         using S = cache_policy::Stream;
         using D = cache_policy::Default;
diff --git a/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm75_simt.cu b/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm75_simt.cu
index a97919d20c..db73c713a5 100644
--- a/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm75_simt.cu
+++ b/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm75_simt.cu
@@ -27,7 +27,11 @@ void Registry::f16_u4g128_f16_tnt_sm75_simt()
                             Transform_HMMA_SIMT_B,
                             Operand_V,
                             kRowMajor,
-                            half>;
+                            half,
+                            Striding::kFlat,
+                            Striding::kFlat,
+                            Striding::kFlat,
+                            GemmScheduler<kColMajor>>;
 
         // clang-format off
         Add<C::Type<128, 128, 16, 8, 1, 1, D, D, 2, true, 1, 128>>();
diff --git a/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm80_s16816.cu b/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm80_s16816.cu
index 8b188a44d2..3f589a2389 100644
--- a/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm80_s16816.cu
+++ b/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm80_s16816.cu
@@ -1,6 +1,7 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
 #include "src/turbomind/kernels/gemm/arch/config_sm80_s16816.h"
+#include "src/turbomind/kernels/gemm/cta_map.h"
 #include "src/turbomind/kernels/gemm/registry.h"
 #include "src/turbomind/kernels/gemm/transform.h"
 #include "src/turbomind/kernels/gemm/types.h"
@@ -9,21 +10,26 @@ namespace turbomind::gemm {
 
 void Registry::f16_u4g128_f16_tnt_sm80_s16816()
 {
+#if 1
     using namespace sm80_s16816;
     using namespace cache_policy;
     using S = cache_policy::Stream;
     using D = cache_policy::Default;
 
     using C = Sm80_s16816<Sm80,
-                          Operand_A<half, kRowMajor>,          // A
-                          Transform_Default,                   // tarnsform A
-                          VoidOperand,                         // U
-                          Operand_B_Pack<uint4_t, kColMajor>,  // B
-                          Transform_HMMA_16816<1, 0>,          // transform B
-                          Operand_UV_Pack<uint32_t, true>,     // V
-                          kRowMajor,                           // order_C
-                          half>;                               // Tc
-
+                          half,
+                          Operand_A<half, kRowMajor>,             // A
+                          Transform_Default,                      // tarnsform A
+                          VoidOperand,                            // U
+                          Operand_B_Pack<uint4_t, kColMajor, 2>,  // B
+                          Transform_HMMA_16816<1, 0>,             // transform B
+                          Operand_UV_Pack<uint32_t, true>,        // V
+                          kRowMajor,                              // order_C
+                          half,                                   // Tc
+                          Striding::kFlat,
+                          Striding::kFlat,
+                          Striding::kFlat,
+                          GemmScheduler<kColMajor>>;
     // clang-format off
     // Add<C::Type<128, 256,  64, 1, 8, 1, D, S, 3, true, 1, 128>>(); // 0/0
     Add<C::Type<128, 256,  32, 1, 8, 1, D, D, 3, true, 1, 128, 128, 128>>(); // 30/3
@@ -61,6 +67,7 @@ void Registry::f16_u4g128_f16_tnt_sm80_s16816()
     Add<C::Type<16,  64, 128, 1, 2, 2, D, S, 3, true, 1, 128>>();
     Add<C::Type<16,  64, 128, 1, 2, 2, D, S, 4, true, 1, 128>>();
     // clang-format on
+#endif
 }
 
 // sm80_f16_u4g128_f16_ttt_128x256x32_4_s16816_1x8x1_c128x128_a1x32x32_00: 46
diff --git a/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm90_s16816.cu b/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm90_s16816.cu
index 908451ed00..c0ee423bfc 100644
--- a/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm90_s16816.cu
+++ b/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm90_s16816.cu
@@ -17,14 +17,19 @@ void Registry::f16_u4g128_f16_tnt_sm90_s16816()
     using D = cache_policy::Default;
 
     using C = Sm80_s16816<Sm90,
-                          Operand_A<half, kRowMajor>,          // A
-                          Transform_Default,                   // tarnsform A
-                          VoidOperand,                         // U
-                          Operand_B_Pack<uint4_t, kColMajor>,  // B
-                          Transform_HMMA_16816<1, 0>,          // transform B
-                          Operand_UV_Pack<uint32_t, true>,     // V
-                          kRowMajor,                           // order_C
-                          half>;                               // Tc
+                          half,
+                          Operand_A<half, kRowMajor>,             // A
+                          Transform_Default,                      // tarnsform A
+                          VoidOperand,                            // U
+                          Operand_B_Pack<uint4_t, kColMajor, 2>,  // B
+                          Transform_HMMA_16816<1, 0>,             // transform B
+                          Operand_UV_Pack<uint32_t, true>,        // V
+                          kRowMajor,                              // order_C
+                          half,                                   // Tc
+                          Striding::kFlat,
+                          Striding::kFlat,
+                          Striding::kFlat,
+                          GemmScheduler<kColMajor>>;
 
     // clang-format off
     Add<C::Type<128, 256,  64, 1, 8, 1, D, D, 3, true, 1, 128>>();
diff --git a/src/turbomind/kernels/gemm/kernel/sm70_s884_dynamic.cu b/src/turbomind/kernels/gemm/kernel/sm70_s884_dynamic.cu
new file mode 100644
index 0000000000..8b630b6d50
--- /dev/null
+++ b/src/turbomind/kernels/gemm/kernel/sm70_s884_dynamic.cu
@@ -0,0 +1,77 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/arch/config_sm70_s884.h"
+#include "src/turbomind/kernels/gemm/cta_map.h"
+#include "src/turbomind/kernels/gemm/registry.h"
+#include "src/turbomind/kernels/gemm/transform.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind::gemm {
+
+using namespace sm70_s884;
+using namespace cache_policy;
+using S = cache_policy::Stream;
+using D = cache_policy::Default;
+
+void Registry::sm70_s884_dynamic()
+{
+    if constexpr (1) {
+        using C = Sm70_s884<Operand_A<half>,       // A
+                            Transform_Default,     // tarnsform A
+                            VoidOperand,           // U
+                            Operand_B_Pack<half>,  // B
+                            Transform_Default,     // transform B
+                            VoidOperand,           // V
+                            kRowMajor,             // order_C
+                            half,                  // Tc
+                            Striding::kIndexed,    // indexed input
+                            Striding::kBlocked,
+                            Striding::kBlocked,
+                            DynamicScheduler<kColMajor>>;
+
+        // clang-format off
+        Add<C::Type<256, 128,  16, 4, 2, 1, D, D, 2,   0 , 1, 1, 128, 128>>();
+        Add<C::Type<128, 256,  16, 2, 4, 1, D, D, 2,   0 , 1, 1, 128, 128>>();
+        Add<C::Type<128, 256,  16, 2, 4, 1, D, D, 2,   0 , 1, 1, 128, 128>>();
+        Add<C::Type<128, 128,  16, 2, 2, 1, D, D, 2, true, 1, 1,  64, 128>>();
+        Add<C::Type< 96,  64,  32, 2, 2, 1, D, D, 2, true, 1, 1>>();
+        Add<C::Type< 64, 128,  32, 1, 4, 1, D, S, 2, true, 1, 1>>();
+        Add<C::Type< 64,  64,  64, 2, 2, 1, D, S, 2, true, 1, 1>>();
+        Add<C::Type< 32, 128,  32, 1, 4, 1, D, S, 2, true, 1, 1>>();
+        Add<C::Type< 16, 128,  64, 1, 4, 1, D, S, 2, true, 1, 1>>();
+        Add<C::Type< 16, 128,  32, 1, 4, 1, D, S, 2, true, 1, 1>>();
+        Add<C::Type<  8, 128,  64, 1, 4, 1, D, S, 2, true, 1, 1>>();
+        // clang-format on
+    }
+
+    if constexpr (1) {
+        using C = Sm70_s884<Operand_A<half>,           // A
+                            Transform_Default,         // tarnsform A
+                            VoidOperand,               // U
+                            Operand_B_Pack<uint4_t>,   // B
+                            Transform_HMMA_SIMT_B,     // transform B,
+                            Operand_V_Pack<uint32_t>,  // V
+                            kRowMajor,                 // order_C
+                            half,                      // Tc
+                            Striding::kIndexed,        // indexed input
+                            Striding::kBlocked,
+                            Striding::kBlocked,
+                            DynamicScheduler<kColMajor>>;
+
+        // clang-format off
+        Add<C::Type<128, 256,  16, 2, 4, 1, D, D, 2,   0 , 1, 128, 128, 128>>();
+        Add<C::Type<128, 128,  16, 2, 2, 1, D, D, 2, true, 1, 128,  64, 128>>();
+        Add<C::Type< 64, 128,  32, 1, 4, 1, D, S, 2, true, 1, 128,  32, 128>>();
+        Add<C::Type< 64, 256,  16, 1, 4, 1, D, S, 2, true, 1, 128,  64, 128>>();
+        Add<C::Type< 32, 128,  32, 1, 4, 1, D, S, 2, true, 1, 128>>();
+        Add<C::Type< 32, 256,  32, 1, 4, 1, D, S, 2, true, 1, 128>>();
+        Add<C::Type< 16, 256,  64, 1, 4, 1, D, S, 2, true, 1, 128>>();
+        Add<C::Type< 16, 256,  32, 1, 4, 1, D, S, 2, true, 1, 128>>();
+        Add<C::Type< 16, 128,  32, 1, 4, 1, D, S, 2, true, 1, 128>>();
+        Add<C::Type< 16, 256,  32, 1, 4, 1, D, S, 2, true, 1, 128>>();
+        Add<C::Type<  8, 128,  64, 1, 4, 1, D, S, 2, true, 1, 128>>();
+        // clang-format on
+    }
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/kernel/sm75_s16816_dynamic.cu b/src/turbomind/kernels/gemm/kernel/sm75_s16816_dynamic.cu
new file mode 100644
index 0000000000..3c535b8223
--- /dev/null
+++ b/src/turbomind/kernels/gemm/kernel/sm75_s16816_dynamic.cu
@@ -0,0 +1,73 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/arch/config_sm75_s16816.h"
+#include "src/turbomind/kernels/gemm/cta_map.h"
+#include "src/turbomind/kernels/gemm/registry.h"
+#include "src/turbomind/kernels/gemm/transform.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind::gemm {
+
+using namespace sm75_s16816;
+using namespace cache_policy;
+using S = cache_policy::Stream;
+using D = cache_policy::Default;
+
+void Registry::sm75_s16816_dynamic()
+{
+    if constexpr (1) {
+        using C = Sm75_s16816<Operand_A<half, kRowMajor>,          // A
+                              Transform_Default,                   // tarnsform A
+                              VoidOperand,                         // U
+                              Operand_B_Pack<half, kRowMajor, 1>,  // B
+                              Transform_Default,                   // transform B
+                              VoidOperand,                         // V
+                              kRowMajor,                           // order_C
+                              half,                                // Tc
+                              Striding::kIndexed,
+                              Striding::kBlocked,
+                              Striding::kBlocked,
+                              DynamicScheduler<kColMajor>>;
+
+        // clang-format off
+        Add<C::Type<128, 256,  32, 2, 4, 1, D, D, 2,    0, 1, 1, 128, 128>>();
+        Add<C::Type<128, 128,  32, 2, 2, 1, D, D, 2, true, 1, 1,  64, 128>>();
+        Add<C::Type< 96,  64,  64, 2, 2, 1, D, D, 2, true, 1, 1>>();
+        Add<C::Type< 64, 128,  64, 1, 4, 1, D, S, 2, true, 1, 1>>();
+        Add<C::Type< 64,  64,  64, 2, 2, 1, D, S, 2, true, 1, 1>>();
+        Add<C::Type< 64,  64, 128, 1, 2, 2, D, S, 2, true, 1, 1>>();
+        Add<C::Type< 32,  64, 128, 1, 2, 2, D, S, 2, true, 1, 1>>();
+        Add<C::Type< 32, 128,  64, 1, 4, 1, D, S, 2, true, 1, 1>>();
+        Add<C::Type< 16,  64, 128, 1, 2, 2, D, S, 2, true, 1, 1>>();
+        Add<C::Type< 16, 128,  64, 1, 4, 1, D, S, 2, true, 1, 1>>();
+        // clang-format on
+    }
+
+    if constexpr (1) {
+        using C = Sm75_s16816<Operand_A<half, kRowMajor>,             // A
+                              Transform_Default,                      // tarnsform A
+                              VoidOperand,                            // U
+                              Operand_B_Pack<uint4_t, kRowMajor, 2>,  // B
+                              Transform_HMMA_16816<1, 0>,             // transform B,
+                              Operand_UV_Pack<uint32_t, true>,        // V
+                              kRowMajor,                              // order_C
+                              half,                                   // Tc
+                              Striding::kIndexed,
+                              Striding::kBlocked,
+                              Striding::kBlocked,
+                              DynamicScheduler<kColMajor>>;
+
+        // clang-format off
+        Add<C::Type<128, 256,  32, 2, 4, 1, D, D, 2,    0, 1, 128, 128, 128>>();
+        Add<C::Type<128, 128,  32, 2, 2, 1, D, D, 2, true, 1, 128,  64, 128>>();
+        Add<C::Type< 64, 128,  64, 1, 4, 1, D, S, 2, true, 1, 128,  32, 128>>();
+        Add<C::Type< 64, 256,  32, 1, 4, 1, D, S, 2, true, 1, 128,  32, 256>>();
+        Add<C::Type< 32,  64, 128, 1, 2, 2, D, S, 2, true, 1, 128>>();
+        Add<C::Type< 32, 128,  64, 1, 4, 1, D, S, 2, true, 1, 128>>();
+        Add<C::Type< 16, 128,  32, 1, 4, 1, D, S, 2, true, 1, 128>>();
+        Add<C::Type< 16,  64,  64, 1, 2, 2, D, S, 2, true, 1, 128>>();
+        // clang-format on
+    }
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/kernel/sm80_s16816_dynamic.cu b/src/turbomind/kernels/gemm/kernel/sm80_s16816_dynamic.cu
new file mode 100644
index 0000000000..16b4139b50
--- /dev/null
+++ b/src/turbomind/kernels/gemm/kernel/sm80_s16816_dynamic.cu
@@ -0,0 +1,123 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/arch/config_sm80_s16816.h"
+#include "src/turbomind/kernels/gemm/cta_map.h"
+#include "src/turbomind/kernels/gemm/registry.h"
+#include "src/turbomind/kernels/gemm/transform.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind::gemm {
+
+using namespace sm80_s16816;
+using namespace cache_policy;
+using S = cache_policy::Stream;
+using D = cache_policy::Default;
+
+template<class T>
+void Registry::sm80_s16816_dynamic()
+{
+    if constexpr (std::is_same_v<T, half>) {
+        using C = Sm80_s16816<Sm80,
+                              half,
+                              Operand_A<half, kRowMajor>,          // A
+                              Transform_Default,                   // tarnsform A
+                              VoidOperand,                         // U
+                              Operand_B_Pack<half, kRowMajor, 1>,  // B
+                              Transform_Default,                   // transform B
+                              VoidOperand,                         // V
+                              kRowMajor,                           // order_C
+                              half,                                // Tc
+                              Striding::kIndexed,                  // indexed input
+                              Striding::kBlocked,
+                              Striding::kBlocked,
+                              DynamicScheduler<kColMajor>>;
+
+        // clang-format off
+        Add<C::Type<256, 128,  64, 4, 2, 1, D, D, 3,   0 , 1, 1>>();
+        Add<C::Type<128, 256,  64, 2, 4, 1, D, D, 3,   0 , 1, 1>>(); // 10
+        Add<C::Type<128, 256,  32, 2, 4, 1, D, D, 3,   0 , 1, 1>>();
+        Add<C::Type<128, 128,  32, 2, 2, 1, D, D, 3, true, 1, 1>>(); // 6
+        Add<C::Type<128, 128,  64, 2, 2, 1, D, D, 3, true, 1, 1>>();
+        Add<C::Type<128, 128,  32, 2, 2, 1, D, D, 5, true, 1, 1>>();
+        Add<C::Type< 96,  64,  64, 2, 2, 1, D, D, 3, true, 1, 1>>(); // 2
+        Add<C::Type< 64, 128,  64, 1, 4, 1, D, S, 3, true, 1, 1>>();
+        Add<C::Type< 64,  64,  64, 2, 2, 1, D, S, 3, true, 1, 1>>(); // *
+        Add<C::Type< 64,  64,  64, 2, 2, 1, D, S, 5, true, 1, 1>>();
+        Add<C::Type< 64,  64, 128, 1, 2, 2, D, S, 3, true, 1, 1>>(); // 4
+        Add<C::Type< 32,  64, 128, 1, 2, 2, D, S, 3, true, 1, 1>>();
+        Add<C::Type< 32, 128,  64, 1, 4, 1, D, S, 3, true, 1, 1>>();
+        Add<C::Type< 16,  64, 128, 1, 2, 2, D, S, 3, true, 1, 1>>(); // 10
+        Add<C::Type< 16, 128,  64, 1, 4, 1, D, S, 3, true, 1, 1>>();
+        // clang-format on
+    }
+    else if constexpr (std::is_same_v<T, nv_bfloat16>) {
+        using C = Sm80_s16816<Sm80,
+                              nv_bfloat16,
+                              Operand_A<nv_bfloat16, kRowMajor>,          // A
+                              Transform_Default,                          // tarnsform A
+                              VoidOperand,                                // U
+                              Operand_B_Pack<nv_bfloat16, kRowMajor, 1>,  // B
+                              Transform_Default,                          // transform B
+                              VoidOperand,                                // V
+                              kRowMajor,                                  // order_C
+                              nv_bfloat16,                                // Tc
+                              Striding::kIndexed,                         // indexed input
+                              Striding::kBlocked,
+                              Striding::kBlocked,
+                              DynamicScheduler<kColMajor>>;
+
+        // clang-format off
+        Add<C::Type<256, 128,  64, 4, 2, 1, D, D, 3,   0 , 1, 1>>();
+        Add<C::Type<128, 256,  64, 2, 4, 1, D, D, 3,   0 , 1, 1>>(); // 10
+        Add<C::Type<128, 256,  32, 2, 4, 1, D, D, 3,   0 , 1, 1>>();
+        Add<C::Type<128, 128,  32, 2, 2, 1, D, D, 3, true, 1, 1>>(); // 6
+        Add<C::Type<128, 128,  64, 2, 2, 1, D, D, 3, true, 1, 1>>();
+        Add<C::Type<128, 128,  32, 2, 2, 1, D, D, 5, true, 1, 1>>();
+        Add<C::Type< 96,  64,  64, 2, 2, 1, D, D, 3, true, 1, 1>>(); // 2
+        Add<C::Type< 64, 128,  64, 1, 4, 1, D, S, 3, true, 1, 1>>();
+        Add<C::Type< 64,  64,  64, 2, 2, 1, D, S, 3, true, 1, 1>>(); // *
+        Add<C::Type< 64,  64,  64, 2, 2, 1, D, S, 5, true, 1, 1>>();
+        Add<C::Type< 64,  64, 128, 1, 2, 2, D, S, 3, true, 1, 1>>(); // 4
+        Add<C::Type< 32,  64, 128, 1, 2, 2, D, S, 3, true, 1, 1>>();
+        Add<C::Type< 32, 128,  64, 1, 4, 1, D, S, 3, true, 1, 1>>();
+        Add<C::Type< 16,  64, 128, 1, 2, 2, D, S, 3, true, 1, 1>>(); // 10
+        Add<C::Type< 16, 128,  64, 1, 4, 1, D, S, 3, true, 1, 1>>();
+        // clang-format on
+    }
+
+    if constexpr (std::is_same_v<T, half>) {
+        using C = Sm80_s16816<Sm80,
+                              half,
+                              Operand_A<half, kRowMajor>,             // A
+                              Transform_Default,                      // tarnsform A
+                              VoidOperand,                            // U
+                              Operand_B_Pack<uint4_t, kRowMajor, 2>,  // B
+                              Transform_HMMA_16816<1, 0>,             // transform B,
+                              Operand_UV_Pack<uint32_t, true>,        // V
+                              kRowMajor,                              // order_C
+                              half,                                   // Tc
+                              Striding::kIndexed,                     // indexed input
+                              Striding::kBlocked,
+                              Striding::kBlocked,
+                              DynamicScheduler<kColMajor>>;
+
+        // clang-format off
+        Add<C::Type<128, 256,  32, 2, 4, 1, D, D, 3,   0 , 1, 128>>();  // 10 + 5 + 4 + 10 + 10, 37
+        Add<C::Type<128, 128,  32, 1, 4, 1, D, D, 3, true, 1, 128>>();  // 1 + 6 + 4 + 4 + 2, 3
+        Add<C::Type< 64, 128,  64, 1, 4, 1, D, S, 3, true, 1, 128>>();  // 7 + 4 + 6 + 2 + 4, 26
+        Add<C::Type< 64, 256,  32, 1, 4, 1, D, S, 3, true, 1, 128>>();  // 18
+        Add<C::Type< 32,  64, 128, 1, 2, 2, D, S, 3, true, 1, 128>>();  // 2
+        Add<C::Type< 32, 128,  64, 1, 4, 1, D, S, 5, true, 1, 128>>();  // 1 + 2 + 2 + 2 + 2, 2
+        Add<C::Type< 32, 256,  64, 1, 4, 1, D, S, 3, true, 1, 128>>();  // 9
+        Add<C::Type< 16, 256,  64, 1, 4, 1, D, S, 3, true, 1, 128>>();  // 22
+        Add<C::Type< 16, 256,  32, 1, 4, 1, D, S, 3, true, 1, 128>>();  // 8
+        Add<C::Type< 16, 128,  64, 1, 4, 1, D, S, 3, true, 1, 128>>();  // 1 + 13 + 9 + 13 + 7, 7
+        Add<C::Type< 16,  64, 128, 1, 2, 2, D, S, 3, true, 1, 128>>();  // 12 + 2 + 6 + 2 + 8, 42
+        // clang-format on
+    }
+}
+
+template void Registry::sm80_s16816_dynamic<half>();
+template void Registry::sm80_s16816_dynamic<nv_bfloat16>();
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/kernel/sm90_s16816_dynamic.cu b/src/turbomind/kernels/gemm/kernel/sm90_s16816_dynamic.cu
new file mode 100644
index 0000000000..378c9caa93
--- /dev/null
+++ b/src/turbomind/kernels/gemm/kernel/sm90_s16816_dynamic.cu
@@ -0,0 +1,123 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/arch/config_sm80_s16816.h"
+#include "src/turbomind/kernels/gemm/cta_map.h"
+#include "src/turbomind/kernels/gemm/registry.h"
+#include "src/turbomind/kernels/gemm/transform.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind::gemm {
+
+using namespace sm80_s16816;
+using namespace cache_policy;
+using S = cache_policy::Stream;
+using D = cache_policy::Default;
+
+template<class T>
+void Registry::sm90_s16816_dynamic()
+{
+    if constexpr (std::is_same_v<T, half>) {
+        using C = Sm80_s16816<Sm90,
+                              half,
+                              Operand_A<half, kRowMajor>,          // A
+                              Transform_Default,                   // tarnsform A
+                              VoidOperand,                         // U
+                              Operand_B_Pack<half, kRowMajor, 1>,  // B
+                              Transform_Default,                   // transform B
+                              VoidOperand,                         // V
+                              kRowMajor,                           // order_C
+                              half,                                // Tc
+                              Striding::kIndexed,                  // indexed input
+                              Striding::kBlocked,
+                              Striding::kBlocked,
+                              DynamicScheduler<kColMajor>>;
+
+        // clang-format off
+        Add<C::Type<256, 128,  64, 4, 2, 1, D, D, 3,   0 , 1, 1>>();
+        Add<C::Type<128, 256,  64, 2, 4, 1, D, D, 3,   0 , 1, 1>>();
+        Add<C::Type<128, 256,  32, 2, 4, 1, D, D, 3,   0 , 1, 1>>();
+        Add<C::Type<128, 128,  32, 2, 2, 1, D, D, 3, true, 1, 1>>();
+        Add<C::Type<128, 128,  64, 2, 2, 1, D, D, 3, true, 1, 1>>();
+        Add<C::Type<128, 128,  32, 2, 2, 1, D, D, 5, true, 1, 1>>();
+        Add<C::Type< 96,  64,  64, 2, 2, 1, D, D, 3, true, 1, 1>>();
+        Add<C::Type< 64, 128,  64, 1, 4, 1, D, D, 3, true, 1, 1>>();
+        Add<C::Type< 64,  64,  64, 2, 2, 1, D, D, 3, true, 1, 1>>();
+        Add<C::Type< 64,  64,  64, 2, 2, 1, D, D, 5, true, 1, 1>>();
+        Add<C::Type< 64,  64, 128, 1, 2, 2, D, D, 3, true, 1, 1>>();
+        Add<C::Type< 32,  64, 128, 1, 2, 2, D, D, 3, true, 1, 1>>();
+        Add<C::Type< 32, 128,  64, 1, 4, 1, D, D, 3, true, 1, 1>>();
+        Add<C::Type< 16,  64, 128, 1, 2, 2, D, D, 3, true, 1, 1>>();
+        Add<C::Type< 16, 128,  64, 1, 4, 1, D, D, 3, true, 1, 1>>();
+        // clang-format on
+    }
+    else if constexpr (std::is_same_v<T, nv_bfloat16>) {
+        using C = Sm80_s16816<Sm90,
+                              nv_bfloat16,
+                              Operand_A<nv_bfloat16, kRowMajor>,          // A
+                              Transform_Default,                          // tarnsform A
+                              VoidOperand,                                // U
+                              Operand_B_Pack<nv_bfloat16, kRowMajor, 1>,  // B
+                              Transform_Default,                          // transform B
+                              VoidOperand,                                // V
+                              kRowMajor,                                  // order_C
+                              nv_bfloat16,                                // Tc
+                              Striding::kIndexed,                         // indexed input
+                              Striding::kBlocked,
+                              Striding::kBlocked,
+                              DynamicScheduler<kColMajor>>;
+
+        // clang-format off
+        Add<C::Type<256, 128,  64, 4, 2, 1, D, D, 3,   0 , 1, 1>>();
+        Add<C::Type<128, 256,  64, 2, 4, 1, D, D, 3,   0 , 1, 1>>();
+        Add<C::Type<128, 256,  32, 2, 4, 1, D, D, 3,   0 , 1, 1>>();
+        Add<C::Type<128, 128,  32, 2, 2, 1, D, D, 3, true, 1, 1>>();
+        Add<C::Type<128, 128,  64, 2, 2, 1, D, D, 3, true, 1, 1>>();
+        Add<C::Type<128, 128,  32, 2, 2, 1, D, D, 5, true, 1, 1>>();
+        Add<C::Type< 96,  64,  64, 2, 2, 1, D, D, 3, true, 1, 1>>();
+        Add<C::Type< 64, 128,  64, 1, 4, 1, D, D, 3, true, 1, 1>>();
+        Add<C::Type< 64,  64,  64, 2, 2, 1, D, D, 3, true, 1, 1>>();
+        Add<C::Type< 64,  64,  64, 2, 2, 1, D, D, 5, true, 1, 1>>();
+        Add<C::Type< 64,  64, 128, 1, 2, 2, D, D, 3, true, 1, 1>>();
+        Add<C::Type< 32,  64, 128, 1, 2, 2, D, D, 3, true, 1, 1>>();
+        Add<C::Type< 32, 128,  64, 1, 4, 1, D, D, 3, true, 1, 1>>();
+        Add<C::Type< 16,  64, 128, 1, 2, 2, D, D, 3, true, 1, 1>>();
+        Add<C::Type< 16, 128,  64, 1, 4, 1, D, D, 3, true, 1, 1>>();
+        // clang-format on
+    }
+
+    if constexpr (std::is_same_v<T, half>) {
+        using C = Sm80_s16816<Sm90,
+                              half,
+                              Operand_A<half, kRowMajor>,             // A
+                              Transform_Default,                      // tarnsform A
+                              VoidOperand,                            // U
+                              Operand_B_Pack<uint4_t, kRowMajor, 2>,  // B
+                              Transform_HMMA_16816<1, 0>,             // transform B,
+                              Operand_UV_Pack<uint32_t, true>,        // V
+                              kRowMajor,                              // order_C
+                              half,                                   // Tc
+                              Striding::kIndexed,                     // indexed input
+                              Striding::kBlocked,
+                              Striding::kBlocked,
+                              DynamicScheduler<kColMajor>>;
+
+        // clang-format off
+        Add<C::Type<128, 256,  32, 2, 4, 1, D, D, 3,   0 , 1, 128>>();
+        Add<C::Type<128, 128,  32, 1, 4, 1, D, D, 3, true, 1, 128>>();
+        Add<C::Type< 64, 128,  64, 1, 4, 1, D, D, 3, true, 1, 128>>();
+        Add<C::Type< 64, 256,  32, 1, 4, 1, D, D, 3, true, 1, 128>>();
+        Add<C::Type< 32,  64, 128, 1, 2, 2, D, D, 3, true, 1, 128>>();
+        Add<C::Type< 32, 128,  64, 1, 4, 1, D, D, 5, true, 1, 128>>();
+        Add<C::Type< 32, 256,  64, 1, 4, 1, D, D, 3, true, 1, 128>>();
+        Add<C::Type< 16, 256,  64, 1, 4, 1, D, D, 3, true, 1, 128>>();
+        Add<C::Type< 16, 256,  32, 1, 4, 1, D, D, 3, true, 1, 128>>();
+        Add<C::Type< 16, 128,  64, 1, 4, 1, D, D, 3, true, 1, 128>>();
+        Add<C::Type< 16,  64, 128, 1, 2, 2, D, D, 3, true, 1, 128>>();
+        // clang-format on
+    }
+}
+
+template void Registry::sm90_s16816_dynamic<half>();
+template void Registry::sm90_s16816_dynamic<nv_bfloat16>();
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/kernel/u4g128_f16_f16_nnn_sm80_s16816.cu b/src/turbomind/kernels/gemm/kernel/u4g128_f16_f16_nnn_sm80_s16816.cu
index 4e1a071ae7..f19f7abd36 100644
--- a/src/turbomind/kernels/gemm/kernel/u4g128_f16_f16_nnn_sm80_s16816.cu
+++ b/src/turbomind/kernels/gemm/kernel/u4g128_f16_f16_nnn_sm80_s16816.cu
@@ -9,23 +9,23 @@
 
 namespace turbomind::gemm {
 
-using namespace sm80_s16816;
-template<int N>
-using Config_ = Sm80_s16816<Sm80,
-                            Operand_A_Pack<uint4_t, kColMajor>,  // A
-                            Transform_HMMA_16816<0, 1>,          // tarnsform A
-                            Operand_UV_Pack<uint32_t, false>,    // U
-                            Operand_B<half, kRowMajor, N>,       // B
-                            Transform_Default,                   // transform B
-                            VoidOperand,                         // V
-                            kColMajor,                           // order_C
-                            half,                                // Tc
-                            CtaMapN>;
-
 void Registry::u4g128_f16_f16_nnn_sm80_s16816()
 {
     // ! Must be M-major MMA
 #if 0
+    using namespace sm80_s16816;
+    template<int N>
+    using Config_ = Sm80_s16816<Sm80,
+                                Operand_A_Pack<uint4_t, kColMajor>,  // A
+                                Transform_HMMA_16816<0, 1>,          // tarnsform A
+                                Operand_UV_Pack<uint32_t, false>,    // U
+                                Operand_B<half, kRowMajor, N>,       // B
+                                Transform_Default,                   // transform B
+                                VoidOperand,                         // V
+                                kColMajor,                           // order_C
+                                half,                                // Tc
+                                CtaMapN>;
+
     using namespace cache_policy;
 
     using C16 = Config_<16>;
diff --git a/src/turbomind/kernels/gemm/kernel_impl.h b/src/turbomind/kernels/gemm/kernel_impl.h
index 171b0b6952..3980e1d222 100644
--- a/src/turbomind/kernels/gemm/kernel_impl.h
+++ b/src/turbomind/kernels/gemm/kernel_impl.h
@@ -4,8 +4,13 @@
 
 #include "src/turbomind/kernels/core/common.h"
 #include "src/turbomind/kernels/core/data_type.h"
+#include "src/turbomind/kernels/gemm/context.h"
+#include "src/turbomind/kernels/gemm/cta_map.h"
+#include "src/turbomind/kernels/gemm/desc.h"
+#include "src/turbomind/kernels/gemm/epilogue.h"
 #include "src/turbomind/kernels/gemm/gemm_universal.h"
 #include "src/turbomind/kernels/gemm/kernel.h"
+#include "src/turbomind/kernels/gemm/matrix_ptr.h"
 #include "src/turbomind/kernels/gemm/operand.h"
 #include "src/turbomind/kernels/gemm/thread_group_map.h"
 #include "src/turbomind/kernels/gemm/types.h"
@@ -38,6 +43,13 @@ class KernelImpl: public Kernel {
         desc_.type_b = get_data_type_v<typename Gemm::Tb>;
         desc_.type_c = get_data_type_v<typename Gemm::Tc>;
 
+        using IterA = typename OpA::GmemIter;
+        using IterB = typename OpB::GmemIter;
+
+        desc_.striding_a = IterA::kMode;
+        desc_.striding_b = IterB::kMode;
+        desc_.striding_c = Gemm::Epilogue::kMode;
+
         desc_.pack_a = OpA::kPack;
         desc_.pack_b = OpB::kPack;
         desc_.pack_u = OpU::kPack;
@@ -58,9 +70,6 @@ class KernelImpl: public Kernel {
         desc_.mma_tile = {Impl::MMA_Map::kGroupM, Impl::MMA_Map::kGroupN, Impl::MMA_Map::kGroupK};
         chunk_size_k_  = Gemm::kChunkSizeK;
 
-        using IterA = typename OpA::GmemIter;
-        using IterB = typename OpB::GmemIter;
-
         desc_.align.x = OpA::kOrder == kColMajor ? IterA::ThreadMap::kAccessC : 1;
         desc_.align.y = OpB::kOrder == kColMajor ? IterB::ThreadMap::kAccessC : 1;
         desc_.align.z = Gemm::CTA_K;
@@ -73,21 +82,21 @@ class KernelImpl: public Kernel {
         smem_size_ = sizeof(typename Gemm::SharedStorage);
 
         desc_.stages  = Impl::Stages;
-        desc_.split_k = Gemm::SplitK;
+        desc_.split_k = Gemm::kSplitK;
+        desc_.sched   = Gemm::kDynamicSched;
 
         desc_.arch = Gemm::Arch::value;
 
-        using Params = typename Gemm::Param;
         using CtaMap = typename Gemm::CtaMap;
 
-        auto func = gemm_kernel<Gemm, Params, CtaMap>;
+        auto func = gemm_kernel<Gemm, GemmParam, EpilogueParam, CtaMap>;
 
         if (smem_size_ > (48 << 10)) {
             cudaFuncSetAttribute(func, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size_);
         }
 
         cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-            &desc_.max_active_ctas, gemm_kernel<Gemm, Params, CtaMap>, Impl::WARPS * WARP_SIZE, smem_size_);
+            &desc_.max_active_ctas, func, Impl::WARPS * WARP_SIZE, smem_size_);
 
         cudaFuncGetAttributes(&desc_.attr, func);
 
@@ -97,7 +106,7 @@ class KernelImpl: public Kernel {
     int Launch(const Operation&    operation,
                float               alpha,
                const void*         A,
-               const MatrixLayout& Adesc,
+               const MatrixLayout& _Adesc,
                const void*         U,
                const MatrixLayout& Udesc,
                const void*         B,
@@ -112,10 +121,12 @@ class KernelImpl: public Kernel {
                int                 swizzle,
                int                 splits,
                Workspace&          workspace,
-               cudaStream_t        stream) override
+               cudaStream_t        stream) const override
     {
         using Map = typename Gemm::CtaMap;
 
+        MatrixLayout Adesc = _Adesc;
+
         const int m = Ddesc.rows;
         const int n = Ddesc.cols;
         const int k = Adesc.cols;
@@ -126,47 +137,36 @@ class KernelImpl: public Kernel {
             return x;
         };
 
-        const MatrixLayout Bdesc = transpose(_Bdesc);
-        const MatrixLayout Vdesc = transpose(_Vdesc);
-
-        const int chunk_cnt = ceil_div(k, Gemm::kChunkSizeK);
-
-        // Limit splits by num of chunks to avoid chaos
-        splits = std::min(chunk_cnt, splits);
-
-        auto tiles = Map::get_tiled_shape(m, n, k, CTA_M, CTA_N, splits);
-
-        if (splits > 1) {
-            size_t bsize{}, psize{};
-            GetWorkspaceSizes(m, n, tiles.x, tiles.y, splits, bsize, psize);
-            const int max_splits = GetMaxSplits(m, n, k, workspace.barriers_size, workspace.partials_size);
-            if (workspace.barriers_size < bsize || workspace.partials_size < psize) {
-                fprintf(
-                    stderr,
-                    "Problem size (%d, %d, %d), workspace size too small (%d, %d) vs required (%d, %d) for %d splits. Force `splits` = %d\n",
-                    m,
-                    n,
-                    k,
-                    (int)workspace.barriers_size,
-                    (int)workspace.partials_size,
-                    (int)bsize,
-                    (int)psize,
-                    splits,
-                    max_splits);
-                splits = max_splits;
-                tiles  = Map::get_tiled_shape(m, n, k, CTA_M, CTA_N, splits);
+        MatrixLayout Bdesc = transpose(_Bdesc);
+        MatrixLayout Vdesc = transpose(_Vdesc);
+
+        auto sched = [&] {
+            if constexpr (Gemm::kDynamicSched) {
+                LaunchSpec spec{(Kernel*)this};
+                spec.splits  = splits;
+                spec.swizzle = swizzle;
+                return Map{operation.context->Schedule(spec)};
             }
-        }
+            else {
+                const int chunk_cnt = ceil_div(k, Gemm::kChunkSizeK);
+                // Limit splits by num of chunks to avoid chaos
+                splits = std::min(chunk_cnt, splits);
 
-        swizzle = Map::get_log_tile(tiles, 1 << swizzle);
+                const int2 tiles = get_tiled_shape(m, n, CTA_M, CTA_N);
+                const int4 shape{m, n, k, 1};
 
-        const auto grid  = Map::get_grid_shape(tiles, swizzle);
-        const auto block = Gemm::Impl::WARPS * WARP_SIZE;
+                if (splits > 1) {
+                    splits = FixSplits(shape, tiles, splits, workspace);
+                }
+
+                swizzle = Map::get_log_tile(tiles, 1 << swizzle);
+
+                return Map{shape, tiles, splits, swizzle, CTA_K, Gemm::kChunkSizeK};
+            }
+        }();
 
         using Ta = typename Gemm::Ta;
         using Tb = typename Gemm::Tb;
-        using Tu = typename Gemm::Tu;
-        using Tv = typename Gemm::Tv;
         using Tc = typename Gemm::Tc;
 
         if constexpr (0) {
@@ -196,125 +196,107 @@ class KernelImpl: public Kernel {
             }();
         }
 
-        int lda = Adesc.ld;
-        int ldb = Bdesc.ld;
+        const bool silu_act = ((int)operation.epilogue & (int)Epilogue::kGatedSilu);
 
-        if (Gemm::kPackA) {
-            lda = mk2cs<Gemm::kOrderA>(Packing_v2<Gemm::kPackA, Gemm::kOrderA>::apply({m, k})).x;
-        }
-        if (Gemm::kPackB) {
-            ldb = mk2cs<Gemm::kOrderB>(Packing_v2<Gemm::kPackB, Gemm::kOrderB>::apply({n, k})).x;
-        }
+        MatrixLayout Pdesc = Ddesc;
+        Pdesc.ld           = mk2cs<Gemm::kOrderC>(Pdesc.rows, Pdesc.cols).x;
 
-        // std::cout << "lda=" << lda << ", ldb=" << ldb << ", ldc=" << Cdesc.ld << "\n";
+        MatrixCombination_v3 combin_mat{to_param((void*)C, Cdesc), alpha, beta};
 
-        // std::cout << "C: " << C << ", D: " << D << "\n";
+        EpilogueParam epilogue{to_param((void*)D, Ddesc),
+                               to_param((void*)workspace.partials, Pdesc),
+                               (int*)workspace.barriers,
+                               combin_mat,
+                               silu_act};
 
-        const bool silu_act = ((int)operation.epilogue & (int)Epilogue::kGatedSilu);
+        // std::cout << Adesc.offsets << " " << Adesc.idxs << "\n";
+
+        GemmParam param{
+            to_param((void*)A, Adesc),
+            to_param((void*)B, Bdesc),
+            to_param((void*)U, Udesc),
+            to_param((void*)V, Vdesc),
+        };
 
-        const int partial_C_ld = mk2cs<Gemm::kOrderC>(Ddesc.rows, Ddesc.cols).x;
-
-        EpilogueParam<Tc> epilogue{m,
-                                   n,
-                                   (Tc*)D,
-                                   Ddesc.ld,
-                                   (float*)workspace.partials,
-                                   partial_C_ld,
-                                   (int*)workspace.barriers,
-                                   {alpha, beta, (const Tc*)C, Cdesc.ld},
-                                   silu_act};
-
-        const int chunk_per_split = chunk_cnt / splits;
-        const int chunk_remianing = chunk_cnt % splits;
-        const int chunk_offset    = splits - chunk_remianing;
-        // chunk_id = z * chunk_per_split + max(z - (splits - chunk_remaining), 0);
-        // offset_k = chunk_id * kChunkSizeK;
-        // gemm_k_size = offset_k + (chunk_per_split + int(z > chunk_offset)) * kChunkSizeK
-        // gemm_k_size = std::min(gemm_k_size, k) - offset_k
-
-        // std::cout << k << " " << Gemm::kChunkSizeK << " " << splits << " " << chunk_per_split << " " <<
-        // chunk_remianing << " " << chunk_offset << "\n";
-
-        typename Gemm::Param param{m,
-                                   n,
-                                   k,
-                                   typename Gemm::PtrA{(Ta*)A},
-                                   lda,
-                                   (Tu*)U,
-                                   Udesc.ld,
-                                   typename Gemm::PtrB{(Tb*)B},
-                                   ldb,
-                                   (Tv*)V,
-                                   Vdesc.ld,
-                                   swizzle,
-                                   tiles,
-                                   chunk_per_split,
-                                   chunk_offset,
-                                   epilogue};
-
-        gemm_kernel<Gemm><<<grid, block, smem_size_, stream>>>(param, Map{});
+        const auto grid  = sched.get_grid_shape();
+        const auto block = Gemm::Impl::WARPS * WARP_SIZE;
 
-        return 0;
-    }
+        // std::cout << grid.x << " " << grid.y << " " << grid.z << "\n";
 
-    template<class T>
-    static auto _cast(T* p)
-    {
-        if constexpr (bitsof<T> % 8 == 0) {
-            return p;
-        }
-        else {
-            return (char*)p;
-        }
+        gemm_kernel<Gemm><<<grid, block, smem_size_, stream>>>(param, epilogue, sched);
+
+        return 0;
     }
 
-    // ! This assumes N results in 16 byte aligned partials
-    void
-    GetWorkspaceSizes(int m, int n, int tiled_m, int tiled_n, int splits, size_t& barriers_size, size_t& partials_size)
+    std::array<size_t, 2> GetWorkspaceSizesV2(const int4& shape, int tiles, int splits) const
     {
         static constexpr bool kSerial = true;
 
-        partials_size = sizeof(float) * m * n;
-        barriers_size = sizeof(int) * tiled_m * tiled_n;
+        const auto& [m, n, _, num] = shape;
+
+        size_t barriers_size = sizeof(int) * tiles;
+        size_t partials_size = sizeof(float) * m * n * num;
 
         if constexpr (!kSerial) {
-            partials_size *= splits;
             barriers_size *= splits;
+            partials_size *= splits;
         }
+
+        return {barriers_size, partials_size};
     }
 
-    int GetMaxSplits(int m, int n, int k, size_t barrier_size, size_t partials_size) override
+    int GetMaxSplits(const int4& shape, int64_t tiles, size_t bsize, size_t psize) const override
     {
-        if (!Gemm::SplitK) {  // kernel has no split-k support
+        if (!Gemm::kSplitK) {
             return 1;
         }
 
-        const int tiled_m = ceil_div(m, CTA_M);
-        const int tiled_n = ceil_div(n, CTA_N);
-
-        size_t bsize_1split{};
-        size_t psize_1split{};
+        const auto& [m, n, k, _] = shape;
 
-        // workspace for 1 non-trival split
-        GetWorkspaceSizes(m, n, tiled_m, tiled_n, 1, bsize_1split, psize_1split);
+        const auto& [a, b] = GetWorkspaceSizesV2(shape, tiles, 1);
 
-        if (barrier_size >= bsize_1split && partials_size >= psize_1split) {
+        if (bsize >= a && psize >= b) {
             // Serial split-k requires workspace for 1 split only
             // But it can't exceed num of k chunks
-            const int chunk_cnt = ceil_div(k, Gemm::kChunkSizeK);
-            return std::min(chunk_cnt, 32);
+            return cdiv(k, Gemm::kChunkSizeK);
         }
         else {
             return 1;
         }
     }
 
-    int GetSwizzle(int m, int n, int k, int splits, int swizzle) override
+    int GetSwizzle(int m, int n, int k, int splits, int swizzle) const override
     {
         using Map        = typename Gemm::CtaMap;
-        const auto tiles = Map::get_tiled_shape(m, n, k, CTA_M, CTA_N, splits);
+        const auto tiles = get_tiled_shape(m, n, CTA_M, CTA_N);
         return Map::get_log_tile(tiles, 1 << swizzle);
     }
+
+    int FixSplits(const int4& shape, int2 tiled_mn, int splits, Workspace& ws) const
+    {
+        const int tiles            = tiled_mn.x * tiled_mn.y;
+        const auto& [bsize, psize] = GetWorkspaceSizesV2(shape, tiles, splits);
+
+        if (ws.barriers_size < bsize || ws.partials_size < psize) {
+            const int max_splits       = GetMaxSplits(shape, tiles, ws.barriers_size, ws.partials_size);
+            const auto& [m, n, k, num] = shape;
+            fprintf(
+                stderr,
+                "Problem size (%d, %d, %d), workspace size too small (%d, %d) vs required (%d, %d) for %d splits. Force `splits` = %d\n",
+                m,
+                n,
+                k,
+                (int)ws.barriers_size,
+                (int)ws.partials_size,
+                (int)bsize,
+                (int)psize,
+                splits,
+                max_splits);
+            splits = max_splits;
+        }
+
+        return splits;
+    }
 };
 
 }  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/matrix_ptr.h b/src/turbomind/kernels/gemm/matrix_ptr.h
new file mode 100644
index 0000000000..8b70de1e51
--- /dev/null
+++ b/src/turbomind/kernels/gemm/matrix_ptr.h
@@ -0,0 +1,112 @@
+#pragma once
+
+#include "src/turbomind/kernels/core/data_type.h"
+#include "src/turbomind/kernels/core/meta.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind::gemm {
+
+struct __align__(16) StridedPtr
+{
+    void* ptr;
+    int   stride;
+};
+
+struct MatrixParam {
+    void* ptr;
+    int   stride;
+    int*  offsets;
+    int*  idxs;
+};
+
+struct MatrixData {
+    StridedPtr ptr;
+    const int* idxs;
+};
+
+inline MatrixParam to_param(void* ptr, MatrixLayout layout)
+{
+    return {ptr, layout.ld, layout.offsets, layout.idxs};
+}
+
+#if 0
+template<Striding mode>
+__inline__ __device__ MatrixData resolve(const MatrixParam& param, int gemm_id)
+{
+    if constexpr (mode == Striding::kFlat) {
+        return {{param.ptr, param.stride}, nullptr};
+    }
+    else if constexpr (mode == Striding::kBlocked) {
+        StridedPtr ptr{param.ptr, param.stride};
+        if (param.stride == 0) {
+            (uint4&)ptr = __ldg((const uint4*)param.ptr + gemm_id);
+        }
+        return {ptr, nullptr};
+    }
+    else if constexpr (mode == Striding::kIndexed) {
+        const uintptr_t idx = param.idxs ? __ldg((uintptr_t*)param.idxs + gemm_id) : 0;
+        StridedPtr      ptr{param.ptr, param.stride};
+        if (param.stride == 0) {
+            (uint4&)ptr = __ldg((const uint4*)param.ptr + gemm_id);
+        }
+        return {ptr, reinterpret_cast<const int*>(idx)};
+    }
+    else {
+        static_assert(mode != mode, "Not implemented.");
+        return {};
+    }
+}
+#endif
+
+template<class T, Striding mode>
+__inline__ __device__ MatrixData resolve(const MatrixParam& param, int g)
+{
+    StridedPtr ptr{param.ptr, param.stride};
+    const int* idxs{};
+    if constexpr (mode == Striding::kFlat) {
+        // pass
+    }
+    else if constexpr (mode == Striding::kBlocked) {
+        if (ptr.stride == 0) {
+            (uint4&)ptr = __ldg((const uint4*)param.ptr + g);
+        }  // Post-condition: ptr.stride != 0
+        if (param.offsets) {
+            ptr.ptr = (char*)ptr.ptr + __ldg(param.offsets + g) * (size_t)ptr.stride * bitsof<T> / bitsof<char>;
+        }
+    }
+    else if constexpr (mode == Striding::kIndexed) {
+        idxs = param.idxs;
+        if (ptr.stride == 0) {
+            (uint4&)ptr = __ldg((const uint4*)param.ptr + g);
+            idxs        = idxs ? ((int**)idxs)[g] : nullptr;
+        }  // Post-condition: ptr.stride != 0
+        if (param.offsets) {
+            const int offset = __ldg(param.offsets + g);
+            if (idxs) {
+                idxs += offset;
+            }
+            else {
+                ptr.ptr = (char*)ptr.ptr + offset * (size_t)ptr.stride * bitsof<T> / bitsof<char>;
+            }
+        }
+    }
+    else {
+        static_assert(mode != mode, "Not implemented.");
+    }
+    return {ptr, idxs};
+}
+
+// p <- dat_ptrs[g]
+// i <- idx_ptrs[g]
+
+// pitch offset idxs
+//    1     0     0   -> {ptr, pitch}       , 0
+//    1     0     1   -> {ptr, pitch}       , idxs
+//    1     1     0   -> {ptr, pitch} + o[g], 0
+//    1     1     1   -> {ptr, pitch}       , idxs + o[g]
+//    0     0     0   ->       p            , 0
+//    0     0     1   ->       p            , i
+//    0     1     0   ->       p      + o[g], 0
+//    0     1     1   ->       p            , i    + o[g]
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu
new file mode 100644
index 0000000000..acf6355856
--- /dev/null
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu
@@ -0,0 +1,475 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <algorithm>
+#include <cstdio>
+#include <limits>
+#include <numeric>
+#include <random>
+
+#include <cub/block/block_reduce.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cuda_pipeline_primitives.h>
+
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/math.h"
+#include "src/turbomind/kernels/gemm/moe_utils_v2.h"
+
+namespace turbomind {
+
+template<int top_k, int block_dim>
+__global__ void MoeGateKernel_V2(float*       scales,  // [e,n]
+                                 int*         masks,   // [E,n], padded
+                                 int*         accum,   // [E,tiles]
+                                 const float* logits,  // [E,n]
+                                 int          log_tile,
+                                 int          tiles,
+                                 int          tokens,
+                                 int          tokens_padded,
+                                 int          experts)
+{
+    constexpr int max_tiles = kMoeGateMaxTiles;
+
+    // Brute-force per thread top-k using a flat thread mapping
+    const int ti = threadIdx.x + blockIdx.x * blockDim.x;
+
+    // Clear masks
+    for (int e = 0; e < experts; ++e) {
+        if (ti < tokens_padded) {
+            masks[e * tokens_padded + ti] = -1;
+        }
+    }
+
+    __shared__ int shared_accum[32][max_tiles];
+
+    for (int i = threadIdx.x; i < experts * max_tiles; i += block_dim) {
+        int e = i / max_tiles;
+        int t = i % max_tiles;
+        if (e < experts && t < tiles) {
+            shared_accum[e][t] = 0;
+        }
+    }
+
+    __syncthreads();
+
+    if (ti < tokens) {
+
+        static_assert(top_k <= 32);
+        int mask = -1;
+
+        float max_logit = 0.f;
+
+        // Find top-k
+        PRAGMA_UNROLL
+        for (int k = 0; k < top_k; ++k) {
+            int   max_bit = 0;
+            float max_val = -std::numeric_limits<float>::infinity();
+            int   bit     = 1;
+            for (int e = 0; e < experts; ++e) {
+                const auto val = logits[ti * experts + e];
+                // const auto val = logits[e * tokens + ti];
+                if ((mask & bit) && val > max_val) {
+                    max_bit = bit;
+                    max_val = val;
+                }
+                bit *= 2;
+            }
+            mask -= max_bit;
+            if (k == 0) {
+                max_logit = max_val;
+            }
+        }
+
+        mask = ~mask;
+
+        Array<float, top_k> top_val;
+        PRAGMA_UNROLL
+        for (int i = 0; i < top_k; ++i) {
+            const int lowbit = (mask & -mask);
+            const int e      = 31 - __clz(lowbit);
+
+            masks[e * tokens_padded + ti] = i;
+            atomicAdd(&shared_accum[e][ti >> log_tile], 1);
+            top_val[i] = logits[ti * experts + e];
+            // top_val[i] = logits[e * tokens + ti];
+
+            mask -= lowbit;
+        }
+
+        float prob_sum = 0.f;
+        PRAGMA_UNROLL
+        for (int i = 0; i < top_k; ++i) {
+            top_val[i] = expf(top_val[i] - max_logit);
+            prob_sum += top_val[i];
+        }
+
+        PRAGMA_UNROLL
+        for (int i = 0; i < top_k; ++i) {
+            scales[i * tokens + ti] = fdividef(top_val[i], prob_sum);
+        }
+    }
+
+    __syncthreads();
+
+    for (int i = threadIdx.x; i < experts * max_tiles; i += block_dim) {
+        int e = i / max_tiles;
+        int t = i % max_tiles;
+        if (e < experts && t < tiles) {
+            atomicAdd(accum + e * tiles + t, shared_accum[e][t]);
+        }
+    }
+}
+
+template<int block_dim>
+__global__ void MoeScanKernel_V2(int*       f2n,      // [e*n]
+                                 int*       en2f,     // [e,n]
+                                 int*       offsets,  // [E+1]
+                                 int*       masks,    // [E,n], padded
+                                 const int* accum,    // [E,tiles]
+                                 int        log_tile,
+                                 int        tiles,
+                                 int        tokens,
+                                 int        tokens_padded,
+                                 int        experts)
+{
+    using BlockReduce = cub::BlockReduce<int, block_dim>;
+    using BlockScan   = cub::BlockScan<int, block_dim>;
+
+    __shared__ union TempStorage {
+        typename BlockReduce::TempStorage reduce;
+        typename BlockScan::TempStorage   scan;
+    } temp_storage;
+
+    constexpr int vec_size = kMoeGateVecSize;
+
+    using Vec = Array<int, vec_size>;
+
+    const int tile_id = blockIdx.x;
+    const int ei      = blockIdx.y;
+
+    const int global_tile_id = ei * tiles + tile_id;
+
+    int vacc[4]{};
+    {
+        int idx = threadIdx.x;
+        PRAGMA_UNROLL
+        for (int i = 0; i < 4; ++i) {
+            if (idx < global_tile_id) {
+                vacc[i] = accum[idx];
+            }
+            idx += block_dim;
+        }
+    }
+
+    int offset = BlockReduce{temp_storage.reduce}.Sum(vacc);
+
+    __shared__ int shared_offset;
+
+    if (threadIdx.x == 0) {
+        shared_offset = offset;
+        if (tile_id == 0) {
+            offsets[ei] = offset;
+        }
+    }
+
+    if (ei == experts) {
+        return;
+    }
+
+    __syncthreads();
+
+    offset = shared_offset;
+
+    const int token_vecs = tokens_padded / vec_size;
+
+    const int tile_size     = 1 << log_tile;
+    const int tile_vec_size = tile_size / vec_size;
+
+    const int tile_vec_beg    = tile_id * tile_vec_size;
+    const int tile_vec_end    = std::min(tile_vec_beg + tile_vec_size, token_vecs);
+    const int tile_vec_padded = tile_vec_beg + round_up(tile_vec_size, block_dim);
+
+    // if (threadIdx.x == 0) {
+    //     printf("%d %d %d\n", tile_vec_beg, tile_vec_end, tile_vec_padded);
+    // }
+
+    auto mask_ptr = (Vec*)masks + ei * token_vecs;
+
+    for (int vi = tile_vec_beg + threadIdx.x; vi < tile_vec_padded; vi += block_dim) {
+
+        const bool pred = vi < tile_vec_end;
+
+        Vec data;
+        fill(data, -1);
+        if (pred) {
+            Ldg(data, mask_ptr[vi].data());
+        }
+
+        int prefix[vec_size];
+        PRAGMA_UNROLL
+        for (int i = 0; i < vec_size; ++i) {
+            prefix[i] = int(data[i] >= 0);
+        }
+
+        int block_sum = 0;
+
+        BlockScan{temp_storage.scan}.ExclusiveSum(prefix, prefix, block_sum);
+        __syncthreads();
+
+        PRAGMA_UNROLL
+        for (int i = 0; i < vec_size; ++i) {
+            if (pred && data[i] >= 0) {
+                const int flat_id = prefix[i] + offset;
+                const int ti      = vi * vec_size + i;
+                f2n[flat_id]      = ti;
+                // No ti is generated for padded tokens so we are safe
+                en2f[data[i] * tokens + ti] = flat_id;
+            }
+        }
+
+        offset += block_sum;
+    }
+}
+
+void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
+                      int*         en2f,           // [e,n] -> n*e
+                      int*         offsets,        // [E+1]
+                      float*       scales,         // [e,n]
+                      int*         masks,          // [E,n]
+                      int*         accum,          // [E]
+                      const float* logits,         // [e,n]
+                      int          tokens,         //  n
+                      int          tokens_padded,  //  round_up(n, 4)
+                      int          experts,        //  E
+                      int          experts_per_token,
+                      cudaStream_t st)
+{
+    constexpr int base_log_tile = 9;
+
+    int log_tile = base_log_tile;
+    while (((tokens_padded + (1 << log_tile) - 1) >> log_tile) > kMoeGateMaxTiles) {
+        ++log_tile;
+    }
+    const int tiles = ceil_div(tokens_padded, 1 << log_tile);
+
+    // std::cout << log_tile << " " << tiles << "\n";
+
+    {
+        constexpr int threads = 128;
+        const int     blocks  = ceil_div(tokens, threads);
+
+        auto invoke = [&](auto e) {
+            static constexpr int top_k = decltype(e)::value;
+            MoeGateKernel_V2<top_k, threads><<<blocks, threads, 0, st>>>(  //
+                scales,
+                masks,
+                accum,
+                logits,
+                log_tile,
+                tiles,
+                tokens,
+                tokens_padded,
+                experts);
+        };
+
+        switch (experts_per_token) {
+            case 2:
+                invoke(std::integral_constant<int, 2>{});
+                break;
+            // case 4:
+            //     invoke(std::integral_constant<int, 4>{});
+            //     break;
+            default:
+                std::cerr << __FILE__ << ":" << __LINE__ << " Not implemented. " << std::endl;
+                std::abort();
+        }
+    }
+
+    // return;
+
+    {
+        // Check: tiles * experts <= threads
+
+        constexpr int threads = (1 << base_log_tile) / kMoeGateVecSize;
+        const dim3    blocks(tiles, experts + 1);
+        MoeScanKernel_V2<threads><<<blocks, threads, 0, st>>>(f2n,  //
+                                                              en2f,
+                                                              offsets,
+                                                              masks,
+                                                              accum,
+                                                              log_tile,
+                                                              tiles,
+                                                              tokens,
+                                                              tokens_padded,
+                                                              experts);
+    }
+}
+
+template<int vec_size, int block_dim, class T>
+__global__ void MoeGatherKernel(T*         dst,  // [e*n, d]
+                                const T*   src,  // [  n, d]
+                                const int* f2n,  // [e*n] :: e*n -> n
+                                int        dims)
+{
+    using Vec        = Array<T, vec_size>;
+    const int64_t bi = blockIdx.x;
+
+    auto src_ptr = (const Vec*)src + dims * f2n[bi];
+    auto dst_ptr = (/* */ Vec*)dst + dims * bi;
+    for (int i = threadIdx.x; i < dims; i += block_dim) {
+        Vec v;
+        Ldg(v, src_ptr[i].data());
+        Store(dst_ptr[i].data(), v);
+    }
+}
+
+template<class T>
+void invokeMoeGather(T* dst, const T* src, const int* f2n, int tokens, int experts_per_token, int dims, cudaStream_t st)
+{
+    constexpr int threads  = 256;
+    constexpr int vec_size = 16 / sizeof(T);
+    MoeGatherKernel<vec_size, threads><<<tokens * experts_per_token, threads, 0, st>>>(  //
+        dst,
+        src,
+        f2n,
+        dims / vec_size);
+}
+
+template void invokeMoeGather(uint16_t*, const uint16_t*, const int*, int, int, int, cudaStream_t);
+
+template<int vec_size, int exp_k, int block_dim, class T>
+__global__ void MoeReduceKernel(T*           dst,     // [  n, d]
+                                const T*     src,     // [e*n, d]
+                                const float* scales,  // [  e, n]
+                                const int*   en2f,    // [  e, n] :: (e,n) -> e*n
+                                int          dims,
+                                int          tokens)
+{
+    using Vec = Array<T, vec_size>;
+
+    const int64_t ti = blockIdx.x;
+
+    auto dst_ptr = (Vec*)dst + dims * ti;
+
+    // Should be warp uniforms
+    const Vec* src_ptr[exp_k];
+    float      scale[exp_k];
+    PRAGMA_UNROLL
+    for (int e = 0; e < exp_k; ++e) {
+        src_ptr[e] = (const Vec*)src + dims * en2f[e * tokens + ti];
+        scale[e]   = scales ? scales[e * tokens + ti] : 1.f;
+    }
+
+    for (int i = threadIdx.x; i < dims; i += block_dim) {
+        Array<float, vec_size> accum{};
+        PRAGMA_UNROLL
+        for (int e = 0; e < exp_k; ++e) {
+            Vec v;
+            Ldg(v, src_ptr[e][i].data());
+            using namespace ops;
+            const auto x = cast<float>(v) * scale[e];
+            accum        = accum + x;
+        }
+        Store(dst_ptr[i].data(), cast<T>(accum));
+    }
+}
+
+template<class T>
+void invokeMoeReduce(T*           dst,
+                     const T*     src,
+                     const float* scales,
+                     const int*   en2f,
+                     int          tokens,
+                     int          experts_per_token,
+                     int          dims,
+                     cudaStream_t st)
+{
+    // std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+    const auto invoke = [&](auto e) {
+        constexpr int threads     = 256;
+        constexpr int vec_size    = 16 / sizeof(T);
+        constexpr int exp_per_tok = decltype(e)::value;
+        MoeReduceKernel<vec_size, exp_per_tok, threads><<<tokens, threads, 0, st>>>(  //
+            dst,
+            src,
+            scales,
+            en2f,
+            dims / vec_size,
+            tokens);
+    };
+
+    switch (experts_per_token) {
+        case 1:
+            return invoke(std::integral_constant<int, 1>{});
+        case 2:
+            return invoke(std::integral_constant<int, 2>{});
+        // case 4:
+        //     return invoke(std::integral_constant<int, 4>{});
+        // case 6:
+        //     return invoke(std::integral_constant<int, 6>{});
+        default:
+            fprintf(stderr, "Unsupported experts_per_token %d\n", experts_per_token);
+            std::abort();
+    }
+}
+
+template void invokeMoeReduce(half*, const half*, const float*, const int*, int, int, int, cudaStream_t);
+#ifdef ENABLE_BF16
+template void invokeMoeReduce(nv_bfloat16*, const nv_bfloat16*, const float*, const int*, int, int, int, cudaStream_t);
+#endif
+
+std::vector<int> SampleUniform(int token_num, int expert_num, int exp_per_tok, std::mt19937& g)
+{
+    std::vector<int> idxs((size_t)token_num * exp_per_tok);
+    std::vector<int> r(expert_num);
+    std::iota(r.begin(), r.end(), 0);
+    auto it = idxs.begin();
+    for (int i = 0; i < token_num; ++i) {
+        it = std::sample(r.cbegin(), r.cend(), it, exp_per_tok, g);
+    }
+    return idxs;
+}
+
+std::vector<int> SampleBalanced(int token_num, int expert_num, int exp_per_tok, std::mt19937& g)
+{
+    assert(exp_per_tok <= expert_num);
+    std::vector<int> idxs((size_t)token_num * exp_per_tok);
+    std::vector<int> q;
+
+    std::vector<int> r(expert_num);
+    std::iota(r.begin(), r.end(), 0);
+
+    auto it = idxs.begin();
+    for (int i = 0; i < token_num; ++i) {
+        if ((int)q.size() < exp_per_tok) {
+            const int k = q.size();
+            // prepend the experts: [xxx] -> [yyy | xxx]
+            q.insert(q.begin(), r.cbegin(), r.cend());
+            // move duplicated experts to the front: [yyy | xxx] -> [xxx' | yyy' | xxx]
+            int p = 0;
+            std::for_each(q.cend() - k, q.cend(), [&](auto x) { std::swap(q[p++], q[x]); });
+            // shuffle unique experts yyy'
+            std::shuffle(q.begin() + p, q.end() - k, g);
+        }
+        it = std::copy(q.end() - exp_per_tok, q.end(), it);
+        // remove used experts [xxx' | yyy' | xxx ] -> [xxx' | zzz]
+        q.resize(q.size() - exp_per_tok);
+        // alias [xxx] <- [xxx' | zzz]
+    }
+    assert(it == idxs.end());
+
+    // shuffle to decorrelate adjacent tokens
+    r.resize(token_num);
+    std::iota(r.begin(), r.end(), 0);
+    std::shuffle(r.begin(), r.end(), g);
+    std::vector<int> ret(idxs.size());
+    it = ret.begin();
+    for (const auto& i : r) {
+        it = std::copy_n(idxs.begin() + i * exp_per_tok, exp_per_tok, it);
+    }
+    assert(it == ret.end());
+    return ret;
+}
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.h b/src/turbomind/kernels/gemm/moe_utils_v2.h
new file mode 100644
index 0000000000..334e2de272
--- /dev/null
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.h
@@ -0,0 +1,62 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <cstdint>
+#include <cuda_runtime.h>
+#include <random>
+#include <vector>
+
+namespace turbomind {
+
+constexpr int kMoeGateMaxTiles = 16;
+constexpr int kMoeGateVecSize  = 4;
+
+void invokeMoeGate_V2(int*         f2n,
+                      int*         en2f,
+                      int*         offsets,
+                      float*       scales,
+                      int*         masks,
+                      int*         accum,
+                      const float* logits,
+                      int          tokens,
+                      int          tokens_padded,
+                      int          experts,
+                      int          exp_per_tok,
+                      cudaStream_t st);
+
+template<class T>
+void invokeMoeGather(
+    T* dst, const T* src, const int* f2n, int tokens, int experts_per_token, int dims, cudaStream_t st);
+
+template<class T>
+inline void
+dispatchMoeGather(T* dst, const T* src, const int* f2n, int tokens, int experts_per_token, int dims, cudaStream_t st)
+{
+    const auto invoke = [&](auto type) {
+        using V = decltype(type);
+        invokeMoeGather((V*)dst, (const V*)src, f2n, tokens, experts_per_token, dims, st);
+    };
+
+    if constexpr (sizeof(T) == 2) {
+        invoke(uint16_t{});
+    }
+    else {  /// TODO: dispatch for more types
+        static_assert(sizeof(T) != sizeof(T), "Not implemented");
+    }
+}
+
+template<class T>
+void invokeMoeReduce(T*           dst,
+                     const T*     src,
+                     const float* scales,
+                     const int*   en2f,
+                     int          tokens,
+                     int          experts_per_token,
+                     int          dims,
+                     cudaStream_t st);
+
+// Sample `e` from `E` experts uniformly for every token
+std::vector<int> SampleUniform(int token_num, int expert_num, int exp_per_tok, std::mt19937& g);
+
+std::vector<int> SampleBalanced(int token_num, int expert_num, int exp_per_tok, std::mt19937& g);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/gemm/registry.cu b/src/turbomind/kernels/gemm/registry.cu
index da3d2923c5..53748707fc 100644
--- a/src/turbomind/kernels/gemm/registry.cu
+++ b/src/turbomind/kernels/gemm/registry.cu
@@ -14,25 +14,36 @@ Registry::Registry(std::shared_ptr<cudaDeviceProp> device_prop):
     f16_u4g128_f16_tnt_sm80_s16816();
     f16_u4g128_f16_tnt_sm90_s16816();
 
-    u4g128_f16_f16_nnn_sm80_s16816();
+    sm70_s884_dynamic();
+    sm75_s16816_dynamic();
+    sm80_s16816_dynamic<half>();
+    sm90_s16816_dynamic<half>();
+    sm80_s16816_dynamic<nv_bfloat16>();
+    sm90_s16816_dynamic<nv_bfloat16>();
+
+    // u4g128_f16_f16_nnn_sm80_s16816();
 }
 
 bool Registry::Add(std::unique_ptr<Kernel> kernel)
 {
+    bool is_valid = true;
     if (!is_arch_compatible(kernel->arch(), arch_)) {
-        return false;
+        is_valid = false;
     }
     if ((int)device_prop_->sharedMemPerBlockOptin < kernel->smem_size()) {
-        return false;
+        is_valid = false;
+    }
+    // if (is_valid) {
+    //     std::cout << "register: " << kernel->name()                                        //
+    //               << ", shared: " << (kernel->smem_size() >> 10) << " KB"                  //
+    //               << ", regs: " << kernel->desc().attr.numRegs                             //
+    //               << ", local: " << (float)kernel->desc().attr.localSizeBytes << " bytes"  //
+    //               << ", max_active_ctas: " << kernel->desc().max_active_ctas * is_valid << " \n";
+    // }
+    if (is_valid) {
+        kernels_.push_back(std::move(kernel));
+        ptrs_.push_back(kernels_.back().get());
     }
-    // std::cout << "register: " << kernel->name()                                        //
-    //           << ", shared: " << (kernel->smem_size() >> 10) << " KB"                  //
-    //           << ", regs: " << kernel->desc().attr.numRegs                             //
-    //           << ", local: " << (float)kernel->desc().attr.localSizeBytes << " bytes"  //
-    //           << ", max_active_ctas: " << kernel->desc().max_active_ctas << " \n";
-
-    kernels_.push_back(std::move(kernel));
-    ptrs_.push_back(kernels_.back().get());
     return true;
 }
 
diff --git a/src/turbomind/kernels/gemm/registry.h b/src/turbomind/kernels/gemm/registry.h
index 401325cdda..867a5f37d6 100644
--- a/src/turbomind/kernels/gemm/registry.h
+++ b/src/turbomind/kernels/gemm/registry.h
@@ -31,6 +31,13 @@ class Registry {
     void f16_u4g128_f16_tnt_sm80_s16816();
     void f16_u4g128_f16_tnt_sm90_s16816();
 
+    void sm70_s884_dynamic();
+    void sm75_s16816_dynamic();
+    template<class T>
+    void sm80_s16816_dynamic();
+    template<class T>
+    void sm90_s16816_dynamic();
+
     void u4g128_f16_f16_nnn_sm80_s16816();
 
 private:
diff --git a/src/turbomind/kernels/gemm/test/gemm_bench.cu b/src/turbomind/kernels/gemm/test/gemm_bench.cu
index 216d6d9d83..f6e2a624de 100644
--- a/src/turbomind/kernels/gemm/test/gemm_bench.cu
+++ b/src/turbomind/kernels/gemm/test/gemm_bench.cu
@@ -4,6 +4,7 @@
 #include "src/turbomind/kernels/gemm/operand.h"
 #include "src/turbomind/kernels/gemm/test/models.h"
 #include "src/turbomind/kernels/gemm/test/testbed.h"
+#include <cuda_runtime_api.h>
 #include <map>
 #include <nvbench/nvbench.cuh>
 #include <string>
@@ -15,6 +16,9 @@ void gemm_bench(nvbench::state& state)
     const auto bs = state.get_int64("bs");
     const auto tp = state.get_int64("tp");
 
+    const auto expert_num  = state.get_int64("e_num");
+    const auto exp_per_tok = state.get_int64("e_tok");
+
     auto [output_dims, input_dims] = config[idx];
 
     constexpr int group_size = 128;
@@ -43,35 +47,41 @@ void gemm_bench(nvbench::state& state)
             std::swap(m, n);
         }
         std::cerr << "m" << m << "n" << n << "k" << k << "\n";
-        get_test().Initialize(m, n, k, group_size, state.get_cuda_stream());
+
+        get_test().Initialize(m, n, k, group_size, expert_num, exp_per_tok, state.get_cuda_stream());
     }
 
-    state.add_element_count((size_t)bs * output_dims * input_dims * 2);  // mul + add
+    state.add_element_count(get_test().get_element_count());
 
     // state.collect_dram_throughput();
     // state.collect_l2_hit_rates();
 
     if constexpr (1) {
-        state.add_global_memory_reads(get_test().global_memory_reads());
+        state.add_global_memory_reads(get_test().get_global_memory_reads());
         get_test().Run();
         state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) {  //
             get_test().Run();
         });
     }
     else {
-        state.add_global_memory_reads(sizeof(half) * (bs * input_dims + output_dims * input_dims));
+        state.add_global_memory_reads(get_test().get_ref_global_memory_reads());
         state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) {  //
             get_test().RunCublas();
         });
     }
+
+    get_test().ctx_.reset();
 }
 
 NVBENCH_BENCH(gemm_bench)
     .add_int64_axis("idx", nvbench::range(0, (int)config.size() - 1))
-    .add_int64_power_of_two_axis("bs", nvbench::range(0, 10))
-    .add_int64_axis("tp", {1, 2, 4});
+    .add_int64_power_of_two_axis("bs", nvbench::range(0, 14))
+    .add_int64_axis("tp", {1, 2, 4})
+    .add_int64_axis("e_num", {0})
+    .add_int64_axis("e_tok", {1});
 
 int main(int argc, char* argv[])
 {
     NVBENCH_MAIN_BODY(argc, argv);
+    return 0;
 }
diff --git a/src/turbomind/kernels/gemm/test/gemm_test.cu b/src/turbomind/kernels/gemm/test/gemm_test.cu
index f783e7d284..85944816a9 100644
--- a/src/turbomind/kernels/gemm/test/gemm_test.cu
+++ b/src/turbomind/kernels/gemm/test/gemm_test.cu
@@ -10,6 +10,7 @@
 #include "src/turbomind/kernels/gemm/test/quantization.h"
 #include "src/turbomind/kernels/gemm/test/test_utils.h"
 #include "src/turbomind/kernels/gemm/test/testbed.h"
+#include "src/turbomind/kernels/gemm/tuner/cache_utils.h"
 #include "src/turbomind/kernels/gemm/types.h"
 #include <fstream>
 #include <limits>
@@ -40,7 +41,7 @@ void ComputeRefCpu(half* C, const half* A, const half* B, int m, int n, int k)
 
 static int g_check = 0;
 
-void Run(int batch_size, int output_dims, int input_dims, int g = 128)
+void Run(int batch_size, int output_dims, int input_dims, int expert_num = 0, int top_e = 1, int g = 128)
 {
     auto& test = get_test();
     int   m    = batch_size;
@@ -49,15 +50,19 @@ void Run(int batch_size, int output_dims, int input_dims, int g = 128)
     if (get_test().kBatchDim == 1) {
         std::swap(m, n);
     }
-    std::cerr << "m" << m << "n" << n << "k" << k << "\n";
-    test.Initialize(m, n, k, g, 0);
+    std::cerr << "m" << m << "n" << n << "k" << k << "_"
+              << "E" << expert_num << "e" << top_e << "\n";
+    test.Initialize(m, n, k, g, expert_num, top_e, 0);
 
     if (g_check) {
         test.Check();
     }
     else {
-        for (int i = 0; i < 10; ++i) {
+        for (int i = 0; i < 100; ++i) {
+            CacheFlushing::flush();
             test.Run();
+            CacheFlushing::flush();
+            test.RunCublas();
         }
         test.CompareC();
     }
@@ -66,7 +71,16 @@ void Run(int batch_size, int output_dims, int input_dims, int g = 128)
 int main(int argc, char* argv[])
 {
     g_check = 0;
-    Run(16384, 16384, 16384);
+    // Run(8192, 14336 * 2, 4096);
+
+    // Run(16384, 16384, 16384);
+    // Run(18, 14336, 4096, 8, 2);
+
+    // Run(16, 4096, 7168, 8, 2);
+    Run(777, 14336, 4096, 8, 2);
+
+    // Run(256, 14336 * 2, 4096);
+    // Run(16, 4096, 14336);
 
     // g_check = 1;
     // std::vector<int> bsz(1024);
diff --git a/src/turbomind/kernels/gemm/test/models.h b/src/turbomind/kernels/gemm/test/models.h
index 8a6260fef8..b4fca0fbe9 100644
--- a/src/turbomind/kernels/gemm/test/models.h
+++ b/src/turbomind/kernels/gemm/test/models.h
@@ -3,6 +3,7 @@
 #pragma once
 
 #include <cstdint>
+#include <map>
 #include <utility>
 #include <vector>
 
@@ -14,6 +15,15 @@ static const std::vector<std::pair<int64_t, int64_t>> config{
     {18944 * 2, 3584}, {3584, 18944}, {4608, 3584},  {3584, 3584},  // qwen2-7b
     {20480 * 2, 7168}, {7168, 20480}, {9216, 7168},  {7168, 7168},  // yi-34b
     {28672 * 2, 8192}, {8192, 28672}, {10240, 8192}, {8192, 8192},  // llama2-70b / llama3-70b
-    {29696 * 2, 8192}, {8192, 29696}, {10240, 8192}, {8192, 8192}   // qwen2-72b-instruct-awq
+    {29696 * 2, 8192}, {8192, 29696}, {10240, 8192}, {8192, 8192},  // qwen2-72b-instruct-awq
+    {14336 * 2, 4096}, {4096, 14336}, {6144, 4096},  {4096, 4096},  // mixtral-8x7b, E8e2
+    {16384 * 2, 6144}, {6144, 16384}, {0, 0},        {0, 0},        // mixtral-8x22b, E8e2
+    {1536 * 2, 5120},  {5120, 1536},  {0, 0},        {0, 0},        // deepseek-v2, E160e6
+    {1536 * 2, 2048},  {2048, 1536},  {0, 0},        {0, 0},        // deepseek-v2-lite, E64e6
+    {2560 * 2, 3840},  {3840, 2560},  {0, 0},        {0, 0},        // qwen2-a14b, E64e8
+    {6400 * 2, 4096},  {4096, 6400},  {0, 0},        {0, 0},        // phi-3.5-MoE, E16e2
 };
+
+// static const std::map<int, std::pair<int, int>> moe_config{{32, {8, 2}}, {33, {8, 2}}};
+
 // {29568 * 2, 8192}, {8192, 29568}, {10240, 8192}, {8192, 8192},  // qwen2-72b
diff --git a/src/turbomind/kernels/gemm/test/reference.cu b/src/turbomind/kernels/gemm/test/reference.cu
index 591d8e6bc6..d1f7f34f64 100644
--- a/src/turbomind/kernels/gemm/test/reference.cu
+++ b/src/turbomind/kernels/gemm/test/reference.cu
@@ -40,6 +40,9 @@ cudaDataType to_cuda_dtype(DataType dtype)
 Reference::Reference()
 {
     cublasCreate(&handle_);
+
+    // cublasSetWorkspace(handle_, nullptr, 0);
+    cublasSetMathMode(handle_, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
 }
 
 Reference::~Reference()
diff --git a/src/turbomind/kernels/gemm/test/test_moe_utils.cu b/src/turbomind/kernels/gemm/test/test_moe_utils.cu
new file mode 100644
index 0000000000..a311162193
--- /dev/null
+++ b/src/turbomind/kernels/gemm/test/test_moe_utils.cu
@@ -0,0 +1,373 @@
+#include "src/turbomind/kernels/gemm/moe_utils_v2.h"
+#include "src/turbomind/kernels/gemm/test/test_utils.h"
+#include "src/turbomind/kernels/gemm/tuner/cache_utils.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include <algorithm>
+#include <iomanip>
+#include <numeric>
+#include <sstream>
+#include <string>
+#include <thrust/host_vector.h>
+#include <thrust/universal_vector.h>
+
+using namespace turbomind;
+
+template<class T>
+void print_vecs(const T* data, int m, int k, std::string msg, int width = 4)
+{
+    if (!msg.empty()) {
+        std::cout << msg << ":\n";
+    }
+    for (int mm = 0; mm < m; ++mm) {
+        for (int kk = 0; kk < k; ++kk) {
+            std::cout << std::setw(width) << data[mm * k + kk];
+        }
+        std::cout << "\n";
+    }
+}
+
+#if 0
+void func()
+{
+    using thrust::universal_vector;
+
+    // clang-format off
+    std::vector<float> h_logits{
+        8,  5,  1,  4,  3,  6,  2,  7,
+        50, 60, 90, 20, 70, 71, 72, 73,
+        0, 1, 0, 0, 0, 1, 0, 1,
+        0, 0, 0, 1, 0, 0, 0, 2};
+    // clang-format on
+
+    h_logits.resize(8);
+
+    // auto tmp = h_logits;
+    // for (int i = 0; i < 127; ++i) {
+    //     h_logits.insert(h_logits.end(), tmp.begin(), tmp.end());
+    // }
+
+    universal_vector<float> logits(h_logits.begin(), h_logits.end());
+
+    const int E = 8;
+    const int n = h_logits.size() / E;
+    const int e = 2;
+
+    const int n_padded = (n + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
+
+    universal_vector<int>   f2n(e * n);
+    universal_vector<int>   en2f(e * n);
+    universal_vector<int>   offsets(E + 1);
+    universal_vector<int>   accum(E * kMoeGateMaxTiles);
+    universal_vector<float> scales(n * e);
+    universal_vector<int>   masks(E * n_padded);
+
+    for (int i = 0; i < 10; ++i) {
+        gemm::CacheFlushing::flush(0);
+        cudaMemset(accum.data().get(), 0, sizeof(int) * accum.size());
+        invokeMoeGate_V2(f2n.data().get(),
+                         en2f.data().get(),
+                         offsets.data().get(),
+                         scales.data().get(),
+                         masks.data().get(),
+                         accum.data().get(),
+                         logits.data().get(),
+                         n,
+                         n_padded,
+                         E,
+                         e,
+                         0);
+    }
+
+    auto err = cudaDeviceSynchronize();
+    if (err) {
+        std::cerr << cudaGetErrorString(err) << "\n";
+    }
+
+    print_vecs(scales.data().get(), e, n, "scales", 12);
+    print_vecs(masks.data().get(), E, n_padded, "tmp");
+    print_vecs(accum.data().get(), E, 1, "accum");
+    print_vecs(offsets.data().get(), 1, E + 1, "offsets");
+    print_vecs(f2n.data().get(), n * e, 1, "f2n");
+    print_vecs(en2f.data().get(), e, n, "en2f");
+}
+#endif
+
+RNG& gRNG()
+{
+    static RNG inst{};
+    return inst;
+}
+
+using thrust::universal_vector;
+
+void moe_gate_ref(int                            tokens,
+                  int                            expert_num,
+                  int                            experts_per_token,
+                  const universal_vector<float>& logits,
+                  universal_vector<int>&         offsets,
+                  universal_vector<int>&         eids,
+                  universal_vector<int>&         f2n,
+                  universal_vector<int>&         en2f,
+                  universal_vector<float>&       scales)
+{
+    std::vector<int> eid_range(expert_num);
+    std::iota(eid_range.begin(), eid_range.end(), 0);
+
+    for (int t = 0; t < tokens; ++t) {
+        const float* logit   = logits.data().get() + expert_num * t;
+        const float  max_val = *std::max_element(logit, logit + expert_num);
+        if constexpr (0) {
+            std::vector<float> probs(logit, logit + expert_num);
+            float              sum = 0;
+            for (auto& p : probs) {
+                p = std::exp(p - max_val);
+                sum += p;
+            }
+            for (auto& p : probs) {
+                p /= sum;
+            }
+            std::vector<int> idxs = eid_range;
+            // Had to use stable sort since there is no `std::stable_nth_element`
+            std::stable_sort(idxs.begin(), idxs.end(), [&](int i, int j) {  //
+                return probs[i] > probs[j];
+            });
+            // Recover natural order in top-k
+            std::sort(idxs.begin(), idxs.begin() + experts_per_token);
+            idxs.resize(experts_per_token);
+            sum = 0;
+            for (int e = 0; e < experts_per_token; ++e) {
+                eids[e * tokens + t] = idxs[e];
+                sum += probs[idxs[e]];
+            }
+            for (int e = 0; e < experts_per_token; ++e) {
+                scales[e * tokens + t] = probs[idxs[e]] / sum;
+            }
+        }
+        else {
+            std::vector<int> idxs = eid_range;
+            // Had to use stable sort since there is no `std::stable_nth_element`
+            std::stable_sort(idxs.begin(), idxs.end(), [&](int i, int j) {  //
+                return logit[i] > logit[j];
+            });
+            // Recover natural order in top-k
+            std::sort(idxs.begin(), idxs.begin() + experts_per_token);
+            idxs.resize(experts_per_token);
+            std::vector<float> probs(experts_per_token);
+            float              sum = 0;
+            for (int e = 0; e < experts_per_token; ++e) {
+                eids[e * tokens + t] = idxs[e];
+                probs[e]             = std::exp(logit[idxs[e]] - max_val);
+                sum += probs[e];
+            }
+            for (int e = 0; e < experts_per_token; ++e) {
+                scales[e * tokens + t] = probs[e] / sum;
+            }
+        }
+    }
+
+    // f2en
+    std::vector<int> f2en(eids.size());
+    std::iota(f2en.begin(), f2en.end(), 0);
+
+    std::stable_sort(f2en.begin(), f2en.end(), [&](int i, int j) {  //
+        if (eids[i] != eids[j]) {
+            return eids[i] < eids[j];
+        }
+        return i % tokens < j % tokens;
+    });
+
+    std::fill_n(offsets.begin(), offsets.size(), 0);
+    std::vector<int> accum(expert_num);
+
+    for (size_t i = 0; i < f2en.size(); ++i) {
+        f2n[i]        = f2en[i] % tokens;
+        en2f[f2en[i]] = i;
+        ++accum[eids[i]];
+    }
+
+    for (size_t i = 1; i < offsets.size(); ++i) {
+        offsets[i] = offsets[i - 1] + accum[i - 1];
+    }
+}
+
+void mask2eids(const universal_vector<int>& masks, universal_vector<int>& eids, int tokens, int expert_num)
+{
+    const int tokens_padded = masks.size() / expert_num;
+    // std::cout << eids.size() << std::endl;
+    for (int e = 0; e < expert_num; ++e) {
+        for (int t = 0; t < tokens_padded; ++t) {
+            if (auto v = masks[e * tokens_padded + t]; v >= 0) {
+                // if (v >= 2 || t >= 8193) {
+                //     std::cerr << "FUCK " << v << " " << t << std::endl;
+                // }
+                eids[v * tokens + t] = e;
+            }
+        }
+    }
+}
+
+struct Tiling {
+    int  output_dims;
+    int  input_dims;
+    int3 cta_tile;
+};
+
+bool test_moe_gate(int                     tokens,  //
+                   int                     expert_num,
+                   int                     experts_per_token,
+                   gemm::Tape&             tape,
+                   const Tiling&           tiling,
+                   universal_vector<float> logits = {})
+{
+    if (logits.empty()) {
+        logits.resize(tokens * expert_num);
+        gRNG().GenerateUniform(logits.data().get(), logits.size());
+    }
+    assert(logits.size() == tokens * expert_num);
+
+    const int tokens_padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
+    // const int max_coords    = get_max_coords(tokens, expert_num, experts_per_token, tiling);
+
+    universal_vector<int>   offsets(expert_num + 1);
+    universal_vector<int>   accum(expert_num * kMoeGateMaxTiles);
+    universal_vector<int>   masks(expert_num * tokens_padded);
+    universal_vector<int>   eids(experts_per_token * tokens);
+    universal_vector<int>   f2n(experts_per_token * tokens);
+    universal_vector<int>   en2f(experts_per_token * tokens);
+    universal_vector<float> scales(experts_per_token * tokens);
+    // universal_vector<int2>  coords(max_coords);
+    // thrust::fill(coords.begin(), coords.end(), int2{-1, 0});
+
+    auto offsets_ref = offsets;
+    auto eids_ref    = eids;
+    auto f2n_ref     = f2n;
+    auto en2f_ref    = en2f;
+    auto scales_ref  = scales;
+
+    moe_gate_ref(tokens, expert_num, experts_per_token, logits, offsets_ref, eids_ref, f2n_ref, en2f_ref, scales_ref);
+
+    for (int i = 0; i < 10; ++i) {
+        cudaMemset(accum.data().get(), 0, sizeof(int) * accum.size());
+        invokeMoeGate_V2(f2n.data().get(),
+                         en2f.data().get(),
+                         offsets.data().get(),
+                         scales.data().get(),
+                         masks.data().get(),
+                         accum.data().get(),
+                         logits.data().get(),
+                         tokens,
+                         tokens_padded,
+                         expert_num,
+                         experts_per_token,
+                         0);
+    }
+
+    // invokeMoeTiling(coords.data().get(), offsets.data().get(), expert_num, coords.size(), &tiling, 1, 0);
+
+    // gemm::scheduleGemmMoe(tape,
+    //                       offsets.data().get(),
+    //                       tokens,
+    //                       experts_per_token,
+    //                       expert_num,
+    //                       tiling.output_dims,
+    //                       tiling.input_dims,
+    //                       tiling.cta_tile,
+    //                       tiling.cta_tile.z,
+    //                       1,
+    //                       0,
+    //                       0);
+
+    if (auto err = cudaDeviceSynchronize(); err != cudaSuccess) {
+        std::cerr << cudaGetErrorString(err) << std::endl;
+        std::abort();
+    }
+
+    // print_vecs(masks.data().get(), expert_num, tokens_padded, "masks");
+    mask2eids(masks, eids, tokens, expert_num);
+
+    bool success = true;
+
+    // success = offsets == offsets_ref && eids == eids_ref && f2n == f2n_ref && en2f == en2f_ref;
+
+    if (offsets != offsets_ref) {
+        std::cerr << "offset\n";
+        success = false;
+    }
+    if (eids != eids_ref) {
+        std::cerr << "eids\n";
+        success = false;
+    }
+    if (f2n != f2n_ref) {
+        std::cerr << "f2n\n";
+        success = false;
+    }
+    if (en2f != en2f_ref) {
+        std::cerr << "en2f\n";
+        success = false;
+    }
+
+    if (!success || false) {
+        print_vecs(offsets_ref.data().get(), 1, expert_num + 1, "offsets_ref");
+        print_vecs(offsets.data().get(), 1, expert_num + 1, "offsets");
+
+        print_vecs(eids_ref.data().get(), experts_per_token, tokens, "eids_ref");
+        print_vecs(eids.data().get(), experts_per_token, tokens, "eids");
+
+        print_vecs(f2n_ref.data().get(), 1, experts_per_token * tokens, "f2n_ref");
+        print_vecs(f2n.data().get(), 1, experts_per_token * tokens, "f2n");
+
+        print_vecs(en2f_ref.data().get(), experts_per_token, tokens, "en2f_ref");
+        print_vecs(en2f.data().get(), experts_per_token, tokens, "en2f");
+
+        print_vecs(scales_ref.data().get(), experts_per_token, tokens, "scales_ref", 12);
+        print_vecs(scales.data().get(), experts_per_token, tokens, "scales", 12);
+
+        print_vecs(accum.data().get(), expert_num, 1, "accum");
+
+        // print_vecs(coords.data().get(), 1, max_coords, "coords");
+
+        thrust::host_vector<int4> tile_offsets(tape.max_ctas);
+        std::cout << tape.max_ctas << std::endl;
+        cudaMemcpy(tile_offsets.data(), tape.tile_offsets, sizeof(int4) * tile_offsets.size(), cudaMemcpyDefault);
+        cudaDeviceSynchronize();
+
+        std::cout << "coords:\n";
+        int last = -1;
+        for (int i = 0; i < tape.max_ctas; ++i) {
+            auto& c = tile_offsets[i];
+            if (last >= 0 && c.w != last) {
+                std::cout << "\n";
+            }
+            if (c.w == -1) {
+                std::cout << i << "\n";
+                break;
+            }
+            last = c.w;
+            std::stringstream ss;
+            ss << c.x << "," << c.y;
+            std::cout << std::setw(6) << ss.str();
+        }
+        std::cout << "\n";
+    }
+
+    return success;
+}
+
+int main()
+{
+    gemm::Tape       tape{};
+    constexpr Tiling tiling{14336, 128, {128, 128, 32}};
+
+    test_moe_gate(8192, 8, 2, tape, tiling);
+    return 0;
+
+    for (int i = 1; i < 16384; ++i) {
+        // std::cerr << i << std::endl;
+        auto success = test_moe_gate(i, 8, 2, tape, tiling);
+        if (!success) {
+            std::cerr << i << std::endl;
+            // std::abort();
+        }
+        // break;
+    }
+}
diff --git a/src/turbomind/kernels/gemm/test/test_utils.h b/src/turbomind/kernels/gemm/test/test_utils.h
index 00401fe7ef..a01533feb5 100644
--- a/src/turbomind/kernels/gemm/test/test_utils.h
+++ b/src/turbomind/kernels/gemm/test/test_utils.h
@@ -4,7 +4,9 @@
 
 #include "src/turbomind/macro.h"
 #include <cuda_fp16.h>
+#include <cuda_runtime.h>
 #include <memory>
+#include <string>
 #include <vector>
 
 namespace turbomind {
diff --git a/src/turbomind/kernels/gemm/test/testbed.h b/src/turbomind/kernels/gemm/test/testbed.h
index 6e586a18b0..6b1ec88f58 100644
--- a/src/turbomind/kernels/gemm/test/testbed.h
+++ b/src/turbomind/kernels/gemm/test/testbed.h
@@ -3,19 +3,27 @@
 #pragma once
 
 #include "src/turbomind/kernels/core/array.h"
+#include "src/turbomind/kernels/core/data_type.h"
 #include "src/turbomind/kernels/core/math.h"
+#include "src/turbomind/kernels/gemm/context.h"
 #include "src/turbomind/kernels/gemm/desc.h"
 #include "src/turbomind/kernels/gemm/gemm.h"
 #include "src/turbomind/kernels/gemm/kernel.h"
+#include "src/turbomind/kernels/gemm/matrix_ptr.h"
+#include "src/turbomind/kernels/gemm/moe_utils_v2.h"
 #include "src/turbomind/kernels/gemm/test/quantization.h"
 #include "src/turbomind/kernels/gemm/test/reference.h"
 #include "src/turbomind/kernels/gemm/test/test_utils.h"
 #include "src/turbomind/kernels/gemm/types.h"
 #include "src/turbomind/kernels/gemm/utils.h"
 #include <algorithm>
+#include <climits>
 #include <cstdlib>
 #include <fstream>
 #include <iomanip>
+#include <iterator>
+#include <numeric>
+#include <random>
 #include <thrust/universal_vector.h>
 #include <type_traits>
 
@@ -76,35 +84,39 @@ class Testbed {
     }
 
     void Initialize(int m, int n, int k, int g, cudaStream_t stream)
+    {
+        Initialize(m, n, k, g, 1, 1, stream);
+    }
+
+    void Initialize(int m, int n, int k, int g, int experts, int top_e, cudaStream_t stream) noexcept
     {
         rng_.set_stream(stream);
         reference_.set_stream(stream);
         stream_ = stream;
 
+        cudaGetDeviceProperties(&prop_, 0);
+
         m_ = m;
         n_ = n;
         k_ = k;
 
+        batch_size_  = batch_dim == 0 ? m_ : n_;
+        input_dims_  = k;
+        output_dims_ = (size_t)m_ * n_ / batch_size_;
+
+        const size_t E = std::max(1, experts);
+
         a_.resize(m * k);
-        b_.resize(n * k);
+        b_.resize(n * k * E);
         c_.resize(m * n);
 
-        a_desc_ = MatrixLayout{get_data_type_v<Tc>, order_a, m, k, mk2cs<order_a>(m, k).x};
-        b_desc_ = MatrixLayout{get_data_type_v<Tc>, order_b, k, n, _kn2cs<order_b>(k, n).x};
-        c_desc_ = MatrixLayout{get_data_type_v<Tc>, order_c, m, n, mk2cs<order_c>(m, n).x};
+        a_desc_ = MatrixLayout{get_data_type_v<Tc>, order_a, m, k, mk2cs<order_a>(m, k).x, 0};
+        b_desc_ = MatrixLayout{get_data_type_v<Tc>, order_b, k, n, _kn2cs<order_b>(k, n).x, 0};
+        c_desc_ = MatrixLayout{get_data_type_v<Tc>, order_c, m, n, mk2cs<order_c>(m, n).x, 0};
 
         c_f_.resize(c_.size());
         c_ref_.resize(c_.size());
 
-        // a_q_.resize(a_.size());
-        // b_q_.resize(b_.size());
-
-        // u_.resize(a_.size());
-        // v_.resize(b_.size());
-
-        // a_f_.resize(a_.size());
-        // b_f_.resize(b_.size());
-
         /// TODO: Revise packed format
         a_pack_.resize(a_.size() / kVecSize);
         b_pack_.resize(b_.size() / kVecSize);
@@ -157,12 +169,21 @@ class Testbed {
         if constexpr (is_quant_b) {
             static_assert(pack_b && pack_v);
             constexpr Order _order_b = transpose(order_b);
-            Quantize<Tb>(b_, n, k, _order_b, g, b_f_, b_q_, v_, stream);
-            v_pack_desc_ = v_desc_ = {DataType::U32, kRowMajor, ceil_div(k, g), n, n};
+            Quantize<Tb>(b_, n * E, k, _order_b, g, b_f_, b_q_, v_, stream);
+            quant_b_ = {QuantType::kDefault, g};
+
+            v_pack_desc_ = v_desc_ = {DataType::U32, kRowMajor, ceil_div(k, g), n, int(n * E)};
             v_pack_desc_.pack      = pack_v;
             v_pack_.resize(v_.size());
-            CHECK(!Convert(v_.data().get(), v_desc_, v_pack_.data().get(), v_pack_desc_, stream_));
-            quant_b_ = {QuantType::kDefault, g};
+            auto v_src_data = (uint32_t*)v_.data().get();
+            auto v_dst_data = (uint32_t*)v_pack_.data().get();
+            std::cout << "pre-pack: " << v_pack_desc_.ld << "\n";
+            for (size_t e = 0; e < E; ++e) {
+                CHECK(!Convert(v_src_data, v_desc_, v_dst_data, v_pack_desc_, stream_));
+                v_src_data += n;
+                v_dst_data += (size_t)v_desc_.rows * v_desc_.cols;
+            }
+            std::cout << "post-pack: " << v_pack_desc_.ld << "\n";
 
             // cudaDeviceSynchronize();
 
@@ -184,10 +205,26 @@ class Testbed {
         }
 
         if constexpr (pack_b) {
+            // CHECK(experts == 0);
             b_pack_desc_.type = get_data_type_v<Tb>;
             b_pack_desc_.pack = pack_b;
-            const auto b_data = is_quant_b ? (void*)b_q_.data().get() : (void*)b_.data().get();
-            CHECK(!Convert(b_data, b_desc_, b_pack_.data().get(), b_pack_desc_, stream_));
+            // clang-format off
+            auto b_src_data = [&] {
+                // MSVC does not recognise `is_quant_b` as compile time constant
+                constexpr bool is_quant = !std::is_same_v<Tb, Tc>;
+                if constexpr (is_quant) return b_q_.data().get(); else return b_.data().get();
+            }();
+            // clang-format on
+            get_pointer_type<Tb> b_dst_data{(Tb*)b_pack_.data().get()};
+            const size_t         numel = (size_t)b_desc_.rows * b_desc_.cols;
+            std::cout << "pre-pack: " << b_pack_desc_.ld << "\n";
+            for (size_t e = 0; e < E; ++e) {
+                CHECK(!Convert((Tb*)b_src_data, b_desc_, (Tb*)b_dst_data, b_pack_desc_, stream_));
+                // NOTE: This is not correct when b is quantized in n-major
+                b_src_data = b_src_data + numel;
+                b_dst_data = b_dst_data + numel;
+            }
+            std::cout << "post-pack: " << b_pack_desc_.ld << "\n";
 
             // {
             //     cudaDeviceSynchronize();
@@ -208,6 +245,152 @@ class Testbed {
             cudaMemcpyAsync(
                 (Tb*)b_pack_.data().get(), b_.data().get(), sizeof(Tb) * b_.size(), cudaMemcpyDefault, stream);
         }
+
+        // ctx_ = std::make_unique<DynamicGemmContext>(prop_, stream_);
+
+        InitMoE(batch_size_, experts, top_e);
+    }
+
+    void InitMoE(int batch_size, int experts, int top_e)
+    {
+        experts_     = experts;
+        exp_per_tok_ = top_e;
+
+        if (experts == 0) {
+            return;
+        }
+
+        ctx_ = std::make_unique<MoeGemmContext>(experts_, top_e, prop_, stream_);
+
+        std::vector<int> r(experts);
+        std::iota(r.begin(), r.end(), 0);
+
+        // Sample `top_e` experts per token
+        std::mt19937 g{};
+        expert_ids_ = SampleBalanced(batch_size_, experts_, top_e, g);
+
+        std::uniform_real_distribution<float> dist(1e-3, 1.f);
+        std::vector<float>                    tmp(top_e);
+        moe_scales_.resize(top_e * batch_size_);
+        for (int i = 0; i < batch_size_; ++i) {
+            float inv{};
+            for (auto& x : tmp) {
+                x = dist(g);
+                inv += x;
+            }
+            inv = 1.f / inv;
+            for (int e = 0; e < top_e; ++e) {
+                moe_scales_[e * batch_size_ + i] = tmp[e] * inv;
+            }
+        }
+
+        moe_cnt_.resize(experts);
+        std::fill_n(moe_cnt_.begin(), moe_cnt_.size(), 0);
+        std::vector<std::vector<int>> f2i(experts_);
+        for (int i = 0; i < (int)expert_ids_.size(); ++i) {
+            ++moe_cnt_[expert_ids_[i]];
+            f2i[expert_ids_[i]].push_back(i);  // i ~ [n, e]
+        }
+
+        moe_m_offsets_.resize(experts_ + 1);
+        moe_m_offsets_[0] = 0;
+        for (int i = 0; i < experts_; ++i) {
+            moe_m_offsets_[i + 1] = moe_m_offsets_[i] + moe_cnt_[i];
+        }
+
+        moe_n_offsets_.resize(experts_ + 1);
+        moe_n_offsets_[0] = 0;
+        for (int i = 0; i < experts_; ++i) {
+            moe_n_offsets_[i + 1] = moe_n_offsets_[i] + output_dims_;
+        }
+
+        if (1) {
+            moe_n_ptrs_.resize(experts_);
+            const size_t         numel = (size_t)input_dims_ * output_dims_;
+            get_pointer_type<Tb> p{(Tb*)b_pack_.data().get()};
+            for (int i = 0; i < experts_; ++i) {
+                moe_n_ptrs_[i] = StridedPtr{static_cast<Tb*>(p + i * numel), b_pack_desc_.ld};
+            }
+        }
+        if (1) {
+            moe_v_ptrs_.resize(experts_);
+            const size_t numel = (size_t)v_desc_.rows * v_desc_.cols;
+            const auto   p     = (uint32_t*)v_pack_.data().get();
+            for (int i = 0; i < experts_; ++i) {
+                moe_v_ptrs_[i] = StridedPtr{p + i * numel, v_pack_desc_.ld};
+            }
+        }
+
+        std::cout << expert_ids_.size() << "\n";
+
+        // for (auto x : expert_ids_) {
+        //     std::cout << x << " ";
+        // }
+        // std::cout << "\n";
+
+        for (auto x : moe_cnt_) {
+            std::cout << x << " ";
+        }
+        std::cout << "\n";
+
+        for (auto x : moe_m_offsets_) {
+            std::cout << x << " ";
+        }
+        std::cout << "\n";
+
+        for (auto x : moe_n_offsets_) {
+            std::cout << x << " ";
+        }
+        std::cout << "\n";
+
+        moe_f2n_.resize(expert_ids_.size());
+        moe_f2en_.resize(expert_ids_.size());
+        moe_en2f_.resize(expert_ids_.size());
+        for (int e = 0, i = 0; e < experts_; ++e) {
+            for (const auto& x : f2i[e]) {
+                moe_f2n_[i] = x / top_e;
+                // [n, e] -> [e, n]
+                const int en  = x % top_e * batch_size_ + x / top_e;
+                moe_f2en_[i]  = en;
+                moe_en2f_[en] = i;
+                ++i;
+            }
+        }
+
+        ((MoeGemmContext*)ctx_.get())->set_offsets(moe_m_offsets_.data().get());
+
+        CHECK(batch_dim == 0);
+        CHECK(a_desc_.order == kRowMajor);
+
+        a_e_.resize(a_f_.size() * top_e);
+        c_e_.resize(c_f_.size() * top_e);
+        c_e_ref_.resize(c_e_.size());
+
+        for (int i = 0; i < 10; ++i) {
+            dispatchMoeGather(
+                a_e_.data().get(), a_f_.data().get(), moe_f2n_.data().get(), batch_size_, top_e, input_dims_, stream_);
+        }
+
+        a_pack_desc_.num = b_pack_desc_.num = c_desc_.num = experts_;
+
+        a_pack_desc_.rows = a_desc_.rows = c_desc_.rows = expert_ids_.size();
+        a_pack_desc_.offsets = c_desc_.offsets = moe_m_offsets_.data().get();
+
+        a_pack_desc_.idxs = moe_f2n_.data().get();
+
+        if (!moe_n_ptrs_.empty()) {
+            b_pack_desc_.ld = 0;
+        }
+        // b_pack_desc_.offsets = moe_n_offsets_.data().get();
+
+        v_pack_desc_.num = b_pack_desc_.num;
+        if (!moe_v_ptrs_.empty()) {
+            v_pack_desc_.ld = 0;
+        }
+
+        cudaMemPrefetchAsync(moe_m_offsets_.data().get(), sizeof(int) * moe_m_offsets_.size(), 0, stream_);
+        cudaMemPrefetchAsync(moe_n_offsets_.data().get(), sizeof(int) * moe_n_offsets_.size(), 0, stream_);
+        cudaMemPrefetchAsync(moe_f2n_.data().get(), sizeof(int) * moe_f2n_.size(), 0, stream_);
     }
 
     void Run(void* ctx = {})
@@ -218,28 +401,45 @@ class Testbed {
             quant_a_,
             quant_b_,
             kBatchDim,
+            ctx_.get(),
             ctx,
         };
 
         const Workspace workspace{barriers_.data().get(), barriers_.size(), partials_.data().get(), partials_.size()};
 
-        auto status = gemm_.Run(operation,
+        void* A = a_pack_.data().get();
+        void* B = b_pack_.data().get();
+        void* V = v_pack_.data().get();
+        void* C = c_.data().get();
+
+        if (experts_) {
+            C = c_e_.data().get();
+            if (!moe_n_ptrs_.empty()) {
+                B = moe_n_ptrs_.data().get();
+            }
+            if (!moe_v_ptrs_.empty()) {
+                V = moe_v_ptrs_.data().get();
+            }
+        }
+
+        auto status = gemm_.Run(operation,  //
                                 1.f,
-                                a_pack_.data().get(),
+                                A,
                                 a_pack_desc_,
                                 u_pack_.data().get(),
                                 u_pack_desc_,
-                                b_pack_.data().get(),
+                                B,
                                 b_pack_desc_,
-                                v_pack_.data().get(),
+                                V,
                                 v_pack_desc_,
                                 0.f,
-                                c_.data().get(),
+                                C,
                                 c_desc_,
-                                c_.data().get(),
+                                C,
                                 c_desc_,
                                 workspace,
                                 stream_);
+        // auto status = 0;
 
         if (!ctx && status) {
             std::cerr << "Run failed, code =" << status << "\n";
@@ -249,23 +449,14 @@ class Testbed {
 
     void RunCublas()
     {
-        reference_.gemm(a_f_.data().get(),  //
-                        a_desc_,
-                        b_f_.data().get(),
-                        b_desc_,
-                        c_f_.data().get(),
-                        c_desc_);
-    }
+        if (experts_ == 0) {
+            // reference_.gemm(a_f_.data().get(),  //
+            //                 a_desc_,
+            //                 b_f_.data().get(),
+            //                 b_desc_,
+            //                 c_f_.data().get(),
+            //                 c_desc_);
 
-    void CompareB()
-    {
-        cudaDeviceSynchronize();
-        Compare(b_f_.data().get(), b_.data().get(), k_, k_, n_);
-    }
-
-    void CompareC()
-    {
-        for (int i = 0; i < 10; ++i) {
             reference_.gemm(a_f_.data().get(),  //
                             a_desc_,
                             b_f_.data().get(),
@@ -273,22 +464,75 @@ class Testbed {
                             c_ref_.data().get(),
                             c_desc_);
         }
+        else {  // [e_i, k] -> [k, n / E] -> [e_i, n / E]
+            auto a_desc = a_desc_;
+            auto b_desc = b_desc_;
+            auto c_desc = c_desc_;
 
-        // c_f_.resize(m_ * n_);
-        // computeRefCublas(c_f_.data().get(), a_.data().get(), b_f_.data().get(), m_, n_, k_, stream_);
-        // RunCublas();
+            CHECK(a_desc.order == kRowMajor);  // k-major, T
+            // CHECK(b_desc.order == kColMajor);  // k-major, N
+            CHECK(c_desc.order == kRowMajor);  // n-major, T
 
+            auto a = a_e_.data().get();
+            auto b = b_f_.data().get();
+            auto c = c_e_ref_.data().get();
+
+            for (int e = 0; e < experts_; ++e) {
+                // Set input size for current expert
+                c_desc.rows = a_desc.rows = moe_cnt_[e];
+
+                reference_.gemm(a, a_desc, b, b_desc, c, c_desc);
+
+                // Move to next expert
+                a += moe_cnt_[e] * input_dims_;
+                b += output_dims_ * input_dims_;
+                c += moe_cnt_[e] * output_dims_;
+            }
+        }
+    }
+
+    void CompareB()
+    {
         cudaDeviceSynchronize();
+        Compare(b_f_.data().get(), b_.data().get(), k_, k_, n_);
+    }
 
-        // Compare(c_f_.data().get(), c_ref_.data().get(), n_, n_, m_, 0);
+    void CompareC()
+    {
+        if (experts_ == 0) {
 
-        // Compare(c_.data().get(), c_f_.data().get(), n_, n_, m_, 0);
+            cudaDeviceSynchronize();
 
-        int dims = m_, bsz = n_;
-        if (order_c == kRowMajor) {
-            std::swap(dims, bsz);
+            int dims = m_, bsz = n_;
+            if (order_c == kRowMajor) {
+                std::swap(dims, bsz);
+            }
+            Compare(c_.data().get(), c_ref_.data().get(), dims, dims, bsz, 0);
+        }
+        else {
+            invokeMoeReduce(c_.data().get(),
+                            c_e_.data().get(),
+                            moe_scales_.data().get(),
+                            moe_en2f_.data().get(),
+                            batch_size_,
+                            expert_ids_.size() / batch_size_,
+                            output_dims_,
+                            stream_);
+
+            invokeMoeReduce(c_ref_.data().get(),
+                            c_e_ref_.data().get(),
+                            moe_scales_.data().get(),
+                            moe_en2f_.data().get(),
+                            batch_size_,
+                            expert_ids_.size() / batch_size_,
+                            output_dims_,
+                            stream_);
+
+            cudaDeviceSynchronize();
+
+            Compare(c_e_.data().get(), c_e_ref_.data().get(), output_dims_, output_dims_, expert_ids_.size(), 0);
+            Compare(c_.data().get(), c_ref_.data().get(), output_dims_, output_dims_, batch_size_, 0);
         }
-        Compare(c_.data().get(), c_ref_.data().get(), dims, dims, bsz, 0);
     }
 
     void Check()
@@ -335,22 +579,77 @@ class Testbed {
         }
     }
 
-    int64_t global_memory_reads()
+    int64_t get_global_memory_reads()
+    {
+        if (experts_ == 0) {
+            return get_size(a_pack_desc_) + get_size(b_pack_desc_) + get_size(u_pack_desc_) + get_size(v_pack_desc_);
+        }
+        else {
+            size_t    size = get_size(a_pack_desc_) + get_size(u_pack_desc_);
+            const int nnz =
+                std::accumulate(moe_cnt_.begin(), moe_cnt_.end(), 0, [](auto a, auto x) { return a + (x > 0); });
+            size += nnz * (get_size(b_pack_desc_) + get_size(v_pack_desc_));
+            return size;
+        }
+    }
+
+    int64_t get_ref_global_memory_reads()
     {
-        return get_size(a_pack_desc_) + get_size(b_pack_desc_) + get_size(u_pack_desc_) + get_size(v_pack_desc_);
+        if (experts_ == 0) {
+            return get_size(a_desc_) + get_size(b_desc_);
+        }
+        else {
+            size_t    size = get_size(a_desc_);
+            const int nnz =
+                std::accumulate(moe_cnt_.begin(), moe_cnt_.end(), 0, [](auto a, auto x) { return a + (x > 0); });
+            size += nnz * get_size(b_desc_);
+            return size;
+        }
     }
 
-    int64_t ref_global_memory_reads()
+    int64_t get_element_count()
     {
-        return get_size(a_desc_) + get_size(b_desc_);
+        if (experts_ == 0) {
+            return (int64_t)m_ * n_ * k_ * 2;
+        }
+        else {
+            int64_t count = 0;
+            for (const auto& m : moe_cnt_) {
+                count += (int64_t)m * output_dims_ * input_dims_;
+            }
+            return count * 2;
+        }
     }
 
-private:
+    // private:
     int m_{};
     int n_{};
     int k_{};
     int g_{};
 
+    int batch_size_{};
+    int input_dims_{};
+    int output_dims_{};
+    int experts_{};
+    int exp_per_tok_{};
+
+    /// MoE buffers
+    universal_vector<Tc> a_e_;
+    universal_vector<Tc> c_e_;
+    universal_vector<Tc> c_e_ref_;
+
+    /// MoE utils
+    std::vector<int>             expert_ids_;  // f(batch_idx * top_e) -> expert_id
+    std::vector<int>             moe_cnt_;
+    universal_vector<int>        moe_f2n_;
+    universal_vector<int>        moe_f2en_;
+    universal_vector<int>        moe_en2f_;
+    universal_vector<int>        moe_m_offsets_;
+    universal_vector<int>        moe_n_offsets_;
+    universal_vector<StridedPtr> moe_n_ptrs_;
+    universal_vector<StridedPtr> moe_v_ptrs_;
+    universal_vector<float>      moe_scales_;
+
     universal_vector<Tc> a_;      // A in fp
     universal_vector<Tc> b_;      // B in fp
     universal_vector<Tc> c_ref_;  // reference C
@@ -395,6 +694,9 @@ class Testbed {
     universal_vector<char> barriers_;
     universal_vector<char> partials_;
 
+    cudaDeviceProp           prop_;
+    std::unique_ptr<Context> ctx_;
+
     cudaStream_t stream_;
 
     RNG rng_;
@@ -448,24 +750,47 @@ inline decltype(auto) get_test()
         constexpr Pack kPackU = HMMA_16816 | OPERAND_U | 1;
         return gTestbed<gemm::Testbed<uint4_t, half, half, 1, kColMajor, kColMajor, kColMajor, kPackA, 0, kPackU, 0>>();
     }
-    else if constexpr (1) {
+    else if constexpr (0) {
         // sm80 / sm75
         constexpr Pack kPackB = HMMA_16816 | OPERAND_B | 2;
         constexpr Pack kPackV = HMMA_16816 | OPERAND_V | 1;
         return gTestbed<gemm::Testbed<half, uint4_t, half, 0, kRowMajor, kRowMajor, kRowMajor, 0, kPackB, 0, kPackV>>();
     }
     else if constexpr (0) {
-        // sm70
+        // sm70 int4
         constexpr Pack kPackB = HMMA_884 | OPERAND_B | 1;
         constexpr Pack kPackV = HMMA_884 | OPERAND_V | 1;
         return gTestbed<gemm::Testbed<half, uint4_t, half, 0, kRowMajor, kColMajor, kRowMajor, 0, kPackB, 0, kPackV>>();
     }
+    else if constexpr (0) {
+        // sm70 half
+        constexpr Pack kPackB = HMMA_884 | OPERAND_B | 1;
+        return gTestbed<gemm::Testbed<half, half, half, 0, kRowMajor, kColMajor, kRowMajor, 0, kPackB, 0, 0>>();
+    }
     else if constexpr (0) {
         // simt
         constexpr Pack kPackB = HMMA_SIMT | OPERAND_B | 1;
         constexpr Pack kPackV = HMMA_SIMT | OPERAND_V | 1;
         return gTestbed<gemm::Testbed<half, uint4_t, half, 0, kRowMajor, kColMajor, kRowMajor, 0, kPackB, 0, kPackV>>();
     }
+    else if constexpr (0) {
+        constexpr Pack kPackB = HMMA_16816 | OPERAND_B | 1;
+        // constexpr Pack kPackB = 0;
+        constexpr Pack kPackV = 0;
+        return gTestbed<gemm::Testbed<half, half, half, 0, kRowMajor, kColMajor, kRowMajor, 0, kPackB, 0, kPackV>>();
+    }
+    else if constexpr (1) {
+        constexpr Pack kPackB = HMMA_16816 | OPERAND_B | 2;
+        // constexpr Pack kPackB = 0;
+        constexpr Pack kPackV = HMMA_16816 | OPERAND_V | 1;
+        // constexpr Pack kPackV = 0;
+        return gTestbed<gemm::Testbed<half, uint4_t, half, 0, kRowMajor, kColMajor, kRowMajor, 0, kPackB, 0, kPackV>>();
+    }
+    else if constexpr (0) {
+        // constexpr Pack kPackA = HMMA_16816 | OPERAND_A | 1;
+        constexpr Pack kPackA = 0;
+        return gTestbed<gemm::Testbed<half, half, half, 1, kColMajor, kColMajor, kColMajor, kPackA, 0, 0, 0>>();
+    }
 }
 
 }  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/tuner/measurer.cu b/src/turbomind/kernels/gemm/tuner/measurer.cu
index a33d78f6f7..64a64b9d35 100644
--- a/src/turbomind/kernels/gemm/tuner/measurer.cu
+++ b/src/turbomind/kernels/gemm/tuner/measurer.cu
@@ -66,6 +66,8 @@ std::pair<float, cudaError_t> Measurer::ColdRun(LaunchSpec spec, const Launcher&
 
     cudaEventRecord(ev_beg_, stream);
 
+    // std::cout << spec.kernel->name() << " " << spec.splits << " " << spec.swizzle << std::endl;
+
     launcher(spec, stream);
 
     cudaEventRecord(ev_end_, stream);
@@ -77,6 +79,9 @@ std::pair<float, cudaError_t> Measurer::ColdRun(LaunchSpec spec, const Launcher&
     if (status == cudaSuccess) {
         cudaEventElapsedTime(&ms, ev_beg_, ev_end_);
     }
+    else {
+        std::cerr << cudaGetErrorString(status) << std::endl;
+    }
 
     return {ms, status};
 }
diff --git a/src/turbomind/kernels/gemm/types.h b/src/turbomind/kernels/gemm/types.h
index 133fdafe34..94a31e9452 100644
--- a/src/turbomind/kernels/gemm/types.h
+++ b/src/turbomind/kernels/gemm/types.h
@@ -4,6 +4,8 @@
 
 #include "src/turbomind/kernels/core/data_type.h"
 #include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
 #if ENABLE_BF16
 #include <cuda_bf16.h>
 #endif
@@ -40,6 +42,8 @@ typedef enum Op_Tag
     OPERAND_B = 0x020,
     OPERAND_U = 0x030,
     OPERAND_V = 0x040,
+    OPERAND_C = 0x050,
+    OPERAND_D = 0x060,
 } Op_Tag;
 
 constexpr MMA_Tag get_mma_tag(Pack pack)
@@ -57,6 +61,30 @@ constexpr int get_pack_num(Pack pack)
     return pack & 0x00f;
 }
 
+enum class Striding : int
+{
+    kFlat,     // [1111,2222,3333]
+    kRagged,   // [11,2222222,333]  [0 , 2      , 9  ]
+    kIndexed,  // [xx xxxxxxx xxx], [01, 2345678, 9ab]
+    kBlocked,  // [11][22222][333]
+};
+
+inline const char* to_string(Striding striding)
+{
+    switch (striding) {
+        case Striding::kFlat:
+            return "f";
+        case Striding::kRagged:
+            return "r";
+        case Striding::kIndexed:
+            return "i";
+        case Striding::kBlocked:
+            return "b";
+        default:
+            return "unknown";
+    }
+}
+
 enum class QuantType : int
 {
     kNone,
@@ -208,12 +236,28 @@ constexpr bool operator&(const DispatchPolicy& a, const DispatchPolicy& b)
     return ((int)a & (int)b);
 }
 
+class Kernel;
+class Context;
+
+struct Tape {
+    int   ctas;
+    int   max_num;
+    int   max_ctas;
+    char* buffer;
+    int4* gemm_shapes;
+    int4* tiled_shapes;
+    int4* tile_offsets;
+    int2* iter_k_ranges;
+    int*  tile_ids;
+};
+
 struct Operation {
     DispatchPolicy dispatch;
     Epilogue       epilogue;
     QuantDesc      quant_a;
     QuantDesc      quant_b;
     int            batch_dim;
+    Context*       context;
     void*          reserved;
 };
 
@@ -224,6 +268,9 @@ struct MatrixLayout {
     int      cols;
     int      ld;
     Pack     pack;
+    int      num;
+    int*     offsets;
+    int*     idxs;
 };
 
 inline int64_t get_size(const MatrixLayout& m)
@@ -231,6 +278,17 @@ inline int64_t get_size(const MatrixLayout& m)
     return get_size(m.type, (int64_t)m.rows * m.cols);
 }
 
+inline Striding get_mode(const MatrixLayout& m)
+{
+    if (m.idxs) {
+        return Striding::kIndexed;
+    }
+    else if (m.ld == 0 || m.offsets) {
+        return Striding::kBlocked;
+    }
+    return Striding::kFlat;
+}
+
 struct Workspace {
     void*  barriers;
     size_t barriers_size;
diff --git a/src/turbomind/kernels/gemm/utils.h b/src/turbomind/kernels/gemm/utils.h
index 965ea8d224..8c061eb52d 100644
--- a/src/turbomind/kernels/gemm/utils.h
+++ b/src/turbomind/kernels/gemm/utils.h
@@ -66,6 +66,22 @@ __host__ __device__ constexpr Index cs2idx(int2 cs, Index ld)
     return ld * cs.y + cs.x;
 }
 
+template<class Index>
+__host__ __device__ constexpr Index cs2idx(int2 cs, Index ld, int s0)
+{
+    return ld * (cs.y + s0) + cs.x;
+}
+
+__host__ __device__ constexpr auto dot(int2 a, int2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+
+__host__ __device__ constexpr auto dot(int2 a, long2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+
 template<MMA_Tag mma, Op_Tag op, int num, Order order>
 struct PackingImpl {
     __host__ __device__ static constexpr int2 apply(int2 mk)
diff --git a/src/turbomind/kernels/gpt_kernels.cu b/src/turbomind/kernels/gpt_kernels.cu
index 3bfc1ab7d1..4f47631fa5 100644
--- a/src/turbomind/kernels/gpt_kernels.cu
+++ b/src/turbomind/kernels/gpt_kernels.cu
@@ -236,6 +236,9 @@ invokeTransposeAxis01(half* out, half* in, const int dim0, const int dim1, const
 template void
 invokeTransposeAxis01(int* out, int* in, const int dim0, const int dim1, const int dim2, cudaStream_t stream);
 
+template void
+invokeTransposeAxis01(uint16_t* out, uint16_t* in, const int dim0, const int dim1, const int dim2, cudaStream_t stream);
+
 template<typename T>
 __global__ void transposeAxis01(T* out, T* in, const int* in_skipping_dim1, const int dim0, const int dim1)
 {
diff --git a/src/turbomind/models/llama/CMakeLists.txt b/src/turbomind/models/llama/CMakeLists.txt
index 77c26e9e51..285fcea31f 100644
--- a/src/turbomind/models/llama/CMakeLists.txt
+++ b/src/turbomind/models/llama/CMakeLists.txt
@@ -15,6 +15,7 @@ add_library(Llama STATIC
         LlamaWeight.cc
         LlamaDecoderLayerWeight.cc
         LlamaFfnLayer.cc
+        moe_ffn_layer.cc
         unified_decoder.cc
         unified_attention_layer.cc
         llama_kernels.cu
diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc
index 25f4bce212..4138174e5d 100644
--- a/src/turbomind/models/llama/LlamaBatch.cc
+++ b/src/turbomind/models/llama/LlamaBatch.cc
@@ -3,6 +3,7 @@
 #include "src/turbomind/models/llama/LlamaBatch.h"
 #include "src/turbomind/kernels/core/data_type.h"
 #include "src/turbomind/kernels/decoding_kernels.h"
+#include "src/turbomind/kernels/gemm/tuner/params.h"
 #include "src/turbomind/kernels/sampling_topk_kernels.h"
 #include "src/turbomind/macro.h"
 #include "src/turbomind/models/llama/BlockManager.h"
@@ -29,6 +30,7 @@
 #include <iterator>
 #include <mutex>
 #include <numeric>
+#include <random>
 #include <sstream>
 #include <unordered_map>
 #include <utility>
@@ -960,11 +962,6 @@ LlamaBatch<T>::~LlamaBatch()
     model_.reset();
     sequence_manager_.reset();
     context_.reset();  // This destroy all objects in context except for `stream`
-
-    check_cuda_error(cudaStreamSynchronize(stream_));
-
-    // Destroy the stream in context
-    check_cuda_error(cudaStreamDestroy(stream_));
 }
 
 template<typename T>
@@ -1894,6 +1891,126 @@ void LlamaBatch<T>::Submit(std::unordered_map<std::string, Tensor>*       output
     }
 }
 
+namespace {
+
+template<class First, class Last>
+std::string Join(First first, Last last, const std::string& delim)
+{
+    if (first == last) {
+        return {};
+    }
+    std::ostringstream oss;
+    oss << *first++;
+    while (first != last) {
+        oss << delim << *first++;
+    }
+    return oss.str();
+}
+
+template<class T>
+struct TuningContext {
+    LlamaLinear<T>& linear_;
+    cudaStream_t    stream_;
+    TuningContext(LlamaLinear<T>& linear, cudaStream_t stream): linear_{linear}, stream_{stream}
+    {
+        isTuning() = true;
+        linear_.set_measure(true);
+    }
+    ~TuningContext()
+    {
+        linear_.set_measure(false);
+        isTuning() = false;
+        // This will catch async errors during tuning
+        check_cuda_error(cudaStreamSynchronize(stream_));
+    }
+};
+
+}  // namespace
+
+template<class T>
+void LlamaBatch<T>::tune()
+{
+    auto& linear = *context_->linear;
+    if (auto str = std::getenv("TM_GEMM_IMPORT")) {
+        std::ifstream ifs(str);
+        const int     n_imported = linear.Import(ifs);
+        if (rank_ == 0) {
+            TM_LOG_INFO("[Gemm2] %d records imported", n_imported);
+        }
+        return;
+    }
+
+    std::vector<int> bss = linear.GetTuningSeq();
+    if (bss.empty()) {
+        bss = gemm::GenerateTuningSequence(gemm::GetDefaultTuningGenerators());
+    }
+
+    // remove bs that is too large
+    bss.erase(std::remove_if(bss.begin(), bss.end(), [&](auto x) { return x > max_forward_token_num_; }), bss.end());
+
+    if (rank_ == 0) {
+        auto str = Join(bss.begin(), bss.end(), ", ");
+        TM_LOG_INFO("[Gemm2] Tuning sequence: %s", str.c_str());
+    }
+
+    if (!bss.empty()) {
+        const auto                         max_bs = *std::max_element(bss.begin(), bss.end());
+        std::vector<int>                   input_ids(max_bs);
+        std::mt19937                       g{};
+        std::uniform_int_distribution<int> d{0, (int)model_->vocab_size_ - 1};
+        for (auto& x : input_ids) {
+            x = d(g);
+        }
+        Copy(input_ids.data(), max_bs, context_decoder_ids_buf_);
+        check_cuda_error(cudaStreamSynchronize(stream_));
+
+        TuningContext context{linear, stream_};
+
+        auto tick = std::chrono::steady_clock::now();
+
+        /// NOTE: No explicit barrier can be used here as internal threads are waiting on it now
+        for (auto bs : bss) {
+            if (rank_ == 0) {
+                TM_LOG_INFO("[Gemm2] %d", bs);
+            }
+            const int input_length = bs;
+            model_->forwardUnified(decoder_output_buf_,
+                                   context_decoder_output_buf_,
+                                   context_decoder_input_buf_,
+                                   (void**)block_ptrs_,  // invalid data
+                                   cu_block_counts_,     // invalid data
+                                   context_decoder_ids_buf_,
+                                   &input_length,
+                                   &input_length,
+                                   rope_theta_,    // invalid data
+                                   finished_buf_,  // invalid data
+                                   bs,
+                                   0,
+                                   1,
+                                   nullptr,
+                                   nullptr);
+            // implicit barrier for TP
+            check_cuda_error(cudaStreamSynchronize(stream_));
+        }
+
+        auto tock = std::chrono::steady_clock::now();
+
+        if (rank_ == 0) {
+            TM_LOG_INFO("[Gemm2] Tuning finished in %.2f seconds.",
+                        std::chrono::duration<float, std::ratio<1, 1>>(tock - tick).count());
+        }
+    }
+
+    // Only rank-0 exports the dispatch cache
+    if (rank_ == 0) {
+        if (auto path = std::getenv("TM_GEMM_EXPORT")) {
+            std::ofstream ofs(path);
+            const auto    n_records = context_->linear->Export(ofs);
+            TM_LOG_INFO("[Gemm2] %d records exported.", n_records);
+        }
+    }
+}
+
 template class LlamaBatch<half>;
 #ifdef ENABLE_FP32
 template class LlamaBatch<float>;
diff --git a/src/turbomind/models/llama/LlamaBatch.h b/src/turbomind/models/llama/LlamaBatch.h
index ee16f911b8..9c66948999 100644
--- a/src/turbomind/models/llama/LlamaBatch.h
+++ b/src/turbomind/models/llama/LlamaBatch.h
@@ -143,6 +143,8 @@ class LlamaBatch {
         return session_len_;
     }
 
+    void tune();
+
 private:
     void InternalThreadEntry();
 
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
index 3beba40c6b..7ed657a9b8 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -22,6 +22,7 @@
 #include "src/turbomind/kernels/gemm/cast.h"
 #include "src/turbomind/kernels/gemm/gemm.h"
 #include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/kernels/gpt_kernels.h"
 #include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/logger.h"
@@ -29,8 +30,6 @@
 #include <cstdlib>
 #include <cuda_runtime.h>
 #include <filesystem>
-#include <ios>
-
 namespace turbomind {
 
 static bool is_fuse_silu_act()
@@ -63,6 +62,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
                                                     int        group_size,
                                                     LoraParam  lora_param,
                                                     bool       attn_bias,
+                                                    MoeParam   moe_param,
                                                     size_t     tensor_para_size,
                                                     size_t     tensor_para_rank):
     head_num_(head_num),
@@ -124,26 +124,23 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
     self_attn_weights.output.type        = weight_type;
     self_attn_weights.output.group_size  = group_size;
 
-    ffn_weights.gating.input_dims  = hidden_units_;
-    ffn_weights.gating.output_dims = inter_size_ / tensor_para_size_;
-    ffn_weights.gating.type        = weight_type;
-    ffn_weights.gating.group_size  = group_size;
-
-    ffn_weights.intermediate.input_dims  = hidden_units_;
-    ffn_weights.intermediate.output_dims = inter_size_ / tensor_para_size_;
-    ffn_weights.intermediate.type        = weight_type;
-    ffn_weights.intermediate.group_size  = group_size;
-
-    ffn_weights.fused_gating_intermediate.input_dims  = hidden_units_;
-    ffn_weights.fused_gating_intermediate.output_dims = inter_size_ / tensor_para_size_ * 2;
-    ffn_weights.fused_gating_intermediate.type        = weight_type;
-    ffn_weights.fused_gating_intermediate.group_size  = group_size;
-    ffn_weights.is_fused_silu                         = weight_type == WeightType::kINT4 && is_fuse_silu_act();
+    ffn_weights = LlamaFfnWeight<T>{
+        hidden_units_,
+        inter_size_,
+        tensor_para_size_,
+        weight_type_,
+        group_size,
+        weight_type_ == WeightType::kINT4 && is_fuse_silu_act(),
+    };
 
-    ffn_weights.output.input_dims  = inter_size_ / tensor_para_size_;
-    ffn_weights.output.output_dims = hidden_units_;
-    ffn_weights.output.type        = weight_type;
-    ffn_weights.output.group_size  = group_size;
+    moe_weights = MoeFfnWeight<T>{hidden_units_,
+                                  moe_param.inter_size,
+                                  moe_param.expert_num,
+                                  moe_param.method,
+                                  tensor_para_size_,
+                                  weight_type,
+                                  group_size,
+                                  is_fuse_silu_act()};
 
     mallocWeights();
 }
@@ -151,19 +148,20 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
 template<typename T>
 size_t LlamaDecoderLayerWeight<T>::workspace_size() const noexcept
 {
-    if (weight_type_ != WeightType::kINT4) {
-        return 0;
-    }
+    // Space to hold the largest weight in full precision
 
     auto get_size = [](const auto& w) { return (size_t)w.input_dims * w.output_dims; };
 
     size_t size = 0;
 
     size = std::max(size, get_size(self_attn_weights.qkv));
+    size = std::max(size, get_size(self_attn_weights.output));
     size = std::max(size, get_size(ffn_weights.gating));
+    size = std::max(size, get_size(ffn_weights.fused_gating_intermediate));
 
-    if (fused_up_and_gate_) {
-        size = std::max(size, get_size(ffn_weights.fused_gating_intermediate));
+    for (const auto& e : moe_weights.experts) {
+        size = std::max(size, get_size(e.gating));
+        size = std::max(size, get_size(e.fused_gating_intermediate));
     }
 
     return size * sizeof(uint16_t);
@@ -191,7 +189,7 @@ void freeWeights(LlamaDenseWeight<T>& weights)
 }
 
 template<typename T>
-void mallocWeights(LlamaDenseWeight<T>& weights, bool bias)
+void LlamaDecoderLayerWeight<T>::mallocWeights(LlamaDenseWeight<T>& weights, bool bias)
 {
     if (bias) {
         deviceMalloc((T**)&weights.bias, weights.output_dims);
@@ -210,7 +208,6 @@ void mallocWeights(LlamaDenseWeight<T>& weights, bool bias)
     }
 
     if (weights.lora.r > 0) {
-        // FT_CHECK(bit_size >= 16);
         deviceMalloc((T**)&weights.lora.a, weights.input_dims * weights.lora.r);
         deviceMalloc((T**)&weights.lora.b, weights.lora.r * weights.output_dims);
     }
@@ -231,47 +228,33 @@ void getWeightTensor(LlamaDenseWeight<T>& weights, bool bias, const std::string&
     auto get_name = [=](const std::string& name) { return concat(prefix, name); };
 
     if (bias) {
-        output.insert(get_name("bias"),
-                      Tensor{MEMORY_GPU, getTensorType<T>(), {weights.output_dims * sizeof(T)}, weights.bias});
+        output.insert(get_name("bias"), Tensor{MEMORY_GPU, getTensorType<T>(), {weights.bias_size()}, weights.bias});
     }
+
     const size_t bit_size = getBitSize(weights.type);
     if (bit_size >= 16) {
         output.insert(get_name("weight"),
-                      Tensor{MEMORY_GPU,
-                             getTensorType<T>(),
-                             {weights.input_dims * weights.output_dims * sizeof(T)},
-                             weights.kernel});
+                      Tensor{MEMORY_GPU, getTensorType<T>(), {weights.kernel_size()}, weights.kernel});
     }
-    else {  // int8, int4
-        const int factor = sizeof(float) * 8 / bit_size;
-        output.insert(get_name("qweight"),
-                      Tensor{MEMORY_GPU,
-                             TYPE_INT32,
-                             {weights.input_dims * weights.output_dims * sizeof(int) / factor},
-                             weights.kernel});
+    else {
+        output.insert(get_name("qweight"), Tensor{MEMORY_GPU, TYPE_INT32, {weights.kernel_size()}, weights.kernel});
         output.insert(get_name("scales"),
-                      Tensor{MEMORY_GPU,
-                             getTensorType<T>(),
-                             {weights.input_dims / weights.group_size * weights.output_dims * sizeof(T)},
-                             weights.scales});
+                      Tensor{MEMORY_GPU, getTensorType<T>(), {weights.scales_size()}, weights.scales});
         output.insert(get_name("zeros"),
-                      Tensor{MEMORY_GPU,
-                             getTensorType<T>(),
-                             {weights.input_dims / weights.group_size * weights.output_dims * sizeof(T)},
-                             weights.zeros});
+                      Tensor{MEMORY_GPU, getTensorType<T>(), {weights.scales_size()}, weights.zeros});
     }
 
     if (weights.lora.r) {
-        // FT_CHECK(bit_size >= 16);
-        auto        n       = prefix.rfind(".");
+        auto n = prefix.rfind(".");
+
         std::string _prefix = prefix.substr(0, n);
         std::string _num    = prefix.substr(n + 1);
-        output.insert(
-            concat(_prefix, "lora_a", _num, "weight"),
-            Tensor{MEMORY_GPU, getTensorType<T>(), {weights.input_dims * weights.lora.r * sizeof(T)}, weights.lora.a});
-        output.insert(
-            concat(_prefix, "lora_b", _num, "weight"),
-            Tensor{MEMORY_GPU, getTensorType<T>(), {weights.lora.r * weights.output_dims * sizeof(T)}, weights.lora.b});
+
+        output.insert(concat(_prefix, "lora_a", _num, "weight"),
+                      Tensor{MEMORY_GPU, getTensorType<T>(), {weights.lora_size().first}, weights.lora.a});
+        output.insert(concat(_prefix, "lora_b", _num, "weight"),
+                      Tensor{MEMORY_GPU, getTensorType<T>(), {weights.lora_size().second}, weights.lora.b});
+
         TM_LOG_DEBUG("allocate lora weight, layer_name=%s input_dims=%d, output_dims=%d, lora_r=%d",
                      get_name("weight").c_str(),
                      weights.input_dims,
@@ -286,6 +269,7 @@ void loadWeights(
 {
     auto weight_file  = prefix + "." + std::to_string(tensor_para_size - 1) + ".weight";
     auto qweight_file = prefix + "." + std::to_string(tensor_para_size - 1) + ".qweight";
+
     if (!std::filesystem::exists(weight_file) && !std::filesystem::exists(qweight_file)) {
         TM_LOG_ERROR("%s and %s does not exist", weight_file.c_str(), qweight_file.c_str());
         FT_CHECK(false);
@@ -325,12 +309,22 @@ void LlamaDecoderLayerWeight<T>::mallocWeights()
     deviceMalloc((T**)&self_attn_norm_weights, hidden_units_);
     deviceMalloc((T**)&ffn_norm_weights, hidden_units_);
 
-    turbomind::mallocWeights(self_attn_weights.qkv, attn_bias_);
-    turbomind::mallocWeights(self_attn_weights.output, attn_bias_);
+    mallocWeights(self_attn_weights.qkv, attn_bias_);
+    mallocWeights(self_attn_weights.output, attn_bias_);
 
-    turbomind::mallocWeights(ffn_weights.gating, false);
-    turbomind::mallocWeights(ffn_weights.intermediate, false);
-    turbomind::mallocWeights(ffn_weights.output, false);
+    if (moe_weights.experts.empty()) {
+        mallocWeights(ffn_weights.gating, false);
+        mallocWeights(ffn_weights.intermediate, false);
+        mallocWeights(ffn_weights.output, false);
+    }
+    else {
+        mallocWeights(moe_weights.gate, false);
+        for (auto& e : moe_weights.experts) {
+            mallocWeights(e.gating, false);
+            mallocWeights(e.intermediate, false);
+            mallocWeights(e.output, false);
+        }
+    }
 }
 
 template<typename T>
@@ -361,6 +355,7 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
     loadWeightFromBin((T*)ffn_norm_weights, {hidden_units_}, dir_path + ".ffn_norm.weight", model_file_type);
 
     loadWeights(self_attn_weights.qkv, dir_path + ".attention.w_qkv", tensor_para_rank_, type, tensor_para_size_);
+
     loadWeights(self_attn_weights.output, dir_path + ".attention.wo", tensor_para_rank_, type, tensor_para_size_);
 
     loadWeights(ffn_weights.gating, dir_path + ".feed_forward.w1", tensor_para_rank_, type, tensor_para_size_);
@@ -384,23 +379,37 @@ TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
     getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output);
     getWeightTensor(self_attn_weights.output, attn_bias_, get_prefix("attention.wo"), output);
 
-    getWeightTensor(ffn_weights.gating, false, get_prefix("feed_forward.w1"), output);
-    getWeightTensor(ffn_weights.intermediate, false, get_prefix("feed_forward.w3"), output);
-    getWeightTensor(ffn_weights.output, false, get_prefix("feed_forward.w2"), output);
+    if (moe_weights.experts.empty()) {
+        getWeightTensor(ffn_weights.gating, false, get_prefix("feed_forward.w1"), output);
+        getWeightTensor(ffn_weights.intermediate, false, get_prefix("feed_forward.w3"), output);
+        getWeightTensor(ffn_weights.output, false, get_prefix("feed_forward.w2"), output);
+    }
+    else {
+        output.insert(
+            concat(prefix, "moe_ffn.gate.weight"),
+            Tensor{MEMORY_GPU, getTensorType<T>(), {moe_weights.gate.kernel_size()}, moe_weights.gate.kernel});
+        auto& experts = moe_weights.experts;
+        for (size_t i = 0; i < experts.size(); ++i) {
+            const std::string name = "moe_ffn.experts." + std::to_string(i);
+            // std::cerr << "FUCK " << get_prefix(concat(name, "w1")) << "\n";
+            getWeightTensor(experts[i].gating, false, get_prefix(concat(name, "w1")), output);
+            getWeightTensor(experts[i].intermediate, false, get_prefix(concat(name, "w3")), output);
+            getWeightTensor(experts[i].output, false, get_prefix(concat(name, "w2")), output);
+        }
+    }
 
     return output;
 }
 
-template<class T>
-static void convert(LlamaDenseWeight<T>& weight, void* workspace, size_t size, bool use_simt)
+// template<class T>
+static void convert_u4(LlamaDenseWeight<half>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt)
 {
-    if (weight.type != WeightType::kINT4) {
-        return;
-    }
+    FT_CHECK(weight.type == WeightType::kINT4);
 
     using namespace gemm;
 
-    auto [order_b, pack_b, order_v, pack_v] = get_weight_and_scales_layout(getSMVersion(), use_simt);
+    auto [order_b, pack_b, order_v, pack_v] =
+        get_weight_and_scales_layout(gemm::DataType::U4, is_fused_moe, getSMVersion(), use_simt);
 
     if (order_b == kColMajor) {
         transpose_u4((uint4_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims, weight.output_dims);
@@ -410,21 +419,6 @@ static void convert(LlamaDenseWeight<T>& weight, void* workspace, size_t size, b
     extend_to_u16((uint16_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims * weight.output_dims);
     sync_check_cuda_error();
 
-    if constexpr (0) {
-        std::vector<uint16_t> tmp(weight.input_dims * weight.output_dims);
-        cudaMemcpy(tmp.data(), workspace, sizeof(uint16_t) * tmp.size(), cudaMemcpyDefault);
-        cudaDeviceSynchronize();
-        int i = 0;
-        for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) {
-            std::cout << *it << " ";
-        }
-        i = 0;
-        std::cout << "\n";
-        for (auto it = tmp.rbegin(); i < 1000 && it != tmp.rend(); ++it, ++i) {
-            std::cout << *it << " ";
-        }
-    }
-
     MatrixLayout w_desc{
         gemm::DataType::F16,
         order_b,
@@ -442,29 +436,12 @@ static void convert(LlamaDenseWeight<T>& weight, void* workspace, size_t size, b
     FT_CHECK(Convert(workspace, w_desc, weight.kernel, k_desc, 0) == 0);
     sync_check_cuda_error();
 
-    if constexpr (0) {
-        std::vector<uint32_t> tmp(weight.input_dims * weight.output_dims / 8);
-        cudaMemcpy(tmp.data(), weight.kernel, sizeof(uint32_t) * tmp.size(), cudaMemcpyDefault);
-        cudaDeviceSynchronize();
-        int i = 0;
-        for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) {
-            std::cout << std::hex << *it << " ";
-        }
-        i = 0;
-        std::cout << "\n";
-        for (auto it = tmp.rbegin(); i < 1000 && it != tmp.rend(); ++it, ++i) {
-            std::cout << std::hex << *it << " ";
-        }
-    }
-
     const int scale_count = (weight.input_dims / weight.group_size) * weight.output_dims;
 
-    if constexpr (std::is_same_v<T, half>) {
-        // std::cout << "fuse_scales_and_zeros\n";
-        fuse_scales_and_zeros((T*)workspace, weight.scales, weight.zeros, scale_count);
-        // cudaMemset((T*)workspace, 0, sizeof(T) * scale_count * 2);
-        sync_check_cuda_error();
-    }
+    // std::cout << "fuse_scales_and_zeros\n";
+    fuse_scales_and_zeros((half*)workspace, weight.scales, weight.zeros, scale_count);
+    // cudaMemset((T*)workspace, 0, sizeof(T) * scale_count * 2);
+    sync_check_cuda_error();
 
     cudaDeviceSynchronize();
 
@@ -472,7 +449,7 @@ static void convert(LlamaDenseWeight<T>& weight, void* workspace, size_t size, b
     cudaFree(weight.zeros);
     weight.scales = weight.zeros = nullptr;
 
-    deviceMalloc((T**)&weight.scales_zeros, scale_count * 2);
+    deviceMalloc((half**)&weight.scales_zeros, scale_count * 2);
 
     MatrixLayout s_desc{
         gemm::DataType::U32,
@@ -488,30 +465,75 @@ static void convert(LlamaDenseWeight<T>& weight, void* workspace, size_t size, b
     FT_CHECK(Convert(workspace, s_desc, weight.scales_zeros, q_desc, 0) == 0);
     sync_check_cuda_error();
 
-    if constexpr (0) {
-        std::vector<T> tmp(scale_count * 2);
-        cudaMemcpy(tmp.data(), weight.scales_zeros, sizeof(T) * tmp.size(), cudaMemcpyDefault);
-        cudaDeviceSynchronize();
-        // for (const auto& x: tmp) {
-        //     std::cout << (float)x << " ";
-        // }
-        int i = 0;
-        for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) {
-            std::cout << std::hex << *it << " ";
-        }
-        i = 0;
-        std::cout << "\n";
-        for (auto it = tmp.rbegin(); i < 1000 && it != tmp.rend(); ++it, ++i) {
-            std::cout << std::hex << *it << " ";
-        }
-    }
-
     weight.k_desc = k_desc;
     weight.q_desc = q_desc;
 
     // FT_CHECK(0);
 }
 
+template<class T>
+static void convert_fp(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt)
+{
+    using namespace gemm;
+
+    if (!is_fused_moe) {
+        return;
+    }
+
+    const auto [order_b, pack_b, order_v, pack_v] =
+        get_weight_and_scales_layout(get_data_type_v<T>, is_fused_moe, getSMVersion(), use_simt);
+
+    const int input_dim  = weight.input_dims;
+    const int output_dim = weight.output_dims;
+
+    if (order_b == kColMajor) {
+        invokeTransposeAxis01((uint16_t*)workspace, (uint16_t*)weight.kernel, input_dim, output_dim, 1, nullptr);
+        sync_check_cuda_error();
+        // FT_CHECK(0);
+    }
+    else {
+        check_cuda_error(cudaMemcpy(workspace, weight.kernel, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault));
+    }
+
+    MatrixLayout src{
+        get_data_type_v<T>,
+        order_b,
+        input_dim,   // k
+        output_dim,  // n
+        order_b == kRowMajor ? output_dim : input_dim,
+    };
+
+    MatrixLayout dst = src;
+    dst.pack         = pack_b;
+
+    if (pack_b) {
+        FT_CHECK(Convert(workspace, src, weight.kernel, dst, nullptr) == 0);
+        sync_check_cuda_error();
+        // FT_CHECK(0);
+    }
+    else {
+        check_cuda_error(cudaMemcpy(weight.kernel, workspace, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault));
+    }
+
+    weight.k_desc = dst;
+}
+
+template<class T>
+static void convert(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt)
+{
+    if (weight.type == WeightType::kINT4) {
+        if constexpr (std::is_same_v<T, half>) {
+            convert_u4(weight, is_fused_moe, workspace, size, use_simt);
+        }
+        else {
+            FT_CHECK(0);
+        }
+    }
+    else {
+        convert_fp(weight, is_fused_moe, workspace, size, use_simt);
+    }
+}
+
 template<class T>
 void interleave(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>& b, void* workspace, size_t size)
 {
@@ -541,7 +563,6 @@ void interleave(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight
         interleave_output_dims(c.zeros, a.zeros, b.zeros, a.output_dims, a.input_dims / a.group_size, 0);
     }
     else {
-        FT_CHECK_WITH_INFO(0, "not implemented");
         interleave_output_dims((T*)c.kernel, (const T*)a.kernel, (const T*)b.kernel, a.output_dims, a.input_dims, 0);
     }
 
@@ -582,33 +603,81 @@ void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cud
 {
     const bool is_16xx = is_16xx_series(prop.name);
 
-    convert(self_attn_weights.qkv, workspace, size, is_16xx);
-    convert(self_attn_weights.output, workspace, size, is_16xx);
+    convert(self_attn_weights.qkv, false, workspace, size, is_16xx);
+    convert(self_attn_weights.output, false, workspace, size, is_16xx);
+
+    auto process_ffn = [&](LlamaFfnWeight<T>& ffn, bool is_fused_moe) {
+        if (fused_up_and_gate_) {
+            auto& fused_up_and_gate = ffn.fused_gating_intermediate;
 
-    if (fused_up_and_gate_) {
+            mallocWeights(fused_up_and_gate, false);
 
-        auto& fused_up_and_gate = ffn_weights.fused_gating_intermediate;
+            if (ffn.is_fused_silu) {
+                interleave(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size);
+            }
+            else {
+                chunk(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size);
+            }
 
-        turbomind::mallocWeights(fused_up_and_gate, false);
+            convert(ffn.fused_gating_intermediate, is_fused_moe, workspace, size, is_16xx);
 
-        if (ffn_weights.is_fused_silu) {
-            interleave(fused_up_and_gate, ffn_weights.gating, ffn_weights.intermediate, workspace, size);
+            freeWeights(ffn.gating);
+            freeWeights(ffn.intermediate);
         }
         else {
-            chunk(fused_up_and_gate, ffn_weights.gating, ffn_weights.intermediate, workspace, size);
+            convert(ffn.gating, is_fused_moe, workspace, size, is_16xx);
+            convert(ffn.intermediate, is_fused_moe, workspace, size, is_16xx);
         }
 
-        convert(ffn_weights.fused_gating_intermediate, workspace, size, is_16xx);
+        convert(ffn.output, is_fused_moe, workspace, size, is_16xx);
+    };
 
-        freeWeights(ffn_weights.gating);
-        freeWeights(ffn_weights.intermediate);
+    if (moe_weights.experts.empty()) {
+        process_ffn(ffn_weights, false);
     }
     else {
-        convert(ffn_weights.gating, workspace, size, is_16xx);
-        convert(ffn_weights.intermediate, workspace, size, is_16xx);
-    }
+        std::vector<std::pair<void*, int>> fused_ptrs;
+        std::vector<std::pair<void*, int>> output_ptrs;
+        std::vector<std::pair<void*, int>> fused_param_ptrs;
+        std::vector<std::pair<void*, int>> output_param_ptrs;
+
+        for (auto& e : moe_weights.experts) {
+
+            process_ffn(e, moe_weights.method);
 
-    convert(ffn_weights.output, workspace, size, is_16xx);
+            const auto& fused  = e.fused_gating_intermediate;
+            const auto& output = e.output;
+
+            fused_ptrs.push_back({fused.kernel, fused.k_desc.ld});
+            output_ptrs.push_back({output.kernel, output.k_desc.ld});
+
+            if (e.fused_gating_intermediate.scales_zeros) {
+                fused_param_ptrs.emplace_back(fused.scales_zeros, fused.q_desc.ld);
+                output_param_ptrs.emplace_back(output.scales_zeros, output.q_desc.ld);
+            }
+        }
+
+        // Note: This assumes all experts has the same shape
+        moe_weights.block = moe_weights.experts.at(0);
+
+        auto& fused  = moe_weights.block.fused_gating_intermediate;
+        auto& output = moe_weights.block.output;
+
+        // TODO: free these ptrs
+        fused.kernel  = gemm::make_blocked_ptrs(fused_ptrs, nullptr);
+        output.kernel = gemm::make_blocked_ptrs(output_ptrs, nullptr);
+
+        if (!fused_param_ptrs.empty()) {
+            fused.scales_zeros  = (T*)gemm::make_blocked_ptrs(fused_param_ptrs, nullptr);
+            output.scales_zeros = (T*)gemm::make_blocked_ptrs(output_param_ptrs, nullptr);
+        }
+
+        fused.k_desc.ld = output.k_desc.ld = 0;
+        fused.k_desc.num = output.k_desc.num = moe_weights.experts.size();
+
+        fused.q_desc.ld = output.q_desc.ld = 0;
+        fused.q_desc.num = output.q_desc.num = moe_weights.experts.size();
+    }
 }
 
 #ifdef ENABLE_FP32
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
index ea6a45b862..f68a103dd5 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
@@ -40,6 +40,7 @@ struct LlamaDecoderLayerWeight {
                             int        group_size,
                             LoraParam  lora_param,
                             bool       attn_bias,
+                            MoeParam   moe_param,
                             size_t     tensor_para_size,
                             size_t     tensor_para_rank);
     ~LlamaDecoderLayerWeight();
@@ -54,10 +55,13 @@ struct LlamaDecoderLayerWeight {
 
     size_t workspace_size() const noexcept;
 
+    void mallocWeights(LlamaDenseWeight<T>& weights, bool bias);
+
     T*                      self_attn_norm_weights{};
     T*                      ffn_norm_weights{};
     LlamaAttentionWeight<T> self_attn_weights{};
     LlamaFfnWeight<T>       ffn_weights{};
+    MoeFfnWeight<T>         moe_weights{};
 
 private:
     size_t     head_num_;
diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h
index a975da1a0d..9a895243bc 100644
--- a/src/turbomind/models/llama/LlamaDenseWeight.h
+++ b/src/turbomind/models/llama/LlamaDenseWeight.h
@@ -21,6 +21,7 @@
 
 #include "src/turbomind/kernels/gemm/types.h"
 #include "src/turbomind/utils/cuda_utils.h"
+#include <cuda_bf16.h>
 
 namespace turbomind {
 
@@ -34,6 +35,24 @@ enum class WeightType : int
     kINT4
 };
 
+template<class T>
+constexpr WeightType get_default_weight_type()
+{
+    if constexpr (std::is_same_v<T, half>) {
+        return WeightType::kFP16;
+    }
+    else if constexpr (std::is_same_v<T, nv_bfloat16>) {
+        return WeightType::kBF16;
+    }
+    else if constexpr (std::is_same_v<T, float>) {
+        return WeightType::kFP32;
+    }
+    else {
+        static_assert(sizeof(T) != sizeof(T), "not implemented");
+        return {};
+    }
+}
+
 inline size_t getBitSize(WeightType type)
 {
     switch (type) {
@@ -90,6 +109,26 @@ struct LlamaDenseWeight {
 
     gemm::MatrixLayout k_desc;
     gemm::MatrixLayout q_desc;
+
+    size_t kernel_size() const noexcept
+    {
+        return getBitSize(type) * input_dims * output_dims / 8;
+    }
+
+    size_t bias_size() const noexcept
+    {
+        return sizeof(T) * output_dims;
+    }
+
+    size_t scales_size() const noexcept
+    {
+        return sizeof(T) * input_dims / group_size * output_dims;
+    }
+
+    std::pair<size_t, size_t> lora_size() const noexcept
+    {
+        return {sizeof(T) * input_dims * lora.r, sizeof(T) * lora.r * output_dims};
+    }
 };
 
 template<typename T>
@@ -100,11 +139,83 @@ struct LlamaAttentionWeight {
 
 template<typename T>
 struct LlamaFfnWeight {
+
+    LlamaFfnWeight() = default;
+
+    LlamaFfnWeight(
+        size_t hidden_dim, size_t inter_size, size_t tp, WeightType weight_type, int group_size, bool fuse_silu_act)
+    {
+        gating.input_dims  = hidden_dim;
+        gating.output_dims = inter_size / tp;
+        gating.type        = weight_type;
+        gating.group_size  = group_size;
+
+        intermediate.input_dims  = hidden_dim;
+        intermediate.output_dims = inter_size / tp;
+        intermediate.type        = weight_type;
+        intermediate.group_size  = group_size;
+
+        fused_gating_intermediate.input_dims  = hidden_dim;
+        fused_gating_intermediate.output_dims = inter_size / tp * 2;
+        fused_gating_intermediate.type        = weight_type;
+        fused_gating_intermediate.group_size  = group_size;
+
+        is_fused_silu = fuse_silu_act;
+
+        output.input_dims  = inter_size / tp;
+        output.output_dims = hidden_dim;
+        output.type        = weight_type;
+        output.group_size  = group_size;
+    }
+
     LlamaDenseWeight<T> gating;
     LlamaDenseWeight<T> intermediate;
     LlamaDenseWeight<T> output;
     LlamaDenseWeight<T> fused_gating_intermediate;
-    bool                is_fused_silu;
+
+    bool is_fused_silu{};
+};
+
+template<class T>
+struct MoeFfnWeight {
+
+    MoeFfnWeight() = default;
+
+    MoeFfnWeight(size_t     hidden_dim,
+                 int        inter_size,
+                 int        expert_num,
+                 int        method,
+                 size_t     tp,
+                 WeightType weight_type,
+                 int        group_size,
+                 bool       fuse_silu_act)
+    {
+        if (expert_num == 0) {
+            return;
+        }
+
+        gate.input_dims  = hidden_dim;
+        gate.output_dims = expert_num;
+        gate.type        = get_default_weight_type<T>();
+        gate.group_size  = group_size;
+
+        experts.resize(expert_num);
+
+        this->method  = method;
+        fuse_silu_act = fuse_silu_act && method;
+
+        for (auto& e : experts) {
+            // inter size is divided by tp in `FfnWeight`
+            e = LlamaFfnWeight<T>{hidden_dim, (size_t)inter_size, tp, weight_type, group_size, fuse_silu_act};
+        }
+    }
+
+    LlamaDenseWeight<T>            gate;
+    std::vector<LlamaFfnWeight<T>> experts;
+
+    LlamaFfnWeight<T> block;
+
+    int method{};
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc
index b837c8b7b1..f9ee0c4ad4 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.cc
+++ b/src/turbomind/models/llama/LlamaFfnLayer.cc
@@ -144,7 +144,7 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
 
     count_and_fix(ffn_output_data, num_token * weights->output.output_dims, Concat("w2", layer_id), 3);
 
-    if (tensor_para_.world_size_ > 1) {
+    if (all_reduce_ && tensor_para_.world_size_ > 1) {
         NcclGuard nccl_guard(tensor_para_, stream_);
         ftNcclAllReduceSum(ffn_output_data, ffn_output_data, num_token * hidden_units_, tensor_para_, stream_);
         sync_check_cuda_error();
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.h b/src/turbomind/models/llama/LlamaFfnLayer.h
index 3ea7df0b20..75ced5f9ac 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.h
+++ b/src/turbomind/models/llama/LlamaFfnLayer.h
@@ -31,14 +31,14 @@ namespace turbomind {
 template<typename T>
 class LlamaFfnLayer {
 public:
-    LlamaFfnLayer(const ModelParam& model, const NcclParam& tp, const Context<T>& ctx):
+    LlamaFfnLayer(const ModelParam& model, const NcclParam& tp, const Context<T>& ctx, bool all_reduce):
         inter_size_(model.inter_size / tp.world_size_),
         hidden_units_(model.hidden_units),
         tensor_para_(tp),
         stream_(ctx.stream),
         linear_(ctx.linear.get()),
-        allocator_(ctx.allocator.get())
-
+        allocator_(ctx.allocator.get()),
+        all_reduce_(all_reduce)
     {
     }
 
@@ -62,6 +62,7 @@ class LlamaFfnLayer {
     cudaStream_t const    stream_;
     LlamaLinear<T>* const linear_;
     IAllocator* const     allocator_;
+    const bool            all_reduce_;
     bool                  is_free_buffer_after_forward_{};
 
     T* gating_buf_{};
diff --git a/src/turbomind/models/llama/LlamaInstanceComm.h b/src/turbomind/models/llama/LlamaInstanceComm.h
deleted file mode 100644
index 540a009020..0000000000
--- a/src/turbomind/models/llama/LlamaInstanceComm.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#pragma once
-
-#include "src/turbomind/models/llama/Barrier.h"
-#include "src/turbomind/utils/instance_comm.h"
-
-namespace turbomind {
-
-class LlamaInstanceComm: public AbstractInstanceComm {
-public:
-    LlamaInstanceComm(int count): barrier_(count) {}
-
-    void barrier() override
-    {
-        barrier_.wait();
-    }
-
-    void setSharedObject(void* p) override
-    {
-        ptr = p;
-    }
-
-    void* getSharedObject() override
-    {
-        return ptr;
-    }
-
-private:
-    Barrier barrier_;
-    void*   ptr{};
-};
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaLinear.cu b/src/turbomind/models/llama/LlamaLinear.cu
index 9f80bdab0c..58e67dbb84 100644
--- a/src/turbomind/models/llama/LlamaLinear.cu
+++ b/src/turbomind/models/llama/LlamaLinear.cu
@@ -114,6 +114,7 @@ struct LlamaLinear<T>::Impl {
                                   {QuantType::kNone},
                                   {QuantType::kDefault, weight.group_size},
                                   0,
+                                  {},
                                   nullptr};
 
         const MatrixLayout a_desc{
@@ -156,6 +157,81 @@ struct LlamaLinear<T>::Impl {
         }
     }
 
+    void forward_moe(T*                         output_data,
+                     Pitched                    input_data,
+                     const int*                 indexes,
+                     const int*                 offsets,
+                     int                        batch_size,
+                     const LlamaDenseWeight<T>& weight,
+                     Type                       type,
+                     gemm::Context*             context)
+    {
+        using namespace gemm;
+
+        QuantDesc quant_b{};
+        if (weight.k_desc.type == gemm::DataType::U4) {
+            quant_b.type       = QuantType::kDefault;
+            quant_b.group_size = weight.group_size;
+        }
+
+        const Operation operation{dispatch_policy_,
+                                  type == kFusedSiluFfn ? Epilogue::kGatedSilu : Epilogue::kNone,
+                                  {QuantType::kNone},
+                                  quant_b,
+                                  0,
+                                  context,
+                                  nullptr};
+
+        MatrixLayout a_desc{
+            get_data_type_v<T>,
+            kRowMajor,
+            batch_size,              // m
+            (int)weight.input_dims,  // k
+            input_data.pitch,
+        };
+
+        // std::cout << "m" << batch_size << "n" << weight.output_dims << "k" << weight.input_dims << " "
+        //           << input_data.pitch << "\n";
+
+        a_desc.offsets = (int*)offsets;
+        a_desc.idxs    = (int*)indexes;
+
+        MatrixLayout c_desc{
+            get_data_type_v<T>,
+            kRowMajor,
+            batch_size,
+            (int)weight.output_dims,
+            type == kFusedSiluFfn ? (int)weight.output_dims / 2 : (int)weight.output_dims,
+        };
+
+        c_desc.offsets = (int*)offsets;
+
+        a_desc.num = c_desc.num = weight.k_desc.num;
+
+        auto ec = gemm_.Run(operation,
+                            1.f,
+                            input_data.ptr,
+                            a_desc,
+                            nullptr,
+                            {},
+                            weight.kernel,
+                            weight.k_desc,
+                            weight.scales_zeros,
+                            weight.q_desc,
+                            type == kFusedAdd ? 1.0f : 0.0f,
+                            output_data,
+                            c_desc,
+                            output_data,
+                            c_desc,
+                            workspace_,
+                            stream_);
+
+        if (ec) {
+            TM_LOG_ERROR("%s: %d", __PRETTY_FUNCTION__, ec);
+            // std::abort();
+        }
+    }
+
     cublasMMWrapper*     cublas_wrapper_;
     gemm::Gemm           gemm_;
     gemm::DispatchPolicy dispatch_policy_{gemm::DispatchPolicy::kDefault};
@@ -177,6 +253,19 @@ void LlamaLinear<T>::forward(
     impl_->forward(output_data, input_data, batch_size, weight, type, lora_mask);
 }
 
+template<class T>
+void LlamaLinear<T>::forward_moe(T*                         output_data,
+                                 Pitched                    input_data,
+                                 const int*                 indexes,
+                                 const int*                 offsets,
+                                 int                        batch_size,
+                                 const LlamaDenseWeight<T>& weight,
+                                 Type                       type,
+                                 gemm::Context*             context)
+{
+    impl_->forward_moe(output_data, input_data, indexes, offsets, batch_size, weight, type, context);
+}
+
 template<class T>
 void LlamaLinear<T>::set_measure(bool measure)
 {
diff --git a/src/turbomind/models/llama/LlamaLinear.h b/src/turbomind/models/llama/LlamaLinear.h
index 938188f4bc..6d23f295bb 100644
--- a/src/turbomind/models/llama/LlamaLinear.h
+++ b/src/turbomind/models/llama/LlamaLinear.h
@@ -34,6 +34,15 @@ class LlamaLinear {
                  Type                       type      = kGemm,
                  int*                       lora_mask = nullptr);
 
+    void forward_moe(T*                         output_data,
+                     Pitched                    input_data,
+                     const int*                 indexes,
+                     const int*                 offsets,
+                     int                        batch_size,
+                     const LlamaDenseWeight<T>& weight,
+                     Type                       type,
+                     gemm::Context*             context);
+
     void set_measure(bool measure);
 
     [[maybe_unused]] int Export(std::ostream& os);
diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc
index 9e03e16ede..3d50910ad4 100644
--- a/src/turbomind/models/llama/LlamaV2.cc
+++ b/src/turbomind/models/llama/LlamaV2.cc
@@ -41,6 +41,7 @@
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/logger.h"
 #include "src/turbomind/utils/memory_utils.h"
+#include "src/turbomind/utils/monotonic.h"
 #include <algorithm>
 #include <chrono>
 #include <exception>
@@ -60,6 +61,7 @@ inline int pad_vocab_size(int vocab_size, int tp)
 template<typename T>
 LlamaV2<T>::LlamaV2(const ModelParam&               model,
                     const AttentionParam&           attn,
+                    const MoeParam&                 moe,
                     const LoraParam&                lora,
                     const NcclParam&                tp,
                     const Context<T>&               ctx,
@@ -92,7 +94,7 @@ LlamaV2<T>::LlamaV2(const ModelParam&               model,
 {
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
 
-    unified_decoder_ = std::make_unique<UnifiedDecoder<T>>(model, attn, lora, tp, ctx);
+    unified_decoder_ = std::make_unique<UnifiedDecoder<T>>(model, attn, moe, lora, tp, ctx);
 
     dynamic_decode_layer_ = std::make_unique<DynamicDecodeLayer<float>>(vocab_size_,
                                                                         vocab_size_padded_,
@@ -145,6 +147,9 @@ void LlamaV2<T>::updateEmbedding(T*               decoder_input,
                                  int*             lora_mask,
                                  bool*            have_embeddings)
 {
+    if (isTuning())
+        return;
+
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
 
     *have_embeddings          = false;
@@ -421,107 +426,6 @@ void LlamaV2<T>::dynamicDecode(int*            token_ids,
     dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
 }
 
-template<class First, class Last>
-static std::string Join(First first, Last last, const std::string& delim)
-{
-    if (first == last) {
-        return {};
-    }
-    std::ostringstream oss;
-    oss << *first++;
-    while (first != last) {
-        oss << delim << *first++;
-    }
-    return oss.str();
-}
-
-// Only called when `weight_type == INT4` for now
-template<typename T>
-void LlamaV2<T>::tune()
-{
-
-    if (auto str = std::getenv("TM_GEMM_IMPORT")) {
-        std::ifstream ifs(str);
-        const int     n_imported = linear_->Import(ifs);
-        TM_LOG_INFO("[Gemm2] %d records imported", n_imported);
-        return;
-    }
-
-    std::vector<int> bss = linear_->GetTuningSeq();
-    if (bss.empty()) {
-        bss = gemm::GenerateTuningSequence(gemm::GetDefaultTuningGenerators());
-    }
-
-    {
-        auto str = Join(bss.begin(), bss.end(), ", ");
-        TM_LOG_INFO("[Gemm2] Tuning sequence: %s", str.c_str());
-    }
-
-    LlamaAttentionWeight<T>& attn = weights_->decoder_layer_weights[0]->self_attn_weights;
-    LlamaFfnWeight<T>&       ffn  = weights_->decoder_layer_weights[0]->ffn_weights;
-
-    std::vector<LlamaDenseWeight<T>*> weights{&attn.qkv, &attn.output, &ffn.output};
-
-    for (auto& layer : weights_->decoder_layer_weights) {
-        if (layer->ffn_weights.gating.kernel) {
-            weights.push_back(&layer->ffn_weights.gating);
-            break;
-        }
-    }
-    for (auto& layer : weights_->decoder_layer_weights) {
-        if (layer->ffn_weights.fused_gating_intermediate.kernel) {
-            weights.push_back(&layer->ffn_weights.fused_gating_intermediate);
-            break;
-        }
-    }
-
-    const int max_bs  = *std::max_element(bss.begin(), bss.end());
-    int       max_in  = 0;
-    int       max_out = 0;
-    for (auto& w : weights) {
-        max_in  = std::max<int>(max_in, w->input_dims);
-        max_out = std::max<int>(max_out, w->output_dims);
-    }
-
-    T* in_data  = (T*)allocator_->malloc(sizeof(T) * (size_t)max_bs * max_in);
-    T* out_data = (T*)allocator_->malloc(sizeof(T) * (size_t)max_bs * max_out);
-
-    cudaRandomUniform(in_data, (size_t)max_bs * max_in);
-    check_cuda_error(cudaDeviceSynchronize());
-
-    linear_->set_measure(true);
-
-    auto tick = std::chrono::steady_clock::now();
-
-    for (auto bs : bss) {
-        TM_LOG_INFO("[Gemm2] %d", bs);
-        for (auto& w : weights) {
-            linear_->forward(out_data, in_data, bs, *w);
-        }
-    }
-
-    auto tock = std::chrono::steady_clock::now();
-
-    TM_LOG_INFO("[Gemm2] Tuning finished in %.2f seconds.",
-                std::chrono::duration<float, std::ratio<1, 1>>(tock - tick).count());
-
-    linear_->set_measure(false);
-
-    check_cuda_error(cudaDeviceSynchronize());
-
-    allocator_->free((void**)&in_data);
-    allocator_->free((void**)&out_data);
-
-    // Only rank-0 exports the dispatch cache
-    if (tensor_para_.rank_ == 0) {
-        if (auto path = std::getenv("TM_GEMM_EXPORT")) {
-            std::ofstream ofs(path);
-            const auto    n_records = linear_->Export(ofs);
-            TM_LOG_INFO("[Gemm2] %d records exported.", n_records);
-        }
-    }
-}
-
 template class LlamaV2<half>;
 #ifdef ENABLE_FP32
 template class LlamaV2<float>;
diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h
index f746c3f625..6321d09d7c 100644
--- a/src/turbomind/models/llama/LlamaV2.h
+++ b/src/turbomind/models/llama/LlamaV2.h
@@ -45,14 +45,13 @@ class LlamaV2 {
 
     LlamaV2(const ModelParam&               model,
             const AttentionParam&           attn,
+            const MoeParam&                 moe,
             const LoraParam&                lora,
             const NcclParam&                tp,
             const Context<T>&               ctx,
             int                             max_batch_size,
             std::shared_ptr<LlamaWeight<T>> weights);
 
-    void tune();
-
     size_t vocab_size() const noexcept
     {
         return vocab_size_;
diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc
index e83d736b23..1ac2d82dd9 100644
--- a/src/turbomind/models/llama/LlamaWeight.cc
+++ b/src/turbomind/models/llama/LlamaWeight.cc
@@ -19,6 +19,7 @@
 // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptWeight.cc
 
 #include "src/turbomind/models/llama/LlamaWeight.h"
+#include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/utils/memory_utils.h"
 #include <cuda_runtime.h>
 
@@ -36,6 +37,7 @@ LlamaWeight<T>::LlamaWeight(size_t     head_num,
                             WeightType weight_type,
                             int        group_size,
                             LoraParam  lora_param,
+                            MoeParam   moe_param,
                             size_t     tensor_para_size,
                             size_t     tensor_para_rank):
     hidden_units_(hidden_units),
@@ -66,6 +68,7 @@ LlamaWeight<T>::LlamaWeight(size_t     head_num,
                                                                        group_size,
                                                                        lora_param,
                                                                        attn_bias,
+                                                                       moe_param,
                                                                        tensor_para_size_,
                                                                        tensor_para_rank_));
     }
@@ -159,8 +162,15 @@ TensorMap LlamaWeight<T>::getParams()
 template<typename T>
 void LlamaWeight<T>::prepare(const cudaDeviceProp& prop)
 {
-    const auto workspace_size = decoder_layer_weights[0]->workspace_size();
-    char*      workspace{};
+    const auto workspace_size = [&] {
+        size_t size{};
+        for (const auto& layer : decoder_layer_weights) {
+            size = std::max(size, layer->workspace_size());
+        }
+        return size;
+    }();
+
+    char* workspace{};
 
     TM_LOG_INFO("[LlamaWeight<T>::prepare] workspace size: %d\n", workspace_size);
 
diff --git a/src/turbomind/models/llama/LlamaWeight.h b/src/turbomind/models/llama/LlamaWeight.h
index 8c94925ce7..c04bf6c5a6 100644
--- a/src/turbomind/models/llama/LlamaWeight.h
+++ b/src/turbomind/models/llama/LlamaWeight.h
@@ -40,6 +40,7 @@ struct LlamaWeight {
                 WeightType weight_type,
                 int        group_size,
                 LoraParam  lora_param,
+                MoeParam   moe_param,
                 size_t     tensor_para_size,
                 size_t     tensor_para_rank);
 
diff --git a/src/turbomind/models/llama/context.h b/src/turbomind/models/llama/context.h
index bbdab8c6bd..c4d36ccc5c 100644
--- a/src/turbomind/models/llama/context.h
+++ b/src/turbomind/models/llama/context.h
@@ -5,7 +5,10 @@
 #include "src/turbomind/models/llama/LlamaLinear.h"
 #include "src/turbomind/utils/allocator.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
+#include <cublasLt.h>
+#include <cublas_v2.h>
 #include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
 #include <memory>
 
 namespace turbomind {
@@ -15,11 +18,74 @@ struct Context {
     cudaStream_t                                    stream;
     std::unique_ptr<Allocator<AllocatorType::CUDA>> allocator;
     std::unique_ptr<Allocator<AllocatorType::CUDA>> peer_allocator;
+    cublasHandle_t                                  cublas_handle;
+    cublasLtHandle_t                                cublasLt_handle;
     std::unique_ptr<cublasAlgoMap>                  cublas_algo_map;
     std::unique_ptr<std::mutex>                     cublas_wrapper_mutex;
     std::unique_ptr<cublasMMWrapper>                cublas_wrapper;
     std::unique_ptr<LlamaLinear<T>>                 linear;
     cudaDeviceProp                                  cuda_device_prop;
+
+    Context(int device_id)
+    {
+        check_cuda_error(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+        allocator = std::make_unique<Allocator<AllocatorType::CUDA>>(device_id, false);
+        allocator->setStream(stream);
+
+        peer_allocator = std::make_unique<Allocator<AllocatorType::CUDA>>(device_id, true);
+        peer_allocator->setStream(stream);
+
+        cublasCreate(&cublas_handle);
+        cublasLtCreate(&cublasLt_handle);
+        cublasSetStream(cublas_handle, stream);
+
+        if (0) {
+            cublasSetWorkspace(cublas_handle, nullptr, 0);
+            cublasSetMathMode(cublas_handle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+        }
+
+        cublas_algo_map      = std::make_unique<cublasAlgoMap>("gemm_config.in");
+        cublas_wrapper_mutex = std::make_unique<std::mutex>();
+        cublas_wrapper       = std::make_unique<cublasMMWrapper>(
+            cublas_handle, cublasLt_handle, stream, cublas_algo_map.get(), cublas_wrapper_mutex.get(), allocator.get());
+        linear = std::make_unique<LlamaLinear<T>>(cublas_wrapper.get(), stream);
+
+        check_cuda_error(cudaGetDeviceProperties(&cuda_device_prop, device_id));
+
+        if (std::is_same<T, half>::value) {
+            cublas_wrapper->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F);
+        }
+#ifdef ENABLE_FP32
+        else if (std::is_same<T, float>::value) {
+            cublas_wrapper->setFP32GemmConfig();
+        }
+#endif
+#ifdef ENABLE_BF16
+        else if (std::is_same<T, __nv_bfloat16>::value) {
+            cublas_wrapper->setBF16GemmConfig();
+        }
+#endif
+    }
+
+    ~Context()
+    {
+        linear.reset();
+        cublas_wrapper.reset();
+        cublas_algo_map.reset();
+
+        cublasDestroy(cublas_handle);
+        cublas_handle = {};
+
+        cublasLtDestroy(cublasLt_handle);
+        cublasLt_handle = {};
+
+        peer_allocator.reset();
+        allocator.reset();
+
+        cudaStreamDestroy(stream);
+        stream = {};
+    }
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h
index aa6076de6f..4cb9e27e13 100644
--- a/src/turbomind/models/llama/llama_params.h
+++ b/src/turbomind/models/llama/llama_params.h
@@ -25,6 +25,17 @@ struct ModelParam {
     int end_id;
 };
 
+struct MoeParam {
+    enum Method
+    {
+        kNaive,
+        kFused
+    } method;
+    int expert_num;
+    int experts_per_token;
+    int inter_size;
+};
+
 struct AttentionParam {
     int         rotary_embedding_dim;
     float       rotary_embedding_base;
diff --git a/src/turbomind/models/llama/llama_utils.cu b/src/turbomind/models/llama/llama_utils.cu
index 9e2706270e..925c6b8831 100644
--- a/src/turbomind/models/llama/llama_utils.cu
+++ b/src/turbomind/models/llama/llama_utils.cu
@@ -172,4 +172,10 @@ int64_t& gSequenceIds(int batch_idx)
     return ids.at(batch_idx);
 }
 
+bool& isTuning()
+{
+    thread_local bool value{};
+    return value;
+}
+
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/llama_utils.h b/src/turbomind/models/llama/llama_utils.h
index a97b94c371..e50364bbd1 100644
--- a/src/turbomind/models/llama/llama_utils.h
+++ b/src/turbomind/models/llama/llama_utils.h
@@ -82,4 +82,6 @@ struct NvtxScope {
 
 int64_t& gSequenceIds(int batch_idx);
 
+bool& isTuning();
+
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc
new file mode 100644
index 0000000000..def6b04abb
--- /dev/null
+++ b/src/turbomind/models/llama/moe_ffn_layer.cc
@@ -0,0 +1,293 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/models/llama/moe_ffn_layer.h"
+#include "src/turbomind/kernels/activation_kernels.h"
+#include "src/turbomind/models/llama/LlamaDenseWeight.h"
+#include "src/turbomind/models/llama/LlamaLinear.h"
+#include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/models/llama/llama_utils.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/monotonic.h"
+#include "src/turbomind/utils/nvtx_utils.h"
+#include "src/turbomind/utils/string_utils.h"
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+#include <iomanip>
+
+namespace turbomind {
+
+template<class T>
+void MoeFfnLayer<T>::AllocateBuffer(size_t tokens, size_t padded)
+{
+    char* base = 0;
+
+    auto allocate = [&](void* base) {
+        Monotonic alloc{base};
+        alloc(&inout_buf_, tokens * param_.experts_per_token * hidden_dim_);
+        alloc(&inter_buf_, tokens * param_.experts_per_token * inter_size_ * 2);
+        alloc(&logits_, tokens * param_.expert_num);
+        alloc(&masks_, param_.expert_num * padded);
+        alloc(&f2n_, param_.experts_per_token * tokens);
+        alloc(&en2f_, param_.experts_per_token * tokens);
+        alloc(&scales_, param_.experts_per_token * tokens);
+        return (char*)alloc.ptr() - (char*)base;
+    };
+
+    const auto workspace_size = allocate(0);
+
+    workspace_ = (char*)allocator_->reMalloc(workspace_, workspace_size);
+
+    allocate(workspace_);
+}
+
+template<class T>
+void MoeFfnLayer<T>::FreeBuffer()
+{
+    allocator_->free((void**)&workspace_);
+
+    allocator_->free((void**)&accum_);
+    allocator_->free((void**)&offsets_);
+
+    allocator_->free((void**)&h_offsets_, true);
+}
+
+template<class T>
+void MoeFfnLayer<T>::gate(float* logits, const T* input, int tokens, const LlamaDenseWeight<T>& weight)
+{
+    const float alpha = 1.f;
+    const float beta  = 0.f;
+    cublas_->Gemm(CUBLAS_OP_N,
+                  CUBLAS_OP_N,
+                  weight.output_dims,
+                  tokens,
+                  weight.input_dims,
+                  &alpha,
+                  weight.kernel,
+                  getCudaDataType<T>(),
+                  weight.output_dims,
+                  input,
+                  getCudaDataType<T>(),
+                  hidden_dim_,
+                  &beta,
+                  logits_,
+                  CUDA_R_32F,
+                  weight.output_dims,
+                  CUDA_R_32F,
+                  CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+}
+
+template<class T>
+void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWeight<T>& moe)
+{
+    const size_t padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
+
+    AllocateBuffer(tokens, padded);
+
+    gate(logits_, inout, tokens, moe.gate);
+    sync_check_cuda_error();
+
+    check_cuda_error(cudaMemsetAsync(accum_, 0, sizeof(int) * param_.expert_num * kMoeGateMaxTiles, stream_));
+    sync_check_cuda_error();
+
+    // dump_logits(tokens, layer_id);
+
+    /// TODO: fix illegal memory access even if NaN are present in logits
+    invokeMoeGate_V2(f2n_,
+                     en2f_,
+                     offsets_,
+                     scales_,
+                     masks_,
+                     accum_,
+                     logits_,
+                     tokens,
+                     padded,
+                     param_.expert_num,
+                     param_.experts_per_token,
+                     stream_);
+    sync_check_cuda_error();
+
+    if (isTuning()) {
+        std::mt19937     g;
+        const auto       expert_ids = SampleUniform(tokens, param_.expert_num, param_.experts_per_token, g);
+        std::vector<int> cnt(param_.expert_num);
+        for (const auto& x : expert_ids) {
+            ++cnt[x];
+        }
+        h_offsets_[0] = 0;
+        for (int i = 0; i < param_.expert_num; ++i) {
+            h_offsets_[i + 1] = h_offsets_[i] + cnt[i];
+        }
+        check_cuda_error(
+            cudaMemcpyAsync(offsets_, h_offsets_, sizeof(int) * (param_.expert_num + 1), cudaMemcpyDefault, stream_));
+    }
+
+    if (param_.method == MoeParam::kNaive) {
+
+        dispatchMoeGather(inout_buf_, inout, f2n_, tokens, param_.experts_per_token, hidden_dim_, stream_);
+        sync_check_cuda_error();
+
+        check_cuda_error(
+            cudaMemcpyAsync(h_offsets_, offsets_, sizeof(int) * (param_.expert_num + 1), cudaMemcpyDefault, stream_));
+
+        check_cuda_error(cudaStreamSynchronize(stream_));
+
+        if (h_offsets_[param_.expert_num] != tokens * param_.experts_per_token) {
+            FT_CHECK_WITH_INFO(0, fmtstr("%d vs %d", h_offsets_[param_.expert_num], tokens * param_.experts_per_token));
+        }
+
+        for (int i = 0; i < param_.expert_num; ++i) {
+
+            FT_CHECK(moe.experts[i].is_fused_silu == false);
+
+            if (size_t count = h_offsets_[i + 1] - h_offsets_[i]) {
+                auto io = inout_buf_ + h_offsets_[i] * hidden_dim_;
+
+                TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, dtype_, {count, hidden_dim_}, io}},
+                                     {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}}};
+                TensorMap ffn_outputs{{"ffn_output", {MEMORY_GPU, dtype_, {count, hidden_dim_}, io}}};
+
+                expert_ffn_->forward(&ffn_outputs, &ffn_inputs, &moe.experts[i]);
+            }
+        }
+    }
+    else {
+        context_->set_offsets(offsets_);
+
+        auto& block = moe.block;
+
+#if 0
+        FT_CHECK(!block.is_fused_silu);
+        for (int i = 0; i < param_.expert_num; ++i) {
+            if (size_t count = h_offsets_[i + 1] - h_offsets_[i]) {
+                cublas_->Gemm(CUBLAS_OP_T,  // (m, k)  W
+                              CUBLAS_OP_N,  // (k, n)  X
+                              inter_size_ * 2,
+                              count,
+                              hidden_dim_,
+                              moe.experts[i].fused_gating_intermediate.kernel,
+                              hidden_dim_,
+                              inout_buf_ + h_offsets_[i] * hidden_dim_,
+                              hidden_dim_,
+                              inter_buf_ + h_offsets_[i] * inter_size_ * 2,
+                              inter_size_ * 2);
+                sync_check_cuda_error();
+            }
+        }
+        auto mode = kCmpWrite;
+#else
+        linear_->forward_moe(inter_buf_,
+                             {inout, (int)hidden_dim_},
+                             f2n_,
+                             offsets_,
+                             tokens * param_.experts_per_token,
+                             block.fused_gating_intermediate,
+                             block.is_fused_silu ? LlamaLinear<T>::kFusedSiluFfn : LlamaLinear<T>::kGemm,
+                             context_.get());
+        sync_check_cuda_error();
+        auto mode = kCmpRead;
+#endif
+
+        // if (tensor_para_.rank_ == 0) {
+        //     Compare(inter_buf_,  //
+        //             tokens * param_.experts_per_token * inter_size_ * 2,
+        //             "inter_buf",
+        //             mode,
+        //             stream_);
+        // }
+
+        if (!block.is_fused_silu) {
+            invokeGenericActivation_v2<SiluActivation>(inter_buf_,
+                                                       inter_buf_ + inter_size_,
+                                                       inter_size_ * 2,
+                                                       tokens * param_.experts_per_token,
+                                                       inter_size_,
+                                                       stream_);
+            sync_check_cuda_error();
+        }
+
+#if 0
+        for (int i = 0; i < param_.expert_num; ++i) {
+            if (size_t count = h_offsets_[i + 1] - h_offsets_[i]) {
+                cublas_->Gemm(CUBLAS_OP_T,  // (m, k) W
+                              CUBLAS_OP_N,  // (k, n) X
+                              hidden_dim_,
+                              count,
+                              inter_size_,
+                              moe.experts[i].output.kernel,
+                              inter_size_,
+                              inter_buf_ + h_offsets_[i] * inter_size_ * 2,
+                              inter_size_ * 2,
+                              inout_buf_ + h_offsets_[i] * hidden_dim_,
+                              hidden_dim_);
+                sync_check_cuda_error();
+            }
+        }
+        auto mode1 = kCmpWrite;
+#else
+        linear_->forward_moe(inout_buf_,
+                             {inter_buf_, block.is_fused_silu ? (int)inter_size_ : (int)inter_size_ * 2},
+                             nullptr,
+                             offsets_,
+                             tokens * param_.experts_per_token,
+                             block.output,
+                             LlamaLinear<T>::kGemm,
+                             context_.get());
+        sync_check_cuda_error();
+        auto mode1 = kCmpRead;
+#endif
+
+        // if (tensor_para_.rank_ == 0) {
+        //     Compare(inter_buf_2_,  //
+        //             tokens * param_.experts_per_token * inter_size_,
+        //             "inter_buf_2_",
+        //             mode1,
+        //             stream_);
+        //     Compare(inout_buf_,  //
+        //             tokens * param_.experts_per_token * hidden_dim_,
+        //             "inout_buf",
+        //             mode1,
+        //             stream_);
+        // }
+    }
+
+    invokeMoeReduce(inout, inout_buf_, scales_, en2f_, tokens, param_.experts_per_token, hidden_dim_, stream_);
+    sync_check_cuda_error();
+
+    if (tensor_para_.world_size_ > 1) {
+        ftNcclAllReduceSum(inout, inout, tokens * hidden_dim_, tensor_para_, stream_);
+        sync_check_cuda_error();
+    }
+
+    // if (tensor_para_.rank_ == 0) {
+    //     check_cuda_error(cudaStreamSynchronize(stream_));
+    //     std::abort();
+    // }
+}
+
+template<class T>
+void MoeFfnLayer<T>::dump_logits(int token_num, int layer_id)
+{
+    std::vector<float> logits(token_num * param_.expert_num);
+    check_cuda_error(
+        cudaMemcpyAsync(logits.data(), logits_, sizeof(float) * logits.size(), cudaMemcpyDefault, stream_));
+    check_cuda_error(cudaStreamSynchronize(stream_));
+
+    auto ptr = logits.data();
+    std::cout << "layer_id: " << layer_id << std::endl;
+    for (int i = 0; i < token_num; ++i) {
+        for (int e = 0; e < param_.expert_num; ++e) {
+            std::cout << *ptr++ << " ";
+        }
+        std::cout << std::endl;
+    }
+}
+
+#ifdef ENABLE_FP32
+template class MoeFfnLayer<float>;
+#endif
+template class MoeFfnLayer<half>;
+#ifdef ENABLE_BF16
+template class MoeFfnLayer<__nv_bfloat16>;
+#endif
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/llama/moe_ffn_layer.h b/src/turbomind/models/llama/moe_ffn_layer.h
new file mode 100644
index 0000000000..ef65aaa464
--- /dev/null
+++ b/src/turbomind/models/llama/moe_ffn_layer.h
@@ -0,0 +1,92 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/gemm/context.h"
+#include "src/turbomind/kernels/gemm/moe_utils_v2.h"
+#include "src/turbomind/models/llama/LlamaDenseWeight.h"
+#include "src/turbomind/models/llama/LlamaFfnLayer.h"
+#include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/utils/cublasMMWrapper.h"
+#include "src/turbomind/utils/nccl_utils.h"
+
+namespace turbomind {
+
+template<class T>
+class MoeFfnLayer {
+public:
+    MoeFfnLayer(ModelParam model, const MoeParam& param, const NcclParam& tp, const Context<T>& ctx):
+        inter_size_(param.inter_size / tp.world_size_),
+        hidden_dim_(model.hidden_units),
+        param_(param),
+        dtype_(getTensorType<T>()),
+        tensor_para_(tp),
+        stream_(ctx.stream),
+        cublas_(ctx.cublas_wrapper.get()),
+        linear_(ctx.linear.get()),
+        allocator_(ctx.allocator.get())
+    {
+        model.inter_size = param.inter_size;
+
+        if (param_.method == MoeParam::kFused) {
+            context_ = std::make_unique<gemm::MoeGemmContext>(
+                param.expert_num, param.experts_per_token, ctx.cuda_device_prop, stream_);
+        }
+        else {
+            expert_ffn_ = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx, false);
+        }
+
+        h_offsets_ = (int*)allocator_->malloc(sizeof(int) * (param_.expert_num + 1), false, true);
+
+        offsets_ = (int*)allocator_->malloc(sizeof(int) * (param_.expert_num + 1));
+        accum_   = (int*)allocator_->malloc(sizeof(int) * param_.expert_num * kMoeGateMaxTiles);
+    }
+
+    void AllocateBuffer(size_t tokens, size_t padded);
+
+    void FreeBuffer();
+
+    ~MoeFfnLayer()
+    {
+        FreeBuffer();
+    }
+
+    void forward(T* inout, int tokens, int layer_id, const MoeFfnWeight<T>& moe);
+
+    void gate(float* logits, const T* input, int tokens, const LlamaDenseWeight<T>& weight);
+
+    void dump_logits(int token_num, int layer_id);
+
+private:
+    const size_t           inter_size_;
+    const size_t           hidden_dim_;
+    const MoeParam         param_;
+    const DataType         dtype_;
+    const NcclParam        tensor_para_;
+    cudaStream_t const     stream_;
+    cublasMMWrapper* const cublas_;
+    LlamaLinear<T>* const  linear_;
+    IAllocator* const      allocator_;
+
+    std::unique_ptr<LlamaFfnLayer<T>>     expert_ffn_;
+    std::unique_ptr<gemm::MoeGemmContext> context_;
+
+    int* h_offsets_{};
+
+    char* workspace_{};
+
+    T* inout_buf_{};  // [n * e, hidden_dim]
+    T* inter_buf_{};  // [n * e, inter_size]
+
+    float* logits_{};
+    int*   masks_{};
+
+    int*   f2n_{};
+    int*   en2f_{};
+    float* scales_{};
+
+    int* accum_{};
+    int* offsets_{};
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc
index cdd27a5c60..f38151a1a5 100644
--- a/src/turbomind/models/llama/unified_attention_layer.cc
+++ b/src/turbomind/models/llama/unified_attention_layer.cc
@@ -335,7 +335,7 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
         check_cuda_error(cudaStreamWaitEvent(aux_stream_, qkv_event_));
     }
 
-    if (pf_batch_size) {
+    if (pf_batch_size && !isTuning()) {
         const int offset    = dc_batch_size;
         const int sum_k_len = h_cu_k_len[offset + pf_batch_size] - h_cu_k_len[offset];
         // We are executing prefill & decoding kernels concurrently, but only have 1 workspace
@@ -354,7 +354,7 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
         }
     }
 
-    if (dc_batch_size) {
+    if (dc_batch_size && !isTuning()) {
         auto params = CreateParams(0, dc_batch_size, kMaxKVSplits, dc_stream);
         if constexpr (sizeof(T) == 2) {
             dispatchDecoding<T>(params);
@@ -373,6 +373,11 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
     //     dump(qkv_buf_3_, num_token * weights->output.input_dims, stream_, "qkv_buf_3");
     // }
 
+    if (isTuning()) {
+        rng_.set_stream(stream_);
+        rng_.GenerateUniform(qkv_buf_3_, token_num * weights->output.input_dims, .02f, -.01f);
+    }
+
     count_and_fix(qkv_buf_3_, token_num * weights->output.input_dims, Concat("attn", layer_id), 3);
 
     //////////////////////////////////////////////
diff --git a/src/turbomind/models/llama/unified_attention_layer.h b/src/turbomind/models/llama/unified_attention_layer.h
index 19aa08c29f..da0c0e6fc8 100644
--- a/src/turbomind/models/llama/unified_attention_layer.h
+++ b/src/turbomind/models/llama/unified_attention_layer.h
@@ -21,6 +21,7 @@
 
 #pragma once
 
+#include "src/turbomind/kernels/gemm/test/test_utils.h"
 #include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include "src/turbomind/models/llama/LlamaLinear.h"
 #include "src/turbomind/models/llama/context.h"
@@ -133,6 +134,8 @@ class UnifiedAttentionLayer {
 
     std::array<cudaStream_t, 2> streams_;
 
+    RNG rng_;
+
     T*     qkv_buf_{};
     T*     q_buf_2_{};
     T*     k_buf_2_{};
diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc
index e1fa9efbde..68392215f6 100644
--- a/src/turbomind/models/llama/unified_decoder.cc
+++ b/src/turbomind/models/llama/unified_decoder.cc
@@ -3,6 +3,7 @@
 #include "src/turbomind/models/llama/llama_decoder_kernels.h"
 #include "src/turbomind/models/llama/llama_kernels.h"
 #include "src/turbomind/models/llama/llama_utils.h"
+#include "src/turbomind/models/llama/moe_ffn_layer.h"
 #include "src/turbomind/models/llama/unified_attention_layer.h"
 #include "src/turbomind/utils/anomaly_handler.h"
 #include "src/turbomind/utils/cuda_utils.h"
@@ -13,6 +14,7 @@ namespace turbomind {
 template<class T>
 UnifiedDecoder<T>::UnifiedDecoder(const ModelParam&     model,
                                   const AttentionParam& attn,
+                                  const MoeParam&       moe,
                                   const LoraParam&      lora,
                                   const NcclParam&      tp,
                                   const Context<T>&     ctx):
@@ -24,8 +26,9 @@ UnifiedDecoder<T>::UnifiedDecoder(const ModelParam&     model,
     dtype_(getTensorType<T>())
 {
 
-    attn_layer_ = std::make_unique<UnifiedAttentionLayer<T>>(model, attn, lora, tp, ctx);
-    ffn_layer_  = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx);
+    attn_layer_    = std::make_unique<UnifiedAttentionLayer<T>>(model, attn, lora, tp, ctx);
+    ffn_layer_     = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx, true);
+    moe_ffn_layer_ = std::make_unique<MoeFfnLayer<T>>(model, moe, tp, ctx);
 
     check_cuda_error(cudaEventCreateWithFlags(&ev_h_cu_x_, cudaEventDisableTiming));
 }
@@ -151,6 +154,11 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
 
     for (size_t layer = 0; layer < layer_num_; ++layer) {
 
+        /// TODO: do not skip the layers when they are heterogeneous
+        if (isTuning() && layer != 0) {
+            continue;
+        }
+
         // Compare(decoder_output, token_num * hidden_units_, "attn_input", kCmpRead, stream_);
 
         /////////////////////////////////////////////
@@ -180,15 +188,20 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
 
         ////////////////////////////////////////////
         /// feed-forward network
-        int       layer_id = layer;  // int is needed
-        TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}},
-                             {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}}};
-        TensorMap ffn_outputs{{"ffn_output", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}}};
-        if (inputs->isExist("lora_mask")) {
-            ffn_inputs.insert({"lora_mask", inputs->at("lora_mask")});
-        }
 
-        ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &weights->at(layer)->ffn_weights);
+        if (!weights->at(layer)->moe_weights.experts.empty()) {
+            moe_ffn_layer_->forward(decoder_output, token_num, layer, weights->at(layer)->moe_weights);
+        }
+        else {
+            int       layer_id = layer;  // int is needed
+            TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}},
+                                 {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}}};
+            TensorMap ffn_outputs{{"ffn_output", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}}};
+            if (inputs->isExist("lora_mask")) {
+                ffn_inputs.insert({"lora_mask", inputs->at("lora_mask")});
+            }
+            ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &weights->at(layer)->ffn_weights);
+        }
 
         count_and_fix(decoder_output, token_num * hidden_units_, Concat("ffn_block", layer), 2);
 
diff --git a/src/turbomind/models/llama/unified_decoder.h b/src/turbomind/models/llama/unified_decoder.h
index 95a36d2a8b..f13b4ba842 100644
--- a/src/turbomind/models/llama/unified_decoder.h
+++ b/src/turbomind/models/llama/unified_decoder.h
@@ -4,6 +4,7 @@
 #include "src/turbomind/models/llama/LlamaFfnLayer.h"
 #include "src/turbomind/models/llama/context.h"
 #include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/models/llama/moe_ffn_layer.h"
 #include "src/turbomind/models/llama/unified_attention_layer.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
 #include "src/turbomind/utils/cuda_utils.h"
@@ -32,6 +33,7 @@ class UnifiedDecoder {
 
     std::unique_ptr<UnifiedAttentionLayer<T>> attn_layer_;
     std::unique_ptr<LlamaFfnLayer<T>>         ffn_layer_;
+    std::unique_ptr<MoeFfnLayer<T>>           moe_ffn_layer_;
 
     cudaEvent_t ev_h_cu_x_{};
 
@@ -48,6 +50,7 @@ class UnifiedDecoder {
 public:
     UnifiedDecoder(const ModelParam&     model,
                    const AttentionParam& attn,
+                   const MoeParam&       moe,
                    const LoraParam&      lora,
                    const NcclParam&      tp,
                    const Context<T>&     ctx);
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 7829a4924b..44f73370da 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -18,21 +18,47 @@
 // Modified from
 // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.cc
 
-#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
+#include <cctype>
+#include <optional>
+
+#include <cuda_runtime.h>
+#include <yaml-cpp/yaml.h>
+
 #include "src/turbomind/models/llama/LlamaDenseWeight.h"
-#include "src/turbomind/models/llama/LlamaInstanceComm.h"
-#include "src/turbomind/models/llama/LlamaLinear.h"
 #include "src/turbomind/models/llama/context.h"
+#include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
 #include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
 #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
 #include "src/turbomind/utils/allocator.h"
 #include "src/turbomind/utils/cuda_utils.h"
-#include <cuda_runtime.h>
-#include <mutex>
-#include <yaml-cpp/yaml.h>
 
 namespace ft = turbomind;
 
+static std::optional<ft::MoeParam::Method> get_moe_method()
+{
+    static const auto value = []() -> std::optional<ft::MoeParam::Method> {
+        const auto p = std::getenv("TM_MOE_METHOD");
+        if (p) {
+            std::string str(p);
+            for (auto& x : str) {
+                x = std::tolower(x);
+            }
+            if (str == "naive") {
+                return ft::MoeParam::kNaive;
+            }
+            else if (str == "fused") {
+                return ft::MoeParam::kFused;
+            }
+            else {
+                std::cerr << "[WARNING] unrecognised MoE method: " << str << "\n";
+            }
+        }
+        return {};
+    }();
+    return value;
+}
+
 std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaModel(std::string config_file)
 {
     YAML::Node reader;
@@ -261,14 +287,18 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     engine_param_.num_tokens_per_iter = engine_reader["num_tokens_per_iter"].as<int>(0);
     engine_param_.max_prefill_iters   = engine_reader["max_prefill_iters"].as<int>(1);
 
-    lora_param_.policy        = ft::getLoraPolicy(reader["lora_config"]["lora_policy"].as<std::string>(""));
-    lora_param_.r             = lora_reader["lora_r"].as<int>(0);
-    lora_param_.scale         = lora_reader["lora_scale"].as<float>(0);
-    lora_param_.max_wo_r      = lora_reader["lora_max_wo_r"].as<int>(0);
-    lora_param_.rank_pattern  = getLoraPattern<int>(lora_reader["lora_rank_pattern"].as<std::string>(""),
+    lora_param_.policy           = ft::getLoraPolicy(reader["lora_config"]["lora_policy"].as<std::string>(""));
+    lora_param_.r                = lora_reader["lora_r"].as<int>(0);
+    lora_param_.scale            = lora_reader["lora_scale"].as<float>(0);
+    lora_param_.max_wo_r         = lora_reader["lora_max_wo_r"].as<int>(0);
+    lora_param_.rank_pattern     = getLoraPattern<int>(lora_reader["lora_rank_pattern"].as<std::string>(""),
                                                    [](const std::string& s) { return std::stoi(s); });
-    lora_param_.scale_pattern = getLoraPattern<float>(lora_reader["lora_scale_pattern"].as<std::string>(""),
+    lora_param_.scale_pattern    = getLoraPattern<float>(lora_reader["lora_scale_pattern"].as<std::string>(""),
                                                       [](const std::string& s) { return std::stof(s); });
+    moe_param_.expert_num        = model_reader["expert_num"].as<int>(0);
+    moe_param_.experts_per_token = model_reader["experts_per_token"].as<int>(0);
+    moe_param_.inter_size        = model_reader["expert_inter_size"].as<int>(0);
+
     handleMissingParams();
 
     shared_state_          = std::make_shared<ft::SharedState>();
@@ -298,6 +328,19 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
         ft::FT_CHECK(0);
     }
 
+    if (auto method = get_moe_method()) {
+        moe_param_.method = *method;
+    }
+    else {
+        moe_param_.method = ft::MoeParam::kFused;
+        // Note: This will fail when GPUs of different SMs are mixed
+        if (weight_type_ != ft::WeightType::kINT4 && ft::getSMVersion() >= 90) {
+            // On sm90 the cuBLAS method may be faster as our grouped GEMM is not
+            // optimized for GMMA yet
+            moe_param_.method = ft::MoeParam::kNaive;
+        }
+    }
+
     TM_LOG_INFO("%s", toString().c_str());
 }
 
@@ -311,48 +354,7 @@ std::unique_ptr<ft::Engine<T>> LlamaTritonModel<T>::createSharedModelInstance(
     ft::check_cuda_error(cudaSetDevice(device_id));
     const int comms_rank = device_id % (tensor_para_size_ * pipeline_para_size_);
 
-    auto ctx = std::make_unique<ft::Context<T>>();
-
-    ft::check_cuda_error(cudaStreamCreateWithFlags(&ctx->stream, cudaStreamNonBlocking));
-
-    ctx->allocator = std::make_unique<ft::Allocator<ft::AllocatorType::CUDA>>(device_id, false);
-    ctx->allocator->setStream(ctx->stream);
-
-    ctx->peer_allocator = std::make_unique<ft::Allocator<ft::AllocatorType::CUDA>>(device_id, true);
-    ctx->peer_allocator->setStream(ctx->stream);
-
-    cublasHandle_t   cublas_handle;
-    cublasLtHandle_t cublaslt_handle;
-
-    cublasCreate(&cublas_handle);
-    cublasLtCreate(&cublaslt_handle);
-    cublasSetStream(cublas_handle, ctx->stream);
-
-    ctx->cublas_algo_map      = std::make_unique<ft::cublasAlgoMap>("gemm_config.in");
-    ctx->cublas_wrapper_mutex = std::make_unique<std::mutex>();
-    ctx->cublas_wrapper       = std::make_unique<ft::cublasMMWrapper>(cublas_handle,
-                                                                cublaslt_handle,
-                                                                ctx->stream,
-                                                                ctx->cublas_algo_map.get(),
-                                                                ctx->cublas_wrapper_mutex.get(),
-                                                                ctx->allocator.get());
-    ctx->linear               = std::make_unique<ft::LlamaLinear<T>>(ctx->cublas_wrapper.get(), ctx->stream);
-
-    ft::check_cuda_error(cudaGetDeviceProperties(&ctx->cuda_device_prop, device_id));
-
-    if (std::is_same<T, half>::value) {
-        ctx->cublas_wrapper->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F);
-    }
-#ifdef ENABLE_FP32
-    else if (std::is_same<T, float>::value) {
-        ctx.cublas_wrapper->setFP32GemmConfig();
-    }
-#endif
-#ifdef ENABLE_BF16
-    else if (std::is_same<T, __nv_bfloat16>::value) {
-        ctx->cublas_wrapper->setBF16GemmConfig();
-    }
-#endif
+    auto ctx = std::make_unique<ft::Context<T>>(device_id);
 
     ft::NcclParam tensor_para   = nccl_params.first[comms_rank];
     ft::NcclParam pipeline_para = nccl_params.second[comms_rank];
@@ -362,6 +364,7 @@ std::unique_ptr<ft::Engine<T>> LlamaTritonModel<T>::createSharedModelInstance(
 
     auto model = std::make_unique<ft::LlamaV2<T>>(model_param_,  //
                                                   attn_param_,
+                                                  moe_param_,
                                                   lora_param_,
                                                   tensor_para,
                                                   *ctx,
@@ -416,6 +419,7 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
                                                                weight_type_,
                                                                group_size_,
                                                                lora_param_,
+                                                               moe_param_,
                                                                tensor_para_size_,
                                                                tensor_para_rank);
     // model inited with model_dir
@@ -462,9 +466,7 @@ void LlamaTritonModel<T>::createEngine(int
     auto engine = createSharedModelInstance(device_id, rank, nccl_params, custom_all_reduce_comm);
     engine->set_ffi_lock(ffi_lock_);
 
-    if (weight_type_ == ft::WeightType::kINT4) {
-        engine->model().tune();
-    }
+    engine->tune();
 
     engines_[device_id] = std::move(engine);
 }
@@ -489,7 +491,8 @@ std::string LlamaTritonModel<T>::toString()
        << "\ntensor_para_size: " << tensor_para_size_ << "\npipeline_para_size: " << pipeline_para_size_
        << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_ << "\nmodel_name: " << model_name_
        << "\nmodel_dir: " << model_dir_ << "\nquant_policy: " << model_param_.quant_policy
-       << "\ngroup_size: " << group_size_ << std::endl;
+       << "\ngroup_size: " << group_size_ << "\nexpert_num: " << moe_param_.expert_num
+       << "\nexpert_per_token: " << moe_param_.experts_per_token << "\nmoe_method: " << moe_param_.method << std::endl;
 
     return ss.str();
 }
@@ -505,7 +508,7 @@ void LlamaTritonModel<T>::createCustomComms(
 template<typename T>
 std::unique_ptr<ft::AbstractInstanceComm> LlamaTritonModel<T>::createInstanceComm(int size)
 {
-    return std::make_unique<ft::LlamaInstanceComm>(size);
+    return nullptr;
 }
 
 template<typename T>
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
index 1a069fcac5..19a143e721 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
@@ -86,6 +86,7 @@ struct LlamaTritonModel: public AbstractTransformerModel {
 
     ft::ModelParam     model_param_;
     ft::AttentionParam attn_param_;
+    ft::MoeParam       moe_param_;
     ft::LoraParam      lora_param_;
     ft::EngineParam    engine_param_;
     size_t             tensor_para_size_;
diff --git a/src/turbomind/utils/cublasMMWrapper.cc b/src/turbomind/utils/cublasMMWrapper.cc
index eac6dc1e26..cd70298b64 100644
--- a/src/turbomind/utils/cublasMMWrapper.cc
+++ b/src/turbomind/utils/cublasMMWrapper.cc
@@ -323,7 +323,6 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
                                       ldc,
                                       computeType_,
                                       static_cast<cublasGemmAlgo_t>(cublasAlgo)));
-        sync_check_cuda_error();
     }
     mu_->unlock();
 }
diff --git a/src/turbomind/utils/monotonic.h b/src/turbomind/utils/monotonic.h
new file mode 100644
index 0000000000..ef948c88f0
--- /dev/null
+++ b/src/turbomind/utils/monotonic.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <cstdint>
+#include <cstdlib>
+#include <utility>
+
+namespace turbomind {
+
+class Monotonic {
+public:
+    Monotonic(void* base, size_t alignment = 256): ptr_{base}, alignment_{alignment}
+    {
+        ptr_ = align(ptr_);
+    }
+
+    template<class T>
+    void operator()(T** ptr, size_t numel) noexcept
+    {
+        *ptr = (T*)std::exchange(ptr_, align((T*)ptr_ + numel));
+    }
+
+    void* ptr() const noexcept
+    {
+        return ptr_;
+    }
+
+private:
+    template<class T>
+    void* align(T* p)
+    {
+        static_assert(sizeof(T*) == sizeof(uintptr_t));
+        auto x = reinterpret_cast<uintptr_t>(p);
+        if (auto remainder = x % alignment_) {
+            x += alignment_ - remainder;
+        }
+        return reinterpret_cast<void*>(x);
+    }
+
+    void*  ptr_;
+    size_t alignment_;
+};
+
+}  // namespace turbomind
diff --git a/tests/test_lmdeploy/test_auto_backend.py b/tests/test_lmdeploy/test_auto_backend.py
index d8c74f6f81..3dfcac292a 100644
--- a/tests/test_lmdeploy/test_auto_backend.py
+++ b/tests/test_lmdeploy/test_auto_backend.py
@@ -34,7 +34,7 @@ def models(self):
             ('01-ai/Yi-34B-Chat', True, True),
             ('codellama/CodeLlama-7b-Instruct-hf', True, True),
             ('mistralai/Mistral-7B-Instruct-v0.1', True, True),
-            ('mistralai/Mixtral-8x7B-Instruct-v0.1', True, False),
+            ('mistralai/Mixtral-8x7B-Instruct-v0.1', True, True),
             ('Qwen/Qwen-7B-Chat', True, True),
             ('Qwen/Qwen-VL-Chat', False, True),
             ('Qwen/Qwen1.5-4B-Chat', True, True),
@@ -64,22 +64,3 @@ def test_autoget_backend(self, turbomind_workspace, models):
             target = 'turbomind' if is_support_turbomind else 'pytorch'
             backend = autoget_backend(model)
             assert backend == target
-
-    def test_autoget_backend_config(self, turbomind_workspace):
-        from lmdeploy.archs import autoget_backend_config
-        from lmdeploy.messages import (PytorchEngineConfig,
-                                       TurbomindEngineConfig)
-        assert type(autoget_backend_config(
-            turbomind_workspace)) is TurbomindEngineConfig
-        assert type(autoget_backend_config(
-            'internlm/internlm-chat-7b')) is TurbomindEngineConfig
-        assert type(
-            autoget_backend_config(
-                'mistralai/Mixtral-8x7B-Instruct-v0.1')) is PytorchEngineConfig
-        backend_config = TurbomindEngineConfig(max_batch_size=64,
-                                               cache_block_seq_len=128)
-        config = autoget_backend_config('mistralai/Mixtral-8x7B-Instruct-v0.1',
-                                        backend_config)
-        assert type(config) is PytorchEngineConfig
-        assert config.max_batch_size == 64
-        assert config.block_size == 128
diff --git a/tests/test_lmdeploy/test_turbomind/test_converter.py b/tests/test_lmdeploy/test_turbomind/test_converter.py
index 3548eac7d0..66f7498b73 100644
--- a/tests/test_lmdeploy/test_turbomind/test_converter.py
+++ b/tests/test_lmdeploy/test_turbomind/test_converter.py
@@ -42,7 +42,7 @@ def test_registered_models():
                                                      model_format=model_format)
         assert input_name in list(INPUT_MODELS.module_dict.keys())
 
-        output_name, config, _ = get_output_model_registered_name_and_config(
+        output_name, config = get_output_model_registered_name_and_config(
             model, model_format=model_format, dtype='auto', group_size=0)
         assert output_name == register_name
         assert config.model_config.group_size == group_size
@@ -53,7 +53,7 @@ def test_registered_models():
 
 def test_update_from_engine_config():
     import copy
-    _, _config, _ = get_output_model_registered_name_and_config(
+    _, _config = get_output_model_registered_name_and_config(
         'internlm/internlm2-chat-7b',
         model_format='hf',
         dtype='auto',
@@ -95,14 +95,14 @@ def test_dtype():
     testsets = [('auto', 'bfloat16'), ('float16', 'float16'),
                 ('bfloat16', 'bfloat16')]
     for specified_dtype, expected_dtype in testsets:
-        _, _config, _ = get_output_model_registered_name_and_config(
+        _, _config = get_output_model_registered_name_and_config(
             'internlm/internlm2-chat-7b',
             model_format='hf',
             dtype=specified_dtype,
             group_size=0)
         assert _config.weight_type == expected_dtype
     for specified_dtype in ['auto', 'float16', 'bfloat16']:
-        _, _config, _ = get_output_model_registered_name_and_config(
+        _, _config = get_output_model_registered_name_and_config(
             'internlm/internlm2_5-20b-chat-4bit-awq',
             model_format='awq',
             dtype=specified_dtype,

From 67a027648fe91b46d2be1f69f0d3076c085f2cb3 Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Fri, 25 Oct 2024 19:03:49 +0800
Subject: [PATCH 039/122] miss device_type when checking is_bf16_supported on
 ascend platform (#2663)

---
 lmdeploy/pytorch/check_env/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lmdeploy/pytorch/check_env/__init__.py b/lmdeploy/pytorch/check_env/__init__.py
index 6250997253..7d72438224 100644
--- a/lmdeploy/pytorch/check_env/__init__.py
+++ b/lmdeploy/pytorch/check_env/__init__.py
@@ -201,7 +201,7 @@ def __check_model_transformers_version(config, trans_version):
                        f'but transformers {trans_version} is installed.')
             _handle_exception(e, 'transformers', logger, message=message)
 
-    def __check_model_dtype_support(config):
+    def __check_model_dtype_support(config, device_type):
         """Checking model dtype support."""
         logger.debug('Checking <Model> dtype support.')
 
@@ -215,7 +215,7 @@ def __check_model_dtype_support(config):
                                                       model_path=model_path,
                                                       dtype=dtype)
             if model_config.dtype == torch.bfloat16:
-                assert is_bf16_supported(), (
+                assert is_bf16_supported(device_type), (
                     'bf16 is not supported on your device')
         except AssertionError as e:
             message = (
@@ -234,7 +234,7 @@ def __check_model_dtype_support(config):
     _, trans_version = __check_transformers_version()
     config = __check_config(trans_version)
     __check_model_transformers_version(config, trans_version)
-    __check_model_dtype_support(config)
+    __check_model_dtype_support(config, device_type)
     check_awq(config, device_type)
 
 
From 538f618743321e83a0d5531f847f0b43cd7238e7 Mon Sep 17 00:00:00 2001
From: CyCle1024 <chenchiyu@pjlab.org.cn>
Date: Fri, 25 Oct 2024 19:12:47 +0800
Subject: [PATCH 040/122] fix syntax in Dockerfile_aarch64_ascend (#2664)

---
 docker/Dockerfile_aarch64_ascend | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile_aarch64_ascend b/docker/Dockerfile_aarch64_ascend
index f9c07cb69c..f4ecd97bcb 100644
--- a/docker/Dockerfile_aarch64_ascend
+++ b/docker/Dockerfile_aarch64_ascend
@@ -87,7 +87,7 @@ RUN --mount=type=cache,target=/tmp,from=build_temp,source=/tmp \
     chmod +x $TOOLKIT_PKG $KERNELS_PKG $NNAL_PKG && \
     ./$TOOLKIT_PKG --quiet --install --install-path=$ASCEND_BASE --install-for-all $CHIPOPTION && \
     ./$KERNELS_PKG --quiet --install --install-path=$ASCEND_BASE --install-for-all && \
-    ./$NNAL_PKG --quiet --install --install-path=$ASCEND_BASE &&
+    ./$NNAL_PKG --quiet --install --install-path=$ASCEND_BASE && \
     rm -f $TOOLKIT_PKG $KERNELS_PKG $NNAL_PKG
 
 ENV GLOG_v=2 \

From a8b2765c3897330bb054051595b1984e1569288d Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Mon, 28 Oct 2024 12:32:12 +0800
Subject: [PATCH 041/122] Set history_cross_kv_seqlens to 0 by default (#2666)

---
 lmdeploy/pytorch/messages.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/lmdeploy/pytorch/messages.py b/lmdeploy/pytorch/messages.py
index 6331577548..b16a78f1f4 100644
--- a/lmdeploy/pytorch/messages.py
+++ b/lmdeploy/pytorch/messages.py
@@ -385,7 +385,7 @@ class SchedulerSequence:
     mrope_position_ids: Optional[Tensor] = None
     mrope_position_delta: Optional[int] = None
     cross_attention_states: Optional[Tensor] = None
-    history_cross_kv_seqlens: Optional[int] = None
+    history_cross_kv_seqlens: int = 0
 
     def __post_init__(self):
         """post init."""
@@ -489,12 +489,11 @@ def num_all_tokens(self):
 
     def num_all_cross_tokens(self):
         """num of all cross tokens."""
-        if self.history_cross_kv_seqlens is None:
-            if self.cross_attention_states is None:
-                self.history_cross_kv_seqlens = 0
-            else:
-                self.history_cross_kv_seqlens = self.cross_attention_states.shape[  # noqa
-                    -2]
+        if self.cross_attention_states is None:
+            self.history_cross_kv_seqlens = 0
+        else:
+            self.history_cross_kv_seqlens = self.cross_attention_states.shape[
+                -2]
         return self.history_cross_kv_seqlens
 
     def update_token_ids(self,

From 1c2273190431ade1b434ea80e3f78e9110996eec Mon Sep 17 00:00:00 2001
From: CyCle1024 <chenchiyu@pjlab.org.cn>
Date: Mon, 28 Oct 2024 14:13:07 +0800
Subject: [PATCH 042/122] fix build error in ascend dockerfile (#2667)

---
 docker/Dockerfile_aarch64_ascend | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile_aarch64_ascend b/docker/Dockerfile_aarch64_ascend
index f4ecd97bcb..e125ef92cb 100644
--- a/docker/Dockerfile_aarch64_ascend
+++ b/docker/Dockerfile_aarch64_ascend
@@ -73,7 +73,7 @@ $LD_LIBRARY_PATH
 ARG CHIP=all
 ARG TOOLKIT_PKG=Ascend-cann-toolkit_*.run
 ARG KERNELS_PKG=Ascend-cann-kernels-*.run
-ARG NNAL_PKG=Ascend-nnal_*.run
+ARG NNAL_PKG=Ascend-cann-nnal_*.run
 
 RUN --mount=type=cache,target=/tmp,from=build_temp,source=/tmp \
     umask 0022 && \
@@ -87,6 +87,7 @@ RUN --mount=type=cache,target=/tmp,from=build_temp,source=/tmp \
     chmod +x $TOOLKIT_PKG $KERNELS_PKG $NNAL_PKG && \
     ./$TOOLKIT_PKG --quiet --install --install-path=$ASCEND_BASE --install-for-all $CHIPOPTION && \
     ./$KERNELS_PKG --quiet --install --install-path=$ASCEND_BASE --install-for-all && \
+    . /usr/local/Ascend/ascend-toolkit/set_env.sh && \
     ./$NNAL_PKG --quiet --install --install-path=$ASCEND_BASE && \
     rm -f $TOOLKIT_PKG $KERNELS_PKG $NNAL_PKG
 

From 39de575065c3b19f1e071782a3dfa723f7d2182d Mon Sep 17 00:00:00 2001
From: Willow <523814299@qq.com>
Date: Mon, 28 Oct 2024 14:20:26 +0800
Subject: [PATCH 043/122] bugfix: llava-hf/llava-interleave-qwen-7b-hf (#2657)

- fix init raise exception because tie_word_embeddings config
---
 lmdeploy/vl/model/llava_hf.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lmdeploy/vl/model/llava_hf.py b/lmdeploy/vl/model/llava_hf.py
index 66faf4f467..c2a0e4afa0 100644
--- a/lmdeploy/vl/model/llava_hf.py
+++ b/lmdeploy/vl/model/llava_hf.py
@@ -31,13 +31,15 @@ def build_model(self):
             else:
                 self.vl_model = model
 
+        # fix for llava-hf/llava-interleave-qwen-7b-hf
+        setattr(model.config, "tie_word_embeddings", False)
         with disable_logging():
             load_checkpoint_and_dispatch(
                 model=model,
                 max_memory=self.max_memory,
                 checkpoint=self.model_path,
                 device_map='auto' if not self.with_llm else {'': 'cpu'},
-                no_split_module_classes=['CLIPEncoderLayer'],
+                no_split_module_classes=['CLIPEncoderLayer', 'SiglipEncoderLayer'],
                 dtype=torch.half)
         model.eval()
         self.model = model

From a41a2a29713121b4f850a3a8d4b75f36656cb1d2 Mon Sep 17 00:00:00 2001
From: Chen Xin <irexyc@gmail.com>
Date: Mon, 28 Oct 2024 15:18:11 +0800
Subject: [PATCH 044/122] fix inference mode error for qwen2-vl (#2668)

---
 lmdeploy/pytorch/models/qwen2_vl.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lmdeploy/pytorch/models/qwen2_vl.py b/lmdeploy/pytorch/models/qwen2_vl.py
index ced57f79f4..1a1dc1e1da 100644
--- a/lmdeploy/pytorch/models/qwen2_vl.py
+++ b/lmdeploy/pytorch/models/qwen2_vl.py
@@ -388,10 +388,11 @@ def forward(
             inputs_embeds=inputs_embeds,
             mrope_position_ids=mrope_position_ids,
         )
+        return hidden_states
 
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-        return logits
+    def get_logits(self, hidden_states: torch.Tensor):
+        """compute logits of the model output."""
+        return self.lm_head(hidden_states)
 
     def update_weights(self):
         """update weights."""

From f5189ce0861bd890c312d938b968a0c8dff57b8a Mon Sep 17 00:00:00 2001
From: jinminxi104 <jinminxi104@hotmail.com>
Date: Mon, 28 Oct 2024 17:10:01 +0800
Subject: [PATCH 045/122] fix supported model list of ascend graph mode (#2669)

---
 docs/en/get_started/ascend/get_started.md     |  2 +-
 docs/zh_cn/get_started/ascend/get_started.md  |  2 +-
 .../backends/dlinfer/ascend/graph_runner.py   | 30 +++++++++----------
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/docs/en/get_started/ascend/get_started.md b/docs/en/get_started/ascend/get_started.md
index c737185420..402ac50fbf 100644
--- a/docs/en/get_started/ascend/get_started.md
+++ b/docs/en/get_started/ascend/get_started.md
@@ -49,7 +49,7 @@ For more information about running the Docker client on Ascend devices, please r
 ## Offline batch inference
 
 > \[!TIP\]
-> Graph mode has been supported on Atlas 800T A2. Currently, InternLM2-7B/LLaMa2-7B/Qwen2-7B are tested on graph mode.
+> Graph mode has been supported on Atlas 800T A2. Currently, LLaMa3-8B/LLaMa2-7B/Qwen2-7B are tested on graph mode.
 > Users can set `eager_mode=False` to enable graph mode, or, set `eager_mode=True` to disable graph mode.
 > (Please source `/usr/local/Ascend/nnal/atb/set_env.sh` before enabling graph mode)
 
diff --git a/docs/zh_cn/get_started/ascend/get_started.md b/docs/zh_cn/get_started/ascend/get_started.md
index cde7409a36..78bd8383d4 100644
--- a/docs/zh_cn/get_started/ascend/get_started.md
+++ b/docs/zh_cn/get_started/ascend/get_started.md
@@ -49,7 +49,7 @@ docker run -e ASCEND_VISIBLE_DEVICES=0 --rm --name lmdeploy -t lmdeploy-aarch64-
 ## 离线批处理
 
 > \[!TIP\]
-> 图模式已经支持了Atlas 800T A2。目前，单卡下的InternLM2-7B/LLaMa2-7B/Qwen2-7B已经通过测试。用户可以设定`eager_mode=False`来开启图模式，或者设定`eager_mode=True`来关闭图模式。(启动图模式需要事先source `/usr/local/Ascend/nnal/atb/set_env.sh`)
+> 图模式已经支持了Atlas 800T A2。目前，单卡下的LLaMa3-8B/LLaMa2-7B/Qwen2-7B已经通过测试。用户可以设定`eager_mode=False`来开启图模式，或者设定`eager_mode=True`来关闭图模式。(启动图模式需要事先source `/usr/local/Ascend/nnal/atb/set_env.sh`)
 
 ### LLM 推理
 
diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py b/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py
index 3ecc4223bd..7dbb86d4b6 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py
@@ -22,6 +22,7 @@ def __init__(self, model: torch.nn.Module, model_config: ModelConfig,
         super().__init__(model, model_config, cache_config, backend_config,
                          device)
 
+        self.supported_model = ['Llama3-8B', 'Llama2-7B', 'Qwen2-7B']
         self.enable_graph = self.check_enable_graph()
         if self.enable_graph:
             import dlinfer.graph
@@ -44,21 +45,20 @@ def check_enable_graph(self):
                 "Graph mode of device_type 'ascend' only supports tp=1 "
                 'for now, fallback to eager mode', RuntimeWarning)
             return False
-        # model support
-        self.supported_model = {
-            'Llama2': 'LlamaConfig',
-            'InternLM2': 'InternLM2Config',
-            'Qwen2': 'Qwen2Config',
-        }
-        is_model_support = True
-        model_config_name = str(type(self.model_config.hf_config).__name__)
-        if model_config_name not in self.supported_model.values():
-            is_model_support = False
-        if not is_model_support:
-            warnings.warn(
-                "Graph mode of device_type 'ascend' only supports models: "
-                f"{', '.join(self.supported_model.keys())} when tp=1 for now",
-                RuntimeWarning)
+
+        warnings.warn(
+            '\n\n'
+            '**********************************************************\n'
+            '  The following models were tested in graph mode of\n'
+            "  device_type 'ascend' when tp=1:\n"
+            f"  {', '.join(self.supported_model)}\n"
+            '  Other LLaMa-like models may work in graph mode, please\n'
+            '  check the result yourself!\n'
+            '  If graph mode does not work correctly with your model,\n'
+            '  please use eager mode instead.\n'
+            '**********************************************************\n\n',
+            RuntimeWarning)
+
         return True
 
     def patch_kernels_custom_op(self):

From a07e65e9329bec2f93c5a121a8da49d8a169f34b Mon Sep 17 00:00:00 2001
From: CyCle1024 <chenchiyu@pjlab.org.cn>
Date: Mon, 28 Oct 2024 19:06:53 +0800
Subject: [PATCH 046/122] remove dlinfer version (#2672)

---
 docker/Dockerfile_aarch64_ascend | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile_aarch64_ascend b/docker/Dockerfile_aarch64_ascend
index e125ef92cb..1c9591197b 100644
--- a/docker/Dockerfile_aarch64_ascend
+++ b/docker/Dockerfile_aarch64_ascend
@@ -110,7 +110,7 @@ RUN echo "source /usr/local/Ascend/ascend-toolkit/set_env.sh" >> ~/.bashrc && \
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip3 install torch==2.3.1 torchvision==0.18.1 torch-npu==2.3.1 && \
     pip3 install transformers timm && \
-    pip3 install dlinfer-ascend==0.1.1
+    pip3 install dlinfer-ascend
 
 # lmdeploy
 FROM build_temp as copy_temp

From 522108c64cd854f0e5d555f4eb7c761aa9eae377 Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Tue, 29 Oct 2024 14:40:39 +0800
Subject: [PATCH 047/122] Bump version to v0.6.2 (#2659)

* bump version to v0.6.2

* update supported-models list

* update news

* update

* update

* update

* upate

* merge main and fix lint
---
 README.md                                     |  2 +
 README_ja.md                                  |  1 +
 README_zh-CN.md                               |  2 +
 docs/en/get_started/installation.md           |  2 +-
 docs/en/supported_models/supported_models.md  | 63 ++++++++++---------
 docs/zh_cn/get_started/installation.md        |  2 +-
 .../supported_models/supported_models.md      | 63 ++++++++++---------
 lmdeploy/version.py                           |  2 +-
 lmdeploy/vl/model/llava_hf.py                 |  6 +-
 9 files changed, 76 insertions(+), 67 deletions(-)

diff --git a/README.md b/README.md
index 61c0eba45b..6ca5fadedd 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,7 @@ ______________________________________________________________________
 <details open>
 <summary><b>2024</b></summary>
 
+- \[2024/10\] PyTorchEngine supports graph mode on ascend platform, doubling the inference speed
 - \[2024/09\] LMDeploy PyTorchEngine adds support for [Huawei Ascend](./docs/en/get_started/ascend/get_started.md). See supported models [here](docs/en/supported_models/supported_models.md)
 - \[2024/09\] LMDeploy PyTorchEngine achieves 1.3x faster on Llama3-8B inference by introducing CUDA graph
 - \[2024/08\] LMDeploy is integrated into [modelscope/swift](https://github.com/modelscope/swift) as the default accelerator for VLMs inference
@@ -162,6 +163,7 @@ For detailed inference benchmarks in more devices and more settings, please refe
   <li>Phi-3-vision (4.2B)</li>
   <li>Phi-3.5-vision (4.2B)</li>
   <li>GLM-4V (9B)</li>
+  <li>Llama3.2-vision (11B, 90B)</li>
 </ul>
 </td>
 </tr>
diff --git a/README_ja.md b/README_ja.md
index 999ebc9f0b..df4647d868 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -160,6 +160,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
   <li>Phi-3-vision (4.2B)</li>
   <li>Phi-3.5-vision (4.2B)</li>
   <li>GLM-4V (9B)</li>
+  <li>Llama3.2-vision (11B, 90B)</li>
 </ul>
 </td>
 </tr>
diff --git a/README_zh-CN.md b/README_zh-CN.md
index f002899c60..663b7b24ab 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -26,6 +26,7 @@ ______________________________________________________________________
 <details open>
 <summary><b>2024</b></summary>
 
+- \[2024/10\] PyTorchEngine 在 ascend 平台上支持了图模式，推理性能提高了 1 倍
 - \[2024/09\] LMDeploy PyTorchEngine 增加了对 [华为 Ascend](docs/zh_cn/get_started/ascend/get_started.md) 的支持。支持的模型请见[这里](docs/zh_cn/supported_models/supported_models.md)
 - \[2024/09\] 通过引入 CUDA Graph，LMDeploy PyTorchEngine 在 Llama3-8B 推理上实现了 1.3 倍的加速
 - \[2024/08\] LMDeploy现已集成至 [modelscope/swift](https://github.com/modelscope/swift)，成为 VLMs 推理的默认加速引擎
@@ -163,6 +164,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型
   <li>Phi-3-vision (4.2B)</li>
   <li>Phi-3.5-vision (4.2B)</li>
   <li>GLM-4V (9B)</li>
+  <li>Llama3.2-vision (11B, 90B)</li>
 </ul>
 </td>
 </tr>
diff --git a/docs/en/get_started/installation.md b/docs/en/get_started/installation.md
index ab7ee0b30e..b7d03b28a6 100644
--- a/docs/en/get_started/installation.md
+++ b/docs/en/get_started/installation.md
@@ -23,7 +23,7 @@ pip install lmdeploy
 The default prebuilt package is compiled on **CUDA 12**. If CUDA 11+ (>=11.3) is required, you can install lmdeploy by:
 
 ```shell
-export LMDEPLOY_VERSION=0.6.1
+export LMDEPLOY_VERSION=0.6.2
 export PYTHON_VERSION=38
 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 ```
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index 260120efe0..cd38a60025 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -4,36 +4,37 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 
 ## TurboMind on CUDA Platform
 
-|         Model         |    Size     | Type | FP16/BF16 | KV INT8 | KV INT4 | W4A16 |
-| :-------------------: | :---------: | :--: | :-------: | :-----: | :-----: | :---: |
-|         Llama         |  7B - 65B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Llama2         |  7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Llama3         |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Llama3.1        |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Llama3.2        |     3B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternLM        |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternLM2       |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|      InternLM2.5      |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|  InternLM-XComposer2  | 7B, 4khd-7B | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-| InternLM-XComposer2.5 |     7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|         Qwen          | 1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Qwen1.5        | 1.8B - 110B | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|         Qwen2         | 1.5B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Mistral        |     7B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
-|        Qwen-VL        |     7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|      DeepSeek-VL      |     7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Baichuan        |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Baichuan2       |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|      Code Llama       |  7B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No   |
-|          YI           |  6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|    LLaVA(1.5,1.6)     |  7B - 34B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternVL        | v1.1- v1.5  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternVL2       |   2B-76B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-| MiniCPM-Llama3-V-2_5  |      -      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|     MiniCPM-V-2_6     |      -      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|    MiniGeminiLlama    |     7B      | MLLM |    Yes    |    -    |    -    |  Yes  |
-|         GLM4          |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       CodeGeeX4       |     9B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
+|         Model         |     Size     | Type | FP16/BF16 | KV INT8 | KV INT4 | W4A16 |
+| :-------------------: | :----------: | :--: | :-------: | :-----: | :-----: | :---: |
+|         Llama         |   7B - 65B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Llama2         |   7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Llama3         |   8B, 70B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Llama3.1        |   8B, 70B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Llama3.2        |      3B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternLM        |   7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternLM2       |   7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|      InternLM2.5      |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|  InternLM-XComposer2  | 7B, 4khd-7B  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+| InternLM-XComposer2.5 |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|         Qwen          |  1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Qwen1.5        | 1.8B - 110B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|         Qwen2         |  1.5B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Mistral        |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Mixtral        | 8x7B, 8x22B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Qwen-VL        |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|      DeepSeek-VL      |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Baichuan        |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Baichuan2       |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|      Code Llama       |   7B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No   |
+|          YI           |   6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|    LLaVA(1.5,1.6)     |   7B - 34B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternVL        | v1.1 - v1.5  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternVL2       | 2B, 8B - 76B | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+| MiniCPM-Llama3-V-2_5  |      -       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|     MiniCPM-V-2_6     |      -       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|    MiniGeminiLlama    |      7B      | MLLM |    Yes    |    -    |    -    |  Yes  |
+|         GLM4          |      9B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       CodeGeeX4       |      9B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
 
 "-" means not verified yet.
 
@@ -60,7 +61,7 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 |     Falcon     |  7B - 180B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |       YI       |  6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
 |    Mistral     |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|    Mixtral     |    8x7B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|    Mixtral     | 8x7B, 8x22B | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |      QWen      | 1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
 |    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
 |  QWen1.5-MoE   |    A2.7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
diff --git a/docs/zh_cn/get_started/installation.md b/docs/zh_cn/get_started/installation.md
index f7eedecfaa..3108d64815 100644
--- a/docs/zh_cn/get_started/installation.md
+++ b/docs/zh_cn/get_started/installation.md
@@ -23,7 +23,7 @@ pip install lmdeploy
 默认的预构建包是在 **CUDA 12** 上编译的。如果需要 CUDA 11+ (>=11.3)，你可以使用以下命令安装 lmdeploy：
 
 ```shell
-export LMDEPLOY_VERSION=0.6.1
+export LMDEPLOY_VERSION=0.6.2
 export PYTHON_VERSION=38
 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 ```
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index 26930cf3ce..9bdbf0d45d 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -4,36 +4,37 @@
 
 ## TurboMind CUDA 平台
 
-|         Model         |    Size     | Type | FP16/BF16 | KV INT8 | KV INT4 | W4A16 |
-| :-------------------: | :---------: | :--: | :-------: | :-----: | :-----: | :---: |
-|         Llama         |  7B - 65B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Llama2         |  7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Llama3         |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Llama3.1        |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Llama3.2        |     3B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternLM        |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternLM2       |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|      InternLM2.5      |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|  InternLM-XComposer2  | 7B, 4khd-7B | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-| InternLM-XComposer2.5 |     7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|         Qwen          | 1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Qwen1.5        | 1.8B - 110B | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|         Qwen2         | 1.5B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Mistral        |     7B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
-|        Qwen-VL        |     7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|      DeepSeek-VL      |     7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Baichuan        |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Baichuan2       |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|      Code Llama       |  7B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No   |
-|          YI           |  6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|    LLaVA(1.5,1.6)     |  7B - 34B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternVL        | v1.1- v1.5  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternVL2       |   2B-76B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-| MiniCPM-Llama3-V-2_5  |      -      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|     MiniCPM-V-2_6     |      -      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|    MiniGeminiLlama    |     7B      | MLLM |    Yes    |    -    |    -    |  Yes  |
-|         GLM4          |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       CodeGeeX4       |     9B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
+|         Model         |     Size     | Type | FP16/BF16 | KV INT8 | KV INT4 | W4A16 |
+| :-------------------: | :----------: | :--: | :-------: | :-----: | :-----: | :---: |
+|         Llama         |   7B - 65B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Llama2         |   7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Llama3         |   8B, 70B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Llama3.1        |   8B, 70B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Llama3.2        |      3B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternLM        |   7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternLM2       |   7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|      InternLM2.5      |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|  InternLM-XComposer2  | 7B, 4khd-7B  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+| InternLM-XComposer2.5 |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|         Qwen          |  1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Qwen1.5        | 1.8B - 110B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|         Qwen2         |  1.5B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Mistral        |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Mixtral        | 8x7B, 8x22B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Qwen-VL        |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|      DeepSeek-VL      |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Baichuan        |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Baichuan2       |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|      Code Llama       |   7B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No   |
+|          YI           |   6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|    LLaVA(1.5,1.6)     |   7B - 34B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternVL        | v1.1 - v1.5  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternVL2       | 2B, 8B - 76B | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+| MiniCPM-Llama3-V-2_5  |      -       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|     MiniCPM-V-2_6     |      -       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|    MiniGeminiLlama    |      7B      | MLLM |    Yes    |    -    |    -    |  Yes  |
+|         GLM4          |      9B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       CodeGeeX4       |      9B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
 
 “-” 表示还没有验证。
 
@@ -60,7 +61,7 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 |     Falcon     |  7B - 180B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |       YI       |  6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
 |    Mistral     |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|    Mixtral     |    8x7B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|    Mixtral     | 8x7B, 8x22B | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |      QWen      | 1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
 |    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
 |  QWen1.5-MoE   |    A2.7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
diff --git a/lmdeploy/version.py b/lmdeploy/version.py
index 5237d5f859..b9f76b5761 100644
--- a/lmdeploy/version.py
+++ b/lmdeploy/version.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Tuple
 
-__version__ = '0.6.1'
+__version__ = '0.6.2'
 short_version = __version__
 
 
diff --git a/lmdeploy/vl/model/llava_hf.py b/lmdeploy/vl/model/llava_hf.py
index c2a0e4afa0..31be101ae8 100644
--- a/lmdeploy/vl/model/llava_hf.py
+++ b/lmdeploy/vl/model/llava_hf.py
@@ -32,14 +32,16 @@ def build_model(self):
                 self.vl_model = model
 
         # fix for llava-hf/llava-interleave-qwen-7b-hf
-        setattr(model.config, "tie_word_embeddings", False)
+        setattr(model.config, 'tie_word_embeddings', False)
         with disable_logging():
             load_checkpoint_and_dispatch(
                 model=model,
                 max_memory=self.max_memory,
                 checkpoint=self.model_path,
                 device_map='auto' if not self.with_llm else {'': 'cpu'},
-                no_split_module_classes=['CLIPEncoderLayer', 'SiglipEncoderLayer'],
+                no_split_module_classes=[
+                    'CLIPEncoderLayer', 'SiglipEncoderLayer'
+                ],
                 dtype=torch.half)
         model.eval()
         self.model = model

From 587c1846daa5a2293fbb3483cef52fa494650f8d Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Wed, 30 Oct 2024 12:15:32 +0800
Subject: [PATCH 048/122] [ci] support v100 dailytest  (#2665)

* update

* update

* update

* update

* update
---
 .github/scripts/eval_chat_config.py           |   1 -
 .github/workflows/daily_ete_test.yml          |   2 +-
 .github/workflows/daily_ete_test_v100.yml     | 667 ++++++++++++++++++
 autotest/config-v100.yaml                     | 131 ++++
 autotest/config.yaml                          |   2 +-
 .../interface/pipeline/test_pipeline_func.py  |  62 +-
 autotest/utils/benchmark_utils.py             |   6 +
 autotest/utils/config_utils.py                |  14 +-
 autotest/utils/pipeline_chat.py               |   7 +
 autotest/utils/quantization_utils.py          |  13 +-
 autotest/utils/run_client_chat.py             |   5 +
 autotest/utils/run_restful_chat.py            |  15 +-
 benchmark/profile_generation.py               |   4 +
 benchmark/profile_throughput.py               |   5 +
 14 files changed, 894 insertions(+), 40 deletions(-)
 create mode 100644 .github/workflows/daily_ete_test_v100.yml
 create mode 100644 autotest/config-v100.yaml

diff --git a/.github/scripts/eval_chat_config.py b/.github/scripts/eval_chat_config.py
index 81872c0dd1..89ad20a533 100644
--- a/.github/scripts/eval_chat_config.py
+++ b/.github/scripts/eval_chat_config.py
@@ -174,7 +174,6 @@
                                 max_out_len=MAX_NEW_TOKENS,
                                 max_seq_len=MAX_SESSION_LEN,
                                 batch_size=128,
-                                concurrency=128,
                                 meta_template=llama2_meta_template,
                                 run_cfg=dict(num_gpus=1),
                                 end_str='[INST]')
diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
index 84fcaf5034..f03bbf4a50 100644
--- a/.github/workflows/daily_ete_test.yml
+++ b/.github/workflows/daily_ete_test.yml
@@ -199,9 +199,9 @@ jobs:
           chmod -R 777 $workdir
 
   test_tools:
-    needs: test_quantization
     if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
     runs-on: [self-hosted, linux-a100]
+    needs: test_quantization
     timeout-minutes: 150
     strategy:
       fail-fast: false
diff --git a/.github/workflows/daily_ete_test_v100.yml b/.github/workflows/daily_ete_test_v100.yml
new file mode 100644
index 0000000000..8b32bab1f7
--- /dev/null
+++ b/.github/workflows/daily_ete_test_v100.yml
@@ -0,0 +1,667 @@
+name: daily_ete_test_v100
+
+on:
+  workflow_dispatch:
+    inputs:
+      repo_org:
+        required: false
+        description: 'Tested repository organization name. Default is InternLM'
+        type: string
+        default: 'InternLM/lmdeploy'
+      repo_ref:
+        required: false
+        description: 'Set branch or tag or commit id. Default is "main"'
+        type: string
+        default: 'main'
+      backend:
+        required: true
+        description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
+        type: string
+        default: "['turbomind', 'pytorch', 'turbomind_vl']"
+      model:
+        required: true
+        description: 'Set testcase module filter: chat, restful, pipeline, quantization. Default contains all models'
+        type: string
+        default: "['pipeline','restful','chat']"
+      offline_mode:
+        required: true
+        description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
+        type: boolean
+        default: false
+      regression_func:
+        required: true
+        description: 'regression functions'
+        type: string
+        default: "['quant', 'tools','restful','pipeline','benchmark','evaluation']"
+  schedule:
+    - cron:  '00 16 * * 0-4'
+
+env:
+  HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
+  HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
+  OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }}
+  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+  REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }}
+  COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
+  FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
+  TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy
+  OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
+  OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
+
+jobs:
+  linux-build:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}}
+    strategy:
+      matrix:
+        pyver: [py310]
+    runs-on: ubuntu-latest
+    env:
+      PYTHON_VERSION: ${{ matrix.pyver }}
+      PLAT_NAME: manylinux2014_x86_64
+      DOCKER_TAG: cuda12.1
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Build
+        run: |
+          echo ${PYTHON_VERSION}
+          echo ${PLAT_NAME}
+          echo ${DOCKER_TAG}
+          echo ${OUTPUT_FOLDER}
+          echo ${GITHUB_RUN_ID}
+          # remove -it
+          sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
+          bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
+      - name: Upload Artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          if-no-files-found: error
+          path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
+          retention-days: 1
+          name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
+
+
+  download_pkgs:
+    needs: linux-build
+    if: ${{!cancelled()}}
+    runs-on: linux-v100
+    timeout-minutes: 50
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/187:/mnt/187
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Clone repository
+        uses: actions/checkout@v2
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Copy repository
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}}
+      - name: Copy repository - offline
+        if: ${{inputs.offline_mode}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}}
+      - name: Change testconfig on v100
+        run: |
+          mv ${{env.TEST_CODE_PATH}}/autotest/config-v100.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml
+      - name: Download Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        uses: actions/download-artifact@v4
+        with:
+          name: my-artifact-${{ github.run_id }}-py310
+      - name: Copy Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+      - name: Copy Artifacts - offline
+        if: ${{inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+
+  test_quantization:
+    needs: download_pkgs
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}}
+    runs-on: linux-v100
+    timeout-minutes: 180
+    env:
+      PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
+      MODELSCOPE_CACHE: /root/modelscope_hub
+      MODELSCOPE_MODULES_CACHE: /root/modelscope_modules
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/github-actions/modelscope_hub:/root/modelscope_hub
+        - /nvme/github-actions/modelscope_modules:/root/modelscope_modules
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /mnt/187:/mnt/187
+        - /nvme/qa_test_models/lmdeploy/autotest:/local_case
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Check env
+        run: |
+          pip install triton==3.0.0
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Test lmdeploy - quantization w4a16
+        continue-on-error: true
+        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'turbomind')
+        run: |
+          pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - convert
+        continue-on-error: true
+        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'turbomind')
+        run: |
+          pytest autotest/tools/convert -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  test_tools:
+    needs: test_quantization
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
+    runs-on: linux-v100
+    timeout-minutes: 240
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: ${{ fromJSON(inputs.backend) || fromJSON('["turbomind", "pytorch", "turbomind_vl"]')}}
+        model: ${{ fromJSON(inputs.model) || fromJSON('["pipeline","restful","chat"]')}}
+        exclude:
+          - backend: turbomind_vl
+            model: chat
+        include:
+          - backend: turbomind
+            model: local_case
+    env:
+      PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
+      MODELSCOPE_CACHE: /root/modelscope_hub
+      MODELSCOPE_MODULES_CACHE: /root/modelscope_modules
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/github-actions/modelscope_hub:/root/modelscope_hub
+        - /nvme/github-actions/modelscope_modules:/root/modelscope_modules
+        - /nvme/github-actions/resources/lora:/root/lora
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /mnt/187:/mnt/187
+        - /nvme/qa_test_models/lmdeploy/autotest:/local_case
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Check env
+        run: |
+          pip install triton==3.0.0
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          cp -r /root/lora .
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Test lmdeploy - chat workspace
+        continue-on-error: true
+        if: matrix.backend == 'turbomind' && matrix.model == 'chat'
+        run: |
+          pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - chat
+        continue-on-error: true
+        if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'chat'
+        run: |
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - pipeline
+        continue-on-error: true
+        if: matrix.model == 'pipeline'
+        run: |
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - restful
+        continue-on-error: true
+        if: matrix.model == 'restful'
+        run: |
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - restful workspace
+        continue-on-error: true
+        if: matrix.backend == 'turbomind' && matrix.model == 'restful'
+        run: |
+          pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - local testcase
+        if: matrix.backend == 'turbomind' && matrix.model == 'local_case'
+        run: |
+          pytest /local_case/issue_regression --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}}|| true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  test_restful:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
+    runs-on: linux-v100
+    needs: test_quantization
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: ['turbomind', 'pytorch']
+    timeout-minutes: 120
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /mnt/187:/mnt/187
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Check env
+        run: |
+          pip install triton==3.0.0
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Start restful api turbomind
+        if: matrix.backend == 'turbomind'
+        run: |
+          CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm2_5-20b-chat --tp 2 > restful.log  2>&1  &
+          echo "restful_pid=$!" >> "$GITHUB_ENV"
+          sleep 600s
+      - name: Start restful api pytorch
+        if: matrix.backend == 'pytorch'
+        run: |
+          CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm2_5-20b-chat --tp 2 --backend pytorch --dtype float16 > restful.log  2>&1  &
+          echo "restful_pid=$!" >> "$GITHUB_ENV"
+          sleep 600s
+      - name: Test lmdeploy - restful api
+        timeout-minutes: 75
+        run: |
+          pytest autotest/interface/restful/test_restful_chat_func.py -n 20 -m 'not not_${{matrix.backend}}' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Kill api server
+        if: always()
+        run: |
+          kill -15 "$restful_pid"
+      - name: Start restful api turbomind - base
+        if: matrix.backend == 'turbomind'
+        run: |
+          CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm2_5-20b --tp 2 > restful.log  2>&1  &
+          echo "restful_pid=$!" >> "$GITHUB_ENV"
+          sleep 600s
+      - name: Start restful api pytorch - base
+        if: matrix.backend == 'pytorch'
+        run: |
+          CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm2_5-20b --tp 2 --backend pytorch --dtype float16 > restful.log  2>&1  &
+          echo "restful_pid=$!" >> "$GITHUB_ENV"
+          sleep 600s
+      - name: Test lmdeploy - restful api - base
+        timeout-minutes: 40
+        run: |
+          pytest autotest/interface/restful/test_restful_completions_v1.py -n 20 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Kill api server
+        if: always()
+        run: |
+          kill -15 "$restful_pid"
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  test_pipeline:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'pipeline'))}}
+    runs-on: linux-v100
+    needs: test_quantization
+    timeout-minutes: 240
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /mnt/187:/mnt/187
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Check env
+        run: |
+          pip install triton==3.0.0
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Test lmdeploy - interface pipeline case
+        run: |
+          pytest autotest/interface/pipeline/test_pipeline_func.py -m 'not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+
+  test_benchmark:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}}
+    runs-on: linux-v100
+    needs: test_quantization
+    timeout-minutes: 120
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /mnt/187:/mnt/187
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Check env
+        run: |
+          pip install triton==3.0.0
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Test benchmark script
+        run: |
+          pytest autotest/benchmark -n 4 --run_id ${{ github.run_id }} -m function ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          chmod -R 777 /nvme/qa_test_models/benchmark-reports/${{ github.run_id }}
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  test_evaluation:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'evaluation'))}}
+    runs-on: linux-v100
+    needs: test_quantization
+    timeout-minutes: 120
+    strategy:
+        fail-fast: false
+        matrix:
+          evaluate_type: ['chat', 'base']
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/github-actions/resources:/root/resources
+        - /nvme/github-actions/opencompass-data:/root/opencompass-data
+        - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /mnt/187:/mnt/187
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Install opencompass
+        run: |
+          git clone --depth=1 https://github.com/open-compass/opencompass.git
+          cd opencompass
+          python3 -m pip install -e .
+          echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV
+      - name: Check env
+        run: |
+          pip install triton==3.0.0
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Setup paths for evaluation
+        run: |
+          ln -s /root/opencompass-data ./data
+          python3 .github/scripts/action_tools.py create_model_links /nvme/qa_test_models .
+      - name: Evaluate models
+        if: matrix.evaluate_type == 'chat'
+        run: |
+          export LMDEPLOY_DIR=$(pwd)
+
+          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_4bits, turbomind_internlm2_5_7b_chat_kvint4, pt_internlm2_5_7b_chat, turbomind_internlm2_5_20b_chat, turbomind_internlm2_5_20b_chat_4bits, turbomind_internlm2_5_20b_chat_kvint4, pt_internlm2_5_20b_chat, turbomind_llama_3d1_8b_instruct, pt_llama_3d1_8b_instruct, turbomind_llama_3d1_8b_instruct_4bits, turbomind_llama_3d1_8b_instruct_kvint4, turbomind_qwen2_7b_instruct, turbomind_qwen2_7b_instruct_4bits, pt_qwen1_5_moe_2_7b_chat, pt_gemma_2_9b_it]" "[*race_datasets, *gsm8k_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true
+      - name: Evaluate base models
+        if: matrix.evaluate_type == 'base'
+        run: |
+          export LMDEPLOY_DIR=$(pwd)
+
+          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b, turbomind_qwen2_7b, turbomind_internlm2_5_7b_batch1]" "[*mmlu_datasets, *gsm8k_datasets]" /root/evaluation-reports/${{ github.run_id }} base true
+      - name: Clear workspace
+        if: always()
+        run: |
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+
+  get_benchmark_result:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}}
+    needs: [test_benchmark]
+    timeout-minutes: 5
+    runs-on: linux-v100
+    env:
+      BENCHMARK_REPORT_DIR: /nvme/qa_test_models/benchmark-reports/${{ github.run_id }}
+    steps:
+      - name: Clone repository
+        uses: actions/checkout@v3
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Get overview
+        run: |
+          pip install pandas fire mmengine
+          python3 .github/scripts/action_tools.py generate_benchmark_report $BENCHMARK_REPORT_DIR
+
+
+  get_coverage_report:
+    if: ${{!cancelled()}}
+    runs-on: linux-v100
+    needs: [test_tools, test_restful, test_pipeline, test_benchmark]
+    timeout-minutes: 5
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Get coverage report
+        run: |
+          pip install coverage
+          coverage combine ${{env.REPORT_DIR}}
+          coverage xml -o ${{env.REPORT_DIR}}/coverage.xml
+          coverage report -m
+          mv .coverage ${{env.REPORT_DIR}}/.coverage
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  notify_to_feishu:
+    if: always() && !cancelled() && (github.ref_name == 'develop' || github.ref_name == 'main')
+    needs: [get_benchmark_result, get_coverage_report, test_evaluation]
+    timeout-minutes: 5
+    runs-on: linux-v100
+    steps:
+      - name: notify
+        if: contains(needs.*.result, 'failure')
+        run: |
+          curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Lmdeploy- Daily test finished！！！","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.FEISHU_USER_ID }}'"}]]}}}}'  ${{ secrets.FEISHU_WEBHOOK_URL }}
diff --git a/autotest/config-v100.yaml b/autotest/config-v100.yaml
new file mode 100644
index 0000000000..172667ec0c
--- /dev/null
+++ b/autotest/config-v100.yaml
@@ -0,0 +1,131 @@
+model_path: /nvme/qa_test_models
+dst_path: /nvme/qa_test_models/autotest_model
+log_path: /nvme/qa_test_models/autotest_model/log
+benchmark_path: /nvme/qa_test_models/benchmark-reports
+dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
+
+tp_config:
+    internlm-chat-20b: 2
+    internlm2-chat-20b: 2
+    Baichuan2-13B-Chat: 2
+    Mixtral-8x7B-Instruct-v0.1: 2
+    Qwen-VL-Chat: 2
+    llava-v1.5-13b: 2
+    internlm2_5-20b-chat: 2
+    internlm2_5-20b: 2
+    Meta-Llama-3-1-70B-Instruct: 4
+    internlm2_5-7b-chat-1m: 4
+    Qwen2-7B-Instruct-GPTQ-Int4: 2
+    InternVL2-26B: 2
+    InternVL2-40B: 2
+    MiniCPM-V-2_6: 2
+
+turbomind_chat_model:
+    - meta-llama/Meta-Llama-3-1-8B-Instruct
+    - meta-llama/Meta-Llama-3-1-8B-Instruct-AWQ
+    - meta-llama/Meta-Llama-3-8B-Instruct
+    - meta-llama/Meta-Llama-3-8B-Instruct-inner-4bits
+    - internlm/internlm2_5-7b-chat
+    - internlm/internlm2_5-20b-chat
+    - internlm/internlm-xcomposer2d5-7b
+    - OpenGVLab/InternVL2-2B
+    - OpenGVLab/InternVL2-8B
+    - OpenGVLab/InternVL2-26B
+    - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
+    - Qwen/Qwen2-7B-Instruct-AWQ
+    - Qwen/Qwen2-1.5B-Instruct
+    - Qwen/Qwen2.5-7B-Instruct
+    - Qwen/Qwen2-7B-Instruct-GPTQ-Int4
+    - mistralai/Mistral-7B-Instruct-v0.3
+    - THUDM/glm-4-9b-chat
+
+
+pytorch_chat_model:
+    - meta-llama/Meta-Llama-3-8B-Instruct
+    - meta-llama/Meta-Llama-3-1-8B-Instruct
+    - internlm/internlm2_5-7b-chat
+    - internlm/internlm2_5-20b-chat
+    - OpenGVLab/InternVL2-2B
+    - OpenGVLab/InternVL2-4B
+    - OpenGVLab/InternVL2-8B
+    - OpenGVLab/InternVL2-26B
+    - Qwen/Qwen2-1.5B-Instruct
+    - Qwen/Qwen1.5-MoE-A2.7B-Chat
+    - Qwen/Qwen2-VL-2B-Instruct
+    - Qwen/Qwen2-VL-7B-Instruct
+    - google/gemma-2-9b-it
+    - mistralai/Mistral-7B-Instruct-v0.2
+    - THUDM/glm-4v-9b
+    - THUDM/glm-4-9b-chat
+    - microsoft/Phi-3-mini-4k-instruct
+    - deepseek-ai/DeepSeek-V2-Lite-Chat
+
+turbomind_base_model:
+    - internlm/internlm2_5-7b
+    - internlm/internlm2_5-20b
+
+pytorch_base_model:
+    - internlm/internlm2_5-7b
+    - internlm/internlm2_5-20b
+
+vl_model:
+    - OpenGVLab/InternVL2-2B
+    - OpenGVLab/InternVL2-4B
+    - OpenGVLab/InternVL2-8B
+    - OpenGVLab/InternVL2-26B
+    - Qwen/Qwen2-VL-2B-Instruct
+    - Qwen/Qwen2-VL-7B-Instruct
+    - internlm/internlm-xcomposer2d5-7b
+    - THUDM/glm-4v-9b
+    - microsoft/Phi-3-mini-4k-instruct
+
+turbomind_quatization:
+    no_awq:
+        - meta-llama/Meta-Llama-3-1-8B-Instruct
+        - meta-llama/Meta-Llama-3-8B-Instruct
+        - internlm/internlm-xcomposer2d5-7b
+        - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
+        - mistralai/Mistral-7B-Instruct-v0.3
+        - THUDM/glm-4-9b-chat
+    gptq:
+        - internlm/internlm2_5-7b-chat
+    no_kvint4:
+        - openbmb/MiniCPM-V-2_6
+    no_kvint8:
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
+
+pytorch_quatization:
+    awq:
+        - internlm/internlm2_5-7b-chat
+        - internlm/internlm2_5-20b-chat
+        - Qwen/Qwen2-1.5B-Instruct
+    w8a8:
+        - internlm/internlm2_5-7b-chat
+        - internlm/internlm2_5-7b
+    no_kvint4:
+        - OpenGVLab/InternVL2-4B
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
+        - microsoft/Phi-3-mini-4k-instruct
+        - microsoft/Phi-3-vision-128k-instruct
+    no_kvint8:
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
+
+
+longtext_model:
+    - meta-llama/Meta-Llama-3-1-8B-Instruct
+    - meta-llama/Meta-Llama-3-8B-Instruct
+    - meta-llama/Meta-Llama-3-1-70B-Instruct
+    - internlm/internlm2_5-7b-chat-1m
+    - internlm/internlm2-chat-20b
+
+benchmark_model:
+    - meta-llama/Llama-2-7b-chat-hf
+    - meta-llama/Meta-Llama-3-1-8B-Instruct
+    - meta-llama/Meta-Llama-3-8B-Instruct
+    - meta-llama/Meta-Llama-3-1-70B-Instruct
+    - internlm/internlm2_5-7b-chat
+    - internlm/internlm2_5-20b-chat
+    - THUDM/glm-4-9b-chat
+    - mistralai/Mistral-7B-Instruct-v0.3
+    - mistralai/Mixtral-8x7B-Instruct-v0.1
+    - deepseek-ai/DeepSeek-V2-Lite-Chat
diff --git a/autotest/config.yaml b/autotest/config.yaml
index 4e4b20f206..46b9bd9ce1 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -48,6 +48,7 @@ turbomind_chat_model:
     - mistralai/Mistral-7B-Instruct-v0.1
     - mistralai/Mistral-7B-Instruct-v0.2
     - mistralai/Mistral-7B-Instruct-v0.3
+    - mistralai/Mixtral-8x7B-Instruct-v0.1
     - lmdeploy/llama2-chat-7b-w4
     - baichuan-inc/Baichuan2-7B-Chat
     - 01-ai/Yi-6B-Chat
@@ -90,7 +91,6 @@ pytorch_chat_model:
     - mistralai/Mistral-7B-Instruct-v0.1
     - mistralai/Mistral-7B-Instruct-v0.2
     - mistralai/Mixtral-8x7B-Instruct-v0.1
-    - mistralai/Mixtral-8x7B-Instruct-v0.1
     - google/gemma-7b-it
     - google/gemma-2-9b-it
     - deepseek-ai/deepseek-moe-16b-chat
diff --git a/autotest/interface/pipeline/test_pipeline_func.py b/autotest/interface/pipeline/test_pipeline_func.py
index 096918b6b1..bd33ed33a0 100644
--- a/autotest/interface/pipeline/test_pipeline_func.py
+++ b/autotest/interface/pipeline/test_pipeline_func.py
@@ -15,6 +15,14 @@
 
 from lmdeploy import (GenerationConfig, PytorchEngineConfig,
                       TurbomindEngineConfig, pipeline)
+from lmdeploy.utils import is_bf16_supported
+
+
+def init_pipeline(model_path, backend_config):
+    if not is_bf16_supported() and isinstance(backend_config,
+                                              PytorchEngineConfig):
+        backend_config.dtype = 'float16'
+    return pipeline(model_path, backend_config=backend_config)
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
@@ -26,7 +34,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         response = pipe('Hi, pls intro yourself')
         result, msg = assert_pipeline_single_return(response)
         save_pipeline_common_log(config, file_name, result, response, msg)
@@ -56,7 +64,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         response = []
         for item in pipe.stream_infer('Hi, pls intro yourself'):
             response.append(item)
@@ -88,7 +96,7 @@ def run_pipeline_testcase_with_prompt(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'])
         result, msg = assert_pipeline_batch_return(response, 2)
         save_pipeline_common_log(config, file_name, result, response, msg)
@@ -118,7 +126,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         response = []
         for item in pipe.stream_infer(['Pls intro yourself', 'Shanghai is']):
             response.append(item)
@@ -149,7 +157,7 @@ def test_return_with_message(config, model, backend, worker_id):
     def run_pipeline_testcase(config, model, backend, file_name):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         prompts = [[{'role': 'user', 'content': 'Hi, pls intro yourself'}]]
         response = pipe(prompts)
         print(response)
@@ -180,7 +188,7 @@ def test_return_with_message_stream(config, model, backend, worker_id):
     def run_pipeline_testcase(config, model, backend, file_name):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         prompts = [[{'role': 'user', 'content': 'Hi, pls intro yourself'}]]
         response = []
         for item in pipe.stream_infer(prompts):
@@ -212,7 +220,7 @@ def test_return_with_message_batch(config, model, backend, worker_id):
     def run_pipeline_testcase(config, model, backend, file_name):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         prompts = [[{
             'role': 'user',
             'content': 'Hi, pls intro yourself'
@@ -249,7 +257,7 @@ def test_return_with_message_batch_stream(config, model, backend, worker_id):
     def run_pipeline_testcase(config, model, backend, file_name):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         prompts = [[{
             'role': 'user',
             'content': 'Hi, pls intro yourself'
@@ -287,7 +295,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         gen_config = GenerationConfig(logprobs=10,
                                       max_new_tokens=5,
                                       top_k=40,
@@ -320,7 +328,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         gen_config = GenerationConfig(logprobs=10,
                                       max_new_tokens=5,
                                       top_k=40,
@@ -358,7 +366,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(session_len=10, tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'])
 
         result = True
@@ -392,7 +400,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test min_new_tokens
         gen_config = GenerationConfig(min_new_tokens=200, ignore_eos=True)
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
@@ -428,7 +436,7 @@ def run_pipeline_testcase_stop_words(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test stop_words
         gen_config = GenerationConfig(stop_words=[' and', '浦', ' to'])
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
@@ -467,7 +475,7 @@ def run_pipeline_testcase_bad_words(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test bad_words
         gen_config = GenerationConfig(bad_words=[' and', '浦', ' to'])
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
@@ -502,7 +510,7 @@ def test_gen_config_special_words_false(config, model, backend, worker_id):
     def run_pipeline_testcase_special_words(config, model, backend, file_name):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test special_words
         prompt = '<|im_start|>system\n当开启工具以及代码时，根据需求选择合适的工具进行调用\n' + \
             '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' + \
@@ -543,7 +551,7 @@ def test_gen_config_special_words_true(config, model, backend, worker_id):
     def run_pipeline_testcase_special_words(config, model, backend, file_name):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test special_words
         prompt = '<|im_start|>system\n当开启工具以及代码时，根据需求选择合适的工具进行调用\n' + \
             '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' + \
@@ -587,7 +595,7 @@ def run_pipeline_testcase_repetition_penalty(config, model, backend,
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test repetition_penalty
         gen_config = GenerationConfig(repetition_penalty=0.01,
                                       random_seed=1,
@@ -626,7 +634,7 @@ def run_pipeline_testcase_repetition_penalty(config, model, backend,
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test repetition_penalty
         gen_config = GenerationConfig(repetition_penalty=1.2, random_seed=1)
         response = pipe('Shanghai is', gen_config=gen_config)
@@ -658,7 +666,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test repetition_penalty
         gen_config = GenerationConfig(top_p=0.1, random_seed=1)
         response = pipe('Shanghai is', gen_config=gen_config)
@@ -690,7 +698,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test repetition_penalty
         gen_config = GenerationConfig(top_k=1,
                                       max_new_tokens=20,
@@ -727,7 +735,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         response_list = []
         for i in range(3):
             gen_config = GenerationConfig(random_seed=i,
@@ -764,7 +772,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         gen_config = GenerationConfig(random_seed=1, top_k=40, do_sample=True)
         response_list = []
         for i in range(3):
@@ -798,7 +806,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         gen_config = GenerationConfig(temperature=1.0,
                                       top_k=40,
                                       do_sample=True)
@@ -833,7 +841,7 @@ def run_pipeline_testcase_max_new_tokens(config, model, backend,
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test max_new_tokens
         gen_config = GenerationConfig(max_new_tokens=5)
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
@@ -870,7 +878,7 @@ def run_pipeline_testcase_ignore_eos(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test max_new_tokens with ignore_eos
         gen_config = GenerationConfig(ignore_eos=True, max_new_tokens=256)
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
@@ -907,7 +915,7 @@ def test_backend_config_input_validation(config, model, backend, worker_id):
                                                                      tp_num=2)
     model_path = '/'.join([config.get('model_path'), model])
     backend_config = backend(tp=2)
-    pipe = pipeline(model_path, backend_config=backend_config)
+    pipe = init_pipeline(model_path, backend_config=backend_config)
     with pytest.raises(AssertionError):
         gen_config = GenerationConfig(top_p=0)
         pipe('Shanghai is', gen_config=gen_config)
@@ -1018,7 +1026,7 @@ def test_backend_config_tp(config, model, backend, worker_id):
                 worker_id, tp_num=2)
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=100)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         del pipe
         torch.cuda.empty_cache()
         if 'gw' in worker_id:
diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py
index 0ea643524f..c80dbe0dfc 100644
--- a/autotest/utils/benchmark_utils.py
+++ b/autotest/utils/benchmark_utils.py
@@ -7,6 +7,8 @@
 from utils.config_utils import get_workerid
 from utils.run_restful_chat import health_check
 
+from lmdeploy.utils import is_bf16_supported
+
 DEFAULT_PORT = 23333
 GENERATION_CONFIG = ' -c 8 256 -ct 128 128 2048 128 -pt 1 128 128 2048'
 GENERATION_LONGTEXT_CONFIG = ' -c 1 --session-len 200000 -ct 1024 -pt 198000'
@@ -40,6 +42,8 @@ def generation_test(config,
     run_config = ''
     if backend == 'pytorch':
         command += ' --backend pytorch'
+        if not is_bf16_supported():
+            command += ' --dtype float16'
     else:
         if '4bit' in model:
             command += ' --model-format awq'
@@ -105,6 +109,8 @@ def throughput_test(config,
         run_config = '--num-prompts 3000'
     if backend == 'pytorch':
         command += ' --backend pytorch'
+        if not is_bf16_supported():
+            command += ' --dtype float16'
     else:
         if '4bit' in model:
             command += ' --model-format awq'
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index ca041dc9a1..8aa5f933fb 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -4,6 +4,8 @@
 import yaml
 from utils.get_run_config import get_tp_num
 
+from lmdeploy.utils import is_bf16_supported
+
 
 def get_turbomind_model_list(tp_num: int = None,
                              model_type: str = 'chat_model',
@@ -85,14 +87,16 @@ def get_torch_model_list(tp_num: int = None,
 def get_all_model_list(tp_num: int = None,
                        quant_policy: int = None,
                        model_type: str = 'chat_model'):
+
     case_list = get_turbomind_model_list(tp_num=tp_num,
                                          model_type=model_type,
                                          quant_policy=quant_policy)
-    for case in get_torch_model_list(tp_num=tp_num,
-                                     quant_policy=quant_policy,
-                                     model_type=model_type):
-        if case not in case_list:
-            case_list.append(case)
+    if is_bf16_supported():
+        for case in get_torch_model_list(tp_num=tp_num,
+                                         quant_policy=quant_policy,
+                                         model_type=model_type):
+            if case not in case_list:
+                case_list.append(case)
     return [x for x in case_list if 'w8a8' not in x]
 
 
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index 33d65448ab..1ab34b23d5 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -10,6 +10,7 @@
 
 from lmdeploy import pipeline
 from lmdeploy.messages import PytorchEngineConfig, TurbomindEngineConfig
+from lmdeploy.utils import is_bf16_supported
 from lmdeploy.vl import load_image
 from lmdeploy.vl.constants import IMAGE_TOKEN
 
@@ -32,6 +33,8 @@ def run_pipeline_chat_test(config,
 
     if 'pytorch' in type:
         backend_config = PytorchEngineConfig(tp=tp)
+        if not is_bf16_supported():
+            backend_config.dtype = 'float16'
     else:
         backend_config = TurbomindEngineConfig(tp=tp)
 
@@ -292,6 +295,10 @@ def run_pipeline_vl_chat_test(config, model_case, quant_policy: int = None):
         backend_config.model_format = 'awq'
     if quant_policy is not None:
         backend_config.quant_policy = quant_policy
+
+    if not is_bf16_supported():
+        backend_config.cache_max_entry_count = 0.5
+        backend_config.dtype = 'float16'
     pipe = pipeline(hf_path, backend_config=backend_config)
 
     pipeline_chat_log = os.path.join(
diff --git a/autotest/utils/quantization_utils.py b/autotest/utils/quantization_utils.py
index 75b7319aeb..752168958a 100644
--- a/autotest/utils/quantization_utils.py
+++ b/autotest/utils/quantization_utils.py
@@ -2,6 +2,8 @@
 import subprocess
 from subprocess import PIPE
 
+from lmdeploy.utils import is_bf16_supported
+
 
 def quantization(config,
                  quantization_model_name,
@@ -21,17 +23,17 @@ def quantization(config,
     if quantization_type == 'awq':
         quantization_cmd = ' '.join([
             cuda_prefix, 'lmdeploy lite auto_awq', origin_model_path,
-            '--work-dir', quantization_model_path, '--batch-size 32'
+            '--work-dir', quantization_model_path
         ])
     elif quantization_type == 'gptq':
         quantization_cmd = ' '.join([
             cuda_prefix, 'lmdeploy lite auto_gptq', origin_model_path,
-            '--work-dir', quantization_model_path, '--batch-size 32'
+            '--work-dir', quantization_model_path
         ])
     elif quantization_type == 'w8a8':
         quantization_cmd = ' '.join([
             cuda_prefix, 'lmdeploy lite smooth_quant', origin_model_path,
-            '--work-dir', quantization_model_path, '--batch-size 32'
+            '--work-dir', quantization_model_path
         ])
     else:
         return False, 'quantization type should in [awq, gptq, w8a8], \
@@ -40,6 +42,11 @@ def quantization(config,
     if 'llama-3' in origin_model_name.lower():
         quantization_cmd += ' --search-scale True'
 
+    if not is_bf16_supported():
+        quantization_cmd += ' --batch-size 8'
+    else:
+        quantization_cmd += ' --batch-size 32'
+
     with open(quantization_log, 'w') as f:
         # remove existing folder
         subprocess.run([' '.join(['rm -rf', quantization_model_path])],
diff --git a/autotest/utils/run_client_chat.py b/autotest/utils/run_client_chat.py
index edc2268e30..529bf4a6a0 100644
--- a/autotest/utils/run_client_chat.py
+++ b/autotest/utils/run_client_chat.py
@@ -4,6 +4,8 @@
 from utils.get_run_config import get_command_with_extra, get_model_name
 from utils.rule_condition_assert import assert_result
 
+from lmdeploy.utils import is_bf16_supported
+
 TEMPLATE = 'autotest/template.json'
 
 
@@ -63,6 +65,9 @@ def hf_command_line_test(config,
                                  need_tp=True,
                                  cuda_prefix=cuda_prefix)
 
+    if type == 'pytorch':
+        if not is_bf16_supported():
+            cmd += ' --dtype float16'
     if type == 'turbomind':
         if ('w4' in model_case
                 or ('4bits' in model_case or 'awq' in model_case.lower())):
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index 1eb84f1d93..c567db4d00 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -13,6 +13,7 @@
 from utils.run_client_chat import command_line_test
 
 from lmdeploy.serve.openai.api_client import APIClient
+from lmdeploy.utils import is_bf16_supported
 
 BASE_HTTP_URL = 'http://localhost'
 DEFAULT_PORT = 23333
@@ -60,12 +61,17 @@ def start_restful_api(config, param, model, model_path, backend_type,
             cmd += ' --model-format gptq'
     if backend_type == 'pytorch':
         cmd += ' --backend pytorch'
+        if not is_bf16_supported():
+            cmd += ' --dtype float16'
     if 'llava' in model:
         cmd += ' --model-name vicuna'
     if 'quant_policy' in param.keys() and param['quant_policy'] is not None:
         quant_policy = param['quant_policy']
         cmd += f' --quant-policy {quant_policy}'
 
+    if not is_bf16_supported():
+        cmd += ' --cache-max-entry-count 0.5'
+
     start_log = os.path.join(
         log_path, 'start_restful_' + model.split('/')[1] + worker_id + '.log')
 
@@ -87,13 +93,18 @@ def start_restful_api(config, param, model, model_path, backend_type,
         content = file.read()
         print(content)
     start_time = int(time())
+
+    start_timeout = 300
+    if not is_bf16_supported():
+        start_timeout = 600
+
     sleep(5)
-    for i in range(300):
+    for i in range(start_timeout):
         sleep(1)
         end_time = int(time())
         total_time = end_time - start_time
         result = health_check(http_url)
-        if result or total_time >= 300:
+        if result or total_time >= start_timeout:
             break
     allure.attach.file(start_log, attachment_type=allure.attachment_type.TEXT)
     return pid, startRes
diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py
index b28937dd4c..952de5d9f7 100644
--- a/benchmark/profile_generation.py
+++ b/benchmark/profile_generation.py
@@ -349,6 +349,7 @@ def parse_args():
     session_len_act = ArgumentHelper.session_len(pt_group, default=2048)
     prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group)
     rope_scaling_factor_act = ArgumentHelper.rope_scaling_factor(pt_group)
+    dtype_act = ArgumentHelper.dtype(pt_group)
 
     # turbomind engine args
     tb_group = parser.add_argument_group('TurboMind engine argument')
@@ -358,6 +359,7 @@ def parse_args():
     tb_group._group_actions.append(cache_block_seq_len_act)
     tb_group._group_actions.append(prefix_caching_act)
     tb_group._group_actions.append(rope_scaling_factor_act)
+    tb_group._group_actions.append(dtype_act)
     ArgumentHelper.model_format(tb_group, default='hf')
     args = parser.parse_args()
     return args
@@ -416,6 +418,7 @@ def main():
                     rope_scaling_factor=args.rope_scaling_factor,
                     tp=args.tp,
                     enable_prefix_caching=args.enable_prefix_caching,
+                    dtype=args.dtype,
                 )
             elif args.backend == 'pytorch':
                 engine_config = PytorchEngineConfig(
@@ -426,6 +429,7 @@ def main():
                     thread_safe=True,
                     eager_mode=args.eager_mode,
                     enable_prefix_caching=args.enable_prefix_caching,
+                    dtype=args.dtype,
                 )
             gen_config = GenerationConfig(top_k=args.top_k,
                                           top_p=args.top_p,
diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py
index 9d573d51b1..58786d9c80 100644
--- a/benchmark/profile_throughput.py
+++ b/benchmark/profile_throughput.py
@@ -289,6 +289,7 @@ def parse_args():
     cache_block_seq_len_act = ArgumentHelper.cache_block_seq_len(pt_group)
     prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group)
     quant_policy_act = ArgumentHelper.quant_policy(pt_group, default=0)
+    dtype_act = ArgumentHelper.dtype(pt_group)
 
     # turbomind engine args
     tb_group = parser.add_argument_group('TurboMind engine argument')
@@ -298,6 +299,8 @@ def parse_args():
     tb_group._group_actions.append(cache_block_seq_len_act)
     tb_group._group_actions.append(prefix_caching_act)
     tb_group._group_actions.append(quant_policy_act)
+    tb_group._group_actions.append(dtype_act)
+
     ArgumentHelper.model_format(tb_group, default='hf')
     ArgumentHelper.num_tokens_per_iter(tb_group)
     ArgumentHelper.max_prefill_iters(tb_group)
@@ -321,6 +324,7 @@ def main():
             num_tokens_per_iter=args.num_tokens_per_iter,
             max_prefill_iters=args.max_prefill_iters,
             enable_prefix_caching=args.enable_prefix_caching,
+            dtype=args.dtype,
         )
     elif args.backend == 'pytorch':
         engine_config = PytorchEngineConfig(
@@ -333,6 +337,7 @@ def main():
             eager_mode=args.eager_mode,
             enable_prefix_caching=args.enable_prefix_caching,
             quant_policy=args.quant_policy,
+            dtype=args.dtype,
         )
 
     engine = Engine(args.model_path, engine_config, csv=args.csv)

From dde5d2343c3debbacdd2fa93e21f33b60b349c21 Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Thu, 31 Oct 2024 14:39:07 +0800
Subject: [PATCH 049/122] Call cuda empty_cache to prevent OOM when quantizing
 model (#2671)

* Call cuda empty_cache to prevent OOM when quantizing model

* empty cache during export and after forward
---
 lmdeploy/lite/quantization/calibration.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lmdeploy/lite/quantization/calibration.py b/lmdeploy/lite/quantization/calibration.py
index 4ae21e3f45..77ff74e234 100644
--- a/lmdeploy/lite/quantization/calibration.py
+++ b/lmdeploy/lite/quantization/calibration.py
@@ -253,9 +253,11 @@ def export(self, out_dir):
 
         inp_stats = self.collect_inputs_stats()
         torch.save(inp_stats, out_dir / 'inputs_stats.pth')
+        torch.cuda.empty_cache()
 
         out_stats = self.collect_outputs_stats()
         torch.save(out_stats, out_dir / 'outputs_stats.pth')
+        torch.cuda.empty_cache()
 
     def calibrate(self, data):
         """Forward pass through the model in inference mode with given data."""
@@ -267,6 +269,7 @@ def calibrate(self, data):
             model = self.model.model
         with torch.inference_mode():
             _ = model(data.to(self.device))
+        torch.cuda.empty_cache()
 
     def __enter__(self):
         """Prepares the Calibration object for a 'with' statement by
@@ -440,6 +443,7 @@ def export(self, out_dir):
             inputs_stats['absmean'][name] = obs.absmean_val
             inputs_stats['ratios'][name] = obs.ratio
         torch.save(inputs_stats, out_dir / 'inputs_stats.pth')
+        torch.cuda.empty_cache()
 
     def _wrap_decoder_layers_for_search(self):
         """Method to wrap the decoder layers' forward functions for observing

From e034610c89a797de35dd645b1a5e7648cfb29fd6 Mon Sep 17 00:00:00 2001
From: CyCle1024 <chenchiyu@pjlab.org.cn>
Date: Fri, 1 Nov 2024 13:48:13 +0800
Subject: [PATCH 050/122] fix ascend get_started.md link (#2696)

* fix ascend get_started.md link

* fix en ascend get_started.md
---
 docs/en/get_started/ascend/get_started.md    | 4 ++--
 docs/zh_cn/get_started/ascend/get_started.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/en/get_started/ascend/get_started.md b/docs/en/get_started/ascend/get_started.md
index 402ac50fbf..a5400ed64d 100644
--- a/docs/en/get_started/ascend/get_started.md
+++ b/docs/en/get_started/ascend/get_started.md
@@ -23,8 +23,8 @@ The Docker version is supposed to be no less than `18.03`. And `Ascend Docker Ru
 
 #### Ascend Drivers, Firmware and CANN
 
-The target machine needs to install the Huawei driver and firmware version 23.0.3, refer to
-[CANN Driver and Firmware Installation](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha003/softwareinst/instg/instg_0019.html)
+The target machine needs to install the Huawei driver and firmware version not lower than 23.0.3, refer to
+[CANN Driver and Firmware Installation](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha001/softwareinst/instg/instg_0005.html)
 and [download resources](https://www.hiascend.com/hardware/firmware-drivers/community?product=4&model=26&cann=8.0.RC2.beta1&driver=1.0.25.alpha).
 
 And the CANN (version 8.0.RC2.beta1) software packages should also be downloaded from [Ascend Resource Download Center](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC2.beta1&product=4&model=26) themselves. Make sure to place the `Ascend-cann-kernels-910b*.run`, `Ascend-cann-nnal_*.run` and `Ascend-cann-toolkit*-aarch64.run` under the root directory of lmdeploy source code
diff --git a/docs/zh_cn/get_started/ascend/get_started.md b/docs/zh_cn/get_started/ascend/get_started.md
index 78bd8383d4..e00c1e173a 100644
--- a/docs/zh_cn/get_started/ascend/get_started.md
+++ b/docs/zh_cn/get_started/ascend/get_started.md
@@ -22,8 +22,8 @@ Docker 版本应不低于 18.03。并且需按照[官方指南](https://www.hias
 
 #### Drivers，Firmware 和 CANN
 
-目标机器需安装华为驱动程序和固件版本 23.0.3，请参考
-[CANN 驱动程序和固件安装](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha003/softwareinst/instg/instg_0019.html)
+目标机器需安装华为驱动程序和固件版本至少为 23.0.3，请参考
+[CANN 驱动程序和固件安装](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha001/softwareinst/instg/instg_0005.html)
 和[下载资源](https://www.hiascend.com/hardware/firmware-drivers/community?product=4&model=26&cann=8.0.RC2.beta1&driver=1.0.25.alpha)。
 
 另外，`docker/Dockerfile_aarch64_ascend`没有提供CANN 安装包，用户需要自己从[昇腾资源下载中心](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC2.beta1&product=4&model=26)下载CANN(version 8.0.RC2.beta1)软件包。

From 654c457332c6a731578e70382a8563abd5f681a3 Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Fri, 1 Nov 2024 20:05:37 +0800
Subject: [PATCH 051/122] Support min_tokens, min_p parameters for api_server
 (#2681)

* Support min_tokens for api_server

* fix

* use min_new_tokens

* add min_p
---
 lmdeploy/serve/openai/api_server.py | 16 ++++++++++++++++
 lmdeploy/serve/openai/protocol.py   |  4 ++++
 2 files changed, 20 insertions(+)

diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index 019a617acd..a12cadaa7d 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -323,6 +323,12 @@ async def chat_completions_v1(request: ChatCompletionRequest,
     - ignore_eos (bool): indicator for ignoring eos
     - skip_special_tokens (bool): Whether or not to remove special tokens
         in the decoding. Default to be True.
+    - min_new_tokens (int): To generate at least numbers of tokens.
+    - min_p (float): Minimum token probability, which will be scaled by the
+        probability of the most likely token. It must be a value between
+        0 and 1. Typical values are in the 0.01-0.2 range, comparably
+        selective as setting `top_p` in the 0.99-0.8 range (use the
+        opposite of normal `top_p` values)
 
     Currently we do not support the following features:
     - presence_penalty (replaced with repetition_penalty)
@@ -386,6 +392,8 @@ async def chat_completions_v1(request: ChatCompletionRequest,
         skip_special_tokens=request.skip_special_tokens,
         response_format=response_format,
         logits_processors=logits_processors,
+        min_new_tokens=request.min_new_tokens,
+        min_p=request.min_p,
         random_seed=random_seed)
 
     tools = None
@@ -826,6 +834,12 @@ async def chat_interactive_v1(request: GenerateRequest,
         in the decoding. Default to be True.
     - adapter_name (str): For slora inference. Choose which lora to do the
         inference.
+    - min_new_tokens (int): To generate at least numbers of tokens.
+    - min_p (float): Minimum token probability, which will be scaled by the
+        probability of the most likely token. It must be a value between
+        0 and 1. Typical values are in the 0.01-0.2 range, comparably
+        selective as setting `top_p` in the 0.99-0.8 range (use the
+        opposite of normal `top_p` values)
     """
     if request.cancel:
         if request.session_id != -1:
@@ -867,6 +881,8 @@ async def chat_interactive_v1(request: GenerateRequest,
         ignore_eos=request.ignore_eos,
         stop_words=request.stop,
         skip_special_tokens=request.skip_special_tokens,
+        min_new_tokens=request.min_new_tokens,
+        min_p=request.min_p,
         random_seed=random_seed)
     if request.image_url:
         from lmdeploy.vl import load_image
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
index bd54028c39..d4bf8ed315 100644
--- a/lmdeploy/serve/openai/protocol.py
+++ b/lmdeploy/serve/openai/protocol.py
@@ -137,6 +137,8 @@ class ChatCompletionRequest(BaseModel):
     skip_special_tokens: Optional[bool] = True
     top_k: Optional[int] = 40
     seed: Optional[int] = None
+    min_new_tokens: Optional[int] = Field(default=None, examples=[None])
+    min_p: float = 0.0
 
 
 class FunctionResponse(BaseModel):
@@ -339,6 +341,8 @@ class GenerateRequest(BaseModel):
     cancel: Optional[bool] = False  # cancel a responding request
     adapter_name: Optional[str] = Field(default=None, examples=[None])
     seed: Optional[int] = None
+    min_new_tokens: Optional[int] = Field(default=None, examples=[None])
+    min_p: float = 0.0
 
 
 class GenerateResponse(BaseModel):

From 993aa14ca343c4746bbdfd785032c68c803c6dc9 Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Fri, 1 Nov 2024 20:06:03 +0800
Subject: [PATCH 052/122] fix index error when computing ppl on long-text
 prompt (#2697)

* fix index error when computing ppl on long-text prompt

* update user guide
---
 docs/en/llm/pipeline.md    |  4 ++++
 docs/zh_cn/llm/pipeline.md |  4 ++++
 lmdeploy/serve/utils.py    | 15 ++++++++-------
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/docs/en/llm/pipeline.md b/docs/en/llm/pipeline.md
index ab4035a8cc..887e2e0a3e 100644
--- a/docs/en/llm/pipeline.md
+++ b/docs/en/llm/pipeline.md
@@ -136,6 +136,10 @@ logits = pipe.get_logits(input_ids)
 ppl = pipe.get_ppl(input_ids)
 ```
 
+```{note}
+get_ppl returns the cross entropy loss without applying the exponential operation afterwards
+```
+
 - **Below is an example for pytorch backend. Please install triton first.**
 
 ```shell
diff --git a/docs/zh_cn/llm/pipeline.md b/docs/zh_cn/llm/pipeline.md
index a9c74a5f14..40406c85a4 100644
--- a/docs/zh_cn/llm/pipeline.md
+++ b/docs/zh_cn/llm/pipeline.md
@@ -136,6 +136,10 @@ logits = pipe.get_logits(input_ids)
 ppl = pipe.get_ppl(input_ids)
 ```
 
+```{note}
+get_ppl 返回的是 cross entropy loss，没有在之后加 exp 操作
+```
+
 - **使用 pytorch 后端**
 
 需要先安装 triton
diff --git a/lmdeploy/serve/utils.py b/lmdeploy/serve/utils.py
index 4791d3c724..3a16f0a65b 100644
--- a/lmdeploy/serve/utils.py
+++ b/lmdeploy/serve/utils.py
@@ -212,8 +212,8 @@ def get_ppl(self, input_ids: Union[List[int],
         logger.info(f'sorted indices: {indices}')
         for (start, end) in self._batch_iterator(sizes, max_input_len):
             logger.info(f'start: {start}, end: {end}')
-            _input_ids = [input_ids[indices[i]] for i in range(start, end)]
             if start == end:
+                _input_ids = input_ids[indices[start]]
                 loss, target_count = self._get_long_text_ppl(
                     generator=generator,
                     input_ids=_input_ids,
@@ -221,6 +221,7 @@ def get_ppl(self, input_ids: Union[List[int],
                 losses.append(loss)
                 target_counts.append(target_count)
             else:
+                _input_ids = [input_ids[indices[i]] for i in range(start, end)]
                 loss, target_count = self._get_ppl(
                     generator=generator,
                     input_ids=_input_ids,
@@ -261,24 +262,24 @@ def _batch_iterator(self, sizes, max_value):
                 i += 1
 
     def _get_long_text_ppl(self, generator, input_ids, max_input_len):
-        assert isinstance(input_ids, List) and len(input_ids) == 1
-        seq_len = len(input_ids[0])
+        assert all(isinstance(_, int) for _ in input_ids)
+        seq_len = len(input_ids)
         assert seq_len > max_input_len
         logger.info(f'get long text ppl: seq_len {seq_len}')
 
         losses = []
         target_counts = []
         for i in range(0, seq_len, max_input_len):
-            token_ids = input_ids[:, i:i + max_input_len]
+            token_ids = input_ids[i:i + max_input_len]
             step = [i]
             # shift token_ids by 1 to the left
-            target_ids = input_ids[:, i + 1:i + 1 + max_input_len]
+            target_ids = input_ids[i + 1:i + 1 + max_input_len]
 
             loss, target_count = self._get_ppl(
                 generator=generator,
-                input_ids=token_ids,
+                input_ids=[token_ids],
                 max_input_len=max_input_len,
-                target_ids=target_ids,
+                target_ids=[target_ids],
                 steps=step,
                 sequence_start=(i == 0),
                 sequence_end=(i + max_input_len >= seq_len))

From 20de9593ed6690d7801c2ff05e08def3ee5166f2 Mon Sep 17 00:00:00 2001
From: q yao <streetyao@live.com>
Date: Mon, 4 Nov 2024 16:29:53 +0800
Subject: [PATCH 053/122] better tp exit log (#2677)

---
 lmdeploy/pytorch/engine/model_agent.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
index 84e3fba8fb..c713e3ec85 100644
--- a/lmdeploy/pytorch/engine/model_agent.py
+++ b/lmdeploy/pytorch/engine/model_agent.py
@@ -500,15 +500,22 @@ def _start_tp_process(proc_id: int,
 def _check_context_alive(mp_context: mp.ProcessContext):
     """check context alive."""
     procs: List[mp.Process] = mp_context.processes
-    failed_ranks = list(idx for idx, p in enumerate(procs) if not p.is_alive())
-    if len(failed_ranks) == 0:
+    failed_procs = list(idx for idx, p in enumerate(procs) if not p.is_alive())
+    if len(failed_procs) == 0:
         return
-    for p in procs:
+
+    log_procs = []
+    for idx, p in enumerate(procs):
         if p.is_alive():
             p.terminate()
         else:
+            exitcode = p.exitcode
+            if exitcode > 0:
+                # terminated exitcode < 0
+                log_procs.append((idx, exitcode))
             p.close()
-    logger.error(f'TP process {failed_ranks} failed.')
+    for idx, exitcode in log_procs:
+        logger.error(f'TP process {idx} failed with exitcode {exitcode}.')
     # TODO: not safe exit.
     os._exit(1)
 

From e557f054f5fd1d0024f14e985bd604c49ef78022 Mon Sep 17 00:00:00 2001
From: Chen Xin <irexyc@gmail.com>
Date: Mon, 4 Nov 2024 16:30:38 +0800
Subject: [PATCH 054/122] support yarn in turbomind backend (#2519)

* support yarn in turbomind backend

* update qwen2 model to support yarn rope in pytorch backend

* use mul

* refactor export rope params

* support partial_rotary_factor

* fix lint

* fix rope type

* Revert "support partial_rotary_factor"

This reverts commit cc4cce7d0dede6eff3cf59e13c5703a829fe9b43.
---
 .../backends/default/rotary_embedding.py      | 17 ++--
 lmdeploy/pytorch/backends/rotary_embedding.py |  2 +
 lmdeploy/pytorch/models/qwen2.py              | 10 ++-
 lmdeploy/pytorch/nn/__init__.py               |  1 +
 lmdeploy/pytorch/nn/rotary_embedding.py       | 79 +++++++++++++++++++
 lmdeploy/turbomind/deploy/config.py           |  3 +
 .../turbomind/deploy/source_model/llama.py    | 36 ++++++---
 .../kernels/attention/attention_params.h      |  6 ++
 .../kernels/attention/attention_universal.h   |  5 ++
 .../kernels/attention/kv_cache_utils_v2.cu    | 50 ++++++++++++
 .../kernels/attention/kv_cache_utils_v2.h     | 20 +++++
 .../kernels/attention/rotary_embedding.h      | 20 ++++-
 .../kernels/attention/test_attention.cu       | 15 ++++
 src/turbomind/models/llama/llama_params.h     |  3 +
 .../models/llama/unified_attention_layer.cc   | 30 +++++++
 .../triton_backend/llama/LlamaTritonModel.cc  |  3 +
 16 files changed, 277 insertions(+), 23 deletions(-)

diff --git a/lmdeploy/pytorch/backends/default/rotary_embedding.py b/lmdeploy/pytorch/backends/default/rotary_embedding.py
index bc209be5e1..3cecbefbc2 100644
--- a/lmdeploy/pytorch/backends/default/rotary_embedding.py
+++ b/lmdeploy/pytorch/backends/default/rotary_embedding.py
@@ -232,9 +232,12 @@ def __init__(self,
         self.register_buffer('inv_freq', inv_freq, persistent=False)
 
         # get mscale
-        self.mscale = float(
-            yarn_get_mscale(self.scaling_factor, self.mscale) /
-            yarn_get_mscale(self.scaling_factor, self.mscale_all_dim))
+        if yarn_params.attention_factor is not None:
+            self.mscale = yarn_params.attention_factor
+        else:
+            self.mscale = float(
+                yarn_get_mscale(self.scaling_factor, self.mscale) /
+                yarn_get_mscale(self.scaling_factor, self.mscale_all_dim))
         if self.mscale == 1.0:
             self.mscale = None
 
@@ -334,10 +337,10 @@ def build(
             return LlamaDynamicNTKScalingRotaryEmbedding(
                 dim, base, scaling_factor, max_position_embeddings)
         elif emb_type == RopeType.Llama3:
-            return Llama3RotaryEmbeddingImpl(dim, base, scaling_factor,
-                                             llama3_params.low_freq_factor,
-                                             llama3_params.high_freq_factor,
-                                             max_position_embeddings)
+            return Llama3RotaryEmbeddingImpl(
+                dim, base, scaling_factor, llama3_params.low_freq_factor,
+                llama3_params.high_freq_factor,
+                llama3_params.original_max_position_embeddings)
         elif emb_type == RopeType.Yarn:
             return YarnRotaryEmbeddingImpl(dim,
                                            base,
diff --git a/lmdeploy/pytorch/backends/rotary_embedding.py b/lmdeploy/pytorch/backends/rotary_embedding.py
index 6fa6abbdf9..5718d822f0 100644
--- a/lmdeploy/pytorch/backends/rotary_embedding.py
+++ b/lmdeploy/pytorch/backends/rotary_embedding.py
@@ -22,6 +22,7 @@ class YarnParameters:
     beta_slow: float = 1
     mscale: int = 1
     mscale_all_dim: int = 0
+    attention_factor: int = None
 
 
 @dataclass
@@ -39,6 +40,7 @@ class Llama3Parameters:
     """llama3 rope parameters."""
     low_freq_factor: float = 1.0
     high_freq_factor: float = 4.0
+    original_max_position_embeddings: int = 8192
 
 
 class RotaryEmbeddingImpl(ABC):
diff --git a/lmdeploy/pytorch/models/qwen2.py b/lmdeploy/pytorch/models/qwen2.py
index 5ffa06665b..de6a7a58e1 100644
--- a/lmdeploy/pytorch/models/qwen2.py
+++ b/lmdeploy/pytorch/models/qwen2.py
@@ -6,8 +6,9 @@
 from transformers.configuration_utils import PretrainedConfig
 
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm, RopeType,
-                                 SiluAndMul, build_rotary_embedding)
+from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm,
+                                 SiluAndMul, build_rotary_embedding,
+                                 build_rotary_params)
 from lmdeploy.pytorch.nn.linear import (build_merged_colwise_linear,
                                         build_qkv_proj, build_rowwise_linear)
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
@@ -245,7 +246,8 @@ def __init__(self,
                             device=device)
 
         # build rotary embedding
-        emb_type = RopeType.LinearScaling
+        # emb_type = RopeType.LinearScaling
+        rope_params = build_rotary_params(config)
         rope_dim = config.hidden_size // config.num_attention_heads
         rope_max_pos_emb = config.max_position_embeddings
         rope_base = config.rope_theta
@@ -253,7 +255,7 @@ def __init__(self,
             rope_dim,
             rope_max_pos_emb,
             rope_base,
-            emb_type=emb_type,
+            **rope_params,
         )
 
     def forward(
diff --git a/lmdeploy/pytorch/nn/__init__.py b/lmdeploy/pytorch/nn/__init__.py
index 2b90f40298..63df9a5ae9 100644
--- a/lmdeploy/pytorch/nn/__init__.py
+++ b/lmdeploy/pytorch/nn/__init__.py
@@ -8,3 +8,4 @@
 from .rotary_embedding import RopeType  # noqa: F401
 from .rotary_embedding import YarnParameters  # noqa: F401
 from .rotary_embedding import build_rotary_embedding  # noqa: F401
+from .rotary_embedding import build_rotary_params  # noqa: F401
diff --git a/lmdeploy/pytorch/nn/rotary_embedding.py b/lmdeploy/pytorch/nn/rotary_embedding.py
index 35a7de7144..43eb1f806d 100644
--- a/lmdeploy/pytorch/nn/rotary_embedding.py
+++ b/lmdeploy/pytorch/nn/rotary_embedding.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from torch import Tensor, nn
+from transformers import PretrainedConfig
 
 from ..backends import OpType, get_backend
 from ..backends.rotary_embedding import (Llama3Parameters,
@@ -7,6 +8,84 @@
                                          YarnParameters)
 
 
+def _get_default_rope_parameters(config: PretrainedConfig):
+    """get default rope parameters."""
+    return dict(emb_type=RopeType.Default, scaling_factor=1.0)
+
+
+def _get_linear_scaling_rope_parameters(config: PretrainedConfig):
+    """get linear rope parameters."""
+    rope_scaling = config.rope_scaling
+    scaling_factor = rope_scaling['factor']
+    return dict(emb_type=RopeType.LinearScaling, scaling_factor=scaling_factor)
+
+
+def _get_dynamic_ntk_parameters(config: PretrainedConfig):
+    """get dynamic ntk parameters."""
+    rope_scaling = config.rope_scaling
+    scaling_factor = rope_scaling['factor']
+    return dict(emb_type=RopeType.DynamicNTKScaling,
+                scaling_factor=scaling_factor)
+
+
+def _get_yarn_parameters(config: PretrainedConfig):
+    """get yarn parameters."""
+    rope_scaling = config.rope_scaling
+    scaling_factor = rope_scaling['factor']
+    params = YarnParameters()
+    params.attention_factor = rope_scaling.get('attention_factor',
+                                               params.attention_factor)
+    params.beta_fast = rope_scaling.get('beta_fast', params.beta_fast)
+    params.beta_slow = rope_scaling.get('beta_slow', params.beta_slow)
+    return dict(emb_type=RopeType.Yarn,
+                scaling_factor=scaling_factor,
+                yarn_params=params)
+
+
+def _get_longrope_parameters(config: PretrainedConfig):
+    """get longrope parameters."""
+    rope_scaling = config.rope_scaling
+    params = LongRoPEScalingParameters()
+    scaling_factor = rope_scaling['factor']
+    params.long_factor = rope_scaling.long_factor
+    params.short_factor = rope_scaling.long_factor
+    params.original_max_position_embeddings = rope_scaling.get(
+        'original_max_position_embeddings', config.max_position_embeddings)
+    return dict(emb_type=RopeType.LongRoPEScaling,
+                scaling_factor=scaling_factor,
+                longrope_params=params)
+
+
+def _get_llama3_parameters(config: PretrainedConfig):
+    """get llama rope parameters."""
+    rope_scaling = config.rope_scaling
+    params = Llama3Parameters()
+    scaling_factor = rope_scaling['factor']
+    params.low_freq_factor = rope_scaling['low_freq_factor']
+    params.high_freq_factor = rope_scaling['high_freq_factor']
+    params.original_max_position_embeddings = rope_scaling.get(
+        'original_max_position_embeddings',
+        params.original_max_position_embeddings)
+    return dict(emb_type=RopeType.Llama3,
+                scaling_factor=scaling_factor,
+                llama3_params=params)
+
+
+def build_rotary_params(config: PretrainedConfig):
+    """get scaling_factor rotary params, and emb_type."""
+    params = dict(emb_type=RopeType.Default)
+    if config.rope_scaling is not None:
+        rope_type_str = config.rope_scaling.get('rope_type', 'default')
+        build_funcs = dict(default=_get_default_rope_parameters,
+                           linear=_get_linear_scaling_rope_parameters,
+                           dynamic=_get_dynamic_ntk_parameters,
+                           yarn=_get_yarn_parameters,
+                           longrope=_get_longrope_parameters,
+                           llama3=_get_llama3_parameters)
+        params.update(build_funcs[rope_type_str](config))
+    return params
+
+
 def build_rotary_embedding(dim: int,
                            max_position_embeddings: int = 2048,
                            base: int = 10000,
diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
index 6652650949..7e8ebf7b47 100644
--- a/lmdeploy/turbomind/deploy/config.py
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -63,6 +63,7 @@ def verify(self):
 class AttentionConfig:
     rotary_embedding: int = 128
     rope_theta: float = 10000.0
+    attention_factor: float = None
     max_position_embeddings: int = 0
     original_max_position_embeddings: int = 0
     rope_scaling_type: str = ''
@@ -70,6 +71,8 @@ class AttentionConfig:
     use_dynamic_ntk: int = 0
     low_freq_factor: float = 1.0
     high_freq_factor: float = 1.0
+    beta_fast: float = 32.0
+    beta_slow: float = 1.0
     use_logn_attn: int = 0
     cache_block_seq_len: int = 64
 
diff --git a/lmdeploy/turbomind/deploy/source_model/llama.py b/lmdeploy/turbomind/deploy/source_model/llama.py
index d61d1906e1..8e19fa8d87 100644
--- a/lmdeploy/turbomind/deploy/source_model/llama.py
+++ b/lmdeploy/turbomind/deploy/source_model/llama.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import json
+import math
 import os.path as osp
 import re
 
@@ -157,25 +158,35 @@ def model_info(self):
             scaling_type = ''
             low_freq_factor = 1.0
             high_freq_factor = 1.0
+            attention_factor = -1.0
+            beta_fast = 32.0
+            beta_slow = 1.0
             original_max_position_embeddings = 0
             if isinstance(rope_scaling, dict):
-                llama2_scaling_type = model_arg['rope_scaling'].get('type', '')
-                llama3_scaling_type = model_arg['rope_scaling'].get(
-                    'rope_type', '')
-                scaling_factor = model_arg['rope_scaling'].get('factor', '')
-                low_freq_factor = model_arg['rope_scaling'].get(
-                    'low_freq_factor', 1.0)
-                high_freq_factor = model_arg['rope_scaling'].get(
-                    'high_freq_factor', 1.0)
-                original_max_position_embeddings = model_arg[
-                    'rope_scaling'].get('original_max_position_embeddings', 0)
+                llama2_scaling_type = rope_scaling.get('type', '')
+                llama3_scaling_type = rope_scaling.get('rope_type', '')
                 if llama2_scaling_type and llama3_scaling_type:
                     raise ValueError(
                         f'Ambiguous rope_scaling in config: {model_arg}')
                 scaling_type = llama2_scaling_type if llama2_scaling_type \
                     else llama3_scaling_type
+                scaling_factor = rope_scaling.get('factor', 0.0)
                 if scaling_type == 'dynamic':
                     use_dynamic_ntk = 1
+                elif scaling_type == 'llama3':
+                    low_freq_factor = rope_scaling.get('low_freq_factor', 1.0)
+                    high_freq_factor = rope_scaling.get(
+                        'high_freq_factor', 1.0)
+                    original_max_position_embeddings = model_arg[
+                        'rope_scaling'].get('original_max_position_embeddings',
+                                            0)
+                elif scaling_type == 'yarn':
+                    attention_factor = rope_scaling.get(
+                        'attention_factor', None)
+                    if attention_factor is None:
+                        attention_factor = 0.1 * math.log(scaling_factor) + 1.0
+                    beta_fast = rope_scaling.get('beta_fast', 32.0)
+                    beta_slow = rope_scaling.get('beta_slow', 1.0)
 
         return dict(
             num_layer=num_layer,
@@ -192,4 +203,7 @@ def model_info(self):
             rope_scaling_type=scaling_type,
             rope_scaling_factor=scaling_factor,
             low_freq_factor=low_freq_factor,
-            high_freq_factor=high_freq_factor)
+            high_freq_factor=high_freq_factor,
+            attention_factor=attention_factor,
+            beta_fast=beta_fast,
+            beta_slow=beta_slow)
diff --git a/src/turbomind/kernels/attention/attention_params.h b/src/turbomind/kernels/attention/attention_params.h
index 8e0e52195d..b6dfaa596c 100644
--- a/src/turbomind/kernels/attention/attention_params.h
+++ b/src/turbomind/kernels/attention/attention_params.h
@@ -59,12 +59,18 @@ struct AttentionParams {
     // rotary embedding
     int   rotary_embedding_dim;
     float rotary_embedding_base;
+    float rope_scaling_factor;
+    float attention_scaling;
     int   max_position_embeddings;
     float rope_ti_scale;  // used for linear RoPE scaling
     // the following 3 parameters are used by llama3
     float llama3_inv_scaling_factor;
     float llama3_alpha;
     float llama3_beta;
+    // the following are use by yarn
+    float yarn_ramp_inv_factor_div_2;
+    float yarn_ramp_inv_factor_mul_min;
+    float yarn_inv_scaling_factor;
 
     // log(n) attention
     bool use_logn_attn;
diff --git a/src/turbomind/kernels/attention/attention_universal.h b/src/turbomind/kernels/attention/attention_universal.h
index 352cc14725..5fb583bd1f 100644
--- a/src/turbomind/kernels/attention/attention_universal.h
+++ b/src/turbomind/kernels/attention/attention_universal.h
@@ -231,9 +231,14 @@ struct AttentionUniversal {
                           params.rotary_embedding_dim,
                           rope_base,
                           params.rope_ti_scale,
+                          params.rope_scaling_factor,
                           params.llama3_inv_scaling_factor,
                           params.llama3_alpha,
                           params.llama3_beta,
+                          params.yarn_ramp_inv_factor_div_2,
+                          params.yarn_ramp_inv_factor_mul_min,
+                          params.yarn_inv_scaling_factor,
+                          params.attention_scaling,
                           std::integral_constant<int, kVecSize>{});
             PRAGMA_UNROLL
             for (int s = 0; s < ITER_S; ++s) {
diff --git a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
index 1edb92f374..9f28a17b83 100644
--- a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
+++ b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
@@ -23,9 +23,14 @@ __global__ void __launch_bounds__(128) ProcessKV_v2(char**       blocks,
                                                     const float* rope_base,
                                                     int          rope_dim,
                                                     float        rope_ti_scale,
+                                                    float        rope_scaling_factor,
                                                     float        llama3_inv_scaling_factor,
                                                     float        llama3_alpha,
                                                     float        llama3_beta,
+                                                    float        yarn_ramp_inv_factor_div_2,
+                                                    float        yarn_ramp_inv_factor_mul_min,
+                                                    float        yarn_inv_scaling_factor,
+                                                    float        attention_scaling,
                                                     int64_t      stride_b,
                                                     int64_t      stride_c,
                                                     int64_t      stride_h,
@@ -128,9 +133,14 @@ __global__ void __launch_bounds__(128) ProcessKV_v2(char**       blocks,
                           rope_dim,
                           base,
                           rope_ti_scale,
+                          rope_scaling_factor,
                           llama3_inv_scaling_factor,
                           llama3_alpha,
                           llama3_beta,
+                          yarn_ramp_inv_factor_div_2,
+                          yarn_ramp_inv_factor_mul_min,
+                          yarn_inv_scaling_factor,
+                          attention_scaling,
                           std::integral_constant<int, kVecSize>{});
             PRAGMA_UNROLL
             for (int s = 0; s < ITER_S; ++s) {
@@ -204,9 +214,14 @@ void invokeProcessKV_v2(char**       blocks,
                         const float* rope_base,
                         int          rope_dim,
                         float        rope_ti_scale,
+                        float        rope_scaling_factor,
                         float        llama3_inv_scaling_factor,
                         float        llama3_1_alpha,
                         float        llama3_1_beta,
+                        float        yarn_ramp_inv_factor_div_2,
+                        float        yarn_ramp_inv_factor_mul_min,
+                        float        yarn_inv_scaling_factor,
+                        float        attention_scaling,
                         int64_t      stride_b,
                         int64_t      stride_c,
                         int64_t      stride_h,
@@ -245,9 +260,14 @@ void invokeProcessKV_v2(char**       blocks,
                                                                               rope_base,
                                                                               rope_dim,
                                                                               rope_ti_scale,
+                                                                              rope_scaling_factor,
                                                                               llama3_inv_scaling_factor,
                                                                               llama3_1_alpha,
                                                                               llama3_1_beta,
+                                                                              yarn_ramp_inv_factor_div_2,
+                                                                              yarn_ramp_inv_factor_mul_min,
+                                                                              yarn_inv_scaling_factor,
+                                                                              attention_scaling,
                                                                               stride_b,
                                                                               stride_c,
                                                                               stride_h,
@@ -279,9 +299,14 @@ void invokeProcessKV_v2(char**       blocks,
                                      const float* rope_base,                                                           \
                                      int          rope_dim,                                                            \
                                      float        rope_ti_scale,                                                       \
+                                     float        rope_scaling_factor,                                                 \
                                      float        llama3_inv_scaling_factor,                                           \
                                      float        llama3_1_alpha,                                                      \
                                      float        llama3_1_beta,                                                       \
+                                     float        yarn_ramp_inv_factor_div_2,                                          \
+                                     float        yarn_ramp_inv_factor_mul_min,                                        \
+                                     float        yarn_inv_scaling_factor,                                             \
+                                     float        attention_scaling,                                                   \
                                      int64_t      stride_b,                                                            \
                                      int64_t      stride_c,                                                            \
                                      int64_t      stride_h,                                                            \
@@ -309,9 +334,14 @@ __global__ void __launch_bounds__(128) flattenKV_v2(T*           k,
                                                     const float* rope_base,
                                                     int          rope_dim,
                                                     float        rope_ti_scale,
+                                                    float        rope_scaling_factor,
                                                     float        llama3_inv_scaling_factor,
                                                     float        llama3_alpha,
                                                     float        llama3_beta,
+                                                    float        yarn_ramp_inv_factor_div_2,
+                                                    float        yarn_ramp_inv_factor_mul_min,
+                                                    float        yarn_inv_scaling_factor,
+                                                    float        attention_scaling,
                                                     int64_t      stride_b,
                                                     int64_t      stride_c,
                                                     int64_t      stride_h,
@@ -397,9 +427,14 @@ __global__ void __launch_bounds__(128) flattenKV_v2(T*           k,
                           rope_dim,
                           base,
                           rope_ti_scale,
+                          rope_scaling_factor,
                           llama3_inv_scaling_factor,
                           llama3_alpha,
                           llama3_beta,
+                          yarn_ramp_inv_factor_div_2,
+                          yarn_ramp_inv_factor_mul_min,
+                          yarn_inv_scaling_factor,
+                          attention_scaling,
                           std::integral_constant<int, kVecSize>{});
             PRAGMA_UNROLL
             for (int s = 0; s < ITER_S; ++s) {
@@ -434,9 +469,14 @@ void invokeFlattenKV_v2(T*           k,
                         const float* rope_base,
                         int          rope_dim,
                         float        rope_ti_scale,
+                        float        rope_scaling_factor,
                         float        llama3_inv_scaling_factor,
                         float        llama3_alpha,
                         float        llama3_beta,
+                        float        yarn_ramp_inv_factor_div_2,
+                        float        yarn_ramp_inv_factor_mul_min,
+                        float        yarn_inv_scaling_factor,
+                        float        attention_scaling,
                         int64_t      stride_b,
                         int64_t      stride_c,
                         int64_t      stride_h,
@@ -472,9 +512,14 @@ void invokeFlattenKV_v2(T*           k,
                                                                             rope_base,
                                                                             rope_dim,
                                                                             rope_ti_scale,
+                                                                            rope_scaling_factor,
                                                                             llama3_inv_scaling_factor,
                                                                             llama3_alpha,
                                                                             llama3_beta,
+                                                                            yarn_ramp_inv_factor_div_2,
+                                                                            yarn_ramp_inv_factor_mul_min,
+                                                                            yarn_inv_scaling_factor,
+                                                                            attention_scaling,
                                                                             stride_b,
                                                                             stride_c,
                                                                             stride_h,
@@ -503,9 +548,14 @@ void invokeFlattenKV_v2(T*           k,
                                      const float* rope_base,                                                           \
                                      int          rope_dim,                                                            \
                                      float        rope_ti_scale,                                                       \
+                                     float        rope_scaling_factor,                                                 \
                                      float        llama3_inv_scaling_factor,                                           \
                                      float        llama3_alpha,                                                        \
                                      float        llama3_beta,                                                         \
+                                     float        yarn_ramp_inv_factor_div_2,                                          \
+                                     float        yarn_ramp_inv_factor_mul_min,                                        \
+                                     float        yarn_inv_scaling_factor,                                             \
+                                     float        attention_scaling,                                                   \
                                      int64_t      stride_b,                                                            \
                                      int64_t      stride_c,                                                            \
                                      int64_t      stride_h,                                                            \
diff --git a/src/turbomind/kernels/attention/kv_cache_utils_v2.h b/src/turbomind/kernels/attention/kv_cache_utils_v2.h
index 74ba7fafb0..fe45ad7be7 100644
--- a/src/turbomind/kernels/attention/kv_cache_utils_v2.h
+++ b/src/turbomind/kernels/attention/kv_cache_utils_v2.h
@@ -19,9 +19,14 @@ void invokeProcessKV_v2(char**       blocks,
                         const float* rope_base,
                         int          rope_dim,
                         float        rope_ti_scale,
+                        float        rope_scaling_factor,
                         float        llama3_inv_scaling_factor,
                         float        llama3_alpha,
                         float        llama3_beta,
+                        float        yarn_ramp_inv_factor_div_2,
+                        float        yarn_ramp_inv_factor_mul_min,
+                        float        yarn_inv_scaling_factor,
+                        float        attention_scaling,
                         int64_t      stride_b,
                         int64_t      stride_c,
                         int64_t      stride_h,
@@ -49,9 +54,14 @@ void invokeProcessKV_v2_(const AttentionParams<T>& params)
                        params.rope_theta,
                        params.rotary_embedding_dim,
                        params.rope_ti_scale,
+                       params.rope_scaling_factor,
                        params.llama3_inv_scaling_factor,
                        params.llama3_alpha,
                        params.llama3_beta,
+                       params.yarn_ramp_inv_factor_div_2,
+                       params.yarn_ramp_inv_factor_mul_min,
+                       params.yarn_inv_scaling_factor,
+                       params.attention_scaling,
                        0,                                     // stride b
                        params.stride / params.size_per_head,  // stride c
                        1,                                     // stride h
@@ -75,9 +85,14 @@ void invokeFlattenKV_v2(T*           k,
                         const float* rope_base,
                         int          rope_dim,
                         float        rope_ti_scale,
+                        float        rope_scaling_factor,
                         float        llama3_inv_scaling_factor,
                         float        llama3_alpha,
                         float        llama3_beta,
+                        float        yarn_ramp_inv_factor_div_2,
+                        float        yarn_ramp_inv_factor_mul_min,
+                        float        yarn_inv_scaling_factor,
+                        float        attention_scaling,
                         int64_t      stride_b,
                         int64_t      stride_c,
                         int64_t      stride_h,
@@ -104,9 +119,14 @@ void invokeFlattenKV_v2_(const AttentionParams<T>& params, int sum_k_len)
                        nullptr,  // params.rope_theta,
                        params.rotary_embedding_dim,
                        params.rope_ti_scale,
+                       params.rope_scaling_factor,
                        params.llama3_inv_scaling_factor,
                        params.llama3_alpha,
                        params.llama3_beta,
+                       params.yarn_ramp_inv_factor_div_2,
+                       params.yarn_ramp_inv_factor_mul_min,
+                       params.yarn_inv_scaling_factor,
+                       params.attention_scaling,
                        0,
                        1,
                        2 * sum_k_len,
diff --git a/src/turbomind/kernels/attention/rotary_embedding.h b/src/turbomind/kernels/attention/rotary_embedding.h
index 8bc54ad268..8e09da22cd 100644
--- a/src/turbomind/kernels/attention/rotary_embedding.h
+++ b/src/turbomind/kernels/attention/rotary_embedding.h
@@ -74,17 +74,24 @@ struct FastRoPE {
 
     Array<float, N / 2> inv_freq_;
     bool                is_valid_;
+    float               attention_scaling_;
 
     __device__ FastRoPE(int   idx,
                         D     dims,
                         float base,
                         float ti_scale,
+                        float factor,
                         float llama3_inv_scaling_factor,
                         float llama3_alpha,
                         float llama3_beta,
+                        float yarn_ramp_inv_factor_div_2,
+                        float yarn_ramp_inv_factor_mul_min,
+                        float yarn_inv_scaling_factor,
+                        float attention_scaling,
                         std::integral_constant<int, N>)
     {
-        is_valid_ = idx < dims;
+        is_valid_          = idx < dims;
+        attention_scaling_ = attention_scaling;
         /// TODO: Take this away from device code
         const float scale_factor = -log2f(base) / dims;
         PRAGMA_UNROLL
@@ -110,6 +117,15 @@ struct FastRoPE {
                 inv_freq_[i / 2] = (1 - smooth) * freq * llama3_inv_scaling_factor + smooth * freq;
             }
         }
+        if (yarn_ramp_inv_factor_div_2) {
+            PRAGMA_UNROLL
+            for (int i = 0; i < N; i += 2) {
+                auto  freq       = inv_freq_[i / 2];
+                float alpha      = (idx + i) * yarn_ramp_inv_factor_div_2 - yarn_ramp_inv_factor_mul_min;
+                alpha            = fmaxf(0.f, fminf(1.f, alpha));
+                inv_freq_[i / 2] = freq - freq * alpha * yarn_inv_scaling_factor;
+            }
+        }
     }
 
     template<typename T>
@@ -119,6 +135,8 @@ struct FastRoPE {
         for (int i = 0; i < N; i += 2) {
             float c, s;
             sincosf(timestep * inv_freq_[i / 2], &s, &c);
+            s *= attention_scaling_;
+            c *= attention_scaling_;
             float tmp0 = c * (float)x[i] - s * (float)x[i + 1];
             float tmp1 = c * (float)x[i + 1] + s * (float)x[i];
             if (is_valid_) {
diff --git a/src/turbomind/kernels/attention/test_attention.cu b/src/turbomind/kernels/attention/test_attention.cu
index 4496b8b4a1..c6d7b40637 100644
--- a/src/turbomind/kernels/attention/test_attention.cu
+++ b/src/turbomind/kernels/attention/test_attention.cu
@@ -150,7 +150,12 @@ void TestBlocks(const thrust::universal_vector<T>& k_cache,        // [B, H, S,
                            rope_dim,
                            1.,
                            0.,
+                           0.,
+                           1.0,
                            1.0,
+                           0.0,
+                           0.0,
+                           0.0,
                            1.0,
                            2 * head_num * seq_len,
                            0,
@@ -179,8 +184,13 @@ void TestBlocks(const thrust::universal_vector<T>& k_cache,        // [B, H, S,
                            rope_dim,
                            1.,
                            0.,
+                           0.,
                            1.0,
                            1.0,
+                           0.0,
+                           0.0,
+                           0.0,
+                           1.0,
                            2 * head_num * seq_len,
                            0,
                            seq_len,
@@ -538,7 +548,12 @@ int test_attention()
                        kRoPEDim,
                        1.,
                        0.,
+                       0.,
+                       1.0,
                        1.0,
+                       0.0,
+                       0.0,
+                       0.0,
                        1.0,
                        KvHeadNum * kContextLen,
                        0,
diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h
index 4cb9e27e13..1c039ca66a 100644
--- a/src/turbomind/models/llama/llama_params.h
+++ b/src/turbomind/models/llama/llama_params.h
@@ -45,6 +45,9 @@ struct AttentionParam {
     float       rope_scaling_factor;
     float       low_freq_factor;
     float       high_freq_factor;
+    float       attention_factor;
+    float       beta_fast;
+    float       beta_slow;
     bool        use_dynamic_ntk;
     bool        use_logn_attn;
     int         cache_block_seq_len;
diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc
index f38151a1a5..2f99b0c2ce 100644
--- a/src/turbomind/models/llama/unified_attention_layer.cc
+++ b/src/turbomind/models/llama/unified_attention_layer.cc
@@ -296,6 +296,8 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
         params.rotary_embedding_dim    = param_.rotary_embedding_dim;
         params.rotary_embedding_base   = param_.rotary_embedding_base;
         params.max_position_embeddings = param_.max_position_embeddings;
+        params.rope_scaling_factor     = param_.rope_scaling_factor;
+        params.attention_scaling       = 1.0;
         params.rope_ti_scale           = 1.f;
         if (param_.rope_scaling_type == "linear") {
             params.rope_ti_scale /= param_.rope_scaling_factor;
@@ -307,6 +309,34 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
             params.llama3_alpha = param_.original_max_position_embeddings / (2 * PI) * inv_diff_freq_factor;
             params.llama3_beta  = param_.low_freq_factor * inv_diff_freq_factor;
         }
+        if (param_.rope_scaling_type == "yarn") {
+            const double PI                  = 3.14159265358979323846;
+            auto         find_correction_dim = [&](float num_rotations) {
+                return (param_.rotary_embedding_dim
+                        * std::log(param_.max_position_embeddings / (num_rotations * 2 * PI)))
+                       / (2 * std::log(param_.rotary_embedding_base));
+            };
+            auto find_correction_range = [&](float low_rot, float high_rot, float& low, float& high) {
+                low  = std::floor(find_correction_dim(low_rot));
+                high = std::ceil(find_correction_dim(high_rot));
+                low  = std::max(low, 0.f);
+                high = std::min(high, param_.rotary_embedding_dim - 1.f);
+            };
+            float low, high;
+            find_correction_range(param_.beta_fast, param_.beta_slow, low, high);
+            if (low == high) {
+                high += 0.01f;
+            }
+            params.yarn_ramp_inv_factor_div_2   = 1.0 / (high - low) / 2.0;
+            params.yarn_ramp_inv_factor_mul_min = 1.0 / (high - low) * low;
+            params.yarn_inv_scaling_factor      = (1 - 1.0 / param_.rope_scaling_factor);
+            if (param_.attention_factor < 0) {
+                params.attention_scaling = 0.1 * std::log(param_.rope_scaling_factor) + 1.0;
+            }
+            else {
+                params.attention_scaling = param_.attention_factor;
+            }
+        }
 
         params.use_logn_attn = param_.use_logn_attn;
 
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 44f73370da..5fbd4287a8 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -265,6 +265,9 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     // rotary embedding parameters
     attn_param_.rotary_embedding_dim    = attention_reader["rotary_embedding"].as<int>();
     attn_param_.rotary_embedding_base   = attention_reader["rope_theta"].as<float>(10000.0f);
+    attn_param_.attention_factor        = attention_reader["attention_factor"].as<float>(-1.f);
+    attn_param_.beta_fast               = attention_reader["beta_fast"].as<float>(32.f);
+    attn_param_.beta_slow               = attention_reader["beta_slow"].as<float>(1.f);
     attn_param_.rope_scaling_type       = attention_reader["rope_scaling_type"].as<std::string>("");
     attn_param_.rope_scaling_factor     = attention_reader["rope_scaling_factor"].as<float>(0.f);
     attn_param_.low_freq_factor         = attention_reader["low_freq_factor"].as<float>(1.0);

From 5f577c244e100433c1c71b062de8917656629c1c Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Mon, 4 Nov 2024 16:55:15 +0800
Subject: [PATCH 055/122] miss to read moe_ffn weights from converted tm model
 (#2698)

* miss to read moe_ffn weights

* fix linting

* fix linting

* fix linting
---
 .../models/llama/LlamaDecoderLayerWeight.cc   | 57 +++++++++++++++++--
 1 file changed, 53 insertions(+), 4 deletions(-)

diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
index 7ed657a9b8..2d68ef3535 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -303,6 +303,43 @@ void loadWeights(
     }
 }
 
+template<typename T>
+void loadWeights(LlamaDenseWeight<T>& w, std::string prefix, FtCudaDataType model_file_type)
+{
+    auto weight_file  = prefix + ".weight";
+    auto qweight_file = prefix + ".qweight";
+
+    if (!std::filesystem::exists(weight_file) && !std::filesystem::exists(qweight_file)) {
+        TM_LOG_ERROR("%s and %s does not exist", weight_file.c_str(), qweight_file.c_str());
+        FT_CHECK(false);
+    }
+
+    size_t     dim0 = w.input_dims;
+    size_t     dim1 = w.output_dims;
+    const auto type = model_file_type;
+
+    if (w.bias) {
+        loadWeightFromBin((T*)w.bias, {1, dim1}, prefix + ".bias", type);
+    }
+    const size_t bit_size = getBitSize(w.type);
+    if (bit_size >= 16) {  // fp16, fp32
+        loadWeightFromBin((T*)w.kernel, {dim0, dim1}, prefix + ".weight", type);
+    }
+    else {  // int8, int4
+        const int factor = sizeof(float) * 8 / bit_size;
+
+        FT_CHECK(dim1 % factor == 0);
+
+        std::vector<size_t> w_shape{dim0, dim1 / factor * sizeof(uint32_t)};
+        loadWeightFromBin((int8_t*)w.kernel, w_shape, prefix + ".qweight", FtCudaDataType::INT8);
+
+        const size_t group_count = w.group_size > 0 ? dim0 / w.group_size : 1;
+
+        loadWeightFromBin((half*)w.scales, {group_count, dim1}, prefix + ".scales", type);
+        loadWeightFromBin((half*)w.zeros, {group_count, dim1}, prefix + ".zeros", type);
+    }
+}
+
 template<typename T>
 void LlamaDecoderLayerWeight<T>::mallocWeights()
 {
@@ -357,10 +394,22 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
     loadWeights(self_attn_weights.qkv, dir_path + ".attention.w_qkv", tensor_para_rank_, type, tensor_para_size_);
 
     loadWeights(self_attn_weights.output, dir_path + ".attention.wo", tensor_para_rank_, type, tensor_para_size_);
-
-    loadWeights(ffn_weights.gating, dir_path + ".feed_forward.w1", tensor_para_rank_, type, tensor_para_size_);
-    loadWeights(ffn_weights.intermediate, dir_path + ".feed_forward.w3", tensor_para_rank_, type, tensor_para_size_);
-    loadWeights(ffn_weights.output, dir_path + ".feed_forward.w2", tensor_para_rank_, type, tensor_para_size_);
+    if (moe_weights.experts.empty()) {
+        loadWeights(ffn_weights.gating, dir_path + ".feed_forward.w1", tensor_para_rank_, type, tensor_para_size_);
+        loadWeights(
+            ffn_weights.intermediate, dir_path + ".feed_forward.w3", tensor_para_rank_, type, tensor_para_size_);
+        loadWeights(ffn_weights.output, dir_path + ".feed_forward.w2", tensor_para_rank_, type, tensor_para_size_);
+    }
+    else {
+        loadWeights(moe_weights.gate, dir_path + ".moe_ffn.gate", type);
+        for (size_t i = 0; i < moe_weights.experts.size(); ++i) {
+            std::string weight_name = dir_path + ".moe_ffn.experts." + std::to_string(i);
+            loadWeights(moe_weights.experts[i].gating, weight_name + ".w1", tensor_para_rank_, type, tensor_para_size_);
+            loadWeights(
+                moe_weights.experts[i].intermediate, weight_name + ".w3", tensor_para_rank_, type, tensor_para_size_);
+            loadWeights(moe_weights.experts[i].output, weight_name + ".w2", tensor_para_rank_, type, tensor_para_size_);
+        }
+    }
 }
 
 template<typename T>

From a417a877e788a0e5bcdcabd20fad2d10c9deb23d Mon Sep 17 00:00:00 2001
From: Blank_Answer <97771966+blankanswer@users.noreply.github.com>
Date: Mon, 4 Nov 2024 16:55:34 +0800
Subject: [PATCH 056/122] Fix llama3.2 VL vision in "Supported Modals"
 documents (#2703)

* fix zh_cn supported_models.md llama3.2 version

* fix zh_cn supported_models.md llama3.2 version
---
 docs/en/supported_models/supported_models.md    | 2 +-
 docs/zh_cn/supported_models/supported_models.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index cd38a60025..1f344e78bb 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -51,7 +51,7 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 |     Llama3     |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |    Llama3.2    |   1B, 3B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  Llama3.2-VL   |   8B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|  Llama3.2-VL   |  11B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |   -   |
 |   InternLM2    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  InternLM2.5   |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index 9bdbf0d45d..ac061cf1ae 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -51,7 +51,7 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 |     Llama3     |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |    Llama3.2    |   1B, 3B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  Llama3.2-VL   |   8B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|  Llama3.2-VL   |  11B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |   -   |
 |   InternLM2    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  InternLM2.5   |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |

From 71f1d0f42a56f93e0c663432d8c50e2c2c919965 Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Tue, 5 Nov 2024 14:32:17 +0800
Subject: [PATCH 057/122] Fix turbomind TP (#2706)

* fix tp

* use fused moe for all arches by default
---
 src/turbomind/triton_backend/llama/LlamaTritonModel.cc | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 5fbd4287a8..8db13652f5 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -336,12 +336,6 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     }
     else {
         moe_param_.method = ft::MoeParam::kFused;
-        // Note: This will fail when GPUs of different SMs are mixed
-        if (weight_type_ != ft::WeightType::kINT4 && ft::getSMVersion() >= 90) {
-            // On sm90 the cuBLAS method may be faster as our grouped GEMM is not
-            // optimized for GMMA yet
-            moe_param_.method = ft::MoeParam::kNaive;
-        }
     }
 
     TM_LOG_INFO("%s", toString().c_str());
@@ -380,6 +374,10 @@ std::unique_ptr<ft::Engine<T>> LlamaTritonModel<T>::createSharedModelInstance(
                                                   shared_state_,
                                                   device_id);
 
+    // Wait for pinned buffers to be allocated for all ranks, otherwise tuning will hang
+    // due to concurrent kernel launch & cudaMallocHost
+    shared_state_->barrier->wait();
+
     engine->Start();
 
     return engine;

From ed9aa15b14e97e7987d4f90b8058e346bd175616 Mon Sep 17 00:00:00 2001
From: tangzhiyi11 <tangzhiyi11@users.noreply.github.com>
Date: Tue, 5 Nov 2024 19:39:55 +0800
Subject: [PATCH 058/122] feat: support dynamic/llama3 rotary embedding in
 ascend graph mode (#2670)

* feat: support dynamic ntk scaling rotary embedding in ascend graph mode

* add llama3 rotary embedding

* remove useless codes
---
 .../backends/dlinfer/rotary_embedding.py      | 153 ++++++++++++++----
 1 file changed, 124 insertions(+), 29 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py b/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py
index e97c9d1338..fab6e510f5 100644
--- a/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py
+++ b/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py
@@ -1,14 +1,44 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import math
+
 import torch
 from torch import nn
 
-from ..default.rotary_embedding import (Llama3RotaryEmbeddingImpl,
-                                        LlamaDynamicNTKScalingRotaryEmbedding)
+from ..default.rotary_embedding import LlamaDynamicNTKScalingRotaryEmbedding
 from ..rotary_embedding import (Llama3Parameters, LongRoPEScalingParameters,
                                 RopeType, RotaryEmbeddingBuilder,
                                 RotaryEmbeddingImpl, YarnParameters)
 
 
+def _rotary_embedding_fwd(position_ids: torch.Tensor,
+                          inv_freq: torch.Tensor,
+                          scaling_factor: float,
+                          mscale: float = None,
+                          dtype: torch.dtype = None):
+    """rotary embedding forward."""
+    if dtype is None:
+        dtype = torch.float16
+
+    if scaling_factor != 1.0:
+        position_ids = position_ids.float() / scaling_factor
+    else:
+        position_ids = position_ids.float()
+
+    inv_freq_expanded = inv_freq.view(1, -1, 1)
+    position_ids_expanded = position_ids.unsqueeze(1)
+
+    tmp = torch.bmm(inv_freq_expanded, position_ids_expanded)
+    freqs = tmp.transpose(1, 2)
+    emb = torch.cat((freqs, freqs), dim=-1)
+    cos = emb.cos()
+    sin = emb.sin()
+
+    if mscale is not None:
+        cos = cos * mscale
+        sin = sin * mscale
+    return cos.to(dtype=dtype), sin.to(dtype=dtype)
+
+
 class DlinferRotaryEmbeddingImpl(RotaryEmbeddingImpl, nn.Module):
     """base rotary embedding."""
 
@@ -28,34 +58,100 @@ def __init__(self,
     def forward(self, x, position_ids):
         """forward."""
         # x: [bs, num_attention_heads, seq_len, head_size]
+        dtype = x.dtype
         if self.inv_freq.device != x.device:
             self.inv_freq = self.inv_freq.to(x.device)
+        return _rotary_embedding_fwd(position_ids,
+                                     self.inv_freq,
+                                     scaling_factor=self.scaling_factor,
+                                     dtype=dtype)
 
-        if self.scaling_factor != 1.0:
-            position_ids = position_ids.float() / self.scaling_factor
-        else:
-            position_ids = position_ids.float()
-
-        inv_freq_expanded = self.inv_freq.view(1, -1, 1)
-        position_ids_expanded = position_ids.unsqueeze(1)
-
-        # # Force float32 since bfloat16 loses precision on long contexts
-        # See https://github.com/huggingface/transformers/pull/29285
-        device_type = x.device.type
-        device_type = device_type if isinstance(
-            device_type, str) and device_type != 'mps' else 'cpu'
-        inv_freq_expanded = inv_freq_expanded
-        position_ids_expanded = position_ids_expanded
-        tmp = torch.bmm(inv_freq_expanded, position_ids_expanded)
-        freqs = tmp.transpose(1, 2)
-        emb = torch.cat((freqs, freqs), dim=-1)
-        cos = emb.cos()
-        sin = emb.sin()
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+class DlinferLlamaDynamicNTKScalingRotaryEmbedding(
+        LlamaDynamicNTKScalingRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with Dynamic NTK scaling.
+
+    Credits to the Reddit users /u/bloc97 and /u/emozilla
+    """
+
+    def __init__(self,
+                 dim: int,
+                 base: int = 10000,
+                 scaling_factor: float = 1.0,
+                 max_position_embeddings: int = 2048):
+        super().__init__(dim, base, scaling_factor, max_position_embeddings)
+        self.dim_scale_ratio = self.dim / (self.dim - 2)
+        self.pos_freq_scaling = torch.arange(
+            0, self.dim, 2, dtype=torch.int64).float().cuda() / self.dim
+        self.scale_offset = self.scaling_factor - 1
+        self.pos_scale_factor = self.scaling_factor / \
+            self.max_position_embeddings
+
+    def _ntk_inv_freq(self, seq_len: torch.Tensor):
+        """Calculate inverse frequency with NTK scaling."""
+        base = self.base * ((self.pos_scale_factor * seq_len) -
+                            self.scale_offset)**self.dim_scale_ratio
+        inv_freq = 1.0 / (base**self.pos_freq_scaling)
+        return inv_freq
+
+    def forward(self, x: torch.Tensor, position_ids: torch.Tensor):
+        """forward."""
+        dtype = x.dtype
+        seq_len = torch.max(position_ids) + 1
+        ntk_inv_freq = self._ntk_inv_freq(seq_len)
+        if self.inv_freq.device != x.device:
+            self.inv_freq = self.inv_freq.to(x.device)
+        inv_freq = torch.where(seq_len > self.max_position_embeddings,
+                               ntk_inv_freq, self.inv_freq)
+
+        cos, sin = _rotary_embedding_fwd(position_ids,
+                                         inv_freq,
+                                         scaling_factor=1.0,
+                                         dtype=dtype)
+        return cos, sin
+
+
+class DlinferLlama3RotaryEmbeddingImpl(DlinferRotaryEmbeddingImpl):
+    """llama3 rotary embedding implementation."""
+
+    def __init__(
+        self,
+        dim: int,
+        base: int = 10000,
+        scaling_factor: float = 1.0,
+        low_freq_factor: float = 1.0,
+        high_freq_factor: float = 4.0,
+        original_max_position_embeddings: int = 8194,
+    ):
+        super().__init__(dim, base, scaling_factor)
+        old_context_len = original_max_position_embeddings
+        low_freq_wavelen = old_context_len / low_freq_factor
+        high_freq_wavelen = old_context_len / high_freq_factor
+
+        inv_freq = self.inv_freq
+        factor = self.scaling_factor
+
+        wavelen = 2 * math.pi / inv_freq
+        # wavelen < high_freq_wavelen: do nothing
+        # wavelen > low_freq_wavelen: divide by factor
+        inv_freq_llama = torch.where(wavelen > low_freq_wavelen,
+                                     inv_freq / factor, inv_freq)
+        # otherwise: interpolate between the two, using a smooth factor
+        smooth_factor = (old_context_len / wavelen - low_freq_factor) / (
+            high_freq_factor - low_freq_factor)
+        smoothed_inv_freq = (
+            1 - smooth_factor
+        ) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
+        is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen >
+                                                            low_freq_wavelen)
+        inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq,
+                                     inv_freq_llama)
+        self.scaling_factor = 1.0
+        self.register_buffer('inv_freq', inv_freq_llama)
 
 
 class DlinferRotaryEmbeddingBuilder(RotaryEmbeddingBuilder):
-    """rotary embedding builder."""
+    """rotary embedding dlinfer builder."""
 
     @staticmethod
     def build(
@@ -72,13 +168,12 @@ def build(
         if emb_type in (RopeType.Default, RopeType.LinearScaling):
             return DlinferRotaryEmbeddingImpl(dim, base, scaling_factor)
         elif emb_type == RopeType.DynamicNTKScaling:
-            return LlamaDynamicNTKScalingRotaryEmbedding(
+            return DlinferLlamaDynamicNTKScalingRotaryEmbedding(
                 dim, base, scaling_factor, max_position_embeddings)
         elif emb_type == RopeType.Llama3:
-            return Llama3RotaryEmbeddingImpl(dim, base, scaling_factor,
-                                             llama3_params.low_freq_factor,
-                                             llama3_params.high_freq_factor,
-                                             max_position_embeddings)
+            return DlinferLlama3RotaryEmbeddingImpl(
+                dim, base, scaling_factor, llama3_params.low_freq_factor,
+                llama3_params.high_freq_factor, max_position_embeddings)
         else:
             raise NotImplementedError(
                 f'Unsupported embedding type: {emb_type}')

From 364a142916dd2c7264408af0d282ad9879b0069d Mon Sep 17 00:00:00 2001
From: yaofengchen <67218893+yao-fengchen@users.noreply.github.com>
Date: Tue, 5 Nov 2024 19:41:20 +0800
Subject: [PATCH 059/122] add linear op on dlinfer platform (#2627)

* add linear op on ascend platform

* update code
---
 lmdeploy/pytorch/backends/dlinfer/linear.py   | 32 +++++++++++++++++++
 .../pytorch/backends/dlinfer/op_backend.py    |  3 ++
 lmdeploy/pytorch/kernels/dlinfer/__init__.py  |  2 ++
 lmdeploy/pytorch/kernels/dlinfer/linear.py    | 12 +++++++
 4 files changed, 49 insertions(+)
 create mode 100644 lmdeploy/pytorch/backends/dlinfer/linear.py
 create mode 100644 lmdeploy/pytorch/kernels/dlinfer/linear.py

diff --git a/lmdeploy/pytorch/backends/dlinfer/linear.py b/lmdeploy/pytorch/backends/dlinfer/linear.py
new file mode 100644
index 0000000000..567a01dddf
--- /dev/null
+++ b/lmdeploy/pytorch/backends/dlinfer/linear.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+
+from lmdeploy.pytorch.kernels.dlinfer import linear
+
+from ..linear import LinearBuilder, LinearImpl
+
+
+class DlinferLinearImpl(LinearImpl):
+    """Dlinfer linear implementation api."""
+
+    def forward(self,
+                x,
+                weight: torch.Tensor,
+                bias: Optional[torch.Tensor] = None,
+                all_reduce: bool = False):
+        """forward."""
+        return linear(x, weight, bias, all_reduce)
+
+
+class DlinferLinearBuilder(LinearBuilder):
+    """Dlinfer linear implementation builder."""
+
+    @staticmethod
+    def build(in_features: int,
+              out_features: int,
+              bias: bool = True,
+              dtype: torch.dtype = None):
+        """build."""
+        return DlinferLinearImpl()
diff --git a/lmdeploy/pytorch/backends/dlinfer/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/op_backend.py
index 031f51fdca..52a8830595 100644
--- a/lmdeploy/pytorch/backends/dlinfer/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/op_backend.py
@@ -40,6 +40,9 @@ def get_layer_impl_builder(cls, layer_type: OpType):
         elif layer_type == OpType.FusedMoE:
             from .moe import DlinferFusedMoEBuilder
             return DlinferFusedMoEBuilder
+        elif layer_type == OpType.Linear:
+            from .linear import DlinferLinearBuilder
+            return DlinferLinearBuilder
         elif layer_type == OpType.LinearW4A16:
             from .awq_modules import AwqLinearW4A16Builder
             return AwqLinearW4A16Builder
diff --git a/lmdeploy/pytorch/kernels/dlinfer/__init__.py b/lmdeploy/pytorch/kernels/dlinfer/__init__.py
index 4d678bfe68..8f86f0019a 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/__init__.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/__init__.py
@@ -4,6 +4,7 @@
 from .awq_kernels import awq_linear
 from .fill_kv_cache import fill_kv_cache
 from .fused_moe import fused_moe
+from .linear import linear
 from .moe_gating_topk_softmax import moe_gating_topk_softmax
 from .pagedattention import paged_attention_fwd
 from .rms_norm import rms_norm
@@ -15,6 +16,7 @@
     'fill_kv_cache',
     'fused_moe',
     'paged_attention_fwd',
+    'linear',
     'moe_gating_topk_softmax',
     'multinomial_sampling',
 ]
diff --git a/lmdeploy/pytorch/kernels/dlinfer/linear.py b/lmdeploy/pytorch/kernels/dlinfer/linear.py
new file mode 100644
index 0000000000..695e089fd8
--- /dev/null
+++ b/lmdeploy/pytorch/kernels/dlinfer/linear.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import dlinfer.ops as ext_ops
+from torch import Tensor
+
+
+def linear(x: Tensor,
+           weight: Tensor,
+           bias: Optional[Tensor] = None,
+           all_reduce: bool = False):
+    return ext_ops.linear(x, weight, bias=bias, all_reduce=all_reduce)

From cc142150780351aebd9dac00539cd74e76d7a369 Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Wed, 6 Nov 2024 11:07:55 +0800
Subject: [PATCH 060/122] Add ensure_ascii = False for json.dumps (#2707)

---
 lmdeploy/model.py                         | 2 +-
 lmdeploy/pytorch/engine/logits_process.py | 3 ++-
 lmdeploy/serve/async_engine.py            | 5 +++--
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index 26ab856bc2..98f8e373ba 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -515,7 +515,7 @@ def messages2prompt(self,
                 for tool_call in message['tool_calls']:
                     function = tool_call.get('function', {})
                     function['arguments'] = function.pop('parameters', {})
-                    content += f'<|action_start|><|plugin|>\n{json.dumps(function)}<|action_end|>'
+                    content += f'<|action_start|><|plugin|>\n{json.dumps(function, ensure_ascii=False)}<|action_end|>'
             if 'name' in message and message['name'] in name_map:
                 begin = box_map[role].strip(
                 ) + f" name={name_map[message['name']]}\n"
diff --git a/lmdeploy/pytorch/engine/logits_process.py b/lmdeploy/pytorch/engine/logits_process.py
index 44eb25a8c5..54740a4fb3 100644
--- a/lmdeploy/pytorch/engine/logits_process.py
+++ b/lmdeploy/pytorch/engine/logits_process.py
@@ -101,7 +101,8 @@ def _guided_sampling(response_formats: Tuple[Dict], scores: torch.Tensor,
                 if isinstance(schema, Dict):
                     for key in ['json_schema', 'schema']:
                         if key in schema:
-                            schema = json.dumps(schema[key])
+                            schema = json.dumps(schema[key],
+                                                ensure_ascii=False)
                 elif schema is None:
                     from .guided_process import JSON_GRAMMAR
                     schema = JSON_GRAMMAR
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
index 6b7da04a99..3c8f193cd5 100644
--- a/lmdeploy/serve/async_engine.py
+++ b/lmdeploy/serve/async_engine.py
@@ -645,8 +645,9 @@ def parse_tool_response(self, text, tools, **kwargs):
             action = action.split('<|action_end|>'.strip())[0]
             action = action[action.find('{'):]
             action = json.loads(action)
-            name, parameters = action['name'], json.dumps(
-                action.get('parameters', action.get('arguments', {})))
+            name, parameters = action['name'], json.dumps(action.get(
+                'parameters', action.get('arguments', {})),
+                                                          ensure_ascii=False)
         elif '<function=' in text:  # llama3.1
             action, _ = text.split('</function>')
             parameters = action[action.find('{'):]

From 354028be3e8827a1276ec810973213ee2087a542 Mon Sep 17 00:00:00 2001
From: q yao <streetyao@live.com>
Date: Wed, 6 Nov 2024 14:53:04 +0800
Subject: [PATCH 061/122] fix decoding kernel for deepseekv2 (#2688)

---
 lmdeploy/pytorch/kernels/cuda/pagedattention.py | 4 ++++
 tests/pytorch/kernel/test_paged_attention.py    | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/lmdeploy/pytorch/kernels/cuda/pagedattention.py b/lmdeploy/pytorch/kernels/cuda/pagedattention.py
index e15ab911fc..d125eabc85 100644
--- a/lmdeploy/pytorch/kernels/cuda/pagedattention.py
+++ b/lmdeploy/pytorch/kernels/cuda/pagedattention.py
@@ -121,6 +121,8 @@ def _fwd_grouped_split_kernel(
     cur_head = cur_kv_head * HEAD_PER_CTA + tl.arange(0, BLOCK_H)
     mask_h = cur_head < cur_kv_head * HEAD_PER_CTA + HEAD_PER_CTA
     mask_h = mask_h & (cur_head < num_heads_q)
+    if BLOCK_H < kv_group_num:
+        cur_kv_head = (cur_kv_head * HEAD_PER_CTA) // kv_group_num
 
     q_seqlen = 1
     kv_seqlen = tl.load(KV_seqlens + cur_batch)
@@ -366,6 +368,8 @@ def _fwd_grouped_split_quant_kernel(
     cur_head = cur_kv_head * HEAD_PER_CTA + tl.arange(0, BLOCK_H)
     mask_h = cur_head < cur_kv_head * HEAD_PER_CTA + HEAD_PER_CTA
     mask_h = mask_h & (cur_head < num_heads_q)
+    if BLOCK_H < kv_group_num:
+        cur_kv_head = (cur_kv_head * HEAD_PER_CTA) // kv_group_num
 
     q_seqlen = 1
     kv_seqlen = tl.load(KV_seqlens + cur_batch)
diff --git a/tests/pytorch/kernel/test_paged_attention.py b/tests/pytorch/kernel/test_paged_attention.py
index 7f63b281c5..0ef0db7330 100644
--- a/tests/pytorch/kernel/test_paged_attention.py
+++ b/tests/pytorch/kernel/test_paged_attention.py
@@ -244,7 +244,8 @@ def conti_gt(self, gt, seq_lens):
 
     @pytest.mark.parametrize('feat_dim', [48, 32], indirect=True)
     @pytest.mark.parametrize('feat_dim_v', [32], indirect=True)
-    @pytest.mark.parametrize(['num_heads_q', 'num_heads_k'], [(8, 2), (2, 2)],
+    @pytest.mark.parametrize(['num_heads_q', 'num_heads_k'], [(128, 2), (8, 2),
+                                                              (2, 2)],
                              indirect=True)
     @pytest.mark.parametrize(['seq_lens', 'history_lens'],
                              [([30, 50, 70, 90], [50, 40, 30, 20]),

From e7886b43e6097514700c124e262246ec39205004 Mon Sep 17 00:00:00 2001
From: Chen Xin <irexyc@gmail.com>
Date: Wed, 6 Nov 2024 15:27:46 +0800
Subject: [PATCH 062/122] support turbomind head_dim 64 (#2715)

* support head_dim 64

* fix unit-test

* fix wrong dispatch

* fix comments

* fix comments
---
 .../turbomind/deploy/source_model/internvl.py |  5 +++
 .../turbomind/deploy/source_model/llama.py    |  2 +
 lmdeploy/turbomind/supported_models.py        | 12 ++----
 .../kernels/attention/CMakeLists.txt          | 16 ++++++++
 src/turbomind/kernels/attention/attention.cu  | 19 ++++++---
 .../codegen/attention_sm70_64_f16.cu          | 16 ++++++++
 .../codegen/attention_sm75_64_f16.cu          | 17 ++++++++
 .../codegen/attention_sm80_64_bf16.cu         | 16 ++++++++
 .../codegen/attention_sm80_64_f16.cu          | 16 ++++++++
 .../codegen/decoding_sm70_64_f16_f16.cu       | 16 ++++++++
 .../codegen/decoding_sm70_64_f16_u4.cu        | 17 ++++++++
 .../codegen/decoding_sm70_64_f16_u8.cu        | 17 ++++++++
 .../codegen/decoding_sm75_64_f16_f16.cu       | 14 +++++++
 .../codegen/decoding_sm75_64_f16_u4.cu        | 14 +++++++
 .../codegen/decoding_sm75_64_f16_u8.cu        | 14 +++++++
 .../codegen/decoding_sm80_64_bf16_bf16.cu     | 22 ++++++++++
 .../codegen/decoding_sm80_64_bf16_u4.cu       | 14 +++++++
 .../codegen/decoding_sm80_64_bf16_u8.cu       | 14 +++++++
 .../codegen/decoding_sm80_64_f16_f16.cu       | 18 +++++++++
 .../codegen/decoding_sm80_64_f16_u4.cu        | 14 +++++++
 .../codegen/decoding_sm80_64_f16_u8.cu        | 14 +++++++
 src/turbomind/kernels/attention/decoding.cu   | 33 +++++++++------
 src/turbomind/kernels/attention/impl_16816.h  | 17 ++++----
 src/turbomind/kernels/attention/impl_1688.h   | 12 ++++--
 src/turbomind/kernels/attention/impl_81616.h  |  4 +-
 .../kernels/attention/kv_cache_utils_v2.cu    | 40 ++++++++++++++-----
 src/turbomind/kernels/attention/reduce.cu     | 39 ++++++++----------
 tests/test_lmdeploy/test_auto_backend.py      |  2 +-
 28 files changed, 383 insertions(+), 71 deletions(-)
 create mode 100644 src/turbomind/kernels/attention/codegen/attention_sm70_64_f16.cu
 create mode 100644 src/turbomind/kernels/attention/codegen/attention_sm75_64_f16.cu
 create mode 100644 src/turbomind/kernels/attention/codegen/attention_sm80_64_bf16.cu
 create mode 100644 src/turbomind/kernels/attention/codegen/attention_sm80_64_f16.cu
 create mode 100644 src/turbomind/kernels/attention/codegen/decoding_sm70_64_f16_f16.cu
 create mode 100644 src/turbomind/kernels/attention/codegen/decoding_sm70_64_f16_u4.cu
 create mode 100644 src/turbomind/kernels/attention/codegen/decoding_sm70_64_f16_u8.cu
 create mode 100644 src/turbomind/kernels/attention/codegen/decoding_sm75_64_f16_f16.cu
 create mode 100644 src/turbomind/kernels/attention/codegen/decoding_sm75_64_f16_u4.cu
 create mode 100644 src/turbomind/kernels/attention/codegen/decoding_sm75_64_f16_u8.cu
 create mode 100644 src/turbomind/kernels/attention/codegen/decoding_sm80_64_bf16_bf16.cu
 create mode 100644 src/turbomind/kernels/attention/codegen/decoding_sm80_64_bf16_u4.cu
 create mode 100644 src/turbomind/kernels/attention/codegen/decoding_sm80_64_bf16_u8.cu
 create mode 100644 src/turbomind/kernels/attention/codegen/decoding_sm80_64_f16_f16.cu
 create mode 100644 src/turbomind/kernels/attention/codegen/decoding_sm80_64_f16_u4.cu
 create mode 100644 src/turbomind/kernels/attention/codegen/decoding_sm80_64_f16_u8.cu

diff --git a/lmdeploy/turbomind/deploy/source_model/internvl.py b/lmdeploy/turbomind/deploy/source_model/internvl.py
index 51082fb3a1..bb660a59b2 100644
--- a/lmdeploy/turbomind/deploy/source_model/internvl.py
+++ b/lmdeploy/turbomind/deploy/source_model/internvl.py
@@ -80,8 +80,13 @@ def model_info(self):
                 scaling_factor = model_arg['rope_scaling'].get('factor', '')
                 if scaling_type == 'dynamic':
                     use_dynamic_ntk = 1
+            attn_bias = 1 if model_arg['architectures'][
+                0] == 'Qwen2ForCausalLM' else 0
 
         return dict(num_layer=num_layer,
+                    size_per_head=hidden_units // attn_head_num,
+                    rotary_embedding=hidden_units // attn_head_num,
+                    attn_bias=attn_bias,
                     norm_eps=norm_eps,
                     hidden_units=hidden_units,
                     inter_size=inter_size,
diff --git a/lmdeploy/turbomind/deploy/source_model/llama.py b/lmdeploy/turbomind/deploy/source_model/llama.py
index 8e19fa8d87..a8aa51b144 100644
--- a/lmdeploy/turbomind/deploy/source_model/llama.py
+++ b/lmdeploy/turbomind/deploy/source_model/llama.py
@@ -189,6 +189,8 @@ def model_info(self):
                     beta_slow = rope_scaling.get('beta_slow', 1.0)
 
         return dict(
+            size_per_head=hidden_units // attn_head_num,
+            rotary_embedding=hidden_units // attn_head_num,
             num_layer=num_layer,
             norm_eps=norm_eps,
             head_num=attn_head_num,
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index 8a1f5e7315..979ed0c547 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -65,11 +65,10 @@ def is_supported(model_path: str):
     """  # noqa: E501
     import os
 
-    def _is_head_dim_128(cfg):
+    def _is_head_dim_supported(cfg):
         num_attn_head = cfg.num_attention_heads
         hidden_size = cfg.hidden_size
-        # turbomind support head_dim=128
-        return (hidden_size // num_attn_head) == 128
+        return (hidden_size // num_attn_head) in [128, 64]
 
     support_by_turbomind = False
     triton_model_path = os.path.join(model_path, 'triton_models')
@@ -87,9 +86,7 @@ def _is_head_dim_128(cfg):
                     # baichuan-13B, baichuan2-13B not supported by turbomind
                     support_by_turbomind = False
             elif arch in ['Qwen2ForCausalLM', 'LlamaForCausalLM']:
-                # the head_dim of qwen2 0.5b and llama3.2-1b is 64, which
-                # hasn't been supported by turbomind yet
-                support_by_turbomind = _is_head_dim_128(cfg)
+                support_by_turbomind = _is_head_dim_supported(cfg)
             elif arch in ('ChatGLMModel', 'ChatGLMForConditionalGeneration'):
                 # chatglm1/2/3 is not working yet
                 support_by_turbomind = cfg.num_layers == 40
@@ -97,7 +94,6 @@ def _is_head_dim_128(cfg):
                     # glm-4v-9b not supported
                     support_by_turbomind = False
             elif arch == 'InternVLChatModel':
-                # internvl2-4b,internlm2-1b are not working yet
-                support_by_turbomind = _is_head_dim_128(cfg.llm_config)
+                support_by_turbomind = _is_head_dim_supported(cfg.llm_config)
 
     return support_by_turbomind
diff --git a/src/turbomind/kernels/attention/CMakeLists.txt b/src/turbomind/kernels/attention/CMakeLists.txt
index 4ca63f5db6..af9d47e0e6 100644
--- a/src/turbomind/kernels/attention/CMakeLists.txt
+++ b/src/turbomind/kernels/attention/CMakeLists.txt
@@ -22,6 +22,22 @@ add_library(attention STATIC
             codegen/decoding_sm80_128_f16_f16.cu
             codegen/decoding_sm80_128_f16_u4.cu
             codegen/decoding_sm80_128_f16_u8.cu
+            codegen/attention_sm70_64_f16.cu
+            codegen/attention_sm75_64_f16.cu
+            codegen/attention_sm80_64_bf16.cu
+            codegen/attention_sm80_64_f16.cu
+            codegen/decoding_sm70_64_f16_f16.cu
+            codegen/decoding_sm70_64_f16_u4.cu
+            codegen/decoding_sm70_64_f16_u8.cu
+            codegen/decoding_sm75_64_f16_f16.cu
+            codegen/decoding_sm75_64_f16_u4.cu
+            codegen/decoding_sm75_64_f16_u8.cu
+            codegen/decoding_sm80_64_bf16_bf16.cu
+            codegen/decoding_sm80_64_bf16_u4.cu
+            codegen/decoding_sm80_64_bf16_u8.cu
+            codegen/decoding_sm80_64_f16_f16.cu
+            codegen/decoding_sm80_64_f16_u4.cu
+            codegen/decoding_sm80_64_f16_u8.cu
             )
 set_property(TARGET attention PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/src/turbomind/kernels/attention/attention.cu b/src/turbomind/kernels/attention/attention.cu
index ffbad56b46..3f557234bc 100644
--- a/src/turbomind/kernels/attention/attention.cu
+++ b/src/turbomind/kernels/attention/attention.cu
@@ -14,20 +14,19 @@ template<class T>
 void dispatchAttention(const AttentionParams<T>& params)
 {
     using namespace attention;
-    if (params.size_per_head == 128) {
-
+    auto dispatch = [&](const auto dim) {
+        constexpr int kHeadDim = dim;
         if (params.arch >= 80) {
-            using Config = AttentionConfig<arch::Sm80, T, 128, CacheType::kLinear>;
+            using Config = AttentionConfig<arch::Sm80, T, kHeadDim, CacheType::kLinear>;
             return invokeAttention<typename Config::Kernel>(params);
         }
-
         if constexpr (!std::is_same_v<T, nv_bfloat16>) {
             if (params.arch == 75) {
-                return invokeAttention<typename AttentionConfig<arch::Sm75, T, 128, CacheType::kLinear>::Kernel>(
+                return invokeAttention<typename AttentionConfig<arch::Sm75, T, kHeadDim, CacheType::kLinear>::Kernel>(
                     params);
             }
             else if (params.arch >= 70) {
-                return invokeAttention<typename AttentionConfig<arch::Sm70, T, 128, CacheType::kLinear>::Kernel>(
+                return invokeAttention<typename AttentionConfig<arch::Sm70, T, kHeadDim, CacheType::kLinear>::Kernel>(
                     params);
             }
         }
@@ -38,6 +37,14 @@ void dispatchAttention(const AttentionParams<T>& params)
                     params.arch);
             }
         }
+        FT_CHECK(0);
+    };
+
+    if (params.size_per_head == 64) {
+        return dispatch(std::integral_constant<int, 64>{});
+    }
+    else if (params.size_per_head == 128) {
+        return dispatch(std::integral_constant<int, 128>{});
     }
     FT_CHECK(0);
 }
diff --git a/src/turbomind/kernels/attention/codegen/attention_sm70_64_f16.cu b/src/turbomind/kernels/attention/codegen/attention_sm70_64_f16.cu
new file mode 100644
index 0000000000..72b219432c
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/attention_sm70_64_f16.cu
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../attention_config.h"
+#include "../attention_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template void invokeAttention<typename AttentionConfig<arch::Sm70, half, 64, CacheType::kLinear>::Kernel>(
+    const AttentionParams<half>& params);
+
+template void invokeAttention<typename AttentionConfig<arch::Sm70, half, 64, CacheType::kBlock>::Kernel>(
+    const AttentionParams<half>& params);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/codegen/attention_sm75_64_f16.cu b/src/turbomind/kernels/attention/codegen/attention_sm75_64_f16.cu
new file mode 100644
index 0000000000..cef945015a
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/attention_sm75_64_f16.cu
@@ -0,0 +1,17 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../attention_config.h"
+#include "../attention_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template void invokeAttention<typename AttentionConfig<arch::Sm75, half, 64, CacheType::kLinear>::Kernel>(
+    const AttentionParams<half>& params);
+
+// ! register spill
+// template void invokeAttention<typename AttentionConfig<arch::Sm75, half, 64, CacheType::kBlock>::Kernel>(
+//     const AttentionParams<half>& params);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/codegen/attention_sm80_64_bf16.cu b/src/turbomind/kernels/attention/codegen/attention_sm80_64_bf16.cu
new file mode 100644
index 0000000000..cc6e54c14b
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/attention_sm80_64_bf16.cu
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../attention_config.h"
+#include "../attention_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template void invokeAttention<typename AttentionConfig<arch::Sm80, nv_bfloat16, 64, CacheType::kLinear>::Kernel>(
+    const AttentionParams<nv_bfloat16>& params);
+
+template void invokeAttention<typename AttentionConfig<arch::Sm80, nv_bfloat16, 64, CacheType::kBlock>::Kernel>(
+    const AttentionParams<nv_bfloat16>& params);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/codegen/attention_sm80_64_f16.cu b/src/turbomind/kernels/attention/codegen/attention_sm80_64_f16.cu
new file mode 100644
index 0000000000..26e3f54b29
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/attention_sm80_64_f16.cu
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../attention_config.h"
+#include "../attention_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template void invokeAttention<typename AttentionConfig<arch::Sm80, half, 64, CacheType::kLinear>::Kernel>(
+    const AttentionParams<half>& params);
+
+template void invokeAttention<typename AttentionConfig<arch::Sm80, half, 64, CacheType::kBlock>::Kernel>(
+    const AttentionParams<half>& params);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/codegen/decoding_sm70_64_f16_f16.cu b/src/turbomind/kernels/attention/codegen/decoding_sm70_64_f16_f16.cu
new file mode 100644
index 0000000000..12558aeae6
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/decoding_sm70_64_f16_f16.cu
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../decoding_config.h"
+#include "../decoding_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template bool invokeDecoding<Decoding<arch::Sm70, half, half, 1, 64>>(const AttentionParams<half>& params);
+
+template bool invokeDecoding<Decoding<arch::Sm70, half, half, 2, 64>>(const AttentionParams<half>& params);
+
+template bool invokeDecoding<Decoding<arch::Sm70, half, half, 3, 64>>(const AttentionParams<half>& params);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/codegen/decoding_sm70_64_f16_u4.cu b/src/turbomind/kernels/attention/codegen/decoding_sm70_64_f16_u4.cu
new file mode 100644
index 0000000000..25b49f9590
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/decoding_sm70_64_f16_u4.cu
@@ -0,0 +1,17 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../attention_params.h"
+#include "../decoding_config.h"
+#include "../decoding_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template bool invokeDecoding<Decoding<arch::Sm70, half, uint4_t, 1, 64>>(const AttentionParams<half>& params);
+
+template bool invokeDecoding<Decoding<arch::Sm70, half, uint4_t, 2, 64>>(const AttentionParams<half>& params);
+
+template bool invokeDecoding<Decoding<arch::Sm70, half, uint4_t, 3, 64>>(const AttentionParams<half>& params);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/codegen/decoding_sm70_64_f16_u8.cu b/src/turbomind/kernels/attention/codegen/decoding_sm70_64_f16_u8.cu
new file mode 100644
index 0000000000..824cd5b02e
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/decoding_sm70_64_f16_u8.cu
@@ -0,0 +1,17 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../attention_params.h"
+#include "../decoding_config.h"
+#include "../decoding_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template bool invokeDecoding<Decoding<arch::Sm70, half, uint8_t, 1, 64>>(const AttentionParams<half>& params);
+
+template bool invokeDecoding<Decoding<arch::Sm70, half, uint8_t, 2, 64>>(const AttentionParams<half>& params);
+
+template bool invokeDecoding<Decoding<arch::Sm70, half, uint8_t, 3, 64>>(const AttentionParams<half>& params);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/codegen/decoding_sm75_64_f16_f16.cu b/src/turbomind/kernels/attention/codegen/decoding_sm75_64_f16_f16.cu
new file mode 100644
index 0000000000..456e6e18d7
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/decoding_sm75_64_f16_f16.cu
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../decoding_config.h"
+#include "../decoding_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template bool invokeDecoding<Decoding<arch::Sm75, half, half, 8, 64>>(const AttentionParams<half>& params);
+
+template bool invokeDecoding<Decoding<arch::Sm75, half, half, 16, 64>>(const AttentionParams<half>& params);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/codegen/decoding_sm75_64_f16_u4.cu b/src/turbomind/kernels/attention/codegen/decoding_sm75_64_f16_u4.cu
new file mode 100644
index 0000000000..171e59f5f1
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/decoding_sm75_64_f16_u4.cu
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../decoding_config.h"
+#include "../decoding_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template bool invokeDecoding<Decoding<arch::Sm75, half, uint4_t, 8, 64>>(const AttentionParams<half>& params);
+
+template bool invokeDecoding<Decoding<arch::Sm75, half, uint4_t, 16, 64>>(const AttentionParams<half>& params);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/codegen/decoding_sm75_64_f16_u8.cu b/src/turbomind/kernels/attention/codegen/decoding_sm75_64_f16_u8.cu
new file mode 100644
index 0000000000..1d6d40ed3a
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/decoding_sm75_64_f16_u8.cu
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../decoding_config.h"
+#include "../decoding_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template bool invokeDecoding<Decoding<arch::Sm75, half, uint8_t, 8, 64>>(const AttentionParams<half>& params);
+
+template bool invokeDecoding<Decoding<arch::Sm75, half, uint8_t, 16, 64>>(const AttentionParams<half>& params);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/codegen/decoding_sm80_64_bf16_bf16.cu b/src/turbomind/kernels/attention/codegen/decoding_sm80_64_bf16_bf16.cu
new file mode 100644
index 0000000000..b657034c4c
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/decoding_sm80_64_bf16_bf16.cu
@@ -0,0 +1,22 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../decoding_config.h"
+#include "../decoding_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template bool
+invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, nv_bfloat16, 1, 64>>(const AttentionParams<nv_bfloat16>& params);
+
+template bool
+invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, nv_bfloat16, 2, 64>>(const AttentionParams<nv_bfloat16>& params);
+
+template bool
+invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, nv_bfloat16, 8, 64>>(const AttentionParams<nv_bfloat16>& params);
+
+template bool
+invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, nv_bfloat16, 16, 64>>(const AttentionParams<nv_bfloat16>& params);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/codegen/decoding_sm80_64_bf16_u4.cu b/src/turbomind/kernels/attention/codegen/decoding_sm80_64_bf16_u4.cu
new file mode 100644
index 0000000000..a5c0b34b7f
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/decoding_sm80_64_bf16_u4.cu
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../decoding_config.h"
+#include "../decoding_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template bool invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, uint4_t, 8, 64>>(const AttentionParams<nv_bfloat16>&);
+
+template bool invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, uint4_t, 16, 64>>(const AttentionParams<nv_bfloat16>&);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/codegen/decoding_sm80_64_bf16_u8.cu b/src/turbomind/kernels/attention/codegen/decoding_sm80_64_bf16_u8.cu
new file mode 100644
index 0000000000..a7dd3050b1
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/decoding_sm80_64_bf16_u8.cu
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../decoding_config.h"
+#include "../decoding_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template bool invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, uint8_t, 8, 64>>(const AttentionParams<nv_bfloat16>&);
+
+template bool invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, uint8_t, 16, 64>>(const AttentionParams<nv_bfloat16>&);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/codegen/decoding_sm80_64_f16_f16.cu b/src/turbomind/kernels/attention/codegen/decoding_sm80_64_f16_f16.cu
new file mode 100644
index 0000000000..e73be11e62
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/decoding_sm80_64_f16_f16.cu
@@ -0,0 +1,18 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../decoding_config.h"
+#include "../decoding_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template bool invokeDecoding<Decoding<arch::Sm80, half, half, 1, 64>>(const AttentionParams<half>& params);
+
+template bool invokeDecoding<Decoding<arch::Sm80, half, half, 2, 64>>(const AttentionParams<half>& params);
+
+template bool invokeDecoding<Decoding<arch::Sm80, half, half, 8, 64>>(const AttentionParams<half>& params);
+
+template bool invokeDecoding<Decoding<arch::Sm80, half, half, 16, 64>>(const AttentionParams<half>& params);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/codegen/decoding_sm80_64_f16_u4.cu b/src/turbomind/kernels/attention/codegen/decoding_sm80_64_f16_u4.cu
new file mode 100644
index 0000000000..c7c560e98d
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/decoding_sm80_64_f16_u4.cu
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../decoding_config.h"
+#include "../decoding_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template bool invokeDecoding<Decoding<arch::Sm80, half, uint4_t, 8, 64>>(const AttentionParams<half>&);
+
+template bool invokeDecoding<Decoding<arch::Sm80, half, uint4_t, 16, 64>>(const AttentionParams<half>&);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/codegen/decoding_sm80_64_f16_u8.cu b/src/turbomind/kernels/attention/codegen/decoding_sm80_64_f16_u8.cu
new file mode 100644
index 0000000000..06f6ce5600
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/decoding_sm80_64_f16_u8.cu
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../decoding_config.h"
+#include "../decoding_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template bool invokeDecoding<Decoding<arch::Sm80, half, uint8_t, 8, 64>>(const AttentionParams<half>&);
+
+template bool invokeDecoding<Decoding<arch::Sm80, half, uint8_t, 16, 64>>(const AttentionParams<half>&);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/decoding.cu b/src/turbomind/kernels/attention/decoding.cu
index 2b9328a681..1b04b7d4eb 100644
--- a/src/turbomind/kernels/attention/decoding.cu
+++ b/src/turbomind/kernels/attention/decoding.cu
@@ -29,8 +29,6 @@ constexpr auto get_kv_type(std::integral_constant<int, is_kv_int8>)
 template<class T>
 void dispatchDecoding(const AttentionParams<T>& params)
 {
-    static constexpr std::integral_constant<int, 128> kHeadDim{};
-
     const bool is_kv_int8     = params.quant_policy & QuantPolicy::kCacheKVInt8;
     const bool is_kv_int4     = params.quant_policy & QuantPolicy::kCacheKVInt4;
     const int  query_group_sz = params.num_heads / params.num_kv_heads;
@@ -39,9 +37,10 @@ void dispatchDecoding(const AttentionParams<T>& params)
 
     /// TODO: we need better Qh dispatching, when #waves < 1, smaller Qh may outperform larger Qh due to better
     // concurrency
-    auto dispatch_h = [&](auto arch, auto kv) -> bool {
-        using Arch = decltype(arch);
-        using Tkv  = decltype(kv);
+    auto dispatch_h = [&](auto arch, auto kv, const auto dim) -> bool {
+        using Arch             = decltype(arch);
+        using Tkv              = decltype(kv);
+        constexpr int kHeadDim = dim;
         if (0) {}
         else if (query_group_sz > 8) {
             return invokeDecoding<Decoding<Arch, T, Tkv, 9, kHeadDim>>(params);
@@ -73,31 +72,41 @@ void dispatchDecoding(const AttentionParams<T>& params)
         return false;
     };
 
-    auto dispatch_kv = [&](auto arch) -> bool {
+    auto dispatch_kv = [&](auto arch, const auto dim) -> bool {
         FT_CHECK(!(is_kv_int4 && is_kv_int8));
         if (is_kv_int4) {
-            return dispatch_h(arch, uint4_t{});
+            return dispatch_h(arch, uint4_t{}, dim);
         }
         else if (is_kv_int8) {
-            return dispatch_h(arch, uint8_t{});
+            return dispatch_h(arch, uint8_t{}, dim);
         }
         else {
-            return dispatch_h(arch, T{});
+            return dispatch_h(arch, T{}, dim);
+        }
+        return false;
+    };
+
+    auto dispatch_head_dim = [&](auto arch) {
+        if (params.size_per_head == 128) {
+            return dispatch_kv(arch, std::integral_constant<int, 128>{});
+        }
+        else if (params.size_per_head == 64) {
+            return dispatch_kv(arch, std::integral_constant<int, 64>{});
         }
         return false;
     };
 
     auto dispatch = [&]() {
         if (params.arch >= 80) {
-            return dispatch_kv(arch::Sm80{});
+            return dispatch_head_dim(arch::Sm80{});
         }
 
         if constexpr (!std::is_same_v<T, nv_bfloat16>) {
             if (params.arch == 75) {
-                return dispatch_kv(arch::Sm75{});
+                return dispatch_head_dim(arch::Sm75{});
             }
             else if (params.arch >= 70) {
-                return dispatch_kv(arch::Sm70{});
+                return dispatch_head_dim(arch::Sm70{});
             }
         }
 
diff --git a/src/turbomind/kernels/attention/impl_16816.h b/src/turbomind/kernels/attention/impl_16816.h
index 69e0a6a48c..6e8f37f4d4 100644
--- a/src/turbomind/kernels/attention/impl_16816.h
+++ b/src/turbomind/kernels/attention/impl_16816.h
@@ -63,14 +63,15 @@ struct Impl<MMA_16816, T_, T_, CTA_H_, CTA_Q_, CTA_S_, WARP_H, WARP_Q, WARP_S, H
 
     static_assert(sizeof(FragS) / 2 == sizeof(FragP));
 
-    using SmemLayoutQ = SmemLayoutV2<CTA_Q * CTA_H, HeadDim, 64, 128, Swizzle<3, 3, 4>>;
-#if 0
-    using SmemLayoutK = SmemLayoutV2<CTA_S, HeadDim, 16, 64, Swizzle<3, 3, 3>>;
-    using SmemLayoutV = SmemLayoutV2<CTA_S, HeadDim, 16, 64, Swizzle<3, 3, 3>>;
-#else
-    using SmemLayoutK = SmemLayoutV2<CTA_S, HeadDim, 16, 128, Swizzle<3, 3, 4>>;
-    using SmemLayoutV = SmemLayoutV2<CTA_S, HeadDim, 16, 128, Swizzle<3, 3, 4>>;
-#endif
+    using SmemLayoutQ = std::conditional_t<HeadDim == 128,
+                                           SmemLayoutV2<CTA_Q * CTA_H, HeadDim, 64, 128, Swizzle<3, 3, 4>>,
+                                           SmemLayoutV2<CTA_Q * CTA_H, HeadDim, 64, 64, Swizzle<3, 3, 3>>>;
+    using SmemLayoutK = std::conditional_t<HeadDim == 128,
+                                           SmemLayoutV2<CTA_S, HeadDim, 16, 128, Swizzle<3, 3, 4>>,
+                                           SmemLayoutV2<CTA_S, HeadDim, 16, 64, Swizzle<3, 3, 3>>>;
+    using SmemLayoutV = std::conditional_t<HeadDim == 128,
+                                           SmemLayoutV2<CTA_S, HeadDim, 16, 128, Swizzle<3, 3, 4>>,
+                                           SmemLayoutV2<CTA_S, HeadDim, 16, 64, Swizzle<3, 3, 3>>>;
 
     using SmemLayoutKVp = void;
 
diff --git a/src/turbomind/kernels/attention/impl_1688.h b/src/turbomind/kernels/attention/impl_1688.h
index 856ddcd587..a822c58039 100644
--- a/src/turbomind/kernels/attention/impl_1688.h
+++ b/src/turbomind/kernels/attention/impl_1688.h
@@ -61,9 +61,15 @@ struct Impl<MMA_1688, T_, T_, CTA_H_, CTA_Q_, CTA_S_, WARP_H, WARP_Q, WARP_S, He
     using FragV = Array<T, 2>[V_K][V_N];  // ((d8, s4), (Sk, Dn), (s2))
                                           //    1   2     8   8     1
 
-    using SmemLayoutQ = SmemLayoutV2<CTA_Q * CTA_H, HeadDim, 64, 128, Swizzle<3, 3, 4>>;
-    using SmemLayoutK = SmemLayoutV2<CTA_S, HeadDim, 32, 128, Swizzle<3, 3, 4>>;  // load by (s32,d8) tile
-    using SmemLayoutV = SmemLayoutV2<CTA_S, HeadDim, 16, 128, Swizzle<3, 3, 4>>;  // load by (s8,d32) tile
+    using SmemLayoutQ = std::conditional_t<HeadDim == 128,
+                                           SmemLayoutV2<CTA_Q * CTA_H, HeadDim, 64, 128, Swizzle<3, 3, 4>>,
+                                           SmemLayoutV2<CTA_Q * CTA_H, HeadDim, 64, 64, Swizzle<3, 3, 3>>>;
+    using SmemLayoutK = std::conditional_t<HeadDim == 128,  // load by (s32,d8) tile
+                                           SmemLayoutV2<CTA_S, HeadDim, 32, 128, Swizzle<3, 3, 4>>,
+                                           SmemLayoutV2<CTA_S, HeadDim, 32, 64, Swizzle<3, 3, 3>>>;
+    using SmemLayoutV = std::conditional_t<HeadDim == 128,  // load by (s8,d32) tile
+                                           SmemLayoutV2<CTA_S, HeadDim, 16, 128, Swizzle<3, 3, 4>>,
+                                           SmemLayoutV2<CTA_S, HeadDim, 16, 64, Swizzle<3, 3, 3>>>;
 
     using SmemLayoutKVp = void;
 
diff --git a/src/turbomind/kernels/attention/impl_81616.h b/src/turbomind/kernels/attention/impl_81616.h
index 0c0baa531a..3b90bcdf57 100644
--- a/src/turbomind/kernels/attention/impl_81616.h
+++ b/src/turbomind/kernels/attention/impl_81616.h
@@ -104,7 +104,9 @@ struct Impl<MMA_81616, T_, Tkv_, CTA_H_, CTA_Q_, CTA_S_, WARP_H_, WARP_Q, WARP_S
     }
     static constexpr auto _SmemLayoutKV(std::integral_constant<int, 4>)
     {
-        return SmemLayoutV2<CTA_S, HeadDim, 32, 128, Swizzle<2, 5, 3>>{};
+        return std::conditional_t<HeadDim == 128,
+                                  SmemLayoutV2<CTA_S, HeadDim, 32, 128, Swizzle<2, 5, 3>>,
+                                  SmemLayoutV2<CTA_S, HeadDim, 32, 64, Swizzle<3, 4, 3>>>{};
     }
 
     using SmemLayoutQ = SmemLayoutV2<CTA_H1, HeadDim, CTA_H1, HeadDim, Swizzle<3, 3, 4>>;
diff --git a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
index 9f28a17b83..20bb00fde8 100644
--- a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
+++ b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
@@ -241,10 +241,10 @@ void invokeProcessKV_v2(char**       blocks,
     int  block = WARPS * WARP_SIZE;
     dim3 grid((max_q_len + CTA_S - 1) / CTA_S, head_num, batch_size);
 
-    auto invoke = [&](auto tkv) {
+    auto invoke = [&](auto tkv, const auto dim) {
         using Tkv = decltype(tkv);
 
-        constexpr int kHeadDim = 128;
+        constexpr int kHeadDim = dim;
         FT_CHECK(head_dim == kHeadDim);
 
         block::Layout block_layout{block::Config<T, Tkv, kHeadDim>{head_num, block_seq_len}};
@@ -276,14 +276,24 @@ void invokeProcessKV_v2(char**       blocks,
                                                                               block_layout);
     };
 
+    auto dispatch = [&](auto tkv) {
+        if (head_dim == 128) {
+            return invoke(tkv, std::integral_constant<int, 128>{});
+        }
+        else if (head_dim == 64) {
+            return invoke(tkv, std::integral_constant<int, 64>{});
+        }
+        FT_CHECK(0);
+    };
+
     if (quant_policy & QuantPolicy::kCacheKVInt8) {
-        invoke(uint8_t{});
+        dispatch(uint8_t{});
     }
     else if (quant_policy & QuantPolicy::kCacheKVInt4) {
-        invoke(uint4_t{});
+        dispatch(uint4_t{});
     }
     else {
-        invoke(T{});
+        dispatch(T{});
     }
 }
 
@@ -496,10 +506,10 @@ void invokeFlattenKV_v2(T*           k,
     constexpr int block = kWarpCnt * WARP_SIZE;
     const dim3    grid((max_seq_len + CTA_S - 1) / CTA_S, head_num, batch_size);
 
-    auto invoke = [&](auto tkv) {
+    auto invoke = [&](auto tkv, const auto dim) {
         using Tkv = decltype(tkv);
 
-        constexpr int kHeadDim = 128;
+        constexpr int kHeadDim = dim;
         FT_CHECK(head_dim == kHeadDim);
 
         block::Layout block_layout{block::Config<T, Tkv, kHeadDim>{head_num, block_seq_len}};
@@ -528,14 +538,24 @@ void invokeFlattenKV_v2(T*           k,
                                                                             block_layout);
     };
 
+    auto dispatch = [&](auto tkv) {
+        if (head_dim == 64) {
+            return invoke(tkv, std::integral_constant<int, 64>{});
+        }
+        else if (head_dim == 128) {
+            return invoke(tkv, std::integral_constant<int, 128>{});
+        }
+        FT_CHECK(0);
+    };
+
     if (quant_policy & QuantPolicy::kCacheKVInt8) {
-        invoke(uint8_t{});
+        dispatch(uint8_t{});
     }
     else if (quant_policy & QuantPolicy::kCacheKVInt4) {
-        invoke(uint4_t{});
+        dispatch(uint4_t{});
     }
     else {
-        invoke(T{});
+        dispatch(T{});
     }
 }
 
diff --git a/src/turbomind/kernels/attention/reduce.cu b/src/turbomind/kernels/attention/reduce.cu
index 44b3dbfdaa..12f6aff38b 100644
--- a/src/turbomind/kernels/attention/reduce.cu
+++ b/src/turbomind/kernels/attention/reduce.cu
@@ -53,30 +53,25 @@ void invokeReduce(T*           out,
     invoke(std::true_type{}, stride_k);
 }
 
-template void invokeReduce<128>(half*        out,
-                                float*       partial_M,
-                                float*       partial_L,
-                                float*       partial_O,
-                                const int*   split_cnt,
-                                int          partial_len,
-                                int          max_split_cnt,
-                                int          query_num,
-                                int          head_num,
-                                float        exp_scale,
-                                cudaStream_t stream);
+#define INSTANTIATE_invokeReduce(dim, type)                                                                            \
+    template void invokeReduce<dim>(type * out,                                                                        \
+                                    float*       partial_M,                                                            \
+                                    float*       partial_L,                                                            \
+                                    float*       partial_O,                                                            \
+                                    const int*   split_cnt,                                                            \
+                                    int          partial_len,                                                          \
+                                    int          max_split_cnt,                                                        \
+                                    int          query_num,                                                            \
+                                    int          head_num,                                                             \
+                                    float        exp_scale,                                                            \
+                                    cudaStream_t stream);
+
+INSTANTIATE_invokeReduce(128, half);
+INSTANTIATE_invokeReduce(64, half);
 
 #if ENABLE_BF16
-template void invokeReduce<128>(nv_bfloat16* out,
-                                float*       partial_M,
-                                float*       partial_L,
-                                float*       partial_O,
-                                const int*   split_cnt,
-                                int          partial_len,
-                                int          max_split_cnt,
-                                int          query_num,
-                                int          head_num,
-                                float        exp_scale,
-                                cudaStream_t stream);
+INSTANTIATE_invokeReduce(128, nv_bfloat16);
+INSTANTIATE_invokeReduce(64, nv_bfloat16)
 #endif
 
 }  // namespace turbomind::attention
diff --git a/tests/test_lmdeploy/test_auto_backend.py b/tests/test_lmdeploy/test_auto_backend.py
index 3dfcac292a..5db727f17f 100644
--- a/tests/test_lmdeploy/test_auto_backend.py
+++ b/tests/test_lmdeploy/test_auto_backend.py
@@ -38,7 +38,7 @@ def models(self):
             ('Qwen/Qwen-7B-Chat', True, True),
             ('Qwen/Qwen-VL-Chat', False, True),
             ('Qwen/Qwen1.5-4B-Chat', True, True),
-            ('Qwen/Qwen1.5-0.5B-Chat', True, False),
+            ('Qwen/Qwen1.5-0.5B-Chat', True, True),
         ]
         return models
 

From a4012efa0f21caa737d15bb558e216199a104581 Mon Sep 17 00:00:00 2001
From: RunningLeon <mnsheng@yeah.net>
Date: Thu, 7 Nov 2024 11:21:45 +0800
Subject: [PATCH 063/122] fix tp exit code for pytorch engine (#2718)

---
 lmdeploy/pytorch/engine/model_agent.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
index c713e3ec85..74938de812 100644
--- a/lmdeploy/pytorch/engine/model_agent.py
+++ b/lmdeploy/pytorch/engine/model_agent.py
@@ -517,7 +517,8 @@ def _check_context_alive(mp_context: mp.ProcessContext):
     for idx, exitcode in log_procs:
         logger.error(f'TP process {idx} failed with exitcode {exitcode}.')
     # TODO: not safe exit.
-    os._exit(1)
+    exit_code = 1 if len(log_procs) > 0 else 0
+    os._exit(exit_code)
 
 
 def _find_available_port() -> bool:

From 2bed0182a8bac22e815ddfb6e00be1f680dea927 Mon Sep 17 00:00:00 2001
From: q yao <streetyao@live.com>
Date: Fri, 8 Nov 2024 11:51:35 +0800
Subject: [PATCH 064/122] Flatten cache and add flashattention (#2676)

* add flash attention

* add flash attention

* fix

* remove paged attention prefill

* remove auto tuning

* fix triton2

* fix ut

* fix sliding window

* fill last block
---
 lmdeploy/pytorch/backends/cuda/attention.py   |  98 ++-
 lmdeploy/pytorch/backends/cuda/op_backend.py  |  20 +-
 lmdeploy/pytorch/kernels/cuda/__init__.py     |   4 +
 .../pytorch/kernels/cuda/flashattention.py    | 427 +++++++++
 .../pytorch/kernels/cuda/flatten_kv_cache.py  | 323 +++++++
 .../pytorch/kernels/cuda/pagedattention.py    | 827 +++---------------
 lmdeploy/pytorch/models/cogvlm.py             |  14 -
 lmdeploy/pytorch/models/internlm2.py          |  11 -
 lmdeploy/pytorch/models/llama.py              |  16 -
 lmdeploy/pytorch/models/mixtral.py            |  11 -
 lmdeploy/pytorch/models/utils/cudagraph.py    |   3 +-
 tests/pytorch/kernel/test_flash_attention.py  | 250 ++++++
 tests/pytorch/kernel/test_flatten_kv_cache.py | 182 ++++
 tests/pytorch/kernel/test_paged_attention.py  | 114 +--
 14 files changed, 1460 insertions(+), 840 deletions(-)
 create mode 100644 lmdeploy/pytorch/kernels/cuda/flashattention.py
 create mode 100644 lmdeploy/pytorch/kernels/cuda/flatten_kv_cache.py
 create mode 100644 tests/pytorch/kernel/test_flash_attention.py
 create mode 100644 tests/pytorch/kernel/test_flatten_kv_cache.py

diff --git a/lmdeploy/pytorch/backends/cuda/attention.py b/lmdeploy/pytorch/backends/cuda/attention.py
index a0148c8782..d01d6fe9b4 100644
--- a/lmdeploy/pytorch/backends/cuda/attention.py
+++ b/lmdeploy/pytorch/backends/cuda/attention.py
@@ -1,4 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from dataclasses import dataclass
+from typing import Literal
+
 import torch
 
 from lmdeploy.pytorch.distributed import get_world_rank
@@ -6,9 +9,23 @@
 from ..attention import AttentionBuilder, AttentionImpl, AttentionMetadata
 
 
+@dataclass
 class TritonAttentionMetadata(AttentionMetadata):
     """triton attention metadata."""
-    pass
+    is_decoding: bool
+    block_offsets: torch.Tensor
+    q_start_loc: torch.Tensor = None
+    q_seqlens: torch.Tensor = None
+    kv_start_loc: torch.Tensor = None
+    kv_seqlens: torch.Tensor = None
+    fill_seqlens: torch.Tensor = None
+    quant_policy: Literal[0, 4, 8] = 0
+    kv_flatten_size: int = None
+
+
+def _cdiv(a, b):
+    """perform div up."""
+    return (a + b - 1) // b
 
 
 class TritonAttentionImpl(AttentionImpl[TritonAttentionMetadata]):
@@ -40,10 +57,14 @@ def __init__(
 
         from lmdeploy.pytorch.kernels.cuda import (alibi_paged_attention_fwd,
                                                    fill_kv_cache,
+                                                   flash_attention_fwd,
+                                                   flatten_kv_cache,
                                                    paged_attention_fwd)
         self.fill_kv_cache = fill_kv_cache
         self.paged_attention_fwd = paged_attention_fwd
         self.alibi_paged_attention_fwd = alibi_paged_attention_fwd
+        self.flatten_kv_cache = flatten_kv_cache
+        self.flash_attention_fwd = flash_attention_fwd
 
         # for alibi attention
         world_size, rank = get_world_rank()
@@ -69,7 +90,9 @@ def forward(
         fill_q_start_loc = q_start_loc
         q_seqlens = attn_metadata.q_seqlens
         fill_seqlens = q_seqlens
+        kv_start_loc = attn_metadata.kv_start_loc
         kv_seqlens = attn_metadata.kv_seqlens
+        kv_flatten_size = attn_metadata.kv_flatten_size
         quant_policy = attn_metadata.quant_policy
         max_q_seqlen = query.numel() // (query.size(-1) * query.size(-2))
         fill_max_q_seqlen = max_q_seqlen
@@ -95,31 +118,58 @@ def forward(
                 quant_policy=quant_policy,
             )
 
-        if inplace:
-            attn_output = query[..., :self.v_head_size]
-        else:
-            q_shape = query.shape
-            o_shape = q_shape[:-1] + (self.v_head_size, )
-            attn_output = query.new_empty(o_shape)
+        q_shape = query.shape
+        o_shape = q_shape[:-1] + (self.v_head_size, )
+        attn_output = query.new_empty(o_shape)
 
+        is_decoding = attn_metadata.is_decoding
         if not self.alibi:
-            self.paged_attention_fwd(
-                query,
-                k_cache,
-                v_cache,
-                attn_output,
-                block_offsets,
-                q_start_loc=q_start_loc,
-                q_seqlens=q_seqlens,
-                kv_seqlens=kv_seqlens,
-                max_seqlen=max_q_seqlen,
-                k_scales_zeros=k_scales_zeros,
-                v_scales_zeros=v_scales_zeros,
-                quant_policy=quant_policy,
-                window_size=self.sliding_window,
-                sm_scale=self.scale,
-                logit_softcapping=self.logit_softcapping,
-            )
+            if is_decoding:
+                self.paged_attention_fwd(
+                    query,
+                    k_cache,
+                    v_cache,
+                    attn_output,
+                    block_offsets,
+                    kv_seqlens=kv_seqlens,
+                    k_scales_zeros=k_scales_zeros,
+                    v_scales_zeros=v_scales_zeros,
+                    quant_policy=quant_policy,
+                    window_size=self.sliding_window,
+                    sm_scale=self.scale,
+                    logit_softcapping=self.logit_softcapping,
+                )
+            else:
+                BLOCK_BS = k_cache.size(1)
+                # pad one more block to avoid invalid kv visit
+                out_size = (_cdiv(kv_flatten_size, BLOCK_BS) * BLOCK_BS +
+                            BLOCK_BS)
+                flatten_k, flatten_v = self.flatten_kv_cache(
+                    k_cache,
+                    v_cache,
+                    kv_seqlens,
+                    block_offsets,
+                    start_loc=kv_start_loc,
+                    out_size=out_size,
+                    out_dtype=query.dtype,
+                    k_scales_zeros=k_scales_zeros,
+                    v_scales_zeros=v_scales_zeros,
+                    quant_policy=quant_policy,
+                )
+                self.flash_attention_fwd(
+                    query,
+                    flatten_k,
+                    flatten_v,
+                    attn_output,
+                    q_start_loc=q_start_loc,
+                    q_seqlens=q_seqlens,
+                    kv_start_loc=kv_start_loc,
+                    kv_seqlens=kv_seqlens,
+                    max_seqlen=max_q_seqlen,
+                    window_size=self.sliding_window,
+                    sm_scale=self.scale,
+                    logit_softcapping=self.logit_softcapping,
+                )
         else:
             self.alibi_paged_attention_fwd(
                 query,
diff --git a/lmdeploy/pytorch/backends/cuda/op_backend.py b/lmdeploy/pytorch/backends/cuda/op_backend.py
index c01b5f093a..3e7fc23728 100644
--- a/lmdeploy/pytorch/backends/cuda/op_backend.py
+++ b/lmdeploy/pytorch/backends/cuda/op_backend.py
@@ -104,12 +104,20 @@ def update_step_context(cls, step_context):
         attn_meta_cls = cls.get_attention_metadata_cls()
         q_seqlens = step_context.q_seqlens
         q_start_loc = q_seqlens.cumsum(0) - q_seqlens
+        kv_seqlens = step_context.kv_seqlens
+        kv_start_loc = None
+        kv_flatten_size = None
+        if not step_context.is_decoding:
+            kv_start_loc = kv_seqlens.cumsum(0) - kv_seqlens
+            kv_flatten_size = kv_seqlens.sum().item()
         attn_metadata = attn_meta_cls(
             step_context.is_decoding,
             step_context.block_offsets,
             q_start_loc=q_start_loc,
             q_seqlens=q_seqlens,
-            kv_seqlens=step_context.kv_seqlens,
+            kv_start_loc=kv_start_loc,
+            kv_seqlens=kv_seqlens,
+            kv_flatten_size=kv_flatten_size,
             quant_policy=step_context.kv_quant_policy,
         )
 
@@ -120,12 +128,20 @@ def update_step_context(cls, step_context):
             for idx, state in enumerate(step_context.cross_attention_states):
                 if state is not None:
                     fill_seqlens[idx] = state.shape[-2]
+        cross_kv_seqlens = step_context.cross_kv_seqlens
+        cross_kv_start_loc = None
+        cross_kv_flatten_size = None
+        if not step_context.is_decoding and cross_kv_seqlens is not None:
+            cross_kv_start_loc = cross_kv_seqlens.cumsum(0) - cross_kv_seqlens
+            cross_kv_flatten_size = cross_kv_seqlens.sum().item()
         cross_attn_metadata = attn_meta_cls(
             step_context.is_decoding,
             step_context.block_offsets,
             q_start_loc=q_start_loc,
             q_seqlens=q_seqlens,
-            kv_seqlens=step_context.cross_kv_seqlens,
+            kv_start_loc=cross_kv_start_loc,
+            kv_seqlens=cross_kv_seqlens,
+            kv_flatten_size=cross_kv_flatten_size,
             fill_seqlens=fill_seqlens,
             quant_policy=step_context.kv_quant_policy,
         )
diff --git a/lmdeploy/pytorch/kernels/cuda/__init__.py b/lmdeploy/pytorch/kernels/cuda/__init__.py
index 23cf5d33ea..3790cf0f66 100644
--- a/lmdeploy/pytorch/kernels/cuda/__init__.py
+++ b/lmdeploy/pytorch/kernels/cuda/__init__.py
@@ -2,6 +2,8 @@
 from .alibi_pagedattention import alibi_paged_attention_fwd
 from .apply_rotary_pos_emb import apply_rotary_pos_emb
 from .fill_kv_cache import fill_kv_cache
+from .flashattention import flash_attention_fwd
+from .flatten_kv_cache import flatten_kv_cache
 from .fused_moe import fused_moe
 from .fused_rotary_emb import fused_rotary_emb
 from .multinomial_sampling import multinomial_sampling
@@ -24,4 +26,6 @@
     'per_channel_quant',
     'per_token_quant_int8',
     'rms_norm_dynamic_quant',
+    'flash_attention_fwd',
+    'flatten_kv_cache',
 ]
diff --git a/lmdeploy/pytorch/kernels/cuda/flashattention.py b/lmdeploy/pytorch/kernels/cuda/flashattention.py
new file mode 100644
index 0000000000..7521a3e2bb
--- /dev/null
+++ b/lmdeploy/pytorch/kernels/cuda/flashattention.py
@@ -0,0 +1,427 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import triton
+import triton.language as tl
+from packaging import version
+from torch import Tensor
+
+from lmdeploy.utils import get_logger
+
+logger = get_logger('lmdeploy')
+
+TRITON_VERSION = version.parse(triton.__version__)
+VERSION_300 = version.parse('3.0.0')
+assert TRITON_VERSION >= version.parse('2.2.0')
+
+# TODO: fast op might not work on non-nv device
+if TRITON_VERSION >= VERSION_300:
+    tanh = tl.extra.cuda.libdevice.tanh
+    tl_log2 = tl.log2
+    tl_exp2 = tl.exp2
+else:
+    tanh = tl.math.tanh
+    tl_log2 = tl.math.log2
+    tl_exp2 = tl.math.exp2
+
+
+def _get_block_d(head_dim_k, head_dim_v):
+    """get block d."""
+    BLOCK_DK = triton.next_power_of_2(head_dim_k)
+    BLOCK_DK1 = 0
+    if BLOCK_DK != head_dim_k:
+        BLOCK_DK = BLOCK_DK // 2
+        BLOCK_DK1 = max(16, triton.next_power_of_2(head_dim_k - BLOCK_DK))
+    BLOCK_DV = triton.next_power_of_2(head_dim_v)
+    return BLOCK_DK, BLOCK_DK1, BLOCK_DV
+
+
+@triton.jit
+def softcapping(qk, logit_softcapping: tl.constexpr):
+    """soft capping."""
+    if logit_softcapping > 0.0:
+        qk = qk / logit_softcapping
+        qk = tanh(qk)
+        qk = qk * logit_softcapping
+    return qk
+
+
+@triton.jit
+def _prefill_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, q1, k1_ptrs,
+                       loop_start, loop_end, qk_scale, history_mask,
+                       kv_min_loc, causal_mask: tl.constexpr,
+                       window_size: tl.constexpr,
+                       logit_softcapping: tl.constexpr, BLOCK_N: tl.constexpr,
+                       BLOCK_DK1: tl.constexpr):
+    k_ptrs = tl.advance(k_ptrs, (0, loop_start))
+    v_ptrs = tl.advance(v_ptrs, (loop_start, 0))
+    if BLOCK_DK1:
+        k1_ptrs = tl.advance(k1_ptrs, (0, loop_start))
+
+    offs_n = tl.arange(0, BLOCK_N)
+    for start_n in range(loop_start, loop_end, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+
+        k = tl.load(k_ptrs)
+        qk = tl.dot(q, k)
+
+        if BLOCK_DK1 != 0:
+            k1 = tl.load(k1_ptrs)
+            qk += tl.dot(q1, k1)
+
+        if causal_mask:
+            qk *= qk_scale
+            qk = softcapping(qk, logit_softcapping)
+            qk_mask = (history_mask[:, None]) >= (start_n + offs_n[None, :])
+            if window_size > 0:
+                qk_mask = qk_mask and (
+                    (start_n + offs_n[None, :]) >= kv_min_loc[:, None])
+            qk = tl.where(
+                qk_mask,
+                qk,
+                float(-1e30),
+            )
+            m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+            qk -= m_i_new[:, None]
+        elif window_size > 0:
+            qk *= qk_scale
+            qk = softcapping(qk, logit_softcapping)
+            qk_mask = ((start_n + offs_n[None, :]) >= kv_min_loc[:, None])
+            qk = tl.where(
+                qk_mask,
+                qk,
+                float(-1e30),
+            )
+            m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+            qk -= m_i_new[:, None]
+        elif logit_softcapping > 0:
+            qk *= qk_scale
+            qk = softcapping(qk, logit_softcapping)
+            m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+            qk -= m_i_new[:, None]
+        else:
+            m_i_new = tl.maximum(m_i, tl.max(qk, 1) * qk_scale)
+            qk = qk * qk_scale - m_i_new[:, None]
+
+        # -- compute p, m_i and l_i
+        p = tl_exp2(qk)
+        alpha = tl_exp2(m_i - m_i_new)
+        l_i = alpha * l_i + tl.sum(p, 1)
+        # -- update output accumulator --
+        # scale acc
+        acc = acc * alpha[:, None]
+
+        # update acc
+        v = tl.load(v_ptrs)
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        m_i = m_i_new
+
+        k_ptrs = tl.advance(k_ptrs, (0, BLOCK_N))
+        v_ptrs = tl.advance(v_ptrs, (BLOCK_N, 0))
+        if BLOCK_DK1:
+            k1_ptrs = tl.advance(k1_ptrs, (0, BLOCK_N))
+
+    return acc, l_i, m_i
+
+
+# # FOR DEBUG, DON'T REMOVE
+# import itertools
+# configs = [
+#     triton.Config({
+#         'BLOCK_M': BM,
+#         'BLOCK_N': BN
+#     }, num_stages=s, num_warps=w)
+#     for BM, BN, s, w in itertools.product([64, 128], [32, 64], [3, 4], [4])
+# ]
+
+
+# @triton.autotune(list(configs),
+#                  key=['head_dim_k', 'head_dim_v'],
+#                  warmup=10,
+#                  rep=25)
+@triton.jit
+def _flash_prefill_fwd_kernel(
+    q_ptr,
+    k_ptr,
+    v_ptr,
+    o_ptr,
+    q_start_loc_ptr,
+    q_seqlens_ptr,
+    kv_start_loc_ptr,
+    kv_seqlens_ptr,
+    sm_scale,
+    stride_qs: tl.constexpr,
+    stride_qh: tl.constexpr,
+    stride_qd: tl.constexpr,
+    stride_ks: tl.constexpr,
+    stride_kh,
+    stride_kd: tl.constexpr,
+    stride_vs: tl.constexpr,
+    stride_vh,
+    stride_vd: tl.constexpr,
+    stride_os: tl.constexpr,
+    stride_oh: tl.constexpr,
+    stride_od: tl.constexpr,
+    kv_group_num,
+    head_dim_k,
+    head_dim_v,
+    window_size: tl.constexpr,
+    logit_softcapping: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_DK: tl.constexpr,
+    BLOCK_DK1: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+):
+    """flash attention kernel."""
+    start_m = tl.program_id(0)
+    head_id = tl.program_id(1)
+    batch_id = tl.program_id(2)
+
+    q_seqlen = tl.load(q_seqlens_ptr + batch_id)
+
+    if BLOCK_M * start_m >= q_seqlen:
+        return
+
+    kv_head_id = head_id // kv_group_num
+    q_seqlen = q_seqlen.to(tl.int32)
+    kv_seqlen = tl.load(kv_seqlens_ptr + batch_id).to(tl.int32)
+    q_start_loc = tl.load(q_start_loc_ptr + batch_id).to(tl.int32)
+    kv_start_loc = tl.load(kv_start_loc_ptr + batch_id).to(tl.int32)
+
+    history_len = kv_seqlen - q_seqlen
+
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+
+    loop_start = 0
+    kv_min_loc = tl.zeros([BLOCK_M], dtype=tl.int32)
+    if window_size > 0:
+        start_block_id = tl.maximum(
+            history_len + start_m * BLOCK_M - window_size, 0) // BLOCK_N
+        kv_min_loc = tl.maximum(history_len + offs_m - window_size, 0)
+        loop_start = start_block_id * BLOCK_N
+
+    offs_dk = tl.arange(0, BLOCK_DK)
+    mask_dk = offs_dk < head_dim_k
+    offs_dk = tl.multiple_of(tl.max_contiguous(offs_dk % head_dim_k, BLOCK_DK),
+                             BLOCK_DK)
+    off_q = ((q_start_loc + offs_m[:, None]) * stride_qs +
+             head_id * stride_qh + offs_dk[None, :] * stride_qd)
+    q_ptrs = q_ptr + off_q
+    q = tl.load(q_ptrs, mask=(offs_m[:, None] < q_seqlen and mask_dk[None, :]))
+
+    k_ptrs = tl.make_block_ptr(
+        base=k_ptr + kv_start_loc * stride_ks + kv_head_id * stride_kh,
+        shape=(head_dim_k, kv_seqlen),
+        strides=(stride_kd, stride_ks),
+        offsets=(0, 0),
+        block_shape=(BLOCK_DK, BLOCK_N),
+        order=(0, 1),
+    )
+    v_ptrs = tl.make_block_ptr(
+        base=v_ptr + kv_start_loc * stride_vs + kv_head_id * stride_vh,
+        shape=(kv_seqlen, head_dim_v),
+        strides=(stride_vs, stride_vd),
+        offsets=(0, 0),
+        block_shape=(BLOCK_N, BLOCK_DV),
+        order=(1, 0),
+    )
+
+    if BLOCK_DK1 != 0:
+        offs_dk1 = BLOCK_DK + tl.arange(0, BLOCK_DK1)
+        mask_dk1 = offs_dk1 < head_dim_k
+        offs_dk1 = tl.multiple_of(
+            tl.max_contiguous(offs_dk1 % head_dim_k, BLOCK_DK1), BLOCK_DK1)
+        offs_q1 = ((q_start_loc + offs_m[:, None]) * stride_qs +
+                   head_id * stride_qh + offs_dk1[None, :] * stride_qd)
+        q1_ptrs = q_ptr + offs_q1
+        q1 = tl.load(q1_ptrs,
+                     mask=(offs_m[:, None] < q_seqlen and mask_dk1[None, :]))
+        k1_ptrs = tl.make_block_ptr(
+            base=k_ptr + kv_start_loc * stride_ks + kv_head_id * stride_kh,
+            shape=(head_dim_k, kv_seqlen),
+            strides=(stride_kd, stride_ks),
+            offsets=(BLOCK_DK, 0),
+            block_shape=(BLOCK_DK1, BLOCK_N),
+            order=(0, 1),
+        )
+    else:
+        q1 = q
+        k1_ptrs = k_ptrs
+
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + 1.0
+    acc = tl.zeros([BLOCK_M, BLOCK_DV], dtype=tl.float32)
+
+    qk_scale = sm_scale * tl_log2(math.e)
+    history_mask = history_len + start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+
+    loop_end = (history_len + start_m * BLOCK_M) // BLOCK_N * BLOCK_N
+    acc, l_i, m_i = _prefill_fwd_inner(acc,
+                                       l_i,
+                                       m_i,
+                                       q,
+                                       k_ptrs,
+                                       v_ptrs,
+                                       q1,
+                                       k1_ptrs,
+                                       loop_start,
+                                       loop_end,
+                                       qk_scale,
+                                       history_mask,
+                                       kv_min_loc,
+                                       causal_mask=False,
+                                       window_size=window_size,
+                                       logit_softcapping=logit_softcapping,
+                                       BLOCK_N=BLOCK_N,
+                                       BLOCK_DK1=BLOCK_DK1)
+
+    loop_start = loop_end
+    loop_end = tl.minimum(kv_seqlen, loop_start + BLOCK_M + BLOCK_N)
+    acc, l_i, m_i = _prefill_fwd_inner(acc,
+                                       l_i,
+                                       m_i,
+                                       q,
+                                       k_ptrs,
+                                       v_ptrs,
+                                       q1,
+                                       k1_ptrs,
+                                       loop_start,
+                                       loop_end,
+                                       qk_scale,
+                                       history_mask,
+                                       kv_min_loc,
+                                       causal_mask=True,
+                                       window_size=window_size,
+                                       logit_softcapping=logit_softcapping,
+                                       BLOCK_N=BLOCK_N,
+                                       BLOCK_DK1=BLOCK_DK1)
+    # epilogue
+    m_i += tl.math.log2(l_i)
+    acc = acc / l_i[:, None]
+
+    # initialize pointers to output
+    offs_dv = tl.arange(0, BLOCK_DV)
+    mask_dv = offs_dv < head_dim_v
+    off_o = ((q_start_loc + offs_m[:, None]) * stride_os +
+             head_id * stride_oh + offs_dv[None, :] * stride_od)
+    out_ptrs = o_ptr + off_o
+    tl.store(out_ptrs,
+             acc,
+             mask=(offs_m[:, None] < q_seqlen) & mask_dv[None, :])
+
+
+_nv_cap = None
+
+
+def flash_attention_fwd(
+    q_states: Tensor,
+    k_states: Tensor,
+    v_states: Tensor,
+    o_states: Tensor,
+    q_start_loc: Tensor,
+    q_seqlens: Tensor,
+    kv_start_loc: Tensor,
+    kv_seqlens: Tensor,
+    max_seqlen: int = None,
+    window_size: int = None,
+    sm_scale: float = None,
+    logit_softcapping: float = None,
+    kv_layout: str = 'hsd',
+):
+    """varlen flash Attention forward.
+
+    Support sliding window, softcapping. Note that this kernel will not perform
+    bound check for k,v.
+    """
+
+    global _nv_cap
+    if _nv_cap is None:
+        _nv_cap = torch.cuda.get_device_capability()
+
+    def grid(args):
+        return (triton.cdiv(max_seqlen, args['BLOCK_M']), num_heads, batch)
+
+    if kv_layout == 'shd':
+        s_dim, h_dim, d_dim = (0, 1, 2)
+    elif kv_layout == 'hsd':
+        s_dim, h_dim, d_dim = (1, 0, 2)
+    else:
+        raise RuntimeError('Unsupported layout.')
+
+    if max_seqlen is None:
+        max_seqlen = q_states.size(0)
+
+    if window_size is None:
+        window_size = -1
+
+    if logit_softcapping is None:
+        logit_softcapping = -1.0
+
+    head_dim_q = q_states.size(-1)
+    head_dim_k = k_states.size(d_dim)
+    head_dim_v = v_states.size(d_dim)
+    assert head_dim_q == head_dim_k and head_dim_v == o_states.size(-1)
+
+    if sm_scale is None:
+        sm_scale = 1.0 / (head_dim_q**0.5)
+
+    batch, num_heads = q_seqlens.size(0), q_states.size(-2)
+    num_kv_heads = k_states.size(h_dim)
+    kv_group_num = num_heads // num_kv_heads
+
+    BLOCK_DK, BLOCK_DK1, BLOCK_DV = _get_block_d(head_dim_k, head_dim_v)
+
+    BLOCK_N = 32
+    if _nv_cap[0] < 8:
+        BLOCK_M = max(16, 8192 // BLOCK_DK)
+    else:
+        BLOCK_M = max(16, 16384 // BLOCK_DK)
+    num_warps = 4
+    num_stages = min(4, max(2, 1024 // BLOCK_DK))
+    if BLOCK_DK >= 512:
+        num_stages = 2
+    elif BLOCK_DK >= 256:
+        num_stages = 3
+    else:
+        num_stages = 4
+    _flash_prefill_fwd_kernel[grid](
+        q_states,
+        k_states,
+        v_states,
+        o_states,
+        q_start_loc,
+        q_seqlens,
+        kv_start_loc,
+        kv_seqlens,
+        sm_scale=sm_scale,
+        stride_qs=q_states.stride(0),
+        stride_qh=q_states.stride(1),
+        stride_qd=q_states.stride(2),
+        stride_ks=k_states.stride(s_dim),
+        stride_kh=k_states.stride(h_dim),
+        stride_kd=k_states.stride(d_dim),
+        stride_vs=v_states.stride(s_dim),
+        stride_vh=v_states.stride(h_dim),
+        stride_vd=v_states.stride(d_dim),
+        stride_os=o_states.stride(0),
+        stride_oh=o_states.stride(1),
+        stride_od=o_states.stride(2),
+        kv_group_num=kv_group_num,
+        head_dim_k=head_dim_k,
+        head_dim_v=head_dim_v,
+        window_size=window_size,
+        logit_softcapping=logit_softcapping,
+        BLOCK_DK=BLOCK_DK,
+        BLOCK_DK1=BLOCK_DK1,
+        BLOCK_DV=BLOCK_DV,
+        BLOCK_M=BLOCK_M,
+        BLOCK_N=BLOCK_N,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+
+    return o_states
diff --git a/lmdeploy/pytorch/kernels/cuda/flatten_kv_cache.py b/lmdeploy/pytorch/kernels/cuda/flatten_kv_cache.py
new file mode 100644
index 0000000000..90b135743e
--- /dev/null
+++ b/lmdeploy/pytorch/kernels/cuda/flatten_kv_cache.py
@@ -0,0 +1,323 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Literal
+
+import torch
+import triton
+import triton.language as tl
+from torch import Tensor
+
+
+@triton.jit
+def _flatten_kv_cache(
+    kc_ptr,
+    vc_ptr,
+    ko_ptr,
+    vo_ptr,
+    start_loc_ptr,
+    seqlens_ptr,
+    block_offsets_ptr,
+    stride_kcb: tl.constexpr,
+    stride_kcs: tl.constexpr,
+    stride_kch: tl.constexpr,
+    stride_kcd: tl.constexpr,
+    stride_vcb: tl.constexpr,
+    stride_vcs: tl.constexpr,
+    stride_vch: tl.constexpr,
+    stride_vcd: tl.constexpr,
+    stride_koh,
+    stride_kos: tl.constexpr,
+    stride_kod: tl.constexpr,
+    stride_voh,
+    stride_vos: tl.constexpr,
+    stride_vod: tl.constexpr,
+    stride_boff,
+    OUT_SIZE: tl.constexpr,
+    HEAD_DIM_K: tl.constexpr,
+    HEAD_DIM_V: tl.constexpr,
+    BLOCK_BS: tl.constexpr,
+    BLOCK_DK: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+):
+    """flatten kv cache."""
+    page_id = tl.program_id(0)
+    batch_id = tl.program_id(1)
+    head_id = tl.program_id(2)
+
+    num_batches = tl.num_programs(1)
+
+    seqlen = tl.load(seqlens_ptr + batch_id)
+    start_loc = tl.load(start_loc_ptr + batch_id)
+    # fill last block to prevent attention nan
+    if batch_id == num_batches - 1:
+        seqlen = OUT_SIZE - start_loc
+    if page_id * BLOCK_BS >= seqlen:
+        return
+
+    start_loc = tl.load(start_loc_ptr + batch_id)
+    b_off = tl.load(block_offsets_ptr + batch_id * stride_boff + page_id)
+
+    offs_bs = tl.arange(0, BLOCK_BS)
+    offs_dk = tl.arange(0, BLOCK_DK) % HEAD_DIM_K
+    offs_dv = tl.arange(0, BLOCK_DV) % HEAD_DIM_V
+    offs_obs = page_id * BLOCK_BS + tl.arange(0, BLOCK_BS)
+    mask_bs = offs_obs < seqlen
+    mask_dk = tl.arange(0, BLOCK_DK) < HEAD_DIM_K
+    mask_dv = tl.arange(0, BLOCK_DV) < HEAD_DIM_V
+
+    kc_ptrs = (kc_ptr + b_off * stride_kcb + offs_bs[:, None] * stride_kcs +
+               head_id * stride_kch + offs_dk[None, :] * stride_kcd)
+    vc_ptrs = (vc_ptr + b_off * stride_vcb + offs_bs[:, None] * stride_vcs +
+               head_id * stride_vch + offs_dv[None, :] * stride_vcd)
+    ko_ptrs = (ko_ptr + head_id * stride_koh +
+               (start_loc + offs_obs[:, None]) * stride_kos +
+               offs_dk[None, :] * stride_kod)
+    vo_ptrs = (vo_ptr + head_id * stride_voh +
+               (start_loc + offs_obs[:, None]) * stride_vos +
+               offs_dv[None, :] * stride_vod)
+
+    kc = tl.load(kc_ptrs)
+    tl.store(ko_ptrs, kc, mask=mask_bs[:, None] and mask_dk[None, :])
+    vc = tl.load(vc_ptrs)
+    tl.store(vo_ptrs, vc, mask=mask_bs[:, None] and mask_dv[None, :])
+
+
+@triton.jit
+def _dequant_int4(val, HEAD_DIM: tl.constexpr, BLOCK: tl.constexpr):
+    """dequant int4."""
+    offs = tl.arange(0, BLOCK) // (HEAD_DIM // 2)
+    shift = (offs % 2) * 4
+    return (val >> shift) & 0xf
+
+
+@triton.jit
+def _flatten_kv_cache_quant(
+    kc_ptr,
+    vc_ptr,
+    ko_ptr,
+    vo_ptr,
+    ksz_ptr,
+    vsz_ptr,
+    start_loc_ptr,
+    seqlens_ptr,
+    block_offsets_ptr,
+    stride_kcb: tl.constexpr,
+    stride_kcs: tl.constexpr,
+    stride_kch: tl.constexpr,
+    stride_kcd: tl.constexpr,
+    stride_vcb: tl.constexpr,
+    stride_vcs: tl.constexpr,
+    stride_vch: tl.constexpr,
+    stride_vcd: tl.constexpr,
+    stride_kszb: tl.constexpr,
+    stride_kszs: tl.constexpr,
+    stride_kszh: tl.constexpr,
+    stride_kszd: tl.constexpr,
+    stride_vszb: tl.constexpr,
+    stride_vszs: tl.constexpr,
+    stride_vszh: tl.constexpr,
+    stride_vszd: tl.constexpr,
+    stride_koh,
+    stride_kos: tl.constexpr,
+    stride_kod: tl.constexpr,
+    stride_voh,
+    stride_vos: tl.constexpr,
+    stride_vod: tl.constexpr,
+    stride_boff,
+    quant_policy: tl.constexpr,
+    OUT_SIZE: tl.constexpr,
+    HEAD_DIM_K: tl.constexpr,
+    HEAD_DIM_V: tl.constexpr,
+    BLOCK_BS: tl.constexpr,
+    BLOCK_DK: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+):
+    """flatten kv cache."""
+    page_id = tl.program_id(0)
+    batch_id = tl.program_id(1)
+    head_id = tl.program_id(2)
+
+    num_batches = tl.num_programs(1)
+
+    seqlen = tl.load(seqlens_ptr + batch_id)
+    start_loc = tl.load(start_loc_ptr + batch_id)
+    if batch_id == num_batches - 1:
+        seqlen = OUT_SIZE - start_loc
+    if page_id * BLOCK_BS >= seqlen:
+        return
+
+    b_off = tl.load(block_offsets_ptr + batch_id * stride_boff + page_id)
+
+    offs_bs = tl.arange(0, BLOCK_BS)
+    if quant_policy == 4:
+        HALF_HDK: tl.constexpr = HEAD_DIM_K // 2
+        HALF_HDV: tl.constexpr = HEAD_DIM_V // 2
+        offs_dk = tl.arange(0, BLOCK_DK) % HALF_HDK
+        offs_dv = tl.arange(0, BLOCK_DV) % HALF_HDV
+    else:
+        offs_dk = tl.arange(0, BLOCK_DK) % HEAD_DIM_K
+        offs_dv = tl.arange(0, BLOCK_DV) % HEAD_DIM_V
+    offs_obs = page_id * BLOCK_BS + tl.arange(0, BLOCK_BS)
+    mask_bs = offs_obs < seqlen
+
+    offs_dok = tl.arange(0, BLOCK_DK)
+    offs_dov = tl.arange(0, BLOCK_DV)
+    mask_dok = offs_dok < HEAD_DIM_K
+    mask_dov = offs_dov < HEAD_DIM_V
+
+    kc_ptrs = (kc_ptr + b_off * stride_kcb + offs_bs[:, None] * stride_kcs +
+               head_id * stride_kch + offs_dk[None, :] * stride_kcd)
+    vc_ptrs = (vc_ptr + b_off * stride_vcb + offs_bs[:, None] * stride_vcs +
+               head_id * stride_vch + offs_dv[None, :] * stride_vcd)
+    ksz_ptrs = (ksz_ptr + b_off * stride_kszb + offs_bs * stride_kszs +
+                head_id * stride_kszh)
+    vsz_ptrs = (vsz_ptr + b_off * stride_vszb + offs_bs * stride_vszs +
+                head_id * stride_vszh)
+    ko_ptrs = (ko_ptr + head_id * stride_koh +
+               (start_loc + offs_obs[:, None]) * stride_kos +
+               offs_dok[None, :] * stride_kod)
+    vo_ptrs = (vo_ptr + head_id * stride_voh +
+               (start_loc + offs_obs[:, None]) * stride_vos +
+               offs_dov[None, :] * stride_vod)
+
+    kc = tl.load(kc_ptrs)
+    if quant_policy == 4:
+        kc = _dequant_int4(kc, HEAD_DIM_K, BLOCK_DK)
+    ks = tl.load(ksz_ptrs)
+    kz = tl.load(ksz_ptrs + stride_kszd)
+    ksz = ks * kz
+    kq = (kc * ks[:, None] - ksz[:, None]).to(ko_ptr.dtype.element_ty)
+    tl.store(ko_ptrs, kq, mask=mask_bs[:, None] and mask_dok[None, :])
+    vc = tl.load(vc_ptrs)
+    if quant_policy == 4:
+        vc = _dequant_int4(vc, HEAD_DIM_V, BLOCK_DV)
+    vs = tl.load(vsz_ptrs)
+    vz = tl.load(vsz_ptrs + stride_vszd)
+    vsz = vs * vz
+    vq = (vc * vs[:, None] - vsz[:, None]).to(vo_ptr.dtype.element_ty)
+    tl.store(vo_ptrs, vq, mask=mask_bs[:, None] and mask_dov[None, :])
+
+
+def flatten_kv_cache(k_caches: Tensor,
+                     v_caches: Tensor,
+                     seqlens: Tensor,
+                     block_offsets: Tensor,
+                     start_loc: Tensor = None,
+                     out_size: int = None,
+                     out_dtype: torch.dtype = None,
+                     k_scales_zeros: Tensor = None,
+                     v_scales_zeros: Tensor = None,
+                     quant_policy: Literal[0, 4, 8] = 0,
+                     kv_layout: str = 'bshd'):
+    """recovery paged kv cache to normal kv cache."""
+    if kv_layout == 'bshd':
+        b_dim, s_dim, h_dim, d_dim = (0, 1, 2, 3)
+    elif kv_layout == 'bhsd':
+        b_dim, s_dim, h_dim, d_dim = (0, 2, 1, 3)
+    else:
+        raise RuntimeError('Unsupported layout.')
+
+    if out_dtype is None:
+        out_dtype = k_caches.dtype
+
+    if out_size is None or out_size <= 0:
+        out_size = k_caches.size(b_dim) * k_caches.size(s_dim)
+
+    if start_loc is None:
+        start_loc = seqlens.cumsum(0) - seqlens
+
+    batch_size, num_blocks = block_offsets.size()
+    num_heads = k_caches.size(h_dim)
+    k_head_dim = k_caches.size(d_dim)
+    v_head_dim = v_caches.size(d_dim)
+    if quant_policy == 4:
+        k_head_dim *= 2
+        v_head_dim *= 2
+    BLOCK_DK = triton.next_power_of_2(k_head_dim)
+    BLOCK_DV = triton.next_power_of_2(v_head_dim)
+    BLOCK_BS = k_caches.size(s_dim)
+
+    k_states = k_caches.new_empty(num_heads,
+                                  out_size,
+                                  k_head_dim,
+                                  dtype=out_dtype)
+    v_states = v_caches.new_empty(num_heads,
+                                  out_size,
+                                  v_head_dim,
+                                  dtype=out_dtype)
+
+    grid = (num_blocks, batch_size, num_heads)
+    if quant_policy == 0:
+        _flatten_kv_cache[grid](
+            k_caches,
+            v_caches,
+            k_states,
+            v_states,
+            start_loc,
+            seqlens,
+            block_offsets,
+            stride_kcb=k_caches.stride(b_dim),
+            stride_kcs=k_caches.stride(s_dim),
+            stride_kch=k_caches.stride(h_dim),
+            stride_kcd=k_caches.stride(d_dim),
+            stride_vcb=v_caches.stride(b_dim),
+            stride_vcs=v_caches.stride(s_dim),
+            stride_vch=v_caches.stride(h_dim),
+            stride_vcd=v_caches.stride(d_dim),
+            stride_koh=k_states.stride(0),
+            stride_kos=k_states.stride(1),
+            stride_kod=k_states.stride(2),
+            stride_voh=v_states.stride(0),
+            stride_vos=v_states.stride(1),
+            stride_vod=v_states.stride(2),
+            stride_boff=block_offsets.stride(0),
+            OUT_SIZE=out_size,
+            HEAD_DIM_K=k_head_dim,
+            HEAD_DIM_V=v_head_dim,
+            BLOCK_BS=BLOCK_BS,
+            BLOCK_DK=BLOCK_DK,
+            BLOCK_DV=BLOCK_DV,
+        )
+    else:
+        _flatten_kv_cache_quant[grid](
+            k_caches,
+            v_caches,
+            k_states,
+            v_states,
+            k_scales_zeros,
+            v_scales_zeros,
+            start_loc,
+            seqlens,
+            block_offsets,
+            stride_kcb=k_caches.stride(b_dim),
+            stride_kcs=k_caches.stride(s_dim),
+            stride_kch=k_caches.stride(h_dim),
+            stride_kcd=k_caches.stride(d_dim),
+            stride_vcb=v_caches.stride(b_dim),
+            stride_vcs=v_caches.stride(s_dim),
+            stride_vch=v_caches.stride(h_dim),
+            stride_vcd=v_caches.stride(d_dim),
+            stride_kszb=k_scales_zeros.stride(b_dim),
+            stride_kszs=k_scales_zeros.stride(s_dim),
+            stride_kszh=k_scales_zeros.stride(h_dim),
+            stride_kszd=k_scales_zeros.stride(d_dim),
+            stride_vszb=v_scales_zeros.stride(b_dim),
+            stride_vszs=v_scales_zeros.stride(s_dim),
+            stride_vszh=v_scales_zeros.stride(h_dim),
+            stride_vszd=v_scales_zeros.stride(d_dim),
+            stride_koh=k_states.stride(0),
+            stride_kos=k_states.stride(1),
+            stride_kod=k_states.stride(2),
+            stride_voh=v_states.stride(0),
+            stride_vos=v_states.stride(1),
+            stride_vod=v_states.stride(2),
+            stride_boff=block_offsets.stride(0),
+            quant_policy=quant_policy,
+            OUT_SIZE=out_size,
+            HEAD_DIM_K=k_head_dim,
+            HEAD_DIM_V=v_head_dim,
+            BLOCK_BS=BLOCK_BS,
+            BLOCK_DK=BLOCK_DK,
+            BLOCK_DV=BLOCK_DV,
+        )
+
+    return k_states, v_states
diff --git a/lmdeploy/pytorch/kernels/cuda/pagedattention.py b/lmdeploy/pytorch/kernels/cuda/pagedattention.py
index d125eabc85..bbd6d3cf78 100644
--- a/lmdeploy/pytorch/kernels/cuda/pagedattention.py
+++ b/lmdeploy/pytorch/kernels/cuda/pagedattention.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # modify from: https://github.com/ModelTC/lightllm
+import math
 from typing import Literal
 
 import torch
@@ -22,12 +23,14 @@
 # TODO: fast op might not work on non-nv device
 if TRITON_VERSION >= VERSION_300:
     tanh = tl.extra.cuda.libdevice.tanh
-    fast_expf = tl.extra.cuda.libdevice.fast_expf
     fast_dividef = tl.extra.cuda.libdevice.fast_dividef
+    tl_log2 = tl.log2
+    tl_exp2 = tl.exp2
 else:
     tanh = tl.math.tanh
-    fast_expf = tl.math.fast_expf
     fast_dividef = tl.math.fast_dividef
+    tl_log2 = tl.math.log2
+    tl_exp2 = tl.math.exp2
 
 
 @triton.autotune(configs=[
@@ -202,7 +205,7 @@ def _fwd_grouped_split_kernel(
         qk += tl.dot(q, k)
         if BLOCK_DMODEL1 != 0:
             qk += tl.dot(q1, k1)
-        qk *= sm_scale
+        qk *= sm_scale * tl_log2(math.e)
         if logit_softcapping > 0.0:
             qk = qk / logit_softcapping
             qk = tanh(qk)
@@ -220,8 +223,8 @@ def _fwd_grouped_split_kernel(
 
         # -- compute p, m_i and l_i
         m_i_new = tl.maximum(m_i, tl.max(qk, 1))
-        p = fast_expf(qk - m_i_new[:, None])
-        alpha = fast_expf(m_i - m_i_new)
+        p = tl_exp2(qk - m_i_new[:, None])
+        alpha = tl_exp2(m_i - m_i_new)
         l_i_new = alpha * l_i + tl.sum(p, 1)
 
         # -- update output accumulator --
@@ -488,7 +491,7 @@ def _fwd_grouped_split_quant_kernel(
         qk += tl.dot(q, k)
         if BLOCK_DMODEL1 != 0:
             qk += tl.dot(q1, k1)
-        qk *= sm_scale
+        qk *= sm_scale * tl_log2(math.e)
         if logit_softcapping > 0.0:
             qk = qk / logit_softcapping
             qk = tanh(qk)
@@ -506,8 +509,8 @@ def _fwd_grouped_split_quant_kernel(
 
         # -- compute p, m_i and l_i
         m_i_new = tl.maximum(m_i, tl.max(qk, 1))
-        p = fast_expf(qk - m_i_new[:, None])
-        alpha = fast_expf(m_i - m_i_new)
+        p = tl_exp2(qk - m_i_new[:, None])
+        alpha = tl_exp2(m_i - m_i_new)
         l_i_new = alpha * l_i + tl.sum(p, 1)
 
         # -- update output accumulator --
@@ -590,7 +593,7 @@ def _reduce_split_kernel(
                     other=0.0)
 
     m_max = tl.max(m_k, 0)
-    alpha = fast_expf(m_k - m_max)
+    alpha = tl_exp2(m_k - m_max)
     acc_k = acc_k * alpha[:, None]
     l_k = l_k * alpha
 
@@ -628,460 +631,13 @@ def convert_pv(p, v):
 _nv_cap = None
 
 
-# TODO: how to support inplace autotune?
-# @triton.autotune(configs=[
-#     triton.Config({}, num_stages=1, num_warps=16),
-#     triton.Config({}, num_stages=1, num_warps=8),
-#     triton.Config({}, num_stages=1, num_warps=4),
-# ],
-#                  key=['BLOCK_M', 'BLOCK_N', 'BLOCK_DMODEL', 'BLOCK_DV'])
-@triton.jit
-def _fwd_kernel(
-    Q,
-    K,
-    V,
-    sm_scale,
-    Q_start_loc,
-    Q_seqlens,
-    KV_seqlens,
-    Block_offsets,
-    Out,
-    stride_qbs: tl.constexpr,
-    stride_qh: tl.constexpr,
-    stride_qd: tl.constexpr,
-    stride_kp: tl.constexpr,
-    stride_kbs: tl.constexpr,
-    stride_kh: tl.constexpr,
-    stride_kd: tl.constexpr,
-    stride_vp: tl.constexpr,
-    stride_vbs: tl.constexpr,
-    stride_vh: tl.constexpr,
-    stride_vd: tl.constexpr,
-    stride_obs: tl.constexpr,
-    stride_oh: tl.constexpr,
-    stride_od: tl.constexpr,
-    stride_boffb,
-    kv_group_num,
-    window_size: tl.constexpr,
-    head_size: tl.constexpr,
-    head_size_v: tl.constexpr,
-    logit_softcapping: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
-    BLOCK_DV: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    BLOCK_DMODEL1: tl.constexpr,
-):
-    """paged attention kernel."""
-    cur_batch = tl.program_id(2)
-    cur_kv_head = tl.program_id(1)
-    start_m = tl.program_id(0)
-
-    q_seqlen = tl.load(Q_seqlens + cur_batch)
-    kv_seqlen = tl.load(KV_seqlens + cur_batch)
-    q_start_loc = tl.load(Q_start_loc + cur_batch)
-    history_len = kv_seqlen - q_seqlen
-
-    block_start_loc = BLOCK_M * start_m
-    if block_start_loc >= q_seqlen * kv_group_num:
-        return
-
-    # initialize offsets
-    offs_n = tl.arange(0, BLOCK_N)
-    offs_d = tl.arange(0, BLOCK_DMODEL)
-    offs_dv = tl.arange(0, BLOCK_DV)
-    mask_d = offs_d < head_size
-    offs_d = offs_d % head_size
-    mask_dv = offs_dv < head_size_v
-    offs_dv = offs_dv % head_size_v
-    offs_mh = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_m = offs_mh // kv_group_num
-    cur_head = offs_mh % kv_group_num + cur_kv_head * kv_group_num
-    off_q = ((q_start_loc + offs_m[:, None]) * stride_qbs +
-             cur_head[:, None] * stride_qh + offs_d[None, :] * stride_qd)
-    off_k = (cur_kv_head * stride_kh + offs_d[:, None] * stride_kd +
-             offs_n[None, :] * stride_kbs)
-    off_v = (cur_kv_head * stride_vh + offs_dv[None, :] * stride_vd +
-             offs_n[:, None] * stride_vbs)
-
-    q = tl.load(
-        Q + off_q,
-        mask=(offs_m[:, None] < q_seqlen) & mask_d[None, :],
-        other=0.0,
-        eviction_policy='evict_first',
-    )
-
-    k_ptrs = K + off_k
-    v_ptrs = V + off_v
-
-    if BLOCK_DMODEL1 != 0:
-        offs_d1 = BLOCK_DMODEL + tl.arange(0, BLOCK_DMODEL1)
-        mask_d1 = offs_d1 < head_size
-        offs_d1 = offs_d1 % head_size
-        off_q1 = ((q_start_loc + offs_m[:, None]) * stride_qbs +
-                  cur_head[:, None] * stride_qh + offs_d1[None, :] * stride_qd)
-        q1 = tl.load(Q + off_q1, mask=(offs_m[:, None] < q_seqlen) & mask_d1)
-        off_k1 = (cur_kv_head * stride_kh + offs_d1[:, None] * stride_kd +
-                  offs_n[None, :] * stride_kbs)
-        k1_ptrs = K + off_k1
-
-    block_offset_ptrs = Block_offsets + cur_batch * stride_boffb
-
-    # initialize pointer to m and l
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
-    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-    acc = tl.zeros([BLOCK_M, BLOCK_DV], dtype=tl.float32)
-
-    kv_start_loc = 0
-    if window_size > 0:
-        start_block_id = tl.maximum(history_len - window_size, 0) // BLOCK_N
-        kv_min_loc = tl.maximum(history_len + offs_m - window_size, 0)
-        kv_start_loc = start_block_id * BLOCK_N
-        block_offset_ptrs += start_block_id
-
-    loop_start = kv_start_loc
-    loop_end = history_len // BLOCK_N * BLOCK_N
-    for start_n in range(loop_start, loop_end, BLOCK_N):
-        b_offset = tl.load(block_offset_ptrs)
-        block_offset_ptrs += 1
-
-        # -- compute qk ----
-        k = tl.load(k_ptrs + b_offset * stride_kp)
-        if BLOCK_DMODEL1 != 0:
-            k1 = tl.load(k1_ptrs + b_offset * stride_kp)
-
-        v = tl.load(v_ptrs + b_offset * stride_vp)
-
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k)
-        if BLOCK_DMODEL1 != 0:
-            qk += tl.dot(q1, k1)
-        qk *= sm_scale
-        if logit_softcapping > 0.0:
-            qk = qk / logit_softcapping
-            qk = tanh(qk)
-            qk = qk * logit_softcapping
-        # NOTE: inf - inf = nan, and nan will leads to error
-        if window_size > 0:
-            qk_mask = ((start_n + offs_n[None, :]) >= kv_min_loc[:, None])
-            qk = tl.where(
-                qk_mask,
-                qk,
-                float(-1e30),
-            )
-
-        # -- compute p, m_i and l_i
-        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
-        p = fast_expf(qk - m_i_new[:, None])
-        alpha = fast_expf(m_i - m_i_new)
-        l_i_new = alpha * l_i + tl.sum(p, 1)
-        # -- update output accumulator --
-        # scale acc
-        acc = acc * alpha[:, None]
-
-        # update acc
-        p, v = _convert_pv(p, v)
-        acc += tl.dot(p, v)
-        # update m_i and l_i
-        l_i = l_i_new
-        m_i = m_i_new
-
-    loop_start = loop_end
-    loop_end = kv_seqlen
-    for start_n in range(loop_start, loop_end, BLOCK_N):
-        b_offset = tl.load(block_offset_ptrs)
-        block_offset_ptrs += 1
-
-        # -- compute qk ----
-        k = tl.load(k_ptrs + b_offset * stride_kp)
-        if BLOCK_DMODEL1 != 0:
-            k1 = tl.load(k1_ptrs + b_offset * stride_kp)
-
-        v = tl.load(v_ptrs + b_offset * stride_vp)
-
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k)
-        if BLOCK_DMODEL1 != 0:
-            qk += tl.dot(q1, k1)
-        qk *= sm_scale
-        if logit_softcapping > 0.0:
-            qk = qk / logit_softcapping
-            qk = tanh(qk)
-            qk = qk * logit_softcapping
-        # NOTE: inf - inf = nan, and nan will leads to error
-        qk_mask = (history_len + offs_m[:, None]) >= (start_n +
-                                                      offs_n[None, :])
-        if window_size > 0:
-            qk_mask = qk_mask and (
-                (start_n + offs_n[None, :]) >= kv_min_loc[:, None])
-        qk = tl.where(
-            qk_mask,
-            qk,
-            float(-1e30),
-        )
-
-        # -- compute p, m_i and l_i
-        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
-        p = fast_expf(qk - m_i_new[:, None])
-        alpha = fast_expf(m_i - m_i_new)
-        l_i_new = alpha * l_i + tl.sum(p, 1)
-        # -- update output accumulator --
-        # scale acc
-        acc = acc * alpha[:, None]
-
-        # update acc
-        p, v = _convert_pv(p, v)
-        acc += tl.dot(p, v)
-        # update m_i and l_i
-        l_i = l_i_new
-        m_i = m_i_new
-
-    acc = fast_dividef(acc, l_i[:, None])
-    # initialize pointers to output
-    off_o = ((q_start_loc + offs_m[:, None]) * stride_obs +
-             cur_head[:, None] * stride_oh + offs_dv[None, :] * stride_od)
-    out_ptrs = Out + off_o
-    tl.store(out_ptrs,
-             acc,
-             mask=(offs_m[:, None] < q_seqlen) & mask_dv[None, :])
-
-
-# TODO: how to support inplace autotune?
-# @triton.autotune(configs=[
-#     triton.Config({}, num_stages=1, num_warps=16),
-#     triton.Config({}, num_stages=1, num_warps=8),
-#     triton.Config({}, num_stages=1, num_warps=4),
-# ],
-#                  key=['BLOCK_M', 'BLOCK_N', 'BLOCK_DMODEL', 'BLOCK_DV'])
-@wrap_jit_func
-@triton.jit
-def _fwd_kernel_quant(
-    Q,
-    K,
-    V,
-    KScalesZeros,
-    VScalesZeros,
-    sm_scale,
-    Q_start_loc,
-    Q_seqlens,
-    KV_seqlens,
-    Block_offsets,
-    Out,
-    stride_qbs: tl.constexpr,
-    stride_qh: tl.constexpr,
-    stride_qd: tl.constexpr,
-    stride_kp: tl.constexpr,
-    stride_kbs: tl.constexpr,
-    stride_kh: tl.constexpr,
-    stride_kd: tl.constexpr,
-    stride_vp: tl.constexpr,
-    stride_vbs: tl.constexpr,
-    stride_vh: tl.constexpr,
-    stride_vd: tl.constexpr,
-    stride_kszp: tl.constexpr,
-    stride_kszbs: tl.constexpr,
-    stride_kszh: tl.constexpr,
-    stride_kszd: tl.constexpr,
-    stride_vszp: tl.constexpr,
-    stride_vszbs: tl.constexpr,
-    stride_vszh: tl.constexpr,
-    stride_vszd: tl.constexpr,
-    quant_policy: tl.constexpr,
-    stride_obs: tl.constexpr,
-    stride_oh: tl.constexpr,
-    stride_od: tl.constexpr,
-    stride_boffb,
-    kv_group_num,
-    window_size: tl.constexpr,
-    head_size: tl.constexpr,
-    head_size_v: tl.constexpr,
-    logit_softcapping: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
-    BLOCK_DV: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    BLOCK_DMODEL1: tl.constexpr,
-):
-    """paged attention kernel with dequant fused.
-
-    Args:
-        stride_xp: stride of page num dim
-        stride_xbs: stride of block size dim
-        stride_h: stride of head num dim
-        stride_d: stride of head size dim
-    """
-    cur_batch = tl.program_id(2)
-    cur_kv_head = tl.program_id(1)
-    start_m = tl.program_id(0)
-
-    q_seqlen = tl.load(Q_seqlens + cur_batch)
-    kv_seqlen = tl.load(KV_seqlens + cur_batch)
-    q_start_loc = tl.load(Q_start_loc + cur_batch)
-    history_len = kv_seqlen - q_seqlen
-
-    block_start_loc = BLOCK_M * start_m
-    if block_start_loc >= q_seqlen * kv_group_num:
-        return
-
-    # initialize offsets
-    offs_n = tl.arange(0, BLOCK_N)
-    offs_d = tl.arange(0, BLOCK_DMODEL)
-    offs_dv = tl.arange(0, BLOCK_DV)
-    offs_dsz = tl.arange(0, 1)
-    mask_d = offs_d < head_size
-    offs_d = offs_d % head_size
-    mask_dv = offs_dv < head_size_v
-    offs_dv = offs_dv % head_size_v
-    offs_mh = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_m = offs_mh // kv_group_num
-    cur_head = offs_mh % kv_group_num + cur_kv_head * kv_group_num
-    off_q = ((q_start_loc + offs_m[:, None]) * stride_qbs +
-             cur_head[:, None] * stride_qh + offs_d[None, :] * stride_qd)
-    off_k = (cur_kv_head * stride_kh + offs_d[:, None] * stride_kd +
-             offs_n[None, :] * stride_kbs)
-    off_v = (cur_kv_head * stride_vh + offs_dv[None, :] * stride_vd +
-             offs_n[:, None] * stride_vbs)
-    off_ksz = (cur_kv_head * stride_kszh + offs_dsz[:, None] * stride_kszd +
-               offs_n[None, :] * stride_kszbs)
-    off_vsz = (cur_kv_head * stride_vszh + offs_dsz[None, :] * stride_vszd +
-               offs_n[:, None] * stride_vszbs)
-
-    q = tl.load(Q + off_q,
-                mask=(offs_m[:, None] < q_seqlen) & mask_d[None, :],
-                other=0.0)
-
-    ksz_ptrs = KScalesZeros + off_ksz
-    vsz_ptrs = VScalesZeros + off_vsz
-
-    if BLOCK_DMODEL1 != 0:
-        offs_d1 = BLOCK_DMODEL + tl.arange(0, BLOCK_DMODEL1)
-        mask_d1 = offs_d1 < head_size
-        offs_d1 = offs_d1 % head_size
-        off_q1 = ((q_start_loc + offs_m[:, None]) * stride_qbs +
-                  cur_head[:, None] * stride_qh + offs_d1[None, :] * stride_qd)
-        q1 = tl.load(Q + off_q1, mask=(offs_m[:, None] < q_seqlen) & mask_d1)
-        off_k1 = (cur_kv_head * stride_kh + offs_d1[:, None] * stride_kd +
-                  offs_n[None, :] * stride_kbs)
-
-    block_offset_ptrs = Block_offsets + cur_batch * stride_boffb
-
-    # initialize pointer to m and l
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
-    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-    if quant_policy == 4:
-        offs_d = tl.arange(0, BLOCK_DMODEL) % (head_size // 2)
-        offs_dv = tl.arange(0, BLOCK_DV * 2) % (head_size_v)
-        shift_kd = (tl.arange(0, BLOCK_DMODEL) // (head_size // 2) * 4)[:,
-                                                                        None]
-        off_k = (cur_kv_head * stride_kh + offs_d[:, None] * stride_kd +
-                 offs_n[None, :] * stride_kbs)
-        shift_vd = (tl.arange(0, BLOCK_DV * 2) // head_size_v * 4)
-        off_v = (cur_kv_head * stride_vh + offs_dv[None, :] * stride_vd +
-                 offs_n[:, None] * stride_vbs)
-        if BLOCK_DMODEL1 != 0:
-            offs_d1 = BLOCK_DMODEL // 2 + tl.arange(0, BLOCK_DMODEL1)
-            shift_k1d = (offs_d1 // (head_size // 2) * 4)[:, None]
-            offs_d1 = offs_d1 % (head_size // 2)
-            off_k1 = (cur_kv_head * stride_kh + offs_d1[:, None] * stride_kd +
-                      offs_n[None, :] * stride_kbs)
-        acc = tl.zeros([BLOCK_M, BLOCK_DV * 2],
-                       dtype=tl.float32)  # v head_dim packed
-        mask_dv = tl.arange(0, BLOCK_DV * 2) < (head_size_v * 2)
-        offs_dv = tl.arange(0, BLOCK_DV * 2) % (head_size_v * 2)
-    else:
-        acc = tl.zeros([BLOCK_M, BLOCK_DV], dtype=tl.float32)
-
-    kv_start_loc = 0
-    if window_size > 0:
-        start_block_id = tl.maximum(history_len - window_size, 0) // BLOCK_N
-        kv_min_loc = tl.maximum(history_len + offs_m - window_size, 0)
-        kv_start_loc = start_block_id * BLOCK_N
-        block_offset_ptrs += start_block_id
-    for start_n in range(kv_start_loc, kv_seqlen, BLOCK_N):
-        b_offset = tl.load(block_offset_ptrs)
-        block_offset_ptrs += 1
-
-        # -- compute qk ----
-        k = tl.load(K + off_k + b_offset * stride_kp)
-        if quant_policy == 4:
-            k = (k >> shift_kd) & 0x0F
-        ks = tl.load(ksz_ptrs + b_offset * stride_kszp)
-        kz = tl.load(ksz_ptrs + b_offset * stride_kszp + 1)
-        if BLOCK_DMODEL1 != 0:
-            k1 = tl.load(K + off_k1 + b_offset * stride_kp)
-            if quant_policy == 4:
-                k1 = (k1 >> shift_k1d) & 0x0F
-            k1 = ((k1 - kz) * ks).to(q.dtype)
-
-        if quant_policy == 4:
-            v = tl.load(V + off_v + b_offset * stride_vp)
-            v = (v >> shift_vd) & 0x0F
-        else:
-            v = tl.load(V + off_v + b_offset * stride_vp)
-        vs = tl.load(vsz_ptrs + b_offset * stride_vszp)
-        vz = tl.load(vsz_ptrs + b_offset * stride_vszp + 1)
-
-        # k = tl.view(k, (ks.shape[0], ks.shape[1]))
-        v = ((v - vz) * vs).to(q.dtype)
-        k = ((k - kz) * ks).to(q.dtype)
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k)
-        if BLOCK_DMODEL1 != 0:
-            qk += tl.dot(q1, k1)
-        qk *= sm_scale
-        if logit_softcapping > 0.0:
-            qk = qk / logit_softcapping
-            qk = tanh(qk)
-            qk = qk * logit_softcapping
-        # NOTE: inf - inf = nan, and nan will leads to error
-        if start_n + BLOCK_N > history_len or window_size > 0:
-            qk_mask = (history_len + offs_m[:, None]) >= (start_n +
-                                                          offs_n[None, :])
-            if window_size > 0:
-                qk_mask = qk_mask and (
-                    (start_n + offs_n[None, :]) >= kv_min_loc[:, None])
-            qk = tl.where(
-                qk_mask,
-                qk,
-                float(-1e30),
-            )
-
-        # -- compute p, m_i and l_i
-        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
-        p = fast_expf(qk - m_i_new[:, None])
-        alpha = fast_expf(m_i - m_i_new)
-        l_i_new = alpha * l_i + tl.sum(p, 1)
-        # -- update output accumulator --
-        # scale acc
-        acc = acc * alpha[:, None]
-
-        # update acc
-        p, v = _convert_pv(p, v)
-        acc += tl.dot(p, v)
-        # update m_i and l_i
-        l_i = l_i_new
-        m_i = m_i_new
-
-    acc = fast_dividef(acc, l_i[:, None])
-    # initialize pointers to output
-    off_o = ((q_start_loc + offs_m[:, None]) * stride_obs +
-             cur_head[:, None] * stride_oh + offs_dv[None, :] * stride_od)
-    out_ptrs = Out + off_o
-    tl.store(out_ptrs,
-             acc,
-             mask=(offs_m[:, None] < q_seqlen) & mask_dv[None, :])
-
-
 def paged_attention_fwd(
     q: Tensor,
     k: Tensor,
     v: Tensor,
     o: Tensor,
     block_offsets: Tensor,
-    q_start_loc: Tensor,
-    q_seqlens: Tensor,
     kv_seqlens: Tensor,
-    max_seqlen: int,
     k_scales_zeros: Tensor = None,
     v_scales_zeros: Tensor = None,
     quant_policy: Literal[0, 4, 8] = 0,
@@ -1099,7 +655,6 @@ def paged_attention_fwd(
         o (Tensor): Output state.
         block_offsets (Tensor): The block offset of key and value.
         q_start_loc (Tensor): Start token location of each data in batch.
-        q_seqlens (Tensor): Query length for each data in batch.
         kv_seqlens (Tensor): Key/Value length for each data in batch.
         max_seqlen (int): The max input length.
         BLOCK (int): The kernel block size.
@@ -1142,7 +697,7 @@ def _get_block_d(Lk):
 
     if sm_scale is None:
         sm_scale = 1.0 / (Lq**0.5)
-    batch, head = q_seqlens.shape[0], q.shape[-2]
+    batch, head = kv_seqlens.shape[0], q.shape[-2]
     kv_group_num = q.shape[-2] // k.shape[h_dim]
 
     BLOCK = k.size(s_dim)
@@ -1153,234 +708,132 @@ def _get_block_d(Lk):
                        'Please reduce `block_size`.')
 
     kernel_meta = get_kernel_meta(q)
-    is_decoding = q.shape[-3] == q_seqlens.size(0)
-    if not is_decoding:
-        BLOCK_DMODEL, BLOCK_DMODEL1, BLOCK_DV = _get_block_d(Lq)
-        if _nv_cap[0] < 8:
-            BLOCK_M = max(16, 8192 // BLOCK_DMODEL)
-        else:
-            BLOCK_M = max(16, 16384 // BLOCK_DMODEL)
-        num_warps = 4
-        num_stages = 2
-        kv_head = k.shape[h_dim]
-        grid = (triton.cdiv(max_seqlen * kv_group_num,
-                            BLOCK_M), kv_head, batch)
-        if quant_policy > 0:
-            _fwd_kernel_quant[grid](q,
-                                    k,
-                                    v,
-                                    k_scales_zeros,
-                                    v_scales_zeros,
-                                    sm_scale,
-                                    q_start_loc,
-                                    q_seqlens,
-                                    kv_seqlens,
-                                    block_offsets,
-                                    o,
-                                    stride_qbs=q.stride(-3),
-                                    stride_qh=q.stride(-2),
-                                    stride_qd=q.stride(-1),
-                                    stride_kp=k.stride(b_dim),
-                                    stride_kbs=k.stride(s_dim),
-                                    stride_kh=k.stride(h_dim),
-                                    stride_kd=k.stride(d_dim),
-                                    stride_vp=v.stride(b_dim),
-                                    stride_vbs=v.stride(s_dim),
-                                    stride_vh=v.stride(h_dim),
-                                    stride_vd=v.stride(d_dim),
-                                    stride_kszp=k_scales_zeros.stride(b_dim),
-                                    stride_kszbs=k_scales_zeros.stride(s_dim),
-                                    stride_kszh=k_scales_zeros.stride(h_dim),
-                                    stride_kszd=k_scales_zeros.stride(d_dim),
-                                    stride_vszp=v_scales_zeros.stride(b_dim),
-                                    stride_vszbs=v_scales_zeros.stride(s_dim),
-                                    stride_vszh=v_scales_zeros.stride(h_dim),
-                                    stride_vszd=v_scales_zeros.stride(d_dim),
-                                    quant_policy=quant_policy,
-                                    stride_obs=o.stride(-3),
-                                    stride_oh=o.stride(-2),
-                                    stride_od=o.stride(-1),
-                                    stride_boffb=block_offsets.stride(0),
-                                    kv_group_num=kv_group_num,
-                                    window_size=window_size,
-                                    head_size=Lq,
-                                    head_size_v=Lv,
-                                    logit_softcapping=logit_softcapping,
-                                    BLOCK_M=BLOCK_M,
-                                    BLOCK_DMODEL=BLOCK_DMODEL,
-                                    BLOCK_DV=BLOCK_DV,
-                                    BLOCK_N=BLOCK,
-                                    BLOCK_DMODEL1=BLOCK_DMODEL1,
-                                    num_warps=num_warps,
-                                    num_stages=num_stages,
-                                    **kernel_meta)
-        else:
-            _fwd_kernel[grid](q,
-                              k,
-                              v,
-                              sm_scale,
-                              q_start_loc,
-                              q_seqlens,
-                              kv_seqlens,
-                              block_offsets,
-                              o,
-                              stride_qbs=q.stride(-3),
-                              stride_qh=q.stride(-2),
-                              stride_qd=q.stride(-1),
-                              stride_kp=k.stride(b_dim),
-                              stride_kbs=k.stride(s_dim),
-                              stride_kh=k.stride(h_dim),
-                              stride_kd=k.stride(d_dim),
-                              stride_vp=v.stride(b_dim),
-                              stride_vbs=v.stride(s_dim),
-                              stride_vh=v.stride(h_dim),
-                              stride_vd=v.stride(d_dim),
-                              stride_obs=o.stride(-3),
-                              stride_oh=o.stride(-2),
-                              stride_od=o.stride(-1),
-                              stride_boffb=block_offsets.stride(0),
-                              kv_group_num=kv_group_num,
-                              window_size=window_size,
-                              head_size=Lk,
-                              head_size_v=Lv,
-                              logit_softcapping=logit_softcapping,
-                              BLOCK_M=BLOCK_M,
-                              BLOCK_DMODEL=BLOCK_DMODEL,
-                              BLOCK_DV=BLOCK_DV,
-                              BLOCK_N=BLOCK,
-                              BLOCK_DMODEL1=BLOCK_DMODEL1,
-                              num_warps=num_warps,
-                              num_stages=num_stages,
-                              **kernel_meta)
+    is_decoding = q.shape[-3] == kv_seqlens.size(0)
+    assert is_decoding, 'we only support decoding paged attention.'
+
+    SPLIT_K = 4
+    if quant_policy != 4:
+        acc = q.new_empty(batch, head, SPLIT_K, Lv + 2, dtype=torch.float32)
     else:
-        SPLIT_K = 4
-        if quant_policy != 4:
-            acc = q.new_empty(batch,
-                              head,
-                              SPLIT_K,
-                              Lv + 2,
-                              dtype=torch.float32)
-        else:
-            acc = q.new_empty(batch,
-                              head,
-                              SPLIT_K,
-                              o.shape[-1] + 2,
-                              dtype=torch.float32)
-        BLOCK_DMODEL, BLOCK_DMODEL1, BLOCK_DV = _get_block_d(Lq)
-        p2_kv_group_num = triton.next_power_of_2(kv_group_num)
-        BLOCK_H = max(16, min(BLOCK, p2_kv_group_num))
-        grid_1 = triton.cdiv(head, min(BLOCK_H, kv_group_num))
-        grid = (
-            grid_1,
-            SPLIT_K,
-            batch,
-        )
-        if quant_policy > 0:
-            _fwd_grouped_split_quant_kernel[grid](
-                q,
-                k,
-                v,
-                k_scales_zeros,
-                v_scales_zeros,
-                sm_scale,
-                kv_seqlens,
-                block_offsets,
-                acc,
-                stride_qbs=q.stride(-3),
-                stride_qh=q.stride(-2),
-                stride_qd=q.stride(-1),
-                stride_kp=k.stride(b_dim),
-                stride_kbs=k.stride(s_dim),
-                stride_kh=k.stride(h_dim),
-                stride_kd=k.stride(d_dim),
-                stride_vp=v.stride(b_dim),
-                stride_vbs=v.stride(s_dim),
-                stride_vh=v.stride(h_dim),
-                stride_vd=v.stride(d_dim),
-                stride_kszp=k_scales_zeros.stride(b_dim),
-                stride_kszbs=k_scales_zeros.stride(s_dim),
-                stride_kszh=k_scales_zeros.stride(h_dim),
-                stride_kszd=k_scales_zeros.stride(d_dim),
-                stride_vszp=v_scales_zeros.stride(b_dim),
-                stride_vszbs=v_scales_zeros.stride(s_dim),
-                stride_vszh=v_scales_zeros.stride(h_dim),
-                stride_vszd=v_scales_zeros.stride(d_dim),
-                quant_policy=quant_policy,
-                stride_ok=acc.stride(-2),
-                stride_obs=acc.stride(-4),
-                stride_oh=acc.stride(-3),
-                stride_od=acc.stride(-1),
-                stride_boffb=block_offsets.stride(0),
-                kv_group_num=kv_group_num,
-                window_size=window_size,
-                head_size=Lq,
-                head_size_v=Lv,
-                num_heads_q=head,
-                logit_softcapping=logit_softcapping,
-                SPLIT_K=SPLIT_K,
-                BLOCK_DMODEL=BLOCK_DMODEL,
-                BLOCK_DV=BLOCK_DV,
-                BLOCK_N=BLOCK,
-                BLOCK_H=BLOCK_H,
-                BLOCK_DMODEL1=BLOCK_DMODEL1,
-                **kernel_meta)
+        acc = q.new_empty(batch,
+                          head,
+                          SPLIT_K,
+                          o.shape[-1] + 2,
+                          dtype=torch.float32)
+    BLOCK_DMODEL, BLOCK_DMODEL1, BLOCK_DV = _get_block_d(Lq)
+    p2_kv_group_num = triton.next_power_of_2(kv_group_num)
+    BLOCK_H = max(16, min(BLOCK, p2_kv_group_num))
+    grid_1 = triton.cdiv(head, min(BLOCK_H, kv_group_num))
+    grid = (
+        grid_1,
+        SPLIT_K,
+        batch,
+    )
+    if quant_policy > 0:
+        _fwd_grouped_split_quant_kernel[grid](
+            q,
+            k,
+            v,
+            k_scales_zeros,
+            v_scales_zeros,
+            sm_scale,
+            kv_seqlens,
+            block_offsets,
+            acc,
+            stride_qbs=q.stride(-3),
+            stride_qh=q.stride(-2),
+            stride_qd=q.stride(-1),
+            stride_kp=k.stride(b_dim),
+            stride_kbs=k.stride(s_dim),
+            stride_kh=k.stride(h_dim),
+            stride_kd=k.stride(d_dim),
+            stride_vp=v.stride(b_dim),
+            stride_vbs=v.stride(s_dim),
+            stride_vh=v.stride(h_dim),
+            stride_vd=v.stride(d_dim),
+            stride_kszp=k_scales_zeros.stride(b_dim),
+            stride_kszbs=k_scales_zeros.stride(s_dim),
+            stride_kszh=k_scales_zeros.stride(h_dim),
+            stride_kszd=k_scales_zeros.stride(d_dim),
+            stride_vszp=v_scales_zeros.stride(b_dim),
+            stride_vszbs=v_scales_zeros.stride(s_dim),
+            stride_vszh=v_scales_zeros.stride(h_dim),
+            stride_vszd=v_scales_zeros.stride(d_dim),
+            quant_policy=quant_policy,
+            stride_ok=acc.stride(-2),
+            stride_obs=acc.stride(-4),
+            stride_oh=acc.stride(-3),
+            stride_od=acc.stride(-1),
+            stride_boffb=block_offsets.stride(0),
+            kv_group_num=kv_group_num,
+            window_size=window_size,
+            head_size=Lq,
+            head_size_v=Lv,
+            num_heads_q=head,
+            logit_softcapping=logit_softcapping,
+            SPLIT_K=SPLIT_K,
+            BLOCK_DMODEL=BLOCK_DMODEL,
+            BLOCK_DV=BLOCK_DV,
+            BLOCK_N=BLOCK,
+            BLOCK_H=BLOCK_H,
+            BLOCK_DMODEL1=BLOCK_DMODEL1,
+            **kernel_meta)
 
-        else:
-            _fwd_grouped_split_kernel[grid](
-                q,
-                k,
-                v,
-                sm_scale,
-                kv_seqlens,
-                block_offsets,
-                acc,
-                stride_qbs=q.stride(-3),
-                stride_qh=q.stride(-2),
-                stride_qd=q.stride(-1),
-                stride_kp=k.stride(b_dim),
-                stride_kbs=k.stride(s_dim),
-                stride_kh=k.stride(h_dim),
-                stride_kd=k.stride(d_dim),
-                stride_vp=v.stride(b_dim),
-                stride_vbs=v.stride(s_dim),
-                stride_vh=v.stride(h_dim),
-                stride_vd=v.stride(d_dim),
-                stride_ok=acc.stride(-2),
-                stride_obs=acc.stride(-4),
-                stride_oh=acc.stride(-3),
-                stride_od=acc.stride(-1),
-                stride_boffb=block_offsets.stride(0),
-                kv_group_num=kv_group_num,
-                window_size=window_size,
-                head_size=Lk,
-                head_size_v=Lv,
-                num_heads_q=head,
-                logit_softcapping=logit_softcapping,
-                SPLIT_K=SPLIT_K,
-                BLOCK_DMODEL=BLOCK_DMODEL,
-                BLOCK_DV=BLOCK_DV,
-                BLOCK_N=BLOCK,
-                BLOCK_H=BLOCK_H,
-                BLOCK_DMODEL1=BLOCK_DMODEL1,
-                **kernel_meta)
-
-        num_warps = 4
-        grid = (batch, head)
-        if quant_policy == 4:
-            Lv *= 2
-            BLOCK_DV *= 2
-        _reduce_split_kernel[grid](acc,
-                                   o,
-                                   stride_ak=acc.stride(2),
-                                   stride_abs=acc.stride(0),
-                                   stride_ah=acc.stride(1),
-                                   stride_ad=acc.stride(3),
-                                   stride_obs=o.stride(0),
-                                   stride_oh=o.stride(1),
-                                   stride_od=o.stride(2),
-                                   SPLIT_K=SPLIT_K,
-                                   head_size_v=Lv,
-                                   BLOCK_DV=BLOCK_DV,
-                                   num_warps=num_warps,
-                                   num_stages=1,
-                                   **kernel_meta)
+    else:
+        _fwd_grouped_split_kernel[grid](q,
+                                        k,
+                                        v,
+                                        sm_scale,
+                                        kv_seqlens,
+                                        block_offsets,
+                                        acc,
+                                        stride_qbs=q.stride(-3),
+                                        stride_qh=q.stride(-2),
+                                        stride_qd=q.stride(-1),
+                                        stride_kp=k.stride(b_dim),
+                                        stride_kbs=k.stride(s_dim),
+                                        stride_kh=k.stride(h_dim),
+                                        stride_kd=k.stride(d_dim),
+                                        stride_vp=v.stride(b_dim),
+                                        stride_vbs=v.stride(s_dim),
+                                        stride_vh=v.stride(h_dim),
+                                        stride_vd=v.stride(d_dim),
+                                        stride_ok=acc.stride(-2),
+                                        stride_obs=acc.stride(-4),
+                                        stride_oh=acc.stride(-3),
+                                        stride_od=acc.stride(-1),
+                                        stride_boffb=block_offsets.stride(0),
+                                        kv_group_num=kv_group_num,
+                                        window_size=window_size,
+                                        head_size=Lk,
+                                        head_size_v=Lv,
+                                        num_heads_q=head,
+                                        logit_softcapping=logit_softcapping,
+                                        SPLIT_K=SPLIT_K,
+                                        BLOCK_DMODEL=BLOCK_DMODEL,
+                                        BLOCK_DV=BLOCK_DV,
+                                        BLOCK_N=BLOCK,
+                                        BLOCK_H=BLOCK_H,
+                                        BLOCK_DMODEL1=BLOCK_DMODEL1,
+                                        **kernel_meta)
+
+    num_warps = 4
+    grid = (batch, head)
+    if quant_policy == 4:
+        Lv *= 2
+        BLOCK_DV *= 2
+    _reduce_split_kernel[grid](acc,
+                               o,
+                               stride_ak=acc.stride(2),
+                               stride_abs=acc.stride(0),
+                               stride_ah=acc.stride(1),
+                               stride_ad=acc.stride(3),
+                               stride_obs=o.stride(0),
+                               stride_oh=o.stride(1),
+                               stride_od=o.stride(2),
+                               SPLIT_K=SPLIT_K,
+                               head_size_v=Lv,
+                               BLOCK_DV=BLOCK_DV,
+                               num_warps=num_warps,
+                               num_stages=1,
+                               **kernel_meta)
diff --git a/lmdeploy/pytorch/models/cogvlm.py b/lmdeploy/pytorch/models/cogvlm.py
index 093b367ce2..b53c74d95a 100644
--- a/lmdeploy/pytorch/models/cogvlm.py
+++ b/lmdeploy/pytorch/models/cogvlm.py
@@ -548,20 +548,6 @@ def get_logits(self, hidden_states: torch.Tensor):
         """compute logits of the model output."""
         return self.lm_head(hidden_states)
 
-    def support_cuda_graph(
-        self,
-        input_ids: torch.Tensor,
-        position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
-        attn_metadata: Any = None,
-        inputs_embeds: torch.Tensor = None,
-        lang_ids: torch.LongTensor = None,
-        vision_ids: torch.LongTensor = None,
-        **kwargs,
-    ):
-        """support cudagraph."""
-        return inputs_embeds is None
-
     def get_input_embeddings(self):
         """get input embeddings."""
         return self.model.get_input_embeddings()
diff --git a/lmdeploy/pytorch/models/internlm2.py b/lmdeploy/pytorch/models/internlm2.py
index 497090afc5..a87c848e65 100644
--- a/lmdeploy/pytorch/models/internlm2.py
+++ b/lmdeploy/pytorch/models/internlm2.py
@@ -360,17 +360,6 @@ def get_logits(self, hidden_states: torch.Tensor):
         """compute logits of the model output."""
         return self.output(hidden_states)
 
-    def support_cuda_graph(
-        self,
-        input_ids: torch.Tensor,
-        **kwargs,
-    ):
-        """support cudagraph."""
-        seq_lens = input_ids.size(1)
-        if seq_lens <= 512:
-            return True
-        return False
-
     def get_input_embeddings(self):
         """get input embeddings."""
         return self.model.get_input_embeddings()
diff --git a/lmdeploy/pytorch/models/llama.py b/lmdeploy/pytorch/models/llama.py
index 91f9ec4cfc..525c8e3d34 100644
--- a/lmdeploy/pytorch/models/llama.py
+++ b/lmdeploy/pytorch/models/llama.py
@@ -384,22 +384,6 @@ def get_logits(self, hidden_states: torch.Tensor):
         """compute logits of the model output."""
         return self.lm_head(hidden_states)
 
-    def support_cuda_graph(
-        self,
-        input_ids: torch.Tensor,
-        **kwargs,
-    ):
-        """support cudagraph."""
-        seq_lens = input_ids.size(1)
-        if seq_lens <= 512:
-            return True
-
-        # prevent oom on llama-3 70b
-        if self.config.num_hidden_layers >= 40:
-            return False
-
-        return False
-
     def get_input_embeddings(self):
         """get input embeddings."""
         return self.model.get_input_embeddings()
diff --git a/lmdeploy/pytorch/models/mixtral.py b/lmdeploy/pytorch/models/mixtral.py
index 677d82905b..d98efee712 100644
--- a/lmdeploy/pytorch/models/mixtral.py
+++ b/lmdeploy/pytorch/models/mixtral.py
@@ -340,17 +340,6 @@ def get_logits(self, hidden_states: torch.Tensor):
         """compute logits of the model output."""
         return self.lm_head(hidden_states)
 
-    def support_cuda_graph(
-        self,
-        input_ids: torch.Tensor,
-        **kwargs,
-    ):
-        """support cudagraph."""
-        seq_lens = input_ids.size(1)
-        if seq_lens <= 512:
-            return True
-        return False
-
     def get_input_embeddings(self):
         """get input embeddings."""
         return self.model.get_input_embeddings()
diff --git a/lmdeploy/pytorch/models/utils/cudagraph.py b/lmdeploy/pytorch/models/utils/cudagraph.py
index f56899be89..149376e4be 100644
--- a/lmdeploy/pytorch/models/utils/cudagraph.py
+++ b/lmdeploy/pytorch/models/utils/cudagraph.py
@@ -48,8 +48,7 @@ def support_cuda_graph(
         **kwargs,
     ):
         """return True is model support cudagraph."""
-        seq_lens = input_ids.size(1)
-        return seq_lens <= 256
+        return attn_metadata.is_decoding
 
     def make_buffers_cudagraph(self, graph_meta: CudaGraphMeta, *args,
                                **kwargs) -> BuffType:
diff --git a/tests/pytorch/kernel/test_flash_attention.py b/tests/pytorch/kernel/test_flash_attention.py
new file mode 100644
index 0000000000..7d4b7a7f3a
--- /dev/null
+++ b/tests/pytorch/kernel/test_flash_attention.py
@@ -0,0 +1,250 @@
+import math
+
+import pytest
+import torch
+
+
+def _conti_input(data, q_seqlens):
+    data = [x[:l] for x, l in zip(data, q_seqlens)]
+    data = torch.cat(data, dim=0)
+    return data
+
+
+def _make_bias(q_seqlens, history_lens, neg_val):
+    full_seq_lens = q_seqlens + history_lens
+    max_seq_len = q_seqlens.max().item()
+    max_full_len = full_seq_lens.max().item()
+    seq_ranges = [torch.arange(max_seq_len) for _ in q_seqlens]
+    for r, l in zip(seq_ranges, q_seqlens):
+        r[l:] = -max_full_len
+    seq_ranges = torch.stack(seq_ranges, dim=0).cuda()
+    kv_ranges = [torch.arange(max_full_len) for _ in full_seq_lens]
+    kv_ranges = torch.stack(kv_ranges, 0).cuda()
+    mask = kv_ranges[:, None, :] - seq_ranges[:, :, None] > history_lens[:,
+                                                                         None,
+                                                                         None]
+    return mask.float() * neg_val
+
+
+def _naive_attention(batched_q, batched_kv, bias):
+    batched_k, batched_v = batched_kv
+
+    num_heads_q = batched_q.shape[2]
+    num_heads_k = batched_k.shape[2]
+    head_dim = batched_q.shape[-1]
+    group = num_heads_q // num_heads_k
+
+    q = batched_q.transpose(1, 2)
+    k = batched_k.permute(0, 2, 3, 1)
+    v = batched_v.transpose(1, 2)
+
+    # expand group
+    k = k.unsqueeze(2).expand(-1, -1, group, -1, -1).flatten(1, 2)
+    v = v.unsqueeze(2).expand(-1, -1, group, -1, -1).flatten(1, 2)
+
+    qk = torch.matmul(q, k) / math.sqrt(head_dim)
+    attn_weight = qk + bias[:, None]
+    attn_weight = torch.softmax(attn_weight, dim=-1, dtype=torch.float32)
+    attn_weight = attn_weight.to(q.dtype)
+    attn_output = torch.matmul(attn_weight, v)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output
+
+
+def _naive_window_attention(q, k, v, seqlens_q, seqlens_k, window_size):
+    from flash_attn import flash_attn_varlen_func
+
+    def _make_cu_seqlens(seqlens):
+        cu_seqlens = seqlens.cumsum(0)
+        cu_zero = cu_seqlens.new_zeros(1)
+        cu_seqlens = torch.cat([cu_zero, cu_seqlens])
+        return cu_seqlens
+
+    max_seqlen_q = seqlens_q.max().item()
+    max_seqlen_k = seqlens_k.max().item()
+    cu_seqlens_q = _make_cu_seqlens(seqlens_q).int()
+    cu_seqlens_k = _make_cu_seqlens(seqlens_k).int()
+
+    output = flash_attn_varlen_func(q,
+                                    k,
+                                    v,
+                                    cu_seqlens_q,
+                                    cu_seqlens_k,
+                                    max_seqlen_q=max_seqlen_q,
+                                    max_seqlen_k=max_seqlen_k,
+                                    causal=True,
+                                    window_size=window_size)
+    return output
+
+
+class TestFlashAttention:
+
+    @pytest.fixture
+    def dtype(self):
+        yield torch.float16
+
+    @pytest.fixture
+    def head_dim_k(self, request):
+        yield request.param
+
+    @pytest.fixture
+    def head_dim_v(self, request):
+        yield request.param
+
+    @pytest.fixture
+    def num_heads_q(self, request):
+        yield request.param
+
+    @pytest.fixture
+    def num_heads_k(self, request):
+        yield request.param
+
+    @pytest.fixture
+    def q_seqlens(self, request):
+        yield torch.tensor(request.param, device='cuda')
+
+    @pytest.fixture
+    def q_start_loc(self, q_seqlens):
+        yield q_seqlens.cumsum(0) - q_seqlens
+
+    @pytest.fixture
+    def history_lens(self, request):
+        yield torch.tensor(request.param, device='cuda')
+
+    @pytest.fixture
+    def kv_seqlens(self, q_seqlens, history_lens):
+        yield q_seqlens + history_lens
+
+    @pytest.fixture
+    def kv_start_loc(self, kv_seqlens):
+        yield kv_seqlens.cumsum(0) - kv_seqlens
+
+    @pytest.fixture
+    def batched_q(self, q_seqlens, num_heads_q, head_dim_k, dtype):
+        torch.manual_seed(123)
+        batch_size = len(q_seqlens)
+        max_seq_len = q_seqlens.max().item()
+        inputs = torch.rand(batch_size,
+                            max_seq_len,
+                            num_heads_q,
+                            head_dim_k,
+                            dtype=dtype,
+                            device='cuda')
+        yield inputs
+
+    @pytest.fixture
+    def batched_kv(self, q_seqlens, history_lens, num_heads_k, head_dim_k,
+                   head_dim_v, dtype):
+        torch.manual_seed(123)
+        batch_size = len(q_seqlens)
+        full_seq_lens = q_seqlens + history_lens
+        max_seq_len = full_seq_lens.max().item()
+        k = torch.rand(batch_size,
+                       max_seq_len,
+                       num_heads_k,
+                       head_dim_k,
+                       dtype=dtype,
+                       device='cuda')
+        v = torch.rand(batch_size,
+                       max_seq_len,
+                       num_heads_k,
+                       head_dim_v,
+                       dtype=dtype,
+                       device='cuda')
+        yield k, v
+
+    @pytest.fixture
+    def conti_q(self, q_seqlens, batched_q):
+        yield _conti_input(batched_q, q_seqlens)
+
+    @pytest.fixture
+    def conti_kv(self, kv_seqlens, batched_kv):
+        conti_k = _conti_input(batched_kv[0], kv_seqlens)
+        conti_k = conti_k.transpose(0, 1).contiguous()
+        conti_v = _conti_input(batched_kv[1], kv_seqlens)
+        conti_v = conti_v.transpose(0, 1).contiguous()
+        yield (conti_k, conti_v)
+
+    @pytest.fixture
+    def mask(self, q_seqlens, history_lens):
+        neg_val = -1e30
+        yield _make_bias(q_seqlens, history_lens, neg_val)
+
+    @pytest.fixture
+    def gt(self, batched_q, batched_kv, mask):
+        yield _naive_attention(batched_q, batched_kv, mask)
+
+    @pytest.fixture
+    def conti_gt(self, gt, q_seqlens):
+        yield _conti_input(gt, q_seqlens)
+
+    @pytest.mark.parametrize('head_dim_k', [32, 48], indirect=True)
+    @pytest.mark.parametrize('head_dim_v', [32], indirect=True)
+    @pytest.mark.parametrize('num_heads_q', [8, 2], indirect=True)
+    @pytest.mark.parametrize('num_heads_k', [2], indirect=True)
+    @pytest.mark.parametrize(['q_seqlens', 'history_lens'],
+                             [([30, 50, 70, 90], [50, 40, 30, 20])],
+                             indirect=True)
+    def test_flash_attention(self, conti_q, conti_kv, q_start_loc, q_seqlens,
+                             kv_start_loc, kv_seqlens, head_dim_v, conti_gt):
+        from lmdeploy.pytorch.kernels.cuda.flashattention import \
+            flash_attention_fwd
+        max_seq_len = q_seqlens.max().item()
+
+        conti_k, conti_v = conti_kv
+        out = conti_q.new_empty(*conti_q.shape[:-1], head_dim_v)
+        flash_attention_fwd(conti_q,
+                            conti_k,
+                            conti_v,
+                            out,
+                            q_start_loc=q_start_loc,
+                            q_seqlens=q_seqlens,
+                            kv_start_loc=kv_start_loc,
+                            kv_seqlens=kv_seqlens,
+                            max_seqlen=max_seq_len)
+        torch.testing.assert_close(out, conti_gt, atol=1e-3, rtol=1e-5)
+
+    @pytest.fixture
+    def win_size(self, request):
+        yield request.param
+
+    @pytest.fixture
+    def window_gt(self, conti_q, conti_kv, q_seqlens, kv_seqlens, win_size):
+        conti_k, conti_v = conti_kv
+        yield _naive_window_attention(conti_q,
+                                      conti_k.transpose(0, 1),
+                                      conti_v.transpose(0, 1),
+                                      q_seqlens,
+                                      kv_seqlens,
+                                      window_size=(win_size, win_size))
+
+    @pytest.mark.parametrize('head_dim_k', [16], indirect=True)
+    @pytest.mark.parametrize('head_dim_v', [16], indirect=True)
+    @pytest.mark.parametrize(['num_heads_q', 'num_heads_k'], [(4, 2)],
+                             indirect=True)
+    @pytest.mark.parametrize(['q_seqlens', 'history_lens'], [
+        ([30, 50, 70, 90], [50, 40, 30, 90]),
+    ],
+                             indirect=True)
+    @pytest.mark.parametrize('win_size', (32, ), indirect=True)
+    def test_window_attention(self, conti_q, conti_kv, q_start_loc, q_seqlens,
+                              kv_start_loc, kv_seqlens, head_dim_v, win_size,
+                              window_gt):
+        from lmdeploy.pytorch.kernels.cuda.flashattention import \
+            flash_attention_fwd
+        max_seq_len = q_seqlens.max().item()
+
+        conti_k, conti_v = conti_kv
+        out = conti_q.new_empty(*conti_q.shape[:-1], head_dim_v)
+        flash_attention_fwd(conti_q,
+                            conti_k,
+                            conti_v,
+                            out,
+                            q_start_loc=q_start_loc,
+                            q_seqlens=q_seqlens,
+                            kv_start_loc=kv_start_loc,
+                            kv_seqlens=kv_seqlens,
+                            max_seqlen=max_seq_len,
+                            window_size=win_size)
+        torch.testing.assert_close(out, window_gt, atol=1e-3, rtol=1e-5)
diff --git a/tests/pytorch/kernel/test_flatten_kv_cache.py b/tests/pytorch/kernel/test_flatten_kv_cache.py
new file mode 100644
index 0000000000..9b870bc6b6
--- /dev/null
+++ b/tests/pytorch/kernel/test_flatten_kv_cache.py
@@ -0,0 +1,182 @@
+import pytest
+import torch
+
+
+def _div_up(a, b):
+    return (a + b - 1) // b
+
+
+class TestFlattenKVCache:
+
+    @pytest.fixture
+    def out_dtype(self):
+        yield torch.float16
+
+    @pytest.fixture
+    def num_heads(self):
+        yield 4
+
+    @pytest.fixture
+    def head_dim(self):
+        yield 32
+
+    @pytest.fixture
+    def block_size(self):
+        yield 16
+
+    @pytest.fixture
+    def kv_lens(self):
+        yield [2, 24, 47, 48]
+
+    @pytest.fixture
+    def batch_size(self, kv_lens):
+        yield len(kv_lens)
+
+    @pytest.fixture
+    def num_blocks_per_input(self, kv_lens, block_size):
+        yield [_div_up(kv_len, block_size) for kv_len in kv_lens]
+
+    @pytest.fixture
+    def max_num_blocks(self, num_blocks_per_input):
+        yield max(num_blocks_per_input)
+
+    @pytest.fixture
+    def out_size(self, kv_lens):
+        yield sum(kv_lens)
+
+    @pytest.fixture
+    def kv_seqlens(self, kv_lens):
+        yield torch.tensor(kv_lens).cuda()
+
+    @pytest.fixture
+    def k_caches(self, batch_size, max_num_blocks, block_size, num_heads,
+                 head_dim, out_dtype):
+        shape = (batch_size * max_num_blocks, block_size, num_heads, head_dim)
+        yield torch.rand(shape, dtype=out_dtype, device='cuda')
+
+    @pytest.fixture
+    def v_caches(self, k_caches):
+        yield torch.rand_like(k_caches)
+
+    @pytest.fixture
+    def block_offsets(self, num_blocks_per_input):
+        batch_size = len(num_blocks_per_input)
+        max_num_blocks = max(num_blocks_per_input)
+        batch_ids = torch.arange(batch_size)
+        ret = torch.arange(max_num_blocks)
+        ret = batch_ids[:, None] + ret[None, :] * batch_size
+        yield ret.cuda()
+
+    @pytest.fixture
+    def gt(self, k_caches, v_caches, kv_lens, block_offsets, block_size,
+           num_heads, out_size, head_dim):
+        k_states = k_caches.new_empty(num_heads, out_size, head_dim)
+        v_states = v_caches.new_empty(num_heads, out_size, head_dim)
+        start_loc = 0
+        for kv_len, block_offs in zip(kv_lens, block_offsets):
+            remain_len = kv_len
+            for idx, _ in enumerate(range(0, kv_len, block_size)):
+                b_off = block_offs[idx]
+                block_len = min(block_size, remain_len)
+                end_loc = start_loc + block_len
+                k_block = k_caches[b_off, :block_len]
+                v_block = v_caches[b_off, :block_len]
+                k_states[:, start_loc:end_loc] = k_block.transpose(0, 1)
+                v_states[:, start_loc:end_loc] = v_block.transpose(0, 1)
+                start_loc = end_loc
+                remain_len -= block_len
+
+        yield k_states, v_states
+
+    def test_flatten_kv_cache(self, k_caches, v_caches, kv_seqlens,
+                              block_offsets, out_size, gt):
+        from lmdeploy.pytorch.kernels.cuda.flatten_kv_cache import \
+            flatten_kv_cache
+
+        k_states, v_states = flatten_kv_cache(k_caches,
+                                              v_caches,
+                                              kv_seqlens,
+                                              block_offsets,
+                                              out_size=out_size)
+        torch.testing.assert_close(k_states, gt[0])
+        torch.testing.assert_close(v_states, gt[1])
+
+
+def precise_round(x: torch.Tensor):
+    return x.sign() * (x.abs() + 0.5).floor()
+
+
+def quant(kv: torch.Tensor, nbits: int = 8):
+    """Quant kv on the head_dim."""
+    amax = kv.amax(dim=-1, keepdim=True)
+    amin = kv.amin(dim=-1, keepdim=True)
+    scales = (amax - amin) / (2**nbits - 1)
+    zeros = -amin / scales
+    q_kv = (kv / scales + zeros + 0.5).to(torch.uint8)
+    if nbits == 4:
+        q_kv1, q_kv2 = q_kv.split(q_kv.shape[-1] // 2, -1)
+        q_kv = q_kv1 + q_kv2 * 16
+    return q_kv, torch.cat([scales, zeros], dim=-1)
+
+
+class TestFlattenKVCacheQuant8(TestFlattenKVCache):
+
+    @pytest.fixture
+    def nbits(self):
+        yield 8
+
+    @pytest.fixture
+    def atol(self):
+        yield 4e-3
+
+    @pytest.fixture
+    def rtol(self):
+        yield 1e-5
+
+    @pytest.fixture
+    def k_quant(self, k_caches, nbits):
+        yield quant(k_caches, nbits)
+
+    @pytest.fixture
+    def v_quant(self, v_caches, nbits):
+        yield quant(v_caches, nbits)
+
+    def test_flatten_kv_cache(self, k_quant, v_quant, kv_seqlens,
+                              block_offsets, out_size, out_dtype, nbits, gt,
+                              atol, rtol):
+        from lmdeploy.pytorch.kernels.cuda.flatten_kv_cache import \
+            flatten_kv_cache
+
+        k_caches, k_sz = k_quant
+        v_caches, v_sz = v_quant
+
+        k_sz = k_sz.to(out_dtype)
+        v_sz = v_sz.to(out_dtype)
+
+        k_states, v_states = flatten_kv_cache(k_caches,
+                                              v_caches,
+                                              kv_seqlens,
+                                              block_offsets,
+                                              out_size=out_size,
+                                              out_dtype=out_dtype,
+                                              k_scales_zeros=k_sz,
+                                              v_scales_zeros=v_sz,
+                                              quant_policy=nbits)
+
+        torch.testing.assert_close(k_states, gt[0], atol=atol, rtol=rtol)
+        torch.testing.assert_close(v_states, gt[1], atol=atol, rtol=rtol)
+
+
+class TestFlattenKVCacheQuant4(TestFlattenKVCacheQuant8):
+
+    @pytest.fixture
+    def nbits(self):
+        yield 4
+
+    @pytest.fixture
+    def atol(self):
+        yield 0.05
+
+    @pytest.fixture
+    def rtol(self):
+        yield 1e-3
diff --git a/tests/pytorch/kernel/test_paged_attention.py b/tests/pytorch/kernel/test_paged_attention.py
index 0ef0db7330..bd77027a00 100644
--- a/tests/pytorch/kernel/test_paged_attention.py
+++ b/tests/pytorch/kernel/test_paged_attention.py
@@ -145,26 +145,23 @@ def layout(self, request):
         yield request.param
 
     @pytest.fixture
-    def seq_lens(self, request):
+    def history_lens(self, request):
         yield torch.tensor(request.param, device='cuda')
 
     @pytest.fixture
-    def start_loc(self, seq_lens):
-        seq_sum = seq_lens.cumsum(0)
-        start_loc = torch.cat([seq_sum.new_zeros(1), seq_sum[:-1]], dim=0)
-        yield start_loc
+    def seq_lens(self, history_lens):
+        yield torch.ones_like(history_lens)
 
     @pytest.fixture
-    def history_lens(self, request):
-        yield torch.tensor(request.param, device='cuda')
+    def kv_seqlens(self, history_lens):
+        yield 1 + history_lens
 
     @pytest.fixture
-    def batched_q(self, seq_lens, num_heads_q, feat_dim, dtype):
+    def batched_q(self, kv_seqlens, num_heads_q, feat_dim, dtype):
         torch.manual_seed(123)
-        batch_size = len(seq_lens)
-        max_seq_len = seq_lens.max().item()
+        batch_size = len(kv_seqlens)
         inputs = torch.rand(batch_size,
-                            max_seq_len,
+                            1,
                             num_heads_q,
                             feat_dim,
                             dtype=dtype,
@@ -172,12 +169,10 @@ def batched_q(self, seq_lens, num_heads_q, feat_dim, dtype):
         yield inputs
 
     @pytest.fixture
-    def batched_kv(self, seq_lens, history_lens, num_heads_k, feat_dim,
-                   feat_dim_v, dtype):
+    def batched_kv(self, kv_seqlens, num_heads_k, feat_dim, feat_dim_v, dtype):
         torch.manual_seed(123)
-        batch_size = len(seq_lens)
-        full_seq_lens = seq_lens + history_lens
-        max_seq_len = full_seq_lens.max().item()
+        batch_size = len(kv_seqlens)
+        max_seq_len = kv_seqlens.max().item()
         k = torch.rand(batch_size,
                        max_seq_len,
                        num_heads_k,
@@ -193,14 +188,14 @@ def batched_kv(self, seq_lens, history_lens, num_heads_k, feat_dim,
         yield k, v
 
     @pytest.fixture
-    def conti_q(self, seq_lens, batched_q):
+    def conti_q(self, kv_seqlens, batched_q):
+        seq_lens = torch.ones_like(kv_seqlens)
         yield _conti_input(batched_q, seq_lens)
 
     @pytest.fixture
-    def block_offsets(self, seq_lens, history_lens, block_size):
-        full_seq_lens = seq_lens + history_lens
-        batch_size = full_seq_lens.size(0)
-        num_blocks = (full_seq_lens + block_size - 1) // block_size
+    def block_offsets(self, kv_seqlens, block_size):
+        batch_size = kv_seqlens.size(0)
+        num_blocks = (kv_seqlens + block_size - 1) // block_size
 
         offset = [
             torch.arange(size) * batch_size + idx
@@ -215,23 +210,25 @@ def block_offsets(self, seq_lens, history_lens, block_size):
         yield new_offset.cuda()
 
     @pytest.fixture
-    def conti_kv(self, batched_kv, seq_lens, history_lens):
-        full_seq_lens = seq_lens + history_lens
+    def conti_kv(self, batched_kv, history_lens):
+        full_seq_lens = 1 + history_lens
         conti_k = _conti_input(batched_kv[0], full_seq_lens)
         conti_v = _conti_input(batched_kv[1], full_seq_lens)
         yield (conti_k, conti_v)
 
     @pytest.fixture
-    def blocked_kv(self, batched_kv, seq_lens, history_lens, block_offsets,
+    def blocked_kv(self, batched_kv, kv_seqlens, history_lens, block_offsets,
                    block_size, num_heads_k, feat_dim, feat_dim_v, layout):
         batched_k, batched_v = batched_kv
+        seq_lens = torch.ones_like(kv_seqlens)
         yield _make_blocked_cache(batched_k, batched_v, seq_lens, history_lens,
                                   block_offsets, block_size, num_heads_k,
                                   feat_dim, feat_dim_v, layout)
 
     @pytest.fixture
-    def mask(self, seq_lens, history_lens):
+    def mask(self, history_lens):
         neg_val = -1e30
+        seq_lens = torch.ones_like(history_lens)
         yield _make_bias(seq_lens, history_lens, neg_val)
 
     @pytest.fixture
@@ -247,18 +244,13 @@ def conti_gt(self, gt, seq_lens):
     @pytest.mark.parametrize(['num_heads_q', 'num_heads_k'], [(128, 2), (8, 2),
                                                               (2, 2)],
                              indirect=True)
-    @pytest.mark.parametrize(['seq_lens', 'history_lens'],
-                             [([30, 50, 70, 90], [50, 40, 30, 20]),
-                              ([1, 1, 1, 1], [50, 40, 30, 20])],
-                             indirect=True)
+    @pytest.mark.parametrize('history_lens', [(50, 40, 30, 20)], indirect=True)
     @pytest.mark.parametrize('block_size', [16], indirect=True)
     @pytest.mark.parametrize('layout', ['bshd', 'bhsd'], indirect=True)
     def test_paged_attention(self, conti_q, blocked_kv, block_offsets,
-                             start_loc, seq_lens, history_lens, feat_dim_v,
-                             layout, conti_gt):
+                             history_lens, feat_dim_v, layout, conti_gt):
         from lmdeploy.pytorch.kernels import paged_attention_fwd
-        kv_seq_lens = seq_lens + history_lens
-        max_seq_len = seq_lens.max().item()
+        kv_seq_lens = 1 + history_lens
 
         blocked_k, blocked_v = blocked_kv
         out = conti_q.new_empty(*conti_q.shape[:-1], feat_dim_v)
@@ -268,10 +260,7 @@ def test_paged_attention(self, conti_q, blocked_kv, block_offsets,
                             blocked_v,
                             out,
                             block_offsets=block_offsets,
-                            q_start_loc=start_loc,
-                            q_seqlens=seq_lens,
                             kv_seqlens=kv_seq_lens,
-                            max_seqlen=max_seq_len,
                             kv_layout=layout)
         torch.testing.assert_close(out, conti_gt, atol=1e-3, rtol=1e-5)
 
@@ -293,20 +282,18 @@ def window_gt(self, conti_q, conti_kv, seq_lens, history_lens, win_size):
     @pytest.mark.parametrize('feat_dim_v', [16], indirect=True)
     @pytest.mark.parametrize(['num_heads_q', 'num_heads_k'], [(4, 2)],
                              indirect=True)
-    @pytest.mark.parametrize(['seq_lens', 'history_lens'], [
-        ([30, 50, 70, 90], [50, 40, 30, 20]),
-        ([1, 1, 1, 1], [50, 40, 30, 20]),
+    @pytest.mark.parametrize('history_lens', [
+        (50, 40, 30, 20),
     ],
                              indirect=True)
     @pytest.mark.parametrize('win_size', (32, ), indirect=True)
     @pytest.mark.parametrize('block_size', [16], indirect=True)
     @pytest.mark.parametrize('layout', ['bshd'], indirect=True)
     def test_window_attention(self, conti_q, blocked_kv, block_offsets,
-                              start_loc, seq_lens, history_lens, feat_dim_v,
-                              win_size, layout, window_gt):
+                              history_lens, feat_dim_v, win_size, layout,
+                              window_gt):
         from lmdeploy.pytorch.kernels import paged_attention_fwd
-        kv_seq_lens = seq_lens + history_lens
-        max_seq_len = seq_lens.max().item()
+        kv_seq_lens = 1 + history_lens
 
         blocked_k, blocked_v = blocked_kv
         out = conti_q.new_empty(*conti_q.shape[:-1], feat_dim_v)
@@ -315,26 +302,19 @@ def test_window_attention(self, conti_q, blocked_kv, block_offsets,
                             blocked_v,
                             out,
                             block_offsets=block_offsets,
-                            q_start_loc=start_loc,
-                            q_seqlens=seq_lens,
                             kv_seqlens=kv_seq_lens,
-                            max_seqlen=max_seq_len,
                             window_size=win_size,
                             kv_layout=layout)
         torch.testing.assert_close(out, window_gt, atol=1e-3, rtol=1e-5)
 
 
-def precise_round(x: torch.Tensor):
-    return x.sign() * (x.abs() + 0.5).floor()
-
-
 def quant(kv: torch.Tensor, nbits: int = 8):
     """Quant kv on the head_dim."""
     amax = kv.amax(dim=-1, keepdim=True)
     amin = kv.amin(dim=-1, keepdim=True)
     scales = (amax - amin) / (2**nbits - 1)
     zeros = -amin / scales
-    q_kv = precise_round((kv - amin) / scales).to(torch.uint8)
+    q_kv = (kv / scales + zeros + 0.5).to(torch.uint8)
     if nbits == 4:
         q_kv1, q_kv2 = q_kv.split(q_kv.shape[-1] // 2, -1)
         q_kv = q_kv1 + q_kv2 * 16
@@ -400,17 +380,13 @@ def blocked_kv(self, batched_kv, seq_lens, history_lens, block_offsets,
     @pytest.mark.parametrize('feat_dim_v', [32], indirect=True)
     @pytest.mark.parametrize(['num_heads_q', 'num_heads_k'], [(8, 2), (2, 2)],
                              indirect=True)
-    @pytest.mark.parametrize(['seq_lens', 'history_lens'],
-                             [([30, 50, 70, 90], [50, 40, 30, 20]),
-                              ([1, 1, 1, 1], [50, 40, 30, 20])],
-                             indirect=True)
+    @pytest.mark.parametrize('history_lens', [(50, 40, 30, 20)], indirect=True)
     @pytest.mark.parametrize('block_size', [16], indirect=True)
     def test_paged_attention(self, conti_q, blocked_kv, block_offsets,
-                             start_loc, seq_lens, history_lens, feat_dim_v,
-                             conti_gt, nbits):
+                             seq_lens, history_lens, feat_dim_v, conti_gt,
+                             nbits):
         from lmdeploy.pytorch.kernels import paged_attention_fwd
-        kv_seq_lens = seq_lens + history_lens
-        max_seq_len = seq_lens.max().item()
+        kv_seq_lens = 1 + history_lens
 
         blocked_k, blocked_v, blocked_ksz, blocked_vsz = blocked_kv
         out = conti_q.new_empty(*conti_q.shape[:-1], feat_dim_v)
@@ -423,10 +399,7 @@ def test_paged_attention(self, conti_q, blocked_kv, block_offsets,
                             v_scales_zeros=blocked_vsz,
                             quant_policy=nbits,
                             block_offsets=block_offsets,
-                            q_start_loc=start_loc,
-                            q_seqlens=seq_lens,
-                            kv_seqlens=kv_seq_lens,
-                            max_seqlen=max_seq_len)
+                            kv_seqlens=kv_seq_lens)
         if nbits == 4:
             torch.testing.assert_close(out, conti_gt, atol=0.05, rtol=0.01)
         else:
@@ -436,19 +409,17 @@ def test_paged_attention(self, conti_q, blocked_kv, block_offsets,
     @pytest.mark.parametrize('feat_dim_v', [16], indirect=True)
     @pytest.mark.parametrize(['num_heads_q', 'num_heads_k'], [(4, 2)],
                              indirect=True)
-    @pytest.mark.parametrize(['seq_lens', 'history_lens'], [
-        ([30, 50, 70, 90], [50, 40, 30, 20]),
-        ([1, 1, 1, 1], [50, 40, 30, 20]),
+    @pytest.mark.parametrize('history_lens', [
+        (50, 40, 30, 20),
     ],
                              indirect=True)
     @pytest.mark.parametrize('win_size', (32, ), indirect=True)
     @pytest.mark.parametrize('block_size', [16], indirect=True)
     def test_window_attention(self, conti_q, blocked_kv, block_offsets,
-                              start_loc, seq_lens, history_lens, feat_dim_v,
-                              win_size, window_gt, nbits):
+                              history_lens, feat_dim_v, win_size, window_gt,
+                              nbits):
         from lmdeploy.pytorch.kernels import paged_attention_fwd
-        kv_seq_lens = seq_lens + history_lens
-        max_seq_len = seq_lens.max().item()
+        kv_seq_lens = 1 + history_lens
 
         blocked_k, blocked_v, blocked_ksz, blocked_vsz = blocked_kv
         out = conti_q.new_empty(*conti_q.shape[:-1], feat_dim_v)
@@ -460,10 +431,7 @@ def test_window_attention(self, conti_q, blocked_kv, block_offsets,
                             v_scales_zeros=blocked_vsz,
                             quant_policy=nbits,
                             block_offsets=block_offsets,
-                            q_start_loc=start_loc,
-                            q_seqlens=seq_lens,
                             kv_seqlens=kv_seq_lens,
-                            max_seqlen=max_seq_len,
                             window_size=win_size)
         if nbits == 4:
             torch.testing.assert_close(out, window_gt, atol=0.05, rtol=0.01)

From 78ab485e4a563e82f8d560e836ebeeec4a2c22b1 Mon Sep 17 00:00:00 2001
From: Willow <deepindeed2022@gmail.com>
Date: Fri, 8 Nov 2024 19:31:35 +0800
Subject: [PATCH 065/122] [Feature]: support LlavaForConditionalGeneration with
 turbomind inference (#2710)

* feat: support llava_qwen2 for fp16 and awq

* update generate gemm config script for VLM

* lint: fix lint warning

* doc: presenting the usage in the user guide

* resolve conflict issue and refactor for better design

* fix and doc:
- fix tune attribute error
- add chinese llava doc

* keep LlavaLlamaForCausalLM/LlavaMistralForCausalLM to llama

* fix attn_bias default value
---
 docs/en/multi_modal/llava.md                  | 138 +++++++++++++++++-
 docs/zh_cn/multi_modal/llava.md               | 134 ++++++++++++++++-
 .../turbomind/deploy/source_model/__init__.py |   1 +
 .../turbomind/deploy/source_model/llava.py    |  89 +++++++++++
 lmdeploy/turbomind/generate_gemm_config.py    |   4 +
 lmdeploy/turbomind/supported_models.py        |   6 +
 6 files changed, 370 insertions(+), 2 deletions(-)
 create mode 100644 lmdeploy/turbomind/deploy/source_model/llava.py

diff --git a/docs/en/multi_modal/llava.md b/docs/en/multi_modal/llava.md
index cf95e15d5c..8f052227d5 100644
--- a/docs/en/multi_modal/llava.md
+++ b/docs/en/multi_modal/llava.md
@@ -1,3 +1,139 @@
 # LLaVA
 
-TODO
+LMDeploy supports the following llava series of models, which are detailed in the table below:
+
+|                Model                 | Size | Supported Inference Engine |
+| :----------------------------------: | :--: | :------------------------: |
+| llava-hf/Llava-interleave-qwen-7b-hf |  7B  |     TurboMind, PyTorch     |
+|       llava-hf/llava-1.5-7b-hf       |  7B  |     TurboMind, PyTorch     |
+|   liuhaotian/llava-v1.6-vicuna-7b    |  7B  |     TurboMind, PyTorch     |
+|   liuhaotian/llava-v1.6-mistral-7b   |  7B  |     TurboMind, PyTorch     |
+
+The next chapter demonstrates how to deploy an Llava model using LMDeploy, with [llava-hf/llava-interleave](https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf) as an example.
+
+## Installation
+
+Please install LMDeploy by following the [installation guide](../get_started/installation.md).
+
+Or, you can go with office docker image:
+
+```shell
+docker pull openmmlab/lmdeploy:latest
+```
+
+## Offline inference
+
+The following sample code shows the basic usage of VLM pipeline. For detailed information, please refer to [VLM Offline Inference Pipeline](./vl_pipeline.md)
+
+```python
+from lmdeploy import GenerationConfig, TurbomindEngineConfig, pipeline
+from lmdeploy.vl import load_image
+
+
+pipe = pipeline("llava-hf/llava-interleave-qwen-7b-hf", backend_config=TurbomindEngineConfig(cache_max_entry_count=0.5),
+    gen_config=GenerationConfig(max_new_tokens=512))
+
+image = load_image('https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg')
+prompt = 'Describe the image.'
+print(f'prompt:{prompt}')
+response = pipe((prompt, image))
+print(response)
+
+```
+
+More examples are listed below:
+
+<details>
+  <summary>
+    <b>multi-image multi-round conversation, combined images</b>
+  </summary>
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+
+pipe = pipeline('llava-hf/llava-interleave-qwen-7b-hf', log_level='INFO')
+messages = [
+    dict(role='user', content=[
+        dict(type='text', text='Describe the two images in detail.'),
+        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Beijing_Small.jpeg')),
+        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Chongqing_Small.jpeg'))
+    ])
+]
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+
+messages.append(dict(role='assistant', content=out.text))
+messages.append(dict(role='user', content='What are the similarities and differences between these two images.'))
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+```
+
+</details>
+
+## Online serving
+
+You can launch the server by the `lmdeploy serve api_server` CLI:
+
+```shell
+lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
+```
+
+You can also start the service using the aforementioned built docker image:
+
+```shell
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    -p 23333:23333 \
+    --ipc=host \
+    openmmlab/lmdeploy:latest \
+    lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
+```
+
+The docker compose is another option. Create a `docker-compose.yml` configuration file in the root directory of the lmdeploy project as follows:
+
+```yaml
+version: '3.5'
+
+services:
+  lmdeploy:
+    container_name: lmdeploy
+    image: openmmlab/lmdeploy:latest
+    ports:
+      - "23333:23333"
+    environment:
+      HUGGING_FACE_HUB_TOKEN: <secret>
+    volumes:
+      - ~/.cache/huggingface:/root/.cache/huggingface
+    stdin_open: true
+    tty: true
+    ipc: host
+    command: lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: "all"
+              capabilities: [gpu]
+```
+
+Then, you can execute the startup command as below:
+
+```shell
+docker-compose up -d
+```
+
+If you find the following logs after running `docker logs -f lmdeploy`, it means the service launches successfully.
+
+```text
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+INFO:     Started server process [2439]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on  http://0.0.0.0:23333  (Press CTRL+C to quit)
+```
+
+The arguments of `lmdeploy serve api_server` can be reviewed in detail by `lmdeploy serve api_server -h`.
+
+More information about `api_server` as well as how to access the service can be found from [here](api_server_vl.md)
diff --git a/docs/zh_cn/multi_modal/llava.md b/docs/zh_cn/multi_modal/llava.md
index cf95e15d5c..c40f37308a 100644
--- a/docs/zh_cn/multi_modal/llava.md
+++ b/docs/zh_cn/multi_modal/llava.md
@@ -1,3 +1,135 @@
 # LLaVA
 
-TODO
+LMDeploy 支持以下 LLaVA 系列模型，具体如下表所示：
+
+|                 模型                 | 大小 |   支持的推理引擎   |
+| :----------------------------------: | :--: | :----------------: |
+| llava-hf/Llava-interleave-qwen-7b-hf |  7B  | TurboMind, PyTorch |
+|       llava-hf/llava-1.5-7b-hf       |  7B  | TurboMind, PyTorch |
+|   liuhaotian/llava-v1.6-vicuna-7b    |  7B  | TurboMind, PyTorch |
+|   liuhaotian/llava-v1.6-mistral-7b   |  7B  | TurboMind, PyTorch |
+
+接下来的章节将演示如何使用 LMDeploy 部署 LLaVA 模型，并以 [llava-hf/llava-interleave](https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf) 为例。
+
+## 安装
+
+请按照[安装指南](../get_started/installation.md)安装 LMDeploy。
+
+或者，您也可以使用官方的 Docker 镜像：
+
+```shell
+docker pull openmmlab/lmdeploy:latest
+```
+
+## 离线推理
+
+以下示例代码展示了 VLM pipeline 的基本用法。有关详细信息，请参考 [VLM 离线推理流程](./vl_pipeline.md)。
+
+```python
+from lmdeploy import GenerationConfig, TurbomindEngineConfig, pipeline
+from lmdeploy.vl import load_image
+
+pipe = pipeline("llava-hf/llava-interleave-qwen-7b-hf", backend_config=TurbomindEngineConfig(cache_max_entry_count=0.5),
+    gen_config=GenerationConfig(max_new_tokens=512))
+
+image = load_image('https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg')
+prompt = 'Describe the image.'
+print(f'prompt:{prompt}')
+response = pipe((prompt, image))
+print(response)
+```
+
+更多示例：
+
+<details>
+  <summary><b>多图片多轮对话，组合图片</b></summary>
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+
+pipe = pipeline('llava-hf/llava-interleave-qwen-7b-hf', log_level='INFO')
+messages = [
+    dict(role='user', content=[
+        dict(type='text', text='Describe the two images in detail.'),
+        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Beijing_Small.jpeg')),
+        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Chongqing_Small.jpeg'))
+    ])
+]
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+
+messages.append(dict(role='assistant', content=out.text))
+messages.append(dict(role='user', content='What are the similarities and differences between these two images.'))
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+```
+
+</details>
+
+## 在线服务
+
+可以使用 `lmdeploy serve api_server` CLI 启动服务器：
+
+```shell
+lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
+```
+
+或者，使用前面提到的 Docker 镜像启动服务：
+
+```shell
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    -p 23333:23333 \
+    --ipc=host \
+    openmmlab/lmdeploy:latest \
+    lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
+```
+
+采用 Docker Compose 部署也是一种常见选择。在 lmdeploy 项目的根目录创建 `docker-compose.yml` 文件，如下：
+
+```yaml
+version: '3.5'
+
+services:
+  lmdeploy:
+    container_name: lmdeploy
+    image: openmmlab/lmdeploy:latest
+    ports:
+      - "23333:23333"
+    environment:
+      HUGGING_FACE_HUB_TOKEN: <secret>
+    volumes:
+      - ~/.cache/huggingface:/root/.cache/huggingface
+    stdin_open: true
+    tty: true
+    ipc: host
+    command: lmdeploy serve api_server llava-hf/llava-interleave-qwen-7b-hf
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: "all"
+              capabilities: [gpu]
+```
+
+然后，可以执行以下命令启动服务：
+
+```shell
+docker-compose up -d
+```
+
+当运行 `docker logs -f lmdeploy` 后看到如下日志，说明服务启动成功：
+
+```text
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+INFO:     Started server process [2439]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on  http://0.0.0.0:23333  (Press CTRL+C to quit)
+```
+
+可以通过 `lmdeploy serve api_server -h` 查看 `lmdeploy serve api_server` 的参数详情。
+
+关于 `api_server` 以及如何访问服务的更多信息可以在[这里](api_server_vl.md)找到。
diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py
index a36102e1c6..b1da698e2e 100644
--- a/lmdeploy/turbomind/deploy/source_model/__init__.py
+++ b/lmdeploy/turbomind/deploy/source_model/__init__.py
@@ -5,6 +5,7 @@
 from .internlm2 import InternLM2Model  # noqa: F401
 from .internvl import InternVLModel  # noqa: F401
 from .llama import LlamaModel  # noqa: F401
+from .llava import LlavaModel  # noqa: F401
 from .meta_llama import MetaLlamaModel  # noqa: F401
 from .minicpmv import MiniCPMVModel  # noqa: F401
 from .mixtral import MixtralModel  # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/source_model/llava.py b/lmdeploy/turbomind/deploy/source_model/llava.py
new file mode 100644
index 0000000000..3b4d82c37b
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/llava.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+
+from .base import INPUT_MODELS
+from .llama import LlamaModel, LlamaReader
+
+
+class LlavaReader(LlamaReader):
+    """LlavaReader for llama model."""
+
+    attn_layer_prefix = 'language_model.model.layers'
+    attn_layer_patten = r'language_model.model.layers.([0-9]+).'
+    tok_embeddings_key = 'language_model.model.embed_tokens.weight'
+    norm_weight_key = 'language_model.model.norm.weight'
+    output_weight_key = 'language_model.lm_head.weight'
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
+                 model_cfg: dict, policy):
+        model_cfg = model_cfg.get('text_config')
+        super().__init__(new_params, unused_params, last_bin, model_cfg,
+                         policy)
+
+
+@INPUT_MODELS.register_module(name='llava')
+class LlavaModel(LlamaModel):
+    """LlavaModel model in hf format."""
+
+    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
+        super().__init__(model_path, tokenizer_path, **kwargs)
+        from transformers import AutoConfig
+        config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+        config = getattr(config, 'text_config', config)
+        arch = config.architectures[0]
+        _readers = dict(Qwen2ForCausalLM=LlavaReader,
+                        LlamaForCausalLM=LlavaReader)
+        self.Reader = _readers[arch]
+        self.arch = arch
+
+    def model_info(self):
+        """Read model info for LlavaForConditionalGeneration.
+
+        https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf
+        """
+        params_path = osp.join(self.model_path, 'config.json')
+        with open(params_path) as f:
+            model_arg = json.load(f)['text_config']
+            num_layer = model_arg.get('num_hidden_layers', 32)
+            norm_eps = model_arg.get('rms_norm_eps', 1e-6)
+            attn_head_num = model_arg.get('num_attention_heads', 32)
+            if 'num_key_value_heads' in model_arg:
+                kv_head_num = model_arg.get('num_key_value_heads', 32)
+            else:
+                kv_head_num = model_arg.get('num_attention_heads', 32)
+            rope_theta = float(model_arg.get('rope_theta', 10000.0))
+            max_position_embeddings = int(
+                model_arg.get('max_position_embeddings', 0))
+            rope_scaling = model_arg.get('rope_scaling', None)
+            scaling_factor = 0.0
+            use_dynamic_ntk = 0
+
+            # special for the model: llava-hf/llava-interleave-qwen-7b-hf
+            hidden_units = model_arg.get('hidden_size', 4096)
+            vocab_size = model_arg.get('vocab_size', 152000)
+            intermediate_size = model_arg.get('intermediate_size', 11008)
+            attn_bias = 1 if model_arg['architectures'][0] \
+                == 'Qwen2ForCausalLM' else 0
+            attn_bias = int(model_arg.get('attn_bias', attn_bias))
+            use_logn_attn = int(model_arg.get('use_logn_attn', 0))
+
+            if isinstance(rope_scaling, dict):
+                scaling_type = model_arg['rope_scaling'].get('type', '')
+                scaling_factor = model_arg['rope_scaling'].get('factor', '')
+                if scaling_type == 'dynamic':
+                    use_dynamic_ntk = 1
+
+        return dict(num_layer=num_layer,
+                    norm_eps=norm_eps,
+                    head_num=attn_head_num,
+                    hidden_units=hidden_units,
+                    kv_head_num=kv_head_num,
+                    rope_theta=rope_theta,
+                    max_position_embeddings=max_position_embeddings,
+                    use_dynamic_ntk=use_dynamic_ntk,
+                    rope_scaling_factor=scaling_factor,
+                    inter_size=intermediate_size,
+                    use_logn_attn=use_logn_attn,
+                    attn_bias=attn_bias,
+                    vocab_size=vocab_size)
diff --git a/lmdeploy/turbomind/generate_gemm_config.py b/lmdeploy/turbomind/generate_gemm_config.py
index 91b057d723..34e769776f 100644
--- a/lmdeploy/turbomind/generate_gemm_config.py
+++ b/lmdeploy/turbomind/generate_gemm_config.py
@@ -54,10 +54,14 @@ def main(head_num: int = 32,
             from transformers import AutoConfig
             config = AutoConfig.from_pretrained(model_path,
                                                 trust_remote_code=True)
+
+            for key in ['language_config', 'llm_config', 'text_config']:
+                config = getattr(config, key, config)
             head_num = config.num_attention_heads
             size_per_head = config.hidden_size // head_num
             inter_size = config.intermediate_size
             vocab_size = config.vocab_size
+
     for bsz in range(1, max_batch_size + 1):
         subprocess.call(
             f'{get_llama_gemm()} {bsz} 1 1 {head_num} {size_per_head}'
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index 979ed0c547..fe0819d70f 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -25,6 +25,7 @@
     # llava
     LlavaLlamaForCausalLM='llama',
     LlavaMistralForCausalLM='llama',
+    LlavaForConditionalGeneration='llava',
     # xcomposer2
     InternLMXComposer2ForCausalLM='xcomposer2',
     # internvl
@@ -95,5 +96,10 @@ def _is_head_dim_supported(cfg):
                     support_by_turbomind = False
             elif arch == 'InternVLChatModel':
                 support_by_turbomind = _is_head_dim_supported(cfg.llm_config)
+            elif arch == 'LlavaForConditionalGeneration':
+                sub_arch = cfg.text_config.architectures[0]
+                if sub_arch in ['Qwen2ForCausalLM', 'LlamaForCausalLM']:
+                    support_by_turbomind = _is_head_dim_supported(
+                        cfg.text_config)
 
     return support_by_turbomind

From 06aea5da8c122e9a0c5d29ea2898b44338cdf31d Mon Sep 17 00:00:00 2001
From: Zhaokai Wang <53330871+wzk1015@users.noreply.github.com>
Date: Mon, 11 Nov 2024 11:09:44 +0800
Subject: [PATCH 066/122] Support Mono-InternVL with PyTorch backend (#2727)

* support Mono-InternVL; fix typos

* update readme

* add assertion for FP16

* add assertion for FP16

* update _SUPPORTED_ARCHS
---
 .github/CONTRIBUTING.md                       |  14 +-
 README.md                                     |   2 +
 README_zh-CN.md                               |   2 +
 docs/en/multi_modal/internvl.md               |  13 +-
 docs/en/multi_modal/vl_pipeline.md            |   1 +
 docs/en/supported_models/supported_models.md  |   5 +
 docs/zh_cn/multi_modal/internvl.md            |  17 +-
 docs/zh_cn/multi_modal/vl_pipeline.md         |   1 +
 .../supported_models/supported_models.md      |   5 +
 lmdeploy/model.py                             |   3 +-
 lmdeploy/pytorch/models/baichuan.py           |   2 +-
 lmdeploy/pytorch/models/chatglm2.py           |   2 +-
 lmdeploy/pytorch/models/cogvlm.py             |   2 +-
 lmdeploy/pytorch/models/dbrx.py               |   2 +-
 lmdeploy/pytorch/models/deepseek.py           |   2 +-
 lmdeploy/pytorch/models/falcon.py             |   2 +-
 lmdeploy/pytorch/models/gemma.py              |   2 +-
 lmdeploy/pytorch/models/internlm.py           |   2 +-
 lmdeploy/pytorch/models/internlm2.py          |   2 +-
 lmdeploy/pytorch/models/internlm2_ve.py       | 338 ++++++++++++++++++
 lmdeploy/pytorch/models/internvl.py           |  63 +++-
 lmdeploy/pytorch/models/llama.py              |   2 +-
 lmdeploy/pytorch/models/minicpm3.py           |   2 +-
 lmdeploy/pytorch/models/mistral.py            |   2 +-
 lmdeploy/pytorch/models/mllama.py             |   4 +-
 lmdeploy/pytorch/models/module_map.py         |   6 +
 lmdeploy/pytorch/models/phi3.py               |   2 +-
 lmdeploy/pytorch/models/qwen.py               |   2 +-
 lmdeploy/pytorch/models/qwen2.py              |   2 +-
 lmdeploy/pytorch/models/qwen2_moe.py          |   2 +-
 lmdeploy/pytorch/models/qwen2_vl.py           |   2 +-
 lmdeploy/pytorch/models/starcoder2.py         |   2 +-
 lmdeploy/pytorch/supported_models.py          |   2 +
 33 files changed, 458 insertions(+), 54 deletions(-)
 create mode 100644 lmdeploy/pytorch/models/internlm2_ve.py

diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index 19668fe9e4..20bd3a5f48 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -1,6 +1,6 @@
-## Contributing to InternLM
+## Contributing to LMDeploy
 
-Welcome to the InternLM community, all kinds of contributions are welcomed, including but not limited to
+Welcome to the LMDeploy community, all kinds of contributions are welcomed, including but not limited to
 
 **Fix bug**
 
@@ -56,7 +56,7 @@ upstream	git@github.com:InternLM/lmdeploy.git (push)
 
 #### 2. Configure pre-commit
 
-You should configure [pre-commit](https://pre-commit.com/#intro) in the local development environment to make sure the code style matches that of InternLM. **Note**: The following code should be executed under the lmdeploy directory.
+You should configure [pre-commit](https://pre-commit.com/#intro) in the local development environment to make sure the code style matches that of LMDeploy. **Note**: The following code should be executed under the lmdeploy directory.
 
 ```shell
 pip install -U pre-commit
@@ -96,7 +96,7 @@ git checkout -b yhc/refactor_contributing_doc
 In subsequent development, if the master branch of the local repository is behind the master branch of "upstream", we need to pull the upstream for synchronization, and then execute the above command:
 
 ```shell
-git pull upstream master
+git pull upstream main
 ```
 
 #### 4. Commit the code and pass the unit test
@@ -151,7 +151,7 @@ Find more details about Pull Request description in [pull request guidelines](#p
 
 <img src="https://user-images.githubusercontent.com/57566630/167307490-f9ebf9fa-63c0-4d83-8ba1-081ea169eb3a.png" width="1200">
 
-IternLM will run unit test for the posted Pull Request on different platforms (Linux, Window, Mac), based on different versions of Python, PyTorch, CUDA to make sure the code is correct. We can see the specific test information by clicking `Details` in the above image so that we can modify the code.
+LMDeploy will run unit test for the posted Pull Request on different platforms (Linux, Window, Mac), based on different versions of Python, PyTorch, CUDA to make sure the code is correct. We can see the specific test information by clicking `Details` in the above image so that we can modify the code.
 
 (3) If the Pull Request passes the CI, then you can wait for the review from other developers. You'll modify the code based on the reviewer's comments, and repeat the steps [4](#4-commit-the-code-and-pass-the-unit-test)-[5](#5-push-the-code-to-remote) until all reviewers approve it. Then, we will merge it ASAP.
 
@@ -163,14 +163,14 @@ If your local branch conflicts with the latest master branch of "upstream", you'
 
 ```shell
 git fetch --all --prune
-git rebase upstream/master
+git rebase upstream/main
 ```
 
 or
 
 ```shell
 git fetch --all --prune
-git merge upstream/master
+git merge upstream/main
 ```
 
 If you are very good at handling conflicts, then you can use rebase to resolve conflicts, as this will keep your commit logs tidy. If you are not familiar with `rebase`, then you can use `merge` to resolve conflicts.
diff --git a/README.md b/README.md
index 6ca5fadedd..efbb87a22e 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,7 @@ ______________________________________________________________________
 <details open>
 <summary><b>2024</b></summary>
 
+- \[2024/11\] Support Mono-InternVL with PyTorch engine
 - \[2024/10\] PyTorchEngine supports graph mode on ascend platform, doubling the inference speed
 - \[2024/09\] LMDeploy PyTorchEngine adds support for [Huawei Ascend](./docs/en/get_started/ascend/get_started.md). See supported models [here](docs/en/supported_models/supported_models.md)
 - \[2024/09\] LMDeploy PyTorchEngine achieves 1.3x faster on Llama3-8B inference by introducing CUDA graph
@@ -155,6 +156,7 @@ For detailed inference benchmarks in more devices and more settings, please refe
   <li>DeepSeek-VL (7B)</li>
   <li>InternVL-Chat (v1.1-v1.5)</li>
   <li>InternVL2 (1B-76B)</li>
+  <li>Mono-InternVL (2B)</li>
   <li>MiniGeminiLlama (7B)</li>
   <li>CogVLM-Chat (17B)</li>
   <li>CogVLM2-Chat (19B)</li>
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 663b7b24ab..477fed6f79 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -26,6 +26,7 @@ ______________________________________________________________________
 <details open>
 <summary><b>2024</b></summary>
 
+- \[2024/11\] PyTorch engine 支持 Mono-InternVL 模型
 - \[2024/10\] PyTorchEngine 在 ascend 平台上支持了图模式，推理性能提高了 1 倍
 - \[2024/09\] LMDeploy PyTorchEngine 增加了对 [华为 Ascend](docs/zh_cn/get_started/ascend/get_started.md) 的支持。支持的模型请见[这里](docs/zh_cn/supported_models/supported_models.md)
 - \[2024/09\] 通过引入 CUDA Graph，LMDeploy PyTorchEngine 在 Llama3-8B 推理上实现了 1.3 倍的加速
@@ -156,6 +157,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型
   <li>DeepSeek-VL (7B)</li>
   <li>InternVL-Chat (v1.1-v1.5)</li>
   <li>InternVL2 (1B-76B)</li>
+  <li>Mono-InternVL (2B)</li>
   <li>MiniGeminiLlama (7B)</li>
   <li>CogVLM-Chat (17B)</li>
   <li>CogVLM2-Chat (19B)</li>
diff --git a/docs/en/multi_modal/internvl.md b/docs/en/multi_modal/internvl.md
index 24c79357c0..bd33649139 100644
--- a/docs/en/multi_modal/internvl.md
+++ b/docs/en/multi_modal/internvl.md
@@ -2,12 +2,13 @@
 
 LMDeploy supports the following InternVL series of models, which are detailed in the table below:
 
-|    Model    |    Size    | Supported Inference Engine |
-| :---------: | :--------: | :------------------------: |
-|  InternVL   |  13B-19B   |         TurboMind          |
-| InternVL1.5 |   2B-26B   |     TurboMind, PyTorch     |
-|  InternVL2  |   1B, 4B   |          PyTorch           |
-|  InternVL2  | 2B, 8B-76B |     TurboMind, PyTorch     |
+|     Model     |    Size    | Supported Inference Engine |
+| :-----------: | :--------: | :------------------------: |
+|   InternVL    |  13B-19B   |         TurboMind          |
+|  InternVL1.5  |   2B-26B   |     TurboMind, PyTorch     |
+|   InternVL2   |   1B, 4B   |          PyTorch           |
+|   InternVL2   | 2B, 8B-76B |     TurboMind, PyTorch     |
+| Mono-InternVL |     2B     |          PyTorch           |
 
 The next chapter demonstrates how to deploy an InternVL model using LMDeploy, with [InternVL2-8B](https://huggingface.co/OpenGVLab/InternVL2-8B) as an example.
 
diff --git a/docs/en/multi_modal/vl_pipeline.md b/docs/en/multi_modal/vl_pipeline.md
index 72eb0b4595..4881b99071 100644
--- a/docs/en/multi_modal/vl_pipeline.md
+++ b/docs/en/multi_modal/vl_pipeline.md
@@ -9,6 +9,7 @@ Currently, it supports the following models.
 - [Yi-VL](https://huggingface.co/01-ai/Yi-VL-6B)
 - [DeepSeek-VL](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)
 - [InternVL](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5)
+- [Mono-InternVL](https://huggingface.co/OpenGVLab/Mono-InternVL-2B)
 - [MGM](https://huggingface.co/YanweiLi/MGM-7B)
 - [XComposer](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b)
 - [CogVLM](https://github.com/InternLM/lmdeploy/tree/main/docs/en/multi_modal/cogvlm.md)
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index 1f344e78bb..371e4968e0 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -80,6 +80,7 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 | LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
 | InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
 |   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+| Mono-InternVL  |     2B      | MLLM |   Yes\*   |   Yes   |   Yes   |  No  |   -   |
 |     Gemma2     |   9B-27B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |      GLM4      |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |     GLM-4V     |     9B      | MLLM |    Yes    |   Yes   |   Yes   |  No  |  No   |
@@ -88,6 +89,10 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 |  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   Yes   |   No    |  No  |   -   |
 | Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   Yes   |   No    |  No  |   -   |
 
+```{note}
+* Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.
+```
+
 ## PyTorchEngine on Huawei Ascend Platform
 
 |     Model      |   Size   | Type | FP16/BF16 | W4A16 |
diff --git a/docs/zh_cn/multi_modal/internvl.md b/docs/zh_cn/multi_modal/internvl.md
index 3d948353a5..e5dae1a89c 100644
--- a/docs/zh_cn/multi_modal/internvl.md
+++ b/docs/zh_cn/multi_modal/internvl.md
@@ -2,14 +2,15 @@
 
 LMDeploy 支持 InternVL 系列模型，具体如下：
 
-|    Model    |    Size    | Supported Inference Engine |
-| :---------: | :--------: | :------------------------: |
-|  InternVL   |  13B-19B   |         TurboMind          |
-| InternVL1.5 |   2B-26B   |     TurboMind, PyTorch     |
-|  InternVL2  |   1B, 4B   |          PyTorch           |
-|  InternVL2  | 2B, 8B-76B |     TurboMind, PyTorch     |
-
-本文将以[InternVL2-8B](https://huggingface.co/OpenGVLab/InternVL2-8B)为例，演示使用 LMDeploy 部署 InternVL 系列模型的方法
+|     Model     |    Size    | Supported Inference Engine |
+| :-----------: | :--------: | :------------------------: |
+|   InternVL    |  13B-19B   |         TurboMind          |
+|  InternVL1.5  |   2B-26B   |     TurboMind, PyTorch     |
+|   InternVL2   |   1B, 4B   |          PyTorch           |
+|   InternVL2   | 2B, 8B-76B |     TurboMind, PyTorch     |
+| Mono-InternVL |     2B     |          PyTorch           |
+
+本文将以[InternVL2-8B](https://huggingface.co/OpenGVLab/InternVL2-8B)为例，演示使用 LMDeploy 部署 InternVL 系列模型的方法。
 
 ## 安装
 
diff --git a/docs/zh_cn/multi_modal/vl_pipeline.md b/docs/zh_cn/multi_modal/vl_pipeline.md
index 31533b38f7..570598311a 100644
--- a/docs/zh_cn/multi_modal/vl_pipeline.md
+++ b/docs/zh_cn/multi_modal/vl_pipeline.md
@@ -9,6 +9,7 @@ LMDeploy 把视觉-语言模型（VLM）复杂的推理过程，抽象为简单
 - [Yi-VL](https://huggingface.co/01-ai/Yi-VL-6B)
 - [DeepSeek-VL](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)
 - [InternVL](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5)
+- [Mono-InternVL](https://huggingface.co/OpenGVLab/Mono-InternVL-2B)
 - [MGM](https://huggingface.co/YanweiLi/MGM-7B)
 - [XComposer](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b)
 - [CogVLM](https://github.com/InternLM/lmdeploy/tree/main/docs/zh_cn/multi_modal/cogvlm.md)
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index ac061cf1ae..7d59a59899 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -80,6 +80,7 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 | LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
 | InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
 |   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+| Mono-InternVL  |     2B      | MLLM |   Yes\*   |   Yes   |   Yes   |  No  |   -   |
 |     Gemma2     |   9B-27B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |      GLM4      |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |     GLM-4V     |     9B      | MLLM |    Yes    |   Yes   |   Yes   |  No  |  No   |
@@ -88,6 +89,10 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 |  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   Yes   |   No    |  No  |   -   |
 | Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   Yes   |   No    |  No  |   -   |
 
+```{note}
+* Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.
+```
+
 ## PyTorchEngine 华为昇腾平台
 
 |     Model      |   Size   | Type | FP16/BF16 | W4A16 |
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index 98f8e373ba..2b3a0a4e1d 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -578,7 +578,8 @@ def match(cls, model_path: str) -> Optional[str]:
             model_path (str): the model path used for matching.
         """
         path = model_path.lower()
-        if 'internvl2' in path and 'internvl2-4b' not in path:
+        if ('internvl2' in path
+                and 'internvl2-4b' not in path) or 'mono-internvl' in path:
             return 'internvl2-internlm2'
 
 
diff --git a/lmdeploy/pytorch/models/baichuan.py b/lmdeploy/pytorch/models/baichuan.py
index 6bd18d9e58..583cd19fe9 100644
--- a/lmdeploy/pytorch/models/baichuan.py
+++ b/lmdeploy/pytorch/models/baichuan.py
@@ -167,7 +167,7 @@ def __init__(self,
         # build attention layer
         self.self_attn = BaichuanAttention(config, dtype=dtype, device=device)
 
-        # builf MLP
+        # build MLP
         self.mlp = MLP(config, dtype=dtype, device=device)
 
         # build input layer norm
diff --git a/lmdeploy/pytorch/models/chatglm2.py b/lmdeploy/pytorch/models/chatglm2.py
index efb44b2431..8d7a21a0a6 100644
--- a/lmdeploy/pytorch/models/chatglm2.py
+++ b/lmdeploy/pytorch/models/chatglm2.py
@@ -279,7 +279,7 @@ def __init__(self,
         # build attention layer
         self.self_attention = SelfAttention(config, dtype=dtype, device=device)
 
-        # builf MLP
+        # build MLP
         self.mlp = MLP(config, dtype=dtype, device=device)
 
         # build input layer norm
diff --git a/lmdeploy/pytorch/models/cogvlm.py b/lmdeploy/pytorch/models/cogvlm.py
index b53c74d95a..6caf10df00 100644
--- a/lmdeploy/pytorch/models/cogvlm.py
+++ b/lmdeploy/pytorch/models/cogvlm.py
@@ -263,7 +263,7 @@ def __init__(self,
                                                dtype=dtype,
                                                device=device)
 
-        # builf MLP
+        # build MLP
         self.mlp = VisionExpertMLP(config, dtype=dtype, device=device)
 
         # build input layer norm
diff --git a/lmdeploy/pytorch/models/dbrx.py b/lmdeploy/pytorch/models/dbrx.py
index dd1191625b..e71ff17fe9 100644
--- a/lmdeploy/pytorch/models/dbrx.py
+++ b/lmdeploy/pytorch/models/dbrx.py
@@ -301,7 +301,7 @@ def __init__(self,
                                                     dtype=dtype,
                                                     device=device)
 
-        # builf MLP
+        # build MLP
         self.ffn = DbrxFFN(config, dtype=dtype, device=device)
 
     def forward(
diff --git a/lmdeploy/pytorch/models/deepseek.py b/lmdeploy/pytorch/models/deepseek.py
index f4e80fb048..5742baeee5 100644
--- a/lmdeploy/pytorch/models/deepseek.py
+++ b/lmdeploy/pytorch/models/deepseek.py
@@ -250,7 +250,7 @@ def __init__(self,
         # build attention layer
         self.self_attn = DeepseekAttention(config, dtype=dtype, device=device)
 
-        # builf MLP
+        # build MLP
         self.mlp = (DeepseekMoE(config, dtype=dtype, device=device) if
                     (config.n_routed_experts is not None
                      and layer_idx >= config.first_k_dense_replace
diff --git a/lmdeploy/pytorch/models/falcon.py b/lmdeploy/pytorch/models/falcon.py
index e767d29849..8f8659dc5e 100644
--- a/lmdeploy/pytorch/models/falcon.py
+++ b/lmdeploy/pytorch/models/falcon.py
@@ -179,7 +179,7 @@ def __init__(self,
                                               dtype=dtype,
                                               device=device)
 
-        # builf MLP
+        # build MLP
         self.mlp = FalconMLP(config, dtype=dtype, device=device)
 
         if not hasattr(config, 'num_ln_in_parallel_attn'):
diff --git a/lmdeploy/pytorch/models/gemma.py b/lmdeploy/pytorch/models/gemma.py
index 2d9f85f2ca..450767bda3 100644
--- a/lmdeploy/pytorch/models/gemma.py
+++ b/lmdeploy/pytorch/models/gemma.py
@@ -177,7 +177,7 @@ def __init__(self,
                                         dtype=dtype,
                                         device=device)
 
-        # builf MLP
+        # build MLP
         self.mlp = GemmaMLP(config, dtype=dtype, device=device)
 
         # build input layer norm
diff --git a/lmdeploy/pytorch/models/internlm.py b/lmdeploy/pytorch/models/internlm.py
index f8869543be..99c622e4ac 100644
--- a/lmdeploy/pytorch/models/internlm.py
+++ b/lmdeploy/pytorch/models/internlm.py
@@ -161,7 +161,7 @@ def __init__(self,
         # build attention layer
         self.self_attn = InternLMAttention(config, dtype=dtype, device=device)
 
-        # builf MLP
+        # build MLP
         self.mlp = InternLMMLP(config, dtype=dtype, device=device)
 
         # build input layer norm
diff --git a/lmdeploy/pytorch/models/internlm2.py b/lmdeploy/pytorch/models/internlm2.py
index a87c848e65..6cbc2ccff3 100644
--- a/lmdeploy/pytorch/models/internlm2.py
+++ b/lmdeploy/pytorch/models/internlm2.py
@@ -160,7 +160,7 @@ def __init__(self,
         # build attention layer
         self.attention = InternLM2Attention(config, dtype=dtype, device=device)
 
-        # builf MLP
+        # build MLP
         self.feed_forward = InternLM2MLP(config, dtype=dtype, device=device)
 
         # build input layer norm
diff --git a/lmdeploy/pytorch/models/internlm2_ve.py b/lmdeploy/pytorch/models/internlm2_ve.py
new file mode 100644
index 0000000000..b1a2329597
--- /dev/null
+++ b/lmdeploy/pytorch/models/internlm2_ve.py
@@ -0,0 +1,338 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+
+from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
+from lmdeploy.pytorch.models.internlm2 import InternLM2Attention, InternLM2MLP
+from lmdeploy.pytorch.nn import RMSNorm, RopeType, build_rotary_embedding
+from lmdeploy.pytorch.nn.linear import build_rowwise_linear
+from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
+
+from .utils.cudagraph import CudaGraphMixin
+
+
+class InternLM2VEDecoderLayer(nn.Module):
+    """decoder layer with visual expert."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 layer_idx: int,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        quantization_config = getattr(config, 'quantization_config', None)
+
+        # build attention layer
+        self.attention = InternLM2Attention(config, dtype=dtype, device=device)
+
+        # build MLP
+        self.feed_forward = InternLM2MLP(config, dtype=dtype, device=device)
+
+        # build visual expert
+        self.feed_forward_ve = InternLM2MLP(config, dtype=dtype, device=device)
+
+        # build input layer norm
+        self.attention_norm = RMSNorm(config.hidden_size,
+                                      config.rms_norm_eps,
+                                      quant_config=quantization_config,
+                                      dtype=dtype,
+                                      device=device)
+
+        # build attention layer norm
+        self.ffn_norm = RMSNorm(config.hidden_size,
+                                config.rms_norm_eps,
+                                quant_config=quantization_config,
+                                dtype=dtype,
+                                device=device)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: Optional[List[torch.FloatTensor]],
+        residual: Optional[torch.Tensor] = None,
+        attn_metadata: Any = None,
+        vision_embedding_indexing: Optional[torch.Tensor] = None,
+        text_embedding_indexing: Optional[torch.Tensor] = None,
+    ):
+
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.attention_norm(hidden_states)
+        else:
+            hidden_states, residual = self.attention_norm(
+                hidden_states, residual)
+
+        # Self Attention
+        hidden_states = self.attention(
+            hidden_states=hidden_states,
+            rotary_pos_emb=rotary_pos_emb,
+            past_key_value=past_key_value,
+            attn_metadata=attn_metadata,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.ffn_norm(hidden_states, residual)
+        if vision_embedding_indexing is not None:
+            hidden_states[:,
+                          vision_embedding_indexing, :] = self.feed_forward_ve(
+                              hidden_states[:, vision_embedding_indexing, :].
+                              reshape(-1, self.hidden_size)).unsqueeze(0)
+            if text_embedding_indexing is not None:
+                hidden_states[:,
+                              text_embedding_indexing, :] = self.feed_forward(
+                                  hidden_states[:, text_embedding_indexing, :].
+                                  reshape(-1, self.hidden_size)).unsqueeze(0)
+        else:
+            hidden_states = self.feed_forward(hidden_states)
+
+        outputs = (hidden_states, residual)
+        return outputs
+
+
+class InternLM2VEModel(nn.Module):
+    """internlm2 model with visual expert."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        quantization_config = getattr(config, 'quantization_config', None)
+
+        self.tok_embeddings = nn.Embedding(config.vocab_size,
+                                           config.hidden_size,
+                                           self.padding_idx,
+                                           dtype=dtype,
+                                           device=device)
+
+        # build all decode layers
+        self.layers = nn.ModuleList([
+            InternLM2VEDecoderLayer(config,
+                                    layer_idx,
+                                    dtype=dtype,
+                                    device=device)
+            for layer_idx in range(config.num_hidden_layers)
+        ])
+
+        # build norm
+        self.norm = RMSNorm(config.hidden_size,
+                            config.rms_norm_eps,
+                            quant_config=quantization_config,
+                            dtype=dtype,
+                            device=device)
+
+        # build rotary embedding in Model
+        rope_scaling = config.rope_scaling
+        scaling_factor = 1.0
+        emb_type = RopeType.LinearScaling
+        if rope_scaling is not None:
+            scaling_factor = rope_scaling.get('factor', scaling_factor)
+            rope_type = rope_scaling['type']
+            if rope_type == 'linear':
+                emb_type = RopeType.LinearScaling
+            if rope_type == 'dynamic':
+                emb_type = RopeType.DynamicNTKScaling
+            else:
+                raise RuntimeError(f'Unsupported rope type: {rope_type}')
+        rope_dim = config.hidden_size // config.num_attention_heads
+        rope_max_pos_emb = config.max_position_embeddings
+        rope_base = config.rope_theta
+        self.rotary_emb = build_rotary_embedding(
+            rope_dim,
+            rope_max_pos_emb,
+            rope_base,
+            scaling_factor,
+            emb_type=emb_type,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        attn_metadata: Any = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_embedding_indexing: Optional[torch.Tensor] = None,
+        text_embedding_indexing: Optional[torch.Tensor] = None,
+    ):
+        """Rewrite of forward."""
+
+        # token embedding
+        if inputs_embeds is None:
+            inputs_embeds = self.tok_embeddings(input_ids)
+
+        hidden_states = inputs_embeds
+
+        # rotary embedding
+        cos, sin = self.rotary_emb(hidden_states, position_ids)
+        cos, sin = cos[0], sin[0]
+        rotary_pos_emb = (cos, sin)
+
+        # decoding
+        residual = None
+        for idx, decoder_layer in enumerate(self.layers):
+            past_key_value = past_key_values[idx]
+            hidden_states, residual = decoder_layer(
+                hidden_states,
+                rotary_pos_emb=rotary_pos_emb,
+                past_key_value=past_key_value,
+                residual=residual,
+                attn_metadata=attn_metadata,
+                vision_embedding_indexing=vision_embedding_indexing,
+                text_embedding_indexing=text_embedding_indexing,
+            )
+
+        # norm
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+    def get_input_embeddings(self):
+        """get input embeddings."""
+        return self.tok_embeddings
+
+
+class InternLM2VEForCausalLM(nn.Module, CudaGraphMixin):
+    """rewrote model of InternLM2ForCausalLM with visual expert."""
+
+    packed_modules_mapping = {
+        'gate_up_proj': [
+            'w1',
+            'w3',
+        ],
+    }
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 ctx_mgr: StepContextManager,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.config = config
+        self.ctx_mgr = ctx_mgr
+        # build Model
+        self.model = InternLM2VEModel(config, dtype=dtype, device=device)
+        # build lm_head
+        self.output = build_rowwise_linear(config.hidden_size,
+                                           config.vocab_size,
+                                           bias=False,
+                                           dtype=dtype,
+                                           device=device)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        past_key_values: List[List[torch.Tensor]],
+        attn_metadata: Any = None,
+        inputs_embeds: torch.Tensor = None,
+        vision_embedding_indexing: Optional[torch.Tensor] = None,
+        text_embedding_indexing: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        """model forward, return logits."""
+        hidden_states = self.model(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            attn_metadata=attn_metadata,
+            inputs_embeds=inputs_embeds,
+            vision_embedding_indexing=vision_embedding_indexing,
+            text_embedding_indexing=text_embedding_indexing,
+        )
+        return hidden_states
+
+    def get_logits(self, hidden_states: torch.Tensor):
+        """compute logits of the model output."""
+        return self.output(hidden_states)
+
+    def support_cuda_graph(
+        self,
+        input_ids: torch.Tensor,
+        attn_metadata: Any = None,
+        **kwargs,
+    ):
+        """support cudagraph."""
+        if not attn_metadata.is_decoding:
+            return False
+        seq_lens = input_ids.size(1)
+        if seq_lens <= 512:
+            return True
+        return False
+
+    def get_input_embeddings(self):
+        """get input embeddings."""
+        return self.model.get_input_embeddings()
+
+    def prepare_inputs_for_generation(
+        self,
+        past_key_values: List[List[torch.Tensor]],
+        inputs_embeds: Optional[torch.Tensor] = None,
+        context: StepContext = None,
+    ):
+        """prepare input."""
+        # get input_ids, position_ids and attention metadatas
+        input_ids = context.input_ids
+        position_ids = context.position_ids
+        attn_metadata = context.attn_metadata
+
+        # process vision embeddings
+        vision_embeddings = context.input_embeddings
+        vision_embedding_indexing = context.input_embedding_indexing
+        if vision_embeddings is not None and len(vision_embeddings) > 0:
+            if inputs_embeds is None:
+                inputs_embeds = self.get_input_embeddings()(input_ids)
+            inputs_embeds[:,
+                          vision_embedding_indexing, :] = vision_embeddings.to(
+                              inputs_embeds)
+
+        # inputs of forward
+        return dict(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            attn_metadata=attn_metadata,
+            inputs_embeds=inputs_embeds,
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        """load weights."""
+        # modify from vllm
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ('.gate_up_proj', '.w1', 0),
+            ('.gate_up_proj', '.w3', 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if 'rotary_emb.inv_freq' in name:
+                continue
+            if ('rotary_emb.cos_cached' in name
+                    or 'rotary_emb.sin_cached' in name):
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                load_weight(param, loaded_weight, shard_id=shard_id)
+                break
+            else:
+                if '.wqkv' in name:
+                    param = params_dict[name]
+                    q, k, v = param.weight_spliter(loaded_weight, layout='hgd')
+                    load_weight(param, q, shard_id='q')
+                    load_weight(param, k, shard_id='k')
+                    load_weight(param, v, shard_id='v')
+                else:
+                    param = params_dict[name]
+                    load_weight(param, loaded_weight)
diff --git a/lmdeploy/pytorch/models/internvl.py b/lmdeploy/pytorch/models/internvl.py
index 8981436113..70dd8f2159 100644
--- a/lmdeploy/pytorch/models/internvl.py
+++ b/lmdeploy/pytorch/models/internvl.py
@@ -26,6 +26,15 @@ def __init__(self,
                                                          dtype=dtype,
                                                          device=device)
 
+        self.llm_arch_name = llm_config.architectures[0]
+
+        # for Mono-InternVL
+        self.is_mono = self.llm_arch_name == 'InternLM2VEForCausalLM'
+        if self.is_mono:
+            assert dtype != torch.float16, (
+                'Currently Mono-InternVL does not support FP16 due to'
+                'numerical instability. Please use BF16 instead.')
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -33,13 +42,25 @@ def forward(
         past_key_values: List[List[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
+        vision_embedding_indexing: torch.Tensor = None,
+        text_embedding_indexing: torch.Tensor = None,
         **kwargs,
     ):
-        return self.language_model.forward(input_ids=input_ids,
-                                           inputs_embeds=inputs_embeds,
-                                           past_key_values=past_key_values,
-                                           position_ids=position_ids,
-                                           attn_metadata=attn_metadata)
+        if self.is_mono:
+            return self.language_model.forward(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                past_key_values=past_key_values,
+                position_ids=position_ids,
+                attn_metadata=attn_metadata,
+                vision_embedding_indexing=vision_embedding_indexing,
+                text_embedding_indexing=text_embedding_indexing)
+        else:
+            return self.language_model.forward(input_ids=input_ids,
+                                               inputs_embeds=inputs_embeds,
+                                               past_key_values=past_key_values,
+                                               position_ids=position_ids,
+                                               attn_metadata=attn_metadata)
 
     def get_logits(self, hidden_states: torch.Tensor):
         """compute logits of the model output."""
@@ -70,13 +91,31 @@ def prepare_inputs_for_generation(
                           vision_embedding_indexing, :] = vision_embeddings.to(
                               inputs_embeds)
 
-        return dict(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            attn_metadata=attn_metadata,
-            inputs_embeds=inputs_embeds,
-        )
+        if self.is_mono and vision_embedding_indexing is not None:
+            all_indices = torch.arange(input_ids.shape[1]).to(input_ids)
+            text_embedding_indexing = all_indices[
+                ~torch.isin(all_indices, vision_embedding_indexing)]
+            if vision_embedding_indexing.numel() == 0:
+                vision_embedding_indexing = None
+            if text_embedding_indexing.numel() == 0:
+                text_embedding_indexing = None
+            return dict(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                attn_metadata=attn_metadata,
+                inputs_embeds=inputs_embeds,
+                vision_embedding_indexing=vision_embedding_indexing,
+                text_embedding_indexing=text_embedding_indexing,
+            )
+        else:
+            return dict(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                attn_metadata=attn_metadata,
+                inputs_embeds=inputs_embeds,
+            )
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         """load weights."""
diff --git a/lmdeploy/pytorch/models/llama.py b/lmdeploy/pytorch/models/llama.py
index 525c8e3d34..f38c5ef02b 100644
--- a/lmdeploy/pytorch/models/llama.py
+++ b/lmdeploy/pytorch/models/llama.py
@@ -163,7 +163,7 @@ def __init__(self,
         # build attention layer
         self.self_attn = LlamaAttention(config, dtype=dtype, device=device)
 
-        # builf MLP
+        # build MLP
         self.mlp = LlamaMLP(config, dtype=dtype, device=device)
 
         # build input layer norm
diff --git a/lmdeploy/pytorch/models/minicpm3.py b/lmdeploy/pytorch/models/minicpm3.py
index 56a1c4edf1..72a2b8a045 100644
--- a/lmdeploy/pytorch/models/minicpm3.py
+++ b/lmdeploy/pytorch/models/minicpm3.py
@@ -237,7 +237,7 @@ def __init__(self,
         # build attention layer
         self.self_attn = MiniCPMAttention(config, dtype=dtype, device=device)
 
-        # builf MLP
+        # build MLP
         self.mlp = MiniCPMMLP(config, dtype=dtype, device=device)
 
         # build input layer norm
diff --git a/lmdeploy/pytorch/models/mistral.py b/lmdeploy/pytorch/models/mistral.py
index 4c369b716b..04af4c8526 100644
--- a/lmdeploy/pytorch/models/mistral.py
+++ b/lmdeploy/pytorch/models/mistral.py
@@ -162,7 +162,7 @@ def __init__(self,
         # build attention layer
         self.self_attn = MistralAttention(config, dtype=dtype, device=device)
 
-        # builf MLP
+        # build MLP
         self.mlp = MistralMLP(config, dtype=dtype, device=device)
 
         # build input layer norm
diff --git a/lmdeploy/pytorch/models/mllama.py b/lmdeploy/pytorch/models/mllama.py
index a16abd8b91..2596fe5299 100644
--- a/lmdeploy/pytorch/models/mllama.py
+++ b/lmdeploy/pytorch/models/mllama.py
@@ -267,7 +267,7 @@ def __init__(self,
         # build attention layer
         self.self_attn = LlamaAttention(config, dtype=dtype, device=device)
 
-        # builf MLP
+        # build MLP
         self.mlp = LlamaMLP(config, dtype=dtype, device=device)
 
         # build input layer norm
@@ -336,7 +336,7 @@ def __init__(self,
                                                    dtype=dtype,
                                                    device=device)
 
-        # builf MLP
+        # build MLP
         self.mlp = LlamaMLP(config, dtype=dtype, device=device)
 
         # build input layer norm
diff --git a/lmdeploy/pytorch/models/module_map.py b/lmdeploy/pytorch/models/module_map.py
index bc6385d8b2..e6b5f6e29e 100644
--- a/lmdeploy/pytorch/models/module_map.py
+++ b/lmdeploy/pytorch/models/module_map.py
@@ -149,6 +149,12 @@
     f'{LMDEPLOY_PYTORCH_MODEL_PATH}.internvl.InternVLChatModel'
 })
 
+# mono-internvl
+MODULE_MAP.update({
+    'InternLM2VEForCausalLM':
+    f'{LMDEPLOY_PYTORCH_MODEL_PATH}.internlm2_ve.InternLM2VEForCausalLM',
+})
+
 # phi3 vision
 MODULE_MAP.update({
     'Phi3VForCausalLM':
diff --git a/lmdeploy/pytorch/models/phi3.py b/lmdeploy/pytorch/models/phi3.py
index a2859e3e3e..f9477fdab8 100644
--- a/lmdeploy/pytorch/models/phi3.py
+++ b/lmdeploy/pytorch/models/phi3.py
@@ -165,7 +165,7 @@ def __init__(self,
         # build attention layer
         self.self_attn = Phi3Attention(config, dtype=dtype, device=device)
 
-        # builf MLP
+        # build MLP
         self.mlp = Phi3MLP(config, dtype=dtype, device=device)
 
         # build input layer norm
diff --git a/lmdeploy/pytorch/models/qwen.py b/lmdeploy/pytorch/models/qwen.py
index 50b9fd4ee8..bf856461a3 100644
--- a/lmdeploy/pytorch/models/qwen.py
+++ b/lmdeploy/pytorch/models/qwen.py
@@ -174,7 +174,7 @@ def __init__(self,
         # build attention layer
         self.attn = QWenAttention(config, dtype=dtype, device=device)
 
-        # builf MLP
+        # build MLP
         self.mlp = QWenMLP(config, dtype=dtype, device=device)
 
         # build input layer norm
diff --git a/lmdeploy/pytorch/models/qwen2.py b/lmdeploy/pytorch/models/qwen2.py
index de6a7a58e1..82be75e167 100644
--- a/lmdeploy/pytorch/models/qwen2.py
+++ b/lmdeploy/pytorch/models/qwen2.py
@@ -163,7 +163,7 @@ def __init__(self,
         # build attention layer
         self.self_attn = Qwen2Attention(config, dtype=dtype, device=device)
 
-        # builf MLP
+        # build MLP
         self.mlp = Qwen2MLP(config, dtype=dtype, device=device)
 
         # build input layer norm
diff --git a/lmdeploy/pytorch/models/qwen2_moe.py b/lmdeploy/pytorch/models/qwen2_moe.py
index fdaff8e0cc..1aff14483a 100644
--- a/lmdeploy/pytorch/models/qwen2_moe.py
+++ b/lmdeploy/pytorch/models/qwen2_moe.py
@@ -258,7 +258,7 @@ def __init__(self,
         # build attention layer
         self.self_attn = Qwen2MoeAttention(config, dtype=dtype, device=device)
 
-        # builf MLP
+        # build MLP
         if (layer_idx not in config.mlp_only_layers) and (
                 config.num_experts > 0) and ((layer_idx + 1) %
                                              config.decoder_sparse_step == 0):
diff --git a/lmdeploy/pytorch/models/qwen2_vl.py b/lmdeploy/pytorch/models/qwen2_vl.py
index 1a1dc1e1da..b10baaa4d5 100644
--- a/lmdeploy/pytorch/models/qwen2_vl.py
+++ b/lmdeploy/pytorch/models/qwen2_vl.py
@@ -192,7 +192,7 @@ def __init__(self,
         # build attention layer
         self.self_attn = Qwen2Attention(config, dtype=dtype, device=device)
 
-        # builf MLP
+        # build MLP
         self.mlp = Qwen2MLP(config, dtype=dtype, device=device)
 
         # build input layer norm
diff --git a/lmdeploy/pytorch/models/starcoder2.py b/lmdeploy/pytorch/models/starcoder2.py
index 7498df606f..4a6b175ca3 100644
--- a/lmdeploy/pytorch/models/starcoder2.py
+++ b/lmdeploy/pytorch/models/starcoder2.py
@@ -168,7 +168,7 @@ def __init__(self,
                                              dtype=dtype,
                                              device=device)
 
-        # builf MLP
+        # build MLP
         self.mlp = Starcoder2MLP(config, dtype=dtype, device=device)
 
         # build input layer norm
diff --git a/lmdeploy/pytorch/supported_models.py b/lmdeploy/pytorch/supported_models.py
index 3a5baf8fc6..21418188dd 100644
--- a/lmdeploy/pytorch/supported_models.py
+++ b/lmdeploy/pytorch/supported_models.py
@@ -62,6 +62,8 @@
     DeepseekV2ForCausalLM=True,
     # internvl
     InternVLChatModel=True,
+    # mono-internvl
+    InternLM2VEForCausalLM=True,
     # gemma2
     Gemma2ForCausalLM=True,
     # phi3.5-moe

From 47b0d1abe674c3444e2cbfeb7e2490ca976983af Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Mon, 11 Nov 2024 15:19:59 +0800
Subject: [PATCH 067/122] [ci] add more testcase into evaluation and daily test
 (#2721)

* updaste

* update

* update

* update

* updaste

* updaste

* update

* update numprompts

* update

* update

* update
---
 .github/scripts/eval_base_config.py           | 133 ++++++++++++++++--
 .github/scripts/eval_chat_config.py           | 107 +++++++++++++-
 .github/workflows/daily_ete_test.yml          |  12 +-
 .github/workflows/daily_ete_test_v100.yml     |  16 +--
 .github/workflows/evaluate.yml                |   8 +-
 autotest/config-v100.yaml                     |   8 +-
 autotest/config.yaml                          |  16 +--
 .../interface/pipeline/test_pipeline_func.py  |  14 +-
 .../pipeline/test_pipeline_longtext_func.py   |  37 +++++
 .../restful/test_restful_chat_func.py         |  35 +++++
 autotest/toolchain/test_lagent.py             |  36 +++++
 .../pipeline/test_pipeline_chat_pytorch.py    | 109 ++++++++------
 .../pipeline/test_pipeline_chat_turbomind.py  |  79 ++++++-----
 .../test_restful_chat_hf_turbomind_vl.py      |   2 +-
 autotest/utils/benchmark_utils.py             |   2 +-
 autotest/utils/mp_log_utils.py                |   2 +-
 autotest/utils/pipeline_chat.py               |   3 +-
 autotest/utils/restful_return_check.py        |  13 +-
 autotest/utils/run_restful_chat.py            |   2 +-
 19 files changed, 491 insertions(+), 143 deletions(-)
 create mode 100644 autotest/toolchain/test_lagent.py

diff --git a/.github/scripts/eval_base_config.py b/.github/scripts/eval_base_config.py
index dc31293603..8915decc7c 100644
--- a/.github/scripts/eval_base_config.py
+++ b/.github/scripts/eval_base_config.py
@@ -4,6 +4,8 @@
 
 with read_base():
     # choose a list of datasets
+    from opencompass.configs.datasets.ARC_c.ARC_c_few_shot_ppl import \
+        ARC_c_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import \
         bbh_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.ceval.ceval_ppl import \
@@ -14,17 +16,43 @@
         crowspairs_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.drop.drop_gen_a2697c import \
         drop_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.gpqa.gpqa_ppl_6bf57a import \
+    # Corebench v1.7
+    from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import \
+        GaokaoBench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_4b5a83 import \
         gpqa_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
         gsm8k_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.hellaswag.hellaswag_ppl import \
+    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import \
         hellaswag_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.humaneval.internal_humaneval_gen_ce6b06 import \
+        humaneval_datasets as humaneval_v2_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.humaneval.internal_humaneval_gen_d2537e import \
+        humaneval_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import \
+        math_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \
+        mathbench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import \
+        sanitized_mbpp_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import \
         mmlu_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.race.race_ppl_a138cd import \
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \
+        mmlu_pro_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.nq.nq_open_1shot_gen_20a989 import \
+        nq_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.race.race_few_shot_ppl import \
         race_datasets  # noqa: F401, E501
-    # read models
+    from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_few_shot_ppl import \
+        BoolQ_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
+        TheoremQA_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import \
+        triviaqa_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.wikibench.wikibench_few_shot_ppl_c23d79 import \
+        wikibench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
+        winogrande_datasets  # noqa: F401, E501
     from opencompass.configs.models.baichuan.hf_baichuan_7b import \
         models as hf_baichuan_7b  # noqa: F401, E501
     from opencompass.configs.models.gemma.hf_gemma_7b import \
@@ -49,6 +77,8 @@
         models as hf_mistral_7b_v0_1  # noqa: F401, E501
     from opencompass.configs.models.mistral.hf_mixtral_8x7b_v0_1 import \
         models as hf_mixtral_8x7b_v0_1  # noqa: F401, E501
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import \
+        models as lmdeploy_qwen2_5_7b  # noqa: F401, E501
     from opencompass.configs.models.qwen.hf_qwen1_5_7b import \
         models as hf_qwen1_5_7b  # noqa: F401, E501
     from opencompass.configs.models.qwen.hf_qwen2_7b import \
@@ -59,16 +89,95 @@
         models as lmdeploy_qwen1_5_7b  # noqa: F401, E501
     from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import \
         models as lmdeploy_qwen2_7b  # noqa: F401, E501
-    # and output the results in a chosen format
-    from opencompass.configs.summarizers.medium import \
-        summarizer  # noqa: F401, E501
+
+    # read models
+race_datasets = [race_datasets[1]]
+summarizer = dict(
+    dataset_abbrs=[
+        ['race-high', 'accuracy'],
+        ['ARC-c', 'accuracy'],
+        ['BoolQ', 'accuracy'],
+        ['mmlu_pro', 'naive_average'],
+        ['GPQA_diamond', 'accuracy'],
+        ['cmmlu', 'naive_average'],
+        ['mmlu', 'naive_average'],
+        ['drop', 'accuracy'],
+        ['bbh', 'naive_average'],
+        ['math', 'accuracy'],
+        ['openai_humaneval', 'humaneval_pass@1'],
+        ['openai_humaneval_v2', 'humaneval_pass@1'],
+        ['sanitized_mbpp', 'score'],
+        ['wikibench-wiki-single_choice_cncircular', 'perf_4'],
+        ['gsm8k', 'accuracy'],
+        ['GaokaoBench', 'weighted_average'],
+        ['triviaqa_wiki_1shot', 'score'],
+        ['nq_open_1shot', 'score'],
+        ['winogrande', 'accuracy'],
+        ['hellaswag', 'accuracy'],
+        ['TheoremQA', 'score'],
+        '###### MathBench-A: Application Part ######',
+        'college',
+        'high',
+        'middle',
+        'primary',
+        'arithmetic',
+        'mathbench-a (average)',
+        '###### MathBench-T: Theory Part ######',
+        'college_knowledge',
+        'high_knowledge',
+        'middle_knowledge',
+        'primary_knowledge',
+        'mathbench-t (average)',
+        '###### Overall: Average between MathBench-A and MathBench-T ######',
+        'Overall',
+        '',
+        'mmlu',
+        'mmlu-stem',
+        'mmlu-social-science',
+        'mmlu-humanities',
+        'mmlu-other',
+        'cmmlu',
+        'cmmlu-stem',
+        'cmmlu-social-science',
+        'cmmlu-humanities',
+        'cmmlu-other',
+        'cmmlu-china-specific',
+        'mmlu_pro',
+        'mmlu_pro_biology',
+        'mmlu_pro_business',
+        'mmlu_pro_chemistry',
+        'mmlu_pro_computer_science',
+        'mmlu_pro_economics',
+        'mmlu_pro_engineering',
+        'mmlu_pro_health',
+        'mmlu_pro_history',
+        'mmlu_pro_law',
+        'mmlu_pro_math',
+        'mmlu_pro_philosophy',
+        'mmlu_pro_physics',
+        'mmlu_pro_psychology',
+        'mmlu_pro_other',
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
 
 turbomind_qwen1_5_7b = deepcopy(*lmdeploy_qwen1_5_7b)
 turbomind_qwen2_7b = deepcopy(*lmdeploy_qwen2_7b)
+turbomind_qwen2_5_7b = deepcopy(*lmdeploy_qwen2_5_7b)
+turbomind_qwen2_5_14b = deepcopy(*lmdeploy_qwen2_5_7b)
+turbomind_qwen2_5_14b['path'] = 'Qwen/Qwen2.5-14B'
 turbomind_internlm2_5_7b = deepcopy(*lmdeploy_internlm2_5_7b)
+turbomind_internlm2_5_7b_4bits = deepcopy(*lmdeploy_internlm2_5_7b)
 turbomind_internlm2_5_7b_batch1 = deepcopy(*lmdeploy_internlm2_5_7b)
+turbomind_internlm2_5_7b_batch1_4bits = deepcopy(*lmdeploy_internlm2_5_7b)
+
+for model in [v for k, v in locals().items() if k.endswith('_4bits')]:
+    model['engine_config']['model_format'] = 'awq'
+    model['abbr'] = model['abbr'] + '_4bits'
+    model['path'] = model['path'] + '-inner-4bits'
 
-turbomind_internlm2_5_7b_batch1[
-    'abbr'] = turbomind_internlm2_5_7b_batch1['abbr'] + '_batch1'
-turbomind_internlm2_5_7b_batch1['engine_config']['max_batch_size'] = 1
-turbomind_internlm2_5_7b_batch1['batch_size'] = 1
+for model in [v for k, v in locals().items() if '_batch1' in k]:
+    model['abbr'] = model['abbr'] + '_batch1'
+    model['engine_config']['max_batch_size'] = 1
+    model['batch_size'] = 1
diff --git a/.github/scripts/eval_chat_config.py b/.github/scripts/eval_chat_config.py
index 89ad20a533..a54b66bdc8 100644
--- a/.github/scripts/eval_chat_config.py
+++ b/.github/scripts/eval_chat_config.py
@@ -98,9 +98,6 @@
         models as lmdeploy_qwen2_7b_instruct  # noqa: F401, E501
     from opencompass.configs.models.qwen.lmdeploy_qwen_7b_chat import \
         models as lmdeploy_qwen_7b_chat  # noqa: F401, E501
-    # and output the results in a chosen format
-    from opencompass.configs.summarizers.medium import \
-        summarizer  # noqa: F401, E501
 
 llama2_meta_template = dict(round=[
     dict(role='HUMAN', begin='[INST] ', end=' [/INST]'),
@@ -129,6 +126,8 @@
 turbomind_internlm2_5_7b_chat_kvint4 = deepcopy(*lmdeploy_internlm2_5_7b_chat)
 turbomind_internlm2_5_7b_chat_kvint8 = deepcopy(*lmdeploy_internlm2_5_7b_chat)
 turbomind_internlm2_5_7b_chat_batch1 = deepcopy(*lmdeploy_internlm2_5_7b_chat)
+turbomind_internlm2_5_7b_chat_batch1_4bits = deepcopy(
+    *lmdeploy_internlm2_5_7b_chat)
 pytorch_internlm2_5_7b_chat = deepcopy(*lmdeploy_internlm2_5_7b_chat)
 
 # ===== Configs for internlm/internlm2_5_20b_chat =====
@@ -231,10 +230,10 @@
     model['gen_config']['do_sample'] = False
     model['batch_size'] = 64
 
-turbomind_internlm2_5_7b_chat_batch1[
-    'abbr'] = turbomind_internlm2_5_7b_chat_batch1['abbr'] + '_batch1'
-turbomind_internlm2_5_7b_chat_batch1['engine_config']['max_batch_size'] = 1
-turbomind_internlm2_5_7b_chat_batch1['batch_size'] = 1
+for model in [v for k, v in locals().items() if '_batch1' in k]:
+    model['abbr'] = model['abbr'] + '_batch1'
+    model['engine_config']['max_batch_size'] = 1
+    model['batch_size'] = 1
 
 basic_pytorch_chat_tp1 = dict(type=TurboMindModelwithChatTemplate,
                               engine_config=dict(session_len=MAX_SESSION_LEN,
@@ -256,3 +255,97 @@
 pytorch_gemma_2_9b_it = deepcopy(basic_pytorch_chat_tp1)
 pytorch_gemma_2_9b_it['abbr'] = 'pytorch_gemma_2_9b_it'
 pytorch_gemma_2_9b_it['path'] = 'google/gemma-2-9b-it'
+
+race_datasets = [race_datasets[1]]
+
+# Summarizer
+summarizer = dict(
+    dataset_abbrs=[
+        ['race-high', 'accuracy'],
+        ['ARC-c', 'accuracy'],
+        ['BoolQ', 'accuracy'],
+        ['mmlu_pro', 'naive_average'],
+        ['drop', 'accuracy'],
+        ['bbh', 'naive_average'],
+        ['GPQA_diamond', 'accuracy'],
+        ['math', 'accuracy'],
+        ['wikibench-wiki-single_choice_cncircular', 'perf_4'],
+        ['openai_humaneval', 'humaneval_pass@1'],
+        ['sanitized_mbpp', 'score'],
+        ['cmmlu', 'naive_average'],
+        ['mmlu', 'naive_average'],
+        ['teval', 'naive_average'],
+        ['SciCode', 'accuracy'],
+        ['SciCode', 'sub_accuracy'],
+        ['humanevalx', 'naive_average'],
+        ['ds1000', 'naive_average'],
+        ['IFEval', 'Prompt-level-strict-accuracy'],
+        ['gsm8k', 'accuracy'],
+        ['GaokaoBench', 'weighted_average'],
+        ['triviaqa_wiki_1shot', 'score'],
+        ['nq_open_1shot', 'score'],
+        ['hellaswag', 'accuracy'],
+        ['TheoremQA', 'score'],
+        '###### MathBench-A: Application Part ######',
+        'college',
+        'high',
+        'middle',
+        'primary',
+        'arithmetic',
+        'mathbench-a (average)',
+        '###### MathBench-T: Theory Part ######',
+        'college_knowledge',
+        'high_knowledge',
+        'middle_knowledge',
+        'primary_knowledge',
+        'mathbench-t (average)',
+        '###### Overall: Average between MathBench-A and MathBench-T ######',
+        'Overall',
+        '',
+        ''
+        'mmlu',
+        'mmlu-stem',
+        'mmlu-social-science',
+        'mmlu-humanities',
+        'mmlu-other',
+        '',
+        'cmmlu',
+        'cmmlu-stem',
+        'cmmlu-social-science',
+        'cmmlu-humanities',
+        'cmmlu-other',
+        'cmmlu-china-specific',
+        '',
+        'mmlu_pro',
+        'mmlu_pro_biology',
+        'mmlu_pro_business',
+        'mmlu_pro_chemistry',
+        'mmlu_pro_computer_science',
+        'mmlu_pro_economics',
+        'mmlu_pro_engineering',
+        'mmlu_pro_health',
+        'mmlu_pro_history',
+        'mmlu_pro_law',
+        'mmlu_pro_math',
+        'mmlu_pro_philosophy',
+        'mmlu_pro_physics',
+        'mmlu_pro_psychology',
+        'mmlu_pro_other',
+        '',
+        'humanevalx-python',
+        'humanevalx-cpp',
+        'humanevalx-go',
+        'humanevalx-java',
+        'humanevalx-js',
+        '',
+        'ds1000_Pandas',
+        'ds1000_Numpy',
+        'ds1000_Tensorflow',
+        'ds1000_Scipy',
+        'ds1000_Sklearn',
+        'ds1000_Pytorch',
+        'ds1000_Matplotlib',
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
index f03bbf4a50..ab01d692c0 100644
--- a/.github/workflows/daily_ete_test.yml
+++ b/.github/workflows/daily_ete_test.yml
@@ -17,12 +17,12 @@ on:
         required: true
         description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
         type: string
-        default: "['turbomind', 'pytorch', 'turbomind_vl']"
+        default: '["turbomind", "pytorch", "turbomind_vl"]'
       model:
         required: true
         description: 'Set testcase module filter: chat, restful, pipeline, quantization. Default contains all models'
         type: string
-        default: "['pipeline','restful','chat']"
+        default: '["pipeline", "restful", "chat"]'
       offline_mode:
         required: true
         description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
@@ -206,8 +206,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        backend: ${{ fromJSON(inputs.backend) || fromJSON('["turbomind", "pytorch", "turbomind_vl"]')}}
-        model: ${{ fromJSON(inputs.model) || fromJSON('["pipeline","restful","chat"]')}}
+        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch", "turbomind_vl"]')}}
+        model: ${{ fromJSON(inputs.model || '["pipeline", "restful", "chat"]')}}
         exclude:
           - backend: turbomind_vl
             model: chat
@@ -589,13 +589,13 @@ jobs:
         run: |
           export LMDEPLOY_DIR=$(pwd)
 
-          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_4bits, turbomind_internlm2_5_7b_chat_kvint4, pt_internlm2_5_7b_chat, turbomind_internlm2_5_20b_chat, turbomind_internlm2_5_20b_chat_4bits, turbomind_internlm2_5_20b_chat_kvint4, pt_internlm2_5_20b_chat, turbomind_llama_3d1_8b_instruct, pt_llama_3d1_8b_instruct, turbomind_llama_3d1_8b_instruct_4bits, turbomind_llama_3d1_8b_instruct_kvint4, turbomind_qwen2_7b_instruct, turbomind_qwen2_7b_instruct_4bits, pt_qwen1_5_moe_2_7b_chat, pt_gemma_2_9b_it]" "[*race_datasets, *gsm8k_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true
+          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_chat_7b, pytorch_internlm2_chat_7b, turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama2_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true
       - name: Evaluate base models
         if: matrix.evaluate_type == 'base'
         run: |
           export LMDEPLOY_DIR=$(pwd)
 
-          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b, turbomind_qwen2_7b, turbomind_internlm2_5_7b_batch1]" "[*mmlu_datasets, *gsm8k_datasets]" /root/evaluation-reports/${{ github.run_id }} base true
+          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b, turbomind_qwen2_5_14b, turbomind_internlm2_5_7b_batch1]" "[*race_datasets, *gsm8k_datasets, *gpqa_datasets, *winogrande_datasets]" /root/evaluation-reports/${{ github.run_id }} base true
       - name: Clear workspace
         if: always()
         run: |
diff --git a/.github/workflows/daily_ete_test_v100.yml b/.github/workflows/daily_ete_test_v100.yml
index 8b32bab1f7..0112e9aaab 100644
--- a/.github/workflows/daily_ete_test_v100.yml
+++ b/.github/workflows/daily_ete_test_v100.yml
@@ -17,12 +17,12 @@ on:
         required: true
         description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
         type: string
-        default: "['turbomind', 'pytorch', 'turbomind_vl']"
+        default: '["turbomind", "pytorch", "turbomind_vl"]'
       model:
         required: true
         description: 'Set testcase module filter: chat, restful, pipeline, quantization. Default contains all models'
         type: string
-        default: "['pipeline','restful','chat']"
+        default: '["pipeline", "restful", "chat"]'
       offline_mode:
         required: true
         description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
@@ -34,7 +34,7 @@ on:
         type: string
         default: "['quant', 'tools','restful','pipeline','benchmark','evaluation']"
   schedule:
-    - cron:  '00 16 * * 0-4'
+    - cron:  '00 17 * * 0-4'
 
 env:
   HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
@@ -194,15 +194,15 @@ jobs:
           chmod -R 777 $workdir
 
   test_tools:
-    needs: test_quantization
     if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
     runs-on: linux-v100
+    needs: test_quantization
     timeout-minutes: 240
     strategy:
       fail-fast: false
       matrix:
-        backend: ${{ fromJSON(inputs.backend) || fromJSON('["turbomind", "pytorch", "turbomind_vl"]')}}
-        model: ${{ fromJSON(inputs.model) || fromJSON('["pipeline","restful","chat"]')}}
+        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch", "turbomind_vl"]')}}
+        model: ${{ fromJSON(inputs.model || '["pipeline", "restful", "chat"]')}}
         exclude:
           - backend: turbomind_vl
             model: chat
@@ -582,13 +582,13 @@ jobs:
         run: |
           export LMDEPLOY_DIR=$(pwd)
 
-          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_4bits, turbomind_internlm2_5_7b_chat_kvint4, pt_internlm2_5_7b_chat, turbomind_internlm2_5_20b_chat, turbomind_internlm2_5_20b_chat_4bits, turbomind_internlm2_5_20b_chat_kvint4, pt_internlm2_5_20b_chat, turbomind_llama_3d1_8b_instruct, pt_llama_3d1_8b_instruct, turbomind_llama_3d1_8b_instruct_4bits, turbomind_llama_3d1_8b_instruct_kvint4, turbomind_qwen2_7b_instruct, turbomind_qwen2_7b_instruct_4bits, pt_qwen1_5_moe_2_7b_chat, pt_gemma_2_9b_it]" "[*race_datasets, *gsm8k_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true
+          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_chat_7b, pytorch_internlm2_chat_7b, turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama2_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it]" "[*race_datasets, *gsm8k_datasets, *ifeval_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true
       - name: Evaluate base models
         if: matrix.evaluate_type == 'base'
         run: |
           export LMDEPLOY_DIR=$(pwd)
 
-          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b, turbomind_qwen2_7b, turbomind_internlm2_5_7b_batch1]" "[*mmlu_datasets, *gsm8k_datasets]" /root/evaluation-reports/${{ github.run_id }} base true
+          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b, turbomind_qwen2_5_14b, turbomind_internlm2_5_7b_batch1]" "[*race_datasets, *gsm8k_datasets, *gpqa_datasets, *winogrande_datasets]" /root/evaluation-reports/${{ github.run_id }} base true
       - name: Clear workspace
         if: always()
         run: |
diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml
index 6b91cd2746..b6ab89f595 100644
--- a/.github/workflows/evaluate.yml
+++ b/.github/workflows/evaluate.yml
@@ -17,22 +17,22 @@ on:
         required: true
         description: 'Tested TurboMind models list. eg. [internlm_chat_7b,internlm_chat_7b_w8a16]'
         type: string
-        default: '[turbomind_internlm2_chat_7b, pytorch_internlm2_chat_7b, turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama2_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, turbomind_internlm2_chat_7b_4bits, turbomind_internlm2_chat_7b_kvint4, turbomind_internlm2_chat_7b_kvint8, turbomind_internlm2_5_7b_chat_4bits, turbomind_internlm2_5_7b_chat_kvint4, turbomind_internlm2_5_7b_chat_kvint8, turbomind_internlm2_5_20b_chat_4bits, turbomind_internlm2_5_20b_chat_kvint4, turbomind_internlm2_5_20b_chat_kvint8, turbomind_qwen1_5_7b_chat_4bits, turbomind_qwen1_5_7b_chat_kvint4, turbomind_qwen1_5_7b_chat_kvint8, turbomind_llama2_7b_chat_4bits, turbomind_llama2_7b_chat_kvint4, turbomind_llama2_7b_chat_kvint8, turbomind_llama3_8b_instruct_4bits, turbomind_llama3_8b_instruct_kvint4, turbomind_llama3_8b_instruct_kvint8, turbomind_llama3_1_8b_instruct_4bits, turbomind_llama3_1_8b_instruct_kvint4, turbomind_llama3_1_8b_instruct_kvint8, turbomind_qwen2_7b_instruct_4bits, turbomind_qwen2_7b_instruct_kvint8]'
+        default: '[turbomind_internlm2_chat_7b, pytorch_internlm2_chat_7b, turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama2_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, turbomind_internlm2_chat_7b_4bits, turbomind_internlm2_chat_7b_kvint4, turbomind_internlm2_chat_7b_kvint8, turbomind_internlm2_5_7b_chat_4bits, turbomind_internlm2_5_7b_chat_kvint4, turbomind_internlm2_5_7b_chat_kvint8, turbomind_internlm2_5_20b_chat_4bits, turbomind_internlm2_5_20b_chat_kvint4, turbomind_internlm2_5_20b_chat_kvint8, turbomind_qwen1_5_7b_chat_4bits, turbomind_qwen1_5_7b_chat_kvint4, turbomind_qwen1_5_7b_chat_kvint8, turbomind_llama2_7b_chat_4bits, turbomind_llama2_7b_chat_kvint4, turbomind_llama2_7b_chat_kvint8, turbomind_llama3_8b_instruct_4bits, turbomind_llama3_8b_instruct_kvint4, turbomind_llama3_8b_instruct_kvint8, turbomind_llama3_1_8b_instruct_4bits, turbomind_llama3_1_8b_instruct_kvint4, turbomind_llama3_1_8b_instruct_kvint8, turbomind_qwen2_7b_instruct_4bits, turbomind_qwen2_7b_instruct_kvint8]'
       chat_datasets:
         required: true
         description: 'Tested datasets list. eg. [*bbh_datasets,*ceval_datasets,*cmmlu_datasets,*GaokaoBench_datasets,*gpqa_datasets,*gsm8k_datasets,*hellaswag_datasets,*humaneval_datasets,*ifeval_datasets,*math_datasets,*sanitized_mbpp_datasets,*mmlu_datasets,*nq_datasets,*race_datasets,*TheoremQA_datasets,*triviaqa_datasets,*winogrande_datasets,*crowspairs_datasets]'
         type: string
-        default: '[*mmlu_datasets, *gsm8k_datasets]'
+        default: '[*mmlu_datasets, *gsm8k_datasets, *ifeval_datasets]'
       base_models:
         required: true
         description: 'Tested TurboMind models list. eg. [turbomind_internlm2_5_7b, turbomind_qwen2_7b, turbomind_internlm2_5_7b_batch1]'
         type: string
-        default: '[turbomind_internlm2_5_7b, turbomind_qwen2_7b, turbomind_internlm2_5_7b_batch1]'
+        default: '[turbomind_internlm2_5_7b, turbomind_internlm2_5_7b_4bits, turbomind_internlm2_5_7b_batch1, turbomind_internlm2_5_7b_batch1_4bits, turbomind_qwen2_7b, turbomind_qwen2_5_7b, turbomind_qwen2_5_14b]'
       baes_datasets:
         required: true
         description: 'Tested datasets list. eg. [*mmlu_datasets, *gsm8k_datasets]'
         type: string
-        default: '[*mmlu_datasets, *gsm8k_datasets]'
+        default: '[*mmlu_datasets, *gsm8k_datasets, *gpqa_datasets, *winogrande_datasets]'
       oc_repo_org:
         required: false
         description: 'Tested repository organization name. Default is open-compass/opencompass'
diff --git a/autotest/config-v100.yaml b/autotest/config-v100.yaml
index 172667ec0c..de51e7e5e7 100644
--- a/autotest/config-v100.yaml
+++ b/autotest/config-v100.yaml
@@ -21,6 +21,7 @@ tp_config:
     MiniCPM-V-2_6: 2
 
 turbomind_chat_model:
+    - meta-llama/Llama-3.2-1B-Instruct
     - meta-llama/Meta-Llama-3-1-8B-Instruct
     - meta-llama/Meta-Llama-3-1-8B-Instruct-AWQ
     - meta-llama/Meta-Llama-3-8B-Instruct
@@ -28,10 +29,12 @@ turbomind_chat_model:
     - internlm/internlm2_5-7b-chat
     - internlm/internlm2_5-20b-chat
     - internlm/internlm-xcomposer2d5-7b
+    - OpenGVLab/InternVL2-1B
     - OpenGVLab/InternVL2-2B
     - OpenGVLab/InternVL2-8B
     - OpenGVLab/InternVL2-26B
     - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
+    - Qwen/Qwen2.5-0.5B-Instruct
     - Qwen/Qwen2-7B-Instruct-AWQ
     - Qwen/Qwen2-1.5B-Instruct
     - Qwen/Qwen2.5-7B-Instruct
@@ -54,7 +57,7 @@ pytorch_chat_model:
     - Qwen/Qwen2-VL-2B-Instruct
     - Qwen/Qwen2-VL-7B-Instruct
     - google/gemma-2-9b-it
-    - mistralai/Mistral-7B-Instruct-v0.2
+    - mistralai/Mistral-7B-Instruct-v0.3
     - THUDM/glm-4v-9b
     - THUDM/glm-4-9b-chat
     - microsoft/Phi-3-mini-4k-instruct
@@ -69,6 +72,7 @@ pytorch_base_model:
     - internlm/internlm2_5-20b
 
 vl_model:
+    - OpenGVLab/InternVL2-1B
     - OpenGVLab/InternVL2-2B
     - OpenGVLab/InternVL2-4B
     - OpenGVLab/InternVL2-8B
@@ -77,7 +81,7 @@ vl_model:
     - Qwen/Qwen2-VL-7B-Instruct
     - internlm/internlm-xcomposer2d5-7b
     - THUDM/glm-4v-9b
-    - microsoft/Phi-3-mini-4k-instruct
+    - microsoft/Phi-3.5-vision-instruct
 
 turbomind_quatization:
     no_awq:
diff --git a/autotest/config.yaml b/autotest/config.yaml
index 46b9bd9ce1..9357e473bb 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -20,6 +20,7 @@ tp_config:
     MiniCPM-V-2_6: 2
 
 turbomind_chat_model:
+    - meta-llama/Llama-3.2-1B-Instruct
     - meta-llama/Meta-Llama-3-1-8B-Instruct
     - meta-llama/Meta-Llama-3-1-8B-Instruct-AWQ
     - meta-llama/Meta-Llama-3-8B-Instruct
@@ -31,6 +32,7 @@ turbomind_chat_model:
     - internlm/internlm-chat-20b
     - internlm/internlm-xcomposer2-4khd-7b
     - internlm/internlm-xcomposer2d5-7b
+    - OpenGVLab/InternVL2-1B
     - OpenGVLab/InternVL2-2B
     - OpenGVLab/InternVL2-8B
     - OpenGVLab/InternVL2-26B
@@ -43,10 +45,9 @@ turbomind_chat_model:
     - Qwen/Qwen1.5-7B-Chat
     - Qwen/Qwen1.5-4B-Chat-AWQ
     - Qwen/Qwen-VL-Chat
+    - Qwen/Qwen2.5-0.5B-Instruct
     - Qwen/Qwen2.5-7B-Instruct
     - Qwen/Qwen2-7B-Instruct-GPTQ-Int4
-    - mistralai/Mistral-7B-Instruct-v0.1
-    - mistralai/Mistral-7B-Instruct-v0.2
     - mistralai/Mistral-7B-Instruct-v0.3
     - mistralai/Mixtral-8x7B-Instruct-v0.1
     - lmdeploy/llama2-chat-7b-w4
@@ -77,6 +78,7 @@ pytorch_chat_model:
     - OpenGVLab/InternVL2-26B
     - OpenGVLab/InternVL2-40B
     - OpenGVLab/InternVL-Chat-V1-5
+    - OpenGVLab/Mono-InternVL-2B
     - baichuan-inc/Baichuan2-7B-Chat
     - baichuan-inc/Baichuan2-13B-Chat
     - 01-ai/Yi-6B-Chat
@@ -88,8 +90,7 @@ pytorch_chat_model:
     - Qwen/Qwen1.5-MoE-A2.7B-Chat
     - Qwen/Qwen2-VL-2B-Instruct
     - Qwen/Qwen2-VL-7B-Instruct
-    - mistralai/Mistral-7B-Instruct-v0.1
-    - mistralai/Mistral-7B-Instruct-v0.2
+    - mistralai/Mistral-7B-Instruct-v0.3
     - mistralai/Mixtral-8x7B-Instruct-v0.1
     - google/gemma-7b-it
     - google/gemma-2-9b-it
@@ -100,7 +101,6 @@ pytorch_chat_model:
     - THUDM/cogvlm2-llama3-chinese-chat-19B
     - THUDM/glm-4v-9b
     - THUDM/glm-4-9b-chat
-    - THUDM/cogvlm-chat-hf
     - microsoft/Phi-3-mini-4k-instruct
     - microsoft/Phi-3-vision-128k-instruct
 
@@ -125,11 +125,13 @@ vl_model:
     - deepseek-ai/deepseek-vl-1.3b-chat
     - OpenGVLab/InternVL-Chat-V1-5
     - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
+    - OpenGVLab/InternVL2-1B
     - OpenGVLab/InternVL2-2B
     - OpenGVLab/InternVL2-4B
     - OpenGVLab/InternVL2-8B
     - OpenGVLab/InternVL2-26B
     - OpenGVLab/InternVL2-40B
+    - OpenGVLab/Mono-InternVL-2B
     - Qwen/Qwen2-VL-2B-Instruct
     - Qwen/Qwen2-VL-7B-Instruct
     - internlm/internlm-xcomposer2d5-7b
@@ -137,7 +139,7 @@ vl_model:
     - THUDM/cogvlm-chat-hf
     - THUDM/cogvlm2-llama3-chinese-chat-19B
     - THUDM/glm-4v-9b
-    - microsoft/Phi-3-mini-4k-instruct
+    - microsoft/Phi-3.5-vision-instruct
     - microsoft/Phi-3-vision-128k-instruct
     - openbmb/MiniCPM-Llama3-V-2_5
     - openbmb/MiniCPM-V-2_6
@@ -146,8 +148,6 @@ turbomind_quatization:
     no_awq:
         - Qwen/Qwen2-VL-2B-Instruct
         - Qwen/Qwen2-VL-7B-Instruct
-        - mistralai/Mistral-7B-Instruct-v0.1
-        - mistralai/Mistral-7B-Instruct-v0.2
         - mistralai/Mistral-7B-Instruct-v0.3
         - deepseek-ai/deepseek-coder-1.3b-instruct
         - codellama/CodeLlama-7b-Instruct-hf
diff --git a/autotest/interface/pipeline/test_pipeline_func.py b/autotest/interface/pipeline/test_pipeline_func.py
index bd33ed33a0..87a0719bcb 100644
--- a/autotest/interface/pipeline/test_pipeline_func.py
+++ b/autotest/interface/pipeline/test_pipeline_func.py
@@ -958,7 +958,9 @@ def test_backend_config_validate_turbomind(config, model, backend, worker_id):
         backend_config = backend(tp=0)
         pipeline(model_path, backend_config=backend_config)
 
-    with pytest.raises(pydantic.ValidationError):
+    with pytest.raises(
+            AssertionError,
+            match='max_batch_size should be greater than 0, but got 0'):
         backend_config = backend(max_batch_size=0)
         pipeline(model_path, backend_config=backend_config)
 
@@ -995,23 +997,23 @@ def test_backend_config_validate_pytorch(config, model, backend, worker_id):
     model_path = '/'.join([config.get('model_path'), model])
     with pytest.raises(AssertionError):
         backend_config = backend(tp=0)
-        pipeline(model_path, backend_config=backend_config)
+        init_pipeline(model_path, backend_config=backend_config)
 
     with pytest.raises(AssertionError):
         backend_config = backend(max_batch_size=0)
-        pipeline(model_path, backend_config=backend_config)
+        init_pipeline(model_path, backend_config=backend_config)
 
     with pytest.raises(AssertionError):
         backend_config = backend(cache_max_entry_count=0)
-        pipeline(model_path, backend_config=backend_config)
+        init_pipeline(model_path, backend_config=backend_config)
 
     with pytest.raises(AssertionError):
         backend_config = backend(num_cpu_blocks=-1)
-        pipeline(model_path, backend_config=backend_config)
+        init_pipeline(model_path, backend_config=backend_config)
 
     with pytest.raises(AssertionError):
         backend_config = backend(num_gpu_blocks=-1)
-        pipeline(model_path, backend_config=backend_config)
+        init_pipeline(model_path, backend_config=backend_config)
 
     if 'gw' in worker_id:
         del os.environ['CUDA_VISIBLE_DEVICES']
diff --git a/autotest/interface/pipeline/test_pipeline_longtext_func.py b/autotest/interface/pipeline/test_pipeline_longtext_func.py
index 76625f5de6..ada5b890c2 100644
--- a/autotest/interface/pipeline/test_pipeline_longtext_func.py
+++ b/autotest/interface/pipeline/test_pipeline_longtext_func.py
@@ -1,3 +1,4 @@
+import json
 import os
 from multiprocessing import Process
 
@@ -156,6 +157,9 @@ def passkey_retrival(config,
         else:
             backend_config = PytorchEngineConfig(session_len=session_len,
                                                  tp=tp_num)
+    # add config according to https://huggingface.co/Qwen/Qwen2.5-7B-Instruct
+    if 'qwen' in model.lower():
+        add_config_Qwen(model_path)
 
     pipe = pipeline(model_path, backend_config=backend_config)
 
@@ -163,6 +167,11 @@ def passkey_retrival(config,
     # inference
     pass_key, prompt = get_passkey_prompt(pipe, session_len)
     response = pipe(prompt, gen_config=gen_config)
+
+    # remove config, https://huggingface.co/Qwen/Qwen2.5-7B-Instruct
+    if 'qwen' in model.lower():
+        remove_config_Qwen(model_path)
+
     save_pipeline_common_log(config, log_name,
                              str(pass_key) in response.text, str(response))
 
@@ -202,3 +211,31 @@ def get_passkey_prompt(pipe, session_len):
     # inference
     prompt = ' '.join(lines)
     return pass_key, prompt
+
+
+def add_config_Qwen(model_path):
+    data = {
+        'rope_scaling': {
+            'factor': 4.0,
+            'original_max_position_embeddings': 32768,
+            'type': 'yarn'
+        }
+    }
+
+    with open('/'.join([model_path, 'config.json']), 'r') as f:
+        config = json.load(f)
+    if 'rope_scaling' not in config:
+        config.update(data)
+        with open('/'.join([model_path, 'config.json']), 'w') as f:
+            json.dump(config, f, indent=4)
+
+
+def remove_config_Qwen(model_path):
+    with open('/'.join([model_path, 'config.json']), 'r') as f:
+        config = json.load(f)
+
+    if 'rope_scaling' in config:
+        del config['rope_scaling']
+
+    with open('/'.join([model_path, 'config.json']), 'w') as f:
+        json.dump(config, f, indent=4)
diff --git a/autotest/interface/restful/test_restful_chat_func.py b/autotest/interface/restful/test_restful_chat_func.py
index c2a41906f8..f77fa22433 100644
--- a/autotest/interface/restful/test_restful_chat_func.py
+++ b/autotest/interface/restful/test_restful_chat_func.py
@@ -1442,3 +1442,38 @@ def mul(a: int, b: int):
         assert func2_args == '{"a": 8, "b": 2}'
         assert func2_out == 16
         assert response.choices[0].message.tool_calls[0].type == 'function'
+
+    def test_search_prompt(self):
+        tools = [{
+            'type': 'function',
+            'function': {
+                'name': 'search',
+                'description': 'BING search API',
+                'parameters': {
+                    'type': 'object',
+                    'properties': {
+                        'query': {
+                            'type': 'string',
+                            'description': 'list of search query strings'
+                        }
+                    },
+                    'required': ['location']
+                }
+            }
+        }]
+        messages = [{'role': 'user', 'content': '搜索最近的人工智能发展趋势'}]
+
+        client = OpenAI(api_key='YOUR_API_KEY', base_url=BASE_URL + '/v1')
+        model_name = client.models.list().data[0].id
+        response = client.chat.completions.create(model=model_name,
+                                                  messages=messages,
+                                                  temperature=0.01,
+                                                  stream=False,
+                                                  tools=tools)
+        print(response)
+        assert response.choices[0].finish_reason == 'tool_calls'
+        assert response.choices[0].message.tool_calls[
+            0].function.name == 'search'
+        assert '人工智能' in response.choices[0].message.tool_calls[
+            0].function.arguments
+        assert response.choices[0].message.tool_calls[0].type == 'function'
diff --git a/autotest/toolchain/test_lagent.py b/autotest/toolchain/test_lagent.py
new file mode 100644
index 0000000000..9004bc3f44
--- /dev/null
+++ b/autotest/toolchain/test_lagent.py
@@ -0,0 +1,36 @@
+import pytest
+
+
+@pytest.mark.order(10)
+@pytest.mark.lagent
+@pytest.mark.flaky(reruns=2)
+class TestLagent:
+
+    @pytest.mark.parametrize('model', ['internlm/internlm2_5-7b-chat'])
+    def test_repeat(config, model):
+        from lagent.llms import INTERNLM2_META, LMDeployPipeline
+
+        model = LMDeployPipeline(
+            path='/'.join([config.get('model_path'), model]),
+            meta_template=INTERNLM2_META,
+            tp=1,
+            top_k=40,
+            top_p=0.8,
+            temperature=1.2,
+            stop_words=['<|im_end|>'],
+            max_new_tokens=4096,
+        )
+        response_list = []
+        for i in range(3):
+            print(f'run_{i}：')
+            response = model.chat([{
+                'role':
+                'user',
+                'content':
+                '已知$$z_{1}=1$$,$$z_{2}=\\text{i}$$,$$z_{3}=-1$$,$$z_{4}=-\\text{i}$$,顺次连结它们所表示的点,则所得图形围成的面积为（ ）\nA. $$\\dfrac{1}{4}$$\n B. $$\\dfrac{1}{2}$$\n C. $$1$$\n D. $$2$$\n\n'  # noqa: F401, E501
+            }])
+            print(response)
+            response_list.append(response)
+            assert len(response) > 10 and '$\\boxed' in response
+        assert response_list[0] != response_list[1] and response_list[
+            1] != response_list[2]
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch.py
index 270d4b6831..a828e17a09 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_pytorch.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch.py
@@ -1,5 +1,5 @@
 import os
-from multiprocessing import Process
+from multiprocessing import get_context
 
 import pytest
 from utils.config_utils import get_cuda_id_by_workerid, get_torch_model_list
@@ -18,8 +18,10 @@ def test_pipeline_chat_pytorch_tp1(config, common_case_config, model,
                                    worker_id):
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
-    p = Process(target=run_pipeline_chat_test,
-                args=(config, common_case_config, model, 'pytorch', worker_id))
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_chat_test,
+                              args=(config, common_case_config, model,
+                                    'pytorch', worker_id))
     p.start()
     p.join()
 
@@ -42,8 +44,10 @@ def test_pipeline_chat_pytorch_tp2(config, common_case_config, model,
                                                                      tp_num=2)
         os.environ['MASTER_PORT'] = str(
             int(worker_id.replace('gw', '')) + 29500)
-    p = Process(target=run_pipeline_chat_test,
-                args=(config, common_case_config, model, 'pytorch', worker_id))
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_chat_test,
+                              args=(config, common_case_config, model,
+                                    'pytorch', worker_id))
     p.start()
     p.join()
 
@@ -67,11 +71,12 @@ def test_pipeline_chat_kvint4_tp1(config, common_case_config, model,
         return  # kvint4 for qwen2 is not support
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
-    p = Process(target=run_pipeline_chat_test,
-                args=(config, common_case_config, model, 'pytorch-kvint',
-                      worker_id, {
-                          'quant_policy': 4
-                      }))
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_chat_test,
+                              args=(config, common_case_config, model,
+                                    'pytorch-kvint', worker_id, {
+                                        'quant_policy': 4
+                                    }))
     p.start()
     p.join()
     assert_pipeline_chat_log(config, common_case_config, model,
@@ -92,11 +97,14 @@ def test_pipeline_chat_kvint4_tp2(config, common_case_config, model,
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
                                                                      tp_num=2)
-    p = Process(target=run_pipeline_chat_test,
-                args=(config, common_case_config, model, 'pytorch-kvint',
-                      worker_id, {
-                          'quant_policy': 4
-                      }))
+        os.environ['MASTER_PORT'] = str(
+            int(worker_id.replace('gw', '')) + 29500)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_chat_test,
+                              args=(config, common_case_config, model,
+                                    'pytorch-kvint', worker_id, {
+                                        'quant_policy': 4
+                                    }))
     p.start()
     p.join()
     assert_pipeline_chat_log(config, common_case_config, model,
@@ -116,11 +124,12 @@ def test_pipeline_chat_kvint8_tp1(config, common_case_config, model,
                                   worker_id):
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
-    p = Process(target=run_pipeline_chat_test,
-                args=(config, common_case_config, model, 'pytorch-kvint',
-                      worker_id, {
-                          'quant_policy': 8
-                      }))
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_chat_test,
+                              args=(config, common_case_config, model,
+                                    'pytorch-kvint', worker_id, {
+                                        'quant_policy': 8
+                                    }))
     p.start()
     p.join()
     assert_pipeline_chat_log(config, common_case_config, model,
@@ -141,11 +150,14 @@ def test_pipeline_chat_kvint8_tp2(config, common_case_config, model,
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
                                                                      tp_num=2)
-    p = Process(target=run_pipeline_chat_test,
-                args=(config, common_case_config, model, 'pytorch-kvint',
-                      worker_id, {
-                          'quant_policy': 8
-                      }))
+        os.environ['MASTER_PORT'] = str(
+            int(worker_id.replace('gw', '')) + 29500)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_chat_test,
+                              args=(config, common_case_config, model,
+                                    'pytorch-kvint', worker_id, {
+                                        'quant_policy': 8
+                                    }))
     p.start()
     p.join()
     assert_pipeline_chat_log(config, common_case_config, model,
@@ -160,8 +172,10 @@ def test_pipeline_chat_kvint8_tp2(config, common_case_config, model,
 @pytest.mark.pr_test
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 def test_pipeline_chat_pytorch_pr(config, common_case_config, model):
-    p = Process(target=run_pipeline_chat_test,
-                args=(config, common_case_config, model, 'pytorch'))
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_chat_test,
+                              args=(config, common_case_config, model,
+                                    'pytorch'))
     p.start()
     p.join()
 
@@ -180,9 +194,10 @@ def test_modelscope_pipeline_chat_pytorch_tp1(config, common_case_config,
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
     os.environ['LMDEPLOY_USE_MODELSCOPE'] = 'True'
-    p = Process(target=run_pipeline_chat_test,
-                args=(config, common_case_config, model, 'pytorch', worker_id,
-                      None, False))
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_chat_test,
+                              args=(config, common_case_config, model,
+                                    'pytorch', worker_id, None, False))
     p.start()
     p.join()
     del os.environ['LMDEPLOY_USE_MODELSCOPE']
@@ -202,13 +217,15 @@ def test_pipeline_chat_pytorch_with_lora_tp1(config, common_case_config, model,
                                              worker_id):
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
-    p = Process(target=run_pipeline_chat_test,
-                args=(config, common_case_config, model, 'pytorch_lora',
-                      worker_id, {
-                          'adapters': {
-                              'adapter0': 'lora/Llama2-Chinese-7b-Chat-LoRA'
-                          }
-                      }))
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_chat_test,
+                              args=(config, common_case_config, model,
+                                    'pytorch_lora', worker_id, {
+                                        'adapters': {
+                                            'adapter0':
+                                            'lora/Llama2-Chinese-7b-Chat-LoRA'
+                                        }
+                                    }))
     p.start()
     p.join()
 
@@ -230,14 +247,16 @@ def test_pipeline_chat_pytorch_with_lora_tp2(config, common_case_config, model,
                                                                      tp_num=2)
         os.environ['MASTER_PORT'] = str(
             int(worker_id.replace('gw', '')) + 29500)
-    p = Process(target=run_pipeline_chat_test,
-                args=(config, common_case_config, model, 'pytorch_lora',
-                      worker_id, {
-                          'adapters': {
-                              'adapter0': 'lora/2024-01-25_self_dup',
-                              'adapter1': 'lora/2024-01-25_self'
-                          }
-                      }))
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_chat_test,
+                              args=(config, common_case_config, model,
+                                    'pytorch_lora', worker_id, {
+                                        'adapters': {
+                                            'adapter0':
+                                            'lora/2024-01-25_self_dup',
+                                            'adapter1': 'lora/2024-01-25_self'
+                                        }
+                                    }))
     p.start()
     p.join()
 
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind.py
index d67b5d27b3..17560e754d 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind.py
@@ -1,5 +1,5 @@
 import os
-from multiprocessing import Process
+from multiprocessing import get_context
 
 import pytest
 from utils.config_utils import get_all_model_list, get_cuda_id_by_workerid
@@ -16,9 +16,10 @@
 def test_pipeline_chat_tp1(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
-    p = Process(target=run_pipeline_chat_test,
-                args=(config, common_case_config, model, 'turbomind',
-                      worker_id))
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_chat_test,
+                              args=(config, common_case_config, model,
+                                    'turbomind', worker_id))
     p.start()
     p.join()
     assert_pipeline_chat_log(config, common_case_config, model, 'turbomind',
@@ -35,9 +36,12 @@ def test_pipeline_chat_tp2(config, common_case_config, model, worker_id):
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
                                                                      tp_num=2)
-    p = Process(target=run_pipeline_chat_test,
-                args=(config, common_case_config, model, 'turbomind',
-                      worker_id))
+        os.environ['MASTER_PORT'] = str(
+            int(worker_id.replace('gw', '')) + 29500)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_chat_test,
+                              args=(config, common_case_config, model,
+                                    'turbomind', worker_id))
     p.start()
     p.join()
     assert_pipeline_chat_log(config, common_case_config, model, 'turbomind',
@@ -56,11 +60,12 @@ def test_pipeline_chat_kvint4_tp1(config, common_case_config, model,
         return  # kvint4 for qwen2 is not support
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
-    p = Process(target=run_pipeline_chat_test,
-                args=(config, common_case_config, model, 'turbomind-kvint',
-                      worker_id, {
-                          'quant_policy': 4
-                      }))
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_chat_test,
+                              args=(config, common_case_config, model,
+                                    'turbomind-kvint', worker_id, {
+                                        'quant_policy': 4
+                                    }))
     p.start()
     p.join()
     assert_pipeline_chat_log(config, common_case_config, model,
@@ -78,11 +83,14 @@ def test_pipeline_chat_kvint4_tp2(config, common_case_config, model,
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
                                                                      tp_num=2)
-    p = Process(target=run_pipeline_chat_test,
-                args=(config, common_case_config, model, 'turbomind-kvint',
-                      worker_id, {
-                          'quant_policy': 4
-                      }))
+        os.environ['MASTER_PORT'] = str(
+            int(worker_id.replace('gw', '')) + 29500)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_chat_test,
+                              args=(config, common_case_config, model,
+                                    'turbomind-kvint', worker_id, {
+                                        'quant_policy': 4
+                                    }))
     p.start()
     p.join()
     assert_pipeline_chat_log(config, common_case_config, model,
@@ -99,11 +107,12 @@ def test_pipeline_chat_kvint8_tp1(config, common_case_config, model,
                                   worker_id):
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
-    p = Process(target=run_pipeline_chat_test,
-                args=(config, common_case_config, model, 'turbomind-kvint',
-                      worker_id, {
-                          'quant_policy': 8
-                      }))
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_chat_test,
+                              args=(config, common_case_config, model,
+                                    'turbomind-kvint', worker_id, {
+                                        'quant_policy': 8
+                                    }))
     p.start()
     p.join()
     assert_pipeline_chat_log(config, common_case_config, model,
@@ -121,11 +130,14 @@ def test_pipeline_chat_kvint8_tp2(config, common_case_config, model,
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
                                                                      tp_num=2)
-    p = Process(target=run_pipeline_chat_test,
-                args=(config, common_case_config, model, 'turbomind-kvint',
-                      worker_id, {
-                          'quant_policy': 8
-                      }))
+        os.environ['MASTER_PORT'] = str(
+            int(worker_id.replace('gw', '')) + 29500)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_chat_test,
+                              args=(config, common_case_config, model,
+                                    'turbomind-kvint', worker_id, {
+                                        'quant_policy': 8
+                                    }))
     p.start()
     p.join()
     assert_pipeline_chat_log(config, common_case_config, model,
@@ -143,8 +155,10 @@ def test_pipeline_chat_kvint8_tp2(config, common_case_config, model,
     'internlm/internlm2_5-20b-chat-inner-4bits'
 ])
 def test_pipeline_chat_pr(config, common_case_config, model):
-    p = Process(target=run_pipeline_chat_test,
-                args=(config, common_case_config, model, 'turbomind'))
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_chat_test,
+                              args=(config, common_case_config, model,
+                                    'turbomind'))
     p.start()
     p.join()
     assert_pipeline_chat_log(config, common_case_config, model, 'turbomind')
@@ -161,9 +175,10 @@ def test_modelscope_pipeline_chat_tp1(config, common_case_config, model,
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
     os.environ['LMDEPLOY_USE_MODELSCOPE'] = 'True'
-    p = Process(target=run_pipeline_chat_test,
-                args=(config, common_case_config, model, 'turbomind',
-                      worker_id, None, False))
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_chat_test,
+                              args=(config, common_case_config, model,
+                                    'turbomind', worker_id, None, False))
     p.start()
     p.join()
     del os.environ['LMDEPLOY_USE_MODELSCOPE']
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py
index 1248ec3d50..6e636d7ad4 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py
@@ -28,7 +28,7 @@ def getModelList(tp_num):
     return [{
         'model': item,
         'cuda_prefix': None,
-        'tp_num': tp_num,
+        'tp_num': tp_num
     } for item in get_vl_model_list(tp_num)]
 
 
diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py
index c80dbe0dfc..4c2e0a2c90 100644
--- a/autotest/utils/benchmark_utils.py
+++ b/autotest/utils/benchmark_utils.py
@@ -178,7 +178,7 @@ def restful_test(config,
     if is_smoke:
         command += ' --num-prompts 200'
     else:
-        command += ' --num-prompts 2000'
+        command += ' --num-prompts 5000'
 
     for batch in [128, 256]:
         csv_path = f'{benchmark_path}/restful_batch_{batch}_1th.csv'
diff --git a/autotest/utils/mp_log_utils.py b/autotest/utils/mp_log_utils.py
index c3391c2568..2322e721bf 100644
--- a/autotest/utils/mp_log_utils.py
+++ b/autotest/utils/mp_log_utils.py
@@ -1,7 +1,7 @@
 import os
 
 import allure
-from pytest import assume
+from pytest_assume.plugin import assume
 
 
 def write_log(config,
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index 1ab34b23d5..e9988f0e39 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -4,7 +4,7 @@
 
 import allure
 import torch
-from pytest import assume
+from pytest_assume.plugin import assume
 from utils.get_run_config import get_model_name, get_tp_num
 from utils.rule_condition_assert import assert_result
 
@@ -297,7 +297,6 @@ def run_pipeline_vl_chat_test(config, model_case, quant_policy: int = None):
         backend_config.quant_policy = quant_policy
 
     if not is_bf16_supported():
-        backend_config.cache_max_entry_count = 0.5
         backend_config.dtype = 'float16'
     pipe = pipeline(hf_path, backend_config=backend_config)
 
diff --git a/autotest/utils/restful_return_check.py b/autotest/utils/restful_return_check.py
index 9de308bf6e..a94cb2be4a 100644
--- a/autotest/utils/restful_return_check.py
+++ b/autotest/utils/restful_return_check.py
@@ -82,10 +82,9 @@ def assert_chat_completions_stream_return(output,
         if not is_last:
             assert message.get('finish_reason') is None
             if check_logprobs:
-                assert (len(message.get('logprobs').get('content')) == 1)
-                assert_logprobs(
-                    message.get('logprobs').get('content')[0], logprobs_num)
-
+                assert (len(message.get('logprobs').get('content')) >= 1)
+                for content in message.get('logprobs').get('content'):
+                    assert_logprobs(content, logprobs_num)
         if is_last is True:
             assert len(message.get('delta').get('content')) == 0
             assert message.get('finish_reason') in ['stop', 'length']
@@ -110,9 +109,9 @@ def assert_completions_stream_return(output,
         if is_last is False:
             assert message.get('finish_reason') is None
             if check_logprobs:
-                assert (len(message.get('logprobs').get('content')) == 1)
-                assert_logprobs(
-                    message.get('logprobs').get('content')[0], logprobs_num)
+                assert (len(message.get('logprobs').get('content')) >= 1)
+                for content in message.get('logprobs').get('content'):
+                    assert_logprobs(content, logprobs_num)
 
         if is_last is True:
             assert len(message.get('text')) == 0
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index c567db4d00..dfc363b086 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -6,7 +6,7 @@
 
 import allure
 import psutil
-from pytest import assume
+from pytest_assume.plugin import assume
 from utils.config_utils import get_cuda_prefix_by_workerid, get_workerid
 from utils.get_run_config import get_command_with_extra
 from utils.rule_condition_assert import assert_result

From 4a8d745604246ae6fdd1978688c28cf7ffda2a03 Mon Sep 17 00:00:00 2001
From: q yao <streetyao@live.com>
Date: Mon, 11 Nov 2024 21:11:30 +0800
Subject: [PATCH 068/122] Support ep, column major moe kernel. (#2690)

* support EP, optimize moe kernel

* support ep and col major moe kernel

* remove create_weight_ep
---
 lmdeploy/pytorch/backends/cuda/moe.py      | 43 ++++++++++--
 lmdeploy/pytorch/backends/dlinfer/moe.py   | 14 ++--
 lmdeploy/pytorch/backends/moe.py           | 21 ++++--
 lmdeploy/pytorch/kernels/cuda/fused_moe.py | 79 ++++++++--------------
 lmdeploy/pytorch/nn/moe.py                 | 79 +++++++++++++++++-----
 5 files changed, 153 insertions(+), 83 deletions(-)

diff --git a/lmdeploy/pytorch/backends/cuda/moe.py b/lmdeploy/pytorch/backends/cuda/moe.py
index e5ae92d8bd..eb38401211 100644
--- a/lmdeploy/pytorch/backends/cuda/moe.py
+++ b/lmdeploy/pytorch/backends/cuda/moe.py
@@ -1,5 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
+from typing import List
+
 import torch
 
 from lmdeploy.pytorch.kernels.cuda import fused_moe
@@ -10,7 +12,11 @@
 class TritonFusedMoEImpl(FusedMoEImpl):
     """triton fused moe implementation."""
 
-    def __init__(self, top_k: int, renormalize: bool = False):
+    def __init__(self,
+                 top_k: int,
+                 num_experts: int,
+                 renormalize: bool = False):
+        self.num_experts = num_experts
         self.top_k = top_k
         self.renormalize = renormalize
 
@@ -23,16 +29,39 @@ def update_weights(self, gate_up_weights: torch.Tensor,
                                               2).contiguous().transpose(1, 2)
         return gate_up_weights, down_weights
 
-    def forward(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor,
-                topk_ids: torch.LongTensor, gate_up_weights: torch.Tensor,
-                down_weights: torch.Tensor):
+    def support_ep(self):
+        """support expert parallelism."""
+        return True
+
+    def ep_expert_list(self, world_size: int, rank: int):
+        """experts list of current rank."""
+        num_experts = self.num_experts
+        expert_per_rank = (num_experts + world_size - 1) // world_size
+        first_expert = rank * expert_per_rank
+        last_expert = min(first_expert + expert_per_rank, num_experts)
+        return list(range(first_expert, last_expert))
+
+    def forward(self,
+                hidden_states: torch.Tensor,
+                topk_weights: torch.Tensor,
+                topk_ids: torch.LongTensor,
+                gate_up_weights: torch.Tensor,
+                down_weights: torch.Tensor,
+                expert_list: List[int] = None):
         """forward."""
+        expert_offset = 0
+        num_experts = None
+        if expert_list is not None and len(expert_list) != self.num_experts:
+            expert_offset = expert_list[0]
+            num_experts = self.num_experts
         return fused_moe(hidden_states,
                          gate_up_weights,
                          down_weights,
                          topk_weights=topk_weights,
                          topk_ids=topk_ids,
                          topk=self.top_k,
+                         expert_offset=expert_offset,
+                         num_experts=num_experts,
                          renormalize=self.renormalize)
 
 
@@ -40,6 +69,8 @@ class TritonFusedMoEBuilder(FusedMoEBuilder):
     """triton fused moe builder."""
 
     @staticmethod
-    def build(top_k: int, renormalize: bool = False):
+    def build(top_k: int, num_experts: int, renormalize: bool = False):
         """build from mlp."""
-        return TritonFusedMoEImpl(top_k=top_k, renormalize=renormalize)
+        return TritonFusedMoEImpl(top_k=top_k,
+                                  num_experts=num_experts,
+                                  renormalize=renormalize)
diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py
index eb8b1e591e..90f6335ecb 100644
--- a/lmdeploy/pytorch/backends/dlinfer/moe.py
+++ b/lmdeploy/pytorch/backends/dlinfer/moe.py
@@ -1,5 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
+from typing import List
+
 import torch
 
 from lmdeploy.pytorch.kernels.dlinfer import fused_moe, moe_gating_topk_softmax
@@ -38,9 +40,13 @@ def __init__(self, top_k: int, renormalize: bool = False):
         self.top_k = top_k
         self.renormalize = renormalize
 
-    def forward(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor,
-                topk_ids: torch.LongTensor, gate_up_weights: torch.Tensor,
-                down_weights: torch.Tensor):
+    def forward(self,
+                hidden_states: torch.Tensor,
+                topk_weights: torch.Tensor,
+                topk_ids: torch.LongTensor,
+                gate_up_weights: torch.Tensor,
+                down_weights: torch.Tensor,
+                expert_list: List[int] = None):
         """forward."""
         return fused_moe(hidden_states, self.top_k, topk_ids, topk_weights,
                          gate_up_weights, down_weights)
@@ -50,6 +56,6 @@ class DlinferFusedMoEBuilder(FusedMoEBuilder):
     """dlinfer fused moe builder."""
 
     @staticmethod
-    def build(top_k: int, renormalize: bool = False):
+    def build(top_k: int, num_experts: int, renormalize: bool = False):
         """build from mlp."""
         return DlinferFusedMoEImpl(top_k=top_k, renormalize=renormalize)
diff --git a/lmdeploy/pytorch/backends/moe.py b/lmdeploy/pytorch/backends/moe.py
index 4a1d5b73da..8e7977625e 100644
--- a/lmdeploy/pytorch/backends/moe.py
+++ b/lmdeploy/pytorch/backends/moe.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABC, abstractmethod
+from typing import List
 
 import torch
 
@@ -31,10 +32,22 @@ def update_weights(self, gate_up_weights: torch.Tensor,
         """update weights."""
         return gate_up_weights, down_weights
 
+    def support_ep(self):
+        """support expert parallelism."""
+        return False
+
+    def ep_expert_list(self, world_size: int, rank: int):
+        """experts list of current rank."""
+        raise NotImplementedError('Not Implemented.')
+
     @abstractmethod
-    def forward(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor,
-                topk_ids: torch.LongTensor, gate_up_weights: torch.Tensor,
-                down_weights: torch.Tensor):
+    def forward(self,
+                hidden_states: torch.Tensor,
+                topk_weights: torch.Tensor,
+                topk_ids: torch.LongTensor,
+                gate_up_weights: torch.Tensor,
+                down_weights: torch.Tensor,
+                expert_list: List[int] = None):
         """forward."""
         raise NotImplementedError
 
@@ -44,6 +57,6 @@ class FusedMoEBuilder(ABC):
 
     @staticmethod
     @abstractmethod
-    def build(top_k: int, renormalize: bool = False):
+    def build(top_k: int, num_experts: int, renormalize: bool = False):
         """build from mlp."""
         raise NotImplementedError
diff --git a/lmdeploy/pytorch/kernels/cuda/fused_moe.py b/lmdeploy/pytorch/kernels/cuda/fused_moe.py
index e9ac7087cd..9f9771368e 100644
--- a/lmdeploy/pytorch/kernels/cuda/fused_moe.py
+++ b/lmdeploy/pytorch/kernels/cuda/fused_moe.py
@@ -5,7 +5,7 @@
 import triton.language as tl
 
 from .activation import silu_and_mul
-from .triton_utils import get_kernel_meta, wrap_jit_func
+from .triton_utils import get_kernel_meta
 
 
 def get_cuda_autotune_config():
@@ -13,16 +13,16 @@ def get_cuda_autotune_config():
         triton.Config(
             {
                 'BLOCK_SIZE_M': 128,
-                'BLOCK_SIZE_N': 256,
-                'BLOCK_SIZE_K': 64,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 32,
                 'GROUP_SIZE_M': 1,
             },
-            num_stages=3,
-            num_warps=8),
+            num_stages=4,
+            num_warps=4),
         triton.Config(
             {
-                'BLOCK_SIZE_M': 128,
-                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 256,
                 'BLOCK_SIZE_K': 32,
                 'GROUP_SIZE_M': 1,
             },
@@ -43,34 +43,9 @@ def get_cuda_autotune_config():
 @triton.autotune(
     configs=get_cuda_autotune_config(),
     key=['N', 'K', 'M_NP2'],
+    warmup=10,
+    rep=25,
 )
-@wrap_jit_func(type_hint=dict(
-    A=torch.Tensor,
-    B=torch.Tensor,
-    C=torch.Tensor,
-    SortedIdx=torch.Tensor,
-    ExpStart=torch.Tensor,
-    ExpEnd=torch.Tensor,
-    Weights=torch.Tensor,
-    N=int,
-    K=int,
-    stride_am=int,
-    stride_ak=int,
-    stride_be=int,
-    stride_bn=int,
-    stride_bk=int,
-    stride_cm=int,
-    stride_cn=int,
-    BLOCK_SIZE_M=torch.int32,
-    BLOCK_SIZE_N=torch.int32,
-    BLOCK_SIZE_K=torch.int32,
-    GROUP_SIZE_M=torch.int32,
-    ENABLE_WEIGHTS=bool,
-    top_k=torch.int32,
-    expert_offset=torch.int32,
-    reindex_a=bool,
-    reindex_c=bool,
-))
 @triton.jit
 def fused_moe_kernel(
     A,
@@ -110,16 +85,23 @@ def fused_moe_kernel(
     if M <= 0:
         return
 
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_m = tl.cdiv(M_NP2, BLOCK_SIZE_M)
     num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-
-    if pid_m * BLOCK_SIZE_M >= M:
+
+    if GROUP_SIZE_M == 1:
+        pid_m = pid % num_pid_m
+        pid_n = pid // num_pid_m
+        # pid_m = pid // num_pid_n
+        # pid_n = pid % num_pid_n
+    else:
+        num_pid_in_group = GROUP_SIZE_M * num_pid_n
+        group_id = pid // num_pid_in_group
+        first_pid_m = group_id * GROUP_SIZE_M
+        group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+        pid_m = first_pid_m + (pid % group_size_m)
+        pid_n = (pid % num_pid_in_group) // group_size_m
+
+    if pid_m * BLOCK_SIZE_M >= M or pid_n * BLOCK_SIZE_N >= N:
         return
 
     offs_sid = exp_start + pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
@@ -189,11 +171,11 @@ def fused_moe_kernel_launcher(
     if num_tokens is None:
         num_tokens = A.size(0)
     M_NP2 = triton.next_power_of_2(num_tokens)
-    M_NP2 = max(32, M_NP2)
+    M_NP2 = max(64, M_NP2)
     E, N, K = B.shape
 
     def _grid_fn(META):
-        grid = (triton.cdiv(num_tokens, META['BLOCK_SIZE_M']) *
+        grid = (triton.cdiv(M_NP2, META['BLOCK_SIZE_M']) *
                 triton.cdiv(N, META['BLOCK_SIZE_N']), E)
         return grid
 
@@ -229,13 +211,6 @@ def _grid_fn(META):
     )
 
 
-@wrap_jit_func(type_hint=dict(TopkIdx=torch.Tensor,
-                              SortedIdx=torch.Tensor,
-                              ExpStart=torch.Tensor,
-                              ExpEnd=torch.Tensor,
-                              len_sorted_idx=int,
-                              num_experts=torch.int32,
-                              BLOCK=torch.int32))
 @triton.jit
 def _start_end_kernel(TopkIdx, SortedIdx, ExpStart, ExpEnd,
                       len_sorted_idx: int, num_experts: tl.constexpr,
diff --git a/lmdeploy/pytorch/nn/moe.py b/lmdeploy/pytorch/nn/moe.py
index 6467a6de08..47176335c4 100644
--- a/lmdeploy/pytorch/nn/moe.py
+++ b/lmdeploy/pytorch/nn/moe.py
@@ -35,32 +35,54 @@ def __init__(self,
                  renormalize: bool = False,
                  dtype: Optional[torch.dtype] = None,
                  device: Optional[torch.device] = None,
-                 all_reduce: bool = True):
+                 all_reduce: bool = True,
+                 enable_ep: bool = False):
         super().__init__()
         if device is None:
             device = torch.device('cpu')
         if dtype is None:
             dtype = torch.float16
-        hidden_dim, ffn_dim = self._update_args(hidden_dim, ffn_dim)
 
         impl_builder = get_backend().get_layer_impl_builder(OpType.FusedMoE)
-        self.impl = impl_builder.build(top_k, renormalize)
-
-        gate_up_weights, down_weights = self.create_weights(hidden_dim,
-                                                            ffn_dim,
-                                                            num_experts,
-                                                            dtype=dtype,
-                                                            device=device)
+        self.impl = impl_builder.build(top_k, num_experts, renormalize)
+
+        self.expert_list = None
+        self.expert_map = None
+        enable_ep = enable_ep and self.impl.support_ep()
+        if enable_ep:
+            world_size, rank = get_world_rank()
+            expert_list = self.impl.ep_expert_list(world_size, rank)
+            self.expert_list = expert_list
+            self.expert_map = dict(
+                (eid, idx) for idx, eid in enumerate(expert_list))
+            num_experts = len(expert_list)
+            gate_up_weights, down_weights = self.create_weights(hidden_dim,
+                                                                ffn_dim,
+                                                                num_experts,
+                                                                dtype=dtype,
+                                                                device=device)
+        else:
+            hidden_dim, ffn_dim = self._update_args(hidden_dim, ffn_dim)
+            gate_up_weights, down_weights = self.create_weights(hidden_dim,
+                                                                ffn_dim,
+                                                                num_experts,
+                                                                dtype=dtype,
+                                                                device=device)
         gate_up_weights = torch.nn.Parameter(gate_up_weights,
                                              requires_grad=False)
         down_weights = torch.nn.Parameter(down_weights, requires_grad=False)
-        gate_up_weights.weight_loader = self.weight_loader
-        down_weights.weight_loader = self.weight_loader
         gate_up_weights._weight_type = 'gate_up_weights'
         down_weights._weight_type = 'down_weights'
         self.register_parameter('gate_up_weights', gate_up_weights)
         self.register_parameter('down_weights', down_weights)
 
+        if enable_ep:
+            gate_up_weights.weight_loader = self.weight_loader_ep
+            down_weights.weight_loader = self.weight_loader_ep
+        else:
+            gate_up_weights.weight_loader = self.weight_loader_tp
+            down_weights.weight_loader = self.weight_loader_tp
+
         self.hidden_dim = hidden_dim
         self.ffn_dim = ffn_dim
         self.num_experts = num_experts
@@ -91,21 +113,23 @@ def create_weights(self, hidden_dim: int, ffn_dim: int, num_experts: int,
 
     def update_weights(self):
         """update weights."""
+        gateup_loader = self.gate_up_weights.weight_loader
+        down_loader = self.down_weights.weight_loader
         gate_up_weights, down_weights = self.impl.update_weights(
             self.gate_up_weights, self.down_weights)
         gate_up_weights = torch.nn.Parameter(gate_up_weights,
                                              requires_grad=False)
         down_weights = torch.nn.Parameter(down_weights, requires_grad=False)
-        gate_up_weights.weight_loader = self.weight_loader
-        down_weights.weight_loader = self.weight_loader
+        gate_up_weights.weight_loader = gateup_loader
+        down_weights.weight_loader = down_loader
         gate_up_weights._weight_type = 'gate_up_weights'
         down_weights._weight_type = 'down_weights'
         self.register_parameter('gate_up_weights', gate_up_weights)
         self.register_parameter('down_weights', down_weights)
 
-    def weight_loader(self, param: torch.nn.Parameter,
-                      loaded_weight: torch.Tensor, expert_id: int,
-                      shard_id: str):
+    def weight_loader_tp(self, param: torch.nn.Parameter,
+                         loaded_weight: torch.Tensor, expert_id: int,
+                         shard_id: str):
         """weight loader."""
         world_size, rank = get_world_rank()
         if shard_id == 'gate':
@@ -121,10 +145,31 @@ def weight_loader(self, param: torch.nn.Parameter,
             raise RuntimeError(f'Unknown shard_id: {shard_id}')
         param_data.copy_(weight)
 
+    def weight_loader_ep(self, param: torch.nn.Parameter,
+                         loaded_weight: torch.Tensor, expert_id: int,
+                         shard_id: str):
+        """weight loader."""
+        expert_list = self.expert_list
+        if expert_id not in expert_list:
+            return
+
+        expert_map = self.expert_map
+        param_id = expert_map[expert_id]
+        if shard_id == 'gate':
+            param_data = param.data[param_id, :self.ffn_dim]
+        elif shard_id == 'up':
+            param_data = param.data[param_id, self.ffn_dim:]
+        elif shard_id == 'down':
+            param_data = param.data[param_id]
+        else:
+            raise RuntimeError(f'Unknown shard_id: {shard_id}')
+        param_data.copy_(loaded_weight)
+
     def forward(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor,
                 topk_ids: torch.LongTensor):
         ret = self.impl.forward(hidden_states, topk_weights, topk_ids,
-                                self.gate_up_weights, self.down_weights)
+                                self.gate_up_weights, self.down_weights,
+                                self.expert_list)
         if self.all_reduce:
             dist.all_reduce(ret)
         return ret

From 67a85384de625618ba93d92c04d5b2e3d10d6f8f Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Tue, 12 Nov 2024 16:40:17 +0800
Subject: [PATCH 069/122] Remove one of the duplicate bos tokens (#2708)

* Remove one of the duplicate bos tokens

* Update tokenizer.py
---
 lmdeploy/tokenizer.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/lmdeploy/tokenizer.py b/lmdeploy/tokenizer.py
index e977005588..fb4364602a 100644
--- a/lmdeploy/tokenizer.py
+++ b/lmdeploy/tokenizer.py
@@ -624,7 +624,14 @@ def encode(self,
         Returns:
             list[int]: token ids
         """
-        return self.model.encode(s, add_bos, add_special_tokens, **kwargs)
+        encoded = self.model.encode(s, add_bos, add_special_tokens, **kwargs)
+        if encoded[:2] == [self.bos_token_id] * 2:
+            get_logger('lmdeploy').warn(
+                f'Detected duplicate bos token {self.bos_token_id} in prompt, '
+                'this will likely reduce response quality, one of them will be'
+                'removed')
+            encoded = encoded[1:]
+        return encoded
 
     def decode(
         self,

From e7517080903b9a5b2086818f800a1b1b212b4e3a Mon Sep 17 00:00:00 2001
From: vinkle <vinkle-hzt@outlook.com>
Date: Tue, 12 Nov 2024 21:15:09 +0800
Subject: [PATCH 070/122] fix assert pad >= 0 failed when inter_size is not a
 multiple of group_size (#2740)

---
 lmdeploy/turbomind/deploy/target_model/base.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py
index 4750cde850..abd570cd00 100644
--- a/lmdeploy/turbomind/deploy/target_model/base.py
+++ b/lmdeploy/turbomind/deploy/target_model/base.py
@@ -38,7 +38,8 @@ def _weight_dtype_map(weight_type: str, default=None):
 
 def _pad_inter_size(inter_size: int, group_size: int, tp: int):
     group_size = max(1, group_size)
-    groups_per_rank = (inter_size // group_size + tp - 1) // tp
+    group_num = (inter_size + group_size - 1) // group_size
+    groups_per_rank = (group_num + tp - 1) // tp
     inter_size_padded = groups_per_rank * group_size * tp
     return inter_size_padded
 

From d2d4209d148c09356492a04000a878270896178c Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Wed, 13 Nov 2024 11:27:26 +0800
Subject: [PATCH 071/122] Support Qwen2-MoE models (#2723)

* add qwen2-moe

* eliminate `inter_size_` from ffn layer

* clean up

* fix lint

* clean up

* Update config.yaml

---------

Co-authored-by: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
---
 autotest/config.yaml                          |   2 +
 lmdeploy/turbomind/deploy/config.py           |   2 +
 lmdeploy/turbomind/deploy/module.py           |  21 +-
 .../turbomind/deploy/source_model/mixtral.py  |   2 +
 .../turbomind/deploy/source_model/qwen.py     |  61 +++
 lmdeploy/turbomind/supported_models.py        |   1 +
 src/turbomind/kernels/gemm/moe_utils_v2.cu    | 449 ++++++++++++++++--
 src/turbomind/kernels/gemm/moe_utils_v2.h     |   4 +-
 .../kernels/gemm/test/test_moe_utils.cu       | 103 ++--
 src/turbomind/kernels/gemm/test/testbed.h     |   2 +
 .../models/llama/LlamaDecoderLayerWeight.cc   |  52 +-
 src/turbomind/models/llama/LlamaDenseWeight.h |  29 +-
 src/turbomind/models/llama/LlamaFfnLayer.cc   |  47 +-
 src/turbomind/models/llama/LlamaFfnLayer.h    |  11 +-
 src/turbomind/models/llama/llama_params.h     |   8 +-
 src/turbomind/models/llama/moe_ffn_layer.cc   |  78 +--
 src/turbomind/models/llama/moe_ffn_layer.h    |   6 +-
 src/turbomind/models/llama/unified_decoder.cc |  21 +-
 .../triton_backend/llama/LlamaTritonModel.cc  |   2 +
 19 files changed, 711 insertions(+), 190 deletions(-)

diff --git a/autotest/config.yaml b/autotest/config.yaml
index 9357e473bb..587ee6331b 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -44,10 +44,12 @@ turbomind_chat_model:
     - Qwen/Qwen2-1.5B-Instruct
     - Qwen/Qwen1.5-7B-Chat
     - Qwen/Qwen1.5-4B-Chat-AWQ
+    - Qwen/Qwen1.5-MoE-A2.7B-Chat
     - Qwen/Qwen-VL-Chat
     - Qwen/Qwen2.5-0.5B-Instruct
     - Qwen/Qwen2.5-7B-Instruct
     - Qwen/Qwen2-7B-Instruct-GPTQ-Int4
+    - Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4
     - mistralai/Mistral-7B-Instruct-v0.3
     - mistralai/Mixtral-8x7B-Instruct-v0.1
     - lmdeploy/llama2-chat-7b-w4
diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
index 7e8ebf7b47..a535b0d4c1 100644
--- a/lmdeploy/turbomind/deploy/config.py
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -50,6 +50,8 @@ class ModelConfig:
     expert_num: int = 0
     expert_inter_size: int = 0
     experts_per_token: int = 0
+    moe_shared_gate: int = False
+    moe_norm_topk: int = False
 
     def verify(self):
         invalid = {}
diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py
index a9f7385376..8d998abe2b 100644
--- a/lmdeploy/turbomind/deploy/module.py
+++ b/lmdeploy/turbomind/deploy/module.py
@@ -140,14 +140,18 @@ class MoeFfn(Ffn):
     requires:
         r.moe_ffn_expert(e, i, kind)
         r.moe_ffn_gate(i)
+        r.moe_ffn_shared_gate(i)
     """
 
     _moe_ffn_expert = 'layers.{0}.moe_ffn.experts.E.{1}.{2}'
-    _moe_ffn_gate = 'layers.{0}.moe_ffn.gate.{1}'
+    _moe_ffn_gate = 'layers.{0}.moe_ffn.gate.weight'
+    _moe_ffn_shared_gate = 'layers.{0}.moe_ffn.shared_gate.weight'
 
     def __init__(self, model: BaseOutputModel):
         super().__init__(model)
         self.expert_num = model.model_config.expert_num
+        self.inter_size = model.model_config.expert_inter_size
+        self.shared_gate = model.model_config.moe_shared_gate
 
     def apply(self, i: int, r: BaseReader):
         for p in get_params(r.moe_ffn_expert()):
@@ -157,7 +161,13 @@ def apply(self, i: int, r: BaseReader):
                   i)
 
         gate = transpose(r.moe_ffn_gate(i))
-        self.model.save_split(gate, self._moe_ffn_gate.format(i, 'weight'))
+        self.model.save_split(gate, self._moe_ffn_gate.format(i))
+
+        if self.shared_gate:
+            shared_gate = transpose(r.moe_ffn_shared_gate(i))
+            # print(shared_gate)
+            self.model.save_split(shared_gate,
+                                  self._moe_ffn_shared_gate.format(i))
 
 
 class Attn(Module):
@@ -248,8 +258,11 @@ class Transformer:
 
     def __init__(self, model: BaseOutputModel):
         self.model = model
-        ffn = MoeFfn if model.model_config.expert_num else Ffn
-        modules = [Attn, LayerNorm, ffn]
+        modules = [Attn, LayerNorm]
+        if model.model_config.inter_size:
+            modules.append(Ffn)
+        if model.model_config.expert_num:
+            modules.append(MoeFfn)
         self.modules = [c(model) for c in modules]
         self.misc = Misc(model)
 
diff --git a/lmdeploy/turbomind/deploy/source_model/mixtral.py b/lmdeploy/turbomind/deploy/source_model/mixtral.py
index 102ede29f2..ff9df2d409 100644
--- a/lmdeploy/turbomind/deploy/source_model/mixtral.py
+++ b/lmdeploy/turbomind/deploy/source_model/mixtral.py
@@ -33,4 +33,6 @@ def model_info(self):
         info['expert_num'] = cfg['num_local_experts']
         info['expert_inter_size'] = cfg['intermediate_size']
         info['experts_per_token'] = cfg['num_experts_per_tok']
+        info['moe_norm_topk'] = True
+        info['inter_size'] = 0
         return info
diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py
index 0ec0586a37..772bd03037 100644
--- a/lmdeploy/turbomind/deploy/source_model/qwen.py
+++ b/lmdeploy/turbomind/deploy/source_model/qwen.py
@@ -120,3 +120,64 @@ def model_info(self):
         cfg = super().model_info()
         cfg['attn_bias'] = 1
         return cfg
+
+
+class Qwen2MoeReader(LlamaReader):
+
+    ffn_pattern = r'shared_expert\.'
+
+    def moe_ffn_expert(self, e=None, i=None, kind=None):
+        if not kind:
+            return self.filter(r'experts')
+        result = []
+        for key in ['gate', 'down', 'up']:
+            name = f'model.layers.{i}.mlp.experts.{e}.{key}_proj.{kind}'
+            tensor = self.params.get(name)
+            tensor = self.transform(tensor, kind)
+            result.append(tensor)
+        return (*result, )
+
+    def moe_ffn_gate(self, i):
+        return self.params.get(f'model.layers.{i}.mlp.gate.weight')
+
+    def _ffn(self, i: int, kind: str):
+        """Get ffn kind for layer i."""
+        if not kind:
+            return self.filter(self.ffn_pattern)
+        result = []
+        for key in ['gate', 'down', 'up']:
+            tensor = self.params[
+                f'model.layers.{i}.mlp.shared_expert.{key}_proj.{kind}']
+            tensor = self.transform(tensor, kind)
+            result.append(tensor)
+        return (*result, )
+
+    def moe_ffn_shared_gate(self, i):
+        return self.params.get(
+            f'model.layers.{i}.mlp.shared_expert_gate.weight')
+
+
+@INPUT_MODELS.register_module(name='qwen2-moe')
+class Qwen2MoeModel(LlamaModel):
+
+    Reader = Qwen2MoeReader
+
+    def tokenizer_info(self):
+        """https://huggingface.co/Qwen/Qwen1.5-7B-Chat/blob/main/generation_con
+        fig.json."""  # noqa: E501
+        n_words = 152064
+        bos_id = 151643
+        eos_id = 151645
+        return n_words, bos_id, eos_id
+
+    def model_info(self):
+        cfg = self.model_config
+        info = super().model_info()
+        info['expert_num'] = cfg['num_experts']
+        info['expert_inter_size'] = cfg['moe_intermediate_size']
+        info['experts_per_token'] = cfg['num_experts_per_tok']
+        info['inter_size'] = cfg['shared_expert_intermediate_size']
+        info['moe_shared_gate'] = True
+        info['moe_norm_topk_prob'] = cfg['norm_topk_prob']
+        info['attn_bias'] = 1
+        return info
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index fe0819d70f..f6772fddd5 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -20,6 +20,7 @@
     QWenLMHeadModel='qwen',
     # Qwen2
     Qwen2ForCausalLM='qwen2',
+    Qwen2MoeForCausalLM='qwen2-moe',
     # mistral
     MistralForCausalLM='llama',
     # llava
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu
index acf6355856..5912c60a8a 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.cu
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu
@@ -8,7 +8,7 @@
 
 #include <cub/block/block_reduce.cuh>
 #include <cub/block/block_scan.cuh>
-#include <cuda_pipeline_primitives.h>
+#include <cub/warp/warp_scan.cuh>
 
 #include "src/turbomind/kernels/core/array_ops.h"
 #include "src/turbomind/kernels/core/common.h"
@@ -19,7 +19,7 @@ namespace turbomind {
 
 template<int top_k, int block_dim>
 __global__ void MoeGateKernel_V2(float*       scales,  // [e,n]
-                                 int*         masks,   // [E,n], padded
+                                 int8_t*      masks,   // [E,n], padded
                                  int*         accum,   // [E,tiles]
                                  const float* logits,  // [E,n]
                                  int          log_tile,
@@ -88,6 +88,8 @@ __global__ void MoeGateKernel_V2(float*       scales,  // [e,n]
             const int lowbit = (mask & -mask);
             const int e      = 31 - __clz(lowbit);
 
+            // printf("e = %d, ti = %d, idx = %d\n", e, ti, i);
+
             masks[e * tokens_padded + ti] = i;
             atomicAdd(&shared_accum[e][ti >> log_tile], 1);
             top_val[i] = logits[ti * experts + e];
@@ -120,11 +122,11 @@ __global__ void MoeGateKernel_V2(float*       scales,  // [e,n]
     }
 }
 
-template<int block_dim>
-__global__ void MoeScanKernel_V2(int*       f2n,      // [e*n]
+template<int block_dim, class Mask>
+__global__ void MoeScanKernel_v2(int*       f2n,      // [e*n]
                                  int*       en2f,     // [e,n]
                                  int*       offsets,  // [E+1]
-                                 int*       masks,    // [E,n], padded
+                                 Mask*      masks,    // [E,n], padded
                                  const int* accum,    // [E,tiles]
                                  int        log_tile,
                                  int        tiles,
@@ -142,13 +144,15 @@ __global__ void MoeScanKernel_V2(int*       f2n,      // [e*n]
 
     constexpr int vec_size = kMoeGateVecSize;
 
-    using Vec = Array<int, vec_size>;
+    using Vec = Array<Mask, vec_size>;
 
     const int tile_id = blockIdx.x;
     const int ei      = blockIdx.y;
 
-    const int global_tile_id = ei * tiles + tile_id;
+    const int  global_tile_id = ei * tiles + tile_id;
+    const bool is_valid       = global_tile_id <= experts * tiles;
 
+#if 0
     int vacc[4]{};
     {
         int idx = threadIdx.x;
@@ -162,6 +166,18 @@ __global__ void MoeScanKernel_V2(int*       f2n,      // [e*n]
     }
 
     int offset = BlockReduce{temp_storage.reduce}.Sum(vacc);
+#else
+
+    int vacc = 0;
+    for (int i = threadIdx.x; i < global_tile_id; i += block_dim) {
+        if (is_valid && i < global_tile_id) {
+            vacc += accum[i];
+        }
+    }
+
+    int offset = BlockReduce{temp_storage.reduce}.Sum(vacc);
+
+#endif
 
     __shared__ int shared_offset;
 
@@ -200,7 +216,7 @@ __global__ void MoeScanKernel_V2(int*       f2n,      // [e*n]
         const bool pred = vi < tile_vec_end;
 
         Vec data;
-        fill(data, -1);
+        fill(data, Mask{-1});
         if (pred) {
             Ldg(data, mask_ptr[vi].data());
         }
@@ -231,17 +247,328 @@ __global__ void MoeScanKernel_V2(int*       f2n,      // [e*n]
     }
 }
 
+template<int max_expert_num,
+         int max_top_k,
+         //  bool norm_top_k,
+         int items_per_thread,
+         int block_dim,
+         int access_size,
+         class Mask>
+__global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
+                                 Mask*        masks,   // [E,n], padded
+                                 int*         accum,   // [E,tiles]
+                                 const float* logits,  // [n,E]
+                                 int          log_tile,
+                                 int          tiles,
+                                 int          token_num,
+                                 int          token_num_padded,
+                                 int          expert_num,
+                                 int          top_k,
+                                 bool         norm_topk)
+{
+    constexpr int max_tiles         = kMoeGateMaxTiles;
+    constexpr int threads_per_token = max_expert_num / items_per_thread;  // 8
+    constexpr int tokens_per_cta    = block_dim / threads_per_token;
+
+    // We use bits in a uint32_t to represent selected experts
+    static_assert(items_per_thread <= 32);
+    // We use warp-level primitives for reduction
+    static_assert(threads_per_token <= 32);
+
+    static_assert((threads_per_token & (threads_per_token - 1)) == 0);
+
+    const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    const int ti = thread_idx / threads_per_token;
+    const int ei = thread_idx % threads_per_token;
+
+    const int bti = threadIdx.x / threads_per_token;
+
+    const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token;
+
+    const int warp_offset  = thread_idx / WARP_SIZE * WARP_SIZE / threads_per_token;
+    const int block_offset = thread_idx / block_dim * block_dim / threads_per_token;
+
+    float data[items_per_thread];
+    int   idxs[items_per_thread];
+
+#if 0
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        data[i] = -std::numeric_limits<float>::infinity();
+        idxs[i] = threads_per_token * (i / access_size * access_size) + i % access_size + ei * access_size;
+    }
+    if (ti < token_num) {
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; i += access_size) {
+            const int e = threads_per_token * i + ei * access_size;
+            if (e < expert_num) {
+                Ldg((Array<float, access_size>&)data[i], &logits[ti * expert_num + e]);
+            }
+        }
+    }
+
+    __shared__ union {
+        struct {
+            // +1 padding greatly reduced (-80%) bank conflicts
+            int   shared_accum[max_tiles][max_expert_num + 1];
+            float shared_scales[max_top_k][tokens_per_cta];
+            int   shared_exp_id[max_top_k][tokens_per_cta];
+        };
+    } smem;
+#elif 1
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        data[i] = -std::numeric_limits<float>::infinity();
+        // idxs[i] = threads_per_token * (i / access_size * access_size) + i % access_size + ei * access_size;
+        idxs[i] = ei * items_per_thread + i;
+    }
+    if (ti < token_num) {
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; i += access_size) {
+            // const int e = threads_per_token * i + ei * access_size;
+            const int e = ei * items_per_thread + i;
+            if (e < expert_num) {
+                Ldg((Array<float, access_size>&)data[i], &logits[ti * expert_num + e]);
+            }
+        }
+    }
+
+    __shared__ union {
+        struct {
+            // +1 padding greatly reduced (-80%) bank conflicts
+            int   shared_accum[max_tiles][max_expert_num + 1];
+            float shared_scales[max_top_k][tokens_per_cta];
+            int   shared_exp_id[max_top_k][tokens_per_cta];
+        };
+    } smem;
+#else
+
+    const int warp_id = threadIdx.x / WARP_SIZE;
+    const int lane_id = threadIdx.x % WARP_SIZE;
+
+    constexpr int vecs_per_thread = items_per_thread / access_size;
+
+    using Vec            = Array<float, access_size>;
+    constexpr int banks  = 128 / sizeof(Vec);
+    constexpr int chunks = 4;  // block_dim / WARP_SIZE;
+
+    __shared__ union {
+        Vec shared_data[chunks][vecs_per_thread * WARP_SIZE / banks][banks + 1];
+        struct {
+            // +1 padding greatly reduced (-80%) bank conflicts
+            int   shared_accum[max_tiles][max_expert_num + 1];
+            float shared_scales[max_top_k][tokens_per_cta];
+            int   shared_exp_id[max_top_k][tokens_per_cta];
+        };
+    } smem;
+
+    __align__(16) Vec vecs[vecs_per_thread];
+
+    {
+        const int warp_end = min(warp_offset + WARP_SIZE / threads_per_token, token_num) * expert_num;
+        int       p        = warp_offset * expert_num + access_size * lane_id;
+        PRAGMA_UNROLL
+        for (int i = 0; i < vecs_per_thread; ++i) {
+            fill(vecs[i], -std::numeric_limits<float>::infinity());
+            // const int p = warp_offset * expert_num + access_size * (lane_id + i * WARP_SIZE);
+            if (p < warp_end) {
+                Ldg(vecs[i], &logits[p]);
+            }
+            p += access_size * WARP_SIZE;
+        }
+    }
+
+    PRAGMA_UNROLL
+    for (int c = 0; c < block_dim / WARP_SIZE; c += chunks) {
+        PRAGMA_UNROLL
+        for (int i = 0; i < vecs_per_thread; ++i) {
+            int p = i * WARP_SIZE + lane_id;
+            if (c <= warp_id && warp_id < c + chunks) {
+                Store(smem.shared_data[warp_id - c][p / banks][p % banks].data(), vecs[i]);
+            }
+        }
+
+        __syncwarp();
+
+        PRAGMA_UNROLL
+        for (int i = 0; i < vecs_per_thread; ++i) {
+            int p = lane_id * vecs_per_thread + i;
+            if (c <= warp_id && warp_id < c + chunks) {
+                Load(vecs[i], smem.shared_data[warp_id - c][p / banks][p % banks].data());
+            }
+        }
+
+        __syncthreads();
+    }
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        idxs[i] = ei * items_per_thread + i;
+    }
+    PRAGMA_UNROLL
+    for (int i = 0; i < vecs_per_thread; ++i) {
+        (Array<float, access_size>&)data[i * access_size] = vecs[i];
+    }
+
+#endif
+
+    constexpr float kLog2e = 1.4426950408889634074;
+
+    unsigned mask = (unsigned)-1;
+    float    max_logit;
+
+    int   count{};
+    float sum_prob{};
+
+    const int warp_ti_offset = warp_ti * threads_per_token;
+
+    auto run = [&](int k) {
+        unsigned bit     = 1;
+        unsigned max_bit = 0;
+        float    max_val = -std::numeric_limits<float>::infinity();
+        // local maximum
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; ++i) {
+            if ((mask & bit) && data[i] > max_val) {
+                max_bit = bit;
+                max_val = data[i];
+            }
+            asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
+        }
+
+        if (k == 0) {
+            PRAGMA_UNROLL
+            for (int i = 0; i < items_per_thread; ++i) {
+                data[i] *= kLog2e;
+            }
+        }
+
+        int   g_max_ei  = ei;
+        float g_max_val = max_val;
+        if constexpr (threads_per_token > 1) {
+            // global maximum
+            PRAGMA_UNROLL
+            for (int m = threads_per_token / 2; m >= 1; m /= 2) {
+                g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m));
+            }
+            // tie breaking
+            const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val);
+            g_max_ei          = __ffs(active >> (unsigned)warp_ti_offset) - 1;
+        }
+        if (k == 0) {
+            max_logit = g_max_val;
+        }
+        if (ei == g_max_ei) {
+            mask -= max_bit;
+            ++count;
+        }
+    };
+
+    run(0);
+
+    for (int k = 1; k < top_k; ++k) {
+        run(k);
+    }
+
+    mask = ~mask;
+
+    int used[items_per_thread];
+    {
+        unsigned bit = 1;
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; ++i) {
+            used[i] = (mask & bit) > 0;
+            asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
+        }
+    }
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        if (!norm_topk || used[i]) {
+            data[i] = exp2f(data[i] - max_logit);
+            sum_prob += data[i];
+        }
+    }
+
+    PRAGMA_UNROLL
+    for (int m = threads_per_token / 2; m >= 1; m /= 2) {
+        sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m);
+    }
+
+    sum_prob = fdividef(1.f, sum_prob);
+
+    using WarpScan = cub::WarpScan<int, threads_per_token>;
+    __shared__ typename WarpScan::TempStorage temp_storage[tokens_per_cta];
+
+    int idx{};
+    WarpScan{temp_storage[bti]}.ExclusiveSum(count, idx);
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        if (used[i]) {
+            smem.shared_exp_id[idx][bti] = idxs[i];
+            smem.shared_scales[idx][bti] = data[i] * sum_prob;
+            ++idx;
+        }
+    }
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < max_tiles * max_expert_num; i += block_dim) {
+        int e                   = (i + threadIdx.x) % max_expert_num;
+        int t                   = (i + threadIdx.x) / max_expert_num;
+        smem.shared_accum[t][e] = 0;
+    }
+
+    __syncthreads();
+
+    constexpr int k_per_thread = cdiv(max_top_k, threads_per_token);
+
+    const int bti2 = threadIdx.x % tokens_per_cta;
+    const int ei2  = threadIdx.x / tokens_per_cta;
+    const int ti2  = blockIdx.x * tokens_per_cta + bti2;
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < k_per_thread; ++i) {
+        const int   idx       = ei2 * k_per_thread + i;
+        const int   expert_id = smem.shared_exp_id[idx][bti2];
+        const float scale     = smem.shared_scales[idx][bti2];
+
+        if (ti2 < token_num && idx < top_k) {
+            masks[expert_id * token_num_padded + ti2] = idx;
+            scales[idx * token_num + ti2]             = scale;
+            atomicAdd(&smem.shared_accum[ti2 >> log_tile][expert_id], 1);
+
+            // printf("%d %d %f\n", idx, expert_id, scale);
+        }
+    }
+
+    __syncthreads();
+
+    for (int i = 0; i < max_expert_num * max_tiles; i += block_dim) {
+        int t = (threadIdx.x + i) % max_tiles;
+        int e = (threadIdx.x + i) / max_tiles;
+        if (e < expert_num && t < tiles) {
+            atomicAdd(accum + e * tiles + t, smem.shared_accum[t][e]);
+        }
+    }
+}
+
+template<int N>
+inline constexpr std::integral_constant<int, N> _Int{};
+
 void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
                       int*         en2f,           // [e,n] -> n*e
                       int*         offsets,        // [E+1]
                       float*       scales,         // [e,n]
-                      int*         masks,          // [E,n]
+                      void*        masks,          // [E,n]
                       int*         accum,          // [E]
                       const float* logits,         // [e,n]
                       int          tokens,         //  n
                       int          tokens_padded,  //  round_up(n, 4)
                       int          experts,        //  E
                       int          experts_per_token,
+                      bool         norm_topk,
                       cudaStream_t st)
 {
     constexpr int base_log_tile = 9;
@@ -254,48 +581,64 @@ void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
 
     // std::cout << log_tile << " " << tiles << "\n";
 
-    {
-        constexpr int threads = 128;
-        const int     blocks  = ceil_div(tokens, threads);
+    auto invoke = [&](auto max_expert_num, auto top_k, auto items_per_thread) {
+        constexpr int thrs_per_tok = max_expert_num.value / items_per_thread.value;
+        constexpr int threads      = 256;
+        const int     blocks       = ceil_div(tokens, threads / thrs_per_tok);
+
+        cudaMemsetAsync(masks, -1, sizeof(int8_t) * experts * tokens_padded, st);
 
-        auto invoke = [&](auto e) {
-            static constexpr int top_k = decltype(e)::value;
-            MoeGateKernel_V2<top_k, threads><<<blocks, threads, 0, st>>>(  //
+        MoeGateKernel_v8<max_expert_num.value, top_k.value, items_per_thread.value, threads, 4>
+            <<<blocks, threads, 0, st>>>(  //
                 scales,
-                masks,
+                (int8_t*)masks,
                 accum,
                 logits,
                 log_tile,
                 tiles,
                 tokens,
                 tokens_padded,
-                experts);
-        };
+                experts,
+                experts_per_token,
+                norm_topk);
+    };
 
-        switch (experts_per_token) {
-            case 2:
-                invoke(std::integral_constant<int, 2>{});
-                break;
-            // case 4:
-            //     invoke(std::integral_constant<int, 4>{});
-            //     break;
-            default:
-                std::cerr << __FILE__ << ":" << __LINE__ << " Not implemented. " << std::endl;
-                std::abort();
+    auto fail = [&] {
+        std::cerr << "unsupported moe config: expert_num=" << experts << ", top_k=" << experts_per_token << "\n";
+        std::abort();
+    };
+
+    if (experts <= 8) {
+        if (experts_per_token <= 2) {
+            invoke(_Int<8>, _Int<2>, _Int<8>);
+        }
+        else {
+            invoke(_Int<8>, _Int<8>, _Int<8>);
         }
     }
-
-    // return;
+    else if (experts <= 64) {
+        if (experts_per_token <= 4) {
+            invoke(_Int<64>, _Int<4>, _Int<16>);
+        }
+        else if (experts_per_token <= 8) {
+            invoke(_Int<64>, _Int<8>, _Int<16>);
+        }
+        else {
+            fail();
+        }
+    }
+    else {
+        fail();
+    }
 
     {
-        // Check: tiles * experts <= threads
-
         constexpr int threads = (1 << base_log_tile) / kMoeGateVecSize;
         const dim3    blocks(tiles, experts + 1);
-        MoeScanKernel_V2<threads><<<blocks, threads, 0, st>>>(f2n,  //
+
+        MoeScanKernel_v2<threads><<<blocks, threads, 0, st>>>(f2n,  //
                                                               en2f,
                                                               offsets,
-                                                              masks,
+                                                              (int8_t*)masks,
                                                               accum,
                                                               log_tile,
                                                               tiles,
@@ -338,10 +681,11 @@ void invokeMoeGather(T* dst, const T* src, const int* f2n, int tokens, int exper
 template void invokeMoeGather(uint16_t*, const uint16_t*, const int*, int, int, int, cudaStream_t);
 
 template<int vec_size, int exp_k, int block_dim, class T>
-__global__ void MoeReduceKernel(T*           dst,     // [  n, d]
-                                const T*     src,     // [e*n, d]
-                                const float* scales,  // [  e, n]
-                                const int*   en2f,    // [  e, n] :: (e,n) -> e*n
+__global__ void MoeReduceKernel(T*           dst,         // [  n, d]
+                                const T*     src,         // [e*n, d]
+                                const float* scales,      // [  e, n]
+                                const int*   en2f,        // [  e, n] :: (e,n) -> e*n
+                                const float* dst_scales,  // [n]
                                 int          dims,
                                 int          tokens)
 {
@@ -351,6 +695,12 @@ __global__ void MoeReduceKernel(T*           dst,     // [  n, d]
 
     auto dst_ptr = (Vec*)dst + dims * ti;
 
+    float dst_scale = 0;
+    if (dst_scales) {
+        dst_scale = dst_scales[ti];
+        dst_scale = fdividef(1.f, 1.f + expf(-dst_scale));
+    }
+
     // Should be warp uniforms
     const Vec* src_ptr[exp_k];
     float      scale[exp_k];
@@ -362,6 +712,12 @@ __global__ void MoeReduceKernel(T*           dst,     // [  n, d]
 
     for (int i = threadIdx.x; i < dims; i += block_dim) {
         Array<float, vec_size> accum{};
+        if (dst_scales) {
+            Vec v;
+            Ldg(v, dst_ptr[i].data());
+            using namespace ops;
+            accum = cast<float>(v) * dst_scale;
+        }
         PRAGMA_UNROLL
         for (int e = 0; e < exp_k; ++e) {
             Vec v;
@@ -379,6 +735,7 @@ void invokeMoeReduce(T*           dst,
                      const T*     src,
                      const float* scales,
                      const int*   en2f,
+                     const float* dst_scales,
                      int          tokens,
                      int          experts_per_token,
                      int          dims,
@@ -395,6 +752,7 @@ void invokeMoeReduce(T*           dst,
             src,
             scales,
             en2f,
+            dst_scales,
             dims / vec_size,
             tokens);
     };
@@ -404,19 +762,22 @@ void invokeMoeReduce(T*           dst,
             return invoke(std::integral_constant<int, 1>{});
         case 2:
             return invoke(std::integral_constant<int, 2>{});
-        // case 4:
-        //     return invoke(std::integral_constant<int, 4>{});
-        // case 6:
-        //     return invoke(std::integral_constant<int, 6>{});
+        case 4:
+            return invoke(std::integral_constant<int, 4>{});
+        case 6:
+            return invoke(std::integral_constant<int, 6>{});
+        case 8:
+            return invoke(std::integral_constant<int, 8>{});
         default:
             fprintf(stderr, "Unsupported experts_per_token %d\n", experts_per_token);
             std::abort();
     }
 }
 
-template void invokeMoeReduce(half*, const half*, const float*, const int*, int, int, int, cudaStream_t);
+template void invokeMoeReduce(half*, const half*, const float*, const int*, const float*, int, int, int, cudaStream_t);
 #ifdef ENABLE_BF16
-template void invokeMoeReduce(nv_bfloat16*, const nv_bfloat16*, const float*, const int*, int, int, int, cudaStream_t);
+template void
+invokeMoeReduce(nv_bfloat16*, const nv_bfloat16*, const float*, const int*, const float*, int, int, int, cudaStream_t);
 #endif
 
 std::vector<int> SampleUniform(int token_num, int expert_num, int exp_per_tok, std::mt19937& g)
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.h b/src/turbomind/kernels/gemm/moe_utils_v2.h
index 334e2de272..0e4c36af09 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.h
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.h
@@ -14,13 +14,14 @@ void invokeMoeGate_V2(int*         f2n,
                       int*         en2f,
                       int*         offsets,
                       float*       scales,
-                      int*         masks,
+                      void*        masks,
                       int*         accum,
                       const float* logits,
                       int          tokens,
                       int          tokens_padded,
                       int          experts,
                       int          exp_per_tok,
+                      bool         norm_topk,
                       cudaStream_t st);
 
 template<class T>
@@ -49,6 +50,7 @@ void invokeMoeReduce(T*           dst,
                      const T*     src,
                      const float* scales,
                      const int*   en2f,
+                     const float* dst_scales,
                      int          tokens,
                      int          experts_per_token,
                      int          dims,
diff --git a/src/turbomind/kernels/gemm/test/test_moe_utils.cu b/src/turbomind/kernels/gemm/test/test_moe_utils.cu
index a311162193..47e3bfdb16 100644
--- a/src/turbomind/kernels/gemm/test/test_moe_utils.cu
+++ b/src/turbomind/kernels/gemm/test/test_moe_utils.cu
@@ -26,6 +26,25 @@ void print_vecs(const T* data, int m, int k, std::string msg, int width = 4)
     }
 }
 
+template<class T>
+void diff_vecs(const T* data, const T* refs, int m, int k, std::string msg)
+{
+    if (!msg.empty()) {
+        std::cout << msg << ": [" << m << ", " << k << "]\n";
+    }
+    for (int mm = 0; mm < m; ++mm) {
+        std::cout << "m=" << mm << ": ";
+        for (int kk = 0; kk < k; ++kk) {
+            const auto& x = data[mm * k + kk];
+            const auto& y = refs[mm * k + kk];
+            if (x != y) {
+                std::cout << kk << "(" << x << ", " << y << ") ";
+            }
+        }
+        std::cout << "\n";
+    }
+}
+
 #if 0
 void func()
 {
@@ -190,7 +209,7 @@ void moe_gate_ref(int                            tokens,
     }
 }
 
-void mask2eids(const universal_vector<int>& masks, universal_vector<int>& eids, int tokens, int expert_num)
+void mask2eids(universal_vector<int8_t>& masks, universal_vector<int>& eids, int tokens, int expert_num)
 {
     const int tokens_padded = masks.size() / expert_num;
     // std::cout << eids.size() << std::endl;
@@ -228,13 +247,13 @@ bool test_moe_gate(int                     tokens,  //
     const int tokens_padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
     // const int max_coords    = get_max_coords(tokens, expert_num, experts_per_token, tiling);
 
-    universal_vector<int>   offsets(expert_num + 1);
-    universal_vector<int>   accum(expert_num * kMoeGateMaxTiles);
-    universal_vector<int>   masks(expert_num * tokens_padded);
-    universal_vector<int>   eids(experts_per_token * tokens);
-    universal_vector<int>   f2n(experts_per_token * tokens);
-    universal_vector<int>   en2f(experts_per_token * tokens);
-    universal_vector<float> scales(experts_per_token * tokens);
+    universal_vector<int>    offsets(expert_num + 1);
+    universal_vector<int>    accum(expert_num * kMoeGateMaxTiles);
+    universal_vector<int8_t> masks(expert_num * tokens_padded);
+    universal_vector<int>    eids(experts_per_token * tokens);
+    universal_vector<int>    f2n(experts_per_token * tokens);
+    universal_vector<int>    en2f(experts_per_token * tokens);
+    universal_vector<float>  scales(experts_per_token * tokens);
     // universal_vector<int2>  coords(max_coords);
     // thrust::fill(coords.begin(), coords.end(), int2{-1, 0});
 
@@ -246,8 +265,16 @@ bool test_moe_gate(int                     tokens,  //
 
     moe_gate_ref(tokens, expert_num, experts_per_token, logits, offsets_ref, eids_ref, f2n_ref, en2f_ref, scales_ref);
 
-    for (int i = 0; i < 10; ++i) {
+    cudaMemPrefetchAsync(f2n.data().get(), sizeof(int) * f2n.size(), 0);
+    cudaMemPrefetchAsync(en2f.data().get(), sizeof(int) * en2f.size(), 0);
+    cudaMemPrefetchAsync(offsets.data().get(), sizeof(int) * offsets.size(), 0);
+    cudaMemPrefetchAsync(scales.data().get(), sizeof(float) * scales.size(), 0);
+    cudaMemPrefetchAsync(logits.data().get(), sizeof(float) * logits.size(), 0);
+
+    for (int i = 0; i < 1; ++i) {
+        gemm::CacheFlushing::flush();
         cudaMemset(accum.data().get(), 0, sizeof(int) * accum.size());
+        cudaMemset(masks.data().get(), -1, sizeof(int8_t) * masks.size());
         invokeMoeGate_V2(f2n.data().get(),
                          en2f.data().get(),
                          offsets.data().get(),
@@ -259,6 +286,7 @@ bool test_moe_gate(int                     tokens,  //
                          tokens_padded,
                          expert_num,
                          experts_per_token,
+                         true,
                          0);
     }
 
@@ -306,7 +334,10 @@ bool test_moe_gate(int                     tokens,  //
         success = false;
     }
 
-    if (!success || false) {
+    if (!success && 1) {
+
+        diff_vecs(eids.data().get(), eids_ref.data().get(), experts_per_token, tokens, "eids");
+
         print_vecs(offsets_ref.data().get(), 1, expert_num + 1, "offsets_ref");
         print_vecs(offsets.data().get(), 1, expert_num + 1, "offsets");
 
@@ -322,32 +353,32 @@ bool test_moe_gate(int                     tokens,  //
         print_vecs(scales_ref.data().get(), experts_per_token, tokens, "scales_ref", 12);
         print_vecs(scales.data().get(), experts_per_token, tokens, "scales", 12);
 
-        print_vecs(accum.data().get(), expert_num, 1, "accum");
+        // print_vecs(accum.data().get(), expert_num, 1, "accum");
 
         // print_vecs(coords.data().get(), 1, max_coords, "coords");
 
-        thrust::host_vector<int4> tile_offsets(tape.max_ctas);
-        std::cout << tape.max_ctas << std::endl;
-        cudaMemcpy(tile_offsets.data(), tape.tile_offsets, sizeof(int4) * tile_offsets.size(), cudaMemcpyDefault);
-        cudaDeviceSynchronize();
-
-        std::cout << "coords:\n";
-        int last = -1;
-        for (int i = 0; i < tape.max_ctas; ++i) {
-            auto& c = tile_offsets[i];
-            if (last >= 0 && c.w != last) {
-                std::cout << "\n";
-            }
-            if (c.w == -1) {
-                std::cout << i << "\n";
-                break;
-            }
-            last = c.w;
-            std::stringstream ss;
-            ss << c.x << "," << c.y;
-            std::cout << std::setw(6) << ss.str();
-        }
-        std::cout << "\n";
+        // thrust::host_vector<int4> tile_offsets(tape.max_ctas);
+        // std::cout << tape.max_ctas << std::endl;
+        // cudaMemcpy(tile_offsets.data(), tape.tile_offsets, sizeof(int4) * tile_offsets.size(), cudaMemcpyDefault);
+        // cudaDeviceSynchronize();
+
+        // std::cout << "coords:\n";
+        // int last = -1;
+        // for (int i = 0; i < tape.max_ctas; ++i) {
+        //     auto& c = tile_offsets[i];
+        //     if (last >= 0 && c.w != last) {
+        //         std::cout << "\n";
+        //     }
+        //     if (c.w == -1) {
+        //         std::cout << i << "\n";
+        //         break;
+        //     }
+        //     last = c.w;
+        //     std::stringstream ss;
+        //     ss << c.x << "," << c.y;
+        //     std::cout << std::setw(6) << ss.str();
+        // }
+        // std::cout << "\n";
     }
 
     return success;
@@ -358,7 +389,11 @@ int main()
     gemm::Tape       tape{};
     constexpr Tiling tiling{14336, 128, {128, 128, 32}};
 
-    test_moe_gate(8192, 8, 2, tape, tiling);
+    // test_moe_gate(32768 * 4, 60, 4, tape, tiling);
+    // test_moe_gate(32768, 64, 8, tape, tiling);
+    // test_moe_gate(8, 60, 4, tape, tiling);
+
+    test_moe_gate(65536, 8, 2, tape, tiling);
     return 0;
 
     for (int i = 1; i < 16384; ++i) {
diff --git a/src/turbomind/kernels/gemm/test/testbed.h b/src/turbomind/kernels/gemm/test/testbed.h
index 6b1ec88f58..7a089fbdf2 100644
--- a/src/turbomind/kernels/gemm/test/testbed.h
+++ b/src/turbomind/kernels/gemm/test/testbed.h
@@ -514,6 +514,7 @@ class Testbed {
                             c_e_.data().get(),
                             moe_scales_.data().get(),
                             moe_en2f_.data().get(),
+                            nullptr,
                             batch_size_,
                             expert_ids_.size() / batch_size_,
                             output_dims_,
@@ -523,6 +524,7 @@ class Testbed {
                             c_e_ref_.data().get(),
                             moe_scales_.data().get(),
                             moe_en2f_.data().get(),
+                            nullptr,
                             batch_size_,
                             expert_ids_.size() / batch_size_,
                             output_dims_,
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
index 2d68ef3535..f6f9ab0efa 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -137,6 +137,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
                                   moe_param.inter_size,
                                   moe_param.expert_num,
                                   moe_param.method,
+                                  moe_param.shared_gate,
                                   tensor_para_size_,
                                   weight_type,
                                   group_size,
@@ -349,18 +350,22 @@ void LlamaDecoderLayerWeight<T>::mallocWeights()
     mallocWeights(self_attn_weights.qkv, attn_bias_);
     mallocWeights(self_attn_weights.output, attn_bias_);
 
-    if (moe_weights.experts.empty()) {
+    if (inter_size_) {
         mallocWeights(ffn_weights.gating, false);
         mallocWeights(ffn_weights.intermediate, false);
         mallocWeights(ffn_weights.output, false);
     }
-    else {
+
+    if (!moe_weights.experts.empty()) {
         mallocWeights(moe_weights.gate, false);
         for (auto& e : moe_weights.experts) {
             mallocWeights(e.gating, false);
             mallocWeights(e.intermediate, false);
             mallocWeights(e.output, false);
         }
+        if (moe_weights.shared_gate.output_dims) {
+            mallocWeights(moe_weights.shared_gate, false);
+        }
     }
 }
 
@@ -375,10 +380,25 @@ LlamaDecoderLayerWeight<T>::~LlamaDecoderLayerWeight()
     freeWeights(self_attn_weights.qkv);
     freeWeights(self_attn_weights.output);
 
-    freeWeights(ffn_weights.fused_gating_intermediate);
-    freeWeights(ffn_weights.gating);
-    freeWeights(ffn_weights.intermediate);
-    freeWeights(ffn_weights.output);
+    if (inter_size_) {
+        freeWeights(ffn_weights.fused_gating_intermediate);
+        freeWeights(ffn_weights.gating);
+        freeWeights(ffn_weights.intermediate);
+        freeWeights(ffn_weights.output);
+    }
+
+    if (!moe_weights.experts.empty()) {
+        freeWeights(moe_weights.gate);
+        for (auto& e : moe_weights.experts) {
+            freeWeights(e.fused_gating_intermediate);
+            freeWeights(e.gating);
+            freeWeights(e.intermediate);
+            freeWeights(e.output);
+        }
+        if (moe_weights.shared_gate.kernel) {
+            freeWeights(moe_weights.shared_gate);
+        }
+    }
 }
 
 template<typename T>
@@ -428,23 +448,30 @@ TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
     getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output);
     getWeightTensor(self_attn_weights.output, attn_bias_, get_prefix("attention.wo"), output);
 
-    if (moe_weights.experts.empty()) {
+    if (inter_size_) {
         getWeightTensor(ffn_weights.gating, false, get_prefix("feed_forward.w1"), output);
         getWeightTensor(ffn_weights.intermediate, false, get_prefix("feed_forward.w3"), output);
         getWeightTensor(ffn_weights.output, false, get_prefix("feed_forward.w2"), output);
     }
-    else {
+
+    if (!moe_weights.experts.empty()) {
         output.insert(
             concat(prefix, "moe_ffn.gate.weight"),
             Tensor{MEMORY_GPU, getTensorType<T>(), {moe_weights.gate.kernel_size()}, moe_weights.gate.kernel});
         auto& experts = moe_weights.experts;
         for (size_t i = 0; i < experts.size(); ++i) {
             const std::string name = "moe_ffn.experts." + std::to_string(i);
-            // std::cerr << "FUCK " << get_prefix(concat(name, "w1")) << "\n";
             getWeightTensor(experts[i].gating, false, get_prefix(concat(name, "w1")), output);
             getWeightTensor(experts[i].intermediate, false, get_prefix(concat(name, "w3")), output);
             getWeightTensor(experts[i].output, false, get_prefix(concat(name, "w2")), output);
         }
+        if (moe_weights.shared_gate.kernel) {
+            output.insert(concat(prefix, "moe_ffn.shared_gate.weight"),
+                          Tensor{MEMORY_GPU,
+                                 getTensorType<T>(),
+                                 {moe_weights.shared_gate.kernel_size()},
+                                 moe_weights.shared_gate.kernel});
+        }
     }
 
     return output;
@@ -681,10 +708,13 @@ void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cud
         convert(ffn.output, is_fused_moe, workspace, size, is_16xx);
     };
 
-    if (moe_weights.experts.empty()) {
+    if (inter_size_) {
+        // std::cerr << "process FFN\n";
         process_ffn(ffn_weights, false);
     }
-    else {
+
+    if (!moe_weights.experts.empty()) {
+        // std::cerr << "process MoE\n";
         std::vector<std::pair<void*, int>> fused_ptrs;
         std::vector<std::pair<void*, int>> output_ptrs;
         std::vector<std::pair<void*, int>> fused_param_ptrs;
diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h
index 9a895243bc..169fb53bcf 100644
--- a/src/turbomind/models/llama/LlamaDenseWeight.h
+++ b/src/turbomind/models/llama/LlamaDenseWeight.h
@@ -145,24 +145,28 @@ struct LlamaFfnWeight {
     LlamaFfnWeight(
         size_t hidden_dim, size_t inter_size, size_t tp, WeightType weight_type, int group_size, bool fuse_silu_act)
     {
+        inter_size /= tp;
+
+        this->inter_size = inter_size;
+
         gating.input_dims  = hidden_dim;
-        gating.output_dims = inter_size / tp;
+        gating.output_dims = inter_size;
         gating.type        = weight_type;
         gating.group_size  = group_size;
 
         intermediate.input_dims  = hidden_dim;
-        intermediate.output_dims = inter_size / tp;
+        intermediate.output_dims = inter_size;
         intermediate.type        = weight_type;
         intermediate.group_size  = group_size;
 
         fused_gating_intermediate.input_dims  = hidden_dim;
-        fused_gating_intermediate.output_dims = inter_size / tp * 2;
+        fused_gating_intermediate.output_dims = inter_size * 2;
         fused_gating_intermediate.type        = weight_type;
         fused_gating_intermediate.group_size  = group_size;
 
         is_fused_silu = fuse_silu_act;
 
-        output.input_dims  = inter_size / tp;
+        output.input_dims  = inter_size;
         output.output_dims = hidden_dim;
         output.type        = weight_type;
         output.group_size  = group_size;
@@ -173,6 +177,7 @@ struct LlamaFfnWeight {
     LlamaDenseWeight<T> output;
     LlamaDenseWeight<T> fused_gating_intermediate;
 
+    int  inter_size{};
     bool is_fused_silu{};
 };
 
@@ -185,11 +190,15 @@ struct MoeFfnWeight {
                  int        inter_size,
                  int        expert_num,
                  int        method,
+                 bool       has_shared_gate,
                  size_t     tp,
                  WeightType weight_type,
                  int        group_size,
                  bool       fuse_silu_act)
     {
+
+        // printf("%d %d %d\n", (int)hidden_dim, (int)inter_size, (int)expert_num);
+
         if (expert_num == 0) {
             return;
         }
@@ -208,11 +217,23 @@ struct MoeFfnWeight {
             // inter size is divided by tp in `FfnWeight`
             e = LlamaFfnWeight<T>{hidden_dim, (size_t)inter_size, tp, weight_type, group_size, fuse_silu_act};
         }
+
+        if (has_shared_gate) {
+            shared_gate.input_dims  = hidden_dim;
+            shared_gate.output_dims = 1;
+            shared_gate.type        = get_default_weight_type<T>();
+            gate.group_size         = group_size;
+        }
+        else {
+            shared_gate = {};
+        }
     }
 
     LlamaDenseWeight<T>            gate;
     std::vector<LlamaFfnWeight<T>> experts;
 
+    LlamaDenseWeight<T> shared_gate;
+
     LlamaFfnWeight<T> block;
 
     int method{};
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc
index f9ee0c4ad4..8cce207203 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.cc
+++ b/src/turbomind/models/llama/LlamaFfnLayer.cc
@@ -28,10 +28,11 @@ namespace turbomind {
 
 template<typename T>
 void LlamaFfnLayer<T>::allocateBuffer(size_t                     token_num,
+                                      int                        inter_size,
                                       const LlamaDenseWeight<T>* gating,
                                       const LlamaDenseWeight<T>* inter)
 {
-    const size_t sz = token_num * inter_size_;
+    const size_t sz = token_num * inter_size;
 
     const size_t sz_gate  = token_num * gating->lora.r;
     const size_t sz_inter = token_num * inter->lora.r;
@@ -51,24 +52,24 @@ template<typename T>
 void LlamaFfnLayer<T>::freeBuffer()
 {
     if (is_allocate_buffer_) {
-        // allocator_->free((void**)&inter_buf_);
         allocator_->free((void**)&gating_buf_);
         is_allocate_buffer_ = false;
     }
 }
 
 template<typename T>
-void LlamaFfnLayer<T>::activation(int token_num, bool is_chunked)
+void LlamaFfnLayer<T>::activation(int token_num, int inter_size, bool is_chunked)
 {
     NvtxScope scope("activation");
     if (is_chunked) {
+        // gate & up are in the SAME buffer
         invokeGenericActivation_v2<SiluActivation>(
-            gating_buf_, gating_buf_ + inter_size_, inter_size_ * 2, token_num, inter_size_, stream_);
+            gating_buf_, gating_buf_ + inter_size, inter_size * 2, token_num, inter_size, stream_);
         sync_check_cuda_error();
     }
     else {
-        invokeGenericActivation_v2<SiluActivation>(
-            gating_buf_, inter_buf_, inter_size_, token_num, inter_size_, stream_);
+        // gate & up are in separate buffers
+        invokeGenericActivation_v2<SiluActivation>(gating_buf_, inter_buf_, inter_size, token_num, inter_size, stream_);
         sync_check_cuda_error();
     }
 }
@@ -88,11 +89,11 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
 
     NvtxScope scope("ffn");
 
-    const size_t num_token = input_tensors->at("ffn_input").shape[0];
-    const int    layer_id  = input_tensors->getVal<int>("layer_id");
-    // LOG(WARNING);
+    const size_t token_num  = input_tensors->at("ffn_input").shape[0];
+    const int    layer_id   = input_tensors->getVal<int>("layer_id");
+    const int    inter_size = weights->inter_size;
 
-    allocateBuffer(num_token, &weights->gating, &weights->intermediate);
+    allocateBuffer(token_num, inter_size, &weights->gating, &weights->intermediate);
 
     const T* ffn_input_data  = input_tensors->at("ffn_input").getPtr<T>();
     T*       ffn_output_data = output_tensors->at("ffn_output").getPtr<T>();
@@ -103,50 +104,50 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
 
         const auto type = weights->is_fused_silu ? LlamaLinear<T>::kFusedSiluFfn : LlamaLinear<T>::kGemm;
 
-        linear_->forward(gating_buf_, ffn_input_data, num_token, weights->fused_gating_intermediate, type);
+        linear_->forward(gating_buf_, ffn_input_data, token_num, weights->fused_gating_intermediate, type);
         sync_check_cuda_error();
 
         if (!weights->is_fused_silu) {
-            activation(num_token, true);
+            activation(token_num, inter_size, true);
         }
 
-        count_and_fix(gating_buf_, num_token * weights->output.input_dims, Concat("w1_w3_silu", layer_id), 3);
+        count_and_fix(gating_buf_, token_num * weights->output.input_dims, Concat("w1_w3_silu", layer_id), 3);
     }
     else {
         {  // w1(x)
             NvtxScope scope("w1");
-            linear_->forward(gating_buf_, ffn_input_data, num_token, weights->gating, LlamaLinear<T>::kGemm, lora_mask);
+            linear_->forward(gating_buf_, ffn_input_data, token_num, weights->gating, LlamaLinear<T>::kGemm, lora_mask);
             sync_check_cuda_error();
         }
-        count_and_fix(gating_buf_, num_token * weights->gating.output_dims, Concat("w1", layer_id), 3);
+        count_and_fix(gating_buf_, token_num * weights->gating.output_dims, Concat("w1", layer_id), 3);
 
         {  // w3(x)
             NvtxScope scope("w3");
             linear_->forward(
-                inter_buf_, ffn_input_data, num_token, weights->intermediate, LlamaLinear<T>::kGemm, lora_mask);
+                inter_buf_, ffn_input_data, token_num, weights->intermediate, LlamaLinear<T>::kGemm, lora_mask);
             sync_check_cuda_error();
         }
-        count_and_fix(inter_buf_, num_token * weights->intermediate.output_dims, Concat("w3", layer_id), 3);
+        count_and_fix(inter_buf_, token_num * weights->intermediate.output_dims, Concat("w3", layer_id), 3);
 
         // silu(w1(x)) * w3(x)
-        activation(num_token, false);
+        activation(token_num, inter_size, false);
 
-        count_and_fix(gating_buf_, num_token * weights->output.input_dims, Concat("act", layer_id), 3);
+        count_and_fix(gating_buf_, token_num * weights->output.input_dims, Concat("act", layer_id), 3);
     }
 
     {  // w2(x)
         NvtxScope scope("w2");
-        const int pitch = (weights->fused_gating_intermediate.kernel && !weights->is_fused_silu) ? inter_size_ * 2 : 0;
+        const int pitch = (weights->fused_gating_intermediate.kernel && !weights->is_fused_silu) ? inter_size * 2 : 0;
         linear_->forward(
-            ffn_output_data, {gating_buf_, pitch}, num_token, weights->output, LlamaLinear<T>::kGemm, lora_mask);
+            ffn_output_data, {gating_buf_, pitch}, token_num, weights->output, LlamaLinear<T>::kGemm, lora_mask);
         sync_check_cuda_error();
     }
 
-    count_and_fix(ffn_output_data, num_token * weights->output.output_dims, Concat("w2", layer_id), 3);
+    count_and_fix(ffn_output_data, token_num * weights->output.output_dims, Concat("w2", layer_id), 3);
 
     if (all_reduce_ && tensor_para_.world_size_ > 1) {
         NcclGuard nccl_guard(tensor_para_, stream_);
-        ftNcclAllReduceSum(ffn_output_data, ffn_output_data, num_token * hidden_units_, tensor_para_, stream_);
+        ftNcclAllReduceSum(ffn_output_data, ffn_output_data, token_num * hidden_units_, tensor_para_, stream_);
         sync_check_cuda_error();
     }
 
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.h b/src/turbomind/models/llama/LlamaFfnLayer.h
index 75ced5f9ac..2daca2cc95 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.h
+++ b/src/turbomind/models/llama/LlamaFfnLayer.h
@@ -19,12 +19,11 @@
 
 #pragma once
 
-#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
 #include "src/turbomind/models/llama/LlamaLinear.h"
 #include "src/turbomind/models/llama/context.h"
-#include "src/turbomind/utils/custom_ar_comm.h"
+#include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/nccl_utils.h"
-#include <functional>
 
 namespace turbomind {
 
@@ -32,7 +31,6 @@ template<typename T>
 class LlamaFfnLayer {
 public:
     LlamaFfnLayer(const ModelParam& model, const NcclParam& tp, const Context<T>& ctx, bool all_reduce):
-        inter_size_(model.inter_size / tp.world_size_),
         hidden_units_(model.hidden_units),
         tensor_para_(tp),
         stream_(ctx.stream),
@@ -50,13 +48,12 @@ class LlamaFfnLayer {
     void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaFfnWeight<T>* weights);
 
 private:
-    void allocateBuffer(size_t token_num, const LlamaDenseWeight<T>*, const LlamaDenseWeight<T>*);
+    void allocateBuffer(size_t token_num, int inter_size, const LlamaDenseWeight<T>*, const LlamaDenseWeight<T>*);
 
     void freeBuffer();
 
-    void activation(int token_num, bool is_chunked);
+    void activation(int token_num, int inter_size, bool is_chunked);
 
-    const size_t          inter_size_;
     const size_t          hidden_units_;
     const NcclParam       tensor_para_;
     cudaStream_t const    stream_;
diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h
index 1c039ca66a..2ea63f0410 100644
--- a/src/turbomind/models/llama/llama_params.h
+++ b/src/turbomind/models/llama/llama_params.h
@@ -31,9 +31,11 @@ struct MoeParam {
         kNaive,
         kFused
     } method;
-    int expert_num;
-    int experts_per_token;
-    int inter_size;
+    int  expert_num;
+    int  experts_per_token;
+    int  inter_size;
+    bool norm_topk;
+    bool shared_gate;
 };
 
 struct AttentionParam {
diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc
index def6b04abb..1ad76839d1 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.cc
+++ b/src/turbomind/models/llama/moe_ffn_layer.cc
@@ -30,6 +30,7 @@ void MoeFfnLayer<T>::AllocateBuffer(size_t tokens, size_t padded)
         alloc(&f2n_, param_.experts_per_token * tokens);
         alloc(&en2f_, param_.experts_per_token * tokens);
         alloc(&scales_, param_.experts_per_token * tokens);
+        alloc(&shared_scales_, tokens);
         return (char*)alloc.ptr() - (char*)base;
     };
 
@@ -69,7 +70,7 @@ void MoeFfnLayer<T>::gate(float* logits, const T* input, int tokens, const Llama
                   getCudaDataType<T>(),
                   hidden_dim_,
                   &beta,
-                  logits_,
+                  logits,
                   CUDA_R_32F,
                   weight.output_dims,
                   CUDA_R_32F,
@@ -77,13 +78,13 @@ void MoeFfnLayer<T>::gate(float* logits, const T* input, int tokens, const Llama
 }
 
 template<class T>
-void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWeight<T>& moe)
+void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight<T>& moe)
 {
     const size_t padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
 
     AllocateBuffer(tokens, padded);
 
-    gate(logits_, inout, tokens, moe.gate);
+    gate(logits_, input, tokens, moe.gate);
     sync_check_cuda_error();
 
     check_cuda_error(cudaMemsetAsync(accum_, 0, sizeof(int) * param_.expert_num * kMoeGateMaxTiles, stream_));
@@ -103,6 +104,7 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
                      padded,
                      param_.expert_num,
                      param_.experts_per_token,
+                     param_.norm_topk,
                      stream_);
     sync_check_cuda_error();
 
@@ -123,7 +125,7 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
 
     if (param_.method == MoeParam::kNaive) {
 
-        dispatchMoeGather(inout_buf_, inout, f2n_, tokens, param_.experts_per_token, hidden_dim_, stream_);
+        dispatchMoeGather(inout_buf_, input, f2n_, tokens, param_.experts_per_token, hidden_dim_, stream_);
         sync_check_cuda_error();
 
         check_cuda_error(
@@ -155,28 +157,8 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
 
         auto& block = moe.block;
 
-#if 0
-        FT_CHECK(!block.is_fused_silu);
-        for (int i = 0; i < param_.expert_num; ++i) {
-            if (size_t count = h_offsets_[i + 1] - h_offsets_[i]) {
-                cublas_->Gemm(CUBLAS_OP_T,  // (m, k)  W
-                              CUBLAS_OP_N,  // (k, n)  X
-                              inter_size_ * 2,
-                              count,
-                              hidden_dim_,
-                              moe.experts[i].fused_gating_intermediate.kernel,
-                              hidden_dim_,
-                              inout_buf_ + h_offsets_[i] * hidden_dim_,
-                              hidden_dim_,
-                              inter_buf_ + h_offsets_[i] * inter_size_ * 2,
-                              inter_size_ * 2);
-                sync_check_cuda_error();
-            }
-        }
-        auto mode = kCmpWrite;
-#else
         linear_->forward_moe(inter_buf_,
-                             {inout, (int)hidden_dim_},
+                             {input, (int)hidden_dim_},
                              f2n_,
                              offsets_,
                              tokens * param_.experts_per_token,
@@ -185,7 +167,6 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
                              context_.get());
         sync_check_cuda_error();
         auto mode = kCmpRead;
-#endif
 
         // if (tensor_para_.rank_ == 0) {
         //     Compare(inter_buf_,  //
@@ -205,25 +186,6 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
             sync_check_cuda_error();
         }
 
-#if 0
-        for (int i = 0; i < param_.expert_num; ++i) {
-            if (size_t count = h_offsets_[i + 1] - h_offsets_[i]) {
-                cublas_->Gemm(CUBLAS_OP_T,  // (m, k) W
-                              CUBLAS_OP_N,  // (k, n) X
-                              hidden_dim_,
-                              count,
-                              inter_size_,
-                              moe.experts[i].output.kernel,
-                              inter_size_,
-                              inter_buf_ + h_offsets_[i] * inter_size_ * 2,
-                              inter_size_ * 2,
-                              inout_buf_ + h_offsets_[i] * hidden_dim_,
-                              hidden_dim_);
-                sync_check_cuda_error();
-            }
-        }
-        auto mode1 = kCmpWrite;
-#else
         linear_->forward_moe(inout_buf_,
                              {inter_buf_, block.is_fused_silu ? (int)inter_size_ : (int)inter_size_ * 2},
                              nullptr,
@@ -234,7 +196,6 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
                              context_.get());
         sync_check_cuda_error();
         auto mode1 = kCmpRead;
-#endif
 
         // if (tensor_para_.rank_ == 0) {
         //     Compare(inter_buf_2_,  //
@@ -250,18 +211,29 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
         // }
     }
 
-    invokeMoeReduce(inout, inout_buf_, scales_, en2f_, tokens, param_.experts_per_token, hidden_dim_, stream_);
+    if (moe.shared_gate.kernel) {
+        gate(shared_scales_, input, tokens, moe.shared_gate);
+    }
+}
+
+template<class T>
+void MoeFfnLayer<T>::reduce(T* output, int tokens, const MoeFfnWeight<T>& moe)
+{
+    invokeMoeReduce(output,
+                    inout_buf_,
+                    scales_,
+                    en2f_,
+                    moe.shared_gate.kernel ? shared_scales_ : nullptr,
+                    tokens,
+                    param_.experts_per_token,
+                    hidden_dim_,
+                    stream_);
     sync_check_cuda_error();
 
     if (tensor_para_.world_size_ > 1) {
-        ftNcclAllReduceSum(inout, inout, tokens * hidden_dim_, tensor_para_, stream_);
+        ftNcclAllReduceSum(output, output, tokens * hidden_dim_, tensor_para_, stream_);
         sync_check_cuda_error();
     }
-
-    // if (tensor_para_.rank_ == 0) {
-    //     check_cuda_error(cudaStreamSynchronize(stream_));
-    //     std::abort();
-    // }
 }
 
 template<class T>
diff --git a/src/turbomind/models/llama/moe_ffn_layer.h b/src/turbomind/models/llama/moe_ffn_layer.h
index ef65aaa464..0f1713f7b5 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.h
+++ b/src/turbomind/models/llama/moe_ffn_layer.h
@@ -51,7 +51,9 @@ class MoeFfnLayer {
         FreeBuffer();
     }
 
-    void forward(T* inout, int tokens, int layer_id, const MoeFfnWeight<T>& moe);
+    void forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight<T>& moe);
+
+    void reduce(T* output, int tokens, const MoeFfnWeight<T>& moe);
 
     void gate(float* logits, const T* input, int tokens, const LlamaDenseWeight<T>& weight);
 
@@ -85,6 +87,8 @@ class MoeFfnLayer {
     int*   en2f_{};
     float* scales_{};
 
+    float* shared_scales_{};
+
     int* accum_{};
     int* offsets_{};
 };
diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc
index 68392215f6..28e8b5f649 100644
--- a/src/turbomind/models/llama/unified_decoder.cc
+++ b/src/turbomind/models/llama/unified_decoder.cc
@@ -26,9 +26,15 @@ UnifiedDecoder<T>::UnifiedDecoder(const ModelParam&     model,
     dtype_(getTensorType<T>())
 {
 
-    attn_layer_    = std::make_unique<UnifiedAttentionLayer<T>>(model, attn, lora, tp, ctx);
-    ffn_layer_     = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx, true);
-    moe_ffn_layer_ = std::make_unique<MoeFfnLayer<T>>(model, moe, tp, ctx);
+    attn_layer_ = std::make_unique<UnifiedAttentionLayer<T>>(model, attn, lora, tp, ctx);
+
+    if (moe.expert_num) {
+        moe_ffn_layer_ = std::make_unique<MoeFfnLayer<T>>(model, moe, tp, ctx);
+    }
+
+    if (model.inter_size) {
+        ffn_layer_ = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx, !moe_ffn_layer_);
+    }
 
     check_cuda_error(cudaEventCreateWithFlags(&ev_h_cu_x_, cudaEventDisableTiming));
 }
@@ -190,9 +196,10 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
         /// feed-forward network
 
         if (!weights->at(layer)->moe_weights.experts.empty()) {
-            moe_ffn_layer_->forward(decoder_output, token_num, layer, weights->at(layer)->moe_weights);
+            moe_ffn_layer_->forward(nullptr, decoder_output, token_num, layer, weights->at(layer)->moe_weights);
         }
-        else {
+
+        if (ffn_layer_) {
             int       layer_id = layer;  // int is needed
             TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}},
                                  {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}}};
@@ -203,6 +210,10 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
             ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &weights->at(layer)->ffn_weights);
         }
 
+        if (!weights->at(layer)->moe_weights.experts.empty()) {
+            moe_ffn_layer_->reduce(decoder_output, token_num, weights->at(layer)->moe_weights);
+        }
+
         count_and_fix(decoder_output, token_num * hidden_units_, Concat("ffn_block", layer), 2);
 
         const bool is_last_layer = layer == layer_num_ - 1;
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 8db13652f5..38552be0cf 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -301,6 +301,8 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     moe_param_.expert_num        = model_reader["expert_num"].as<int>(0);
     moe_param_.experts_per_token = model_reader["experts_per_token"].as<int>(0);
     moe_param_.inter_size        = model_reader["expert_inter_size"].as<int>(0);
+    moe_param_.shared_gate       = model_reader["moe_shared_gate"].as<int>(0);
+    moe_param_.norm_topk         = model_reader["moe_norm_topk"].as<bool>(false);
 
     handleMissingParams();
 

From adf7c361531b45a032e58bc6293c4df33c182fbe Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Wed, 13 Nov 2024 12:21:28 +0800
Subject: [PATCH 072/122] Support mixtral moe AWQ quantization. (#2725)

* moe-awq

* skip gate

* add skipped_modules

* fix search-scale

* update autotest
---
 autotest/utils/quantization_utils.py          |  2 +-
 docs/en/quantization/w4a16.md                 |  1 -
 docs/zh_cn/quantization/w4a16.md              |  1 -
 lmdeploy/cli/utils.py                         |  4 +-
 lmdeploy/lite/apis/auto_awq.py                | 10 +---
 lmdeploy/lite/apis/calibrate.py               | 48 +++++++++++++++++--
 .../lite/quantization/activation/observer.py  |  5 +-
 lmdeploy/lite/quantization/awq.py             | 30 ++++++++----
 lmdeploy/lite/quantization/calibration.py     | 35 +-------------
 9 files changed, 75 insertions(+), 61 deletions(-)

diff --git a/autotest/utils/quantization_utils.py b/autotest/utils/quantization_utils.py
index 752168958a..bc09ed9a4c 100644
--- a/autotest/utils/quantization_utils.py
+++ b/autotest/utils/quantization_utils.py
@@ -40,7 +40,7 @@ def quantization(config,
             now the type is ' + quantization_type
 
     if 'llama-3' in origin_model_name.lower():
-        quantization_cmd += ' --search-scale True'
+        quantization_cmd += ' --search-scale'
 
     if not is_bf16_supported():
         quantization_cmd += ' --batch-size 8'
diff --git a/docs/en/quantization/w4a16.md b/docs/en/quantization/w4a16.md
index 32dfe18d80..0aa1e17a5b 100644
--- a/docs/en/quantization/w4a16.md
+++ b/docs/en/quantization/w4a16.md
@@ -39,7 +39,6 @@ lmdeploy lite auto_awq \
   --w-bits 4 \
   --w-group-size 128 \
   --batch-size 1 \
-  --search-scale False \
   --work-dir $WORK_DIR
 ```
 
diff --git a/docs/zh_cn/quantization/w4a16.md b/docs/zh_cn/quantization/w4a16.md
index d50e464af3..d69a8a23d2 100644
--- a/docs/zh_cn/quantization/w4a16.md
+++ b/docs/zh_cn/quantization/w4a16.md
@@ -39,7 +39,6 @@ lmdeploy lite auto_awq \
   --w-bits 4 \
   --w-group-size 128 \
   --batch-size 1 \
-  --search-scale False \
   --work-dir $WORK_DIR
 ```
 
diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
index ad7a058c8f..85784a58f5 100644
--- a/lmdeploy/cli/utils.py
+++ b/lmdeploy/cli/utils.py
@@ -358,10 +358,10 @@ def calib_search_scale(parser):
 
         return parser.add_argument(
             '--search-scale',
-            type=bool,
+            action='store_true',
             default=False,
             help=\
-            'Whether search scale ratio. Default to False, which means only smooth quant with 0.5 ratio will be applied'  # noqa
+            'Whether search scale ratio. Default to be disabled, which means only smooth quant with 0.5 ratio will be applied'  # noqa
         )
 
     @staticmethod
diff --git a/lmdeploy/lite/apis/auto_awq.py b/lmdeploy/lite/apis/auto_awq.py
index d7d6a5560e..c41b28fd6e 100644
--- a/lmdeploy/lite/apis/auto_awq.py
+++ b/lmdeploy/lite/apis/auto_awq.py
@@ -101,7 +101,7 @@ def auto_awq(model: str,
     layer_type = LAYER_TYPE_MAP[type(model).__name__]
     fc2fcs = FC_FCS_MAP[layer_type]
     norm2fcs = NORM_FCS_MAP[layer_type]
-    input_stats = torch.load(work_dir / 'inputs_stats.pth')
+    input_stats = torch.load(osp.join(work_dir, 'inputs_stats.pth'))
     layers = collect_target_modules(model, layer_type)
     fcs = {}
     for l_name, layer in layers.items():
@@ -117,13 +117,7 @@ def auto_awq(model: str,
         act_scales = input_stats['absmax']
         smooth_layers(layers, fc2fcs, norm2fcs, act_scales, w_group_size,
                       device)
-    quant_weights(model,
-                  fcs,
-                  w_bits,
-                  w_sym,
-                  w_group_size,
-                  device,
-                  skip_if_contains='lora')  # TODO quant lora weight
+    quant_weights(model, fcs, w_bits, w_sym, w_group_size, device)
     quantization_config = dict(quant_method='awq',
                                version='gemm',
                                bits=w_bits,
diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
index b2fd8e3883..65ecd765c7 100644
--- a/lmdeploy/lite/apis/calibrate.py
+++ b/lmdeploy/lite/apis/calibrate.py
@@ -24,7 +24,8 @@
     'MGMLlamaForCausalLM': 'LlamaDecoderLayer',  # mini gemini
     'InternLMXComposer2ForCausalLM': 'InternLM2DecoderLayer',
     'Phi3ForCausalLM': 'Phi3DecoderLayer',
-    'ChatGLMForConditionalGeneration': 'GLMBlock'
+    'ChatGLMForConditionalGeneration': 'GLMBlock',
+    'MixtralForCausalLM': 'MixtralDecoderLayer',
 }
 
 NORM_TYPE_MAP = {
@@ -39,7 +40,8 @@
     'MGMLlamaForCausalLM': 'LlamaRMSNorm',  # mini gemini
     'InternLMXComposer2ForCausalLM': 'InternLM2RMSNorm',
     'Phi3ForCausalLM': 'Phi3RMSNorm',
-    'ChatGLMForConditionalGeneration': 'RMSNorm'
+    'ChatGLMForConditionalGeneration': 'RMSNorm',
+    'MixtralForCausalLM': 'MixtralRMSNorm',
 }
 
 HEAD_NAME_MAP = {
@@ -54,7 +56,8 @@
     'MGMLlamaForCausalLM': 'lm_head',  # mini gemini
     'InternLMXComposer2ForCausalLM': 'output',
     'Phi3ForCausalLM': 'lm_head',
-    'ChatGLMForConditionalGeneration': 'output_layer'
+    'ChatGLMForConditionalGeneration': 'output_layer',
+    'MixtralForCausalLM': 'lm_head',
 }
 
 
@@ -150,6 +153,42 @@ def _get_non_default_generation_parameters(self):
             PretrainedConfig._get_non_default_generation_parameters = _get_non_default_generation_parameters  # noqa
 
 
+def update_moe_mapping(model, model_type):
+    """Update moe mapping."""
+    from lmdeploy.lite.quantization.awq import FC_FCS_MAP, NORM_FCS_MAP
+
+    # get experts num
+    num_experts = 0
+    for n, m in model.named_modules():
+        if type(m).__name__ == LAYER_TYPE_MAP[model_type]:
+            fc2fcs = FC_FCS_MAP[LAYER_TYPE_MAP[model_type]]
+            for k, v in fc2fcs.items():
+                if '{i}' in k:
+                    break
+            num_experts = len(m.get_submodule(k.split('.{i}')[0]))
+            break
+
+    # update FC_FCS_MAP
+    updated_fc2fcs = dict()
+    for prev_fc, post_fc in fc2fcs.items():
+        if '{i}' in prev_fc:
+            for i in range(num_experts):
+                updated_fc2fcs.update(
+                    {prev_fc.format(i=i): [v.format(i=i) for v in post_fc]})
+        else:
+            updated_fc2fcs.update({prev_fc: post_fc})
+    FC_FCS_MAP[LAYER_TYPE_MAP[model_type]] = updated_fc2fcs
+    # update NORM_FCS_MAP
+    norm2fcs = NORM_FCS_MAP[LAYER_TYPE_MAP[model_type]]
+    updated_norm2fcs = dict()
+    for norm, fc in norm2fcs.items():
+        updated_norm2fcs.update({
+            norm:
+            list(set([v.format(i=i) for v in fc for i in range(num_experts)]))
+        })
+    NORM_FCS_MAP[LAYER_TYPE_MAP[model_type]] = updated_norm2fcs
+
+
 def calibrate(model: str,
               calib_dataset: str = 'ptb',
               calib_samples: int = 128,
@@ -216,6 +255,9 @@ def calibrate(model: str,
             f'not supported. The supported model types are '
             f"{', '.join(LAYER_TYPE_MAP.keys())}.")
 
+    if model_type in ['MixtralForCausalLM']:
+        update_moe_mapping(model, model_type)
+
     if model_type == 'QWenLMHeadModel':
         try:
             import flash_attn  # noqa: F401
diff --git a/lmdeploy/lite/quantization/activation/observer.py b/lmdeploy/lite/quantization/activation/observer.py
index c66bdda6f4..9138c6ccce 100644
--- a/lmdeploy/lite/quantization/activation/observer.py
+++ b/lmdeploy/lite/quantization/activation/observer.py
@@ -99,11 +99,10 @@ def observe(self, x: torch.Tensor, save_input: bool = False) -> None:
         Args:
             x : Input tensor
         """
+        assert torch.isnan(x).sum() == 0
         if self.observed:
             return
-        if len(x.shape) != 3:
-            return
-        assert x.size(2) == self.dim
+        assert x.size(-1) == self.dim
         cur_val = x.flatten(0, 1)
         cur_max = cur_val.max(0)[0].cpu()
         cur_min = cur_val.min(0)[0].cpu()
diff --git a/lmdeploy/lite/quantization/awq.py b/lmdeploy/lite/quantization/awq.py
index fad7c2ef30..068ad9357e 100644
--- a/lmdeploy/lite/quantization/awq.py
+++ b/lmdeploy/lite/quantization/awq.py
@@ -39,6 +39,12 @@
     'GLMBlock': {
         'input_layernorm': ['self_attention.query_key_value'],
         'post_attention_layernorm': ['mlp.dense_h_to_4h']
+    },
+    'MixtralDecoderLayer': {
+        'input_layernorm':
+        ['self_attn.k_proj', 'self_attn.q_proj', 'self_attn.v_proj'],
+        'post_attention_layernorm':
+        ['block_sparse_moe.experts.{i}.w1', 'block_sparse_moe.experts.{i}.w3']
     }
 }
 
@@ -73,9 +79,23 @@
     'GLMBlock': {
         # 'self_attention.query_key_value': ['self_attention.dense']
         # 'mlp.dense_h_to_4h': ['mlp.dense_4h_to_h']
+    },
+    'MixtralDecoderLayer': {
+        'self_attn.v_proj': ['self_attn.o_proj'],
+        'block_sparse_moe.experts.{i}.w3': ['block_sparse_moe.experts.{i}.w2']
     }
 }
 
+SKIPPED_MODULE = ['lora', 'block_sparse_moe.gate']
+
+
+def skipped_module(name: str):
+    """Whether the module should be skipped from quantization."""
+    for m in SKIPPED_MODULE:
+        if m in name:
+            return True
+    return False
+
 
 @torch.no_grad()
 def get_weight_scale(weight, q_group_size=-1):
@@ -225,13 +245,7 @@ def check_awq_supported(layer_type):
         raise NotImplementedError
 
 
-def quant_weights(model,
-                  fcs,
-                  bits,
-                  symmetry,
-                  group_size=-1,
-                  device='cuda',
-                  skip_if_contains: str = None):
+def quant_weights(model, fcs, bits, symmetry, group_size=-1, device='cuda'):
     """Quantize the weights of the target model's linear layers."""
     from lmdeploy.lite.quantization import WeightQuantizer
     from lmdeploy.lite.quantization.modules import WeightOnlyQLinear
@@ -241,7 +255,7 @@ def quant_weights(model,
         parent_name, _, child_name = name.rpartition('.')
         parent = model.get_submodule(parent_name)
         pack_or_skip = 'packed'
-        if skip_if_contains and skip_if_contains in child_name:
+        if skipped_module(name):
             q_linear = fc
             pack_or_skip = 'skipped'
         else:
diff --git a/lmdeploy/lite/quantization/calibration.py b/lmdeploy/lite/quantization/calibration.py
index 77ff74e234..e590f1a4eb 100644
--- a/lmdeploy/lite/quantization/calibration.py
+++ b/lmdeploy/lite/quantization/calibration.py
@@ -6,8 +6,7 @@
 from torch import nn
 from transformers import PreTrainedTokenizer
 
-from lmdeploy.lite.quantization.activation import (ActivationObserver,
-                                                   KVCacheObserver)
+from lmdeploy.lite.quantization.activation import ActivationObserver
 from lmdeploy.lite.quantization.awq import FC_FCS_MAP, NORM_FCS_MAP
 from lmdeploy.lite.utils import (bimap_name_mod, collect_target_modules,
                                  concat_decoder_layer_outputs,
@@ -27,8 +26,6 @@ class CalibrationContext():
 
     inp_obs_group = 'inputs'
     out_obs_group = 'outputs'
-    key_obs_group = 'keys'
-    value_obs_group = 'values'
 
     def __init__(self,
                  model: nn.Module,
@@ -75,7 +72,6 @@ def __init__(self,
         self._init_input_observers(self.name2fc)
         self._init_output_observers(self.name2norm)
         self._init_output_observers(self.name2fc)
-        self._init_kv_observers(self.name2layer)
 
         self.device = device
 
@@ -102,14 +98,6 @@ def _init_output_observers(self, name2mod):
             obs = ActivationObserver(mod.weight.size(0))
             obs.global_available(name, group=self.out_obs_group)
 
-    def _init_kv_observers(self, name2mod):
-        """Initialize KV observers for given modules."""
-        for name in name2mod.keys():
-            k_obs = KVCacheObserver(self.num_kv_heads, self.head_dim)
-            v_obs = KVCacheObserver(self.num_kv_heads, self.head_dim)
-            k_obs.global_available(name, group=self.key_obs_group)
-            v_obs.global_available(name, group=self.value_obs_group)
-
     def _insert_input_observers(self):
         """Insert input observers into the target modules.
 
@@ -221,27 +209,6 @@ def collect_outputs_stats(self):
             outputs_stats['absmean'][name] = obs.absmean_val
         return outputs_stats
 
-    def collect_kv_stats(self):
-        """Collect statistics (min, max, absmax values) of the observed keys
-        and values.
-
-        Returns a tuple of two dictionaries with these collected stats.
-        """
-        key_stats = {'max': {}, 'min': {}, 'absmax': {}}
-        obs_group = KVCacheObserver.find_group(self.key_obs_group)
-        for name, obs in obs_group.items():
-            key_stats['max'][name] = obs.max_val
-            key_stats['min'][name] = obs.min_val
-            key_stats['absmax'][name] = obs.absmax_val
-
-        value_stats = {'max': {}, 'min': {}, 'absmax': {}}
-        obs_group = KVCacheObserver.find_group(self.value_obs_group)
-        for name, obs in obs_group.items():
-            value_stats['max'][name] = obs.max_val
-            value_stats['min'][name] = obs.min_val
-            value_stats['absmax'][name] = obs.absmax_val
-        return key_stats, value_stats
-
     def export(self, out_dir):
         """Export the calibration statistics (inputs, outputs, keys and values)
         to specified directory.

From 9f6ff9b2b63d4883338ee6f7ed2b03e7a932729e Mon Sep 17 00:00:00 2001
From: Chen Xin <irexyc@gmail.com>
Date: Wed, 13 Nov 2024 15:37:54 +0800
Subject: [PATCH 073/122] Check server input (#2719)

* validate server input

* remove unused

* addl ToolMessage

* remove check

* remove check

* remove check

* update

* update
---
 lmdeploy/model.py                 | 24 +++++++++++++++++++-----
 lmdeploy/serve/openai/protocol.py |  3 +--
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index 2b3a0a4e1d..1872502334 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -3,7 +3,7 @@
 import json
 import uuid
 from abc import abstractmethod
-from typing import List, Literal, Optional
+from typing import List, Literal, Optional, Union
 
 from mmengine import Registry
 
@@ -18,6 +18,20 @@ def random_uuid() -> str:
     return str(uuid.uuid4().hex)
 
 
+def get_text(content: Union[str, List[dict]]):
+    """Within the OpenAI API, the content field may be specified as either a
+    string or a list of ChatCompletionContentPartTextParam (defined in openai).
+
+    When a list is provided, lmdeploy selects the first element to incorporate
+    into the chat template, as the manner in which OpenAI processes lists is
+    not explicitly defined.
+    """
+
+    if isinstance(content, str):
+        return content
+    return content[0]['text']
+
+
 @dataclasses.dataclass
 class ChatTemplateConfig:
     """Parameters for chat template.
@@ -219,7 +233,7 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs):
                 ret += f'{self.system}{self.meta_instruction}{self.eosys}'
         for message in messages:
             role = message['role']
-            content = message['content']
+            content = get_text(message['content'])
             ret += f'{box_map[role]}{content}{eox_map[role]}'
         if len(messages) and messages[-1]['role'] == 'assistant':
             return ret[:-len(eox_map['assistant'])]  # prefix of response
@@ -509,7 +523,7 @@ def messages2prompt(self,
             messages.insert(insert_index, tools_prompt)
         for message in messages:
             role = message['role']
-            content = message['content']
+            content = get_text(message['content'])
             if role == 'assistant' and message.get('tool_calls',
                                                    None) is not None:
                 for tool_call in message['tool_calls']:
@@ -862,7 +876,7 @@ def messages2prompt(self,
                     ret += f'{self.system}{self.knowledge}{self.tools}{tool_prompt}{self.eotools}{self.meta_instruction}{self.eosys}'
         for message in messages:
             role = message['role']
-            content = message['content']
+            content = get_text(message['content'])
             if role == 'assistant' and ('<|python_tag|>' in content
                                         or '</function>' in content):
                 ret += f'{box_map[role]}{content}<|eom_id|>'
@@ -1038,7 +1052,7 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs):
         count = 0
         for message in messages:
             role = message['role']
-            content = message['content']
+            content = get_text(message['content'])
             if role == 'user':
                 count += 1
                 ret += f'[Round {count}]\n\n'
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
index d4bf8ed315..2b9d39c7b7 100644
--- a/lmdeploy/serve/openai/protocol.py
+++ b/lmdeploy/serve/openai/protocol.py
@@ -114,7 +114,7 @@ class ChatCompletionRequest(BaseModel):
     temperature: Optional[float] = 0.7
     top_p: Optional[float] = 1.0
     tools: Optional[List[Tool]] = Field(default=None, examples=[None])
-    tool_choice: Union[ToolChoice, Literal['auto', 'required','none']] = Field(default='auto', examples=['none'])  # noqa
+    tool_choice: Union[ToolChoice, Literal['auto', 'required', 'none']] = Field(default='auto', examples=['none'])  # noqa
     logprobs: Optional[bool] = False
     top_logprobs: Optional[int] = None
     n: Optional[int] = 1
@@ -242,7 +242,6 @@ class CompletionRequest(BaseModel):
     stream_options: Optional[StreamOptions] = Field(default=None,
                                                     examples=[None])
     top_p: Optional[float] = 1.0
-    logprobs: Optional[int] = None
     echo: Optional[bool] = False
     presence_penalty: Optional[float] = 0.0
     frequency_penalty: Optional[float] = 0.0

From 20544d3c9020d2000bf56fb33ee6e32bd017c8fc Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Wed, 13 Nov 2024 17:38:46 +0800
Subject: [PATCH 074/122] fix issue that mono-internvl failed to fallback
 pytorch engine (#2744)

---
 lmdeploy/turbomind/supported_models.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index f6772fddd5..bb3533254b 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -96,10 +96,12 @@ def _is_head_dim_supported(cfg):
                     # glm-4v-9b not supported
                     support_by_turbomind = False
             elif arch == 'InternVLChatModel':
-                support_by_turbomind = _is_head_dim_supported(cfg.llm_config)
+                llm_arch = cfg.llm_config.architectures[0]
+                support_by_turbomind = (llm_arch in SUPPORTED_ARCHS and
+                                        _is_head_dim_supported(cfg.llm_config))
             elif arch == 'LlavaForConditionalGeneration':
-                sub_arch = cfg.text_config.architectures[0]
-                if sub_arch in ['Qwen2ForCausalLM', 'LlamaForCausalLM']:
+                llm_arch = cfg.text_config.architectures[0]
+                if llm_arch in ['Qwen2ForCausalLM', 'LlamaForCausalLM']:
                     support_by_turbomind = _is_head_dim_supported(
                         cfg.text_config)
 

From 72503185fa121a2ce2305f7dca56f9136152db33 Mon Sep 17 00:00:00 2001
From: tangzhiyi11 <tangzhiyi11@users.noreply.github.com>
Date: Wed, 13 Nov 2024 18:39:59 +0800
Subject: [PATCH 075/122] optimize dlinfer moe (#2741)

---
 lmdeploy/pytorch/backends/dlinfer/moe.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py
index 90f6335ecb..6ada730fbe 100644
--- a/lmdeploy/pytorch/backends/dlinfer/moe.py
+++ b/lmdeploy/pytorch/backends/dlinfer/moe.py
@@ -19,9 +19,8 @@ def __init__(self, top_k: int, dim: int = -1):
 
     def forward(self, x: torch.Tensor):
         routing_weights, selected_experts = moe_gating_topk_softmax(
-            x, self.top_k)
-        return routing_weights.to(torch.float32), selected_experts.to(
-            torch.int64)
+            x.to(torch.float32), self.top_k)
+        return routing_weights, selected_experts
 
 
 class DlinferSoftmaxTopKBuilder(SoftmaxTopKBuilder):

From a21def9e7722058689e8a4eeeb1c957c2a948b88 Mon Sep 17 00:00:00 2001
From: RunningLeon <mnsheng@yeah.net>
Date: Thu, 14 Nov 2024 11:34:12 +0800
Subject: [PATCH 076/122] Support chemvlm (#2738)

* update to support chemvlm

* update docs

* add ut
---
 README.md                                     |  1 +
 README_ja.md                                  |  2 ++
 README_zh-CN.md                               |  1 +
 docs/en/supported_models/supported_models.md  |  2 ++
 .../supported_models/supported_models.md      |  2 ++
 lmdeploy/model.py                             |  3 +++
 lmdeploy/vl/model/internvl.py                 | 12 +++++++++---
 tests/test_lmdeploy/test_model.py             | 19 +++++++++++++++++++
 8 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index efbb87a22e..5b6ad47bdf 100644
--- a/README.md
+++ b/README.md
@@ -157,6 +157,7 @@ For detailed inference benchmarks in more devices and more settings, please refe
   <li>InternVL-Chat (v1.1-v1.5)</li>
   <li>InternVL2 (1B-76B)</li>
   <li>Mono-InternVL (2B)</li>
+  <li>ChemVLM (8B-26B)</li>
   <li>MiniGeminiLlama (7B)</li>
   <li>CogVLM-Chat (17B)</li>
   <li>CogVLM2-Chat (19B)</li>
diff --git a/README_ja.md b/README_ja.md
index df4647d868..bdd9ddb02d 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -152,6 +152,8 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
   <li>DeepSeek-VL (7B)</li>
   <li>InternVL-Chat (v1.1-v1.5)</li>
   <li>InternVL2 (1B-76B)</li>
+  <li>Mono-InternVL (2B)</li>
+  <li>ChemVLM (8B-26B)</li>
   <li>MiniGeminiLlama (7B)</li>
   <li>CogVLM-Chat (17B)</li>
   <li>CogVLM2-Chat (19B)</li>
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 477fed6f79..550922d081 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -158,6 +158,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型
   <li>InternVL-Chat (v1.1-v1.5)</li>
   <li>InternVL2 (1B-76B)</li>
   <li>Mono-InternVL (2B)</li>
+  <li>ChemVLM (8B-26B)</li>
   <li>MiniGeminiLlama (7B)</li>
   <li>CogVLM-Chat (17B)</li>
   <li>CogVLM2-Chat (19B)</li>
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index 371e4968e0..90ca90388b 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -30,6 +30,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |    LLaVA(1.5,1.6)     |   7B - 34B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |       InternVL        | v1.1 - v1.5  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |       InternVL2       | 2B, 8B - 76B | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|        ChemVLM        |   8B - 26B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 | MiniCPM-Llama3-V-2_5  |      -       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |     MiniCPM-V-2_6     |      -       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |    MiniGeminiLlama    |      7B      | MLLM |    Yes    |    -    |    -    |  Yes  |
@@ -81,6 +82,7 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 | InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
 |   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
 | Mono-InternVL  |     2B      | MLLM |   Yes\*   |   Yes   |   Yes   |  No  |   -   |
+|    ChemVLM     |   8B-26B    | MLLM |    Yes    |   Yes   |   No    |  No  |   -   |
 |     Gemma2     |   9B-27B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |      GLM4      |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |     GLM-4V     |     9B      | MLLM |    Yes    |   Yes   |   Yes   |  No  |  No   |
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index 7d59a59899..fecfdee200 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -30,6 +30,7 @@
 |    LLaVA(1.5,1.6)     |   7B - 34B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |       InternVL        | v1.1 - v1.5  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |       InternVL2       | 2B, 8B - 76B | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|        ChemVLM        |   8B - 26B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 | MiniCPM-Llama3-V-2_5  |      -       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |     MiniCPM-V-2_6     |      -       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |    MiniGeminiLlama    |      7B      | MLLM |    Yes    |    -    |    -    |  Yes  |
@@ -81,6 +82,7 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 | InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
 |   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
 | Mono-InternVL  |     2B      | MLLM |   Yes\*   |   Yes   |   Yes   |  No  |   -   |
+|    ChemVLM     |   8B-26B    | MLLM |    Yes    |   Yes   |   No    |  No  |   -   |
 |     Gemma2     |   9B-27B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
 |      GLM4      |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |     GLM-4V     |     9B      | MLLM |    Yes    |   Yes   |   Yes   |  No  |  No   |
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index 1872502334..db864a8344 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -565,6 +565,9 @@ def match(cls, model_path: str) -> Optional[str]:
                 return None
             return 'internvl-internlm2'
 
+        if 'chemvlm' in path:
+            return 'internvl-internlm2'
+
 
 @MODELS.register_module(name='internvl2-internlm2')
 class InternVL2InternLM2(InternLM2Chat7B):
diff --git a/lmdeploy/vl/model/internvl.py b/lmdeploy/vl/model/internvl.py
index d85fe30939..fa67192f11 100644
--- a/lmdeploy/vl/model/internvl.py
+++ b/lmdeploy/vl/model/internvl.py
@@ -108,8 +108,15 @@ def build_model(self):
         # avoid randomness in inference.
         self.model = model.eval()
         self.config = config
+        dynamic_image_size = getattr(self.config, 'dynamic_image_size', False)
+        image_processor = None
+        try:
+            image_processor = CLIPImageProcessor.from_pretrained(
+                self.model_path)
+        except OSError:
+            pass
 
-        if getattr(self.config, 'dynamic_image_size', False):
+        if dynamic_image_size or image_processor is None:
             logger.info('using InternVL-Chat-V1-5 vision preprocess')
             MEAN = (0.485, 0.456, 0.406)
             STD = (0.229, 0.224, 0.225)
@@ -126,8 +133,7 @@ def build_model(self):
             ])
             self._forward_func = self._forward_v1_5
         else:
-            self.image_processor = CLIPImageProcessor.from_pretrained(
-                self.model_path)
+            self.image_processor = image_processor
             self._forward_func = self._forward
 
     def _preprocess_v1_5(self, images: List[Image], params: List[Dict] = None):
diff --git a/tests/test_lmdeploy/test_model.py b/tests/test_lmdeploy/test_model.py
index a38971e4d0..7e3e71793d 100644
--- a/tests/test_lmdeploy/test_model.py
+++ b/tests/test_lmdeploy/test_model.py
@@ -475,6 +475,25 @@ def test_internvl2():
     assert res == expected
 
 
+def test_chemvlm():
+    deduced_name = best_match_model('AI4Chem/ChemVLM-8B')
+
+    assert deduced_name == 'internvl-internlm2'
+    model = MODELS.get(deduced_name)()
+    messages = [{
+        'role': 'user',
+        'content': 'who are you'
+    }, {
+        'role': 'assistant',
+        'content': 'I am an AI'
+    }]
+    expected = '<|im_start|>system\nYou are an AI assistant whose name is '\
+        'InternLM (书生·浦语).<|im_end|>\n<|im_start|>user\nwho are you'\
+        '<|im_end|>\n<|im_start|>assistant\nI am an AI'
+    res = model.messages2prompt(messages)
+    assert res == expected
+
+
 def test_codegeex4():
     model_path_and_name = 'THUDM/codegeex4-all-9b'
     deduced_name = best_match_model(model_path_and_name)

From fd8906c1c4bc37a359b9677d0cbef694a23ab00e Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Thu, 14 Nov 2024 13:07:01 +0800
Subject: [PATCH 077/122] Support molmo in turbomind (#2716)

* initial moe support

* dynamic grouped gemm

* benchmark

* moe benchmark

* moe sampling

* split-k

* refactor tuning

* simplify

* n-major weight

* add `num` for `MatrixLayout`

* packed rows

* packed cols

* dispatch for packed rows

* w4a16 moe

* refactor model loading

* fix pytorch loader

* refactor

* dispatch w4a16 moe

* fix loader

* add comment

* fix msvc build

* fix msvc build

* fix msvc build

* fix ut

* fix ut

* fix p-lora

* add all support arches

* minor

* fix lint

* fix lint

* fix lint

* fix ut

* bf16 support

* minor

* checkin molmo conversion

* add chat template

* refactor

* fix lint

* fix ut

* Just for test: hardcode vocab_size

* minor

* minor

* minor

* fix inter_size config

* load with non-standard filenames

* fix loader

* fix missing default param

* defer the loading of misc weights for safetensors

* add embedding_size

* update

* update

* tmp

* tmp

* update molmo template

* vision embedding

* fix

* update

* fix

* fix messages2prompt in templates

* fix order of out_messages

* fix

* add user guide

* update is_supported

---------

Co-authored-by: Li Zhang <lzhang329@gmail.com>
---
 docs/en/multi_modal/index.rst                 |   2 +
 docs/en/multi_modal/molmo.md                  |  92 +++++++++
 docs/zh_cn/multi_modal/index.rst              |   2 +
 docs/zh_cn/multi_modal/molmo.md               |  92 +++++++++
 lmdeploy/archs.py                             |   3 +-
 lmdeploy/model.py                             |  31 +++
 lmdeploy/serve/vl_async_engine.py             |   5 +
 lmdeploy/turbomind/deploy/config.py           |   7 +
 .../turbomind/deploy/source_model/__init__.py |   1 +
 .../turbomind/deploy/source_model/molmo.py    | 122 ++++++++++++
 .../turbomind/deploy/target_model/base.py     |   3 +
 lmdeploy/turbomind/supported_models.py        |   8 +-
 lmdeploy/vl/model/builder.py                  |  10 +-
 lmdeploy/vl/model/molmo.py                    | 177 ++++++++++++++++++
 lmdeploy/vl/templates.py                      |  80 ++++++++
 src/turbomind/models/llama/LlamaWeight.cc     |  15 +-
 src/turbomind/models/llama/LlamaWeight.h      |   2 +
 src/turbomind/models/llama/llama_params.h     |   1 +
 .../triton_backend/llama/LlamaTritonModel.cc  |   8 +
 19 files changed, 653 insertions(+), 8 deletions(-)
 create mode 100644 docs/en/multi_modal/molmo.md
 create mode 100644 docs/zh_cn/multi_modal/molmo.md
 create mode 100644 lmdeploy/turbomind/deploy/source_model/molmo.py
 create mode 100644 lmdeploy/vl/model/molmo.py

diff --git a/docs/en/multi_modal/index.rst b/docs/en/multi_modal/index.rst
index 62f724070f..a68fe3da4f 100644
--- a/docs/en/multi_modal/index.rst
+++ b/docs/en/multi_modal/index.rst
@@ -12,3 +12,5 @@ Vision-Language Models
    minicpmv.md
    phi3.md
    mllama.md
+   qwen2_vl.md
+   molmo.md
diff --git a/docs/en/multi_modal/molmo.md b/docs/en/multi_modal/molmo.md
new file mode 100644
index 0000000000..dfff43dc64
--- /dev/null
+++ b/docs/en/multi_modal/molmo.md
@@ -0,0 +1,92 @@
+# Molmo
+
+LMDeploy supports the following molmo series of models, which are detailed in the table below:
+
+|      Model      | Size | Supported Inference Engine |
+| :-------------: | :--: | :------------------------: |
+| Molmo-7B-D-0924 |  7B  |         TurboMind          |
+|  Molmo-72-0924  | 72B  |         TurboMind          |
+
+The next chapter demonstrates how to deploy a molmo model using LMDeploy, with [Molmo-7B-D-0924](https://huggingface.co/allenai/Molmo-7B-D-0924) as an example.
+
+## Installation
+
+Please install LMDeploy by following the [installation guide](../get_started/installation.md)
+
+## Offline inference
+
+The following sample code shows the basic usage of VLM pipeline. For detailed information, please refer to [VLM Offline Inference Pipeline](./vl_pipeline.md)
+
+```python
+from lmdeploy import pipeline
+from lmdeploy.vl import load_image
+
+pipe = pipeline('allenai/Molmo-7B-D-0924')
+
+image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
+response = pipe((f'describe this image', image))
+print(response)
+```
+
+More examples are listed below:
+
+<details>
+  <summary>
+    <b>multi-image multi-round conversation, combined images</b>
+  </summary>
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+
+pipe = pipeline('allenai/Molmo-7B-D-0924', log_level='INFO')
+messages = [
+    dict(role='user', content=[
+        dict(type='text', text='Describe the two images in detail.'),
+        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Beijing_Small.jpeg')),
+        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Chongqing_Small.jpeg'))
+    ])
+]
+out = pipe(messages, gen_config=GenerationConfig(do_sample=False))
+
+messages.append(dict(role='assistant', content=out.text))
+messages.append(dict(role='user', content='What are the similarities and differences between these two images.'))
+out = pipe(messages, gen_config=GenerationConfig(do_sample=False))
+```
+
+</details>
+
+## Online serving
+
+You can launch the server by the `lmdeploy serve api_server` CLI:
+
+```shell
+lmdeploy serve api_server allenai/Molmo-7B-D-0924
+```
+
+You can also start the service using the docker image:
+
+```shell
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    -p 23333:23333 \
+    --ipc=host \
+    openmmlab/lmdeploy:latest \
+    lmdeploy serve api_server allenai/Molmo-7B-D-0924
+```
+
+If you find the following logs, it means the service launches successfully.
+
+```text
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+INFO:     Started server process [2439]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on  http://0.0.0.0:23333  (Press CTRL+C to quit)
+```
+
+The arguments of `lmdeploy serve api_server` can be reviewed in detail by `lmdeploy serve api_server -h`.
+
+More information about `api_server` as well as how to access the service can be found from [here](api_server_vl.md)
diff --git a/docs/zh_cn/multi_modal/index.rst b/docs/zh_cn/multi_modal/index.rst
index 0942d8d31c..bd141ea90f 100644
--- a/docs/zh_cn/multi_modal/index.rst
+++ b/docs/zh_cn/multi_modal/index.rst
@@ -12,3 +12,5 @@
    minicpmv.md
    phi3.md
    mllama.md
+   qwen2_vl.md
+   molmo.md
diff --git a/docs/zh_cn/multi_modal/molmo.md b/docs/zh_cn/multi_modal/molmo.md
new file mode 100644
index 0000000000..1dc8f8f79b
--- /dev/null
+++ b/docs/zh_cn/multi_modal/molmo.md
@@ -0,0 +1,92 @@
+# Qwen2-VL
+
+LMDeploy 支持 Molmo 系列模型，具体如下：
+
+|      Model      | Size | Supported Inference Engine |
+| :-------------: | :--: | :------------------------: |
+| Molmo-7B-D-0924 |  7B  |         TurboMind          |
+|  Molmo-72-0924  | 72B  |         TurboMind          |
+
+本文将以[Molmo-7B-D-0924](https://huggingface.co/allenai/Molmo-7B-D-0924) 为例，演示使用 LMDeploy 部署 Molmo 系列模型的方法
+
+## 安装
+
+请参考[安装文档](../get_started/installation.md)安装 LMDeploy。
+
+## 离线推理
+
+以下是使用 pipeline 进行离线推理的示例，更多用法参考[VLM离线推理 pipeline](./vl_pipeline.md)
+
+```python
+from lmdeploy import pipeline
+from lmdeploy.vl import load_image
+
+pipe = pipeline('allenai/Molmo-7B-D-0924')
+
+image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
+response = pipe((f'describe this image', image))
+print(response)
+```
+
+更多例子如下：
+
+<details>
+  <summary>
+    <b>多图多轮对话</b>
+  </summary>
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+
+pipe = pipeline('Qwen/Qwen2-VL-2B-Instruct', log_level='INFO')
+messages = [
+    dict(role='user', content=[
+        dict(type='text', text='Describe the two images in detail.'),
+        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Beijing_Small.jpeg')),
+        dict(type='image_url', image_url=dict(url='https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Chongqing_Small.jpeg'))
+    ])
+]
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+
+messages.append(dict(role='assistant', content=out.text))
+messages.append(dict(role='user', content='What are the similarities and differences between these two images.'))
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+```
+
+</details>
+
+## 在线服务
+
+你可以通过 `lmdeploy serve api_server` CLI 工具启动服务：
+
+```shell
+lmdeploy serve api_server Qwen/Qwen2-VL-2B-Instruct
+```
+
+也可以基于 docker image 启动服务：
+
+```shell
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    -p 23333:23333 \
+    --ipc=host \
+    openmmlab/lmdeploy:qwen2vl \
+    lmdeploy serve api_server Qwen/Qwen2-VL-2B-Instruct
+```
+
+如果日志中有如下信息，就表明服务启动成功了。
+
+```text
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+INFO:     Started server process [2439]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on  http://0.0.0.0:23333  (Press CTRL+C to quit)
+```
+
+有关 `lmdeploy serve api_server` 的详细参数可以通过`lmdeploy serve api_server -h`查阅。
+
+关于 `api_server` 更多的介绍，以及访问 `api_server` 的方法，请阅读[此处](api_server_vl.md)
diff --git a/lmdeploy/archs.py b/lmdeploy/archs.py
index 8284c99741..ce5cbd98ff 100644
--- a/lmdeploy/archs.py
+++ b/lmdeploy/archs.py
@@ -121,7 +121,8 @@ def check_vl_llm(config: dict) -> bool:
         'InternVLChatModel', 'MiniGeminiLlamaForCausalLM',
         'MGMLlamaForCausalLM', 'MiniCPMV', 'LlavaForConditionalGeneration',
         'LlavaNextForConditionalGeneration', 'Phi3VForCausalLM',
-        'Qwen2VLForConditionalGeneration', 'MllamaForConditionalGeneration'
+        'Qwen2VLForConditionalGeneration', 'MllamaForConditionalGeneration',
+        'MolmoForCausalLM'
     ])
     if arch == 'QWenLMHeadModel' and 'visual' in config:
         return True
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index db864a8344..c9eb71c2c3 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -1747,6 +1747,37 @@ def match(cls, model_path: str) -> Optional[str]:
             return 'internvl-phi3'
 
 
+@MODELS.register_module(name='molmo')
+class Molmo(BaseChatTemplate):
+
+    def __init__(self,
+                 user=' User: ',
+                 eoh='',
+                 assistant=' Assistant:',
+                 eoa='',
+                 separator=' ',
+                 stop_words=['<|endoftext|>'],
+                 **kwargs):
+        super().__init__(user=user,
+                         eoh=eoh,
+                         assistant=assistant,
+                         eoa=eoa,
+                         separator=separator,
+                         stop_words=stop_words,
+                         **kwargs)
+
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.
+
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        path = model_path.lower()
+        if 'molmo' in path:
+            return 'molmo'
+
+
 def best_match_model(query: str) -> Optional[str]:
     """Get the model that matches the query.
 
diff --git a/lmdeploy/serve/vl_async_engine.py b/lmdeploy/serve/vl_async_engine.py
index fd0b0bb5e4..c293cd71c8 100644
--- a/lmdeploy/serve/vl_async_engine.py
+++ b/lmdeploy/serve/vl_async_engine.py
@@ -64,6 +64,7 @@ async def _get_prompt_input(self,
         results = {}
         input_ids = []
         from lmdeploy.vl.templates import (MllamaTempateWrapper,
+                                           MolmoChatTemplateWrapper,
                                            Qwen2VLChatTemplateWrapper)
         ranges = None
         grid_thws = None
@@ -99,6 +100,10 @@ async def _get_prompt_input(self,
                     results['cross_attention_states'] = features[0]
                     return results
 
+                if isinstance(self.vl_prompt_template,
+                              MolmoChatTemplateWrapper):
+                    return features[0]
+
             features = [x.cpu().numpy() for x in features]
             input_ids = []
             begins = []
diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
index a535b0d4c1..c724b085a0 100644
--- a/lmdeploy/turbomind/deploy/config.py
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -35,6 +35,13 @@ class ModelConfig:
     kv_head_num: int = None
     hidden_units: int = None
     vocab_size: int = None
+    # Turbomind used to assume token_embedding and lm_head has the same size
+    # at vocab dim, i.e. `vocab_size`
+    # But in molmo, embedding.shape is [vocab_size + 128, hidden_units]
+    # while lm_head shape is [hidden_units, vocab_size].
+    # Therefore, we add a new attr "embedding_size" to represent the vocab dim
+    # of token_embedding
+    embedding_size: int = 0
     num_layer: int = None
     inter_size: int = None
     norm_eps: float = None
diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py
index b1da698e2e..de16bdc0a0 100644
--- a/lmdeploy/turbomind/deploy/source_model/__init__.py
+++ b/lmdeploy/turbomind/deploy/source_model/__init__.py
@@ -9,5 +9,6 @@
 from .meta_llama import MetaLlamaModel  # noqa: F401
 from .minicpmv import MiniCPMVModel  # noqa: F401
 from .mixtral import MixtralModel  # noqa: F401
+from .molmo import MolmoModel  # noqa: F401
 from .qwen import QwenModel  # noqa: F401
 from .xcomposer2 import Xcomposer2Model  # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/source_model/molmo.py b/lmdeploy/turbomind/deploy/source_model/molmo.py
new file mode 100644
index 0000000000..541e201046
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/molmo.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+
+import torch
+
+from .base import INPUT_MODELS
+from .llama import LlamaModel, LlamaReader
+
+
+class MolmoReader(LlamaReader):
+    attn_layer_prefix = 'model.transformer.blocks'
+    attn_layer_patten = r'model.transformer.blocks.([0-9]+).'
+    norm_weight_key = 'model.transformer.ln_f.weight'
+    output_weight_key = 'model.transformer.ff_out.weight'
+
+    # In molmo, names of attention parameters are "att_proj.bias",
+    # "att_proj.weight", "attn_norm.weight", "attn_out.weight", and names
+    # of ffn parameters are "ff_norm", "ff_out", "ff_proj", so we
+    # make the patterns are r'att' and r'ffn_', respectively.
+    attn_pattern = r'att'
+    ffn_pattern = r'ff_'
+
+    def tok_embeddings(self):
+        embed1 = self.params.get('model.transformer.wte.embedding', None)
+        embed2 = self.params.get('model.transformer.wte.new_embedding', None)
+        if embed1 is not None and embed2 is not None:
+            return torch.cat((embed1, embed2), dim=0)
+        else:
+            assert embed1 is None and embed2 is None
+            return None
+
+    def attn_norm(self, i: int):
+        """Get attn norm for layer i."""
+        return self.params[f'{self.attn_layer_prefix}.{i}.attn_norm.weight']
+
+    def _attn(self, i: int, kind: str):
+        """Get q, k, v, o kind(weight, bias, qweight) for layer i.
+
+        Args:
+            i (int): layer id
+            kind (str): can be one of ["weight", "bias", "qweight"]
+        """
+        q, k, v = (None, ) * 3
+        hidden_size = self.model_cfg['hidden_size']
+        head_num = self.model_cfg['num_attention_heads']
+        kv_head_num = self.model_cfg['num_key_value_heads']
+        head_dim = hidden_size // head_num
+        assert head_dim == 128
+        fused_dims = (hidden_size, kv_head_num * head_dim,
+                      kv_head_num * head_dim)
+        qkv = self.params.get(f'{self.attn_layer_prefix}.{i}.att_proj.{kind}')
+        qkv = self.transform(qkv, kind)
+        if qkv is not None:
+            q, k, v = qkv.split(fused_dims, dim=0)
+        o = self.params.get(f'{self.attn_layer_prefix}.{i}.attn_out.{kind}')
+        o = self.transform(o, kind)
+        if o is None:  # handle the case when qkv has bias but o doesn't
+            o = torch.zeros_like(q)
+        return (q, k, v, o)
+
+    def _ffn(self, i: int, kind: str):
+        """Get ffn kind(weight, qweight) for layer i."""
+        up_and_gate = self.params[
+            f'{self.attn_layer_prefix}.{i}.ff_proj.{kind}']
+        up_and_gate = self.transform(up_and_gate, kind)
+        gate, up = up_and_gate.chunk(2, dim=0)
+        down = self.params[f'{self.attn_layer_prefix}.{i}.ff_out.{kind}']
+        down = self.transform(down, kind)
+        return (up, down, gate)
+
+    def ffn_norm(self, i: int):
+        """Get ffn norm for layer i."""
+        return self.params[f'{self.attn_layer_prefix}.{i}.ff_norm.weight']
+
+
+@INPUT_MODELS.register_module(name='molmo')
+class MolmoModel(LlamaModel):
+
+    Reader = MolmoReader
+
+    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
+        super().__init__(model_path, tokenizer_path, **kwargs)
+        config_path = osp.join(self.model_path, 'config.json')
+        with open(config_path) as f:
+            self.config = json.load(f)
+
+    def tokenizer_info(self):
+
+        n_words = 152064
+        bos_id = 151643
+        eos_id = 151643
+        return n_words, bos_id, eos_id
+
+    def model_info(self):
+        config = self.config
+        num_layer = config['num_hidden_layers']
+        norm_eps = config['layer_norm_eps']
+        attn_head_num = config['num_attention_heads']
+        kv_head_num = config['num_key_value_heads']
+        hidden_units = config['hidden_size']
+        rope_theta = config['rope_theta']
+        max_position_embeddings = config['max_position_embeddings']
+        vocab_size = config['vocab_size']
+        # https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/modeling_molmo.py#L2041
+        additional_vocab_size = 128
+        inter_size = config['intermediate_size'] // 2
+        attn_bias = config['qkv_bias']
+        return dict(
+            num_layer=num_layer,
+            norm_eps=norm_eps,
+            head_num=attn_head_num,
+            kv_head_num=kv_head_num,
+            hidden_units=hidden_units,
+            attn_bias=int(attn_bias),
+            inter_size=inter_size,
+            vocab_size=vocab_size,
+            # https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/modeling_molmo.py#L564
+            embedding_size=vocab_size + additional_vocab_size,
+            rope_theta=rope_theta,
+            max_position_embeddings=max_position_embeddings,
+        )
diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py
index abd570cd00..09699ade09 100644
--- a/lmdeploy/turbomind/deploy/target_model/base.py
+++ b/lmdeploy/turbomind/deploy/target_model/base.py
@@ -92,6 +92,9 @@ def update_model_config(self):
         final_cfg = config_to_dict(self.model_config)
         final_cfg.update(dict(start_id=bos_id, end_id=eos_id))
         final_cfg.update(self.input_model_info)
+        if 'embedding_size' not in self.input_model_info.keys():
+            final_cfg.update(
+                embedding_size=self.input_model_info['vocab_size'])
 
         self.model_config = config_from_dict(ModelConfig, final_cfg)
 
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index bb3533254b..e66da22df0 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -42,7 +42,9 @@
     ChatGLMModel='glm4',
     ChatGLMForConditionalGeneration='glm4',
     # mixtral
-    MixtralForCausalLM='mixtral')
+    MixtralForCausalLM='mixtral',
+    MolmoForCausalLM='molmo',
+)
 
 
 def is_supported(model_path: str):
@@ -104,5 +106,9 @@ def _is_head_dim_supported(cfg):
                 if llm_arch in ['Qwen2ForCausalLM', 'LlamaForCausalLM']:
                     support_by_turbomind = _is_head_dim_supported(
                         cfg.text_config)
+            elif arch == 'MolmoForCausalLM':
+                kv_heads = cfg.num_key_value_heads
+                # TM hasn't supported allenai/Molmo-7B-O-0924 yet
+                support_by_turbomind = kv_heads is not None
 
     return support_by_turbomind
diff --git a/lmdeploy/vl/model/builder.py b/lmdeploy/vl/model/builder.py
index 9e71f7d1c0..2401b42259 100644
--- a/lmdeploy/vl/model/builder.py
+++ b/lmdeploy/vl/model/builder.py
@@ -18,6 +18,7 @@
 from .mini_gemeni import MiniGeminiVisionModel  # noqa F401
 from .minicpmv import MiniCPMVModel  # noqa F401
 from .mllama import MllamaVLModel  # noqa F401
+from .molmo import MolmoVisionModel  # noqa F401
 from .phi3_vision import Phi3VisionModel  # noqa F401
 from .qwen import QwenVisionModel  # noqa F401
 from .qwen2 import Qwen2VLModel  # noqa F401
@@ -31,7 +32,14 @@ def load_vl_model(model_path: str,
                   with_llm: bool = False,
                   backend_config: Optional[Union[TurbomindEngineConfig,
                                                  PytorchEngineConfig]] = None):
-    """load visual model."""
+    """load visual model.
+
+    Args:
+        model_path(str): the path or repo_id from model hub of the model
+        with_llm(bool): whether to remove the LLM part from the model.
+            When it is False, it means removing LLM part
+        backend_config: the config of the inference engine
+    """
     if not os.path.exists(model_path):
         revision = getattr(backend_config, 'revision', None)
         download_dir = getattr(backend_config, 'download_dir', None)
diff --git a/lmdeploy/vl/model/molmo.py b/lmdeploy/vl/model/molmo.py
new file mode 100644
index 0000000000..9abae7a309
--- /dev/null
+++ b/lmdeploy/vl/model/molmo.py
@@ -0,0 +1,177 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from typing import Dict, List
+
+import torch
+from PIL.Image import Image
+from transformers import AutoModelForCausalLM, AutoProcessor
+
+from lmdeploy.utils import get_logger
+from lmdeploy.vl.constants import IMAGE_TOKEN
+from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
+from lmdeploy.vl.model.utils import disable_logging
+
+logger = get_logger('lmdeploy')
+
+
+@VISION_MODELS.register_module()
+class MolmoVisionModel(VisonModel):
+    """molmo's vision model."""
+
+    _arch = 'MolmoForCausalLM'
+
+    def build_model(self):
+        """Load model."""
+        from accelerate import init_empty_weights, load_checkpoint_and_dispatch
+        with init_empty_weights():
+            config = self.hf_config
+            model = AutoModelForCausalLM.from_config(config,
+                                                     trust_remote_code=True)
+            if not self.with_llm:
+                # Remove nn modules other than embedding from the LLM model
+                for key in ['emb_drop', 'ln_f', 'blocks', 'ff_out']:
+                    del model.model.transformer[key]
+                self.token_embedding = model.model.transformer.wte
+            else:
+                self.vl_model = model
+
+        with disable_logging():
+            load_checkpoint_and_dispatch(
+                model=model,
+                checkpoint=self.model_path,
+                device_map='auto' if not self.with_llm else {'': 'cpu'},
+                max_memory=self.max_memory,
+                no_split_module_classes=[
+                    'ResidualAttentionBlock', 'Embedding'
+                ])
+
+        # We need eval mode to freeze the weights in model, thus,
+        # avoid randomness in inference.
+        self.model = model.eval()
+        self.config = config
+
+        self.processor = AutoProcessor.from_pretrained(self.model_path,
+                                                       trust_remote_code=True,
+                                                       torch_dtype='auto',
+                                                       device_map='auto')
+
+    @torch.no_grad()
+    def forward(self,
+                images: List[Image],
+                params: List[Dict] = None) -> List[Dict]:
+        """forward the model with given input.
+
+        Args:
+            images (List): [None] it is not used
+            params (List): the inputs after precessing GPT4V messages in
+                `MolmoChatTemplateWrapper`. Its format is like the following:
+                [[
+                    {'role': 'user', 'content': 'user prompt'},
+                    {'role': 'asssistant', 'content': 'assistant prompt'},
+                    {'role': 'user', 'content': 'user prompt', 'images': [PIL image list]},
+                    ...
+                ]]
+        """  # noqa
+
+        messages = params[0]
+        assert isinstance(messages, List)
+        # append an assistant message to `messages`
+        messages.append(dict(role='assistant', content=''))
+        # results is a list of tuple(input_ids, embeddings)
+        results = []
+        # the concat prompt. It is not used during inference but to adhere the
+        # interface definition of `_get_prompt_input` in `class VLAsyncEngine`
+        prompts = ''
+        # Prepend BOS
+        # qwen2 and olmo do not have a BOS, and instead use EOS as a generic
+        # separator token.
+        bos = (self.processor.tokenizer.bos_token_id
+               or self.processor.tokenizer.eos_token_id)
+        results.append(([bos], None))
+        for i, message in enumerate(messages):
+            if 'images' in message.keys():
+                prompts += ' User: ' + (IMAGE_TOKEN + '\n') * len(
+                    message['images']) + message['content']
+                prompt = f' User: {message["content"]}'
+                tokens = self.processor.tokenizer.encode(
+                    prompt, add_special_tokens=False)
+                # preprocess images. The output is a dict
+                inputs = self.processor.process(images=message['images'],
+                                                tokens=tokens)
+                inputs = {
+                    k: v.to(self.model.device).unsqueeze(0)
+                    for k, v in inputs.items()
+                }
+                input_ids = inputs['input_ids']
+                # remove the bos from input_ids which is prepended by molmo's
+                # processor
+                input_ids = input_ids[:, 1:]
+                images = inputs[
+                    'images']  # (batch_size, num_image, num_patch, d_model)
+                image_input_idx = inputs[
+                    'image_input_idx']  # (batch_size, num_image, num_patch)
+                image_masks = inputs['image_masks']
+                batch_size, seq_len = input_ids.size()
+                assert batch_size == 1
+
+                # Get embeddings of input.
+                if input_ids is not None:
+                    input_ids = input_ids * (input_ids != -1).to(
+                        input_ids.dtype)
+                embeddings = self.model.model.transformer.wte(input_ids)
+                image_features, _ = self.model.model.vision_backbone(
+                    images, image_masks)
+                num_image, num_patch = image_features.shape[1:3]
+                assert image_input_idx.shape == (batch_size, num_image,
+                                                 num_patch)
+
+                # insert the image feature into the embedding.
+                image_features = image_features.view(batch_size,
+                                                     num_image * num_patch, -1)
+                image_input_idx = image_input_idx.view(batch_size,
+                                                       num_image * num_patch)
+
+                valid = image_input_idx >= 0
+                batch_idx = torch.arange(batch_size, device=embeddings.device)
+                batch_idx = torch.tile(batch_idx[:, None],
+                                       [1, image_features.shape[1]])
+                image_features = image_features.to(embeddings.device)
+                embeddings[batch_idx[valid],
+                           image_input_idx[valid]] += image_features[valid]
+                assert embeddings.shape[:2] == (batch_size, seq_len)
+                results.append((input_ids.flatten().tolist(), embeddings))
+            else:
+                role = message['role']
+                content = message['content']
+                assert isinstance(content, str)
+                prompt = ''
+                if role == 'user':
+                    prompt = f' User: {content}'
+                elif role == 'assistant':
+                    prompt = f' Assistant:{content}'
+                else:
+                    assert 0, f'molmo does not support role {role}, message is {message}'  # noqa
+                input_ids = self.processor.tokenizer.encode(
+                    prompt, add_special_tokens=False)
+                results.append((input_ids, None))
+                prompts += prompt
+
+        # concat input_ids from results, calculate the range in the input_ids
+        # where embeddings will be copied to
+        input_ids = []
+        input_embeddings = []
+        input_embedding_ranges = []
+        start = 0
+        for _input_ids, _embeddings in results:
+            if _embeddings is not None:
+                input_embeddings.append(_embeddings.cpu())
+                end = start + len(_input_ids)
+                input_embedding_ranges.append((start, end))
+            input_ids += _input_ids
+            start += len(_input_ids)
+        return [
+            dict(prompt=prompts,
+                 input_ids=input_ids,
+                 input_embeddings=input_embeddings,
+                 input_embedding_ranges=input_embedding_ranges)
+        ]
diff --git a/lmdeploy/vl/templates.py b/lmdeploy/vl/templates.py
index 45e457ad2c..cdf398868a 100644
--- a/lmdeploy/vl/templates.py
+++ b/lmdeploy/vl/templates.py
@@ -428,6 +428,84 @@ class GLM4VChatTemplateWrapper(VLChatTemplateWrapper):
     pass
 
 
+class MolmoChatTemplateWrapper(VLChatTemplateWrapper):
+
+    async def async_collect_pil_images(
+            self, messages: List[Dict]) -> List[Tuple[PIL.Image.Image, Dict]]:
+        """collect images from messages.
+
+        Args:
+            messages (List[Dict]): a user request of GPT4V message format
+        """
+        if isinstance(messages, Dict):
+            messages = [messages]
+        assert isinstance(messages, List)
+
+        out_messages = [None] * len(messages)
+
+        def _inner_call(i, in_messages, out_messages):
+            role = in_messages[i]['role']
+            content = in_messages[i]['content']
+            if role != 'user' or isinstance(content, str):
+                # means message is user's prompt input or assistant's prompt,
+                # returning it directory
+                out_messages[i] = in_messages[i]
+                return
+            # the role is a user and the content is a list
+            assert isinstance(content, List)
+            message = dict(role=role, content='', images=[])
+            for item in content:
+                # 'image_url': means url or local path to image.
+                # 'image_data': means PIL.Image.Image object.
+                if item['type'] == 'image_url':
+                    try:
+                        image = load_image(item['image_url']['url'])
+                        message['images'].append(image)
+                    except KeyError:
+                        logger.error(f'invalid format {message}')
+                elif item['type'] == 'image_data':
+                    try:
+                        image = load_image(item['image_data']['data'])
+                        message['images'].append(image)
+                    except KeyError:
+                        logger.error(f'invalid format {message}')
+                elif item['type'] == 'text':
+                    message['content'] = item['text']
+                else:
+                    logger.error(f'unexpected content type {message}')
+            out_messages[i] = message
+
+        await asyncio.gather(*[
+            asyncio.get_event_loop().run_in_executor(None, _inner_call, i,
+                                                     messages, out_messages)
+            for i in range(len(messages))
+        ])
+        return [(None, out_messages)]
+
+    def messages2prompt(self, messages, sequence_start=True, **kwargs) -> str:
+        """Return a placeholder "IMAGE_TOKEN" so that
+        `vl_asyn_engine._get_prompt_input` can know that it."""
+        if isinstance(messages, str):
+            return self.chat_template.messages2prompt(messages, sequence_start)
+        else:
+            _messages = []
+            for message in messages:
+                role, content = message['role'], message['content']
+                if role != 'user' or isinstance(content, str):
+                    _messages.append(message)
+                    continue
+                for item in content:
+                    item_type = item['type']
+                    if item_type in ['image_url', 'image_data']:
+                        # Return the image placeholder so that
+                        # `vl_asyn_engine._get_prompt_input` can know that the
+                        # request contains images
+                        return IMAGE_TOKEN
+                    _messages.append(dict(role=role, content=item[item_type]))
+            return self.chat_template.messages2prompt(_messages,
+                                                      sequence_start)
+
+
 def get_vl_prompt_template(model_path: str, chat_template: BaseModel,
                            model_name: str) -> VLChatTemplateWrapper:
     """get vision language prompt template."""
@@ -467,4 +545,6 @@ def get_vl_prompt_template(model_path: str, chat_template: BaseModel,
         return GLM4VChatTemplateWrapper(chat_template)
     elif arch == 'Qwen2VLForConditionalGeneration':
         return Qwen2VLChatTemplateWrapper(chat_template)
+    elif arch == 'MolmoForCausalLM':
+        return MolmoChatTemplateWrapper(chat_template)
     raise ValueError(f'unsupported vl_prompt_template with arch {arch}')
diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc
index 1ac2d82dd9..9d62042d62 100644
--- a/src/turbomind/models/llama/LlamaWeight.cc
+++ b/src/turbomind/models/llama/LlamaWeight.cc
@@ -32,6 +32,7 @@ LlamaWeight<T>::LlamaWeight(size_t     head_num,
                             size_t     hidden_units,
                             size_t     inter_size,
                             size_t     vocab_size,
+                            size_t     embedding_size,
                             size_t     num_layer,
                             bool       attn_bias,
                             WeightType weight_type,
@@ -44,16 +45,20 @@ LlamaWeight<T>::LlamaWeight(size_t     head_num,
     inter_size_(inter_size),
     vocab_size_(vocab_size),
     vocab_size_padded_(vocab_size),
+    embedding_size_(embedding_size),
     num_layer_(num_layer),
     weight_type_(weight_type),
     tensor_para_size_(tensor_para_size),
     tensor_para_rank_(tensor_para_rank)
 {
     if (vocab_size_padded_ % tensor_para_size_ != 0) {
-        vocab_size_padded_ = (vocab_size_padded_ + tensor_para_size_ - 1) / tensor_para_size_ * tensor_para_size_;
+        vocab_size_padded_ = (vocab_size_ + tensor_para_size_ - 1) / tensor_para_size_ * tensor_para_size_;
         TM_LOG_WARNING("pad vocab size from %d to %d", vocab_size_, vocab_size_padded_);
     }
-
+    if (embedding_size_ % tensor_para_size_ != 0) {
+        embedding_size_ = (embedding_size_ + tensor_para_size_ - 1) / tensor_para_size_ * tensor_para_size_;
+        TM_LOG_WARNING("pad embed size from %d to %d", embedding_size_, embedding_size_);
+    }
     FT_CHECK(hidden_units_ % tensor_para_size_ == 0);
 
     decoder_layer_weights.reserve(num_layer_);
@@ -96,7 +101,7 @@ template<typename T>
 void LlamaWeight<T>::mallocWeights()
 {
     FT_CHECK(vocab_size_padded_ % tensor_para_size_ == 0);
-    deviceMalloc((T**)&pre_decoder_embedding_table, vocab_size_padded_ * hidden_units_ / tensor_para_size_);
+    deviceMalloc((T**)&pre_decoder_embedding_table, embedding_size_ * hidden_units_ / tensor_para_size_);
     deviceMalloc((T**)&output_norm_weight, hidden_units_);
     deviceMalloc((T**)&post_decoder_embedding_kernel, hidden_units_ * vocab_size_padded_ / tensor_para_size_);
 }
@@ -111,7 +116,7 @@ void LlamaWeight<T>::loadModel(std::string dir_path)
     dir_path += '/';
 
     loadWeightFromBin((T*)pre_decoder_embedding_table,
-                      {vocab_size_padded_ * hidden_units_ / tensor_para_size_},
+                      {embedding_size_ * hidden_units_ / tensor_para_size_},
                       dir_path + "tok_embeddings." + std::to_string(tensor_para_rank_) + ".weight",
                       model_file_type);
 
@@ -135,7 +140,7 @@ TensorMap LlamaWeight<T>::getParams()
     output.insert("tok_embeddings." + std::to_string(tensor_para_rank_) + ".weight",
                   Tensor{MEMORY_GPU,
                          getTensorType<T>(),
-                         {vocab_size_padded_ * hidden_units_ / tensor_para_size_ * sizeof(T)},
+                         {embedding_size_ * hidden_units_ / tensor_para_size_ * sizeof(T)},
                          pre_decoder_embedding_table});
 
     output.insert("norm.weight",
diff --git a/src/turbomind/models/llama/LlamaWeight.h b/src/turbomind/models/llama/LlamaWeight.h
index c04bf6c5a6..c30e753565 100644
--- a/src/turbomind/models/llama/LlamaWeight.h
+++ b/src/turbomind/models/llama/LlamaWeight.h
@@ -35,6 +35,7 @@ struct LlamaWeight {
                 size_t     hidden_units,
                 size_t     inter_size,
                 size_t     vocab_size,
+                size_t     embedding_size,
                 size_t     num_layer,
                 bool       attn_bias,
                 WeightType weight_type,
@@ -67,6 +68,7 @@ struct LlamaWeight {
     size_t     inter_size_;
     size_t     vocab_size_;
     size_t     vocab_size_padded_;
+    size_t     embedding_size_;
     size_t     num_layer_;
     WeightType weight_type_;
     size_t     tensor_para_size_;
diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h
index 2ea63f0410..e6b9d690ae 100644
--- a/src/turbomind/models/llama/llama_params.h
+++ b/src/turbomind/models/llama/llama_params.h
@@ -18,6 +18,7 @@ struct ModelParam {
     size_t layer_num;
     size_t inter_size;
     size_t vocab_size;
+    size_t embedding_size;
     float  norm_eps;
     int    quant_policy;
     //
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 38552be0cf..2deca46380 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -133,6 +133,12 @@ void LlamaTritonModel<T>::handleMissingParams()
                        (int)model_param_.kv_head_num);
     }
 
+    if (model_param_.embedding_size == 0) {
+        model_param_.embedding_size = model_param_.vocab_size;
+        TM_LOG_WARNING("[LlamaTritonModel] `embedding_size` is not set, default to `vocab_size` (%d).",
+                       (int)model_param_.vocab_size);
+    }
+
     if (!attn_param_.max_position_embeddings) {
         attn_param_.max_position_embeddings = 2048;
         TM_LOG_WARNING("[LlamaTritonModel] `max_position_embeddings` is not set, default to %d.",
@@ -252,6 +258,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     model_param_.layer_num          = model_reader["num_layer"].as<int>();
     model_param_.inter_size         = model_reader["inter_size"].as<int>();
     model_param_.vocab_size         = model_reader["vocab_size"].as<int>();
+    model_param_.embedding_size     = model_reader["embedding_size"].as<int>();
     model_param_.norm_eps           = model_reader["norm_eps"].as<float>();
     model_param_.start_id           = model_reader["start_id"].as<int>();
     model_param_.end_id             = model_reader["end_id"].as<int>();
@@ -417,6 +424,7 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
                                                                model_param_.hidden_units,
                                                                model_param_.inter_size,
                                                                model_param_.vocab_size,
+                                                               model_param_.embedding_size,
                                                                model_param_.layer_num,
                                                                attn_bias_,
                                                                weight_type_,

From 59c1c63b992eb332f6408554f1d6de146a6f7733 Mon Sep 17 00:00:00 2001
From: jinminxi104 <jinminxi104@hotmail.com>
Date: Thu, 14 Nov 2024 13:12:39 +0800
Subject: [PATCH 078/122] Update ascend readme (#2756)

* Update get_started.md of ascend

* Update get_started.md of ascend
---
 docs/en/get_started/ascend/get_started.md    | 4 ++--
 docs/zh_cn/get_started/ascend/get_started.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/en/get_started/ascend/get_started.md b/docs/en/get_started/ascend/get_started.md
index a5400ed64d..9e963b3795 100644
--- a/docs/en/get_started/ascend/get_started.md
+++ b/docs/en/get_started/ascend/get_started.md
@@ -49,7 +49,7 @@ For more information about running the Docker client on Ascend devices, please r
 ## Offline batch inference
 
 > \[!TIP\]
-> Graph mode has been supported on Atlas 800T A2. Currently, LLaMa3-8B/LLaMa2-7B/Qwen2-7B are tested on graph mode.
+> Graph mode has been supported on Atlas 800T A2.
 > Users can set `eager_mode=False` to enable graph mode, or, set `eager_mode=True` to disable graph mode.
 > (Please source `/usr/local/Ascend/nnal/atb/set_env.sh` before enabling graph mode)
 
@@ -86,7 +86,7 @@ if __name__ == "__main__":
 ## Online serving
 
 > \[!TIP\]
-> Graph mode has been supported on Atlas 800T A2. Currently, InternLM2-7B/LLaMa2-7B/Qwen2-7B are tested on graph mode.
+> Graph mode has been supported on Atlas 800T A2.
 > Graph mode is default enabled in online serving. Users can add `--eager-mode` to disable graph mode.
 > (Please source `/usr/local/Ascend/nnal/atb/set_env.sh` before enabling graph mode)
 
diff --git a/docs/zh_cn/get_started/ascend/get_started.md b/docs/zh_cn/get_started/ascend/get_started.md
index e00c1e173a..046aea756b 100644
--- a/docs/zh_cn/get_started/ascend/get_started.md
+++ b/docs/zh_cn/get_started/ascend/get_started.md
@@ -49,7 +49,7 @@ docker run -e ASCEND_VISIBLE_DEVICES=0 --rm --name lmdeploy -t lmdeploy-aarch64-
 ## 离线批处理
 
 > \[!TIP\]
-> 图模式已经支持了Atlas 800T A2。目前，单卡下的LLaMa3-8B/LLaMa2-7B/Qwen2-7B已经通过测试。用户可以设定`eager_mode=False`来开启图模式，或者设定`eager_mode=True`来关闭图模式。(启动图模式需要事先source `/usr/local/Ascend/nnal/atb/set_env.sh`)
+> 图模式已经支持了Atlas 800T A2。用户可以设定`eager_mode=False`来开启图模式，或者设定`eager_mode=True`来关闭图模式。(启动图模式需要事先source `/usr/local/Ascend/nnal/atb/set_env.sh`)
 
 ### LLM 推理
 
@@ -84,7 +84,7 @@ if __name__ == "__main__":
 ## 在线服务
 
 > \[!TIP\]
-> 图模式已经支持Atlas 800T A2。目前，单卡下的InternLM2-7B/LLaMa2-7B/Qwen2-7B已经通过测试。
+> 图模式已经支持Atlas 800T A2。
 > 在线服务时，图模式默认开启，用户可以添加`--eager-mode`来关闭图模式。(启动图模式需要事先source `/usr/local/Ascend/nnal/atb/set_env.sh`)
 
 ### LLM 模型服务

From 8e0076a059cd27d1ba1deb2801af2c3668c563d4 Mon Sep 17 00:00:00 2001
From: tangzhiyi11 <tangzhiyi11@users.noreply.github.com>
Date: Thu, 14 Nov 2024 14:29:59 +0800
Subject: [PATCH 079/122] feat: support multi cards in ascend graph mode
 (#2755)

* support multi cards in ascend graph mode

* update warning info

* update warning info
---
 .../backends/dlinfer/ascend/graph_runner.py   | 28 ++++++++-----------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py b/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py
index 7dbb86d4b6..b69cb1dca5 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py
@@ -22,7 +22,6 @@ def __init__(self, model: torch.nn.Module, model_config: ModelConfig,
         super().__init__(model, model_config, cache_config, backend_config,
                          device)
 
-        self.supported_model = ['Llama3-8B', 'Llama2-7B', 'Qwen2-7B']
         self.enable_graph = self.check_enable_graph()
         if self.enable_graph:
             import dlinfer.graph
@@ -39,26 +38,23 @@ def check_enable_graph(self):
         # eager_mode
         if self.backend_config.eager_mode:
             return False
-        # tp
-        if torch.distributed.is_initialized():
-            warnings.warn(
-                "Graph mode of device_type 'ascend' only supports tp=1 "
-                'for now, fallback to eager mode', RuntimeWarning)
-            return False
 
         warnings.warn(
             '\n\n'
-            '**********************************************************\n'
-            '  The following models were tested in graph mode of\n'
-            "  device_type 'ascend' when tp=1:\n"
-            f"  {', '.join(self.supported_model)}\n"
-            '  Other LLaMa-like models may work in graph mode, please\n'
-            '  check the result yourself!\n'
-            '  If graph mode does not work correctly with your model,\n'
-            '  please use eager mode instead.\n'
-            '**********************************************************\n\n',
+            '************************************************************\n'
+            '  Graph mode is an experimental feature. We currently\n'
+            '  support both dense and Mixture of Experts (MoE) models\n'
+            '  with bf16 and fp16 data types.\n'
+            '  If graph mode does not function correctly with your model,\n'
+            '  please consider using eager mode as an alternative.\n'
+            '************************************************************\n\n',
             RuntimeWarning)
 
+        # tp
+        if torch.distributed.is_initialized():
+            torch._inductor.config.compile_threads = 1
+            return True
+
         return True
 
     def patch_kernels_custom_op(self):

From 21f2866e54a0acd3b7b241f4e44ba6337aac2025 Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Fri, 15 Nov 2024 00:42:28 +0800
Subject: [PATCH 080/122] Remove use_fast=True when loading tokenizer for lite
 auto_awq (#2758)

---
 lmdeploy/lite/apis/calibrate.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
index 65ecd765c7..cd5178793d 100644
--- a/lmdeploy/lite/apis/calibrate.py
+++ b/lmdeploy/lite/apis/calibrate.py
@@ -236,7 +236,6 @@ def calibrate(model: str,
     if model_type == 'llm':
         # Load tokenizer and configuration
         tokenizer = AutoTokenizer.from_pretrained(model,
-                                                  use_fast=False,
                                                   trust_remote_code=True)
 
         model = load_hf_from_pretrained(model,

From 9ecc44abeb99e672c17959b4543e041c14f2221c Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Fri, 15 Nov 2024 19:19:23 +0800
Subject: [PATCH 081/122] set wrong head_dim for mistral-nemo (#2761)

---
 lmdeploy/turbomind/deploy/source_model/llama.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lmdeploy/turbomind/deploy/source_model/llama.py b/lmdeploy/turbomind/deploy/source_model/llama.py
index a8aa51b144..0c702d6588 100644
--- a/lmdeploy/turbomind/deploy/source_model/llama.py
+++ b/lmdeploy/turbomind/deploy/source_model/llama.py
@@ -153,6 +153,7 @@ def model_info(self):
             max_position_embeddings = int(
                 model_arg.get('max_position_embeddings', 0))
             rope_scaling = model_arg.get('rope_scaling', None)
+            head_dim = model_arg.get('head_dim', hidden_units // attn_head_num)
             scaling_factor = 0.0
             use_dynamic_ntk = 0
             scaling_type = ''
@@ -189,7 +190,7 @@ def model_info(self):
                     beta_slow = rope_scaling.get('beta_slow', 1.0)
 
         return dict(
-            size_per_head=hidden_units // attn_head_num,
+            size_per_head=head_dim,
             rotary_embedding=hidden_units // attn_head_num,
             num_layer=num_layer,
             norm_eps=norm_eps,

From 0c80baa001e79d0b7d182b8a670190801d2d8d5b Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Sat, 16 Nov 2024 12:29:24 +0800
Subject: [PATCH 082/122] bump version to v0.6.3 (#2754)

* bump version to v0.6.3

* update supported models
---
 README.md                                       |  1 +
 README_ja.md                                    |  1 +
 README_zh-CN.md                                 |  1 +
 docs/en/get_started/installation.md             |  2 +-
 docs/en/multi_modal/vl_pipeline.md              | 16 +++-------------
 docs/en/supported_models/supported_models.md    |  1 +
 docs/zh_cn/get_started/installation.md          |  2 +-
 docs/zh_cn/multi_modal/vl_pipeline.md           | 16 +++-------------
 docs/zh_cn/supported_models/supported_models.md |  1 +
 lmdeploy/version.py                             |  2 +-
 10 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/README.md b/README.md
index 5b6ad47bdf..d160338aa6 100644
--- a/README.md
+++ b/README.md
@@ -167,6 +167,7 @@ For detailed inference benchmarks in more devices and more settings, please refe
   <li>Phi-3.5-vision (4.2B)</li>
   <li>GLM-4V (9B)</li>
   <li>Llama3.2-vision (11B, 90B)</li>
+  <li>Molmo (7B-D,72B)</li>
 </ul>
 </td>
 </tr>
diff --git a/README_ja.md b/README_ja.md
index bdd9ddb02d..fda176229e 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -163,6 +163,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
   <li>Phi-3.5-vision (4.2B)</li>
   <li>GLM-4V (9B)</li>
   <li>Llama3.2-vision (11B, 90B)</li>
+  <li>Molmo (7B-D,72B)</li>
 </ul>
 </td>
 </tr>
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 550922d081..6c24b2e500 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -168,6 +168,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型
   <li>Phi-3.5-vision (4.2B)</li>
   <li>GLM-4V (9B)</li>
   <li>Llama3.2-vision (11B, 90B)</li>
+  <li>Molmo (7B-D,72B)</li>
 </ul>
 </td>
 </tr>
diff --git a/docs/en/get_started/installation.md b/docs/en/get_started/installation.md
index b7d03b28a6..b3e8bb8abd 100644
--- a/docs/en/get_started/installation.md
+++ b/docs/en/get_started/installation.md
@@ -23,7 +23,7 @@ pip install lmdeploy
 The default prebuilt package is compiled on **CUDA 12**. If CUDA 11+ (>=11.3) is required, you can install lmdeploy by:
 
 ```shell
-export LMDEPLOY_VERSION=0.6.2
+export LMDEPLOY_VERSION=0.6.3
 export PYTHON_VERSION=38
 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 ```
diff --git a/docs/en/multi_modal/vl_pipeline.md b/docs/en/multi_modal/vl_pipeline.md
index 4881b99071..9632c9e6df 100644
--- a/docs/en/multi_modal/vl_pipeline.md
+++ b/docs/en/multi_modal/vl_pipeline.md
@@ -2,24 +2,14 @@
 
 LMDeploy abstracts the complex inference process of multi-modal Vision-Language Models (VLM) into an easy-to-use pipeline, similar to the Large Language Model (LLM) inference [pipeline](../llm/pipeline.md).
 
-Currently, it supports the following models.
-
-- [Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)
-- LLaVA series: [v1.5](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [v1.6](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
-- [Yi-VL](https://huggingface.co/01-ai/Yi-VL-6B)
-- [DeepSeek-VL](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)
-- [InternVL](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5)
-- [Mono-InternVL](https://huggingface.co/OpenGVLab/Mono-InternVL-2B)
-- [MGM](https://huggingface.co/YanweiLi/MGM-7B)
-- [XComposer](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b)
-- [CogVLM](https://github.com/InternLM/lmdeploy/tree/main/docs/en/multi_modal/cogvlm.md)
-
-We genuinely invite the community to contribute new VLM support to LMDeploy. Your involvement is truly appreciated.
+The supported models are listed [here](../supported_models/supported_models.md). We genuinely invite the community to contribute new VLM support to LMDeploy. Your involvement is truly appreciated.
 
 This article showcases the VLM pipeline using the [liuhaotian/llava-v1.6-vicuna-7b](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b) model as a case study.
 You'll learn about the simplest ways to leverage the pipeline and how to gradually unlock more advanced features by adjusting engine parameters and generation arguments, such as tensor parallelism, context window sizing, random sampling, and chat template customization.
 Moreover, we will provide practical inference examples tailored to scenarios with multiple images, batch prompts etc.
 
+Using the pipeline interface to infer other VLM models is similar, with the main difference being the configuration and installation dependencies of the models. You can read [here](https://lmdeploy.readthedocs.io/en/latest/multi_modal/index.html) for environment installation and configuration methods for different models.
+
 ## A 'Hello, world' example
 
 ```python
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index 90ca90388b..a122f10ec8 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -36,6 +36,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |    MiniGeminiLlama    |      7B      | MLLM |    Yes    |    -    |    -    |  Yes  |
 |         GLM4          |      9B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       CodeGeeX4       |      9B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
+|         Molmo         |   7B-D,72B   | MLLM |    Yes    |   Yes   |   Yes   |  NO   |
 
 "-" means not verified yet.
 
diff --git a/docs/zh_cn/get_started/installation.md b/docs/zh_cn/get_started/installation.md
index 3108d64815..12562c51d5 100644
--- a/docs/zh_cn/get_started/installation.md
+++ b/docs/zh_cn/get_started/installation.md
@@ -23,7 +23,7 @@ pip install lmdeploy
 默认的预构建包是在 **CUDA 12** 上编译的。如果需要 CUDA 11+ (>=11.3)，你可以使用以下命令安装 lmdeploy：
 
 ```shell
-export LMDEPLOY_VERSION=0.6.2
+export LMDEPLOY_VERSION=0.6.3
 export PYTHON_VERSION=38
 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 ```
diff --git a/docs/zh_cn/multi_modal/vl_pipeline.md b/docs/zh_cn/multi_modal/vl_pipeline.md
index 570598311a..35f647e36c 100644
--- a/docs/zh_cn/multi_modal/vl_pipeline.md
+++ b/docs/zh_cn/multi_modal/vl_pipeline.md
@@ -2,24 +2,14 @@
 
 LMDeploy 把视觉-语言模型（VLM）复杂的推理过程，抽象为简单好用的 pipeline。它的用法与大语言模型（LLM）推理 [pipeline](../llm/pipeline.md) 类似。
 
-目前，VLM pipeline 支持以下模型：
-
-- [Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)
-- LLaVA series: [v1.5](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [v1.6](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
-- [Yi-VL](https://huggingface.co/01-ai/Yi-VL-6B)
-- [DeepSeek-VL](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)
-- [InternVL](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5)
-- [Mono-InternVL](https://huggingface.co/OpenGVLab/Mono-InternVL-2B)
-- [MGM](https://huggingface.co/YanweiLi/MGM-7B)
-- [XComposer](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b)
-- [CogVLM](https://github.com/InternLM/lmdeploy/tree/main/docs/zh_cn/multi_modal/cogvlm.md)
-
-我们诚挚邀请社区在 LMDeploy 中添加更多 VLM 模型的支持。
+在[这个列表中](../supported_models/supported_models.md)，你可以查阅每个推理引擎支持的 VLM 模型。我们诚挚邀请社区在 LMDeploy 中添加更多 VLM 模型。
 
 本文将以 [liuhaotian/llava-v1.6-vicuna-7b](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b) 模型为例，展示 VLM pipeline 的用法。你将了解它的最基础用法，以及如何通过调整引擎参数和生成条件来逐步解锁更多高级特性，如张量并行，上下文窗口大小调整，随机采样，以及对话模板的定制。
 
 此外，我们还提供针对多图、批量提示词等场景的实际推理示例。
 
+使用 pipeline 接口推理其他 VLM 模型，大同小异，主要区别在于模型依赖的配置和安装。你可以阅读[此处](https://lmdeploy.readthedocs.io/zh-cn/latest/multi_modal/)，查看不同模型的环境安装和配置方式
+
 ## "Hello, world" 示例
 
 ```python
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index fecfdee200..f3ffd4311d 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -36,6 +36,7 @@
 |    MiniGeminiLlama    |      7B      | MLLM |    Yes    |    -    |    -    |  Yes  |
 |         GLM4          |      9B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       CodeGeeX4       |      9B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
+|         Molmo         |   7B-D,72B   | MLLM |    Yes    |   Yes   |   Yes   |  NO   |
 
 “-” 表示还没有验证。
 
diff --git a/lmdeploy/version.py b/lmdeploy/version.py
index b9f76b5761..d9f4307a78 100644
--- a/lmdeploy/version.py
+++ b/lmdeploy/version.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Tuple
 
-__version__ = '0.6.2'
+__version__ = '0.6.3'
 short_version = __version__
 
 
From 96fa66846e7af4d5e321c7f2694612d269c2c10c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=80=9D=E5=A4=9C=E9=95=BF=E6=AD=8C?= <928926035@qq.com>
Date: Tue, 19 Nov 2024 11:21:58 +0800
Subject: [PATCH 083/122] feature: support qwen2.5 fuction_call (#2737)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat: support qwen2.5 tools_call

* fix: npe bug

* fix: 模版不一致

* fix: adopting review suggestions

* fix: adopting review suggestions

* fix: adopting review suggestions

* fix: adopting review suggestions

* feat: Support multi tools calling

* feat: Support multi tools calling

* fix: Add '\n' between each tool

* fix: Add ensure_ascii=False

* bugfix: rfind

* bugfix: tools_call -> tool_calls

* bugfix: add toolName in tool_response

* fix: some '\n' error

* fix: remove toolname

* fix: replace '\n' to self.separator

* feat: add doc with multiple tool calling

* fix：update doc

* feat: add qwen2.5 prompt template test

* feat: add qwen2.5 no tool call prompt test

---------

Co-authored-by: gaozixiang <gaozixiang1@xiaomi.com>
---
 docs/en/llm/api_server_tools.md     | 155 ++++++++++++++-
 docs/zh_cn/llm/api_server_tools.md  | 155 ++++++++++++++-
 lmdeploy/model.py                   | 110 ++++++++++-
 lmdeploy/serve/async_engine.py      |  28 ++-
 lmdeploy/serve/openai/api_server.py |  13 +-
 tests/test_lmdeploy/test_model.py   | 286 ++++++++++++++++++++++++++++
 6 files changed, 736 insertions(+), 11 deletions(-)

diff --git a/docs/en/llm/api_server_tools.md b/docs/en/llm/api_server_tools.md
index 56fb1b598a..39e91dbf07 100644
--- a/docs/en/llm/api_server_tools.md
+++ b/docs/en/llm/api_server_tools.md
@@ -1,6 +1,6 @@
 # Tools Calling
 
-LMDeploy supports tools for InternLM2, InternLM2.5 and llama3.1 models.
+LMDeploy supports tools for InternLM2, InternLM2.5, llama3.1 and Qwen2.5 models.
 
 ## Single Round Invocation
 
@@ -241,3 +241,156 @@ messages += [
 assistant_response = request_llama3_1_service(messages)
 print(assistant_response)
 ```
+
+### Qwen2.5
+
+Qwen2.5 supports multi tool calling, which means that multiple tool requests can be initiated in one request
+
+```python
+from openai import OpenAI
+import json
+
+def get_current_temperature(location: str, unit: str = "celsius"):
+    """Get current temperature at a location.
+
+    Args:
+        location: The location to get the temperature for, in the format "City, State, Country".
+        unit: The unit to return the temperature in. Defaults to "celsius". (choices: ["celsius", "fahrenheit"])
+
+    Returns:
+        the temperature, the location, and the unit in a dict
+    """
+    return {
+        "temperature": 26.1,
+        "location": location,
+        "unit": unit,
+    }
+
+
+def get_temperature_date(location: str, date: str, unit: str = "celsius"):
+    """Get temperature at a location and date.
+
+    Args:
+        location: The location to get the temperature for, in the format "City, State, Country".
+        date: The date to get the temperature for, in the format "Year-Month-Day".
+        unit: The unit to return the temperature in. Defaults to "celsius". (choices: ["celsius", "fahrenheit"])
+
+    Returns:
+        the temperature, the location, the date and the unit in a dict
+    """
+    return {
+        "temperature": 25.9,
+        "location": location,
+        "date": date,
+        "unit": unit,
+    }
+
+def get_function_by_name(name):
+    if name == "get_current_temperature":
+        return get_current_temperature
+    if name == "get_temperature_date":
+        return get_temperature_date
+
+tools = [{
+    'type': 'function',
+    'function': {
+        'name': 'get_current_temperature',
+        'description': 'Get current temperature at a location.',
+        'parameters': {
+            'type': 'object',
+            'properties': {
+                'location': {
+                    'type': 'string',
+                    'description': 'The location to get the temperature for, in the format \'City, State, Country\'.'
+                },
+                'unit': {
+                    'type': 'string',
+                    'enum': [
+                        'celsius',
+                        'fahrenheit'
+                    ],
+                    'description': 'The unit to return the temperature in. Defaults to \'celsius\'.'
+                }
+            },
+            'required': [
+                'location'
+            ]
+        }
+    }
+}, {
+    'type': 'function',
+    'function': {
+        'name': 'get_temperature_date',
+        'description': 'Get temperature at a location and date.',
+        'parameters': {
+            'type': 'object',
+            'properties': {
+                'location': {
+                    'type': 'string',
+                    'description': 'The location to get the temperature for, in the format \'City, State, Country\'.'
+                },
+                'date': {
+                    'type': 'string',
+                    'description': 'The date to get the temperature for, in the format \'Year-Month-Day\'.'
+                },
+                'unit': {
+                    'type': 'string',
+                    'enum': [
+                        'celsius',
+                        'fahrenheit'
+                    ],
+                    'description': 'The unit to return the temperature in. Defaults to \'celsius\'.'
+                }
+            },
+            'required': [
+                'location',
+                'date'
+            ]
+        }
+    }
+}]
+messages = [{'role': 'user', 'content': 'Today is 2024-11-14, What\'s the temperature in San Francisco now? How about tomorrow?'}]
+
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+model_name = client.models.list().data[0].id
+response = client.chat.completions.create(
+    model=model_name,
+    messages=messages,
+    temperature=0.8,
+    top_p=0.8,
+    stream=False,
+    tools=tools)
+print(response.choices[0].message.tool_calls)
+messages.append(response.choices[0].message)
+
+for tool_call in response.choices[0].message.tool_calls:
+    tool_call_args = json.loads(tool_call.function.arguments)
+    tool_call_result =  get_function_by_name(tool_call.function.name)(**tool_call_args)
+    messages.append({
+        'role': 'tool',
+        'name': tool_call.function.name,
+        'content': tool_call_result,
+        'tool_call_id': tool_call.id
+    })
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=messages,
+    temperature=0.8,
+    top_p=0.8,
+    stream=False,
+    tools=tools)
+print(response.choices[0].message.content)
+
+```
+
+Using the Qwen2.5-14B-Instruct, similar results can be obtained as follows
+
+```
+[ChatCompletionMessageToolCall(id='0', function=Function(arguments='{"location": "San Francisco, California, USA"}', name='get_current_temperature'), type='function'),
+ ChatCompletionMessageToolCall(id='1', function=Function(arguments='{"location": "San Francisco, California, USA", "date": "2024-11-15"}', name='get_temperature_date'), type='function')]
+
+The current temperature in San Francisco, California, USA is 26.1°C. For tomorrow, 2024-11-15, the temperature is expected to be 25.9°C.
+```
+
+It is important to note that in scenarios involving multiple tool calls, the order of the tool call results can affect the response quality. The tool_call_id has not been correctly provided to the LLM.
diff --git a/docs/zh_cn/llm/api_server_tools.md b/docs/zh_cn/llm/api_server_tools.md
index 643a39d5d2..8688ea35cd 100644
--- a/docs/zh_cn/llm/api_server_tools.md
+++ b/docs/zh_cn/llm/api_server_tools.md
@@ -1,6 +1,6 @@
 # Tools
 
-LMDeploy 支持 InternLM2, InternLM2.5 和 Llama3.1 模型的工具调用。
+LMDeploy 支持 InternLM2, InternLM2.5, Llama3.1 和 Qwen2.5模型的工具调用。
 
 ## 单轮调用
 
@@ -241,3 +241,156 @@ messages += [
 assistant_response = request_llama3_1_service(messages)
 print(assistant_response)
 ```
+
+### Qwen2.5
+
+Qwen2.5 支持了多工具调用，这意味着可以在一次请求中可能发起多个工具请求
+
+```python
+from openai import OpenAI
+import json
+
+def get_current_temperature(location: str, unit: str = "celsius"):
+    """Get current temperature at a location.
+
+    Args:
+        location: The location to get the temperature for, in the format "City, State, Country".
+        unit: The unit to return the temperature in. Defaults to "celsius". (choices: ["celsius", "fahrenheit"])
+
+    Returns:
+        the temperature, the location, and the unit in a dict
+    """
+    return {
+        "temperature": 26.1,
+        "location": location,
+        "unit": unit,
+    }
+
+
+def get_temperature_date(location: str, date: str, unit: str = "celsius"):
+    """Get temperature at a location and date.
+
+    Args:
+        location: The location to get the temperature for, in the format "City, State, Country".
+        date: The date to get the temperature for, in the format "Year-Month-Day".
+        unit: The unit to return the temperature in. Defaults to "celsius". (choices: ["celsius", "fahrenheit"])
+
+    Returns:
+        the temperature, the location, the date and the unit in a dict
+    """
+    return {
+        "temperature": 25.9,
+        "location": location,
+        "date": date,
+        "unit": unit,
+    }
+
+def get_function_by_name(name):
+    if name == "get_current_temperature":
+        return get_current_temperature
+    if name == "get_temperature_date":
+        return get_temperature_date
+
+tools = [{
+    'type': 'function',
+    'function': {
+        'name': 'get_current_temperature',
+        'description': 'Get current temperature at a location.',
+        'parameters': {
+            'type': 'object',
+            'properties': {
+                'location': {
+                    'type': 'string',
+                    'description': 'The location to get the temperature for, in the format \'City, State, Country\'.'
+                },
+                'unit': {
+                    'type': 'string',
+                    'enum': [
+                        'celsius',
+                        'fahrenheit'
+                    ],
+                    'description': 'The unit to return the temperature in. Defaults to \'celsius\'.'
+                }
+            },
+            'required': [
+                'location'
+            ]
+        }
+    }
+}, {
+    'type': 'function',
+    'function': {
+        'name': 'get_temperature_date',
+        'description': 'Get temperature at a location and date.',
+        'parameters': {
+            'type': 'object',
+            'properties': {
+                'location': {
+                    'type': 'string',
+                    'description': 'The location to get the temperature for, in the format \'City, State, Country\'.'
+                },
+                'date': {
+                    'type': 'string',
+                    'description': 'The date to get the temperature for, in the format \'Year-Month-Day\'.'
+                },
+                'unit': {
+                    'type': 'string',
+                    'enum': [
+                        'celsius',
+                        'fahrenheit'
+                    ],
+                    'description': 'The unit to return the temperature in. Defaults to \'celsius\'.'
+                }
+            },
+            'required': [
+                'location',
+                'date'
+            ]
+        }
+    }
+}]
+messages = [{'role': 'user', 'content': 'Today is 2024-11-14, What\'s the temperature in San Francisco now? How about tomorrow?'}]
+
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+model_name = client.models.list().data[0].id
+response = client.chat.completions.create(
+    model=model_name,
+    messages=messages,
+    temperature=0.8,
+    top_p=0.8,
+    stream=False,
+    tools=tools)
+print(response.choices[0].message.tool_calls)
+messages.append(response.choices[0].message)
+
+for tool_call in response.choices[0].message.tool_calls:
+    tool_call_args = json.loads(tool_call.function.arguments)
+    tool_call_result =  get_function_by_name(tool_call.function.name)(**tool_call_args)
+    messages.append({
+        'role': 'tool',
+        'name': tool_call.function.name,
+        'content': tool_call_result,
+        'tool_call_id': tool_call.id
+    })
+
+response = client.chat.completions.create(
+    model=model_name,
+    messages=messages,
+    temperature=0.8,
+    top_p=0.8,
+    stream=False,
+    tools=tools)
+print(response.choices[0].message.content)
+
+```
+
+使用Qwen2.5-14B-Instruct，可以得到以下类似结果
+
+```
+[ChatCompletionMessageToolCall(id='0', function=Function(arguments='{"location": "San Francisco, California, USA"}', name='get_current_temperature'), type='function'),
+ ChatCompletionMessageToolCall(id='1', function=Function(arguments='{"location": "San Francisco, California, USA", "date": "2024-11-15"}', name='get_temperature_date'), type='function')]
+
+The current temperature in San Francisco, California, USA is 26.1°C. For tomorrow, 2024-11-15, the temperature is expected to be 25.9°C.
+```
+
+需要注意的是，多工具调用的情况下，工具调用的结果顺序会影响回答的效果，tool_call_id并没有正确给到LLM.
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index c9eb71c2c3..47aaaa4e88 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -944,7 +944,8 @@ def match(cls, model_path: str) -> Optional[str]:
         Args:
             model_path (str): the model path used for matching.
         """
-        if 'qwen' in model_path.lower():
+        if 'qwen' in model_path.lower() and 'qwen2.5' not in model_path.lower(
+        ):
             return 'qwen'
         if 'minicpm-v-2_6' in model_path.lower():
             return 'minicpmv-2d6'
@@ -952,6 +953,113 @@ def match(cls, model_path: str) -> Optional[str]:
             return 'minicpm3'
 
 
+@MODELS.register_module(name='qwen2d5')
+class Qwen2d5Chat(Qwen7BChat):
+    """Chat template for Qwen2.5-Instruct series."""
+
+    def __init__(
+            self,
+            system='<|im_start|>system\n',
+            meta_instruction='You are Qwen, created by Alibaba Cloud. You are a helpful assistant.',
+            eosys='<|im_end|>\n',
+            user='<|im_start|>user\n',
+            eoh='<|im_end|>\n',
+            assistant='<|im_start|>assistant\n',
+            eoa='<|im_end|>',
+            separator='\n',
+            tools="""\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>""",
+            eotools="""\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{"name": <function-name>, "arguments": <args-json-object>}\n</tool_call>""",
+            stop_words=['<|im_end|>'],
+            **kwargs):
+
+        self.tools = tools
+        self.eotools = eotools
+        super().__init__(system=system,
+                         meta_instruction=meta_instruction,
+                         eosys=eosys,
+                         user=user,
+                         eoh=eoh,
+                         assistant=assistant,
+                         eoa=eoa,
+                         separator=separator,
+                         stop_words=stop_words,
+                         **kwargs)
+
+    def messages2prompt(self,
+                        messages,
+                        sequence_start=True,
+                        tools=None,
+                        **kwargs):
+        """Return the prompt that is concatenated with other elements in the
+        chat template.
+
+        Args:
+            messages (str | List): user's input prompt
+        Returns:
+            str: the concatenated prompt
+        """
+        if isinstance(messages, str):
+            return self.get_prompt(messages, sequence_start)
+        box_map = dict(user=self.user,
+                       assistant=self.assistant,
+                       system=self.system)
+        ret = ''
+        tool_prompt = ''
+        if tools is not None and len(tools) > 0:
+            for tool in tools:
+                tool_prompt += self.separator
+                tool_prompt += f'{{"type": "function", "function": {json.dumps(tool, ensure_ascii=False)}}}'
+            if len(messages) and messages[0]['role'] == 'system':
+                ret += f"{self.system}{messages[0]['content']}{self.tools}{tool_prompt}{self.eotools}{self.eosys}"
+            else:
+                ret += f'{self.system}{self.meta_instruction}{self.tools}{tool_prompt}{self.eotools}{self.eosys}'
+        else:
+            if self.meta_instruction is not None and sequence_start:
+                if len(messages) and messages[0]['role'] == 'system':
+                    ret += f"{self.system}{messages[0]['content']}{self.eosys}"
+                else:
+                    ret += f'{self.system}{self.meta_instruction}{self.eosys}'
+
+        for index, message in enumerate(messages):
+            if (message['role'] == 'user'
+                    or (message['role'] == 'system' and index != 0)
+                    or (message['role'] == 'assistant'
+                        and message.get('tool_calls') is None)):
+                ret += f"{box_map[message['role']]}{message['content']}{self.eosys}"
+            elif message['role'] == 'assistant':
+                ret += f'<|im_start|>assistant'
+                if message.get('content') is not None:
+                    ret += f"{self.separator}{message['content']}"
+
+                if message.get('tool_calls') is not None:
+                    tool_calls = message['tool_calls']
+                    for tool_call in tool_calls:
+                        if tool_call.get('function') is not None:
+                            tool_call = tool_call['function']
+                        ret += f'{self.separator}<tool_call>{self.separator}{{"name": "{tool_call["name"]}", "arguments": {json.dumps(tool_call["arguments"], ensure_ascii=False)}}}{self.separator}</tool_call>'
+                ret += self.eosys
+            if message['role'] == 'tool':
+                if index == 0 or messages[index - 1]['role'] != 'tool':
+                    ret += f'<|im_start|>user'
+                ret += f"{self.separator}<tool_response>{self.separator}{message['content']}{self.separator}</tool_response>"
+                if index == len(messages) - 1 or messages[index +
+                                                          1]['role'] != 'tool':
+                    ret += f'{self.eoh}'
+        ret += f'{self.assistant}'
+        return ret
+
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.
+
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        lower_path = model_path.lower()
+        if 'qwen2.5' in lower_path or 'qwen2_5' in lower_path:
+            return 'qwen2d5'
+
+
 @MODELS.register_module(name='codellama')
 class CodeLlama(Llama2):
 
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
index 3c8f193cd5..f3c3432328 100644
--- a/lmdeploy/serve/async_engine.py
+++ b/lmdeploy/serve/async_engine.py
@@ -4,6 +4,7 @@
 import json
 import os
 import random
+import re
 from contextlib import asynccontextmanager
 from copy import deepcopy
 from itertools import count
@@ -648,14 +649,37 @@ def parse_tool_response(self, text, tools, **kwargs):
             name, parameters = action['name'], json.dumps(action.get(
                 'parameters', action.get('arguments', {})),
                                                           ensure_ascii=False)
+            call_info_list = [(name, parameters)]
         elif '<function=' in text:  # llama3.1
             action, _ = text.split('</function>')
             parameters = action[action.find('{'):]
             name = action.split('<function=')[1].split('>{')[0]
+            call_info_list = [(name, parameters)]
+        elif '<tool_call>' in text and '</tool_call>' in text:  # qwen2.5
+            # get tool_call in text
+            pattern = r'<tool_call>(.*?)</tool_call>'
+            match_result_list = re.findall(pattern, text, re.DOTALL)
+            call_info_list = []
+            for match_result in match_result_list:
+                action = json.loads(match_result)
+                call_info_list.append((action['name'],
+                                       json.dumps(action['arguments'],
+                                                  ensure_ascii=False)))
+            # get text outside of tags
+            if not text.startswith('<tool_call>'):
+                text = text[:text.find('<tool_call>')]
+            elif not text.endswith('</tool_call>'):
+                text = text[text.rfind('</tool_call>') + len('</tool_call>'):]
+            else:
+                text = ''
+
         else:
             raise RuntimeError(f'Unexpected model response: {text}')
-        action_id = [tool.function.name for tool in tools].index(name)
-        return text, action_id, name, parameters
+
+        call_info_list = [([tool.function.name for tool in tools
+                            ].index(call_info[0]), call_info[0], call_info[1])
+                          for call_info in call_info_list]
+        return text, call_info_list
 
     def chat(self,
              prompt: str,
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index a12cadaa7d..2d0560720d 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -495,17 +495,18 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
             final_logprobs.extend(res.logprobs)
 
     tool_calls = None
-    if request.tool_choice != 'none' and ('<|plugin|>' in text
-                                          or '<function=' in text):
+    if request.tool_choice != 'none' and ('<|plugin|>' in text or '<function='
+                                          in text or '<tool_call>' in text):
         if final_res.finish_reason == 'stop':
             final_res.finish_reason = 'tool_calls'
         try:  # TODO add json_schema guidance to turbomind
-            text, action_id, name, parameters = VariableInterface.async_engine.parse_tool_response(  # noqa
+            text, call_info_list = VariableInterface.async_engine.parse_tool_response(  # noqa
                 text, request.tools)
             tool_calls = [
-                ToolCall(id=str(action_id),
-                         function=FunctionResponse(name=name,
-                                                   arguments=parameters))
+                ToolCall(id=str(call_info[0]),
+                         function=FunctionResponse(name=call_info[1],
+                                                   arguments=call_info[2]))
+                for call_info in call_info_list
             ]
         except Exception as e:
             logger.error(f'Exception: {e}')
diff --git a/tests/test_lmdeploy/test_model.py b/tests/test_lmdeploy/test_model.py
index 7e3e71793d..3b78053a74 100644
--- a/tests/test_lmdeploy/test_model.py
+++ b/tests/test_lmdeploy/test_model.py
@@ -9,6 +9,7 @@
      ('internlm/internlm2-1_8b', ['base']),
      ('models--internlm--internlm-chat-7b/snapshots/1234567', ['internlm']),
      ('Qwen/Qwen-7B-Chat', ['qwen']),
+     ('Qwen/Qwen2.5-7B-Instruct', ['qwen2d5']),
      ('codellama/CodeLlama-7b-hf', ['codellama']),
      ('upstage/SOLAR-0-70b', ['solar', 'solar-70b']),
      ('meta-llama/Llama-2-7b-chat-hf', ['llama2']),
@@ -283,6 +284,291 @@ def test_qwen():
         assert _prompt is None
 
 
+def test_qwen2d5():
+    prompt = 'hello, can u introduce yourself'
+    model = MODELS.get('qwen2d5')(capability='completion')
+    assert model.get_prompt(prompt, sequence_start=True) == prompt
+    assert model.get_prompt(prompt, sequence_start=False) == prompt
+
+    model = MODELS.get('qwen2d5')(capability='chat')
+
+    # No tool call
+    messages = [
+        dict(role='user',
+             content='What\'s the temperature in San Francisco now?')
+    ]
+    no_tool_prompt = ('<|im_start|>system\nYou are Qwen, created by Alibaba '
+                      'Cloud. You are a helpful '
+                      "assistant.<|im_end|>\n<|im_start|>user\nWhat's the "
+                      'temperature in San Francisco '
+                      'now?<|im_end|>\n<|im_start|>assistant\n')
+    assert model.messages2prompt(messages) == no_tool_prompt
+    assert model.messages2prompt(messages, tools=[]) == no_tool_prompt
+
+    messages.append({'role': 'assistant', 'content': 'I don\'t know.'})
+    no_tool_prompt = ('<|im_start|>system\nYou are Qwen, created by Alibaba '
+                      'Cloud. You are a helpful '
+                      "assistant.<|im_end|>\n<|im_start|>user\nWhat's the "
+                      'temperature in San Francisco '
+                      "now?<|im_end|>\n<|im_start|>assistant\nI don't "
+                      'know.<|im_end|>\n<|im_start|>assistant\n')
+    assert model.messages2prompt(messages) == no_tool_prompt
+    # Single tool call
+    tools = [{
+        'name': 'get_current_temperature',
+        'description': 'Get current temperature at a location.',
+        'parameters': {
+            'type': 'object',
+            'properties': {
+                'location': {
+                    'type':
+                    'string',
+                    'description':
+                    'The location to get the temperature for,'
+                    ' in the format \'City, State, Country\'.'
+                },
+                'unit': {
+                    'type':
+                    'string',
+                    'enum': ['celsius', 'fahrenheit'],
+                    'description':
+                    'The unit to return the temperature in. Defaults to '
+                    '\'celsius\'.'
+                }
+            },
+            'required': ['location']
+        }
+    }]
+
+    messages = [
+        dict(role='user',
+             content='What\'s the temperature in San Francisco now?')
+    ]
+    tool_prompt = ('<|im_start|>system\nYou are Qwen, created by Alibaba '
+                   'Cloud. You are a helpful assistant.\n\n# Tools\n\nYou '
+                   'may call one or more functions to assist with the user '
+                   'query.\n\nYou are provided with function signatures '
+                   "within <tools></tools> XML tags:\n<tools>\n{\"type\": "
+                   "\"function\", \"function\": {\"name\": "
+                   "\"get_current_temperature\", \"description\": \"Get "
+                   "current temperature at a location.\", \"parameters\": {"
+                   "\"type\": \"object\", \"properties\": {\"location\": {"
+                   "\"type\": \"string\", \"description\": \"The location to "
+                   "get the temperature for, in the format 'City, State, "
+                   "Country'.\"}, \"unit\": {\"type\": \"string\", \"enum\": "
+                   "[\"celsius\", \"fahrenheit\"], \"description\": \"The "
+                   'unit to return the temperature in. Defaults to '
+                   "'celsius'.\"}}, \"required\": ["
+                   "\"location\"]}}}\n</tools>\n\nFor each function call, "
+                   'return a json object with function name and arguments '
+                   'within <tool_call></tool_call> XML tags:\n<tool_call>\n{'
+                   "\"name\": <function-name>, \"arguments\": "
+                   '<args-json-object>}\n</tool_call><|im_end|>\n<|im_start'
+                   "|>user\nWhat's the temperature in San Francisco "
+                   'now?<|im_end|>\n<|im_start|>assistant\n')
+    assert model.messages2prompt(messages, tools=tools) == tool_prompt
+
+    messages.append(
+        dict(role='tool',
+             name='get_current_temperature',
+             content={
+                 'temperature': 26.1,
+                 'location': 'San Francisco, California, USA',
+                 'unit': 'celsius'
+             },
+             tool_call_id='0'))
+    tool_prompt = ('<|im_start|>system\nYou are Qwen, created by Alibaba '
+                   'Cloud. You are a helpful assistant.\n\n# Tools\n\nYou '
+                   'may call one or more functions to assist with the user '
+                   'query.\n\nYou are provided with function signatures '
+                   "within <tools></tools> XML tags:\n<tools>\n{\"type\": "
+                   "\"function\", \"function\": {\"name\": "
+                   "\"get_current_temperature\", \"description\": \"Get "
+                   "current temperature at a location.\", \"parameters\": {"
+                   "\"type\": \"object\", \"properties\": {\"location\": {"
+                   "\"type\": \"string\", \"description\": \"The location to "
+                   "get the temperature for, in the format 'City, State, "
+                   "Country'.\"}, \"unit\": {\"type\": \"string\", \"enum\": "
+                   "[\"celsius\", \"fahrenheit\"], \"description\": \"The "
+                   'unit to return the temperature in. Defaults to '
+                   "'celsius'.\"}}, \"required\": ["
+                   "\"location\"]}}}\n</tools>\n\nFor each function call, "
+                   'return a json object with function name and arguments '
+                   'within <tool_call></tool_call> XML tags:\n<tool_call>\n{'
+                   "\"name\": <function-name>, \"arguments\": "
+                   '<args-json-object>}\n</tool_call><|im_end|>\n<|im_start'
+                   "|>user\nWhat's the temperature in San Francisco "
+                   'now?<|im_end|>\n<|im_start|>user\n<tool_response>\n{'
+                   "'temperature': 26.1, 'location': 'San Francisco, "
+                   "California, USA', 'unit': "
+                   "'celsius'}\n</tool_response><|im_end|>\n<|im_start"
+                   '|>assistant\n')
+    assert model.messages2prompt(messages, tools=tools) == tool_prompt
+    # Multi tool calling
+    tools = [{
+        'name': 'get_current_temperature',
+        'description': 'Get current temperature at a location.',
+        'parameters': {
+            'type': 'object',
+            'properties': {
+                'location': {
+                    'type':
+                    'string',
+                    'description':
+                    'The location to get the temperature for, in the format '
+                    '\'City, State, Country\'.'
+                },
+                'unit': {
+                    'type':
+                    'string',
+                    'enum': ['celsius', 'fahrenheit'],
+                    'description':
+                    'The unit to return the temperature in.'
+                    ' Defaults to \'celsius\'.'
+                }
+            },
+            'required': ['location']
+        }
+    }, {
+        'name': 'get_temperature_date',
+        'description': 'Get temperature at a location and date.',
+        'parameters': {
+            'type': 'object',
+            'properties': {
+                'location': {
+                    'type':
+                    'string',
+                    'description':
+                    'The location to get the temperature for,'
+                    ' in the format \'City, State, Country\'.'
+                },
+                'date': {
+                    'type':
+                    'string',
+                    'description':
+                    'The date to get the temperature for,'
+                    ' in the format \'Year-Month-Day\'.'
+                },
+                'unit': {
+                    'type':
+                    'string',
+                    'enum': ['celsius', 'fahrenheit'],
+                    'description':
+                    'The unit to return the temperature in.'
+                    ' Defaults to \'celsius\'.'
+                }
+            },
+            'required': ['location', 'date']
+        }
+    }]
+    messages = [
+        dict(role='user',
+             content='Today is 2024-11-14, What\'s the temperature in'
+             ' San Francisco now? How about tomorrow?')
+    ]
+    tool_prompt = ('<|im_start|>system\nYou are Qwen, created by Alibaba '
+                   'Cloud. You are a helpful assistant.\n\n# Tools\n\nYou '
+                   'may call one or more functions to assist with the user '
+                   'query.\n\nYou are provided with function signatures '
+                   "within <tools></tools> XML tags:\n<tools>\n{\"type\": "
+                   "\"function\", \"function\": {\"name\": "
+                   "\"get_current_temperature\", \"description\": \"Get "
+                   "current temperature at a location.\", \"parameters\": {"
+                   "\"type\": \"object\", \"properties\": {\"location\": {"
+                   "\"type\": \"string\", \"description\": \"The location to "
+                   "get the temperature for, in the format 'City, State, "
+                   "Country'.\"}, \"unit\": {\"type\": \"string\", \"enum\": "
+                   "[\"celsius\", \"fahrenheit\"], \"description\": \"The "
+                   'unit to return the temperature in. Defaults to '
+                   "'celsius'.\"}}, \"required\": [\"location\"]}}}\n{"
+                   "\"type\": \"function\", \"function\": {\"name\": "
+                   "\"get_temperature_date\", \"description\": \"Get "
+                   "temperature at a location and date.\", \"parameters\": {"
+                   "\"type\": \"object\", \"properties\": {\"location\": {"
+                   "\"type\": \"string\", \"description\": \"The location to "
+                   "get the temperature for, in the format 'City, State, "
+                   "Country'.\"}, \"date\": {\"type\": \"string\", "
+                   "\"description\": \"The date to get the temperature for, "
+                   "in the format 'Year-Month-Day'.\"}, \"unit\": {\"type\": "
+                   "\"string\", \"enum\": [\"celsius\", \"fahrenheit\"], "
+                   "\"description\": \"The unit to return the temperature "
+                   "in. Defaults to 'celsius'.\"}}, \"required\": ["
+                   "\"location\", \"date\"]}}}\n</tools>\n\nFor each "
+                   'function call, return a json object with function name '
+                   'and arguments within <tool_call></tool_call> XML '
+                   "tags:\n<tool_call>\n{\"name\": <function-name>, "
+                   "\"arguments\": "
+                   '<args-json-object>}\n</tool_call><|im_end|>\n<|im_start'
+                   "|>user\nToday is 2024-11-14, What's the temperature in "
+                   'San Francisco now? How about '
+                   'tomorrow?<|im_end|>\n<|im_start|>assistant\n')
+    assert model.messages2prompt(messages, tools=tools) == tool_prompt
+
+    messages.append(
+        dict(role='tool',
+             name='get_current_temperature',
+             content={
+                 'temperature': 26.1,
+                 'location': 'San Francisco, California, USA',
+                 'unit': 'celsius'
+             },
+             tool_call_id='0'))
+    messages.append(
+        dict(role='tool',
+             name='get_temperature_date',
+             content={
+                 'temperature': 25.9,
+                 'location': 'San Francisco, California, USA',
+                 'date': '2024-11-15',
+                 'unit': 'celsius'
+             },
+             tool_call_id='1'))
+    tool_prompt = ('<|im_start|>system\nYou are Qwen, created by Alibaba '
+                   'Cloud. You are a helpful assistant.\n\n# Tools\n\nYou '
+                   'may call one or more functions to assist with the user '
+                   'query.\n\nYou are provided with function signatures '
+                   "within <tools></tools> XML tags:\n<tools>\n{\"type\": "
+                   "\"function\", \"function\": {\"name\": "
+                   "\"get_current_temperature\", \"description\": \"Get "
+                   "current temperature at a location.\", \"parameters\": {"
+                   "\"type\": \"object\", \"properties\": {\"location\": {"
+                   "\"type\": \"string\", \"description\": \"The location to "
+                   "get the temperature for, in the format 'City, State, "
+                   "Country'.\"}, \"unit\": {\"type\": \"string\", \"enum\": "
+                   "[\"celsius\", \"fahrenheit\"], \"description\": \"The "
+                   'unit to return the temperature in. Defaults to '
+                   "'celsius'.\"}}, \"required\": [\"location\"]}}}\n{"
+                   "\"type\": \"function\", \"function\": {\"name\": "
+                   "\"get_temperature_date\", \"description\": \"Get "
+                   "temperature at a location and date.\", \"parameters\": {"
+                   "\"type\": \"object\", \"properties\": {\"location\": {"
+                   "\"type\": \"string\", \"description\": \"The location to "
+                   "get the temperature for, in the format 'City, State, "
+                   "Country'.\"}, \"date\": {\"type\": \"string\", "
+                   "\"description\": \"The date to get the temperature for, "
+                   "in the format 'Year-Month-Day'.\"}, \"unit\": {\"type\": "
+                   "\"string\", \"enum\": [\"celsius\", \"fahrenheit\"], "
+                   "\"description\": \"The unit to return the temperature "
+                   "in. Defaults to 'celsius'.\"}}, \"required\": ["
+                   "\"location\", \"date\"]}}}\n</tools>\n\nFor each "
+                   'function call, return a json object with function name '
+                   'and arguments within <tool_call></tool_call> XML '
+                   "tags:\n<tool_call>\n{\"name\": <function-name>, "
+                   "\"arguments\": "
+                   '<args-json-object>}\n</tool_call><|im_end|>\n<|im_start'
+                   "|>user\nToday is 2024-11-14, What's the temperature in "
+                   'San Francisco now? How about '
+                   'tomorrow?<|im_end|>\n<|im_start|>user\n<tool_response'
+                   ">\n{'temperature': 26.1, 'location': 'San Francisco, "
+                   "California, USA', 'unit': "
+                   "'celsius'}\n</tool_response>\n<tool_response>\n{"
+                   "'temperature': 25.9, 'location': 'San Francisco, "
+                   "California, USA', 'date': '2024-11-15', 'unit': "
+                   "'celsius'}\n</tool_response><|im_end|>\n<|im_start"
+                   '|>assistant\n')
+    assert model.messages2prompt(messages, tools=tools) == tool_prompt
+
+
 def test_codellama_completion():
     model = MODELS.get('codellama')(capability='completion')
     prompt = """\

From 0608b01f87a32a31eea4e7c579edf659a65df6b8 Mon Sep 17 00:00:00 2001
From: jinminxi104 <jinminxi104@hotmail.com>
Date: Tue, 19 Nov 2024 17:58:25 +0800
Subject: [PATCH 084/122] Update supported models & Ascend doc (#2765)

* update ascend supported model list

* fix markdown

* fix markdown

* fix lint

* Update get_started.md

* Update get_started.md
---
 docs/en/get_started/ascend/get_started.md     |  4 +++
 docs/en/supported_models/supported_models.md  | 28 ++++++++++---------
 docs/zh_cn/get_started/ascend/get_started.md  |  4 +++
 .../supported_models/supported_models.md      | 28 ++++++++++---------
 4 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/docs/en/get_started/ascend/get_started.md b/docs/en/get_started/ascend/get_started.md
index 9e963b3795..23b86afa61 100644
--- a/docs/en/get_started/ascend/get_started.md
+++ b/docs/en/get_started/ascend/get_started.md
@@ -3,6 +3,8 @@
 The usage of lmdeploy on a Huawei Ascend device is almost the same as its usage on CUDA with PytorchEngine in lmdeploy.
 Please read the original [Get Started](../get_started.md) guide before reading this tutorial.
 
+Here is the [supported model list](../../supported_models/supported_models.md#PyTorchEngine-on-Huawei-Ascend-Platform).
+
 ## Installation
 
 We highly recommend that users build a Docker image for streamlined environment setup.
@@ -38,6 +40,8 @@ DOCKER_BUILDKIT=1 docker build -t lmdeploy-aarch64-ascend:latest \
     -f docker/Dockerfile_aarch64_ascend .
 ```
 
+The `Dockerfile_aarch64_ascend` is tested on Kunpeng CPU. For intel CPU, please try [this dockerfile](https://github.com/InternLM/lmdeploy/issues/2745#issuecomment-2473285703) (which is not fully tested)
+
 If the following command executes without any errors, it indicates that the environment setup is successful.
 
 ```bash
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index a122f10ec8..684a4f5109 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -98,16 +98,18 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 
 ## PyTorchEngine on Huawei Ascend Platform
 
-|     Model      |   Size   | Type | FP16/BF16 | W4A16 |
-| :------------: | :------: | :--: | :-------: | :---: |
-|     Llama2     | 7B - 70B | LLM  |    Yes    |  Yes  |
-|     Llama3     |    8B    | LLM  |    Yes    |  Yes  |
-|    Llama3.1    |    8B    | LLM  |    Yes    |  Yes  |
-|   InternLM2    | 7B - 20B | LLM  |    Yes    |  Yes  |
-|  InternLM2.5   | 7B - 20B | LLM  |    Yes    |  Yes  |
-|    Mixtral     |   8x7B   | LLM  |    Yes    |  No   |
-|  QWen1.5-MoE   |  A2.7B   | LLM  |    Yes    |  No   |
-|     QWen2      |    7B    | LLM  |    Yes    |  No   |
-|   QWen2-MoE    | A14.57B  | LLM  |    Yes    |  No   |
-| InternVL(v1.5) |  2B-26B  | MLLM |    Yes    |  Yes  |
-|   InternVL2    |  1B-40B  | MLLM |    Yes    |  Yes  |
+|     Model      |   Size   | Type | FP16/BF16(eager) | FP16/BF16(graph) | W4A16(eager) |
+| :------------: | :------: | :--: | :--------------: | :--------------: | :----------: |
+|     Llama2     | 7B - 70B | LLM  |       Yes        |       Yes        |     Yes      |
+|     Llama3     |    8B    | LLM  |       Yes        |       Yes        |     Yes      |
+|    Llama3.1    |    8B    | LLM  |       Yes        |       Yes        |     Yes      |
+|   InternLM2    | 7B - 20B | LLM  |       Yes        |       Yes        |     Yes      |
+|  InternLM2.5   | 7B - 20B | LLM  |       Yes        |       Yes        |     Yes      |
+|    Mixtral     |   8x7B   | LLM  |       Yes        |       Yes        |      No      |
+|  QWen1.5-MoE   |  A2.7B   | LLM  |       Yes        |        -         |      No      |
+|   QWen2(.5)    |    7B    | LLM  |       Yes        |       Yes        |      No      |
+|   QWen2-MoE    | A14.57B  | LLM  |       Yes        |        -         |      No      |
+| InternVL(v1.5) |  2B-26B  | MLLM |       Yes        |        -         |     Yes      |
+|   InternVL2    |  1B-40B  | MLLM |       Yes        |       Yes        |     Yes      |
+|  CogVLM2-chat  |   19B    | MLLM |       Yes        |        No        |      -       |
+|     GLM4V      |    9B    | MLLM |       Yes        |        No        |      -       |
diff --git a/docs/zh_cn/get_started/ascend/get_started.md b/docs/zh_cn/get_started/ascend/get_started.md
index 046aea756b..b137c458be 100644
--- a/docs/zh_cn/get_started/ascend/get_started.md
+++ b/docs/zh_cn/get_started/ascend/get_started.md
@@ -2,6 +2,8 @@
 
 我们基于 LMDeploy 的 PytorchEngine，增加了华为昇腾设备的支持。所以，在华为昇腾上使用 LDMeploy 的方法与在英伟达 GPU 上使用 PytorchEngine 后端的方法几乎相同。在阅读本教程之前，请先阅读原版的[快速开始](../get_started.md)。
 
+支持的模型列表在[这里](../../supported_models/supported_models.md#PyTorchEngine-华为昇腾平台).
+
 ## 安装
 
 我们强烈建议用户构建一个 Docker 镜像以简化环境设置。
@@ -38,6 +40,8 @@ DOCKER_BUILDKIT=1 docker build -t lmdeploy-aarch64-ascend:latest \
     -f docker/Dockerfile_aarch64_ascend .
 ```
 
+上述`Dockerfile_aarch64_ascend`适用于鲲鹏CPU. 如果是Intel CPU的机器，请尝试[这个dockerfile](https://github.com/InternLM/lmdeploy/issues/2745#issuecomment-2473285703) (未经过测试)
+
 如果以下命令执行没有任何错误，这表明环境设置成功。
 
 ```bash
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index f3ffd4311d..d8bf9a1ad8 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -98,16 +98,18 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 
 ## PyTorchEngine 华为昇腾平台
 
-|     Model      |   Size   | Type | FP16/BF16 | W4A16 |
-| :------------: | :------: | :--: | :-------: | :---: |
-|     Llama2     | 7B - 70B | LLM  |    Yes    |  Yes  |
-|     Llama3     |    8B    | LLM  |    Yes    |  Yes  |
-|    Llama3.1    |    8B    | LLM  |    Yes    |  Yes  |
-|   InternLM2    | 7B - 20B | LLM  |    Yes    |  Yes  |
-|  InternLM2.5   | 7B - 20B | LLM  |    Yes    |  Yes  |
-|    Mixtral     |   8x7B   | LLM  |    Yes    |  No   |
-|  QWen1.5-MoE   |  A2.7B   | LLM  |    Yes    |  No   |
-|     QWen2      |    7B    | LLM  |    Yes    |  No   |
-|   QWen2-MoE    | A14.57B  | LLM  |    Yes    |  No   |
-| InternVL(v1.5) |  2B-26B  | MLLM |    Yes    |  Yes  |
-|   InternVL2    |  1B-40B  | MLLM |    Yes    |  Yes  |
+|     Model      |   Size   | Type | FP16/BF16(eager) | FP16/BF16(graph) | W4A16(eager) |
+| :------------: | :------: | :--: | :--------------: | :--------------: | :----------: |
+|     Llama2     | 7B - 70B | LLM  |       Yes        |       Yes        |     Yes      |
+|     Llama3     |    8B    | LLM  |       Yes        |       Yes        |     Yes      |
+|    Llama3.1    |    8B    | LLM  |       Yes        |       Yes        |     Yes      |
+|   InternLM2    | 7B - 20B | LLM  |       Yes        |       Yes        |     Yes      |
+|  InternLM2.5   | 7B - 20B | LLM  |       Yes        |       Yes        |     Yes      |
+|    Mixtral     |   8x7B   | LLM  |       Yes        |       Yes        |      No      |
+|  QWen1.5-MoE   |  A2.7B   | LLM  |       Yes        |        -         |      No      |
+|   QWen2(.5)    |    7B    | LLM  |       Yes        |       Yes        |      No      |
+|   QWen2-MoE    | A14.57B  | LLM  |       Yes        |        -         |      No      |
+| InternVL(v1.5) |  2B-26B  | MLLM |       Yes        |        -         |     Yes      |
+|   InternVL2    |  1B-40B  | MLLM |       Yes        |       Yes        |     Yes      |
+|  CogVLM2-chat  |   19B    | MLLM |       Yes        |        No        |      -       |
+|     GLM4V      |    9B    | MLLM |       Yes        |        No        |      -       |

From 178ec7bddcba23d32d9cdc2488e045624390086c Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Tue, 19 Nov 2024 18:04:39 +0800
Subject: [PATCH 085/122] [CI] Split vl testcases into turbomind and pytorch
 backend (#2751)

* updaet

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update
---
 .github/scripts/eval_base_config.py           |  11 ++
 .github/scripts/eval_chat_config.py           |  21 ++
 .github/workflows/daily_ete_test.yml          |  47 +++--
 .github/workflows/daily_ete_test_v100.yml     |  47 +++--
 autotest/config-v100.yaml                     |  18 +-
 autotest/config.yaml                          |  55 ++++--
 ...h.py => test_pipeline_chat_pytorch_llm.py} |   0
 .../test_pipeline_chat_pytorch_mllm.py        | 120 ++++++++++++
 ...py => test_pipeline_chat_turbomind_llm.py} |   0
 .../test_pipeline_chat_turbomind_mllm.py      | 139 +++++++++++++
 .../test_pipeline_chat_turbomind_vl.py        | 109 -----------
 ...py => test_restful_chat_hf_pytorch_llm.py} |   0
 .../test_restful_chat_hf_pytorch_mllm.py      | 116 +++++++++++
 ... => test_restful_chat_hf_turbomind_llm.py} |   0
 .../test_restful_chat_hf_turbomind_mllm.py    | 116 +++++++++++
 .../test_restful_chat_hf_turbomind_vl.py      | 182 ------------------
 autotest/utils/benchmark_utils.py             |   9 +-
 autotest/utils/config_utils.py                |   2 +-
 autotest/utils/pipeline_chat.py               |  44 +++--
 autotest/utils/run_restful_chat.py            |  50 +++++
 docs/en/supported_models/supported_models.md  |  66 +++----
 .../supported_models/supported_models.md      |  66 +++----
 22 files changed, 786 insertions(+), 432 deletions(-)
 rename autotest/tools/pipeline/{test_pipeline_chat_pytorch.py => test_pipeline_chat_pytorch_llm.py} (100%)
 create mode 100644 autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
 rename autotest/tools/pipeline/{test_pipeline_chat_turbomind.py => test_pipeline_chat_turbomind_llm.py} (100%)
 create mode 100644 autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
 delete mode 100644 autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py
 rename autotest/tools/restful/{test_restful_chat_hf_pytorch.py => test_restful_chat_hf_pytorch_llm.py} (100%)
 create mode 100644 autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py
 rename autotest/tools/restful/{test_restful_chat_hf_turbomind.py => test_restful_chat_hf_turbomind_llm.py} (100%)
 create mode 100644 autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
 delete mode 100644 autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py

diff --git a/.github/scripts/eval_base_config.py b/.github/scripts/eval_base_config.py
index 8915decc7c..9a2b5fc39e 100644
--- a/.github/scripts/eval_base_config.py
+++ b/.github/scripts/eval_base_config.py
@@ -89,6 +89,17 @@
         models as lmdeploy_qwen1_5_7b  # noqa: F401, E501
     from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import \
         models as lmdeploy_qwen2_7b  # noqa: F401, E501
+    # Summary Groups
+    from opencompass.configs.summarizers.groups.cmmlu import \
+        cmmlu_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.GaokaoBench import \
+        GaokaoBench_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
+        mathbench_2024_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mmlu import \
+        mmlu_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mmlu_pro import \
+        mmlu_pro_summary_groups  # noqa: F401, E501
 
     # read models
 race_datasets = [race_datasets[1]]
diff --git a/.github/scripts/eval_chat_config.py b/.github/scripts/eval_chat_config.py
index a54b66bdc8..e2463c0f39 100644
--- a/.github/scripts/eval_chat_config.py
+++ b/.github/scripts/eval_chat_config.py
@@ -98,6 +98,27 @@
         models as lmdeploy_qwen2_7b_instruct  # noqa: F401, E501
     from opencompass.configs.models.qwen.lmdeploy_qwen_7b_chat import \
         models as lmdeploy_qwen_7b_chat  # noqa: F401, E501
+    # Summary Groups
+    from opencompass.configs.summarizers.groups.bbh import \
+        bbh_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.cmmlu import \
+        cmmlu_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.ds1000 import \
+        ds1000_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.GaokaoBench import \
+        GaokaoBench_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.humanevalx import \
+        humanevalx_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
+        mathbench_2024_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mmlu import \
+        mmlu_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mmlu_pro import \
+        mmlu_pro_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.scicode import \
+        scicode_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.teval import \
+        teval_summary_groups  # noqa: F401, E501
 
 llama2_meta_template = dict(round=[
     dict(role='HUMAN', begin='[INST] ', end=' [/INST]'),
diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
index ab01d692c0..dbacfc32f5 100644
--- a/.github/workflows/daily_ete_test.yml
+++ b/.github/workflows/daily_ete_test.yml
@@ -17,10 +17,15 @@ on:
         required: true
         description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
         type: string
-        default: '["turbomind", "pytorch", "turbomind_vl"]'
+        default: "['turbomind', 'pytorch']"
       model:
         required: true
-        description: 'Set testcase module filter: chat, restful, pipeline, quantization. Default contains all models'
+        description: 'Set testcase module filter: llm, vllm. Default contains all models'
+        type: string
+        default: "['llm','mllm']"
+      function:
+        required: true
+        description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions'
         type: string
         default: '["pipeline", "restful", "chat"]'
       offline_mode:
@@ -206,14 +211,20 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch", "turbomind_vl"]')}}
-        model: ${{ fromJSON(inputs.model || '["pipeline", "restful", "chat"]')}}
+        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
+        model: ${{ fromJSON(inputs.model || '["llm", "mllm"]')}}
+        function: ${{ fromJSON(inputs.function || '["pipeline","restful","chat"]')}}
         exclude:
-          - backend: turbomind_vl
-            model: chat
+          - backend: turbomind
+            model: mllm
+            function: chat
+          - backend: pytorch
+            model: mllm
+            function: chat
         include:
           - backend: turbomind
-            model: local_case
+            model: llm
+            function: local_case
     env:
       PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
       MODELSCOPE_CACHE: /root/modelscope_hub
@@ -261,7 +272,7 @@ jobs:
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - chat workspace
         continue-on-error: true
-        if: matrix.backend == 'turbomind' && matrix.model == 'chat'
+        if: matrix.backend == 'turbomind' && matrix.model == 'llm' && matrix.function == 'chat'
         run: |
           pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
@@ -269,7 +280,7 @@ jobs:
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - chat
         continue-on-error: true
-        if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'chat'
+        if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat'
         run: |
           pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
@@ -277,30 +288,30 @@ jobs:
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - pipeline
         continue-on-error: true
-        if: matrix.model == 'pipeline'
+        if: matrix.function == 'pipeline'
         run: |
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - restful
         continue-on-error: true
-        if: matrix.model == 'restful'
+        if: matrix.function == 'restful'
         run: |
-          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - restful workspace
         continue-on-error: true
-        if: matrix.backend == 'turbomind' && matrix.model == 'restful'
+        if: matrix.backend == 'turbomind' && matrix.model == 'llm' && matrix.function == 'restful'
         run: |
           pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
           pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - local testcase
-        if: matrix.backend == 'turbomind' && matrix.model == 'local_case'
+        if: matrix.backend == 'turbomind' && matrix.model == 'llm' && matrix.function == 'local_case'
         run: |
           pytest /local_case/issue_regression --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}}|| true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
@@ -321,7 +332,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        backend: ['turbomind', 'pytorch']
+        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
     timeout-minutes: 60
     container:
       image: openmmlab/lmdeploy:latest-cu11
diff --git a/.github/workflows/daily_ete_test_v100.yml b/.github/workflows/daily_ete_test_v100.yml
index 0112e9aaab..8a662b85f5 100644
--- a/.github/workflows/daily_ete_test_v100.yml
+++ b/.github/workflows/daily_ete_test_v100.yml
@@ -17,10 +17,15 @@ on:
         required: true
         description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
         type: string
-        default: '["turbomind", "pytorch", "turbomind_vl"]'
+        default: "['turbomind', 'pytorch']"
       model:
         required: true
-        description: 'Set testcase module filter: chat, restful, pipeline, quantization. Default contains all models'
+        description: 'Set testcase module filter: llm, vllm. Default contains all models'
+        type: string
+        default: "['llm','mllm']"
+      function:
+        required: true
+        description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions'
         type: string
         default: '["pipeline", "restful", "chat"]'
       offline_mode:
@@ -201,14 +206,20 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch", "turbomind_vl"]')}}
-        model: ${{ fromJSON(inputs.model || '["pipeline", "restful", "chat"]')}}
+        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
+        model: ${{ fromJSON(inputs.model || '["llm", "mllm"]')}}
+        function: ${{ fromJSON(inputs.function || '["pipeline","restful","chat"]')}}
         exclude:
-          - backend: turbomind_vl
-            model: chat
+          - backend: turbomind
+            model: mllm
+            function: chat
+          - backend: pytorch
+            model: mllm
+            function: chat
         include:
           - backend: turbomind
-            model: local_case
+            model: llm
+            function: local_case
     env:
       PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
       MODELSCOPE_CACHE: /root/modelscope_hub
@@ -255,7 +266,7 @@ jobs:
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test lmdeploy - chat workspace
         continue-on-error: true
-        if: matrix.backend == 'turbomind' && matrix.model == 'chat'
+        if: matrix.backend == 'turbomind' && matrix.model == 'llm' && matrix.function == 'chat'
         run: |
           pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
@@ -263,7 +274,7 @@ jobs:
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - chat
         continue-on-error: true
-        if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'chat'
+        if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat'
         run: |
           pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
@@ -271,30 +282,30 @@ jobs:
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - pipeline
         continue-on-error: true
-        if: matrix.model == 'pipeline'
+        if: matrix.function == 'pipeline'
         run: |
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - restful
         continue-on-error: true
-        if: matrix.model == 'restful'
+        if: matrix.function == 'restful'
         run: |
-          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - restful workspace
         continue-on-error: true
-        if: matrix.backend == 'turbomind' && matrix.model == 'restful'
+        if: matrix.backend == 'turbomind' && matrix.model == 'llm' && matrix.function == 'restful'
         run: |
           pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
           pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - local testcase
-        if: matrix.backend == 'turbomind' && matrix.model == 'local_case'
+        if: matrix.backend == 'turbomind' && matrix.model == 'llm' && matrix.function == 'local_case'
         run: |
           pytest /local_case/issue_regression --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}}|| true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
@@ -315,7 +326,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        backend: ['turbomind', 'pytorch']
+        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
     timeout-minutes: 120
     container:
       image: openmmlab/lmdeploy:latest-cu12
diff --git a/autotest/config-v100.yaml b/autotest/config-v100.yaml
index de51e7e5e7..41216cb730 100644
--- a/autotest/config-v100.yaml
+++ b/autotest/config-v100.yaml
@@ -22,6 +22,7 @@ tp_config:
 
 turbomind_chat_model:
     - meta-llama/Llama-3.2-1B-Instruct
+    - meta-llama/Llama-3.2-3B-Instruct
     - meta-llama/Meta-Llama-3-1-8B-Instruct
     - meta-llama/Meta-Llama-3-1-8B-Instruct-AWQ
     - meta-llama/Meta-Llama-3-8B-Instruct
@@ -48,6 +49,7 @@ pytorch_chat_model:
     - meta-llama/Meta-Llama-3-1-8B-Instruct
     - internlm/internlm2_5-7b-chat
     - internlm/internlm2_5-20b-chat
+    - OpenGVLab/InternVL2-1B
     - OpenGVLab/InternVL2-2B
     - OpenGVLab/InternVL2-4B
     - OpenGVLab/InternVL2-8B
@@ -71,16 +73,25 @@ pytorch_base_model:
     - internlm/internlm2_5-7b
     - internlm/internlm2_5-20b
 
-vl_model:
+turbomind_vl_model:
     - OpenGVLab/InternVL2-1B
     - OpenGVLab/InternVL2-2B
-    - OpenGVLab/InternVL2-4B
     - OpenGVLab/InternVL2-8B
     - OpenGVLab/InternVL2-26B
     - Qwen/Qwen2-VL-2B-Instruct
     - Qwen/Qwen2-VL-7B-Instruct
     - internlm/internlm-xcomposer2d5-7b
     - THUDM/glm-4v-9b
+
+pytorch_vl_model:
+    - OpenGVLab/InternVL2-1B
+    - OpenGVLab/InternVL2-4B
+    - OpenGVLab/InternVL2-8B
+    - OpenGVLab/InternVL2-26B
+    - OpenGVLab/Mono-InternVL-2B
+    - Qwen/Qwen2-VL-2B-Instruct
+    - Qwen/Qwen2-VL-7B-Instruct
+    - THUDM/glm-4v-9b
     - microsoft/Phi-3.5-vision-instruct
 
 turbomind_quatization:
@@ -107,10 +118,13 @@ pytorch_quatization:
         - internlm/internlm2_5-7b-chat
         - internlm/internlm2_5-7b
     no_kvint4:
+        - OpenGVLab/InternVL2-1B
         - OpenGVLab/InternVL2-4B
         - deepseek-ai/DeepSeek-V2-Lite-Chat
         - microsoft/Phi-3-mini-4k-instruct
         - microsoft/Phi-3-vision-128k-instruct
+        - microsoft/Phi-3.5-vision-instruct
+        - openbmb/MiniCPM-V-2_6
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
 
diff --git a/autotest/config.yaml b/autotest/config.yaml
index 587ee6331b..6c92d2cf0b 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -21,6 +21,7 @@ tp_config:
 
 turbomind_chat_model:
     - meta-llama/Llama-3.2-1B-Instruct
+    - meta-llama/Llama-3.2-3B-Instruct
     - meta-llama/Meta-Llama-3-1-8B-Instruct
     - meta-llama/Meta-Llama-3-1-8B-Instruct-AWQ
     - meta-llama/Meta-Llama-3-8B-Instruct
@@ -51,6 +52,7 @@ turbomind_chat_model:
     - Qwen/Qwen2-7B-Instruct-GPTQ-Int4
     - Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4
     - mistralai/Mistral-7B-Instruct-v0.3
+    - mistralai/Mistral-Nemo-Instruct-2407
     - mistralai/Mixtral-8x7B-Instruct-v0.1
     - lmdeploy/llama2-chat-7b-w4
     - baichuan-inc/Baichuan2-7B-Chat
@@ -69,11 +71,14 @@ pytorch_chat_model:
     - meta-llama/Meta-Llama-3-8B-Instruct
     - meta-llama/Meta-Llama-3-1-8B-Instruct
     - meta-llama/Llama-3.2-1B-Instruct
+    - meta-llama/Llama-3.2-3B-Instruct
+    - meta-llama/Llama-3.2-11B-Vision-Instruct
     - meta-llama/Llama-2-7b-chat-hf
     - internlm/internlm2_5-7b-chat
     - internlm/internlm2_5-20b-chat
     - internlm/internlm2-chat-20b
     - internlm/internlm-chat-20b
+    - OpenGVLab/InternVL2-1B
     - OpenGVLab/InternVL2-2B
     - OpenGVLab/InternVL2-4B
     - OpenGVLab/InternVL2-8B
@@ -106,20 +111,7 @@ pytorch_chat_model:
     - microsoft/Phi-3-mini-4k-instruct
     - microsoft/Phi-3-vision-128k-instruct
 
-turbomind_base_model:
-    - internlm/internlm2_5-7b
-    - internlm/internlm2_5-1_8b
-    - internlm/internlm2_5-20b
-    - codellama/CodeLlama-7b-hf
-
-pytorch_base_model:
-    - tiiuae/falcon-7b
-    - internlm/internlm2_5-7b
-    - internlm/internlm2_5-1_8b
-    - internlm/internlm2_5-20b
-    - bigcode/starcoder2-7b
-
-vl_model:
+turbomind_vl_model:
     - Qwen/Qwen-VL-Chat
     - liuhaotian/llava-v1.5-13b
     - liuhaotian/llava-v1.6-vicuna-7b
@@ -129,6 +121,20 @@ vl_model:
     - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
     - OpenGVLab/InternVL2-1B
     - OpenGVLab/InternVL2-2B
+    - OpenGVLab/InternVL2-8B
+    - OpenGVLab/InternVL2-26B
+    - OpenGVLab/InternVL2-40B
+    - internlm/internlm-xcomposer2d5-7b
+    - internlm/internlm-xcomposer2-4khd-7b
+    - openbmb/MiniCPM-Llama3-V-2_5
+    - openbmb/MiniCPM-V-2_6
+
+pytorch_vl_model:
+    - meta-llama/Llama-3.2-11B-Vision-Instruct
+    - OpenGVLab/InternVL-Chat-V1-5
+    - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
+    - OpenGVLab/InternVL2-1B
+    - OpenGVLab/InternVL2-2B
     - OpenGVLab/InternVL2-4B
     - OpenGVLab/InternVL2-8B
     - OpenGVLab/InternVL2-26B
@@ -136,15 +142,24 @@ vl_model:
     - OpenGVLab/Mono-InternVL-2B
     - Qwen/Qwen2-VL-2B-Instruct
     - Qwen/Qwen2-VL-7B-Instruct
-    - internlm/internlm-xcomposer2d5-7b
-    - internlm/internlm-xcomposer2-4khd-7b
     - THUDM/cogvlm-chat-hf
     - THUDM/cogvlm2-llama3-chinese-chat-19B
     - THUDM/glm-4v-9b
-    - microsoft/Phi-3.5-vision-instruct
     - microsoft/Phi-3-vision-128k-instruct
-    - openbmb/MiniCPM-Llama3-V-2_5
-    - openbmb/MiniCPM-V-2_6
+    - microsoft/Phi-3.5-vision-instruct
+
+turbomind_base_model:
+    - internlm/internlm2_5-7b
+    - internlm/internlm2_5-1_8b
+    - internlm/internlm2_5-20b
+    - codellama/CodeLlama-7b-hf
+
+pytorch_base_model:
+    - tiiuae/falcon-7b
+    - internlm/internlm2_5-7b
+    - internlm/internlm2_5-1_8b
+    - internlm/internlm2_5-20b
+    - bigcode/starcoder2-7b
 
 turbomind_quatization:
     no_awq:
@@ -184,10 +199,12 @@ pytorch_quatization:
         - internlm/internlm2_5-20b
         - internlm/internlm2_5-7b
     no_kvint4:
+        - OpenGVLab/InternVL2-1B
         - OpenGVLab/InternVL2-4B
         - deepseek-ai/DeepSeek-V2-Lite-Chat
         - microsoft/Phi-3-mini-4k-instruct
         - microsoft/Phi-3-vision-128k-instruct
+        - microsoft/Phi-3.5-vision-instruct
         - openbmb/MiniCPM-V-2_6
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
similarity index 100%
rename from autotest/tools/pipeline/test_pipeline_chat_pytorch.py
rename to autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
new file mode 100644
index 0000000000..276ced5bcb
--- /dev/null
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
@@ -0,0 +1,120 @@
+import os
+from multiprocessing import get_context
+
+import pytest
+from utils.config_utils import get_cuda_id_by_workerid, get_torch_model_list
+from utils.pipeline_chat import (assert_pipeline_vl_chat_log,
+                                 run_pipeline_vl_chat_test)
+
+BACKEND = 'pytorch'
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=1, model_type='vl_model'))
+def test_pipeline_chat_tp1(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=2, model_type='vl_model'))
+def test_pipeline_chat_tp2(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
+                                                                     tp_num=2)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=1,
+                                              quant_policy=4,
+                                              model_type='vl_model'))
+def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
+    if 'Qwen2' in model:
+        return  # kvint4 for qwen2 is not support
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id, 4))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=2,
+                                              quant_policy=4,
+                                              model_type='vl_model'))
+def test_pipeline_chat_kvint4_tp2(config, model, worker_id):
+    if 'Qwen2' in model:
+        return  # kvint4 for qwen2 is not support
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
+                                                                     tp_num=2)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id, 4))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=1,
+                                              quant_policy=8,
+                                              model_type='vl_model'))
+def test_pipeline_chat_kvint8_tp1(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id, 8))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=2,
+                                              quant_policy=8,
+                                              model_type='vl_model'))
+def test_pipeline_chat_kvint8_tp2(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
+                                                                     tp_num=2)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id, 8))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
similarity index 100%
rename from autotest/tools/pipeline/test_pipeline_chat_turbomind.py
rename to autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
new file mode 100644
index 0000000000..8f1bc7d8b1
--- /dev/null
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
@@ -0,0 +1,139 @@
+import os
+from multiprocessing import get_context
+
+import pytest
+from utils.config_utils import get_all_model_list, get_cuda_id_by_workerid
+from utils.pipeline_chat import (assert_pipeline_vl_chat_log,
+                                 run_pipeline_vl_chat_test)
+
+BACKEND = 'turbomind'
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('model',
+                         get_all_model_list(tp_num=1, model_type='vl_model'))
+def test_pipeline_chat_tp1(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('model',
+                         get_all_model_list(tp_num=2, model_type='vl_model'))
+def test_pipeline_chat_tp2(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
+                                                                     tp_num=2)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('model',
+                         get_all_model_list(tp_num=1,
+                                            quant_policy=4,
+                                            model_type='vl_model'))
+def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
+    if 'Qwen2' in model:
+        return  # kvint4 for qwen2 is not support
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id, 4))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('model',
+                         get_all_model_list(tp_num=2,
+                                            quant_policy=4,
+                                            model_type='vl_model'))
+def test_pipeline_chat_kvint4_tp2(config, model, worker_id):
+    if 'Qwen2' in model:
+        return  # kvint4 for qwen2 is not support
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
+                                                                     tp_num=2)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id, 4))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('model',
+                         get_all_model_list(tp_num=1,
+                                            quant_policy=8,
+                                            model_type='vl_model'))
+def test_pipeline_chat_kvint8_tp1(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id, 8))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('model',
+                         get_all_model_list(tp_num=2,
+                                            quant_policy=8,
+                                            model_type='vl_model'))
+def test_pipeline_chat_kvint8_tp2(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
+                                                                     tp_num=2)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id, 8))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
+
+
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.pr_test
+@pytest.mark.parametrize('model', [
+    'liuhaotian/llava-v1.6-vicuna-7b', 'OpenGVLab/InternVL2-4B',
+    'OpenGVLab/InternVL2-8B', 'internlm/internlm-xcomposer2d5-7b'
+])
+def test_pipeline_pr_test(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(
+            int(get_cuda_id_by_workerid(worker_id)) + 5)
+    spawn_context = get_context('spawn')
+    p = spawn_context.Process(target=run_pipeline_vl_chat_test,
+                              args=(config, model, BACKEND, worker_id))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model, worker_id)
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py
deleted file mode 100644
index 3279495493..0000000000
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py
+++ /dev/null
@@ -1,109 +0,0 @@
-import os
-from multiprocessing import Process
-
-import pytest
-from utils.config_utils import get_cuda_id_by_workerid, get_vl_model_list
-from utils.pipeline_chat import (assert_pipeline_vl_chat_log,
-                                 run_pipeline_vl_chat_test)
-
-
-@pytest.mark.order(6)
-@pytest.mark.pipeline_chat
-@pytest.mark.gpu_num_1
-@pytest.mark.parametrize('model', get_vl_model_list(tp_num=1))
-def test_pipeline_chat_tp1(config, model, worker_id):
-    if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
-    p = Process(target=run_pipeline_vl_chat_test, args=(config, model))
-    p.start()
-    p.join()
-    assert_pipeline_vl_chat_log(config, model)
-
-
-@pytest.mark.order(6)
-@pytest.mark.pipeline_chat
-@pytest.mark.gpu_num_2
-@pytest.mark.parametrize('model', get_vl_model_list(tp_num=2))
-def test_pipeline_chat_tp2(config, model, worker_id):
-    if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
-                                                                     tp_num=2)
-    p = Process(target=run_pipeline_vl_chat_test, args=(config, model))
-    p.start()
-    p.join()
-    assert_pipeline_vl_chat_log(config, model)
-
-
-@pytest.mark.order(6)
-@pytest.mark.pipeline_chat
-@pytest.mark.gpu_num_1
-@pytest.mark.parametrize('model', get_vl_model_list(tp_num=1, quant_policy=4))
-def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
-    if 'Qwen2' in model:
-        return  # kvint4 for qwen2 is not support
-    if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
-    p = Process(target=run_pipeline_vl_chat_test, args=(config, model, 4))
-    p.start()
-    p.join()
-    assert_pipeline_vl_chat_log(config, model)
-
-
-@pytest.mark.order(6)
-@pytest.mark.pipeline_chat
-@pytest.mark.gpu_num_2
-@pytest.mark.parametrize('model', get_vl_model_list(tp_num=2, quant_policy=4))
-def test_pipeline_chat_kvint4_tp2(config, model, worker_id):
-    if 'Qwen2' in model:
-        return  # kvint4 for qwen2 is not support
-    if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
-                                                                     tp_num=2)
-    p = Process(target=run_pipeline_vl_chat_test, args=(config, model, 4))
-    p.start()
-    p.join()
-    assert_pipeline_vl_chat_log(config, model)
-
-
-@pytest.mark.order(6)
-@pytest.mark.pipeline_chat
-@pytest.mark.gpu_num_1
-@pytest.mark.parametrize('model', get_vl_model_list(tp_num=1, quant_policy=8))
-def test_pipeline_chat_kvint8_tp1(config, model, worker_id):
-    if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
-    p = Process(target=run_pipeline_vl_chat_test, args=(config, model, 8))
-    p.start()
-    p.join()
-    assert_pipeline_vl_chat_log(config, model)
-
-
-@pytest.mark.order(6)
-@pytest.mark.pipeline_chat
-@pytest.mark.gpu_num_2
-@pytest.mark.parametrize('model', get_vl_model_list(tp_num=2, quant_policy=8))
-def test_pipeline_chat_kvint8_tp2(config, model, worker_id):
-    if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
-                                                                     tp_num=2)
-    p = Process(target=run_pipeline_vl_chat_test, args=(config, model, 8))
-    p.start()
-    p.join()
-    assert_pipeline_vl_chat_log(config, model)
-
-
-@pytest.mark.pipeline_chat
-@pytest.mark.gpu_num_1
-@pytest.mark.pr_test
-@pytest.mark.parametrize('model', [
-    'liuhaotian/llava-v1.6-vicuna-7b', 'OpenGVLab/InternVL2-4B',
-    'OpenGVLab/InternVL2-8B', 'internlm/internlm-xcomposer2d5-7b'
-])
-def test_pipeline_pr_test(config, model, worker_id):
-    if 'gw' in worker_id:
-        os.environ['CUDA_VISIBLE_DEVICES'] = str(
-            int(get_cuda_id_by_workerid(worker_id)) + 5)
-    p = Process(target=run_pipeline_vl_chat_test, args=(config, model))
-    p.start()
-    p.join()
-    assert_pipeline_vl_chat_log(config, model)
diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
similarity index 100%
rename from autotest/tools/restful/test_restful_chat_hf_pytorch.py
rename to autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py
new file mode 100644
index 0000000000..b210733db4
--- /dev/null
+++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py
@@ -0,0 +1,116 @@
+import pytest
+from utils.config_utils import get_torch_model_list, get_workerid
+from utils.run_restful_chat import (run_vl_testcase, start_restful_api,
+                                    stop_restful_api)
+
+BASE_HTTP_URL = 'http://localhost'
+DEFAULT_PORT = 23333
+
+
+@pytest.fixture(scope='function', autouse=True)
+def prepare_environment(request, config, worker_id):
+    param = request.param
+    model = param['model']
+    model_path = config.get('model_path') + '/' + model
+
+    pid, startRes = start_restful_api(config, param, model, model_path,
+                                      'pytorch', worker_id)
+    yield
+    stop_restful_api(pid, startRes, param)
+
+
+def getModelList(tp_num):
+    return [{
+        'model': item,
+        'cuda_prefix': None,
+        'tp_num': tp_num,
+    } for item in get_torch_model_list(tp_num, model_type='vl_model')]
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('prepare_environment',
+                         getModelList(tp_num=1),
+                         indirect=True)
+def test_restful_chat_tp1(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('prepare_environment',
+                         getModelList(tp_num=2),
+                         indirect=True)
+def test_restful_chat_tp2(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+def getKvintModelList(tp_num, quant_policy: int = None):
+    return [{
+        'model': item,
+        'cuda_prefix': None,
+        'tp_num': tp_num,
+        'extra': f'--quant-policy {quant_policy}'
+    } for item in get_torch_model_list(
+        tp_num, quant_policy=quant_policy, model_type='vl_model')
+            if 'qwen2' not in item.lower() or quant_policy == 8]
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=1, quant_policy=4),
+                         indirect=True)
+def test_restful_chat_kvint4_tp1(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=2, quant_policy=4),
+                         indirect=True)
+def test_restful_chat_kvint4_tp2(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=1, quant_policy=8),
+                         indirect=True)
+def test_restful_chat_kvint8_tp1(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=2, quant_policy=8),
+                         indirect=True)
+def test_restful_chat_kvint8_tp2(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
similarity index 100%
rename from autotest/tools/restful/test_restful_chat_hf_turbomind.py
rename to autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
new file mode 100644
index 0000000000..091e18e6e3
--- /dev/null
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
@@ -0,0 +1,116 @@
+import pytest
+from utils.config_utils import get_all_model_list, get_workerid
+from utils.run_restful_chat import (run_vl_testcase, start_restful_api,
+                                    stop_restful_api)
+
+BASE_HTTP_URL = 'http://localhost'
+DEFAULT_PORT = 23333
+
+
+@pytest.fixture(scope='function', autouse=True)
+def prepare_environment(request, config, worker_id):
+    param = request.param
+    model = param['model']
+    model_path = config.get('model_path') + '/' + model
+
+    pid, startRes = start_restful_api(config, param, model, model_path,
+                                      'turbomind', worker_id)
+    yield
+    stop_restful_api(pid, startRes, param)
+
+
+def getModelList(tp_num):
+    return [{
+        'model': item,
+        'cuda_prefix': None,
+        'tp_num': tp_num,
+    } for item in get_all_model_list(tp_num, model_type='vl_model')]
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('prepare_environment',
+                         getModelList(tp_num=1),
+                         indirect=True)
+def test_restful_chat_tp1(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('prepare_environment',
+                         getModelList(tp_num=2),
+                         indirect=True)
+def test_restful_chat_tp2(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+def getKvintModelList(tp_num, quant_policy: int = None):
+    return [{
+        'model': item,
+        'cuda_prefix': None,
+        'tp_num': tp_num,
+        'extra': f'--quant-policy {quant_policy}'
+    } for item in get_all_model_list(
+        tp_num, quant_policy=quant_policy, model_type='vl_model')
+            if 'qwen2' not in item.lower() or quant_policy == 8]
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=1, quant_policy=4),
+                         indirect=True)
+def test_restful_chat_kvint4_tp1(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=2, quant_policy=4),
+                         indirect=True)
+def test_restful_chat_kvint4_tp2(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=1, quant_policy=8),
+                         indirect=True)
+def test_restful_chat_kvint8_tp1(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
+
+
+@pytest.mark.order(7)
+@pytest.mark.restful_api_vl
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('prepare_environment',
+                         getKvintModelList(tp_num=2, quant_policy=8),
+                         indirect=True)
+def test_restful_chat_kvint8_tp2(config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_vl_testcase(config)
+    else:
+        run_vl_testcase(config, port=DEFAULT_PORT + get_workerid(worker_id))
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py
deleted file mode 100644
index 6e636d7ad4..0000000000
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py
+++ /dev/null
@@ -1,182 +0,0 @@
-import os
-
-import allure
-import pytest
-from openai import OpenAI
-from utils.config_utils import get_vl_model_list, get_workerid
-from utils.run_restful_chat import start_restful_api, stop_restful_api
-
-from lmdeploy.serve.openai.api_client import APIClient
-
-BASE_HTTP_URL = 'http://localhost'
-DEFAULT_PORT = 23333
-
-
-@pytest.fixture(scope='function', autouse=True)
-def prepare_environment(request, config, worker_id):
-    param = request.param
-    model = param['model']
-    model_path = config.get('model_path') + '/' + model
-
-    pid, startRes = start_restful_api(config, param, model, model_path,
-                                      'turbomind', worker_id)
-    yield
-    stop_restful_api(pid, startRes, param)
-
-
-def getModelList(tp_num):
-    return [{
-        'model': item,
-        'cuda_prefix': None,
-        'tp_num': tp_num
-    } for item in get_vl_model_list(tp_num)]
-
-
-@pytest.mark.order(7)
-@pytest.mark.restful_api_vl
-@pytest.mark.gpu_num_1
-@pytest.mark.parametrize('prepare_environment',
-                         getModelList(tp_num=1),
-                         indirect=True)
-def test_restful_chat_tp1(config, worker_id):
-    if get_workerid(worker_id) is None:
-        run_all_step(config)
-    else:
-        run_all_step(config, port=DEFAULT_PORT + get_workerid(worker_id))
-
-
-@pytest.mark.order(7)
-@pytest.mark.restful_api_vl
-@pytest.mark.gpu_num_2
-@pytest.mark.parametrize('prepare_environment',
-                         getModelList(tp_num=2),
-                         indirect=True)
-def test_restful_chat_tp2(config, worker_id):
-    if get_workerid(worker_id) is None:
-        run_all_step(config)
-    else:
-        run_all_step(config, port=DEFAULT_PORT + get_workerid(worker_id))
-
-
-def getKvintModelList(tp_num, quant_policy: int = None):
-    return [{
-        'model': item,
-        'cuda_prefix': None,
-        'tp_num': tp_num,
-        'extra': f'--quant-policy {quant_policy}'
-    } for item in get_vl_model_list(tp_num, quant_policy)
-            if 'qwen2' not in item.lower() or quant_policy == 8]
-
-
-@pytest.mark.order(7)
-@pytest.mark.restful_api_vl
-@pytest.mark.gpu_num_1
-@pytest.mark.parametrize('prepare_environment',
-                         getKvintModelList(tp_num=1, quant_policy=4),
-                         indirect=True)
-def test_restful_chat_kvint4_tp1(config, worker_id):
-    if get_workerid(worker_id) is None:
-        run_all_step(config)
-    else:
-        run_all_step(config, port=DEFAULT_PORT + get_workerid(worker_id))
-
-
-@pytest.mark.order(7)
-@pytest.mark.restful_api_vl
-@pytest.mark.gpu_num_2
-@pytest.mark.parametrize('prepare_environment',
-                         getKvintModelList(tp_num=2, quant_policy=4),
-                         indirect=True)
-def test_restful_chat_kvint4_tp2(config, worker_id):
-    if get_workerid(worker_id) is None:
-        run_all_step(config)
-    else:
-        run_all_step(config, port=DEFAULT_PORT + get_workerid(worker_id))
-
-
-@pytest.mark.order(7)
-@pytest.mark.restful_api_vl
-@pytest.mark.gpu_num_1
-@pytest.mark.parametrize('prepare_environment',
-                         getKvintModelList(tp_num=1, quant_policy=8),
-                         indirect=True)
-def test_restful_chat_kvint8_tp1(config, worker_id):
-    if get_workerid(worker_id) is None:
-        run_all_step(config)
-    else:
-        run_all_step(config, port=DEFAULT_PORT + get_workerid(worker_id))
-
-
-@pytest.mark.order(7)
-@pytest.mark.restful_api_vl
-@pytest.mark.gpu_num_2
-@pytest.mark.parametrize('prepare_environment',
-                         getKvintModelList(tp_num=2, quant_policy=8),
-                         indirect=True)
-def test_restful_chat_kvint8_tp2(config, worker_id):
-    if get_workerid(worker_id) is None:
-        run_all_step(config)
-    else:
-        run_all_step(config, port=DEFAULT_PORT + get_workerid(worker_id))
-
-
-PIC = 'https://raw.githubusercontent.com/' + \
-    'open-mmlab/mmdeploy/main/tests/data/tiger.jpeg'
-
-
-def run_all_step(config, port: int = DEFAULT_PORT):
-    http_url = BASE_HTTP_URL + ':' + str(port)
-    log_path = config.get('log_path')
-
-    client = OpenAI(api_key='YOUR_API_KEY', base_url=http_url + '/v1')
-    model_name = client.models.list().data[0].id
-
-    restful_log = os.path.join(
-        log_path,
-        'restful_vl_' + model_name.split('/')[-1] + str(port) + '.log')
-    file = open(restful_log, 'w')
-
-    response = client.chat.completions.create(
-        model=model_name,
-        messages=[{
-            'role':
-            'user',
-            'content': [{
-                'type': 'text',
-                'text': 'Describe the image please',
-            }, {
-                'type': 'image_url',
-                'image_url': {
-                    'url': PIC,
-                },
-            }],
-        }],
-        temperature=0.8,
-        top_p=0.8)
-    file.writelines(str(response).lower() + '\n')
-    assert 'tiger' in str(response).lower() or '虎' in str(
-        response).lower(), response
-
-    api_client = APIClient(http_url)
-    model_name = api_client.available_models[0]
-    messages = [{
-        'role':
-        'user',
-        'content': [{
-            'type': 'text',
-            'text': 'Describe the image please',
-        }, {
-            'type': 'image_url',
-            'image_url': {
-                'url': PIC,
-            },
-        }]
-    }]
-    for item in api_client.chat_completions_v1(model=model_name,
-                                               messages=messages):
-        continue
-    file.writelines(str(item) + '\n')
-    assert 'tiger' in str(item).lower() or '虎' in str(item).lower(), item
-
-    allure.attach.file(restful_log,
-                       attachment_type=allure.attachment_type.TEXT)
diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py
index 4c2e0a2c90..c38568e6f1 100644
--- a/autotest/utils/benchmark_utils.py
+++ b/autotest/utils/benchmark_utils.py
@@ -27,7 +27,8 @@ def generation_test(config,
     model_path = '/'.join([config.get('model_path'), model])
     log_path = config.get('log_path')
     benchmark_log = os.path.join(
-        log_path, 'benchmark_' + model.split('/')[1] + worker_id + '.log')
+        log_path,
+        'benchmark_generation_' + model.split('/')[1] + worker_id + '.log')
     benchmark_path = '/'.join([
         config.get('benchmark_path'), run_id, model,
         f'benchmark-generation-{backend}'
@@ -86,7 +87,8 @@ def throughput_test(config,
     log_path = config.get('log_path')
     dataset_path = config.get('dataset_path')
     benchmark_log = os.path.join(
-        log_path, 'benchmark_' + model.split('/')[1] + worker_id + '.log')
+        log_path,
+        'benchmark_throughput_' + model.split('/')[1] + worker_id + '.log')
     if backend == 'turbomind' and quant_policy != 0:
         benchmark_path = '/'.join([
             config.get('benchmark_path'), run_id, model,
@@ -150,7 +152,8 @@ def restful_test(config,
     log_path = config.get('log_path')
     dataset_path = config.get('dataset_path')
     benchmark_log = os.path.join(
-        log_path, 'benchmark_' + model.split('/')[1] + worker_id + '.log')
+        log_path,
+        'benchmark_restful_' + model.split('/')[1] + worker_id + '.log')
     if backend == 'turbomind' and quant_policy != 0:
         benchmark_path = '/'.join([
             config.get('benchmark_path'), run_id, model,
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index 8aa5f933fb..24b4a3f8cd 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -23,7 +23,7 @@ def get_turbomind_model_list(tp_num: int = None,
 
     quatization_case_config = config.get('turbomind_quatization')
     for key in config.get('turbomind_' + model_type):
-        if key not in quatization_case_config.get(
+        if key in case_list and key not in quatization_case_config.get(
                 'no_awq') and not is_quantization_model(key):
             case_list.append(key + '-inner-4bits')
     for key in quatization_case_config.get('gptq'):
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index e9988f0e39..562a707efe 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -61,7 +61,7 @@ def run_pipeline_chat_test(config,
         ]))
     file = open(config_log, 'w')
     log_string = '\n'.join([
-        'reproduce config info:',
+        'reproduce config info:', 'from lmdeploy import pipeline',
         'from lmdeploy.messages import PytorchEngineConfig',
         'from lmdeploy.messages import TurbomindEngineConfig',
         'engine_config = ' + str(backend_config),
@@ -273,24 +273,29 @@ def assert_pipeline_single_element(output,
     return result
 
 
-PIC1 = 'https://raw.githubusercontent.com/' + \
-    'open-mmlab/mmdeploy/main/tests/data/tiger.jpeg'
-PIC2 = 'https://raw.githubusercontent.com/' + \
-    'open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg'
+PIC1 = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg'  # noqa E501
+PIC2 = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg'  # noqa E501
 
 
-def run_pipeline_vl_chat_test(config, model_case, quant_policy: int = None):
+def run_pipeline_vl_chat_test(config,
+                              model_case,
+                              backend,
+                              worker_id: str = '',
+                              quant_policy: int = None):
     log_path = config.get('log_path')
     tp = get_tp_num(config, model_case)
     model_path = config.get('model_path')
     hf_path = model_path + '/' + model_case
 
-    if 'llava' in model_case:
-        backend_config = TurbomindEngineConfig(tp=tp,
-                                               session_len=8192,
-                                               model_name='vicuna')
+    if 'pytorch' in backend:
+        backend_config = PytorchEngineConfig(tp=tp, session_len=8192)
+        if not is_bf16_supported():
+            backend_config.dtype = 'float16'
     else:
         backend_config = TurbomindEngineConfig(tp=tp, session_len=8192)
+
+    if 'llava' in model_case:
+        backend_config.model_name = 'vicuna'
     if '4bit' in model_case.lower() or 'awq' in model_case.lower():
         backend_config.model_format = 'awq'
     if quant_policy is not None:
@@ -301,7 +306,8 @@ def run_pipeline_vl_chat_test(config, model_case, quant_policy: int = None):
     pipe = pipeline(hf_path, backend_config=backend_config)
 
     pipeline_chat_log = os.path.join(
-        log_path, 'pipeline_vl_chat_' + model_case.split('/')[1] + '.log')
+        log_path,
+        'pipeline_vl_chat_' + model_case.split('/')[1] + worker_id + '.log')
     file = open(pipeline_chat_log, 'w')
 
     image = load_image(PIC1)
@@ -311,7 +317,16 @@ def run_pipeline_vl_chat_test(config, model_case, quant_policy: int = None):
     else:
         prompt = 'describe this image'
 
-    file.writelines('engineconfig:' + str(backend_config))
+    log_string = '\n'.join([
+        'reproduce config info:', 'from lmdeploy import pipeline',
+        'from lmdeploy.messages import PytorchEngineConfig',
+        'from lmdeploy.messages import TurbomindEngineConfig',
+        'engine_config = ' + str(backend_config),
+        'pipe = pipeline("' + hf_path + '",  backend_config=engine_config)',
+        f'res = pipe(({prompt}, {image}))'
+    ])
+    file.writelines(log_string)
+    print(log_string)
     response = pipe((prompt, image))
     result = 'tiger' in response.text.lower() or '虎' in response.text.lower()
     file.writelines('result:' + str(result) +
@@ -377,11 +392,12 @@ def run_pipeline_vl_chat_test(config, model_case, quant_policy: int = None):
     torch.cuda.empty_cache()
 
 
-def assert_pipeline_vl_chat_log(config, model_case):
+def assert_pipeline_vl_chat_log(config, model_case, worker_id):
     log_path = config.get('log_path')
 
     pipeline_chat_log = os.path.join(
-        log_path, 'pipeline_vl_chat_' + model_case.split('/')[1] + '.log')
+        log_path,
+        'pipeline_vl_chat_' + model_case.split('/')[1] + worker_id + '.log')
 
     allure.attach.file(pipeline_chat_log,
                        attachment_type=allure.attachment_type.TEXT)
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index dfc363b086..77af1975be 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -6,6 +6,7 @@
 
 import allure
 import psutil
+from openai import OpenAI
 from pytest_assume.plugin import assume
 from utils.config_utils import get_cuda_prefix_by_workerid, get_workerid
 from utils.get_run_config import get_command_with_extra
@@ -278,3 +279,52 @@ def get_model(url):
         return model_name.split('/')[-1]
     except Exception:
         return None
+
+
+PIC = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg'  # noqa E501
+
+
+def run_vl_testcase(config, port: int = DEFAULT_PORT):
+    http_url = BASE_HTTP_URL + ':' + str(port)
+    log_path = config.get('log_path')
+
+    client = OpenAI(api_key='YOUR_API_KEY', base_url=http_url + '/v1')
+    model_name = client.models.list().data[0].id
+
+    restful_log = os.path.join(
+        log_path,
+        'restful_vl_' + model_name.split('/')[-1] + str(port) + '.log')
+    file = open(restful_log, 'w')
+
+    prompt_messages = [{
+        'role':
+        'user',
+        'content': [{
+            'type': 'text',
+            'text': 'Describe the image please',
+        }, {
+            'type': 'image_url',
+            'image_url': {
+                'url': PIC,
+            },
+        }],
+    }]
+
+    response = client.chat.completions.create(model=model_name,
+                                              messages=prompt_messages,
+                                              temperature=0.8,
+                                              top_p=0.8)
+    file.writelines(str(response).lower() + '\n')
+    assert 'tiger' in str(response).lower() or '虎' in str(
+        response).lower(), response
+
+    api_client = APIClient(http_url)
+    model_name = api_client.available_models[0]
+    for item in api_client.chat_completions_v1(model=model_name,
+                                               messages=prompt_messages):
+        continue
+    file.writelines(str(item) + '\n')
+    assert 'tiger' in str(item).lower() or '虎' in str(item).lower(), item
+
+    allure.attach.file(restful_log,
+                       attachment_type=allure.attachment_type.TEXT)
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index 684a4f5109..283ce596f6 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -4,39 +4,39 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 
 ## TurboMind on CUDA Platform
 
-|         Model         |     Size     | Type | FP16/BF16 | KV INT8 | KV INT4 | W4A16 |
-| :-------------------: | :----------: | :--: | :-------: | :-----: | :-----: | :---: |
-|         Llama         |   7B - 65B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Llama2         |   7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Llama3         |   8B, 70B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Llama3.1        |   8B, 70B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Llama3.2        |      3B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternLM        |   7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternLM2       |   7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|      InternLM2.5      |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|  InternLM-XComposer2  | 7B, 4khd-7B  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-| InternLM-XComposer2.5 |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|         Qwen          |  1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Qwen1.5        | 1.8B - 110B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|         Qwen2         |  1.5B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Mistral        |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Mixtral        | 8x7B, 8x22B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Qwen-VL        |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|      DeepSeek-VL      |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Baichuan        |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Baichuan2       |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|      Code Llama       |   7B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No   |
-|          YI           |   6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|    LLaVA(1.5,1.6)     |   7B - 34B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternVL        | v1.1 - v1.5  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternVL2       | 2B, 8B - 76B | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|        ChemVLM        |   8B - 26B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-| MiniCPM-Llama3-V-2_5  |      -       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|     MiniCPM-V-2_6     |      -       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|    MiniGeminiLlama    |      7B      | MLLM |    Yes    |    -    |    -    |  Yes  |
-|         GLM4          |      9B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       CodeGeeX4       |      9B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
-|         Molmo         |   7B-D,72B   | MLLM |    Yes    |   Yes   |   Yes   |  NO   |
+|         Model         |      Size      | Type | FP16/BF16 | KV INT8 | KV INT4 | W4A16 |
+| :-------------------: | :------------: | :--: | :-------: | :-----: | :-----: | :---: |
+|         Llama         |    7B - 65B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Llama2         |    7B - 70B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Llama3         |    8B, 70B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Llama3.1        |    8B, 70B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Llama3.2        |     1B, 3B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternLM        |    7B - 20B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternLM2       |    7B - 20B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|      InternLM2.5      |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|  InternLM-XComposer2  |  7B, 4khd-7B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+| InternLM-XComposer2.5 |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|         Qwen          |   1.8B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Qwen1.5        |  1.8B - 110B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|         Qwen2         |   0.5B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Mistral        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Mixtral        |  8x7B, 8x22B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Qwen-VL        |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|      DeepSeek-VL      |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Baichuan        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Baichuan2       |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|      Code Llama       |    7B - 34B    | LLM  |    Yes    |   Yes   |   Yes   |  No   |
+|          YI           |    6B - 34B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|    LLaVA(1.5,1.6)     |    7B - 34B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternVL        |  v1.1 - v1.5   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternVL2       | 1-2B, 8B - 76B | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|        ChemVLM        |    8B - 26B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+| MiniCPM-Llama3-V-2_5  |       -        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|     MiniCPM-V-2_6     |       -        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|    MiniGeminiLlama    |       7B       | MLLM |    Yes    |    -    |    -    |  Yes  |
+|         GLM4          |       9B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       CodeGeeX4       |       9B       | LLM  |    Yes    |   Yes   |   Yes   |   -   |
+|         Molmo         |    7B-D,72B    | MLLM |    Yes    |   Yes   |   Yes   |  NO   |
 
 "-" means not verified yet.
 
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index d8bf9a1ad8..908f9a17f5 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -4,39 +4,39 @@
 
 ## TurboMind CUDA 平台
 
-|         Model         |     Size     | Type | FP16/BF16 | KV INT8 | KV INT4 | W4A16 |
-| :-------------------: | :----------: | :--: | :-------: | :-----: | :-----: | :---: |
-|         Llama         |   7B - 65B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Llama2         |   7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Llama3         |   8B, 70B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Llama3.1        |   8B, 70B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Llama3.2        |      3B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternLM        |   7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternLM2       |   7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|      InternLM2.5      |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|  InternLM-XComposer2  | 7B, 4khd-7B  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-| InternLM-XComposer2.5 |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|         Qwen          |  1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Qwen1.5        | 1.8B - 110B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|         Qwen2         |  1.5B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Mistral        |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Mixtral        | 8x7B, 8x22B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Qwen-VL        |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|      DeepSeek-VL      |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Baichuan        |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Baichuan2       |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|      Code Llama       |   7B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No   |
-|          YI           |   6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|    LLaVA(1.5,1.6)     |   7B - 34B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternVL        | v1.1 - v1.5  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternVL2       | 2B, 8B - 76B | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|        ChemVLM        |   8B - 26B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-| MiniCPM-Llama3-V-2_5  |      -       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|     MiniCPM-V-2_6     |      -       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|    MiniGeminiLlama    |      7B      | MLLM |    Yes    |    -    |    -    |  Yes  |
-|         GLM4          |      9B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       CodeGeeX4       |      9B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
-|         Molmo         |   7B-D,72B   | MLLM |    Yes    |   Yes   |   Yes   |  NO   |
+|         Model         |      Size      | Type | FP16/BF16 | KV INT8 | KV INT4 | W4A16 |
+| :-------------------: | :------------: | :--: | :-------: | :-----: | :-----: | :---: |
+|         Llama         |    7B - 65B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Llama2         |    7B - 70B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Llama3         |    8B, 70B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Llama3.1        |    8B, 70B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Llama3.2        |     1B, 3B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternLM        |    7B - 20B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternLM2       |    7B - 20B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|      InternLM2.5      |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|  InternLM-XComposer2  |  7B, 4khd-7B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+| InternLM-XComposer2.5 |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|         Qwen          |   1.8B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Qwen1.5        |  1.8B - 110B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|         Qwen2         |   0.5B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Mistral        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Mixtral        |  8x7B, 8x22B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Qwen-VL        |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|      DeepSeek-VL      |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Baichuan        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Baichuan2       |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|      Code Llama       |    7B - 34B    | LLM  |    Yes    |   Yes   |   Yes   |  No   |
+|          YI           |    6B - 34B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|    LLaVA(1.5,1.6)     |    7B - 34B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternVL        |  v1.1 - v1.5   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternVL2       | 1-2B, 8B - 76B | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|        ChemVLM        |    8B - 26B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+| MiniCPM-Llama3-V-2_5  |       -        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|     MiniCPM-V-2_6     |       -        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|    MiniGeminiLlama    |       7B       | MLLM |    Yes    |    -    |    -    |  Yes  |
+|         GLM4          |       9B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       CodeGeeX4       |       9B       | LLM  |    Yes    |   Yes   |   Yes   |   -   |
+|         Molmo         |    7B-D,72B    | MLLM |    Yes    |   Yes   |   Yes   |  NO   |
 
 “-” 表示还没有验证。
 

From 324237b2c9e223c2392088cecb57b3703d1f7d54 Mon Sep 17 00:00:00 2001
From: zhoushenglong <87467364+Reinerzhou@users.noreply.github.com>
Date: Thu, 21 Nov 2024 19:01:12 +0800
Subject: [PATCH 086/122] [Feature] support minicpm-v_2_6 for pytorch engine.
 (#2767)

* support minicpmv_2_6.

* update supported_models.

* update supported_models.
---
 docs/en/supported_models/supported_models.md  |   1 +
 .../supported_models/supported_models.md      |   1 +
 lmdeploy/pytorch/models/minicpmv26.py         | 430 ++++++++++++++++++
 lmdeploy/pytorch/models/module_map.py         |   6 +
 lmdeploy/pytorch/supported_models.py          |   2 +
 5 files changed, 440 insertions(+)
 create mode 100644 lmdeploy/pytorch/models/minicpmv26.py

diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index 283ce596f6..da52241253 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -72,6 +72,7 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 |  DeepSeek-MoE  |     16B     | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |  DeepSeek-V2   |  16B, 236B  | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |    MiniCPM3    |     4B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    |  No  |  Yes  |
 |     Gemma      |    2B-7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |      Dbrx      |    132B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |   StarCoder2   |   3B-15B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index 908f9a17f5..502e91b6d3 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -72,6 +72,7 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 |  DeepSeek-MoE  |     16B     | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |  DeepSeek-V2   |  16B, 236B  | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |    MiniCPM3    |     4B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    |  No  |  Yes  |
 |     Gemma      |    2B-7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |      Dbrx      |    132B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |   StarCoder2   |   3B-15B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
diff --git a/lmdeploy/pytorch/models/minicpmv26.py b/lmdeploy/pytorch/models/minicpmv26.py
new file mode 100644
index 0000000000..725e97d9d7
--- /dev/null
+++ b/lmdeploy/pytorch/models/minicpmv26.py
@@ -0,0 +1,430 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+
+from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
+from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm,
+                                 SiluAndMul, build_rotary_embedding,
+                                 build_rotary_params)
+from lmdeploy.pytorch.nn.linear import (build_merged_colwise_linear,
+                                        build_qkv_proj, build_rowwise_linear)
+from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
+
+from .utils.cudagraph import CudaGraphMixin
+
+
+class MiniCPMV26Attention(nn.Module):
+    """Rewrite module of MiniCPMV26Attention."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        quantization_config = getattr(config, 'quantization_config', None)
+        num_heads = config.num_attention_heads
+        num_key_value_heads = config.num_key_value_heads
+        hidden_size = config.hidden_size
+        head_dim = getattr(config, 'head_dim', hidden_size // num_heads)
+
+        # packed qkv
+        self.qkv_proj = build_qkv_proj(
+            hidden_size,
+            num_q_heads=num_heads,
+            num_kv_heads=num_key_value_heads,
+            head_size=head_dim,
+            bias=True,
+            quant_config=quantization_config,
+            dtype=dtype,
+            device=device,
+        )
+
+        # rotary embedding
+        self.apply_rotary_pos_emb = ApplyRotaryEmb()
+
+        # attention
+        self.attn_fwd = Attention(
+            num_heads,
+            head_dim,
+            num_kv_heads=num_key_value_heads,
+            v_head_size=head_dim,
+            sliding_window=config.sliding_window,
+        )
+
+        # o_proj
+        self.o_proj = build_rowwise_linear(num_heads * head_dim,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quantization_config,
+                                           dtype=dtype,
+                                           device=device,
+                                           is_tp=True)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attn_metadata: Any = None,
+    ):
+        """Rewrite of LlamaAttention.forward."""
+        # qkv proj
+        qkv_states = self.qkv_proj(hidden_states)
+        # (-1, heads, head_dim)
+        qkv_states = qkv_states.flatten(0, -2)
+        query_states, key_states, value_states = self.qkv_proj.split_qkv(
+            qkv_states)
+
+        # apply rotary embedding
+        cos, sin = rotary_pos_emb
+        query_states, key_states = self.apply_rotary_pos_emb(
+            query_states,
+            key_states,
+            cos,
+            sin,
+            inplace=True,
+        )
+
+        # attention
+        attn_output = self.attn_fwd(
+            query_states,
+            key_states,
+            value_states,
+            past_key_value[0],
+            past_key_value[1],
+            attn_metadata,
+            k_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[2],
+            v_scales_zeros=None
+            if len(past_key_value) == 2 else past_key_value[3],
+            inplace=True,
+        )
+        attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
+
+        # o proj
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+
+
+class MiniCPMV26MLP(nn.Module):
+    """mlp."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        quantization_config = getattr(config, 'quantization_config', None)
+        # gate up
+        self.gate_up_proj = build_merged_colwise_linear(
+            config.hidden_size,
+            [config.intermediate_size, config.intermediate_size],
+            bias=False,
+            dtype=dtype,
+            device=device,
+            quant_config=quantization_config,
+            is_tp=True,
+        )
+
+        # silu and mul
+        self.act_fn = SiluAndMul(inplace=True)
+
+        # down
+        self.down_proj = build_rowwise_linear(config.intermediate_size,
+                                              config.hidden_size,
+                                              bias=False,
+                                              quant_config=quantization_config,
+                                              dtype=dtype,
+                                              device=device,
+                                              is_tp=True)
+
+    def forward(self, x):
+        """forward."""
+        gate_up = self.gate_up_proj(x)
+        act = self.act_fn(gate_up)
+        return self.down_proj(act)
+
+
+class MiniCPMV26DecoderLayer(nn.Module):
+    """decoder layer."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 layer_idx: int,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.layer_idx = layer_idx
+        quantization_config = getattr(config, 'quantization_config', None)
+
+        # build attention layer
+        self.self_attn = MiniCPMV26Attention(config,
+                                             dtype=dtype,
+                                             device=device)
+
+        # build MLP
+        self.mlp = MiniCPMV26MLP(config, dtype=dtype, device=device)
+
+        # build input layer norm
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       config.rms_norm_eps,
+                                       quant_config=quantization_config,
+                                       dtype=dtype,
+                                       device=device)
+
+        # build attention layer norm
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size,
+            config.rms_norm_eps,
+            quant_config=quantization_config,
+            dtype=dtype,
+            device=device)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: Optional[List[torch.FloatTensor]],
+        residual: Optional[torch.Tensor] = None,
+        attn_metadata: Any = None,
+    ):
+
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        # Self Attention
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            rotary_pos_emb=rotary_pos_emb,
+            past_key_value=past_key_value,
+            attn_metadata=attn_metadata,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+
+        outputs = (hidden_states, residual)
+        return outputs
+
+
+class MiniCPMV26Model(nn.Module):
+    """model."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        quantization_config = getattr(config, 'quantization_config', None)
+
+        self.embed_tokens = nn.Embedding(config.vocab_size,
+                                         config.hidden_size,
+                                         self.padding_idx,
+                                         dtype=dtype,
+                                         device=device)
+
+        # build all decode layers
+        self.layers = nn.ModuleList([
+            MiniCPMV26DecoderLayer(config,
+                                   layer_idx,
+                                   dtype=dtype,
+                                   device=device)
+            for layer_idx in range(config.num_hidden_layers)
+        ])
+
+        # build norm
+        self.norm = RMSNorm(config.hidden_size,
+                            config.rms_norm_eps,
+                            quant_config=quantization_config,
+                            dtype=dtype,
+                            device=device)
+
+        # build rotary embedding
+        rope_params = build_rotary_params(config)
+        rope_dim = config.hidden_size // config.num_attention_heads
+        rope_max_pos_emb = config.max_position_embeddings
+        rope_base = config.rope_theta
+        self.rotary_emb = build_rotary_embedding(
+            rope_dim,
+            rope_max_pos_emb,
+            rope_base,
+            **rope_params,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        attn_metadata: Any = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ):
+        """Rewrite of LlamaModel.forward."""
+
+        # token embedding
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        hidden_states = inputs_embeds
+
+        # rotary embedding
+        cos, sin = self.rotary_emb(hidden_states, position_ids)
+        cos, sin = cos[0], sin[0]
+        rotary_pos_emb = (cos, sin)
+
+        # decoding
+        residual = None
+        for idx, decoder_layer in enumerate(self.layers):
+            past_key_value = past_key_values[idx]
+            hidden_states, residual = decoder_layer(
+                hidden_states,
+                rotary_pos_emb=rotary_pos_emb,
+                past_key_value=past_key_value,
+                residual=residual,
+                attn_metadata=attn_metadata,
+            )
+
+        # norm
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+    def get_input_embeddings(self):
+        """get input embeddings."""
+        return self.embed_tokens
+
+
+class MiniCPMVForCausalLM(nn.Module, CudaGraphMixin):
+    """rewrote model of MiniCPMVForCausalLM."""
+
+    packed_modules_mapping = {
+        'gate_up_proj': [
+            'gate_proj',
+            'up_proj',
+        ],
+    }
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 ctx_mgr: StepContextManager,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.config = config
+        self.ctx_mgr = ctx_mgr
+        # build model
+        self.model = MiniCPMV26Model(config, dtype=dtype, device=device)
+        # build lm_head
+        self.lm_head = build_rowwise_linear(config.hidden_size,
+                                            config.vocab_size,
+                                            bias=False,
+                                            dtype=dtype,
+                                            device=device)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        past_key_values: List[List[torch.Tensor]],
+        attn_metadata: Any = None,
+        inputs_embeds: torch.Tensor = None,
+        **kwargs,
+    ):
+        """model forward, return logits."""
+        hidden_states = self.model(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            attn_metadata=attn_metadata,
+            inputs_embeds=inputs_embeds,
+        )
+
+        return hidden_states
+
+    def get_logits(self, hidden_states: torch.Tensor):
+        """compute logits of the model output."""
+        return self.lm_head(hidden_states)
+
+    def update_weights(self):
+        """update weights."""
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+
+    def get_input_embeddings(self):
+        """get input embeddings."""
+        return self.model.get_input_embeddings()
+
+    def prepare_inputs_for_generation(
+        self,
+        past_key_values: List[List[torch.Tensor]],
+        inputs_embeds: Optional[torch.Tensor] = None,
+        context: StepContext = None,
+    ):
+        """prepare input."""
+        # get input_ids, position_ids and attention metadatas
+        input_ids = context.input_ids
+        position_ids = context.position_ids
+        attn_metadata = context.attn_metadata
+
+        # process vision embeddings
+        vision_embeddings = context.input_embeddings
+        vision_embedding_indexing = context.input_embedding_indexing
+        if vision_embeddings is not None and len(vision_embeddings) > 0:
+            if inputs_embeds is None:
+                inputs_embeds = self.get_input_embeddings()(input_ids)
+            inputs_embeds[:,
+                          vision_embedding_indexing, :] = vision_embeddings.to(
+                              inputs_embeds)
+
+        # inputs of forward
+        return dict(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            attn_metadata=attn_metadata,
+            inputs_embeds=inputs_embeds,
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        """load weights."""
+        # modify from vllm
+        stacked_params_mapping = [
+            ('.qkv_proj', '.q_proj', 'q'),
+            ('.qkv_proj', '.k_proj', 'k'),
+            ('.qkv_proj', '.v_proj', 'v'),
+            ('.gate_up_proj', '.gate_proj', 0),
+            ('.gate_up_proj', '.up_proj', 1),
+        ]
+
+        params_dict = dict(self.named_parameters(prefix='llm'))
+        for name, loaded_weight in weights:
+            if 'vpm' in name or 'resampler' in name:
+                continue
+            if 'rotary_emb.inv_freq' in name:
+                continue
+            if ('rotary_emb.cos_cached' in name
+                    or 'rotary_emb.sin_cached' in name):
+                continue
+            if self.config.tie_word_embeddings and 'lm_head.weight' in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                load_weight(param, loaded_weight, shard_id=shard_id)
+                break
+            else:
+                param = params_dict[name]
+                load_weight(param, loaded_weight)
diff --git a/lmdeploy/pytorch/models/module_map.py b/lmdeploy/pytorch/models/module_map.py
index e6b5f6e29e..1059bfee4e 100644
--- a/lmdeploy/pytorch/models/module_map.py
+++ b/lmdeploy/pytorch/models/module_map.py
@@ -173,6 +173,12 @@
     f'{LMDEPLOY_PYTORCH_MODEL_PATH}.minicpm3.MiniCPM3ForCausalLM',
 })
 
+# minicpmv2_6
+MODULE_MAP.update({
+    'MiniCPMV':
+    f'{LMDEPLOY_PYTORCH_MODEL_PATH}.minicpmv26.MiniCPMVForCausalLM',
+})
+
 # mllama
 MODULE_MAP.update({
     'MllamaForConditionalGeneration':
diff --git a/lmdeploy/pytorch/supported_models.py b/lmdeploy/pytorch/supported_models.py
index 21418188dd..7fa568651b 100644
--- a/lmdeploy/pytorch/supported_models.py
+++ b/lmdeploy/pytorch/supported_models.py
@@ -70,6 +70,8 @@
     PhiMoEForCausalLM=True,
     # mllama
     MllamaForConditionalGeneration=True,
+    # MiniCPM-V-2_6
+    MiniCPMVForCausalLM=True,
 )
 
 
From b4834ea4c6d9add7253092ff3271f0add86bab44 Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Mon, 25 Nov 2024 18:21:12 +0800
Subject: [PATCH 087/122] Support qwen2-vl AWQ quantization (#2787)

* Support qwen2-vl AWQ quantization

* Update config.yaml

---------

Co-authored-by: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
---
 autotest/config.yaml               |  4 +--
 lmdeploy/lite/apis/calibrate.py    |  3 ++
 lmdeploy/lite/quantization/awq.py  |  9 ++++++
 lmdeploy/lite/utils/batch_split.py |  8 +++++
 lmdeploy/vl/model/qwen2.py         | 52 ++++++++++++++++--------------
 5 files changed, 49 insertions(+), 27 deletions(-)

diff --git a/autotest/config.yaml b/autotest/config.yaml
index 6c92d2cf0b..e31a40f0d4 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -163,8 +163,6 @@ pytorch_base_model:
 
 turbomind_quatization:
     no_awq:
-        - Qwen/Qwen2-VL-2B-Instruct
-        - Qwen/Qwen2-VL-7B-Instruct
         - mistralai/Mistral-7B-Instruct-v0.3
         - deepseek-ai/deepseek-coder-1.3b-instruct
         - codellama/CodeLlama-7b-Instruct-hf
@@ -189,6 +187,8 @@ pytorch_quatization:
         - Qwen/Qwen2-7B-Instruct
         - Qwen/Qwen2-1.5B-Instruct
         - microsoft/Phi-3-mini-4k-instruct
+        - Qwen/Qwen2-VL-2B-Instruct
+        - Qwen/Qwen2-VL-7B-Instruct
     w8a8:
         - meta-llama/Meta-Llama-3-8B-Instruct
         - meta-llama/Llama-2-7b-chat-hf
diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
index cd5178793d..0780e93594 100644
--- a/lmdeploy/lite/apis/calibrate.py
+++ b/lmdeploy/lite/apis/calibrate.py
@@ -26,6 +26,7 @@
     'Phi3ForCausalLM': 'Phi3DecoderLayer',
     'ChatGLMForConditionalGeneration': 'GLMBlock',
     'MixtralForCausalLM': 'MixtralDecoderLayer',
+    'Qwen2VLForConditionalGeneration': 'Qwen2VLDecoderLayer',
 }
 
 NORM_TYPE_MAP = {
@@ -42,6 +43,7 @@
     'Phi3ForCausalLM': 'Phi3RMSNorm',
     'ChatGLMForConditionalGeneration': 'RMSNorm',
     'MixtralForCausalLM': 'MixtralRMSNorm',
+    'Qwen2VLForConditionalGeneration': 'Qwen2RMSNorm',
 }
 
 HEAD_NAME_MAP = {
@@ -58,6 +60,7 @@
     'Phi3ForCausalLM': 'lm_head',
     'ChatGLMForConditionalGeneration': 'output_layer',
     'MixtralForCausalLM': 'lm_head',
+    'Qwen2VLForConditionalGeneration': 'lm_head',
 }
 
 
diff --git a/lmdeploy/lite/quantization/awq.py b/lmdeploy/lite/quantization/awq.py
index 068ad9357e..2efe41b6da 100644
--- a/lmdeploy/lite/quantization/awq.py
+++ b/lmdeploy/lite/quantization/awq.py
@@ -45,6 +45,11 @@
         ['self_attn.k_proj', 'self_attn.q_proj', 'self_attn.v_proj'],
         'post_attention_layernorm':
         ['block_sparse_moe.experts.{i}.w1', 'block_sparse_moe.experts.{i}.w3']
+    },
+    'Qwen2VLDecoderLayer': {
+        'input_layernorm':
+        ['self_attn.k_proj', 'self_attn.q_proj', 'self_attn.v_proj'],
+        'post_attention_layernorm': ['mlp.gate_proj', 'mlp.up_proj']
     }
 }
 
@@ -83,6 +88,10 @@
     'MixtralDecoderLayer': {
         'self_attn.v_proj': ['self_attn.o_proj'],
         'block_sparse_moe.experts.{i}.w3': ['block_sparse_moe.experts.{i}.w2']
+    },
+    'Qwen2VLDecoderLayer': {
+        'self_attn.v_proj': ['self_attn.o_proj'],
+        'mlp.up_proj': ['mlp.down_proj']
     }
 }
 
diff --git a/lmdeploy/lite/utils/batch_split.py b/lmdeploy/lite/utils/batch_split.py
index 3bd208f609..4e30f61d34 100644
--- a/lmdeploy/lite/utils/batch_split.py
+++ b/lmdeploy/lite/utils/batch_split.py
@@ -46,6 +46,14 @@ def split_decoder_layer_inputs(
         for name, val in kwargs.items():
             if isinstance(val, torch.Tensor) and val.size(0) == bs:
                 new_kwargs[name] = val[i:i + batch_size]
+            elif isinstance(val, torch.Tensor) and len(
+                    val.shape) > 1 and val.size(1) == bs:  # qwen2-vl
+                new_kwargs[name] = val[:, i:i + batch_size]
+            elif name == 'position_embeddings' and isinstance(
+                    val, Tuple) and len(
+                        val[0].shape) > 1 and val[0].size(1) == bs:  # qwen2-vl
+                new_kwargs[name] = (val[0][:, i:i + batch_size],
+                                    val[1][:, i:i + batch_size])
             else:
                 new_kwargs[name] = val
 
diff --git a/lmdeploy/vl/model/qwen2.py b/lmdeploy/vl/model/qwen2.py
index 2e53d8e0f0..3eb3c1541c 100644
--- a/lmdeploy/vl/model/qwen2.py
+++ b/lmdeploy/vl/model/qwen2.py
@@ -33,33 +33,35 @@ class Qwen2VLModel(VisonModel):
 
     def build_model(self):
         check_qwen_vl_deps_install()
-
-        from accelerate import init_empty_weights
-        with init_empty_weights():
-            config = self.hf_config
-            config.quantization_config = {}  # disable vision part quantization
-            # disable accelerate check_tied_parameters_in_config
-            # for Qwen2-VL-2B-Instruct
-            config.tie_word_embeddings = False
-
-            from transformers import Qwen2VLForConditionalGeneration
-            model = Qwen2VLForConditionalGeneration._from_config(config)
-            if not self.with_llm:
+        from transformers import Qwen2VLForConditionalGeneration
+        if self.with_llm:
+            model = Qwen2VLForConditionalGeneration.from_pretrained(
+                self.hf_config._name_or_path, trust_remote_code=True)
+            model.half()
+            self.vl_model = model
+        else:
+            from accelerate import init_empty_weights
+            with init_empty_weights():
+                config = self.hf_config
+                config.quantization_config = {
+                }  # disable vision part quantization
+                # disable accelerate check_tied_parameters_in_config
+                # for Qwen2-VL-2B-Instruct
+                config.tie_word_embeddings = False
+
+                model = Qwen2VLForConditionalGeneration._from_config(config)
                 del model.model
                 del model.lm_head
-            else:
-                self.vl_model = model
-            model.half()
-
-        from accelerate import load_checkpoint_and_dispatch
-        with disable_logging():
-            load_checkpoint_and_dispatch(
-                model=model,
-                checkpoint=self.model_path,
-                device_map='auto' if not self.with_llm else {'': 'cpu'},
-                max_memory=self.max_memory,
-                no_split_module_classes=['Qwen2VLVisionBlock'],
-                dtype=torch.half)
+                model.half()
+            from accelerate import load_checkpoint_and_dispatch
+            with disable_logging():
+                load_checkpoint_and_dispatch(
+                    model=model,
+                    checkpoint=self.model_path,
+                    device_map='auto' if not self.with_llm else {'': 'cpu'},
+                    max_memory=self.max_memory,
+                    no_split_module_classes=['Qwen2VLVisionBlock'],
+                    dtype=torch.half)
 
         self.model = model.eval()
 

From f13c0f93e82873b18dbc3daabb2f89ed16b4ea21 Mon Sep 17 00:00:00 2001
From: Wei Tao <1136862851@qq.com>
Date: Mon, 25 Nov 2024 18:33:24 +0800
Subject: [PATCH 088/122] [dlinfer] Fix qwenvl rope error for dlinfer backend
 (#2795)

---
 lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py b/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py
index fab6e510f5..ed807d66b0 100644
--- a/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py
+++ b/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py
@@ -24,7 +24,8 @@ def _rotary_embedding_fwd(position_ids: torch.Tensor,
     else:
         position_ids = position_ids.float()
 
-    inv_freq_expanded = inv_freq.view(1, -1, 1)
+    inv_freq_expanded = inv_freq.view(1, -1, 1).expand(position_ids.size(0),
+                                                       -1, 1)
     position_ids_expanded = position_ids.unsqueeze(1)
 
     tmp = torch.bmm(inv_freq_expanded, position_ids_expanded)

From b5b31791a76cf37ac3856d0394fa3eb5502217a6 Mon Sep 17 00:00:00 2001
From: jinminxi104 <jinminxi104@hotmail.com>
Date: Mon, 25 Nov 2024 20:30:58 +0800
Subject: [PATCH 089/122] Optimize update_step_ctx on Ascend (#2804)

* opt update_ctx for ascend

* fix lint
---
 .../backends/dlinfer/ascend/op_backend.py     | 61 +++++++++++--------
 1 file changed, 36 insertions(+), 25 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
index 79e5288364..b6f544510b 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -71,31 +71,42 @@ def get_total_slots():
         max_q_seq_len = max(q_seqlens_list)
         max_kv_seq_len = max(kv_seqlens_list)
 
-        for i in range(step_context.q_start_loc.size(0)):
-            q_seq_len = q_seqlens_list[i]
-            kv_seq_len = kv_seqlens_list[i]
-
-            # collect kv start indices.
-            history_length = kv_seq_len - q_seq_len
-            total_slots = get_total_slots()
-            slot_tables = total_slots[step_context.block_offsets[i]].view(-1)
-            slots = slot_tables[history_length:kv_seq_len]
-            kv_start_indices.append(slots)
-
-            # collect attention mask of paged_prefill attention stage.
-            if not (step_context.is_decoding or is_unpaged_prefill):
-                single_attention_mask = torch.logical_not(
-                    torch.tril(
-                        torch.ones(q_seq_len,
-                                   step_context.block_offsets.shape[1] *
-                                   block_size,
-                                   dtype=torch.bool,
-                                   device=step_context.block_offsets.device),
-                        diagonal=kv_seq_len - q_seq_len,
-                    ))
-                attention_mask.append(single_attention_mask)
-
-        kv_start_indices = torch.cat(kv_start_indices)
+        if step_context.is_decoding:
+            # collect kv_start_indices without using a for-loop,
+            # (fill kv-cache for just ONE token during the decoding phase)
+            idx = (step_context.kv_seqlens - 1) % block_size
+            block_num = (step_context.kv_seqlens - 1) // block_size
+            last_block = step_context.block_offsets.gather(
+                1, block_num.view(-1, 1)).view(-1)
+            kv_start_indices = last_block * block_size + idx
+        else:
+            for i in range(step_context.q_start_loc.size(0)):
+                q_seq_len = q_seqlens_list[i]
+                kv_seq_len = kv_seqlens_list[i]
+
+                # collect kv start indices during the prefill phase.
+                history_length = kv_seq_len - q_seq_len
+                total_slots = get_total_slots()
+                slot_tables = total_slots[step_context.block_offsets[i]].view(
+                    -1)
+                slots = slot_tables[history_length:kv_seq_len]
+                kv_start_indices.append(slots)
+
+                # collect attention mask of paged_prefill attention stage.
+                if not is_unpaged_prefill:
+                    single_attention_mask = torch.logical_not(
+                        torch.tril(
+                            torch.ones(
+                                q_seq_len,
+                                step_context.block_offsets.shape[1] *
+                                block_size,
+                                dtype=torch.bool,
+                                device=step_context.block_offsets.device),
+                            diagonal=kv_seq_len - q_seq_len,
+                        ))
+                    attention_mask.append(single_attention_mask)
+
+            kv_start_indices = torch.cat(kv_start_indices)
 
         if step_context.is_decoding:
             # prepare some params of paged_decode attention stage.

From 3913eadfbb60c2d73832f25415d5ebaef62280a4 Mon Sep 17 00:00:00 2001
From: q yao <streetyao@live.com>
Date: Wed, 27 Nov 2024 19:17:16 +0800
Subject: [PATCH 090/122] disable prefix-caching for vl model (#2825)

---
 lmdeploy/api.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lmdeploy/api.py b/lmdeploy/api.py
index e66d73754a..2b4204a53b 100644
--- a/lmdeploy/api.py
+++ b/lmdeploy/api.py
@@ -69,7 +69,11 @@ def pipeline(model_path: str,
             if backend_config is not None else None
         model_path = get_model(model_path, download_dir, revision)
 
-    _, pipeline_class = get_task(model_path)
+    task, pipeline_class = get_task(model_path)
+    if task == 'vlm':
+        if backend_config.enable_prefix_caching:
+            backend_config.enable_prefix_caching = False
+            logger.warning('VLM does not support prefix caching.')
 
     if type(backend_config) is not PytorchEngineConfig:
         # set auto backend mode

From f88fbc3c31961b1cb159e041dcb657592fb2da21 Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Fri, 29 Nov 2024 10:37:42 +0800
Subject: [PATCH 091/122] Add DeepSeek-V2 support (#2763)

* add qwen2-moe

* eliminate `inter_size_` from ffn layer

* clean up

* fix lint

* clean up

* layer-wise `inter_size` & `expert_num`

* add head dim 192

* refactor weight processing

* deepseek-v2-lite

* deepseek-v2

* fix lint

* fix lint

* fix ut

* Update config.yaml

* Update config.yaml

* fix mixtral

* fix moe gating & config parsing

* fix yarn for deepseek-v2

* fix `copy_from`

* fix rms norm, rotary embedding & deepseek v2 attention

* remove debug code

---------

Co-authored-by: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
---
 autotest/config.yaml                          |   2 +
 examples/cpp/llama/llama_triton_example.cc    |   4 +-
 lmdeploy/turbomind/deploy/config.py           |  23 +-
 lmdeploy/turbomind/deploy/converter.py        |   7 +-
 lmdeploy/turbomind/deploy/loader.py           |  21 ++
 lmdeploy/turbomind/deploy/module.py           |  82 ++++-
 .../turbomind/deploy/source_model/__init__.py |   1 +
 .../deploy/source_model/deepseek2.py          | 134 ++++++++
 .../turbomind/deploy/source_model/mixtral.py  |   2 +-
 .../turbomind/deploy/source_model/qwen.py     |   2 +-
 .../turbomind/deploy/target_model/base.py     |  20 +-
 lmdeploy/turbomind/supported_models.py        |   1 +
 src/turbomind/kernels/CMakeLists.txt          |   1 +
 .../kernels/attention/CMakeLists.txt          |   2 +
 src/turbomind/kernels/attention/attention.cu  |   6 +
 .../attention/codegen/attention_sm80_192.cu   |  16 +
 .../attention/codegen/decoding_sm80_192.cu    |  20 ++
 src/turbomind/kernels/attention/decoding.cu   |  17 +-
 .../kernels/attention/decoding_config.h       |  12 +-
 src/turbomind/kernels/attention/impl_16816.h  |  61 ++--
 src/turbomind/kernels/attention/impl_81616.h  |   2 +-
 src/turbomind/kernels/attention/impl_simt.h   |  14 +-
 .../kernels/attention/kv_cache_utils_v2.cu    |  12 +-
 .../kernels/attention/mainloop_sm80.h         |  17 +-
 src/turbomind/kernels/attention/reduce.cu     |   6 +-
 .../kernels/attention/reduce_kernel.h         |   7 +-
 .../kernels/attention/rotary_embedding.h      |  17 +
 .../kernels/attention/test_attention.cu       |  12 +-
 src/turbomind/kernels/core/array_ops.h        |   2 +-
 src/turbomind/kernels/core/math.h             |   8 +
 src/turbomind/kernels/core/thread_map.h       |   3 +-
 .../flash_attention2/CMakeLists.txt           |   4 +-
 .../flash_fwd_launch_template.h               |   2 +-
 .../flash_attention2/static_switch.h          |  12 +
 src/turbomind/kernels/gemm/context.h          |  13 +-
 src/turbomind/kernels/gemm/convert_v2.cu      |  41 ++-
 src/turbomind/kernels/gemm/moe_utils_v2.cu    | 195 +++++++++--
 src/turbomind/kernels/gemm/moe_utils_v2.h     |   4 +
 .../kernels/gemm/test/test_moe_utils.cu       |  86 +----
 src/turbomind/kernels/gemm/test/testbed.h     |   4 +-
 src/turbomind/kernels/gemm/unpack.cu          |  34 +-
 src/turbomind/kernels/norm/CMakeLists.txt     |   5 +
 src/turbomind/kernels/norm/rms_norm.cu        | 235 +++++++++++++
 src/turbomind/kernels/norm/rms_norm.h         |  21 ++
 src/turbomind/models/llama/CMakeLists.txt     |   4 +-
 src/turbomind/models/llama/LlamaBatch.cc      |   6 +-
 .../models/llama/LlamaDecoderLayerWeight.cc   | 325 ++++++++----------
 .../models/llama/LlamaDecoderLayerWeight.h    |  39 +--
 src/turbomind/models/llama/LlamaDenseWeight.h | 265 +++++++++-----
 src/turbomind/models/llama/LlamaFfnLayer.cc   |  26 +-
 src/turbomind/models/llama/LlamaFfnLayer.h    |   9 +-
 src/turbomind/models/llama/LlamaV2.cc         |   1 -
 src/turbomind/models/llama/LlamaV2.h          |   1 -
 src/turbomind/models/llama/LlamaWeight.cc     |  99 +++---
 src/turbomind/models/llama/LlamaWeight.h      |  36 +-
 src/turbomind/models/llama/llama_gemm.cc      |   2 +-
 src/turbomind/models/llama/llama_kernels.h    |   2 +-
 src/turbomind/models/llama/llama_params.h     |  65 +++-
 src/turbomind/models/llama/llama_utils.cu     |  73 ++--
 src/turbomind/models/llama/mla_utils.cu       |  93 +++++
 src/turbomind/models/llama/mla_utils.h        |  57 +++
 src/turbomind/models/llama/moe_ffn_layer.cc   |  74 ++--
 src/turbomind/models/llama/moe_ffn_layer.h    |  20 +-
 .../models/llama/unified_attention_layer.cc   | 150 ++++++--
 .../models/llama/unified_attention_layer.h    |   7 +-
 src/turbomind/models/llama/unified_decoder.cc |  89 ++---
 src/turbomind/models/llama/unified_decoder.h  |  16 +-
 src/turbomind/models/llama/weight_type.h      |  56 +++
 src/turbomind/python/bind.cpp                 |  48 ++-
 .../triton_backend/llama/LlamaTritonModel.cc  |  80 +++--
 .../triton_backend/llama/LlamaTritonModel.h   |   3 -
 src/turbomind/utils/allocator.h               |   3 +-
 src/turbomind/utils/cuda_utils.h              |  19 +
 src/turbomind/utils/memory_utils.cu           | 108 +++---
 src/turbomind/utils/memory_utils.h            |  13 +-
 75 files changed, 2118 insertions(+), 861 deletions(-)
 create mode 100644 lmdeploy/turbomind/deploy/source_model/deepseek2.py
 create mode 100644 src/turbomind/kernels/attention/codegen/attention_sm80_192.cu
 create mode 100644 src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu
 create mode 100644 src/turbomind/kernels/norm/CMakeLists.txt
 create mode 100644 src/turbomind/kernels/norm/rms_norm.cu
 create mode 100644 src/turbomind/kernels/norm/rms_norm.h
 create mode 100644 src/turbomind/models/llama/mla_utils.cu
 create mode 100644 src/turbomind/models/llama/mla_utils.h
 create mode 100644 src/turbomind/models/llama/weight_type.h

diff --git a/autotest/config.yaml b/autotest/config.yaml
index e31a40f0d4..88ca7c3127 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -62,6 +62,7 @@ turbomind_chat_model:
     - liuhaotian/llava-v1.6-vicuna-7b
     - deepseek-ai/deepseek-vl-1.3b-chat
     - deepseek-ai/deepseek-coder-1.3b-instruct
+    - deepseek-ai/DeepSeek-V2-Lite-Chat
     - codellama/CodeLlama-7b-Instruct-hf
     - THUDM/glm-4-9b-chat
     - openbmb/MiniCPM-Llama3-V-2_5
@@ -165,6 +166,7 @@ turbomind_quatization:
     no_awq:
         - mistralai/Mistral-7B-Instruct-v0.3
         - deepseek-ai/deepseek-coder-1.3b-instruct
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
         - codellama/CodeLlama-7b-Instruct-hf
     gptq:
         - internlm/internlm2_5-7b-chat
diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index b0e513410e..1fb5fa0964 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -114,14 +114,14 @@ broadCastRequest(const std::vector<int>& v_start_ids,
         }
         else {
             // conditional case.
-            ft::deviceMalloc(&d_input_ids, size_1, false);
+            ft::deviceMalloc(&d_input_ids, size_1, nullptr, false);
             // ft::deviceMalloc(&d_input_lengths, size_2, false);
             ft::cudaH2Dcpy(d_input_ids, v_input_ids.data(), size_1);
             // ft::cudaH2Dcpy(d_input_lengths, v_input_lengths.data(), size_2);
         }
 
         if (!v_input_bad_words.empty()) {
-            ft::deviceMalloc(&d_input_bad_words, size_bad_words, false);
+            ft::deviceMalloc(&d_input_bad_words, size_bad_words, nullptr, false);
             ft::cudaH2Dcpy(d_input_bad_words, v_input_bad_words.data(), size_bad_words);
         }
         else {
diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
index c724b085a0..e483500e96 100644
--- a/lmdeploy/turbomind/deploy/config.py
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -2,6 +2,7 @@
 import inspect
 import json
 from dataclasses import asdict, fields
+from typing import List
 
 # use pydantic.dataclasses.dataclass to check data type
 from pydantic.dataclasses import dataclass
@@ -43,22 +44,33 @@ class ModelConfig:
     # of token_embedding
     embedding_size: int = 0
     num_layer: int = None
-    inter_size: int = None
+    inter_size: List[int] = None
     norm_eps: float = None
     attn_bias: int = 0
     start_id: int = None
     end_id: int = None
     size_per_head: int = 128
-    group_size: int = 0
+    group_size: int = 64
     weight_type: str = None
     session_len: int = None
     tp: int = 1
     model_format: str = 'hf'
-    expert_num: int = 0
+    expert_num: List[int] = ()
     expert_inter_size: int = 0
     experts_per_token: int = 0
-    moe_shared_gate: int = False
-    moe_norm_topk: int = False
+    moe_shared_gate: bool = False
+    norm_topk_prob: bool = False
+    routed_scale: float = 1.0
+    topk_group: int = 1
+    topk_method: str = 'greedy'
+    moe_group_num: int = 1
+    # MLA
+    q_lora_rank: int = 0
+    kv_lora_rank: int = 0
+    qk_rope_dim: int = 0
+    v_head_dim: int = 0
+    # tuning
+    tune_layer_num: int = 1
 
     def verify(self):
         invalid = {}
@@ -72,6 +84,7 @@ def verify(self):
 class AttentionConfig:
     rotary_embedding: int = 128
     rope_theta: float = 10000.0
+    softmax_scale: float = 0
     attention_factor: float = None
     max_position_embeddings: int = 0
     original_max_position_embeddings: int = 0
diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
index 1c847ede01..77f0bc8dc8 100644
--- a/lmdeploy/turbomind/deploy/converter.py
+++ b/lmdeploy/turbomind/deploy/converter.py
@@ -241,11 +241,10 @@ def get_tm_model(model_path,
         engine_config.model_format = quant_method
         group_size = _group_size
 
-    # Compatible to awq models that are quantized by lmdeploy (<=v0.3.0)
-    if not group_size:
-        group_size = 128
-
     if engine_config.model_format in ['awq', 'gptq']:
+        # Compatible to awq models that are quantized by lmdeploy (<=v0.3.0)
+        if not group_size:
+            group_size = 128
         assert group_size == 128, \
             f'model format is "{engine_config.model_format}" ' \
             f'but group_size is {group_size}. Currently, only 128 ' \
diff --git a/lmdeploy/turbomind/deploy/loader.py b/lmdeploy/turbomind/deploy/loader.py
index e3d79b164a..94e779b6b7 100644
--- a/lmdeploy/turbomind/deploy/loader.py
+++ b/lmdeploy/turbomind/deploy/loader.py
@@ -88,6 +88,27 @@ def items(self):
                     yield (-1, {k: f.get_tensor(k) for k in misc})
         assert not params
 
+    # def items(self):
+    #     params = defaultdict(dict)
+    #     for shard in self.shards:
+    #         # with safe_open(shard, 'pt') as f:
+    #         with open(shard, 'rb') as f:
+    #             w = safetensors.torch.load(f.read())
+    #             misc = []
+    #             for k in w.keys():
+    #                 match = re.findall(self.pattern, k)
+    #                 if not match:
+    #                     misc.append(k)
+    #                 else:
+    #                     idx = int(match[0])
+    #                     param = params[idx]
+    #                     param[k] = w[k]
+    #                     if len(param) == self.item_count[idx]:
+    #                         yield (idx, params.pop(idx))
+    #             if misc:
+    #                 yield (-1, {k: w[k] for k in misc})
+    #     assert not params
+
 
 class PytorchLoader(BaseLoader):
 
diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py
index 8d998abe2b..52497175ef 100644
--- a/lmdeploy/turbomind/deploy/module.py
+++ b/lmdeploy/turbomind/deploy/module.py
@@ -96,10 +96,13 @@ class Ffn(Module):
     def __init__(self, model: BaseOutputModel):
         self.model = model
         self.tp = model.tensor_para_size
+        # inter_sizes in config are padded and may be different from what's
+        # in the weights
         self.inter_size = model.model_config.inter_size
         self.group_size = max(1, model.model_config.group_size)
 
     def _export(self,
+                inter_size: int,
                 fmt: str,
                 idx: int,
                 w123,
@@ -110,11 +113,11 @@ def _export(self,
         w1, w2, w3 = map(transpose, w123)
 
         if not is_lora_a:
-            w1 = pad_out_dims(w1, self.inter_size)
-            w3 = pad_out_dims(w3, self.inter_size)
+            w1 = pad_out_dims(w1, inter_size)
+            w3 = pad_out_dims(w3, inter_size)
         if not is_lora_b:
             group_size = self.group_size if apply_gs else 1
-            w2 = pad_in_dims(w2, self.inter_size // group_size)
+            w2 = pad_in_dims(w2, inter_size // group_size)
 
         w1, w2, w3 = map(pack_fn, (w1, w2, w3))
         self.model.save_split(w1,
@@ -132,7 +135,8 @@ def _export(self,
 
     def apply(self, i: int, r: BaseReader):
         for e in get_params(r.ffn(i, None)):
-            e(partial(self._export, self._ffn), partial(r.ffn, i), i)
+            e(partial(self._export, self.inter_size[i], self._ffn),
+              partial(r.ffn, i), i)
 
 
 class MoeFfn(Ffn):
@@ -154,11 +158,13 @@ def __init__(self, model: BaseOutputModel):
         self.shared_gate = model.model_config.moe_shared_gate
 
     def apply(self, i: int, r: BaseReader):
+        if self.expert_num[i] == 0:
+            return
         for p in get_params(r.moe_ffn_expert()):
-            for e in range(self.expert_num):
+            for e in range(self.expert_num[i]):
                 fmt = self._moe_ffn_expert.replace('E', str(e))
-                p(partial(self._export, fmt), partial(r.moe_ffn_expert, e, i),
-                  i)
+                p(partial(self._export, self.inter_size, fmt),
+                  partial(r.moe_ffn_expert, e, i), i)
 
         gate = transpose(r.moe_ffn_gate(i))
         self.model.save_split(gate, self._moe_ffn_gate.format(i))
@@ -218,6 +224,62 @@ def apply(self, i: int, r: BaseReader):
             e(self._export, partial(r.attn, i), i)
 
 
+class MLA(Module):
+    """
+    requires:
+        r.mla(i, kind)
+        r.mla_norm(i)
+    """
+
+    _mla = 'layers.{0}.attention.{1}.{2}'
+
+    def __init__(self, model: BaseOutputModel):
+        self.model = model
+
+    def _export(self, idx: int, xs, kind: str, pack_fn, **kwargs):
+        if all(x is None for x in xs):
+            return
+        q_a, q_b, q, kv_a, kv_b, o = map(transpose, xs)
+
+        if q is not None:
+            q_b = q
+
+        cfg = self.model.model_config
+
+        o = o.reshape(cfg.head_num, cfg.v_head_dim, -1)
+        o = torch.nn.functional.pad(
+            o, (0, 0, 0, cfg.size_per_head - cfg.v_head_dim, 0, 0))
+        o = o.view(cfg.head_num * cfg.size_per_head, cfg.hidden_units)
+
+        if q_a is not None:
+            self.model.save_split(pack_fn(q_a),
+                                  self._mla.format(idx, 'q_a_proj', kind))
+        q_b_name = 'q_proj' if q_a is None else 'q_b_proj'
+        self.model.save_split(pack_fn(q_b),
+                              self._mla.format(idx, q_b_name, kind),
+                              split_dim=-1)
+        self.model.save_split(pack_fn(kv_a),
+                              self._mla.format(idx, 'kv_a_proj', kind))
+        self.model.save_split(pack_fn(kv_b),
+                              self._mla.format(idx, 'kv_b_proj', kind),
+                              split_dim=-1)
+        self.model.save_split(pack_fn(o),
+                              self._mla.format(idx, 'wo', kind),
+                              split_dim=0)
+
+    _layernorm = 'layers.{0}.attention.{1}_a_layernorm'
+
+    def apply(self, i: int, r: BaseReader):
+
+        for f in get_params(r.attn(i, None), bias=False):
+            f(self._export, partial(r.mla, i), i)
+
+        q, k = r.mla_norm(i)
+        if q is not None:
+            self.model.save_split(q, self._layernorm.format(i, 'q'))
+        self.model.save_split(k, self._layernorm.format(i, 'kv'))
+
+
 class Misc(Module):
     """
     requires:
@@ -258,7 +320,11 @@ class Transformer:
 
     def __init__(self, model: BaseOutputModel):
         self.model = model
-        modules = [Attn, LayerNorm]
+        modules = [LayerNorm]
+        if model.model_config.kv_lora_rank:
+            modules.append(MLA)
+        else:
+            modules.append(Attn)
         if model.model_config.inter_size:
             modules.append(Ffn)
         if model.model_config.expert_num:
diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py
index de16bdc0a0..b9394b1244 100644
--- a/lmdeploy/turbomind/deploy/source_model/__init__.py
+++ b/lmdeploy/turbomind/deploy/source_model/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .baichuan import Baichuan2Model, BaichuanModel  # noqa: F401
+from .deepseek2 import DeepSeek2Model  # noqa: F401
 from .deepseek_vl import DeepSeekVLModel  # noqa: F401
 from .glm4 import Glm4Model  # noqa: F401
 from .internlm2 import InternLM2Model  # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek2.py b/lmdeploy/turbomind/deploy/source_model/deepseek2.py
new file mode 100644
index 0000000000..0023f650ff
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/deepseek2.py
@@ -0,0 +1,134 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+from .base import INPUT_MODELS
+from .llama import LlamaModel, LlamaReader
+
+
+class DeepSeek2Reader(LlamaReader):
+
+    def moe_ffn_gate(self, i):
+        return self.params.get(f'model.layers.{i}.mlp.gate.weight')
+
+    def moe_ffn_expert(self, e=None, i=None, kind=None):
+        if not kind:
+            return self.filter(r'experts')
+        result = []
+        for key in ['gate', 'down', 'up']:
+            name = f'model.layers.{i}.mlp.experts.{e}.{key}_proj.{kind}'
+            tensor = self.params.get(name)
+            tensor = self.transform(tensor, kind)
+            result.append(tensor)
+        return (*result, )
+
+    def _ffn(self, i: int, kind: str):
+        """Get ffn kind for layer i."""
+        if not kind:
+            return self.filter(r'mlp' if i == 0 else r'shared_expert\.')
+        result = []
+        for key in ['gate', 'down', 'up']:
+            name = f'model.layers.{i}.mlp.shared_experts.{key}_proj.{kind}'
+            if i == 0:
+                name = name.replace('shared_experts.', '')
+            tensor = self.params.get(name)
+            tensor = self.transform(tensor, kind)
+            result.append(tensor)
+        return (*result, )
+
+    def mla(self, i: int, kind: str):
+        if not kind:
+            return self.filter(r'self_attn.*proj')
+        result = []
+        for key in [
+                'q_a_proj', 'q_b_proj', 'q_proj', 'kv_a_proj_with_mqa',
+                'kv_b_proj', 'o_proj'
+        ]:
+            tensor = self.params.get(
+                f'{self.attn_layer_prefix}.{i}.self_attn.{key}.{kind}')
+            tensor = self.transform(tensor, kind)
+            result.append(tensor)
+        return (*result, )
+
+    def mla_norm(self, i: int):
+        result = []
+        for k in ['q', 'kv']:
+            name = f'{self.attn_layer_prefix}.{i}.self_attn.{k}_a_layernorm.weight'  # noqa: E501
+            result.append(self.params.get(name))
+        return (*result, )
+
+
+def get_yarn_params(rope_scaling: dict):
+
+    scaling_factor = float(rope_scaling['factor'])
+    mscale = rope_scaling['mscale']
+    mscale_all_dim = rope_scaling['mscale_all_dim']
+
+    def yarn_get_mscale(scale=1, mscale=1):
+        if scale <= 1:
+            return 1.0
+        return 0.1 * mscale * math.log(scale) + 1.0
+
+    _mscale = float(
+        yarn_get_mscale(scaling_factor, mscale) /
+        yarn_get_mscale(scaling_factor, mscale_all_dim))
+
+    softmax_scale = 0
+    if mscale_all_dim:
+        scale = yarn_get_mscale(scaling_factor, mscale_all_dim)
+        softmax_scale = scale * scale
+
+    return _mscale, softmax_scale
+
+
+@INPUT_MODELS.register_module(name='deepseek2')
+class DeepSeek2Model(LlamaModel):
+
+    Reader = DeepSeek2Reader
+
+    def tokenizer_info(self):
+        n_words = self.model_config['vocab_size']
+        bos_id = self.model_config['bos_token_id']
+        eos_id = self.model_config['eos_token_id']
+        return n_words, bos_id, eos_id
+
+    def model_info(self):
+        cfg = self.model_config
+        info = super().model_info()
+        qk_nope_dim = cfg['qk_nope_head_dim']
+        qk_rope_dim = cfg['qk_rope_head_dim']
+        num_layer = cfg['num_hidden_layers']
+        expert_num = cfg['n_routed_experts']
+        expert_num = [expert_num] * num_layer
+        expert_num[0] = 0
+        n_shared_experts = cfg['n_shared_experts']
+        expert_inter_size = cfg['moe_intermediate_size']
+        experts_per_token = cfg['num_experts_per_tok']
+        inter_size = [n_shared_experts * expert_inter_size] * num_layer
+        inter_size[0] = cfg['intermediate_size']
+        norm_topk_prob = cfg['norm_topk_prob']
+        size_per_head = qk_rope_dim + qk_nope_dim
+        info.update(kv_lora_rank=cfg['kv_lora_rank'],
+                    q_lora_rank=cfg['q_lora_rank'] or 0,
+                    qk_rope_dim=qk_rope_dim,
+                    v_head_dim=cfg['v_head_dim'],
+                    size_per_head=size_per_head,
+                    rotary_embedding=qk_rope_dim,
+                    expert_num=expert_num,
+                    expert_inter_size=expert_inter_size,
+                    experts_per_token=experts_per_token,
+                    inter_size=inter_size,
+                    norm_topk_prob=norm_topk_prob,
+                    routed_scale=cfg['routed_scaling_factor'],
+                    topk_method=cfg['topk_method'],
+                    topk_group=cfg['topk_group'],
+                    moe_group_num=cfg['n_group'],
+                    tune_layer_num=2)
+        rope_scaling = cfg.get('rope_scaling')
+        if rope_scaling and rope_scaling['type'] == 'yarn':
+            attention_factor, softmax_scale = get_yarn_params(rope_scaling)
+            softmax_scale *= size_per_head**(-0.5)
+            info.update(max_position_embeddings=rope_scaling[
+                'original_max_position_embeddings'],
+                        attention_factor=attention_factor,
+                        softmax_scale=softmax_scale)
+        return info
diff --git a/lmdeploy/turbomind/deploy/source_model/mixtral.py b/lmdeploy/turbomind/deploy/source_model/mixtral.py
index ff9df2d409..6ac22a658e 100644
--- a/lmdeploy/turbomind/deploy/source_model/mixtral.py
+++ b/lmdeploy/turbomind/deploy/source_model/mixtral.py
@@ -33,6 +33,6 @@ def model_info(self):
         info['expert_num'] = cfg['num_local_experts']
         info['expert_inter_size'] = cfg['intermediate_size']
         info['experts_per_token'] = cfg['num_experts_per_tok']
-        info['moe_norm_topk'] = True
+        info['norm_topk_prob'] = True
         info['inter_size'] = 0
         return info
diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py
index 772bd03037..637983e8ce 100644
--- a/lmdeploy/turbomind/deploy/source_model/qwen.py
+++ b/lmdeploy/turbomind/deploy/source_model/qwen.py
@@ -178,6 +178,6 @@ def model_info(self):
         info['experts_per_token'] = cfg['num_experts_per_tok']
         info['inter_size'] = cfg['shared_expert_intermediate_size']
         info['moe_shared_gate'] = True
-        info['moe_norm_topk_prob'] = cfg['norm_topk_prob']
+        info['norm_topk_prob'] = cfg['norm_topk_prob']
         info['attn_bias'] = 1
         return info
diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py
index 09699ade09..f2c981bb24 100644
--- a/lmdeploy/turbomind/deploy/target_model/base.py
+++ b/lmdeploy/turbomind/deploy/target_model/base.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os.path as osp
 from abc import ABC
+from collections.abc import Sequence
 
 import torch
 import tqdm
@@ -65,13 +66,14 @@ def __init__(self,
         # get `model_info` and `tokenizer_info` at first, which
         # will be updated to `self.model_config` and `self.attention_config`
         self.input_model_info = self.input_model.model_info()
+        self.input_model_info = self.single_to_list(
+            self.input_model_info, keys=['inter_size', 'expert_num'])
         self.input_model_tokenizer_info = self.input_model.tokenizer_info()
         self.permute_qk = self.input_model_info.get('permute_qk', True)
-
         self.update_model_config()
-        self.model_config.inter_size = _pad_inter_size(
-            self.model_config.inter_size, self.model_config.group_size,
-            self.tensor_para_size)
+        for i, v in enumerate(self.model_config.inter_size):
+            self.model_config.inter_size[i] = _pad_inter_size(
+                v, self.model_config.group_size, self.tensor_para_size)
         if self.model_config.expert_num:
             self.model_config.expert_inter_size = _pad_inter_size(
                 self.model_config.expert_inter_size,
@@ -79,11 +81,21 @@ def __init__(self,
         self.model_config.verify()
         assert self.model_config.kv_head_num % self.tensor_para_size == 0
 
+        # print(self.model_config)
+
         self.update_attention_config()
         self.update_lora_config()
         # ! Dependency on `self`
         self.model = model_cls(self)
 
+    def single_to_list(self, config: dict, keys):
+        num_layer = int(config['num_layer'])
+        for k in keys:
+            v = config.get(k, None)
+            if v is not None and not isinstance(v, Sequence):
+                config[k] = [v] * num_layer
+        return config
+
     def update_model_config(self):
         """Update `self.model_config` according to the input_model's
         `tokenizer_info` and `model_info`"""
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index e66da22df0..11e99edfa0 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -33,6 +33,7 @@
     InternVLChatModel='internvl',
     # deepseek-vl
     MultiModalityCausalLM='deepseekvl',
+    DeepseekV2ForCausalLM='deepseek2',
     # MiniCPMV
     MiniCPMV='minicpmv',
     # mini gemini
diff --git a/src/turbomind/kernels/CMakeLists.txt b/src/turbomind/kernels/CMakeLists.txt
index febb8692dd..40a48402af 100644
--- a/src/turbomind/kernels/CMakeLists.txt
+++ b/src/turbomind/kernels/CMakeLists.txt
@@ -68,3 +68,4 @@ endif ()
 
 add_subdirectory(attention)
 add_subdirectory(gemm)
+add_subdirectory(norm)
diff --git a/src/turbomind/kernels/attention/CMakeLists.txt b/src/turbomind/kernels/attention/CMakeLists.txt
index af9d47e0e6..32de38981a 100644
--- a/src/turbomind/kernels/attention/CMakeLists.txt
+++ b/src/turbomind/kernels/attention/CMakeLists.txt
@@ -38,6 +38,8 @@ add_library(attention STATIC
             codegen/decoding_sm80_64_f16_f16.cu
             codegen/decoding_sm80_64_f16_u4.cu
             codegen/decoding_sm80_64_f16_u8.cu
+            codegen/attention_sm80_192.cu
+            codegen/decoding_sm80_192.cu
             )
 set_property(TARGET attention PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/src/turbomind/kernels/attention/attention.cu b/src/turbomind/kernels/attention/attention.cu
index 3f557234bc..e7642584c2 100644
--- a/src/turbomind/kernels/attention/attention.cu
+++ b/src/turbomind/kernels/attention/attention.cu
@@ -46,6 +46,12 @@ void dispatchAttention(const AttentionParams<T>& params)
     else if (params.size_per_head == 128) {
         return dispatch(std::integral_constant<int, 128>{});
     }
+
+    if (params.size_per_head == 192) {
+        using Config = AttentionConfig<arch::Sm80, T, 192, CacheType::kLinear>;
+        return invokeAttention<typename Config::Kernel>(params);
+    }
+
     FT_CHECK(0);
 }
 
diff --git a/src/turbomind/kernels/attention/codegen/attention_sm80_192.cu b/src/turbomind/kernels/attention/codegen/attention_sm80_192.cu
new file mode 100644
index 0000000000..ceeafa7a6d
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/attention_sm80_192.cu
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../attention_config.h"
+#include "../attention_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template void invokeAttention<typename AttentionConfig<arch::Sm80, nv_bfloat16, 192, CacheType::kLinear>::Kernel>(
+    const AttentionParams<nv_bfloat16>& params);
+
+template void invokeAttention<typename AttentionConfig<arch::Sm80, half, 192, CacheType::kLinear>::Kernel>(
+    const AttentionParams<half>& params);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu b/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu
new file mode 100644
index 0000000000..214e6748d9
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu
@@ -0,0 +1,20 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../decoding_config.h"
+#include "../decoding_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template bool
+invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, nv_bfloat16, 1, 192>>(const AttentionParams<nv_bfloat16>& params);
+
+template bool invokeDecoding<Decoding<arch::Sm80, half, half, 1, 192>>(const AttentionParams<half>& params);
+
+template bool
+invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, uint8_t, 1, 192>>(const AttentionParams<nv_bfloat16>& params);
+
+template bool invokeDecoding<Decoding<arch::Sm80, half, uint8_t, 1, 192>>(const AttentionParams<half>& params);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/decoding.cu b/src/turbomind/kernels/attention/decoding.cu
index 1b04b7d4eb..67bd81e45b 100644
--- a/src/turbomind/kernels/attention/decoding.cu
+++ b/src/turbomind/kernels/attention/decoding.cu
@@ -2,8 +2,8 @@
 
 #include "decoding.h"
 #include "decoding_config.h"
+#include "src/turbomind/kernels/attention/arch.h"
 #include "src/turbomind/models/llama/llama_utils.h"
-// #include "src/turbomind/utils/dispatch.h"
 #include <type_traits>
 #include <utility>
 
@@ -113,6 +113,21 @@ void dispatchDecoding(const AttentionParams<T>& params)
         return false;
     };
 
+    if (params.size_per_head == 192) {
+
+        if (is_kv_int8) {
+            invokeDecoding<Decoding<arch::Sm80, T, uint8_t, 1, 192>>(params);
+        }
+        else if (is_kv_int4) {
+            FT_CHECK_WITH_INFO(!is_kv_int4, "not implemented");
+            // invokeDecoding<Decoding<arch::Sm80, T, uint4_t, 1, 192>>(params);
+        }
+        else {
+            invokeDecoding<Decoding<arch::Sm80, T, T, 1, 192>>(params);
+        }
+        return;
+    }
+
     auto success = dispatch();
 
     FT_CHECK(success);
diff --git a/src/turbomind/kernels/attention/decoding_config.h b/src/turbomind/kernels/attention/decoding_config.h
index 7dcb119cfd..dfd5e07835 100644
--- a/src/turbomind/kernels/attention/decoding_config.h
+++ b/src/turbomind/kernels/attention/decoding_config.h
@@ -40,7 +40,7 @@ struct DecodingConfig<arch::Sm80, T, T, Qh_, HeadDim, std::enable_if_t<(Qh_ > 2)
 };
 
 template<class T, int Qh_, int HeadDim>
-struct DecodingConfig<arch::Sm80, T, uint8_t, Qh_, HeadDim> {
+struct DecodingConfig<arch::Sm80, T, uint8_t, Qh_, HeadDim, std::enable_if_t<(HeadDim != 192)>> {
     static constexpr int Qh = (Qh_ + 7) / 8 * 8;
     using Attention         = Impl<MMA_81616, T, uint8_t, Qh, 1, 64, Qh, 1, 16, HeadDim, 5>;
     using CacheIter         = GetBlockIterFactory<T, uint8_t, 64, HeadDim>;
@@ -76,4 +76,14 @@ struct DecodingConfig<arch::Sm70, T, Tkv, Qh, HeadDim> {
     using Kernel = AttentionUniversal<arch::Sm70, Mainloop<arch::Sm70, Attention>, CacheIter, DecodingCtaMap>;
 };
 
+template<class T>
+struct DecodingConfig<arch::Sm80, T, uint8_t, 1, 192> {
+    static constexpr int Qh      = 1;
+    static constexpr int HeadDim = 192;
+
+    using Attention = Impl<MMA_SIMT, T, uint8_t, Qh, 1, 64, Qh, 1, 16, HeadDim, 3>;
+    using CacheIter = GetBlockIterFactory<T, uint8_t, 64, HeadDim>;
+    using Kernel    = AttentionUniversal<arch::Sm80, Mainloop<Sm80_CpAsync<3>, Attention>, CacheIter, DecodingCtaMap>;
+};
+
 }  // namespace turbomind::attention
diff --git a/src/turbomind/kernels/attention/impl_16816.h b/src/turbomind/kernels/attention/impl_16816.h
index 6e8f37f4d4..07c7dcb12b 100644
--- a/src/turbomind/kernels/attention/impl_16816.h
+++ b/src/turbomind/kernels/attention/impl_16816.h
@@ -63,26 +63,28 @@ struct Impl<MMA_16816, T_, T_, CTA_H_, CTA_Q_, CTA_S_, WARP_H, WARP_Q, WARP_S, H
 
     static_assert(sizeof(FragS) / 2 == sizeof(FragP));
 
-    using SmemLayoutQ = std::conditional_t<HeadDim == 128,
+    using SmemLayoutQ = std::conditional_t<HeadDim % 128 == 0,
                                            SmemLayoutV2<CTA_Q * CTA_H, HeadDim, 64, 128, Swizzle<3, 3, 4>>,
                                            SmemLayoutV2<CTA_Q * CTA_H, HeadDim, 64, 64, Swizzle<3, 3, 3>>>;
-    using SmemLayoutK = std::conditional_t<HeadDim == 128,
+    using SmemLayoutK = std::conditional_t<HeadDim % 128 == 0,
                                            SmemLayoutV2<CTA_S, HeadDim, 16, 128, Swizzle<3, 3, 4>>,
                                            SmemLayoutV2<CTA_S, HeadDim, 16, 64, Swizzle<3, 3, 3>>>;
-    using SmemLayoutV = std::conditional_t<HeadDim == 128,
+    using SmemLayoutV = std::conditional_t<HeadDim % 128 == 0,
                                            SmemLayoutV2<CTA_S, HeadDim, 16, 128, Swizzle<3, 3, 4>>,
                                            SmemLayoutV2<CTA_S, HeadDim, 16, 64, Swizzle<3, 3, 3>>>;
 
     using SmemLayoutKVp = void;
 
+    static constexpr bool kUseSmemQ = false;
+    static constexpr bool kUseSmemP = false;
+
+    static_assert(!kUseSmemQ, "current smemQ impl yields inconsistent outputs");
+
     union SharedStorage {
         __align__(16) T KV[Stages * (SmemLayoutK::kSize + SmemLayoutV::kSize) / 2];
         __align__(16) T Q[SmemLayoutQ::kSize];
     };
 
-    static constexpr bool kUseSmemQ = false;
-    static constexpr bool kUseSmemP = false;
-
     using ThreadMapQ  = RakedThreadMap<HeadDim, CTA_Q * CTA_H, 8, kWarpCount>;
     using ThreadMapKV = RakedThreadMap<HeadDim, CTA_S, 8, kWarpCount>;
 
@@ -109,22 +111,24 @@ struct Impl<MMA_16816, T_, T_, CTA_H_, CTA_Q_, CTA_S_, WARP_H, WARP_Q, WARP_S, H
         const int warp_id = threadIdx.x / WARP_SIZE;
         const int lane_id = threadIdx.x % WARP_SIZE;
 
-        __syncwarp();
+        if constexpr (!kUseSmemQ) {
+            __syncwarp();
 
-        SmemAccessor<T, SmemLayoutQ> sQ{smem_Q};
+            SmemAccessor<T, SmemLayoutQ> sQ{smem_Q};
 
-        // Load from shared memory using LDSM, rearrange to m16n8k16 atom layout
-        PRAGMA_UNROLL
-        for (int m = 0; m < K_M; ++m) {
+            // Load from shared memory using LDSM, rearrange to m16n8k16 atom layout
             PRAGMA_UNROLL
-            for (int k = 0; k < K_K; ++k) {
-                const int qi = lane_id % 16 * 1 + m * 16 + warp_id * WARP_Q;
-                const int di = lane_id / 16 * 8 + k * 16;
-                ldsm_x4((Array<uint32_t, 4>&)frag_Q[k][m], cast_smem_ptr_to_uint(&sQ(qi, di)));
+            for (int m = 0; m < K_M; ++m) {
+                PRAGMA_UNROLL
+                for (int k = 0; k < K_K; ++k) {
+                    const int qi = lane_id % 16 * 1 + m * 16 + warp_id * WARP_Q;
+                    const int di = lane_id / 16 * 8 + k * 16;
+                    ldsm_x4((Array<uint32_t, 4>&)frag_Q[k][m], cast_smem_ptr_to_uint(&sQ(qi, di)));
+                }
             }
         }
 
-        if constexpr (kUseSmemQ) {
+        if constexpr (0) {
             __syncthreads();
 
             // Rearrange Q in smem so that swizzling is not needed for later LDSMs
@@ -142,20 +146,25 @@ struct Impl<MMA_16816, T_, T_, CTA_H_, CTA_Q_, CTA_S_, WARP_H, WARP_Q, WARP_S, H
 
     struct StateQK {
         SmemAccessor<T, SmemLayoutK> smem_K;
+        T*                           smem_Q;
 
         FragQ frag_Q;
         FragK frag_K;
 
         __device__ StateQK(SharedStorage& storage, FragQ frag_Q_): smem_K{storage.KV}
         {
-            static_assert(!kUseSmemQ, "not implemented");
-            PRAGMA_UNROLL
-            for (int k = 0; k < K_K; ++k) {
+            if constexpr (!kUseSmemQ) {
                 PRAGMA_UNROLL
-                for (int m = 0; m < K_M; ++m) {
-                    frag_Q[k][m] = frag_Q_[k][m];
+                for (int k = 0; k < K_K; ++k) {
+                    PRAGMA_UNROLL
+                    for (int m = 0; m < K_M; ++m) {
+                        frag_Q[k][m] = frag_Q_[k][m];
+                    }
                 }
             }
+            else {
+                smem_Q = storage.Q;
+            }
         }
 
         __device__ void Load(int k, int pipe_iter)
@@ -166,6 +175,16 @@ struct Impl<MMA_16816, T_, T_, CTA_H_, CTA_Q_, CTA_S_, WARP_H, WARP_Q, WARP_S, H
             const int offset_s      = group_lane_id % 8 + group_id * 8;
             const int offset_c      = group_lane_id / 8 * 8;
             const int offset        = pipe_iter * SmemLayoutK::kSize;
+            if constexpr (kUseSmemQ) {
+                const int                    warp_id = threadIdx.x / WARP_SIZE;
+                SmemAccessor<T, SmemLayoutQ> sQ{smem_Q};
+                PRAGMA_UNROLL
+                for (int m = 0; m < K_M; ++m) {
+                    const int qi = lane_id % 16 * 1 + m * 16 + warp_id * WARP_Q;
+                    const int di = lane_id / 16 * 8 + k * 16;
+                    ldsm_x4((Array<uint32_t, 4>&)frag_Q[k][m], cast_smem_ptr_to_uint(&sQ(qi, di)));
+                }
+            }
             PRAGMA_UNROLL
             for (int n = 0; n < K_N; n += 2) {  // Load (s16,d16) tiles
                 const int s = n * 8 + offset_s;
diff --git a/src/turbomind/kernels/attention/impl_81616.h b/src/turbomind/kernels/attention/impl_81616.h
index 3b90bcdf57..f865f1bc3a 100644
--- a/src/turbomind/kernels/attention/impl_81616.h
+++ b/src/turbomind/kernels/attention/impl_81616.h
@@ -104,7 +104,7 @@ struct Impl<MMA_81616, T_, Tkv_, CTA_H_, CTA_Q_, CTA_S_, WARP_H_, WARP_Q, WARP_S
     }
     static constexpr auto _SmemLayoutKV(std::integral_constant<int, 4>)
     {
-        return std::conditional_t<HeadDim == 128,
+        return std::conditional_t<HeadDim % 128 == 0,
                                   SmemLayoutV2<CTA_S, HeadDim, 32, 128, Swizzle<2, 5, 3>>,
                                   SmemLayoutV2<CTA_S, HeadDim, 32, 64, Swizzle<3, 4, 3>>>{};
     }
diff --git a/src/turbomind/kernels/attention/impl_simt.h b/src/turbomind/kernels/attention/impl_simt.h
index a886185a44..444b67e2c8 100644
--- a/src/turbomind/kernels/attention/impl_simt.h
+++ b/src/turbomind/kernels/attention/impl_simt.h
@@ -2,12 +2,16 @@
 
 #pragma once
 
-#include "src/turbomind/kernels/attention/impl.h"
+#include <limits>
+#include <numeric>
+#include <type_traits>
+
 #include "src/turbomind/kernels/core/array_ops.h"
 #include "src/turbomind/kernels/core/layout.h"
 #include "src/turbomind/kernels/core/thread_map.h"
-#include <limits>
-#include <type_traits>
+
+#include "src/turbomind/kernels/attention/impl.h"
+#include "src/turbomind/kernels/attention/quantization.h"
 
 namespace turbomind::attention {
 
@@ -51,7 +55,7 @@ struct Impl<MMA_SIMT, T_, Tkv_, CTA_H_, CTA_Q_, CTA_S_, WARP_H_, WARP_Q, WARP_S,
     static constexpr int T_D = 8;                // warp thread C
     static constexpr int T_S = WARP_SIZE / T_D;  // warp thread S
 
-    // warp footprint
+    // warp footprint (1x4x64)
     static constexpr int OP_H = 1;
     static constexpr int OP_S = T_S;
     static constexpr int OP_D = VEC * T_D;
@@ -76,7 +80,7 @@ struct Impl<MMA_SIMT, T_, Tkv_, CTA_H_, CTA_Q_, CTA_S_, WARP_H_, WARP_Q, WARP_S,
         static constexpr int S_S_thr = 1;
         static constexpr int S_D     = VEC;
         static constexpr int S_S     = T_S;
-        static constexpr int LDS     = K_K;
+        static constexpr int LDS     = std::gcd(16 / sizeof(Array<Tkv, VEC>), K_K);
     };
 
     struct LinearD {
diff --git a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
index 20bb00fde8..f2e2faef91 100644
--- a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
+++ b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
@@ -277,11 +277,14 @@ void invokeProcessKV_v2(char**       blocks,
     };
 
     auto dispatch = [&](auto tkv) {
-        if (head_dim == 128) {
+        if (head_dim == 64) {
+            return invoke(tkv, std::integral_constant<int, 64>{});
+        }
+        else if (head_dim == 128) {
             return invoke(tkv, std::integral_constant<int, 128>{});
         }
-        else if (head_dim == 64) {
-            return invoke(tkv, std::integral_constant<int, 64>{});
+        else if (head_dim == 192) {
+            return invoke(tkv, std::integral_constant<int, 192>{});
         }
         FT_CHECK(0);
     };
@@ -545,6 +548,9 @@ void invokeFlattenKV_v2(T*           k,
         else if (head_dim == 128) {
             return invoke(tkv, std::integral_constant<int, 128>{});
         }
+        else if (head_dim == 192) {
+            return invoke(tkv, std::integral_constant<int, 192>{});
+        }
         FT_CHECK(0);
     };
 
diff --git a/src/turbomind/kernels/attention/mainloop_sm80.h b/src/turbomind/kernels/attention/mainloop_sm80.h
index bf0fc1d32a..4435400b70 100644
--- a/src/turbomind/kernels/attention/mainloop_sm80.h
+++ b/src/turbomind/kernels/attention/mainloop_sm80.h
@@ -52,7 +52,7 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
     template<class... Args>
     __device__ void operator()(Args&&... args)
     {
-        Run(Sm80_CpAsync<Stages>{}, ((Args &&) args)...);
+        Run(Sm80_CpAsync<Stages>{}, std::integral_constant<int, Impl::kHeadDim>{}, ((Args &&) args)...);
     }
 
     template<int Idx, class A, class B>
@@ -81,8 +81,9 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
         }
     }
 
-    template<class CacheIter, class StoreS, int Stages_>
+    template<int head_dim, class CacheIter, class StoreS, int Stages_>
     __device__ void Run(Sm80_CpAsync<Stages_>,
+                        std::integral_constant<int, head_dim>,
                         FragQ&         frag_Q,
                         CacheIter&     cache_iter,
                         FragO&         frag_O,
@@ -199,9 +200,10 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
         __pipeline_wait_prior(0);
     }
 
-#if 0
+    // #if 1
     template<class CacheIter, class StoreS>
     __device__ void Run(Sm80_CpAsync<2>,
+                        std::integral_constant<int, 192>,
                         FragQ&         frag_Q,
                         CacheIter&     cache_iter,
                         FragO&         frag_O,
@@ -234,7 +236,7 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
         Wait();
         state_QK.Load(0, 0);
 
-        constexpr auto _ = [](int){};
+        constexpr auto _ = [](int) {};
 
         auto loop = [&](auto is_residue, auto is_mask) {
             const int offset_K = tile_iter * CTA_S;
@@ -292,14 +294,15 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
         __pipeline_wait_prior(0);
     }
 
-#elif 1
+    // #elif 1
     // Load      : K0,K1 | V0,K2,V1,K3 ...
     // Compute   :    K0 | K1,V0,K2,V1 ...
     // - more register consumption
     // - more interleaved HMMA and FMA
     // - slight performance gain
-    template<class CacheIter, class StoreS>
+    template<int head_dim, class CacheIter, class StoreS>
     __device__ void Run(Sm80_CpAsync<2>,
+                        std::integral_constant<int, head_dim>,
                         FragQ&         frag_Q,
                         CacheIter&     cache_iter_,
                         FragO&         frag_O,
@@ -407,7 +410,7 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
         __pipeline_commit();
         __pipeline_wait_prior(0);
     }
-#endif
+    // #endif
 
     __device__ void Wait()
     {
diff --git a/src/turbomind/kernels/attention/reduce.cu b/src/turbomind/kernels/attention/reduce.cu
index 12f6aff38b..c654f40d05 100644
--- a/src/turbomind/kernels/attention/reduce.cu
+++ b/src/turbomind/kernels/attention/reduce.cu
@@ -66,12 +66,14 @@ void invokeReduce(T*           out,
                                     float        exp_scale,                                                            \
                                     cudaStream_t stream);
 
-INSTANTIATE_invokeReduce(128, half);
 INSTANTIATE_invokeReduce(64, half);
+INSTANTIATE_invokeReduce(128, half);
+INSTANTIATE_invokeReduce(192, half);
 
 #if ENABLE_BF16
+INSTANTIATE_invokeReduce(64, nv_bfloat16);
 INSTANTIATE_invokeReduce(128, nv_bfloat16);
-INSTANTIATE_invokeReduce(64, nv_bfloat16)
+INSTANTIATE_invokeReduce(192, nv_bfloat16);
 #endif
 
 }  // namespace turbomind::attention
diff --git a/src/turbomind/kernels/attention/reduce_kernel.h b/src/turbomind/kernels/attention/reduce_kernel.h
index 88a3ab3af8..b4c9064cfe 100644
--- a/src/turbomind/kernels/attention/reduce_kernel.h
+++ b/src/turbomind/kernels/attention/reduce_kernel.h
@@ -128,9 +128,12 @@ struct Reduce {
 
         __syncthreads();
 
-        constexpr int kVecSize = HeadDim / WARP_SIZE;
+        // HeadDim / WARP_SIZE
+        // 128     -> 4
+        // 64, 192 -> 2
+        constexpr int kVecSize = HeadDim % 128 == 0 ? 4 : 2;
 
-        using Map = RakedThreadMap<HeadDim, WarpCnt * CTA_H, kVecSize, WarpCnt>;
+        using Map = RakedThreadMap<HeadDim, WarpCnt * CTA_H, kVecSize, WarpCnt, WARP_SIZE>;
 
         static_assert(Map::kIterS == CTA_H);
 
diff --git a/src/turbomind/kernels/attention/rotary_embedding.h b/src/turbomind/kernels/attention/rotary_embedding.h
index 8e09da22cd..db836ed184 100644
--- a/src/turbomind/kernels/attention/rotary_embedding.h
+++ b/src/turbomind/kernels/attention/rotary_embedding.h
@@ -131,6 +131,7 @@ struct FastRoPE {
     template<typename T>
     __device__ void apply(Array<T, N>& x, float timestep)
     {
+#if 0
         PRAGMA_UNROLL
         for (int i = 0; i < N; i += 2) {
             float c, s;
@@ -144,6 +145,22 @@ struct FastRoPE {
                 x[i + 1] = (T)tmp1;
             }
         }
+#else
+        // Most models apply rotary embedding in half precision
+        PRAGMA_UNROLL
+        for (int i = 0; i < N; i += 2) {
+            float c, s;
+            sincosf(timestep * inv_freq_[i / 2], &s, &c);
+            s *= attention_scaling_;
+            c *= attention_scaling_;
+            T tmp0 = (T)c * x[i] - (T)s * x[i + 1];
+            T tmp1 = (T)c * x[i + 1] + (T)s * x[i];
+            if (is_valid_) {
+                x[i]     = tmp0;
+                x[i + 1] = tmp1;
+            }
+        }
+#endif
     }
 };
 
diff --git a/src/turbomind/kernels/attention/test_attention.cu b/src/turbomind/kernels/attention/test_attention.cu
index c6d7b40637..804d4815dc 100644
--- a/src/turbomind/kernels/attention/test_attention.cu
+++ b/src/turbomind/kernels/attention/test_attention.cu
@@ -218,14 +218,14 @@ void TestBlocks(const thrust::universal_vector<T>& k_cache,        // [B, H, S,
 
 #define KV_INT4 0
 
-#define DECODING 1
+#define DECODING 0
 
 template<class T>
 int test_attention()
 {
     AttentionParams<T> params{};
 
-    constexpr size_t kHeadDim = 128;
+    constexpr size_t kHeadDim = 192;
 
 #if DECODING
     // constexpr size_t kHeadNum   = 32;
@@ -239,11 +239,11 @@ int test_attention()
     // constexpr size_t kSequenceLen = 511;
     // constexpr size_t kSequenceLen = 2047;
     // constexpr size_t kSequenceLen = 4095;
-    // constexpr size_t kSequenceLen = 8191;
+    constexpr size_t kSequenceLen = 8191;
     // constexpr size_t kSequenceLen = 32767;
     // constexpr size_t kSequenceLen = 65535;
     // constexpr size_t kSequenceLen = 131071;
-    constexpr size_t kSequenceLen = 200000;
+    // constexpr size_t kSequenceLen = 200000;
     // constexpr size_t kSequenceLen = 262143;
     // constexpr size_t kSequenceLen = (1 << 20) - 1;  // 1M
     // constexpr size_t kSequenceLen = (1 << 22) - 1;  // 4M
@@ -451,6 +451,10 @@ int test_attention()
     params.qk = qk_buf.data().get();
     params.pr = pr_buf.data().get();
 
+    params.attention_scaling          = 1.f;
+    params.llama3_inv_scaling_factor  = 0;
+    params.yarn_ramp_inv_factor_div_2 = 0;
+
     Reference<T> reference(kDump ? Reference<T>::kUNFUSED : Reference<T>::kFLASH_ATTENTION, {});
     // Reference<T> reference(Reference<T>::kUNFUSED, {});
     reference.Reshape(kInputLen, kContextLen, kHeadNum, kHeadDim, KvHeadNum, kBatchSize);
diff --git a/src/turbomind/kernels/core/array_ops.h b/src/turbomind/kernels/core/array_ops.h
index 6b639abc83..ec6e7fb4ed 100644
--- a/src/turbomind/kernels/core/array_ops.h
+++ b/src/turbomind/kernels/core/array_ops.h
@@ -172,7 +172,7 @@ inline __device__ void copy(const Array<T, N> (&src)[M], Array<T, N> (&dst)[M])
 }
 
 template<typename T, int N>
-inline __device__ void Store(T* __restrict__ dst, const Array<T, N>& src)
+inline __device__ void Store(T* dst, const Array<T, N>& src)
 {
     if constexpr (sizeof(Array<T, N>) == sizeof(uint4)) {
         *(uint4*)dst = (const uint4&)src;
diff --git a/src/turbomind/kernels/core/math.h b/src/turbomind/kernels/core/math.h
index a708a34985..054269c27f 100644
--- a/src/turbomind/kernels/core/math.h
+++ b/src/turbomind/kernels/core/math.h
@@ -5,6 +5,7 @@
 #include "src/turbomind/kernels/core/common.h"
 #include <cassert>
 #include <cstdint>
+#include <type_traits>
 
 namespace turbomind {
 
@@ -41,6 +42,13 @@ TM_HOST_DEVICE constexpr T log2(T x)
 // static_assert(log2(32) == 5);
 // static_assert(log2(1) == 0);
 
+template<class T>
+TM_HOST_DEVICE constexpr T lowbit(T x)
+{
+    const std::make_signed_t<T> s = x;
+    return static_cast<T>(s & -s);
+}
+
 // https://arxiv.org/abs/1902.01961
 template<class T>
 struct FastDivMod {
diff --git a/src/turbomind/kernels/core/thread_map.h b/src/turbomind/kernels/core/thread_map.h
index 66b691832f..1271aefcc0 100644
--- a/src/turbomind/kernels/core/thread_map.h
+++ b/src/turbomind/kernels/core/thread_map.h
@@ -3,6 +3,7 @@
 #pragma once
 
 #include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/math.h"
 
 #include <iostream>
 
@@ -51,7 +52,7 @@ struct ThreadMapQ {
     }
 };
 
-template<int DimC, int DimS, int AccessC, int WarpCount, int WarpThreadC = DimC / AccessC>
+template<int DimC, int DimS, int AccessC, int WarpCount, int WarpThreadC = lowbit(DimC) / AccessC>
 struct RakedThreadMap {
     static constexpr int kDimC = DimC;
     static constexpr int kDimS = DimS;
diff --git a/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt b/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt
index d41c391e9d..81c9750584 100644
--- a/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt
+++ b/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt
@@ -8,9 +8,11 @@ add_library(${PROJECT_NAME} STATIC
     # flash_fwd_hdim64_fp16_sm80.cu
     flash_fwd_hdim128_fp16_sm80.cu
     flash_fwd_hdim128_bf16_sm80.cu
-    # flash_fwd_hdim256_fp16_sm80.cu
+    flash_fwd_hdim256_bf16_sm80.cu
+    flash_fwd_hdim256_fp16_sm80.cu
     )
 target_include_directories(${PROJECT_NAME} PRIVATE ${CUTLASS_DIR} / include)
 target_link_libraries(${PROJECT_NAME} PRIVATE nvidia::cutlass::cutlass)
+
 set_property(TARGET ${PROJECT_NAME} PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h b/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h
index e108a55f28..2456496367 100644
--- a/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h
+++ b/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h
@@ -147,7 +147,7 @@ void run_mha_fwd_hdim128(Flash_fwd_params& params, cudaStream_t stream)
     });
 }
 
-#if 0
+#if 1
 template<typename T>
 void run_mha_fwd_hdim256(Flash_fwd_params& params, cudaStream_t stream)
 {
diff --git a/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h b/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h
index fd19a0ea61..b1df29cb7b 100644
--- a/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h
+++ b/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h
@@ -58,6 +58,18 @@
             return __VA_ARGS__();                                                                                      \
         }                                                                                                              \
     }()
+#elif 1
+#define FWD_HEADDIM_SWITCH(HEADDIM, ...)                                                                               \
+    [&] {                                                                                                              \
+        if (HEADDIM <= 128) {                                                                                          \
+            constexpr static int kHeadDim = 128;                                                                       \
+            return __VA_ARGS__();                                                                                      \
+        }                                                                                                              \
+        else if (HEADDIM <= 256) {                                                                                     \
+            constexpr static int kHeadDim = 256;                                                                       \
+            return __VA_ARGS__();                                                                                      \
+        }                                                                                                              \
+    }()
 #else
 #define FWD_HEADDIM_SWITCH(HEADDIM, ...)                                                                               \
     [&] {                                                                                                              \
diff --git a/src/turbomind/kernels/gemm/context.h b/src/turbomind/kernels/gemm/context.h
index 4fec5b732f..bd03917b89 100644
--- a/src/turbomind/kernels/gemm/context.h
+++ b/src/turbomind/kernels/gemm/context.h
@@ -113,12 +113,7 @@ class DynamicGemmContext: public StaticGemmContext {
 
 class MoeGemmContext: public Context {
 public:
-    MoeGemmContext(int experts,
-                   int experts_per_token,
-                   //    int                   output_dims,
-                   //    int                   input_dims,
-                   const cudaDeviceProp& prop,
-                   cudaStream_t          stream);
+    MoeGemmContext(int experts, int experts_per_token, const cudaDeviceProp& prop, cudaStream_t stream);
 
     ~MoeGemmContext() override;
 
@@ -156,9 +151,11 @@ class MoeGemmContext: public Context {
 
     Tape Schedule(const LaunchSpec&) override;
 
-    void set_offsets(const int* offsets)
+    void update(int expert_num, int experts_per_token, const int* offsets)
     {
-        offsets_ = offsets;
+        expert_num_        = expert_num;
+        experts_per_token_ = experts_per_token;
+        offsets_           = offsets;
     }
 
 protected:
diff --git a/src/turbomind/kernels/gemm/convert_v2.cu b/src/turbomind/kernels/gemm/convert_v2.cu
index ed8b2ee2ff..e58bfc9b95 100644
--- a/src/turbomind/kernels/gemm/convert_v2.cu
+++ b/src/turbomind/kernels/gemm/convert_v2.cu
@@ -279,17 +279,44 @@ get_weight_and_scales_layout(DataType dtype, bool is_fused_moe, int sm, bool for
     return {};
 }
 
-void* make_blocked_ptrs(const std::vector<std::pair<void*, int>>& ptrs, cudaStream_t stream)
+namespace {
+
+template<int N>
+struct Param {
+    StridedPtr  data[N];
+    StridedPtr* ptr;
+    int         n;
+};
+
+template<int N>
+__global__ void fill_strided_ptrs(Param<N> param)
 {
-    std::vector<StridedPtr> tmp;
-    for (const auto& [p, s] : ptrs) {
-        tmp.push_back({p, s});
+    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx < param.n) {
+        param.ptr[idx] = param.data[idx];
     }
+}
+
+}  // namespace
+
+void* make_blocked_ptrs(const std::vector<std::pair<void*, int>>& ptrs, cudaStream_t stream)
+{
+    constexpr int N = 64;
+    Param<N>      param{};
+    static_assert(sizeof(param) <= 4096);  // max parameter size for cuda11
     StridedPtr* ptr{};
     cudaMallocAsync(&ptr, sizeof(StridedPtr) * ptrs.size(), stream);
-    cudaMemcpyAsync(ptr, tmp.data(), sizeof(StridedPtr) * ptrs.size(), cudaMemcpyDefault, stream);
-    // Sync before tmp can be destructed
-    cudaStreamSynchronize(stream);
+    param.ptr = ptr;
+    for (int i = 0; i < (int)ptrs.size(); i += N) {
+        const int n = std::min<int>(ptrs.size() - i, N);
+        for (int j = 0; j < n; ++j) {
+            auto& [p, s]  = ptrs[i + j];
+            param.data[j] = StridedPtr{p, s};
+        }
+        param.n = n;
+        fill_strided_ptrs<<<1, N, 0, stream>>>(param);
+        param.ptr += N;
+    }
     return ptr;
 }
 
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu
index 5912c60a8a..a9e4f7da51 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.cu
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu
@@ -264,7 +264,8 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
                                  int          token_num_padded,
                                  int          expert_num,
                                  int          top_k,
-                                 bool         norm_topk)
+                                 bool         norm_topk,
+                                 float        routed_scale)
 {
     constexpr int max_tiles         = kMoeGateMaxTiles;
     constexpr int threads_per_token = max_expert_num / items_per_thread;  // 8
@@ -286,8 +287,8 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
 
     const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token;
 
-    const int warp_offset  = thread_idx / WARP_SIZE * WARP_SIZE / threads_per_token;
-    const int block_offset = thread_idx / block_dim * block_dim / threads_per_token;
+    // const int warp_offset  = thread_idx / WARP_SIZE * WARP_SIZE / threads_per_token;
+    // const int block_offset = thread_idx / block_dim * block_dim / threads_per_token;
 
     float data[items_per_thread];
     int   idxs[items_per_thread];
@@ -413,7 +414,13 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
 
 #endif
 
-    constexpr float kLog2e = 1.4426950408889634074;
+    // constexpr float kLog2e = 1.4426950408889634074;
+    // if (k == 0) {
+    //     PRAGMA_UNROLL
+    //     for (int i = 0; i < items_per_thread; ++i) {
+    //         data[i] *= kLog2e;
+    //     }
+    // }
 
     unsigned mask = (unsigned)-1;
     float    max_logit;
@@ -437,13 +444,6 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
             asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
         }
 
-        if (k == 0) {
-            PRAGMA_UNROLL
-            for (int i = 0; i < items_per_thread; ++i) {
-                data[i] *= kLog2e;
-            }
-        }
-
         int   g_max_ei  = ei;
         float g_max_val = max_val;
         if constexpr (threads_per_token > 1) {
@@ -486,7 +486,7 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
     PRAGMA_UNROLL
     for (int i = 0; i < items_per_thread; ++i) {
         if (!norm_topk || used[i]) {
-            data[i] = exp2f(data[i] - max_logit);
+            data[i] = expf(data[i] - max_logit);
             sum_prob += data[i];
         }
     }
@@ -515,9 +515,11 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
 
     PRAGMA_UNROLL
     for (int i = 0; i < max_tiles * max_expert_num; i += block_dim) {
-        int e                   = (i + threadIdx.x) % max_expert_num;
-        int t                   = (i + threadIdx.x) / max_expert_num;
-        smem.shared_accum[t][e] = 0;
+        int e = (i + threadIdx.x) % max_expert_num;
+        int t = (i + threadIdx.x) / max_expert_num;
+        if (t < max_tiles) {
+            smem.shared_accum[t][e] = 0;
+        }
     }
 
     __syncthreads();
@@ -536,10 +538,8 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
 
         if (ti2 < token_num && idx < top_k) {
             masks[expert_id * token_num_padded + ti2] = idx;
-            scales[idx * token_num + ti2]             = scale;
+            scales[idx * token_num + ti2]             = scale * routed_scale;
             atomicAdd(&smem.shared_accum[ti2 >> log_tile][expert_id], 1);
-
-            // printf("%d %d %f\n", idx, expert_id, scale);
         }
     }
 
@@ -569,6 +569,7 @@ void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
                       int          experts,        //  E
                       int          experts_per_token,
                       bool         norm_topk,
+                      float        routed_scale,
                       cudaStream_t st)
 {
     constexpr int base_log_tile = 9;
@@ -581,14 +582,14 @@ void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
 
     // std::cout << log_tile << " " << tiles << "\n";
 
-    auto invoke = [&](auto max_expert_num, auto top_k, auto items_per_thread) {
+    auto invoke = [&](auto max_expert_num, auto top_k, auto items_per_thread, auto vec_size) {
         constexpr int thrs_per_tok = max_expert_num.value / items_per_thread.value;
         constexpr int threads      = 256;
         const int     blocks       = ceil_div(tokens, threads / thrs_per_tok);
 
         cudaMemsetAsync(masks, -1, sizeof(int8_t) * experts * tokens_padded, st);
 
-        MoeGateKernel_v8<max_expert_num.value, top_k.value, items_per_thread.value, threads, 4>
+        MoeGateKernel_v8<max_expert_num.value, top_k.value, items_per_thread.value, threads, vec_size.value>
             <<<blocks, threads, 0, st>>>(  //
                 scales,
                 (int8_t*)masks,
@@ -600,28 +601,49 @@ void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
                 tokens_padded,
                 experts,
                 experts_per_token,
-                norm_topk);
+                norm_topk,
+                routed_scale);
     };
 
     auto fail = [&] {
-        std::cerr << "unsupported moe config: expert_num=" << experts << ", top_k=" << experts_per_token << "\n";
+        std::cerr << __FILE__ << "(" << __LINE__ << "): unsupported moe config: expert_num=" << experts
+                  << ", top_k=" << experts_per_token << "\n";
         std::abort();
     };
 
     if (experts <= 8) {
         if (experts_per_token <= 2) {
-            invoke(_Int<8>, _Int<2>, _Int<8>);
+            // MoeGateKernel_V2<2, 128><<<cdiv(tokens, 128), 128, 0, st>>>(scales,
+            //     (int8_t*)masks,
+            //     accum,
+            //     logits,
+            //     log_tile,
+            //     tiles,
+            //     tokens,
+            //     tokens_padded,
+            //     experts);
+
+            // std::cout << tokens << " " << experts << " " << experts_per_token << " " << tokens_padded << "\n";
+            invoke(_Int<8>, _Int<2>, _Int<8>, _Int<4>);
         }
         else {
-            invoke(_Int<8>, _Int<8>, _Int<8>);
+            invoke(_Int<8>, _Int<8>, _Int<8>, _Int<4>);
         }
     }
     else if (experts <= 64) {
         if (experts_per_token <= 4) {
-            invoke(_Int<64>, _Int<4>, _Int<16>);
+            invoke(_Int<64>, _Int<4>, _Int<16>, _Int<4>);
         }
         else if (experts_per_token <= 8) {
-            invoke(_Int<64>, _Int<8>, _Int<16>);
+            invoke(_Int<64>, _Int<8>, _Int<16>, _Int<4>);
+        }
+        else {
+            fail();
+        }
+    }
+    else if (experts <= 160) {
+        if (experts_per_token <= 8) {
+            invoke(_Int<160>, _Int<8>, _Int<10>, _Int<2>);
         }
         else {
             fail();
@@ -687,7 +709,8 @@ __global__ void MoeReduceKernel(T*           dst,         // [  n, d]
                                 const int*   en2f,        // [  e, n] :: (e,n) -> e*n
                                 const float* dst_scales,  // [n]
                                 int          dims,
-                                int          tokens)
+                                int          tokens,
+                                float        dst_scale)
 {
     using Vec = Array<T, vec_size>;
 
@@ -695,7 +718,6 @@ __global__ void MoeReduceKernel(T*           dst,         // [  n, d]
 
     auto dst_ptr = (Vec*)dst + dims * ti;
 
-    float dst_scale = 0;
     if (dst_scales) {
         dst_scale = dst_scales[ti];
         dst_scale = fdividef(1.f, 1.f + expf(-dst_scale));
@@ -711,8 +733,9 @@ __global__ void MoeReduceKernel(T*           dst,         // [  n, d]
     }
 
     for (int i = threadIdx.x; i < dims; i += block_dim) {
+#if 1
         Array<float, vec_size> accum{};
-        if (dst_scales) {
+        if (dst_scale) {
             Vec v;
             Ldg(v, dst_ptr[i].data());
             using namespace ops;
@@ -727,6 +750,24 @@ __global__ void MoeReduceKernel(T*           dst,         // [  n, d]
             accum        = accum + x;
         }
         Store(dst_ptr[i].data(), cast<T>(accum));
+#else
+        Array<T, vec_size> accum{};
+        if (dst_scale) {
+            Vec v;
+            Ldg(v, dst_ptr[i].data());
+            using namespace ops;
+            accum = v * (T)dst_scale;
+        }
+        PRAGMA_UNROLL
+        for (int e = 0; e < exp_k; ++e) {
+            Vec v;
+            Ldg(v, src_ptr[e][i].data());
+            using namespace ops;
+            const auto x = v * (T)scale[e];
+            accum        = accum + x;
+        }
+        Store(dst_ptr[i].data(), accum);
+#endif
     }
 }
 
@@ -739,6 +780,7 @@ void invokeMoeReduce(T*           dst,
                      int          tokens,
                      int          experts_per_token,
                      int          dims,
+                     float        dst_scale,
                      cudaStream_t st)
 {
     // std::cout << __PRETTY_FUNCTION__ << std::endl;
@@ -754,7 +796,8 @@ void invokeMoeReduce(T*           dst,
             en2f,
             dst_scales,
             dims / vec_size,
-            tokens);
+            tokens,
+            dst_scale);
     };
 
     switch (experts_per_token) {
@@ -774,10 +817,11 @@ void invokeMoeReduce(T*           dst,
     }
 }
 
-template void invokeMoeReduce(half*, const half*, const float*, const int*, const float*, int, int, int, cudaStream_t);
-#ifdef ENABLE_BF16
 template void
-invokeMoeReduce(nv_bfloat16*, const nv_bfloat16*, const float*, const int*, const float*, int, int, int, cudaStream_t);
+invokeMoeReduce(half*, const half*, const float*, const int*, const float*, int, int, int, float, cudaStream_t);
+#ifdef ENABLE_BF16
+template void invokeMoeReduce(
+    nv_bfloat16*, const nv_bfloat16*, const float*, const int*, const float*, int, int, int, float, cudaStream_t);
 #endif
 
 std::vector<int> SampleUniform(int token_num, int expert_num, int exp_per_tok, std::mt19937& g)
@@ -833,4 +877,89 @@ std::vector<int> SampleBalanced(int token_num, int expert_num, int exp_per_tok,
     return ret;
 }
 
+template<int max_expert_num, int items_per_thread, int access_size>
+__global__ void MoeMaskTopKGroups(float* logits, int token_num, int expert_num, int top_k)
+{
+    constexpr int threads_per_token = max_expert_num / items_per_thread;
+
+    static_assert((threads_per_token & (threads_per_token - 1)) == 0);
+    static_assert(items_per_thread % access_size == 0);
+
+    const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    const int ti = thread_idx / threads_per_token;
+    const int ei = thread_idx % threads_per_token;
+
+    float data[items_per_thread];
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        data[i] = -std::numeric_limits<float>::infinity();
+    }
+    float max_val = -std::numeric_limits<float>::infinity();
+    if (ti < token_num) {
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; i += access_size) {
+            const int e = ei * items_per_thread + i;
+            if (e < expert_num) {
+                Ldg((Array<float, access_size>&)data[i], &logits[ti * expert_num + e]);
+                PRAGMA_UNROLL
+                for (int c = 0; c < access_size; ++c) {
+                    max_val = fmaxf(max_val, data[i + c]);
+                }
+            }
+        }
+    }
+
+    const int warp_ti        = threadIdx.x % WARP_SIZE / threads_per_token;
+    const int warp_ti_offset = warp_ti * threads_per_token;
+
+    bool alive = false;
+
+    for (int k = 0; k < top_k; ++k) {
+        int   g_max_ei  = ei;
+        float g_max_val = max_val;
+        PRAGMA_UNROLL
+        for (int m = threads_per_token / 2; m >= 1; m /= 2) {
+            g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m));
+        }
+        // tie breaking
+        const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val);
+        g_max_ei          = __ffs(active >> (unsigned)warp_ti_offset) - 1;
+        if (ei == g_max_ei) {
+            alive   = true;
+            max_val = -std::numeric_limits<float>::infinity();
+        }
+    }
+
+    if (!alive && ti < token_num) {
+        Array<float, access_size> vec;
+        fill(vec, -std::numeric_limits<float>::infinity());
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; i += access_size) {
+            const int e = ei * items_per_thread + i;
+            if (e < expert_num) {
+                Store(&logits[ti * expert_num + e], vec);
+            }
+        }
+    }
+}
+
+void invokeMaskMoeTopKGroups(float* logits, int token_num, int expert_num, int group_size, int top_k, cudaStream_t st)
+{
+    auto invoke = [&](auto max_expert_num, auto items_per_thread, auto vec_size) {
+        constexpr int thrs_per_tok = max_expert_num.value / items_per_thread.value;
+        constexpr int threads      = 256;
+        const int     blocks       = ceil_div(token_num, threads / thrs_per_tok);
+        MoeMaskTopKGroups<max_expert_num.value, items_per_thread.value, vec_size.value>
+            <<<blocks, threads, 0, st>>>(logits, token_num, expert_num, top_k);
+    };
+    if (expert_num == 160 && group_size == 20) {
+        return invoke(_Int<160>, _Int<20>, _Int<4>);
+    }
+
+    std::cerr << __FILE__ << "(" << __LINE__ << "): unsupported moe config: expert_num=" << expert_num
+              << ", group_size=" << group_size << "\n";
+    std::abort();
+}
+
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.h b/src/turbomind/kernels/gemm/moe_utils_v2.h
index 0e4c36af09..d53de1354e 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.h
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.h
@@ -22,6 +22,7 @@ void invokeMoeGate_V2(int*         f2n,
                       int          experts,
                       int          exp_per_tok,
                       bool         norm_topk,
+                      float        routed_scale,
                       cudaStream_t st);
 
 template<class T>
@@ -54,8 +55,11 @@ void invokeMoeReduce(T*           dst,
                      int          tokens,
                      int          experts_per_token,
                      int          dims,
+                     float        dst_scale,
                      cudaStream_t st);
 
+void invokeMaskMoeTopKGroups(float* logits, int token_num, int expert_num, int group_size, int top_k, cudaStream_t st);
+
 // Sample `e` from `E` experts uniformly for every token
 std::vector<int> SampleUniform(int token_num, int expert_num, int exp_per_tok, std::mt19937& g);
 
diff --git a/src/turbomind/kernels/gemm/test/test_moe_utils.cu b/src/turbomind/kernels/gemm/test/test_moe_utils.cu
index 47e3bfdb16..4b2ea6a83a 100644
--- a/src/turbomind/kernels/gemm/test/test_moe_utils.cu
+++ b/src/turbomind/kernels/gemm/test/test_moe_utils.cu
@@ -45,72 +45,6 @@ void diff_vecs(const T* data, const T* refs, int m, int k, std::string msg)
     }
 }
 
-#if 0
-void func()
-{
-    using thrust::universal_vector;
-
-    // clang-format off
-    std::vector<float> h_logits{
-        8,  5,  1,  4,  3,  6,  2,  7,
-        50, 60, 90, 20, 70, 71, 72, 73,
-        0, 1, 0, 0, 0, 1, 0, 1,
-        0, 0, 0, 1, 0, 0, 0, 2};
-    // clang-format on
-
-    h_logits.resize(8);
-
-    // auto tmp = h_logits;
-    // for (int i = 0; i < 127; ++i) {
-    //     h_logits.insert(h_logits.end(), tmp.begin(), tmp.end());
-    // }
-
-    universal_vector<float> logits(h_logits.begin(), h_logits.end());
-
-    const int E = 8;
-    const int n = h_logits.size() / E;
-    const int e = 2;
-
-    const int n_padded = (n + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
-
-    universal_vector<int>   f2n(e * n);
-    universal_vector<int>   en2f(e * n);
-    universal_vector<int>   offsets(E + 1);
-    universal_vector<int>   accum(E * kMoeGateMaxTiles);
-    universal_vector<float> scales(n * e);
-    universal_vector<int>   masks(E * n_padded);
-
-    for (int i = 0; i < 10; ++i) {
-        gemm::CacheFlushing::flush(0);
-        cudaMemset(accum.data().get(), 0, sizeof(int) * accum.size());
-        invokeMoeGate_V2(f2n.data().get(),
-                         en2f.data().get(),
-                         offsets.data().get(),
-                         scales.data().get(),
-                         masks.data().get(),
-                         accum.data().get(),
-                         logits.data().get(),
-                         n,
-                         n_padded,
-                         E,
-                         e,
-                         0);
-    }
-
-    auto err = cudaDeviceSynchronize();
-    if (err) {
-        std::cerr << cudaGetErrorString(err) << "\n";
-    }
-
-    print_vecs(scales.data().get(), e, n, "scales", 12);
-    print_vecs(masks.data().get(), E, n_padded, "tmp");
-    print_vecs(accum.data().get(), E, 1, "accum");
-    print_vecs(offsets.data().get(), 1, E + 1, "offsets");
-    print_vecs(f2n.data().get(), n * e, 1, "f2n");
-    print_vecs(en2f.data().get(), e, n, "en2f");
-}
-#endif
-
 RNG& gRNG()
 {
     static RNG inst{};
@@ -271,6 +205,8 @@ bool test_moe_gate(int                     tokens,  //
     cudaMemPrefetchAsync(scales.data().get(), sizeof(float) * scales.size(), 0);
     cudaMemPrefetchAsync(logits.data().get(), sizeof(float) * logits.size(), 0);
 
+    // invokeMaskMoeTopKGroups(logits.data().get(), tokens, expert_num, expert_num / 8, 3, nullptr);
+
     for (int i = 0; i < 1; ++i) {
         gemm::CacheFlushing::flush();
         cudaMemset(accum.data().get(), 0, sizeof(int) * accum.size());
@@ -286,8 +222,9 @@ bool test_moe_gate(int                     tokens,  //
                          tokens_padded,
                          expert_num,
                          experts_per_token,
-                         true,
-                         0);
+                         false,
+                         1.f,
+                         nullptr);
     }
 
     // invokeMoeTiling(coords.data().get(), offsets.data().get(), expert_num, coords.size(), &tiling, 1, 0);
@@ -334,6 +271,8 @@ bool test_moe_gate(int                     tokens,  //
         success = false;
     }
 
+    // print_vecs(logits.data().get(), tokens, expert_num, "logits", 12);
+
     if (!success && 1) {
 
         diff_vecs(eids.data().get(), eids_ref.data().get(), experts_per_token, tokens, "eids");
@@ -353,6 +292,15 @@ bool test_moe_gate(int                     tokens,  //
         print_vecs(scales_ref.data().get(), experts_per_token, tokens, "scales_ref", 12);
         print_vecs(scales.data().get(), experts_per_token, tokens, "scales", 12);
 
+        for (int i = 0; i < tokens; ++i) {
+            float sum = 0;
+            for (int j = 0; j < experts_per_token; ++j) {
+                sum += scales[j * tokens + i];
+            }
+            std::cout << sum << " ";
+        }
+        std::cout << "\n";
+
         // print_vecs(accum.data().get(), expert_num, 1, "accum");
 
         // print_vecs(coords.data().get(), 1, max_coords, "coords");
@@ -393,7 +341,7 @@ int main()
     // test_moe_gate(32768, 64, 8, tape, tiling);
     // test_moe_gate(8, 60, 4, tape, tiling);
 
-    test_moe_gate(65536, 8, 2, tape, tiling);
+    test_moe_gate(16, 160, 6, tape, tiling);
     return 0;
 
     for (int i = 1; i < 16384; ++i) {
diff --git a/src/turbomind/kernels/gemm/test/testbed.h b/src/turbomind/kernels/gemm/test/testbed.h
index 7a089fbdf2..4747644f9a 100644
--- a/src/turbomind/kernels/gemm/test/testbed.h
+++ b/src/turbomind/kernels/gemm/test/testbed.h
@@ -357,7 +357,7 @@ class Testbed {
             }
         }
 
-        ((MoeGemmContext*)ctx_.get())->set_offsets(moe_m_offsets_.data().get());
+        ((MoeGemmContext*)ctx_.get())->update(experts_, exp_per_tok_, moe_m_offsets_.data().get());
 
         CHECK(batch_dim == 0);
         CHECK(a_desc_.order == kRowMajor);
@@ -518,6 +518,7 @@ class Testbed {
                             batch_size_,
                             expert_ids_.size() / batch_size_,
                             output_dims_,
+                            0.f,
                             stream_);
 
             invokeMoeReduce(c_ref_.data().get(),
@@ -528,6 +529,7 @@ class Testbed {
                             batch_size_,
                             expert_ids_.size() / batch_size_,
                             output_dims_,
+                            0.f,
                             stream_);
 
             cudaDeviceSynchronize();
diff --git a/src/turbomind/kernels/gemm/unpack.cu b/src/turbomind/kernels/gemm/unpack.cu
index 92f468d82b..39e6a2e1aa 100644
--- a/src/turbomind/kernels/gemm/unpack.cu
+++ b/src/turbomind/kernels/gemm/unpack.cu
@@ -71,14 +71,44 @@ void unpack_awq_gemm(uint4_t* dst, const uint4_t* src, int rows, int cols, cudaS
     permute_u4<0, 1, 3, 2><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape);
 }
 
+__global__ void transpose_u4_kernel(uint4_t* dst, const uint4_t* src, int s, int c)
+{
+    const int idx_c = 8 * (threadIdx.x + blockIdx.x * blockDim.x);
+    const int idx_s = 8 * (threadIdx.y + blockIdx.y * blockDim.y);
+    if (idx_c >= c || idx_s >= s) {
+        return;
+    }
+    uint32_t ivec[8];
+    PRAGMA_UNROLL
+    for (int i = 0; i < 8; ++i) {
+        ivec[i] = ((const uint32_t*)src)[((idx_s + i) * c + idx_c) / 8];
+    }
+    uint32_t ovec[8]{};
+    PRAGMA_UNROLL
+    for (int i = 0; i < 8; ++i) {
+        PRAGMA_UNROLL
+        for (int j = 0; j < 8; ++j) {
+            ovec[i] |= (((ivec[j] >> (i * 4)) & 0xfu) << (j * 4));
+        }
+    }
+    PRAGMA_UNROLL
+    for (int i = 0; i < 8; ++i) {
+        ((uint32_t*)dst)[((idx_c + i) * s + idx_s) / 8] = ovec[i];
+    }
+}
+
 void transpose_u4(uint4_t* dst, const uint4_t* src, int s, int c, cudaStream_t st)
 {
     if (s % 8 || c % 8) {
         std::cerr << "transpose_u4: invalid shape (" << s << "," << c << "), must be multiple of 8" << std::endl;
         return;
     }
-    Array<int, 2> shape{s, c};
-    permute_u4<1, 0><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape);
+    // Array<int, 2> shape{s, c};
+    // permute_u4<1, 0><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape);
+
+    const dim3 block(16, 16);
+    const dim3 grid((c + 15) / 16, (s + 15) / 16);
+    transpose_u4_kernel<<<grid, block, 0, st>>>(dst, src, s, c);
 }
 
 // load -> unpack -> extend_to_u8 -> manipulation -> compat_to_u4 -> store
diff --git a/src/turbomind/kernels/norm/CMakeLists.txt b/src/turbomind/kernels/norm/CMakeLists.txt
new file mode 100644
index 0000000000..bc1569c405
--- /dev/null
+++ b/src/turbomind/kernels/norm/CMakeLists.txt
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+add_library(rms_norm rms_norm.cu)
+set_property(TARGET rms_norm PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET rms_norm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/src/turbomind/kernels/norm/rms_norm.cu b/src/turbomind/kernels/norm/rms_norm.cu
new file mode 100644
index 0000000000..22fd69f52a
--- /dev/null
+++ b/src/turbomind/kernels/norm/rms_norm.cu
@@ -0,0 +1,235 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "cub/block/block_reduce.cuh"
+
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/common.h"
+
+namespace turbomind {
+
+template<class T, class Accum, int block_dim, int vec_size>
+__global__ void RMSNormKernel(T*       dst,
+                              int      dst_ld,
+                              const T* src,
+                              int      src_ld,
+                              const T* __restrict__ weights,
+                              int   dims,
+                              int   num,
+                              float eps,
+                              float inv_dims)
+{
+    const int ti = blockIdx.x;
+    const int di = threadIdx.x * vec_size;
+
+    if (ti >= num) {
+        return;
+    }
+
+    src += src_ld * ti;
+
+    Array<Accum, vec_size> accum{};
+    Array<T, vec_size>     vec;
+
+    for (int i = di; i < dims; i += block_dim * vec_size) {
+        Load(vec, &src[i]);
+        Array<Accum, vec_size> tmp = cast<Accum>(vec);
+        using namespace ops;
+        accum = accum + tmp * tmp;
+    }
+
+    float sum{};
+    PRAGMA_UNROLL
+    for (int i = 0; i < vec_size; ++i) {
+        sum += accum[i];
+    }
+
+    using BlockReduce = cub::BlockReduce<Accum, block_dim>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    sum = BlockReduce{temp_storage}.Sum(sum);
+
+    __shared__ float shared_sum;
+
+    if (threadIdx.x == 0) {
+        shared_sum = rsqrtf(sum * inv_dims + eps);
+    }
+
+    __syncthreads();
+
+    sum = shared_sum;
+
+    dst += dst_ld * ti;
+
+    Array<T, vec_size> sv;
+    for (int i = di; i < dims; i += block_dim * vec_size) {
+        Load(vec, &src[i]);
+        Ldg(sv, &weights[i]);
+        PRAGMA_UNROLL
+        for (int c = 0; c < vec_size; ++c) {
+            vec[c] = (T)((float)vec[c] * sum) * sv[c];
+            // vec[c] = (T)((float)vec[c] * sum * (float)sv[c]);
+        }
+        Store(&dst[i], vec);
+    }
+}
+
+template<class T>
+void invokeRMSNorm(
+    T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, cudaStream_t st)
+{
+    constexpr int vec_size = 16 / sizeof(T);
+
+    constexpr int threads = 512;
+    const int     blocks  = num;
+
+    RMSNormKernel<T, float, threads, vec_size><<<blocks, threads, 0, st>>>(dst,  //
+                                                                           dst_ld,
+                                                                           src,
+                                                                           src_ld,
+                                                                           weights,
+                                                                           dims,
+                                                                           num,
+                                                                           eps,
+                                                                           1.f / dims);
+}
+
+template void invokeRMSNorm(half*        dst,
+                            int          dst_ld,
+                            const half*  src,
+                            int          src_ld,
+                            const half*  weights,
+                            int          dims,
+                            int          num,
+                            float        eps,
+                            cudaStream_t st);
+#if ENABLE_BF16
+template void invokeRMSNorm(nv_bfloat16*       dst,
+                            int                dst_ld,
+                            const nv_bfloat16* src,
+                            int                src_ld,
+                            const nv_bfloat16* weights,
+                            int                dims,
+                            int                num,
+                            float              eps,
+                            cudaStream_t       st);
+#endif
+
+// r' <- r + (h + b)
+// h' <- norm(r') * w
+template<class T, class Tacc, int block_dim, int vec_size>
+__global__ void BiasResidualRMSNormKernel(T* __restrict__ residual,
+                                          T* __restrict__ hidden_states,
+                                          const T* __restrict__ weights,
+                                          const T* __restrict__ bias,
+                                          int   dims,
+                                          int   num,
+                                          float eps,
+                                          float inv_dims)
+{
+    const int ti = blockIdx.x;
+    const int di = threadIdx.x * vec_size;
+
+    if (ti >= num) {
+        return;
+    }
+
+    residual += dims * ti;
+    hidden_states += dims * ti;
+
+    Array<Tacc, vec_size> accum{};
+
+    Array<T, vec_size> r_vec;
+    Array<T, vec_size> h_vec;
+    Array<T, vec_size> b_vec;
+
+    for (int i = di; i < dims; i += block_dim * vec_size) {
+        Load(r_vec, &residual[i]);
+        Load(h_vec, &hidden_states[i]);
+
+        using namespace ops;
+        r_vec = r_vec + h_vec;
+
+        if (bias) {
+            Ldg(b_vec, &bias[i]);
+            r_vec = r_vec + b_vec;
+        }
+
+        Store(&residual[i], r_vec);
+
+        Array<Tacc, vec_size> tmp = cast<Tacc>(r_vec);
+
+        accum = accum + tmp * tmp;
+    }
+
+    float sum{};
+    PRAGMA_UNROLL
+    for (int i = 0; i < vec_size; ++i) {
+        sum += accum[i];
+    }
+
+    using BlockReduce = cub::BlockReduce<Tacc, block_dim>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    sum = BlockReduce{temp_storage}.Sum(sum);
+
+    __shared__ float shared_sum;
+
+    if (threadIdx.x == 0) {
+        shared_sum = rsqrtf(sum * inv_dims + eps);
+    }
+
+    __syncthreads();
+
+    sum = shared_sum;
+
+    Array<T, vec_size> w_vec;
+    for (int i = di; i < dims; i += block_dim * vec_size) {
+        Load(r_vec, &residual[i]);
+        Ldg(w_vec, &weights[i]);
+        PRAGMA_UNROLL
+        for (int c = 0; c < vec_size; ++c) {
+            r_vec[c] = (T)((float)r_vec[c] * sum) * w_vec[c];
+        }
+        Store(&hidden_states[i], r_vec);
+    }
+}
+
+template<class T>
+void invokeBiasResidualRMSNorm(
+    T* residual, T* hidden_states, const T* weights, const T* bias, int dims, int num, float eps, cudaStream_t st)
+{
+    constexpr int vec_size = 16 / sizeof(T);
+    constexpr int threads  = 512;
+    const int     blocks   = num;
+
+    BiasResidualRMSNormKernel<T, float, threads, vec_size><<<blocks, threads, 0, st>>>(residual,  //
+                                                                                       hidden_states,
+                                                                                       weights,
+                                                                                       bias,
+                                                                                       dims,
+                                                                                       num,
+                                                                                       eps,
+                                                                                       1.f / dims);
+}
+
+template void invokeBiasResidualRMSNorm(half*        residual,
+                                        half*        hidden_states,
+                                        const half*  weights,
+                                        const half*  bias,
+                                        int          dims,
+                                        int          num,
+                                        float        eps,
+                                        cudaStream_t st);
+
+#if ENABLE_BF16
+template void invokeBiasResidualRMSNorm(nv_bfloat16*       residual,
+                                        nv_bfloat16*       hidden_states,
+                                        const nv_bfloat16* weights,
+                                        const nv_bfloat16* bias,
+                                        int                dims,
+                                        int                num,
+                                        float              eps,
+                                        cudaStream_t       st);
+#endif
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/norm/rms_norm.h b/src/turbomind/kernels/norm/rms_norm.h
new file mode 100644
index 0000000000..83fa0f8263
--- /dev/null
+++ b/src/turbomind/kernels/norm/rms_norm.h
@@ -0,0 +1,21 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <cuda_runtime.h>
+
+namespace turbomind {
+
+template<class T>
+void invokeRMSNorm(
+    T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, cudaStream_t st);
+
+template<class T>
+void invokeRMSNorm(T* dst, const T* src, const T* weights, int dims, int num, float eps, cudaStream_t st)
+{
+    invokeRMSNorm(dst, dims, src, dims, weights, dims, num, eps, st);
+}
+
+template<class T>
+void invokeBiasResidualRMSNorm(
+    T* residual, T* hidden_states, const T* weights, const T* bias, int dims, int num, float eps, cudaStream_t st);
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/llama/CMakeLists.txt b/src/turbomind/models/llama/CMakeLists.txt
index 285fcea31f..3c714bd234 100644
--- a/src/turbomind/models/llama/CMakeLists.txt
+++ b/src/turbomind/models/llama/CMakeLists.txt
@@ -20,11 +20,13 @@ add_library(Llama STATIC
         unified_attention_layer.cc
         llama_kernels.cu
         llama_decoder_kernels.cu
-        llama_utils.cu)
+        llama_utils.cu
+        mla_utils.cu)
 set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 target_link_libraries(Llama PUBLIC CUDA::cudart
         gemm2
+        rms_norm
         cublasMMWrapper
         DynamicDecodeLayer
         activation_kernels
diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc
index 4138174e5d..ea321d06a0 100644
--- a/src/turbomind/models/llama/LlamaBatch.cc
+++ b/src/turbomind/models/llama/LlamaBatch.cc
@@ -20,6 +20,7 @@
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/debug_utils.h"
 #include "src/turbomind/utils/logger.h"
+#include "src/turbomind/utils/nccl_utils.h"
 #include <algorithm>
 #include <cmath>
 #include <cstddef>
@@ -1041,6 +1042,9 @@ LlamaBatch<T>::LlamaBatch(const EngineParam&           param,
 
     AllocateBuffer(max_batch_size_, session_len_, cache_block_seq_len);
     AllocatePersistantBuffer(max_batch_size_, cache_block_seq_len);
+
+    // Wait for allocations
+    check_cuda_error(cudaStreamSynchronize(stream_));
 }
 
 template<typename T>
@@ -1990,7 +1994,7 @@ void LlamaBatch<T>::tune()
                                    nullptr,
                                    nullptr);
             // implicit barrier for TP
-            check_cuda_error(cudaStreamSynchronize(stream_));
+            ftNcclStreamSynchronize(model_->tensor_para_, {}, stream_);
         }
 
         auto tock = std::chrono::steady_clock::now();
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
index f6f9ab0efa..0a2a3be175 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -52,28 +52,21 @@ static bool is_fuse_silu_act()
 }
 
 template<typename T>
-LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
-                                                    size_t     head_num,
-                                                    size_t     kv_head_num,
-                                                    size_t     size_per_head,
-                                                    size_t     hidden_units,
-                                                    size_t     inter_size,
-                                                    WeightType weight_type,
-                                                    int        group_size,
-                                                    LoraParam  lora_param,
-                                                    bool       attn_bias,
-                                                    MoeParam   moe_param,
-                                                    size_t     tensor_para_size,
-                                                    size_t     tensor_para_rank):
-    head_num_(head_num),
-    kv_head_num_(kv_head_num),
-    size_per_head_(size_per_head),
-    hidden_units_(hidden_units),
-    inter_size_(inter_size),
-    weight_type_(weight_type),
-    attn_bias_(attn_bias),
-    tensor_para_size_(tensor_para_size),
-    tensor_para_rank_(tensor_para_rank)
+LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int               layer_id,
+                                                    const ModelParam& model,
+                                                    const LoraParam&  lora_param,
+                                                    const MoeParam&   moe_param,
+                                                    size_t            tp_size,
+                                                    size_t            tp_rank):
+    head_num_(model.head_num),
+    kv_head_num_(model.kv_head_num),
+    size_per_head_(model.head_dim),
+    hidden_units_(model.hidden_units),
+    inter_size_(model.inter_size.at(layer_id)),
+    weight_type_(model.weight_type),
+    attn_bias_(model.attn_bias),
+    tensor_para_size_(tp_size),
+    tensor_para_rank_(tp_rank)
 {
     if (lora_param.policy == LoraPolicy::kPlora) {
         std::vector<std::string> keys = {
@@ -88,7 +81,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
             auto&       weight    = *weights[i];
             int         rank      = lora_param.r;
             float       scale     = lora_param.scale;
-            std::string full_name = "layers." + std::to_string(layer_idx) + "." + name;
+            std::string full_name = "layers." + std::to_string(layer_id) + "." + name;
 
             for (const auto& [re, pr] : lora_param.rank_pattern) {
                 if (std::regex_search(full_name, pr.first)) {
@@ -114,36 +107,44 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
 
     fused_up_and_gate_ = ffn_weights.gating.lora.policy != LoraPolicy::kPlora;
 
-    self_attn_weights.qkv.input_dims  = hidden_units_;
-    self_attn_weights.qkv.output_dims = (head_num + 2 * kv_head_num) * size_per_head / tensor_para_size_;
-    self_attn_weights.qkv.type        = weight_type;
-    self_attn_weights.qkv.group_size  = group_size;
-
-    self_attn_weights.output.input_dims  = (head_num * size_per_head) / tensor_para_size_;
-    self_attn_weights.output.output_dims = hidden_units_;
-    self_attn_weights.output.type        = weight_type;
-    self_attn_weights.output.group_size  = group_size;
+    self_attn_weights = LlamaAttentionWeight<T>{hidden_units_,
+                                                size_per_head_,
+                                                head_num_,
+                                                kv_head_num_,
+                                                model.mla,
+                                                attn_bias_,
+                                                tensor_para_size_,
+                                                weight_type_,
+                                                model.group_size};
 
     ffn_weights = LlamaFfnWeight<T>{
         hidden_units_,
         inter_size_,
         tensor_para_size_,
         weight_type_,
-        group_size,
+        model.group_size,
         weight_type_ == WeightType::kINT4 && is_fuse_silu_act(),
     };
 
-    moe_weights = MoeFfnWeight<T>{hidden_units_,
-                                  moe_param.inter_size,
-                                  moe_param.expert_num,
-                                  moe_param.method,
-                                  moe_param.shared_gate,
-                                  tensor_para_size_,
-                                  weight_type,
-                                  group_size,
-                                  is_fuse_silu_act()};
-
-    mallocWeights();
+    moe_weights = MoeFfnWeight<T>{
+        layer_id, moe_param, hidden_units_, weight_type_, model.group_size, tensor_para_size_, is_fuse_silu_act()};
+}
+
+template<typename T>
+void LlamaDecoderLayerWeight<T>::malloc(cudaStream_t st)
+{
+    deviceMalloc((T**)&self_attn_norm_weights, hidden_units_, st);
+    deviceMalloc((T**)&ffn_norm_weights, hidden_units_, st);
+
+    self_attn_weights.malloc(st);
+
+    if (inter_size_) {
+        ffn_weights.malloc(st);
+    }
+
+    if (!moe_weights.experts.empty()) {
+        moe_weights.malloc(st);
+    }
 }
 
 template<typename T>
@@ -168,52 +169,6 @@ size_t LlamaDecoderLayerWeight<T>::workspace_size() const noexcept
     return size * sizeof(uint16_t);
 }
 
-template<typename T>
-void freeWeights(LlamaDenseWeight<T>& weights)
-{
-    cudaFree(weights.kernel);
-    cudaFree(weights.bias);
-    cudaFree(weights.scales);
-    cudaFree(weights.zeros);
-
-    weights.kernel = nullptr;
-    weights.bias   = nullptr;
-    weights.scales = nullptr;
-    weights.zeros  = nullptr;
-
-    {
-        cudaFree(weights.lora.a);
-        cudaFree(weights.lora.b);
-        weights.lora.a = nullptr;
-        weights.lora.b = nullptr;
-    }
-}
-
-template<typename T>
-void LlamaDecoderLayerWeight<T>::mallocWeights(LlamaDenseWeight<T>& weights, bool bias)
-{
-    if (bias) {
-        deviceMalloc((T**)&weights.bias, weights.output_dims);
-    }
-    const size_t bit_size = getBitSize(weights.type);
-    if (bit_size >= 16) {  // fp16, fp32
-        deviceMalloc((T**)&weights.kernel, weights.input_dims * weights.output_dims);
-    }
-    else {  // int8, int4
-        const int factor = sizeof(float) * 8 / bit_size;
-        FT_CHECK(weights.input_dims % factor == 0);
-        deviceMalloc((int**)&weights.kernel, weights.input_dims * weights.output_dims / factor);
-        deviceMemSetZero((int*)weights.kernel, weights.input_dims * weights.output_dims / factor);
-        deviceMalloc((T**)&weights.scales, weights.input_dims / weights.group_size * weights.output_dims);
-        deviceMalloc((T**)&weights.zeros, weights.input_dims / weights.group_size * weights.output_dims);
-    }
-
-    if (weights.lora.r > 0) {
-        deviceMalloc((T**)&weights.lora.a, weights.input_dims * weights.lora.r);
-        deviceMalloc((T**)&weights.lora.b, weights.lora.r * weights.output_dims);
-    }
-}
-
 template<typename FirstArg, typename... Args>
 std::string concat(FirstArg&& first, Args&&... args)
 {
@@ -342,64 +297,24 @@ void loadWeights(LlamaDenseWeight<T>& w, std::string prefix, FtCudaDataType mode
 }
 
 template<typename T>
-void LlamaDecoderLayerWeight<T>::mallocWeights()
+void LlamaDecoderLayerWeight<T>::free(cudaStream_t st)
 {
-    deviceMalloc((T**)&self_attn_norm_weights, hidden_units_);
-    deviceMalloc((T**)&ffn_norm_weights, hidden_units_);
+    deviceFree(self_attn_norm_weights, st);
+    deviceFree(ffn_norm_weights, st);
 
-    mallocWeights(self_attn_weights.qkv, attn_bias_);
-    mallocWeights(self_attn_weights.output, attn_bias_);
+    self_attn_weights.free(st);
 
     if (inter_size_) {
-        mallocWeights(ffn_weights.gating, false);
-        mallocWeights(ffn_weights.intermediate, false);
-        mallocWeights(ffn_weights.output, false);
+        ffn_weights.free(st);
     }
 
     if (!moe_weights.experts.empty()) {
-        mallocWeights(moe_weights.gate, false);
-        for (auto& e : moe_weights.experts) {
-            mallocWeights(e.gating, false);
-            mallocWeights(e.intermediate, false);
-            mallocWeights(e.output, false);
-        }
-        if (moe_weights.shared_gate.output_dims) {
-            mallocWeights(moe_weights.shared_gate, false);
-        }
+        moe_weights.free(st);
     }
 }
 
 template<typename T>
-LlamaDecoderLayerWeight<T>::~LlamaDecoderLayerWeight()
-{
-    cudaFree((void*)self_attn_norm_weights);
-    cudaFree((void*)ffn_norm_weights);
-    self_attn_norm_weights = nullptr;
-    ffn_norm_weights       = nullptr;
-
-    freeWeights(self_attn_weights.qkv);
-    freeWeights(self_attn_weights.output);
-
-    if (inter_size_) {
-        freeWeights(ffn_weights.fused_gating_intermediate);
-        freeWeights(ffn_weights.gating);
-        freeWeights(ffn_weights.intermediate);
-        freeWeights(ffn_weights.output);
-    }
-
-    if (!moe_weights.experts.empty()) {
-        freeWeights(moe_weights.gate);
-        for (auto& e : moe_weights.experts) {
-            freeWeights(e.fused_gating_intermediate);
-            freeWeights(e.gating);
-            freeWeights(e.intermediate);
-            freeWeights(e.output);
-        }
-        if (moe_weights.shared_gate.kernel) {
-            freeWeights(moe_weights.shared_gate);
-        }
-    }
-}
+LlamaDecoderLayerWeight<T>::~LlamaDecoderLayerWeight() = default;
 
 template<typename T>
 void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType model_file_type)
@@ -432,6 +347,24 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
     }
 }
 
+template<class T>
+void getMLATensor(LlamaAttentionWeight<T>& w, const std::string& p, TensorMap& m, int tp_rank)
+{
+    if (w.q_proj.output_dims) {
+        getWeightTensor(w.q_proj, false, concat(p, "attention.q_proj", tp_rank), m);
+    }
+    else {
+        getWeightTensor(w.q_a_proj, false, concat(p, "attention.q_a_proj"), m);
+        getWeightTensor(w.q_b_proj, false, concat(p, "attention.q_b_proj", tp_rank), m);
+        m.insert(concat(p, "attention.q_a_layernorm"),
+                 Tensor{MEMORY_GPU, getTensorType<T>(), {sizeof(T) * w.q_b_proj.input_dims}, w.q_a_layernorm});
+    }
+    getWeightTensor(w.kv_a_proj, false, concat(p, "attention.kv_a_proj"), m);
+    getWeightTensor(w.kv_b_proj, false, concat(p, "attention.kv_b_proj", tp_rank), m);
+    m.insert(concat(p, "attention.kv_a_layernorm"),
+             Tensor{MEMORY_GPU, getTensorType<T>(), {sizeof(T) * w.kv_b_proj.input_dims}, w.kv_a_layernorm});
+}
+
 template<typename T>
 TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
 {
@@ -445,7 +378,12 @@ TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
 
     auto get_prefix = [=](std::string_view name) { return concat(prefix, name, tensor_para_rank_); };
 
-    getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output);
+    if (self_attn_weights.qkv.output_dims) {
+        getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output);
+    }
+    else {
+        getMLATensor(self_attn_weights, prefix, output, tensor_para_rank_);
+    }
     getWeightTensor(self_attn_weights.output, attn_bias_, get_prefix("attention.wo"), output);
 
     if (inter_size_) {
@@ -478,7 +416,8 @@ TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
 }
 
 // template<class T>
-static void convert_u4(LlamaDenseWeight<half>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt)
+static void convert_u4(
+    LlamaDenseWeight<half>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st)
 {
     FT_CHECK(weight.type == WeightType::kINT4);
 
@@ -488,11 +427,11 @@ static void convert_u4(LlamaDenseWeight<half>& weight, bool is_fused_moe, void*
         get_weight_and_scales_layout(gemm::DataType::U4, is_fused_moe, getSMVersion(), use_simt);
 
     if (order_b == kColMajor) {
-        transpose_u4((uint4_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims, weight.output_dims);
-        cudaMemcpy(weight.kernel, workspace, weight.input_dims * weight.output_dims / 2, cudaMemcpyDefault);
+        transpose_u4((uint4_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims, weight.output_dims, st);
+        cudaMemcpyAsync(weight.kernel, workspace, weight.input_dims * weight.output_dims / 2, cudaMemcpyDefault, st);
     }
 
-    extend_to_u16((uint16_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims * weight.output_dims);
+    extend_to_u16((uint16_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims * weight.output_dims, st);
     sync_check_cuda_error();
 
     MatrixLayout w_desc{
@@ -507,25 +446,22 @@ static void convert_u4(LlamaDenseWeight<half>& weight, bool is_fused_moe, void*
     k_desc.type         = gemm::DataType::U4;
     k_desc.pack         = pack_b;
 
-    cudaMemset(weight.kernel, 0, weight.input_dims * weight.output_dims / 2);
+    cudaMemsetAsync(weight.kernel, 0, weight.input_dims * weight.output_dims / 2, st);
 
-    FT_CHECK(Convert(workspace, w_desc, weight.kernel, k_desc, 0) == 0);
+    FT_CHECK(Convert(workspace, w_desc, weight.kernel, k_desc, st) == 0);
     sync_check_cuda_error();
 
     const int scale_count = (weight.input_dims / weight.group_size) * weight.output_dims;
 
     // std::cout << "fuse_scales_and_zeros\n";
-    fuse_scales_and_zeros((half*)workspace, weight.scales, weight.zeros, scale_count);
+    fuse_scales_and_zeros((half*)workspace, weight.scales, weight.zeros, scale_count, st);
     // cudaMemset((T*)workspace, 0, sizeof(T) * scale_count * 2);
     sync_check_cuda_error();
 
-    cudaDeviceSynchronize();
-
-    cudaFree(weight.scales);
-    cudaFree(weight.zeros);
-    weight.scales = weight.zeros = nullptr;
+    deviceFree(weight.scales, st);
+    deviceFree(weight.zeros, st);
 
-    deviceMalloc((half**)&weight.scales_zeros, scale_count * 2);
+    deviceMalloc((half**)&weight.scales_zeros, scale_count * 2, st);
 
     MatrixLayout s_desc{
         gemm::DataType::U32,
@@ -538,7 +474,7 @@ static void convert_u4(LlamaDenseWeight<half>& weight, bool is_fused_moe, void*
     MatrixLayout q_desc = s_desc;
     q_desc.pack         = pack_v;
 
-    FT_CHECK(Convert(workspace, s_desc, weight.scales_zeros, q_desc, 0) == 0);
+    FT_CHECK(Convert(workspace, s_desc, weight.scales_zeros, q_desc, st) == 0);
     sync_check_cuda_error();
 
     weight.k_desc = k_desc;
@@ -548,7 +484,8 @@ static void convert_u4(LlamaDenseWeight<half>& weight, bool is_fused_moe, void*
 }
 
 template<class T>
-static void convert_fp(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt)
+static void
+convert_fp(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st)
 {
     using namespace gemm;
 
@@ -563,12 +500,13 @@ static void convert_fp(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* wor
     const int output_dim = weight.output_dims;
 
     if (order_b == kColMajor) {
-        invokeTransposeAxis01((uint16_t*)workspace, (uint16_t*)weight.kernel, input_dim, output_dim, 1, nullptr);
+        invokeTransposeAxis01((uint16_t*)workspace, (uint16_t*)weight.kernel, input_dim, output_dim, 1, st);
         sync_check_cuda_error();
         // FT_CHECK(0);
     }
     else {
-        check_cuda_error(cudaMemcpy(workspace, weight.kernel, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault));
+        check_cuda_error(
+            cudaMemcpyAsync(workspace, weight.kernel, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault, st));
     }
 
     MatrixLayout src{
@@ -583,35 +521,42 @@ static void convert_fp(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* wor
     dst.pack         = pack_b;
 
     if (pack_b) {
-        FT_CHECK(Convert(workspace, src, weight.kernel, dst, nullptr) == 0);
+        FT_CHECK(Convert(workspace, src, weight.kernel, dst, st) == 0);
         sync_check_cuda_error();
         // FT_CHECK(0);
     }
     else {
-        check_cuda_error(cudaMemcpy(weight.kernel, workspace, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault));
+        check_cuda_error(
+            cudaMemcpyAsync(weight.kernel, workspace, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault, st));
     }
 
     weight.k_desc = dst;
 }
 
 template<class T>
-static void convert(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt)
+static void
+convert(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st)
 {
     if (weight.type == WeightType::kINT4) {
         if constexpr (std::is_same_v<T, half>) {
-            convert_u4(weight, is_fused_moe, workspace, size, use_simt);
+            convert_u4(weight, is_fused_moe, workspace, size, use_simt, st);
         }
         else {
             FT_CHECK(0);
         }
     }
     else {
-        convert_fp(weight, is_fused_moe, workspace, size, use_simt);
+        convert_fp(weight, is_fused_moe, workspace, size, use_simt, st);
     }
 }
 
 template<class T>
-void interleave(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>& b, void* workspace, size_t size)
+void interleave(LlamaDenseWeight<T>& c,
+                LlamaDenseWeight<T>& a,
+                LlamaDenseWeight<T>& b,
+                void*                workspace,
+                size_t               size,
+                cudaStream_t         st)
 {
     FT_CHECK(c.input_dims == a.input_dims);
     FT_CHECK(c.input_dims == b.input_dims);
@@ -628,18 +573,18 @@ void interleave(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight
         const auto sentinel = tmp_c + c.output_dims * c.input_dims;
         FT_CHECK(sentinel <= (uint8_t*)workspace + size);
 
-        extend_to_u8(tmp_a, (const uint4_t*)a.kernel, a.output_dims * a.input_dims);
-        extend_to_u8(tmp_b, (const uint4_t*)b.kernel, b.output_dims * b.input_dims);
+        extend_to_u8(tmp_a, (const uint4_t*)a.kernel, a.output_dims * a.input_dims, st);
+        extend_to_u8(tmp_b, (const uint4_t*)b.kernel, b.output_dims * b.input_dims, st);
 
-        interleave_output_dims(tmp_c, tmp_a, tmp_b, a.output_dims, a.input_dims, 0);
+        interleave_output_dims(tmp_c, tmp_a, tmp_b, a.output_dims, a.input_dims, st);
 
-        compact_to_u4((uint4_t*)c.kernel, tmp_c, c.output_dims * c.input_dims);
+        compact_to_u4((uint4_t*)c.kernel, tmp_c, c.output_dims * c.input_dims, st);
 
-        interleave_output_dims(c.scales, a.scales, b.scales, a.output_dims, a.input_dims / a.group_size, 0);
-        interleave_output_dims(c.zeros, a.zeros, b.zeros, a.output_dims, a.input_dims / a.group_size, 0);
+        interleave_output_dims(c.scales, a.scales, b.scales, a.output_dims, a.input_dims / a.group_size, st);
+        interleave_output_dims(c.zeros, a.zeros, b.zeros, a.output_dims, a.input_dims / a.group_size, st);
     }
     else {
-        interleave_output_dims((T*)c.kernel, (const T*)a.kernel, (const T*)b.kernel, a.output_dims, a.input_dims, 0);
+        interleave_output_dims((T*)c.kernel, (const T*)a.kernel, (const T*)b.kernel, a.output_dims, a.input_dims, st);
     }
 
     // Check at function level
@@ -647,7 +592,7 @@ void interleave(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight
 }
 
 template<class T>
-void chunk(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>& b, void*, size_t)
+void chunk(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>& b, void*, size_t, cudaStream_t st)
 {
     FT_CHECK(c.input_dims == a.input_dims);
     FT_CHECK(c.input_dims == b.input_dims);
@@ -656,9 +601,11 @@ void chunk(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>&
     FT_CHECK(c.group_size == a.group_size);
     FT_CHECK(c.group_size == b.group_size);
 
-    auto _chunks = [](auto c, auto a, auto b, int height, int width) {
-        check_cuda_error(cudaMemcpy2D((char*)c + 0x000, width * 2, a, width, width, height, cudaMemcpyDefault));
-        check_cuda_error(cudaMemcpy2D((char*)c + width, width * 2, b, width, width, height, cudaMemcpyDefault));
+    auto _chunks = [&](auto c, auto a, auto b, int height, int width) {
+        check_cuda_error(
+            cudaMemcpy2DAsync((char*)c + 0x000, width * 2, a, width, width, height, cudaMemcpyDefault, st));
+        check_cuda_error(
+            cudaMemcpy2DAsync((char*)c + width, width * 2, b, width, width, height, cudaMemcpyDefault, st));
     };
 
     if (c.type == WeightType::kINT4) {
@@ -675,37 +622,37 @@ void chunk(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>&
 }
 
 template<typename T>
-void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cudaDeviceProp& prop)
+void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cudaDeviceProp& prop, cudaStream_t st)
 {
     const bool is_16xx = is_16xx_series(prop.name);
 
-    convert(self_attn_weights.qkv, false, workspace, size, is_16xx);
-    convert(self_attn_weights.output, false, workspace, size, is_16xx);
+    convert(self_attn_weights.qkv, false, workspace, size, is_16xx, st);
+    convert(self_attn_weights.output, false, workspace, size, is_16xx, st);
 
     auto process_ffn = [&](LlamaFfnWeight<T>& ffn, bool is_fused_moe) {
         if (fused_up_and_gate_) {
             auto& fused_up_and_gate = ffn.fused_gating_intermediate;
 
-            mallocWeights(fused_up_and_gate, false);
+            fused_up_and_gate.malloc(st);
 
             if (ffn.is_fused_silu) {
-                interleave(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size);
+                interleave(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size, st);
             }
             else {
-                chunk(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size);
+                chunk(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size, st);
             }
 
-            convert(ffn.fused_gating_intermediate, is_fused_moe, workspace, size, is_16xx);
+            convert(ffn.fused_gating_intermediate, is_fused_moe, workspace, size, is_16xx, st);
 
-            freeWeights(ffn.gating);
-            freeWeights(ffn.intermediate);
+            ffn.gating.free(st);
+            ffn.intermediate.free(st);
         }
         else {
-            convert(ffn.gating, is_fused_moe, workspace, size, is_16xx);
-            convert(ffn.intermediate, is_fused_moe, workspace, size, is_16xx);
+            convert(ffn.gating, is_fused_moe, workspace, size, is_16xx, st);
+            convert(ffn.intermediate, is_fused_moe, workspace, size, is_16xx, st);
         }
 
-        convert(ffn.output, is_fused_moe, workspace, size, is_16xx);
+        convert(ffn.output, is_fused_moe, workspace, size, is_16xx, st);
     };
 
     if (inter_size_) {
@@ -722,7 +669,7 @@ void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cud
 
         for (auto& e : moe_weights.experts) {
 
-            process_ffn(e, moe_weights.method);
+            process_ffn(e, moe_weights.method == MoeParam::kFused);
 
             const auto& fused  = e.fused_gating_intermediate;
             const auto& output = e.output;
@@ -743,12 +690,12 @@ void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cud
         auto& output = moe_weights.block.output;
 
         // TODO: free these ptrs
-        fused.kernel  = gemm::make_blocked_ptrs(fused_ptrs, nullptr);
-        output.kernel = gemm::make_blocked_ptrs(output_ptrs, nullptr);
+        fused.kernel  = gemm::make_blocked_ptrs(fused_ptrs, st);
+        output.kernel = gemm::make_blocked_ptrs(output_ptrs, st);
 
         if (!fused_param_ptrs.empty()) {
-            fused.scales_zeros  = (T*)gemm::make_blocked_ptrs(fused_param_ptrs, nullptr);
-            output.scales_zeros = (T*)gemm::make_blocked_ptrs(output_param_ptrs, nullptr);
+            fused.scales_zeros  = (T*)gemm::make_blocked_ptrs(fused_param_ptrs, st);
+            output.scales_zeros = (T*)gemm::make_blocked_ptrs(output_param_ptrs, st);
         }
 
         fused.k_desc.ld = output.k_desc.ld = 0;
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
index f68a103dd5..9b204ed0dc 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
@@ -30,19 +30,14 @@ template<typename T>
 struct LlamaDecoderLayerWeight {
 public:
     LlamaDecoderLayerWeight() = delete;
-    LlamaDecoderLayerWeight(int        layer_idx,
-                            size_t     head_num,
-                            size_t     kv_head_num,
-                            size_t     size_per_head,
-                            size_t     hidden_units,
-                            size_t     inter_size,
-                            WeightType weight_type,
-                            int        group_size,
-                            LoraParam  lora_param,
-                            bool       attn_bias,
-                            MoeParam   moe_param,
-                            size_t     tensor_para_size,
-                            size_t     tensor_para_rank);
+
+    LlamaDecoderLayerWeight(int               layer_id,
+                            const ModelParam& model,
+                            const LoraParam&  lora_param,
+                            const MoeParam&   moe_param,
+                            size_t            tp_size,
+                            size_t            tp_rank);
+
     ~LlamaDecoderLayerWeight();
     LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other) = delete;
     LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other) = delete;
@@ -51,17 +46,21 @@ struct LlamaDecoderLayerWeight {
 
     TensorMap getParams(std::string prefix);
 
-    void prepare(void* workspace, size_t size, const cudaDeviceProp& prop);
+    void prepare(void* workspace, size_t size, const cudaDeviceProp& prop, cudaStream_t st);
 
     size_t workspace_size() const noexcept;
 
-    void mallocWeights(LlamaDenseWeight<T>& weights, bool bias);
+    void malloc(cudaStream_t st);
+
+    void free(cudaStream_t st);
+
+    T* self_attn_norm_weights{};
+    T* ffn_norm_weights{};
 
-    T*                      self_attn_norm_weights{};
-    T*                      ffn_norm_weights{};
     LlamaAttentionWeight<T> self_attn_weights{};
-    LlamaFfnWeight<T>       ffn_weights{};
-    MoeFfnWeight<T>         moe_weights{};
+
+    LlamaFfnWeight<T> ffn_weights{};
+    MoeFfnWeight<T>   moe_weights{};
 
 private:
     size_t     head_num_;
@@ -76,8 +75,6 @@ struct LlamaDecoderLayerWeight {
     size_t     tensor_para_rank_;
     bool       is_maintain_buffer_ = false;
     bool       fused_up_and_gate_;
-
-    void mallocWeights();
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h
index 169fb53bcf..944781bf5d 100644
--- a/src/turbomind/models/llama/LlamaDenseWeight.h
+++ b/src/turbomind/models/llama/LlamaDenseWeight.h
@@ -20,64 +20,14 @@
 #pragma once
 
 #include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/models/llama/weight_type.h"
 #include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/memory_utils.h"
 #include <cuda_bf16.h>
 
 namespace turbomind {
 
-enum class WeightType : int
-{
-    kFP32,
-    kFP16,
-    kFP8,  // not supported yet
-    kBF16,
-    kINT8,
-    kINT4
-};
-
-template<class T>
-constexpr WeightType get_default_weight_type()
-{
-    if constexpr (std::is_same_v<T, half>) {
-        return WeightType::kFP16;
-    }
-    else if constexpr (std::is_same_v<T, nv_bfloat16>) {
-        return WeightType::kBF16;
-    }
-    else if constexpr (std::is_same_v<T, float>) {
-        return WeightType::kFP32;
-    }
-    else {
-        static_assert(sizeof(T) != sizeof(T), "not implemented");
-        return {};
-    }
-}
-
-inline size_t getBitSize(WeightType type)
-{
-    switch (type) {
-        case WeightType::kFP32:
-            return 32;
-        case WeightType::kFP16:
-            return 16;
-        case WeightType::kFP8:
-            return 8;
-        case WeightType::kBF16:
-            return 16;
-        case WeightType::kINT8:
-            return 8;
-        case WeightType::kINT4:
-            return 4;
-    }
-    return 0;
-}
-
-enum class LoraPolicy : int
-{
-    kNull,
-    kPlora,
-};
-
 inline LoraPolicy getLoraPolicy(const std::string& policy)
 {
     if (policy == "plora") {
@@ -96,20 +46,31 @@ struct LoraWeight {
 
 template<typename T>
 struct LlamaDenseWeight {
-    size_t     input_dims;
-    size_t     output_dims;
-    void*      kernel;
+    size_t     input_dims  = 0;
+    size_t     output_dims = 0;
+    WeightType type;  // uninitialized
+    void*      kernel       = nullptr;
+    T*         bias         = nullptr;
+    T*         scales       = nullptr;
+    T*         zeros        = nullptr;
+    T*         scales_zeros = nullptr;
+    int        group_size   = 1;
+
     LoraWeight lora;
-    WeightType type;
-    T*         bias;
-    T*         scales;
-    T*         zeros;
-    T*         scales_zeros;
-    int        group_size;
 
     gemm::MatrixLayout k_desc;
     gemm::MatrixLayout q_desc;
 
+    LlamaDenseWeight(): type{}, lora{}, k_desc{}, q_desc{} {}
+
+    LlamaDenseWeight(size_t input_dim, size_t output_dim, WeightType type, int group_size): LlamaDenseWeight{}
+    {
+        this->input_dims  = input_dim;
+        this->output_dims = output_dim;
+        this->type        = type;
+        this->group_size  = group_size;
+    }
+
     size_t kernel_size() const noexcept
     {
         return getBitSize(type) * input_dims * output_dims / 8;
@@ -129,12 +90,121 @@ struct LlamaDenseWeight {
     {
         return {sizeof(T) * input_dims * lora.r, sizeof(T) * lora.r * output_dims};
     }
+
+    void malloc(cudaStream_t st, bool with_bias = false)
+    {
+        if (with_bias) {
+            deviceMalloc((T**)&bias, output_dims, st);
+        }
+        const size_t bit_size = getBitSize(type);
+        if (bit_size >= 16) {  // fp16, fp32
+            deviceMalloc((T**)&kernel, input_dims * output_dims, st);
+        }
+        else {  // int8, int4
+            const int factor = sizeof(float) * 8 / bit_size;
+            FT_CHECK(input_dims % factor == 0);
+            deviceMalloc((int**)&kernel, input_dims * output_dims / factor, st);
+            deviceMalloc((T**)&scales, input_dims / group_size * output_dims, st);
+            deviceMalloc((T**)&zeros, input_dims / group_size * output_dims, st);
+        }
+
+        if (lora.r > 0) {
+            deviceMalloc((T**)&lora.a, input_dims * lora.r, st);
+            deviceMalloc((T**)&lora.b, lora.r * output_dims, st);
+        }
+    }
+
+    void free(cudaStream_t st)
+    {
+        deviceFree(kernel, st);
+        deviceFree(bias, st);
+        deviceFree(scales, st);
+        deviceFree(zeros, st);
+        deviceFree(lora.a, st);
+        deviceFree(lora.b, st);
+    }
 };
 
 template<typename T>
 struct LlamaAttentionWeight {
+
+    LlamaAttentionWeight() = default;
+
+    LlamaAttentionWeight(size_t     hidden_dim,
+                         size_t     head_dim,
+                         size_t     head_num,
+                         size_t     kv_head_num,
+                         MLAParam   mla,
+                         bool       bias,
+                         size_t     tp,
+                         WeightType weight_type,
+                         int        group_size)
+    {
+        this->bias = bias;
+        if (mla.kv_lora_rank == 0) {
+            qkv = {hidden_dim, (head_num + 2 * kv_head_num) * head_dim / tp, weight_type, group_size};
+        }
+        else {
+            const int qk_nope_dim = head_dim - mla.qk_rope_dim;
+            if (mla.q_lora_rank) {
+                q_a_proj = {hidden_dim, mla.q_lora_rank, weight_type, group_size};
+                q_b_proj = {mla.q_lora_rank, head_num * head_dim / tp, weight_type, group_size};
+            }
+            else {
+                q_proj = {hidden_dim, head_num * head_dim / tp, weight_type, group_size};
+            }
+            kv_a_proj = {hidden_dim, mla.kv_lora_rank + mla.qk_rope_dim, weight_type, group_size};
+            kv_b_proj = {mla.kv_lora_rank, head_num * (qk_nope_dim + mla.v_head_dim) / tp, weight_type, group_size};
+        }
+        output = {(head_num * head_dim) / tp, hidden_dim, weight_type, group_size};
+    }
+
+    void malloc(cudaStream_t st)
+    {
+        if (qkv.output_dims) {
+            qkv.malloc(st, bias);
+        }
+        else {
+            if (q_proj.output_dims) {
+                q_proj.malloc(st);
+            }
+            else {
+                q_a_proj.malloc(st);
+                q_b_proj.malloc(st);
+                deviceMalloc((T**)&q_a_layernorm, q_b_proj.input_dims, st);
+            }
+            kv_a_proj.malloc(st);
+            kv_b_proj.malloc(st);
+            deviceMalloc((T**)&kv_a_layernorm, kv_b_proj.input_dims, st);
+        }
+        output.malloc(st, bias);
+    }
+
+    void free(cudaStream_t st)
+    {
+        qkv.free(st);
+        q_proj.free(st);
+        q_a_proj.free(st);
+        q_b_proj.free(st);
+        kv_a_proj.free(st);
+        kv_b_proj.free(st);
+        output.free(st);
+        deviceFree(q_a_layernorm, st);
+        deviceFree(kv_a_layernorm, st);
+    }
+
     LlamaDenseWeight<T> qkv;
     LlamaDenseWeight<T> output;
+    bool                bias{};
+
+    LlamaDenseWeight<T> q_proj;
+    LlamaDenseWeight<T> q_a_proj;
+    LlamaDenseWeight<T> q_b_proj;
+    LlamaDenseWeight<T> kv_a_proj;
+    LlamaDenseWeight<T> kv_b_proj;
+
+    T* q_a_layernorm{};
+    T* kv_a_layernorm{};
 };
 
 template<typename T>
@@ -172,6 +242,21 @@ struct LlamaFfnWeight {
         output.group_size  = group_size;
     }
 
+    void malloc(cudaStream_t st)
+    {
+        gating.malloc(st);
+        intermediate.malloc(st);
+        output.malloc(st);
+    }
+
+    void free(cudaStream_t st)
+    {
+        gating.free(st);
+        intermediate.free(st);
+        output.free(st);
+        fused_gating_intermediate.free(st);
+    }
+
     LlamaDenseWeight<T> gating;
     LlamaDenseWeight<T> intermediate;
     LlamaDenseWeight<T> output;
@@ -186,23 +271,27 @@ struct MoeFfnWeight {
 
     MoeFfnWeight() = default;
 
-    MoeFfnWeight(size_t     hidden_dim,
-                 int        inter_size,
-                 int        expert_num,
-                 int        method,
-                 bool       has_shared_gate,
-                 size_t     tp,
-                 WeightType weight_type,
-                 int        group_size,
-                 bool       fuse_silu_act)
+    MoeFfnWeight(int             layer_id,
+                 const MoeParam& param,
+                 size_t          hidden_dim,
+                 WeightType      weight_type,
+                 int             group_size,
+                 size_t          tp,
+                 bool            fuse_silu_act)
     {
 
-        // printf("%d %d %d\n", (int)hidden_dim, (int)inter_size, (int)expert_num);
+        if (param.expert_num.size() <= layer_id) {
+            return;
+        }
+
+        const int expert_num = param.expert_num[layer_id];
 
         if (expert_num == 0) {
             return;
         }
 
+        // printf("%d %d %d\n", (int)hidden_dim, (int)param.inter_size, (int)expert_num);
+
         gate.input_dims  = hidden_dim;
         gate.output_dims = expert_num;
         gate.type        = get_default_weight_type<T>();
@@ -210,15 +299,15 @@ struct MoeFfnWeight {
 
         experts.resize(expert_num);
 
-        this->method  = method;
-        fuse_silu_act = fuse_silu_act && method;
+        method        = param.method;
+        fuse_silu_act = fuse_silu_act && method == MoeParam::kFused;
 
         for (auto& e : experts) {
             // inter size is divided by tp in `FfnWeight`
-            e = LlamaFfnWeight<T>{hidden_dim, (size_t)inter_size, tp, weight_type, group_size, fuse_silu_act};
+            e = LlamaFfnWeight<T>{hidden_dim, (size_t)param.inter_size, tp, weight_type, group_size, fuse_silu_act};
         }
 
-        if (has_shared_gate) {
+        if (param.shared_gate) {
             shared_gate.input_dims  = hidden_dim;
             shared_gate.output_dims = 1;
             shared_gate.type        = get_default_weight_type<T>();
@@ -229,14 +318,36 @@ struct MoeFfnWeight {
         }
     }
 
+    void malloc(cudaStream_t st)
+    {
+        gate.malloc(st);
+        if (shared_gate.output_dims) {
+            shared_gate.malloc(st);
+        }
+        for (auto& e : experts) {
+            e.malloc(st);
+        }
+    }
+
+    void free(cudaStream_t st)
+    {
+        gate.free(st);
+        shared_gate.free(st);
+        for (auto& e : experts) {
+            e.free(st);
+        }
+        block.free(st);
+    }
+
     LlamaDenseWeight<T>            gate;
     std::vector<LlamaFfnWeight<T>> experts;
 
     LlamaDenseWeight<T> shared_gate;
 
+    // reference into `experts`
     LlamaFfnWeight<T> block;
 
-    int method{};
+    MoeParam::Method method{};
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc
index 8cce207203..907467341a 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.cc
+++ b/src/turbomind/models/llama/LlamaFfnLayer.cc
@@ -27,21 +27,20 @@
 namespace turbomind {
 
 template<typename T>
-void LlamaFfnLayer<T>::allocateBuffer(size_t                     token_num,
-                                      int                        inter_size,
-                                      const LlamaDenseWeight<T>* gating,
-                                      const LlamaDenseWeight<T>* inter)
+void LlamaFfnLayer<T>::allocateBuffer(
+    size_t token_num, int inter_size, size_t inter_buf_factor, size_t gating_lora_r, size_t inter_lora_r)
 {
     const size_t sz = token_num * inter_size;
 
-    const size_t sz_gate  = token_num * gating->lora.r;
-    const size_t sz_inter = token_num * inter->lora.r;
+    const size_t sz_gate  = token_num * gating_lora_r;
+    const size_t sz_inter = token_num * inter_lora_r;
 
-    gating_buf_ = (T*)allocator_->reMalloc(gating_buf_, sizeof(T) * (sz * 2 + sz_gate + sz_inter), false);
-    inter_buf_  = gating_buf_ + sz;
+    gating_buf_ =
+        (T*)allocator_->reMalloc(gating_buf_, sizeof(T) * (sz * inter_buf_factor + sz_gate + sz_inter), false);
+    inter_buf_ = gating_buf_ + sz;
 
     // gate & inter is not fused when lora is enabled
-    if (gating->lora.r) {
+    if (gating_lora_r) {
         inter_buf_ += sz_gate;
     }
 
@@ -93,12 +92,16 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
     const int    layer_id   = input_tensors->getVal<int>("layer_id");
     const int    inter_size = weights->inter_size;
 
-    allocateBuffer(token_num, inter_size, &weights->gating, &weights->intermediate);
+    const bool is_fused_silu = weights->fused_gating_intermediate.kernel && weights->is_fused_silu;
+
+    allocateBuffer(token_num, inter_size, is_fused_silu ? 1 : 2, weights->gating.lora.r, weights->intermediate.lora.r);
 
     const T* ffn_input_data  = input_tensors->at("ffn_input").getPtr<T>();
     T*       ffn_output_data = output_tensors->at("ffn_output").getPtr<T>();
     int*     lora_mask = input_tensors->at("lora_mask", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<int>();
 
+    const bool all_reduce = input_tensors->getVal<bool>("all_reduce", false);
+
     if (weights->fused_gating_intermediate.kernel) {
         NvtxScope scope("fused_silu_ffn");
 
@@ -145,7 +148,8 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
 
     count_and_fix(ffn_output_data, token_num * weights->output.output_dims, Concat("w2", layer_id), 3);
 
-    if (all_reduce_ && tensor_para_.world_size_ > 1) {
+    if (all_reduce && tensor_para_.world_size_ > 1) {
+        // std::cout << "ffn all reduce " << layer_id << "\n";
         NcclGuard nccl_guard(tensor_para_, stream_);
         ftNcclAllReduceSum(ffn_output_data, ffn_output_data, token_num * hidden_units_, tensor_para_, stream_);
         sync_check_cuda_error();
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.h b/src/turbomind/models/llama/LlamaFfnLayer.h
index 2daca2cc95..a72a24701e 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.h
+++ b/src/turbomind/models/llama/LlamaFfnLayer.h
@@ -30,13 +30,12 @@ namespace turbomind {
 template<typename T>
 class LlamaFfnLayer {
 public:
-    LlamaFfnLayer(const ModelParam& model, const NcclParam& tp, const Context<T>& ctx, bool all_reduce):
+    LlamaFfnLayer(const ModelParam& model, const NcclParam& tp, const Context<T>& ctx):
         hidden_units_(model.hidden_units),
         tensor_para_(tp),
         stream_(ctx.stream),
         linear_(ctx.linear.get()),
-        allocator_(ctx.allocator.get()),
-        all_reduce_(all_reduce)
+        allocator_(ctx.allocator.get())
     {
     }
 
@@ -48,7 +47,8 @@ class LlamaFfnLayer {
     void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaFfnWeight<T>* weights);
 
 private:
-    void allocateBuffer(size_t token_num, int inter_size, const LlamaDenseWeight<T>*, const LlamaDenseWeight<T>*);
+    void allocateBuffer(
+        size_t token_num, int inter_size, size_t inter_buf_factor, size_t gating_lora_r, size_t inter_lora_r);
 
     void freeBuffer();
 
@@ -59,7 +59,6 @@ class LlamaFfnLayer {
     cudaStream_t const    stream_;
     LlamaLinear<T>* const linear_;
     IAllocator* const     allocator_;
-    const bool            all_reduce_;
     bool                  is_free_buffer_after_forward_{};
 
     T* gating_buf_{};
diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc
index 3d50910ad4..05b22deed5 100644
--- a/src/turbomind/models/llama/LlamaV2.cc
+++ b/src/turbomind/models/llama/LlamaV2.cc
@@ -72,7 +72,6 @@ LlamaV2<T>::LlamaV2(const ModelParam&               model,
     lora_param_(lora),
     head_num_(model.head_num),
     size_per_head_(model.head_dim),
-    inter_size_(model.inter_size),
     hidden_units_(model.hidden_units),
     layer_num_(model.layer_num),
     vocab_size_(model.vocab_size),
diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h
index 6321d09d7c..658282f5e5 100644
--- a/src/turbomind/models/llama/LlamaV2.h
+++ b/src/turbomind/models/llama/LlamaV2.h
@@ -113,7 +113,6 @@ class LlamaV2 {
     const size_t    head_num_;
     const size_t    size_per_head_;
     const size_t    hidden_units_;
-    const size_t    inter_size_;
     const size_t    layer_num_;
     const size_t    vocab_size_;
     const size_t    vocab_size_padded_;
diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc
index 9d62042d62..bcee150977 100644
--- a/src/turbomind/models/llama/LlamaWeight.cc
+++ b/src/turbomind/models/llama/LlamaWeight.cc
@@ -20,36 +20,24 @@
 
 #include "src/turbomind/models/llama/LlamaWeight.h"
 #include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/memory_utils.h"
 #include <cuda_runtime.h>
 
 namespace turbomind {
 
 template<typename T>
-LlamaWeight<T>::LlamaWeight(size_t     head_num,
-                            size_t     kv_head_num,
-                            size_t     size_per_head,
-                            size_t     hidden_units,
-                            size_t     inter_size,
-                            size_t     vocab_size,
-                            size_t     embedding_size,
-                            size_t     num_layer,
-                            bool       attn_bias,
-                            WeightType weight_type,
-                            int        group_size,
-                            LoraParam  lora_param,
-                            MoeParam   moe_param,
-                            size_t     tensor_para_size,
-                            size_t     tensor_para_rank):
-    hidden_units_(hidden_units),
-    inter_size_(inter_size),
-    vocab_size_(vocab_size),
-    vocab_size_padded_(vocab_size),
-    embedding_size_(embedding_size),
-    num_layer_(num_layer),
-    weight_type_(weight_type),
-    tensor_para_size_(tensor_para_size),
-    tensor_para_rank_(tensor_para_rank)
+LlamaWeight<T>::LlamaWeight(
+    const ModelParam& model, const LoraParam& lora_param, const MoeParam& moe_param, size_t tp_size, size_t tp_rank):
+    hidden_units_(model.hidden_units),
+    inter_size_(model.inter_size),
+    vocab_size_(model.vocab_size),
+    vocab_size_padded_(model.vocab_size),
+    embedding_size_(model.embedding_size),
+    num_layer_(model.layer_num),
+    weight_type_(model.weight_type),
+    tensor_para_size_(tp_size),
+    tensor_para_rank_(tp_rank)
 {
     if (vocab_size_padded_ % tensor_para_size_ != 0) {
         vocab_size_padded_ = (vocab_size_ + tensor_para_size_ - 1) / tensor_para_size_ * tensor_para_size_;
@@ -61,49 +49,42 @@ LlamaWeight<T>::LlamaWeight(size_t     head_num,
     }
     FT_CHECK(hidden_units_ % tensor_para_size_ == 0);
 
+    check_cuda_error(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
+
     decoder_layer_weights.reserve(num_layer_);
     for (unsigned l = 0; l < num_layer_; ++l) {
-        decoder_layer_weights.push_back(new LlamaDecoderLayerWeight<T>(l,
-                                                                       head_num,
-                                                                       kv_head_num,
-                                                                       size_per_head,
-                                                                       hidden_units_,
-                                                                       inter_size_,
-                                                                       weight_type_,
-                                                                       group_size,
-                                                                       lora_param,
-                                                                       attn_bias,
-                                                                       moe_param,
-                                                                       tensor_para_size_,
-                                                                       tensor_para_rank_));
+        decoder_layer_weights.emplace_back(
+            new LlamaDecoderLayerWeight<T>(l, model, lora_param, moe_param, tp_size, tp_rank));
+        decoder_layer_weights.back()->malloc(stream_);
     }
 
-    mallocWeights();
+    FT_CHECK(vocab_size_padded_ % tensor_para_size_ == 0);
+    deviceMalloc((T**)&pre_decoder_embedding_table, embedding_size_ * hidden_units_ / tensor_para_size_, stream_);
+    deviceMalloc((T**)&output_norm_weight, hidden_units_, stream_);
+    deviceMalloc((T**)&post_decoder_embedding_kernel, hidden_units_ * vocab_size_padded_ / tensor_para_size_, stream_);
+
+    // Wait for allocations
+    check_cuda_error(cudaStreamSynchronize(stream_));
 }
 
 template<typename T>
 LlamaWeight<T>::~LlamaWeight()
 {
-    cudaFree((void*)pre_decoder_embedding_table);
-    cudaFree((void*)output_norm_weight);
-    cudaFree((void*)post_decoder_embedding_kernel);
-
-    pre_decoder_embedding_table   = nullptr;
-    output_norm_weight            = nullptr;
-    post_decoder_embedding_kernel = nullptr;
+    deviceFree(pre_decoder_embedding_table, stream_);
+    deviceFree(output_norm_weight, stream_);
+    deviceFree(post_decoder_embedding_kernel, stream_);
 
     for (auto& p : decoder_layer_weights) {
+        p->free(stream_);
         delete p;
     }
-}
 
-template<typename T>
-void LlamaWeight<T>::mallocWeights()
-{
-    FT_CHECK(vocab_size_padded_ % tensor_para_size_ == 0);
-    deviceMalloc((T**)&pre_decoder_embedding_table, embedding_size_ * hidden_units_ / tensor_para_size_);
-    deviceMalloc((T**)&output_norm_weight, hidden_units_);
-    deviceMalloc((T**)&post_decoder_embedding_kernel, hidden_units_ * vocab_size_padded_ / tensor_para_size_);
+    decoder_layer_weights.clear();
+
+    // Wait for deallocations
+    check_cuda_error(cudaStreamSynchronize(stream_));
+    check_cuda_error(cudaStreamDestroy(stream_));
+    stream_ = {};
 }
 
 template<typename T>
@@ -179,13 +160,19 @@ void LlamaWeight<T>::prepare(const cudaDeviceProp& prop)
 
     TM_LOG_INFO("[LlamaWeight<T>::prepare] workspace size: %d\n", workspace_size);
 
+    // Wait for the weights to be filled externally
+    check_cuda_error(cudaDeviceSynchronize());
+
     if (workspace_size) {
-        deviceMalloc((char**)&workspace, workspace_size);
+        deviceMalloc((char**)&workspace, workspace_size, stream_);
     }
     for (auto& layer : decoder_layer_weights) {
-        layer->prepare(workspace, workspace_size, prop);
+        layer->prepare(workspace, workspace_size, prop, stream_);
     }
-    deviceFree(workspace);
+
+    deviceFree(workspace, stream_);
+
+    check_cuda_error(cudaStreamSynchronize(stream_));
 }
 
 #ifdef ENABLE_FP32
diff --git a/src/turbomind/models/llama/LlamaWeight.h b/src/turbomind/models/llama/LlamaWeight.h
index c30e753565..629cd56120 100644
--- a/src/turbomind/models/llama/LlamaWeight.h
+++ b/src/turbomind/models/llama/LlamaWeight.h
@@ -22,28 +22,18 @@
 
 #include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
 #include "src/turbomind/models/llama/llama_params.h"
-#include "src/turbomind/utils/memory_utils.h"
 
 namespace turbomind {
 
 template<typename T>
 struct LlamaWeight {
     LlamaWeight() = default;
-    LlamaWeight(size_t     head_num,
-                size_t     kv_head_num,
-                size_t     size_per_head,
-                size_t     hidden_units,
-                size_t     inter_size,
-                size_t     vocab_size,
-                size_t     embedding_size,
-                size_t     num_layer,
-                bool       attn_bias,
-                WeightType weight_type,
-                int        group_size,
-                LoraParam  lora_param,
-                MoeParam   moe_param,
-                size_t     tensor_para_size,
-                size_t     tensor_para_rank);
+
+    LlamaWeight(const ModelParam& model_param,
+                const LoraParam&  lora_param,
+                const MoeParam&   moe_param,
+                size_t            tp_size,
+                size_t            tp_rank);
 
     ~LlamaWeight();
 
@@ -57,15 +47,13 @@ struct LlamaWeight {
     void prepare(const cudaDeviceProp& prop);
 
     std::vector<LlamaDecoderLayerWeight<T>*> decoder_layer_weights;
-    const T*                                 pre_decoder_embedding_table{};
-    const T*                                 output_norm_weight{};
-    const T*                                 post_decoder_embedding_kernel{};
 
-private:
-    void mallocWeights();
+    T* pre_decoder_embedding_table{};
+    T* output_norm_weight{};
+    T* post_decoder_embedding_kernel{};
 
+private:
     size_t     hidden_units_;
-    size_t     inter_size_;
     size_t     vocab_size_;
     size_t     vocab_size_padded_;
     size_t     embedding_size_;
@@ -73,6 +61,10 @@ struct LlamaWeight {
     WeightType weight_type_;
     size_t     tensor_para_size_;
     size_t     tensor_para_rank_;
+
+    std::vector<int> inter_size_;
+
+    cudaStream_t stream_;
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/llama_gemm.cc b/src/turbomind/models/llama/llama_gemm.cc
index 62952cd715..f9a0191e4b 100644
--- a/src/turbomind/models/llama/llama_gemm.cc
+++ b/src/turbomind/models/llama/llama_gemm.cc
@@ -84,7 +84,7 @@ int main(int argc, char* argv[])
         return -1;
     }
     else {
-        ft::deviceMalloc(reinterpret_cast<char**>(&gemm_test_buf), buf_size_in_byte, false);
+        ft::deviceMalloc(reinterpret_cast<char**>(&gemm_test_buf), buf_size_in_byte, nullptr, false);
     }
 
     if (0) {}
diff --git a/src/turbomind/models/llama/llama_kernels.h b/src/turbomind/models/llama/llama_kernels.h
index 3b01dee60d..aaade1a513 100644
--- a/src/turbomind/models/llama/llama_kernels.h
+++ b/src/turbomind/models/llama/llama_kernels.h
@@ -154,7 +154,7 @@ template<typename T>
 struct TempBuffer {
     TempBuffer(size_t size)
     {
-        deviceMalloc(&data, size, false);
+        cudaMalloc(&data, size);
     }
     T* data;
 };
diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h
index e6b9d690ae..0a505b11a9 100644
--- a/src/turbomind/models/llama/llama_params.h
+++ b/src/turbomind/models/llama/llama_params.h
@@ -2,28 +2,41 @@
 
 #pragma once
 
-#include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include <cstddef>
 #include <map>
 #include <regex>
 #include <string>
 
+#include "src/turbomind/models/llama/weight_type.h"
+
 namespace turbomind {
 
+struct MLAParam {
+    size_t q_lora_rank;
+    size_t kv_lora_rank;
+    size_t qk_rope_dim;
+    size_t v_head_dim;
+};
+
 struct ModelParam {
-    size_t head_num;
-    size_t head_dim;
-    size_t kv_head_num;
-    size_t hidden_units;
-    size_t layer_num;
-    size_t inter_size;
-    size_t vocab_size;
-    size_t embedding_size;
-    float  norm_eps;
-    int    quant_policy;
-    //
-    int start_id;
-    int end_id;
+    size_t     head_num;
+    size_t     head_dim;
+    size_t     kv_head_num;
+    size_t     hidden_units;
+    size_t     layer_num;
+    size_t     vocab_size;
+    size_t     embedding_size;
+    float      norm_eps;
+    int        quant_policy;
+    bool       attn_bias;
+    WeightType weight_type;
+    int        group_size;
+    int        start_id;
+    int        end_id;
+    MLAParam   mla;
+    int        tune_layer_num;
+
+    std::vector<int> inter_size;
 };
 
 struct MoeParam {
@@ -32,17 +45,25 @@ struct MoeParam {
         kNaive,
         kFused
     } method;
-    int  expert_num;
-    int  experts_per_token;
-    int  inter_size;
-    bool norm_topk;
-    bool shared_gate;
+
+    int   experts_per_token;
+    int   inter_size;
+    bool  norm_topk_prob;
+    bool  shared_gate;
+    float routed_scale;
+
+    int         topk_group;
+    std::string topk_method;
+    int         n_group;
+
+    std::vector<int> expert_num;
 };
 
 struct AttentionParam {
     int         rotary_embedding_dim;
     float       rotary_embedding_base;
     int         max_position_embeddings;
+    float       softmax_scale;
     std::string rope_scaling_type;
     int         original_max_position_embeddings;
     float       rope_scaling_factor;
@@ -74,6 +95,12 @@ struct EngineParam {
     int max_prefill_iters;
 };
 
+enum class LoraPolicy : int
+{
+    kNull,
+    kPlora,
+};
+
 struct LoraParam {
     int        r;
     float      scale;
diff --git a/src/turbomind/models/llama/llama_utils.cu b/src/turbomind/models/llama/llama_utils.cu
index 925c6b8831..eaa450ae20 100644
--- a/src/turbomind/models/llama/llama_utils.cu
+++ b/src/turbomind/models/llama/llama_utils.cu
@@ -1,47 +1,25 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
-#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
-#include "src/turbomind/models/llama/llama_utils.h"
-#include "src/turbomind/utils/cuda_utils.h"
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <type_traits>
+#include <vector>
+
 #include <cuda_fp16.h>
 #include <curand_kernel.h>
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/host_vector.h>
-#include <vector>
+
+#include "src/turbomind/models/llama/llama_utils.h"
+#include "src/turbomind/utils/cuda_utils.h"
 
 namespace turbomind {
 
 CmpMode compare_mode = kCmpRead;
-
-template<typename T>
-struct abs_diff_t {
-    using type = T;
-};
-
-template<>
-struct abs_diff_t<half> {
-    using type = float;
-};
-
-template<>
-struct abs_diff_t<__nv_bfloat16> {
-    using type = float;
-};
-
-template<typename T>
-struct abs_diff: public thrust::unary_function<thrust::tuple<T, T>, typename abs_diff_t<T>::type> {
-    __host__ __device__ float operator()(thrust::tuple<T, T> x) const
-    {
-        using R = typename abs_diff_t<T>::type;
-        auto r  = R(thrust::get<0>(x)) - R(thrust::get<1>(x));
-        return r < R(0) ? -r : r;
-    }
-};
+// CmpMode compare_mode = kCmpWrite;
 
 template<typename T>
 void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream)
@@ -63,10 +41,8 @@ void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream)
 template<typename T>
 void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream)
 {
-    // wait for b
-    check_cuda_error(cudaStreamSynchronize(stream));
     // read a from file
-    thrust::host_vector<T> h_a(size);
+    std::vector<T> h_a(size);
     {
         const auto    filename = "tmp/" + key + ".cmp";
         std::ifstream ifs(filename, std::ios::binary);
@@ -85,15 +61,30 @@ void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream)
         }
         ifs.read((char*)h_a.data(), sizeof(T) * h_a.size());
     }
-    // copy a to device
-    thrust::device_vector<T> a = h_a;
-    // create abs(a - b) iterator
-    thrust::device_ptr<T> dev_ptr(ptr);
-    auto                  zip_iter       = thrust::make_zip_iterator(thrust::make_tuple(a.begin(), dev_ptr));
-    auto                  transform_iter = thrust::make_transform_iterator(zip_iter, abs_diff<T>{});
-    // sum(abs(a - b))
-    auto asum = thrust::reduce(thrust::device, transform_iter, transform_iter + size);
-    std::cerr << key << ": " << asum << " " << asum / size << "\n";
+    std::vector<T> h_b(size);
+    check_cuda_error(cudaMemcpyAsync(h_b.data(), ptr, sizeof(T) * size, cudaMemcpyDefault, stream));
+    check_cuda_error(cudaStreamSynchronize(stream));
+
+    using Tacc         = std::conditional_t<std::is_integral_v<T>, int64_t, float>;
+    constexpr Tacc eps = std::is_integral_v<T> ? 1 : 1e-8f;
+
+    Tacc asum{};
+    Tacc rsum{};
+    Tacc amean{};
+    for (size_t i = 0; i < size; ++i) {
+        Tacc x        = (Tacc)h_b[i];
+        Tacc r        = (Tacc)h_a[i];
+        Tacc abs_diff = std::abs(x - r);
+        Tacc rel_diff = abs_diff / std::max(std::max(std::abs(r), std::abs(x)), eps);
+        asum += abs_diff;
+        rsum += rel_diff;
+        amean += std::abs(r);
+    }
+
+    std::cerr << key << ": " << amean / size << " " << asum << " " << asum / size << " " << rsum / size << "\n";
+
+    check_cuda_error(cudaMemcpyAsync(ptr, h_a.data(), sizeof(T) * h_a.size(), cudaMemcpyDefault, stream));
+    check_cuda_error(cudaStreamSynchronize(stream));
 }
 
 template<typename T>
diff --git a/src/turbomind/models/llama/mla_utils.cu b/src/turbomind/models/llama/mla_utils.cu
new file mode 100644
index 0000000000..2f9e786f2a
--- /dev/null
+++ b/src/turbomind/models/llama/mla_utils.cu
@@ -0,0 +1,93 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "src/turbomind/kernels/core/array_ops.h"
+
+namespace turbomind {
+
+template<class T, int vec_size>
+__global__ void mla_copy_qkv_kernel(T*       qkv,
+                                    const T* q,     // [h, head_dim]
+                                    const T* kv_a,  // [kv_lora_rank, rope_dim]
+                                    const T* kv_b,  // [h, nope_dim + v_head_dim]
+                                    int      head_num,
+                                    int      head_dim,
+                                    int      nope_dim,
+                                    int      rope_dim,
+                                    int      kv_lora_rank,
+                                    int      v_head_dim)
+{
+    const int type = blockIdx.y;
+
+    const int64_t ti = blockIdx.x;
+    const int     di = threadIdx.x;
+
+    const int kv_b_dim = nope_dim + v_head_dim;
+
+    // for (int hi = threadIdx.y; hi < head_num; hi += blockDim.y) {
+    const int          hi = threadIdx.y;
+    Array<T, vec_size> data{};
+    if (type == 0) {  // Q
+        if (di * vec_size < rope_dim) {
+            Ldg(data, &q[ti * head_num * head_dim + hi * head_dim + nope_dim + di * vec_size]);
+        }
+        else {
+            Ldg(data, &q[ti * head_num * head_dim + hi * head_dim + di * vec_size - rope_dim]);
+        }
+    }
+    else if (type == 1) {  // K
+        if (di * vec_size < rope_dim) {
+            Ldg(data, &kv_a[ti * (kv_lora_rank + rope_dim) + kv_lora_rank + di * vec_size]);
+        }
+        else {
+            Ldg(data, &kv_b[ti * head_num * kv_b_dim + hi * kv_b_dim + di * vec_size - rope_dim]);
+        }
+    }
+    else {  // V
+        if (di * vec_size < v_head_dim) {
+            Ldg(data, &kv_b[ti * head_num * kv_b_dim + hi * kv_b_dim + nope_dim + di * vec_size]);
+        }
+    }
+    const int stride = 3 * head_num * head_dim;
+    Store(&qkv[ti * stride + type * head_num * head_dim + hi * head_dim + di * vec_size], data);
+    // }
+}
+
+template<class T>
+void invokeMLACopyQKV(T*           qkv,
+                      const T*     q,
+                      const T*     kv_a,
+                      const T*     kv_b,
+                      int          token_num,
+                      int          head_num,
+                      int          nope_dim,
+                      int          rope_dim,
+                      int          kv_lora_rank,
+                      int          v_head_dim,
+                      cudaStream_t stream)
+{
+    constexpr int vec_size = 16 / sizeof(T);
+    const int     head_dim = nope_dim + rope_dim;
+
+    dim3 block(head_dim / vec_size, head_num);
+    // make sure block size <= 1024
+    while (block.x * block.y > 1024) {
+        block.y /= 2;
+    }
+    const dim3 grid(token_num, 3);
+
+    mla_copy_qkv_kernel<T, vec_size><<<grid, block, 0, stream>>>(
+        qkv, q, kv_a, kv_b, head_num, head_dim, nope_dim, rope_dim, kv_lora_rank, v_head_dim);
+}
+
+template void invokeMLACopyQKV(uint16_t*       qkv,
+                               const uint16_t* q,
+                               const uint16_t* kv_a,
+                               const uint16_t* kv_b,
+                               int             token_num,
+                               int             head_num,
+                               int             nope_dim,
+                               int             rope_dim,
+                               int             kv_lora_rank,
+                               int             v_head_dim,
+                               cudaStream_t    stream);
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/llama/mla_utils.h b/src/turbomind/models/llama/mla_utils.h
new file mode 100644
index 0000000000..bc06a352f9
--- /dev/null
+++ b/src/turbomind/models/llama/mla_utils.h
@@ -0,0 +1,57 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#pragma once
+
+#include <cstdint>
+#include <cuda_runtime.h>
+
+#include "src/turbomind/utils/cuda_utils.h"
+
+namespace turbomind {
+
+template<class T>
+void invokeMLACopyQKV(T*           qkv,
+                      const T*     q,
+                      const T*     kv_a,
+                      const T*     kv_b,
+                      int          token_num,
+                      int          head_num,
+                      int          nope_dim,
+                      int          rope_dim,
+                      int          kv_lora_rank,
+                      int          v_head_dim,
+                      cudaStream_t stream);
+
+template<class T>
+void dispatchMLACopyQKV(T*           qkv,
+                        const T*     q,
+                        const T*     kv_a,
+                        const T*     kv_b,
+                        int          token_num,
+                        int          head_num,
+                        int          nope_dim,
+                        int          rope_dim,
+                        int          kv_lora_rank,
+                        int          v_head_dim,
+                        cudaStream_t stream)
+{
+    auto invoke = [&](auto x) {
+        using type = decltype(x);
+        invokeMLACopyQKV((type*)qkv,
+                         (const type*)q,
+                         (const type*)kv_a,
+                         (const type*)kv_b,
+                         token_num,
+                         head_num,
+                         nope_dim,
+                         rope_dim,
+                         kv_lora_rank,
+                         v_head_dim,
+                         stream);
+    };
+    if constexpr (sizeof(T) == 2) {
+        return invoke(uint16_t{});
+    }
+    FT_CHECK(0);
+}
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc
index 1ad76839d1..390d147540 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.cc
+++ b/src/turbomind/models/llama/moe_ffn_layer.cc
@@ -11,22 +11,21 @@
 #include "src/turbomind/utils/nvtx_utils.h"
 #include "src/turbomind/utils/string_utils.h"
 #include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
 #include <iomanip>
 
 namespace turbomind {
 
 template<class T>
-void MoeFfnLayer<T>::AllocateBuffer(size_t tokens, size_t padded)
+void MoeFfnLayer<T>::AllocateBuffer(size_t tokens, size_t padded, size_t expert_num, size_t inter_buf_factor)
 {
     char* base = 0;
 
     auto allocate = [&](void* base) {
         Monotonic alloc{base};
         alloc(&inout_buf_, tokens * param_.experts_per_token * hidden_dim_);
-        alloc(&inter_buf_, tokens * param_.experts_per_token * inter_size_ * 2);
-        alloc(&logits_, tokens * param_.expert_num);
-        alloc(&masks_, param_.expert_num * padded);
+        alloc(&inter_buf_, tokens * param_.experts_per_token * inter_size_ * inter_buf_factor);
+        alloc(&logits_, tokens * expert_num);
+        alloc(&masks_, expert_num * padded);
         alloc(&f2n_, param_.experts_per_token * tokens);
         alloc(&en2f_, param_.experts_per_token * tokens);
         alloc(&scales_, param_.experts_per_token * tokens);
@@ -80,18 +79,42 @@ void MoeFfnLayer<T>::gate(float* logits, const T* input, int tokens, const Llama
 template<class T>
 void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight<T>& moe)
 {
-    const size_t padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
+    const size_t padded     = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
+    const int    expert_num = moe.experts.size();
 
-    AllocateBuffer(tokens, padded);
+    FT_CHECK(expert_num);
+
+    const size_t inter_buf_factor = [&] {
+        if (param_.method == MoeParam::kNaive) {
+            return 0;  // managed by ffn
+        }
+        else if (moe.block.is_fused_silu) {
+            return 1;
+        }
+        else {
+            return 2;
+        }
+    }();
+
+    AllocateBuffer(tokens, padded, expert_num, inter_buf_factor);
 
     gate(logits_, input, tokens, moe.gate);
     sync_check_cuda_error();
 
-    check_cuda_error(cudaMemsetAsync(accum_, 0, sizeof(int) * param_.expert_num * kMoeGateMaxTiles, stream_));
-    sync_check_cuda_error();
+    // if (tensor_para_.rank_ == 0) {
+    //     Compare(logits_, tokens * expert_num, Concat("logit", layer_id), compare_mode, stream_);
+    // }
+
+    check_cuda_error(cudaMemsetAsync(accum_, 0, sizeof(int) * expert_num * kMoeGateMaxTiles, stream_));
+    check_cuda_error(cudaMemsetAsync(masks_, -1, sizeof(int8_t) * expert_num * padded, stream_));
 
     // dump_logits(tokens, layer_id);
 
+    if (param_.topk_method == "group_limited_greedy") {
+        invokeMaskMoeTopKGroups(logits_, tokens, expert_num, expert_num / param_.n_group, param_.topk_group, stream_);
+        sync_check_cuda_error();
+    }
+
     /// TODO: fix illegal memory access even if NaN are present in logits
     invokeMoeGate_V2(f2n_,
                      en2f_,
@@ -102,25 +125,26 @@ void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id
                      logits_,
                      tokens,
                      padded,
-                     param_.expert_num,
+                     expert_num,
                      param_.experts_per_token,
-                     param_.norm_topk,
+                     param_.norm_topk_prob,
+                     param_.routed_scale,
                      stream_);
     sync_check_cuda_error();
 
     if (isTuning()) {
         std::mt19937     g;
-        const auto       expert_ids = SampleUniform(tokens, param_.expert_num, param_.experts_per_token, g);
-        std::vector<int> cnt(param_.expert_num);
+        const auto       expert_ids = SampleUniform(tokens, expert_num, param_.experts_per_token, g);
+        std::vector<int> cnt(expert_num);
         for (const auto& x : expert_ids) {
             ++cnt[x];
         }
         h_offsets_[0] = 0;
-        for (int i = 0; i < param_.expert_num; ++i) {
+        for (int i = 0; i < expert_num; ++i) {
             h_offsets_[i + 1] = h_offsets_[i] + cnt[i];
         }
         check_cuda_error(
-            cudaMemcpyAsync(offsets_, h_offsets_, sizeof(int) * (param_.expert_num + 1), cudaMemcpyDefault, stream_));
+            cudaMemcpyAsync(offsets_, h_offsets_, sizeof(int) * (expert_num + 1), cudaMemcpyDefault, stream_));
     }
 
     if (param_.method == MoeParam::kNaive) {
@@ -129,15 +153,15 @@ void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id
         sync_check_cuda_error();
 
         check_cuda_error(
-            cudaMemcpyAsync(h_offsets_, offsets_, sizeof(int) * (param_.expert_num + 1), cudaMemcpyDefault, stream_));
+            cudaMemcpyAsync(h_offsets_, offsets_, sizeof(int) * (expert_num + 1), cudaMemcpyDefault, stream_));
 
         check_cuda_error(cudaStreamSynchronize(stream_));
 
-        if (h_offsets_[param_.expert_num] != tokens * param_.experts_per_token) {
-            FT_CHECK_WITH_INFO(0, fmtstr("%d vs %d", h_offsets_[param_.expert_num], tokens * param_.experts_per_token));
+        if (h_offsets_[expert_num] != tokens * param_.experts_per_token) {
+            FT_CHECK_WITH_INFO(0, fmtstr("%d vs %d", h_offsets_[expert_num], tokens * param_.experts_per_token));
         }
 
-        for (int i = 0; i < param_.expert_num; ++i) {
+        for (int i = 0; i < expert_num; ++i) {
 
             FT_CHECK(moe.experts[i].is_fused_silu == false);
 
@@ -153,7 +177,7 @@ void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id
         }
     }
     else {
-        context_->set_offsets(offsets_);
+        context_->update(expert_num, param_.experts_per_token, offsets_);
 
         auto& block = moe.block;
 
@@ -217,7 +241,7 @@ void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id
 }
 
 template<class T>
-void MoeFfnLayer<T>::reduce(T* output, int tokens, const MoeFfnWeight<T>& moe)
+void MoeFfnLayer<T>::reduce(T* output, int tokens, float output_scale, int layer_id, const MoeFfnWeight<T>& moe)
 {
     invokeMoeReduce(output,
                     inout_buf_,
@@ -227,19 +251,21 @@ void MoeFfnLayer<T>::reduce(T* output, int tokens, const MoeFfnWeight<T>& moe)
                     tokens,
                     param_.experts_per_token,
                     hidden_dim_,
+                    output_scale,
                     stream_);
     sync_check_cuda_error();
 
     if (tensor_para_.world_size_ > 1) {
+        // std::cout << "moe all reduce " << layer_id << "\n";
         ftNcclAllReduceSum(output, output, tokens * hidden_dim_, tensor_para_, stream_);
         sync_check_cuda_error();
     }
 }
 
 template<class T>
-void MoeFfnLayer<T>::dump_logits(int token_num, int layer_id)
+void MoeFfnLayer<T>::dump_logits(int token_num, int layer_id, int expert_num)
 {
-    std::vector<float> logits(token_num * param_.expert_num);
+    std::vector<float> logits(token_num * expert_num);
     check_cuda_error(
         cudaMemcpyAsync(logits.data(), logits_, sizeof(float) * logits.size(), cudaMemcpyDefault, stream_));
     check_cuda_error(cudaStreamSynchronize(stream_));
@@ -247,7 +273,7 @@ void MoeFfnLayer<T>::dump_logits(int token_num, int layer_id)
     auto ptr = logits.data();
     std::cout << "layer_id: " << layer_id << std::endl;
     for (int i = 0; i < token_num; ++i) {
-        for (int e = 0; e < param_.expert_num; ++e) {
+        for (int e = 0; e < expert_num; ++e) {
             std::cout << *ptr++ << " ";
         }
         std::cout << std::endl;
diff --git a/src/turbomind/models/llama/moe_ffn_layer.h b/src/turbomind/models/llama/moe_ffn_layer.h
index 0f1713f7b5..74c62d004b 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.h
+++ b/src/turbomind/models/llama/moe_ffn_layer.h
@@ -9,6 +9,7 @@
 #include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
 #include "src/turbomind/utils/nccl_utils.h"
+#include <algorithm>
 
 namespace turbomind {
 
@@ -26,23 +27,24 @@ class MoeFfnLayer {
         linear_(ctx.linear.get()),
         allocator_(ctx.allocator.get())
     {
-        model.inter_size = param.inter_size;
+        FT_CHECK(!param.expert_num.empty());
+        const int max_expert_num = *std::max_element(param.expert_num.begin(), param.expert_num.end());
 
         if (param_.method == MoeParam::kFused) {
             context_ = std::make_unique<gemm::MoeGemmContext>(
-                param.expert_num, param.experts_per_token, ctx.cuda_device_prop, stream_);
+                max_expert_num, param.experts_per_token, ctx.cuda_device_prop, stream_);
         }
         else {
-            expert_ffn_ = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx, false);
+            expert_ffn_ = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx);
         }
 
-        h_offsets_ = (int*)allocator_->malloc(sizeof(int) * (param_.expert_num + 1), false, true);
+        h_offsets_ = (int*)allocator_->malloc(sizeof(int) * (max_expert_num + 1), false, true);
 
-        offsets_ = (int*)allocator_->malloc(sizeof(int) * (param_.expert_num + 1));
-        accum_   = (int*)allocator_->malloc(sizeof(int) * param_.expert_num * kMoeGateMaxTiles);
+        offsets_ = (int*)allocator_->malloc(sizeof(int) * (max_expert_num + 1));
+        accum_   = (int*)allocator_->malloc(sizeof(int) * max_expert_num * kMoeGateMaxTiles);
     }
 
-    void AllocateBuffer(size_t tokens, size_t padded);
+    void AllocateBuffer(size_t tokens, size_t padded, size_t expert_num, size_t inter_buf_factor);
 
     void FreeBuffer();
 
@@ -53,11 +55,11 @@ class MoeFfnLayer {
 
     void forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight<T>& moe);
 
-    void reduce(T* output, int tokens, const MoeFfnWeight<T>& moe);
+    void reduce(T* output, int tokens, float output_scale, int layer_id, const MoeFfnWeight<T>& moe);
 
     void gate(float* logits, const T* input, int tokens, const LlamaDenseWeight<T>& weight);
 
-    void dump_logits(int token_num, int layer_id);
+    void dump_logits(int token_num, int layer_id, int expert_num);
 
 private:
     const size_t           inter_size_;
diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc
index 2f99b0c2ce..7a6eddc4ba 100644
--- a/src/turbomind/models/llama/unified_attention_layer.cc
+++ b/src/turbomind/models/llama/unified_attention_layer.cc
@@ -19,21 +19,24 @@
 // Modified from
 // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc
 
-#include "src/turbomind/models/llama/unified_attention_layer.h"
+#include <algorithm>
+#include <math.h>
+
 #include "src/turbomind/kernels/attention/attention.h"
 #include "src/turbomind/kernels/attention/decoding.h"
 #include "src/turbomind/kernels/attention/kv_cache_utils_v2.h"
+#include "src/turbomind/kernels/norm/rms_norm.h"
 #include "src/turbomind/macro.h"
 #include "src/turbomind/models/llama/LlamaNcclGuard.h"
 #include "src/turbomind/models/llama/llama_kernels.h"
 #include "src/turbomind/models/llama/llama_utils.h"
+#include "src/turbomind/models/llama/mla_utils.h"
+#include "src/turbomind/models/llama/unified_attention_layer.h"
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/anomaly_handler.h"
 #include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/debug_utils.h"
 #include "src/turbomind/utils/logger.h"
-#include <algorithm>
-#include <math.h>
+#include "src/turbomind/utils/memory_utils.h"
 
 namespace turbomind {
 
@@ -72,17 +75,14 @@ UnifiedAttentionLayer<T>::UnifiedAttentionLayer(const ModelParam&     model,
 }
 
 template<typename T>
-void UnifiedAttentionLayer<T>::allocateBuffer(size_t            q_count,
-                                              size_t            k_count,
-                                              size_t            batch_size,
-                                              const WeightType* weights)
+void UnifiedAttentionLayer<T>::allocateBuffer(size_t q_count, size_t k_count, size_t batch_size, size_t qkv_lora_rank)
 {
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
 
     const int local_q_kv_head_num = local_head_num_ + 2 * local_kv_head_num_;
 
-    if (weights->qkv.lora.r) {
-        size_t sz = sizeof(T) * q_count * (local_q_kv_head_num * size_per_head_ + weights->qkv.lora.r);
+    if (qkv_lora_rank) {
+        size_t sz = sizeof(T) * q_count * (local_q_kv_head_num * size_per_head_ + qkv_lora_rank);
         qkv_buf_  = (T*)allocator_->reMalloc(qkv_buf_, sz, false);
     }
     else {
@@ -198,28 +198,38 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
     allocateBuffer(token_num,                                           // shared
                    h_cu_k_len[batch_size] - h_cu_k_len[dc_batch_size],  // prefill
                    batch_size,
-                   weights);
+                   weights->qkv.lora.r);
 
     // [L, 2, H, s, D]
     const size_t layer_offset = layer_id * 2 * local_kv_head_num_ * param_.cache_block_seq_len * size_per_head_;
 
-    static int count = 0;
+    // static int count = 0;
 
-    // if (layer_id == 0 && count == 0) {
-    //     Compare(attention_input, token_num * weights->qkv.input_dims, "qkv_input", compare_mode, stream_);
+    // if (tensor_para_.rank_ == 0) {
+    //     Compare(attention_input, token_num * hidden_units_, Concat("qkv_input", layer_id), compare_mode, stream_);
     // }
 
     int* lora_mask = inputs->at("lora_mask", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<int>();
-    //////////////////////////////////////////////
-    /// qkv gemm
-    // [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim]
-    linear_->forward(qkv_buf_, attention_input, token_num, weights->qkv, LlamaLinear<T>::kGemm, lora_mask);
-    sync_check_cuda_error();
+
+    if (weights->qkv.output_dims) {
+        //////////////////////////////////////////////
+        /// qkv gemm
+        // [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim]
+        linear_->forward(qkv_buf_, attention_input, token_num, weights->qkv, LlamaLinear<T>::kGemm, lora_mask);
+        sync_check_cuda_error();
+    }
+    else {
+        forward_mla(attention_input, token_num, *weights);
+    }
+
+    // std::cerr << layer_id << " " << count << " " << tensor_para_.rank_ << "\n";
 
     count_and_fix(qkv_buf_, token_num * weights->qkv.output_dims, Concat("qkv", layer_id), 3);
 
-    // if (layer_id == 0 && count == 0) {
-    //     Compare(qkv_buf_, token_num * weights->qkv.output_dims, "qkv_buf", compare_mode, stream_);
+    // std::cerr << "token num: " << token_num << "\n";
+
+    // if (layer_id == 0 && count == 0 && tensor_para_.rank_ == 0) {
+    //     Compare(qkv_buf_, token_num * (3 * local_head_num_ * size_per_head_), "qkv_buf", CMP_MODE, stream_);
     // }
 
     if constexpr (0) {
@@ -290,8 +300,15 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
         params.num_heads     = local_head_num_;
         params.num_kv_heads  = local_kv_head_num_;
         params.size_per_head = size_per_head_;
+
         // MSVC does not have M_LOG2E
-        params.inv_sqrt_dh = (float)std::log2(expf(1.)) / std::sqrt((float)params.size_per_head);
+        params.inv_sqrt_dh = (float)std::log2(expf(1.));
+        if (param_.softmax_scale) {  // model predefined softmax scale
+            params.inv_sqrt_dh *= param_.softmax_scale;
+        }
+        else {  // default value
+            params.inv_sqrt_dh /= std::sqrt((float)params.size_per_head);
+        }
 
         params.rotary_embedding_dim    = param_.rotary_embedding_dim;
         params.rotary_embedding_base   = param_.rotary_embedding_base;
@@ -324,8 +341,9 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
             };
             float low, high;
             find_correction_range(param_.beta_fast, param_.beta_slow, low, high);
+            // https://github.com/huggingface/transformers/blob/6c3f168b36882f0beebaa9121eafa1928ba29633/src/transformers/modeling_rope_utils.py#L216
             if (low == high) {
-                high += 0.01f;
+                high += 0.001f;
             }
             params.yarn_ramp_inv_factor_div_2   = 1.0 / (high - low) / 2.0;
             params.yarn_ramp_inv_factor_mul_min = 1.0 / (high - low) * low;
@@ -415,8 +433,6 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
     linear_->forward(attention_out, qkv_buf_3_, token_num, weights->output, LlamaLinear<T>::kGemm, lora_mask);
     sync_check_cuda_error();
 
-    // ++count;
-
     count_and_fix(attention_out, token_num * weights->output.output_dims, Concat("wo", layer_id), 3);
 
     if (tensor_para_.world_size_ > 1) {
@@ -425,10 +441,94 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
         sync_check_cuda_error();
     }
 
+    // if (tensor_para_.rank_ == 0) {
+    //     Compare(attention_out, token_num * hidden_units_, Concat("attn_out", layer_id), compare_mode, stream_);
+    //     // dump(qkv_buf_3_, num_token * weights->output.input_dims, stream_, "qkv_buf_3");
+    // }
+
     if (is_free_buffer_after_forward_ == true) {
         freeBuffer();
     }
     sync_check_cuda_error();
+
+    // ++count;
+}
+
+template<typename T>
+void UnifiedAttentionLayer<T>::forward_mla(const T* inputs, int token_num, const WeightType& w)
+{
+    const int q_lora_rank  = w.q_a_proj.output_dims;
+    const int kv_lora_rank = w.kv_b_proj.input_dims;
+    const int qk_rope_dim  = w.kv_a_proj.output_dims - kv_lora_rank;
+    const int qk_nope_dim  = std::max(w.q_b_proj.output_dims, w.q_proj.output_dims) / local_head_num_ - qk_rope_dim;
+    const int v_head_dim   = w.kv_b_proj.output_dims / local_head_num_ - qk_nope_dim;
+
+    T* q{};
+
+    if (w.q_proj.kernel) {
+        deviceMalloc((T**)&q, (size_t)token_num * w.q_proj.output_dims, stream_);
+        linear_->forward(q, inputs, token_num, w.q_proj);
+        sync_check_cuda_error();
+    }
+    else {
+        T* q_a{};
+        deviceMalloc((T**)&q_a, (size_t)token_num * q_lora_rank, stream_);
+
+        linear_->forward(q_a, inputs, token_num, w.q_a_proj);
+        sync_check_cuda_error();
+
+        invokeRMSNorm(q_a,
+                      q_lora_rank,
+                      q_a,
+                      q_lora_rank,
+                      w.q_a_layernorm,
+                      q_lora_rank,
+                      token_num,
+                      model_param_.norm_eps,
+                      stream_);
+        sync_check_cuda_error();
+
+        deviceMalloc((T**)&q, (size_t)token_num * w.q_b_proj.output_dims, stream_);
+        linear_->forward(q, q_a, token_num, w.q_b_proj);
+        sync_check_cuda_error();
+
+        deviceFree(q_a, stream_);
+    }
+
+    T*        kv_a{};
+    const int kv_a_dim = w.kv_a_proj.output_dims;
+    deviceMalloc((T**)&kv_a, (size_t)token_num * kv_a_dim, stream_);
+
+    linear_->forward(kv_a, inputs, token_num, w.kv_a_proj);
+    sync_check_cuda_error();
+
+    invokeRMSNorm(
+        kv_a, kv_a_dim, kv_a, kv_a_dim, w.kv_a_layernorm, kv_lora_rank, token_num, model_param_.norm_eps, stream_);
+    sync_check_cuda_error();
+
+    T* kv_b{};
+    deviceMalloc((T**)&kv_b, (size_t)token_num * w.kv_b_proj.output_dims, stream_);
+    sync_check_cuda_error();
+
+    linear_->forward(kv_b, {kv_a, kv_a_dim}, token_num, w.kv_b_proj);
+    sync_check_cuda_error();
+
+    dispatchMLACopyQKV(qkv_buf_,
+                       q,
+                       kv_a,
+                       kv_b,
+                       token_num,
+                       local_head_num_,
+                       qk_nope_dim,
+                       qk_rope_dim,
+                       kv_lora_rank,
+                       v_head_dim,
+                       stream_);
+    sync_check_cuda_error();
+
+    deviceFree(q, stream_);
+    deviceFree(kv_a, stream_);
+    deviceFree(kv_b, stream_);
 }
 
 #ifdef ENABLE_FP32
diff --git a/src/turbomind/models/llama/unified_attention_layer.h b/src/turbomind/models/llama/unified_attention_layer.h
index da0c0e6fc8..7d331b0e41 100644
--- a/src/turbomind/models/llama/unified_attention_layer.h
+++ b/src/turbomind/models/llama/unified_attention_layer.h
@@ -42,7 +42,7 @@ class UnifiedAttentionLayer {
     static constexpr int kMaxWorkspaceTokens = 4096;
 
     void freeBuffer();
-    void allocateBuffer(size_t q_count, size_t k_count, size_t batch_size, const WeightType* weights);
+    void allocateBuffer(size_t q_count, size_t k_count, size_t batch_size, size_t qkv_lora_rank);
 
     void allocateWorkspace();
     void freeWorkspace();
@@ -70,7 +70,7 @@ class UnifiedAttentionLayer {
                           const NcclParam&      tp,
                           const Context<T>&     context);
 
-    void forward(TensorMap* outputs, const TensorMap* inputs, const LlamaAttentionWeight<T>* weights);
+    void forward(TensorMap* outputs, const TensorMap* inputs, const WeightType* weights);
 
     void prefill(T*                output,
                  T*                tmp_kv_buffer,
@@ -107,6 +107,9 @@ class UnifiedAttentionLayer {
                 int               max_split_k,
                 const WeightType* weights);
 
+private:
+    void forward_mla(const T* inputs, int token_num, const WeightType& weights);
+
 private:
     const size_t head_num_;
     const size_t kv_head_num_;
diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc
index 28e8b5f649..ec0e75b7e5 100644
--- a/src/turbomind/models/llama/unified_decoder.cc
+++ b/src/turbomind/models/llama/unified_decoder.cc
@@ -1,13 +1,17 @@
 
-#include "src/turbomind/models/llama/unified_decoder.h"
+
+#include <cuda_runtime.h>
+
+#include "src/turbomind/kernels/norm/rms_norm.h"
 #include "src/turbomind/models/llama/llama_decoder_kernels.h"
 #include "src/turbomind/models/llama/llama_kernels.h"
 #include "src/turbomind/models/llama/llama_utils.h"
 #include "src/turbomind/models/llama/moe_ffn_layer.h"
 #include "src/turbomind/models/llama/unified_attention_layer.h"
+#include "src/turbomind/models/llama/unified_decoder.h"
+#include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/anomaly_handler.h"
 #include "src/turbomind/utils/cuda_utils.h"
-#include <cuda_runtime.h>
 
 namespace turbomind {
 
@@ -23,17 +27,19 @@ UnifiedDecoder<T>::UnifiedDecoder(const ModelParam&     model,
     rmsnorm_eps_(model.norm_eps),
     stream_(ctx.stream),
     allocator_(ctx.allocator.get()),
-    dtype_(getTensorType<T>())
+    tp_(tp),
+    dtype_(getTensorType<T>()),
+    tune_layer_num_(model.tune_layer_num)
 {
 
     attn_layer_ = std::make_unique<UnifiedAttentionLayer<T>>(model, attn, lora, tp, ctx);
 
-    if (moe.expert_num) {
+    if (std::accumulate(moe.expert_num.begin(), moe.expert_num.end(), 0LL)) {
         moe_ffn_layer_ = std::make_unique<MoeFfnLayer<T>>(model, moe, tp, ctx);
     }
 
-    if (model.inter_size) {
-        ffn_layer_ = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx, !moe_ffn_layer_);
+    if (std::accumulate(model.inter_size.begin(), model.inter_size.end(), 0LL)) {
+        ffn_layer_ = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx);
     }
 
     check_cuda_error(cudaEventCreateWithFlags(&ev_h_cu_x_, cudaEventDisableTiming));
@@ -65,13 +71,13 @@ void UnifiedDecoder<T>::freeBuffer()
 }
 
 template<typename T>
-void UnifiedDecoder<T>::forwardSelfAttn(T*                             attn_io,
-                                        TensorMap*                     _outputs,
-                                        const TensorMap*               _inputs,
-                                        size_t                         token_num,
-                                        size_t                         batch_size,
-                                        int                            layer_id,
-                                        const LlamaAttentionWeight<T>* weight)
+void UnifiedDecoder<T>::forwardSelfAttn(T*                attn_io,
+                                        TensorMap*        _outputs,
+                                        const TensorMap*  _inputs,
+                                        size_t            token_num,
+                                        size_t            batch_size,
+                                        int               layer_id,
+                                        const WeightType* weight)
 {
     TensorMap inputs(*_inputs);
     inputs.insert("input_query", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, attn_io});
@@ -84,7 +90,7 @@ void UnifiedDecoder<T>::forwardSelfAttn(T*                             attn_io,
     TensorMap outputs(*_outputs);
     outputs.insert("hidden_features", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, attn_io});
 
-    attn_layer_->forward(&outputs, &inputs, weight);
+    attn_layer_->forward(&outputs, &inputs, &weight->self_attn_weights);
 }
 
 template<typename T>
@@ -141,19 +147,15 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
 
     const int pf_offset = dc_batch_size;
 
-    // Compare(decoder_input_output, token_num * hidden_units_, "decoder_input", kCmpRead, stream_);
-
-    // printf("%d %f\n", (int)token_num, rmsnorm_eps_);
-
     /////////////////////////////////////////////
     /// RMSNorm
-    invokeRootMeanSquareNorm(decoder_output,
-                             decoder_input_output,
-                             weights->at(0)->self_attn_norm_weights,
-                             rmsnorm_eps_,
-                             token_num,
-                             hidden_units_,
-                             stream_);
+    invokeRMSNorm(decoder_output,
+                  decoder_input_output,
+                  weights->at(0)->self_attn_norm_weights,
+                  hidden_units_,
+                  token_num,
+                  rmsnorm_eps_,
+                  stream_);
     sync_check_cuda_error();
 
     count_and_fix(decoder_output, token_num * hidden_units_, Concat("norm0", 0), 2);
@@ -161,12 +163,10 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
     for (size_t layer = 0; layer < layer_num_; ++layer) {
 
         /// TODO: do not skip the layers when they are heterogeneous
-        if (isTuning() && layer != 0) {
+        if (isTuning() && layer >= tune_layer_num_) {
             continue;
         }
 
-        // Compare(decoder_output, token_num * hidden_units_, "attn_input", kCmpRead, stream_);
-
         /////////////////////////////////////////////
         /// self-attention
         forwardSelfAttn(decoder_output,  //
@@ -175,18 +175,18 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
                         token_num,
                         batch_size,
                         layer,
-                        &weights->at(layer)->self_attn_weights);
+                        weights->at(layer));
 
         count_and_fix(decoder_output, token_num * hidden_units_, Concat("attn_block", layer), 2);
 
-        invokeFusedAddBiasResidualRMSNorm(decoder_input_output,
-                                          decoder_output,
-                                          weights->at(layer)->self_attn_weights.output.bias,
-                                          weights->at(layer)->ffn_norm_weights,
-                                          rmsnorm_eps_,
-                                          token_num,
-                                          hidden_units_,
-                                          stream_);
+        invokeBiasResidualRMSNorm(decoder_input_output,
+                                  decoder_output,
+                                  weights->at(layer)->ffn_norm_weights,
+                                  weights->at(layer)->self_attn_weights.output.bias,
+                                  hidden_units_,
+                                  token_num,
+                                  rmsnorm_eps_,
+                                  stream_);
         sync_check_cuda_error();
 
         count_and_fix(decoder_input_output, token_num * hidden_units_, Concat("residual0", layer), 2);
@@ -195,14 +195,17 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
         ////////////////////////////////////////////
         /// feed-forward network
 
-        if (!weights->at(layer)->moe_weights.experts.empty()) {
+        const bool is_moe = !weights->at(layer)->moe_weights.experts.empty();
+        if (is_moe) {
             moe_ffn_layer_->forward(nullptr, decoder_output, token_num, layer, weights->at(layer)->moe_weights);
         }
 
-        if (ffn_layer_) {
-            int       layer_id = layer;  // int is needed
+        if (weights->at(layer)->ffn_weights.output.kernel) {
+            int       layer_id   = layer;  // int is needed
+            bool      all_reduce = !is_moe;
             TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}},
-                                 {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}}};
+                                 {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}},
+                                 {"all_reduce", {MEMORY_CPU, TYPE_BOOL, {1}, &all_reduce}}};
             TensorMap ffn_outputs{{"ffn_output", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}}};
             if (inputs->isExist("lora_mask")) {
                 ffn_inputs.insert({"lora_mask", inputs->at("lora_mask")});
@@ -210,8 +213,8 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
             ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &weights->at(layer)->ffn_weights);
         }
 
-        if (!weights->at(layer)->moe_weights.experts.empty()) {
-            moe_ffn_layer_->reduce(decoder_output, token_num, weights->at(layer)->moe_weights);
+        if (is_moe) {
+            moe_ffn_layer_->reduce(decoder_output, token_num, (bool)ffn_layer_, layer, weights->at(layer)->moe_weights);
         }
 
         count_and_fix(decoder_output, token_num * hidden_units_, Concat("ffn_block", layer), 2);
diff --git a/src/turbomind/models/llama/unified_decoder.h b/src/turbomind/models/llama/unified_decoder.h
index f13b4ba842..e08567136d 100644
--- a/src/turbomind/models/llama/unified_decoder.h
+++ b/src/turbomind/models/llama/unified_decoder.h
@@ -22,7 +22,9 @@ class UnifiedDecoder {
     const float        rmsnorm_eps_;
     cudaStream_t const stream_;
     IAllocator* const  allocator_;
+    const NcclParam    tp_;
     const DataType     dtype_;
+    const int          tune_layer_num_;
     bool               is_free_buffer_after_forward_{};
 
     int* cu_q_len_{};
@@ -39,13 +41,13 @@ class UnifiedDecoder {
 
     using WeightType = LlamaDecoderLayerWeight<T>;
 
-    void forwardSelfAttn(T*                             attn_io,
-                         TensorMap*                     _outputs,
-                         const TensorMap*               _inputs,
-                         size_t                         token_num,
-                         size_t                         batch_size,
-                         int                            layer_id,
-                         const LlamaAttentionWeight<T>* weight);
+    void forwardSelfAttn(T*                attn_io,
+                         TensorMap*        _outputs,
+                         const TensorMap*  _inputs,
+                         size_t            token_num,
+                         size_t            batch_size,
+                         int               layer_id,
+                         const WeightType* weight);
 
 public:
     UnifiedDecoder(const ModelParam&     model,
diff --git a/src/turbomind/models/llama/weight_type.h b/src/turbomind/models/llama/weight_type.h
new file mode 100644
index 0000000000..bc2f49a08e
--- /dev/null
+++ b/src/turbomind/models/llama/weight_type.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <cstdint>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace turbomind {
+
+enum class WeightType : int
+{
+    kFP32,
+    kFP16,
+    kFP8,  // not supported yet
+    kBF16,
+    kINT8,
+    kINT4
+};
+
+template<class T>
+constexpr WeightType get_default_weight_type()
+{
+    if constexpr (std::is_same_v<T, half>) {
+        return WeightType::kFP16;
+    }
+    else if constexpr (std::is_same_v<T, nv_bfloat16>) {
+        return WeightType::kBF16;
+    }
+    else if constexpr (std::is_same_v<T, float>) {
+        return WeightType::kFP32;
+    }
+    else {
+        static_assert(sizeof(T) != sizeof(T), "not implemented");
+        return {};
+    }
+}
+
+inline size_t getBitSize(WeightType type)
+{
+    switch (type) {
+        case WeightType::kFP32:
+            return 32;
+        case WeightType::kFP16:
+            return 16;
+        case WeightType::kFP8:
+            return 8;
+        case WeightType::kBF16:
+            return 16;
+        case WeightType::kINT8:
+            return 8;
+        case WeightType::kINT4:
+            return 4;
+    }
+    return 0;
+}
+
+}  // namespace turbomind
diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp
index 4eb34249ff..5a344d9545 100644
--- a/src/turbomind/python/bind.cpp
+++ b/src/turbomind/python/bind.cpp
@@ -215,6 +215,51 @@ DLTensor GetDLTensor(py::object obj)
     return dlmt->dl_tensor;
 }
 
+static void safe_memcpy(void* dst, const void* src, size_t size)
+{
+    cudaPointerAttributes dat{};
+    cudaPointerAttributes sat{};
+    ft::check_cuda_error(cudaPointerGetAttributes(&dat, dst));
+    ft::check_cuda_error(cudaPointerGetAttributes(&sat, src));
+    try {
+        if (dat.devicePointer && sat.devicePointer) {
+            // Both can be accessed from current context
+            ft::check_cuda_error(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
+        }
+        else if (dat.type == cudaMemoryTypeDevice && sat.type == cudaMemoryTypeDevice) {
+            if (dat.device != sat.device) {
+                // On different devices, try peer memcpy
+                ft::check_cuda_error(cudaMemcpyPeer(dst, dat.device, src, sat.device, size));
+            }
+            else {
+                // Same device, switch to the device first (this is unlikely)
+                ft::CudaDeviceGuard guard(dat.device);
+                ft::check_cuda_error(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
+            }
+        }
+        else {
+            // Unknown case, give it a try anyway
+            ft::check_cuda_error(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
+        }
+    }
+    catch (...) {
+        int device_id{-1};
+        cudaGetDevice(&device_id);
+        TM_LOG_ERROR("cudaMemcpy failed: dst=(%d, %d, %p, %p), src=(%d, %d, %p, %p), size=%s, device=%d",
+                     (int)dat.type,
+                     dat.device,
+                     dat.devicePointer,
+                     dat.hostPointer,
+                     (int)sat.type,
+                     sat.device,
+                     sat.devicePointer,
+                     sat.hostPointer,
+                     std::to_string(size).c_str(),
+                     device_id);
+        throw;
+    }
+}
+
 PYBIND11_MODULE(_turbomind, m)
 {
     // nccl param
@@ -293,8 +338,7 @@ PYBIND11_MODULE(_turbomind, m)
                             std::accumulate(src->shape.begin(), src->shape.end(), 1LL, std::multiplies<int64_t>());
                         auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8;
                         ft::FT_CHECK(self->shape.size() == 1 && num_bytes == self->shape[0]);
-                        cudaMemcpy(
-                            const_cast<void*>(self->data), const_cast<void*>(src->data), num_bytes, cudaMemcpyDefault);
+                        safe_memcpy(const_cast<void*>(self->data), src->data, num_bytes);
                         break;
                     }
                     default:
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 2deca46380..1c7c5eb468 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -256,22 +256,30 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     model_param_.kv_head_num        = model_reader["kv_head_num"].as<int>(0);
     model_param_.hidden_units       = model_reader["hidden_units"].as<int>();
     model_param_.layer_num          = model_reader["num_layer"].as<int>();
-    model_param_.inter_size         = model_reader["inter_size"].as<int>();
     model_param_.vocab_size         = model_reader["vocab_size"].as<int>();
     model_param_.embedding_size     = model_reader["embedding_size"].as<int>();
     model_param_.norm_eps           = model_reader["norm_eps"].as<float>();
     model_param_.start_id           = model_reader["start_id"].as<int>();
     model_param_.end_id             = model_reader["end_id"].as<int>();
+    model_param_.tune_layer_num     = model_reader["tune_layer_num"].as<int>(1);
+    model_param_.mla.q_lora_rank    = model_reader["q_lora_rank"].as<int>();
+    model_param_.mla.kv_lora_rank   = model_reader["kv_lora_rank"].as<int>();
+    model_param_.mla.qk_rope_dim    = model_reader["qk_rope_dim"].as<int>();
+    model_param_.mla.v_head_dim     = model_reader["v_head_dim"].as<int>();
     attn_param_.cache_block_seq_len = attention_reader["cache_block_seq_len"].as<int>(0);
     model_param_.quant_policy       = engine_reader["quant_policy"].as<int>(0);
-
+    YAML::Node inter_size           = model_reader["inter_size"];
+    for (auto it = inter_size.begin(); it != inter_size.end(); ++it) {
+        model_param_.inter_size.push_back(it->as<int>());
+    }
     // Only weight classes need these
-    attn_bias_  = model_reader["attn_bias"].as<int>(0);
-    group_size_ = model_reader["group_size"].as<int>(0);
+    model_param_.attn_bias  = model_reader["attn_bias"].as<int>(0);
+    model_param_.group_size = model_reader["group_size"].as<int>(0);
 
     // rotary embedding parameters
     attn_param_.rotary_embedding_dim    = attention_reader["rotary_embedding"].as<int>();
     attn_param_.rotary_embedding_base   = attention_reader["rope_theta"].as<float>(10000.0f);
+    attn_param_.softmax_scale           = attention_reader["softmax_scale"].as<float>(0);
     attn_param_.attention_factor        = attention_reader["attention_factor"].as<float>(-1.f);
     attn_param_.beta_fast               = attention_reader["beta_fast"].as<float>(32.f);
     attn_param_.beta_slow               = attention_reader["beta_slow"].as<float>(1.f);
@@ -297,19 +305,27 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     engine_param_.num_tokens_per_iter = engine_reader["num_tokens_per_iter"].as<int>(0);
     engine_param_.max_prefill_iters   = engine_reader["max_prefill_iters"].as<int>(1);
 
-    lora_param_.policy           = ft::getLoraPolicy(reader["lora_config"]["lora_policy"].as<std::string>(""));
-    lora_param_.r                = lora_reader["lora_r"].as<int>(0);
-    lora_param_.scale            = lora_reader["lora_scale"].as<float>(0);
-    lora_param_.max_wo_r         = lora_reader["lora_max_wo_r"].as<int>(0);
-    lora_param_.rank_pattern     = getLoraPattern<int>(lora_reader["lora_rank_pattern"].as<std::string>(""),
+    lora_param_.policy        = ft::getLoraPolicy(reader["lora_config"]["lora_policy"].as<std::string>(""));
+    lora_param_.r             = lora_reader["lora_r"].as<int>(0);
+    lora_param_.scale         = lora_reader["lora_scale"].as<float>(0);
+    lora_param_.max_wo_r      = lora_reader["lora_max_wo_r"].as<int>(0);
+    lora_param_.rank_pattern  = getLoraPattern<int>(lora_reader["lora_rank_pattern"].as<std::string>(""),
                                                    [](const std::string& s) { return std::stoi(s); });
-    lora_param_.scale_pattern    = getLoraPattern<float>(lora_reader["lora_scale_pattern"].as<std::string>(""),
+    lora_param_.scale_pattern = getLoraPattern<float>(lora_reader["lora_scale_pattern"].as<std::string>(""),
                                                       [](const std::string& s) { return std::stof(s); });
-    moe_param_.expert_num        = model_reader["expert_num"].as<int>(0);
+
     moe_param_.experts_per_token = model_reader["experts_per_token"].as<int>(0);
     moe_param_.inter_size        = model_reader["expert_inter_size"].as<int>(0);
-    moe_param_.shared_gate       = model_reader["moe_shared_gate"].as<int>(0);
-    moe_param_.norm_topk         = model_reader["moe_norm_topk"].as<bool>(false);
+    moe_param_.shared_gate       = model_reader["moe_shared_gate"].as<bool>();
+    moe_param_.norm_topk_prob    = model_reader["norm_topk_prob"].as<bool>();
+    moe_param_.routed_scale      = model_reader["routed_scale"].as<float>(1.f);
+    moe_param_.topk_group        = model_reader["topk_group"].as<int>(1);
+    moe_param_.topk_method       = model_reader["topk_method"].as<std::string>("greedy");
+    moe_param_.n_group           = model_reader["moe_group_num"].as<int>(1);
+    YAML::Node expert_num        = model_reader["expert_num"];
+    for (auto it = expert_num.begin(); it != expert_num.end(); ++it) {
+        moe_param_.expert_num.push_back(it->as<int>());
+    }
 
     handleMissingParams();
 
@@ -321,19 +337,19 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
 
     const std::string weight_type_str = model_reader["weight_type"].as<std::string>();
     if (weight_type_str == "fp16" || weight_type_str == "float16") {
-        weight_type_ = ft::WeightType::kFP16;
+        model_param_.weight_type = ft::WeightType::kFP16;
     }
     else if (weight_type_str == "bf16" || weight_type_str == "bfloat16") {
-        weight_type_ = ft::WeightType::kBF16;
+        model_param_.weight_type = ft::WeightType::kBF16;
     }
     else if (weight_type_str == "fp32") {
-        weight_type_ = ft::WeightType::kFP32;
+        model_param_.weight_type = ft::WeightType::kFP32;
     }
     else if (weight_type_str == "int8") {
-        weight_type_ = ft::WeightType::kINT8;
+        model_param_.weight_type = ft::WeightType::kINT8;
     }
     else if (weight_type_str == "int4") {
-        weight_type_ = ft::WeightType::kINT4;
+        model_param_.weight_type = ft::WeightType::kINT4;
     }
     else {
         std::cout << "[ERROR] Unsupported weight type: '" << weight_type_str << "'\n";
@@ -418,21 +434,8 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
     const int tensor_para_rank   = rank % tensor_para_size_;
     const int pipeline_para_rank = rank / tensor_para_size_;
     ft::FT_CHECK(pipeline_para_size_ == 1 && pipeline_para_rank == 0);
-    weights_[device_id] = std::make_shared<ft::LlamaWeight<T>>(model_param_.head_num,
-                                                               model_param_.kv_head_num,
-                                                               model_param_.head_dim,
-                                                               model_param_.hidden_units,
-                                                               model_param_.inter_size,
-                                                               model_param_.vocab_size,
-                                                               model_param_.embedding_size,
-                                                               model_param_.layer_num,
-                                                               attn_bias_,
-                                                               weight_type_,
-                                                               group_size_,
-                                                               lora_param_,
-                                                               moe_param_,
-                                                               tensor_para_size_,
-                                                               tensor_para_rank);
+    weights_[device_id] = std::make_shared<ft::LlamaWeight<T>>(
+        model_param_, lora_param_, moe_param_, tensor_para_size_, tensor_para_rank);
     // model inited with model_dir
     if (model_dir_ != "") {
         weights_[device_id]->loadModel(model_dir_);
@@ -488,9 +491,11 @@ std::string LlamaTritonModel<T>::toString()
     std::stringstream ss;
     ss << "Model: "  //
        << "\nhead_num: " << model_param_.head_num << "\nkv_head_num: " << model_param_.kv_head_num
-       << "\nsize_per_head: " << model_param_.head_dim << "\ninter_size: " << model_param_.inter_size
+       << "\nsize_per_head: "
+       << model_param_.head_dim
+       //    << "\ninter_size: " << model_param_.inter_size
        << "\nnum_layer: " << model_param_.layer_num << "\nvocab_size: " << model_param_.vocab_size
-       << "\nattn_bias: " << attn_bias_ << "\nmax_batch_size: " << engine_param_.max_batch_size
+       << "\nattn_bias: " << model_param_.attn_bias << "\nmax_batch_size: " << engine_param_.max_batch_size
        << "\nmax_prefill_token_num: " << engine_param_.max_prefill_token_num
        << "\nmax_context_token_num: " << engine_param_.max_context_token_num
        << "\nnum_tokens_per_iter: " << engine_param_.num_tokens_per_iter
@@ -501,8 +506,9 @@ std::string LlamaTritonModel<T>::toString()
        << "\nenable_prefix_caching: " << engine_param_.enable_prefix_caching << "\nstart_id: " << model_param_.start_id
        << "\ntensor_para_size: " << tensor_para_size_ << "\npipeline_para_size: " << pipeline_para_size_
        << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_ << "\nmodel_name: " << model_name_
-       << "\nmodel_dir: " << model_dir_ << "\nquant_policy: " << model_param_.quant_policy
-       << "\ngroup_size: " << group_size_ << "\nexpert_num: " << moe_param_.expert_num
+       << "\nmodel_dir: " << model_dir_ << "\nquant_policy: " << model_param_.quant_policy << "\ngroup_size: "
+       << model_param_.group_size
+       //    << "\nexpert_num: " << moe_param_.expert_num
        << "\nexpert_per_token: " << moe_param_.experts_per_token << "\nmoe_method: " << moe_param_.method << std::endl;
 
     return ss.str();
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
index 19a143e721..a6c1b862ac 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
@@ -91,9 +91,6 @@ struct LlamaTritonModel: public AbstractTransformerModel {
     ft::EngineParam    engine_param_;
     size_t             tensor_para_size_;
     size_t             pipeline_para_size_;
-    ft::WeightType     weight_type_;
-    bool               attn_bias_;
-    int                group_size_;
 
     std::shared_ptr<ft::SharedState> shared_state_;
     // Weights & engine instances for the ranks
diff --git a/src/turbomind/utils/allocator.h b/src/turbomind/utils/allocator.h
index bdcb9bfc46..88c299c3de 100644
--- a/src/turbomind/utils/allocator.h
+++ b/src/turbomind/utils/allocator.h
@@ -281,7 +281,8 @@ class Allocator<AllocatorType::CUDA>: public IAllocator {
                 pointer_mapping_.erase(address);
             }
             else {
-                TM_LOG_WARNING("pointer_mapping_ does not have information of ptr at %p.", address);
+                FT_CHECK_WITH_INFO(0,
+                                   fmtstr("pointer_mapping_ does not have information of ptr at %p.", address).c_str());
             }
         }
         *ptr = nullptr;
diff --git a/src/turbomind/utils/cuda_utils.h b/src/turbomind/utils/cuda_utils.h
index 2148fcc164..8311e6eb9e 100644
--- a/src/turbomind/utils/cuda_utils.h
+++ b/src/turbomind/utils/cuda_utils.h
@@ -483,5 +483,24 @@ void compareTwoTensor(
 
 bool is_16xx_series(const char* name);
 
+class CudaDeviceGuard {
+public:
+    CudaDeviceGuard(int device)
+    {
+        cudaGetDevice(&last_device_id_);
+        if (device != last_device_id_) {
+            cudaSetDevice(device);
+        }
+    }
+
+    ~CudaDeviceGuard()
+    {
+        cudaSetDevice(last_device_id_);
+    }
+
+private:
+    int last_device_id_{-1};
+};
+
 /* ************************** end of common utils ************************** */
 }  // namespace turbomind
diff --git a/src/turbomind/utils/memory_utils.cu b/src/turbomind/utils/memory_utils.cu
index f8bfb8efe0..e9a79ea5a1 100644
--- a/src/turbomind/utils/memory_utils.cu
+++ b/src/turbomind/utils/memory_utils.cu
@@ -26,77 +26,71 @@
 namespace turbomind {
 
 template<typename T>
-void deviceMalloc(T** ptr, size_t size, bool is_random_initialize)
+void deviceMalloc(T** ptr, size_t size, cudaStream_t st, bool is_random_initialize)
 {
-    FT_CHECK_WITH_INFO(size >= ((size_t)0), "Ask deviceMalloc size " + std::to_string(size) + "< 0 is invalid.");
-    check_cuda_error(cudaMalloc((void**)(ptr), sizeof(T) * size));
+    check_cuda_error(cudaMallocAsync((void**)(ptr), sizeof(T) * size, st));
     if (is_random_initialize) {
-        cudaRandomUniform(*ptr, size);
+        cudaRandomUniform(*ptr, size, st);
     }
 }
 
-template void deviceMalloc(float** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(half** ptr, size_t size, bool is_random_initialize);
+template void deviceMalloc(float** ptr, size_t size, cudaStream_t, bool is_random_initialize);
+template void deviceMalloc(half** ptr, size_t size, cudaStream_t, bool is_random_initialize);
 #ifdef ENABLE_BF16
-template void deviceMalloc(__nv_bfloat16** ptr, size_t size, bool is_random_initialize);
+template void deviceMalloc(__nv_bfloat16** ptr, size_t size, cudaStream_t, bool is_random_initialize);
 #endif
-template void deviceMalloc(uint16_t** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(int** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(bool** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(char** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(int8_t** ptr, size_t size, bool is_random_initialize);
+template void deviceMalloc(uint16_t** ptr, size_t size, cudaStream_t, bool is_random_initialize);
+template void deviceMalloc(int** ptr, size_t size, cudaStream_t, bool is_random_initialize);
+template void deviceMalloc(bool** ptr, size_t size, cudaStream_t, bool is_random_initialize);
+template void deviceMalloc(char** ptr, size_t size, cudaStream_t, bool is_random_initialize);
+template void deviceMalloc(int8_t** ptr, size_t size, cudaStream_t, bool is_random_initialize);
 #ifdef ENABLE_FP8
-template void deviceMalloc(__nv_fp8_e4m3** ptr, size_t size, bool is_random_initialize);
+template void deviceMalloc(__nv_fp8_e4m3** ptr, size_t size, cudaStream_t, bool is_random_initialize);
 #endif
 
 template<typename T>
-void deviceMemSetZero(T* ptr, size_t size)
-{
-    check_cuda_error(cudaMemset(static_cast<void*>(ptr), 0, sizeof(T) * size));
-}
-
-template void deviceMemSetZero(float* ptr, size_t size);
-template void deviceMemSetZero(half* ptr, size_t size);
-template void deviceMemSetZero(int* ptr, size_t size);
-template void deviceMemSetZero(uint32_t* ptr, size_t size);
-template void deviceMemSetZero(bool* ptr, size_t size);
-#ifdef ENABLE_FP8
-template void deviceMemSetZero(__nv_fp8_e4m3* ptr, size_t size);
-#endif
-#ifdef ENABLE_BF16
-template void deviceMemSetZero(__nv_bfloat16* ptr, size_t size);
-#endif
-
-template<typename T>
-void deviceFree(T*& ptr)
+void deviceFree(T*& ptr, cudaStream_t st)
 {
     if (ptr != NULL) {
-        check_cuda_error(cudaFree(ptr));
+        check_cuda_error(cudaFreeAsync(ptr, st));
         ptr = NULL;
     }
 }
 
-template void deviceFree(float*& ptr);
-template void deviceFree(half*& ptr);
+template void deviceFree(float*& ptr, cudaStream_t);
+template void deviceFree(half*& ptr, cudaStream_t);
 #ifdef ENABLE_BF16
-template void deviceFree(__nv_bfloat16*& ptr);
+template void deviceFree(__nv_bfloat16*& ptr, cudaStream_t);
 #endif
-template void deviceFree(unsigned short*& ptr);
-template void deviceFree(int*& ptr);
-template void deviceFree(bool*& ptr);
-template void deviceFree(char*& ptr);
-template void deviceFree(int8_t*& ptr);
+template void deviceFree(unsigned short*& ptr, cudaStream_t);
+template void deviceFree(int*& ptr, cudaStream_t);
+template void deviceFree(bool*& ptr, cudaStream_t);
+template void deviceFree(char*& ptr, cudaStream_t);
+template void deviceFree(int8_t*& ptr, cudaStream_t);
+template void deviceFree(void*& ptr, cudaStream_t);
 #ifdef ENABLE_FP8
-template void deviceFree(__nv_fp8_e4m3*& ptr);
+template void deviceFree(__nv_fp8_e4m3*& ptr, cudaStream_t);
 #endif
 
+namespace {
+
+template<class T>
+__global__ void fill_kernel(T* devptr, size_t size, T value)
+{
+    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    for (size_t i = idx; i < size; i += blockDim.x * gridDim.x) {
+        devptr[i] = value;
+    }
+}
+
+}  // namespace
+
 template<typename T>
 void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream)
 {
-    T* arr = new T[size];
-    std::fill(arr, arr + size, value);
-    check_cuda_error(cudaMemcpyAsync(devptr, arr, sizeof(T) * size, cudaMemcpyHostToDevice, stream));
-    delete[] arr;
+    constexpr int threads = 512;
+    const int     blocks  = (size + threads - 1) / threads;
+    fill_kernel<<<blocks, threads, 0, stream>>>(devptr, size, value);
 }
 
 template void deviceFill(float* devptr, size_t size, float value, cudaStream_t stream);
@@ -280,23 +274,23 @@ __global__ void cuda_random_uniform_kernel<char>(char* buffer, const size_t size
 }
 
 template<typename T>
-void cudaRandomUniform(T* buffer, const size_t size)
+void cudaRandomUniform(T* buffer, const size_t size, cudaStream_t st)
 {
     static int seq_offset = 0;
-    cuda_random_uniform_kernel<T><<<256, 256>>>(buffer, size, seq_offset);
+    cuda_random_uniform_kernel<T><<<256, 256, 0, st>>>(buffer, size, seq_offset);
     seq_offset += 256 * 256;
 }
 
-template void cudaRandomUniform(float* buffer, const size_t size);
-template void cudaRandomUniform(half* buffer, const size_t size);
+template void cudaRandomUniform(float* buffer, const size_t size, cudaStream_t);
+template void cudaRandomUniform(half* buffer, const size_t size, cudaStream_t);
 #ifdef ENABLE_BF16
-template void cudaRandomUniform(__nv_bfloat16* buffer, const size_t size);
+template void cudaRandomUniform(__nv_bfloat16* buffer, const size_t size, cudaStream_t);
 #endif
-template void cudaRandomUniform(int* buffer, const size_t size);
-template void cudaRandomUniform(bool* buffer, const size_t size);
-template void cudaRandomUniform(char* buffer, const size_t size);
+template void cudaRandomUniform(int* buffer, const size_t size, cudaStream_t);
+template void cudaRandomUniform(bool* buffer, const size_t size, cudaStream_t);
+template void cudaRandomUniform(char* buffer, const size_t size, cudaStream_t);
 #ifdef ENABLE_FP8
-template void cudaRandomUniform(__nv_fp8_e4m3* buffer, const size_t size);
+template void cudaRandomUniform(__nv_fp8_e4m3* buffer, const size_t size, cudaStream_t);
 #endif
 
 // loads data from binary file. If it succeeds, returns a non-empty vector. If loading fails or
@@ -366,10 +360,10 @@ int loadWeightFromBinFunc(T* ptr, std::vector<size_t> shape, std::string filenam
     }
     else {
         T_IN* ptr_2 = nullptr;
-        deviceMalloc(&ptr_2, host_array.size(), false);
+        deviceMalloc(&ptr_2, host_array.size(), nullptr, false);
         cudaH2Dcpy(ptr_2, host_array.data(), host_array.size());
         invokeCudaD2DcpyConvert(ptr, ptr_2, host_array.size());
-        deviceFree(ptr_2);
+        deviceFree(ptr_2, nullptr);
     }
     return 0;
 }
diff --git a/src/turbomind/utils/memory_utils.h b/src/turbomind/utils/memory_utils.h
index bb7a4f9c03..03a0ef7b33 100644
--- a/src/turbomind/utils/memory_utils.h
+++ b/src/turbomind/utils/memory_utils.h
@@ -23,16 +23,13 @@
 namespace turbomind {
 
 template<typename T>
-void deviceMalloc(T** ptr, size_t size, bool is_random_initialize = true);
+void deviceMalloc(T** ptr, size_t size, cudaStream_t st, bool is_random_initialize = false);
 
 template<typename T>
-void deviceMemSetZero(T* ptr, size_t size);
+void deviceFree(T*& ptr, cudaStream_t st);
 
 template<typename T>
-void deviceFree(T*& ptr);
-
-template<typename T>
-void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream = 0);
+void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream = {});
 
 template<typename T>
 void cudaD2Hcpy(T* tgt, const T* src, const size_t size);
@@ -44,10 +41,10 @@ template<typename T>
 void cudaD2Dcpy(T* tgt, const T* src, const size_t size);
 
 template<typename T>
-void cudaAutoCpy(T* tgt, const T* src, const size_t size, cudaStream_t stream = NULL);
+void cudaAutoCpy(T* tgt, const T* src, const size_t size, cudaStream_t stream = {});
 
 template<typename T>
-void cudaRandomUniform(T* buffer, const size_t size);
+void cudaRandomUniform(T* buffer, const size_t size, cudaStream_t stream = {});
 
 template<typename T>
 int loadWeightFromBin(T*                  ptr,

From 01f82e09c11b6866b8ebe862de2595ebe87e9733 Mon Sep 17 00:00:00 2001
From: zhabuye <74179177+zhabuye@users.noreply.github.com>
Date: Fri, 29 Nov 2024 16:37:29 +0800
Subject: [PATCH 092/122] Add Ascend installation adapter (#2817)

---
 requirements/runtime_ascend.txt | 22 ++++++++++++++++++++++
 requirements_ascend.txt         |  4 ++++
 setup.py                        | 22 ++++++++++++++++++----
 3 files changed, 44 insertions(+), 4 deletions(-)
 create mode 100644 requirements/runtime_ascend.txt
 create mode 100644 requirements_ascend.txt

diff --git a/requirements/runtime_ascend.txt b/requirements/runtime_ascend.txt
new file mode 100644
index 0000000000..d87748e396
--- /dev/null
+++ b/requirements/runtime_ascend.txt
@@ -0,0 +1,22 @@
+accelerate>=0.29.3
+dlinfer-ascend
+einops
+fastapi
+fire
+mmengine-lite
+numpy<2.0.0
+openai
+outlines<0.1.0
+peft<=0.11.1
+pillow
+protobuf
+pydantic>2.0.0
+pynvml
+safetensors
+sentencepiece
+shortuuid
+tiktoken
+torch<=2.4.0,>=2.0.0
+torchvision<=0.19.0,>=0.15.0
+transformers
+uvicorn
diff --git a/requirements_ascend.txt b/requirements_ascend.txt
new file mode 100644
index 0000000000..e844853ab4
--- /dev/null
+++ b/requirements_ascend.txt
@@ -0,0 +1,4 @@
+-r requirements/build.txt
+-r requirements/runtime_ascend.txt
+-r requirements/lite.txt
+-r requirements/serve.txt
diff --git a/setup.py b/setup.py
index 32a69c600c..7a08ac7919 100644
--- a/setup.py
+++ b/setup.py
@@ -4,6 +4,14 @@
 
 from setuptools import find_packages, setup
 
+npu_available = False
+try:
+    import torch_npu
+
+    npu_available = torch_npu.npu.is_available()
+except ImportError:
+    pass
+
 pwd = os.path.dirname(__file__)
 version_file = 'lmdeploy/version.py'
 
@@ -145,11 +153,17 @@ def gen_packages_items():
         include_package_data=True,
         setup_requires=parse_requirements('requirements/build.txt'),
         tests_require=parse_requirements('requirements/test.txt'),
-        install_requires=parse_requirements('requirements/runtime.txt'),
+        install_requires=parse_requirements(
+            'requirements/runtime_ascend.txt'
+            if npu_available else 'requirements/runtime.txt'),
         extras_require={
-            'all': parse_requirements('requirements.txt'),
-            'lite': parse_requirements('requirements/lite.txt'),
-            'serve': parse_requirements('requirements/serve.txt')
+            'all':
+            parse_requirements('requirements_ascend.txt'
+                               if npu_available else 'requirements.txt'),
+            'lite':
+            parse_requirements('requirements/lite.txt'),
+            'serve':
+            parse_requirements('requirements/serve.txt')
         },
         has_ext_modules=check_ext_modules,
         classifiers=[

From 0b6dd1f23aa9b2239fc6d9c24314ee25bec3990c Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Fri, 29 Nov 2024 17:54:46 +0800
Subject: [PATCH 093/122] [CI] add more testcase for mllm models (#2791)

* update

* update

* update

* update

* update

* update

* update

* update

* update
---
 autotest/config-v100.yaml                     |  16 +-
 autotest/config.yaml                          |  20 +-
 .../test_pipeline_chat_pytorch_llm.py         |   2 -
 .../test_pipeline_chat_pytorch_mllm.py        |   4 -
 .../test_pipeline_chat_turbomind_llm.py       |   2 -
 .../test_pipeline_chat_turbomind_mllm.py      |   4 -
 .../test_restful_chat_hf_pytorch_llm.py       |   3 +-
 .../test_restful_chat_hf_pytorch_mllm.py      |   3 +-
 .../test_restful_chat_hf_turbomind_llm.py     |   3 +-
 .../test_restful_chat_hf_turbomind_mllm.py    |   3 +-
 autotest/utils/pipeline_chat.py               | 348 ++++++++++++++++++
 autotest/utils/run_restful_chat.py            |  15 +-
 docs/en/supported_models/supported_models.md  |   4 +-
 .../supported_models/supported_models.md      |   4 +-
 14 files changed, 401 insertions(+), 30 deletions(-)

diff --git a/autotest/config-v100.yaml b/autotest/config-v100.yaml
index 41216cb730..507f81ceb6 100644
--- a/autotest/config-v100.yaml
+++ b/autotest/config-v100.yaml
@@ -1,4 +1,5 @@
 model_path: /nvme/qa_test_models
+resource_path: /nvme/qa_test_models/resource
 dst_path: /nvme/qa_test_models/autotest_model
 log_path: /nvme/qa_test_models/autotest_model/log
 benchmark_path: /nvme/qa_test_models/benchmark-reports
@@ -100,12 +101,22 @@ turbomind_quatization:
         - meta-llama/Meta-Llama-3-8B-Instruct
         - internlm/internlm-xcomposer2d5-7b
         - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
+        - Qwen/Qwen2-VL-2B-Instruct
+        - Qwen/Qwen2-VL-7B-Instruct
         - mistralai/Mistral-7B-Instruct-v0.3
         - THUDM/glm-4-9b-chat
+        - deepseek-ai/deepseek-coder-1.3b-instruct
+        - codellama/CodeLlama-7b-Instruct-hf
     gptq:
         - internlm/internlm2_5-7b-chat
     no_kvint4:
         - openbmb/MiniCPM-V-2_6
+        - Qwen/Qwen2-7B-Instruct
+        - Qwen/Qwen2-7B-Instruct-AWQ
+        - Qwen/Qwen2-1.5B-Instruct
+        - Qwen/Qwen2.5-0.5B-Instruct
+        - Qwen/Qwen2.5-7B-Instruct
+        - Qwen/Qwen2-7B-Instruct-GPTQ-Int4
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
 
@@ -120,6 +131,10 @@ pytorch_quatization:
     no_kvint4:
         - OpenGVLab/InternVL2-1B
         - OpenGVLab/InternVL2-4B
+        - Qwen/Qwen2-7B-Instruct
+        - Qwen/Qwen2-1.5B-Instruct
+        - Qwen/Qwen2-VL-2B-Instruct
+        - Qwen/Qwen2-VL-7B-Instruct
         - deepseek-ai/DeepSeek-V2-Lite-Chat
         - microsoft/Phi-3-mini-4k-instruct
         - microsoft/Phi-3-vision-128k-instruct
@@ -128,7 +143,6 @@ pytorch_quatization:
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
 
-
 longtext_model:
     - meta-llama/Meta-Llama-3-1-8B-Instruct
     - meta-llama/Meta-Llama-3-8B-Instruct
diff --git a/autotest/config.yaml b/autotest/config.yaml
index 88ca7c3127..b4fd4e1712 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -1,4 +1,5 @@
 model_path: /nvme/qa_test_models
+resource_path: /nvme/qa_test_models/resource
 dst_path: /nvme/qa_test_models/autotest_model
 log_path: /nvme/qa_test_models/autotest_model/log
 benchmark_path: /nvme/qa_test_models/benchmark-reports
@@ -18,6 +19,7 @@ tp_config:
     Qwen2-7B-Instruct-GPTQ-Int4: 2
     InternVL2-40B: 2
     MiniCPM-V-2_6: 2
+    Qwen2.5-72B-Instruct: 4
 
 turbomind_chat_model:
     - meta-llama/Llama-3.2-1B-Instruct
@@ -164,7 +166,11 @@ pytorch_base_model:
 
 turbomind_quatization:
     no_awq:
+        - Qwen/Qwen1.5-MoE-A2.7B-Chat
+        - Qwen/Qwen2-VL-2B-Instruct
+        - Qwen/Qwen2-VL-7B-Instruct
         - mistralai/Mistral-7B-Instruct-v0.3
+        - mistralai/Mistral-Nemo-Instruct-2407
         - deepseek-ai/deepseek-coder-1.3b-instruct
         - deepseek-ai/DeepSeek-V2-Lite-Chat
         - codellama/CodeLlama-7b-Instruct-hf
@@ -172,6 +178,12 @@ turbomind_quatization:
         - internlm/internlm2_5-7b-chat
     no_kvint4:
         - openbmb/MiniCPM-V-2_6
+        - Qwen/Qwen2-7B-Instruct
+        - Qwen/Qwen2-7B-Instruct-AWQ
+        - Qwen/Qwen2-1.5B-Instruct
+        - Qwen/Qwen2.5-0.5B-Instruct
+        - Qwen/Qwen2.5-7B-Instruct
+        - Qwen/Qwen2-7B-Instruct-GPTQ-Int4
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
 
@@ -203,6 +215,10 @@ pytorch_quatization:
     no_kvint4:
         - OpenGVLab/InternVL2-1B
         - OpenGVLab/InternVL2-4B
+        - Qwen/Qwen2-7B-Instruct
+        - Qwen/Qwen2-1.5B-Instruct
+        - Qwen/Qwen2-VL-2B-Instruct
+        - Qwen/Qwen2-VL-7B-Instruct
         - deepseek-ai/DeepSeek-V2-Lite-Chat
         - microsoft/Phi-3-mini-4k-instruct
         - microsoft/Phi-3-vision-128k-instruct
@@ -211,7 +227,6 @@ pytorch_quatization:
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
 
-
 longtext_model:
     - meta-llama/Meta-Llama-3-1-8B-Instruct
     - meta-llama/Meta-Llama-3-8B-Instruct
@@ -227,7 +242,8 @@ benchmark_model:
     - internlm/internlm2_5-7b-chat
     - internlm/internlm2_5-20b-chat
     - THUDM/glm-4-9b-chat
-    - Qwen/Qwen2-7B-Instruct
+    - Qwen/Qwen2.5-7B-Instruct
+    - Qwen/Qwen2.5-72B-Instruct
     - mistralai/Mistral-7B-Instruct-v0.3
     - mistralai/Mixtral-8x7B-Instruct-v0.1
     - deepseek-ai/DeepSeek-V2-Lite-Chat
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
index a828e17a09..58674fa173 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
@@ -67,8 +67,6 @@ def test_pipeline_chat_pytorch_tp2(config, common_case_config, model,
                                               exclude_dup=True))
 def test_pipeline_chat_kvint4_tp1(config, common_case_config, model,
                                   worker_id):
-    if 'Qwen2' in model:
-        return  # kvint4 for qwen2 is not support
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
     spawn_context = get_context('spawn')
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
index 276ced5bcb..8403ced94f 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_mllm.py
@@ -50,8 +50,6 @@ def test_pipeline_chat_tp2(config, model, worker_id):
                                               quant_policy=4,
                                               model_type='vl_model'))
 def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
-    if 'Qwen2' in model:
-        return  # kvint4 for qwen2 is not support
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
     spawn_context = get_context('spawn')
@@ -70,8 +68,6 @@ def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
                                               quant_policy=4,
                                               model_type='vl_model'))
 def test_pipeline_chat_kvint4_tp2(config, model, worker_id):
-    if 'Qwen2' in model:
-        return  # kvint4 for qwen2 is not support
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
                                                                      tp_num=2)
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
index 17560e754d..d1865175cf 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
@@ -56,8 +56,6 @@ def test_pipeline_chat_tp2(config, common_case_config, model, worker_id):
 @pytest.mark.parametrize('model', get_all_model_list(tp_num=1, quant_policy=4))
 def test_pipeline_chat_kvint4_tp1(config, common_case_config, model,
                                   worker_id):
-    if 'Qwen2' in model:
-        return  # kvint4 for qwen2 is not support
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
     spawn_context = get_context('spawn')
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
index 8f1bc7d8b1..8c845fa77a 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
@@ -50,8 +50,6 @@ def test_pipeline_chat_tp2(config, model, worker_id):
                                             quant_policy=4,
                                             model_type='vl_model'))
 def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
-    if 'Qwen2' in model:
-        return  # kvint4 for qwen2 is not support
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
     spawn_context = get_context('spawn')
@@ -70,8 +68,6 @@ def test_pipeline_chat_kvint4_tp1(config, model, worker_id):
                                             quant_policy=4,
                                             model_type='vl_model'))
 def test_pipeline_chat_kvint4_tp2(config, model, worker_id):
-    if 'Qwen2' in model:
-        return  # kvint4 for qwen2 is not support
     if 'gw' in worker_id:
         os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
                                                                      tp_num=2)
diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
index ab1f5595ae..fc95e288ca 100644
--- a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
@@ -67,8 +67,7 @@ def getKvintModelList(tp_num, quant_policy):
         'tp_num': tp_num,
         'extra': f'--quant-policy {quant_policy}'
     } for item in get_torch_model_list(
-        tp_num, quant_policy=quant_policy, exclude_dup=True)
-            if 'qwen2' not in item.lower() or quant_policy == 8]
+        tp_num, quant_policy=quant_policy, exclude_dup=True)]
 
 
 @pytest.mark.order(7)
diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py
index b210733db4..bf20c45e6e 100644
--- a/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_mllm.py
@@ -60,8 +60,7 @@ def getKvintModelList(tp_num, quant_policy: int = None):
         'tp_num': tp_num,
         'extra': f'--quant-policy {quant_policy}'
     } for item in get_torch_model_list(
-        tp_num, quant_policy=quant_policy, model_type='vl_model')
-            if 'qwen2' not in item.lower() or quant_policy == 8]
+        tp_num, quant_policy=quant_policy, model_type='vl_model')]
 
 
 @pytest.mark.order(7)
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
index 91e65ee51a..1c9131b32e 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
@@ -66,8 +66,7 @@ def getKvintModelList(tp_num, quant_policy):
         'cuda_prefix': None,
         'tp_num': tp_num,
         'extra': f'--quant-policy {quant_policy}'
-    } for item in get_all_model_list(tp_num, quant_policy=quant_policy)
-            if 'qwen2' not in item.lower() or quant_policy == 8]
+    } for item in get_all_model_list(tp_num, quant_policy=quant_policy)]
 
 
 @pytest.mark.order(7)
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
index 091e18e6e3..641f2f760f 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_mllm.py
@@ -60,8 +60,7 @@ def getKvintModelList(tp_num, quant_policy: int = None):
         'tp_num': tp_num,
         'extra': f'--quant-policy {quant_policy}'
     } for item in get_all_model_list(
-        tp_num, quant_policy=quant_policy, model_type='vl_model')
-            if 'qwen2' not in item.lower() or quant_policy == 8]
+        tp_num, quant_policy=quant_policy, model_type='vl_model')]
 
 
 @pytest.mark.order(7)
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index 562a707efe..023e4ac142 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -3,7 +3,10 @@
 from subprocess import PIPE
 
 import allure
+import numpy as np
 import torch
+from decord import VideoReader, cpu
+from PIL import Image
 from pytest_assume.plugin import assume
 from utils.get_run_config import get_model_name, get_tp_num
 from utils.rule_condition_assert import assert_result
@@ -13,6 +16,7 @@
 from lmdeploy.utils import is_bf16_supported
 from lmdeploy.vl import load_image
 from lmdeploy.vl.constants import IMAGE_TOKEN
+from lmdeploy.vl.utils import encode_image_base64
 
 
 def run_pipeline_chat_test(config,
@@ -275,6 +279,12 @@ def assert_pipeline_single_element(output,
 
 PIC1 = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg'  # noqa E501
 PIC2 = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg'  # noqa E501
+PIC_BEIJING = 'https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Beijing_Small.jpeg'  # noqa E501
+PIC_CHONGQING = 'https://raw.githubusercontent.com/QwenLM/Qwen-VL/master/assets/mm_tutorial/Chongqing_Small.jpeg'  # noqa E501
+PIC_REDPANDA = 'https://raw.githubusercontent.com/OpenGVLab/InternVL/main/internvl_chat/examples/image1.jpg'  # noqa E501
+PIC_PANDA = 'https://raw.githubusercontent.com/OpenGVLab/InternVL/main/internvl_chat/examples/image2.jpg'  # noqa E501
+DESC = 'What are the similarities and differences between these two images.'  # noqa E501
+DESC_ZH = '两张图有什么相同和不同的地方.'  # noqa E501
 
 
 def run_pipeline_vl_chat_test(config,
@@ -386,12 +396,350 @@ def run_pipeline_vl_chat_test(config,
                     ', reason: Multi-turn example: ski not in ' +
                     sess.response.text + '\n')
 
+    if 'internvl' in model_case.lower():
+        internvl_vl_testcase(config, pipe, file)
+        internvl_vl_testcase(config, pipe, file, 'cn')
+    if 'minicpm' in model_case.lower():
+        MiniCPM_vl_testcase(config, pipe, file)
+    if 'qwen' in model_case.lower():
+        Qwen_vl_testcase(config, pipe, file)
+
     file.close()
 
     del pipe
     torch.cuda.empty_cache()
 
 
+def internvl_vl_testcase(config, pipe, file, lang='en'):
+    if lang == 'cn':
+        description = DESC_ZH
+    else:
+        description = DESC
+    # multi-image multi-round conversation, combined images
+    messages = [
+        dict(role='user',
+             content=[
+                 dict(type='text',
+                      text=f'{IMAGE_TOKEN}{IMAGE_TOKEN}\n{description}'),
+                 dict(type='image_url',
+                      image_url=dict(max_dynamic_patch=12, url=PIC_REDPANDA)),
+                 dict(type='image_url',
+                      image_url=dict(max_dynamic_patch=12, url=PIC_PANDA))
+             ])
+    ]
+    response = pipe(messages)
+    result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: combined images: panda not in ' +
+                    response.text + '\n')
+
+    messages.append(dict(role='assistant', content=response.text))
+    messages.append(dict(role='user', content=description))
+    response = pipe(messages)
+    result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: combined images second: panda not in ' +
+                    response.text + '\n')
+
+    # multi-image multi-round conversation, separate images
+    messages = [
+        dict(
+            role='user',
+            content=[
+                dict(
+                    type='text',
+                    text=f'Image-1: {IMAGE_TOKEN}\nImage-2: {IMAGE_TOKEN}\n'
+                    +  # noqa E251,E501
+                    description),
+                dict(type='image_url',
+                     image_url=dict(max_dynamic_patch=12, url=PIC_REDPANDA)),
+                dict(type='image_url',
+                     image_url=dict(max_dynamic_patch=12, url=PIC_PANDA))
+            ])
+    ]
+    response = pipe(messages)
+    result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: separate images: panda not in ' +
+                    response.text + '\n')
+
+    messages.append(dict(role='assistant', content=response.text))
+    messages.append(dict(role='user', content=description))
+    response = pipe(messages)
+    result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: separate images second: panda not in ' +
+                    response.text + '\n')
+
+    # video multi-round conversation
+    def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
+        if bound:
+            start, end = bound[0], bound[1]
+        else:
+            start, end = -100000, 100000
+        start_idx = max(first_idx, round(start * fps))
+        end_idx = min(round(end * fps), max_frame)
+        seg_size = float(end_idx - start_idx) / num_segments
+        frame_indices = np.array([
+            int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
+            for idx in range(num_segments)
+        ])
+        return frame_indices
+
+    def load_video(video_path, bound=None, num_segments=32):
+        vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+        max_frame = len(vr) - 1
+        fps = float(vr.get_avg_fps())
+        frame_indices = get_index(bound,
+                                  fps,
+                                  max_frame,
+                                  first_idx=0,
+                                  num_segments=num_segments)
+        imgs = []
+        for frame_index in frame_indices:
+            img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
+            imgs.append(img)
+        return imgs
+
+    resource_path = config.get('resource_path')
+    video_path = resource_path + '/red-panda.mp4'
+    imgs = load_video(video_path, num_segments=8)
+
+    question = ''
+    for i in range(len(imgs)):
+        question = question + f'Frame{i+1}: {IMAGE_TOKEN}\n'
+
+    if lang == 'cn':
+        question += '小熊猫在做什么？'
+    else:
+        question += 'What is the red panda doing?'
+
+    content = [{'type': 'text', 'text': question}]
+    for img in imgs:
+        content.append({
+            'type': 'image_url',
+            'image_url': {
+                'max_dynamic_patch': 1,
+                'url': f'data:image/jpeg;base64,{encode_image_base64(img)}'
+            }
+        })
+
+    messages = [dict(role='user', content=content)]
+    response = pipe(messages)
+    result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: video images: red panda not in ' +
+                    response.text + '\n')
+
+    messages.append(dict(role='assistant', content=response.text))
+    if lang == 'cn':
+        messages.append(dict(role='user', content='描述视频详情，不要重复'))
+    else:
+        messages.append(
+            dict(role='user',
+                 content='Describe this video in detail. Don\'t repeat.'))
+    response = pipe(messages)
+    result = 'red panda' in response.text.lower(
+    ) or '熊猫' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: video images: red panda not in ' +
+                    response.text + '\n')
+
+
+def llava_vl_testcase(config, pipe, file):
+    # multi-image multi-round conversation, combined images
+    messages = [
+        dict(role='user',
+             content=[
+                 dict(type='text', text='Describe the two images in detail.'),
+                 dict(type='image_url', image_url=dict(url=PIC_BEIJING)),
+                 dict(type='image_url', image_url=dict(url=PIC_CHONGQING))
+             ])
+    ]
+    response = pipe(messages)
+    result = 'buildings' in response.text.lower(
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower(
+    ) or 'cityscape' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: combined images: buildings not in ' +
+                    response.text + '\n')
+
+    messages.append(dict(role='assistant', content=response.text))
+    messages.append(dict(role='user', content=DESC))
+    response = pipe(messages)
+    result = 'buildings' in response.text.lower(
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower(
+    ) or 'cityscape' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: combined images second: buildings not in ' +
+                    response.text + '\n')
+
+
+def MiniCPM_vl_testcase(config, pipe, file):
+    # Chat with multiple images
+    messages = [
+        dict(role='user',
+             content=[
+                 dict(type='text', text='Describe the two images in detail.'),
+                 dict(type='image_url',
+                      image_url=dict(max_slice_nums=9, url=PIC_REDPANDA)),
+                 dict(type='image_url',
+                      image_url=dict(max_slice_nums=9, url=PIC_PANDA))
+             ])
+    ]
+    response = pipe(messages)
+    result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: multiple images: panda not in ' +
+                    response.text + '\n')
+
+    messages.append(dict(role='assistant', content=response.text))
+    messages.append(dict(role='user', content=DESC))
+    response = pipe(messages)
+    result = 'panda' in response.text.lower() or '熊猫' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: multiple images second: panda not in ' +
+                    response.text + '\n')
+
+    # In-context few-shot learning
+    EXAMPLE1 = 'https://github.com/user-attachments/assets/405d9147-95f6-4f78-8879-606a0aed6707'  # noqa E251,E501
+    EXAMPLE2 = 'https://github.com/user-attachments/assets/9f2c6ed9-2aa5-4189-9c4f-0b9753024ba1'  # noqa E251,E501
+    EXAMPLE3 = 'https://github.com/user-attachments/assets/f335b507-1957-4c22-84ae-ed69ff79df38'  # noqa E251,E501
+    question = 'production date'
+    messages = [
+        dict(role='user',
+             content=[
+                 dict(type='text', text=question),
+                 dict(type='image_url', image_url=dict(url=EXAMPLE1)),
+             ]),
+        dict(role='assistant', content='2021.08.29'),
+        dict(role='user',
+             content=[
+                 dict(type='text', text=question),
+                 dict(type='image_url', image_url=dict(url=EXAMPLE2)),
+             ]),
+        dict(role='assistant', content='1999.05.15'),
+        dict(role='user',
+             content=[
+                 dict(type='text', text=question),
+                 dict(type='image_url', image_url=dict(url=EXAMPLE3)),
+             ])
+    ]
+    response = pipe(messages)
+    result = '2021' in response.text.lower() or '14' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: in context learning: 2021 or 14 not in ' +
+                    response.text + '\n')
+
+    # Chat with video
+    MAX_NUM_FRAMES = 64  # if cuda OOM set a smaller number
+
+    def encode_video(video_path):
+
+        def uniform_sample(length, n):
+            gap = len(length) / n
+            idxs = [int(i * gap + gap / 2) for i in range(n)]
+            return [length[i] for i in idxs]
+
+        vr = VideoReader(video_path, ctx=cpu(0))
+        sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+        frame_idx = [i for i in range(0, len(vr), sample_fps)]
+        if len(frame_idx) > MAX_NUM_FRAMES:
+            frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
+        frames = vr.get_batch(frame_idx).asnumpy()
+        frames = [Image.fromarray(v.astype('uint8')) for v in frames]
+        print('num frames:', len(frames))
+        return frames
+
+    resource_path = config.get('resource_path')
+    video_path = resource_path + '/red-panda.mp4'
+    frames = encode_video(video_path)
+    question = 'Describe the video'
+
+    content = [dict(type='text', text=question)]
+    for frame in frames:
+        content.append(
+            dict(type='image_url',
+                 image_url=dict(
+                     use_image_id=False,
+                     max_slice_nums=2,
+                     url=f'data:image/jpeg;base64,{encode_image_base64(frame)}'
+                 )))
+
+    messages = [dict(role='user', content=content)]
+    response = pipe(messages)
+    result = 'red panda' in response.text.lower(
+    ) or '熊猫' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: video example: panda not in ' + response.text +
+                    '\n')
+
+
+def Qwen_vl_testcase(config, pipe, file):
+    # multi-image multi-round conversation, combined images
+    messages = [
+        dict(role='user',
+             content=[
+                 dict(type='text', text='Describe the two images in detail.'),
+                 dict(type='image_url', image_url=dict(url=PIC_BEIJING)),
+                 dict(type='image_url', image_url=dict(url=PIC_CHONGQING))
+             ])
+    ]
+    response = pipe(messages)
+    result = 'buildings' in response.text.lower(
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower(
+    ) or 'cityscape' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: combined images: buildings not in ' +
+                    response.text + '\n')
+
+    messages.append(dict(role='assistant', content=response.text))
+    messages.append(dict(role='user', content=DESC))
+    response = pipe(messages)
+    result = 'buildings' in response.text.lower(
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower(
+    ) or 'cityscape' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: combined images second: buildings not in ' +
+                    response.text + '\n')
+
+    # image resolution for performance boost
+    min_pixels = 64 * 28 * 28
+    max_pixels = 64 * 28 * 28
+    messages = [
+        dict(role='user',
+             content=[
+                 dict(type='text', text='Describe the two images in detail.'),
+                 dict(type='image_url',
+                      image_url=dict(min_pixels=min_pixels,
+                                     max_pixels=max_pixels,
+                                     url=PIC_BEIJING)),
+                 dict(type='image_url',
+                      image_url=dict(min_pixels=min_pixels,
+                                     max_pixels=max_pixels,
+                                     url=PIC_CHONGQING))
+             ])
+    ]
+    response = pipe(messages)
+    result = 'ski' in response.text.lower() or '滑雪' in response.text.lower()
+    result = 'buildings' in response.text.lower(
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower(
+    ) or 'cityscape' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: performance boost: buildings not in ' +
+                    response.text + '\n')
+
+    messages.append(dict(role='assistant', content=response.text))
+    messages.append(dict(role='user', content=DESC))
+    response = pipe(messages)
+    result = 'buildings' in response.text.lower(
+    ) or '楼' in response.text.lower() or 'skyline' in response.text.lower(
+    ) or 'cityscape' in response.text.lower()
+    file.writelines('result:' + str(result) +
+                    ', reason: performance boost second: buildings not in ' +
+                    response.text + '\n')
+
+
 def assert_pipeline_vl_chat_log(config, model_case, worker_id):
     log_path = config.get('log_path')
 
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index 77af1975be..082a61bcda 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -282,6 +282,7 @@ def get_model(url):
 
 
 PIC = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg'  # noqa E501
+PIC2 = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg'  # noqa E501
 
 
 def run_vl_testcase(config, port: int = DEFAULT_PORT):
@@ -307,6 +308,11 @@ def run_vl_testcase(config, port: int = DEFAULT_PORT):
             'image_url': {
                 'url': PIC,
             },
+        }, {
+            'type': 'image_url',
+            'image_url': {
+                'url': PIC2,
+            },
         }],
     }]
 
@@ -315,8 +321,6 @@ def run_vl_testcase(config, port: int = DEFAULT_PORT):
                                               temperature=0.8,
                                               top_p=0.8)
     file.writelines(str(response).lower() + '\n')
-    assert 'tiger' in str(response).lower() or '虎' in str(
-        response).lower(), response
 
     api_client = APIClient(http_url)
     model_name = api_client.available_models[0]
@@ -324,7 +328,12 @@ def run_vl_testcase(config, port: int = DEFAULT_PORT):
                                                messages=prompt_messages):
         continue
     file.writelines(str(item) + '\n')
-    assert 'tiger' in str(item).lower() or '虎' in str(item).lower(), item
 
     allure.attach.file(restful_log,
                        attachment_type=allure.attachment_type.TEXT)
+
+    assert 'tiger' in str(response).lower() or '虎' in str(
+        response).lower() or 'ski' in str(response).lower() or '滑雪' in str(
+            response).lower(), response
+    assert 'tiger' in str(item).lower() or '虎' in str(item).lower(
+    ) or 'ski' in str(item).lower() or '滑雪' in str(item).lower(), item
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index da52241253..cd43e79c94 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -19,7 +19,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |         Qwen          |   1.8B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Qwen1.5        |  1.8B - 110B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |         Qwen2         |   0.5B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Mistral        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Mistral        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  No   |
 |        Mixtral        |  8x7B, 8x22B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Qwen-VL        |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |      DeepSeek-VL      |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -36,7 +36,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |    MiniGeminiLlama    |       7B       | MLLM |    Yes    |    -    |    -    |  Yes  |
 |         GLM4          |       9B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       CodeGeeX4       |       9B       | LLM  |    Yes    |   Yes   |   Yes   |   -   |
-|         Molmo         |    7B-D,72B    | MLLM |    Yes    |   Yes   |   Yes   |  NO   |
+|         Molmo         |    7B-D,72B    | MLLM |    Yes    |   Yes   |   Yes   |  No   |
 
 "-" means not verified yet.
 
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index 502e91b6d3..7ec36d2351 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -19,7 +19,7 @@
 |         Qwen          |   1.8B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Qwen1.5        |  1.8B - 110B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |         Qwen2         |   0.5B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Mistral        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Mistral        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  No   |
 |        Mixtral        |  8x7B, 8x22B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Qwen-VL        |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |      DeepSeek-VL      |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -36,7 +36,7 @@
 |    MiniGeminiLlama    |       7B       | MLLM |    Yes    |    -    |    -    |  Yes  |
 |         GLM4          |       9B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       CodeGeeX4       |       9B       | LLM  |    Yes    |   Yes   |   Yes   |   -   |
-|         Molmo         |    7B-D,72B    | MLLM |    Yes    |   Yes   |   Yes   |  NO   |
+|         Molmo         |    7B-D,72B    | MLLM |    Yes    |   Yes   |   Yes   |  No   |
 
 “-” 表示还没有验证。
 

From 4ede6314aac338e3b141fe9c909233421d7b636f Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Fri, 29 Nov 2024 18:43:46 +0800
Subject: [PATCH 094/122] refactor turbomind (2/N) (#2818)

---
 CMakeLists.txt                                |   2 +-
 lmdeploy/turbomind/turbomind.py               |   8 +-
 src/turbomind/models/llama/LlamaBatch.h       |   4 +-
 src/turbomind/models/llama/LlamaV2.h          |   6 +-
 src/turbomind/python/bind.cpp                 | 214 ++++++-------
 .../triton_backend/llama/LlamaTritonModel.cc  | 165 +++++-----
 .../triton_backend/llama/LlamaTritonModel.h   |  64 ++--
 .../llama/LlamaTritonModelInstance.cc         | 206 +++++--------
 .../llama/LlamaTritonModelInstance.h          |  36 +--
 .../transformer_triton_backend.cpp            |  52 ++--
 .../transformer_triton_backend.hpp            | 283 ++----------------
 src/turbomind/utils/Tensor.h                  |  10 +
 src/turbomind/utils/instance_comm.h           |  16 -
 13 files changed, 370 insertions(+), 696 deletions(-)
 delete mode 100644 src/turbomind/utils/instance_comm.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ff2ac7dded..356da56f58 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -304,7 +304,7 @@ link_directories(
 
 # add_subdirectory(3rdparty)
 add_subdirectory(src)
-add_subdirectory(examples)
+# add_subdirectory(examples)
 
 if(BUILD_TEST)
     add_subdirectory(tests/csrc)
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index 05bc3e400e..a1b2fff944 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -358,12 +358,10 @@ def _forward_callback(self, result, ctx):
         self.que.put((False, result))
 
     def _forward_thread(self, inputs):
-        instance_comm = self.tm_model.model_comm.create_instance_comm(
-            self.gpu_count)
 
         def _func():
             try:
-                output = self.model_inst.forward(inputs, instance_comm)
+                output = self.model_inst.forward(inputs)
             except Exception as e:
                 logger.error(f'unhandled exception: {e}')
                 self.que.put((-1, None))
@@ -377,12 +375,10 @@ def _async_forward_callback(self, result, ctx, que: LifoQueue):
         que.put((False, result))
 
     def _async_forward_thread(self, inputs, que: LifoQueue):
-        instance_comm = self.tm_model.model_comm.create_instance_comm(
-            self.gpu_count)
 
         def _func():
             try:
-                output = self.model_inst.forward(inputs, instance_comm)
+                output = self.model_inst.forward(inputs)
             except Exception as e:
                 logger.error(f'unhandled exception: {e}')
                 que.put((-1, None))
diff --git a/src/turbomind/models/llama/LlamaBatch.h b/src/turbomind/models/llama/LlamaBatch.h
index 9c66948999..f952da6bae 100644
--- a/src/turbomind/models/llama/LlamaBatch.h
+++ b/src/turbomind/models/llama/LlamaBatch.h
@@ -12,7 +12,6 @@
 #include "src/turbomind/utils/allocator.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
 #include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/instance_comm.h"
 #include <condition_variable>
 #include <curand_kernel.h>
 #include <mutex>
@@ -32,8 +31,7 @@ struct SharedState {
 };
 
 struct Control {
-    AbstractInstanceComm* comm;
-    Request::Callback     callback;
+    Request::Callback callback;
 };
 
 struct BatchState {
diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h
index 658282f5e5..a0d35b887f 100644
--- a/src/turbomind/models/llama/LlamaV2.h
+++ b/src/turbomind/models/llama/LlamaV2.h
@@ -21,6 +21,9 @@
 
 #pragma once
 
+#include <limits>
+#include <unordered_map>
+
 #include "src/turbomind/layers/DynamicDecodeLayer.h"
 #include "src/turbomind/models/llama/Barrier.h"
 #include "src/turbomind/models/llama/LlamaBatch.h"
@@ -31,10 +34,7 @@
 #include "src/turbomind/models/llama/unified_decoder.h"
 #include "src/turbomind/utils/allocator.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
-#include "src/turbomind/utils/instance_comm.h"
 #include "src/turbomind/utils/nccl_utils.h"
-#include <limits>
-#include <unordered_map>
 
 namespace turbomind {
 
diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp
index 5a344d9545..71792a4be8 100644
--- a/src/turbomind/python/bind.cpp
+++ b/src/turbomind/python/bind.cpp
@@ -1,34 +1,38 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
-#include "src/turbomind/python/dlpack.h"
-#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
-#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
-#include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/nccl_utils.h"
-#include <cuda_runtime.h>
 #include <memory>
+#include <stdexcept>
+
+#include <cuda_runtime.h>
+
 #include <pybind11/functional.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/pytypes.h>
 #include <pybind11/stl.h>
 #include <pybind11/stl_bind.h>
-#include <stdexcept>
+
+#include "src/turbomind/python/dlpack.h"
+#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
+#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
+#include "src/turbomind/utils/Tensor.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/nccl_utils.h"
 
 namespace py = pybind11;
 namespace ft = turbomind;
 using namespace pybind11::literals;
 
 // prepare to bind container
-using TensorVector = std::vector<triton::Tensor>;
+using TensorVector = std::vector<ft::Tensor>;
 PYBIND11_MAKE_OPAQUE(TensorVector);
-using TensorMap = std::unordered_map<std::string, triton::Tensor>;
+using TensorMap = std::unordered_map<std::string, ft::Tensor>;
 PYBIND11_MAKE_OPAQUE(TensorMap);
 static const char kDlTensorCapsuleName[] = "dltensor";
 
-DLDevice getDLDevice(triton::Tensor& tensor)
+DLDevice getDLDevice(ft::Tensor& tensor)
 {
     int device_id = 0;
-    if (tensor.where == triton::MEMORY_GPU) {
+    if (tensor.where == ft::MEMORY_GPU) {
         cudaPointerAttributes ptr_attr;
         cudaPointerGetAttributes(&ptr_attr, tensor.data);
         device_id = ptr_attr.device;
@@ -37,13 +41,13 @@ DLDevice getDLDevice(triton::Tensor& tensor)
     DLDevice device{kDLCPU, device_id};
 
     switch (tensor.where) {
-        case triton::MEMORY_CPU:
+        case ft::MEMORY_CPU:
             device.device_type = DLDeviceType::kDLCPU;
             break;
-        case triton::MEMORY_CPU_PINNED:
+        case ft::MEMORY_CPU_PINNED:
             device.device_type = DLDeviceType::kDLCUDAHost;
             break;
-        case triton::MEMORY_GPU:
+        case ft::MEMORY_GPU:
             device.device_type = DLDeviceType::kDLCUDA;
             break;
         default:
@@ -53,62 +57,62 @@ DLDevice getDLDevice(triton::Tensor& tensor)
     return device;
 }
 
-DLManagedTensor* TritonTensorToDLManagedTensor(triton::Tensor& tensor)
+DLManagedTensor* TritonTensorToDLManagedTensor(ft::Tensor& tensor)
 {
     DLDevice device = getDLDevice(tensor);
 
     DLDataType data_type{0, 0, 1};
     switch (tensor.type) {
-        case triton::TYPE_BOOL:
+        case ft::TYPE_BOOL:
             data_type.code = DLDataTypeCode::kDLBool;
             data_type.bits = 8;
             break;
-        case triton::TYPE_UINT8:
+        case ft::TYPE_UINT8:
             data_type.code = DLDataTypeCode::kDLUInt;
             data_type.bits = 8;
             break;
-        case triton::TYPE_UINT16:
+        case ft::TYPE_UINT16:
             data_type.code = DLDataTypeCode::kDLUInt;
             data_type.bits = 16;
             break;
-        case triton::TYPE_UINT32:
+        case ft::TYPE_UINT32:
             data_type.code = DLDataTypeCode::kDLUInt;
             data_type.bits = 32;
             break;
-        case triton::TYPE_UINT64:
+        case ft::TYPE_UINT64:
             data_type.code = DLDataTypeCode::kDLUInt;
             data_type.bits = 64;
             break;
-        case triton::TYPE_INT8:
-        case triton::TYPE_BYTES:
+        case ft::TYPE_INT8:
+        case ft::TYPE_BYTES:
             data_type.code = DLDataTypeCode::kDLInt;
             data_type.bits = 8;
             break;
-        case triton::TYPE_INT16:
+        case ft::TYPE_INT16:
             data_type.code = DLDataTypeCode::kDLInt;
             data_type.bits = 16;
             break;
-        case triton::TYPE_INT32:
+        case ft::TYPE_INT32:
             data_type.code = DLDataTypeCode::kDLInt;
             data_type.bits = 32;
             break;
-        case triton::TYPE_INT64:
+        case ft::TYPE_INT64:
             data_type.code = DLDataTypeCode::kDLInt;
             data_type.bits = 64;
             break;
-        case triton::TYPE_FP16:
+        case ft::TYPE_FP16:
             data_type.code = DLDataTypeCode::kDLFloat;
             data_type.bits = 16;
             break;
-        case triton::TYPE_FP32:
+        case ft::TYPE_FP32:
             data_type.code = DLDataTypeCode::kDLFloat;
             data_type.bits = 32;
             break;
-        case triton::TYPE_FP64:
+        case ft::TYPE_FP64:
             data_type.code = DLDataTypeCode::kDLFloat;
             data_type.bits = 64;
             break;
-        case triton::TYPE_BF16:
+        case ft::TYPE_BF16:
             data_type.code = DLDataTypeCode::kDLBfloat;
             data_type.bits = 16;
             break;
@@ -125,78 +129,78 @@ DLManagedTensor* TritonTensorToDLManagedTensor(triton::Tensor& tensor)
     return new DLManagedTensor{dl_tensor, nullptr, [](DLManagedTensor* dlmt) { delete dlmt; }};
 }
 
-triton::MemoryType getMemoryType(DLDevice device)
+ft::MemoryType getMemoryType(DLDevice device)
 {
     switch (device.device_type) {
         case DLDeviceType::kDLCUDAHost:
-            return triton::MemoryType::MEMORY_CPU_PINNED;
+            return ft::MemoryType::MEMORY_CPU_PINNED;
         case DLDeviceType::kDLCUDA:
-            return triton::MemoryType::MEMORY_GPU;
+            return ft::MemoryType::MEMORY_GPU;
         case DLDeviceType::kDLCPU:
         default:
-            return triton::MemoryType::MEMORY_CPU;
+            return ft::MemoryType::MEMORY_CPU;
     }
 }
 
-triton::DataType getDataType(DLDataType data_type)
+ft::DataType getDataType(DLDataType data_type)
 {
     switch (data_type.code) {
         case DLDataTypeCode::kDLUInt:
             switch (data_type.bits) {
                 case 8:
-                    return triton::TYPE_UINT8;
+                    return ft::TYPE_UINT8;
                 case 16:
-                    return triton::TYPE_UINT16;
+                    return ft::TYPE_UINT16;
                 case 32:
-                    return triton::TYPE_UINT32;
+                    return ft::TYPE_UINT32;
                 case 64:
-                    return triton::TYPE_UINT64;
+                    return ft::TYPE_UINT64;
                 default:
-                    return triton::TYPE_INVALID;
+                    return ft::TYPE_INVALID;
             }
             break;
         case DLDataTypeCode::kDLInt:
             switch (data_type.bits) {
                 case 8:
-                    return triton::TYPE_INT8;
+                    return ft::TYPE_INT8;
                 case 16:
-                    return triton::TYPE_INT16;
+                    return ft::TYPE_INT16;
                 case 32:
-                    return triton::TYPE_INT32;
+                    return ft::TYPE_INT32;
                 case 64:
-                    return triton::TYPE_INT64;
+                    return ft::TYPE_INT64;
                 default:
-                    return triton::TYPE_INVALID;
+                    return ft::TYPE_INVALID;
             }
             break;
         case DLDataTypeCode::kDLFloat:
             switch (data_type.bits) {
                 case 16:
-                    return triton::TYPE_FP16;
+                    return ft::TYPE_FP16;
                 case 32:
-                    return triton::TYPE_FP32;
+                    return ft::TYPE_FP32;
                 case 64:
-                    return triton::TYPE_FP64;
+                    return ft::TYPE_FP64;
                 default:
-                    return triton::TYPE_INVALID;
+                    return ft::TYPE_INVALID;
             }
             break;
         case DLDataTypeCode::kDLBfloat:
             switch (data_type.bits) {
                 case 16:
-                    return triton::TYPE_BF16;
+                    return ft::TYPE_BF16;
                 default:
-                    return triton::TYPE_INVALID;
+                    return ft::TYPE_INVALID;
             }
             break;
         case DLDataTypeCode::kDLBool:
-            return triton::TYPE_BOOL;
+            return ft::TYPE_BOOL;
         default:
-            return triton::TYPE_INVALID;
+            return ft::TYPE_INVALID;
     }
 }
 
-std::shared_ptr<triton::Tensor> DLManagedTensorToTritonTensor(DLManagedTensor* tensor)
+std::shared_ptr<ft::Tensor> DLManagedTensorToTritonTensor(DLManagedTensor* tensor)
 {
     auto& dl_tensor = tensor->dl_tensor;
     auto  where     = getMemoryType(dl_tensor.device);
@@ -205,7 +209,7 @@ std::shared_ptr<triton::Tensor> DLManagedTensorToTritonTensor(DLManagedTensor* t
     std::vector<size_t> shape(dl_tensor.shape, dl_tensor.shape + dl_tensor.ndim);
     auto                data = dl_tensor.data;
 
-    return std::make_shared<triton::Tensor>(where, dtype, shape, data);
+    return std::make_shared<ft::Tensor>(where, dtype, shape, data);
 }
 
 DLTensor GetDLTensor(py::object obj)
@@ -270,70 +274,65 @@ PYBIND11_MODULE(_turbomind, m)
     // custom comm
     py::class_<ft::AbstractCustomComm, std::shared_ptr<ft::AbstractCustomComm>>(m, "AbstractCustomComm");
 
-    // instance comm
-    py::class_<ft::AbstractInstanceComm>(m, "AbstractInstanceComm");
-
     // data type
-    py::enum_<triton::DataType>(m, "DataType")
-        .value("TYPE_INVALID", triton::DataType::TYPE_INVALID)
-        .value("TYPE_BOOL", triton::DataType::TYPE_BOOL)
-        .value("TYPE_UINT8", triton::DataType::TYPE_UINT8)
-        .value("TYPE_UINT16", triton::DataType::TYPE_UINT16)
-        .value("TYPE_UINT32", triton::DataType::TYPE_UINT32)
-        .value("TYPE_UINT64", triton::DataType::TYPE_UINT64)
-        .value("TYPE_INT8", triton::DataType::TYPE_INT8)
-        .value("TYPE_INT16", triton::DataType::TYPE_INT16)
-        .value("TYPE_INT32", triton::DataType::TYPE_INT32)
-        .value("TYPE_INT64", triton::DataType::TYPE_INT64)
-        .value("TYPE_FP16", triton::DataType::TYPE_FP16)
-        .value("TYPE_FP32", triton::DataType::TYPE_FP32)
-        .value("TYPE_FP64", triton::DataType::TYPE_FP64)
-        .value("TYPE_BYTES", triton::DataType::TYPE_BYTES)
-        .value("TYPE_BF16", triton::DataType::TYPE_BF16);
+    py::enum_<ft::DataType>(m, "DataType")
+        .value("TYPE_INVALID", ft::DataType::TYPE_INVALID)
+        .value("TYPE_BOOL", ft::DataType::TYPE_BOOL)
+        .value("TYPE_UINT8", ft::DataType::TYPE_UINT8)
+        .value("TYPE_UINT16", ft::DataType::TYPE_UINT16)
+        .value("TYPE_UINT32", ft::DataType::TYPE_UINT32)
+        .value("TYPE_UINT64", ft::DataType::TYPE_UINT64)
+        .value("TYPE_INT8", ft::DataType::TYPE_INT8)
+        .value("TYPE_INT16", ft::DataType::TYPE_INT16)
+        .value("TYPE_INT32", ft::DataType::TYPE_INT32)
+        .value("TYPE_INT64", ft::DataType::TYPE_INT64)
+        .value("TYPE_FP16", ft::DataType::TYPE_FP16)
+        .value("TYPE_FP32", ft::DataType::TYPE_FP32)
+        .value("TYPE_FP64", ft::DataType::TYPE_FP64)
+        .value("TYPE_BYTES", ft::DataType::TYPE_BYTES)
+        .value("TYPE_BF16", ft::DataType::TYPE_BF16);
 
     // memory type
-    py::enum_<triton::MemoryType>(m, "MemoryType")
-        .value("MEMORY_CPU", triton::MemoryType::MEMORY_CPU)
-        .value("MEMORY_CPU_PINNED", triton::MemoryType::MEMORY_CPU_PINNED)
-        .value("MEMORY_GPU", triton::MemoryType::MEMORY_GPU);
+    py::enum_<ft::MemoryType>(m, "MemoryType")
+        .value("MEMORY_CPU", ft::MemoryType::MEMORY_CPU)
+        .value("MEMORY_CPU_PINNED", ft::MemoryType::MEMORY_CPU_PINNED)
+        .value("MEMORY_GPU", ft::MemoryType::MEMORY_GPU);
 
     // tensor
-    py::class_<triton::Tensor, std::shared_ptr<triton::Tensor>>(m, "Tensor")
-        .def_readonly("where", &triton::Tensor::where)
-        .def_readonly("type", &triton::Tensor::type)
-        .def_readonly("shape", &triton::Tensor::shape)
-        .def_readonly("data", &triton::Tensor::data)
-        .def(py::init([](const triton::MemoryType   where,
-                         const triton::DataType     type,
-                         const std::vector<size_t>& shape,
-                         const long                 data) {
-            auto data_ptr = reinterpret_cast<void*>(data);
-            return new triton::Tensor(where, type, shape, data_ptr);
-        }))
+    py::class_<ft::Tensor, std::shared_ptr<ft::Tensor>>(m, "Tensor")
+        .def_readonly("where", &ft::Tensor::where)
+        .def_readonly("type", &ft::Tensor::type)
+        .def_readonly("shape", &ft::Tensor::shape)
+        .def_readonly("data", &ft::Tensor::data)
+        .def(py::init(
+            [](const ft::MemoryType where, const ft::DataType type, const std::vector<size_t>& shape, const long data) {
+                auto data_ptr = reinterpret_cast<void*>(data);
+                return new ft::Tensor(where, type, shape, data_ptr);
+            }))
         .def(
             "view",
-            [](triton::Tensor* self, triton::DataType new_type) {
-                return new triton::Tensor(self->where, new_type, self->shape, self->data);
+            [](ft::Tensor* self, ft::DataType new_type) {
+                return new ft::Tensor(self->where, new_type, self->shape, self->data);
             },
             "new_type"_a)
         .def(
             "view",
-            [](triton::Tensor* self, std::vector<size_t> new_shape) {
-                return new triton::Tensor(self->where, self->type, new_shape, self->data);
+            [](ft::Tensor* self, std::vector<size_t> new_shape) {
+                return new ft::Tensor(self->where, self->type, new_shape, self->data);
             },
             "new_shape"_a)
         .def(
             "copy_from",
-            [](triton::Tensor* self, py::object obj) {
+            [](ft::Tensor* self, py::object obj) {
                 py::capsule      cap = obj.attr("__dlpack__")();
                 DLManagedTensor* dlmt =
                     static_cast<DLManagedTensor*>(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName));
                 auto src = DLManagedTensorToTritonTensor(dlmt);
                 switch (self->type) {
-                    case triton::TYPE_FP16:
-                    case triton::TYPE_FP32:
-                    case triton::TYPE_INT32:
-                    case triton::TYPE_BF16: {
+                    case ft::TYPE_FP16:
+                    case ft::TYPE_FP32:
+                    case ft::TYPE_INT32:
+                    case ft::TYPE_BF16: {
                         auto num_element =
                             std::accumulate(src->shape.begin(), src->shape.end(), 1LL, std::multiplies<int64_t>());
                         auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8;
@@ -348,7 +347,7 @@ PYBIND11_MODULE(_turbomind, m)
             "tensor"_a)
         .def(
             "__dlpack__",
-            [](triton::Tensor* self, long stream) {
+            [](ft::Tensor* self, long stream) {
                 DLManagedTensor* dlmt = TritonTensorToDLManagedTensor(*self);
                 return py::capsule(dlmt, kDlTensorCapsuleName, [](PyObject* obj) {
                     DLManagedTensor* dlmt =
@@ -364,7 +363,7 @@ PYBIND11_MODULE(_turbomind, m)
                 });
             },
             "stream"_a = 0)
-        .def("__dlpack_device__", [](triton::Tensor* self) {
+        .def("__dlpack_device__", [](ft::Tensor* self) {
             auto device = getDLDevice(*self);
             return std::tuple<int, int>(int(device.device_type), device.device_id);
         });
@@ -380,19 +379,19 @@ PYBIND11_MODULE(_turbomind, m)
         "dl_managed_tensor"_a);
 
     // transformer model instance
+    using ft::AbstractTransformerModelInstance;
     py::bind_map<TensorMap, std::shared_ptr<TensorMap>>(m, "TensorMap");
     py::class_<AbstractTransformerModelInstance>(m, "AbstractTransformerModelInstance")
         .def(
             "forward",
-            [](AbstractTransformerModelInstance* model,
-               std::shared_ptr<TensorMap>        input_tensors,
-               ft::AbstractInstanceComm*         inst_comm) { return model->forward(input_tensors, inst_comm); },
+            [](AbstractTransformerModelInstance* model, std::shared_ptr<TensorMap> input_tensors) {
+                return model->forward(input_tensors);
+            },
             py::call_guard<py::gil_scoped_release>(),
-            "input_tensors"_a,
-            "inst_comm"_a = nullptr)
+            "input_tensors"_a)
         .def(
             "register_callback",
-            [](AbstractTransformerModelInstance* self, triton_stream_cb_t cb, py::object ctx) {
+            [](AbstractTransformerModelInstance* self, ft::triton_stream_cb_t cb, py::object ctx) {
                 self->registerCallback(cb, ctx.ptr());
             },
             "callback"_a,
@@ -400,6 +399,8 @@ PYBIND11_MODULE(_turbomind, m)
         .def("unregister_callback", &AbstractTransformerModelInstance::unRegisterCallback);
 
     // transformer model
+    using ft::AbstractTransformerModel;
+    using ft::LlamaTritonModel;
     py::class_<AbstractTransformerModel, std::shared_ptr<AbstractTransformerModel>>(m, "AbstractTransformerModel")
         .def_static(
             "create_llama_model",
@@ -463,7 +464,6 @@ PYBIND11_MODULE(_turbomind, m)
                 return ret;
             },
             "world_size"_a)
-        .def("create_instance_comm", &AbstractTransformerModel::createInstanceComm, "size"_a)
         .def(
             "create_model_instance",
             [](AbstractTransformerModel*                                         model,
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 1c7c5eb468..40c5ac8907 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -27,17 +27,18 @@
 #include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include "src/turbomind/models/llama/context.h"
 #include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/utils/allocator.h"
+#include "src/turbomind/utils/cuda_utils.h"
+
 #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
 #include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
 #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
-#include "src/turbomind/utils/allocator.h"
-#include "src/turbomind/utils/cuda_utils.h"
 
-namespace ft = turbomind;
+namespace turbomind {
 
-static std::optional<ft::MoeParam::Method> get_moe_method()
+static std::optional<MoeParam::Method> get_moe_method()
 {
-    static const auto value = []() -> std::optional<ft::MoeParam::Method> {
+    static const auto value = []() -> std::optional<MoeParam::Method> {
         const auto p = std::getenv("TM_MOE_METHOD");
         if (p) {
             std::string str(p);
@@ -45,10 +46,10 @@ static std::optional<ft::MoeParam::Method> get_moe_method()
                 x = std::tolower(x);
             }
             if (str == "naive") {
-                return ft::MoeParam::kNaive;
+                return MoeParam::kNaive;
             }
             else if (str == "fused") {
-                return ft::MoeParam::kFused;
+                return MoeParam::kFused;
             }
             else {
                 std::cerr << "[WARNING] unrecognised MoE method: " << str << "\n";
@@ -67,7 +68,7 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaM
     }
     catch (const YAML::Exception& e) {
         std::cerr << "Error reading YAML config: " << e.what() << std::endl;
-        ft::FT_CHECK(false);
+        FT_CHECK(false);
     }
 
     const auto        ft_instance_hyperparameter = reader["ft_instance_hyperparameter"];
@@ -91,7 +92,7 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaM
             model_dir);
 #else
         TM_LOG_ERROR("[ERROR] Turbomind is not built with ENABLE_BF16");
-        ft::FT_CHECK(false);
+        FT_CHECK(false);
 #endif
     }
     else {
@@ -103,7 +104,7 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaM
             model_dir);
 #else
         TM_LOG_ERROR("[ERROR] Turbomind is not built with ENABLE_BF32");
-        ft::FT_CHECK(false);
+        FT_CHECK(false);
 #endif
     }
     return nullptr;
@@ -205,10 +206,10 @@ void LlamaTritonModel<T>::handleMissingParams()
 template<typename T>
 LlamaTritonModel<T>::~LlamaTritonModel()
 {
-    ft::FT_CHECK(weights_.size() == engines_.size());
+    FT_CHECK(weights_.size() == engines_.size());
     for (int device_id = 0; device_id < (int)engines_.size(); ++device_id) {
         // Set device id before destructing CUDA resources
-        ft::check_cuda_error(cudaSetDevice(device_id));
+        check_cuda_error(cudaSetDevice(device_id));
         engines_[device_id].reset();
         weights_[device_id].reset();
     }
@@ -222,7 +223,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
                                       std::string config):
     tensor_para_size_(tensor_para_size),
     pipeline_para_size_(pipeline_para_size),
-    weights_(ft::getDeviceCount()),
+    weights_(getDeviceCount()),
     enable_custom_all_reduce_(enable_custom_all_reduce)
 {
     FT_CHECK_WITH_INFO(!(config.empty() && model_dir.empty()), "invalid init options");
@@ -242,7 +243,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     }
     catch (const YAML::Exception& e) {
         std::cerr << "Error reading YAML config: " << e.what() << std::endl;
-        ft::FT_CHECK(false);
+        FT_CHECK(false);
     }
 
     const auto model_reader     = reader["model_config"];
@@ -305,7 +306,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     engine_param_.num_tokens_per_iter = engine_reader["num_tokens_per_iter"].as<int>(0);
     engine_param_.max_prefill_iters   = engine_reader["max_prefill_iters"].as<int>(1);
 
-    lora_param_.policy        = ft::getLoraPolicy(reader["lora_config"]["lora_policy"].as<std::string>(""));
+    lora_param_.policy        = getLoraPolicy(reader["lora_config"]["lora_policy"].as<std::string>(""));
     lora_param_.r             = lora_reader["lora_r"].as<int>(0);
     lora_param_.scale         = lora_reader["lora_scale"].as<float>(0);
     lora_param_.max_wo_r      = lora_reader["lora_max_wo_r"].as<int>(0);
@@ -329,75 +330,75 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
 
     handleMissingParams();
 
-    shared_state_          = std::make_shared<ft::SharedState>();
-    shared_state_->barrier = std::make_shared<ft::Barrier>(tensor_para_size);
+    shared_state_          = std::make_shared<SharedState>();
+    shared_state_->barrier = std::make_shared<Barrier>(tensor_para_size);
 
-    const auto device_count = ft::getDeviceCount();
+    const auto device_count = getDeviceCount();
     engines_.resize(device_count);
 
     const std::string weight_type_str = model_reader["weight_type"].as<std::string>();
     if (weight_type_str == "fp16" || weight_type_str == "float16") {
-        model_param_.weight_type = ft::WeightType::kFP16;
+        model_param_.weight_type = WeightType::kFP16;
     }
     else if (weight_type_str == "bf16" || weight_type_str == "bfloat16") {
-        model_param_.weight_type = ft::WeightType::kBF16;
+        model_param_.weight_type = WeightType::kBF16;
     }
     else if (weight_type_str == "fp32") {
-        model_param_.weight_type = ft::WeightType::kFP32;
+        model_param_.weight_type = WeightType::kFP32;
     }
     else if (weight_type_str == "int8") {
-        model_param_.weight_type = ft::WeightType::kINT8;
+        model_param_.weight_type = WeightType::kINT8;
     }
     else if (weight_type_str == "int4") {
-        model_param_.weight_type = ft::WeightType::kINT4;
+        model_param_.weight_type = WeightType::kINT4;
     }
     else {
         std::cout << "[ERROR] Unsupported weight type: '" << weight_type_str << "'\n";
-        ft::FT_CHECK(0);
+        FT_CHECK(0);
     }
 
     if (auto method = get_moe_method()) {
         moe_param_.method = *method;
     }
     else {
-        moe_param_.method = ft::MoeParam::kFused;
+        moe_param_.method = MoeParam::kFused;
     }
 
     TM_LOG_INFO("%s", toString().c_str());
 }
 
 template<typename T>
-std::unique_ptr<ft::Engine<T>> LlamaTritonModel<T>::createSharedModelInstance(
-    int                                                               device_id,
-    int                                                               rank,
-    std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
-    std::shared_ptr<ft::AbstractCustomComm>                           custom_all_reduce_comm)
+std::unique_ptr<Engine<T>>
+LlamaTritonModel<T>::createSharedModelInstance(int                                                       device_id,
+                                               int                                                       rank,
+                                               std::pair<std::vector<NcclParam>, std::vector<NcclParam>> nccl_params,
+                                               std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm)
 {
-    ft::check_cuda_error(cudaSetDevice(device_id));
+    check_cuda_error(cudaSetDevice(device_id));
     const int comms_rank = device_id % (tensor_para_size_ * pipeline_para_size_);
 
-    auto ctx = std::make_unique<ft::Context<T>>(device_id);
+    auto ctx = std::make_unique<Context<T>>(device_id);
 
-    ft::NcclParam tensor_para   = nccl_params.first[comms_rank];
-    ft::NcclParam pipeline_para = nccl_params.second[comms_rank];
+    NcclParam tensor_para   = nccl_params.first[comms_rank];
+    NcclParam pipeline_para = nccl_params.second[comms_rank];
 
-    ft::FT_CHECK(tensor_para.world_size_ == tensor_para_size_);
-    ft::FT_CHECK(pipeline_para.world_size_ == pipeline_para_size_);
+    FT_CHECK(tensor_para.world_size_ == tensor_para_size_);
+    FT_CHECK(pipeline_para.world_size_ == pipeline_para_size_);
 
-    auto model = std::make_unique<ft::LlamaV2<T>>(model_param_,  //
-                                                  attn_param_,
-                                                  moe_param_,
-                                                  lora_param_,
-                                                  tensor_para,
-                                                  *ctx,
-                                                  engine_param_.max_batch_size,
-                                                  weights_[device_id]);
+    auto model = std::make_unique<LlamaV2<T>>(model_param_,  //
+                                              attn_param_,
+                                              moe_param_,
+                                              lora_param_,
+                                              tensor_para,
+                                              *ctx,
+                                              engine_param_.max_batch_size,
+                                              weights_[device_id]);
 
-    auto engine = std::make_unique<ft::Engine<T>>(engine_param_,  //
-                                                  std::move(model),
-                                                  std::move(ctx),
-                                                  shared_state_,
-                                                  device_id);
+    auto engine = std::make_unique<Engine<T>>(engine_param_,  //
+                                              std::move(model),
+                                              std::move(ctx),
+                                              shared_state_,
+                                              device_id);
 
     // Wait for pinned buffers to be allocated for all ranks, otherwise tuning will hang
     // due to concurrent kernel launch & cudaMallocHost
@@ -413,14 +414,14 @@ std::unique_ptr<AbstractTransformerModelInstance>
 LlamaTritonModel<T>::createModelInstance(int          device_id,
                                          int          rank,
                                          cudaStream_t stream,
-                                         std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>,
-                                         std::shared_ptr<ft::AbstractCustomComm>)
+                                         std::pair<std::vector<NcclParam>, std::vector<NcclParam>>,
+                                         std::shared_ptr<AbstractCustomComm>)
 {
-    ft::check_cuda_error(cudaSetDevice(device_id));
+    check_cuda_error(cudaSetDevice(device_id));
 
-    ft::FT_CHECK(engines_[device_id] != nullptr);
+    FT_CHECK(engines_[device_id] != nullptr);
 
-    auto allocator = std::make_unique<ft::Allocator<ft::AllocatorType::CUDA>>(device_id, false);
+    auto allocator = std::make_unique<Allocator<AllocatorType::CUDA>>(device_id, false);
 
     allocator->setStream(stream);
 
@@ -430,12 +431,12 @@ LlamaTritonModel<T>::createModelInstance(int          device_id,
 template<typename T>
 void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
 {
-    ft::check_cuda_error(cudaSetDevice(device_id));
+    check_cuda_error(cudaSetDevice(device_id));
     const int tensor_para_rank   = rank % tensor_para_size_;
     const int pipeline_para_rank = rank / tensor_para_size_;
-    ft::FT_CHECK(pipeline_para_size_ == 1 && pipeline_para_rank == 0);
-    weights_[device_id] = std::make_shared<ft::LlamaWeight<T>>(
-        model_param_, lora_param_, moe_param_, tensor_para_size_, tensor_para_rank);
+    FT_CHECK(pipeline_para_size_ == 1 && pipeline_para_rank == 0);
+    weights_[device_id] =
+        std::make_shared<LlamaWeight<T>>(model_param_, lora_param_, moe_param_, tensor_para_size_, tensor_para_rank);
     // model inited with model_dir
     if (model_dir_ != "") {
         weights_[device_id]->loadModel(model_dir_);
@@ -444,37 +445,41 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
 }
 
 template<typename T>
-TensorMap LlamaTritonModel<T>::getParams(int deviceId, int rank)
+std::unordered_map<std::string, Tensor> LlamaTritonModel<T>::getParams(int deviceId, int rank)
 {
-    ft::check_cuda_error(cudaSetDevice(deviceId));
+    check_cuda_error(cudaSetDevice(deviceId));
+
     // shared_weight should be created before getParams
-    ft::FT_CHECK(weights_[deviceId] != nullptr);
-    ft::TensorMap output = weights_[deviceId]->getParams();
-    TensorMap     result;
+    FT_CHECK(weights_[deviceId] != nullptr);
+
+    TensorMap output = weights_[deviceId]->getParams();
+
+    std::unordered_map<std::string, Tensor> result;
     for (auto [name, tensor] : output) {
-        result.emplace(name, triton::Tensor{tensor.where, tensor.type, tensor.shape, tensor.data});
+        result.insert({{name, Tensor{tensor.where, tensor.type, tensor.shape, tensor.data}}});
     }
+
     return result;
 }
 
 template<typename T>
 void LlamaTritonModel<T>::processWeights(int device_id, int rank)
 {
-    ft::check_cuda_error(cudaSetDevice(device_id));
-    ft::FT_CHECK(weights_[device_id] != nullptr);
+    check_cuda_error(cudaSetDevice(device_id));
+    FT_CHECK(weights_[device_id] != nullptr);
 
     cudaDeviceProp props{};
-    ft::check_cuda_error(cudaGetDeviceProperties(&props, device_id));
+    check_cuda_error(cudaGetDeviceProperties(&props, device_id));
 
     weights_[device_id]->prepare(props);
-    ft::sync_check_cuda_error();
+    sync_check_cuda_error();
 }
 
 template<typename T>
-void LlamaTritonModel<T>::createEngine(int                                                               device_id,
-                                       int                                                               rank,
-                                       std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
-                                       std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm)
+void LlamaTritonModel<T>::createEngine(int                                                       device_id,
+                                       int                                                       rank,
+                                       std::pair<std::vector<NcclParam>, std::vector<NcclParam>> nccl_params,
+                                       std::shared_ptr<AbstractCustomComm>                       custom_all_reduce_comm)
 {
 
     auto engine = createSharedModelInstance(device_id, rank, nccl_params, custom_all_reduce_comm);
@@ -515,17 +520,11 @@ std::string LlamaTritonModel<T>::toString()
 }
 
 template<typename T>
-void LlamaTritonModel<T>::createCustomComms(
-    std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms, int world_size)
+void LlamaTritonModel<T>::createCustomComms(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms,
+                                            int                                               world_size)
 {
-    using commDataType = typename ft::CustomARCommTypeConverter<T>::Type;
-    ft::initCustomAllReduceComm<commDataType>(custom_all_reduce_comms, enable_custom_all_reduce_, world_size);
-}
-
-template<typename T>
-std::unique_ptr<ft::AbstractInstanceComm> LlamaTritonModel<T>::createInstanceComm(int size)
-{
-    return nullptr;
+    using commDataType = typename CustomARCommTypeConverter<T>::Type;
+    initCustomAllReduceComm<commDataType>(custom_all_reduce_comms, enable_custom_all_reduce_, world_size);
 }
 
 template<typename T>
@@ -547,3 +546,5 @@ template struct LlamaTritonModel<half>;
 #ifdef ENABLE_BF16
 template struct LlamaTritonModel<__nv_bfloat16>;
 #endif
+
+}  // namespace turbomind
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
index a6c1b862ac..8f473cd4cd 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
@@ -31,7 +31,7 @@
 #include <cuda_fp16.h>
 #include <mutex>
 
-namespace ft = turbomind;
+namespace turbomind {
 
 template<typename T>
 struct LlamaTritonModel: public AbstractTransformerModel {
@@ -44,27 +44,25 @@ struct LlamaTritonModel: public AbstractTransformerModel {
     ~LlamaTritonModel() override;
 
     std::unique_ptr<AbstractTransformerModelInstance>
-    createModelInstance(int                                                               deviceId,
-                        int                                                               rank,
-                        cudaStream_t                                                      stream,
-                        std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
-                        std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr) override;
+    createModelInstance(int                                                       deviceId,
+                        int                                                       rank,
+                        cudaStream_t                                              stream,
+                        std::pair<std::vector<NcclParam>, std::vector<NcclParam>> nccl_params,
+                        std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm = nullptr) override;
 
     void createSharedWeights(int deviceId, int rank) override;
 
-    TensorMap getParams(int deviceId, int rank) override;
+    std::unordered_map<std::string, Tensor> getParams(int deviceId, int rank) override;
 
     void processWeights(int deviceId, int rank) override;
 
-    void createEngine(int                                                               device_id,
-                      int                                                               rank,
-                      std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
-                      std::shared_ptr<ft::AbstractCustomComm>) override;
+    void createEngine(int                                                       device_id,
+                      int                                                       rank,
+                      std::pair<std::vector<NcclParam>, std::vector<NcclParam>> nccl_params,
+                      std::shared_ptr<AbstractCustomComm>) override;
 
-    void createCustomComms(std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms,
-                           int                                                   world_size) override;
-
-    std::unique_ptr<ft::AbstractInstanceComm> createInstanceComm(int size) override;
+    void createCustomComms(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms,
+                           int                                               world_size) override;
 
     void handleMissingParams();
 
@@ -78,24 +76,24 @@ struct LlamaTritonModel: public AbstractTransformerModel {
     int         getPipelineParaSize() override;
 
 private:
-    std::unique_ptr<ft::Engine<T>>
-    createSharedModelInstance(int                                                               deviceId,
-                              int                                                               rank,
-                              std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
-                              std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr);
-
-    ft::ModelParam     model_param_;
-    ft::AttentionParam attn_param_;
-    ft::MoeParam       moe_param_;
-    ft::LoraParam      lora_param_;
-    ft::EngineParam    engine_param_;
-    size_t             tensor_para_size_;
-    size_t             pipeline_para_size_;
-
-    std::shared_ptr<ft::SharedState> shared_state_;
+    std::unique_ptr<Engine<T>>
+    createSharedModelInstance(int                                                       deviceId,
+                              int                                                       rank,
+                              std::pair<std::vector<NcclParam>, std::vector<NcclParam>> nccl_params,
+                              std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm = nullptr);
+
+    ModelParam     model_param_;
+    AttentionParam attn_param_;
+    MoeParam       moe_param_;
+    LoraParam      lora_param_;
+    EngineParam    engine_param_;
+    size_t         tensor_para_size_;
+    size_t         pipeline_para_size_;
+
+    std::shared_ptr<SharedState> shared_state_;
     // Weights & engine instances for the ranks
-    std::vector<std::shared_ptr<ft::LlamaWeight<T>>> weights_;
-    std::vector<std::shared_ptr<ft::Engine<T>>>      engines_;
+    std::vector<std::shared_ptr<LlamaWeight<T>>> weights_;
+    std::vector<std::shared_ptr<Engine<T>>>      engines_;
 
     bool is_fp16_;
     int  enable_custom_all_reduce_ = 0;
@@ -105,3 +103,5 @@ struct LlamaTritonModel: public AbstractTransformerModel {
 
     ffi_api_lock_ctrl_t ffi_lock_ = nullptr;
 };
+
+}  // namespace turbomind
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc
index 8221f932ce..976fc9cc1d 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc
@@ -31,78 +31,23 @@
 #include <unordered_map>
 #include <vector>
 
-namespace ft = turbomind;
+namespace turbomind {
 
 template<typename T>
-void triton_stream_callback(std::unordered_map<std::string, ft::Tensor>* output_tensors, void* ctx)
+void triton_stream_callback(std::unordered_map<std::string, Tensor>* outputs, void* ctx)
 {
-    LlamaTritonModelInstance<T>* model  = reinterpret_cast<LlamaTritonModelInstance<T>*>(ctx);
-    auto                         result = LlamaTritonModelInstance<T>::convert_outputs(*output_tensors);
-
-    model->stream_cb_(result, model->stream_ctx_);
+    LlamaTritonModelInstance<T>* model = reinterpret_cast<LlamaTritonModelInstance<T>*>(ctx);
+    model->stream_cb_(std::make_shared<std::unordered_map<std::string, Tensor>>(*outputs), model->stream_ctx_);
 }
 
 template<typename T>
-LlamaTritonModelInstance<T>::LlamaTritonModelInstance(ft::Engine<T>&                                          instance,
-                                                      std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator,
-                                                      int device_id):
+LlamaTritonModelInstance<T>::LlamaTritonModelInstance(Engine<T>&                                      instance,
+                                                      std::unique_ptr<Allocator<AllocatorType::CUDA>> allocator,
+                                                      int                                             device_id):
     device_id_{device_id}, instance_(&instance), allocator_(std::move(allocator))
 {
 }
 
-template<typename T>
-std::unordered_map<std::string, ft::Tensor> LlamaTritonModelInstance<T>::convert_inputs(
-    std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-
-    const size_t request_batch_size = input_tensors->at("input_ids").shape[0];
-    const size_t input_data_len     = input_tensors->at("input_ids").shape[1];
-    h_total_output_lengths_ =
-        (uint32_t*)std::realloc((void*)h_total_output_lengths_, request_batch_size * sizeof(uint32_t));
-
-    std::unordered_map<std::string, ft::Tensor> ft_input_tensors{};
-
-    for (auto t = input_tensors->begin(); t != input_tensors->end(); ++t) {
-        if (ft_input_tensors.count(t->first) == 0) {
-            ft_input_tensors.insert({t->first, t->second.convertTritonTensorToFt()});
-        }
-    }
-
-    return ft_input_tensors;
-}
-
-template<typename T>
-std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
-LlamaTritonModelInstance<T>::convert_outputs(const std::unordered_map<std::string, ft::Tensor>& output_tensors)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    std::unordered_map<std::string, triton::Tensor>* outputs_mapping =
-        new std::unordered_map<std::string, triton::Tensor>();
-
-    for (auto it = output_tensors.begin(); it != output_tensors.end(); it++) {
-        outputs_mapping->insert({it->first, triton::Tensor::convertFtTensorToTriton(it->second)});
-    }
-
-    return std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>(outputs_mapping);
-}
-
-template<typename T>
-std::shared_ptr<std::vector<triton::Tensor>>
-LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::vector<triton::Tensor>> input_tensors)
-{
-    ft::FT_CHECK(false);
-    return nullptr;
-}
-
-template<typename T>
-std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
-LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
-{
-    ft::FT_CHECK(false);
-    return nullptr;
-}
-
 template<typename T>
 std::string format_vector(const std::vector<T>& vec)
 {
@@ -118,120 +63,109 @@ std::string format_vector(const std::vector<T>& vec)
 }
 
 template<typename T>
-std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
-LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors,
-                                     ft::AbstractInstanceComm*                                        instance_comm)
+std::shared_ptr<std::unordered_map<std::string, Tensor>>
+LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, Tensor>> inputs)
 {
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
 
     // In some cases, this is needed to trigger the creation of CUDA context, or later `cudaMallocAsync` will die
-    ft::check_cuda_error(cudaSetDevice(device_id_));
+    check_cuda_error(cudaSetDevice(device_id_));
 
-    FT_CHECK_WITH_INFO(input_tensors->at("input_ids").shape.size() == 2,
-                       "input_tensors->at(\"input_ids\").shape.size() == 2");
-    FT_CHECK_WITH_INFO(input_tensors->at("input_lengths").shape.size() == 1,
-                       "input_tensors->at(\"input_lengths\").shape.size() == 1");
+    FT_CHECK_WITH_INFO(inputs->at("input_ids").shape.size() == 2, "inputs->at(\"input_ids\").shape.size() == 2");
+    FT_CHECK_WITH_INFO(inputs->at("input_lengths").shape.size() == 1,
+                       "inputs->at(\"input_lengths\").shape.size() == 1");
 
-    const uint32_t request_batch_size     = input_tensors->at("input_ids").shape[0];
-    const uint32_t max_request_output_len = (size_t)*std::max_element(
-        (int*)input_tensors->at("request_output_len").data,
-        (int*)input_tensors->at("request_output_len").data + input_tensors->at("request_output_len").shape[0]);
+    const uint32_t request_batch_size     = inputs->at("input_ids").shape[0];
+    const uint32_t max_request_output_len = (size_t)*std::max_element((int*)inputs->at("request_output_len").data,
+                                                                      (int*)inputs->at("request_output_len").data
+                                                                          + inputs->at("request_output_len").shape[0]);
     // const uint32_t total_output_len = max_request_output_len + input_tensors->at("input_ids").shape[1];
-    const uint32_t beam_width =
-        input_tensors->count("beam_width") ? (size_t)(*(uint*)input_tensors->at("beam_width").data) : 1;
+    const uint32_t beam_width = inputs->count("beam_width") ? (size_t)(*(uint*)inputs->at("beam_width").data) : 1;
     FT_CHECK_WITH_INFO(beam_width == 1, "Beam search is not implemented");
 
-    std::unordered_map<std::string, ft::Tensor> ft_input_tensors = convert_inputs(input_tensors);
+    h_total_output_lengths_ =
+        (uint32_t*)std::realloc((void*)h_total_output_lengths_, request_batch_size * sizeof(uint32_t));
 
-    const size_t max_input_len = input_tensors->at("input_ids").shape[1];
-    const bool   is_return_logits =
-        input_tensors->count("is_return_logits") && *(bool*)input_tensors->at("is_return_logits").data;
+    const size_t max_input_len    = inputs->at("input_ids").shape[1];
+    const bool   is_return_logits = inputs->count("is_return_logits") && *(bool*)inputs->at("is_return_logits").data;
 
     const size_t vocab_size = instance_->model().vocab_size();
 
     allocateBuffer(request_batch_size, max_input_len, beam_width, instance_->session_len(), is_return_logits);
 
-    std::unordered_map<std::string, ft::Tensor> output_tensors = std::unordered_map<std::string, ft::Tensor>{
+    std::unordered_map<std::string, Tensor> outputs{
         {"output_ids",
-         ft::Tensor{ft::MEMORY_CPU,
-                    ft::TYPE_UINT32,
-                    std::vector<size_t>{request_batch_size, beam_width, (size_t)instance_->session_len()},
-                    d_output_ids_}},
+         Tensor{MEMORY_CPU,
+                TYPE_UINT32,
+                std::vector<size_t>{request_batch_size, beam_width, (size_t)instance_->session_len()},
+                d_output_ids_}},
         {"sequence_length",
-         ft::Tensor{ft::MEMORY_CPU,
-                    ft::TYPE_UINT32,
-                    std::vector<size_t>{request_batch_size, beam_width},
-                    d_sequence_lengths_}}};
-
-    if (input_tensors->count("is_return_log_probs") && *((bool*)input_tensors->at("is_return_log_probs").data)) {
-        output_tensors.insert({"output_log_probs",
-                               ft::Tensor{ft::MEMORY_GPU,
-                                          ft::TYPE_FP32,
-                                          std::vector<size_t>{request_batch_size, beam_width, max_request_output_len},
-                                          d_output_log_probs_}});
-        output_tensors.insert({"cum_log_probs",
-                               ft::Tensor{ft::MEMORY_GPU,
-                                          ft::TYPE_FP32,
-                                          std::vector<size_t>{request_batch_size, beam_width},
-                                          d_cum_log_probs_}});
+         Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{request_batch_size, beam_width}, d_sequence_lengths_}}};
+
+    if (inputs->count("is_return_log_probs") && *((bool*)inputs->at("is_return_log_probs").data)) {
+        outputs.insert({"output_log_probs",
+                        Tensor{MEMORY_GPU,
+                               TYPE_FP32,
+                               std::vector<size_t>{request_batch_size, beam_width, max_request_output_len},
+                               d_output_log_probs_}});
+        outputs.insert(
+            {"cum_log_probs",
+             Tensor{MEMORY_GPU, TYPE_FP32, std::vector<size_t>{request_batch_size, beam_width}, d_cum_log_probs_}});
     }
 
-    if (input_tensors->count("logprobs")) {
+    if (inputs->count("logprobs")) {
         size_t max_logprob_length = std::min((int)max_request_output_len, instance_->session_len()) + 1;
         h_logprob_vals_           = (float*)std::realloc(
-            h_logprob_vals_, sizeof(float) * request_batch_size * beam_width * max_logprob_length * ft::kMaxLogProb);
-        h_logprob_indexes_ = (uint32_t*)std::realloc(h_logprob_indexes_,
-                                                     sizeof(uint32_t) * request_batch_size * beam_width
-                                                         * max_logprob_length * ft::kMaxLogProb);
-        h_logprob_nums_    = (uint32_t*)std::realloc(
+            h_logprob_vals_, sizeof(float) * request_batch_size * beam_width * max_logprob_length * kMaxLogProb);
+        h_logprob_indexes_ = (uint32_t*)std::realloc(
+            h_logprob_indexes_, sizeof(uint32_t) * request_batch_size * beam_width * max_logprob_length * kMaxLogProb);
+        h_logprob_nums_ = (uint32_t*)std::realloc(
             h_logprob_nums_, sizeof(uint32_t) * request_batch_size * beam_width * max_logprob_length);
 
-        output_tensors.insert(
-            {{"logprob_vals",
-              ft::Tensor{ft::MEMORY_CPU,
-                         ft::TYPE_FP32,
-                         std::vector<size_t>{request_batch_size, beam_width, max_logprob_length, ft::kMaxLogProb},
-                         h_logprob_vals_}}});
-
-        output_tensors.insert(
-            {{"logprob_indexes",
-              ft::Tensor{ft::MEMORY_CPU,
-                         ft::TYPE_UINT32,
-                         std::vector<size_t>{request_batch_size, beam_width, max_logprob_length, ft::kMaxLogProb},
-                         h_logprob_indexes_}}});
-
-        output_tensors.insert({{"logprob_nums",
-                                ft::Tensor{ft::MEMORY_CPU,
-                                           ft::TYPE_UINT32,
-                                           std::vector<size_t>{request_batch_size, beam_width, max_logprob_length},
-                                           h_logprob_nums_}}});
+        outputs.insert({{"logprob_vals",
+                         Tensor{MEMORY_CPU,
+                                TYPE_FP32,
+                                std::vector<size_t>{request_batch_size, beam_width, max_logprob_length, kMaxLogProb},
+                                h_logprob_vals_}}});
+
+        outputs.insert({{"logprob_indexes",
+                         Tensor{MEMORY_CPU,
+                                TYPE_UINT32,
+                                std::vector<size_t>{request_batch_size, beam_width, max_logprob_length, kMaxLogProb},
+                                h_logprob_indexes_}}});
+
+        outputs.insert({{"logprob_nums",
+                         Tensor{MEMORY_CPU,
+                                TYPE_UINT32,
+                                std::vector<size_t>{request_batch_size, beam_width, max_logprob_length},
+                                h_logprob_nums_}}});
     }
 
     if (is_return_logits) {
-        output_tensors.insert(
-            {"logits",
-             {ft::MEMORY_GPU, ft::TYPE_FP32, {request_batch_size, max_input_len, vocab_size}, d_output_logits_}});
+        outputs.insert(
+            {{"logits", {MEMORY_GPU, TYPE_FP32, {request_batch_size, max_input_len, vocab_size}, d_output_logits_}}});
     }
 
     try {
-        ft::Request::Callback callback;
+        Request::Callback callback;
 
         if (stream_cb_) {
-            callback = [this](std::unordered_map<std::string, ft::Tensor>* outputs) {
+            callback = [this](std::unordered_map<std::string, Tensor>* outputs) {
                 triton_stream_callback<T>(outputs, this);
             };
         }
 
-        ft::check_cuda_error(cudaStreamSynchronize(allocator_->returnStream()));
-        instance_->Submit(&output_tensors, &ft_input_tensors, {instance_comm, callback});
+        check_cuda_error(cudaStreamSynchronize(allocator_->returnStream()));
+
+        instance_->Submit(&outputs, inputs.get(), {callback});
         // ! stream synced by the model before returning
     }
     catch (...) {
         h_exception_ = std::current_exception();
-        output_tensors.insert({"error_message", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_BYTES, {1}, &h_exception_}});
+        outputs.insert({"error_message", Tensor{MEMORY_CPU, TYPE_BYTES, {1}, &h_exception_}});
     }
 
-    return convert_outputs(output_tensors);
+    return std::make_shared<std::unordered_map<std::string, Tensor>>(std::move(outputs));
 }
 
 template<typename T>
@@ -278,3 +212,5 @@ template struct LlamaTritonModelInstance<half>;
 #ifdef ENABLE_BF16
 template struct LlamaTritonModelInstance<__nv_bfloat16>;
 #endif
+
+}  // namespace turbomind
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h
index 08088c05d5..2cf69b9fa5 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h
@@ -20,41 +20,29 @@
 
 #pragma once
 
+#include <memory>
+
 #include "src/turbomind/models/llama/LlamaBatch.h"
 #include "src/turbomind/models/llama/LlamaV2.h"
 #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
 #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
-#include <memory>
 
-namespace ft = turbomind;
+namespace turbomind {
 
 template<typename T>
 struct LlamaTritonModelInstance: AbstractTransformerModelInstance {
 
-    LlamaTritonModelInstance(ft::Engine<T>&                                          instance,
-                             std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator,
-                             int                                                     device_id);
-    ~LlamaTritonModelInstance();
-
-    std::shared_ptr<std::vector<triton::Tensor>>
-    forward(std::shared_ptr<std::vector<triton::Tensor>> input_tensors) override;
+    LlamaTritonModelInstance(Engine<T>&                                      instance,
+                             std::unique_ptr<Allocator<AllocatorType::CUDA>> allocator,
+                             int                                             device_id);
+    ~LlamaTritonModelInstance() override;
 
-    std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
-    forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors) override;
-
-    std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
-    forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors,
-            ft::AbstractInstanceComm*) override;
-
-    static std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
-    convert_outputs(const std::unordered_map<std::string, ft::Tensor>& output_tensors);
+    virtual std::shared_ptr<std::unordered_map<std::string, Tensor>>
+    forward(std::shared_ptr<std::unordered_map<std::string, Tensor>> input_tensors) override;
 
 private:
-    ft::Engine<T>*                                                instance_;
-    const std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator_;
-
-    std::unordered_map<std::string, ft::Tensor>
-    convert_inputs(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors);
+    Engine<T>*                                            instance_;
+    const std::unique_ptr<Allocator<AllocatorType::CUDA>> allocator_;
 
     void allocateBuffer(const size_t request_batch_size,
                         const size_t max_input_len,
@@ -88,3 +76,5 @@ struct LlamaTritonModelInstance: AbstractTransformerModelInstance {
     uint32_t*          h_total_output_lengths_ = nullptr;
     std::exception_ptr h_exception_            = nullptr;
 };
+
+}  // namespace turbomind
diff --git a/src/turbomind/triton_backend/transformer_triton_backend.cpp b/src/turbomind/triton_backend/transformer_triton_backend.cpp
index 16c64b17d5..acf5e06e88 100644
--- a/src/turbomind/triton_backend/transformer_triton_backend.cpp
+++ b/src/turbomind/triton_backend/transformer_triton_backend.cpp
@@ -21,62 +21,66 @@
 #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
 #include "src/turbomind/utils/nccl_utils.h"
 
-std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>
+namespace turbomind {
+
+std::pair<std::vector<NcclParam>, std::vector<NcclParam>>
 AbstractTransformerModel::createNcclParams(const int node_id, const int device_id_start, const bool multi_node)
 {
-    const int gpu_count          = ft::getDeviceCount();
+    const int gpu_count          = getDeviceCount();
     const int tensor_para_size   = getTensorParaSize();
     const int pipeline_para_size = getPipelineParaSize();
     const int local_comm_size    = multi_node ? gpu_count : tensor_para_size * pipeline_para_size;
-    ft::FT_CHECK(tensor_para_size > 0 && pipeline_para_size > 0);
-    ft::FT_CHECK(device_id_start + (int)local_comm_size <= gpu_count);
+    FT_CHECK(tensor_para_size > 0 && pipeline_para_size > 0);
+    FT_CHECK(device_id_start + (int)local_comm_size <= gpu_count);
 
-    std::vector<ft::NcclUid> nccl_ids;
+    std::vector<NcclUid> nccl_ids;
     if (tensor_para_size > 1 || pipeline_para_size > 1) {
         nccl_ids.resize(tensor_para_size + pipeline_para_size);
         if (node_id == 0) {
             for (uint32_t i = 0; i < nccl_ids.size(); i++) {
-                ft::ftNcclGetUniqueId(nccl_ids[i]);
+                ftNcclGetUniqueId(nccl_ids[i]);
             }
         }
     }
 
-    std::vector<ft::NcclParam> tensor_para_params(local_comm_size);
-    std::vector<ft::NcclParam> pipeline_para_params(local_comm_size);
+    std::vector<NcclParam> tensor_para_params(local_comm_size);
+    std::vector<NcclParam> pipeline_para_params(local_comm_size);
     // Don't init comm when size == 1
     if (tensor_para_size > 1) {
-        const auto group_id = ft::ftNcclNextGroupId();
-        ft::ftNcclGroupStart();
+        const auto group_id = ftNcclNextGroupId();
+        ftNcclGroupStart();
         for (int gid = device_id_start; gid < device_id_start + local_comm_size; gid++) {
             int rank               = node_id * gpu_count + gid - device_id_start;
             int tensor_para_rank   = rank % tensor_para_size;
             int pipeline_para_rank = rank / tensor_para_size;
 
-            ft::NcclUid tensor_para_nccl_uid = nccl_ids[pipeline_para_rank];
-            ft::check_cuda_error(cudaSetDevice(gid));
-            ft::ftNcclCommInitRank(
+            NcclUid tensor_para_nccl_uid = nccl_ids[pipeline_para_rank];
+            check_cuda_error(cudaSetDevice(gid));
+            ftNcclCommInitRank(
                 tensor_para_params[gid - device_id_start], tensor_para_rank, tensor_para_size, tensor_para_nccl_uid);
             tensor_para_params[gid - device_id_start].group_id_ = group_id;
         }
-        ft::ftNcclGroupEnd();
+        ftNcclGroupEnd();
     }
     if (pipeline_para_size > 1) {
-        const auto group_id = ft::ftNcclNextGroupId();
-        ft::ftNcclGroupStart();
+        const auto group_id = ftNcclNextGroupId();
+        ftNcclGroupStart();
         for (int gid = device_id_start; gid < device_id_start + local_comm_size; gid++) {
             int rank               = node_id * gpu_count + gid - device_id_start;
             int tensor_para_rank   = rank % tensor_para_size;
             int pipeline_para_rank = rank / tensor_para_size;
 
-            ft::NcclUid pipeline_para_nccl_uid = nccl_ids[pipeline_para_size + tensor_para_rank];
-            ft::check_cuda_error(cudaSetDevice(gid));
-            ft::ftNcclCommInitRank(pipeline_para_params[gid - device_id_start],
-                                   pipeline_para_rank,
-                                   pipeline_para_size,
-                                   pipeline_para_nccl_uid);
+            NcclUid pipeline_para_nccl_uid = nccl_ids[pipeline_para_size + tensor_para_rank];
+            check_cuda_error(cudaSetDevice(gid));
+            ftNcclCommInitRank(pipeline_para_params[gid - device_id_start],
+                               pipeline_para_rank,
+                               pipeline_para_size,
+                               pipeline_para_nccl_uid);
             pipeline_para_params[gid - device_id_start].group_id_ = group_id;
         }
-        ft::ftNcclGroupEnd();
+        ftNcclGroupEnd();
     }
-    return std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>(tensor_para_params, pipeline_para_params);
+    return std::pair<std::vector<NcclParam>, std::vector<NcclParam>>(tensor_para_params, pipeline_para_params);
 }
+
+}  // namespace turbomind
diff --git a/src/turbomind/triton_backend/transformer_triton_backend.hpp b/src/turbomind/triton_backend/transformer_triton_backend.hpp
index 066d75a780..6d49df4578 100644
--- a/src/turbomind/triton_backend/transformer_triton_backend.hpp
+++ b/src/turbomind/triton_backend/transformer_triton_backend.hpp
@@ -30,242 +30,11 @@
 
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/custom_ar_comm.h"
-#include "src/turbomind/utils/instance_comm.h"
 #include "src/turbomind/utils/nccl_utils.h"
 
-namespace ft = turbomind;
+namespace turbomind {
 
-namespace triton {
-#ifdef USE_TRITONSERVER_DATATYPE
-
-#include "triton/core/tritonbackend.h"
-#include "triton/core/tritonserver.h"
-
-#ifndef TRITONSERVER_API_VERSION_MAJOR
-#error TRITONSERVER_API_VERSION_MAJOR Undefined!
-#endif
-
-#ifndef TRITONSERVER_API_VERSION_MINOR
-#error TRITONSERVER_API_VERSION_MINOR Undefined!
-#endif
-
-#if (TRITONSERVER_API_VERSION_MAJOR == 1 && TRITONSERVER_API_VERSION_MINOR >= 17)                                      \
-    || (TRITONSERVER_API_VERSION_MAJOR > 1)
-#define ENABLE_TRITON_BF16 1
-#endif
-
-typedef TRITONSERVER_DataType   DataType;
-typedef TRITONSERVER_MemoryType MemoryType;
-
-constexpr TRITONSERVER_DataType TYPE_INVALID = TRITONSERVER_TYPE_INVALID;
-constexpr TRITONSERVER_DataType TYPE_BOOL    = TRITONSERVER_TYPE_BOOL;
-constexpr TRITONSERVER_DataType TYPE_UINT8   = TRITONSERVER_TYPE_UINT8;
-constexpr TRITONSERVER_DataType TYPE_UINT16  = TRITONSERVER_TYPE_UINT16;
-constexpr TRITONSERVER_DataType TYPE_UINT32  = TRITONSERVER_TYPE_UINT32;
-constexpr TRITONSERVER_DataType TYPE_UINT64  = TRITONSERVER_TYPE_UINT64;
-constexpr TRITONSERVER_DataType TYPE_INT8    = TRITONSERVER_TYPE_INT8;
-constexpr TRITONSERVER_DataType TYPE_INT16   = TRITONSERVER_TYPE_INT16;
-constexpr TRITONSERVER_DataType TYPE_INT32   = TRITONSERVER_TYPE_INT32;
-constexpr TRITONSERVER_DataType TYPE_INT64   = TRITONSERVER_TYPE_INT64;
-constexpr TRITONSERVER_DataType TYPE_FP16    = TRITONSERVER_TYPE_FP16;
-constexpr TRITONSERVER_DataType TYPE_FP32    = TRITONSERVER_TYPE_FP32;
-constexpr TRITONSERVER_DataType TYPE_FP64    = TRITONSERVER_TYPE_FP64;
-constexpr TRITONSERVER_DataType TYPE_BYTES   = TRITONSERVER_TYPE_BYTES;
-
-#ifdef ENABLE_TRITON_BF16
-constexpr TRITONSERVER_DataType TYPE_BF16 = TRITONSERVER_TYPE_BF16;
-#endif
-constexpr TRITONSERVER_MemoryType MEMORY_CPU        = TRITONSERVER_MEMORY_CPU;
-constexpr TRITONSERVER_MemoryType MEMORY_CPU_PINNED = TRITONSERVER_MEMORY_CPU_PINNED;
-constexpr TRITONSERVER_MemoryType MEMORY_GPU        = TRITONSERVER_MEMORY_GPU;
-
-#else
-
-typedef ft::DataType   DataType;
-typedef ft::MemoryType MemoryType;
-
-constexpr DataType   TYPE_INVALID      = ft::TYPE_INVALID;
-constexpr DataType   TYPE_BOOL         = ft::TYPE_BOOL;
-constexpr DataType   TYPE_UINT8        = ft::TYPE_UINT8;
-constexpr DataType   TYPE_UINT16       = ft::TYPE_UINT16;
-constexpr DataType   TYPE_UINT32       = ft::TYPE_UINT32;
-constexpr DataType   TYPE_UINT64       = ft::TYPE_UINT64;
-constexpr DataType   TYPE_INT8         = ft::TYPE_INT8;
-constexpr DataType   TYPE_INT16        = ft::TYPE_INT16;
-constexpr DataType   TYPE_INT32        = ft::TYPE_INT32;
-constexpr DataType   TYPE_INT64        = ft::TYPE_INT64;
-constexpr DataType   TYPE_FP16         = ft::TYPE_FP16;
-constexpr DataType   TYPE_FP32         = ft::TYPE_FP32;
-constexpr DataType   TYPE_FP64         = ft::TYPE_FP64;
-constexpr DataType   TYPE_BYTES        = ft::TYPE_BYTES;
-constexpr DataType   TYPE_BF16         = ft::TYPE_BF16;
-constexpr MemoryType MEMORY_CPU        = ft::MEMORY_CPU;
-constexpr MemoryType MEMORY_CPU_PINNED = ft::MEMORY_CPU_PINNED;
-constexpr MemoryType MEMORY_GPU        = ft::MEMORY_GPU;
-
-#endif
-
-struct Tensor {
-    const MemoryType          where;
-    const DataType            type;
-    const std::vector<size_t> shape;
-    const void*               data;
-
-    Tensor(const MemoryType _where, const DataType _type, const std::vector<size_t> _shape, const void* _data):
-        where(_where), type(_type), shape(_shape), data(_data)
-    {
-    }
-
-    static ft::DataType convertTritonTypeToFt(DataType tmp_type)
-    {
-        ft::DataType ft_data_type;
-        switch (tmp_type) {
-            case TYPE_INVALID:
-                ft_data_type = ft::DataType::TYPE_INVALID;
-                break;
-            case TYPE_BOOL:
-                ft_data_type = ft::DataType::TYPE_BOOL;
-                break;
-            case TYPE_UINT8:
-                ft_data_type = ft::DataType::TYPE_UINT8;
-                break;
-            case TYPE_UINT16:
-                ft_data_type = ft::DataType::TYPE_UINT16;
-                break;
-            case TYPE_UINT32:
-                ft_data_type = ft::DataType::TYPE_UINT32;
-                break;
-            case TYPE_UINT64:
-                ft_data_type = ft::DataType::TYPE_UINT64;
-                break;
-            case TYPE_INT8:
-                ft_data_type = ft::DataType::TYPE_INT8;
-                break;
-            case TYPE_INT16:
-                ft_data_type = ft::DataType::TYPE_INT16;
-                break;
-            case TYPE_INT32:
-                ft_data_type = ft::DataType::TYPE_INT32;
-                break;
-            case TYPE_INT64:
-                ft_data_type = ft::DataType::TYPE_INT64;
-                break;
-            case TYPE_FP16:
-                ft_data_type = ft::DataType::TYPE_FP16;
-                break;
-            case TYPE_FP32:
-                ft_data_type = ft::DataType::TYPE_FP32;
-                break;
-            case TYPE_FP64:
-                ft_data_type = ft::DataType::TYPE_FP64;
-                break;
-#ifdef ENABLE_TRITON_BF16
-            case TYPE_BF16:
-                ft_data_type = ft::DataType::TYPE_BF16;
-                break;
-#endif
-            case TYPE_BYTES:
-                ft_data_type = ft::DataType::TYPE_BYTES;
-                break;
-            default:
-                FT_CHECK_WITH_INFO(false, "Unknown data type with type id: " + std::to_string(tmp_type));
-                break;
-        }
-        return ft_data_type;
-    }
-
-    ft::Tensor convertTritonTensorToFt()
-    {
-        ft::DataType   ft_data_type = convertTritonTypeToFt(type);
-        ft::MemoryType ft_memory_type;
-        switch (where) {
-            case MEMORY_CPU:
-                ft_memory_type = ft::MemoryType::MEMORY_CPU;
-                break;
-            case MEMORY_CPU_PINNED:
-                ft_memory_type = ft::MemoryType::MEMORY_CPU_PINNED;
-                break;
-            case MEMORY_GPU:
-                ft_memory_type = ft::MemoryType::MEMORY_GPU;
-                break;
-        }
-        return ft::Tensor{ft_memory_type, ft_data_type, shape, data};
-    }
-
-    static Tensor convertFtTensorToTriton(ft::Tensor ft_tensor)
-    {
-        DataType triton_data_type;
-        switch (ft_tensor.type) {
-            case TYPE_INVALID:
-                triton_data_type = TYPE_INVALID;
-                break;
-            case TYPE_BOOL:
-                triton_data_type = TYPE_BOOL;
-                break;
-            case TYPE_UINT8:
-                triton_data_type = TYPE_UINT8;
-                break;
-            case TYPE_UINT16:
-                triton_data_type = TYPE_UINT16;
-                break;
-            case TYPE_UINT32:
-                triton_data_type = TYPE_UINT32;
-                break;
-            case TYPE_UINT64:
-                triton_data_type = TYPE_UINT64;
-                break;
-            case TYPE_INT8:
-                triton_data_type = TYPE_INT8;
-                break;
-            case TYPE_INT16:
-                triton_data_type = TYPE_INT16;
-                break;
-            case TYPE_INT32:
-                triton_data_type = TYPE_INT32;
-                break;
-            case TYPE_INT64:
-                triton_data_type = TYPE_INT64;
-                break;
-            case TYPE_FP16:
-                triton_data_type = TYPE_FP16;
-                break;
-            case TYPE_FP32:
-                triton_data_type = TYPE_FP32;
-                break;
-            case TYPE_FP64:
-                triton_data_type = TYPE_FP64;
-                break;
-#ifdef ENABLE_TRITON_BF16
-            case TYPE_BF16:
-                triton_data_type = TYPE_BF16;
-                break;
-#endif
-            case TYPE_BYTES:
-                triton_data_type = TYPE_BYTES;
-                break;
-            default:
-                FT_CHECK_WITH_INFO(false, "Unknown data type with type id: " + std::to_string(ft_tensor.type));
-                break;
-        }
-        MemoryType triton_memory_type;
-        switch (ft_tensor.where) {
-            case MEMORY_CPU:
-                triton_memory_type = MEMORY_CPU;
-                break;
-            case MEMORY_CPU_PINNED:
-                triton_memory_type = MEMORY_CPU_PINNED;
-                break;
-            case MEMORY_GPU:
-                triton_memory_type = MEMORY_GPU;
-                break;
-        }
-        return Tensor{triton_memory_type, triton_data_type, ft_tensor.shape, ft_tensor.data};
-    }
-};
-
-}  // namespace triton
-
-using triton_stream_cb_t = std::function<void(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>, void*)>;
+using triton_stream_cb_t = std::function<void(std::shared_ptr<std::unordered_map<std::string, Tensor>>, void*)>;
 
 struct AbstractTransformerModel;
 struct AbstractTransformerModelInstance;
@@ -273,17 +42,8 @@ struct AbstractTransformerModelInstance;
 struct AbstractTransformerModelInstance {
     virtual ~AbstractTransformerModelInstance() = default;
 
-    virtual std::shared_ptr<std::vector<triton::Tensor>>
-    forward(std::shared_ptr<std::vector<triton::Tensor>> input_tensors) = 0;
-
-    virtual std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
-    forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors) = 0;
-
-    virtual std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
-    forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors, ft::AbstractInstanceComm*)
-    {
-        return forward(input_tensors);
-    }
+    virtual std::shared_ptr<std::unordered_map<std::string, Tensor>>
+    forward(std::shared_ptr<std::unordered_map<std::string, Tensor>> input_tensors) = 0;
 
     void registerCallback(triton_stream_cb_t cb, void* ctx)
     {
@@ -301,43 +61,38 @@ struct AbstractTransformerModelInstance {
     void*              stream_ctx_ = nullptr;
 };
 
-using TensorMap = std::unordered_map<std::string, triton::Tensor>;
-
 struct AbstractTransformerModel {
     static std::shared_ptr<AbstractTransformerModel> createLlamaModel(std::string model_dir);
 
     virtual ~AbstractTransformerModel() = default;
 
-    virtual std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>
+    virtual std::pair<std::vector<NcclParam>, std::vector<NcclParam>>
     createNcclParams(const int node_id, const int device_id_start = 0, const bool multi_node = false);
 
-    virtual void createCustomComms(std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms,
-                                   int                                                   world_size) = 0;
-
-    virtual std::unique_ptr<ft::AbstractInstanceComm> createInstanceComm(int size)
-    {
-        return nullptr;
-    }
+    virtual void createCustomComms(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms,
+                                   int                                               world_size) = 0;
 
     virtual std::unique_ptr<AbstractTransformerModelInstance>
-    createModelInstance(int                                                               deviceId,
-                        int                                                               rank,
-                        cudaStream_t                                                      stream,
-                        std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
-                        std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr) = 0;
+    createModelInstance(int                                                       deviceId,
+                        int                                                       rank,
+                        cudaStream_t                                              stream,
+                        std::pair<std::vector<NcclParam>, std::vector<NcclParam>> nccl_params,
+                        std::shared_ptr<AbstractCustomComm>                       custom_all_reduce_comm = nullptr) = 0;
 
     virtual void createSharedWeights(int deviceId, int rank) = 0;
 
-    virtual TensorMap getParams(int deviceId, int rank) = 0;
+    virtual std::unordered_map<std::string, Tensor> getParams(int deviceId, int rank) = 0;
 
     virtual void processWeights(int deviceId, int rank) = 0;
 
-    virtual void createEngine(int                                                               device_id,
-                              int                                                               rank,
-                              std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
-                              std::shared_ptr<ft::AbstractCustomComm>) = 0;
+    virtual void createEngine(int                                                       device_id,
+                              int                                                       rank,
+                              std::pair<std::vector<NcclParam>, std::vector<NcclParam>> nccl_params,
+                              std::shared_ptr<AbstractCustomComm>) = 0;
 
     virtual std::string toString()            = 0;
     virtual int         getTensorParaSize()   = 0;
     virtual int         getPipelineParaSize() = 0;
 };
+
+}  // namespace turbomind
diff --git a/src/turbomind/utils/Tensor.h b/src/turbomind/utils/Tensor.h
index 6214f6bbc2..b2b8524e09 100644
--- a/src/turbomind/utils/Tensor.h
+++ b/src/turbomind/utils/Tensor.h
@@ -515,6 +515,16 @@ class TensorMap {
         return tensor_map_.end();
     }
 
+    int count(const std::string& key) const
+    {
+        return tensor_map_.count(key);
+    }
+
+    bool empty() const
+    {
+        return tensor_map_.empty();
+    }
+
     std::string      toString();
     static TensorMap fromNpyFolder(const std::string& base_folder);
     void             saveNpy(const std::string& base_folder);
diff --git a/src/turbomind/utils/instance_comm.h b/src/turbomind/utils/instance_comm.h
deleted file mode 100644
index 5a25360a05..0000000000
--- a/src/turbomind/utils/instance_comm.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#pragma once
-
-namespace turbomind {
-
-class AbstractInstanceComm {
-public:
-    virtual ~AbstractInstanceComm() = default;
-
-    virtual void barrier() = 0;
-
-    virtual void setSharedObject(void*) = 0;
-
-    virtual void* getSharedObject() = 0;
-};
-
-}  // namespace turbomind

From ad21c4d73ac856ddc1fc96b9b54231ae266199bd Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Mon, 2 Dec 2024 13:58:27 +0800
Subject: [PATCH 095/122] add openssh-server installation in dockerfile (#2830)

* add openssh-server installation in dockerfile

* add sudo
---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 664dc7271f..caa58ee637 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -13,7 +13,7 @@ ARG PYTHON_VERSION=3.10
 ARG TORCH_VERSION=2.3.0
 ARG TORCHVISION_VERSION=0.18.0
 
-RUN apt-get update -y && apt-get install -y software-properties-common wget vim git curl &&\
+RUN apt-get update -y && apt-get install -y software-properties-common wget vim git curl openssh-server ssh sudo &&\
     curl https://sh.rustup.rs -sSf | sh -s -- -y &&\
     add-apt-repository ppa:deadsnakes/ppa -y && apt-get update -y && apt-get install -y --no-install-recommends \
     ninja-build rapidjson-dev libgoogle-glog-dev gdb python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \

From 776677a43961cc985eb03c0197c0adf620b9ebc5 Mon Sep 17 00:00:00 2001
From: zhabuye <74179177+zhabuye@users.noreply.github.com>
Date: Mon, 2 Dec 2024 14:01:05 +0800
Subject: [PATCH 096/122] Add version restrictions in runtime_ascend.txt to
 ensure functionality  (#2836)

---
 requirements/runtime_ascend.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/runtime_ascend.txt b/requirements/runtime_ascend.txt
index d87748e396..05d74bbe72 100644
--- a/requirements/runtime_ascend.txt
+++ b/requirements/runtime_ascend.txt
@@ -1,5 +1,5 @@
 accelerate>=0.29.3
-dlinfer-ascend
+dlinfer-ascend>=0.1.2
 einops
 fastapi
 fire

From b91ce9a259d3af4bba14c05b968fdf24373545d6 Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Mon, 2 Dec 2024 15:26:29 +0800
Subject: [PATCH 097/122] Fix gemma2 accuracy through the correct softcapping
 logic (#2842)

* Fix gemma2 accuracy through the correct softcapping logic

* remove debugging codes
---
 lmdeploy/pytorch/kernels/cuda/flashattention.py | 17 ++++++++++-------
 lmdeploy/pytorch/kernels/cuda/pagedattention.py |  6 ++++--
 lmdeploy/pytorch/models/gemma.py                |  9 ++++++++-
 3 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/lmdeploy/pytorch/kernels/cuda/flashattention.py b/lmdeploy/pytorch/kernels/cuda/flashattention.py
index 7521a3e2bb..34a11ae030 100644
--- a/lmdeploy/pytorch/kernels/cuda/flashattention.py
+++ b/lmdeploy/pytorch/kernels/cuda/flashattention.py
@@ -49,7 +49,7 @@ def softcapping(qk, logit_softcapping: tl.constexpr):
 
 @triton.jit
 def _prefill_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, q1, k1_ptrs,
-                       loop_start, loop_end, qk_scale, history_mask,
+                       loop_start, loop_end, sm_scale, history_mask,
                        kv_min_loc, causal_mask: tl.constexpr,
                        window_size: tl.constexpr,
                        logit_softcapping: tl.constexpr, BLOCK_N: tl.constexpr,
@@ -71,8 +71,9 @@ def _prefill_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, q1, k1_ptrs,
             qk += tl.dot(q1, k1)
 
         if causal_mask:
-            qk *= qk_scale
+            qk *= sm_scale
             qk = softcapping(qk, logit_softcapping)
+            qk = qk * tl_log2(math.e)
             qk_mask = (history_mask[:, None]) >= (start_n + offs_n[None, :])
             if window_size > 0:
                 qk_mask = qk_mask and (
@@ -85,8 +86,9 @@ def _prefill_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, q1, k1_ptrs,
             m_i_new = tl.maximum(m_i, tl.max(qk, 1))
             qk -= m_i_new[:, None]
         elif window_size > 0:
-            qk *= qk_scale
+            qk *= sm_scale
             qk = softcapping(qk, logit_softcapping)
+            qk = qk * tl_log2(math.e)
             qk_mask = ((start_n + offs_n[None, :]) >= kv_min_loc[:, None])
             qk = tl.where(
                 qk_mask,
@@ -96,11 +98,13 @@ def _prefill_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, q1, k1_ptrs,
             m_i_new = tl.maximum(m_i, tl.max(qk, 1))
             qk -= m_i_new[:, None]
         elif logit_softcapping > 0:
-            qk *= qk_scale
+            qk *= sm_scale
             qk = softcapping(qk, logit_softcapping)
+            qk = qk * tl_log2(math.e)
             m_i_new = tl.maximum(m_i, tl.max(qk, 1))
             qk -= m_i_new[:, None]
         else:
+            qk_scale = sm_scale * tl_log2(math.e)
             m_i_new = tl.maximum(m_i, tl.max(qk, 1) * qk_scale)
             qk = qk * qk_scale - m_i_new[:, None]
 
@@ -256,7 +260,6 @@ def _flash_prefill_fwd_kernel(
     l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + 1.0
     acc = tl.zeros([BLOCK_M, BLOCK_DV], dtype=tl.float32)
 
-    qk_scale = sm_scale * tl_log2(math.e)
     history_mask = history_len + start_m * BLOCK_M + tl.arange(0, BLOCK_M)
 
     loop_end = (history_len + start_m * BLOCK_M) // BLOCK_N * BLOCK_N
@@ -270,7 +273,7 @@ def _flash_prefill_fwd_kernel(
                                        k1_ptrs,
                                        loop_start,
                                        loop_end,
-                                       qk_scale,
+                                       sm_scale,
                                        history_mask,
                                        kv_min_loc,
                                        causal_mask=False,
@@ -291,7 +294,7 @@ def _flash_prefill_fwd_kernel(
                                        k1_ptrs,
                                        loop_start,
                                        loop_end,
-                                       qk_scale,
+                                       sm_scale,
                                        history_mask,
                                        kv_min_loc,
                                        causal_mask=True,
diff --git a/lmdeploy/pytorch/kernels/cuda/pagedattention.py b/lmdeploy/pytorch/kernels/cuda/pagedattention.py
index bbd6d3cf78..fe44ca4344 100644
--- a/lmdeploy/pytorch/kernels/cuda/pagedattention.py
+++ b/lmdeploy/pytorch/kernels/cuda/pagedattention.py
@@ -205,11 +205,12 @@ def _fwd_grouped_split_kernel(
         qk += tl.dot(q, k)
         if BLOCK_DMODEL1 != 0:
             qk += tl.dot(q1, k1)
-        qk *= sm_scale * tl_log2(math.e)
+        qk *= sm_scale
         if logit_softcapping > 0.0:
             qk = qk / logit_softcapping
             qk = tanh(qk)
             qk = qk * logit_softcapping
+        qk = qk * tl_log2(math.e)
         # NOTE: inf - inf = nan, and nan will leads to error
         if start_n + BLOCK_N > history_len or window_size > 0:
             qk_mask = history_len >= (start_n + offs_n)
@@ -491,11 +492,12 @@ def _fwd_grouped_split_quant_kernel(
         qk += tl.dot(q, k)
         if BLOCK_DMODEL1 != 0:
             qk += tl.dot(q1, k1)
-        qk *= sm_scale * tl_log2(math.e)
+        qk *= sm_scale
         if logit_softcapping > 0.0:
             qk = qk / logit_softcapping
             qk = tanh(qk)
             qk = qk * logit_softcapping
+        qk = qk * tl_log2(math.e)
         # NOTE: inf - inf = nan, and nan will leads to error
         if start_n + BLOCK_N > history_len or window_size > 0:
             qk_mask = history_len >= (start_n + offs_n)
diff --git a/lmdeploy/pytorch/models/gemma.py b/lmdeploy/pytorch/models/gemma.py
index 450767bda3..ca36f15651 100644
--- a/lmdeploy/pytorch/models/gemma.py
+++ b/lmdeploy/pytorch/models/gemma.py
@@ -383,6 +383,8 @@ def __init__(self,
                                             bias=False,
                                             dtype=dtype,
                                             device=device)
+        self.final_logit_softcapping = getattr(config,
+                                               'final_logit_softcapping', None)
 
     def forward(
         self,
@@ -405,7 +407,12 @@ def forward(
 
     def get_logits(self, hidden_states: torch.Tensor):
         """compute logits of the model output."""
-        return self.lm_head(hidden_states)
+        logits = self.lm_head(hidden_states)
+        if self.final_logit_softcapping is not None:
+            logits = logits / self.final_logit_softcapping
+            logits = torch.tanh(logits)
+            logits = logits * self.final_logit_softcapping
+        return logits
 
     def get_input_embeddings(self):
         """get input embeddings."""

From c158d1877bc31aeb49f8e1b16536a882246bc130 Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Mon, 2 Dec 2024 16:09:00 +0800
Subject: [PATCH 098/122] fix accessing before initialization (#2845)

* fix accessing before initialization

* fix linting
---
 .../models/llama/LlamaDecoderLayerWeight.cc   | 44 +++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
index 0a2a3be175..393a6a0e87 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -68,6 +68,28 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int               layer_id,
     tensor_para_size_(tp_size),
     tensor_para_rank_(tp_rank)
 {
+    self_attn_weights = LlamaAttentionWeight<T>{hidden_units_,
+                                                size_per_head_,
+                                                head_num_,
+                                                kv_head_num_,
+                                                model.mla,
+                                                attn_bias_,
+                                                tensor_para_size_,
+                                                weight_type_,
+                                                model.group_size};
+
+    ffn_weights = LlamaFfnWeight<T>{
+        hidden_units_,
+        inter_size_,
+        tensor_para_size_,
+        weight_type_,
+        model.group_size,
+        weight_type_ == WeightType::kINT4 && is_fuse_silu_act(),
+    };
+
+    moe_weights = MoeFfnWeight<T>{
+        layer_id, moe_param, hidden_units_, weight_type_, model.group_size, tensor_para_size_, is_fuse_silu_act()};
+
     if (lora_param.policy == LoraPolicy::kPlora) {
         std::vector<std::string> keys = {
             "attention.w_qkv", "attention.wo", "feed_forward.w1", "feed_forward.w2", "feed_forward.w3"};
@@ -106,28 +128,6 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int               layer_id,
     }
 
     fused_up_and_gate_ = ffn_weights.gating.lora.policy != LoraPolicy::kPlora;
-
-    self_attn_weights = LlamaAttentionWeight<T>{hidden_units_,
-                                                size_per_head_,
-                                                head_num_,
-                                                kv_head_num_,
-                                                model.mla,
-                                                attn_bias_,
-                                                tensor_para_size_,
-                                                weight_type_,
-                                                model.group_size};
-
-    ffn_weights = LlamaFfnWeight<T>{
-        hidden_units_,
-        inter_size_,
-        tensor_para_size_,
-        weight_type_,
-        model.group_size,
-        weight_type_ == WeightType::kINT4 && is_fuse_silu_act(),
-    };
-
-    moe_weights = MoeFfnWeight<T>{
-        layer_id, moe_param, hidden_units_, weight_type_, model.group_size, tensor_para_size_, is_fuse_silu_act()};
 }
 
 template<typename T>

From 986ad17c173d2052cb9b6eb7a8e866cf917e6991 Mon Sep 17 00:00:00 2001
From: q yao <streetyao@live.com>
Date: Mon, 2 Dec 2024 16:18:43 +0800
Subject: [PATCH 099/122] better kv allocate (#2814)

* better allocate

* update max session len
---
 lmdeploy/pytorch/engine/cache_engine.py    | 135 +++++++++------------
 lmdeploy/pytorch/engine/engine.py          |  26 +++-
 lmdeploy/pytorch/engine/engine_instance.py |  13 +-
 lmdeploy/pytorch/engine/model_agent.py     |   4 +-
 4 files changed, 80 insertions(+), 98 deletions(-)

diff --git a/lmdeploy/pytorch/engine/cache_engine.py b/lmdeploy/pytorch/engine/cache_engine.py
index 8eaa563947..e393adeed3 100644
--- a/lmdeploy/pytorch/engine/cache_engine.py
+++ b/lmdeploy/pytorch/engine/cache_engine.py
@@ -54,7 +54,7 @@ def __init__(
         self.cache_stream = torch.cuda.Stream()
         assert self.cache_stream != torch.cuda.current_stream()
         # Initialize the events for stream synchronization.
-        self.events = [torch.cuda.Event() for _ in range(self.num_layers)]
+        self.events = torch.cuda.Event()
 
         logger.debug(
             f'Initialize cache engine with {cache_config.num_gpu_blocks}'
@@ -156,80 +156,60 @@ def get_value_block_shape(self,
             local=local,
         )
 
-    def allocate_gpu_cache(self):
-        """allocate caches on GPU."""
-        gpu_cache: List[KVCache] = []
+    def _allocate_cache(self, num_blocks: int, device: torch.device):
+        """allocate cache implement."""
         key_block_shape = self.get_key_block_shape(local=True)
         value_block_shape = self.get_value_block_shape(local=True)
 
-        for _ in range(self.num_layers):
-            key_blocks = torch.empty(
-                size=(self.num_gpu_blocks, *key_block_shape),
-                dtype=self.kv_cache_dtype,
-                device='cuda',
+        num_layers = self.num_layers
+        kv_cache_dtype = self.kv_cache_dtype
+
+        key_cache = torch.empty(
+            size=(num_layers, num_blocks, *key_block_shape),
+            dtype=kv_cache_dtype,
+            device=device,
+        )
+        value_cache = torch.empty(
+            size=(num_layers, num_blocks, *value_block_shape),
+            dtype=kv_cache_dtype,
+            device=device,
+        )
+
+        output = (key_cache, value_cache)
+
+        if self.cache_config.quant_policy in (4, 8):
+            dtype = self.model_config.dtype
+            key_sz_cache = torch.empty(
+                size=(num_layers, num_blocks, *key_block_shape[:-1], 2),
+                dtype=dtype,
+                device=device,
             )
-            value_blocks = torch.empty(
-                size=(self.num_gpu_blocks, *value_block_shape),
-                dtype=self.kv_cache_dtype,
-                device='cuda',
+            val_sz_cache = torch.empty(
+                size=(num_layers, num_blocks, *value_block_shape[:-1], 2),
+                dtype=dtype,
+                device=device,
             )
-            if self.cache_config.quant_policy in (4, 8):
-                key_scales_zeros = torch.empty(
-                    size=(self.num_gpu_blocks, *key_block_shape[:-1], 2),
-                    dtype=self.model_config.dtype,
-                    device='cuda',
-                )
-                value_scales_zeros = torch.empty(
-                    size=(self.num_gpu_blocks, *value_block_shape[:-1], 2),
-                    dtype=self.model_config.dtype,
-                    device='cuda',
-                )
-                gpu_cache.append((key_blocks, value_blocks, key_scales_zeros,
-                                  value_scales_zeros))
-            else:
-                gpu_cache.append((key_blocks, value_blocks))
-
-        return gpu_cache
+            output = output + (key_sz_cache, val_sz_cache)
+
+        return output
+
+    def allocate_gpu_cache(self):
+        """allocate caches on GPU."""
+        caches = self._allocate_cache(self.num_gpu_blocks, 'cuda')
+        self.full_gpu_cache = caches
+        self.local_gpu_cache = list(zip(*caches))
+        return self.local_gpu_cache
 
     def allocate_cpu_cache(self):
         """allocate caches on Host."""
-        cpu_cache: List[KVCache] = []
-        key_block_shape = self.get_key_block_shape(local=True)
-        value_block_shape = self.get_value_block_shape(local=True)
-
-        # TODO: pin memory might need be banned on wsl
-        pin_memory = True
+        caches = self._allocate_cache(self.num_gpu_blocks, 'cpu')
 
-        for _ in range(self.num_layers):
-            key_blocks = torch.empty(
-                size=(self.num_cpu_blocks, *key_block_shape),
-                dtype=self.kv_cache_dtype,
-                pin_memory=pin_memory,
-            )
-            value_blocks = torch.empty(
-                size=(self.num_cpu_blocks, *value_block_shape),
-                dtype=self.kv_cache_dtype,
-                pin_memory=pin_memory,
-            )
-            if self.cache_config.quant_policy in (4, 8):
-                key_scales_zeros = torch.empty(
-                    size=(self.num_cpu_blocks, *key_block_shape[:-1], 2),
-                    dtype=self.model_config.dtype,
-                    pin_memory=pin_memory,
-                )
-                value_scales_zeros = torch.empty(
-                    size=(self.num_cpu_blocks, *value_block_shape[:-1], 2),
-                    dtype=self.model_config.dtype,
-                    pin_memory=pin_memory,
-                )
-                cpu_cache.append((key_blocks, value_blocks, key_scales_zeros,
-                                  value_scales_zeros))
-            else:
-                cpu_cache.append((key_blocks, value_blocks))
-        return cpu_cache
+        self.full_cpu_cache = caches
+        self.local_cpu_cache = list(zip(*caches))
+        return self.local_cpu_cache
 
     @torch.inference_mode()
-    def _swap(self, src: List[KVCache], dst: List[KVCache],
+    def _swap(self, src: List[torch.Tensor], dst: List[torch.Tensor],
               src_to_dst: Dict[int, int]):
         """Move caches from src memory to dst memory.
 
@@ -238,18 +218,19 @@ def _swap(self, src: List[KVCache], dst: List[KVCache],
             dst (List[KVCache]): Destination cache.
             src_to_dst (Dict[int, int]): Map between src and dst.
         """
+        BLOCKS_PER_COPY = 2
+        num_copy = len(src_to_dst)
+        src_idx, dst_idx = list(zip(*src_to_dst.items()))
+        src_idx = torch.tensor(src_idx, device=src[0].device)
+        dst_idx = torch.tensor(dst_idx, device=dst[0].device)
         with torch.cuda.stream(self.cache_stream):
-            for i in range(self.num_layers):
-                src_key_cache, src_value_cache = src[i]
-                dst_key_cache, dst_value_cache = dst[i]
-
-                for src_id, dst_id in src_to_dst.items():
-                    if isinstance(dst_key_cache[dst_id], torch.Tensor):
-                        dst_key_cache[dst_id].copy_(src_key_cache[src_id])
-                        dst_value_cache[dst_id].copy_(src_value_cache[src_id])
-
-                    event = self.events[i]
-                    event.record(stream=self.cache_stream)
+            for scache, dcache in zip(src, dst):
+                for idx in range(0, num_copy, BLOCKS_PER_COPY):
+                    sidx = src_idx[idx:idx + BLOCKS_PER_COPY]
+                    didx = dst_idx[idx:idx + BLOCKS_PER_COPY]
+                    sdata = scache[:, sidx]
+                    dcache.index_copy_(1, didx, sdata.to(dcache.device))
+            self.events.record(stream=self.cache_stream)
 
     def swap_in(self, src_to_dst: Dict[int, int]) -> None:
         """Move cache from Host to Device.
@@ -257,7 +238,7 @@ def swap_in(self, src_to_dst: Dict[int, int]) -> None:
         Args:
             src_to_dst (Dict[int, int]): Map between src and dst.
         """
-        self._swap(self.local_cpu_cache, self.local_gpu_cache, src_to_dst)
+        self._swap(self.full_cpu_cache, self.full_gpu_cache, src_to_dst)
 
     def swap_out(self, src_to_dst: Dict[int, int]) -> None:
         """Move cache from Device to Host.
@@ -265,7 +246,7 @@ def swap_out(self, src_to_dst: Dict[int, int]) -> None:
         Args:
             src_to_dst (Dict[int, int]): Map between src and dst.
         """
-        self._swap(self.local_gpu_cache, self.local_cpu_cache, src_to_dst)
+        self._swap(self.full_gpu_cache, self.full_cpu_cache, src_to_dst)
 
     @classmethod
     def get_cache_block_size(cls,
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
index cffe13bbdb..26b507e9d4 100644
--- a/lmdeploy/pytorch/engine/engine.py
+++ b/lmdeploy/pytorch/engine/engine.py
@@ -164,6 +164,7 @@ def __init__(self,
         self.cache_config = cache_config
         self.backend_config = backend_config
         self.stream = self.model_agent.stream
+        self.max_session_len = self._get_max_session_len()
 
         self.req_manager = self._bind_request_manager()
 
@@ -261,6 +262,20 @@ def _response(self,
                      data=data,
                      err_msg=err_msg))
 
+    def _get_max_session_len(self):
+        """get max session len."""
+        session_len = self.scheduler_config.max_session_len
+        max_tokens = (self.cache_config.num_gpu_blocks *
+                      self.cache_config.block_size)
+        window_size = self.cache_config.window_size
+        if window_size > 0 and window_size <= max_tokens:
+            max_tokens = (1 << 63) - 1
+        if session_len is None:
+            session_len = max_tokens
+        else:
+            session_len = min(max_tokens, session_len)
+        return session_len
+
     def _on_add_session(self, reqs: Request, **kwargs):
         """on add session callback."""
         for req in reqs:
@@ -315,12 +330,11 @@ def __update_bad_words(msg):
 
         def __update_max_new_tokens(msg):
             """update max new tokens."""
-            max_session_len = self.scheduler_config.max_session_len
-            if max_session_len is not None:
-                sampling_param = msg.sampling_param
-                sampling_param.max_new_tokens = min(
-                    sampling_param.max_new_tokens,
-                    max_session_len - msg.num_all_tokens())
+            max_session_len = self.max_session_len
+            sampling_param = msg.sampling_param
+            sampling_param.max_new_tokens = min(
+                sampling_param.max_new_tokens,
+                max_session_len - msg.num_all_tokens())
 
         for req in reqs:
             session_id = req.data['session_id']
diff --git a/lmdeploy/pytorch/engine/engine_instance.py b/lmdeploy/pytorch/engine/engine_instance.py
index 3e741c7ba2..455ab1ccb3 100644
--- a/lmdeploy/pytorch/engine/engine_instance.py
+++ b/lmdeploy/pytorch/engine/engine_instance.py
@@ -89,21 +89,10 @@ class EngineInstance:
     """
 
     def __init__(self, engine: Engine):
-
-        def __get_max_input_len(engine):
-            """get max input len."""
-            cache_config = engine.cache_config
-            max_input_len = (cache_config.block_size *
-                             cache_config.num_gpu_blocks)
-            window_size = cache_config.window_size
-            if window_size > 0 and window_size <= max_input_len:
-                max_input_len = (1 << 63) - 1
-            return max_input_len
-
         self.engine = engine
         self.req_sender = engine.req_manager.build_sender()
 
-        self.max_input_len = __get_max_input_len(self.engine)
+        self.max_input_len = self.engine.max_session_len
 
     def __del__(self):
         """Destructor."""
diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
index 74938de812..2877f59375 100644
--- a/lmdeploy/pytorch/engine/model_agent.py
+++ b/lmdeploy/pytorch/engine/model_agent.py
@@ -120,9 +120,7 @@ def cache_swapping(cache_engine: CacheEngine, swap_in_map: dict,
         issued_cache_op = True
 
     if issued_cache_op:
-        cache_events = cache_engine.events
-        for event in cache_events:
-            event.wait()
+        cache_engine.events.wait()
 
 
 @torch.inference_mode()

From 6734c71ffc0e94323854eb6ed139dbe621e71a9d Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Mon, 2 Dec 2024 19:22:01 +0800
Subject: [PATCH 100/122] Update internvl chat template (#2832)

* Add internvl2-5 chat template

* fix template, using original internlm2 template
---
 lmdeploy/model.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index 47aaaa4e88..a4355ea131 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -597,9 +597,32 @@ def match(cls, model_path: str) -> Optional[str]:
         path = model_path.lower()
         if ('internvl2' in path
                 and 'internvl2-4b' not in path) or 'mono-internvl' in path:
+            if 'internvl2.5' in path or 'internvl2_5' in path:
+                return None
             return 'internvl2-internlm2'
 
 
+@MODELS.register_module(name='internvl2_5')
+class InternVL2_5(InternLM2Chat7B):
+
+    def __init__(
+            self,
+            meta_instruction='你是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',  # noqa
+            **kwargs):
+        super().__init__(meta_instruction=meta_instruction, **kwargs)
+
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.
+
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        path = model_path.lower()
+        if 'internvl2.5' in path or 'internvl2_5' in path:
+            return 'internvl2_5'
+
+
 @MODELS.register_module(name=['internlm-xcomposer2', 'internlm-xcomposer2d5'])
 class InternLMXComposer2Chat7B(InternLMChat7B):
     """Chat template and generation parameters of InternLM-XComposer2-7b."""

From 8fbfed685f328c7fff6ec46a17dfcd0a50d2a685 Mon Sep 17 00:00:00 2001
From: q yao <streetyao@live.com>
Date: Tue, 3 Dec 2024 11:14:35 +0800
Subject: [PATCH 101/122] profile throughput without new threads (#2826)

* profile throughput without threads

* optimize main loop

* fix torch.event

* fix python>3.11

* optimize tp

* reduce cudagraph copy

* optimize fill kv cache

* optimize silu and mul

* optimize apply rotary

* remove executor

* remove kernel

* remove num_heads==1
---
 benchmark/profile_throughput.py               |  38 ++--
 lmdeploy/pytorch/backends/cuda/attention.py   |   5 +-
 lmdeploy/pytorch/engine/engine.py             |  22 ++-
 lmdeploy/pytorch/engine/logits_process.py     |  30 ++-
 lmdeploy/pytorch/engine/model_agent.py        |  65 +------
 lmdeploy/pytorch/kernels/cuda/activation.py   | 105 +++-------
 .../kernels/cuda/apply_rotary_pos_emb.py      |  43 +----
 .../pytorch/kernels/cuda/fill_kv_cache.py     | 182 ++++++------------
 lmdeploy/pytorch/models/utils/cudagraph.py    |  28 ++-
 tests/pytorch/engine/test_logits_process.py   |   3 +-
 10 files changed, 177 insertions(+), 344 deletions(-)

diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py
index 58786d9c80..4f06fad4f9 100644
--- a/benchmark/profile_throughput.py
+++ b/benchmark/profile_throughput.py
@@ -1,12 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import argparse
+import asyncio
 import csv
 import json
 import os
 import random
 import time
 from queue import Queue
-from threading import Thread
 from typing import List, Tuple, Union
 
 import numpy as np
@@ -86,15 +86,15 @@ def __init__(self, model_path: str,
         self.csv = csv
         self.pbar = None
 
-    def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
-                   temperature: float, top_p: float, top_k: int,
-                   stream_output: bool):
+    async def _inference(self, req_queue: Queue, res_queue: Queue,
+                         session_id: int, temperature: float, top_p: float,
+                         top_k: int, stream_output: bool):
         model_inst = self.tm_model.create_instance()
         stats = []
         # get each generated token's latency
         per_token_latency_stats = []
         for prompt, input_seqlen, output_seqlen in iter(
-                req_queue.get, [None, None, None]):
+                req_queue.get_nowait, [None, None, None]):
             _per_token_latency_stats = [0] * (output_seqlen + 1)
             prev = time.perf_counter()
             n_prev_token = 0
@@ -102,7 +102,7 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
             input_ids = self.tokenizer(prompt).input_ids
             state = DetokenizeState(len(input_ids))
 
-            for outputs in model_inst.stream_infer(
+            async for outputs in model_inst.async_stream_infer(
                     session_id,
                     input_ids=input_ids,
                     gen_config=GenerationConfig(max_new_tokens=output_seqlen,
@@ -123,7 +123,7 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
                 prev = now
             # for pytorch engine to restart a session
             if isinstance(model_inst, EngineInstance):
-                model_inst.end(session_id)
+                await model_inst.async_end(session_id)
             assert output_seqlen <= n_token <= output_seqlen + 1, \
                 f'Error. session_id({session_id}) request {output_seqlen} ' \
                 f'tokens, but generate {n_token} tokens.\n' \
@@ -139,13 +139,12 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
             # skip the first token latency
             per_token_latency_stats.append(_per_token_latency_stats[1:])
             self.pbar.update(1)
-        res_queue.put((session_id, stats, per_token_latency_stats))
+        res_queue.put_nowait((session_id, stats, per_token_latency_stats))
 
     def process_request(self, requests, concurrency, temperature, top_p, top_k,
                         stream_output):
         res_queue = Queue()
         req_queue = Queue()
-        threads = []
 
         self.pbar = tqdm(total=len(requests))
 
@@ -157,18 +156,20 @@ def process_request(self, requests, concurrency, temperature, top_p, top_k,
 
         start = time.time()
 
+        event_loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(event_loop)
+
         # start threads
+        tasks = []
         for i in range(concurrency):
-            t = Thread(target=self._inference,
-                       args=(req_queue, res_queue, i, temperature, top_p,
-                             top_k, stream_output),
-                       daemon=True)
-            t.start()
-            threads.append(t)
+            task = self._inference(req_queue, res_queue, i, temperature, top_p,
+                                   top_k, stream_output)
+            tasks.append(task)
+
+        async def _gather_tasks(tasks):
+            return await asyncio.gather(*tasks)
 
-        # wait for finish
-        for t in threads:
-            t.join()
+        event_loop.run_until_complete(_gather_tasks(tasks))
 
         elapsed_time = time.time() - start
 
@@ -333,7 +334,6 @@ def main():
             block_size=args.cache_block_seq_len,
             max_batch_size=args.concurrency,
             tp=args.tp,
-            thread_safe=True,
             eager_mode=args.eager_mode,
             enable_prefix_caching=args.enable_prefix_caching,
             quant_policy=args.quant_policy,
diff --git a/lmdeploy/pytorch/backends/cuda/attention.py b/lmdeploy/pytorch/backends/cuda/attention.py
index d01d6fe9b4..8261b869f0 100644
--- a/lmdeploy/pytorch/backends/cuda/attention.py
+++ b/lmdeploy/pytorch/backends/cuda/attention.py
@@ -94,7 +94,10 @@ def forward(
         kv_seqlens = attn_metadata.kv_seqlens
         kv_flatten_size = attn_metadata.kv_flatten_size
         quant_policy = attn_metadata.quant_policy
-        max_q_seqlen = query.numel() // (query.size(-1) * query.size(-2))
+        if attn_metadata.is_decoding:
+            max_q_seqlen = 1
+        else:
+            max_q_seqlen = query.numel() // (query.size(-1) * query.size(-2))
         fill_max_q_seqlen = max_q_seqlen
         if attn_metadata.fill_seqlens is not None:
             fill_seqlens = attn_metadata.fill_seqlens
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
index 26b507e9d4..b7a803a7a7 100644
--- a/lmdeploy/pytorch/engine/engine.py
+++ b/lmdeploy/pytorch/engine/engine.py
@@ -172,6 +172,7 @@ def __init__(self,
         self._start_loop()
         self._create_buffers()
         self.engine_instance = self.create_instance()
+        self._output_stream = torch.cuda.Stream()
 
     @classmethod
     def from_pretrained(cls,
@@ -673,7 +674,8 @@ async def __long_context_single_forward(inputs):
         return ret
 
     def _make_infer_outputs(self, next_token_ids: torch.LongTensor,
-                            logits: torch.Tensor, stopped: torch.Tensor):
+                            logits: torch.Tensor, stopped: torch.Tensor,
+                            event: torch.cuda.Event):
         """make infer output."""
 
         def __get_out_token_ids(token: torch.Tensor, msg: SchedulerSequence,
@@ -694,6 +696,11 @@ def __get_q_start_loc():
             else:
                 return seq_length.cumsum(0) - seq_length
 
+        with torch.cuda.stream(self._output_stream):
+            event.wait()
+            next_token_ids = next_token_ids.cpu()
+            stopped = stopped.cpu()
+
         running = self._running
         is_run = [seq.status == MessageStatus.RUNNING for seq in running]
         stopped = stopped.tolist()
@@ -755,6 +762,8 @@ def __update_inputs(next_token_ids):
         logger.debug('<ForwardTask>: '
                      f'batch_size={inputs.seq_length.size(0)} '
                      f'num_tokens={inputs.input_ids.size(-1)}')
+        if self.gpu_count == 1:
+            inputs = inputs.to_device('cuda')
         is_decoding = inputs.is_decoding
         if all_ids is not None:
             all_ids = all_ids.cuda()
@@ -785,10 +794,11 @@ def __update_inputs(next_token_ids):
                 next_token_ids, sampling_inputs.stop_words, num_appendable_ids)
 
             # send output
-            stopped = stopped.cpu()
-            finish = stopped.all().item() or (idx == loop_count - 1)
+            finish = (idx == loop_count - 1)
             finish = finish or _check_finish(self.scheduler, idx)
-            output = (next_token_ids.cpu(), logits, stopped)
+            event = torch.cuda.Event()
+            event.record()
+            output = (next_token_ids, logits, stopped, event)
             output_que.put_nowait((finish, output))
 
             if finish:
@@ -951,9 +961,9 @@ async def __step():
                 try:
                     if isinstance(out, Exception):
                         raise out
-                    next_token_ids, logits, stopped = out
+                    next_token_ids, logits, stopped, event = out
                     step_outputs = self._make_infer_outputs(
-                        next_token_ids, logits, stopped)
+                        next_token_ids, logits, stopped, event)
                     __send_resps(step_outputs)
                 except Exception as e:
                     raise e
diff --git a/lmdeploy/pytorch/engine/logits_process.py b/lmdeploy/pytorch/engine/logits_process.py
index 54740a4fb3..24cb336d71 100644
--- a/lmdeploy/pytorch/engine/logits_process.py
+++ b/lmdeploy/pytorch/engine/logits_process.py
@@ -21,10 +21,9 @@ def _process_temperature_(scores: torch.Tensor, temperature: torch.Tensor):
 
 def _process_bad_words_(scores: torch.Tensor,
                         bad_words: torch.LongTensor,
+                        mask: torch.BoolTensor,
                         filter_value: float = -float('inf')):
     """process bad words."""
-    mask = bad_words >= 0
-    bad_words = bad_words.where(mask, 0)
     filtered_scores = scores.gather(1, bad_words)
     filtered_scores[mask] = filter_value
     scores.scatter_(1, bad_words, filtered_scores)
@@ -127,7 +126,9 @@ def _guided_sampling(response_formats: Tuple[Dict], scores: torch.Tensor,
 class SamplingInputs:
     temperature: torch.Tensor = None
     bad_words: torch.LongTensor = None
+    bad_mask: torch.BoolTensor = None
     stop_words: torch.LongTensor = None
+    stop_mask: torch.BoolTensor = None
     repetition_penalty: torch.Tensor = None
     top_k: torch.LongTensor = None
     top_p: torch.Tensor = None
@@ -200,9 +201,11 @@ def __get_bad_words(bad_words):
             """get bad words."""
             max_bw_len = max(len(bw) for bw in bad_words)
             if max_bw_len == 0:
-                return None
+                return None, None
             if all(len(bw) == max_bw_len for bw in bad_words):
-                return torch.tensor(bad_words)
+                ret = torch.tensor(bad_words)
+                mask = torch.ones_like(ret, dtype=bool)
+                return ret, mask
             ret = torch.full((batch_size, max_bw_len), -1, dtype=torch.int64)
             for idx, bw in enumerate(bad_words):
                 bw_len = len(bw)
@@ -210,7 +213,10 @@ def __get_bad_words(bad_words):
                     continue
                 bw = ret.new_tensor(bw)
                 ret[idx, :bw_len] = bw
-            return ret
+
+            mask = ret >= 0
+            ret = ret.where(mask, 0)
+            return ret, mask
 
         __gather_params()
 
@@ -221,8 +227,8 @@ def __get_bad_words(bad_words):
 
         temperature = torch.tensor(temperature)
 
-        bad_words = __get_bad_words(bad_words)
-        stop_words = __get_bad_words(stop_words)
+        bad_words, bad_mask = __get_bad_words(bad_words)
+        stop_words, stop_mask = __get_bad_words(stop_words)
 
         max_top_k = max(top_k)
         if min(top_k) <= 0:
@@ -243,7 +249,9 @@ def __get_bad_words(bad_words):
         sampling_input = cls(
             temperature=temperature,
             bad_words=bad_words,
+            bad_mask=bad_mask,
             stop_words=stop_words,
+            stop_mask=stop_mask,
             repetition_penalty=repetition_penalty,
             top_k=top_k,
             top_p=top_p,
@@ -326,12 +334,14 @@ def __call__(self, all_ids: torch.LongTensor,
 
         bad_words = sampling_inputs.bad_words
         if bad_words is not None:
-            scores = _process_bad_words_(scores, bad_words)
+            bad_mask = sampling_inputs.bad_mask
+            scores = _process_bad_words_(scores, bad_words, bad_mask)
 
         stop_words = sampling_inputs.stop_words
         if stop_words is not None:
-            stop_words = torch.where(self.ignore_eos[:, None], stop_words, -1)
-            scores = _process_bad_words_(scores, stop_words)
+            stop_mask = sampling_inputs.stop_mask
+            stop_mask = torch.where(self.ignore_eos[:, None], stop_mask, False)
+            scores = _process_bad_words_(scores, stop_words, stop_mask)
 
         scores = _guided_sampling(sampling_inputs.response_formats, scores,
                                   guided_input_ids, self.tokenizer)
diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
index 2877f59375..59d77f264a 100644
--- a/lmdeploy/pytorch/engine/model_agent.py
+++ b/lmdeploy/pytorch/engine/model_agent.py
@@ -162,10 +162,6 @@ def __init__(self, model_config: ModelConfig, cache_config: CacheConfig):
         self.model_config = model_config
         self.cache_config = cache_config
 
-    def get_block_numel(self):
-        """get block nelement."""
-        raise NotImplementedError('Not implemented')
-
     async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap,
                             swap_out_map: SwapMap):
         """model forward.
@@ -177,17 +173,6 @@ async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap,
         """
         raise NotImplementedError('Not implemented.')
 
-    def forward(self, inputs: ModelInputs, swap_in_map: SwapMap,
-                swap_out_map: SwapMap):
-        """model forward.
-
-        Args:
-            inputs (Dict): The input data comes from _make_inputs.
-            swap_in_map (SwapMap): Cache maps to swap in.
-            swap_out_map (SwapMap): Cache maps to swap out.
-        """
-        raise NotImplementedError('Not implemented.')
-
     def get_logits(self, hidden_states: torch.Tensor):
         """get logits of model output."""
         raise NotImplementedError('Not implemented.')
@@ -255,11 +240,6 @@ def _build_model(self,
                          device=device)
         return patched_model
 
-    def get_block_numel(self):
-        """get block nelement."""
-        k_cache = self.cache_engine.local_gpu_cache[0][0]
-        return k_cache[0].numel()
-
     def _forward_impl(self, inputs: ModelInputs, swap_in_map: SwapMap,
                       swap_out_map: SwapMap):
         cache_swapping(self.cache_engine,
@@ -274,21 +254,6 @@ def _forward_impl(self, inputs: ModelInputs, swap_in_map: SwapMap,
         )
         return output
 
-    def forward(self, inputs: ModelInputs, swap_in_map: SwapMap,
-                swap_out_map: SwapMap):
-        """model forward.
-
-        Args:
-            inputs (Dict): The input data comes from _make_inputs.
-            swap_in_map (SwapMap): Cache maps to swap in.
-            swap_out_map (SwapMap): Cache maps to swap out.
-        """
-        output = self._forward_impl(inputs,
-                                    swap_in_map=swap_in_map,
-                                    swap_out_map=swap_out_map)
-        self.stream.synchronize()
-        return output
-
     async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap,
                             swap_out_map: SwapMap):
         """model forward.
@@ -301,8 +266,9 @@ async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap,
         output = self._forward_impl(inputs,
                                     swap_in_map=swap_in_map,
                                     swap_out_map=swap_out_map)
-        await asyncio.get_event_loop().run_in_executor(None,
-                                                       self.stream.synchronize)
+        await asyncio.sleep(0)
+        while not self.stream.query():
+            await asyncio.sleep(0)
         return output
 
     def get_logits(self, hidden_states: torch.Tensor):
@@ -688,11 +654,6 @@ def _build_model(
 
         return model, cache_engine, cache_config
 
-    def get_block_numel(self):
-        """get block nelement."""
-        k_cache = self.cache_engine.local_gpu_cache[0][0]
-        return k_cache[0].numel()
-
     def _forward_impl(self, inputs: ModelInputs, swap_in_map: SwapMap,
                       swap_out_map: SwapMap):
         """forward impl."""
@@ -713,21 +674,6 @@ def _forward_impl(self, inputs: ModelInputs, swap_in_map: SwapMap,
             )
         return output
 
-    def forward(self, inputs: ModelInputs, swap_in_map: SwapMap,
-                swap_out_map: SwapMap):
-        """model forward.
-
-        Args:
-            inputs (Dict): The input data comes from _make_inputs.
-            swap_in_map (SwapMap): Cache maps to swap in.
-            swap_out_map (SwapMap): Cache maps to swap out.
-        """
-        output = self._forward_impl(inputs,
-                                    swap_in_map=swap_in_map,
-                                    swap_out_map=swap_out_map)
-        self.stream.synchronize()
-        return output
-
     async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap,
                             swap_out_map: SwapMap):
         """model forward.
@@ -740,8 +686,9 @@ async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap,
         output = self._forward_impl(inputs,
                                     swap_in_map=swap_in_map,
                                     swap_out_map=swap_out_map)
-        await asyncio.get_event_loop().run_in_executor(None,
-                                                       self.stream.synchronize)
+        await asyncio.sleep(0)
+        while not self.stream.query():
+            await asyncio.sleep(0)
         return output
 
     def get_logits(self, hidden_states: torch.Tensor):
diff --git a/lmdeploy/pytorch/kernels/cuda/activation.py b/lmdeploy/pytorch/kernels/cuda/activation.py
index 2533840a95..9a00e7354f 100644
--- a/lmdeploy/pytorch/kernels/cuda/activation.py
+++ b/lmdeploy/pytorch/kernels/cuda/activation.py
@@ -7,10 +7,8 @@
 TRITON_VERSION = version.parse(triton.__version__)
 
 if TRITON_VERSION >= version.parse('3.0.0'):
-
     fast_expf = tl.math.exp
 else:
-    tanh = tl.math.tanh
     fast_expf = tl.math.fast_expf
 
 
@@ -26,63 +24,29 @@ def _silu_and_mul_kernel(
     BLOCK_SIZE_N: tl.constexpr,
 ):
     """silu and mul kernel."""
-    m_id = tl.program_id(0)
+    n_block_id = tl.program_id(0)
+    m_id = tl.program_id(1)
 
     up_ptr = gateup_ptr + N * stride_gun
 
-    offs_n = tl.arange(0, BLOCK_SIZE_N)
+    offs_n = n_block_id * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
     gate_ptrs = gateup_ptr + m_id * stride_gum + offs_n * stride_gun
     up_ptrs = up_ptr + m_id * stride_gum + offs_n * stride_gun
     out_ptrs = out_ptr + m_id * stride_om + offs_n * stride_on
 
-    for _ in range(0, N, BLOCK_SIZE_N):
-        gate = tl.load(gate_ptrs).to(tl.float32)
-        up = tl.load(up_ptrs).to(tl.float32)
-
-        gate = gate / (1 + fast_expf(-gate))
-        out = gate * up
-
-        tl.store(out_ptrs, out)
-
-        gate_ptrs += BLOCK_SIZE_N * stride_gun
-        up_ptrs += BLOCK_SIZE_N * stride_gun
-        out_ptrs += BLOCK_SIZE_N * stride_on
-
-
-@triton.jit
-def _silu_and_mul_no_align_kernel(
-    gateup_ptr,
-    out_ptr,
-    N: tl.constexpr,
-    stride_gum: tl.constexpr,
-    stride_gun: tl.constexpr,
-    stride_om: tl.constexpr,
-    stride_on: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-):
-    """silu and mul kernel."""
-    m_id = tl.program_id(0)
-
-    up_ptr = gateup_ptr + N * stride_gun
-
-    offs_n = tl.arange(0, BLOCK_SIZE_N)
-    gate_ptrs = gateup_ptr + m_id * stride_gum + offs_n * stride_gun
-    up_ptrs = up_ptr + m_id * stride_gum + offs_n * stride_gun
-    out_ptrs = out_ptr + m_id * stride_om + offs_n * stride_on
-
-    for n in range(0, N, BLOCK_SIZE_N):
-        mask = n + offs_n < N
-        gate = tl.load(gate_ptrs, mask=mask).to(tl.float32)
-        up = tl.load(up_ptrs, mask=mask).to(tl.float32)
-
-        gate = gate / (1 + fast_expf(-gate))
-        out = gate * up
+    if N % BLOCK_SIZE_N == 0:
+        mask = None
+    else:
+        mask = offs_n < N
+    gate = tl.load(gate_ptrs, mask=mask)
+    up = tl.load(up_ptrs, mask=mask)
+    gate = gate.to(tl.float32)
+    up = up.to(tl.float32)
 
-        tl.store(out_ptrs, out, mask=mask)
+    gate = gate / (1 + fast_expf(-gate))
+    out = gate * up
 
-        gate_ptrs += BLOCK_SIZE_N * stride_gun
-        up_ptrs += BLOCK_SIZE_N * stride_gun
-        out_ptrs += BLOCK_SIZE_N * stride_on
+    tl.store(out_ptrs, out, mask=mask)
 
 
 def silu_and_mul(gate_up: torch.Tensor, out: torch.Tensor = None):
@@ -96,31 +60,22 @@ def silu_and_mul(gate_up: torch.Tensor, out: torch.Tensor = None):
         out = gate_up.new_empty(out_shape)
 
     BLOCK_SIZE_N = triton.next_power_of_2(N)
-    BLOCK_SIZE_N = min(BLOCK_SIZE_N, 1024)
+    BLOCK_SIZE_N = min(BLOCK_SIZE_N, 512)
     num_warps = 4
-    num_stages = 2
-    grid = (M, )
-    if N % BLOCK_SIZE_N == 0:
-        _silu_and_mul_kernel[grid](gate_up,
-                                   out,
-                                   N,
-                                   stride_gum=gate_up.stride(0),
-                                   stride_gun=gate_up.stride(1),
-                                   stride_om=out.stride(0),
-                                   stride_on=out.stride(1),
-                                   BLOCK_SIZE_N=BLOCK_SIZE_N,
-                                   num_warps=num_warps,
-                                   num_stages=num_stages)
-    else:
-        _silu_and_mul_no_align_kernel[grid](gate_up,
-                                            out,
-                                            N,
-                                            stride_gum=gate_up.stride(0),
-                                            stride_gun=gate_up.stride(1),
-                                            stride_om=out.stride(0),
-                                            stride_on=out.stride(1),
-                                            BLOCK_SIZE_N=BLOCK_SIZE_N,
-                                            num_warps=num_warps,
-                                            num_stages=num_stages)
+    num_stages = 1
+    grid = (
+        triton.cdiv(N, BLOCK_SIZE_N),
+        M,
+    )
+    _silu_and_mul_kernel[grid](gate_up,
+                               out,
+                               N,
+                               stride_gum=gate_up.stride(0),
+                               stride_gun=gate_up.stride(1),
+                               stride_om=out.stride(0),
+                               stride_on=out.stride(1),
+                               BLOCK_SIZE_N=BLOCK_SIZE_N,
+                               num_warps=num_warps,
+                               num_stages=num_stages)
 
     return out
diff --git a/lmdeploy/pytorch/kernels/cuda/apply_rotary_pos_emb.py b/lmdeploy/pytorch/kernels/cuda/apply_rotary_pos_emb.py
index 9e14dc6a0c..f9d5f2f171 100644
--- a/lmdeploy/pytorch/kernels/cuda/apply_rotary_pos_emb.py
+++ b/lmdeploy/pytorch/kernels/cuda/apply_rotary_pos_emb.py
@@ -4,35 +4,9 @@
 import triton.language as tl
 from torch import Tensor
 
-from .triton_utils import get_kernel_meta, wrap_jit_func
-
-
-@wrap_jit_func(type_hint=dict(
-    Q=Tensor,
-    K=Tensor,
-    COS=Tensor,
-    SIN=Tensor,
-    POS=Tensor,
-    Q_EMB=Tensor,
-    K_EMB=Tensor,
-    seq_len=int,
-    stride_qs=int,
-    stride_qh=int,
-    stride_qd=int,
-    stride_ks=int,
-    stride_kh=int,
-    stride_kd=int,
-    stride_qes=int,
-    stride_qeh=int,
-    stride_qed=int,
-    stride_kes=int,
-    stride_keh=int,
-    stride_ked=int,
-    half_size=torch.int32,
-    BLOCK=torch.int32,
-    BLOCK_QH=torch.int32,
-    BLOCK_N=torch.int32,
-))
+from .triton_utils import get_kernel_meta
+
+
 @triton.jit(do_not_specialize=('seq_len', ))
 def apply_rotary_pos_emb_qk_kernel(
     Q,
@@ -60,8 +34,8 @@ def apply_rotary_pos_emb_qk_kernel(
     BLOCK_N: tl.constexpr,
 ):
     """apply rotary on key AND query kernel."""
-    seq_block_id = tl.program_id(0)
-    head_id = tl.program_id(1)
+    seq_block_id = tl.program_id(1)
+    head_id = tl.program_id(0)
 
     pos_offset = seq_block_id * BLOCK + tl.arange(0, BLOCK)
     pos_mask = pos_offset < seq_len
@@ -158,10 +132,13 @@ def apply_rotary_pos_emb(q: Tensor,
     num_heads_q = q.size(-2)
     num_heads_k = k.size(-2)
     num_warps = 4
-    num_stages = 4
+    num_stages = 1
 
     kernel_meta = get_kernel_meta(q)
-    grid = [triton.cdiv(seq_len, BLOCK), num_heads_q + num_heads_k]
+    grid = [
+        num_heads_q + num_heads_k,
+        triton.cdiv(seq_len, BLOCK),
+    ]
     apply_rotary_pos_emb_qk_kernel[grid](q,
                                          k,
                                          cos,
diff --git a/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py b/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py
index 9ef614fadd..93bd89f488 100644
--- a/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py
+++ b/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py
@@ -1,12 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Literal
 
-import torch
 import triton
 import triton.language as tl
 from torch import Tensor
 
-from .triton_utils import get_kernel_meta, wrap_jit_func
+from .triton_utils import get_kernel_meta
 
 
 @triton.jit
@@ -38,37 +37,6 @@ def _quant_int4(val1, val2):
     return q_val, scales, zeros
 
 
-@wrap_jit_func(type_hint=dict(
-    KStates=Tensor,
-    VStates=Tensor,
-    KCaches=Tensor,
-    VCaches=Tensor,
-    QStartLoc=Tensor,
-    QSeqLens=Tensor,
-    KVSeqLens=Tensor,
-    BlockOffsets=Tensor,
-    num_heads=torch.int32,
-    head_dim=torch.int32,
-    stride_kss=int,
-    stride_ksh=int,
-    stride_ksd=int,
-    stride_vss=int,
-    stride_vsh=int,
-    stride_vsd=int,
-    stride_kcn=int,
-    stride_kcb=int,
-    stride_kch=int,
-    stride_kcd=int,
-    stride_vcn=int,
-    stride_vcb=int,
-    stride_vch=int,
-    stride_vcd=int,
-    stride_boff=int,
-    BLOCK=torch.int32,
-    BLOCK_D=torch.int32,
-    BLOCK_DV=torch.int32,
-    BLOCK_H=torch.int32,
-))
 @triton.jit
 def _fill_kv_cache_kernel(
     KStates,
@@ -79,7 +47,7 @@ def _fill_kv_cache_kernel(
     QSeqLens,
     KVSeqLens,
     BlockOffsets,
-    num_heads: tl.constexpr,
+    is_decoding: tl.constexpr,
     head_dim: tl.constexpr,
     head_dim_v: tl.constexpr,
     stride_kss,
@@ -100,108 +68,70 @@ def _fill_kv_cache_kernel(
     BLOCK: tl.constexpr,
     BLOCK_D: tl.constexpr,
     BLOCK_DV: tl.constexpr,
-    BLOCK_H: tl.constexpr,
 ):
     """fill kv cache kernel."""
-    batch_id = tl.program_id(0)
+    batch_id = tl.program_id(2)
+    head_id = tl.program_id(0)
     block_id = tl.program_id(1)
 
-    # initialize
-    h_off = tl.arange(0, BLOCK_H)
-    d_off = tl.arange(0, BLOCK_D)
-
     q_startloc = tl.load(QStartLoc + batch_id)
     q_seqlen = tl.load(QSeqLens + batch_id)
     kv_seqlen = tl.load(KVSeqLens + batch_id)
     history_seqlen = kv_seqlen - q_seqlen
 
-    block0_first_tokenloc = history_seqlen % BLOCK
-
-    state_token_offset = tl.maximum(block_id * BLOCK - block0_first_tokenloc,
-                                    0)
-    kv_block_id = _div_up(history_seqlen + 1, BLOCK) - 1 + block_id
-    kv_block_id = min(kv_block_id, stride_boff - 1)
-    block_off = tl.load(BlockOffsets + batch_id * stride_boff + kv_block_id)
+    kv_block_id = history_seqlen // BLOCK + block_id
 
-    cur_startloc = q_startloc + state_token_offset
-    ks_ptr = KStates + cur_startloc * stride_kss
-    vs_ptr = VStates + cur_startloc * stride_vss
+    if kv_seqlen <= 0:
+        return
 
-    kc_ptr = KCaches + block_off * stride_kcn
-    vc_ptr = VCaches + block_off * stride_vcn
+    if kv_block_id * BLOCK >= kv_seqlen:
+        return
 
-    c_first_tokenloc = block0_first_tokenloc
-    if block_id != 0:
-        c_first_tokenloc *= 0
-    c_last_tokenloc = tl.minimum(
-        BLOCK, q_seqlen + block0_first_tokenloc - block_id * BLOCK)
+    if is_decoding:
+        page_offs = tl.full((1, ), history_seqlen % BLOCK, dtype=tl.int32)
+        kv_mask = tl.full((1, ), 1, dtype=tl.int1)
+        q_offs = tl.full((1, ), q_startloc, dtype=tl.int32)
+    else:
+        page_offs = tl.arange(0, BLOCK)
+        kv_offs = kv_block_id * BLOCK + page_offs
+        kv_mask = (kv_offs >= history_seqlen) & (kv_offs < kv_seqlen)
+        token_off = q_startloc + kv_block_id * BLOCK - history_seqlen
+        q_offs = token_off + page_offs
 
-    for bidx in range(c_first_tokenloc, c_last_tokenloc):
-        sidx = bidx - c_first_tokenloc
-        mask = (h_off[:, None] < num_heads) & (d_off[None, :] < head_dim)
-        k = tl.load(ks_ptr + sidx * stride_kss + h_off[:, None] * stride_ksh +
-                    d_off[None, :] * stride_ksd,
-                    mask=mask)
-        tl.store(kc_ptr + bidx * stride_kcb + h_off[:, None] * stride_kch +
-                 d_off[None, :] * stride_kcd,
-                 k,
-                 mask=mask)
+    block_off = tl.load(BlockOffsets + batch_id * stride_boff + kv_block_id)
 
-        if BLOCK_DV > 0:
-            dv_off = tl.arange(0, BLOCK_DV)
-            maskv = (h_off[:, None] < num_heads) & (dv_off[None, :] <
-                                                    head_dim_v)
-            v = tl.load(vs_ptr + sidx * stride_vss +
-                        h_off[:, None] * stride_vsh +
-                        dv_off[None, :] * stride_vsd,
-                        mask=maskv)
-            tl.store(vc_ptr + bidx * stride_vcb + h_off[:, None] * stride_vch +
-                     dv_off[None, :] * stride_vcd,
-                     v,
-                     mask=maskv)
+    d_off = tl.arange(0, BLOCK_D)
+    mask_ks = kv_mask[:, None]
+    mask_kc = mask_ks & (d_off[None, :] < head_dim)
+    d_off = d_off % head_dim
+
+    ks_ptr = KStates + head_id * stride_ksh
+    ks_ptrs = ks_ptr + q_offs[:,
+                              None] * stride_kss + d_off[None, :] * stride_ksd
+    kc_ptr = KCaches + block_off * stride_kcn + head_id * stride_kch
+    kc_ptrs = kc_ptr + page_offs[:, None] * stride_kcb + d_off[
+        None, :] * stride_kcd
+
+    if BLOCK_DV > 0:
+        dv_off = tl.arange(0, BLOCK_DV)
+        mask_vs = kv_mask[:, None]
+        mask_vc = mask_vs & (dv_off[None, :] < head_dim_v)
+        dv_off = dv_off % head_dim_v
+        vs_ptr = VStates + head_id * stride_vsh
+        vs_ptrs = vs_ptr + q_offs[:, None] * stride_vss + dv_off[
+            None, :] * stride_vsd
+        vc_ptr = VCaches + block_off * stride_vcn + head_id * stride_vch
+        vc_ptrs = vc_ptr + page_offs[:, None] * stride_vcb + dv_off[
+            None, :] * stride_vcd
+
+    k = tl.load(ks_ptrs, mask=mask_ks)
+    if BLOCK_DV > 0:
+        v = tl.load(vs_ptrs, mask=mask_vs)
+    tl.store(kc_ptrs, k, mask=mask_kc)
+    if BLOCK_DV > 0:
+        tl.store(vc_ptrs, v, mask=mask_vc)
 
 
-@wrap_jit_func(type_hint=dict(
-    KStates=Tensor,
-    VStates=Tensor,
-    KCaches=Tensor,
-    VCaches=Tensor,
-    KScalesZeros=Tensor,
-    VScalesZeros=Tensor,
-    QStartLoc=Tensor,
-    QSeqLens=Tensor,
-    KVSeqLens=Tensor,
-    BlockOffsets=Tensor,
-    num_heads=torch.int32,
-    head_dim=torch.int32,
-    stride_kss=int,
-    stride_ksh=int,
-    stride_ksd=int,
-    stride_vss=int,
-    stride_vsh=int,
-    stride_vsd=int,
-    stride_kcn=int,
-    stride_kcb=int,
-    stride_kch=int,
-    stride_kcd=int,
-    stride_vcn=int,
-    stride_vcb=int,
-    stride_vch=int,
-    stride_vcd=int,
-    stride_kszn=int,
-    stride_kszb=int,
-    stride_kszh=int,
-    stride_kszd=int,
-    stride_vszn=int,
-    stride_vszb=int,
-    stride_vszh=int,
-    stride_vszd=int,
-    stride_boff=int,
-    BLOCK=torch.int32,
-    BLOCK_D=torch.int32,
-    BLOCK_DV=torch.int32,
-    BLOCK_H=torch.int32,
-))
 @triton.jit
 def _fill_kv_cache_quant_kernel(
     KStates,
@@ -394,15 +324,19 @@ def fill_kv_cache(k_states: Tensor,
     num_heads = k_caches.size(h_dim)
     head_dim = k_caches.size(d_dim)
     head_dim_v = v_states.size(-1)
-    max_num_blocks = triton.cdiv(max_q_seq_length, block_size) + 1
+    if max_q_seq_length == 1:
+        max_num_blocks = 1
+    else:
+        max_num_blocks = triton.cdiv(max_q_seq_length, block_size) + 1
 
     BLOCK = block_size
     BLOCK_H = triton.next_power_of_2(num_heads)
     BLOCK_D = triton.next_power_of_2(head_dim)
     BLOCK_DV = triton.next_power_of_2(head_dim_v)
-    grid = [batch_size, max_num_blocks]
     kernel_meta = get_kernel_meta(k_states)
     if quant_policy == 0:
+        grid = [num_heads, max_num_blocks, batch_size]
+        is_decoding = max_num_blocks == 1
         _fill_kv_cache_kernel[grid](
             k_states,
             v_states,
@@ -412,7 +346,7 @@ def fill_kv_cache(k_states: Tensor,
             q_seq_length,
             kv_seq_length,
             block_offsets,
-            num_heads=num_heads,
+            is_decoding=is_decoding,
             head_dim=head_dim,
             head_dim_v=head_dim_v,
             stride_kss=k_states.stride(-3),
@@ -433,12 +367,12 @@ def fill_kv_cache(k_states: Tensor,
             BLOCK=BLOCK,
             BLOCK_D=BLOCK_D,
             BLOCK_DV=BLOCK_DV,
-            BLOCK_H=BLOCK_H,
             num_warps=4,
             num_stages=3,
             **kernel_meta,
         )
     else:
+        grid = [batch_size, max_num_blocks]
         _fill_kv_cache_quant_kernel[grid](
             k_states,
             v_states,
diff --git a/lmdeploy/pytorch/models/utils/cudagraph.py b/lmdeploy/pytorch/models/utils/cudagraph.py
index 149376e4be..74d090a9a3 100644
--- a/lmdeploy/pytorch/models/utils/cudagraph.py
+++ b/lmdeploy/pytorch/models/utils/cudagraph.py
@@ -70,15 +70,14 @@ def make_buffers_cudagraph(self, graph_meta: CudaGraphMeta, *args,
         input_buffers['block_offsets'] = torch.zeros((max_batches, num_blocks),
                                                      dtype=torch.int64,
                                                      device=device)
-        input_buffers['q_start_loc'] = torch.zeros(max_batches,
-                                                   dtype=torch.int64,
-                                                   device=device)
-        input_buffers['q_seqlens'] = torch.zeros(max_batches,
-                                                 dtype=torch.int64,
-                                                 device=device)
-        input_buffers['kv_seqlens'] = torch.zeros(max_batches,
-                                                  dtype=torch.int64,
-                                                  device=device)
+
+        input_buffers['qkv_lens'] = torch.zeros(3,
+                                                max_batches,
+                                                dtype=torch.int64,
+                                                device=device)
+        input_buffers['q_start_loc'] = input_buffers['qkv_lens'][0]
+        input_buffers['q_seqlens'] = input_buffers['qkv_lens'][1]
+        input_buffers['kv_seqlens'] = input_buffers['qkv_lens'][2]
         input_buffers['local_adapter_ids'] = torch.zeros(max_batches,
                                                          dtype=torch.int64,
                                                          device=device)
@@ -111,13 +110,10 @@ def fill_buffers_cudagraph(self, graph_meta: CudaGraphMeta,
         input_buffers['position_ids'][:, :num_tokens] = position_ids
         input_buffers[
             'block_offsets'][:batch_size, :num_blocks] = block_offsets
-        if q_seqlens.data_ptr() != input_buffers['q_seqlens'].data_ptr():
-            input_buffers['q_seqlens'].zero_()
-        input_buffers['q_seqlens'][:batch_size] = q_seqlens
-        if kv_seqlens.data_ptr() != input_buffers['kv_seqlens'].data_ptr():
-            input_buffers['kv_seqlens'].zero_()
-        input_buffers['kv_seqlens'][:batch_size] = kv_seqlens
-        input_buffers['q_start_loc'][:batch_size] = q_start_loc
+
+        qkv = torch.stack((q_start_loc, q_seqlens, kv_seqlens))
+        input_buffers['qkv_lens'].zero_()
+        input_buffers['qkv_lens'][:, :batch_size] = qkv
         if inputs_embeds is not None:
             emb_size = inputs_embeds.size(-1)
             if 'inputs_embeds' not in input_buffers:
diff --git a/tests/pytorch/engine/test_logits_process.py b/tests/pytorch/engine/test_logits_process.py
index 5c5fdbdc18..69c8315411 100644
--- a/tests/pytorch/engine/test_logits_process.py
+++ b/tests/pytorch/engine/test_logits_process.py
@@ -35,8 +35,9 @@ def test_process_bad_words():
         [4, 4],
         [-1, -1],
     ])
+    mask = bad_words >= 0
 
-    out_scores = _process_bad_words_(scores, bad_words)
+    out_scores = _process_bad_words_(scores, bad_words.where(mask, 0), mask)
 
     for score, bw in zip(out_scores, bad_words):
         bw = bw.tolist()

From 0dedd73e5727776e2392b6f7256e0f66d0c48c8b Mon Sep 17 00:00:00 2001
From: q yao <streetyao@live.com>
Date: Tue, 3 Dec 2024 14:44:22 +0800
Subject: [PATCH 102/122] fix the logic to verify whether AutoAWQ has been
 successfully installed (#2844)

---
 lmdeploy/pytorch/backends/cuda/awq_modules.py | 2 --
 lmdeploy/pytorch/backends/cuda/op_backend.py  | 6 +++++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/lmdeploy/pytorch/backends/cuda/awq_modules.py b/lmdeploy/pytorch/backends/cuda/awq_modules.py
index f3cbf8bee4..8159bbf554 100644
--- a/lmdeploy/pytorch/backends/cuda/awq_modules.py
+++ b/lmdeploy/pytorch/backends/cuda/awq_modules.py
@@ -53,8 +53,6 @@ class AwqLinearW4A16Impl(LinearW4A16Impl):
 
     def __init__(self, in_features: int, out_features: int, w_bit: int,
                  group_size: int):
-        from awq.modules.linear.gemm import AWQ_INSTALLED
-        assert AWQ_INSTALLED
         self.in_features = in_features
         self.out_features = out_features
         self.w_bit = w_bit
diff --git a/lmdeploy/pytorch/backends/cuda/op_backend.py b/lmdeploy/pytorch/backends/cuda/op_backend.py
index 3e7fc23728..d796f8e19f 100644
--- a/lmdeploy/pytorch/backends/cuda/op_backend.py
+++ b/lmdeploy/pytorch/backends/cuda/op_backend.py
@@ -48,7 +48,11 @@ def get_layer_impl_builder(cls, layer_type: OpType):
             from .activation import TritonSiluAndMulBuilder
             return TritonSiluAndMulBuilder
         elif layer_type == OpType.LinearW4A16:
-            from awq.modules.linear.gemm import AWQ_INSTALLED
+            try:
+                from awq.modules.linear.gemm import awq_ext  # noqa: F401
+                AWQ_INSTALLED = True
+            except Exception:
+                AWQ_INSTALLED = False
             if AWQ_INSTALLED:
                 from .awq_modules import AwqLinearW4A16Builder
                 return AwqLinearW4A16Builder

From efa8ac032005091a17f6c4555917d400c44486ba Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Tue, 3 Dec 2024 14:45:57 +0800
Subject: [PATCH 103/122] check whether backend_config is None or not before
 accessing its attr (#2848)

---
 lmdeploy/api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmdeploy/api.py b/lmdeploy/api.py
index 2b4204a53b..42b7c6e4c1 100644
--- a/lmdeploy/api.py
+++ b/lmdeploy/api.py
@@ -71,7 +71,7 @@ def pipeline(model_path: str,
 
     task, pipeline_class = get_task(model_path)
     if task == 'vlm':
-        if backend_config.enable_prefix_caching:
+        if backend_config and backend_config.enable_prefix_caching:
             backend_config.enable_prefix_caching = False
             logger.warning('VLM does not support prefix caching.')
 

From a6645b228674a294eed00a73c508a00081c3dc6e Mon Sep 17 00:00:00 2001
From: zhoushenglong <87467364+Reinerzhou@users.noreply.github.com>
Date: Tue, 3 Dec 2024 16:44:35 +0800
Subject: [PATCH 104/122] [dlinfer] change dlinfer kv_cache layout and ajust
 paged_prefill_attention api. (#2847)

* opt update_step_ctx on maca.

* change kv_cache layout and ajust paged_prefill_attention.

* opt maca update context.
---
 .../backends/dlinfer/maca/op_backend.py       | 80 +++++++++----------
 .../pytorch/kernels/dlinfer/pagedattention.py |  3 +
 2 files changed, 41 insertions(+), 42 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py
index 084cae1bfe..a68ed9ac3a 100644
--- a/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py
@@ -12,6 +12,7 @@
 
 class MacaOpsBackend(DlinferOpsBackend):
     """maca layer backend."""
+    total_slots = None
 
     @staticmethod
     def get_name() -> str:
@@ -25,10 +26,8 @@ def get_k_block_shape(
         head_size: int,
         dtype: torch.dtype,
     ) -> Tuple[int, ...]:
-        if head_size == 576:
-            x = 16
-            return (num_heads, head_size // x, block_size, x)
-        return (num_heads, block_size, head_size)
+        x = 16
+        return (num_heads, head_size // x, block_size, x)
 
     @staticmethod
     def get_v_block_shape(
@@ -42,11 +41,25 @@ def get_v_block_shape(
     @classmethod
     def update_step_context(cls, step_context):
         """update step context."""
+
+        def get_total_slots():
+            if cls.total_slots is None:
+                cls.total_slots = torch.arange(
+                    block_num * block_size,
+                    dtype=torch.long,
+                    device=step_context.block_offsets.device)
+                cls.total_slots = cls.total_slots.view(block_num, block_size)
+            return cls.total_slots
+
         kv_start_indices, attention_mask = [], []
-        block_num, _, block_size, _ = step_context.kv_caches[0][0].shape
+        block_num, _, block_size, _ = step_context.kv_caches[0][1].shape
         device = step_context.block_offsets.device
 
         is_unpaged_prefill = False
+        if not step_context.is_decoding:
+            is_unpaged_prefill = \
+               all((step_context.q_seqlens ==
+                    step_context.kv_seqlens).tolist())
         q_start_loc = torch.cat((torch.tensor([0], device=device),
                                  step_context.q_seqlens.cumsum(0))).int()
         q_seqlens = step_context.q_seqlens.int()
@@ -54,43 +67,26 @@ def update_step_context(cls, step_context):
         max_q_seq_len = torch.max(q_seqlens).item()
         max_kv_seq_len = torch.max(kv_seqlens).item()
 
-        if not step_context.is_decoding:
-            is_unpaged_prefill = \
-                all((step_context.q_seqlens ==
-                     step_context.kv_seqlens).tolist())
-            if is_unpaged_prefill:
-                single_attention_mask = torch.logical_not(
-                    torch.tril(
-                        torch.ones(max_q_seq_len,
-                                   max_kv_seq_len,
-                                   dtype=torch.bool).cuda(),
-                        diagonal=max_kv_seq_len - max_q_seq_len,
-                    ))
-                attention_mask.append(single_attention_mask)
-        total_slots = torch.arange(block_num * block_size,
-                                   dtype=torch.long,
-                                   device=device)
-        total_slots = total_slots.view(block_num, block_size)
-        for i in range(step_context.q_start_loc.size(0)):
-            q_seq_len = int(step_context.q_seqlens[i])
-            kv_seq_len = int(step_context.kv_seqlens[i])
-            if not (step_context.is_decoding or is_unpaged_prefill):
-                single_attention_mask = torch.logical_not(
-                    torch.tril(
-                        torch.ones(step_context.q_seqlens[i],
-                                   step_context.block_offsets.shape[1] *
-                                   block_size,
-                                   dtype=torch.bool).cuda(),
-                        diagonal=step_context.kv_seqlens[i] -
-                        step_context.q_seqlens[i],
-                    ))
-                attention_mask.append(single_attention_mask)
-            history_length = kv_seq_len - q_seq_len
-            slot_tables = total_slots[step_context.block_offsets[i]].flatten()
-            slot_indices = [p for p in range(history_length, kv_seq_len)]
-            slots = slot_tables[slot_indices].reshape((-1, 1))
-            kv_start_indices.append(slots)
-        kv_start_indices = torch.cat(kv_start_indices)
+        if step_context.is_decoding:
+            # collect kv_start_indices without using a for-loop,
+            # (fill kv-cache for just ONE token during the decoding phase)
+            idx = (step_context.kv_seqlens - 1) % block_size
+            b_num = (step_context.kv_seqlens - 1) // block_size
+            last_block = step_context.block_offsets.gather(
+                1, b_num.view(-1, 1)).view(-1)
+            kv_start_indices = (last_block * block_size + idx).reshape((-1, 1))
+        else:
+            for i in range(step_context.q_start_loc.size(0)):
+                q_seq_len = int(step_context.q_seqlens[i])
+                kv_seq_len = int(step_context.kv_seqlens[i])
+                # collect kv start indices during the prefill phase.
+                history_length = kv_seq_len - q_seq_len
+                total_slots = get_total_slots()
+                slot_tables = total_slots[step_context.block_offsets[i]].view(
+                    -1)
+                slots = slot_tables[history_length:kv_seq_len]
+                kv_start_indices.append(slots)
+            kv_start_indices = torch.cat(kv_start_indices)
 
         attn_meta_cls = cls.get_attention_metadata_cls()
         attn_metadata = attn_meta_cls(
diff --git a/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py b/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py
index 21c72074a4..47bcb0cfff 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py
@@ -39,6 +39,8 @@ def prefill_attention(
     else:
         return ext_ops.paged_prefill_attention(
             query_states,
+            key_states,
+            value_states,
             key_cache,
             value_cache,
             block_offsets,
@@ -46,6 +48,7 @@ def prefill_attention(
             q_start_loc,
             q_seq_len,
             kv_seq_len,
+            max_q_seq_len,
             num_q_heads,
             num_kv_heads,
             attn_mask,

From cc8cfb0b456cb05cf1089ff1f27706d395e70864 Mon Sep 17 00:00:00 2001
From: zhoushenglong <87467364+Reinerzhou@users.noreply.github.com>
Date: Tue, 3 Dec 2024 16:45:35 +0800
Subject: [PATCH 105/122] [maca] add env to support different mm layout on
 maca. (#2835)

* add env to support different mm layout on maca.

* rename env variable.
---
 lmdeploy/pytorch/backends/dlinfer/linear.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/lmdeploy/pytorch/backends/dlinfer/linear.py b/lmdeploy/pytorch/backends/dlinfer/linear.py
index 567a01dddf..5edfa7728d 100644
--- a/lmdeploy/pytorch/backends/dlinfer/linear.py
+++ b/lmdeploy/pytorch/backends/dlinfer/linear.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import os
 from typing import Optional
 
 import torch
@@ -11,6 +12,14 @@
 class DlinferLinearImpl(LinearImpl):
     """Dlinfer linear implementation api."""
 
+    def update_weights(self,
+                       weight: torch.Tensor,
+                       bias: Optional[torch.Tensor] = None):
+        """update weights."""
+        if os.getenv('DLINER_LINEAR_USE_NN_LAYOUT', '0') == '1':
+            weight = weight.data.t().contiguous()
+        return weight, bias
+
     def forward(self,
                 x,
                 weight: torch.Tensor,

From 69a4306d2cd7d8f7790e39edfb3b8266282767fa Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Wed, 4 Dec 2024 13:39:05 +0800
Subject: [PATCH 106/122] Supports W8A8 quantization for more models (#2850)

* Supports W8A8 quantization for more models

* update supported models
---
 docs/en/supported_models/supported_models.md  |   48 +-
 .../supported_models/supported_models.md      |   48 +-
 lmdeploy/lite/apis/calibrate.py               |    3 +
 lmdeploy/lite/apis/smooth_quant.py            |   66 +-
 lmdeploy/lite/quantization/awq.py             |   11 +-
 lmdeploy/pytorch/modeling/__init__.py         |    1 -
 .../pytorch/modeling/convert_to_qmodules.py   |   59 -
 .../pytorch/modeling/modeling_baichuan.py     |  824 -------
 .../pytorch/modeling/modeling_internlm.py     | 1171 ----------
 .../pytorch/modeling/modeling_internlm2.py    | 1940 -----------------
 lmdeploy/pytorch/modeling/modeling_llama.py   | 1297 -----------
 11 files changed, 63 insertions(+), 5405 deletions(-)
 delete mode 100644 lmdeploy/pytorch/modeling/__init__.py
 delete mode 100644 lmdeploy/pytorch/modeling/convert_to_qmodules.py
 delete mode 100644 lmdeploy/pytorch/modeling/modeling_baichuan.py
 delete mode 100644 lmdeploy/pytorch/modeling/modeling_internlm.py
 delete mode 100644 lmdeploy/pytorch/modeling/modeling_internlm2.py
 delete mode 100644 lmdeploy/pytorch/modeling/modeling_llama.py

diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index cd43e79c94..469ece487f 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -51,47 +51,47 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 |     Llama      |  7B - 65B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |     Llama2     |  7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |     Llama3     |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
-|    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|    Llama3.2    |   1B, 3B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  Llama3.2-VL   |  11B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |   -   |
+|    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|    Llama3.2    |   1B, 3B    | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|  Llama3.2-VL   |  11B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |   InternLM2    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  InternLM2.5   |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |   Baichuan2    |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  No   |
 |   Baichuan2    |     13B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |    ChatGLM2    |     6B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |     Falcon     |  7B - 180B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|       YI       |  6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|    Mistral     |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|       YI       |  6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|    Mistral     |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |    Mixtral     | 8x7B, 8x22B | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|      QWen      | 1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|      QWen      | 1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  QWen1.5-MoE   |    A2.7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    |  No  |  Yes  |
+|     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    | Yes  |  Yes  |
 |    QWen2-VL    |   2B, 7B    | MLLM |    Yes    |   Yes   |   No    |  No  |  No   |
 |  DeepSeek-MoE  |     16B     | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |  DeepSeek-V2   |  16B, 236B  | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |    MiniCPM3    |     4B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    |  No  |  Yes  |
+| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    | Yes  |  Yes  |
 |     Gemma      |    2B-7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |      Dbrx      |    132B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |   StarCoder2   |   3B-15B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|   Phi-3-mini   |    3.8B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|  Phi-3-vision  |    4.2B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  CogVLM-Chat   |     17B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-| LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-| Mono-InternVL  |     2B      | MLLM |   Yes\*   |   Yes   |   Yes   |  No  |   -   |
-|    ChemVLM     |   8B-26B    | MLLM |    Yes    |   Yes   |   No    |  No  |   -   |
-|     Gemma2     |   9B-27B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|   Phi-3-mini   |    3.8B     | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|  Phi-3-vision  |    4.2B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|  CogVLM-Chat   |     17B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+| LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+| Mono-InternVL  |     2B      | MLLM |   Yes\*   |   Yes   |   Yes   |  -   |   -   |
+|    ChemVLM     |   8B-26B    | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
+|     Gemma2     |   9B-27B    | LLM  |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |      GLM4      |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |     GLM-4V     |     9B      | MLLM |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|   CodeGeeX4    |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  Phi-3.5-mini  |    3.8B     | LLM  |    Yes    |   Yes   |   No    |  No  |   -   |
-|  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   Yes   |   No    |  No  |   -   |
-| Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   Yes   |   No    |  No  |   -   |
+|   CodeGeeX4    |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|  Phi-3.5-mini  |    3.8B     | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
+|  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
+| Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
 
 ```{note}
 * Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index 7ec36d2351..d734523282 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -51,47 +51,47 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 |     Llama      |  7B - 65B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |     Llama2     |  7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |     Llama3     |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
-|    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|    Llama3.2    |   1B, 3B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  Llama3.2-VL   |  11B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |   -   |
+|    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|    Llama3.2    |   1B, 3B    | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|  Llama3.2-VL   |  11B, 90B   | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |   InternLM2    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  InternLM2.5   |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |   Baichuan2    |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  No   |
 |   Baichuan2    |     13B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |    ChatGLM2    |     6B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |     Falcon     |  7B - 180B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|       YI       |  6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|    Mistral     |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|       YI       |  6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|    Mistral     |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |    Mixtral     | 8x7B, 8x22B | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|      QWen      | 1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|      QWen      | 1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  QWen1.5-MoE   |    A2.7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    |  No  |  Yes  |
+|     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    | Yes  |  Yes  |
 |    QWen2-VL    |   2B, 7B    | MLLM |    Yes    |   Yes   |   No    |  No  |  No   |
 |  DeepSeek-MoE  |     16B     | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |  DeepSeek-V2   |  16B, 236B  | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |    MiniCPM3    |     4B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    |  No  |  Yes  |
+| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    | Yes  |  Yes  |
 |     Gemma      |    2B-7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |      Dbrx      |    132B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |   StarCoder2   |   3B-15B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|   Phi-3-mini   |    3.8B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|  Phi-3-vision  |    4.2B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  CogVLM-Chat   |     17B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-| LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
-|   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
-| Mono-InternVL  |     2B      | MLLM |   Yes\*   |   Yes   |   Yes   |  No  |   -   |
-|    ChemVLM     |   8B-26B    | MLLM |    Yes    |   Yes   |   No    |  No  |   -   |
-|     Gemma2     |   9B-27B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|   Phi-3-mini   |    3.8B     | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|  Phi-3-vision  |    4.2B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|  CogVLM-Chat   |     17B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+| LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+| Mono-InternVL  |     2B      | MLLM |   Yes\*   |   Yes   |   Yes   |  -   |   -   |
+|    ChemVLM     |   8B-26B    | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
+|     Gemma2     |   9B-27B    | LLM  |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |      GLM4      |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |     GLM-4V     |     9B      | MLLM |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|   CodeGeeX4    |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
-|  Phi-3.5-mini  |    3.8B     | LLM  |    Yes    |   Yes   |   No    |  No  |   -   |
-|  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   Yes   |   No    |  No  |   -   |
-| Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   Yes   |   No    |  No  |   -   |
+|   CodeGeeX4    |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|  Phi-3.5-mini  |    3.8B     | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
+|  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
+| Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
 
 ```{note}
 * Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.
diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
index 0780e93594..71f7a5900c 100644
--- a/lmdeploy/lite/apis/calibrate.py
+++ b/lmdeploy/lite/apis/calibrate.py
@@ -27,6 +27,7 @@
     'ChatGLMForConditionalGeneration': 'GLMBlock',
     'MixtralForCausalLM': 'MixtralDecoderLayer',
     'Qwen2VLForConditionalGeneration': 'Qwen2VLDecoderLayer',
+    'MistralForCausalLM': 'MistralDecoderLayer',
 }
 
 NORM_TYPE_MAP = {
@@ -44,6 +45,7 @@
     'ChatGLMForConditionalGeneration': 'RMSNorm',
     'MixtralForCausalLM': 'MixtralRMSNorm',
     'Qwen2VLForConditionalGeneration': 'Qwen2RMSNorm',
+    'MistralForCausalLM': 'MistralRMSNorm',
 }
 
 HEAD_NAME_MAP = {
@@ -61,6 +63,7 @@
     'ChatGLMForConditionalGeneration': 'output_layer',
     'MixtralForCausalLM': 'lm_head',
     'Qwen2VLForConditionalGeneration': 'lm_head',
+    'MistralForCausalLM': 'lm_head',
 }
 
 
diff --git a/lmdeploy/lite/apis/smooth_quant.py b/lmdeploy/lite/apis/smooth_quant.py
index 45684602b2..c8df67355e 100644
--- a/lmdeploy/lite/apis/smooth_quant.py
+++ b/lmdeploy/lite/apis/smooth_quant.py
@@ -1,70 +1,15 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-
-import os.path as osp
-import shutil
-
 import fire
 import torch
 from torch import nn
 
-import lmdeploy
-from lmdeploy.lite.apis.calibrate import calibrate
+from lmdeploy.lite.apis.calibrate import (LAYER_TYPE_MAP, NORM_TYPE_MAP,
+                                          calibrate)
 from lmdeploy.lite.quantization.awq import (FC_FCS_MAP, NORM_FCS_MAP,
                                             awq_layers, smooth_layers)
 from lmdeploy.lite.utils import collect_target_modules
 from lmdeploy.pytorch.models import QLinear, QRMSNorm
 
-LAYER_TYPE_MAP = {
-    'InternLMForCausalLM': 'InternLMDecoderLayer',
-    'InternLM2ForCausalLM': 'InternLM2DecoderLayer',
-    'QWenLMHeadModel': 'QWenBlock',
-    'BaiChuanForCausalLM': 'DecoderLayer',
-    'LlamaForCausalLM': 'LlamaDecoderLayer',
-    'ChatGLMForConditionalGeneration': 'GLMBlock',
-}
-NORM_TYPE_MAP = {
-    'InternLMForCausalLM': 'InternLMRMSNorm',
-    'InternLM2ForCausalLM': 'InternLM2RMSNorm',
-    'QWenLMHeadModel': 'RMSNorm',
-    'BaiChuanForCausalLM': 'RMSNorm',
-    'LlamaForCausalLM': 'LlamaRMSNorm',
-    'ChatGLMForConditionalGeneration': 'RMSNorm',
-}
-
-LMDEPLOY_ROOT = lmdeploy.__path__[0]
-
-MODEL_PATH_MAP = {
-    'InternLMForCausalLM':
-    osp.join(LMDEPLOY_ROOT, 'pytorch/modeling/modeling_internlm.py'),
-    'InternLM2ForCausalLM':
-    osp.join(LMDEPLOY_ROOT, 'pytorch/modeling/modeling_internlm2.py'),
-    'LlamaForCausalLM':
-    osp.join(LMDEPLOY_ROOT, 'pytorch/modeling/modeling_llama.py'),
-    'BaiChuanForCausalLM':
-    osp.join(LMDEPLOY_ROOT, 'pytorch/modeling/modeling_baichuan.py')
-}
-
-AUTO_MAP = {
-    'InternLMForCausalLM': {
-        'AutoConfig': 'configuration_internlm.InternLMConfig',
-        'AutoModel': 'modeling_internlm.InternLMForCausalLM',
-        'AutoModelForCausalLM': 'modeling_internlm.InternLMForCausalLM'
-    },
-    'InternLM2ForCausalLM': {
-        'AutoConfig': 'configuration_internlm2.InternLMConfig',
-        'AutoModelForCausalLM': 'modeling_internlm2.InternLM2ForCausalLM',
-        'AutoModel': 'modeling_internlm2.InternLM2ForCausalLM'
-    },
-    'LlamaForCausalLM': {
-        'AutoModel': 'modeling_llama.LlamaForCausalLM',
-        'AutoModelForCausalLM': 'modeling_llama.LlamaForCausalLM'
-    },
-    'BaiChuanForCausalLM': {
-        'AutoConfig': 'configuration_baichuan.BaiChuanConfig',
-        'AutoModelForCausalLM': 'modeling_baichuan.BaiChuanForCausalLM'
-    }
-}
-
 
 def smooth_quant(model: str,
                  work_dir: str = './work_dir',
@@ -146,11 +91,6 @@ def smooth_quant(model: str,
         setattr(parent, child_name, q_norm)
         norm.to('cpu')
 
-    if hasattr(model.config, 'auto_map'):
-        model.config.auto_map.update(AUTO_MAP[type(model).__name__])
-    else:
-        model.config.auto_map = AUTO_MAP[type(model).__name__]
-
     if vl_model:
         from .auto_awq import save_vl_model
         save_vl_model(vl_model, model_path, work_dir)
@@ -162,8 +102,6 @@ def smooth_quant(model: str,
                               safe_serialization=False)
     tokenizer.save_pretrained(work_dir)
 
-    shutil.copy(MODEL_PATH_MAP[type(model).__name__], work_dir)
-
 
 if __name__ == '__main__':
     fire.Fire(smooth_quant)
diff --git a/lmdeploy/lite/quantization/awq.py b/lmdeploy/lite/quantization/awq.py
index 2efe41b6da..cf03a75216 100644
--- a/lmdeploy/lite/quantization/awq.py
+++ b/lmdeploy/lite/quantization/awq.py
@@ -50,7 +50,12 @@
         'input_layernorm':
         ['self_attn.k_proj', 'self_attn.q_proj', 'self_attn.v_proj'],
         'post_attention_layernorm': ['mlp.gate_proj', 'mlp.up_proj']
-    }
+    },
+    'MistralDecoderLayer': {
+        'input_layernorm':
+        ['self_attn.k_proj', 'self_attn.q_proj', 'self_attn.v_proj'],
+        'post_attention_layernorm': ['mlp.gate_proj', 'mlp.up_proj']
+    },
 }
 
 FC_FCS_MAP = {
@@ -92,6 +97,10 @@
     'Qwen2VLDecoderLayer': {
         'self_attn.v_proj': ['self_attn.o_proj'],
         'mlp.up_proj': ['mlp.down_proj']
+    },
+    'MistralDecoderLayer': {
+        'self_attn.v_proj': ['self_attn.o_proj'],
+        'mlp.up_proj': ['mlp.down_proj']
     }
 }
 
diff --git a/lmdeploy/pytorch/modeling/__init__.py b/lmdeploy/pytorch/modeling/__init__.py
deleted file mode 100644
index ef101fec61..0000000000
--- a/lmdeploy/pytorch/modeling/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/lmdeploy/pytorch/modeling/convert_to_qmodules.py b/lmdeploy/pytorch/modeling/convert_to_qmodules.py
deleted file mode 100644
index 4a95c9f165..0000000000
--- a/lmdeploy/pytorch/modeling/convert_to_qmodules.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch.nn as nn
-
-from lmdeploy.pytorch.models import QLinear, QRMSNorm
-
-LAYER_TYPE_MAP = {
-    'InternLMForCausalLM': 'InternLMDecoderLayer',
-    'InternLM2ForCausalLM': 'InternLM2DecoderLayer',
-    'QWenLMHeadModel': 'QWenBlock',
-    'BaiChuanForCausalLM': 'DecoderLayer',
-    'LlamaForCausalLM': 'LlamaDecoderLayer',
-}
-NORM_TYPE_MAP = {
-    'InternLMForCausalLM': 'InternLMRMSNorm',
-    'InternLM2ForCausalLM': 'InternLM2RMSNorm',
-    'QWenLMHeadModel': 'RMSNorm',
-    'BaiChuanForCausalLM': 'RMSNorm',
-    'LlamaForCausalLM': 'LlamaRMSNorm',
-}
-
-
-def convert_decoder_layer(module, norm_type):
-    """Converts a given module's child layers from regular Linear or RMSNorm to
-    their Quantized versions (QLinear, QRMSNorm).
-
-    The conversion is done in place.
-    """
-    for name, child in module.named_children():
-        if isinstance(child, nn.Linear):
-            new_child = QLinear.from_float(child, initialization=False)
-            setattr(module, name, new_child)
-        elif type(child).__name__ == norm_type:
-            new_child = QRMSNorm.from_float(child, initialization=False)
-            setattr(module, name, new_child)
-        else:
-            convert_decoder_layer(child, norm_type)
-
-
-def convert(module, layer_type, norm_type):
-    """Recursively traverses through given PyTorch module and identifies child
-    layers that match the specified layer_type and norm_type for conversion to
-    their Quantized counterparts.
-
-    The conversion is done using the `convert_decoder_layer` function.
-    """
-    for child in module.children():
-        if type(child).__name__ == layer_type:
-            convert_decoder_layer(child, norm_type)
-        else:
-            convert(child, layer_type, norm_type)
-
-
-def convert_to_qmodules(model):
-    """Convert all Linear and RMSNorm in the decoder layers of the model into
-    their Quantized versions (QLinear, QRMSNorm)."""
-    layer_type = LAYER_TYPE_MAP[type(model).__name__]
-    norm_type = NORM_TYPE_MAP[type(model).__name__]
-    convert(model, layer_type, norm_type)
-    return
diff --git a/lmdeploy/pytorch/modeling/modeling_baichuan.py b/lmdeploy/pytorch/modeling/modeling_baichuan.py
deleted file mode 100644
index a790e81d06..0000000000
--- a/lmdeploy/pytorch/modeling/modeling_baichuan.py
+++ /dev/null
@@ -1,824 +0,0 @@
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from transformers import PreTrainedModel
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import (BaseModelOutputWithPast,
-                                           CausalLMOutputWithPast)
-
-from lmdeploy.pytorch.modeling.convert_to_qmodules import convert_to_qmodules
-from lmdeploy.utils import get_logger
-
-from .configuration_baichuan import BaiChuanConfig
-
-logger = get_logger('lmdeploy')
-
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size,
-                      dtype: torch.dtype,
-                      device: torch.device,
-                      past_key_values_length: int = 0):
-    """Make causal mask used for bi-directional self-attention."""
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len),
-                      torch.tensor(torch.finfo(dtype).min, device=device),
-                      device=device)
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([
-            torch.zeros(
-                tgt_len, past_key_values_length, dtype=dtype, device=device),
-            mask
-        ],
-                         dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len,
-                                         tgt_len + past_key_values_length)
-
-
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor,
-                 dtype: torch.dtype,
-                 tgt_len: Optional[int] = None):
-    """Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len,
-    src_seq_len]`."""
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len,
-                                                  src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool),
-                                     torch.finfo(dtype).min)
-
-
-class RMSNorm(nn.Module):
-
-    def __init__(self, hidden_size, eps=1e-6):
-        """RMSNorm is equivalent to T5LayerNorm."""
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1,
-                                                               keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance +
-                                                    self.variance_epsilon)
-
-        # convert into half-precision if necessary
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.to(self.weight.dtype)
-
-        return self.weight * hidden_states
-
-
-class RotaryEmbedding(torch.nn.Module):
-    """RotaryEmbedding for Baichuan Model.
-
-    This module generates sine and cosine positional encodings based on
-    the paper "RoFormer: Enhanced Transformer with Rotary Position Embedding".
-    The purpose of this class is to provide positional embeddings to the
-    input tensors. It utilizes a cache mechanism to store precomputed
-    sine and cosine values for speedup.
-
-    Args:
-        dim (int): The dimensionality of the embeddings.
-        max_position_embeddings (int, optional): The maximum number of
-            position embeddings. Default is 2048.
-        base (int, optional): The base value for the inverse frequency
-            calculation. Default is 10000.
-        device (str, optional): The device to run operations on.
-            If None, defaults to the device of the model.
-    """
-
-    def __init__(self,
-                 dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 device=None):
-        super().__init__()
-        index = (torch.arange(0, dim, 2).float().to(device) / dim)
-        inv_freq = 1.0 / (base**index)
-        self.register_buffer('inv_freq', inv_freq)
-
-        # Build here to make `torch.jit.trace` work.
-        self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(self.max_seq_len_cached,
-                         device=self.inv_freq.device,
-                         dtype=self.inv_freq.dtype)
-        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order
-        # to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer('cos_cached',
-                             emb.cos()[None, None, :, :],
-                             persistent=False)
-        self.register_buffer('sin_cached',
-                             emb.sin()[None, None, :, :],
-                             persistent=False)
-
-    def forward(self, x, seq_len=None):
-        """Forward propagation method for the embedding layer.
-
-        Generates positional embeddings for the given input tensor.
-        """
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        # This `if` block is unlikely to be run after we build sin/cos in
-        # `__init__`. Keep the logic here just in case.
-        if seq_len > self.max_seq_len_cached:
-            self.max_seq_len_cached = seq_len
-            t = torch.arange(self.max_seq_len_cached,
-                             device=x.device,
-                             dtype=self.inv_freq.dtype)
-            freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-            # Different from paper, but it uses a different permutation in
-            # order to obtain the same calculation
-            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            self.register_buffer('cos_cached',
-                                 emb.cos()[None, None, :, :],
-                                 persistent=False)
-            self.register_buffer('sin_cached',
-                                 emb.sin()[None, None, :, :],
-                                 persistent=False)
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., :x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    """Apply rotary positional embeddings to query and key tensors.
-
-    This function applies the cosine and sine positional embeddings on the
-    input query (q) and key (k) tensors using element-wise multiplication and
-    addition.
-    """
-    # The first two dimensions of cos and sin are always 1,
-    # so we can `squeeze` them.
-    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
-    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
-    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class MLP(nn.Module):
-    """MLP for Baichuan Model."""
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-    ):
-        super().__init__()
-        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
-        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.act_fn = ACT2FN[hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
-class Attention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper."""
-
-    def __init__(self, config: BaiChuanConfig):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.max_position_embeddings = config.max_position_embeddings
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError('hidden_size must be divisible by num_heads '
-                             f'(got `hidden_size`: {self.hidden_size}'
-                             f' and `num_heads`: {self.num_heads}).')
-        self.W_pack = nn.Linear(self.hidden_size,
-                                3 * self.hidden_size,
-                                bias=False)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim,
-                                self.hidden_size,
-                                bias=False)
-        self.rotary_emb = RotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings)
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads,
-                           self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        """Forward propagation method for the attention layer."""
-        bsz, q_len, _ = hidden_states.size()
-
-        proj = self.W_pack(hidden_states)
-        proj = proj.unflatten(-1,
-                              (3, self.hidden_size)).unsqueeze(0).transpose(
-                                  0, -2).squeeze(-2)
-        query_states = proj[0].view(
-            bsz, q_len, self.num_heads, self.head_dim).transpose(
-                1, 2)  # batch_size x source_len x hidden_size
-        key_states = proj[1].view(bsz, q_len,
-                                  self.num_heads, self.head_dim).transpose(
-                                      1,
-                                      2)  # batch_size x target_len x head_size
-        value_states = proj[2].view(
-            bsz, q_len, self.num_heads, self.head_dim).transpose(
-                1, 2)  # batch_size x source_len x hidden_size
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin, position_ids)
-        # [bsz, nh, t, hd]
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(
-            2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                'Attention weights should be of size '
-                f'{(bsz, self.num_heads, q_len, kv_seq_len)}, but is'
-                f' {attn_weights.size()}')
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError('Attention mask should be of size '
-                                 f'{(bsz, 1, q_len, kv_seq_len)},'
-                                 f' but is {attention_mask.size()}')
-            attn_weights = attn_weights + attention_mask
-            attn_weights = torch.max(
-                attn_weights,
-                torch.tensor(torch.finfo(attn_weights.dtype).min))
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights,
-                                             dim=-1,
-                                             dtype=torch.float32).to(
-                                                 query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                '`attn_output` should be of size '
-                f'{(bsz, self.num_heads, q_len, self.head_dim)}, but is'
-                f' {attn_output.size()}')
-
-        attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class DecoderLayer(nn.Module):
-    """Decoder layer for Baichuan Model."""
-
-    def __init__(self, config: BaiChuanConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = Attention(config=config)
-        self.mlp = MLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-        )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor,
-                                                 torch.FloatTensor]]]:
-        """  # noqa: E501
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states, )
-
-        if output_attentions:
-            outputs += (self_attn_weights, )
-
-        if use_cache:
-            outputs += (present_key_value, )
-
-        return outputs
-
-
-class PreTrainedModel(PreTrainedModel):
-    config_class = BaiChuanConfig
-    base_model_prefix = 'model'
-    supports_gradient_checkpointing = True
-    _no_split_modules = ['DecoderLayer']
-    _keys_to_ignore_on_load_unexpected = [r'decoder\.version']
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, Model):
-            module.gradient_checkpointing = value
-
-
-class Model(PreTrainedModel):
-    """Transformer decoder consisting of *config.num_hidden_layers* layers.
-    Each layer is a [`DecoderLayer`]
-
-    Args:
-        config: BaiChuanConfig
-    """
-
-    def __init__(self, config: BaiChuanConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size,
-                                         self.padding_idx)
-        self.layers = nn.ModuleList(
-            [DecoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder.
-    # prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape,
-                                        inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
-            )
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask,
-                                              inputs_embeds.dtype,
-                                              tgt_len=input_shape[-1]).to(
-                                                  inputs_embeds.device)
-            combined_attention_mask = (expanded_attn_mask
-                                       if combined_attention_mask is None else
-                                       expanded_attn_mask +
-                                       combined_attention_mask)
-
-        return combined_attention_mask
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        use_cache = (use_cache
-                     if use_cache is not None else self.config.use_cache)
-
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError('You cannot specify both decoder_input_ids '
-                             'and decoder_inputs_embeds at the same time')
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError('You have to specify either decoder_input_ids '
-                             'or decoder_inputs_embeds')
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = (seq_length_with_past +
-                                    past_key_values_length)
-
-        if position_ids is None:
-            device = (input_ids.device
-                      if input_ids is not None else inputs_embeds.device)
-            position_ids = torch.arange(past_key_values_length,
-                                        seq_length + past_key_values_length,
-                                        dtype=torch.long,
-                                        device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_length_with_past),
-                                        dtype=torch.bool,
-                                        device=inputs_embeds.device)
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds,
-            past_key_values_length)
-
-        hidden_states = inputs_embeds
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    '`use_cache=True` is incompatible with gradient '
-                    'checkpointing. Setting `use_cache=False`...')
-                use_cache = False
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states, )
-
-            past_key_value = past_key_values[
-                idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, output_attentions, None)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    None,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (
-                    layer_outputs[2 if output_attentions else 1], )
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1], )
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states, )
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(
-                v for v in
-                [hidden_states, next_cache, all_hidden_states, all_self_attns]
-                if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class BaiChuanForCausalLM(PreTrainedModel):
-    """This class extends the `PreTrainedModel` to enable causal language
-    modeling.
-
-    It wraps the basic Baichuan model (`Model`) and includes a linear layer as
-    a language model head (`lm_head`). The purpose is to predict token
-    probabilities, given the previous tokens in the sequence.
-    """
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = Model(config)
-
-        self.lm_head = nn.Linear(config.hidden_size,
-                                 config.vocab_size,
-                                 bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-        convert_to_qmodules(self)
-
-    def get_input_embeddings(self):
-        """Get the token embedding layer."""
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        """Set the token embedding layer."""
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        """Get the output embedding layer."""
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        """Set the output embedding layer."""
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        """Set the decoder model."""
-        self.model = decoder
-
-    def get_decoder(self):
-        """Get the decoder model."""
-        return self.model
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""  # noqa: E501
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, ModelForCausalLM
-
-        >>> model = ModelForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you consciours? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
-        ```"""
-
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        # decoder outputs consists of
-        # (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits, ) + outputs[1:]
-            return (loss, ) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      past_key_values=None,
-                                      attention_mask=None,
-                                      inputs_embeds=None,
-                                      **kwargs):
-        """Prepare inputs for generating sequences using the model.
-
-        Args:
-            input_ids (torch.Tensor): Input token ids.
-            past_key_values (list[torch.Tensor], optional): List of past key
-                and value states.
-            attention_mask (torch.Tensor, optional): Mask indicating which
-                tokens should be attended to.
-            inputs_embeds (torch.FloatTensor, optional): Optionally,
-                the input embeddings instead of token ids.
-
-        Returns:
-            dict: Dictionary containing prepared inputs for model generation.
-        """
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-
-        position_ids = kwargs.get('position_ids', None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-
-        # if `inputs_embeds` are passed,
-        # we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {'inputs_embeds': inputs_embeds}
-        else:
-            model_inputs = {'input_ids': input_ids}
-
-        model_inputs.update({
-            'position_ids': position_ids,
-            'past_key_values': past_key_values,
-            'use_cache': kwargs.get('use_cache'),
-            'attention_mask': attention_mask,
-        })
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        """Reorder cached past key-values during generation using beam search.
-
-        This function reorders the cached past key-values according to the
-        given indices. It's useful in beam search where the order of hypotheses
-        can change from one time-step to another.
-        """
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx)
-                for past_state in layer_past), )
-        return reordered_past
diff --git a/lmdeploy/pytorch/modeling/modeling_internlm.py b/lmdeploy/pytorch/modeling/modeling_internlm.py
deleted file mode 100644
index c640641132..0000000000
--- a/lmdeploy/pytorch/modeling/modeling_internlm.py
+++ /dev/null
@@ -1,1171 +0,0 @@
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch InternLM model."""
-import math
-import queue
-import threading
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers.activations import ACT2FN
-from transformers.generation.streamers import BaseStreamer
-from transformers.modeling_outputs import (BaseModelOutputWithPast,
-                                           CausalLMOutputWithPast,
-                                           SequenceClassifierOutputWithPast)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (add_start_docstrings,
-                                add_start_docstrings_to_model_forward,
-                                replace_return_docstrings)
-
-from lmdeploy.pytorch.modeling.convert_to_qmodules import convert_to_qmodules
-from lmdeploy.utils import get_logger
-
-from .configuration_internlm import InternLMConfig
-
-logger = get_logger('lmdeploy')
-
-_CONFIG_FOR_DOC = 'InternLMConfig'
-
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size,
-                      dtype: torch.dtype,
-                      device: torch.device,
-                      past_key_values_length: int = 0):
-    """Make causal mask used for bi-directional self-attention."""
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len),
-                      torch.tensor(torch.finfo(dtype).min, device=device),
-                      device=device)
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([
-            torch.zeros(
-                tgt_len, past_key_values_length, dtype=dtype, device=device),
-            mask
-        ],
-                         dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len,
-                                         tgt_len + past_key_values_length)
-
-
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor,
-                 dtype: torch.dtype,
-                 tgt_len: Optional[int] = None):
-    """Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len,
-    src_seq_len]`."""
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len,
-                                                  src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool),
-                                     torch.finfo(dtype).min)
-
-
-class InternLMRMSNorm(nn.Module):
-
-    def __init__(self, hidden_size, eps=1e-6):
-        """InternLMRMSNorm is equivalent to T5LayerNorm."""
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1,
-                                                               keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance +
-                                                    self.variance_epsilon)
-
-        # convert into half-precision if necessary
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.to(self.weight.dtype)
-
-        return self.weight * hidden_states
-
-
-class InternLMRotaryEmbedding(torch.nn.Module):
-    """RotaryEmbedding for InternLM Model.
-
-    This module generates sine and cosine positional encodings based on
-    the paper "RoFormer: Enhanced Transformer with Rotary Position Embedding".
-    The purpose of this class is to provide positional embeddings to the
-    input tensors. It utilizes a cache mechanism to store precomputed
-    sine and cosine values for speedup.
-
-    Args:
-        dim (int): The dimensionality of the embeddings.
-        max_position_embeddings (int, optional): The maximum number of
-            position embeddings. Default is 2048.
-        base (int, optional): The base value for the inverse frequency
-            calculation. Default is 10000.
-        device (str, optional): The device to run operations on.
-            If None, defaults to the device of the model.
-    """
-
-    def __init__(self,
-                 dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 device=None):
-        super().__init__()
-        index = (torch.arange(0, dim, 2).float().to(device) / dim)
-        inv_freq = 1.0 / (base**index)
-        self.register_buffer('inv_freq', inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(self.max_seq_len_cached,
-                         device=self.inv_freq.device,
-                         dtype=self.inv_freq.dtype)
-        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order
-        # to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer('cos_cached',
-                             emb.cos()[None, None, :, :],
-                             persistent=False)
-        self.register_buffer('sin_cached',
-                             emb.sin()[None, None, :, :],
-                             persistent=False)
-
-    def forward(self, x, seq_len=None):
-        """Forward propagation method for the embedding layer.
-
-        Generates positional embeddings for the given input tensor.
-        """
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        # This `if` block is unlikely to be run after we build sin/cos in
-        # `__init__`. Keep the logic here just in case.
-        if seq_len > self.max_seq_len_cached:
-            self.max_seq_len_cached = seq_len
-            t = torch.arange(self.max_seq_len_cached,
-                             device=x.device,
-                             dtype=self.inv_freq.dtype)
-            freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-            # Different from paper, but it uses a different permutation in
-            # order to obtain the same calculation
-            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            self.register_buffer('cos_cached',
-                                 emb.cos()[None, None, :, :],
-                                 persistent=False)
-            self.register_buffer('sin_cached',
-                                 emb.sin()[None, None, :, :],
-                                 persistent=False)
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., :x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    """Apply rotary positional embeddings to query and key tensors.
-
-    This function applies the cosine and sine positional embeddings on the
-    input query (q) and key (k) tensors using element-wise multiplication and
-    addition.
-    """
-    # The first two dimensions of cos and sin are always 1, so we can
-    # `squeeze` them.
-    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
-    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
-    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class InternLMMLP(nn.Module):
-    """MLP for InternLM Model."""
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-    ):
-        super().__init__()
-        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
-        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.act_fn = ACT2FN[hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
-class InternLMAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper."""
-
-    def __init__(self, config: InternLMConfig):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.max_position_embeddings = config.max_position_embeddings
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError('hidden_size must be divisible by num_heads '
-                             f'(got `hidden_size`: {self.hidden_size}'
-                             f' and `num_heads`: {self.num_heads}).')
-        self.q_proj = nn.Linear(self.hidden_size,
-                                self.num_heads * self.head_dim,
-                                bias=config.bias)
-        self.k_proj = nn.Linear(self.hidden_size,
-                                self.num_heads * self.head_dim,
-                                bias=config.bias)
-        self.v_proj = nn.Linear(self.hidden_size,
-                                self.num_heads * self.head_dim,
-                                bias=config.bias)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim,
-                                self.hidden_size,
-                                bias=config.bias)
-        self.rotary_emb = InternLMRotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings)
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads,
-                           self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        """Forward propagation method for the attention layer."""
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states).view(
-            bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = self.k_proj(hidden_states).view(
-            bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = self.v_proj(hidden_states).view(
-            bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin, position_ids)
-        # [bsz, nh, t, hd]
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(
-            2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                'Attention weights should be of size '
-                f'{(bsz, self.num_heads, q_len, kv_seq_len)}, but is'
-                f' {attn_weights.size()}')
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError('Attention mask should be of size '
-                                 f'{(bsz, 1, q_len, kv_seq_len)}, '
-                                 f'but is {attention_mask.size()}')
-            attn_weights = attn_weights + attention_mask
-            attn_weights = torch.max(
-                attn_weights,
-                torch.tensor(torch.finfo(attn_weights.dtype).min))
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights,
-                                             dim=-1,
-                                             dtype=torch.float32).to(
-                                                 query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                'attn_output` should be of size '
-                f'`{(bsz, self.num_heads, q_len, self.head_dim)}, but is'
-                f' {attn_output.size()}')
-
-        attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class InternLMDecoderLayer(nn.Module):
-    """Decoder layer for InternLM Model."""
-
-    def __init__(self, config: InternLMConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = InternLMAttention(config=config)
-        self.mlp = InternLMMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-        )
-        self.input_layernorm = InternLMRMSNorm(config.hidden_size,
-                                               eps=config.rms_norm_eps)
-        self.post_attention_layernorm = InternLMRMSNorm(
-            config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor,
-                                                 torch.FloatTensor]]]:
-        """ # noqa: E501
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states, )
-
-        if output_attentions:
-            outputs += (self_attn_weights, )
-
-        if use_cache:
-            outputs += (present_key_value, )
-
-        return outputs
-
-
-INTERNLM_START_DOCSTRING = r""" # noqa: E501
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`InternLMConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    'The bare InternLM Model outputting raw hidden-states without any specific head on top.',  # noqa: E501
-    INTERNLM_START_DOCSTRING,
-)
-class InternLMPreTrainedModel(PreTrainedModel):
-    config_class = InternLMConfig
-    base_model_prefix = 'model'
-    supports_gradient_checkpointing = True
-    _no_split_modules = ['InternLMDecoderLayer']
-    _keys_to_ignore_on_load_unexpected = [r'decoder\.version']
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, InternLMModel):
-            module.gradient_checkpointing = value
-
-
-INTERNLM_INPUTS_DOCSTRING = r""" # noqa: E501
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    'The bare InternLM Model outputting raw hidden-states without any specific head on top.',  # noqa: E501
-    INTERNLM_START_DOCSTRING,
-)
-class InternLMModel(InternLMPreTrainedModel):
-    """Transformer decoder consisting of *config.num_hidden_layers* layers.
-    Each layer is a [`InternLMDecoderLayer`]
-
-    Args:
-        config: InternLMConfig
-    """
-    _auto_class = 'AutoModel'
-
-    def __init__(self, config: InternLMConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size,
-                                         self.padding_idx)
-        self.layers = nn.ModuleList([
-            InternLMDecoderLayer(config)
-            for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = InternLMRMSNorm(config.hidden_size,
-                                    eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder.
-    # prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape,
-                                        inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
-            )
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask,
-                                              inputs_embeds.dtype,
-                                              tgt_len=input_shape[-1]).to(
-                                                  inputs_embeds.device)
-            combined_attention_mask = (expanded_attn_mask
-                                       if combined_attention_mask is None else
-                                       expanded_attn_mask +
-                                       combined_attention_mask)
-
-        return combined_attention_mask
-
-    @add_start_docstrings_to_model_forward(INTERNLM_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions or self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                or self.config.output_hidden_states)
-        use_cache = use_cache or self.config.use_cache
-
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError('You cannot specify both decoder_input_ids '
-                             'and decoder_inputs_embeds at the same time')
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError('You have to specify either decoder_input_ids '
-                             'or decoder_inputs_embeds')
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = (seq_length_with_past +
-                                    past_key_values_length)
-
-        if position_ids is None:
-            device = (input_ids.device
-                      if input_ids is not None else inputs_embeds.device)
-            position_ids = torch.arange(past_key_values_length,
-                                        seq_length + past_key_values_length,
-                                        dtype=torch.long,
-                                        device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_length_with_past),
-                                        dtype=torch.bool,
-                                        device=inputs_embeds.device)
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds,
-            past_key_values_length)
-
-        hidden_states = inputs_embeds
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    '`use_cache=True` is incompatible with gradient '
-                    'checkpointing. Setting `use_cache=False`...')
-                use_cache = False
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states, )
-
-            past_key_value = past_key_values[
-                idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, output_attentions, None)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    None,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (
-                    layer_outputs[2 if output_attentions else 1], )
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1], )
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states, )
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(
-                v for v in
-                [hidden_states, next_cache, all_hidden_states, all_self_attns]
-                if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class InternLMForCausalLM(InternLMPreTrainedModel):
-    """This class extends the `InternLMPreTrainedModel` to enable causal
-    language modeling.
-
-    It wraps the basic InternLM model (`InternLMModel`) and includes a linear
-    layer as a language model head (`lm_head`). The purpose is to predict token
-    probabilities, given the previous tokens in the sequence.
-    """
-    _auto_class = 'AutoModelForCausalLM'
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = InternLMModel(config)
-
-        self.lm_head = nn.Linear(config.hidden_size,
-                                 config.vocab_size,
-                                 bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-        convert_to_qmodules(self)
-
-    def get_input_embeddings(self):
-        """Get the token embedding layer."""
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        """Set the token embedding layer."""
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        """Get the output embedding layer."""
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        """Set the output embedding layer."""
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        """Set the decoder model."""
-        self.model = decoder
-
-    def get_decoder(self):
-        """Get the decoder model."""
-        return self.model
-
-    @add_start_docstrings_to_model_forward(INTERNLM_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast,
-                               config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""  # noqa: E501
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, InternLMForCausalLM
-
-        >>> model = InternLMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you consciours? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions or self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        # decoder outputs consists of
-        # (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits, ) + outputs[1:]
-            return (loss, ) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      past_key_values=None,
-                                      attention_mask=None,
-                                      inputs_embeds=None,
-                                      **kwargs):
-        """Prepare inputs for generating sequences using the model.
-
-        Args:
-            input_ids (torch.Tensor): Input token ids.
-            past_key_values (list[torch.Tensor], optional): List of past key
-                and value states.
-            attention_mask (torch.Tensor, optional): Mask indicating which
-                tokens should be attended to.
-            inputs_embeds (torch.FloatTensor, optional): Optionally,
-                the input embeddings instead of token ids.
-
-        Returns:
-            dict: Dictionary containing prepared inputs for model generation.
-        """
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-
-        position_ids = kwargs.get('position_ids', None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-
-        # if `inputs_embeds` are passed,
-        # we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {'inputs_embeds': inputs_embeds}
-        else:
-            model_inputs = {'input_ids': input_ids}
-
-        model_inputs.update({
-            'position_ids': position_ids,
-            'past_key_values': past_key_values,
-            'use_cache': kwargs.get('use_cache'),
-            'attention_mask': attention_mask,
-        })
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        """Reorder cached past key-values during generation using beam search.
-
-        This function reorders the cached past key-values according to the
-        given indices. It's useful in beam search where the order of hypotheses
-        can change from one time-step to another.
-        """
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx)
-                for past_state in layer_past), )
-        return reordered_past
-
-    def build_inputs(self,
-                     tokenizer,
-                     query: str,
-                     history: List[Tuple[str, str]] = []):
-        """Builds the input for the model."""
-        prompt = ''
-        for record in history:
-            prompt += f"""<|User|>:{record[0]}<eoh>\n<|Bot|>:{record[1]}<eoa>\n"""  # noqa: E501
-        prompt += f"""<|User|>:{query}<eoh>\n<|Bot|>:"""
-        return tokenizer([prompt], return_tensors='pt')
-
-    @torch.no_grad()
-    def chat(self,
-             tokenizer,
-             query: str,
-             history: List[Tuple[str, str]] = [],
-             streamer: Optional[BaseStreamer] = None,
-             max_new_tokens: int = 1024,
-             do_sample: bool = True,
-             temperature: float = 0.8,
-             top_p: float = 0.8,
-             **kwargs):
-        """Provides a chatting functionality for the model."""
-        inputs = self.build_inputs(tokenizer, query, history)
-        inputs = {
-            k: v.to(self.device)
-            for k, v in inputs.items() if torch.is_tensor(v)
-        }
-        outputs = self.generate(**inputs,
-                                streamer=streamer,
-                                max_new_tokens=max_new_tokens,
-                                do_sample=do_sample,
-                                temperature=temperature,
-                                top_p=top_p,
-                                **kwargs)
-        outputs = outputs[0].cpu().tolist()[len(inputs['input_ids'][0]):]
-        response = tokenizer.decode(outputs, skip_special_tokens=True)
-        response = response.split('<eoa>')[0]
-        history = history + [(query, response)]
-        return response, history
-
-    @torch.no_grad()
-    def stream_chat(self,
-                    tokenizer,
-                    query: str,
-                    history: List[Tuple[str, str]] = [],
-                    max_new_tokens: int = 1024,
-                    do_sample: bool = True,
-                    temperature: float = 0.8,
-                    top_p: float = 0.8,
-                    **kwargs):
-        """Return a generator in format: (response, history) Eg.
-
-        ('你好，有什么可以帮助您的吗', [('你好', '你好，有什么可以帮助您的吗')]) ('你好，有什么可以帮助您的吗？', [('你好',
-        '你好，有什么可以帮助您的吗？')])
-        """
-
-        response_queue = queue.Queue(maxsize=20)
-
-        class ChatStreamer(BaseStreamer):
-
-            def __init__(self, tokenizer) -> None:
-                super().__init__()
-                self.tokenizer = tokenizer
-                self.queue = response_queue
-                self.query = query
-                self.history = history
-                self.response = ''
-                self.received_inputs = False
-                self.queue.put(
-                    (self.response, history + [(self.query, self.response)]))
-
-            def put(self, value):
-                if len(value.shape) > 1 and value.shape[0] > 1:
-                    raise ValueError('ChatStreamer only supports batch size 1')
-                elif len(value.shape) > 1:
-                    value = value[0]
-
-                if not self.received_inputs:
-                    # The first received value is input_ids, ignore here
-                    self.received_inputs = True
-                    return
-
-                token = self.tokenizer.decode([value[-1]],
-                                              skip_special_tokens=True)
-                if token.strip() != '<eoa>':
-                    self.response = self.response + token
-                    history = self.history + [(self.query, self.response)]
-                    self.queue.put((self.response, history))
-
-            def end(self):
-                self.queue.put(None)
-
-        def stream_producer():
-            return self.chat(tokenizer=tokenizer,
-                             query=query,
-                             streamer=ChatStreamer(tokenizer=tokenizer),
-                             history=history,
-                             max_new_tokens=max_new_tokens,
-                             do_sample=do_sample,
-                             temperature=temperature,
-                             top_p=top_p,
-                             **kwargs)
-
-        def consumer():
-            producer = threading.Thread(target=stream_producer)
-            producer.start()
-            while True:
-                res = response_queue.get()
-                if res is None:
-                    return
-                yield res
-
-        return consumer()
-
-
-@add_start_docstrings(
-    """  # noqa: E501
-    The InternLM Model transformer with a sequence classification head on top (linear layer).
-
-    [`InternLMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    INTERNLM_START_DOCSTRING,
-)
-class InternLMForSequenceClassification(InternLMPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r'lm_head.weight']
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = InternLMModel(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(INTERNLM_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""  # noqa: E501
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError(
-                'Cannot handle batch sizes > 1 if no padding token is defined.'
-            )
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = (
-                    torch.ne(input_ids, self.config.pad_token_id).sum(-1) -
-                    1).to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device),
-                               sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = 'regression'
-                elif self.num_labels > 1 and (labels.dtype == torch.long
-                                              or labels.dtype == torch.int):
-                    self.config.problem_type = 'single_label_classification'
-                else:
-                    self.config.problem_type = 'multi_label_classification'
-
-            if self.config.problem_type == 'regression':
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == 'single_label_classification':
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels),
-                                labels.view(-1))
-            elif self.config.problem_type == 'multi_label_classification':
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits, ) + transformer_outputs[1:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/lmdeploy/pytorch/modeling/modeling_internlm2.py b/lmdeploy/pytorch/modeling/modeling_internlm2.py
deleted file mode 100644
index cb61df0256..0000000000
--- a/lmdeploy/pytorch/modeling/modeling_internlm2.py
+++ /dev/null
@@ -1,1940 +0,0 @@
-# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on transformers/src/transformers/models/llama/modeling_llama.py
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch InternLM2 model."""
-import math
-import queue
-import threading
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from einops import rearrange
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache, StaticCache
-from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-from transformers.modeling_outputs import (BaseModelOutputWithPast,
-                                           CausalLMOutputWithPast,
-                                           QuestionAnsweringModelOutput,
-                                           SequenceClassifierOutputWithPast,
-                                           TokenClassifierOutput)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
-from transformers.utils import (add_start_docstrings,
-                                add_start_docstrings_to_model_forward,
-                                is_flash_attn_2_available,
-                                is_flash_attn_greater_or_equal_2_10, logging,
-                                replace_return_docstrings)
-
-from lmdeploy.pytorch.modeling.convert_to_qmodules import convert_to_qmodules
-
-try:
-    from transformers.generation.streamers import BaseStreamer
-except Exception:
-    BaseStreamer = None
-
-from .configuration_internlm2 import InternLM2Config
-
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import (index_first_axis, pad_input,
-                                         unpad_input)
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = 'InternLM2Config'
-
-
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0,
-                                    dtype=torch.int32), (1, 0))  # pylint: disable=E1102
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-class InternLM2RMSNorm(nn.Module):
-    """InternLM2RMSNorm is equivalent to T5LayerNorm."""
-
-    def __init__(self, hidden_size, eps=1e-6):
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance +
-                                                    self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-ALL_LAYERNORM_LAYERS.append(InternLM2RMSNorm)
-
-
-class InternLM2RotaryEmbedding(nn.Module):
-    """Rotary Position Embedding for the InternLM2 model.
-
-    Credits to the Reddit user /u/lucidrains.
-    """
-
-    def __init__(self,
-                 dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 device=None,
-                 scaling_factor=1.0):
-        super().__init__()
-        self.scaling_factor = scaling_factor
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base**(torch.arange(
-            0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
-        self.register_buffer('inv_freq', inv_freq, persistent=False)
-        # For BC we register cos and sin cached
-        self.max_seq_len_cached = max_position_embeddings
-
-    @torch.no_grad()
-    def forward(self, x, position_ids):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(
-            position_ids.shape[0], -1, 1)
-        position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 since bfloat16 loses precision on long contexts
-        # See https://github.com/huggingface/transformers/pull/29285
-        device_type = x.device.type
-        device_type = device_type if isinstance(
-            device_type, str) and device_type != 'mps' else 'cpu'
-        with torch.autocast(device_type=device_type, enabled=False):
-            freqs = (inv_freq_expanded.float()
-                     @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-
-
-class InternLM2LinearScalingRotaryEmbedding(InternLM2RotaryEmbedding):
-    """InternLM2RotaryEmbedding extended with linear scaling.
-
-    Credits to the Reddit user /u/kaiokendev
-    """
-
-    def forward(self, x, position_ids):
-        # difference to the original RoPE: a scaling factor is applied to the position ids
-        position_ids = position_ids.float() / self.scaling_factor
-        cos, sin = super().forward(x, position_ids)
-        return cos, sin
-
-
-class InternLM2DynamicNTKScalingRotaryEmbedding(InternLM2RotaryEmbedding):
-    """InternLM2RotaryEmbedding extended with Dynamic NTK scaling.
-
-    Credits to the Reddit users /u/bloc97 and /u/emozilla
-    """
-
-    def forward(self, x, position_ids):
-        # difference to the original RoPE: inv_freq is recomputed when the sequence length > original length
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_position_embeddings:
-            base = self.base * ((self.scaling_factor * seq_len /
-                                 self.max_position_embeddings) -
-                                (self.scaling_factor - 1))**(self.dim /
-                                                             (self.dim - 2))
-            inv_freq = 1.0 / (base**(torch.arange(
-                0, self.dim, 2, dtype=torch.int64).float().to(x.device) /
-                                     self.dim))
-            self.register_buffer(
-                'inv_freq', inv_freq,
-                persistent=False)  # TODO joao: this may break with compilation
-
-        cos, sin = super().forward(x, position_ids)
-        return cos, sin
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., :x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):  # pylint: disable=unused-argument
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`, *optional*):
-            Deprecated and unused.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos.unsqueeze(unsqueeze_dim)
-    sin = sin.unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class InternLM2MLP(nn.Module):
-    """MLP for InternLM2 model."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.w1 = nn.Linear(self.hidden_size,
-                            self.intermediate_size,
-                            bias=False)
-        self.w3 = nn.Linear(self.hidden_size,
-                            self.intermediate_size,
-                            bias=False)
-        self.w2 = nn.Linear(self.intermediate_size,
-                            self.hidden_size,
-                            bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        down_proj = self.w2(self.act_fn(self.w1(x)) * self.w3(x))
-
-        return down_proj
-
-
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """This is the equivalent of torch.repeat_interleave(x, dim=1,
-    repeats=n_rep).
-
-    The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to
-    (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :,
-                                  None, :, :].expand(batch,
-                                                     num_key_value_heads,
-                                                     n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen,
-                                 head_dim)
-
-
-class InternLM2Attention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper."""
-
-    def __init__(self,
-                 config: InternLM2Config,
-                 layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f'Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will '
-                'lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` '
-                'when creating this class.')
-
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f'hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}'
-                f' and `num_heads`: {self.num_heads}).')
-
-        self.wqkv = nn.Linear(
-            self.hidden_size,
-            (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim,
-            bias=config.bias,
-        )
-        self.wo = nn.Linear(self.num_heads * self.head_dim,
-                            self.hidden_size,
-                            bias=config.bias)
-
-        self._init_rope()
-
-    def _init_rope(self):
-        if self.config.rope_scaling is None:
-            self.rotary_emb = InternLM2RotaryEmbedding(
-                self.head_dim,
-                max_position_embeddings=self.max_position_embeddings,
-                base=self.rope_theta,
-            )
-        else:
-            scaling_type = self.config.rope_scaling['type']
-            scaling_factor = self.config.rope_scaling['factor']
-            if scaling_type == 'linear':
-                self.rotary_emb = InternLM2LinearScalingRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            elif scaling_type == 'dynamic':
-                self.rotary_emb = InternLM2DynamicNTKScalingRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            else:
-                raise ValueError(f'Unknown RoPE scaling type {scaling_type}')
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,  # pylint: disable=unused-argument
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-
-        if self.config.pretraining_tp > 1:
-            # split qkv_states by tp size
-            key_value_slicing = (self.num_key_value_heads *
-                                 self.head_dim) // self.config.pretraining_tp
-            qkv_slices = self.wqkv.weight.split(key_value_slicing, dim=0)
-            qkv_states = torch.cat(
-                [
-                    F.linear(hidden_states, qkv_slice)
-                    for qkv_slice in qkv_slices
-                ],
-                dim=-1  # pylint: disable=E1102
-            )
-        else:
-            qkv_states = self.wqkv(hidden_states)
-
-        qkv_states = rearrange(
-            qkv_states,
-            'b q (h gs d) -> b q h gs d',
-            gs=2 + self.num_key_value_groups,
-            d=self.head_dim,
-        )
-
-        query_states = qkv_states[..., :self.num_key_value_groups, :]
-        query_states = rearrange(query_states,
-                                 'b q h gs d -> b q (h gs) d').transpose(1, 2)
-        key_states = qkv_states[..., -2, :].transpose(1, 2)
-        value_states = qkv_states[..., -1, :].transpose(1, 2)
-
-        cos, sin = self.rotary_emb(value_states, position_ids)
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {
-                'sin': sin,
-                'cos': cos,
-                'cache_position': cache_position
-            }
-            key_states, value_states = past_key_value.update(
-                key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(
-            2, 3)) / math.sqrt(self.head_dim)
-
-        if attention_mask is not None:  # no matter the length, we just slice it
-            causal_mask = attention_mask[:, :, :, :key_states.shape[-2]]
-            attn_weights = attn_weights + causal_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights,
-                                             dim=-1,
-                                             dtype=torch.float32).to(
-                                                 query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f'`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is'
-                f' {attn_output.size()}')
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        if self.config.pretraining_tp > 1:
-            attn_output = attn_output.split(self.hidden_size //
-                                            self.config.pretraining_tp,
-                                            dim=2)
-            o_proj_slices = self.wo.weight.split(self.hidden_size //
-                                                 self.config.pretraining_tp,
-                                                 dim=1)
-            attn_output = sum([
-                F.linear(attn_output[i], o_proj_slices[i])  # pylint: disable=E1102
-                for i in range(self.config.pretraining_tp)
-            ])
-        else:
-            attn_output = self.wo(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class InternLM2FlashAttention2(InternLM2Attention):
-    """InternLM2 flash attention module.
-
-    This module inherits from `InternLM2Attention` as the weights of the module
-    stays untouched. The only required change would be on the forward pass
-    where it needs to correctly call the public API of flash attention and deal
-    with padding tokens in case the input contains any of them.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment,
-        #   that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
-        # Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1)
-        #   produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10(
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        if isinstance(past_key_value, StaticCache):
-            raise ValueError(
-                '`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` '
-                'make sure to use `sdpa` in the mean time, and open an issue at '
-                'https://github.com/huggingface/transformers')
-
-        output_attentions = False
-
-        bsz, q_len, _ = hidden_states.size()
-
-        qkv_states = self.wqkv(hidden_states)
-
-        qkv_states = rearrange(
-            qkv_states,
-            'b q (h gs d) -> b q h gs d',
-            gs=2 + self.num_key_value_groups,
-            d=self.head_dim,
-        )
-
-        query_states = qkv_states[..., :self.num_key_value_groups, :]
-        query_states = rearrange(query_states, 'b q h gs d -> b q (h gs) d')
-        key_states = qkv_states[..., -2, :]
-        value_states = qkv_states[..., -1, :]
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        cos, sin = self.rotary_emb(value_states, position_ids)
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin)
-
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {
-                'sin': sin,
-                'cos': cos,
-                'cache_position': cache_position
-            }
-            key_states, value_states = past_key_value.update(
-                key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # TODO: These transpose are quite inefficient but Flash Attention requires the layout
-        # [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
-        # to be able to avoid many of these transpose/reshape/view.
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        # dropout_rate = self.attention_dropout if self.training else 0.0
-        dropout_rate = 0.0
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32. (InternLM2RMSNorm handles it correctly)
-
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, '_pre_quantization_dtype'):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.wqkv.weight.dtype
-
-            logger.warning_once(
-                f'The input hidden states seems to be silently casted in float32, this might be related to'
-                f' the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in'
-                f' {target_dtype}.')
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        attn_output = self._flash_attention_forward(query_states,
-                                                    key_states,
-                                                    value_states,
-                                                    attention_mask,
-                                                    q_len,
-                                                    dropout=dropout_rate)
-
-        attn_output = attn_output.reshape(bsz, q_len,
-                                          self.hidden_size).contiguous()
-        attn_output = self.wo(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value  # pylint: disable=E0606
-
-    def _flash_attention_forward(self,
-                                 query_states,
-                                 key_states,
-                                 value_states,
-                                 attention_mask,
-                                 query_length,
-                                 dropout=0.0,
-                                 softmax_scale=None):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`float`):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1.
-            # For details, please see the comment in InternLM2FlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask,
-                query_length)
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            attn_output_unpad = flash_attn_varlen_func(  # pylint: disable=E0606
-                query_states,
-                key_states,
-                value_states,
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_k=cu_seqlens_k,
-                max_seqlen_q=max_seqlen_in_batch_q,
-                max_seqlen_k=max_seqlen_in_batch_k,
-                dropout_p=dropout,
-                softmax_scale=softmax_scale,
-                causal=causal,
-            )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size,
-                                    query_length)  # pylint: disable=E0606
-        else:
-            attn_output = flash_attn_func(  # pylint: disable=E0606
-                query_states,
-                key_states,
-                value_states,
-                dropout,
-                softmax_scale=softmax_scale,
-                causal=causal)
-
-        return attn_output
-
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask,
-                    query_length):
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(
-            attention_mask)
-        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
-
-        key_layer = index_first_axis(  # pylint: disable=E0606
-            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
-                              head_dim), indices_k)
-        value_layer = index_first_axis(  # pylint: disable=E0606
-            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
-                                head_dim), indices_k)
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(  # pylint: disable=E0606
-                query_layer.reshape(batch_size * kv_seq_len, self.num_heads,
-                                    head_dim), indices_k)
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(  # pylint: disable=E0606
-                query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.LllamaSdpaAttention with Llama->InternLM2
-class InternLM2SdpaAttention(InternLM2Attention):
-    """InternLM2 attention module using
-    torch.nn.functional.scaled_dot_product_attention.
-
-    This module inherits from `InternLM2Attention` as the weights of the module
-    stays untouched. The only changes are on the forward pass to adapt to SDPA
-    API.
-    """
-
-    # Adapted from InternLM2Attention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"`
-            # once this is implemented.
-            logger.warning_once(
-                'InternLM2Model uses InternLM2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` '
-                'does not support `output_attentions=True`. Falling back to the manual attention implementation, '
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. '
-                'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                cache_position=cache_position,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        qkv_states = self.wqkv(hidden_states)
-
-        qkv_states = rearrange(
-            qkv_states,
-            'b q (h gs d) -> b q h gs d',
-            gs=2 + self.num_key_value_groups,
-            d=self.head_dim,
-        )
-
-        query_states = qkv_states[..., :self.num_key_value_groups, :]
-        query_states = rearrange(query_states, 'b q h gs d -> b q (h gs) d')
-        key_states = qkv_states[..., -2, :]
-        value_states = qkv_states[..., -1, :]
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        cos, sin = self.rotary_emb(value_states, position_ids)
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin)
-
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {
-                'sin': sin,
-                'cos': cos,
-                'cache_position': cache_position
-            }
-            key_states, value_states = past_key_value.update(
-                key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        causal_mask = attention_mask
-        if attention_mask is not None:
-            causal_mask = causal_mask[:, :, :, :key_states.shape[-2]]
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with
-        # custom attn_mask, Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == 'cuda' and causal_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of
-        # an inline conditional assignment in SDPA to support both torch.compile's dynamic shapes and full graph
-        # options. An inline conditional prevents dynamic shapes from compiling.
-        is_causal = bool(causal_mask is None and q_len > 1)
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(  # pylint: disable=E1102
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=causal_mask,
-            dropout_p=0.0,
-            is_causal=is_causal,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
-
-        attn_output = self.wo(attn_output)
-
-        return attn_output, None, past_key_value
-
-
-INTERNLM2_ATTENTION_CLASSES = {
-    'eager': InternLM2Attention,
-    'flash_attention_2': InternLM2FlashAttention2,
-    'sdpa': InternLM2SdpaAttention,
-}
-
-
-# Modified from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->InternLM2
-class InternLM2DecoderLayer(nn.Module):
-    """InternLM2 Decoder Layer.
-
-    This module is a single layer of the InternLM2 model.
-    """
-
-    def __init__(self, config: InternLM2Config, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.layer_idx = layer_idx
-
-        self.attention = INTERNLM2_ATTENTION_CLASSES[
-            config.attn_implementation](config=config, layer_idx=layer_idx)
-
-        self.feed_forward = InternLM2MLP(config)
-        self.attention_norm = InternLM2RMSNorm(config.hidden_size,
-                                               eps=config.rms_norm_eps)
-        self.ffn_norm = InternLM2RMSNorm(config.hidden_size,
-                                         eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor,
-                                                 torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*):
-                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
-                query_sequence_length, key_sequence_length)` if default attention is used.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-        residual = hidden_states
-
-        hidden_states = self.attention_norm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.attention(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            cache_position=cache_position,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.ffn_norm(hidden_states)
-        hidden_states = self.feed_forward(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states, )
-
-        if output_attentions:
-            outputs += (self_attn_weights, )
-
-        if use_cache:
-            outputs += (present_key_value, )
-
-        return outputs
-
-
-InternLM2_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`InternLM2Config`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->InternLM2
-@add_start_docstrings(
-    'The bare InternLM2 Model outputting raw hidden-states without any specific head on top.',
-    InternLM2_START_DOCSTRING,
-)
-class InternLM2PreTrainedModel(PreTrainedModel):
-    """InternLM2 pretraiend model's base class."""
-
-    config_class = InternLM2Config
-    base_model_prefix = 'model'
-    supports_gradient_checkpointing = True
-    _no_split_modules = ['InternLM2DecoderLayer']
-    _skip_keys_device_placement = ['past_key_values']
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_cache_class = True
-    _supports_quantized_cache = True
-    _supports_static_cache = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-InternLM2_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
-            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
-            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
-            the complete sequence length.
-"""
-
-
-# Modified from transformers.models.llama.modeling_llama.LlamaModel with Llama->InternLM2
-@add_start_docstrings(
-    'The bare InternLM2 Model outputting raw hidden-states without any specific head on top.',
-    InternLM2_START_DOCSTRING,
-)
-class InternLM2Model(InternLM2PreTrainedModel):
-    """Transformer decoder consisting of *config.num_hidden_layers* layers.
-    Each layer is a [`InternLM2DecoderLayer`]
-
-    Args:
-        config: InternLM2Config
-    """
-
-    _auto_class = 'AutoModel'
-
-    def __init__(self, config: InternLM2Config):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-        self.config = config
-
-        self.tok_embeddings = nn.Embedding(config.vocab_size,
-                                           config.hidden_size,
-                                           self.padding_idx)
-
-        self.layers = nn.ModuleList([
-            InternLM2DecoderLayer(config, layer_idx)
-            for layer_idx in range(config.num_hidden_layers)
-        ])
-        self.norm = InternLM2RMSNorm(config.hidden_size,
-                                     eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.tok_embeddings
-
-    def set_input_embeddings(self, value):
-        self.tok_embeddings = value
-
-    @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache,
-                                        List[torch.FloatTensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                'You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one'
-            )
-
-        if self.gradient_checkpointing and self.training and use_cache:
-            logger.warning_once(
-                '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.'
-            )
-            use_cache = False
-
-        if inputs_embeds is None:
-            inputs_embeds = self.tok_embeddings(input_ids)
-
-        return_legacy_cache = False
-        if use_cache and not isinstance(
-                past_key_values,
-                Cache):  # kept for BC (non `Cache` `past_key_values` inputs)
-            return_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-
-        if cache_position is None:
-            past_seen_tokens = past_key_values.get_seq_length(
-            ) if past_key_values is not None else 0
-            cache_position = torch.arange(past_seen_tokens,
-                                          past_seen_tokens +
-                                          inputs_embeds.shape[1],
-                                          device=inputs_embeds.device)
-        if position_ids is None:
-            position_ids = cache_position.unsqueeze(0)
-
-        causal_mask = self._update_causal_mask(attention_mask, inputs_embeds,
-                                               cache_position, past_key_values,
-                                               output_attentions)
-
-        # embed positions
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-
-        for decoder_layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states, )
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    causal_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                    cache_position,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=causal_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                    cache_position=cache_position,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache = layer_outputs[
-                    2 if output_attentions else 1]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1], )
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states, )
-
-        next_cache = next_decoder_cache if use_cache else None
-        if return_legacy_cache:
-            next_cache = next_cache.to_legacy_cache()
-
-        if not return_dict:
-            return tuple(
-                v for v in
-                [hidden_states, next_cache, all_hidden_states, all_self_attns]
-                if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-    def _update_causal_mask(
-        self,
-        attention_mask: torch.Tensor,
-        input_tensor: torch.Tensor,
-        cache_position: torch.Tensor,
-        past_key_values: Cache,
-        output_attentions: bool,
-    ):
-        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length
-        # even when the static KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at
-        # each decode steps due to the dynamic shapes. (`recording cudagraph tree for symint key 13`, etc.), which is
-        # VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using `fullgraph=True`.
-        # See more context in https://github.com/huggingface/transformers/pull/29114
-
-        if self.config.attn_implementation == 'flash_attention_2':
-            if attention_mask is not None and 0.0 in attention_mask:
-                return attention_mask
-            return None
-
-        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
-        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
-        # to infer the attention mask.
-        past_seen_tokens = past_key_values.get_seq_length(
-        ) if past_key_values is not None else 0
-        using_static_cache = isinstance(past_key_values, StaticCache)
-
-        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
-        if self.config.attn_implementation == 'sdpa' and not using_static_cache and not output_attentions:
-            if AttentionMaskConverter._ignore_causal_mask_sdpa(
-                    attention_mask,
-                    inputs_embeds=input_tensor,
-                    past_key_values_length=past_seen_tokens,
-                    is_training=self.training,
-            ):
-                return None
-
-        dtype, device = input_tensor.dtype, input_tensor.device
-        min_dtype = torch.finfo(dtype).min
-        sequence_length = input_tensor.shape[1]
-        if using_static_cache:
-            target_length = past_key_values.get_max_length()
-        else:
-            target_length = (attention_mask.shape[-1] if isinstance(
-                attention_mask, torch.Tensor) else past_seen_tokens +
-                             sequence_length + 1)
-
-        if attention_mask is not None and attention_mask.dim() == 4:
-            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
-            if attention_mask.max() != 0:
-                raise ValueError(
-                    'Custom 4D attention mask should be passed in inverted form with max==0`'
-                )
-            causal_mask = attention_mask
-        else:
-            causal_mask = torch.full((sequence_length, target_length),
-                                     fill_value=min_dtype,
-                                     dtype=dtype,
-                                     device=device)
-            if sequence_length != 1:
-                causal_mask = torch.triu(causal_mask, diagonal=1)
-            causal_mask *= torch.arange(
-                target_length, device=device) > cache_position.reshape(-1, 1)
-            causal_mask = causal_mask[None, None, :, :].expand(
-                input_tensor.shape[0], 1, -1, -1)
-            if attention_mask is not None:
-                causal_mask = causal_mask.clone(
-                )  # copy to contiguous memory for in-place edit
-                mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[:, :, :, :
-                                           mask_length] + attention_mask[:,
-                                                                         None,
-                                                                         None, :]
-                padding_mask = padding_mask == 0
-                causal_mask[:, :, :, :
-                            mask_length] = causal_mask[:, :, :, :
-                                                       mask_length].masked_fill(
-                                                           padding_mask,
-                                                           min_dtype)
-        if (self.config.attn_implementation == 'sdpa'
-                and attention_mask is not None
-                and attention_mask.device.type == 'cuda'
-                and not output_attentions):
-            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
-            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
-            # Details: https://github.com/pytorch/pytorch/issues/110213
-            causal_mask = AttentionMaskConverter._unmask_unattended(
-                causal_mask, min_dtype)  # pylint: disable=E1120
-
-        return causal_mask
-
-
-# Modified from transformers.models.llama.modeling_llama.LlamaForCausalLM
-class InternLM2ForCausalLM(InternLM2PreTrainedModel):
-    """Causal language model (CLM) for InternLM2."""
-
-    _auto_class = 'AutoModelForCausalLM'
-    _tied_weights_keys = ['output.weight']
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = InternLM2Model(config)
-        self.vocab_size = config.vocab_size
-        self.output = nn.Linear(config.hidden_size,
-                                config.vocab_size,
-                                bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-        convert_to_qmodules(self)
-
-    def get_input_embeddings(self):
-        return self.model.tok_embeddings
-
-    def set_input_embeddings(self, value):
-        self.model.tok_embeddings = value
-
-    def get_output_embeddings(self):
-        return self.output
-
-    def set_output_embeddings(self, new_embeddings):
-        self.output = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast,
-                               config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache,
-                                        List[torch.FloatTensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, InternLM2ForCausalLM
-
-        >>> model = InternLM2ForCausalLM.from_pretrained("meta-InternLM2/InternLM2-2-7b-hf")
-        >>> tokenizer = AutoTokenizer.from_pretrained("meta-InternLM2/InternLM2-2-7b-hf")
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            cache_position=cache_position,
-        )
-
-        hidden_states = outputs[0]
-        if self.config.pretraining_tp > 1:
-            output_slices = self.output.weight.split(
-                self.vocab_size // self.config.pretraining_tp, dim=0)
-            logits = [
-                F.linear(hidden_states, output_slices[i])  # pylint: disable=not-callable
-                for i in range(self.config.pretraining_tp)
-            ]
-            logits = torch.cat(logits, dim=-1)
-        else:
-            logits = self.output(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits, ) + outputs[1:]
-            return (loss, ) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past_key_values=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        cache_position=None,
-        use_cache=True,
-        **kwargs,
-    ):
-        past_length = 0
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                past_length = cache_position[
-                    0] if cache_position is not None else past_key_values.get_seq_length(
-                    )
-                max_cache_length = (torch.tensor(
-                    past_key_values.get_max_length(), device=input_ids.device)
-                                    if past_key_values.get_max_length()
-                                    is not None else None)
-                cache_length = past_length if max_cache_length is None else torch.min(
-                    max_cache_length, past_length)
-            # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as input)
-            if attention_mask is not None and attention_mask.shape[
-                    1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] -
-                                           past_length):]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (max_cache_length is not None and attention_mask is not None
-                    and cache_length + input_ids.shape[1] > max_cache_length):
-                attention_mask = attention_mask[:, -max_cache_length:]  # pylint: disable=E1130
-
-        position_ids = kwargs.get('position_ids', None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1]:]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {'inputs_embeds': inputs_embeds}
-        else:
-            # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise
-            # recompiles graphs as the stride of the inputs is a guard.
-            # Ref: https://github.com/huggingface/transformers/pull/29114
-            # TODO: use `next_tokens` directly instead.
-            model_inputs = {'input_ids': input_ids.contiguous()}
-
-        input_length = position_ids.shape[
-            -1] if position_ids is not None else input_ids.shape[-1]
-        if cache_position is None:
-            cache_position = torch.arange(past_length,
-                                          past_length + input_length,
-                                          device=input_ids.device)
-        elif use_cache:
-            cache_position = cache_position[-input_length:]
-
-        model_inputs.update({
-            'position_ids': position_ids,
-            'cache_position': cache_position,
-            'past_key_values': past_key_values,
-            'use_cache': use_cache,
-            'attention_mask': attention_mask,
-        })
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx.to(past_state.device))
-                for past_state in layer_past), )
-        return reordered_past
-
-    def build_inputs(self,
-                     tokenizer,
-                     query: str,
-                     history: List[Tuple[str, str]] = None,
-                     meta_instruction=''):
-        if history is None:
-            history = []
-        if tokenizer.add_bos_token:
-            prompt = ''
-        else:
-            prompt = tokenizer.bos_token
-        if meta_instruction:
-            prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n"""
-        for record in history:
-            prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n"""
-        prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
-        return tokenizer([prompt], return_tensors='pt')
-
-    @torch.no_grad()
-    def chat(
-        self,
-        tokenizer,
-        query: str,
-        history: Optional[List[Tuple[str, str]]] = None,
-        streamer: Optional[BaseStreamer] = None,
-        max_new_tokens: int = 1024,
-        do_sample: bool = True,
-        temperature: float = 0.8,
-        top_p: float = 0.8,
-        meta_instruction:
-        str = 'You are an AI assistant whose name is InternLM (书生·浦语).\n'
-        '- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory '
-        '(上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n'
-        '- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such '
-        'as English and 中文.',
-        **kwargs,
-    ):
-        if history is None:
-            history = []
-        inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
-        inputs = {
-            k: v.to(self.device)
-            for k, v in inputs.items() if torch.is_tensor(v)
-        }
-        # also add end-of-assistant token in eos token id to avoid unnecessary generation
-        eos_token_id = [
-            tokenizer.eos_token_id,
-            tokenizer.convert_tokens_to_ids(['<|im_end|>'])[0]
-        ]
-        outputs = self.generate(
-            **inputs,
-            streamer=streamer,
-            max_new_tokens=max_new_tokens,
-            do_sample=do_sample,
-            temperature=temperature,
-            top_p=top_p,
-            eos_token_id=eos_token_id,
-            **kwargs,
-        )
-        outputs = outputs[0].cpu().tolist()[len(inputs['input_ids'][0]):]
-        response = tokenizer.decode(outputs, skip_special_tokens=True)
-        response = response.split('<|im_end|>')[0]
-        history = history + [(query, response)]
-        return response, history
-
-    @torch.no_grad()
-    def stream_chat(
-        self,
-        tokenizer,
-        query: str,
-        history: List[Tuple[str, str]] = None,
-        max_new_tokens: int = 1024,
-        do_sample: bool = True,
-        temperature: float = 0.8,
-        top_p: float = 0.8,
-        **kwargs,
-    ):
-        if history is None:
-            history = []
-        """
-        Return a generator in format: (response, history)
-        Eg.
-        ('你好，有什么可以帮助您的吗', [('你好', '你好，有什么可以帮助您的吗')])
-        ('你好，有什么可以帮助您的吗？', [('你好', '你好，有什么可以帮助您的吗？')])
-        """
-        if BaseStreamer is None:
-            raise ModuleNotFoundError(
-                'The version of `transformers` is too low. Please make sure '
-                'that you have installed `transformers>=4.28.0`.')
-
-        response_queue = queue.Queue(maxsize=20)
-
-        class ChatStreamer(BaseStreamer):
-            """Streamer used in generate to print words one by one."""
-
-            def __init__(self, tokenizer) -> None:
-                super().__init__()
-                self.tokenizer = tokenizer
-                self.queue = response_queue
-                self.query = query
-                self.history = history
-                self.response = ''
-                self.cache = []
-                self.received_inputs = False
-                self.queue.put(
-                    (self.response, history + [(self.query, self.response)]))
-
-            def put(self, value):
-                if len(value.shape) > 1 and value.shape[0] > 1:
-                    raise ValueError('ChatStreamer only supports batch size 1')
-                elif len(value.shape) > 1:
-                    value = value[0]
-
-                if not self.received_inputs:
-                    # The first received value is input_ids, ignore here
-                    self.received_inputs = True
-                    return
-
-                self.cache.extend(value.tolist())
-                token = self.tokenizer.decode(self.cache,
-                                              skip_special_tokens=True)
-                if token.strip() != '<|im_end|>':
-                    self.response = self.response + token
-                    history = self.history + [(self.query, self.response)]
-                    self.queue.put((self.response, history))
-                    self.cache = []
-                else:
-                    self.end()
-
-            def end(self):
-                self.queue.put(None)
-
-        def stream_producer():
-            return self.chat(
-                tokenizer=tokenizer,
-                query=query,
-                streamer=ChatStreamer(tokenizer=tokenizer),
-                history=history,
-                max_new_tokens=max_new_tokens,
-                do_sample=do_sample,
-                temperature=temperature,
-                top_p=top_p,
-                **kwargs,
-            )
-
-        def consumer():
-            producer = threading.Thread(target=stream_producer)
-            producer.start()
-            while True:
-                res = response_queue.get()
-                if res is None:
-                    return
-                yield res
-
-        return consumer()
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2
-@add_start_docstrings(
-    """
-    The InternLM2 Model transformer with a sequence classification head on top (linear layer).
-
-    [`InternLM2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    InternLM2_START_DOCSTRING,
-)
-class InternLM2ForSequenceClassification(InternLM2PreTrainedModel):
-    """Sequence Classification Head for InternLM2 Model."""
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = InternLM2Model(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.tok_embeddings
-
-    def set_input_embeddings(self, value):
-        self.model.tok_embeddings = value
-
-    @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache,
-                                        List[torch.FloatTensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError(
-                'Cannot handle batch sizes > 1 if no padding token is defined.'
-            )
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(
-                    input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device),
-                               sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = 'regression'
-                elif self.num_labels > 1 and (labels.dtype
-                                              in (torch.long, torch.int)):
-                    self.config.problem_type = 'single_label_classification'
-                else:
-                    self.config.problem_type = 'multi_label_classification'
-
-            if self.config.problem_type == 'regression':
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == 'single_label_classification':
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels),
-                                labels.view(-1))
-            elif self.config.problem_type == 'multi_label_classification':
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits, ) + transformer_outputs[1:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaForQuestionAnswering with Llama->InternLM2
-@add_start_docstrings(
-    """
-The InternLM2 Model transformer with a span classification head on top for extractive question-answering tasks like
-SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
-    """,
-    InternLM2_START_DOCSTRING,
-)
-class InternLM2ForQuestionAnswering(InternLM2PreTrainedModel):
-    """Question Answering model for InternLM2."""
-
-    base_model_prefix = 'transformer'
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.transformer = InternLM2Model(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, 2)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.transformer.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.transformer.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache,
-                                        List[torch.FloatTensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        start_positions: Optional[torch.LongTensor] = None,
-        end_positions: Optional[torch.LongTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
-        r"""
-        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1).contiguous()
-        end_logits = end_logits.squeeze(-1).contiguous()
-
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1).to(
-                    start_logits.device)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1).to(end_logits.device)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions = end_positions.clamp(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-
-        if not return_dict:
-            output = (start_logits, end_logits) + outputs[2:]
-            return ((total_loss, ) +
-                    output) if total_loss is not None else output
-
-        return QuestionAnsweringModelOutput(
-            loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->InternLM2
-@add_start_docstrings(
-    """
-    The InternLM2 Model transformer with a token classification head on top (a linear layer on top of the hidden-states
-    output) e.g. for Named-Entity-Recognition (NER) tasks.
-    """,
-    InternLM2_START_DOCSTRING,
-)
-class InternLM2ForTokenClassification(InternLM2PreTrainedModel):
-    """Token classification model for InternLM2."""
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = InternLM2Model(config)
-        if getattr(config, 'classifier_dropout', None) is not None:
-            classifier_dropout = config.classifier_dropout
-        elif getattr(config, 'hidden_dropout', None) is not None:
-            classifier_dropout = config.hidden_dropout
-        else:
-            classifier_dropout = 0.1
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.score = nn.Linear(config.hidden_size, config.num_labels)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = outputs[0]
-        sequence_output = self.dropout(sequence_output)
-        logits = self.score(sequence_output)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-
-        if not return_dict:
-            output = (logits, ) + outputs[2:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return TokenClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
diff --git a/lmdeploy/pytorch/modeling/modeling_llama.py b/lmdeploy/pytorch/modeling/modeling_llama.py
deleted file mode 100644
index c37d1e9aee..0000000000
--- a/lmdeploy/pytorch/modeling/modeling_llama.py
+++ /dev/null
@@ -1,1297 +0,0 @@
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch LLaMA model."""
-import math
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import (BaseModelOutputWithPast,
-                                           CausalLMOutputWithPast,
-                                           SequenceClassifierOutputWithPast)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.models.llama.configuration_llama import LlamaConfig
-from transformers.utils import (add_start_docstrings,
-                                add_start_docstrings_to_model_forward,
-                                replace_return_docstrings)
-
-from lmdeploy.pytorch.modeling.convert_to_qmodules import convert_to_qmodules
-from lmdeploy.utils import get_logger
-
-logger = get_logger('lmdeploy')
-
-_CONFIG_FOR_DOC = 'LlamaConfig'
-
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(input_ids_shape: torch.Size,
-                      dtype: torch.dtype,
-                      device: torch.device,
-                      past_key_values_length: int = 0):
-    """Make causal mask used for bi-directional self-attention."""
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len),
-                      torch.finfo(dtype).min,
-                      device=device)
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([
-            torch.zeros(
-                tgt_len, past_key_values_length, dtype=dtype, device=device),
-            mask
-        ],
-                         dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len,
-                                         tgt_len + past_key_values_length)
-
-
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor,
-                 dtype: torch.dtype,
-                 tgt_len: Optional[int] = None):
-    """Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len,
-    src_seq_len]`."""
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len,
-                                                  src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool),
-                                     torch.finfo(dtype).min)
-
-
-class LlamaRMSNorm(nn.Module):
-
-    def __init__(self, hidden_size, eps=1e-6):
-        """LlamaRMSNorm is equivalent to T5LayerNorm."""
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance +
-                                                    self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-class LlamaRotaryEmbedding(torch.nn.Module):
-    """RotaryEmbedding for Llama Model.
-
-    This module generates sine and cosine positional encodings based on
-    the paper "RoFormer: Enhanced Transformer with Rotary Position Embedding".
-    The purpose of this class is to provide positional embeddings to the
-    input tensors. It utilizes a cache mechanism to store precomputed
-    sine and cosine values for speedup.
-
-    Args:
-        dim (int): The dimensionality of the embeddings.
-        max_position_embeddings (int, optional): The maximum number of
-            position embeddings. Default is 2048.
-        base (int, optional): The base value for the inverse frequency
-            calculation. Default is 10000.
-        device (str, optional): The device to run operations on.
-            If None, defaults to the device of the model.
-    """
-
-    def __init__(self,
-                 dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base**(
-            torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer('inv_freq', inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(seq_len=max_position_embeddings,
-                                device=self.inv_freq.device,
-                                dtype=torch.get_default_dtype())
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        """Sets the cached sine and cosine values for the specified sequence
-        length.
-
-        Args:
-            seq_len (int): The sequence length for which to set the cache.
-            device (str): The device to use for computation.
-            dtype (torch.dtype): The data type to be used for tensors.
-        """
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached,
-                         device=device,
-                         dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order
-        # to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer('cos_cached',
-                             emb.cos()[None, None, :, :].to(dtype),
-                             persistent=False)
-        self.register_buffer('sin_cached',
-                             emb.sin()[None, None, :, :].to(dtype),
-                             persistent=False)
-
-    def forward(self, x, seq_len=None):
-        """Forward propagation method for the embedding layer. Generates
-        positional embeddings for the given input tensor.
-
-        If the sequence length is larger than the cache, it resets the cache.
-
-        Args:
-            x (torch.Tensor): Input tensor of shape
-                [batch_size, num_attention_heads, seq_len, head_size].
-            seq_len (int, optional): Sequence length. If None, it is obtained
-                from `x`.
-
-        Returns:
-            tuple: Tuple containing cosine and sine positional embeddings.
-        """
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len,
-                                    device=x.device,
-                                    dtype=x.dtype)
-
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
-
-
-class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """This class extends the `LlamaRotaryEmbedding` with linear scaling.
-
-    It provides a mechanism for adjusting the scale of the positional
-    embeddings by dividing the tensor generated by the range of sequence length
-    with a scaling factor. This is useful when dealing with sequences of
-    varying lengths.
-
-    Credits to Reddit User /u/kaiokendev for this extension.
-
-    Args:
-        dim (int): The dimensionality of the embeddings.
-        max_position_embeddings (int, optional): The maximum number of
-            position embeddings. Default is 2048.
-        base (int, optional): The base value for the inverse frequency
-            calculation. Default is 10000.
-        device (str, optional): The device to run operations on. If None,
-            defaults to the device of the model.
-        scaling_factor (float, optional): Scaling factor used in adjusting
-            the scale of positional embeddings. Default is 1.0.
-    """
-
-    def __init__(self,
-                 dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 device=None,
-                 scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        """Sets the cached sine and cosine values for the specified sequence
-        length.
-
-        Args:
-            seq_len (int): The sequence length for which to set the cache.
-            device (str): The device to use for computation.
-            dtype (torch.dtype): The data type to use for tensors.
-        """
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached,
-                         device=device,
-                         dtype=self.inv_freq.dtype)
-        t = t / self.scaling_factor
-
-        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order
-        # to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer('cos_cached',
-                             emb.cos()[None, None, :, :].to(dtype),
-                             persistent=False)
-        self.register_buffer('sin_cached',
-                             emb.sin()[None, None, :, :].to(dtype),
-                             persistent=False)
-
-
-class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with Dynamic NTK scaling.
-
-    Credits to the Reddit users /u/bloc97 and /u/emozilla
-    """
-
-    def __init__(self,
-                 dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 device=None,
-                 scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-
-        if seq_len > self.max_position_embeddings:
-            base = self.base * ((self.scaling_factor * seq_len /
-                                 self.max_position_embeddings) -
-                                (self.scaling_factor - 1))**(self.dim /
-                                                             (self.dim - 2))
-            inv_freq = 1.0 / (base**(
-                torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-            self.register_buffer('inv_freq', inv_freq, persistent=False)
-
-        t = torch.arange(self.max_seq_len_cached,
-                         device=device,
-                         dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order
-        # to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer('cos_cached',
-                             emb.cos()[None, None, :, :].to(dtype),
-                             persistent=False)
-        self.register_buffer('sin_cached',
-                             emb.sin()[None, None, :, :].to(dtype),
-                             persistent=False)
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., :x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    """Apply rotary positional embeddings to query and key tensors.
-
-    This function applies the cosine and sine positional embeddings on the
-    input query (q) and key (k) tensors using element-wise multiplication and
-    addition.
-    """
-    # The first two dimensions of cos and sin are always 1,
-    # so we can `squeeze` them.
-    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
-    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
-    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class LlamaMLP(nn.Module):
-    """MLP for Llama Model."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size,
-                                   self.intermediate_size,
-                                   bias=False)
-        self.up_proj = nn.Linear(self.hidden_size,
-                                 self.intermediate_size,
-                                 bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size,
-                                   self.hidden_size,
-                                   bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        if self.config.pretraining_tp > 1:
-            slice = self.intermediate_size // self.config.pretraining_tp
-            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
-            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
-            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
-
-            gate_proj = torch.cat([
-                F.linear(x, gate_proj_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ],
-                                  dim=-1)
-            up_proj = torch.cat([
-                F.linear(x, up_proj_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ],
-                                dim=-1)
-
-            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(
-                slice, dim=2)
-            down_proj = [
-                F.linear(intermediate_states[i], down_proj_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ]
-            down_proj = sum(down_proj)
-        else:
-            down_proj = self.down_proj(
-                self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-        return down_proj
-
-
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """This is the equivalent of torch.repeat_interleave(x, dim=1,
-    repeats=n_rep).
-
-    The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to
-    (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :,
-                                  None, :, :].expand(batch,
-                                                     num_key_value_heads,
-                                                     n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen,
-                                 head_dim)
-
-
-class LlamaAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper."""
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError('hidden_size must be divisible by num_heads '
-                             f'(got `hidden_size`: {self.hidden_size}'
-                             f' and `num_heads`: {self.num_heads}).')
-        self.q_proj = nn.Linear(self.hidden_size,
-                                self.num_heads * self.head_dim,
-                                bias=False)
-        self.k_proj = nn.Linear(self.hidden_size,
-                                self.num_key_value_heads * self.head_dim,
-                                bias=False)
-        self.v_proj = nn.Linear(self.hidden_size,
-                                self.num_key_value_heads * self.head_dim,
-                                bias=False)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim,
-                                self.hidden_size,
-                                bias=False)
-        self._init_rope()
-
-    def _init_rope(self):
-        """Initialize the Rotary Embedding Module."""
-        if self.config.rope_scaling is None:
-            self.rotary_emb = LlamaRotaryEmbedding(
-                self.head_dim,
-                max_position_embeddings=self.max_position_embeddings,
-                base=self.rope_theta,
-            )
-        else:
-            scaling_type = self.config.rope_scaling['type']
-            scaling_factor = self.config.rope_scaling['factor']
-            if scaling_type == 'linear':
-                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            elif scaling_type == 'dynamic':
-                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            else:
-                raise ValueError(f'Unknown RoPE scaling type {scaling_type}')
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads,
-                           self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        """Forward propagation method for the attention layer."""
-        bsz, q_len, _ = hidden_states.size()
-
-        if self.config.pretraining_tp > 1:
-            key_value_slicing = (self.num_key_value_heads *
-                                 self.head_dim) // self.config.pretraining_tp
-            query_slices = self.q_proj.weight.split(
-                (self.num_heads * self.head_dim) // self.config.pretraining_tp,
-                dim=0)
-            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-            query_states = [
-                F.linear(hidden_states, query_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ]
-            query_states = torch.cat(query_states, dim=-1)
-
-            key_states = [
-                F.linear(hidden_states, key_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ]
-            key_states = torch.cat(key_states, dim=-1)
-
-            value_states = [
-                F.linear(hidden_states, value_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ]
-            value_states = torch.cat(value_states, dim=-1)
-
-        else:
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads,
-                                         self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
-                                     self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
-                                         self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(
-            2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                'Attention weights should be of size '
-                f'{(bsz, self.num_heads, q_len, kv_seq_len)}, but is'
-                f' {attn_weights.size()}')
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError('Attention mask should be of size '
-                                 f'{(bsz, 1, q_len, kv_seq_len)}, '
-                                 f'but is {attention_mask.size()}')
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights,
-                                             dim=-1,
-                                             dtype=torch.float32).to(
-                                                 query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                '`attn_output` should be of size '
-                f'{(bsz, self.num_heads, q_len, self.head_dim)}, but is'
-                f' {attn_output.size()}')
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        if self.config.pretraining_tp > 1:
-            attn_output = attn_output.split(self.hidden_size //
-                                            self.config.pretraining_tp,
-                                            dim=2)
-            o_proj_slices = self.o_proj.weight.split(
-                self.hidden_size // self.config.pretraining_tp, dim=1)
-            attn_output = sum([
-                F.linear(attn_output[i], o_proj_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ])
-        else:
-            attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class LlamaDecoderLayer(nn.Module):
-    """Decoder layer for Llama Model."""
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = LlamaAttention(config=config)
-        self.mlp = LlamaMLP(config)
-        self.input_layernorm = LlamaRMSNorm(config.hidden_size,
-                                            eps=config.rms_norm_eps)
-        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size,
-                                                     eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor,
-                                                 torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape
-                `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask
-                of size `(batch, 1, tgt_len, src_len)` where padding elements
-                are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all
-                attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are
-                returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached
-                past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states, )
-
-        if output_attentions:
-            outputs += (self_attn_weights, )
-
-        if use_cache:
-            outputs += (present_key_value, )
-
-        return outputs
-
-
-LLAMA_START_DOCSTRING = r"""    # noqa: E501
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`LlamaConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    'The bare LLaMA Model outputting raw hidden-states without any specific head on top.',  # noqa: E501
-    LLAMA_START_DOCSTRING,
-)
-class LlamaPreTrainedModel(PreTrainedModel):
-    config_class = LlamaConfig
-    base_model_prefix = 'model'
-    supports_gradient_checkpointing = True
-    _no_split_modules = ['LlamaDecoderLayer']
-    _skip_keys_device_placement = 'past_key_values'
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, LlamaModel):
-            module.gradient_checkpointing = value
-
-
-LLAMA_INPUTS_DOCSTRING = r"""    # noqa: E501
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    'The bare LLaMA Model outputting raw hidden-states without any specific head on top.',  # noqa: E501
-    LLAMA_START_DOCSTRING,
-)
-class LlamaModel(LlamaPreTrainedModel):
-    """Transformer decoder consisting of *config.num_hidden_layers* layers.
-    Each layer is a [`LlamaDecoderLayer`]
-
-    Args:
-        config: LlamaConfig
-    """
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size,
-                                         self.padding_idx)
-        self.layers = nn.ModuleList([
-            LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask    # noqa
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape,
-                                        inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
-            )
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask,
-                                              inputs_embeds.dtype,
-                                              tgt_len=input_shape[-1]).to(
-                                                  inputs_embeds.device)
-            combined_attention_mask = (expanded_attn_mask
-                                       if combined_attention_mask is None else
-                                       expanded_attn_mask +
-                                       combined_attention_mask)
-
-        return combined_attention_mask
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        use_cache = (use_cache
-                     if use_cache is not None else self.config.use_cache)
-
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError('You cannot specify both decoder_input_ids'
-                             'and decoder_inputs_embeds at the same time')
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError('You have to specify either decoder_input_ids'
-                             'or decoder_inputs_embeds')
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = (seq_length_with_past +
-                                    past_key_values_length)
-
-        if position_ids is None:
-            device = (input_ids.device
-                      if input_ids is not None else inputs_embeds.device)
-            position_ids = torch.arange(past_key_values_length,
-                                        seq_length + past_key_values_length,
-                                        dtype=torch.long,
-                                        device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_length_with_past),
-                                        dtype=torch.bool,
-                                        device=inputs_embeds.device)
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds,
-            past_key_values_length)
-
-        hidden_states = inputs_embeds
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    '`use_cache=True` is incompatible with gradient'
-                    ' checkpointing. Setting `use_cache=False`...')
-                use_cache = False
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states, )
-
-            past_key_value = past_key_values[
-                idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, past_key_value,
-                                      output_attentions)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (
-                    layer_outputs[2 if output_attentions else 1], )
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1], )
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states, )
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(
-                v for v in
-                [hidden_states, next_cache, all_hidden_states, all_self_attns]
-                if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class LlamaForCausalLM(LlamaPreTrainedModel):
-    """This class extends the `LlamaPreTrainedModel` to enable causal language
-    modeling.
-
-    It wraps the basic Llama model (`LlamaModel`) and includes a linear layer
-    as a language model head (`lm_head`). The purpose is to predict token
-    probabilities, given the previous tokens in the sequence.
-    """
-    _tied_weights_keys = ['lm_head.weight']
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = LlamaModel(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size,
-                                 config.vocab_size,
-                                 bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-        convert_to_qmodules(self)
-
-    def get_input_embeddings(self):
-        """Get the token embedding layer."""
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        """Set the token embedding layer."""
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        """Get the output embedding layer."""
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        """Set the output embedding layer."""
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        """Set the decoder model."""
-        self.model = decoder
-
-    def get_decoder(self):
-        """Get the decoder model."""
-        return self.model
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast,
-                               config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r""" # noqa: E501
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, LlamaForCausalLM
-
-        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = (output_attentions if output_attentions is not None
-                             else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) # noqa: E501
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        if self.config.pretraining_tp > 1:
-            lm_head_slices = self.lm_head.weight.split(
-                self.vocab_size // self.config.pretraining_tp, dim=0)
-            logits = [
-                F.linear(hidden_states, lm_head_slices[i])
-                for i in range(self.config.pretraining_tp)
-            ]
-            logits = torch.cat(logits, dim=-1)
-        else:
-            logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits, ) + outputs[1:]
-            return (loss, ) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      past_key_values=None,
-                                      attention_mask=None,
-                                      inputs_embeds=None,
-                                      **kwargs):
-        """Prepare inputs for generating sequences using the model.
-
-        Args:
-            input_ids (torch.Tensor): Input token ids.
-            past_key_values (list[torch.Tensor], optional): List of past key
-                and value states.
-            attention_mask (torch.Tensor, optional): Mask indicating which
-                tokens should be attended to.
-            inputs_embeds (torch.FloatTensor, optional): Optionally,
-                the input embeddings instead of token ids.
-
-        Returns:
-            dict: Dictionary containing prepared inputs for model generation.
-        """
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-
-        position_ids = kwargs.get('position_ids', None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-
-        # if `inputs_embeds` are passed, we only want to use them
-        # in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {'inputs_embeds': inputs_embeds}
-        else:
-            model_inputs = {'input_ids': input_ids}
-
-        model_inputs.update({
-            'position_ids': position_ids,
-            'past_key_values': past_key_values,
-            'use_cache': kwargs.get('use_cache'),
-            'attention_mask': attention_mask,
-        })
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        """Reorder cached past key-values during generation using beam search.
-
-        This function reorders the cached past key-values according to the
-        given indices. It's useful in beam search where the order of hypotheses
-        can change from one time-step to another.
-        """
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx.to(past_state.device))
-                for past_state in layer_past), )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """ # noqa: E501
-    The LLaMa Model transformer with a sequence classification head on top (linear layer).
-
-    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    LLAMA_START_DOCSTRING,
-)
-class LlamaForSequenceClassification(LlamaPreTrainedModel):
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = LlamaModel(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r""" # noqa: E501
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = (return_dict if return_dict is not None else
-                       self.config.use_return_dict)
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError(
-                'Cannot handle batch sizes > 1 if no padding token is defined.'
-            )
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = (torch.eq(
-                    input_ids, self.config.pad_token_id).long().argmax(-1) -
-                                    1).to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device),
-                               sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = 'regression'
-                elif self.num_labels > 1 and (labels.dtype == torch.long
-                                              or labels.dtype == torch.int):
-                    self.config.problem_type = 'single_label_classification'
-                else:
-                    self.config.problem_type = 'multi_label_classification'
-
-            if self.config.problem_type == 'regression':
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == 'single_label_classification':
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels),
-                                labels.view(-1))
-            elif self.config.problem_type == 'multi_label_classification':
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits, ) + transformer_outputs[1:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )

From 9bfdeaece9b5c6138a46f662f090e3e7ee926ea5 Mon Sep 17 00:00:00 2001
From: tangzhiyi11 <tangzhiyi11@users.noreply.github.com>
Date: Wed, 4 Dec 2024 19:48:31 +0800
Subject: [PATCH 107/122] convert kv cache to nd format in ascend graph mode
 (#2853)

---
 .../backends/dlinfer/ascend/graph_runner.py   | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py b/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py
index b69cb1dca5..f9664f13ff 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py
@@ -1,15 +1,20 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
 from importlib import import_module
+from typing import List
 
 import torch
 import torch.distributed
+import torch_npu
 
 from lmdeploy.pytorch.config import BackendConfig, CacheConfig, ModelConfig
+from lmdeploy.pytorch.model_inputs import StepContext
 from lmdeploy.utils import get_logger
 
 from ...graph_runner import GraphRunner
 
+ACL_FORMAT_ND = 2
+
 logger = get_logger('lmdeploy')
 
 
@@ -110,3 +115,31 @@ def allocate_gpu_cache_mark_static(self):
             return gpu_cache
 
         setattr(cache_engine_class, func_str, allocate_gpu_cache_mark_static)
+
+    def _convert_kv_format(self,
+                           past_key_values: List[List[torch.Tensor]]) -> None:
+        """Convert key/value caches to ACL_FORMAT_ND format if needed."""
+        # Check format of first KV cache
+        if torch_npu.get_npu_format(past_key_values[0][0]) == ACL_FORMAT_ND:
+            return
+
+        # Convert all KV caches to ACL_FORMAT_ND
+        for layer_kv in past_key_values:
+            key_cache, value_cache = layer_kv
+            torch_npu.npu_format_cast(key_cache, ACL_FORMAT_ND)
+            torch_npu.npu_format_cast(value_cache, ACL_FORMAT_ND)
+
+    def prepare_inputs_for_generation(
+        self,
+        past_key_values: List[List[torch.Tensor]],
+        inputs_embeds: torch.Tensor = None,
+        context: StepContext = None,
+    ):
+        """prepare inputs."""
+        if self.enable_graph:
+            self._convert_kv_format(past_key_values)
+        return self.model.prepare_inputs_for_generation(
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            context=context,
+        )

From 866bfa53598b939970028ce6bd0be8783f90109c Mon Sep 17 00:00:00 2001
From: yaofengchen <67218893+yao-fengchen@users.noreply.github.com>
Date: Fri, 6 Dec 2024 10:49:57 +0800
Subject: [PATCH 108/122] [ascend]feat: support kv int8 (#2736)

* [ascend]feat: support kv int8 quant

* update code

* fix error of argument missing

* update params

* fix not iterable error when quant_meta is None.

* Update ascend_en_get_started.md for kvcache quant

* Update ascend_cn_get_started.md for kvcache quant

---------

Co-authored-by: jinminxi104 <jinminxi104@hotmail.com>
---
 docs/en/get_started/ascend/get_started.md     |  6 ++
 docs/zh_cn/get_started/ascend/get_started.md  |  6 ++
 lmdeploy/messages.py                          |  7 +-
 .../backends/dlinfer/ascend/op_backend.py     | 88 ++++++++++++++++++-
 .../pytorch/backends/dlinfer/attention.py     | 37 +++++++-
 lmdeploy/pytorch/config.py                    |  1 +
 lmdeploy/pytorch/engine/cache_engine.py       |  8 +-
 lmdeploy/pytorch/engine/engine.py             |  1 +
 lmdeploy/pytorch/engine/model_agent.py        |  1 +
 .../pytorch/kernels/dlinfer/fill_kv_cache.py  | 15 +++-
 .../pytorch/kernels/dlinfer/pagedattention.py | 33 ++++++-
 lmdeploy/pytorch/model_inputs.py              |  6 ++
 lmdeploy/pytorch/tools/make_inputs.py         |  1 +
 13 files changed, 199 insertions(+), 11 deletions(-)

diff --git a/docs/en/get_started/ascend/get_started.md b/docs/en/get_started/ascend/get_started.md
index 23b86afa61..d104477ca1 100644
--- a/docs/en/get_started/ascend/get_started.md
+++ b/docs/en/get_started/ascend/get_started.md
@@ -136,3 +136,9 @@ lmdeploy lite auto_awq $HF_MODEL --work-dir $WORK_DIR --device npu
 ```
 
 Please check [supported_models](../../supported_models/supported_models.md) before use this feature.
+
+### int8 KV-cache Quantization
+
+Ascend backend has supported offline int8 KV-cache Quantization on eager mode.
+
+Please refer this [doc](https://github.com/DeepLink-org/dlinfer/blob/main/docs/quant/ascend_kv_quant.md) for details.
diff --git a/docs/zh_cn/get_started/ascend/get_started.md b/docs/zh_cn/get_started/ascend/get_started.md
index b137c458be..9f0a7b1f90 100644
--- a/docs/zh_cn/get_started/ascend/get_started.md
+++ b/docs/zh_cn/get_started/ascend/get_started.md
@@ -133,3 +133,9 @@ lmdeploy lite auto_awq $HF_MODEL --work-dir $WORK_DIR --device npu
 ```
 
 支持的模型列表请参考[支持的模型](../../supported_models/supported_models.md)。
+
+### int8 KV-cache 量化
+
+昇腾后端现在支持了在eager模式下的离线int8 KV-cache量化。
+
+详细使用方式请请参考这篇[文章](https://github.com/DeepLink-org/dlinfer/blob/main/docs/quant/ascend_kv_quant.md)。
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
index 90823598ea..2336d10752 100644
--- a/lmdeploy/messages.py
+++ b/lmdeploy/messages.py
@@ -293,8 +293,11 @@ def __post_init__(self):
         assert self.device_type in [
             'cuda', 'ascend', 'maca'
         ], (f'invalid device_type: {self.device_type}')
-        if self.quant_policy > 0 and self.device_type != 'cuda':
-            assert False, 'kv cache quantization only works for CUDA.'
+        if self.quant_policy > 0 and self.device_type not in [
+                'cuda', 'ascend'
+        ]:
+            assert False, \
+                   'kv cache quantization only works for CUDA and ASCEND.'
 
 
 class ResponseType(enum.Enum):
diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
index b6f544510b..588558f0d5 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -1,5 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
+import itertools
+import os
+import re
+from pathlib import Path
+from typing import Dict, Tuple
 
 import torch
 
@@ -11,6 +15,71 @@
 logger = get_logger('lmdeploy')
 
 
+class AscendKVQuantMeta:
+    has_set_value: bool = False
+    quant_meta: Dict = {}
+
+    @classmethod
+    def set_value(cls, device: str, dtype: torch.dtype, record_file: str,
+                  total_layers: int):
+        with open(record_file, 'r') as file:
+            data = file.read()
+        scale_offset_pairs = re.findall(
+            r'scale:\s*([\d\.\-]+)\s*offset:\s*(-?\d+)', data)
+        scale_offset_pairs = [(float(scale), float(offset))
+                              for scale, offset in scale_offset_pairs]
+        k_scales, v_scales, kv_scales = [], [], []
+        k_zeros, v_zeros, kv_zeros = [], [], []
+        if len(scale_offset_pairs) == total_layers:
+            for scale, offset in scale_offset_pairs:
+                k_scales.append(
+                    torch.tensor([scale], device=device, dtype=dtype))
+                v_scales.append(
+                    torch.tensor([scale], device=device, dtype=dtype))
+                kv_scales.append(
+                    torch.tensor([scale, scale], device=device, dtype=dtype))
+                k_zeros.append(
+                    torch.tensor([offset], device=device, dtype=dtype))
+                v_zeros.append(
+                    torch.tensor([offset], device=device, dtype=dtype))
+                kv_zeros.append(
+                    torch.tensor([offset, offset], device=device, dtype=dtype))
+        elif len(scale_offset_pairs) == total_layers * 2:
+            for i in range(total_layers):
+                scale_k, offset_k = scale_offset_pairs[2 * i]
+                scale_v, offset_v = scale_offset_pairs[2 * i + 1]
+                k_scales.append(
+                    torch.tensor([scale_k], device=device, dtype=dtype))
+                v_scales.append(
+                    torch.tensor([scale_v], device=device, dtype=dtype))
+                kv_scales.append(
+                    torch.tensor([scale_k, scale_v],
+                                 device=device,
+                                 dtype=dtype))
+                k_zeros.append(
+                    torch.tensor([offset_k], device=device, dtype=dtype))
+                v_zeros.append(
+                    torch.tensor([offset_v], device=device, dtype=dtype))
+                kv_zeros.append(
+                    torch.tensor([offset_k, offset_v],
+                                 device=device,
+                                 dtype=dtype))
+        else:
+            raise ValueError(
+                f'num of scale_offset_pairs({len(scale_offset_pairs)}) '
+                f'must match num of total_layers({total_layers})')
+
+        cls.quant_meta.update({
+            'k_scales': itertools.cycle(k_scales),
+            'k_zeros': itertools.cycle(k_zeros),
+            'v_scales': itertools.cycle(v_scales),
+            'v_zeros': itertools.cycle(v_zeros),
+            'kv_scales': itertools.cycle(kv_scales),
+            'kv_zeros': itertools.cycle(kv_zeros)
+        })
+        cls.has_set_value = True
+
+
 class AscendOpsBackend(DlinferOpsBackend):
     """ascend layer backend."""
     enable_graph = False
@@ -164,6 +233,21 @@ def get_total_slots():
                     .repeat_interleave(step_context.q_seqlens, 0)
             kv_seqlens = kv_seqlens_cpu
 
+        if not cls.enable_graph and step_context.kv_quant_policy == 8:
+            record_file = os.getenv('ASCEND_QUANT_RECORD_FILE')
+            assert record_file, 'please specify valid ASCEND_QUANT_RECORD_FILE'
+            path = Path(record_file)
+            is_path = path.is_absolute() or path.is_relative_to('/')
+            exists = path.exists()
+            if not (is_path and exists):
+                raise ValueError(
+                    'please specify valid ASCEND_QUANT_RECORD_FILE')
+            if not AscendKVQuantMeta.has_set_value:
+                total_layers = len(step_context.kv_caches)
+                AscendKVQuantMeta.set_value(step_context.block_offsets.device,
+                                            step_context.model_config.dtype,
+                                            record_file, total_layers)
+
         attn_meta_cls = cls.get_attention_metadata_cls()
         attn_metadata = attn_meta_cls(
             step_context.is_decoding,
@@ -177,6 +261,8 @@ def get_total_slots():
             is_unpaged_prefill=is_unpaged_prefill,
             max_q_seq_len=max_q_seq_len,
             max_kv_seq_len=max_kv_seq_len,
+            quant_policy=step_context.kv_quant_policy,
+            quant_meta=AscendKVQuantMeta.quant_meta,
         )
 
         step_context.attn_metadata = attn_metadata
diff --git a/lmdeploy/pytorch/backends/dlinfer/attention.py b/lmdeploy/pytorch/backends/dlinfer/attention.py
index 0d666c9130..d1b5b619d0 100644
--- a/lmdeploy/pytorch/backends/dlinfer/attention.py
+++ b/lmdeploy/pytorch/backends/dlinfer/attention.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from dataclasses import dataclass
-from typing import Optional, Sequence
+from typing import Dict, Optional, Sequence
 
 from torch import Tensor
 
@@ -15,6 +15,7 @@ class DlinferAttentionMetadata(AttentionMetadata):
     is_unpaged_prefill: Optional[bool] = None
     max_q_seq_len: int = 1
     max_kv_seq_len: int = 1
+    quant_meta: Dict = None
 
 
 class DlinferAttentionImpl(AttentionImpl[DlinferAttentionMetadata]):
@@ -74,10 +75,37 @@ def forward(
         is_unpaged_prefill = attn_metadata.is_unpaged_prefill
         max_q_seq_len = attn_metadata.max_q_seq_len
         max_kv_seq_len = attn_metadata.max_kv_seq_len
+        quant_bits = attn_metadata.quant_policy
+        if attn_metadata.quant_meta is not None:
+            k_scales_zeros = [
+                next(attn_metadata.quant_meta['k_scales']),
+                next(attn_metadata.quant_meta['k_zeros'])
+            ] if 'k_scales' in attn_metadata.quant_meta else []
+            v_scales_zeros = [
+                next(attn_metadata.quant_meta['v_scales']),
+                next(attn_metadata.quant_meta['v_zeros'])
+            ] if 'v_scales' in attn_metadata.quant_meta else []
+            kv_scales = next(
+                attn_metadata.quant_meta['kv_scales']
+            ) if 'kv_scales' in attn_metadata.quant_meta else None
+            kv_zeros = next(
+                attn_metadata.quant_meta['kv_zeros']
+            ) if 'kv_zeros' in attn_metadata.quant_meta else None
+        else:
+            k_scales_zeros = []
+            v_scales_zeros = []
+            kv_scales = None
+            kv_zeros = None
 
         # fill kv cache
-        k_cache, v_cache = self.fill_kv_cache(key, value, k_cache, v_cache,
-                                              kv_start_indices)
+        k_cache, v_cache = self.fill_kv_cache(key,
+                                              value,
+                                              k_cache,
+                                              v_cache,
+                                              kv_start_indices,
+                                              k_scales_zeros=k_scales_zeros,
+                                              v_scales_zeros=v_scales_zeros,
+                                              quant_bits=quant_bits)
 
         if inplace:
             attn_output = query[..., :self.v_head_size]
@@ -103,6 +131,9 @@ def forward(
             block_size=block_size,
             attn_mask=attn_mask,
             is_unpaged_prefill=is_unpaged_prefill,
+            kv_scales=kv_scales,
+            kv_zeros=kv_zeros,
+            quant_bits=quant_bits,
         )
 
         return attn_output
diff --git a/lmdeploy/pytorch/config.py b/lmdeploy/pytorch/config.py
index c350f4b4cf..da2ac35c0e 100644
--- a/lmdeploy/pytorch/config.py
+++ b/lmdeploy/pytorch/config.py
@@ -77,6 +77,7 @@ class CacheConfig:
     max_prefill_token_num: int = 4096
     enable_prefix_caching: bool = False
     quant_policy: Literal[0, 4, 8] = 0
+    device_type: str = 'cuda'
 
     def __post_init__(self):
         """post init."""
diff --git a/lmdeploy/pytorch/engine/cache_engine.py b/lmdeploy/pytorch/engine/cache_engine.py
index e393adeed3..ffaeafa90e 100644
--- a/lmdeploy/pytorch/engine/cache_engine.py
+++ b/lmdeploy/pytorch/engine/cache_engine.py
@@ -44,7 +44,13 @@ def __init__(
         self.num_layers = model_config.num_layers
         self.kv_cache_dtype = model_config.dtype
         if cache_config.quant_policy > 0:
-            self.kv_cache_dtype = torch.uint8
+            if self.cache_config.device_type in ['cuda']:
+                self.kv_cache_dtype = torch.uint8
+            elif self.cache_config.device_type in ['ascend', 'npu']:
+                self.kv_cache_dtype = torch.int8
+            else:
+                raise ValueError(
+                    f'unsupported device_type {self.cache_config.device_type}')
 
         # Initialize the cache.
         self.local_gpu_cache = self.allocate_gpu_cache()
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
index b7a803a7a7..715291a901 100644
--- a/lmdeploy/pytorch/engine/engine.py
+++ b/lmdeploy/pytorch/engine/engine.py
@@ -130,6 +130,7 @@ def __init__(self,
             max_prefill_token_num=engine_config.max_prefill_token_num,
             enable_prefix_caching=engine_config.enable_prefix_caching,
             quant_policy=engine_config.quant_policy,
+            device_type=engine_config.device_type,
         )
 
         if not os.path.exists(model_path):
diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
index 59d77f264a..8e47df70b5 100644
--- a/lmdeploy/pytorch/engine/model_agent.py
+++ b/lmdeploy/pytorch/engine/model_agent.py
@@ -139,6 +139,7 @@ def model_forward(
         ctx_mgr = model.ctx_mgr
         context = ctx_mgr.build_context(
             inputs=inputs,
+            model_config=cache_engine.model_config,
             world_size=world_size,
             kv_caches=cache_engine.gpu_cache,
             kv_quant_policy=cache_engine.cache_config.quant_policy,
diff --git a/lmdeploy/pytorch/kernels/dlinfer/fill_kv_cache.py b/lmdeploy/pytorch/kernels/dlinfer/fill_kv_cache.py
index fb2eee9d41..63564d7ed8 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/fill_kv_cache.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/fill_kv_cache.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence
+
 import dlinfer.ops as ext_ops
 from torch import Tensor
 
@@ -9,7 +11,16 @@ def fill_kv_cache(
     key_caches: Tensor,
     value_caches: Tensor,
     kv_start_indices: Tensor,
+    k_scales_zeros: Sequence[Optional[Tensor]],
+    v_scales_zeros: Sequence[Optional[Tensor]],
+    quant_bits: int = 0,
 ):
     """fill key/value state to cache for paged attention."""
-    return ext_ops.fill_kv_cache(key_states, value_states, key_caches,
-                                 value_caches, kv_start_indices)
+    return ext_ops.fill_kv_cache(key_states,
+                                 value_states,
+                                 key_caches,
+                                 value_caches,
+                                 kv_start_indices,
+                                 k_scales_zeros=k_scales_zeros,
+                                 v_scales_zeros=v_scales_zeros,
+                                 quant_bits=quant_bits)
diff --git a/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py b/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py
index 47bcb0cfff..ded85d476d 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py
@@ -19,6 +19,9 @@ def prefill_attention(
     block_size: int,
     attn_mask: Sequence[Optional[Tensor]],
     is_unpaged_prefill: Optional[bool],
+    kv_scales: Optional[Tensor],
+    kv_zeros: Optional[Tensor],
+    quant_bits: Optional[int],
 ) -> Tensor:
     num_q_heads = query_states.shape[1]
     num_kv_heads = value_states.shape[1]
@@ -53,11 +56,25 @@ def prefill_attention(
             num_kv_heads,
             attn_mask,
             attn_output=attn_output,
+            kv_scales=kv_scales,
+            kv_zeros=kv_zeros,
+            quant_bits=quant_bits,
         )
 
 
-def paged_token_attention(q, k_cache, v_cache, attn_output, kv_seq_len,
-                          max_kv_seq_len, block_offsets, block_size):
+def paged_token_attention(
+    q,
+    k_cache,
+    v_cache,
+    attn_output,
+    kv_seq_len,
+    max_kv_seq_len,
+    block_offsets,
+    block_size,
+    kv_scales: Optional[Tensor],
+    kv_zeros: Optional[Tensor],
+    quant_bits: Optional[int],
+):
     num_q_heads, q_head_dim = q.shape[1:3]
     num_kv_heads = k_cache.shape[-1] // q_head_dim
     return ext_ops.paged_decode_attention(
@@ -71,6 +88,9 @@ def paged_token_attention(q, k_cache, v_cache, attn_output, kv_seq_len,
         num_q_heads,
         num_kv_heads,
         attn_output=attn_output,
+        kv_scales=kv_scales,
+        kv_zeros=kv_zeros,
+        quant_bits=quant_bits,
     )
 
 
@@ -91,6 +111,9 @@ def paged_attention_fwd(
     block_size: int,
     attn_mask: Sequence[Optional[Tensor]] = (),
     is_unpaged_prefill: Optional[bool] = None,
+    kv_scales: Optional[Tensor] = None,
+    kv_zeros: Optional[Tensor] = None,
+    quant_bits: Optional[int] = 0,
 ):
     if not is_decoding:
         return prefill_attention(
@@ -108,6 +131,9 @@ def paged_attention_fwd(
             block_size,
             attn_mask,
             is_unpaged_prefill,
+            kv_scales=kv_scales,
+            kv_zeros=kv_zeros,
+            quant_bits=quant_bits,
         )
     else:
         return paged_token_attention(
@@ -119,4 +145,7 @@ def paged_attention_fwd(
             max_kv_seq_len,
             block_offsets,
             block_size,
+            kv_scales=kv_scales,
+            kv_zeros=kv_zeros,
+            quant_bits=quant_bits,
         )
diff --git a/lmdeploy/pytorch/model_inputs.py b/lmdeploy/pytorch/model_inputs.py
index 669625d43d..d95aa6fafc 100644
--- a/lmdeploy/pytorch/model_inputs.py
+++ b/lmdeploy/pytorch/model_inputs.py
@@ -6,6 +6,7 @@
 import torch
 
 from lmdeploy.pytorch.backends import get_backend
+from lmdeploy.pytorch.config import ModelConfig
 
 
 @dataclass
@@ -198,6 +199,7 @@ class StepContext:
     dataclass provide these infos and tools.
     """
     input_ids: torch.LongTensor
+    model_config: ModelConfig
     block_offsets: torch.LongTensor
     position_ids: torch.LongTensor
     attention_mask: torch.LongTensor
@@ -224,6 +226,7 @@ class StepContext:
     def new(
         cls,
         inputs: ModelInputs,
+        model_config: ModelConfig,
         world_size: int = 1,
         kv_caches: List = None,
         kv_quant_policy: Literal[0, 4, 8] = 0,
@@ -273,6 +276,7 @@ def new(
 
         ret = StepContext(
             input_ids=inputs.input_ids,
+            model_config=model_config,
             block_offsets=inputs.block_offsets,
             position_ids=position_ids,
             input_embeddings=input_embeddings,
@@ -318,6 +322,7 @@ def __init__(self):
     @staticmethod
     def build_context(
         inputs: ModelInputs,
+        model_config: ModelConfig,
         world_size: int = 1,
         kv_caches: List = None,
         kv_quant_policy: Literal[0, 4, 8] = 0,
@@ -325,6 +330,7 @@ def build_context(
         """build context."""
         return StepContext.new(
             inputs,
+            model_config,
             world_size,
             kv_caches,
             kv_quant_policy,
diff --git a/lmdeploy/pytorch/tools/make_inputs.py b/lmdeploy/pytorch/tools/make_inputs.py
index f2d23830b7..053e7d0918 100644
--- a/lmdeploy/pytorch/tools/make_inputs.py
+++ b/lmdeploy/pytorch/tools/make_inputs.py
@@ -135,6 +135,7 @@ def __fill_kv_caches(kv_caches, past_key_values, block_offsets):
 
     return StepContext.new(
         inputs=model_inputs,
+        model_config=model_config,
         world_size=world_size,
         kv_caches=kv_caches,
     )

From 4f7e50b86a1c99b706f78efa0543ce4ccc5f5628 Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Fri, 6 Dec 2024 18:39:01 +0800
Subject: [PATCH 109/122] update supported models (#2849)

* update supported models

* update deepseek-v2.5

* update
---
 README.md                                     |  3 +++
 README_ja.md                                  |  3 +++
 README_zh-CN.md                               |  3 +++
 docs/en/supported_models/supported_models.md  | 19 +++++++++++------
 .../supported_models/supported_models.md      | 21 ++++++++++++-------
 5 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index d160338aa6..8ef7b7994f 100644
--- a/README.md
+++ b/README.md
@@ -125,6 +125,8 @@ For detailed inference benchmarks in more devices and more settings, please refe
   <li>Qwen1.5 (0.5B - 110B)</li>
   <li>Qwen1.5 - MoE (0.5B - 72B)</li>
   <li>Qwen2 (0.5B - 72B)</li>
+  <li>Qwen2-MoE (57BA14B)</li>
+  <li>Qwen2.5 (0.5B - 32B)</li>
   <li>Baichuan (7B)</li>
   <li>Baichuan2 (7B-13B)</li>
   <li>Code Llama (7B - 34B)</li>
@@ -136,6 +138,7 @@ For detailed inference benchmarks in more devices and more settings, please refe
   <li>Mistral (7B)</li>
   <li>DeepSeek-MoE (16B)</li>
   <li>DeepSeek-V2 (16B, 236B)</li>
+  <li>DeepSeek-V2.5 (236B)</li>
   <li>Mixtral (8x7B, 8x22B)</li>
   <li>Gemma (2B - 7B)</li>
   <li>Dbrx (132B)</li>
diff --git a/README_ja.md b/README_ja.md
index fda176229e..77badaac36 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -122,6 +122,8 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
   <li>Qwen1.5 (0.5B - 110B)</li>
   <li>Qwen1.5 - MoE (0.5B - 72B)</li>
   <li>Qwen2 (0.5B - 72B)</li>
+  <li>Qwen2-MoE (57BA14B)</li>
+  <li>Qwen2.5 (0.5B - 32B)</li>
   <li>Baichuan (7B)</li>
   <li>Baichuan2 (7B-13B)</li>
   <li>Code Llama (7B - 34B)</li>
@@ -133,6 +135,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
   <li>Mistral (7B)</li>
   <li>DeepSeek-MoE (16B)</li>
   <li>DeepSeek-V2 (16B, 236B)</li>
+  <li>DeepSeek-V2.5 (236B)</li>
   <li>Mixtral (8x7B, 8x22B)</li>
   <li>Gemma (2B - 7B)</li>
   <li>Dbrx (132B)</li>
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 6c24b2e500..9f3cd40a64 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -126,6 +126,8 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型
   <li>Qwen1.5 (0.5B - 110B)</li>
   <li>Qwen1.5 - MoE (0.5B - 72B)</li>
   <li>Qwen2 (0.5B - 72B)</li>
+  <li>Qwen2-MoE (57BA14B)</li>
+  <li>Qwen2.5 (0.5B - 32B)</li>
   <li>Baichuan (7B)</li>
   <li>Baichuan2 (7B-13B)</li>
   <li>Code Llama (7B - 34B)</li>
@@ -137,6 +139,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型
   <li>Mistral (7B)</li>
   <li>DeepSeek-MoE (16B)</li>
   <li>DeepSeek-V2 (16B, 236B)</li>
+  <li>DeepSeek-V2.5 (236B)</li>
   <li>Mixtral (8x7B, 8x22B)</li>
   <li>Gemma (2B - 7B)</li>
   <li>Dbrx (132B)</li>
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index 469ece487f..dd8ceb4ffa 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -10,7 +10,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |        Llama2         |    7B - 70B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Llama3         |    8B, 70B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       Llama3.1        |    8B, 70B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Llama3.2        |     1B, 3B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Llama3.2        |     1B, 3B     | LLM  |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
 |       InternLM        |    7B - 20B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       InternLM2       |    7B - 20B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |      InternLM2.5      |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -18,9 +18,13 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 | InternLM-XComposer2.5 |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |         Qwen          |   1.8B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Qwen1.5        |  1.8B - 110B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|         Qwen2         |   0.5B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|         Qwen2         |   0.5B - 72B   | LLM  |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
+|       Qwen2-MoE       |    57BA14B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Qwen2.5        |   0.5B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Mistral        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  No   |
 |        Mixtral        |  8x7B, 8x22B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|      DeepSeek-V2      |   16B, 236B    | LLM  |    Yes    |   Yes   |   Yes   |  No   |
+|     DeepSeek-V2.5     |      236B      | LLM  |    Yes    |   Yes   |   Yes   |  No   |
 |        Qwen-VL        |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |      DeepSeek-VL      |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |       Baichuan        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -29,7 +33,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |          YI           |    6B - 34B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |    LLaVA(1.5,1.6)     |    7B - 34B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |       InternVL        |  v1.1 - v1.5   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternVL2       | 1-2B, 8B - 76B | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternVL2       | 1-2B, 8B - 76B | MLLM |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
 |        ChemVLM        |    8B - 26B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 | MiniCPM-Llama3-V-2_5  |       -        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |     MiniCPM-V-2_6     |       -        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -41,7 +45,8 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 "-" means not verified yet.
 
 ```{note}
-The TurboMind engine doesn't support window attention. Therefore, for models that have applied window attention and have the corresponding switch "use_sliding_window" enabled, such as Mistral, Qwen1.5 and etc., please choose the PyTorch engine for inference.
+* The TurboMind engine doesn't support window attention. Therefore, for models that have applied window attention and have the corresponding switch "use_sliding_window" enabled, such as Mistral, Qwen1.5 and etc., please choose the PyTorch engine for inference.
+* When the head_dim of a model is not 128, such as llama3.2-1B, qwen2-0.5B and internvl2-1B, turbomind doesn't support its kv cache 4/8 bit quantization and inference
 ```
 
 ## PyTorchEngine on CUDA Platform
@@ -68,11 +73,13 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 |    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  QWen1.5-MoE   |    A2.7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    | Yes  |  Yes  |
+|    Qwen2.5     | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    | Yes  |  Yes  |
 |    QWen2-VL    |   2B, 7B    | MLLM |    Yes    |   Yes   |   No    |  No  |  No   |
 |  DeepSeek-MoE  |     16B     | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |  DeepSeek-V2   |  16B, 236B  | LLM  |    Yes    |   No    |   No    |  No  |  No   |
+| DeepSeek-V2.5  |    236B     | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |    MiniCPM3    |     4B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    | Yes  |  Yes  |
+| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    |  No  |  Yes  |
 |     Gemma      |    2B-7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |      Dbrx      |    132B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |   StarCoder2   |   3B-15B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
@@ -81,7 +88,7 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 |  CogVLM-Chat   |     17B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 | LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
-| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
 |   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 | Mono-InternVL  |     2B      | MLLM |   Yes\*   |   Yes   |   Yes   |  -   |   -   |
 |    ChemVLM     |   8B-26B    | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index d734523282..3ec3688e1b 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -10,7 +10,7 @@
 |        Llama2         |    7B - 70B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Llama3         |    8B, 70B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       Llama3.1        |    8B, 70B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Llama3.2        |     1B, 3B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Llama3.2        |     1B, 3B     | LLM  |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
 |       InternLM        |    7B - 20B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       InternLM2       |    7B - 20B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |      InternLM2.5      |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -18,9 +18,13 @@
 | InternLM-XComposer2.5 |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |         Qwen          |   1.8B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Qwen1.5        |  1.8B - 110B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|         Qwen2         |   0.5B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|         Qwen2         |   0.5B - 72B   | LLM  |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
+|       Qwen2-MoE       |    57BA14B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Qwen2.5        |   0.5B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Mistral        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  No   |
 |        Mixtral        |  8x7B, 8x22B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|      DeepSeek-V2      |   16B, 236B    | LLM  |    Yes    |   Yes   |   Yes   |  No   |
+|     DeepSeek-V2.5     |      236B      | LLM  |    Yes    |   Yes   |   Yes   |  No   |
 |        Qwen-VL        |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |      DeepSeek-VL      |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |       Baichuan        |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -29,7 +33,7 @@
 |          YI           |    6B - 34B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |    LLaVA(1.5,1.6)     |    7B - 34B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |       InternVL        |  v1.1 - v1.5   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternVL2       | 1-2B, 8B - 76B | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternVL2       | 1-2B, 8B - 76B | MLLM |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
 |        ChemVLM        |    8B - 26B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 | MiniCPM-Llama3-V-2_5  |       -        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |     MiniCPM-V-2_6     |       -        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -41,7 +45,8 @@
 “-” 表示还没有验证。
 
 ```{note}
-turbomind 引擎不支持 window attention。所以，对于应用了 window attention，并开启了对应的开关"use_sliding_window"的模型，比如 Mistral、Qwen1.5 等，在推理时，请选择 pytorch engine
+* turbomind 引擎不支持 window attention。所以，对于应用了 window attention，并开启了对应的开关"use_sliding_window"的模型，比如 Mistral、Qwen1.5 等，在推理时，请选择 pytorch engine
+* 当模型的 head_dim 非 128 时，turbomind 不支持它的 kv cache 4/8 bit 量化和推理。比如，llama3.2-1B，qwen2-0.5B，internvl2-1B 等等
 ```
 
 ## PyTorchEngine CUDA 平台
@@ -68,11 +73,13 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 |    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |  QWen1.5-MoE   |    A2.7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    | Yes  |  Yes  |
+|    Qwen2.5     | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    | Yes  |  Yes  |
 |    QWen2-VL    |   2B, 7B    | MLLM |    Yes    |   Yes   |   No    |  No  |  No   |
 |  DeepSeek-MoE  |     16B     | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |  DeepSeek-V2   |  16B, 236B  | LLM  |    Yes    |   No    |   No    |  No  |  No   |
+| DeepSeek-V2.5  |    236B     | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |    MiniCPM3    |     4B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    | Yes  |  Yes  |
+| MiniCPM-V-2_6  |     8B      | LLM  |    Yes    |   No    |   No    |  No  |  Yes  |
 |     Gemma      |    2B-7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |      Dbrx      |    132B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |   StarCoder2   |   3B-15B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
@@ -81,7 +88,7 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 |  CogVLM-Chat   |     17B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 | LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
-| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
 |   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 | Mono-InternVL  |     2B      | MLLM |   Yes\*   |   Yes   |   Yes   |  -   |   -   |
 |    ChemVLM     |   8B-26B    | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
@@ -94,7 +101,7 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 | Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
 
 ```{note}
-* Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.
+* 目前，Mono-InternVL不支持FP16，因为数值不稳定。请改用BF16。
 ```
 
 ## PyTorchEngine 华为昇腾平台

From af0d95be0aeedfd135b3929f0377e53ef9a581f9 Mon Sep 17 00:00:00 2001
From: jinminxi104 <jinminxi104@hotmail.com>
Date: Mon, 9 Dec 2024 11:39:39 +0800
Subject: [PATCH 110/122] Update dlinfer-ascend version in runtime_ascend.txt
 (#2865)

---
 requirements/runtime_ascend.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/runtime_ascend.txt b/requirements/runtime_ascend.txt
index 05d74bbe72..c5d44cc995 100644
--- a/requirements/runtime_ascend.txt
+++ b/requirements/runtime_ascend.txt
@@ -1,5 +1,5 @@
 accelerate>=0.29.3
-dlinfer-ascend>=0.1.2
+dlinfer-ascend>=0.1.3
 einops
 fastapi
 fire

From 14b64c769247ae082cc541233dde59f65747a714 Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Mon, 9 Dec 2024 20:07:44 +0800
Subject: [PATCH 111/122] bump version to v0.6.4 (#2864)

---
 docs/en/get_started/installation.md    | 2 +-
 docs/zh_cn/get_started/installation.md | 2 +-
 lmdeploy/version.py                    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/en/get_started/installation.md b/docs/en/get_started/installation.md
index b3e8bb8abd..c00111c2ab 100644
--- a/docs/en/get_started/installation.md
+++ b/docs/en/get_started/installation.md
@@ -23,7 +23,7 @@ pip install lmdeploy
 The default prebuilt package is compiled on **CUDA 12**. If CUDA 11+ (>=11.3) is required, you can install lmdeploy by:
 
 ```shell
-export LMDEPLOY_VERSION=0.6.3
+export LMDEPLOY_VERSION=0.6.4
 export PYTHON_VERSION=38
 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 ```
diff --git a/docs/zh_cn/get_started/installation.md b/docs/zh_cn/get_started/installation.md
index 12562c51d5..0213fa6d15 100644
--- a/docs/zh_cn/get_started/installation.md
+++ b/docs/zh_cn/get_started/installation.md
@@ -23,7 +23,7 @@ pip install lmdeploy
 默认的预构建包是在 **CUDA 12** 上编译的。如果需要 CUDA 11+ (>=11.3)，你可以使用以下命令安装 lmdeploy：
 
 ```shell
-export LMDEPLOY_VERSION=0.6.3
+export LMDEPLOY_VERSION=0.6.4
 export PYTHON_VERSION=38
 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 ```
diff --git a/lmdeploy/version.py b/lmdeploy/version.py
index d9f4307a78..f705fcb332 100644
--- a/lmdeploy/version.py
+++ b/lmdeploy/version.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Tuple
 
-__version__ = '0.6.3'
+__version__ = '0.6.4'
 short_version = __version__
 
 
From 47fa7cf9baf8c3f68e1a2ab9c89c91d2d62bc841 Mon Sep 17 00:00:00 2001
From: Galaxy-Husky <598756381@qq.com>
Date: Mon, 9 Dec 2024 21:43:07 +0800
Subject: [PATCH 112/122] Support for loading lora adapter weights in
 safetensors format (#2860)

Co-authored-by: Ping <ping.zhu@jmuse.cn>
---
 lmdeploy/pytorch/models/patch.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lmdeploy/pytorch/models/patch.py b/lmdeploy/pytorch/models/patch.py
index 9da1b9f4ea..a7fe4431ed 100644
--- a/lmdeploy/pytorch/models/patch.py
+++ b/lmdeploy/pytorch/models/patch.py
@@ -8,6 +8,7 @@
 
 import torch
 from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import load_state_dict
 
 from lmdeploy.utils import get_logger
 
@@ -295,7 +296,9 @@ def add_adapters(model: torch.nn.Module,
     for name, path in adapters.items():
         adapter_id = adapter_id_map[name]
         checkpoint_path = f'{path}/adapter_model.bin'
-        state_dict = torch.load(checkpoint_path, map_location=device)
+        if not osp.exists(checkpoint_path):
+            checkpoint_path = f'{path}/adapter_model.safetensors'
+        state_dict = load_state_dict(checkpoint_path, map_location=device)
 
         if hasattr(model, 'load_lora_weights'):
             model.load_lora_weights(state_dict.items(), adapter_id=adapter_id)

From af7157aee3503f8a5f7fb5aba1241ab9155bdf3d Mon Sep 17 00:00:00 2001
From: q yao <streetyao@live.com>
Date: Thu, 12 Dec 2024 12:17:59 +0800
Subject: [PATCH 113/122] fix cpu cache (#2881)

---
 lmdeploy/pytorch/engine/cache_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmdeploy/pytorch/engine/cache_engine.py b/lmdeploy/pytorch/engine/cache_engine.py
index ffaeafa90e..50723b0351 100644
--- a/lmdeploy/pytorch/engine/cache_engine.py
+++ b/lmdeploy/pytorch/engine/cache_engine.py
@@ -208,7 +208,7 @@ def allocate_gpu_cache(self):
 
     def allocate_cpu_cache(self):
         """allocate caches on Host."""
-        caches = self._allocate_cache(self.num_gpu_blocks, 'cpu')
+        caches = self._allocate_cache(self.num_cpu_blocks, 'cpu')
 
         self.full_cpu_cache = caches
         self.local_cpu_cache = list(zip(*caches))

From b99a5da255631ea4d24f03bdefccf44bf89aa108 Mon Sep 17 00:00:00 2001
From: q yao <streetyao@live.com>
Date: Thu, 12 Dec 2024 17:45:14 +0800
Subject: [PATCH 114/122] refactor PyTorchEngine check env (#2870)

* refactor checker

* config builder

* fix

* fix

* update triton

* remove dockerfile update

* update torch version
---
 lmdeploy/pytorch/check_env/__init__.py     | 273 +--------------------
 lmdeploy/pytorch/check_env/adapter.py      |  31 +++
 lmdeploy/pytorch/check_env/base.py         |  62 +++++
 lmdeploy/pytorch/check_env/deeplink.py     |  25 ++
 lmdeploy/pytorch/check_env/model.py        | 117 +++++++++
 lmdeploy/pytorch/check_env/torch.py        |  21 ++
 lmdeploy/pytorch/check_env/transformers.py |  29 +++
 lmdeploy/pytorch/check_env/triton.py       |  60 +++++
 lmdeploy/pytorch/engine/engine.py          |  78 +++---
 lmdeploy/pytorch/engine/engine_checker.py  |  77 ++++++
 requirements/runtime.txt                   |   4 +-
 11 files changed, 474 insertions(+), 303 deletions(-)
 create mode 100644 lmdeploy/pytorch/check_env/adapter.py
 create mode 100644 lmdeploy/pytorch/check_env/base.py
 create mode 100644 lmdeploy/pytorch/check_env/deeplink.py
 create mode 100644 lmdeploy/pytorch/check_env/model.py
 create mode 100644 lmdeploy/pytorch/check_env/torch.py
 create mode 100644 lmdeploy/pytorch/check_env/transformers.py
 create mode 100644 lmdeploy/pytorch/check_env/triton.py
 create mode 100644 lmdeploy/pytorch/engine/engine_checker.py

diff --git a/lmdeploy/pytorch/check_env/__init__.py b/lmdeploy/pytorch/check_env/__init__.py
index 7d72438224..bc95a32be6 100644
--- a/lmdeploy/pytorch/check_env/__init__.py
+++ b/lmdeploy/pytorch/check_env/__init__.py
@@ -1,277 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from logging import Logger
-from typing import List
-
-from lmdeploy.utils import get_logger
-
-
-def _handle_exception(e: Exception,
-                      mod_name: str,
-                      logger: Logger,
-                      message: str = None):
-    red_color = '\033[31m'
-    reset_color = '\033[0m'
-    if message is None:
-        message = 'Please ensure it has been installed correctly.'
-    logger.debug('Exception', exc_info=1)
-    logger.error(f'{type(e).__name__}: {e}')
-    logger.error(f'{red_color}'
-                 f'<{mod_name}> test failed!\n'
-                 f'{message}'
-                 f'{reset_color}')
-    exit(1)
+from .base import BaseChecker  # noqa: F401
 
 
 def check_env_deeplink(device_type: str):
     """check Deeplink environment."""
-    try_import_deeplink(device_type)
+    from .deeplink import DeeplinkChecker
+    checker = DeeplinkChecker(device_type)
+    checker.handle()
 
 
 def try_import_deeplink(device_type: str):
-    """import dlinfer if specific device_type is set."""
-    deeplink_device_type_list = [
-        'ascend',
-        'npu',
-        'maca',
-    ]
-    if device_type in deeplink_device_type_list:
-        logger = get_logger('lmdeploy')
-        try:
-            import dlinfer.framework.lmdeploy_ext  # noqa: F401
-        except Exception as e:
-            _handle_exception(e, 'PyTorch', logger)
-
-
-def check_env_torch():
-    """check PyTorch environment."""
-    logger = get_logger('lmdeploy')
-
-    try:
-        logger.debug('Checking <PyTorch> environment.')
-        import torch
-
-        a = torch.tensor([1, 2], device='cuda')
-        b = a.new_tensor([3, 4], device='cuda')
-        c = a + b
-        torch.testing.assert_close(c, a.new_tensor([4, 6]))
-    except Exception as e:
-        _handle_exception(e, 'PyTorch', logger)
-
-
-MAX_TRITON_VERSION = '3.0.0'
-
-
-def check_env_triton(device: str):
-    """check OpenAI Triton environment."""
-    from packaging import version
-    logger = get_logger('lmdeploy')
-
-    msg = (
-        'Please ensure that your device is functioning properly with <Triton>.\n'  # noqa: E501
-        'You can verify your environment by running '
-        '`python -m lmdeploy.pytorch.check_env.triton_custom_add`.')
-    try:
-        logger.debug('Checking <Triton> environment.')
-        import torch
-        import triton
-        triton_version = version.parse(triton.__version__)
-        if triton_version > version.parse(MAX_TRITON_VERSION):
-            logger.warning(
-                f'Engine has not been tested on triton>{MAX_TRITON_VERSION}.')
-
-        from .triton_custom_add import custom_add
-        a = torch.tensor([1, 2], device='cuda')
-        b = a.new_tensor([3, 4], device='cuda')
-        c = custom_add(a, b)
-        torch.testing.assert_close(c, a + b)
-    except RuntimeError as e:
-        ptxas_error = 'device kernel image is invalid'
-        if len(e.args) > 0 and ptxas_error in e.args[0]:
-            msg = (
-                'This Error might caused by mismatching between NVIDIA Driver and nvcc compiler. \n'  # noqa: E501
-                'Try solution https://github.com/triton-lang/triton/issues/1955#issuecomment-1929908209'  # noqa: E501
-                ' or reinstall the driver.')
-        _handle_exception(e, 'Triton', logger, msg)
-    except Exception as e:
-        _handle_exception(e, 'Triton', logger, msg)
-
-    if device == 'cuda':
-        device_cap = torch.cuda.get_device_capability()
-        TRITON_VER_231 = version.parse('2.3.1')
-
-        if device_cap[0] <= 7:
-            if triton_version <= TRITON_VER_231:
-                err = RuntimeError(
-                    'Attention triton kernel does not fully support '
-                    'triton<3.0.0 on device with capability<8. '
-                    'Please upgrade your triton version.')
-                _handle_exception(err, 'Triton', logger)
-
-
-def check_env(device_type: str):
-    """check all environment."""
-    logger = get_logger('lmdeploy')
-    logger.info('Checking environment for PyTorch Engine.')
+    """check Deeplink environment."""
     check_env_deeplink(device_type)
-    check_env_torch()
-    if device_type == 'cuda':
-        check_env_triton('cuda')
-
-
-MIN_TRANSFORMERS_VERSION = '4.33.0'
-MAX_TRANSFORMERS_VERSION = '4.44.1'
-
-
-def check_awq(hf_config, device_type):
-    """check awq support."""
-    logger = get_logger('lmdeploy')
-    if device_type == 'cuda':
-        quantization_config = getattr(hf_config, 'quantization_config', dict())
-        quant_method = quantization_config.get('quant_method', None)
-        if quant_method != 'awq':
-            return
-        try:
-            import awq  # noqa
-        except Exception as e:
-            _handle_exception(e, 'autoawq', logger)
-
-        try:
-            import awq_ext  # noqa
-        except Exception:
-            logger.debug('Exception:', exc_info=1)
-            logger.warning('Failed to import `awq_ext`. '
-                           'Try reinstall it from source: '
-                           'https://github.com/casper-hansen/AutoAWQ_kernels')
-
-
-def check_transformers_version(model_path: str,
-                               trust_remote_code: bool = True,
-                               dtype: str = 'auto',
-                               device_type: str = 'cuda'):
-    """check transformers version."""
-    from packaging import version
-    logger = get_logger('lmdeploy')
-
-    def __check_transformers_version():
-        """check transformers version."""
-        logger.debug('Checking <transformers> version.')
-        trans_version = None
-        try:
-            import transformers
-            trans_version = version.parse(transformers.__version__)
-            min_version = version.parse(MIN_TRANSFORMERS_VERSION)
-            max_version = version.parse(MAX_TRANSFORMERS_VERSION)
-            if trans_version < min_version or trans_version > max_version:
-                logger.warning('LMDeploy requires transformers version: '
-                               f'[{MIN_TRANSFORMERS_VERSION} ~ '
-                               f'{MAX_TRANSFORMERS_VERSION}], '
-                               'but found version: '
-                               f'{transformers.__version__}')
-        except Exception as e:
-            _handle_exception(e, 'transformers', logger)
-        return transformers, trans_version
-
-    def __check_config(trans_version):
-        """check config."""
-        logger.debug('Checking <Model> AutoConfig.from_pretrained.')
-        try:
-            from transformers import AutoConfig
-            config = AutoConfig.from_pretrained(
-                model_path, trust_remote_code=trust_remote_code)
-        except Exception as e:
-            message = (
-                f'Load model config with transformers=={trans_version}'
-                ' failed. '
-                'Please make sure model can be loaded with transformers API.')
-            _handle_exception(e, 'transformers', logger, message=message)
-        return config
-
-    def __check_model_transformers_version(config, trans_version):
-        """check model transformers version."""
-        logger.debug('Checking <Model> required transformers version.')
-        try:
-            model_trans_version = getattr(config, 'transformers_version', None)
-            if model_trans_version is not None:
-                model_trans_version = version.parse(model_trans_version)
-                assert trans_version >= model_trans_version, \
-                    'Version mismatch.'
-        except Exception as e:
-            message = (f'model `{model_path}` requires '
-                       f'transformers version {model_trans_version} '
-                       f'but transformers {trans_version} is installed.')
-            _handle_exception(e, 'transformers', logger, message=message)
-
-    def __check_model_dtype_support(config, device_type):
-        """Checking model dtype support."""
-        logger.debug('Checking <Model> dtype support.')
-
-        import torch
-
-        from lmdeploy.pytorch.config import ModelConfig
-        from lmdeploy.utils import is_bf16_supported
-
-        try:
-            model_config = ModelConfig.from_hf_config(config,
-                                                      model_path=model_path,
-                                                      dtype=dtype)
-            if model_config.dtype == torch.bfloat16:
-                assert is_bf16_supported(device_type), (
-                    'bf16 is not supported on your device')
-        except AssertionError as e:
-            message = (
-                f'Your device does not support `{model_config.dtype}`. '
-                'You can set `dtype` to float16 in PyTorchEngineConfig or '
-                '`--dtype float16` to api_server.\n'
-                'Note that this might have negative effect!')
-            _handle_exception(e, 'Model', logger, message=message)
-        except Exception as e:
-            message = (f'Checking failed with error {e}',
-                       'Please send issue to LMDeploy with error logs.')
-            _handle_exception(e, 'Model', logger, message=message)
-
-        return model_config
-
-    _, trans_version = __check_transformers_version()
-    config = __check_config(trans_version)
-    __check_model_transformers_version(config, trans_version)
-    __check_model_dtype_support(config, device_type)
-    check_awq(config, device_type)
-
-
-def check_model(model_path: str,
-                trust_remote_code: bool = True,
-                dtype: str = 'auto',
-                device_type: str = 'cuda'):
-    """check model requirements."""
-    logger = get_logger('lmdeploy')
-    logger.info('Checking model.')
-    check_transformers_version(model_path, trust_remote_code, dtype,
-                               device_type)
-
-
-def check_adapter(path: str):
-    """check adapter."""
-    logger = get_logger('lmdeploy')
-    logger.debug(f'Checking <Adapter>: {path}.')
-
-    try:
-        from peft import PeftConfig
-        PeftConfig.from_pretrained(path)
-    except Exception as e:
-        message = ('Please make sure the adapter can be loaded with '
-                   '`peft.PeftConfig.from_pretrained`\n')
-        err_msg = '' if len(e.args) == 0 else e.args[0]
-        if 'got an unexpected keyword argument' in err_msg:
-            message += ('Or try remove all unexpected keywords '
-                        'in `adapter_config.json`.')
-        _handle_exception(e, 'Model', logger, message=message)
-
-
-def check_adapters(adapter_paths: List[str]):
-    """check adapters."""
-    if len(adapter_paths) <= 0:
-        return
-    logger = get_logger('lmdeploy')
-    logger.info('Checking adapters.')
-    for path in adapter_paths:
-        check_adapter(path)
diff --git a/lmdeploy/pytorch/check_env/adapter.py b/lmdeploy/pytorch/check_env/adapter.py
new file mode 100644
index 0000000000..bcaf5fd0e3
--- /dev/null
+++ b/lmdeploy/pytorch/check_env/adapter.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseChecker
+
+
+class AdapterChecker(BaseChecker):
+    """check adapter is available."""
+
+    def __init__(self, adapter_path: str, logger=None):
+        super().__init__(logger)
+        self.adapter_path = adapter_path
+
+    def check(self):
+        """check."""
+        path = self.adapter_path
+
+        try:
+            import peft  # noqa: F401
+        except Exception as e:
+            self.log_and_exit(e, 'Adapter', message='Failed to import peft.')
+
+        try:
+            from peft import PeftConfig
+            PeftConfig.from_pretrained(path)
+        except Exception as e:
+            message = ('Please make sure the adapter can be loaded with '
+                       '`peft.PeftConfig.from_pretrained`\n')
+            err_msg = '' if len(e.args) == 0 else e.args[0]
+            if 'got an unexpected keyword argument' in err_msg:
+                message += ('Or try remove all unexpected keywords '
+                            'in `adapter_config.json`.')
+            self.log_and_exit(e, 'Adapter', message=message)
diff --git a/lmdeploy/pytorch/check_env/base.py b/lmdeploy/pytorch/check_env/base.py
new file mode 100644
index 0000000000..ed5e5a600f
--- /dev/null
+++ b/lmdeploy/pytorch/check_env/base.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from logging import Logger
+from typing import List
+
+from lmdeploy.utils import get_logger
+
+RED_COLOR = '\033[31m'
+RESET_COLOR = '\033[0m'
+
+
+def _red_text(text: str):
+    """red text."""
+    return f'{RED_COLOR}{text}{RESET_COLOR}'
+
+
+class BaseChecker:
+    """base checker."""
+
+    def __init__(self, logger: Logger = None):
+        if logger is None:
+            logger = get_logger('lmdeploy')
+        self.logger = logger
+        self._is_passed = False
+        self._required_checker: List[BaseChecker] = list()
+
+    def get_logger(self):
+        """get logger."""
+        return self.logger
+
+    def register_required_checker(self, checker: 'BaseChecker'):
+        """register_required."""
+        self._required_checker.append(checker)
+
+    def handle(self):
+        """handle check."""
+        is_passed = getattr(self, '_is_passed', False)
+        if not is_passed:
+            checker_name = type(self).__name__
+            self.logger.debug(f'Checking <{checker_name}>:')
+            for checker in self._required_checker:
+                checker.handle()
+            self.check()
+            self.is_passed = True
+
+    def log_and_exit(self,
+                     e: Exception = None,
+                     mod_name: str = None,
+                     message: str = None):
+        logger = self.logger
+        if mod_name is None:
+            mod_name = type(self).__name__
+        if message is None:
+            message = 'Please check your environment.'
+        logger.debug('Exception', exc_info=1)
+        if e is not None:
+            logger.error(f'{type(e).__name__}: {e}')
+        logger.error(f'<{mod_name}> check failed!\n{_red_text(message)}')
+        exit(1)
+
+    def check(self):
+        """check."""
+        raise NotImplementedError('check not implemented.')
diff --git a/lmdeploy/pytorch/check_env/deeplink.py b/lmdeploy/pytorch/check_env/deeplink.py
new file mode 100644
index 0000000000..74ab5a7b87
--- /dev/null
+++ b/lmdeploy/pytorch/check_env/deeplink.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseChecker
+
+deeplink_device_type_list = [
+    'ascend',
+    'npu',
+    'maca',
+]
+
+
+class DeeplinkChecker(BaseChecker):
+    """check pytorch is available."""
+
+    def __init__(self, device_type: str, logger=None) -> None:
+        super().__init__(logger=logger)
+        self.device_type = device_type
+
+    def check(self):
+        """check."""
+        device_type = self.device_type
+        if device_type in deeplink_device_type_list:
+            try:
+                import dlinfer.framework.lmdeploy_ext  # noqa: F401
+            except Exception as e:
+                self.log_and_exit(e, 'dlinfer', 'dlinfer is not available.')
diff --git a/lmdeploy/pytorch/check_env/model.py b/lmdeploy/pytorch/check_env/model.py
new file mode 100644
index 0000000000..4b721e50e2
--- /dev/null
+++ b/lmdeploy/pytorch/check_env/model.py
@@ -0,0 +1,117 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from packaging import version
+
+from .base import BaseChecker
+
+
+class ModelChecker(BaseChecker):
+    """check model is available."""
+
+    def __init__(self,
+                 model_path: str,
+                 trust_remote_code: bool,
+                 dtype: str,
+                 device_type: str,
+                 logger=None) -> None:
+        super().__init__(logger=logger)
+        self.model_path = model_path
+        self.trust_remote_code = trust_remote_code
+        self.device_type = device_type
+        self.dtype = dtype
+
+    def check_config(self, trans_version):
+        """check config."""
+        model_path = self.model_path
+        trust_remote_code = self.trust_remote_code
+        try:
+            from transformers import AutoConfig
+            config = AutoConfig.from_pretrained(
+                model_path, trust_remote_code=trust_remote_code)
+        except Exception as e:
+            message = (
+                f'Load model config with transformers=={trans_version}'
+                ' failed. '
+                'Please make sure model can be loaded with transformers API.')
+            self.log_and_exit(e, 'transformers', message=message)
+        return config
+
+    def check_trans_version(self, config, trans_version):
+        """check transformers version."""
+        model_path = self.model_path
+        try:
+            model_trans_version = getattr(config, 'transformers_version', None)
+            if model_trans_version is not None:
+                model_trans_version = version.parse(model_trans_version)
+                assert trans_version >= model_trans_version, (
+                    'Version mismatch.')
+        except Exception as e:
+            message = (f'model `{model_path}` requires '
+                       f'transformers version {model_trans_version} '
+                       f'but transformers {trans_version} is installed.')
+            self.log_and_exit(e, 'transformers', message=message)
+
+    def check_dtype(self, config):
+        """check dtype."""
+        logger = self.get_logger()
+        model_path = self.model_path
+        device_type = self.device_type
+        dtype = self.dtype
+        try:
+            import torch
+
+            from lmdeploy.pytorch.config import ModelConfig
+            from lmdeploy.utils import is_bf16_supported
+            model_config = ModelConfig.from_hf_config(config,
+                                                      model_path=model_path,
+                                                      dtype=dtype)
+            if model_config.dtype == torch.bfloat16:
+                if not is_bf16_supported(device_type):
+                    logger.warning('Device does not support bfloat16.')
+        except Exception as e:
+            message = (f'Checking failed with error {e}',
+                       'Please send issue to LMDeploy with error logs.')
+            self.log_and_exit(e, 'Model', message=message)
+
+    def check_awq(self, config):
+        """check awq."""
+        logger = self.get_logger()
+        device_type = self.device_type
+        if device_type != 'cuda':
+            return
+
+        quantization_config = getattr(config, 'quantization_config', dict())
+        quant_method = quantization_config.get('quant_method', None)
+        if quant_method != 'awq':
+            return
+        try:
+            import awq  # noqa
+        except Exception as e:
+            self.log_and_exit(e, 'autoawq', logger)
+
+        try:
+            import awq_ext  # noqa
+        except Exception as e:
+            logger.debug('Exception:', exc_info=1)
+            self.log_and_exit(
+                e,
+                'awq_ext',
+                message='Failed to import `awq_ext`. '
+                'Try reinstall it from source: '
+                'https://github.com/casper-hansen/AutoAWQ_kernels')
+
+    def check(self):
+        """check."""
+        import transformers
+        trans_version = version.parse(transformers.__version__)
+
+        # config
+        config = self.check_config(trans_version)
+
+        # transformers version
+        self.check_trans_version(config, trans_version)
+
+        # dtype check
+        self.check_dtype(config)
+
+        # awq
+        self.check_awq(config)
diff --git a/lmdeploy/pytorch/check_env/torch.py b/lmdeploy/pytorch/check_env/torch.py
new file mode 100644
index 0000000000..14b24e04a0
--- /dev/null
+++ b/lmdeploy/pytorch/check_env/torch.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseChecker
+
+
+class TorchChecker(BaseChecker):
+    """check pytorch is available."""
+
+    def __init__(self, device: str = 'cuda', logger=None) -> None:
+        super().__init__(logger=logger)
+        self.device = device
+
+    def check(self):
+        """check."""
+        try:
+            import torch
+            a = torch.tensor([1, 2], device=self.device)
+            b = a.new_tensor([3, 4], device=self.device)
+            c = a + b
+            torch.testing.assert_close(c, a.new_tensor([4, 6]))
+        except Exception as e:
+            self.log_and_exit(e, 'PyTorch', 'PyTorch is not available.')
diff --git a/lmdeploy/pytorch/check_env/transformers.py b/lmdeploy/pytorch/check_env/transformers.py
new file mode 100644
index 0000000000..9d97cd6dca
--- /dev/null
+++ b/lmdeploy/pytorch/check_env/transformers.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from packaging import version
+
+from .base import BaseChecker
+
+MIN_TRANSFORMERS_VERSION = '4.33.0'
+MAX_TRANSFORMERS_VERSION = '4.46.1'
+
+
+class TransformersChecker(BaseChecker):
+    """check transformers is available."""
+
+    def check(self):
+        """check."""
+        import transformers
+        logger = self.get_logger()
+        try:
+            trans_version = version.parse(transformers.__version__)
+            min_version = version.parse(MIN_TRANSFORMERS_VERSION)
+            max_version = version.parse(MAX_TRANSFORMERS_VERSION)
+            if trans_version < min_version or trans_version > max_version:
+                logger.warning('LMDeploy requires transformers version: '
+                               f'[{MIN_TRANSFORMERS_VERSION} ~ '
+                               f'{MAX_TRANSFORMERS_VERSION}], '
+                               'but found version: '
+                               f'{transformers.__version__}')
+        except Exception as e:
+            self.log_and_exit(e, 'transformers',
+                              'transformers is not available.')
diff --git a/lmdeploy/pytorch/check_env/triton.py b/lmdeploy/pytorch/check_env/triton.py
new file mode 100644
index 0000000000..4cc58c5492
--- /dev/null
+++ b/lmdeploy/pytorch/check_env/triton.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from packaging import version
+
+from .base import BaseChecker
+
+MAX_TRITON_VERSION = '3.1.0'
+MIN_TRITON_VERSION = '3.0.0'
+
+
+class TritonChecker(BaseChecker):
+    """check triton is available."""
+
+    def check_version(self):
+        """check version."""
+        logger = self.get_logger()
+
+        # version check
+        import triton
+        max_version = version.parse(MAX_TRITON_VERSION)
+        min_version = version.parse(MIN_TRITON_VERSION)
+        triton_version = version.parse(triton.__version__)
+
+        if triton_version > max_version:
+            logger.warning('PytorchEngine has not been tested on '
+                           f'triton>{MAX_TRITON_VERSION}.')
+        if triton_version < min_version:
+            msg = (f'triton>={MIN_TRITON_VERSION} is required. '
+                   f'Found triton=={triton_version}')
+            self.log_and_exit(mod_name='Triton', message=msg)
+
+    def check(self):
+        """check."""
+        logger = self.get_logger()
+
+        msg = (
+            'Please ensure that your device is functioning properly with <Triton>.\n'  # noqa: E501
+            'You can verify your environment by running '
+            '`python -m lmdeploy.pytorch.check_env.triton_custom_add`.')
+        try:
+            logger.debug('Checking <Triton> environment.')
+            import torch
+
+            from .triton_custom_add import custom_add
+            a = torch.tensor([1, 2], device='cuda')
+            b = a.new_tensor([3, 4], device='cuda')
+            c = custom_add(a, b)
+            torch.testing.assert_close(c, a + b)
+        except RuntimeError as e:
+            ptxas_error = 'device kernel image is invalid'
+            if len(e.args) > 0 and ptxas_error in e.args[0]:
+                msg = (
+                    'This Error might caused by mismatching between NVIDIA Driver and nvcc compiler. \n'  # noqa: E501
+                    'Try solution https://github.com/triton-lang/triton/issues/1955#issuecomment-1929908209'  # noqa: E501
+                    ' or reinstall the driver.')
+            self.log_and_exit(e, 'Triton', msg)
+        except Exception as e:
+            self.log_and_exit(e, 'Triton', msg)
+
+        # version check
+        self.check_version()
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
index 715291a901..b74c0f64a6 100644
--- a/lmdeploy/pytorch/engine/engine.py
+++ b/lmdeploy/pytorch/engine/engine.py
@@ -14,13 +14,13 @@
                             logging_timer)
 
 from ..adapter.adapter import AdapterManager
-from ..check_env import check_adapters, check_env, check_model
 from ..config import BackendConfig, CacheConfig, SchedulerConfig
 from ..devices import DeviceContext, get_device_manager
 from ..messages import (InputEmbeddingRangeType, InputEmbeddingType,
                         MessageStatus, SchedulerSequence)
 from ..model_inputs import ModelInputs, MRopeModelInputs, VisionModelInputs
 from ..paging import Scheduler
+from .engine_checker import EngineChecker
 from .logits_process import FusedLogitsProcessor, SamplingInputs
 from .model_agent import build_model_agent
 from .request import Request, RequestManager, RequestType, Response
@@ -78,6 +78,40 @@ def _check_finish(scheduler: Scheduler, current_iter: int):
     return False
 
 
+def _build_scheduler_config(engine_config: PytorchEngineConfig):
+    """build scheduler config."""
+    scheduler_config = SchedulerConfig(
+        max_batches=engine_config.max_batch_size,
+        max_session_len=engine_config.session_len,
+        prefill_interval=engine_config.prefill_interval)
+    return scheduler_config
+
+
+def _build_cache_config(engine_config: PytorchEngineConfig):
+    """build cache config."""
+    cache_config = CacheConfig(
+        max_batches=engine_config.max_batch_size,
+        block_size=engine_config.block_size,
+        num_cpu_blocks=engine_config.num_cpu_blocks,
+        num_gpu_blocks=engine_config.num_gpu_blocks,
+        cache_max_entry_count=engine_config.cache_max_entry_count,
+        max_prefill_token_num=engine_config.max_prefill_token_num,
+        enable_prefix_caching=engine_config.enable_prefix_caching,
+        quant_policy=engine_config.quant_policy,
+        device_type=engine_config.device_type,
+    )
+    return cache_config
+
+
+def _build_backend_config(engine_config: PytorchEngineConfig):
+    """build backend config."""
+    backend_config = BackendConfig(
+        eager_mode=engine_config.eager_mode,
+        device_type=engine_config.device_type,
+    )
+    return backend_config
+
+
 class Engine:
     """The inference engine of lmdeploy pytorch.
 
@@ -95,44 +129,23 @@ def __init__(self,
             engine_config = PytorchEngineConfig()
         else:
             engine_config = copy.deepcopy(engine_config)
-        check_env(engine_config.device_type)
-        check_model(model_path, trust_remote_code, engine_config.dtype,
-                    engine_config.device_type)
         if engine_config.max_batch_size is None:
             engine_config.max_batch_size = get_max_batch_size(
                 engine_config.device_type)
-        adapters = engine_config.adapters
-        if adapters is not None:
-            check_adapters(list(adapters.values()))
-        assert engine_config.max_batch_size > 0, 'max_batch_size should be' \
-            f' greater than 0, but got {engine_config.max_batch_size}'
-        assert engine_config.dtype in ['auto', 'float16', 'bfloat16'], \
-            f'unsupported specified data type {engine_config.dtype}'
 
+        checker = EngineChecker(model_path=model_path,
+                                engine_config=engine_config,
+                                trust_remote_code=trust_remote_code,
+                                logger=logger)
+        checker.handle()
+
+        adapters = engine_config.adapters
         self.engine_config = engine_config
         self.tp = engine_config.tp
 
         self.device_context = DeviceContext(
             device_type=engine_config.device_type)
 
-        scheduler_config = SchedulerConfig(
-            max_batches=engine_config.max_batch_size,
-            max_session_len=engine_config.session_len,
-            prefill_interval=engine_config.prefill_interval)
-
-        # block_size = 1 to enable unified paging
-        cache_config = CacheConfig(
-            max_batches=engine_config.max_batch_size,
-            block_size=engine_config.block_size,
-            num_cpu_blocks=engine_config.num_cpu_blocks,
-            num_gpu_blocks=engine_config.num_gpu_blocks,
-            cache_max_entry_count=engine_config.cache_max_entry_count,
-            max_prefill_token_num=engine_config.max_prefill_token_num,
-            enable_prefix_caching=engine_config.enable_prefix_caching,
-            quant_policy=engine_config.quant_policy,
-            device_type=engine_config.device_type,
-        )
-
         if not os.path.exists(model_path):
             model_path = get_model(model_path, engine_config.download_dir,
                                    engine_config.revision)
@@ -141,10 +154,9 @@ def __init__(self,
         if adapters is not None and len(adapters) > 0:
             adapters = self._download_adapters(adapters, engine_config)
 
-        backend_config = BackendConfig(
-            eager_mode=engine_config.eager_mode,
-            device_type=engine_config.device_type,
-        )
+        scheduler_config = _build_scheduler_config(engine_config)
+        cache_config = _build_cache_config(engine_config)
+        backend_config = _build_backend_config(engine_config)
 
         with get_device_manager().context(self.device_context):
             self.model_agent = build_model_agent(
diff --git a/lmdeploy/pytorch/engine/engine_checker.py b/lmdeploy/pytorch/engine/engine_checker.py
new file mode 100644
index 0000000000..1654ece4b5
--- /dev/null
+++ b/lmdeploy/pytorch/engine/engine_checker.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from lmdeploy.messages import PytorchEngineConfig
+
+from ..check_env.adapter import AdapterChecker
+from ..check_env.base import BaseChecker
+from ..check_env.model import ModelChecker
+from ..check_env.torch import TorchChecker
+from ..check_env.transformers import TransformersChecker
+
+
+class EngineChecker(BaseChecker):
+    """check transformers is available."""
+
+    def __init__(self,
+                 model_path: str,
+                 engine_config: PytorchEngineConfig,
+                 trust_remote_code: bool = True,
+                 logger=None):
+        super().__init__(logger)
+        logger = self.get_logger()
+
+        self.engine_config = engine_config
+
+        dtype = engine_config.dtype
+        device_type = engine_config.device_type
+
+        # pytorch
+        torch_checker = TorchChecker(logger=logger)
+        self.register_required_checker(torch_checker)
+
+        if device_type == 'cuda':
+            # triton
+            from ..check_env.triton import TritonChecker
+            triton_checker = TritonChecker(logger=logger)
+            triton_checker.register_required_checker(torch_checker)
+            self.register_required_checker(triton_checker)
+        else:
+            # deeplink
+            from ..check_env.deeplink import DeeplinkChecker
+            dl_checker = DeeplinkChecker(device_type, logger=logger)
+            self.register_required_checker(dl_checker)
+
+        # transformers
+
+        # model
+        trans_checker = TransformersChecker()
+        model_checker = ModelChecker(model_path=model_path,
+                                     trust_remote_code=trust_remote_code,
+                                     dtype=dtype,
+                                     device_type=device_type,
+                                     logger=logger)
+        model_checker.register_required_checker(torch_checker)
+        model_checker.register_required_checker(trans_checker)
+        self.register_required_checker(model_checker)
+
+        # adapters
+        adapters = engine_config.adapters
+        if adapters is not None:
+            adapter_paths = list(adapters.values())
+            for adapter in adapter_paths:
+                adapter_checker = AdapterChecker(adapter, logger=logger)
+                self.register_required_checker(adapter_checker)
+
+    def check(self):
+        """check."""
+        engine_config = self.engine_config
+        logger = self.get_logger()
+
+        if engine_config.thread_safe:
+            logger.warning('thread safe mode has been deprecated and'
+                           ' it would be removed in the future.')
+
+        if engine_config.max_batch_size <= 0:
+            self.log_and_exit(
+                mod_name='Engine',
+                message='max_batch_size should be'
+                f' greater than 0, but got {engine_config.max_batch_size}')
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index 400c492b09..a11a749424 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -15,8 +15,8 @@ safetensors
 sentencepiece
 shortuuid
 tiktoken
-torch<=2.4.0,>=2.0.0
+torch<=2.5.1,>=2.0.0
 torchvision<=0.19.0,>=0.15.0
 transformers
-triton>=2.2.0,<=3.0.0; sys_platform == "linux"
+triton==3.0.0; sys_platform == "linux"
 uvicorn

From 8f34eb10bb45c8b41442e8f72f31caf91104a9d2 Mon Sep 17 00:00:00 2001
From: Galaxy-Husky <598756381@qq.com>
Date: Fri, 13 Dec 2024 11:26:33 +0800
Subject: [PATCH 115/122] Fix args type in docstring (#2888)

* Fix args type in docstring

* fix linting
---
 lmdeploy/serve/async_engine.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
index f3c3432328..78574a38b1 100644
--- a/lmdeploy/serve/async_engine.py
+++ b/lmdeploy/serve/async_engine.py
@@ -223,9 +223,10 @@ def __call__(self,
         """Inference a batch of prompts.
 
         Args:
-            prompts (List[str] | str | List[Dict] | List[Dict]): a batch of
-                prompts. It accepts: string prompt, a list of string prompts,
-                a chat history in OpenAI format or a list of chat history.
+            prompts (List[str] | str | List[Dict] | List[List[Dict]]]): a
+            batch of prompts. It accepts: string prompt, a list of string
+            prompts, a chat history in OpenAI format or a list of chat
+            history.
             gen_config (GenerationConfig | None): a instance of
                 GenerationConfig. Default to None.
             do_preprocess (bool): whether pre-process the messages. Default to
@@ -297,9 +298,10 @@ def batch_infer(self,
         """Inference a batch of prompts.
 
         Args:
-            prompts (List[str] | str | List[Dict] | List[Dict]): a batch of
-                prompts. It accepts: string prompt, a list of string prompts,
-                a chat history in OpenAI format or a list of chat history.
+            prompts (List[str] | str | List[Dict] | List[List[Dict]]]): a
+            batch of prompts. It accepts: string prompt, a list of string
+            prompts, a chat history in OpenAI format or a list of chat
+            history.
             gen_config (GenerationConfig | None): a instance of or a list of
                 GenerationConfig. Default to None.
             do_preprocess (bool): whether pre-process the messages. Default to
@@ -374,9 +376,10 @@ def stream_infer(
         """Inference a batch of prompts with stream mode.
 
         Args:
-            prompts (List[str] | str | List[Dict] | List[Dict]): a batch of
-                prompts. It accepts: string prompt, a list of string prompts,
-                a chat history in OpenAI format or a list of chat history.
+            prompts (List[str] | str | List[Dict] | List[List[Dict]]]):a
+            batch of prompts. It accepts: string prompt, a list of string
+            prompts, a chat history in OpenAI format or a list of chat
+            history.
             gen_config (GenerationConfig | None): a instance of or a list of
                 GenerationConfig. Default to None.
             do_preprocess (bool): whether pre-process the messages. Default to

From 0749ca5c64df0c663320f78d067e40f3abf40d97 Mon Sep 17 00:00:00 2001
From: jinminxi104 <jinminxi104@hotmail.com>
Date: Fri, 13 Dec 2024 11:32:58 +0800
Subject: [PATCH 116/122] refine multi-backend setup.py (#2880)

* refine multi-backend setup.py

* fix ci requirments

* add maca requirements

* add maca runtime requirements

* Update Dockerfile_aarch64_ascend

* Update runtime_maca.txt

* change env name

* fix cuda requirements
---
 .github/workflows/pr_ete_test.yml             |  4 ++--
 .github/workflows/unit-test.yml               |  6 ++---
 docker/Dockerfile_aarch64_ascend              |  2 +-
 requirements/runtime_ascend.txt               |  3 ++-
 .../{runtime.txt => runtime_cuda.txt}         |  0
 requirements/runtime_maca.txt                 | 22 +++++++++++++++++
 requirements.txt => requirements_cuda.txt     |  2 +-
 requirements_maca.txt                         |  4 ++++
 setup.py                                      | 24 +++++++------------
 9 files changed, 43 insertions(+), 24 deletions(-)
 rename requirements/{runtime.txt => runtime_cuda.txt} (100%)
 create mode 100644 requirements/runtime_maca.txt
 rename requirements.txt => requirements_cuda.txt (70%)
 create mode 100644 requirements_maca.txt

diff --git a/.github/workflows/pr_ete_test.yml b/.github/workflows/pr_ete_test.yml
index 3a19ebe870..2d1c4b63f5 100644
--- a/.github/workflows/pr_ete_test.yml
+++ b/.github/workflows/pr_ete_test.yml
@@ -10,7 +10,7 @@ on:
       - "3rdparty/**"
       - "lmdeploy/**"
       - "requirements/**"
-      - "requirements.txt"
+      - "requirements_cuda.txt"
       - "CMakeLists.txt"
       - "setup.py"
   workflow_dispatch:
@@ -68,7 +68,7 @@ jobs:
           export PATH=$PATH:/usr/local/openmpi/bin
           export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/openmpi/lib
           python3 -m pip install cmake packaging wheel transformers_stream_generator transformers datasets openai einops timm decord
-          python3 -m pip install -r requirements.txt -r requirements/test.txt -r requirements/build.txt
+          python3 -m pip install -r requirements_cuda.txt -r requirements/test.txt -r requirements/build.txt
           mkdir -p build && cd build &&\
           sh ../generate.sh &&\
           ninja -j$(nproc) && ninja install &&\
diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml
index ec6db0682d..ec70f61f4c 100644
--- a/.github/workflows/unit-test.yml
+++ b/.github/workflows/unit-test.yml
@@ -10,7 +10,7 @@ on:
       - "3rdparty/**"
       - "lmdeploy/**"
       - "requirements/**"
-      - "requirements.txt"
+      - "requirements_cuda.txt"
       - "CMakeLists.txt"
       - "setup.py"
   push:
@@ -24,7 +24,7 @@ on:
       - "3rdparty/**"
       - "lmdeploy/**"
       - "requirements/**"
-      - "requirements.txt"
+      - "requirements_cuda.txt"
       - "CMakeLists.txt"
       - "setup.py"
     tags:
@@ -78,7 +78,7 @@ jobs:
           python3 -m pip install pynvml packaging protobuf transformers_stream_generator
           # manually install flash attn
           python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
-          python3 -m pip install -r requirements.txt -r requirements/test.txt
+          python3 -m pip install -r requirements_cuda.txt -r requirements/test.txt
           python3 -m pip install .
       - name: Check env
         run: |
diff --git a/docker/Dockerfile_aarch64_ascend b/docker/Dockerfile_aarch64_ascend
index 1c9591197b..ecc2d1334e 100644
--- a/docker/Dockerfile_aarch64_ascend
+++ b/docker/Dockerfile_aarch64_ascend
@@ -122,4 +122,4 @@ WORKDIR /opt/lmdeploy
 
 RUN --mount=type=cache,target=/root/.cache/pip \
     sed -i '/triton/d' requirements/runtime.txt && \
-    pip3 install -v --no-build-isolation -e .
+    LMDEPLOY_TARGET_DEVICE=ascend pip3 install -v --no-build-isolation -e .
diff --git a/requirements/runtime_ascend.txt b/requirements/runtime_ascend.txt
index c5d44cc995..965175faf3 100644
--- a/requirements/runtime_ascend.txt
+++ b/requirements/runtime_ascend.txt
@@ -16,7 +16,8 @@ safetensors
 sentencepiece
 shortuuid
 tiktoken
-torch<=2.4.0,>=2.0.0
+torch<=2.4.0,>=2.3.1
+torch-npu==2.3.1
 torchvision<=0.19.0,>=0.15.0
 transformers
 uvicorn
diff --git a/requirements/runtime.txt b/requirements/runtime_cuda.txt
similarity index 100%
rename from requirements/runtime.txt
rename to requirements/runtime_cuda.txt
diff --git a/requirements/runtime_maca.txt b/requirements/runtime_maca.txt
new file mode 100644
index 0000000000..f65b3827cd
--- /dev/null
+++ b/requirements/runtime_maca.txt
@@ -0,0 +1,22 @@
+accelerate==0.32.1
+einops
+fastapi
+fire
+mmengine-lite
+numpy<2.0.0
+openai
+outlines<0.1.0
+peft<=0.11.1
+pillow
+protobuf
+pydantic>2.0.0
+pynvml
+safetensors
+sentencepiece
+shortuuid
+tiktoken
+torch<=2.4.0,>=2.0.0
+torchvision<=0.19.0,>=0.15.0
+transformers
+triton>=2.1.0; sys_platform == "linux"
+uvicorn
diff --git a/requirements.txt b/requirements_cuda.txt
similarity index 70%
rename from requirements.txt
rename to requirements_cuda.txt
index 91d38808f1..7c1d387dfb 100644
--- a/requirements.txt
+++ b/requirements_cuda.txt
@@ -1,4 +1,4 @@
 -r requirements/build.txt
--r requirements/runtime.txt
+-r requirements/runtime_cuda.txt
 -r requirements/lite.txt
 -r requirements/serve.txt
diff --git a/requirements_maca.txt b/requirements_maca.txt
new file mode 100644
index 0000000000..075b132c8c
--- /dev/null
+++ b/requirements_maca.txt
@@ -0,0 +1,4 @@
+-r requirements/build.txt
+-r requirements/runtime_maca.txt
+-r requirements/lite.txt
+-r requirements/serve.txt
diff --git a/setup.py b/setup.py
index 7a08ac7919..52e180d8a2 100644
--- a/setup.py
+++ b/setup.py
@@ -4,18 +4,14 @@
 
 from setuptools import find_packages, setup
 
-npu_available = False
-try:
-    import torch_npu
-
-    npu_available = torch_npu.npu.is_available()
-except ImportError:
-    pass
-
 pwd = os.path.dirname(__file__)
 version_file = 'lmdeploy/version.py'
 
 
+def get_target_device():
+    return os.getenv('LMDEPLOY_TARGET_DEVICE', 'cuda')
+
+
 def readme():
     with open(os.path.join(pwd, 'README.md'), encoding='utf-8') as f:
         content = f.read()
@@ -154,16 +150,12 @@ def gen_packages_items():
         setup_requires=parse_requirements('requirements/build.txt'),
         tests_require=parse_requirements('requirements/test.txt'),
         install_requires=parse_requirements(
-            'requirements/runtime_ascend.txt'
-            if npu_available else 'requirements/runtime.txt'),
+            f'requirements/runtime_{get_target_device()}.txt'),
         extras_require={
             'all':
-            parse_requirements('requirements_ascend.txt'
-                               if npu_available else 'requirements.txt'),
-            'lite':
-            parse_requirements('requirements/lite.txt'),
-            'serve':
-            parse_requirements('requirements/serve.txt')
+            parse_requirements(f'requirements_{get_target_device()}.txt'),
+            'lite': parse_requirements('requirements/lite.txt'),
+            'serve': parse_requirements('requirements/serve.txt')
         },
         has_ext_modules=check_ext_modules,
         classifiers=[

From 422b9f221079f9d6ed3973f032b5898db954a914 Mon Sep 17 00:00:00 2001
From: tangzhiyi11 <tangzhiyi11@users.noreply.github.com>
Date: Fri, 13 Dec 2024 14:50:22 +0800
Subject: [PATCH 117/122] [dlinfer] fix engine checker (#2891)

---
 lmdeploy/pytorch/engine/engine_checker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmdeploy/pytorch/engine/engine_checker.py b/lmdeploy/pytorch/engine/engine_checker.py
index 1654ece4b5..7276a51fbc 100644
--- a/lmdeploy/pytorch/engine/engine_checker.py
+++ b/lmdeploy/pytorch/engine/engine_checker.py
@@ -26,7 +26,6 @@ def __init__(self,
 
         # pytorch
         torch_checker = TorchChecker(logger=logger)
-        self.register_required_checker(torch_checker)
 
         if device_type == 'cuda':
             # triton
@@ -39,6 +38,7 @@ def __init__(self,
             from ..check_env.deeplink import DeeplinkChecker
             dl_checker = DeeplinkChecker(device_type, logger=logger)
             self.register_required_checker(dl_checker)
+            self.register_required_checker(torch_checker)
 
         # transformers
 

From 96e82ebce580c69fcc83f0ffe8c77cd549d61d97 Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Fri, 13 Dec 2024 19:05:24 +0800
Subject: [PATCH 118/122] Refactor VLM modules (#2810)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* refactor VL modules for internvl and qwen2-vl (#2764)

* qwen2-vl

* internvl

* qwen2

* Refactor VL modules for glm4v, deepseek-vl, llava-hf, cogvlm (#2772)

* qwen2-vl

* internvl

* qwen2

* get image_tokens_per_patch for internvl2

* deepseek-vl

* cogvlm

* glm4v

* update internvl

* internvl_llava

* llava

* glm4v

* upate internvl

* cogvlm

* deepseek

* llava_hf

* rollback llava, internvl-llava

* Refactor VL modules for qwen-vl, llava and llava_next (#2773)

* qwen2-vl

* internvl

* qwen2

* get image_tokens_per_patch for internvl2

* deepseek-vl

* cogvlm

* glm4v

* update internvl

* internvl_llava

* llava

* glm4v

* upate internvl

* cogvlm

* deepseek

* llava_hf

* rollback llava, internvl-llava

* refactor qwen

* update internvl

* update llava_hf

* update qwen2-vl

* llava_next

* update llava_next

* update llava

* update llava

* update llava

* Refactor VL modules for qwen2-vl (#2777)

* qwen2-vl

* internvl

* qwen2

* get image_tokens_per_patch for internvl2

* deepseek-vl

* cogvlm

* glm4v

* update internvl

* internvl_llava

* llava

* glm4v

* upate internvl

* cogvlm

* deepseek

* llava_hf

* rollback llava, internvl-llava

* refactor qwen

* update internvl

* update llava_hf

* update qwen2-vl

* llava_next

* update llava_next

* update llava

* update llava

* update llava

* qwen2

* Fix side-effect to internvl (#2778)

* qwen2-vl

* internvl

* qwen2

* get image_tokens_per_patch for internvl2

* deepseek-vl

* cogvlm

* glm4v

* update internvl

* internvl_llava

* llava

* glm4v

* upate internvl

* cogvlm

* deepseek

* llava_hf

* rollback llava, internvl-llava

* refactor qwen

* update internvl

* update llava_hf

* update qwen2-vl

* llava_next

* update llava_next

* update llava

* update llava

* update llava

* qwen2

* fix internvl

* Refactor VL modules for phi3-vision (#2779)

* qwen2-vl

* internvl

* qwen2

* get image_tokens_per_patch for internvl2

* deepseek-vl

* cogvlm

* glm4v

* update internvl

* internvl_llava

* llava

* glm4v

* upate internvl

* cogvlm

* deepseek

* llava_hf

* rollback llava, internvl-llava

* refactor qwen

* update internvl

* update llava_hf

* update qwen2-vl

* llava_next

* update llava_next

* update llava

* update llava

* update llava

* qwen2

* fix internvl

* phi3-vision

* Refactor VL modules for mllama and yi-vl (#2781)

* qwen2-vl

* internvl

* qwen2

* get image_tokens_per_patch for internvl2

* deepseek-vl

* cogvlm

* glm4v

* update internvl

* internvl_llava

* llava

* glm4v

* upate internvl

* cogvlm

* deepseek

* llava_hf

* rollback llava, internvl-llava

* refactor qwen

* update internvl

* update llava_hf

* update qwen2-vl

* llava_next

* update llava_next

* update llava

* update llava

* update llava

* qwen2

* fix internvl

* phi3-vision

* refactor yi-vl

* refactor mllama

* Refactor VLM module for minicpm and molmo (#2794)

* Refactor VLM modules for xcomposer series (#2796)

* Refactor VLM modules for internvl-llava (#2797)

* Refactor VLM modules v2 (#2806)

* internvl2 v2

* cogvlm

* deepseek-vl

* glm-4v

* llava-hf

* llava-next

* llava

* internvl-llava

* mllama

* phi3-vision

* qwen

* qwen2

* yi-vl

* xcomposer

* minicpm

* molmo

* update

* update

* Remove vl template (#2809)

* Resolve conflicts (#2811)

* feature: support qwen2.5 fuction_call (#2737)

* feat: support qwen2.5 tools_call

* fix: npe bug

* fix: 模版不一致

* fix: adopting review suggestions

* fix: adopting review suggestions

* fix: adopting review suggestions

* fix: adopting review suggestions

* feat: Support multi tools calling

* feat: Support multi tools calling

* fix: Add '\n' between each tool

* fix: Add ensure_ascii=False

* bugfix: rfind

* bugfix: tools_call -> tool_calls

* bugfix: add toolName in tool_response

* fix: some '\n' error

* fix: remove toolname

* fix: replace '\n' to self.separator

* feat: add doc with multiple tool calling

* fix：update doc

* feat: add qwen2.5 prompt template test

* feat: add qwen2.5 no tool call prompt test

---------

Co-authored-by: gaozixiang <gaozixiang1@xiaomi.com>

* Update supported models & Ascend doc (#2765)

* update ascend supported model list

* fix markdown

* fix markdown

* fix lint

* Update get_started.md

* Update get_started.md

* [CI] Split vl testcases into turbomind and pytorch backend (#2751)

* updaet

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* [Feature] support minicpm-v_2_6 for pytorch engine. (#2767)

* support minicpmv_2_6.

* update supported_models.

* update supported_models.

* Support qwen2-vl AWQ quantization (#2787)

* Support qwen2-vl AWQ quantization

* Update config.yaml

---------

Co-authored-by: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>

* [dlinfer] Fix qwenvl rope error for dlinfer backend (#2795)

* Optimize update_step_ctx on Ascend (#2804)

* opt update_ctx for ascend

* fix lint

---------

Co-authored-by: 逝夜长歌 <928926035@qq.com>
Co-authored-by: gaozixiang <gaozixiang1@xiaomi.com>
Co-authored-by: jinminxi104 <jinminxi104@hotmail.com>
Co-authored-by: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Co-authored-by: zhoushenglong <87467364+Reinerzhou@users.noreply.github.com>
Co-authored-by: AllentDan <41138331+AllentDan@users.noreply.github.com>
Co-authored-by: Wei Tao <1136862851@qq.com>

* PytorchEngine refactor multimodal (#2742)

* WIP

* support mrope

* support long context

* support causal=false

* fix mask

* flash attn bound

* optimize

* Moskau, Moskau, wirf die Gläser an die Wand

* YMCA

* optimize mllama

* update processor

* support cogvlm

* all work and no play make jack a dull boy

* upgrade triton

* support qwen2vl

* support internvl

* phi3-v WIP

* glm4v WIP

* support chatglm and cogvlm

* use image tokens

* support llava

* support internvl-mono

* phi3v, mllama

* add llavanext

* use img token ids

* support multiimage chatglm cogvlm

* fix ut

* minor-fix

* minor-fix (#2813)

* fix

* fix mono

* fix docs

* read norm_type

* super().collect_images->self.collect_images

* add note in supported models

* define the parameters clearly

* better streaming

* fix molmo

* Fix vision model batch inference (#2868)

* remove forward from vl models that are not supported by tm

* support max_batch_size

* fix

* warn glm4v does not support multi images

* unconst

* fix deepseek-vl

* fix internvl

* fix llava

* fix minicpm 2.6

* fix callback

* fix minicpm v2.5

* fix minicpm v2.6

* update llava_next.py

* remove hardcode from xcomposer2.py

* rollback supported_models

* change to staticmethod

* fix vlm quantization

* update doc

* update

---------
Co-authored-by: q yao <streetyao@live.com>
---
 docs/en/multi_modal/llava.md                  |  10 +-
 docs/en/multi_modal/qwen2_vl.md               |   2 +-
 docs/zh_cn/multi_modal/llava.md               |  10 +-
 docs/zh_cn/multi_modal/qwen2_vl.md            |   2 +-
 lmdeploy/lite/apis/calibrate.py               |  31 +-
 lmdeploy/pytorch/backends/attention.py        |   3 +
 lmdeploy/pytorch/backends/base.py             |   3 +-
 lmdeploy/pytorch/backends/cuda/attention.py   |   6 +
 .../pytorch/backends/cuda/flash_attention.py  | 101 ++
 lmdeploy/pytorch/backends/cuda/op_backend.py  |  51 +-
 .../pytorch/backends/dlinfer/attention.py     |   5 +
 .../pytorch/backends/dlinfer/op_backend.py    |   2 +-
 lmdeploy/pytorch/backends/flash_attention.py  |  40 +
 lmdeploy/pytorch/backends/graph_runner.py     |  23 +
 lmdeploy/pytorch/configurations/llava.py      |  26 -
 lmdeploy/pytorch/engine/engine.py             | 259 +++--
 lmdeploy/pytorch/engine/engine_instance.py    | 274 ++----
 lmdeploy/pytorch/engine/input_process.py      |  44 +
 lmdeploy/pytorch/engine/model_agent.py        |  21 +-
 .../pytorch/kernels/cuda/flashattention.py    |  34 +-
 .../pytorch/kernels/cuda/flatten_kv_cache.py  |   4 +-
 lmdeploy/pytorch/messages.py                  | 141 ++-
 lmdeploy/pytorch/model_inputs.py              | 176 ++--
 lmdeploy/pytorch/models/chatglm2.py           | 552 +++++++++--
 lmdeploy/pytorch/models/cogvlm.py             | 546 +++++++++--
 lmdeploy/pytorch/models/internvl.py           | 482 +++++++++-
 lmdeploy/pytorch/models/internvl_patch.py     |  96 ++
 lmdeploy/pytorch/models/llama.py              |  19 -
 lmdeploy/pytorch/models/llava.py              | 889 +++++++++++++++++-
 lmdeploy/pytorch/models/mistral.py            |  19 -
 lmdeploy/pytorch/models/mllama.py             | 849 ++++++++++++++++-
 lmdeploy/pytorch/models/module_map.py         |   8 +-
 lmdeploy/pytorch/models/phi3.py               |   4 -
 lmdeploy/pytorch/models/phi3_v.py             | 476 ++++++++++
 lmdeploy/pytorch/models/qwen2_vl.py           | 549 ++++++++++-
 lmdeploy/pytorch/models/utils/model.py        |  46 +
 lmdeploy/pytorch/models/utils/multimodal.py   |  14 +
 lmdeploy/pytorch/multimodal/__init__.py       |   4 +
 lmdeploy/pytorch/multimodal/data_type.py      |  51 +
 lmdeploy/pytorch/multimodal/image_type.py     |  15 +
 lmdeploy/pytorch/nn/__init__.py               |   2 +-
 lmdeploy/pytorch/nn/attention.py              | 105 ++-
 lmdeploy/pytorch/supported_models.py          |   4 +-
 lmdeploy/serve/vl_async_engine.py             | 313 +++---
 lmdeploy/vl/engine.py                         | 238 ++---
 lmdeploy/vl/model/base.py                     | 221 ++++-
 lmdeploy/vl/model/builder.py                  |  39 +-
 lmdeploy/vl/model/cogvlm.py                   | 119 +--
 lmdeploy/vl/model/deepseek.py                 | 153 ++-
 lmdeploy/vl/model/glm_4v.py                   | 121 ++-
 lmdeploy/vl/model/internvl.py                 | 238 +++--
 lmdeploy/vl/model/internvl_llava.py           | 100 +-
 lmdeploy/vl/model/llava.py                    | 401 +++++---
 lmdeploy/vl/model/llava_hf.py                 | 167 +++-
 lmdeploy/vl/model/llava_next.py               | 222 +++--
 lmdeploy/vl/model/mini_gemeni.py              |  97 +-
 lmdeploy/vl/model/minicpmv.py                 | 368 +++++---
 lmdeploy/vl/model/mllama.py                   | 312 +-----
 lmdeploy/vl/model/molmo.py                    | 258 ++---
 lmdeploy/vl/model/phi3_vision.py              | 204 +---
 lmdeploy/vl/model/qwen.py                     | 121 ++-
 lmdeploy/vl/model/qwen2.py                    | 161 ++--
 lmdeploy/vl/model/xcomposer2.py               | 230 +++--
 lmdeploy/vl/model/yi.py                       |  34 +-
 lmdeploy/vl/templates.py                      | 550 -----------
 tests/pytorch/kernel/test_flash_attention.py  |  51 +-
 .../{ => test_vl}/test_vl_encode.py           |   0
 tests/test_lmdeploy/test_vl_template.py       | 132 ---
 68 files changed, 7597 insertions(+), 3251 deletions(-)
 create mode 100644 lmdeploy/pytorch/backends/cuda/flash_attention.py
 create mode 100644 lmdeploy/pytorch/backends/flash_attention.py
 delete mode 100644 lmdeploy/pytorch/configurations/llava.py
 create mode 100644 lmdeploy/pytorch/engine/input_process.py
 create mode 100644 lmdeploy/pytorch/models/internvl_patch.py
 create mode 100644 lmdeploy/pytorch/models/phi3_v.py
 create mode 100644 lmdeploy/pytorch/models/utils/model.py
 create mode 100644 lmdeploy/pytorch/models/utils/multimodal.py
 create mode 100644 lmdeploy/pytorch/multimodal/__init__.py
 create mode 100644 lmdeploy/pytorch/multimodal/data_type.py
 create mode 100644 lmdeploy/pytorch/multimodal/image_type.py
 delete mode 100644 lmdeploy/vl/templates.py
 rename tests/test_lmdeploy/{ => test_vl}/test_vl_encode.py (100%)
 delete mode 100644 tests/test_lmdeploy/test_vl_template.py

diff --git a/docs/en/multi_modal/llava.md b/docs/en/multi_modal/llava.md
index 8f052227d5..c374b67121 100644
--- a/docs/en/multi_modal/llava.md
+++ b/docs/en/multi_modal/llava.md
@@ -6,11 +6,17 @@ LMDeploy supports the following llava series of models, which are detailed in th
 | :----------------------------------: | :--: | :------------------------: |
 | llava-hf/Llava-interleave-qwen-7b-hf |  7B  |     TurboMind, PyTorch     |
 |       llava-hf/llava-1.5-7b-hf       |  7B  |     TurboMind, PyTorch     |
-|   liuhaotian/llava-v1.6-vicuna-7b    |  7B  |     TurboMind, PyTorch     |
-|   liuhaotian/llava-v1.6-mistral-7b   |  7B  |     TurboMind, PyTorch     |
+|  llava-hf/llava-v1.6-mistral-7b-hf   |  7B  |          PyTorch           |
+|   llava-hf/llava-v1.6-vicuna-7b-hf   |  7B  |          PyTorch           |
+|   liuhaotian/llava-v1.6-mistral-7b   |  7B  |         TurboMind          |
+|   liuhaotian/llava-v1.6-vicuna-7b    |  7B  |         TurboMind          |
 
 The next chapter demonstrates how to deploy an Llava model using LMDeploy, with [llava-hf/llava-interleave](https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf) as an example.
 
+```{note}
+PyTorch engine removes the support of original llava models after v0.6.4. Please use their corresponding transformers models instead, which can be found in https://huggingface.co/llava-hf
+```
+
 ## Installation
 
 Please install LMDeploy by following the [installation guide](../get_started/installation.md).
diff --git a/docs/en/multi_modal/qwen2_vl.md b/docs/en/multi_modal/qwen2_vl.md
index 8b59f84545..fd9f02abaa 100644
--- a/docs/en/multi_modal/qwen2_vl.md
+++ b/docs/en/multi_modal/qwen2_vl.md
@@ -4,7 +4,7 @@ LMDeploy supports the following Qwen-VL series of models, which are detailed in
 
 |    Model     |  Size  | Supported Inference Engine |
 | :----------: | :----: | :------------------------: |
-| Qwen-VL-Chat |   -    |     TurboMind, Pytorch     |
+| Qwen-VL-Chat |   -    |         TurboMind          |
 |   Qwen2-VL   | 2B, 7B |          PyTorch           |
 
 The next chapter demonstrates how to deploy an Qwen-VL model using LMDeploy, with [Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct) as an example.
diff --git a/docs/zh_cn/multi_modal/llava.md b/docs/zh_cn/multi_modal/llava.md
index c40f37308a..6538d1b861 100644
--- a/docs/zh_cn/multi_modal/llava.md
+++ b/docs/zh_cn/multi_modal/llava.md
@@ -6,11 +6,17 @@ LMDeploy 支持以下 LLaVA 系列模型，具体如下表所示：
 | :----------------------------------: | :--: | :----------------: |
 | llava-hf/Llava-interleave-qwen-7b-hf |  7B  | TurboMind, PyTorch |
 |       llava-hf/llava-1.5-7b-hf       |  7B  | TurboMind, PyTorch |
-|   liuhaotian/llava-v1.6-vicuna-7b    |  7B  | TurboMind, PyTorch |
-|   liuhaotian/llava-v1.6-mistral-7b   |  7B  | TurboMind, PyTorch |
+|  llava-hf/llava-v1.6-mistral-7b-hf   |  7B  |      PyTorch       |
+|   llava-hf/llava-v1.6-vicuna-7b-hf   |  7B  |      PyTorch       |
+|   liuhaotian/llava-v1.6-vicuna-7b    |  7B  |     TurboMind      |
+|   liuhaotian/llava-v1.6-mistral-7b   |  7B  |     TurboMind      |
 
 接下来的章节将演示如何使用 LMDeploy 部署 LLaVA 模型，并以 [llava-hf/llava-interleave](https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf) 为例。
 
+```{note}
+自 0.6.4 之后，PyTorch 引擎移除了对 llava 原始模型的支持。我们建议使用它们对应的 transformers 格式的模型。这些模型可以在 https://huggingface.co/llava-hf 中找到
+```
+
 ## 安装
 
 请按照[安装指南](../get_started/installation.md)安装 LMDeploy。
diff --git a/docs/zh_cn/multi_modal/qwen2_vl.md b/docs/zh_cn/multi_modal/qwen2_vl.md
index f62d2de74c..7cb7efe93b 100644
--- a/docs/zh_cn/multi_modal/qwen2_vl.md
+++ b/docs/zh_cn/multi_modal/qwen2_vl.md
@@ -4,7 +4,7 @@ LMDeploy 支持 Qwen-VL 系列模型，具体如下：
 
 |    Model     |  Size  | Supported Inference Engine |
 | :----------: | :----: | :------------------------: |
-| Qwen-VL-Chat |   -    |     TurboMind, Pytorch     |
+| Qwen-VL-Chat |   -    |         TurboMind          |
 |   Qwen2-VL   | 2B, 7B |          PyTorch           |
 
 本文将以[Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)为例，演示使用 LMDeploy 部署 Qwen2-VL 系列模型的方法
diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
index 71f7a5900c..5f72c53ab0 100644
--- a/lmdeploy/lite/apis/calibrate.py
+++ b/lmdeploy/lite/apis/calibrate.py
@@ -239,20 +239,23 @@ def calibrate(model: str,
 
     model_type, _ = get_task(model)
     make_compatible_internvl_config(model)
-    if model_type == 'llm':
-        # Load tokenizer and configuration
-        tokenizer = AutoTokenizer.from_pretrained(model,
-                                                  trust_remote_code=True)
-
-        model = load_hf_from_pretrained(model,
-                                        torch_dtype=torch.float16,
-                                        trust_remote_code=True)
-        vl_model = None
-    elif model_type == 'vlm':
-        from lmdeploy.vl.model.builder import vl_model_with_tokenizer
-        vl_model, model, tokenizer = vl_model_with_tokenizer(model_path=model)
-
-    model.config.use_cache = False
+
+    # Load tokenizer and configuration
+    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
+
+    model = load_hf_from_pretrained(model,
+                                    torch_dtype=torch.float16,
+                                    trust_remote_code=True)
+    vl_model = None
+    if model_type == 'vlm':
+        vl_model = model
+        if hasattr(model, 'language_model'):
+            model = model.language_model
+        if hasattr(model, 'llm'):
+            model = model.llm
+        model.config.use_cache = False
+        model = model.half().eval()
+
     model_type = type(model).__name__
     if model_type not in LAYER_TYPE_MAP or model_type not in NORM_TYPE_MAP:
         raise RuntimeError(
diff --git a/lmdeploy/pytorch/backends/attention.py b/lmdeploy/pytorch/backends/attention.py
index 92a0befbf4..f0e60d86ac 100644
--- a/lmdeploy/pytorch/backends/attention.py
+++ b/lmdeploy/pytorch/backends/attention.py
@@ -34,6 +34,7 @@ def __init__(
         alibi: bool = None,
         sliding_window: int = None,
         logit_softcapping: float = None,
+        causal: bool = True,
         **kwargs,
     ) -> None:
         if scale is None:
@@ -53,6 +54,7 @@ def __init__(
         self.alibi = alibi
         self.sliding_window = sliding_window
         self.logit_softcapping = logit_softcapping
+        self.causal = causal
 
     @abstractmethod
     def forward(
@@ -82,6 +84,7 @@ def build(
         alibi: bool = False,
         sliding_window: int = None,
         logical_softcapping: float = None,
+        causal: bool = True,
         **kwargs,
     ) -> AttentionImpl[T]:
         """build."""
diff --git a/lmdeploy/pytorch/backends/base.py b/lmdeploy/pytorch/backends/base.py
index ef538f7a3d..c8623666dc 100644
--- a/lmdeploy/pytorch/backends/base.py
+++ b/lmdeploy/pytorch/backends/base.py
@@ -12,7 +12,8 @@
 
 class OpType(Enum):
     """Layer type enumerate."""
-    Attention = auto()
+    PagedAttention = auto()
+    FlashAttention = auto()
     Linear = auto()
     RotaryEmbedding = auto()
     ApplyRotaryEmb = auto()
diff --git a/lmdeploy/pytorch/backends/cuda/attention.py b/lmdeploy/pytorch/backends/cuda/attention.py
index 8261b869f0..f9227497f2 100644
--- a/lmdeploy/pytorch/backends/cuda/attention.py
+++ b/lmdeploy/pytorch/backends/cuda/attention.py
@@ -41,6 +41,7 @@ def __init__(
         alibi: bool = False,
         sliding_window: int = None,
         logit_softcapping: float = None,
+        causal: bool = True,
         **kwargs,
     ):
         super().__init__(
@@ -52,8 +53,10 @@ def __init__(
             alibi=alibi,
             sliding_window=sliding_window,
             logit_softcapping=logit_softcapping,
+            causal=causal,
             **kwargs,
         )
+        assert not (alibi and not causal)
 
         from lmdeploy.pytorch.kernels.cuda import (alibi_paged_attention_fwd,
                                                    fill_kv_cache,
@@ -172,6 +175,7 @@ def forward(
                     window_size=self.sliding_window,
                     sm_scale=self.scale,
                     logit_softcapping=self.logit_softcapping,
+                    causal=self.causal,
                 )
         else:
             self.alibi_paged_attention_fwd(
@@ -207,6 +211,7 @@ def build(
         alibi: bool = False,
         sliding_window: int = None,
         logical_softcapping: float = None,
+        causal: bool = True,
         **kwargs,
     ) -> TritonAttentionImpl:
         """build."""
@@ -218,4 +223,5 @@ def build(
                                    alibi=alibi,
                                    sliding_window=sliding_window,
                                    logical_softcapping=logical_softcapping,
+                                   causal=causal,
                                    **kwargs)
diff --git a/lmdeploy/pytorch/backends/cuda/flash_attention.py b/lmdeploy/pytorch/backends/cuda/flash_attention.py
new file mode 100644
index 0000000000..5d3925b744
--- /dev/null
+++ b/lmdeploy/pytorch/backends/cuda/flash_attention.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from torch import Tensor
+
+from ..flash_attention import FlashAttentionBuilder, FlashAttentionImpl
+
+
+class TritonFlashAttentionImpl(FlashAttentionImpl):
+    """triton flash attention implementation."""
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_dim: int,
+        scale: float = None,
+        num_kv_heads: int = None,
+        v_head_dim: int = None,
+        causal: bool = True,
+        sliding_window: int = None,
+        logical_softcapping: float = None,
+    ):
+        if scale is None:
+            scale = 1.0 / (head_dim**0.5)
+
+        if num_kv_heads is None:
+            num_kv_heads = num_heads
+
+        if v_head_dim is None:
+            v_head_dim = head_dim
+
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.scale = scale
+        self.num_kv_heads = num_kv_heads
+        self.v_head_dim = v_head_dim
+        self.causal = causal
+        self.sliding_window = sliding_window
+        self.logical_softcapping = logical_softcapping
+
+        from lmdeploy.pytorch.kernels.cuda import flash_attention_fwd
+        self.flash_attention_fwd = flash_attention_fwd
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor,
+                value: Tensor,
+                q_start_loc: Tensor,
+                q_seqlens: Tensor,
+                kv_start_loc: Tensor,
+                kv_seqlens: Tensor,
+                max_q_seqlen: int = None):
+        """forward."""
+
+        q_shape = query.shape
+        o_shape = q_shape[:-1] + (self.v_head_dim, )
+        out = query.new_empty(o_shape)
+        self.flash_attention_fwd(
+            query,
+            key,
+            value,
+            out,
+            q_start_loc=q_start_loc,
+            q_seqlens=q_seqlens,
+            kv_start_loc=kv_start_loc,
+            kv_seqlens=kv_seqlens,
+            max_seqlen=max_q_seqlen,
+            window_size=self.sliding_window,
+            sm_scale=self.scale,
+            logit_softcapping=self.logical_softcapping,
+            causal=self.causal,
+            kv_layout='shd',
+        )
+
+        return out
+
+
+class TritonFlashAttentionBuilder(FlashAttentionBuilder):
+    """triton attention builder."""
+
+    @staticmethod
+    def build(
+        num_heads: int,
+        head_dim: int,
+        scale: float = None,
+        num_kv_heads: int = None,
+        v_head_dim: int = None,
+        causal: bool = True,
+        sliding_window: int = None,
+        logical_softcapping: float = None,
+        **kwargs,
+    ) -> FlashAttentionImpl:
+        """build."""
+        return TritonFlashAttentionImpl(
+            num_heads=num_heads,
+            head_dim=head_dim,
+            scale=scale,
+            num_kv_heads=num_kv_heads,
+            v_head_dim=v_head_dim,
+            causal=causal,
+            sliding_window=sliding_window,
+            logical_softcapping=logical_softcapping,
+        )
diff --git a/lmdeploy/pytorch/backends/cuda/op_backend.py b/lmdeploy/pytorch/backends/cuda/op_backend.py
index d796f8e19f..bfe89dc63d 100644
--- a/lmdeploy/pytorch/backends/cuda/op_backend.py
+++ b/lmdeploy/pytorch/backends/cuda/op_backend.py
@@ -23,9 +23,12 @@ def get_name() -> str:
     @classmethod
     def get_layer_impl_builder(cls, layer_type: OpType):
         """get cuda layer builder."""
-        if layer_type == OpType.Attention:
+        if layer_type == OpType.PagedAttention:
             from .attention import TritonAttentionBuilder
             return TritonAttentionBuilder
+        elif layer_type == OpType.FlashAttention:
+            from .flash_attention import TritonFlashAttentionBuilder
+            return TritonFlashAttentionBuilder
         elif layer_type == OpType.ApplyRotaryEmb:
             from .apply_rotary_emb import TritonApplyRotaryEmbBuilder
             return TritonApplyRotaryEmbBuilder
@@ -125,30 +128,30 @@ def update_step_context(cls, step_context):
             quant_policy=step_context.kv_quant_policy,
         )
 
-        cross_attn_metadata = None
-        fill_seqlens = None
-        if step_context.cross_attention_states is not None:
-            fill_seqlens = torch.zeros_like(q_seqlens)
-            for idx, state in enumerate(step_context.cross_attention_states):
-                if state is not None:
-                    fill_seqlens[idx] = state.shape[-2]
+        cross_seqlens = step_context.cross_seqlens
         cross_kv_seqlens = step_context.cross_kv_seqlens
-        cross_kv_start_loc = None
-        cross_kv_flatten_size = None
-        if not step_context.is_decoding and cross_kv_seqlens is not None:
-            cross_kv_start_loc = cross_kv_seqlens.cumsum(0) - cross_kv_seqlens
-            cross_kv_flatten_size = cross_kv_seqlens.sum().item()
-        cross_attn_metadata = attn_meta_cls(
-            step_context.is_decoding,
-            step_context.block_offsets,
-            q_start_loc=q_start_loc,
-            q_seqlens=q_seqlens,
-            kv_start_loc=cross_kv_start_loc,
-            kv_seqlens=cross_kv_seqlens,
-            kv_flatten_size=cross_kv_flatten_size,
-            fill_seqlens=fill_seqlens,
-            quant_policy=step_context.kv_quant_policy,
-        )
+        cross_attn_metadata = None
+        if cross_seqlens is not None:
+            fill_seqlens = cross_seqlens
+            if fill_seqlens.sum().item() == 0:
+                fill_seqlens = None
+            cross_kv_start_loc = None
+            cross_kv_flatten_size = None
+            if not step_context.is_decoding and cross_kv_seqlens is not None:
+                cross_kv_start_loc = cross_kv_seqlens.cumsum(
+                    0) - cross_kv_seqlens
+                cross_kv_flatten_size = cross_kv_seqlens.sum().item()
+            cross_attn_metadata = attn_meta_cls(
+                step_context.is_decoding,
+                step_context.block_offsets,
+                q_start_loc=q_start_loc,
+                q_seqlens=q_seqlens,
+                kv_start_loc=cross_kv_start_loc,
+                kv_seqlens=cross_kv_seqlens,
+                kv_flatten_size=cross_kv_flatten_size,
+                fill_seqlens=fill_seqlens,
+                quant_policy=step_context.kv_quant_policy,
+            )
 
         step_context.attn_metadata = attn_metadata
         step_context.cross_attn_metadata = cross_attn_metadata
diff --git a/lmdeploy/pytorch/backends/dlinfer/attention.py b/lmdeploy/pytorch/backends/dlinfer/attention.py
index d1b5b619d0..6b03403c84 100644
--- a/lmdeploy/pytorch/backends/dlinfer/attention.py
+++ b/lmdeploy/pytorch/backends/dlinfer/attention.py
@@ -31,8 +31,10 @@ def __init__(
         alibi: bool = None,
         sliding_window: int = None,
         logit_softcapping: float = None,
+        causal: bool = True,
         **kwargs,
     ):
+        assert causal
         super().__init__(
             num_heads,
             head_size,
@@ -42,6 +44,7 @@ def __init__(
             alibi,
             sliding_window,
             logit_softcapping,
+            causal=causal,
             **kwargs,
         )
 
@@ -152,6 +155,7 @@ def build(
         alibi_scale: float = None,
         sliding_window: int = None,
         logical_softcapping: float = None,
+        causal: bool = True,
         **kwargs,
     ) -> DlinferAttentionImpl:
         """build."""
@@ -163,4 +167,5 @@ def build(
                                     alibi_scale=alibi_scale,
                                     sliding_window=sliding_window,
                                     logical_softcapping=logical_softcapping,
+                                    causal=causal,
                                     **kwargs)
diff --git a/lmdeploy/pytorch/backends/dlinfer/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/op_backend.py
index 52a8830595..93733fbf57 100644
--- a/lmdeploy/pytorch/backends/dlinfer/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/op_backend.py
@@ -22,7 +22,7 @@ def get_name() -> str:
     @classmethod
     def get_layer_impl_builder(cls, layer_type: OpType):
         """get dlinfer layer builder."""
-        if layer_type == OpType.Attention:
+        if layer_type == OpType.PagedAttention:
             from .attention import DlinferAttentionBuilder
             return DlinferAttentionBuilder
         elif layer_type == OpType.ApplyRotaryEmb:
diff --git a/lmdeploy/pytorch/backends/flash_attention.py b/lmdeploy/pytorch/backends/flash_attention.py
new file mode 100644
index 0000000000..bed3af8d68
--- /dev/null
+++ b/lmdeploy/pytorch/backends/flash_attention.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABC, abstractmethod
+
+from torch import Tensor
+
+
+class FlashAttentionImpl(ABC):
+    """FlashAttention implementation."""
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor,
+                value: Tensor,
+                q_start_loc: Tensor,
+                q_seqlens: Tensor,
+                kv_start_loc: Tensor,
+                kv_seqlens: Tensor,
+                max_q_seqlen: int = None):
+        """forward."""
+        raise NotImplementedError
+
+
+class FlashAttentionBuilder(ABC):
+    """FlashAttention implementation builder."""
+
+    @staticmethod
+    @abstractmethod
+    def build(
+        num_heads: int,
+        head_dim: int,
+        scale: float = None,
+        num_kv_heads: int = None,
+        v_head_dim: int = None,
+        causal: bool = True,
+        sliding_window: int = None,
+        logical_softcapping: float = None,
+        **kwargs,
+    ) -> FlashAttentionImpl:
+        """build."""
+        raise NotImplementedError
diff --git a/lmdeploy/pytorch/backends/graph_runner.py b/lmdeploy/pytorch/backends/graph_runner.py
index 9ab66b26a2..9347995e0b 100644
--- a/lmdeploy/pytorch/backends/graph_runner.py
+++ b/lmdeploy/pytorch/backends/graph_runner.py
@@ -46,3 +46,26 @@ def prepare_inputs_for_generation(
             inputs_embeds,
             context,
         )
+
+    def update_model_metas(
+        self,
+        past_key_values: List[List[torch.Tensor]],
+        inputs_embeds: torch.Tensor = None,
+        context: StepContext = None,
+    ):
+        """prepare inputs."""
+        if hasattr(self.model, 'update_model_metas'):
+            return self.model.update_model_metas(
+                past_key_values,
+                inputs_embeds,
+                context,
+            )
+
+        return None
+
+    def get_input_processor(self):
+        """get input processor."""
+        if hasattr(self.model, 'get_input_processor'):
+            return self.model.get_input_processor()
+        else:
+            return None
diff --git a/lmdeploy/pytorch/configurations/llava.py b/lmdeploy/pytorch/configurations/llava.py
deleted file mode 100644
index aaeeeeadfe..0000000000
--- a/lmdeploy/pytorch/configurations/llava.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .builder import AutoModelConfigBuilder
-from .default import DefaultModelConfigBuilder
-
-
-class LlavaModelConfigBuilder(AutoModelConfigBuilder):
-
-    @classmethod
-    def condition(cls, hf_config):
-        """config."""
-        return hf_config.architectures[0] in [
-            'LlavaLlamaForCausalLM', 'LlavaMistralForCausalLM'
-        ]
-
-    @classmethod
-    def build(cls, hf_config, model_path: str = None):
-        """build."""
-        arch = hf_config.architectures[0]
-        if arch in ['LlavaLlamaForCausalLM', 'LlavaMistralForCausalLM']:
-            from llava.model.language_model.llava_llama import LlavaConfig
-
-            # reload hf_config due to model_type='llava' is already
-            # registered in transformers
-            hf_config = LlavaConfig.from_pretrained(model_path)
-        cfg = DefaultModelConfigBuilder.build(hf_config)
-        return cfg
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
index b74c0f64a6..afa350330c 100644
--- a/lmdeploy/pytorch/engine/engine.py
+++ b/lmdeploy/pytorch/engine/engine.py
@@ -8,17 +8,15 @@
 import numpy as np
 import torch
 
-from lmdeploy.messages import (GenerationConfig, PytorchEngineConfig,
-                               ResponseType)
+from lmdeploy.messages import PytorchEngineConfig, ResponseType
 from lmdeploy.utils import (get_logger, get_max_batch_size, get_model,
                             logging_timer)
 
 from ..adapter.adapter import AdapterManager
 from ..config import BackendConfig, CacheConfig, SchedulerConfig
 from ..devices import DeviceContext, get_device_manager
-from ..messages import (InputEmbeddingRangeType, InputEmbeddingType,
-                        MessageStatus, SchedulerSequence)
-from ..model_inputs import ModelInputs, MRopeModelInputs, VisionModelInputs
+from ..messages import MessageStatus, SchedulerSequence
+from ..model_inputs import ModelInputs, VisionModelInputs
 from ..paging import Scheduler
 from .engine_checker import EngineChecker
 from .logits_process import FusedLogitsProcessor, SamplingInputs
@@ -169,6 +167,8 @@ def __init__(self,
                 dtype=engine_config.dtype,
                 custom_module_map=engine_config.custom_module_map)
 
+        self.input_processor = self.model_agent.get_input_processor()
+
         cache_config = self.model_agent.cache_config
         self.adapter_manager = self._build_adapter_manager(adapters)
         self.scheduler = Scheduler(scheduler_config, cache_config)
@@ -184,7 +184,6 @@ def __init__(self,
         # create main thread
         self._start_loop()
         self._create_buffers()
-        self.engine_instance = self.create_instance()
         self._output_stream = torch.cuda.Stream()
 
     @classmethod
@@ -329,6 +328,10 @@ def _on_end_session(self, reqs: Request, **kwargs):
     def _on_add_message(self, reqs: Request, **kwargs):
         """on add message callback."""
 
+        self._msg_preprocess_inque.put_nowait(reqs)
+
+    def _add_message(self, que):
+
         def __update_bad_words(msg):
             """update bad words."""
             sampling_param = msg.sampling_param
@@ -350,6 +353,11 @@ def __update_max_new_tokens(msg):
                 sampling_param.max_new_tokens,
                 max_session_len - msg.num_all_tokens())
 
+        if que.qsize() == 0:
+            return
+
+        reqs = que.get_nowait()
+
         for req in reqs:
             session_id = req.data['session_id']
             if session_id not in self.scheduler.sessions:
@@ -367,11 +375,8 @@ def __update_max_new_tokens(msg):
                     sampling_param=req.data['sampling_param'],
                     adapter_name=req.data['adapter_name'],
                     return_logits=req.data.get('return_logits', False),
+                    multimodals=req.data.get('input_multimodals'),
                     input_embeddings=req.data.get('input_embeddings'),
-                    mrope_position_ids=req.data.get('mrope_position_ids'),
-                    mrope_position_delta=req.data.get('mrope_position_delta'),
-                    cross_attention_states=req.data.get(
-                        'cross_attention_states'),
                 )
                 msg = next(iter(sess.sequences.values()))
                 __update_bad_words(msg)
@@ -379,9 +384,11 @@ def __update_max_new_tokens(msg):
                 self.scheduler.add_sequence(msg)
             else:
                 msg = next(iter(sess.sequences.values()))
-                msg.update_token_ids(req.data['token_ids'],
-                                     req.data.get('input_embeddings'),
-                                     req.data.get('cross_attention_states'))
+                msg.update_token_ids(
+                    req.data['token_ids'],
+                    multimodals=req.data.get('input_multimodals'),
+                    embeddings=req.data.get('input_embeddings'),
+                )
                 msg.num_new_tokens = 0
                 msg.sampling_param = req.data['sampling_param']
                 msg.return_logits = req.data.get('return_logits', False)
@@ -427,7 +434,6 @@ def create_model_inputs(self, messages: SeqList, is_prefill: bool):
             seq_length = self._seq_length_buf[:batch_size]
         max_q_seq_length = seq_length.max().item()
 
-        # TODO: get block offsets is slow when block_size = 1
         block_offsets = self.scheduler.get_block_tables(messages)
         block_offsets = _tensorlize_block_offsets(block_offsets)
 
@@ -445,13 +451,7 @@ def create_model_inputs(self, messages: SeqList, is_prefill: bool):
         num_ignored_history = [msg.num_ignored_history for msg in messages]
         num_ignored_history = torch.tensor(num_ignored_history)
 
-        def __get_cogvlm_image_info():
-            """Get cogvlm history image info for position ids."""
-            history_image_nums = torch.LongTensor(
-                [msg.history_image_num for msg in messages])
-            history_image_token_lengths = torch.LongTensor(
-                [msg.history_image_token_len for msg in messages])
-            return history_image_nums, history_image_token_lengths
+        model_metas = [msg.model_meta for msg in messages]
 
         def __get_vlm_embeddings():
             """get vlm input embeddings and indexings."""
@@ -476,25 +476,9 @@ def __get_vlm_embeddings():
             return (input_embeddings, input_embedding_indexing,
                     input_embedding_ranges)
 
-        def __get_mrope_inputs():
-            """get multimodal rotary position inputs."""
-            position_ids = [msg.mrope_position_ids for msg in messages]
-            deltas = [msg.mrope_position_delta for msg in messages]
-            return MRopeModelInputs(position_ids=position_ids, deltas=deltas)
-
         # for inputs with embeddings
         history_image_nums = None
         history_image_token_lengths = None
-        # only for cogvlm
-        if self.model_config.cogvlm_style:
-            (history_image_nums,
-             history_image_token_lengths) = __get_cogvlm_image_info()
-        # only for qwen2_vl
-        mrope_inputs = None
-        has_mrope_params = any(
-            [msg.mrope_position_ids is not None for msg in messages])
-        if has_mrope_params:
-            mrope_inputs = __get_mrope_inputs()
 
         input_embeddings = None
         input_embedding_indexing = None
@@ -505,25 +489,40 @@ def __get_mrope_inputs():
             (input_embeddings, input_embedding_indexing,
              input_embedding_ranges) = __get_vlm_embeddings()
 
+        input_multimodals = None
+        has_multimodal = any(
+            [not msg.history_multimodals.empty() for msg in messages])
+        if has_multimodal:
+            has_multimodal = False
+            input_multimodals = [
+                msg.get_input_multimodals() for msg in messages
+            ]
+            for input_mm in input_multimodals:
+                for val in input_mm.values():
+                    if len(val) > 0:
+                        has_multimodal = True
+                        break
+                if has_multimodal:
+                    break
+
         vision_embedding_inputs = None
-        if has_embedding or history_image_nums is not None:
+        if has_embedding or has_multimodal or history_image_nums is not None:
             vision_embedding_inputs = VisionModelInputs(
                 history_lengths=history_lengths,
                 history_image_nums=history_image_nums,
                 history_image_token_lengths=history_image_token_lengths,
                 input_embeddings=input_embeddings,
                 input_embedding_indexing=input_embedding_indexing,
-                input_embedding_ranges=input_embedding_ranges)
-
-        # only for mllama
-        cross_attention_states = None
-        history_cross_kv_seqlens = None
-        if any([msg.cross_attention_states is not None for msg in messages]):
-            cross_attention_states = [
-                msg.cross_attention_states for msg in messages
-            ]
-        history_cross_kv_seqlens = torch.tensor(
-            [msg.history_cross_kv_seqlens for msg in messages])
+                input_embedding_ranges=input_embedding_ranges,
+                input_multimodals=input_multimodals)
+
+        # cross
+        cross_length = torch.tensor([msg.num_cross for msg in messages])
+        history_cross_length = torch.tensor(
+            [msg.num_history_cross for msg in messages])
+        if (cross_length + history_cross_length).max().item() == 0:
+            cross_length = None
+            history_cross_length = None
 
         return ModelInputs(
             input_ids=input_ids,
@@ -534,9 +533,9 @@ def __get_mrope_inputs():
             num_ignored_history=num_ignored_history,
             local_adapter_ids=local_adapter_ids,
             vision_inputs=vision_embedding_inputs,
-            mrope_inputs=mrope_inputs,
-            cross_attention_states=cross_attention_states,
-            history_cross_kv_seqlens=history_cross_kv_seqlens,
+            cross_length=cross_length,
+            history_cross_length=history_cross_length,
+            model_metas=model_metas,
         )
 
     def _batch_stopping_criteria(self, token_ids: torch.Tensor,
@@ -580,11 +579,15 @@ def __get_last_logits():
 
     @logging_timer('UpdateRunning', logger)
     def update_running(self, running: SeqList, next_token_ids: torch.Tensor,
-                       stopped: torch.Tensor):
+                       stopped: torch.Tensor, model_metas: List[Dict[str,
+                                                                     Any]]):
         """update scheduler."""
+        if model_metas is None:
+            model_metas = [None] * len(running)
         next_token_ids = next_token_ids.numpy()
         eos_token_id = self.model_config.eos_token_id
-        for token, msg, stop in zip(next_token_ids, running, stopped):
+        for token, msg, stop, model_meta in zip(next_token_ids, running,
+                                                stopped, model_metas):
             if msg.status != MessageStatus.RUNNING:
                 continue
             update_token = token
@@ -593,7 +596,7 @@ def update_running(self, running: SeqList, next_token_ids: torch.Tensor,
                 update_token = _EMPTY_TOKEN
             else:
                 msg.num_new_tokens += 1
-            msg.update_token_ids(update_token)
+            msg.update_token_ids(update_token, model_meta=model_meta)
             if stop:
                 msg.status = MessageStatus.STOPPED
 
@@ -659,12 +662,14 @@ async def __long_context_single_forward(inputs):
             batch_size = seq_len.size(0)
             assert batch_size == 1
 
-            new_inputs = inputs.split(max_prefill_token_num,
-                                      self.cache_config.block_size)
+            new_inputs = inputs.split(max_prefill_token_num)
 
+            model_metas = new_inputs[0].model_metas
             output_gather = _OutputGather(max_seq_len)
             for inp in new_inputs:
+                inp.model_metas = model_metas
                 tmp_out = await __forward(inp)
+                model_metas = tmp_out.get('model_metas')
                 output_gather.gather(tmp_out)
                 tmp_out.pop('hidden_states', None)
             tmp_out['hidden_states'] = output_gather.get_output()
@@ -686,9 +691,10 @@ async def __long_context_single_forward(inputs):
         ret['logits'] = logits
         return ret
 
-    def _make_infer_outputs(self, next_token_ids: torch.LongTensor,
-                            logits: torch.Tensor, stopped: torch.Tensor,
-                            event: torch.cuda.Event):
+    async def _make_infer_outputs(self, next_token_ids: torch.LongTensor,
+                                  logits: torch.Tensor, stopped: torch.Tensor,
+                                  model_metas: List[Dict[str, Any]],
+                                  event: torch.cuda.Event):
         """make infer output."""
 
         def __get_out_token_ids(token: torch.Tensor, msg: SchedulerSequence,
@@ -709,15 +715,16 @@ def __get_q_start_loc():
             else:
                 return seq_length.cumsum(0) - seq_length
 
+        while not event.query():
+            await asyncio.sleep(0.001)
         with torch.cuda.stream(self._output_stream):
-            event.wait()
             next_token_ids = next_token_ids.cpu()
             stopped = stopped.cpu()
 
         running = self._running
         is_run = [seq.status == MessageStatus.RUNNING for seq in running]
         stopped = stopped.tolist()
-        self.update_running(running, next_token_ids, stopped)
+        self.update_running(running, next_token_ids, stopped, model_metas)
 
         # generate output
         next_token_ids = next_token_ids.tolist()
@@ -807,13 +814,16 @@ def __update_inputs(next_token_ids):
                 next_token_ids, sampling_inputs.stop_words, num_appendable_ids)
 
             # send output
+            model_metas = output.get('model_metas')
             finish = (idx == loop_count - 1)
             finish = finish or _check_finish(self.scheduler, idx)
             event = torch.cuda.Event()
             event.record()
-            output = (next_token_ids, logits, stopped, event)
+            output = (next_token_ids, logits, stopped, model_metas, event)
             output_que.put_nowait((finish, output))
 
+            inputs.model_metas = model_metas
+
             if finish:
                 break
 
@@ -823,6 +833,36 @@ def __update_inputs(next_token_ids):
                 swap_out_map = dict()
                 __update_inputs(next_token_ids)
 
+    @torch.inference_mode()
+    async def _async_loop_preprocess_message(self, inque, outque):
+        """preprocess msg."""
+        while True:
+            reqs = await inque.get()
+
+            for req in reqs:
+                req_data = req.data
+                if req_data.get('input_multimodals', None) is None:
+                    continue
+                elif self.input_processor is None:
+                    logger.warning('Do not support Multimodal inputs.')
+                    continue
+                input_ids = req_data['token_ids']
+                input_multimodals = req_data['input_multimodals']
+                if len(input_multimodals) == 0:
+                    req_data['input_multimodals'] = None
+                    continue
+                result = self.input_processor.preprocess_input(
+                    input_ids, input_multimodals)
+
+                input_ids = result.input_ids
+                input_multimodals = result.input_multimodals
+
+                req_data['token_ids'] = input_ids
+                req_data['input_multimodals'] = input_multimodals
+
+            if len(reqs) > 0:
+                outque.put_nowait(reqs)
+
     @torch.inference_mode()
     async def _async_loop_background(self, in_que: asyncio.Queue,
                                      out_que: asyncio.Queue):
@@ -931,6 +971,10 @@ async def _async_loop(self):
 
         Each engine instance would communicate with the engine by queue.
         """
+
+        self._msg_preprocess_inque = asyncio.Queue()
+        self._msg_preprocess_outque = asyncio.Queue()
+
         prefill_interval = self.scheduler_config.prefill_interval
         in_que = asyncio.Queue()
         out_que = asyncio.Queue()
@@ -939,6 +983,12 @@ async def _async_loop(self):
             name='MainLoopBackground')
         loop_background.add_done_callback(_raise_exception_on_finish)
 
+        loop_msg_proc = asyncio.get_event_loop().create_task(
+            self._async_loop_preprocess_message(self._msg_preprocess_inque,
+                                                self._msg_preprocess_outque),
+            name='MainLoopPreprocessMessage')
+        loop_msg_proc.add_done_callback(_raise_exception_on_finish)
+
         def __send_resp(out: InferOutput):
             """send response."""
             resp_type = (ResponseType.FINISH
@@ -970,13 +1020,14 @@ async def __step():
             while not finish:
                 if self.req_manager.has_requests():
                     self.req_manager.step()
+                self._add_message(self._msg_preprocess_outque)
                 finish, out = await out_que.get()
                 try:
                     if isinstance(out, Exception):
                         raise out
-                    next_token_ids, logits, stopped, event = out
-                    step_outputs = self._make_infer_outputs(
-                        next_token_ids, logits, stopped, event)
+                    (next_token_ids, logits, stopped, model_metas, event) = out
+                    step_outputs = await self._make_infer_outputs(
+                        next_token_ids, logits, stopped, model_metas, event)
                     __send_resps(step_outputs)
                 except Exception as e:
                     raise e
@@ -986,6 +1037,7 @@ async def __step():
         while True:
             if self.req_manager.has_requests():
                 self.req_manager.step()
+            self._add_message(self._msg_preprocess_outque)
 
             if not self.scheduler.has_unfinished():
                 await asyncio.sleep(0.01)
@@ -1009,78 +1061,3 @@ def create_instance(self, cuda_stream_id=0):
         """
         from .engine_instance import EngineInstance
         return EngineInstance(self)
-
-    async def async_batched_infer(
-            self,
-            session_ids: List[int],
-            token_ids: List[List[int]] = None,
-            gen_config: GenerationConfig = None,
-            adapter_names: List[str] = None,
-            keep_cache: bool = False,
-            input_embeddings: List[InputEmbeddingType] = None,
-            input_embedding_ranges: List[InputEmbeddingRangeType] = None):
-        """Send inference request.
-
-        Args:
-            session_ids (List[int]): The session id.
-            token_ids (List[int]): The input token ids.
-            gen_config (GenerationConfig): The sampling parameters.
-            adapter_names (List[str]): The name of the adapters.
-            keep_cache (bool): Keep kv cache after infer.
-
-        Returns:
-            int: Error flags. 0 if success.
-            List[int]: The streaming output tokens.
-            int: The number of the output tokens.
-        """
-        return await self.engine_instance.async_batched_infer(
-            session_ids=session_ids,
-            token_ids=token_ids,
-            gen_config=gen_config,
-            adapter_names=adapter_names,
-            input_embeddings=input_embeddings,
-            input_embedding_ranges=input_embedding_ranges,
-            keep_cache=keep_cache)
-
-    def batched_infer(
-            self,
-            session_ids: List[int],
-            token_ids: List[List[int]] = None,
-            gen_config: GenerationConfig = None,
-            adapter_names: List[str] = None,
-            keep_cache: bool = False,
-            input_embeddings: List[InputEmbeddingType] = None,
-            input_embedding_ranges: List[InputEmbeddingRangeType] = None):
-        """batched infer."""
-        return self.engine_instance.batched_infer(
-            session_ids=session_ids,
-            token_ids=token_ids,
-            gen_config=gen_config,
-            adapter_names=adapter_names,
-            input_embeddings=input_embeddings,
-            input_embedding_ranges=input_embedding_ranges,
-            keep_cache=keep_cache)
-
-    async def async_add_session(self, session_id: int):
-        """Add new session."""
-        return await self.engine_instance._async_try_add_session(session_id)
-
-    def add_session(self, session_id: int):
-        """Add new session."""
-        return self.engine_instance._try_add_session(session_id)
-
-    async def async_cancel(self, session_id: int):
-        """Stop the given session."""
-        return await self.engine_instance.async_cancel(session_id)
-
-    def cancel(self, session_id: int):
-        """Add new session."""
-        return self.engine_instance.cancel(session_id)
-
-    async def async_end(self, session_id: int):
-        """End the given session."""
-        return await self.engine_instance.async_end(session_id)
-
-    def end(self, session_id: int):
-        """Add new session."""
-        return self.engine_instance.end(session_id)
diff --git a/lmdeploy/pytorch/engine/engine_instance.py b/lmdeploy/pytorch/engine/engine_instance.py
index 455ab1ccb3..dff9667eb4 100644
--- a/lmdeploy/pytorch/engine/engine_instance.py
+++ b/lmdeploy/pytorch/engine/engine_instance.py
@@ -1,16 +1,17 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List
+from typing import Any, Dict, List
 
 from lmdeploy.messages import EngineOutput, GenerationConfig
 from lmdeploy.utils import get_logger
 
-from ..messages import (InputEmbeddingRangeType, InputEmbeddings,
-                        InputEmbeddingType, SamplingParam)
+from ..messages import SamplingParam
 from .engine import Engine
 from .request import RequestSender, RequestType, Response, ResponseType
 
 logger = get_logger('lmdeploy')
 
+InputMultiModalType = List[Dict[str, Any]]
+
 
 def _check_resp(resp: Response, state: ResponseType, warning_msg: str = None):
     """check if response has state."""
@@ -114,15 +115,13 @@ def _try_add_session(self, session_id: int):
         """
         return try_add_session(self.req_sender, session_id)
 
-    async def async_stream_infer(
-            self,
-            session_id: int,
-            input_ids: List[int],
-            gen_config: GenerationConfig = None,
-            adapter_name: str = None,
-            input_embeddings: InputEmbeddingType = None,
-            input_embedding_ranges: InputEmbeddingRangeType = None,
-            **kwargs):
+    async def async_stream_infer(self,
+                                 session_id: int,
+                                 input_ids: List[int],
+                                 gen_config: GenerationConfig = None,
+                                 multimodal: InputMultiModalType = None,
+                                 adapter_name: str = None,
+                                 **kwargs):
         """Send stream inference request.
 
         Args:
@@ -144,21 +143,13 @@ async def async_stream_infer(
         await self.req_sender.async_send_async(
             RequestType.ADD_SESSION, dict(session_id=session_id,
                                           response=False))
-        input_embeddings_new: List[InputEmbeddings] = None
-        if input_embeddings is not None and len(input_embeddings) > 0:
-            assert len(input_embeddings) == len(input_embedding_ranges)
-            input_embeddings_new = [
-                InputEmbeddings(emb, rg[0], rg[1])
-                for emb, rg in zip(input_embeddings, input_embedding_ranges)
-            ]
-        msg = dict(token_ids=input_ids,
-                   session_id=session_id,
-                   sampling_param=sampling_param,
-                   adapter_name=adapter_name,
-                   input_embeddings=input_embeddings_new,
-                   mrope_position_ids=kwargs.get('mrope_position_ids'),
-                   mrope_position_delta=kwargs.get('mrope_position_delta'),
-                   cross_attention_states=kwargs.get('cross_attention_states'))
+        msg = dict(
+            token_ids=input_ids,
+            session_id=session_id,
+            sampling_param=sampling_param,
+            adapter_name=adapter_name,
+            input_multimodals=multimodal,
+        )
         req_id = await self.req_sender.async_send_async(
             RequestType.ADD_MESSAGE, msg)
 
@@ -179,14 +170,12 @@ async def async_stream_infer(
                 yield EngineOutput(resp.type, [], 0)
                 break
 
-    async def async_infer(
-            self,
-            session_id: int,
-            input_ids: List[int] = None,
-            gen_config: GenerationConfig = None,
-            input_embeddings: InputEmbeddingType = None,
-            input_embedding_ranges: InputEmbeddingRangeType = None,
-            **kwargs):
+    async def async_infer(self,
+                          session_id: int,
+                          input_ids: List[int] = None,
+                          multimodal: InputMultiModalType = None,
+                          gen_config: GenerationConfig = None,
+                          **kwargs):
         """Send inference request.
 
         Args:
@@ -200,13 +189,11 @@ async def async_infer(
             int: The number of the output tokens.
         """
         token_ids = []
-        async for outputs in self.async_stream_infer(
-                session_id,
-                input_ids,
-                gen_config=gen_config,
-                input_embeddings=input_embeddings,
-                input_embedding_ranges=input_embedding_ranges,
-                **kwargs):
+        async for outputs in self.async_stream_infer(session_id,
+                                                     input_ids,
+                                                     multimodal=multimodal,
+                                                     gen_config=gen_config,
+                                                     **kwargs):
             status, tmp_ids = outputs.status, outputs.token_ids
             if status not in [ResponseType.SUCCESS, ResponseType.FINISH]:
                 return EngineOutput(status, token_ids, len(token_ids))
@@ -217,10 +204,9 @@ async def async_infer(
     def stream_infer(self,
                      session_id: int,
                      input_ids: List[int],
+                     multimodal: InputMultiModalType = None,
                      gen_config: GenerationConfig = None,
                      adapter_name: str = None,
-                     input_embeddings: InputEmbeddingType = None,
-                     input_embedding_ranges: InputEmbeddingRangeType = None,
                      **kwargs):
         """Send stream inference request.
 
@@ -241,14 +227,12 @@ def stream_infer(self,
 
         def __call_async():
             """call async."""
-            coro_gen = self.async_stream_infer(
-                session_id,
-                input_ids,
-                gen_config,
-                adapter_name,
-                input_embeddings=input_embeddings,
-                input_embedding_ranges=input_embedding_ranges,
-                **kwargs)
+            coro_gen = self.async_stream_infer(session_id,
+                                               input_ids,
+                                               multimodal=multimodal,
+                                               gen_config=gen_config,
+                                               adapter_name=adapter_name,
+                                               **kwargs)
             while True:
                 try:
                     yield self.req_sender.run_until_complete(
@@ -264,19 +248,12 @@ def __call_async():
         sampling_param = SamplingParam.from_gen_config(gen_config=gen_config)
         self.req_sender.send_async(RequestType.ADD_SESSION,
                                    dict(session_id=session_id, response=False))
-        input_embeddings_new: List[InputEmbeddings] = None
-        if input_embeddings is not None and len(input_embeddings) > 0:
-            assert len(input_embeddings) == len(input_embedding_ranges)
-            input_embeddings_new = [
-                InputEmbeddings(emb, rg[0], rg[1])
-                for emb, rg in zip(input_embeddings, input_embedding_ranges)
-            ]
         msg = dict(
             token_ids=input_ids,
             session_id=session_id,
             sampling_param=sampling_param,
             adapter_name=adapter_name,
-            input_embeddings=input_embeddings_new,
+            input_multimodals=multimodal,
         )
         req_id = self.req_sender.send_async(RequestType.ADD_MESSAGE, msg)
 
@@ -300,9 +277,8 @@ def __call_async():
     def infer(self,
               session_id: int,
               input_ids: List[int] = None,
+              multimodal: InputMultiModalType = None,
               gen_config: GenerationConfig = None,
-              input_embeddings: InputEmbeddingType = None,
-              input_embedding_ranges: InputEmbeddingRangeType = None,
               **kwargs):
         """Send inference request.
 
@@ -317,13 +293,11 @@ def infer(self,
             int: The number of the output tokens.
         """
         token_ids = []
-        for outputs in self.stream_infer(
-                session_id,
-                input_ids,
-                gen_config=gen_config,
-                input_embeddings=input_embeddings,
-                input_embedding_ranges=input_embedding_ranges,
-                **kwargs):
+        for outputs in self.stream_infer(session_id,
+                                         input_ids,
+                                         multimodal=multimodal,
+                                         gen_config=gen_config,
+                                         **kwargs):
             status, tmp_ids = outputs.status, outputs.token_ids
             if status not in [ResponseType.SUCCESS, ResponseType.FINISH]:
                 return EngineOutput(status, token_ids, len(token_ids))
@@ -331,127 +305,6 @@ def infer(self,
 
         return EngineOutput(0, token_ids, len(token_ids))
 
-    async def async_batched_infer(
-        self,
-        session_ids: List[int],
-        token_ids: List[List[int]] = None,
-        gen_config: GenerationConfig = None,
-        adapter_names: List[str] = None,
-        keep_cache: bool = False,
-        input_embeddings: List[InputEmbeddingType] = None,
-        input_embedding_ranges: List[InputEmbeddingRangeType] = None,
-    ):
-        """Send inference request.
-
-        Args:
-            session_ids (List[int]): The session id.
-            token_ids (List[int]): The input token ids.
-            gen_config (GenerationConfig): The sampling parameters.
-            adapter_names (List[str]): The name of the adapters.
-            keep_cache (bool): Keep kv cache after infer.
-
-        Returns:
-            int: Error flags. 0 if success.
-            List[int]: The streaming output tokens.
-            int: The number of the output tokens.
-        """
-        batch_size = len(token_ids)
-        assert len(session_ids) == batch_size
-        if adapter_names is not None:
-            assert len(adapter_names) == batch_size
-        else:
-            adapter_names = [None for _ in range(batch_size)]
-
-        if input_embeddings is not None:
-            assert len(input_embeddings) == batch_size
-            assert len(input_embedding_ranges) == batch_size
-        else:
-            input_embeddings = [None] * batch_size
-            input_embedding_ranges = [None] * batch_size
-
-        async def _add_sessions(session_ids):
-            for session_id in session_ids:
-                await self._async_try_add_session(session_id)
-
-        async def _add_messages(session_ids, token_ids, adapter_names,
-                                input_embeddings, input_embedding_ranges):
-            add_msgs = []
-            sampling_param = SamplingParam.from_gen_config(gen_config)
-            for session_id, token_id, adapter_name, input_emb, input_ranges in zip(  # noqa: E501
-                    session_ids, token_ids, adapter_names, input_embeddings,
-                    input_embedding_ranges):
-                cur_input_embeddings: List[InputEmbeddings] = None
-                if input_emb is not None and len(input_emb) > 0:
-                    assert len(input_emb) == len(input_ranges)
-                    cur_input_embeddings = [
-                        InputEmbeddings(emb, rg[0], rg[1])
-                        for emb, rg in zip(input_emb, input_ranges)
-                    ]
-                msg = dict(
-                    token_ids=token_id,
-                    session_id=session_id,
-                    sampling_param=sampling_param,
-                    adapter_name=adapter_name,
-                    input_embeddings=cur_input_embeddings,
-                )
-                add_msgs.append(msg)
-            req_types = [RequestType.ADD_MESSAGE] * batch_size
-            req_ids = await self.req_sender.async_batched_send_async(
-                req_types, data=add_msgs)
-            return req_ids
-
-        await _add_sessions(session_ids)
-        req_ids = await _add_messages(session_ids, token_ids, adapter_names,
-                                      input_embeddings, input_embedding_ranges)
-
-        # receive messages
-        req_idx_map = dict(zip(req_ids, range(len(req_ids))))
-        output_token_ids = [list() for _ in req_ids]
-        status = 0
-        finish_count = batch_size
-        while finish_count:
-            resp = await self.req_sender.async_recv_any()
-            if resp.req_id not in req_ids:
-                continue
-            idx = req_idx_map[resp.req_id]
-            token_ids = output_token_ids[idx]
-            if resp.type == ResponseType.SUCCESS:
-                token_ids += resp.data['token_ids']
-            elif resp.type == ResponseType.FINISH:
-                token_ids += resp.data['token_ids']
-                if not keep_cache:
-                    session_id = session_ids[idx]
-                    await self.async_end(session_id=session_id)
-                finish_count -= 1
-            else:
-                logger.error(f'Unexpected response: {resp.type}')
-                status = 1
-                break
-
-        output_token_len = [len(token_ids) for token_ids in output_token_ids]
-        return EngineOutput(status, output_token_ids, output_token_len)
-
-    def batched_infer(
-        self,
-        session_ids: List[int],
-        token_ids: List[List[int]] = None,
-        gen_config: GenerationConfig = None,
-        adapter_names: List[str] = None,
-        keep_cache: bool = False,
-        input_embeddings: List[InputEmbeddingType] = None,
-        input_embedding_ranges: List[InputEmbeddingRangeType] = None,
-    ):
-        """batched infer."""
-        coro = self.async_batched_infer(
-            session_ids,
-            token_ids,
-            gen_config=gen_config,
-            adapter_names=adapter_names,
-            input_embeddings=input_embeddings,
-            input_embedding_ranges=input_embedding_ranges,
-            keep_cache=keep_cache)
-        return self.req_sender.run_until_complete(coro)
-
     async def async_end(self, session_id: int):
         """End the given session."""
         return await async_end(self.req_sender, session_id)
@@ -470,8 +323,7 @@ def cancel(self, session_id: int):
 
     def decode(self,
                input_ids,
-               input_embeddings: List[InputEmbeddingType] = None,
-               input_embedding_ranges: List[InputEmbeddingRangeType] = None,
+               multimodal: List[InputMultiModalType] = None,
                steps: List[int] = None,
                sequence_start: bool = True,
                sequence_end: bool = True,
@@ -481,10 +333,8 @@ def decode(self,
         Args:
             input_ids (numpy.ndarray): the batch of input token ids
             steps (List[int]): the offset of the k/v cache
-            input_embeddings (List[List[Union[torch.Tensor, np.ndarray]]]):
-                embeddings features
-            input_embedding_ranges: (List[List[Tuple[int, int]]]):
-                the begin/end offsets of input_embeddings to input_ids
+            multimodal (List[InputMultiModalType]):
+                multimodals inputs.
             sequence_start (bool): indicator for starting a sequence
             sequence_end (bool): indicator for ending a sequence
             adapter_names (List[str]): The name of the adapters.
@@ -494,33 +344,24 @@ def decode(self,
         batch_size = len(input_ids)
 
         def __add_messages(session_ids, input_ids, adapter_names,
-                           input_embeddings, input_embedding_ranges):
+                           input_multimodals):
             add_msgs = []
             sampling_param = SamplingParam(max_new_tokens=0)
             batch_size = len(input_ids)
-            if input_embeddings is None:
-                input_embeddings = [None] * batch_size
-                input_embedding_ranges = [None] * batch_size
-            for (session_id, token_id, adapter_name, input_emb,
-                 input_ranges) in zip(session_ids, input_ids, adapter_names,
-                                      input_embeddings,
-                                      input_embedding_ranges):
+            if input_multimodals is None:
+                input_multimodals = [None] * batch_size
+            for (session_id, token_id, adapter_name,
+                 in_mm) in zip(session_ids, input_ids, adapter_names,
+                               input_multimodals):
                 if len(token_id) > self.max_input_len:
                     raise RuntimeError(
                         f'Expect input length<={self.max_input_len} '
                         f'but get {len(token_id)}')
-                cur_input_embeddings: List[InputEmbeddings] = None
-                if input_emb is not None and len(input_emb) > 0:
-                    assert len(input_emb) == len(input_ranges)
-                    cur_input_embeddings = [
-                        InputEmbeddings(emb, rg[0], rg[1])
-                        for emb, rg in zip(input_emb, input_ranges)
-                    ]
                 msg = dict(token_ids=token_id,
                            session_id=session_id,
                            sampling_param=sampling_param,
                            adapter_name=adapter_name,
-                           input_embeddings=cur_input_embeddings,
+                           input_multimodals=in_mm,
                            return_logits=True)
                 add_msgs.append(msg)
             req_types = [RequestType.ADD_MESSAGE] * batch_size
@@ -536,13 +377,6 @@ def __add_messages(session_ids, input_ids, adapter_names,
         else:
             adapter_names = [None] * batch_size
 
-        if input_embeddings is not None:
-            assert len(input_embeddings) == batch_size
-            assert len(input_embedding_ranges) == batch_size
-        else:
-            input_embeddings = [None] * batch_size
-            input_embedding_ranges = [None] * batch_size
-
         session_ids = tuple(range(batch_size))
         if sequence_start:
             for sid in session_ids:
@@ -551,7 +385,7 @@ def __add_messages(session_ids, input_ids, adapter_names,
                 self._try_add_session(sid)
 
         req_ids = __add_messages(session_ids, input_ids, adapter_names,
-                                 input_embeddings, input_embedding_ranges)
+                                 multimodal)
         req_idx_map = dict(zip(req_ids, range(len(req_ids))))
 
         finish_count = batch_size
diff --git a/lmdeploy/pytorch/engine/input_process.py b/lmdeploy/pytorch/engine/input_process.py
new file mode 100644
index 0000000000..7f442e153b
--- /dev/null
+++ b/lmdeploy/pytorch/engine/input_process.py
@@ -0,0 +1,44 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+from lmdeploy.pytorch.multimodal.data_type import MultiModalInputs
+
+TypeModelMetas = Dict[str, Any]
+
+InputMultiModalType = List[Dict[str, Any]]
+
+
+@dataclass
+class PreprocessInputResult:
+    """results of preprocess input."""
+    input_ids: List[int]
+    input_multimodals: Optional[MultiModalInputs] = None
+    model_metas: Optional[TypeModelMetas] = None
+
+
+class BaseModelInputProcessor(ABC):
+    """processor of model inputs."""
+
+    @abstractmethod
+    def preprocess_input(self,
+                         input_ids: List[int],
+                         input_mms: InputMultiModalType = None,
+                         **kwargs) -> PreprocessInputResult:
+        """preprocess input."""
+        raise NotImplementedError('Not implemented.')
+
+
+class DefaultModelInputProcessor(BaseModelInputProcessor):
+    """default model input processor."""
+
+    def preprocess_input(self,
+                         input_ids: List[int],
+                         input_mms: MultiModalInputs = None,
+                         **kwargs) -> PreprocessInputResult:
+        """preprocess input."""
+        return PreprocessInputResult(
+            input_ids=input_ids,
+            input_multimodals=input_mms,
+        )
diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
index 8e47df70b5..999fa135cc 100644
--- a/lmdeploy/pytorch/engine/model_agent.py
+++ b/lmdeploy/pytorch/engine/model_agent.py
@@ -145,12 +145,17 @@ def model_forward(
             kv_quant_policy=cache_engine.cache_config.quant_policy,
         )
         with ctx_mgr.context(context):
+            model_metas = None
+            model_metas = model.update_model_metas(
+                past_key_values=cache_engine.gpu_cache,
+                context=context,
+            )
             input_dict = model.prepare_inputs_for_generation(
                 past_key_values=cache_engine.gpu_cache,
                 context=context,
             )
             output = model(**input_dict)
-    return dict(hidden_states=output)
+    return dict(hidden_states=output, model_metas=model_metas)
 
 
 SwapMap = Dict[int, int]
@@ -178,6 +183,10 @@ def get_logits(self, hidden_states: torch.Tensor):
         """get logits of model output."""
         raise NotImplementedError('Not implemented.')
 
+    def get_input_processor(self):
+        """get input processor."""
+        raise NotImplementedError('Not implemented.')
+
 
 class BaseModelAgent(AutoModelAgent):
     """Base model agent.
@@ -268,14 +277,16 @@ async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap,
                                     swap_in_map=swap_in_map,
                                     swap_out_map=swap_out_map)
         await asyncio.sleep(0)
-        while not self.stream.query():
-            await asyncio.sleep(0)
         return output
 
     def get_logits(self, hidden_states: torch.Tensor):
         """get logits of model output."""
         return self.patched_model.get_logits(hidden_states)
 
+    def get_input_processor(self):
+        """get input processor.."""
+        return self.patched_model.get_input_processor()
+
 
 @torch.inference_mode()
 def _tp_build_model(
@@ -696,6 +707,10 @@ def get_logits(self, hidden_states: torch.Tensor):
         """get logits of model output."""
         return self.patched_model.get_logits(hidden_states)
 
+    def get_input_processor(self):
+        """get input processor.."""
+        return self.patched_model.get_input_processor()
+
 
 def _exit_handler(agent: TPModelAgent):
     if hasattr(agent, 'patched_model'):
diff --git a/lmdeploy/pytorch/kernels/cuda/flashattention.py b/lmdeploy/pytorch/kernels/cuda/flashattention.py
index 34a11ae030..3d07225e43 100644
--- a/lmdeploy/pytorch/kernels/cuda/flashattention.py
+++ b/lmdeploy/pytorch/kernels/cuda/flashattention.py
@@ -47,6 +47,17 @@ def softcapping(qk, logit_softcapping: tl.constexpr):
     return qk
 
 
+@triton.jit
+def _load_kv(ptrs, causal_mask: tl.constexpr, boundary_check: tl.constexpr):
+    """load kv."""
+    if causal_mask:
+        return tl.load(ptrs,
+                       boundary_check=boundary_check,
+                       padding_option='zero')
+    else:
+        return tl.load(ptrs)
+
+
 @triton.jit
 def _prefill_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, q1, k1_ptrs,
                        loop_start, loop_end, sm_scale, history_mask,
@@ -63,11 +74,11 @@ def _prefill_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, q1, k1_ptrs,
     for start_n in range(loop_start, loop_end, BLOCK_N):
         start_n = tl.multiple_of(start_n, BLOCK_N)
 
-        k = tl.load(k_ptrs)
+        k = _load_kv(k_ptrs, causal_mask, boundary_check=(1, ))
         qk = tl.dot(q, k)
 
         if BLOCK_DK1 != 0:
-            k1 = tl.load(k1_ptrs)
+            k1 = _load_kv(k1_ptrs, causal_mask, boundary_check=(1, ))
             qk += tl.dot(q1, k1)
 
         if causal_mask:
@@ -117,7 +128,7 @@ def _prefill_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, q1, k1_ptrs,
         acc = acc * alpha[:, None]
 
         # update acc
-        v = tl.load(v_ptrs)
+        v = _load_kv(v_ptrs, causal_mask, boundary_check=(0, ))
         p = p.to(v.dtype)
         acc += tl.dot(p, v)
         # update m_i and l_i
@@ -172,6 +183,7 @@ def _flash_prefill_fwd_kernel(
     kv_group_num,
     head_dim_k,
     head_dim_v,
+    causal: tl.constexpr,
     window_size: tl.constexpr,
     logit_softcapping: tl.constexpr,
     BLOCK_M: tl.constexpr,
@@ -260,9 +272,13 @@ def _flash_prefill_fwd_kernel(
     l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + 1.0
     acc = tl.zeros([BLOCK_M, BLOCK_DV], dtype=tl.float32)
 
-    history_mask = history_len + start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    if causal:
+        history_mask = history_len + start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+        loop_end = (history_len + start_m * BLOCK_M) // BLOCK_N * BLOCK_N
+    else:
+        history_mask = tl.full([BLOCK_M], kv_seqlen - 1, dtype=tl.int32)
+        loop_end = kv_seqlen // BLOCK_N * BLOCK_N
 
-    loop_end = (history_len + start_m * BLOCK_M) // BLOCK_N * BLOCK_N
     acc, l_i, m_i = _prefill_fwd_inner(acc,
                                        l_i,
                                        m_i,
@@ -283,7 +299,10 @@ def _flash_prefill_fwd_kernel(
                                        BLOCK_DK1=BLOCK_DK1)
 
     loop_start = loop_end
-    loop_end = tl.minimum(kv_seqlen, loop_start + BLOCK_M + BLOCK_N)
+    if causal:
+        loop_end = tl.minimum(kv_seqlen, loop_start + BLOCK_M + BLOCK_N)
+    else:
+        loop_end = kv_seqlen
     acc, l_i, m_i = _prefill_fwd_inner(acc,
                                        l_i,
                                        m_i,
@@ -333,6 +352,7 @@ def flash_attention_fwd(
     window_size: int = None,
     sm_scale: float = None,
     logit_softcapping: float = None,
+    causal: bool = True,
     kv_layout: str = 'hsd',
 ):
     """varlen flash Attention forward.
@@ -383,6 +403,7 @@ def grid(args):
         BLOCK_M = max(16, 8192 // BLOCK_DK)
     else:
         BLOCK_M = max(16, 16384 // BLOCK_DK)
+    BLOCK_M = min(128, BLOCK_M)
     num_warps = 4
     num_stages = min(4, max(2, 1024 // BLOCK_DK))
     if BLOCK_DK >= 512:
@@ -416,6 +437,7 @@ def grid(args):
         kv_group_num=kv_group_num,
         head_dim_k=head_dim_k,
         head_dim_v=head_dim_v,
+        causal=causal,
         window_size=window_size,
         logit_softcapping=logit_softcapping,
         BLOCK_DK=BLOCK_DK,
diff --git a/lmdeploy/pytorch/kernels/cuda/flatten_kv_cache.py b/lmdeploy/pytorch/kernels/cuda/flatten_kv_cache.py
index 90b135743e..3a77164046 100644
--- a/lmdeploy/pytorch/kernels/cuda/flatten_kv_cache.py
+++ b/lmdeploy/pytorch/kernels/cuda/flatten_kv_cache.py
@@ -31,7 +31,7 @@ def _flatten_kv_cache(
     stride_vos: tl.constexpr,
     stride_vod: tl.constexpr,
     stride_boff,
-    OUT_SIZE: tl.constexpr,
+    OUT_SIZE,
     HEAD_DIM_K: tl.constexpr,
     HEAD_DIM_V: tl.constexpr,
     BLOCK_BS: tl.constexpr,
@@ -124,7 +124,7 @@ def _flatten_kv_cache_quant(
     stride_vod: tl.constexpr,
     stride_boff,
     quant_policy: tl.constexpr,
-    OUT_SIZE: tl.constexpr,
+    OUT_SIZE,
     HEAD_DIM_K: tl.constexpr,
     HEAD_DIM_V: tl.constexpr,
     BLOCK_BS: tl.constexpr,
diff --git a/lmdeploy/pytorch/messages.py b/lmdeploy/pytorch/messages.py
index b16a78f1f4..0aaba98c94 100644
--- a/lmdeploy/pytorch/messages.py
+++ b/lmdeploy/pytorch/messages.py
@@ -8,6 +8,7 @@
 from torch import Tensor
 
 from lmdeploy.messages import GenerationConfig, LogitsProcessor
+from lmdeploy.pytorch.multimodal.data_type import MultiModalInputs
 from lmdeploy.utils import get_logger
 
 from .block import LogicalTokenBlocks
@@ -205,10 +206,9 @@ def add_sequence(
             sampling_param: SamplingParam = None,
             adapter_name: str = None,
             return_logits: bool = False,
-            input_embeddings: List[InputEmbeddings] = None,
-            mrope_position_ids: Tensor = None,
-            mrope_position_delta: Tensor = None,
-            cross_attention_states: Tensor = None) -> 'SchedulerSequence':
+            multimodals: MultiModalInputs = None,
+            input_embeddings: List[InputEmbeddings] = None
+    ) -> 'SchedulerSequence':
         """Add a new message."""
         if isinstance(token_ids, Tensor):
             token_ids = token_ids.numpy()
@@ -228,10 +228,8 @@ def add_sequence(
             adapter_name=adapter_name,
             arrive_time=time.time(),
             history_embeddings=HistoryEmbeddings(input_embeddings),
+            history_multimodals=HistoryMultiModals(multimodals),
             return_logits=return_logits,
-            mrope_position_ids=mrope_position_ids,
-            mrope_position_delta=mrope_position_delta,
-            cross_attention_states=cross_attention_states,
         )
         self.sequences[seq.seq_id] = seq
         if self.seq_manager is not None:
@@ -361,6 +359,66 @@ def copy(self):
         return self.clone()
 
 
+class HistoryMultiModals:
+
+    def __init__(self, multimodals: MultiModalInputs):
+        if multimodals is None:
+            multimodals = dict()
+        self.multimodals = multimodals
+
+    def get_datas(self, start=0, end=-1):
+        """get multimodals from prompts position [start, end)."""
+        outs = dict()
+        test_range = range(start, end)
+        for modal_type, modal_datas in self.multimodals.items():
+            data = []
+            for modal_data in modal_datas:
+                if (modal_data.start not in test_range
+                        and modal_data.end not in test_range):
+                    continue
+                data.append(modal_data)
+            if len(data) > 0:
+                outs[modal_type] = data
+        return outs
+
+    def add_inputs(self, input_mms: MultiModalInputs):
+        """add new inputs."""
+        for modal_type, vals in input_mms.items():
+            if modal_type in self.multimodals:
+                self.multimodals[modal_type] += vals
+            else:
+                self.multimodals[modal_type] = vals
+
+    def empty(self):
+        if len(self.multimodals) == 0:
+            return 0
+
+        return all(len(vals) == 0 for vals in self.multimodals)
+
+    @staticmethod
+    def update_multimodals(input_mms: MultiModalInputs, prev_len: int):
+        """update multimodals."""
+        for vals in input_mms.values():
+            for val in vals:
+                val.start += prev_len
+                val.end += prev_len
+        return input_mms
+
+    def get_encoder_len(self, start=0, end=-1):
+        """get lens of encoder."""
+        test_range = range(start, end)
+        out_len = 0
+        for _, modal_datas in self.multimodals.items():
+            for modal_data in modal_datas:
+                if modal_data.encoder_len is None:
+                    continue
+                if (modal_data.start not in test_range
+                        and modal_data.end not in test_range):
+                    continue
+                out_len += modal_data.encoder_len
+        return out_len
+
+
 @dataclass
 class SchedulerSequence:
     """Scheduler message."""
@@ -369,6 +427,8 @@ class SchedulerSequence:
     history_cache: HistoryTokenIds = field(default_factory=HistoryTokenIds)
     history_embeddings: HistoryEmbeddings = field(
         default_factory=HistoryEmbeddings)
+    history_multimodals: HistoryMultiModals = field(
+        default_factory=HistoryMultiModals)
     num_new_tokens: int = 0
     sampling_param: SamplingParam = field(default_factory=SamplingParam)
     logical_blocks: LogicalTokenBlocks = field(
@@ -382,10 +442,7 @@ class SchedulerSequence:
     random_offsets: int = 0
     _status: MessageStatus = field(default=MessageStatus.WAITING, init=False)
     num_ignored_history: int = 0
-    mrope_position_ids: Optional[Tensor] = None
-    mrope_position_delta: Optional[int] = None
-    cross_attention_states: Optional[Tensor] = None
-    history_cross_kv_seqlens: int = 0
+    model_meta: Dict[str, Any] = None
 
     def __post_init__(self):
         """post init."""
@@ -394,6 +451,10 @@ def __post_init__(self):
         self._num_images: int = len(self.history_embeddings)
         self._num_token_ids: int = len(self.history_cache)
 
+        self._num_history_cross: int = 0
+        self._num_cross: int = self.history_multimodals.get_encoder_len(
+            0, self._num_token_ids)
+
     @property
     def block_size(self) -> int:
         """block size."""
@@ -464,6 +525,16 @@ def num_all_ids(self):
         """num all tokens."""
         return self.history_len + self._num_token_ids
 
+    @property
+    def num_cross(self):
+        """num cross."""
+        return self._num_cross
+
+    @property
+    def num_history_cross(self):
+        """num history cross."""
+        return self._num_history_cross
+
     @property
     def num_blocks(self):
         """num blocks."""
@@ -489,22 +560,22 @@ def num_all_tokens(self):
 
     def num_all_cross_tokens(self):
         """num of all cross tokens."""
-        if self.cross_attention_states is None:
-            self.history_cross_kv_seqlens = 0
-        else:
-            self.history_cross_kv_seqlens = self.cross_attention_states.shape[
-                -2]
-        return self.history_cross_kv_seqlens
+        return self._num_cross + self._num_history_cross
+
+    def get_input_multimodals(self):
+        """get input multimodals."""
+        start = self.num_history_ids
+        end = self.num_all_ids
+        return self.history_multimodals.get_datas(start, end)
 
     def update_token_ids(self,
                          token_ids: Tensor,
+                         multimodals: MultiModalInputs = None,
                          embeddings: List[InputEmbeddings] = None,
-                         cross_attention_states: List[Tensor] = None):
+                         model_meta: Dict[str, Any] = None):
         """Update token ids, old token ids will be added to history."""
-        # cross attention
-        if cross_attention_states is not None:
-            self.history_cross_kv_seqlens += cross_attention_states.shape[-2]
-            self.cross_attention_states = cross_attention_states
+        old_num_history_ids = self._num_history_ids
+
         self._num_history_ids += self._num_token_ids
         # update history image nums
         self._num_history_images += self._num_images
@@ -516,6 +587,23 @@ def update_token_ids(self,
             self._num_images = len(new_embeddings)
             self.history_embeddings.append(new_embeddings)
 
+        # update multimodals
+        if multimodals is not None:
+            multimodals = HistoryMultiModals.update_multimodals(
+                multimodals, self.num_all_ids)
+            self.history_multimodals.add_inputs(multimodals)
+
+        # cross
+        self._num_history_cross += self._num_cross
+        if multimodals is not None:
+            self._num_cross = self.history_multimodals.get_encoder_len(
+                old_num_history_ids, self._num_history_ids)
+        else:
+            self._num_cross = 0
+
+        if model_meta is not None:
+            self.model_meta = model_meta
+
         if isinstance(token_ids, Tensor):
             token_ids = token_ids.numpy()
         elif not isinstance(token_ids, np.ndarray):
@@ -539,3 +627,12 @@ def set_step(self, step: int):
         self._num_history_ids = step
         self._num_token_ids = num_all_ids - step
         self.num_ignored_history = min(step, self.num_ignored_history)
+
+        self.model_meta = None
+
+        # cross
+        if self.history_multimodals is not None:
+            self._num_history_cross = self.history_multimodals.get_encoder_len(
+                0, self.num_history_ids)
+            self._num_cross = self.history_multimodals.get_encoder_len(
+                self._num_history_ids, num_all_ids)
diff --git a/lmdeploy/pytorch/model_inputs.py b/lmdeploy/pytorch/model_inputs.py
index d95aa6fafc..b5b74e4f02 100644
--- a/lmdeploy/pytorch/model_inputs.py
+++ b/lmdeploy/pytorch/model_inputs.py
@@ -7,45 +7,7 @@
 
 from lmdeploy.pytorch.backends import get_backend
 from lmdeploy.pytorch.config import ModelConfig
-
-
-@dataclass
-class MRopeModelInputs:
-    """Multimodal rotary position inputs."""
-    position_ids: List[torch.LongTensor] = None
-    deltas: List[torch.LongTensor] = None
-
-    def get_inputs(self, history_lengths: torch.Tensor,
-                   seq_lengths: torch.Tensor):
-        mrope_position_ids = []
-        for (his_len, seq_len, pos_ids,
-             delta) in zip(history_lengths, seq_lengths, self.position_ids,
-                           self.deltas):
-            assert pos_ids.dim() == 2, 'invalid mrope_position_ids'
-            if his_len + seq_len <= pos_ids.shape[1]:
-                mrope_position_ids.append(pos_ids[:,
-                                                  his_len:his_len + seq_len])
-            else:
-                mrope_position_ids.append(
-                    torch.tensor([his_len], device=delta.device).expand(3, -1)
-                    + delta)
-
-        mrope_position_ids = torch.cat(mrope_position_ids, dim=-1)
-        return mrope_position_ids
-
-    def to_device(self, device: str):
-        """to device."""
-        out_dict = dict()
-        for f in fields(self):
-            k = f.name
-            v = getattr(self, k)
-            if isinstance(v, torch.Tensor):
-                v = v.to(device)
-            elif isinstance(v, list):
-                v = [x.to(device) for x in v]
-            out_dict[k] = v
-
-        return MRopeModelInputs(**out_dict)
+from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
 
 
 @dataclass
@@ -57,6 +19,7 @@ class VisionModelInputs:
     input_embeddings: List[List[torch.Tensor]] = None
     input_embedding_ranges: List[torch.LongTensor] = None
     input_embedding_indexing: torch.BoolTensor = None
+    input_multimodals: List[MultiModalTensor] = None
 
     def to_device(self, device: str):
         """to device."""
@@ -64,12 +27,19 @@ def to_device(self, device: str):
         for f in fields(self):
             k = f.name
             v = getattr(self, k)
+            if v is None:
+                continue
             if isinstance(v, torch.Tensor):
                 v = v.to(device)
-            elif k == 'input_embedding_ranges' and v is not None:
+            elif k == 'input_embedding_ranges':
                 v = [e.to(device) for e in v]
-            elif k == 'input_embeddings' and v is not None:
+            elif k == 'input_embeddings':
                 v = [[e.to(device) for e in li] for li in v]
+            elif k == 'input_multimodals':
+                for mm_datas in v:
+                    for modal_type, data in mm_datas.items():
+                        data = [d.to_device(device) for d in data]
+                        mm_datas[modal_type] = data
             out_dict[k] = v
 
         return VisionModelInputs(**out_dict)
@@ -120,9 +90,9 @@ class ModelInputs:
     num_ignored_history: torch.LongTensor
     local_adapter_ids: torch.LongTensor = None
     vision_inputs: VisionModelInputs = None
-    mrope_inputs: MRopeModelInputs = None
-    cross_attention_states: torch.Tensor = None
-    history_cross_kv_seqlens: torch.LongTensor = None
+    cross_length: torch.LongTensor = None
+    history_cross_length: torch.LongTensor = None
+    model_metas: List[Dict[str, Any]] = None
 
     def update(self, input_ids: torch.LongTensor):
         """update input ids."""
@@ -133,44 +103,88 @@ def update(self, input_ids: torch.LongTensor):
         self.input_ids = input_ids
         return self
 
-    def split(self, split_size: int, block_size: int):
+    def split(self, split_size: int):
         """split inputs."""
         assert len(
             self.seq_length) == 1, ('Can not perform split on batched input.')
-        assert split_size % block_size == 0, (
-            'split_size should be multi of block_size.')
 
         input_ids = self.input_ids
         if input_ids.numel() < split_size:
             return self
 
-        num_blocks = split_size // block_size
-        overlap = (self.history_lengths[0] % block_size != 0)
+        flatten_mms = []
+        vision_inputs = self.vision_inputs
+        if vision_inputs is not None:
+            if vision_inputs.input_multimodals is not None:
+                input_mms = vision_inputs.input_multimodals[0]
+
+                flatten_mms = []
+                for k, mms in input_mms.items():
+                    mms = [(k, mm) for mm in mms]
+                    flatten_mms += mms
+
+                flatten_mms = sorted(flatten_mms, key=lambda mm: mm[1].start)
+
         max_seq_len = self.seq_length[0].item()
         ret = []
-        block_start = 0
-        for i in range(0, max_seq_len, split_size):
-            start = i
-            end = min(max_seq_len, i + split_size)
-            block_end = block_start + num_blocks
-            if overlap:
-                block_end += 1
-
-            block_offsets = self.block_offsets
+        start = 0
+        history_cross_length = self.history_cross_length
+        cross_length = None
+        if history_cross_length is not None:
+            cross_length = self.history_cross_length.clone()
+        while start < max_seq_len:
+            vision_inputs = None
+            if len(flatten_mms) > 0:
+                mm_start = flatten_mms[0][1].start
+                mm_end = flatten_mms[0][1].end
+                if mm_start > self.history_lengths + start:
+                    end = min(mm_start - self.history_lengths,
+                              start + split_size)
+                else:
+                    input_mms = dict()
+                    key, mm = flatten_mms.pop(0)
+                    input_mms.setdefault(key, [])
+                    input_mms[key].append(mm)
+                    end = start + mm.end - mm.start
+                    while len(flatten_mms) > 0:
+                        next_mm = flatten_mms[0]
+                        next_start = next_mm[1].start
+                        next_end = next_mm[1].end
+                        if next_start < mm_end:
+                            key = next_mm[0]
+                            input_mms.setdefault(key, [])
+                            input_mms[key].append(next_mm[1])
+                            end += max(0, next_end - mm_end)
+                            flatten_mms.pop(0)
+
+                            if cross_length is not None:
+                                encoder_len = next_mm[1].encoder_len
+                                if encoder_len is not None:
+                                    cross_length += encoder_len
+                        else:
+                            break
+                    vision_inputs = VisionModelInputs(
+                        input_multimodals=[input_mms], )
+            else:
+                end = min(max_seq_len, start + split_size)
+
             inp = ModelInputs(
                 input_ids=self.input_ids[:, start:end],
                 seq_length=input_ids.new_tensor([end - start]),
-                block_offsets=block_offsets,
+                block_offsets=self.block_offsets,
                 history_lengths=self.history_lengths + start,
                 is_decoding=self.is_decoding,
                 num_ignored_history=self.num_ignored_history,
                 local_adapter_ids=self.local_adapter_ids,
-                vision_inputs=self.vision_inputs,
-                mrope_inputs=self.mrope_inputs,
-                cross_attention_states=self.cross_attention_states,
+                vision_inputs=vision_inputs,
+                model_metas=self.model_metas,
+                cross_length=cross_length,
+                history_cross_length=history_cross_length,
             )
             ret.append(inp)
-            block_start += num_blocks
+            history_cross_length = cross_length
+
+            start = end
 
         return ret
 
@@ -184,8 +198,6 @@ def to_device(self, device: str):
                 v = v.to(device)
             elif isinstance(v, VisionModelInputs):
                 v = v.to_device(device)
-            elif isinstance(v, MRopeModelInputs):
-                v = v.to_device(device)
             out_dict[k] = v
 
         return ModelInputs(**out_dict)
@@ -212,13 +224,14 @@ class StepContext:
     local_adapter_ids: torch.LongTensor = None
     input_embeddings: torch.Tensor = None
     input_embedding_indexing: torch.Tensor = None
+    input_multimodals: List[MultiModalTensor] = None
     vision_inputs: VisionModelInputs = None
-    mrope_position_ids: torch.Tensor = None
     attn_metadata: Any = None
-    cross_attn_metadata: Any = None
-    cross_attention_states: torch.Tensor = None
+    cross_seqlens: torch.LongTensor = None
     cross_kv_seqlens: torch.LongTensor = None
+    cross_attn_metadata: Any = None
     kv_quant_policy: Literal[0, 4, 8] = 0
+    model_metas: List[Dict[str, Any]] = None
 
     _outputs: Dict = field(default_factory=dict)
 
@@ -242,24 +255,21 @@ def new(
         history_seqlens = inputs.history_lengths
         device = q_seqlens.device
 
+        input_multimodals = None
+        if inputs.vision_inputs is not None:
+            input_multimodals = inputs.vision_inputs.input_multimodals
+
         # for vlm
         input_embeddings, input_embedding_indexing = None, None
         if (inputs.vision_inputs is not None
                 and inputs.vision_inputs.input_embeddings is not None):
             input_embeddings, input_embedding_indexing = \
                 inputs.vision_inputs.get_inputs(history_seqlens, q_seqlens)
-        # for mrope
-        mrope_position_ids = None
-        if inputs.mrope_inputs is not None:
-            mrope_position_ids = inputs.mrope_inputs.get_inputs(
-                history_seqlens, q_seqlens)
 
         # kv_seqlens
-        cross_attention_states = inputs.cross_attention_states
         if inputs.is_decoding:
             attention_mask = torch.ones_like(q_seqlens)[:, None]
-            position_ids = history_seqlens.unsqueeze(-1)
-            cross_attention_states = None
+            position_ids = history_seqlens.unsqueeze(-1).clone()
         else:
             max_q_seqlen = q_seqlens.max().item()
             mask_range = torch.arange(max_q_seqlen, device=device)[None, :]
@@ -268,6 +278,13 @@ def new(
             position_ids += history_seqlens.unsqueeze(-1)
         q_start_loc = q_seqlens.cumsum(0) - q_seqlens
 
+        # cross
+        cross_seqlens = inputs.cross_length
+        cross_kv_seqlens = None
+        if inputs.cross_length is not None:
+            cross_kv_seqlens = (inputs.cross_length +
+                                inputs.history_cross_length)
+
         # position ids 1d
         position_ids = cls.get_position_ids_1d(position_ids, q_seqlens)[None]
         # seq_len + history_length
@@ -281,6 +298,7 @@ def new(
             position_ids=position_ids,
             input_embeddings=input_embeddings,
             input_embedding_indexing=input_embedding_indexing,
+            input_multimodals=input_multimodals,
             attention_mask=attention_mask,
             q_seqlens=q_seqlens,
             kv_seqlens=kv_seqlens,
@@ -290,10 +308,10 @@ def new(
             world_size=world_size,
             local_adapter_ids=inputs.local_adapter_ids,
             vision_inputs=inputs.vision_inputs,
-            mrope_position_ids=mrope_position_ids,
-            cross_attention_states=cross_attention_states,
-            cross_kv_seqlens=inputs.history_cross_kv_seqlens,
             kv_quant_policy=kv_quant_policy,
+            model_metas=inputs.model_metas,
+            cross_seqlens=cross_seqlens,
+            cross_kv_seqlens=cross_kv_seqlens,
         )
 
         ret = get_backend().update_step_context(ret)
diff --git a/lmdeploy/pytorch/models/chatglm2.py b/lmdeploy/pytorch/models/chatglm2.py
index 8d7a21a0a6..ac69fea2a1 100644
--- a/lmdeploy/pytorch/models/chatglm2.py
+++ b/lmdeploy/pytorch/models/chatglm2.py
@@ -1,101 +1,29 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, Iterable, List, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
+from torch.nn import functional as F
 from transformers.configuration_utils import PretrainedConfig
 
+from lmdeploy.pytorch.engine.input_process import (BaseModelInputProcessor,
+                                                   PreprocessInputResult)
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
+from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
 from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm, RopeType,
                                  SiluAndMul, build_rotary_embedding)
-from lmdeploy.pytorch.nn.linear import (build_merged_colwise_linear,
+from lmdeploy.pytorch.nn.linear import (build_colwise_linear,
+                                        build_merged_colwise_linear,
                                         build_qkv_proj, build_rowwise_linear)
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .utils.cudagraph import CudaGraphMixin
+from .utils.model import DeployModelMixin
 
 LANGUAGE_TOKEN_TYPE = 0
 VISION_TOKEN_TYPE = 1
 
 
-def get_vision_expert_mask(token_type_ids: torch.LongTensor):
-    vision_token_mask = torch.zeros_like(token_type_ids, dtype=torch.bool)
-    vision_token_mask[:, :-1] = (token_type_ids[:, :-1]
-                                 == VISION_TOKEN_TYPE) & (token_type_ids[:, 1:]
-                                                          == VISION_TOKEN_TYPE)
-    language_token_mask = ~vision_token_mask
-    return vision_token_mask, language_token_mask
-
-
-def build_position_ids(x: torch.BoolTensor) -> torch.LongTensor:
-    tmp = x.clone()
-    # image boi eoi token as LANGUAGE_TOKEN_TYPE
-    is_boi_eoi = torch.zeros_like(x, dtype=torch.bool)
-    is_boi_eoi[:, 1:] |= (tmp[:, 1:] == VISION_TOKEN_TYPE) & (
-        tmp[:, :-1] == LANGUAGE_TOKEN_TYPE)
-    is_boi_eoi[:, 0] |= (tmp[:, 0] == VISION_TOKEN_TYPE)
-    is_boi_eoi[:, :-1] |= (tmp[:, :-1] == VISION_TOKEN_TYPE) & (
-        tmp[:, 1:] == LANGUAGE_TOKEN_TYPE)
-    is_boi_eoi[:, -1] |= (tmp[:, -1] == VISION_TOKEN_TYPE)
-    tmp[is_boi_eoi] = LANGUAGE_TOKEN_TYPE
-    # final position ids
-    y = torch.zeros_like(x, dtype=torch.long)
-    y[:, 1:] = (tmp[:, 1:] == LANGUAGE_TOKEN_TYPE) | (
-        (tmp[:, 1:] == VISION_TOKEN_TYPE) &
-        (tmp[:, :-1] == LANGUAGE_TOKEN_TYPE))
-    y = y.cumsum(dim=-1)
-    return y
-
-
-def _get_cogvlm_position_ids(context):
-    """get cogvlm position_ids."""
-    q_seqlens = context.q_seqlens
-    history_lengths = context.kv_seqlens - q_seqlens
-    vision_input_info = context.vision_inputs
-    position_id_offsets = (vision_input_info.history_image_token_lengths -
-                           vision_input_info.history_image_nums * 3)
-    lang_ids = None
-    vis_ids = None
-    if context.is_decoding:
-        position_ids = history_lengths - position_id_offsets
-    else:
-        if vision_input_info.input_embeddings is not None and len(
-                vision_input_info.input_embeddings) > 0:
-            starts = history_lengths - vision_input_info.history_lengths
-            ends = starts + q_seqlens
-            token_type_ids = vision_input_info.input_embedding_indexing.to(
-                torch.int)
-            history_position_lengths = (vision_input_info.history_lengths -
-                                        position_id_offsets)
-            position_ids_all = (history_position_lengths[:, None] +
-                                build_position_ids(token_type_ids))
-            position_ids = torch.cat([
-                pids[s:e]
-                for (pids, s, e) in zip(position_ids_all, starts, ends)
-            ])
-            vision_token_mask_all, _ = get_vision_expert_mask(token_type_ids)
-            vision_token_mask = torch.cat([
-                masks[s:e]
-                for (masks, s, e) in zip(vision_token_mask_all, starts, ends)
-            ])
-            mask_indexing = torch.arange(vision_token_mask.shape[-1],
-                                         device=vision_token_mask.device)
-            vis_ids = mask_indexing[vision_token_mask]
-            lang_ids = mask_indexing[~vision_token_mask]
-
-        else:
-            position_ids = context.attention_mask.long().cumsum(-1) - 1
-            position_ids += (history_lengths -
-                             position_id_offsets).unsqueeze(-1)
-            device = position_ids.device
-            position_ids_1d = [
-                ids[:l] for ids, l in zip(position_ids.cpu(), q_seqlens.cpu())
-            ]
-            position_ids = torch.cat(position_ids_1d).to(device)
-
-    return position_ids, lang_ids, vis_ids
-
-
 class SelfAttention(torch.nn.Module):
     """Parallel self-attention layer abstract class.
 
@@ -410,6 +338,286 @@ def forward(self, input_ids):
         return embeddings
 
 
+class PatchEmbedding(nn.Module):
+    """vision embedding."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.proj = nn.Conv2d(config.in_channels,
+                              config.hidden_size,
+                              kernel_size=config.patch_size,
+                              stride=config.patch_size,
+                              dtype=dtype,
+                              device=device)
+        self.cls_embedding = nn.Parameter(
+            torch.empty(1, config.hidden_size, dtype=dtype, device=device))
+        self.position_embedding = nn.Embedding(config.num_positions,
+                                               config.hidden_size,
+                                               dtype=dtype,
+                                               device=device)
+
+    def forward(self, images):
+        """forward."""
+        x = self.proj(images)
+        x = x.flatten(2).transpose(1, 2)
+        cls_token = self.cls_embedding.expand(x.shape[0], -1, -1)
+        x = torch.cat((cls_token, x), dim=1)
+        x += self.position_embedding.weight.unsqueeze(0)
+        return x
+
+
+class EVA2CLIPAttention(nn.Module):
+    """vision attention."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        quantization_config = getattr(config, 'quantization_config', None)
+        hidden_size = config.hidden_size
+        num_heads = config.num_heads
+        head_dim = config.hidden_size // config.num_heads
+        self.scale = head_dim**-0.5
+
+        # packed qkv
+        self.query_key_value = build_qkv_proj(
+            hidden_size,
+            num_q_heads=num_heads,
+            num_kv_heads=num_heads,
+            head_size=head_dim,
+            bias=True,
+            quant_config=quantization_config,
+            dtype=dtype,
+            device=device,
+        )
+
+        # o_proj
+        self.dense = build_rowwise_linear(hidden_size,
+                                          hidden_size,
+                                          bias=True,
+                                          quant_config=quantization_config,
+                                          dtype=dtype,
+                                          device=device,
+                                          is_tp=True)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """forward."""
+        # qkv proj
+        qkv_states = self.query_key_value(hidden_states)
+        q, k, v = self.query_key_value.split_qkv(qkv_states)
+
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        attn_output = F.scaled_dot_product_attention(q, k, v, scale=self.scale)
+
+        # o proj
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.flatten(-2, -1)
+        attn_output = self.dense(attn_output)
+        return attn_output
+
+
+class EVA2CLIPMLP(nn.Module):
+    """vision MLP."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        from transformers.activations import ACT2FN
+
+        # gate up
+        quantization_config = getattr(config, 'quantization_config', None)
+        self.fc1 = build_colwise_linear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=True,
+            dtype=dtype,
+            device=device,
+            quant_config=quantization_config,
+            is_tp=True,
+        )
+
+        # silu and mul
+        if config.hidden_act in [
+                'gelu', 'gelu_fast', 'quick_gelu', 'gelu_python'
+        ]:
+            self.activation_fn = nn.GELU()
+        else:
+            self.activation_fn = ACT2FN[config.hidden_act]
+
+        # down
+        self.fc2 = build_rowwise_linear(config.intermediate_size,
+                                        config.hidden_size,
+                                        bias=True,
+                                        quant_config=quantization_config,
+                                        dtype=dtype,
+                                        device=device,
+                                        is_tp=True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """forward."""
+        x = self.fc1(x)
+        x = self.activation_fn(x)
+        x = self.fc2(x)
+        return x
+
+
+class EVA2CLIPTransformerLayer(nn.Module):
+    """vision trans layer."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.input_layernorm = nn.LayerNorm(config.hidden_size,
+                                            eps=config.layer_norm_eps,
+                                            dtype=dtype,
+                                            device=device)
+        self.attention = EVA2CLIPAttention(config, dtype=dtype, device=device)
+        self.mlp = EVA2CLIPMLP(config, dtype=dtype, device=device)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
+                                                     eps=config.layer_norm_eps,
+                                                     dtype=dtype,
+                                                     device=device)
+
+    def forward(self, hidden_states):
+        """forward."""
+        attention_input = hidden_states
+        attention_output = self.input_layernorm(
+            self.attention(attention_input))
+        hidden_states = attention_input + attention_output
+        mlp_input = hidden_states
+        mlp_output = self.post_attention_layernorm(self.mlp(mlp_input))
+        output = mlp_input + mlp_output
+        return output
+
+
+class EVA2CLIPTransformer(nn.Module):
+    """vision transformer."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.layers = nn.ModuleList([
+            EVA2CLIPTransformerLayer(config, dtype=dtype, device=device)
+            for _ in range(config.num_hidden_layers)
+        ])
+
+    def forward(self, hidden_states):
+        """forward."""
+        for layer_module in self.layers:
+            hidden_states = layer_module(hidden_states)
+        return hidden_states
+
+
+class GLU(nn.Module):
+    """GLU."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 in_features: int,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.linear_proj = nn.Linear(in_features,
+                                     config.hidden_size,
+                                     bias=False,
+                                     dtype=dtype,
+                                     device=device)
+        self.norm1 = nn.LayerNorm(config.hidden_size,
+                                  dtype=dtype,
+                                  device=device)
+        self.act1 = nn.GELU()
+        self.act2 = nn.functional.silu
+        self.dense_h_to_4h = nn.Linear(config.hidden_size,
+                                       config.ffn_hidden_size,
+                                       bias=False,
+                                       dtype=dtype,
+                                       device=device)
+        self.gate_proj = nn.Linear(config.hidden_size,
+                                   config.ffn_hidden_size,
+                                   bias=False,
+                                   dtype=dtype,
+                                   device=device)
+        self.dense_4h_to_h = nn.Linear(config.ffn_hidden_size,
+                                       config.hidden_size,
+                                       bias=False,
+                                       dtype=dtype,
+                                       device=device)
+
+    def forward(self, x):
+        x = self.linear_proj(x)
+        x = self.act1(self.norm1(x))
+        x = self.act2(self.gate_proj(x)) * self.dense_h_to_4h(x)
+        x = self.dense_4h_to_h(x)
+        return x
+
+
+class EVA2CLIPModel(nn.Module):
+    """vision model."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        from argparse import Namespace
+        vision_config = Namespace(**config.vision_config)
+
+        self.patch_embedding = PatchEmbedding(vision_config,
+                                              dtype=dtype,
+                                              device=device)
+        self.transformer = EVA2CLIPTransformer(vision_config,
+                                               dtype=dtype,
+                                               device=device)
+        self.linear_proj = GLU(config,
+                               in_features=config.hidden_size,
+                               dtype=dtype,
+                               device=device)
+        self.conv = nn.Conv2d(in_channels=vision_config.hidden_size,
+                              out_channels=config.hidden_size,
+                              kernel_size=2,
+                              stride=2,
+                              dtype=dtype,
+                              device=device)
+        self.boi = nn.Parameter(
+            torch.empty(1, 1, config.hidden_size, dtype=dtype, device=device))
+        self.eoi = nn.Parameter(
+            torch.empty(1, 1, config.hidden_size, dtype=dtype, device=device))
+        self.scaling_factor = vision_config.scaling_factor
+
+    def forward(self, images):
+        """forward."""
+        x = self.patch_embedding(images)
+        x = self.transformer(x)
+
+        x = x[:, 1:]
+
+        b, s, h = x.shape
+        grid_size = int(s**0.5)
+        x = x.view(b, grid_size, grid_size, h).permute(0, 3, 1, 2)
+        x = self.conv(x)
+
+        x = x.flatten(2).transpose(1, 2)
+        x = self.linear_proj(x)
+        boi = self.boi.expand(x.shape[0], -1, -1)
+        eoi = self.eoi.expand(x.shape[0], -1, -1)
+        x = torch.cat((boi, x, eoi), dim=1)
+        x = x / self.scaling_factor
+        return x
+
+
 class ChatGLMModel(nn.Module):
 
     def __init__(self,
@@ -442,19 +650,32 @@ def __init__(self,
                                                  dtype=dtype,
                                                  device=device)
 
+        self.vision = None
+        if hasattr(config, 'vision_config'):
+            self.vision = EVA2CLIPModel(config, dtype=dtype, device=device)
+
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         attn_metadata: Any = None,
+        images: torch.Tensor = None,
+        image_mask: torch.Tensor = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
     ):
         """forward."""
 
         # token embedding
         if inputs_embeds is None:
+            images_features = None
+            if images is not None:
+                images_features = self.vision(images)
+                images_features = images_features.flatten(0, 1)[None]
             inputs_embeds = self.embedding(input_ids)
+            if images is not None:
+                inputs_embeds.masked_scatter_(image_mask[..., None],
+                                              images_features)
 
         hidden_states = inputs_embeds
 
@@ -477,7 +698,8 @@ def get_input_embeddings(self):
         return self.embedding
 
 
-class ChatGLMForConditionalGeneration(nn.Module, CudaGraphMixin):
+class ChatGLMForConditionalGeneration(nn.Module, DeployModelMixin,
+                                      CudaGraphMixin):
     """rewrote model of LlamaForCausalLM."""
 
     def __init__(self,
@@ -491,12 +713,16 @@ def __init__(self,
         # build Model
         self.transformer = ChatGLMModel(config, dtype=dtype, device=device)
 
+        self.input_processor = ChatGLMInputProcessor(self.config, dtype)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
         past_key_values: List[List[torch.Tensor]],
         attn_metadata: Any = None,
+        images: torch.Tensor = None,
+        image_mask: torch.Tensor = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
     ):
@@ -506,6 +732,8 @@ def forward(
             position_ids=position_ids,
             past_key_values=past_key_values,
             attn_metadata=attn_metadata,
+            images=images,
+            image_mask=image_mask,
             inputs_embeds=inputs_embeds,
         )
         return hidden_states
@@ -529,8 +757,23 @@ def prepare_inputs_for_generation(
         input_ids = context.input_ids
         position_ids = context.position_ids
         attn_metadata = context.attn_metadata
-        if context.vision_inputs is not None:
-            position_ids = _get_cogvlm_position_ids(context)[0][None]
+
+        images = None
+        image_mask = None
+        if context.input_multimodals is not None:
+            images = [
+                input_mm.get('image', [])
+                for input_mm in context.input_multimodals
+            ]
+            # flatten batch
+            images = [data for im_data in images for data in im_data]
+            if len(images) != 0:
+                image_token_id = images[0].meta['image_token_id']
+                image_mask = input_ids == image_token_id
+                images = torch.stack([data.data for data in images])
+            else:
+                images = None
+                image_mask = None
 
         # process vision embeddings
         vision_embeddings = context.input_embeddings
@@ -548,9 +791,92 @@ def prepare_inputs_for_generation(
             position_ids=position_ids,
             past_key_values=past_key_values,
             attn_metadata=attn_metadata,
+            images=images,
+            image_mask=image_mask,
             inputs_embeds=inputs_embeds,
         )
 
+    def update_model_metas(self,
+                           past_key_values: List[List[torch.Tensor]],
+                           inputs_embeds: Optional[torch.Tensor] = None,
+                           context: StepContext = None):
+        """update model meta."""
+        model_metas = context.model_metas
+        if not hasattr(self.config, 'vision_config'):
+            return model_metas
+
+        input_multimodals = context.input_multimodals
+        if input_multimodals is None:
+            input_imgs = [[] for _ in model_metas]
+        else:
+            input_imgs = []
+            for mm in input_multimodals:
+                if mm is None:
+                    input_imgs.append([])
+                else:
+                    input_imgs.append(mm.get('image', []))
+
+        config = self.config
+        image_size: int = config.vision_config['image_size']
+        patch_size: int = config.vision_config['patch_size']
+        vision_token_num = ((image_size // patch_size // 2) *
+                            (image_size // patch_size // 2) + 2)
+        num_pad = vision_token_num - 3
+
+        batched_num_img_tokens = []
+        new_model_metas = []
+        for meta, imgs in zip(model_metas, input_imgs):
+            if meta is None:
+                num_img_tokens = 0
+            else:
+                num_img_tokens = meta.get('num_img_tokens', 0)
+
+            batched_num_img_tokens.append(num_img_tokens)
+
+            num_img_tokens += num_pad * len(imgs)
+            new_model_metas.append(dict(num_img_tokens=num_img_tokens))
+
+        # prepare cogvlm position_ids
+        q_seqlens = context.q_seqlens
+        position_ids = context.position_ids
+
+        if context.is_decoding or all(len(imgs) == 0 for imgs in input_imgs):
+            num_img_tokens = torch.tensor(batched_num_img_tokens,
+                                          device=position_ids.device)
+            position_ids -= num_img_tokens[None]
+        else:
+            batched_position_ids = position_ids[0].split(q_seqlens)
+            for pos_ids, num_img_tok, imgs in zip(batched_position_ids,
+                                                  batched_num_img_tokens,
+                                                  input_imgs):
+                pos_ids -= num_img_tok
+                if len(imgs) == 0:
+                    continue
+
+                seq_len = pos_ids.size(0)
+                start = pos_ids[0].cpu().item()
+                new_pos_ids = []
+
+                imgs = sorted(imgs, key=lambda img: img.start)
+                for img in imgs:
+                    img_pad_pos = img.start + 1 - num_img_tok
+                    num_pad = img.end - img.start - 2
+                    new_pos_ids += list(range(start, img_pad_pos))
+                    new_pos_ids += [img_pad_pos] * num_pad
+                    start = img_pad_pos + 1
+                    num_img_tok += num_pad
+
+                remain = seq_len - len(new_pos_ids)
+                new_pos_ids += list(range(start, start + remain))
+
+                new_pos_ids = pos_ids.new_tensor(new_pos_ids)
+                pos_ids[:] = new_pos_ids
+
+            position_ids = torch.cat(batched_position_ids)[None]
+        context.position_ids = position_ids
+
+        return new_model_metas
+
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         """load weights."""
         # modify from vllm
@@ -558,7 +884,17 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
             if 'transformer.vision' in name:
+                if '.query_key_value' in name:
+                    param = params_dict[name]
+                    q, k, v = param.weight_spliter(loaded_weight)
+                    load_weight(param, q, shard_id='q')
+                    load_weight(param, k, shard_id='k')
+                    load_weight(param, v, shard_id='v')
+                else:
+                    param = params_dict[name]
+                    load_weight(param, loaded_weight)
                 continue
+
             if 'rotary_pos_emb.inv_freq' in name:
                 continue
             if ('rotary_pos_emb.cos_cached' in name
@@ -581,3 +917,53 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             else:
                 param = params_dict[name]
                 load_weight(param, loaded_weight)
+
+    def get_input_processor(self) -> BaseModelInputProcessor:
+        """get input processor."""
+        return self.input_processor
+
+
+class ChatGLMInputProcessor(BaseModelInputProcessor):
+    """input processor."""
+
+    def __init__(self, config: PretrainedConfig, dtype) -> None:
+        self.config = config
+        self.dtype = dtype
+
+        if hasattr(config, 'vision_config'):
+            vision_config = config.vision_config
+            self.image_size = vision_config['image_size']
+            self.patch_size = vision_config['patch_size']
+            self.num_patches = (self.image_size // self.patch_size)**2
+            self.num_positions = self.num_patches + 1
+            self.vision_token_num = self.num_patches // 4
+
+    def preprocess_input(self,
+                         input_ids: List[int],
+                         input_multimodals: List[Dict[str, Any]] = None,
+                         **kwargs) -> PreprocessInputResult:
+        """prepare multimodal input."""
+        if input_multimodals is None or len(input_multimodals) == 0:
+            return input_ids, input_multimodals
+
+        input_imgs = []
+        for input_mm in input_multimodals:
+            pixel_values = input_mm['pixel_values'].to(self.dtype)
+            offset = input_mm['offset']
+            num_pad = input_mm['image_tokens']
+            image_token_id = input_mm.get('image_token_id', 0)
+            if isinstance(num_pad, torch.Tensor):
+                num_pad = num_pad.item()
+
+            mm_data = MultiModalTensor(
+                data=pixel_values,
+                start=offset,
+                end=offset + num_pad,
+                meta=dict(image_token_id=image_token_id))
+            input_imgs.append(mm_data)
+
+        result = PreprocessInputResult(
+            input_ids=input_ids,
+            input_multimodals=dict(image=input_imgs),
+        )
+        return result
diff --git a/lmdeploy/pytorch/models/cogvlm.py b/lmdeploy/pytorch/models/cogvlm.py
index 6caf10df00..f4f1baaff3 100644
--- a/lmdeploy/pytorch/models/cogvlm.py
+++ b/lmdeploy/pytorch/models/cogvlm.py
@@ -1,20 +1,27 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from argparse import Namespace
 from typing import Any, Iterable, List, Optional, Tuple
 
 import torch
 import torch.distributed as dist
+import torch.nn.functional as F
 from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 
 from lmdeploy.pytorch.distributed import get_world_rank
+from lmdeploy.pytorch.engine.input_process import (BaseModelInputProcessor,
+                                                   PreprocessInputResult)
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
+from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
 from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm, RopeType,
                                  SiluAndMul, build_rotary_embedding)
-from lmdeploy.pytorch.nn.linear import (build_merged_colwise_linear,
+from lmdeploy.pytorch.nn.linear import (build_colwise_linear,
+                                        build_merged_colwise_linear,
                                         build_qkv_proj, build_rowwise_linear)
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .utils.cudagraph import CudaGraphMixin
+from .utils.model import DeployModelMixin
 
 
 class VisionExpertAttention(nn.Module):
@@ -322,6 +329,283 @@ def forward(
         return outputs
 
 
+class PatchEmbedding(nn.Module):
+    """vision embedding."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.proj = nn.Conv2d(config.in_channels,
+                              config.hidden_size,
+                              kernel_size=config.patch_size,
+                              stride=config.patch_size,
+                              dtype=dtype,
+                              device=device)
+        self.cls_embedding = nn.Parameter(
+            torch.empty(1, config.hidden_size, dtype=dtype, device=device))
+        self.position_embedding = nn.Embedding(config.num_positions,
+                                               config.hidden_size,
+                                               dtype=dtype,
+                                               device=device)
+
+    def forward(self, images):
+        """forward."""
+        x = self.proj(images)
+        x = x.flatten(2).transpose(1, 2)
+        cls_token = self.cls_embedding.expand(x.shape[0], -1, -1)
+        x = torch.cat((cls_token, x), dim=1)
+        x += self.position_embedding.weight.unsqueeze(0)
+        return x
+
+
+class EVA2CLIPAttention(nn.Module):
+    """vision attention."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        quantization_config = getattr(config, 'quantization_config', None)
+        hidden_size = config.hidden_size
+        num_heads = config.num_heads
+        head_dim = config.hidden_size // config.num_heads
+        self.scale = head_dim**-0.5
+
+        # packed qkv
+        self.query_key_value = build_qkv_proj(
+            hidden_size,
+            num_q_heads=num_heads,
+            num_kv_heads=num_heads,
+            head_size=head_dim,
+            bias=True,
+            quant_config=quantization_config,
+            dtype=dtype,
+            device=device,
+        )
+
+        # o_proj
+        self.dense = build_rowwise_linear(hidden_size,
+                                          hidden_size,
+                                          bias=True,
+                                          quant_config=quantization_config,
+                                          dtype=dtype,
+                                          device=device,
+                                          is_tp=True)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """forward."""
+        # qkv proj
+        qkv_states = self.query_key_value(hidden_states)
+        q, k, v = self.query_key_value.split_qkv(qkv_states)
+
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        attn_output = F.scaled_dot_product_attention(q, k, v, scale=self.scale)
+
+        # o proj
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.flatten(-2, -1)
+        attn_output = self.dense(attn_output)
+        return attn_output
+
+
+class EVA2CLIPMLP(nn.Module):
+    """vision MLP."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        from transformers.activations import ACT2FN
+
+        # gate up
+        quantization_config = getattr(config, 'quantization_config', None)
+        self.fc1 = build_colwise_linear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=True,
+            dtype=dtype,
+            device=device,
+            quant_config=quantization_config,
+            is_tp=True,
+        )
+
+        # silu and mul
+        if config.hidden_act in [
+                'gelu', 'gelu_fast', 'quick_gelu', 'gelu_python'
+        ]:
+            self.activation_fn = nn.GELU()
+        else:
+            self.activation_fn = ACT2FN[config.hidden_act]
+
+        # down
+        self.fc2 = build_rowwise_linear(config.intermediate_size,
+                                        config.hidden_size,
+                                        bias=True,
+                                        quant_config=quantization_config,
+                                        dtype=dtype,
+                                        device=device,
+                                        is_tp=True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """forward."""
+        x = self.fc1(x)
+        x = self.activation_fn(x)
+        x = self.fc2(x)
+        return x
+
+
+class EVA2CLIPTransformerLayer(nn.Module):
+    """vision trans layer."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.input_layernorm = nn.LayerNorm(config.hidden_size,
+                                            eps=config.layer_norm_eps,
+                                            dtype=dtype,
+                                            device=device)
+        self.attention = EVA2CLIPAttention(config, dtype=dtype, device=device)
+        self.mlp = EVA2CLIPMLP(config, dtype=dtype, device=device)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
+                                                     eps=config.layer_norm_eps,
+                                                     dtype=dtype,
+                                                     device=device)
+
+    def forward(self, hidden_states):
+        """forward."""
+        attention_input = hidden_states
+        attention_output = self.input_layernorm(
+            self.attention(attention_input))
+        hidden_states = attention_input + attention_output
+        mlp_input = hidden_states
+        mlp_output = self.post_attention_layernorm(self.mlp(mlp_input))
+        output = mlp_input + mlp_output
+        return output
+
+
+class EVA2CLIPTransformer(nn.Module):
+    """vision transformer."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.layers = nn.ModuleList([
+            EVA2CLIPTransformerLayer(config, dtype=dtype, device=device)
+            for _ in range(config.num_hidden_layers)
+        ])
+
+    def forward(self, hidden_states):
+        """forward."""
+        for layer_module in self.layers:
+            hidden_states = layer_module(hidden_states)
+        return hidden_states
+
+
+class GLU(nn.Module):
+    """GLU."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 in_features: int,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.linear_proj = nn.Linear(in_features,
+                                     config.hidden_size,
+                                     bias=False,
+                                     dtype=dtype,
+                                     device=device)
+        self.norm1 = nn.LayerNorm(config.hidden_size,
+                                  dtype=dtype,
+                                  device=device)
+        self.act1 = nn.GELU()
+        self.act2 = nn.functional.silu
+        self.dense_h_to_4h = nn.Linear(config.hidden_size,
+                                       config.intermediate_size,
+                                       bias=False,
+                                       dtype=dtype,
+                                       device=device)
+        self.gate_proj = nn.Linear(config.hidden_size,
+                                   config.intermediate_size,
+                                   bias=False,
+                                   dtype=dtype,
+                                   device=device)
+        self.dense_4h_to_h = nn.Linear(config.intermediate_size,
+                                       config.hidden_size,
+                                       bias=False,
+                                       dtype=dtype,
+                                       device=device)
+
+    def forward(self, x):
+        x = self.linear_proj(x)
+        x = self.act1(self.norm1(x))
+        x = self.act2(self.gate_proj(x)) * self.dense_h_to_4h(x)
+        x = self.dense_4h_to_h(x)
+        return x
+
+
+class EVA2CLIPModel(nn.Module):
+    """vision model."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        vision_config = Namespace(**config.vision_config)
+
+        self.patch_embedding = PatchEmbedding(vision_config,
+                                              dtype=dtype,
+                                              device=device)
+        self.transformer = EVA2CLIPTransformer(vision_config,
+                                               dtype=dtype,
+                                               device=device)
+        self.linear_proj = GLU(config,
+                               in_features=vision_config.hidden_size,
+                               dtype=dtype,
+                               device=device)
+        self.conv = nn.Conv2d(in_channels=vision_config.hidden_size,
+                              out_channels=vision_config.hidden_size,
+                              kernel_size=2,
+                              stride=2,
+                              dtype=dtype,
+                              device=device)
+        self.boi = nn.Parameter(
+            torch.empty(1, 1, config.hidden_size, dtype=dtype, device=device))
+        self.eoi = nn.Parameter(
+            torch.empty(1, 1, config.hidden_size, dtype=dtype, device=device))
+
+    def forward(self, images):
+        """forward."""
+        x = self.patch_embedding(images)
+        x = self.transformer(x)
+
+        x = x[:, 1:]
+
+        b, s, h = x.shape
+        grid_size = int(s**0.5)
+        x = x.view(b, grid_size, grid_size, h).permute(0, 3, 1, 2)
+        x = self.conv(x)
+
+        x = x.flatten(2).transpose(1, 2)
+        x = self.linear_proj(x)
+        boi = self.boi.expand(x.shape[0], -1, -1)
+        eoi = self.eoi.expand(x.shape[0], -1, -1)
+        x = torch.cat((boi, x, eoi), dim=1)
+        return x
+
+
 class CogVLMModel(nn.Module):
     """model."""
 
@@ -353,6 +637,9 @@ def __init__(self,
                             dtype=dtype,
                             device=device)
 
+        # vision model
+        self.vision = EVA2CLIPModel(config, dtype=dtype, device=device)
+
         # build rotary embedding
         emb_type = RopeType.LinearScaling
         rope_dim = config.hidden_size // config.num_attention_heads
@@ -371,6 +658,7 @@ def forward(
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         attn_metadata: Any = None,
+        images: torch.Tensor = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         lang_ids: torch.LongTensor = None,
         vision_ids: torch.LongTensor = None,
@@ -379,7 +667,12 @@ def forward(
 
         # token embedding
         if inputs_embeds is None:
+            if images is not None:
+                images_features = self.vision(images)
+
             inputs_embeds = self.embed_tokens(input_ids)
+            if vision_ids is not None:
+                inputs_embeds[0, vision_ids] = images_features.flatten(0, 1)
 
         hidden_states = inputs_embeds
 
@@ -416,85 +709,7 @@ def get_input_embeddings(self):
 VISION_TOKEN_TYPE = 1
 
 
-def get_vision_expert_mask(token_type_ids: torch.LongTensor):
-    vision_token_mask = torch.zeros_like(token_type_ids, dtype=torch.bool)
-    vision_token_mask[:, :-1] = (token_type_ids[:, :-1]
-                                 == VISION_TOKEN_TYPE) & (token_type_ids[:, 1:]
-                                                          == VISION_TOKEN_TYPE)
-    language_token_mask = ~vision_token_mask
-    return vision_token_mask, language_token_mask
-
-
-def build_position_ids(x: torch.BoolTensor) -> torch.LongTensor:
-    tmp = x.clone()
-    # image boi eoi token as LANGUAGE_TOKEN_TYPE
-    is_boi_eoi = torch.zeros_like(x, dtype=torch.bool)
-    is_boi_eoi[:, 1:] |= (tmp[:, 1:] == VISION_TOKEN_TYPE) & (
-        tmp[:, :-1] == LANGUAGE_TOKEN_TYPE)
-    is_boi_eoi[:, 0] |= (tmp[:, 0] == VISION_TOKEN_TYPE)
-    is_boi_eoi[:, :-1] |= (tmp[:, :-1] == VISION_TOKEN_TYPE) & (
-        tmp[:, 1:] == LANGUAGE_TOKEN_TYPE)
-    is_boi_eoi[:, -1] |= (tmp[:, -1] == VISION_TOKEN_TYPE)
-    tmp[is_boi_eoi] = LANGUAGE_TOKEN_TYPE
-    # final position ids
-    y = torch.zeros_like(x, dtype=torch.long)
-    y[:, 1:] = (tmp[:, 1:] == LANGUAGE_TOKEN_TYPE) | (
-        (tmp[:, 1:] == VISION_TOKEN_TYPE) &
-        (tmp[:, :-1] == LANGUAGE_TOKEN_TYPE))
-    y = y.cumsum(dim=-1)
-    return y
-
-
-def _get_cogvlm_position_ids(context):
-    """get cogvlm position_ids."""
-    q_seqlens = context.q_seqlens
-    history_lengths = context.kv_seqlens - q_seqlens
-    vision_input_info = context.vision_inputs
-    position_id_offsets = (vision_input_info.history_image_token_lengths -
-                           vision_input_info.history_image_nums * 3)
-    lang_ids = None
-    vis_ids = None
-    if context.is_decoding:
-        position_ids = history_lengths - position_id_offsets
-    else:
-        if vision_input_info.input_embeddings is not None and len(
-                vision_input_info.input_embeddings) > 0:
-            starts = history_lengths - vision_input_info.history_lengths
-            ends = starts + q_seqlens
-            token_type_ids = vision_input_info.input_embedding_indexing.to(
-                torch.int)
-            history_position_lengths = (vision_input_info.history_lengths -
-                                        position_id_offsets)
-            position_ids_all = (history_position_lengths[:, None] +
-                                build_position_ids(token_type_ids))
-            position_ids = torch.cat([
-                pids[s:e]
-                for (pids, s, e) in zip(position_ids_all, starts, ends)
-            ])
-            vision_token_mask_all, _ = get_vision_expert_mask(token_type_ids)
-            vision_token_mask = torch.cat([
-                masks[s:e]
-                for (masks, s, e) in zip(vision_token_mask_all, starts, ends)
-            ])
-            mask_indexing = torch.arange(vision_token_mask.shape[-1],
-                                         device=vision_token_mask.device)
-            vis_ids = mask_indexing[vision_token_mask]
-            lang_ids = mask_indexing[~vision_token_mask]
-
-        else:
-            position_ids = context.attention_mask.long().cumsum(-1) - 1
-            position_ids += (history_lengths -
-                             position_id_offsets).unsqueeze(-1)
-            device = position_ids.device
-            position_ids_1d = [
-                ids[:l] for ids, l in zip(position_ids.cpu(), q_seqlens.cpu())
-            ]
-            position_ids = torch.cat(position_ids_1d).to(device)
-
-    return position_ids, lang_ids, vis_ids
-
-
-class CogVLMForCausalLM(nn.Module, CudaGraphMixin):
+class CogVLMForCausalLM(nn.Module, CudaGraphMixin, DeployModelMixin):
     """ModelForCausalLM."""
 
     packed_modules_mapping = {
@@ -512,6 +727,8 @@ def __init__(self,
         super().__init__()
         self.config = config
         self.ctx_mgr = ctx_mgr
+        # preprocessor
+        self.input_processor = CogVLMInputProcessor(self.config, dtype)
         # build model
         self.model = CogVLMModel(config, dtype=dtype, device=device)
         # build lm_head
@@ -527,6 +744,7 @@ def forward(
         position_ids: torch.Tensor,
         past_key_values: List[List[torch.Tensor]],
         attn_metadata: Any = None,
+        images: torch.Tensor = None,
         inputs_embeds: torch.Tensor = None,
         lang_ids: torch.LongTensor = None,
         vision_ids: torch.LongTensor = None,
@@ -538,6 +756,7 @@ def forward(
             position_ids=position_ids,
             past_key_values=past_key_values,
             attn_metadata=attn_metadata,
+            images=images,
             inputs_embeds=inputs_embeds,
             lang_ids=lang_ids,
             vision_ids=vision_ids,
@@ -561,8 +780,36 @@ def prepare_inputs_for_generation(
         """prepare input."""
         # get input_ids, position_ids and attention metadatas
         input_ids = context.input_ids
-        position_ids, lang_ids, vis_ids = _get_cogvlm_position_ids(context)
-        position_ids = position_ids[None]
+
+        # position_ids, lang_ids, vis_ids = _get_cogvlm_position_ids(context)
+        position_ids = context.position_ids
+        lang_ids = None
+        vis_ids = None
+
+        # vision inputs
+        images = None
+        if context.input_multimodals is not None:
+            images = [
+                input_mm.get('image', [])
+                for input_mm in context.input_multimodals
+            ]
+            # flatten batch
+            images = [data for im_data in images for data in im_data]
+            if len(images) == 0:
+                images = None
+
+        if images is not None:
+            image_token_id = images[0].meta['image_token_id']
+            vis_mask = input_ids[0] == image_token_id
+            images = torch.stack([data.data for data in images])
+
+            # get lang_ids
+            vis_range = torch.arange(0,
+                                     input_ids.size(-1),
+                                     device=input_ids.device)
+            vis_ids = vis_range[vis_mask]
+            lang_ids = vis_range[~vis_mask]
+
         attn_metadata = context.attn_metadata
 
         # process vision embeddings
@@ -581,6 +828,7 @@ def prepare_inputs_for_generation(
             position_ids=position_ids,
             past_key_values=past_key_values,
             attn_metadata=attn_metadata,
+            images=images,
             inputs_embeds=inputs_embeds,
             lang_ids=lang_ids,
             vision_ids=vis_ids,
@@ -597,8 +845,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
-            if 'model.vision' in name:
-                continue
             if 'rotary_emb.inv_freq' in name:
                 continue
             if ('rotary_emb.cos_cached' in name
@@ -607,6 +853,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             if self.config.tie_word_embeddings and 'lm_head.weight' in name:
                 continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if '.vision.' in name:
+                    continue
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -620,6 +868,136 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     load_weight(param, q, shard_id='q')
                     load_weight(param, k, shard_id='k')
                     load_weight(param, v, shard_id='v')
+                elif '.query_key_value' in name:
+                    param = params_dict[name]
+                    q, k, v = param.weight_spliter(loaded_weight)
+                    load_weight(param, q, shard_id='q')
+                    load_weight(param, k, shard_id='k')
+                    load_weight(param, v, shard_id='v')
                 else:
                     param = params_dict[name]
                     load_weight(param, loaded_weight)
+
+    def update_model_metas(self,
+                           past_key_values: List[List[torch.Tensor]],
+                           inputs_embeds: Optional[torch.Tensor] = None,
+                           context: StepContext = None):
+        """update model meta."""
+        model_metas = context.model_metas
+        input_multimodals = context.input_multimodals
+        if input_multimodals is None:
+            input_imgs = [[] for _ in model_metas]
+        else:
+            input_imgs = []
+            for mm in input_multimodals:
+                if mm is None:
+                    input_imgs.append([])
+                else:
+                    input_imgs.append(mm.get('image', []))
+
+        config = self.config
+        image_size: int = config.vision_config['image_size']
+        patch_size: int = config.vision_config['patch_size']
+        vision_token_num = ((image_size // patch_size // 2) *
+                            (image_size // patch_size // 2) + 2)
+        num_pad = vision_token_num - 3
+
+        batched_num_img_tokens = []
+        new_model_metas = []
+        for meta, imgs in zip(model_metas, input_imgs):
+            if meta is None:
+                num_img_tokens = 0
+            else:
+                num_img_tokens = meta.get('num_img_tokens', 0)
+
+            batched_num_img_tokens.append(num_img_tokens)
+
+            num_img_tokens += num_pad * len(imgs)
+            new_model_metas.append(dict(num_img_tokens=num_img_tokens))
+
+        # prepare cogvlm position_ids
+        q_seqlens = context.q_seqlens
+        position_ids = context.position_ids
+
+        if context.is_decoding or all(len(imgs) == 0 for imgs in input_imgs):
+            num_img_tokens = torch.tensor(batched_num_img_tokens,
+                                          device=position_ids.device)
+            position_ids -= num_img_tokens[None]
+        else:
+            batched_position_ids = position_ids[0].split(q_seqlens)
+            for pos_ids, num_img_tok, imgs in zip(batched_position_ids,
+                                                  batched_num_img_tokens,
+                                                  input_imgs):
+                pos_ids -= num_img_tok
+                if len(imgs) == 0:
+                    continue
+
+                seq_len = pos_ids.size(0)
+                start = pos_ids[0].cpu().item()
+                new_pos_ids = []
+
+                imgs = sorted(imgs, key=lambda img: img.start)
+                for img in imgs:
+                    img_pad_pos = img.start + 1 - num_img_tok
+                    num_pad = img.end - img.start - 2
+                    new_pos_ids += list(range(start, img_pad_pos))
+                    new_pos_ids += [img_pad_pos] * num_pad
+                    start = img_pad_pos + 1
+                    num_img_tok += num_pad
+
+                remain = seq_len - len(new_pos_ids)
+                new_pos_ids += list(range(start, start + remain))
+
+                new_pos_ids = pos_ids.new_tensor(new_pos_ids)
+                pos_ids[:] = new_pos_ids
+
+            position_ids = torch.cat(batched_position_ids)[None]
+        context.position_ids = position_ids
+
+        return new_model_metas
+
+    def get_input_processor(self) -> BaseModelInputProcessor:
+        """get input processor."""
+        return self.input_processor
+
+
+class CogVLMInputProcessor(BaseModelInputProcessor):
+    """input processor."""
+
+    def __init__(self, config: PretrainedConfig, dtype) -> None:
+        self.config = config
+        self.dtype = dtype
+        image_size: int = config.vision_config['image_size']
+        patch_size: int = config.vision_config['patch_size']
+        self.vision_token_num = ((image_size // patch_size // 2) *
+                                 (image_size // patch_size // 2) + 2)
+
+    def preprocess_input(self,
+                         input_ids: List[int],
+                         input_multimodals=None,
+                         **kwargs) -> PreprocessInputResult:
+        """prepare multimodal input."""
+        if input_multimodals is None or len(input_multimodals) == 0:
+            return input_ids, input_multimodals
+
+        input_imgs = []
+        for input_mm in input_multimodals:
+            pixel_values = input_mm['pixel_values'].to(self.dtype)
+            offset = input_mm['offset']
+            image_token_id = input_mm.get('image_token_id', 0)
+            num_pad = input_mm['image_tokens']
+            if isinstance(num_pad, torch.Tensor):
+                num_pad = num_pad.item()
+
+            mm_data = MultiModalTensor(
+                data=pixel_values,
+                start=offset,
+                end=offset + num_pad,
+                meta=dict(image_token_id=image_token_id))
+            input_imgs.append(mm_data)
+
+        result = PreprocessInputResult(
+            input_ids=input_ids,
+            input_multimodals=dict(image=input_imgs),
+        )
+        return result
diff --git a/lmdeploy/pytorch/models/internvl.py b/lmdeploy/pytorch/models/internvl.py
index 70dd8f2159..79a796f7a2 100644
--- a/lmdeploy/pytorch/models/internvl.py
+++ b/lmdeploy/pytorch/models/internvl.py
@@ -1,17 +1,310 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, Iterable, List, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Tuple
 
 import torch
+import torch.nn.functional as F
 from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 
+from lmdeploy.pytorch.engine.input_process import (BaseModelInputProcessor,
+                                                   PreprocessInputResult)
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
+from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.nn import LayerNorm, RMSNorm
+from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_qkv_proj,
+                                        build_rowwise_linear)
+from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .patch import build_model_from_hf_config
 from .utils.cudagraph import CudaGraphMixin
+from .utils.model import DeployModelMixin
 
 
-class InternVLChatModel(nn.Module, CudaGraphMixin):
+class InternVisionEmbeddings(nn.Module):
+    """intern vision embedding."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(
+            torch.empty(1, 1, self.embed_dim, dtype=dtype, device=device), )
+
+        self.patch_embedding = nn.Conv2d(in_channels=3,
+                                         out_channels=self.embed_dim,
+                                         kernel_size=self.patch_size,
+                                         stride=self.patch_size,
+                                         dtype=dtype,
+                                         device=device)
+
+        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = nn.Parameter(
+            torch.empty(1,
+                        self.num_positions,
+                        self.embed_dim,
+                        dtype=dtype,
+                        device=device))
+
+    def _get_pos_embed(self, pos_embed, H, W):
+        target_dtype = pos_embed.dtype
+        pos_embed = pos_embed.float().reshape(
+            1, self.image_size // self.patch_size,
+            self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
+        pos_embed = F.interpolate(pos_embed,
+                                  size=(H, W),
+                                  mode='bicubic',
+                                  align_corners=False).reshape(
+                                      1, -1, H * W).permute(0, 2,
+                                                            1).to(target_dtype)
+        return pos_embed
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(
+            pixel_values)  # shape = [*, channel, width, height]
+        batch_size, _, height, width = patch_embeds.shape
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1,
+                                                   -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        position_embedding = torch.cat([
+            self.position_embedding[:, :1, :],
+            self._get_pos_embed(self.position_embedding[:, 1:, :], height,
+                                width)
+        ],
+                                       dim=1)
+        embeddings = embeddings + position_embedding.to(target_dtype)
+        return embeddings
+
+
+NORM2FN = {
+    'rms_norm': RMSNorm,
+    'layer_norm': LayerNorm,
+}
+
+
+class InternAttention(nn.Module):
+    """intern vl attention."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.config = config
+        quantization_config = getattr(config, 'quantization_config', None)
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+
+        self.qkv = build_qkv_proj(
+            self.embed_dim,
+            num_q_heads=self.num_heads,
+            num_kv_heads=self.num_heads,
+            head_size=self.head_dim,
+            bias=config.qkv_bias,
+            quant_config=quantization_config,
+            dtype=dtype,
+            device=device,
+        )
+
+        self.qk_normalization = config.qk_normalization
+
+        if self.qk_normalization:
+            self.q_norm = RMSNorm(
+                self.embed_dim,
+                eps=config.layer_norm_eps,
+                dtype=dtype,
+                device=device,
+            )
+            self.k_norm = RMSNorm(
+                self.embed_dim,
+                eps=config.layer_norm_eps,
+                dtype=dtype,
+                device=device,
+            )
+
+        self.scale = self.head_dim**-0.5
+
+        # o_proj
+        self.proj = build_rowwise_linear(self.embed_dim,
+                                         self.embed_dim,
+                                         bias=True,
+                                         quant_config=quantization_config,
+                                         dtype=dtype,
+                                         device=device,
+                                         is_tp=True)
+
+    def forward(self, hidden_states):
+        """forward."""
+
+        # qkv proj
+        qkv_states = self.qkv(hidden_states)
+        q, k, v = self.qkv.split_qkv(qkv_states)
+
+        if self.qk_normalization:
+            q_shape = q.shape
+            q = self.q_norm(q.flatten(-2, -1)).view(q_shape)
+            k = self.k_norm(k.flatten(-2, -1)).view(q_shape)
+
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        attn_output = F.scaled_dot_product_attention(q, k, v, scale=self.scale)
+
+        # o proj
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.flatten(-2, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+
+
+class InternMLP(nn.Module):
+    """intern vl mlp."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        from transformers.activations import ACT2FN
+        self.config = config
+        quantization_config = getattr(config, 'quantization_config', None)
+        self.act = ACT2FN[config.hidden_act]
+
+        self.fc1 = build_colwise_linear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=True,
+            dtype=dtype,
+            device=device,
+            quant_config=quantization_config,
+            is_tp=True,
+        )
+
+        self.fc2 = build_rowwise_linear(config.intermediate_size,
+                                        config.hidden_size,
+                                        bias=True,
+                                        quant_config=quantization_config,
+                                        dtype=dtype,
+                                        device=device,
+                                        is_tp=True)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class InternVisionEncoderLayer(nn.Module):
+    """intern vision encoder layer."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.norm_type = getattr(config, 'norm_type', 'rms_norm')
+
+        self.attn = InternAttention(config, dtype=dtype, device=device)
+        self.mlp = InternMLP(config, dtype=dtype, device=device)
+        self.norm1 = NORM2FN[self.norm_type](self.embed_dim,
+                                             eps=config.layer_norm_eps,
+                                             dtype=dtype,
+                                             device=device)
+        self.norm2 = NORM2FN[self.norm_type](self.embed_dim,
+                                             eps=config.layer_norm_eps,
+                                             dtype=dtype,
+                                             device=device)
+
+        self.ls1 = nn.Parameter(
+            torch.empty(self.embed_dim, dtype=dtype, device=device))
+        self.ls2 = nn.Parameter(
+            torch.empty(self.embed_dim, dtype=dtype, device=device))
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ):
+        """forward."""
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states).to(hidden_states.dtype)) * self.ls1
+
+        hidden_states = hidden_states + self.mlp(
+            self.norm2(hidden_states).to(hidden_states.dtype)) * self.ls2
+
+        return hidden_states
+
+
+class InternVisionEncoder(nn.Module):
+    """intern vision encoder."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([
+            InternVisionEncoderLayer(config, dtype=dtype, device=device)
+            for idx in range(config.num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        inputs_embeds,
+    ):
+        """forward."""
+        hidden_states = inputs_embeds
+        for _, encoder_layer in enumerate(self.layers):
+            layer_outputs = encoder_layer(hidden_states, )
+            hidden_states = layer_outputs
+        return hidden_states
+
+
+class InternVisionModel(nn.Module):
+    """intern vision model."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.config = config
+
+        self.embeddings = InternVisionEmbeddings(config,
+                                                 dtype=dtype,
+                                                 device=device)
+        self.encoder = InternVisionEncoder(config, dtype=dtype, device=device)
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+    ):
+        """forward."""
+        assert pixel_values.dim() == 4
+        hidden_states = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(inputs_embeds=hidden_states)
+        last_hidden_state = encoder_outputs
+
+        return last_hidden_state
+
+
+class InternVLChatModel(nn.Module, DeployModelMixin, CudaGraphMixin):
 
     def __init__(self,
                  config: PretrainedConfig,
@@ -21,31 +314,106 @@ def __init__(self,
         super().__init__()
         self.config = config
         self.ctx_mgr = ctx_mgr
+        self.select_layer = config.select_layer
+
         llm_config = config.llm_config
+        self.llm_arch_name = llm_config.architectures[0]
+        self.is_mono = self.llm_arch_name == 'InternLM2VEForCausalLM'
+
+        vision_config = config.vision_config
+        if self.is_mono:
+            from .internvl_patch import InternVisionPatchModel
+            self.vision_model = InternVisionPatchModel(
+                vision_config,
+                dtype=dtype,
+                device=device,
+            )
+        else:
+            self.vision_model = InternVisionModel(vision_config,
+                                                  dtype=dtype,
+                                                  device=device)
+
         self.language_model = build_model_from_hf_config(llm_config,
                                                          dtype=dtype,
                                                          device=device)
 
-        self.llm_arch_name = llm_config.architectures[0]
+        vit_hidden_size = config.vision_config.hidden_size
+        llm_hidden_size = config.llm_config.hidden_size
+        self.downsample_ratio = config.downsample_ratio
+        self.mlp1 = nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2,
+                         dtype=dtype,
+                         device=device),
+            nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio)**2,
+                      llm_hidden_size,
+                      dtype=dtype,
+                      device=device), nn.GELU(),
+            nn.Linear(llm_hidden_size,
+                      llm_hidden_size,
+                      dtype=dtype,
+                      device=device))
 
         # for Mono-InternVL
-        self.is_mono = self.llm_arch_name == 'InternLM2VEForCausalLM'
         if self.is_mono:
             assert dtype != torch.float16, (
                 'Currently Mono-InternVL does not support FP16 due to'
                 'numerical instability. Please use BF16 instead.')
 
+        self.input_processor = InternVLInputProcessor(self.config, dtype)
+
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        # N, H * scale, W, C // scale -->
+        # N, H * scale, W * scale, C // (scale ** 2)
+        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
+                   int(c / (scale_factor * scale_factor)))
+        x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+
+    def extract_feature(self, pixel_values):
+        """extract vision feature."""
+        assert self.select_layer == -1
+        vit_embeds = self.vision_model(pixel_values)
+        if self.is_mono:
+            if int(vit_embeds.shape[1]**0.5)**2 != vit_embeds.shape[1]:
+                vit_embeds = vit_embeds[:, 1:, :]
+        else:
+            vit_embeds = vit_embeds[:, 1:, :]
+
+        h = w = int(vit_embeds.shape[1]**0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds,
+                                        scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1,
+                                        vit_embeds.shape[-1])
+        vit_embeds = self.mlp1(vit_embeds)
+        return vit_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
         past_key_values: List[List[torch.Tensor]],
         attn_metadata: Any = None,
+        pixel_values: torch.Tensor = None,
+        image_mask: torch.Tensor = None,
         inputs_embeds: torch.Tensor = None,
         vision_embedding_indexing: torch.Tensor = None,
         text_embedding_indexing: torch.Tensor = None,
         **kwargs,
     ):
+        if inputs_embeds is None and pixel_values is not None:
+            # extract feature
+            vit_embeds = self.extract_feature(pixel_values)
+            lang_embeds = self.language_model.get_input_embeddings()(input_ids)
+            lang_embeds.masked_scatter_(image_mask[..., None], vit_embeds)
+
+            inputs_embeds = lang_embeds
+
         if self.is_mono:
             return self.language_model.forward(
                 input_ids=input_ids,
@@ -80,11 +448,38 @@ def prepare_inputs_for_generation(
         input_ids = context.input_ids
         position_ids = context.position_ids
         attn_metadata = context.attn_metadata
-        # get inputs from context
         vision_embeddings = context.input_embeddings
-        vision_embedding_indexing = context.input_embedding_indexing
+        vision_embedding_indexing = None
 
+        # vision inputs
+        pixel_values = None
+        image_mask = None
+        if context.input_multimodals is not None:
+            pixel_values = [
+                input_mm.get('image', [])
+                for input_mm in context.input_multimodals
+            ]
+            # flatten batch
+            pixel_values = [
+                data for im_data in pixel_values for data in im_data
+            ]
+            if len(pixel_values) > 0:
+                image_token_id = pixel_values[0].meta['image_token_id']
+                image_mask = input_ids == image_token_id
+                pixel_values = torch.cat([data.data for data in pixel_values])
+            else:
+                pixel_values = None
+                image_mask = None
+
+        if self.is_mono and pixel_values is not None:
+            vision_embedding_indexing = torch.arange(input_ids.shape[1],
+                                                     device=input_ids.device)
+            vision_embedding_indexing = vision_embedding_indexing[
+                image_mask[0]]
+
+        # get inputs from context
         if vision_embeddings is not None and len(vision_embeddings) > 0:
+            vision_embedding_indexing = context.input_embedding_indexing
             if inputs_embeds is None:
                 inputs_embeds = self.get_input_embeddings()(input_ids)
             inputs_embeds[:,
@@ -104,6 +499,8 @@ def prepare_inputs_for_generation(
                 position_ids=position_ids,
                 past_key_values=past_key_values,
                 attn_metadata=attn_metadata,
+                pixel_values=pixel_values,
+                image_mask=image_mask,
                 inputs_embeds=inputs_embeds,
                 vision_embedding_indexing=vision_embedding_indexing,
                 text_embedding_indexing=text_embedding_indexing,
@@ -114,18 +511,85 @@ def prepare_inputs_for_generation(
                 position_ids=position_ids,
                 past_key_values=past_key_values,
                 attn_metadata=attn_metadata,
+                pixel_values=pixel_values,
+                image_mask=image_mask,
                 inputs_embeds=inputs_embeds,
             )
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         """load weights."""
 
-        prefix_length = len('language_model.')
+        lang_prefix = 'language_model.'
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if name.startswith(lang_prefix):
+                continue
+
+            if 'qkv' in name:
+                param = params_dict[name]
+                q, k, v = param.weight_spliter(loaded_weight)
+                load_weight(param, q, shard_id='q')
+                load_weight(param, k, shard_id='k')
+                load_weight(param, v, shard_id='v')
+            else:
+                param = params_dict[name]
+                load_weight(param, loaded_weight)
+
+        lang_prefix_length = len(lang_prefix)
         new_weights = dict()
         for key, val in weights:
-            if not key.startswith('language_model.'):
+            if not key.startswith(lang_prefix):
                 continue
-            new_key = key[prefix_length:]
+            new_key = key[lang_prefix_length:]
             new_weights[new_key] = val
 
         self.language_model.load_weights(new_weights.items())
+
+    def get_input_processor(self) -> BaseModelInputProcessor:
+        """get input processor."""
+        return self.input_processor
+
+
+class InternVLInputProcessor(BaseModelInputProcessor):
+    """internvl input processor."""
+
+    def __init__(self, config: PretrainedConfig, dtype) -> None:
+        self.config = config
+        self.dtype = dtype
+
+        vision_config = config.vision_config
+        self.image_size = vision_config.image_size
+        self.patch_size = vision_config.patch_size
+        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_positions = self.num_patches + 1
+        self.vision_token_num = self.num_patches // 4
+
+    def preprocess_input(self,
+                         input_ids: List[int],
+                         input_multimodals: List[Dict[str, Any]] = None,
+                         **kwargs) -> PreprocessInputResult:
+        """prepare multimodal input."""
+        if input_multimodals is None or len(input_multimodals) == 0:
+            return input_ids, input_multimodals
+
+        input_imgs = []
+        for input_mm in input_multimodals:
+            pixel_values = input_mm['pixel_values'].to(self.dtype)
+            offset = input_mm['offset']
+            image_token_id = input_mm.get('image_token_id', 0)
+            num_pad = input_mm['image_tokens']
+            if isinstance(num_pad, torch.Tensor):
+                num_pad = num_pad.item()
+
+            mm_data = MultiModalTensor(
+                data=pixel_values,
+                start=offset,
+                end=offset + num_pad,
+                meta=dict(image_token_id=image_token_id))
+            input_imgs.append(mm_data)
+
+        result = PreprocessInputResult(
+            input_ids=input_ids,
+            input_multimodals=dict(image=input_imgs),
+        )
+        return result
diff --git a/lmdeploy/pytorch/models/internvl_patch.py b/lmdeploy/pytorch/models/internvl_patch.py
new file mode 100644
index 0000000000..d13ad2d39b
--- /dev/null
+++ b/lmdeploy/pytorch/models/internvl_patch.py
@@ -0,0 +1,96 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+
+
+class InternVisionEmbeddings(nn.Module):
+    """mono vision."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(
+            torch.empty(1, 1, self.embed_dim, dtype=dtype, device=device), )
+
+        self.patch_embedding = nn.Conv2d(in_channels=3,
+                                         out_channels=self.embed_dim,
+                                         kernel_size=self.patch_size,
+                                         stride=self.patch_size,
+                                         dtype=dtype,
+                                         device=device)
+
+        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = nn.Parameter(
+            torch.empty(1,
+                        self.num_positions,
+                        self.embed_dim,
+                        dtype=dtype,
+                        device=device))
+
+    def _get_pos_embed(self, pos_embed, H, W):
+        target_dtype = pos_embed.dtype
+        pos_embed = pos_embed.float().reshape(
+            1, self.image_size // self.patch_size,
+            self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
+        pos_embed = F.interpolate(pos_embed,
+                                  size=(H, W),
+                                  mode='bicubic',
+                                  align_corners=False)
+        pos_embed = pos_embed.reshape(1, -1, H * W).permute(0, 2,
+                                                            1).to(target_dtype)
+        return pos_embed
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(
+            pixel_values)  # shape = [*, channel, width, height]
+        batch_size, _, height, width = patch_embeds.shape
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1,
+                                                   -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        position_embedding = torch.cat([
+            self.position_embedding[:, :1, :],
+            self._get_pos_embed(self.position_embedding[:, 1:, :], height,
+                                width)
+        ],
+                                       dim=1)
+        embeddings = embeddings + position_embedding.to(target_dtype)
+        return embeddings
+
+
+class InternVisionPatchModel(nn.Module):
+    """mono vision."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.config = config
+        self.embeddings = InternVisionEmbeddings(config,
+                                                 dtype=dtype,
+                                                 device=device)
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+    ):
+        if len(pixel_values.shape) != 4:
+            raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
+
+        hidden_states = self.embeddings(pixel_values)[:, 1:]
+        return hidden_states
diff --git a/lmdeploy/pytorch/models/llama.py b/lmdeploy/pytorch/models/llama.py
index f38c5ef02b..8acd20a8d5 100644
--- a/lmdeploy/pytorch/models/llama.py
+++ b/lmdeploy/pytorch/models/llama.py
@@ -450,22 +450,3 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             else:
                 param = params_dict[name]
                 load_weight(param, loaded_weight)
-
-
-class LlavaLlamaForCausalLM(LlamaForCausalLM):
-    """llava llama for causallm."""
-
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        """load weights."""
-
-        new_weights = dict()
-        for key, val in weights:
-            if key.startswith('model.vision_tower'):
-                continue
-            if key.startswith('model.mm_projector'):
-                continue
-            if key.startswith('model.image_newline'):
-                continue
-            new_weights[key] = val
-
-        super().load_weights(new_weights.items())
diff --git a/lmdeploy/pytorch/models/llava.py b/lmdeploy/pytorch/models/llava.py
index 56cb5ca675..751f7343ec 100644
--- a/lmdeploy/pytorch/models/llava.py
+++ b/lmdeploy/pytorch/models/llava.py
@@ -1,17 +1,443 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, Iterable, List, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Tuple
 
 import torch
+import torch.nn.functional as F
 from torch import nn
 from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_outputs import BaseModelOutputWithPooling
+from transformers.models.llava.configuration_llava import LlavaConfig
 
+from lmdeploy.pytorch.engine.input_process import (BaseModelInputProcessor,
+                                                   PreprocessInputResult)
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
+from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_qkv_proj,
+                                        build_rowwise_linear)
+from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .patch import build_model_from_hf_config
 from .utils.cudagraph import CudaGraphMixin
+from .utils.model import DeployModelMixin
 
 
-class LlavaForConditionalGeneration(nn.Module, CudaGraphMixin):
+class LlavaMultiModalProjector(nn.Module):
+
+    def __init__(self,
+                 config: LlavaConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        from transformers.activations import ACT2FN
+
+        self.linear_1 = nn.Linear(config.vision_config.hidden_size,
+                                  config.text_config.hidden_size,
+                                  bias=True,
+                                  dtype=dtype,
+                                  device=device)
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(config.text_config.hidden_size,
+                                  config.text_config.hidden_size,
+                                  bias=True,
+                                  dtype=dtype,
+                                  device=device)
+
+    def forward(self, image_features):
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class CLIPVisionEmbeddings(nn.Module):
+    """clip vision embedding."""
+
+    def __init__(self,
+                 config,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(
+            torch.empty(self.embed_dim, dtype=dtype, device=device))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+            dtype=dtype,
+            device=device,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size)**2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(
+            self.num_positions,
+            self.embed_dim,
+            dtype=dtype,
+            device=device,
+        )
+        self.register_buffer('position_ids',
+                             torch.arange(self.num_positions,
+                                          device=device).expand((1, -1)),
+                             persistent=False)
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int,
+                                 width: int) -> torch.Tensor:
+        """This method allows to interpolate the pre-trained position
+        encodings, to be able to use the model on higher resolution images.
+
+        This method is also adapted to support torch.jit tracing.
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        position_embedding = self.position_embedding.weight.unsqueeze(0)
+        num_positions = position_embedding.shape[1] - 1
+
+        # always interpolate when tracing
+        # to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing(
+        ) and num_patches == num_positions and height == width:
+            return self.position_embedding(self.position_ids)
+
+        from transformers.utils import torch_int
+
+        class_pos_embed = position_embedding[:, :1]
+        patch_pos_embed = position_embedding[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions,
+                                                  sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode='bicubic',
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(self,
+                pixel_values: torch.FloatTensor,
+                interpolate_pos_encoding=False) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        if not interpolate_pos_encoding and (height != self.image_size
+                                             or width != self.image_size):
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model"
+                f' ({self.image_size}*{self.image_size}).')
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(
+            dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(
+                embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embedding(
+                self.position_ids)
+        return embeddings
+
+
+class CLIPAttention(nn.Module):
+    """clip attention."""
+
+    def __init__(self,
+                 config,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.config = config
+        quantization_config = getattr(config, 'quantization_config', None)
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+
+        self.qkv_proj = build_qkv_proj(
+            self.embed_dim,
+            num_q_heads=self.num_heads,
+            num_kv_heads=self.num_heads,
+            head_size=self.head_dim,
+            bias=True,
+            quant_config=quantization_config,
+            dtype=dtype,
+            device=device,
+        )
+
+        self.scale = self.head_dim**-0.5
+
+        # o_proj
+        self.out_proj = build_rowwise_linear(self.embed_dim,
+                                             self.embed_dim,
+                                             bias=True,
+                                             quant_config=quantization_config,
+                                             dtype=dtype,
+                                             device=device,
+                                             is_tp=True)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+    ):
+        """forward."""
+        # qkv proj
+        qkv_states = self.qkv_proj(hidden_states)
+        q, k, v = self.qkv_proj.split_qkv(qkv_states)
+
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        if attention_mask is not None and causal_attention_mask is not None:
+            attn_mask = attention_mask + causal_attention_mask
+        elif causal_attention_mask is not None:
+            attn_mask = causal_attention_mask
+        else:
+            attn_mask = attention_mask
+
+        attn_output = F.scaled_dot_product_attention(q,
+                                                     k,
+                                                     v,
+                                                     attn_mask=attn_mask,
+                                                     scale=self.scale)
+
+        # o proj
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.flatten(-2, -1)
+        attn_output = self.out_proj(attn_output)
+        return attn_output
+
+
+class CLIPMLP(nn.Module):
+    """clip mlp."""
+
+    def __init__(self,
+                 config,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.config = config
+        quantization_config = getattr(config, 'quantization_config', None)
+        from transformers.activations import ACT2FN
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = build_colwise_linear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=True,
+            dtype=dtype,
+            device=device,
+            quant_config=quantization_config,
+            is_tp=True,
+        )
+        self.fc2 = build_rowwise_linear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=True,
+            dtype=dtype,
+            device=device,
+            quant_config=quantization_config,
+            is_tp=True,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """forward."""
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class CLIPEncoderLayer(nn.Module):
+    """clip encoder layer."""
+
+    def __init__(self,
+                 config,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = CLIPAttention(config, dtype=dtype, device=device)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps,
+                                        dtype=dtype,
+                                        device=device)
+        self.mlp = CLIPMLP(config, dtype=dtype, device=device)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps,
+                                        dtype=dtype,
+                                        device=device)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+    ):
+        """forward."""
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class CLIPEncoder(nn.Module):
+    """clip encoder."""
+
+    def __init__(self,
+                 config,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([
+            CLIPEncoderLayer(config, dtype=dtype, device=device)
+            for _ in range(config.num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        vision_feature_layer: int = -1,
+    ):
+        """forward."""
+        hidden_states = inputs_embeds
+        num_vision_layers = len(self.layers) + vision_feature_layer + 1
+        for _, encoder_layer in enumerate(self.layers[:num_vision_layers]):
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                causal_attention_mask=causal_attention_mask,
+            )
+
+            hidden_states = layer_outputs
+
+        return hidden_states
+
+
+class CLIPVisionTransformer(nn.Module):
+    """clip vision transformer."""
+
+    def __init__(self,
+                 config,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = CLIPVisionEmbeddings(config,
+                                               dtype=dtype,
+                                               device=device)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim,
+                                         eps=config.layer_norm_eps,
+                                         dtype=dtype,
+                                         device=device)
+        self.encoder = CLIPEncoder(config, dtype=dtype, device=device)
+        self.post_layernorm = nn.LayerNorm(embed_dim,
+                                           eps=config.layer_norm_eps,
+                                           dtype=dtype,
+                                           device=device)
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        interpolate_pos_encoding: bool = False,
+        vision_feature_layer: int = -1,
+    ) -> BaseModelOutputWithPooling:
+        """forward."""
+        hidden_states = self.embeddings(
+            pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            vision_feature_layer=vision_feature_layer)
+
+        last_hidden_state = encoder_outputs
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=None,
+            attentions=None,
+        )
+
+
+class CLIPVisionModel(nn.Module):
+    """clip vision model."""
+
+    def __init__(self,
+                 config,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.vision_model = CLIPVisionTransformer(config,
+                                                  dtype=dtype,
+                                                  device=device)
+
+    def forward(self,
+                pixel_values: torch.FloatTensor,
+                interpolate_pos_encoding: bool = False,
+                vision_feature_layer: int = -1,
+                **kwargs):
+        """forward."""
+        return self.vision_model(
+            pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            vision_feature_layer=vision_feature_layer)
+
+
+def build_vision_model(vision_config,
+                       dtype: torch.dtype = None,
+                       device: torch.device = None):
+    """build vision model."""
+    model_type = vision_config.model_type
+
+    if model_type == 'clip_vision_model':
+        return CLIPVisionModel(vision_config, dtype, device)
+    else:
+        raise NotImplementedError(f'<{model_type}> is not implemented.')
+
+
+class LlavaForConditionalGeneration(nn.Module, CudaGraphMixin,
+                                    DeployModelMixin):
 
     def __init__(self,
                  config: PretrainedConfig,
@@ -22,19 +448,67 @@ def __init__(self,
         self.config = config
         self.ctx_mgr = ctx_mgr
         text_config = config.text_config
+
+        self.vision_tower = build_vision_model(config.vision_config,
+                                               dtype=dtype,
+                                               device=device)
+
         self.language_model = build_model_from_hf_config(text_config,
                                                          dtype=dtype,
                                                          device=device)
 
+        self.multi_modal_projector = LlavaMultiModalProjector(config,
+                                                              dtype=dtype,
+                                                              device=device)
+
+        self.input_processor = LLavaInputProcessor(config, dtype)
+
+    def get_image_features(self,
+                           pixel_values,
+                           vision_feature_layer: int = -1,
+                           vision_feature_select_strategy: str = 'default'):
+        """get image features."""
+        selected_image_feature = self.vision_tower(
+            pixel_values, vision_feature_layer=vision_feature_layer)[0]
+        if vision_feature_select_strategy == 'default':
+            selected_image_feature = selected_image_feature[:, 1:]
+        elif vision_feature_select_strategy == 'full':
+            selected_image_feature = selected_image_feature
+        else:
+            raise ValueError(
+                f'Unexpected select feature strategy: {vision_feature_select_strategy}'  # noqa: E501
+            )
+        image_features = self.multi_modal_projector(selected_image_feature)
+        image_features = image_features.flatten(0, 1)[None]
+
+        return image_features
+
     def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
         past_key_values: List[List[torch.Tensor]],
         attn_metadata: Any = None,
+        pixel_values: torch.Tensor = None,
+        image_mask: torch.Tensor = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
     ):
+        if inputs_embeds is None:
+            image_features = None
+            if pixel_values is not None:
+                vision_feature_layer = self.config.vision_feature_layer
+                select_strategy = self.config.vision_feature_select_strategy
+                image_features = self.get_image_features(
+                    pixel_values,
+                    vision_feature_layer=vision_feature_layer,
+                    vision_feature_select_strategy=select_strategy)
+            inputs_embeds = self.language_model.get_input_embeddings()(
+                input_ids)
+            if pixel_values is not None:
+                inputs_embeds.masked_scatter_(image_mask[..., None],
+                                              image_features)
+
         return self.language_model.forward(input_ids=input_ids,
                                            inputs_embeds=inputs_embeds,
                                            past_key_values=past_key_values,
@@ -59,6 +533,27 @@ def prepare_inputs_for_generation(
         input_ids = context.input_ids
         position_ids = context.position_ids
         attn_metadata = context.attn_metadata
+
+        # vision inputs
+        pixel_values = None
+        image_mask = None
+        if context.input_multimodals is not None:
+            pixel_values = [
+                input_mm.get('image', [])
+                for input_mm in context.input_multimodals
+            ]
+            # flatten batch
+            pixel_values = [
+                data for im_data in pixel_values for data in im_data
+            ]
+            if len(pixel_values) > 0:
+                image_token_id = pixel_values[0].meta['image_token_id']
+                image_mask = input_ids == image_token_id
+                pixel_values = torch.cat([data.data for data in pixel_values])
+            else:
+                pixel_values = None
+                image_mask = None
+
         # get inputs from context
         vision_embeddings = context.input_embeddings
         vision_embedding_indexing = context.input_embedding_indexing
@@ -75,18 +570,404 @@ def prepare_inputs_for_generation(
             position_ids=position_ids,
             past_key_values=past_key_values,
             attn_metadata=attn_metadata,
+            pixel_values=pixel_values,
+            image_mask=image_mask,
             inputs_embeds=inputs_embeds,
         )
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         """load weights."""
 
-        prefix_length = len('language_model.')
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ('.qkv_proj', '.q_proj', 'q'),
+            ('.qkv_proj', '.k_proj', 'k'),
+            ('.qkv_proj', '.v_proj', 'v'),
+        ]
+
+        # vis model
+        lang_prefix = 'language_model.'
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if name.startswith(lang_prefix):
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                load_weight(param, loaded_weight, shard_id=shard_id)
+                break
+            else:
+                param = params_dict[name]
+                load_weight(param, loaded_weight)
+
+        # language model
+        prefix_length = len(lang_prefix)
         new_weights = dict()
         for key, val in weights:
-            if not key.startswith('language_model.'):
+            if not key.startswith(lang_prefix):
                 continue
             new_key = key[prefix_length:]
             new_weights[new_key] = val
 
         self.language_model.load_weights(new_weights.items())
+
+    def get_input_processor(self) -> BaseModelInputProcessor:
+        """get input processor."""
+        return self.input_processor
+
+
+class LLavaInputProcessor(BaseModelInputProcessor):
+    """llava input processor."""
+
+    def __init__(self, config: PretrainedConfig, dtype) -> None:
+        self.config = config
+        self.dtype = dtype
+
+    def preprocess_input(self,
+                         input_ids: List[int],
+                         input_multimodals: List[Dict[str, Any]] = None,
+                         **kwargs) -> PreprocessInputResult:
+        """prepare multimodal input."""
+        if input_multimodals is None or len(input_multimodals) == 0:
+            return input_ids, input_multimodals
+
+        input_imgs = []
+        for input_mm in input_multimodals:
+            pixel_values = input_mm['pixel_values'].to(self.dtype)
+            offset = input_mm['offset']
+            image_token_id = input_mm.get('image_token_id', 0)
+            num_pad = input_mm['image_tokens']
+            if isinstance(num_pad, torch.Tensor):
+                num_pad = num_pad.item()
+
+            mm_data = MultiModalTensor(
+                data=pixel_values,
+                start=offset,
+                end=offset + num_pad,
+                meta=dict(image_token_id=image_token_id))
+            input_imgs.append(mm_data)
+
+        result = PreprocessInputResult(
+            input_ids=input_ids,
+            input_multimodals=dict(image=input_imgs),
+        )
+        return result
+
+
+def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
+
+    from transformers.image_processing_utils import select_best_resolution
+
+    if not isinstance(grid_pinpoints, list):
+        raise TypeError('grid_pinpoints should be a list of tuples or lists')
+
+    if not isinstance(image_size, (list, tuple)):
+        image_size = image_size.tolist()
+
+    height, width = select_best_resolution(image_size, grid_pinpoints)
+    return height // patch_size, width // patch_size
+
+
+def unpad_image(tensor, original_size):
+    """Unpads a PyTorch tensor of a padded and resized image."""
+    if not isinstance(original_size, (list, tuple)):
+        original_size = original_size.tolist()
+    original_height, original_width = original_size
+    current_height, current_width = tensor.shape[1:]
+
+    original_aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+
+    if original_aspect_ratio > current_aspect_ratio:
+        scale_factor = current_width / original_width
+        new_height = int(round(original_height * scale_factor, 7))
+        padding = (current_height - new_height) // 2
+        unpadded_tensor = tensor[:, padding:current_height - padding, :]
+    else:
+        scale_factor = current_height / original_height
+        new_width = int(round(original_width * scale_factor, 7))
+        padding = (current_width - new_width) // 2
+        unpadded_tensor = tensor[:, :, padding:current_width - padding]
+
+    return unpadded_tensor
+
+
+def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
+    """Calculate the number of patches after the preprocessing for images of
+    any resolution."""
+    from transformers.image_processing_utils import select_best_resolution
+    if not isinstance(grid_pinpoints, list):
+        raise TypeError('grid_pinpoints should be a list of tuples or lists')
+
+    if not isinstance(image_size, (list, tuple)):
+        image_size = image_size.tolist()
+
+    best_resolution = select_best_resolution(image_size, grid_pinpoints)
+    height, width = best_resolution
+
+    num_patches = (height // patch_size) * (width // patch_size)
+    # add the base patch
+    num_patches += 1
+    return num_patches
+
+
+class LlavaNextForConditionalGeneration(LlavaForConditionalGeneration):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 ctx_mgr: StepContextManager,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__(config=config,
+                         ctx_mgr=ctx_mgr,
+                         dtype=dtype,
+                         device=device)
+        self.image_newline = nn.Parameter(
+            torch.empty(config.text_config.hidden_size,
+                        dtype=dtype,
+                        device=device))
+        self.input_processor = LLavaNextInputProcessor(config, dtype)
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        image_sizes: torch.Tensor,
+        vision_feature_layer: int,
+        vision_feature_select_strategy: str,
+    ):
+        # ! infer image_num_patches from image_sizes
+        image_num_patches = [
+            image_size_to_num_patches(
+                image_size=imsize,
+                grid_pinpoints=self.config.image_grid_pinpoints,
+                patch_size=self.config.vision_config.image_size,
+            ) for imsize in image_sizes
+        ]
+        if pixel_values.dim() == 5:
+            # stacked if input is
+            # (batch_size, num_patches, num_channels, height, width)
+            _pixel_values_list = [
+                pix_val[:num_patch]
+                for pix_val, num_patch in zip(pixel_values, image_num_patches)
+            ]
+            pixel_values = torch.cat(_pixel_values_list, dim=0)
+        elif pixel_values.dim() != 4:
+            # otherwise has to be stacked from list of
+            # (num_patches, num_channels, height, width)
+            raise ValueError(f'pixel_values of shape {pixel_values.shape}, '
+                             'expect to be of 4 or 5 dimensions')
+
+        selected_image_feature = self.vision_tower(
+            pixel_values, vision_feature_layer=vision_feature_layer)[0]
+        if vision_feature_select_strategy == 'default':
+            selected_image_feature = selected_image_feature[:, 1:]
+        elif vision_feature_select_strategy == 'full':
+            selected_image_feature = selected_image_feature
+        image_features = self.multi_modal_projector(selected_image_feature)
+        image_features = torch.split(image_features, image_num_patches, dim=0)
+        return image_features
+
+    def pack_image_features(self,
+                            image_features,
+                            image_sizes,
+                            vision_feature_select_strategy,
+                            image_newline=None):
+
+        new_image_features = []
+        feature_lens = []
+        for image_idx, image_feature in enumerate(image_features):
+            if image_feature.shape[0] > 1:
+                base_image_feature = image_feature[0]
+                image_feature = image_feature[1:]
+                height = width = (self.config.vision_config.image_size //
+                                  self.config.vision_config.patch_size)
+
+                if vision_feature_select_strategy == 'default':
+                    expected_num_patches = height * width
+                elif vision_feature_select_strategy == 'full':
+                    expected_num_patches = height * width + 1
+                if expected_num_patches != base_image_feature.shape[0]:
+                    raise ValueError('The number of patches is '
+                                     'not consistent with the image size.')
+
+                (num_patch_height,
+                 num_patch_width) = get_anyres_image_grid_shape(
+                     image_sizes[image_idx],
+                     self.config.image_grid_pinpoints,
+                     self.config.vision_config.image_size,
+                 )
+                image_feature = image_feature.view(num_patch_height,
+                                                   num_patch_width, height,
+                                                   width, -1)
+                image_feature = image_feature.permute(4, 0, 2, 1,
+                                                      3).contiguous()
+                image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                image_feature = unpad_image(image_feature,
+                                            image_sizes[image_idx])
+                if image_newline is not None:
+                    image_feature = torch.cat(
+                        (
+                            image_feature,
+                            image_newline[:, None, None].expand(
+                                *image_feature.shape[:-1], 1).to(
+                                    image_feature.dtype),
+                        ),
+                        dim=-1,
+                    )
+                image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                image_feature = torch.cat((base_image_feature, image_feature),
+                                          dim=0)
+            else:
+                image_feature = image_feature[0]
+                if image_newline is not None:
+                    image_feature = torch.cat(
+                        (image_feature, image_newline[None].to(image_feature)),
+                        dim=0)
+            new_image_features.append(image_feature)
+            feature_lens.append(image_feature.size(0))
+        image_features = torch.cat(new_image_features, dim=0)
+        return image_features
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        past_key_values: List[List[torch.Tensor]],
+        attn_metadata: Any = None,
+        pixel_values: torch.Tensor = None,
+        image_sizes: torch.Tensor = None,
+        image_mask: torch.Tensor = None,
+        inputs_embeds: torch.Tensor = None,
+        **kwargs,
+    ):
+        if inputs_embeds is None:
+            image_features = None
+            if pixel_values is not None:
+                vision_feature_layer = self.config.vision_feature_layer
+                select_strategy = self.config.vision_feature_select_strategy
+                image_sizes = image_sizes.tolist()
+                image_features = self.get_image_features(
+                    pixel_values,
+                    image_sizes,
+                    vision_feature_layer=vision_feature_layer,
+                    vision_feature_select_strategy=select_strategy)
+                image_features = self.pack_image_features(
+                    image_features,
+                    image_sizes,
+                    vision_feature_select_strategy=select_strategy,
+                    image_newline=self.image_newline,
+                )
+                image_features = image_features[None]
+            inputs_embeds = self.language_model.get_input_embeddings()(
+                input_ids)
+            if pixel_values is not None:
+                inputs_embeds.masked_scatter_(image_mask[..., None],
+                                              image_features)
+
+        return self.language_model.forward(input_ids=input_ids,
+                                           inputs_embeds=inputs_embeds,
+                                           past_key_values=past_key_values,
+                                           position_ids=position_ids,
+                                           attn_metadata=attn_metadata)
+
+    def get_input_processor(self) -> BaseModelInputProcessor:
+        """get input processor."""
+        return self.input_processor
+
+    def prepare_inputs_for_generation(
+        self,
+        past_key_values: List[List[torch.Tensor]],
+        inputs_embeds: torch.Tensor = None,
+        context: StepContext = None,
+    ):
+        """prepare input."""
+        input_ids = context.input_ids
+        position_ids = context.position_ids
+        attn_metadata = context.attn_metadata
+
+        # vision inputs
+        pixel_values = None
+        image_sizes = None
+        image_mask = None
+        if context.input_multimodals is not None:
+            img_mms = [
+                input_mm.get('image', [])
+                for input_mm in context.input_multimodals
+            ]
+            # flatten batch
+            img_mms = [data for im_data in img_mms for data in im_data]
+            if len(img_mms) > 0:
+                image_token_id = img_mms[0].meta['image_token_id']
+                image_mask = input_ids == image_token_id
+                pixel_values = torch.cat(
+                    [data.data.flatten(0, 1) for data in img_mms])
+                image_sizes = torch.cat(
+                    [data.meta['image_sizes'] for data in img_mms])
+            else:
+                pixel_values = None
+                image_sizes = None
+
+        # get inputs from context
+        vision_embeddings = context.input_embeddings
+        vision_embedding_indexing = context.input_embedding_indexing
+
+        if vision_embeddings is not None and len(vision_embeddings) > 0:
+            if inputs_embeds is None:
+                inputs_embeds = self.get_input_embeddings()(input_ids)
+            inputs_embeds[:,
+                          vision_embedding_indexing, :] = vision_embeddings.to(
+                              inputs_embeds)
+
+        return dict(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            attn_metadata=attn_metadata,
+            pixel_values=pixel_values,
+            image_sizes=image_sizes,
+            image_mask=image_mask,
+            inputs_embeds=inputs_embeds,
+        )
+
+
+class LLavaNextInputProcessor(BaseModelInputProcessor):
+    """llava input processor."""
+
+    def __init__(self, config: PretrainedConfig, dtype) -> None:
+        self.config = config
+        self.dtype = dtype
+
+    def preprocess_input(self,
+                         input_ids: List[int],
+                         input_multimodals: List[Dict[str, Any]] = None,
+                         **kwargs) -> PreprocessInputResult:
+        """prepare multimodal input."""
+        if input_multimodals is None or len(input_multimodals) == 0:
+            return input_ids, input_multimodals
+
+        input_imgs = []
+        for input_mm in input_multimodals:
+            pixel_values = input_mm['pixel_values'].to(self.dtype)
+            image_sizes = input_mm['image_sizes']
+            offset = input_mm['offset']
+            image_token_id = input_mm.get('image_token_id', 0)
+            num_pad = input_mm['image_tokens']
+            if isinstance(num_pad, torch.Tensor):
+                num_pad = num_pad.item()
+
+            mm_data = MultiModalTensor(data=pixel_values,
+                                       start=offset,
+                                       end=offset + num_pad,
+                                       meta=dict(
+                                           image_sizes=image_sizes,
+                                           image_token_id=image_token_id))
+            input_imgs.append(mm_data)
+
+        result = PreprocessInputResult(
+            input_ids=input_ids,
+            input_multimodals=dict(image=input_imgs),
+        )
+        return result
diff --git a/lmdeploy/pytorch/models/mistral.py b/lmdeploy/pytorch/models/mistral.py
index 04af4c8526..ad27963093 100644
--- a/lmdeploy/pytorch/models/mistral.py
+++ b/lmdeploy/pytorch/models/mistral.py
@@ -420,22 +420,3 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             else:
                 param = params_dict[name]
                 load_weight(param, loaded_weight)
-
-
-class LlavaMistralForCausalLM(MistralForCausalLM):
-    """llava forcausallm."""
-
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        """load weights."""
-
-        new_weights = dict()
-        for key, val in weights:
-            if key.startswith('model.vision_tower'):
-                continue
-            if key.startswith('model.mm_projector'):
-                continue
-            if key.startswith('model.image_newline'):
-                continue
-            new_weights[key] = val
-
-        super().load_weights(new_weights.items())
diff --git a/lmdeploy/pytorch/models/mllama.py b/lmdeploy/pytorch/models/mllama.py
index 2596fe5299..bbe9b3a1fc 100644
--- a/lmdeploy/pytorch/models/mllama.py
+++ b/lmdeploy/pytorch/models/mllama.py
@@ -3,23 +3,61 @@
 
 import torch
 from torch import nn
+from torch.nn import functional as F
 from transformers.models.llama import LlamaConfig
-from transformers.models.mllama.modeling_mllama import MllamaTextConfig
+from transformers.models.mllama.modeling_mllama import (MllamaTextConfig,
+                                                        MllamaVisionConfig)
 
+from lmdeploy.pytorch.engine.input_process import (BaseModelInputProcessor,
+                                                   PreprocessInputResult)
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm, RopeType,
-                                 SiluAndMul, build_rotary_embedding)
-from lmdeploy.pytorch.nn.linear import (build_merged_colwise_linear,
+from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, LayerNorm, RMSNorm,
+                                 RopeType, SiluAndMul, build_rotary_embedding)
+from lmdeploy.pytorch.nn.linear import (build_colwise_linear,
+                                        build_merged_colwise_linear,
                                         build_qkv_proj, build_rowwise_linear)
 from lmdeploy.pytorch.nn.rotary_embedding import Llama3Parameters
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
-from .utils.cudagraph import CudaGraphMixin
+from .utils.cudagraph import CudaGraphMeta, CudaGraphMixin, next_power_of_2
+from .utils.model import DeployModelMixin
 
 MLLAMA_IMAGE_TOKEN_ID = 128256
 MLLAMA_IMAGE_TOKEN = '<|image|>'
 
 
+def _prepare_aspect_ratio_attention_mask(
+    aspect_ratio_mask: torch.Tensor,
+    num_patches: int,
+    target_length: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    # Expand aspect ratio mask to target_length
+    batch_size, max_num_tiles = aspect_ratio_mask.shape
+    attention_mask = aspect_ratio_mask.view(batch_size, max_num_tiles, 1,
+                                            1).to(dtype)
+    attention_mask = attention_mask.repeat(1, 1, target_length, 1)
+
+    # Mask padding patches
+    pad_patches = target_length - num_patches
+    attention_mask[:, :, -pad_patches:] = 0
+
+    # Invert the mask (0 -> 1, 1 -> 0)
+    attention_mask = 1 - attention_mask
+
+    # Reshape to 2D and create 4D attention mask
+    # (batch_size, 1, max_num_tiles * target_length,
+    # max_num_tiles * target_length)
+    attention_mask = attention_mask.reshape(batch_size,
+                                            max_num_tiles * target_length, 1)
+    attention_mask = attention_mask * attention_mask.transpose(
+        -1, -2) * torch.finfo(dtype).min
+    attention_mask = attention_mask.unsqueeze(1)
+
+    return attention_mask
+
+
 class LlamaAttention(nn.Module):
     """Rewrite module of LlamaAttention."""
 
@@ -157,6 +195,7 @@ def __init__(self,
             self.head_dim,
             num_kv_heads=self.num_key_value_heads,
             v_head_size=self.head_dim,
+            causal=False,
         )
 
         self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
@@ -579,7 +618,542 @@ def get_logits(self, hidden_states: torch.Tensor):
         return self.lm_head(hidden_states)
 
 
-class MllamaForConditionalGeneration(nn.Module, CudaGraphMixin):
+class MllamaPrecomputedPositionEmbedding(nn.Module):
+    """vis position embedding."""
+
+    def __init__(self,
+                 config: MllamaVisionConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.max_num_tiles = config.max_num_tiles
+        self.max_aspect_ratio_id = config.max_aspect_ratio_id
+        self.config = config
+        self.num_patches = (config.image_size // config.patch_size)**2 + 1
+        self.hidden_size = config.hidden_size
+
+        self.gate = nn.Parameter(torch.empty(1, dtype=dtype, device=device))
+
+        # position embedding
+        self.embedding = nn.Parameter(
+            torch.empty(self.num_patches,
+                        self.hidden_size,
+                        dtype=dtype,
+                        device=device))
+
+        # tile position embedding
+        self.tile_embedding = nn.Embedding(self.max_aspect_ratio_id + 1,
+                                           self.max_num_tiles *
+                                           self.num_patches * self.hidden_size,
+                                           dtype=dtype,
+                                           device=device)
+
+        self._weight_inited = False
+
+    def _init_weight(self):
+        """init weight."""
+        if self._weight_inited:
+            return
+
+        gate_tanh = self.gate.tanh()
+        gated_position_embedding = (1 - gate_tanh) * self.embedding
+        self.gate_tanh = gate_tanh
+        self.gated_position_embedding = gated_position_embedding.view(
+            1, 1, self.num_patches, self.hidden_size)
+
+        self._weight_inited = True
+
+    def forward(self, hidden_state: torch.Tensor,
+                aspect_ratio_ids: torch.Tensor) -> torch.Tensor:
+        """forward."""
+        self._init_weight()
+
+        # position embeddings
+        hidden_state = hidden_state + self.gated_position_embedding
+
+        # precomputed tile position embeddings
+        tile_position_embedding = self.tile_embedding(aspect_ratio_ids)
+        batch_size = hidden_state.shape[0]
+        tile_position_embedding = tile_position_embedding.reshape(
+            batch_size, self.max_num_tiles, self.num_patches, self.hidden_size)
+        gated_tile_position_embedding = (self.gate_tanh *
+                                         tile_position_embedding)
+        hidden_state = hidden_state + gated_tile_position_embedding
+
+        return hidden_state
+
+
+class MllamaPrecomputedAspectRatioEmbedding(nn.Module):
+
+    def __init__(self,
+                 config: MllamaVisionConfig,
+                 is_gated: bool = True,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.max_num_tiles = config.max_num_tiles
+        self.hidden_size = config.hidden_size
+        self.max_aspect_ratio_id = config.max_aspect_ratio_id
+        self.is_gated = is_gated
+
+        self.embedding = nn.Embedding(self.max_aspect_ratio_id + 1,
+                                      self.max_num_tiles * self.hidden_size,
+                                      dtype=dtype,
+                                      device=device)
+        if is_gated:
+            self.gate = nn.Parameter(torch.empty(1, dtype=dtype,
+                                                 device=device))
+
+        self._weight_inited = False
+
+    def _init_weight(self):
+        """init weight."""
+        if self._weight_inited:
+            return
+
+        gate_tanh = self.gate.tanh()
+        self.gate_tanh = gate_tanh
+
+        self._weight_inited = True
+
+    def forward(self, hidden_state: torch.Tensor,
+                aspect_ratio_ids: torch.Tensor) -> torch.Tensor:
+        self._init_weight()
+        embeddings = self.embedding(aspect_ratio_ids)
+        embeddings = embeddings.reshape(-1, self.max_num_tiles, 1,
+                                        self.hidden_size)
+
+        if self.is_gated:
+            embeddings = embeddings * self.gate_tanh
+
+        hidden_state = hidden_state + embeddings
+        return hidden_state
+
+
+class MllamaVisionAttention(nn.Module):
+    """mllama vision attention."""
+
+    def __init__(self,
+                 config: MllamaVisionConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        quantization_config = getattr(config, 'quantization_config', None)
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.attention_heads
+        self.head_dim = config.hidden_size // config.attention_heads
+
+        # packed qkv
+        self.qkv_proj = build_qkv_proj(
+            self.embed_dim,
+            num_q_heads=self.num_heads,
+            num_kv_heads=self.num_heads,
+            head_size=self.head_dim,
+            bias=False,
+            quant_config=quantization_config,
+            dtype=dtype,
+            device=device,
+        )
+
+        # o_proj
+        self.o_proj = build_rowwise_linear(self.num_heads * self.head_dim,
+                                           self.embed_dim,
+                                           bias=False,
+                                           quant_config=quantization_config,
+                                           dtype=dtype,
+                                           device=device,
+                                           is_tp=True)
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size = hidden_state.size(0)
+        qkv_states = self.qkv_proj(hidden_state)
+        qkv_states = qkv_states.flatten(0, -2)
+        query, key, value = self.qkv_proj.split_qkv(qkv_states)
+
+        query = query.unflatten(0, (batch_size, -1))
+        key = key.unflatten(0, (batch_size, -1))
+        value = value.unflatten(0, (batch_size, -1))
+        q_seq_len = query.shape[1]
+
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        attn_output = F.scaled_dot_product_attention(query,
+                                                     key,
+                                                     value,
+                                                     attn_mask=attention_mask)
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_seq_len, -1)
+
+        output = self.o_proj(attn_output)
+
+        return output
+
+
+class MllamaVisionMLP(nn.Module):
+    """mllama vision mlp."""
+
+    def __init__(self,
+                 config: MllamaVisionConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        from transformers.activations import ACT2FN
+        self.config = config
+        quantization_config = getattr(config, 'quantization_config', None)
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = build_colwise_linear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=True,
+            dtype=dtype,
+            device=device,
+            quant_config=quantization_config,
+            is_tp=True,
+        )
+        self.fc2 = build_rowwise_linear(config.intermediate_size,
+                                        config.hidden_size,
+                                        bias=True,
+                                        quant_config=quantization_config,
+                                        dtype=dtype,
+                                        device=device,
+                                        is_tp=True)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class MllamaVisionEncoderLayer(nn.Module):
+    """vision encoder layer."""
+
+    def __init__(self,
+                 config: MllamaVisionConfig,
+                 is_gated: bool,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.is_gated = is_gated
+        self.self_attn = MllamaVisionAttention(config,
+                                               dtype=dtype,
+                                               device=device)
+        self.mlp = MllamaVisionMLP(config, dtype=dtype, device=device)
+
+        self.input_layernorm = LayerNorm(self.hidden_size,
+                                         eps=config.norm_eps,
+                                         dtype=dtype,
+                                         device=device)
+        self.post_attention_layernorm = LayerNorm(self.hidden_size,
+                                                  eps=config.norm_eps,
+                                                  dtype=dtype,
+                                                  device=device)
+
+        if is_gated:
+            self.gate_attn = nn.Parameter(
+                torch.empty(1, dtype=dtype, device=device))
+            self.gate_ffn = nn.Parameter(
+                torch.empty(1, dtype=dtype, device=device))
+
+        self._weight_inited = not is_gated
+
+    def _init_weight(self):
+        """init weight."""
+        if self._weight_inited:
+            return
+
+        self.gate_attn_tanh = self.gate_attn.tanh()
+        self.gate_ffn_tanh = self.gate_ffn.tanh()
+
+        self._weight_inited = True
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        """forward."""
+        self._init_weight()
+
+        # Self Attention
+        residual = hidden_state
+        hidden_state = self.input_layernorm(hidden_state)
+        hidden_state = self.self_attn(hidden_state,
+                                      attention_mask=attention_mask)
+        if self.is_gated:
+            hidden_state = self.gate_attn_tanh * hidden_state
+        hidden_state = residual + hidden_state
+
+        # Feed forward
+        residual = hidden_state
+        hidden_state = self.post_attention_layernorm(hidden_state)
+        hidden_state = self.mlp(hidden_state)
+        if self.is_gated:
+            hidden_state = self.gate_ffn_tanh * hidden_state
+        hidden_state = residual + hidden_state
+
+        outputs = hidden_state
+
+        return outputs
+
+
+class MllamaVisionEncoder(nn.Module):
+    """vision encoder."""
+
+    def __init__(self,
+                 config: MllamaVisionConfig,
+                 num_layers=32,
+                 is_gated=False,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([
+            MllamaVisionEncoderLayer(config,
+                                     is_gated,
+                                     dtype=dtype,
+                                     device=device) for _ in range(num_layers)
+        ])
+        self.gradient_checkpointing = False
+        self.config = config
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        """forward."""
+        encoder_states = ()
+        for encoder_layer in self.layers:
+            encoder_states = encoder_states + (hidden_states, )
+            hidden_states = encoder_layer(
+                hidden_state=hidden_states,
+                attention_mask=attention_mask,
+            )
+        encoder_states = encoder_states + (hidden_states, )
+
+        return hidden_states, encoder_states
+
+
+class MllamaVisionModel(nn.Module):
+    """vision model."""
+
+    def __init__(self,
+                 config: MllamaVisionConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+
+        self.config = config
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.hidden_size = config.hidden_size
+        self.intermediate_layers_indices = config.intermediate_layers_indices
+        self.dtype = dtype
+
+        self.num_patches = (self.image_size // self.patch_size)**2 + 1
+        self.scale = config.hidden_size**-0.5
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.hidden_size,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding='valid',
+            bias=False,
+            dtype=dtype,
+            device=device,
+        )
+
+        self.class_embedding = nn.Parameter(
+            torch.empty(self.hidden_size, dtype=dtype, device=device))
+        self.gated_positional_embedding = MllamaPrecomputedPositionEmbedding(
+            config,
+            dtype=dtype,
+            device=device,
+        )
+
+        self.pre_tile_positional_embedding = MllamaPrecomputedAspectRatioEmbedding(  # noqa: E501
+            config,
+            is_gated=True,
+            dtype=dtype,
+            device=device,
+        )
+        self.post_tile_positional_embedding = MllamaPrecomputedAspectRatioEmbedding(  # noqa: E501
+            config,
+            is_gated=True,
+            dtype=dtype,
+            device=device,
+        )
+
+        # layer norms
+        self.layernorm_pre = nn.LayerNorm(
+            self.hidden_size,
+            dtype=dtype,
+            device=device,
+        )
+        self.layernorm_post = nn.LayerNorm(
+            self.hidden_size,
+            dtype=dtype,
+            device=device,
+        )
+
+        # encoders
+        self.transformer = MllamaVisionEncoder(
+            config,
+            config.num_hidden_layers,
+            is_gated=False,
+            dtype=dtype,
+            device=device,
+        )
+        self.global_transformer = MllamaVisionEncoder(
+            config,
+            config.num_global_layers,
+            is_gated=True,
+            dtype=dtype,
+            device=device,
+        )
+
+    def apply_class_embedding(self,
+                              hidden_state: torch.Tensor) -> torch.Tensor:
+        batch_size, _, hidden_size = hidden_state.shape
+        class_embedding = self.class_embedding.expand(batch_size, 1,
+                                                      hidden_size)
+        hidden_state = torch.cat([class_embedding, hidden_state], dim=1)
+        return hidden_state
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        aspect_ratio_ids: torch.Tensor,
+        aspect_ratio_mask: torch.Tensor,
+    ):
+        """forward."""
+        (batch_size, num_concurrent_media, num_tiles, num_channels, height,
+         width) = pixel_values.shape
+
+        pixel_values = pixel_values.reshape(
+            batch_size * num_concurrent_media * num_tiles, num_channels,
+            height, width)
+        aspect_ratio_ids = aspect_ratio_ids.reshape(
+            batch_size * num_concurrent_media, -1)
+
+        # Patch embedding
+        patch_embeds = self.patch_embedding(pixel_values.to(self.dtype))
+        hidden_state = patch_embeds.flatten(2).transpose(1, 2)
+
+        # Tile embeddings
+        _, num_patches, dim = hidden_state.shape
+        hidden_state = hidden_state.reshape(batch_size * num_concurrent_media,
+                                            num_tiles, -1, dim)
+        hidden_state = self.pre_tile_positional_embedding(
+            hidden_state, aspect_ratio_ids)
+
+        # Add cls token
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media * num_tiles, num_patches, dim)
+        hidden_state = self.apply_class_embedding(hidden_state)
+        num_patches += 1
+
+        # Position embeddings
+        hidden_state = hidden_state.reshape(batch_size * num_concurrent_media,
+                                            num_tiles, num_patches, dim)
+        hidden_state = self.gated_positional_embedding(hidden_state,
+                                                       aspect_ratio_ids)
+
+        hidden_state = self.layernorm_pre(hidden_state)
+
+        # Compute the number of tokens to pad
+        num_padding_patches = (8 - (hidden_state.shape[-2] % 8)) % 8
+        # Compute padding tuple for pad function
+        padding = (
+            0, 0, 0, num_padding_patches
+        )  # (pad_left, pad_right, pad_left for dim -2, pad_right for dim -2)
+        # Pad the tensor
+        hidden_state = F.pad(hidden_state, padding, mode='constant', value=0)
+        slice_index = -num_padding_patches if num_padding_patches > 0 else None
+
+        # Prepare attention mask
+        attention_mask = aspect_ratio_mask.reshape(
+            batch_size * num_concurrent_media, -1)
+        attention_mask = _prepare_aspect_ratio_attention_mask(
+            aspect_ratio_mask=attention_mask,
+            num_patches=self.num_patches,
+            target_length=hidden_state.shape[2],
+            dtype=self.dtype,
+        )
+
+        # Apply encoder
+        hidden_state = hidden_state.view(batch_size * num_concurrent_media, -1,
+                                         dim)
+        output = self.transformer(
+            hidden_state,
+            attention_mask=attention_mask,
+        )
+        hidden_state = output[0]
+
+        hidden_state = self.layernorm_post(hidden_state)
+
+        # Apply global encoder
+        hidden_state = hidden_state.reshape(batch_size * num_concurrent_media,
+                                            num_tiles,
+                                            num_patches + num_padding_patches,
+                                            dim)
+        hidden_state = self.post_tile_positional_embedding(
+            hidden_state, aspect_ratio_ids)
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media,
+            num_tiles * (num_patches + num_padding_patches), dim)
+        global_output = self.global_transformer(
+            hidden_state,
+            attention_mask=attention_mask,
+        )
+        hidden_state = global_output[0]
+
+        # Remove padding form hidden state
+        hidden_state = hidden_state.reshape(batch_size * num_concurrent_media,
+                                            num_tiles,
+                                            num_patches + num_padding_patches,
+                                            dim)
+        hidden_state = hidden_state[:, :, :slice_index]
+        hidden_state = hidden_state.reshape(batch_size, num_concurrent_media,
+                                            num_tiles, num_patches, dim)
+
+        # Collect intermediate layer outputs from encoder output
+        all_intermediate_hidden_states = output[1]
+        all_intermediate_hidden_states = [
+            all_intermediate_hidden_states[i]
+            for i in self.intermediate_layers_indices
+        ]
+        intermediate_hidden_states = torch.stack(
+            all_intermediate_hidden_states, dim=-1)
+
+        # Remove padding from intermediate hidden states
+        intermediate_hidden_states = intermediate_hidden_states.reshape(
+            batch_size * num_concurrent_media, num_tiles,
+            num_patches + num_padding_patches, -1)
+        intermediate_hidden_states = intermediate_hidden_states[:, :, :
+                                                                slice_index]
+        intermediate_hidden_states = intermediate_hidden_states.reshape(
+            batch_size, num_concurrent_media, num_tiles, num_patches, -1)
+
+        # Concatenate final hidden state and intermediate hidden states
+        hidden_state = torch.cat([hidden_state, intermediate_hidden_states],
+                                 dim=-1)
+
+        return hidden_state
+
+
+class MllamaForConditionalGeneration(nn.Module, CudaGraphMixin,
+                                     DeployModelMixin):
     """rewrote model of MllamaForConditionalGeneration."""
 
     packed_modules_mapping = {
@@ -602,16 +1176,32 @@ def __init__(self,
         super().__init__()
         self.config = config
         self.ctx_mgr = ctx_mgr
+
+        self.vision_model = MllamaVisionModel(
+            config.vision_config,
+            dtype=dtype,
+            device=device,
+        )
         # build MllamaForCausalLM
         self.language_model = MllamaForCausalLM(config.text_config,
                                                 dtype=dtype,
                                                 device=device)
+
+        self.multi_modal_projector = build_rowwise_linear(
+            config.vision_config.vision_output_dim,
+            config.text_config.hidden_size,
+            bias=True,
+            dtype=dtype,
+            device=device,
+        )
         self.dtype = dtype
 
-    def flat_encoder_result(self, cross_attention_states: torch.Tensor,
-                            attn_metadata: Any, input_ids: torch.LongTensor):
+        # preprocessor
+        self.input_processor = MLlamaInputProcessor(self.config, dtype)
+
+    def flat_encoder_result(self, attn_metadata: Any,
+                            input_ids: torch.LongTensor):
         # since every state share the same shape
-        cross_attention_states = torch.cat(cross_attention_states, 0)
         full_text_row_masked_out_mask = torch.ones(
             (attn_metadata.q_seqlens.sum(), 1), dtype=torch.bool)
         start_pos = 0
@@ -621,39 +1211,51 @@ def flat_encoder_result(self, cross_attention_states: torch.Tensor,
             full_text_row_masked_out_mask[start_pos:img_id] = False
             start_pos += q_seq_len
         full_text_row_masked_out_mask = full_text_row_masked_out_mask.to(
-            cross_attention_states.device)
+            input_ids.device)
 
-        return cross_attention_states, full_text_row_masked_out_mask
+        return full_text_row_masked_out_mask
 
     def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
         past_key_values: List[List[torch.Tensor]],
-        cross_attention_states: Optional[torch.Tensor] = None,
+        pixel_values: torch.Tensor = None,
+        aspect_ratio_ids: torch.Tensor = None,
+        aspect_ratio_mask: torch.Tensor = None,
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         cross_attn_metadata: Any = None,
         **kwargs,
     ):
         """model forward, return logits."""
+
         if cross_attn_metadata is None:
             full_text_row_masked_out_mask = None
         # FIXME basically, we want to inference
         # text requests and image requests separately
-        elif cross_attention_states is None and (
-                cross_attn_metadata.kv_seqlens is None
-                or int(cross_attn_metadata.kv_seqlens.sum()) == 0):
+        elif pixel_values is None and (cross_attn_metadata.kv_seqlens is None):
             full_text_row_masked_out_mask = None
         elif cross_attn_metadata.is_decoding:
-            cross_attention_states = None
-            full_text_row_masked_out_mask = torch.ones(
-                (attn_metadata.q_seqlens.sum(), 1),
-                dtype=torch.bool,
-                device=input_ids.device)
+            full_text_row_masked_out_mask = input_ids.new_ones(
+                input_ids.size(-1), 1)
         else:
-            cross_attention_states, full_text_row_masked_out_mask = \
-                self.flat_encoder_result(cross_attention_states, cross_attn_metadata, input_ids)  # noqa
+            full_text_row_masked_out_mask = self.flat_encoder_result(
+                cross_attn_metadata, input_ids)  # noqa
+
+        cross_attention_states = None
+        if pixel_values is not None:
+            cross_attention_states = self.vision_model(
+                pixel_values=pixel_values,
+                aspect_ratio_ids=aspect_ratio_ids,
+                aspect_ratio_mask=aspect_ratio_mask,
+            )
+            cross_attention_states = self.multi_modal_projector(
+                cross_attention_states)
+            _, bsz, _, _, image_token_dim = tuple(cross_attention_states.shape)
+            cross_attention_states = cross_attention_states.view(
+                bsz, -1, image_token_dim)
+
         hidden_states = self.language_model(
             input_ids=input_ids,
             position_ids=position_ids,
@@ -670,15 +1272,6 @@ def get_logits(self, hidden_states: torch.Tensor):
         """compute logits of the model output."""
         return self.language_model.get_logits(hidden_states)
 
-    def support_cuda_graph(
-        self,
-        input_ids: torch.Tensor,
-        **kwargs,
-    ):
-        """support cudagraph."""
-
-        return False
-
     def get_input_embeddings(self):
         """get input embeddings."""
         return self.language_model.model.get_input_embeddings()
@@ -694,14 +1287,32 @@ def prepare_inputs_for_generation(
         input_ids = context.input_ids
         position_ids = context.position_ids
         attn_metadata = context.attn_metadata
-        cross_attention_states = context.cross_attention_states
-        if cross_attention_states is not None:
-            cross_attention_states = [
-                t.to(input_ids.device) for t in cross_attention_states
-                if t is not None
-            ]
         cross_attn_metadata = context.cross_attn_metadata
 
+        if int(cross_attn_metadata.kv_seqlens.sum()) == 0:
+            cross_attn_metadata.kv_seqlens = None
+        device = input_ids.device
+
+        # process image input
+        pixel_values = None
+        aspect_ratio_ids = None
+        aspect_ratio_mask = None
+        if context.input_multimodals is not None:
+            pixel_values = []
+            aspect_ratio_ids = []
+            aspect_ratio_mask = []
+            batched_image_data = [
+                input_mm['image'] for input_mm in context.input_multimodals
+            ]
+            for image_data in batched_image_data:
+                for data in image_data:
+                    pixel_values.append(data.data)
+                    aspect_ratio_ids.append(data.meta['aspect_ratio_ids'])
+                    aspect_ratio_mask.append(data.meta['aspect_ratio_mask'])
+            pixel_values = torch.cat(pixel_values, dim=0).to(device)
+            aspect_ratio_ids = torch.cat(aspect_ratio_ids, dim=0).to(device)
+            aspect_ratio_mask = torch.cat(aspect_ratio_mask, dim=0).to(device)
+
         # process vision embeddings
         vision_embeddings = context.input_embeddings
         vision_embedding_indexing = context.input_embedding_indexing
@@ -719,7 +1330,9 @@ def prepare_inputs_for_generation(
             past_key_values=past_key_values,
             attn_metadata=attn_metadata,
             inputs_embeds=inputs_embeds,
-            cross_attention_states=cross_attention_states,
+            pixel_values=pixel_values,
+            aspect_ratio_ids=aspect_ratio_ids,
+            aspect_ratio_mask=aspect_ratio_mask,
             cross_attn_metadata=cross_attn_metadata,
         )
 
@@ -742,8 +1355,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             if ('rotary_emb.cos_cached' in name
                     or 'rotary_emb.sin_cached' in name):
                 continue
-            if 'vision_model' in name or 'multi_modal_projector' in name:
-                continue
             if self.config.text_config.tie_word_embeddings and 'lm_head.weight' in name:  # noqa
                 continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
@@ -756,3 +1367,161 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             else:
                 param = params_dict[name]
                 load_weight(param, loaded_weight)
+
+    def support_cuda_graph(
+        self,
+        input_ids: torch.Tensor,
+        attn_metadata: Any,
+        cross_attn_metadata: Any,
+        **kwargs,
+    ):
+        """support cudagraph."""
+
+        if not attn_metadata.is_decoding:
+            return False
+
+        if cross_attn_metadata is None:
+            return False
+
+        if cross_attn_metadata.kv_seqlens is None:
+            return False
+
+        return True
+
+    def make_buffers_cudagraph(self, graph_meta: CudaGraphMeta, **kwargs):
+        """make cudagraph buffers from forward inputs."""
+        input_buffers = super().make_buffers_cudagraph(graph_meta=graph_meta,
+                                                       **kwargs)
+
+        device = graph_meta.device
+        max_batches = graph_meta.max_batchs
+        input_buffers['cross_kv_seqlens'] = torch.zeros(max_batches,
+                                                        dtype=torch.int64,
+                                                        device=device)
+
+        return input_buffers
+
+    def fill_buffers_cudagraph(self, graph_meta: CudaGraphMeta, **kwargs):
+        """fill cudagraph buffers from forward inputs."""
+        input_buffers = graph_meta.input_buffers
+
+        new_inputs = super().fill_buffers_cudagraph(graph_meta=graph_meta,
+                                                    **kwargs)
+
+        attn_metadata = new_inputs['attn_metadata']
+        cross_attn_metadata = new_inputs['cross_attn_metadata']
+        block_offsets = attn_metadata.block_offsets
+        batch_size, _ = block_offsets.size()
+
+        kv_seqlens = cross_attn_metadata.kv_seqlens
+        if kv_seqlens.data_ptr() != input_buffers['cross_kv_seqlens'].data_ptr(
+        ):
+            input_buffers['cross_kv_seqlens'].zero_()
+        input_buffers['cross_kv_seqlens'][:batch_size] = kv_seqlens
+
+        new_batch_size = next_power_of_2(batch_size)
+        cross_attn_metadata.block_offsets = input_buffers[
+            'block_offsets'][:new_batch_size]
+        cross_attn_metadata.q_start_loc = input_buffers[
+            'q_start_loc'][:new_batch_size]
+        cross_attn_metadata.q_seqlens = input_buffers[
+            'q_seqlens'][:new_batch_size]
+        cross_attn_metadata.kv_seqlens = input_buffers[
+            'cross_kv_seqlens'][:new_batch_size]
+
+        new_inputs['cross_attn_metadata'] = cross_attn_metadata
+        return new_inputs
+
+    def update_model_metas(self,
+                           past_key_values: List[List[torch.Tensor]],
+                           inputs_embeds: Optional[torch.Tensor] = None,
+                           context: StepContext = None):
+        """update model meta."""
+        model_metas = context.model_metas
+        if model_metas is None:
+            batch_size = context.q_seqlens.size(0)
+            model_metas = [dict(cross_kv_len=0) for _ in range(batch_size)]
+
+        if context.is_decoding:
+            return model_metas
+
+        vision_inputs = context.vision_inputs
+        if vision_inputs is None:
+            return model_metas
+
+        input_mms = vision_inputs.input_multimodals
+        if input_mms is None:
+            return model_metas
+
+        config = self.config.vision_config
+        image_size = config.image_size
+        patch_size = config.patch_size
+        wh = image_size // patch_size
+        img_kv_len = wh * wh + 1
+        img_kv_len = img_kv_len * 4
+
+        new_model_metas = []
+        for idx, input_mm in enumerate(input_mms):
+            if input_mm is None:
+                new_model_metas.append(model_metas[idx])
+            images = input_mm['image']
+            num_img = len(images)
+
+            cross_kv_len = 0
+            if model_metas[idx] is not None:
+                cross_kv_len = model_metas[idx].get('cross_kv_len',
+                                                    cross_kv_len)
+            cross_kv_len += img_kv_len * num_img
+            new_model_metas.append(dict(cross_kv_len=cross_kv_len))
+
+        return model_metas
+
+    def get_input_processor(self) -> BaseModelInputProcessor:
+        """get input processor."""
+        return self.input_processor
+
+
+class MLlamaInputProcessor(BaseModelInputProcessor):
+    """mllama input processor."""
+
+    def __init__(self, config: LlamaConfig, dtype: torch.dtype) -> None:
+        self.config = config
+        self.dtype = dtype
+
+        vision_config = self.config.vision_config
+        image_size = vision_config.image_size
+        patch_size = vision_config.patch_size
+        wh = image_size // patch_size
+        encoder_len = wh * wh + 1
+        encoder_len = encoder_len * 4
+        self.encoder_len = encoder_len
+
+    def preprocess_input(self, input_ids, input_multimodals, **kwargs):
+        """prepare multimodal input."""
+        if input_multimodals is None or len(input_multimodals) == 0:
+            return input_ids, input_multimodals
+
+        input_imgs = []
+        for input_mm in input_multimodals:
+            pixel_values = input_mm['pixel_values']
+            aspect_ratio_ids = input_mm['aspect_ratio_ids']
+            aspect_ratio_mask = input_mm['aspect_ratio_mask']
+            offset = input_mm['offset']
+
+            if pixel_values.dtype != self.dtype:
+                pixel_values = pixel_values.to(self.dtype)
+
+            mm_data = MultiModalTensor(
+                data=pixel_values,
+                start=offset,
+                end=offset + 1,
+                encoder_len=self.encoder_len,
+                meta=dict(aspect_ratio_ids=aspect_ratio_ids,
+                          aspect_ratio_mask=aspect_ratio_mask))
+            input_imgs.append(mm_data)
+
+        result = PreprocessInputResult(
+            input_ids=input_ids,
+            input_multimodals=dict(image=input_imgs),
+        )
+        return result
diff --git a/lmdeploy/pytorch/models/module_map.py b/lmdeploy/pytorch/models/module_map.py
index 1059bfee4e..e7b460026a 100644
--- a/lmdeploy/pytorch/models/module_map.py
+++ b/lmdeploy/pytorch/models/module_map.py
@@ -85,14 +85,10 @@
 # llava
 MODULE_MAP.update(
     {
-        'LlavaLlamaForCausalLM':
-        f'{LMDEPLOY_PYTORCH_MODEL_PATH}.llama.LlavaLlamaForCausalLM',
-        'LlavaMistralForCausalLM':
-        f'{LMDEPLOY_PYTORCH_MODEL_PATH}.mistral.LlavaMistralForCausalLM',
         'LlavaForConditionalGeneration':
         f'{LMDEPLOY_PYTORCH_MODEL_PATH}.llava.LlavaForConditionalGeneration',  # noqa: E501
         'LlavaNextForConditionalGeneration':  # noqa: E501
-        f'{LMDEPLOY_PYTORCH_MODEL_PATH}.llava.LlavaForConditionalGeneration'
+        f'{LMDEPLOY_PYTORCH_MODEL_PATH}.llava.LlavaNextForConditionalGeneration'  # noqa: E501
     })
 
 # qwen
@@ -158,7 +154,7 @@
 # phi3 vision
 MODULE_MAP.update({
     'Phi3VForCausalLM':
-    f'{LMDEPLOY_PYTORCH_MODEL_PATH}.phi3.Phi3VForCausalLM',
+    f'{LMDEPLOY_PYTORCH_MODEL_PATH}.phi3_v.Phi3VForCausalLM',
 })
 
 # phi-3.5-moe
diff --git a/lmdeploy/pytorch/models/phi3.py b/lmdeploy/pytorch/models/phi3.py
index f9477fdab8..288fdf3b19 100644
--- a/lmdeploy/pytorch/models/phi3.py
+++ b/lmdeploy/pytorch/models/phi3.py
@@ -435,7 +435,3 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             else:
                 param = params_dict[name]
                 load_weight(param, loaded_weight)
-
-
-class Phi3VForCausalLM(Phi3ForCausalLM):
-    ...
diff --git a/lmdeploy/pytorch/models/phi3_v.py b/lmdeploy/pytorch/models/phi3_v.py
new file mode 100644
index 0000000000..c4bf72c767
--- /dev/null
+++ b/lmdeploy/pytorch/models/phi3_v.py
@@ -0,0 +1,476 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import CLIPVisionConfig, CLIPVisionModel, PretrainedConfig
+
+from lmdeploy.pytorch.engine.input_process import (BaseModelInputProcessor,
+                                                   PreprocessInputResult)
+from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
+from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.nn.linear import build_rowwise_linear
+from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
+
+from .phi3 import Phi3ForCausalLM, Phi3Model
+from .utils.model import DeployModelMixin
+
+CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(attention_dropout=0.0,
+                                                     dropout=0.0,
+                                                     hidden_act='quick_gelu',
+                                                     hidden_size=1024,
+                                                     image_size=336,
+                                                     initializer_factor=1.0,
+                                                     initializer_range=0.02,
+                                                     intermediate_size=4096,
+                                                     layer_norm_eps=1e-05,
+                                                     num_attention_heads=16,
+                                                     num_channels=3,
+                                                     num_hidden_layers=24,
+                                                     patch_size=14,
+                                                     projection_dim=768)
+
+
+class Phi3ImageEmbedding(nn.Module):
+    """image embedding."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 wte=None,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None,
+                 **kwargs):
+        super().__init__()
+        self.config = config
+        hidden_size = config.n_embd if hasattr(
+            config, 'n_embd') else config.hidden_size
+
+        self.wte = wte
+
+        if (isinstance(config.img_processor, dict) and
+                config.img_processor.get('name', None) == 'clip_vision_model'):
+            assert 'model_name' in config.img_processor, (
+                'model_name must be provided for CLIPVisionModel')
+            assert 'image_dim_out' in config.img_processor, (
+                'image_dim_out must be provided for CLIPVisionModel')
+            assert 'num_img_tokens' in config.img_processor, (
+                'num_img_tokens must be provided for CLIPVisionModel')
+            assert config.img_processor[
+                'model_name'] == 'openai/clip-vit-large-patch14-336'
+            clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
+            self.img_processor = CLIPVisionModel(clip_config).to(device).to(
+                dtype)
+            image_dim_out = config.img_processor['image_dim_out']
+            self.num_img_tokens = config.img_processor['num_img_tokens']
+        else:
+            raise NotImplementedError(
+                f'img_processor = {config.img_processor}, not implemented')
+
+        self.image_dim_out = image_dim_out
+        self.img_sizes = None
+
+        self.use_hd_transform = kwargs.get('use_hd_transform', False)
+        self.with_learnable_separator = kwargs.get('with_learnable_separator',
+                                                   False)
+        self.hd_transform_order = kwargs.get('hd_transform_order', 'glb_sub')
+        # with_hd_transform and with_learnable_separator should have same value
+        assert (self.use_hd_transform == self.with_learnable_separator), (
+            'use_hd_transform and with_learnable_separator '
+            'should have same value')
+        if self.with_learnable_separator:
+            assert self.use_hd_transform, (
+                'learnable separator is only for hd transform')
+            # 1024 * 4, merge spatial to channel dimension
+            self.glb_GN = nn.Parameter(
+                torch.empty([1, 1, self.image_dim_out * 4],
+                            dtype=dtype,
+                            device=device))
+            self.sub_GN = nn.Parameter(
+                torch.empty([1, 1, 1, self.image_dim_out * 4],
+                            dtype=dtype,
+                            device=device))
+
+        projection_cls = kwargs.get('projection_cls', 'linear')
+        if projection_cls == 'linear':
+            self.img_projection = nn.Linear(image_dim_out,
+                                            hidden_size,
+                                            dtype=dtype,
+                                            device=device)
+        elif projection_cls == 'mlp' and self.use_hd_transform:
+            dim_projection = hidden_size
+            depth = 2
+            layers = [
+                nn.Linear(image_dim_out * 4,
+                          dim_projection,
+                          dtype=dtype,
+                          device=device)
+            ]
+            for _ in range(1, depth):
+                layers.extend([
+                    nn.GELU(),
+                    nn.Linear(dim_projection,
+                              dim_projection,
+                              dtype=dtype,
+                              device=device)
+                ])
+            self.img_projection = nn.Sequential(*layers)
+        elif projection_cls == 'mlp':
+            dim_projection = hidden_size
+            depth = 2
+            layers = [
+                nn.Linear(image_dim_out,
+                          dim_projection,
+                          dtype=dtype,
+                          device=device)
+            ]
+            for _ in range(1, depth):
+                layers.extend([
+                    nn.GELU(),
+                    nn.Linear(dim_projection,
+                              dim_projection,
+                              dtype=dtype,
+                              device=device)
+                ])
+            self.img_projection = nn.Sequential(*layers)
+        else:
+            raise NotImplementedError(
+                f'projection_cls = {projection_cls}, not implemented')
+
+        self.vocab_size = config.vocab_size
+        self.img_features = None
+
+        if isinstance(config.img_processor, dict):
+            self.layer_idx = config.img_processor.get('layer_idx', -2)
+            self.type_feature = config.img_processor.get(
+                'type_feature', 'patch')
+        else:
+            self.layer_idx = -2
+            self.type_feature = 'patch'
+
+    def get_img_features(self,
+                         img_embeds: torch.FloatTensor) -> torch.FloatTensor:
+        LAYER_IDX = self.layer_idx
+        TYPE_FEATURE = self.type_feature
+
+        img_processor_output = self.img_processor(img_embeds,
+                                                  output_hidden_states=True)
+        img_feature = img_processor_output.hidden_states[LAYER_IDX]
+
+        if TYPE_FEATURE == 'patch':
+            patch_feature = img_feature[:, 1:]
+            return patch_feature
+
+        if TYPE_FEATURE == 'cls_patch':
+            return img_feature
+
+        raise NotImplementedError
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        pixel_values: torch.FloatTensor,
+        image_sizes=None,
+        image_mask: torch.Tensor = None,
+    ) -> torch.FloatTensor:
+        """forward."""
+
+        target_device = pixel_values.device
+        target_dtype = pixel_values.dtype
+
+        img_embeds = pixel_values
+        img_sizes = image_sizes
+        img_sizes = img_sizes.cpu()
+
+        if self.use_hd_transform and img_sizes is not None and len(img_sizes):
+            assert img_embeds.ndim == 5, f'img_embeds size: {img_embeds.size()}, expect 5D tensor for hd transform'  # noqa E501
+            # img_embeds: (num_images, max_num_crops, 3, H, W)
+            # img_sizes: (num_images, 2).view(1, -1)
+
+            bs = img_embeds.shape[0]
+            # Nx(HW)xC
+            img_features = self.get_img_features(img_embeds.flatten(0, 1))
+            base_feat_height = base_feat_width = int(
+                img_features.shape[1]**0.5)
+
+            assert base_feat_height == 24 and base_feat_width == 24, f'base_feat_height: {base_feat_height}, base_feat_width: {base_feat_width}, expect 24x24 features for hd transform'  # noqa E501
+
+            # bs x max_num_crops x (24x24) x C
+            img_features = img_features.view(
+                bs, -1, base_feat_height * base_feat_width, self.image_dim_out)
+            C = self.image_dim_out
+            H = base_feat_height
+
+            output_imgs = []
+            output_len = []
+            # training is tensor, inference is list
+            if isinstance(img_sizes, torch.Tensor):
+                img_sizes = img_sizes.view(-1, 2)
+            for _bs in range(bs):
+                h, w = img_sizes[_bs]
+                h = h // 336
+                w = w // 336
+                B_ = h * w
+
+                # 1 x (24x24) x 1024
+                global_img_feature = img_features[_bs, :1]
+
+                # 1 x 12 x 12 x 4096
+                glb_img = global_img_feature.reshape(
+                    1, H // 2, 2, H // 2, 2,
+                    C).permute(0, 1, 3, 2, 4,
+                               5).reshape(1, H // 2, H // 2, 4 * C)
+                temp_glb_GN = self.sub_GN.repeat(1, H // 2, 1, 1)
+
+                # 1 x 156 x 4096
+                glb_img = torch.cat([glb_img, temp_glb_GN],
+                                    dim=2).reshape(1, -1, 4 * C)
+
+                # (max_num_crops-1) x (12x12) x C
+                sub_img = img_features[_bs, 1:]
+                # 16x574x1024
+                # get rid of padding sub_img
+                sub_img = sub_img[:B_]
+
+                # (num_crops, 12, 2, 12, 2, 1024)
+                # ->(num_crops, 12, 12, 2, 2, 1024)
+                # -> (num_crops, 12*12, 4*1024)
+                sub_img = (sub_img.reshape(B_, H // 2, 2, H // 2, 2,
+                                           C).permute(0, 1, 3, 2, 4, 5))
+                sub_img = sub_img.reshape(1, h, w, 12, 12, -1).permute(
+                    0, 1, 3, 2, 4, 5).reshape(1, h * 12, w * 12, 4 * C)
+                temp_sub_GN = self.sub_GN.repeat(1, h * 12, 1, 1)
+                sub_img = torch.cat([sub_img, temp_sub_GN],
+                                    dim=2).reshape(1, -1, 4 * C)
+                # (1, num_img_tokens, 1024*4)
+
+                # glb + sub
+                if self.hd_transform_order == 'glb_sub':
+                    output_imgs.append(
+                        torch.cat([glb_img, self.glb_GN, sub_img], dim=1))
+                elif self.hd_transform_order == 'sub_glb':
+                    output_imgs.append(
+                        torch.cat([sub_img, self.glb_GN, glb_img], dim=1))
+                else:
+                    raise NotImplementedError(
+                        f'hd_transform_order = {self.hd_transform_order}'
+                    )  # noqa E501
+
+                temp_len = int((h * w + 1) * 144 + 1 + (h + 1) * 12)
+                assert temp_len == output_imgs[-1].shape[
+                    1], f'temp_len: {temp_len}, output_imgs[-1].shape[1]: {output_imgs[-1].shape[1]}'  # noqa E501
+                output_len.append(temp_len)
+
+            img_set_tensor = []
+            for _output_img in output_imgs:
+                img_feature_proj = self.img_projection(
+                    _output_img.to(target_device).to(target_dtype))
+                img_feature_proj = img_feature_proj.flatten(0, 1)
+                img_set_tensor.append(img_feature_proj)
+            img_set_tensor = torch.cat(img_set_tensor)[None]
+        elif img_embeds.ndim == 4:
+            tt = (self.get_img_features(img_embeds).to(target_device).to(
+                target_dtype).reshape(-1, self.image_dim_out))
+            img_set_tensor = self.img_projection(
+                tt)  # adapted visual features.
+        elif img_embeds.ndim == 3:
+            tt = (img_embeds.to(target_device).to(target_dtype).view(
+                -1, self.image_dim_out))
+            img_set_tensor = self.img_projection(
+                tt)  # adapted visual features.
+        else:
+            raise NotImplementedError
+
+        hidden_states = self.wte(input_ids)
+
+        hidden_states.masked_scatter_(image_mask[..., None], img_set_tensor)
+
+        return hidden_states
+
+
+class Phi3VModel(Phi3Model):
+    """phi3v model."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__(config=config, dtype=dtype, device=device)
+
+        self.vision_embed_tokens = None
+        if isinstance(config.embd_layer, dict):
+            # vision embedding layer
+            embedding_config = {
+                'embedding_cls': config.embd_layer['embedding_cls'],
+                **config.embd_layer
+            }
+            self.vision_embed_tokens = Phi3ImageEmbedding(
+                config,
+                wte=self.embed_tokens,
+                dtype=dtype,
+                device=device,
+                **embedding_config)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        attn_metadata: Any = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[torch.LongTensor] = None,
+        image_mask: torch.Tensor = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ):
+        """Rewrite of LlamaModel.forward."""
+
+        if inputs_embeds is None and pixel_values is not None:
+            inputs_embeds = self.vision_embed_tokens(
+                input_ids,
+                pixel_values,
+                image_sizes,
+                image_mask,
+            )
+
+        return super().forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            attn_metadata=attn_metadata,
+            inputs_embeds=inputs_embeds,
+        )
+
+
+class Phi3VForCausalLM(Phi3ForCausalLM, DeployModelMixin):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 ctx_mgr: StepContextManager,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__(config, ctx_mgr, dtype=dtype, device=device)
+        self.config = config
+        self.ctx_mgr = ctx_mgr
+        # build model
+        self.model = Phi3VModel(config, dtype=dtype, device=device)
+        # build lm_head
+        self.lm_head = build_rowwise_linear(config.hidden_size,
+                                            config.vocab_size,
+                                            bias=False,
+                                            dtype=dtype,
+                                            device=device)
+
+        self.input_processor = Phi3VInputProcessor(config, dtype)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        past_key_values: List[List[torch.Tensor]],
+        attn_metadata: Any = None,
+        pixel_values: torch.Tensor = None,
+        image_sizes: torch.Tensor = None,
+        image_mask: torch.Tensor = None,
+        inputs_embeds: torch.Tensor = None,
+        **kwargs,
+    ):
+        """forward."""
+        hidden_states = self.model(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            attn_metadata=attn_metadata,
+            pixel_values=pixel_values,
+            image_sizes=image_sizes,
+            image_mask=image_mask,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def prepare_inputs_for_generation(
+        self,
+        past_key_values: List[List[torch.Tensor]],
+        inputs_embeds: torch.Tensor = None,
+        context: StepContext = None,
+    ):
+        """prepare input."""
+        output = super().prepare_inputs_for_generation(
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            context=context)
+
+        # vision inputs
+        pixel_values = None
+        if context.input_multimodals is not None:
+            input_mms = [
+                input_mm.get('image', [])
+                for input_mm in context.input_multimodals
+            ]
+            # flatten batch
+            input_mms = [data for im_data in input_mms for data in im_data]
+            if len(input_mms) > 0:
+                pixel_values = torch.cat([data.data for data in input_mms])
+                image_sizes = torch.cat(
+                    [data.meta['image_sizes'] for data in input_mms])
+                image_token_id = input_mms[0].meta['image_token_id']
+                image_mask = output['input_ids'] == image_token_id
+                output['pixel_values'] = pixel_values
+                output['image_sizes'] = image_sizes
+                output['image_mask'] = image_mask
+
+        return output
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        """load weights."""
+        super().load_weights(weights)
+
+        vis_prefix = 'vision_embed_tokens.'
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if not (vis_prefix in name):
+                continue
+            param = params_dict[name]
+            load_weight(param, loaded_weight)
+
+    def get_input_processor(self) -> BaseModelInputProcessor:
+        """get input processor."""
+        return self.input_processor
+
+
+class Phi3VInputProcessor(BaseModelInputProcessor):
+    """Phi3V input processor."""
+
+    def __init__(self, config: PretrainedConfig, dtype) -> None:
+        self.config = config
+        self.dtype = dtype
+
+    def preprocess_input(self,
+                         input_ids: List[int],
+                         input_multimodals: List[Dict[str, Any]] = None,
+                         **kwargs) -> PreprocessInputResult:
+        """prepare multimodal input."""
+        if input_multimodals is None or len(input_multimodals) == 0:
+            return input_ids, input_multimodals
+
+        input_imgs = []
+        for input_mm in input_multimodals:
+            pixel_values = input_mm['pixel_values'].to(self.dtype)
+            image_sizes = input_mm['image_sizes']
+            offset = input_mm['offset']
+            image_token_id = input_mm.get('image_token_id', 0)
+            num_pad = input_mm['image_tokens']
+            if isinstance(num_pad, torch.Tensor):
+                num_pad = num_pad.item()
+
+            mm_data = MultiModalTensor(data=pixel_values,
+                                       start=offset,
+                                       end=offset + num_pad,
+                                       meta=dict(
+                                           image_sizes=image_sizes,
+                                           image_token_id=image_token_id))
+            input_imgs.append(mm_data)
+
+        result = PreprocessInputResult(
+            input_ids=input_ids,
+            input_multimodals=dict(image=input_imgs),
+        )
+        return result
diff --git a/lmdeploy/pytorch/models/qwen2_vl.py b/lmdeploy/pytorch/models/qwen2_vl.py
index b10baaa4d5..4e2b1017b5 100644
--- a/lmdeploy/pytorch/models/qwen2_vl.py
+++ b/lmdeploy/pytorch/models/qwen2_vl.py
@@ -1,18 +1,24 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, Callable, Iterable, List, Optional, Tuple
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
 
 import torch
 from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 
+from lmdeploy.pytorch.engine.input_process import (BaseModelInputProcessor,
+                                                   PreprocessInputResult)
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm, RopeType,
-                                 SiluAndMul, build_rotary_embedding)
-from lmdeploy.pytorch.nn.linear import (build_merged_colwise_linear,
+from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, FlashAttention,
+                                 LayerNorm, RMSNorm, RopeType, SiluAndMul,
+                                 build_rotary_embedding)
+from lmdeploy.pytorch.nn.linear import (build_colwise_linear,
+                                        build_merged_colwise_linear,
                                         build_qkv_proj, build_rowwise_linear)
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .utils.cudagraph import CudaGraphMeta, CudaGraphMixin, next_power_of_2
+from .utils.model import DeployModelMixin
 
 
 def _apply_mrope_selection(hidden_states: torch.Tensor,
@@ -337,7 +343,337 @@ def get_input_embeddings(self):
         return self.embed_tokens
 
 
-class Qwen2VLForConditionalGeneration(nn.Module, CudaGraphMixin):
+class PatchEmbed(nn.Module):
+    """Patch Embed."""
+
+    def __init__(self,
+                 patch_size: int = 14,
+                 temporal_patch_size: int = 2,
+                 in_channels: int = 3,
+                 embed_dim: int = 1152,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+
+        kernel_size = [temporal_patch_size, patch_size, patch_size]
+        self.proj = nn.Conv3d(in_channels,
+                              embed_dim,
+                              kernel_size=kernel_size,
+                              stride=kernel_size,
+                              bias=False,
+                              dtype=dtype,
+                              device=device)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        target_dtype = self.proj.weight.dtype
+        hidden_states = hidden_states.view(-1, self.in_channels,
+                                           self.temporal_patch_size,
+                                           self.patch_size, self.patch_size)
+        hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(
+            -1, self.embed_dim)
+        return hidden_states
+
+
+class VisionRotaryEmbedding(nn.Module):
+    """vision rotary embedding."""
+
+    def __init__(self,
+                 dim: int,
+                 theta: float = 10000.0,
+                 device: torch.device = None) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta**(
+            torch.arange(0, dim, 2, dtype=torch.float, device=device) / dim))
+        self.register_buffer('inv_freq', inv_freq, persistent=False)
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen,
+                           device=self.inv_freq.device,
+                           dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+
+
+class VisionAttention(nn.Module):
+    """Vision attention."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        quantization_config = getattr(config, 'quantization_config', None)
+        dim = config.embed_dim
+        num_heads = config.num_heads
+        head_dim = dim // num_heads
+        self.head_dim = head_dim
+
+        # packed qkv
+        self.qkv = build_qkv_proj(
+            dim,
+            num_q_heads=num_heads,
+            num_kv_heads=num_heads,
+            head_size=head_dim,
+            bias=True,
+            quant_config=quantization_config,
+            dtype=dtype,
+            device=device,
+        )
+
+        # rotary embedding
+        self.apply_rotary_pos_emb = ApplyRotaryEmb()
+
+        # attention
+        self.attention = FlashAttention(
+            num_heads,
+            head_dim,
+            causal=False,
+        )
+
+        # o_proj
+        self.proj = build_rowwise_linear(dim,
+                                         dim,
+                                         bias=True,
+                                         quant_config=quantization_config,
+                                         dtype=dtype,
+                                         device=device,
+                                         is_tp=True)
+
+    def forward(
+        self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor]
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        # qkv proj
+        qkv_states = self.qkv(hidden_states)
+        # (-1, heads, head_dim)
+        qkv_states = qkv_states.flatten(0, -2)
+        q, k, v = self.qkv.split_qkv(qkv_states)
+
+        cos, sin = rotary_pos_emb
+        q, k = self.apply_rotary_pos_emb(q, k, cos, sin)
+
+        attn_output = self.attention(
+            q,
+            k,
+            v,
+            q_start_loc=cu_seqlens[:-1],
+            q_seqlens=cu_seqlens[1:] - cu_seqlens[:-1],
+        )
+
+        attn_output = attn_output.reshape(seq_length, -1)
+
+        # o proj
+        attn_output = self.proj(attn_output)
+        return attn_output
+
+
+class VisionMlp(nn.Module):
+    """Vision mlp."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        from transformers.activations import ACT2FN
+        dim = config.embed_dim
+        hidden_dim = int(config.embed_dim * config.mlp_ratio)
+        quantization_config = getattr(config, 'quantization_config', None)
+        # gate up
+        self.fc1 = build_colwise_linear(
+            dim,
+            hidden_dim,
+            bias=True,
+            dtype=dtype,
+            device=device,
+            quant_config=quantization_config,
+            is_tp=True,
+        )
+
+        # silu and mul
+        if config.hidden_act in [
+                'gelu', 'gelu_fast', 'quick_gelu', 'gelu_python'
+        ]:
+            self.act = nn.GELU()
+        else:
+            self.act = ACT2FN[config.hidden_act]
+
+        # down
+        self.fc2 = build_rowwise_linear(hidden_dim,
+                                        dim,
+                                        bias=True,
+                                        quant_config=quantization_config,
+                                        dtype=dtype,
+                                        device=device,
+                                        is_tp=True)
+
+    def forward(self, x):
+        """forward."""
+        return self.fc2(self.act(self.fc1(x)))
+
+
+class Qwen2VLVisionBlock(nn.Module):
+    """Vision block."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 layer_idx: int,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.norm1 = LayerNorm(config.embed_dim,
+                               eps=1e-6,
+                               dtype=dtype,
+                               device=device)
+        self.norm2 = LayerNorm(config.embed_dim,
+                               eps=1e-6,
+                               dtype=dtype,
+                               device=device)
+
+        self.attn = VisionAttention(config, dtype=dtype, device=device)
+
+        self.mlp = VisionMlp(config, dtype=dtype, device=device)
+
+    def forward(self,
+                hidden_states,
+                cu_seqlens,
+                rotary_pos_emb,
+                residual: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.norm1(hidden_states)
+        else:
+            hidden_states, residual = self.norm1(hidden_states, residual)
+
+        hidden_states = self.attn(hidden_states,
+                                  cu_seqlens=cu_seqlens,
+                                  rotary_pos_emb=rotary_pos_emb)
+
+        hidden_states, residual = self.norm2(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class PatchMerger(nn.Module):
+    """PatchMerger."""
+
+    def __init__(self,
+                 dim: int,
+                 context_dim: int,
+                 spatial_merge_size: int = 2,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+        self.ln_q = nn.LayerNorm(context_dim,
+                                 eps=1e-6,
+                                 dtype=dtype,
+                                 device=device)
+        self.mlp = nn.Sequential(
+            nn.Linear(self.hidden_size,
+                      self.hidden_size,
+                      dtype=dtype,
+                      device=device),
+            nn.GELU(),
+            nn.Linear(self.hidden_size, dim, dtype=dtype, device=device),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
+        return x
+
+
+class Qwen2VisionTransformerPretrainedModel(nn.Module):
+    """Vision transformer."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.config = config
+        self.spatial_merge_size = config.spatial_merge_size
+
+        self.patch_embed = PatchEmbed(
+            patch_size=config.patch_size,
+            temporal_patch_size=config.temporal_patch_size,
+            in_channels=config.in_channels,
+            embed_dim=config.embed_dim,
+            dtype=dtype,
+            device=device,
+        )
+
+        head_dim = config.embed_dim // config.num_heads
+        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2,
+                                                    device=device)
+
+        self.blocks = nn.ModuleList([
+            Qwen2VLVisionBlock(config, layer_idx, dtype=dtype, device=device)
+            for layer_idx in range(config.depth)
+        ])
+        self.merger = PatchMerger(dim=config.hidden_size,
+                                  context_dim=config.embed_dim,
+                                  spatial_merge_size=config.spatial_merge_size,
+                                  dtype=dtype,
+                                  device=device)
+
+    def rot_pos_emb(self, grid_thw):
+        """rotary position embedding."""
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(
+                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+
+    def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor,
+                rotary_pos_emb: torch.Tensor) -> torch.Tensor:
+        """forward."""
+        hidden_states = self.patch_embed(hidden_states)
+        cu_seqlens = torch.nn.functional.pad(cu_seqlens, (1, 0), value=0)
+
+        residual = None
+        for blk in self.blocks:
+            hidden_states, residual = blk(hidden_states,
+                                          cu_seqlens=cu_seqlens,
+                                          rotary_pos_emb=rotary_pos_emb,
+                                          residual=residual)
+
+        hidden_states = hidden_states + residual
+
+        return self.merger(hidden_states)
+
+
+class Qwen2VLForConditionalGeneration(nn.Module, DeployModelMixin,
+                                      CudaGraphMixin):
     """ModelForCausalLM."""
 
     packed_modules_mapping = {
@@ -360,6 +696,16 @@ def __init__(self,
         super().__init__()
         self.config = config
         self.ctx_mgr = ctx_mgr
+
+        # preprocessor
+        self.input_processor = Qwen2VLInputProcessor(self.config)
+
+        # build vision model
+        self.visual = Qwen2VisionTransformerPretrainedModel(
+            config.vision_config,
+            dtype=dtype,
+            device=device,
+        )
         # build model
         self.model = Qwen2Model(config, dtype=dtype, device=device)
         # build lm_head
@@ -377,9 +723,26 @@ def forward(
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         mrope_position_ids: torch.Tensor = None,
+        pixel_values: torch.Tensor = None,
+        vis_cu_seqlens: torch.Tensor = None,
+        vis_pos_emb: torch.Tensor = None,
+        image_mask: torch.Tensor = None,
         **kwargs,
     ):
         """model forward, return logits."""
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+            if pixel_values is not None:
+                dtype = inputs_embeds.dtype
+                pixel_values = pixel_values.to(dtype)
+                vis_pos_emb = (vis_pos_emb[0].to(dtype),
+                               vis_pos_emb[1].to(dtype))
+                image_embeds = self.visual(pixel_values,
+                                           cu_seqlens=vis_cu_seqlens,
+                                           rotary_pos_emb=vis_pos_emb)
+                inputs_embeds = inputs_embeds.masked_scatter(
+                    image_mask[..., None], image_embeds)
+
         hidden_states = self.model(
             input_ids=input_ids,
             position_ids=position_ids,
@@ -416,6 +779,36 @@ def prepare_inputs_for_generation(
         position_ids = context.position_ids
         attn_metadata = context.attn_metadata
 
+        pixel_values = None
+        vis_cu_seqlens = None
+        vis_pos_emb = None
+        image_mask = None
+        if context.input_multimodals is not None:
+            image_data = [
+                input_mm['image'] for input_mm in context.input_multimodals
+            ]
+
+            if len(image_data) > 0:
+                # flatten batch
+                image_data = [
+                    data for im_data in image_data for data in im_data
+                ]
+                pixel_values = torch.cat([data.data for data in image_data])
+                image_token_id = image_data[0].meta['image_token_id']
+                image_mask = input_ids == image_token_id
+                grid_thw = torch.cat(
+                    [data.meta['grid_thw'] for data in image_data]).cpu()
+                vis_pos_emb = self.visual.rot_pos_emb(grid_thw)
+                vis_cu_seqlens = torch.repeat_interleave(
+                    grid_thw[:, 1] * grid_thw[:, 2],
+                    grid_thw[:, 0]).to(pixel_values.device)
+                vis_cu_seqlens = vis_cu_seqlens.cumsum(dim=0,
+                                                       dtype=torch.int32)
+                vis_pos_emb = vis_pos_emb.repeat(1, 2)
+                vis_pos_emb = (vis_pos_emb.cos(), vis_pos_emb.sin())
+
+        mrope_position_ids = getattr(context, 'mrope_position_ids', None)
+
         # process vision embeddings
         vision_embeddings = context.input_embeddings
         vision_embedding_indexing = context.input_embedding_indexing
@@ -433,7 +826,11 @@ def prepare_inputs_for_generation(
             past_key_values=past_key_values,
             attn_metadata=attn_metadata,
             inputs_embeds=inputs_embeds,
-            mrope_position_ids=context.mrope_position_ids,
+            mrope_position_ids=mrope_position_ids,
+            pixel_values=pixel_values,
+            vis_cu_seqlens=vis_cu_seqlens,
+            vis_pos_emb=vis_pos_emb,
+            image_mask=image_mask,
         )
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
@@ -450,8 +847,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
-            if 'visual' in name:
-                continue
             if 'rotary_emb.inv_freq' in name:
                 continue
             if ('rotary_emb.cos_cached' in name
@@ -467,8 +862,15 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 load_weight(param, loaded_weight, shard_id=shard_id)
                 break
             else:
-                param = params_dict[name]
-                load_weight(param, loaded_weight)
+                if '.qkv.' in name:
+                    param = params_dict[name]
+                    q, k, v = param.weight_spliter(loaded_weight)
+                    load_weight(param, q, shard_id='q')
+                    load_weight(param, k, shard_id='k')
+                    load_weight(param, v, shard_id='v')
+                else:
+                    param = params_dict[name]
+                    load_weight(param, loaded_weight)
 
     def make_buffers_cudagraph(self, graph_meta: CudaGraphMeta, **kwargs):
         """make cudagraph buffers from forward inputs."""
@@ -510,3 +912,130 @@ def fill_buffers_cudagraph(self, graph_meta: CudaGraphMeta, **kwargs):
                     'mrope_position_ids']
 
         return new_inputs
+
+    def _update_model_meta_decoding(self, context: StepContext):
+        """update model meta for decoding."""
+        model_metas = context.model_metas
+        position_ids = context.position_ids
+
+        mrope_deltas = [meta['mrope_delta'] for meta in model_metas]
+        mrope_deltas = position_ids.new_tensor(mrope_deltas)
+        mrope_position_ids = position_ids + mrope_deltas[None]
+        mrope_position_ids = mrope_position_ids.expand(3, -1)
+
+        context.mrope_position_ids = mrope_position_ids
+        return model_metas
+
+    def _get_multimodal_pos_ids(self, grid_thw: list, device: torch.device):
+        """get mrope ids."""
+        t, h, w = grid_thw
+        h //= 2
+        w //= 2
+        stride = torch.tensor([h * w, w, 1], device=device)[:, None]
+        size = torch.tensor([t, h, w], device=device)[:, None]
+        pos_ids = torch.arange(t * h * w, device=device)[None].expand(3, -1)
+        pos_ids = pos_ids // stride % size
+        return pos_ids
+
+    def _update_model_meta_prefilling(self, context: StepContext):
+        """update model meta for prefilling."""
+        model_metas = context.model_metas
+        input_multimodals = context.input_multimodals
+        if input_multimodals is None:
+            input_multimodals = [None] * len(model_metas)
+        position_ids = context.position_ids
+        batched_pos_ids = position_ids[0].split(context.q_seqlens.tolist())
+        mrope_position_ids = []
+        new_model_metas = []
+        for pos_ids, model_meta, input_mm in zip(batched_pos_ids, model_metas,
+                                                 input_multimodals):
+            images = []
+            if input_mm is not None:
+                images = input_mm['image']
+            if model_meta is None or 'mrope_delta' not in model_meta:
+                mrope_delta = 0
+            else:
+                mrope_delta = model_meta['mrope_delta']
+
+            pos_start = pos_ids[0].item()
+            mrope_pos_ids = pos_ids + mrope_delta
+            mrope_pos_ids = mrope_pos_ids[None].expand(3, -1).clone()
+            for img in images:
+                grid_thw = img.meta['grid_thw'][0].tolist()
+                _, h, w = grid_thw
+                h //= 2
+                w //= 2
+                num_pad = img.end - img.start - max(h, w)
+                mrope_delta -= num_pad
+                fill_start = img.start - pos_start
+                fill_end = img.end - pos_start
+                img_pos_ids = self._get_multimodal_pos_ids(
+                    grid_thw, pos_ids.device)
+                img_pos_ids += mrope_pos_ids[:, fill_start:fill_start + 1]
+                mrope_pos_ids[:, fill_end:] -= num_pad
+                mrope_pos_ids[:, fill_start:fill_end] = img_pos_ids
+
+            mrope_position_ids.append(mrope_pos_ids)
+            new_model_metas.append(dict(mrope_delta=mrope_delta))
+
+        mrope_position_ids = torch.cat(mrope_position_ids, dim=1)
+        context.mrope_position_ids = mrope_position_ids
+
+        return new_model_metas
+
+    def update_model_metas(self,
+                           past_key_values: List[List[torch.Tensor]],
+                           inputs_embeds: Optional[torch.Tensor] = None,
+                           context: StepContext = None):
+        """update model meta."""
+        if context.is_decoding:
+            return self._update_model_meta_decoding(context)
+        else:
+            return self._update_model_meta_prefilling(context)
+
+    def get_input_processor(self) -> BaseModelInputProcessor:
+        """get input processor."""
+        return self.input_processor
+
+
+InputMultiModalType = List[Dict[str, Any]]
+
+
+class Qwen2VLInputProcessor(BaseModelInputProcessor):
+    """qwen2 input processor."""
+
+    def __init__(self, config: PretrainedConfig) -> None:
+        self.config = config
+
+    def preprocess_input(self,
+                         input_ids: List[int],
+                         input_multimodals: List[Dict[str, Any]] = None,
+                         **kwargs) -> PreprocessInputResult:
+        """prepare multimodal input."""
+        if input_multimodals is None or len(input_multimodals) == 0:
+            return input_ids, input_multimodals
+
+        input_imgs = []
+        for input_mm in input_multimodals:
+            pixel_values = input_mm['pixel_values']
+            image_grid_thw = input_mm['image_grid_thw']
+            offset = input_mm['offset']
+            start = offset
+            image_token_id = input_mm.get('image_token_id', 0)
+            num_pad = input_mm['image_tokens']
+            if isinstance(num_pad, torch.Tensor):
+                num_pad = num_pad.item()
+
+            mm_data = MultiModalTensor(data=pixel_values,
+                                       start=start,
+                                       end=start + num_pad,
+                                       meta=dict(
+                                           grid_thw=image_grid_thw,
+                                           image_token_id=image_token_id))
+            input_imgs.append(mm_data)
+
+        result = PreprocessInputResult(
+            input_ids=input_ids,
+            input_multimodals=dict(image=input_imgs),
+        )
+        return result
diff --git a/lmdeploy/pytorch/models/utils/model.py b/lmdeploy/pytorch/models/utils/model.py
new file mode 100644
index 0000000000..99bd4c4bfb
--- /dev/null
+++ b/lmdeploy/pytorch/models/utils/model.py
@@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+
+from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor
+from lmdeploy.pytorch.model_inputs import StepContext
+
+
+class DeployModelMixin:
+
+    def forward(self, *args, **kwargs):
+        """forward of model."""
+        raise NotImplementedError('Not Implemented')
+
+    def prepare_inputs_for_generation(
+        self,
+        past_key_values: List[List[torch.Tensor]],
+        inputs_embeds: Optional[torch.Tensor] = None,
+        context: StepContext = None,
+    ):
+        """prepare input."""
+        raise NotImplementedError('Not Implemented')
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        """load weights."""
+        raise NotImplementedError('Not Implemented')
+
+    def get_logits(self, hidden_states: torch.Tensor):
+        """compute logits of the model output."""
+        return hidden_states
+
+    def update_weights(self):
+        """update weights."""
+        pass
+
+    def update_model_metas(self,
+                           past_key_values: List[List[torch.Tensor]],
+                           inputs_embeds: Optional[torch.Tensor] = None,
+                           context: StepContext = None):
+        """update model meta."""
+        return None
+
+    def get_input_processor(self) -> BaseModelInputProcessor:
+        """get input processor."""
+        return None
diff --git a/lmdeploy/pytorch/models/utils/multimodal.py b/lmdeploy/pytorch/models/utils/multimodal.py
new file mode 100644
index 0000000000..aebcaf4073
--- /dev/null
+++ b/lmdeploy/pytorch/models/utils/multimodal.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+from lmdeploy.pytorch.multimodal.data_type import MultiModalInputs
+
+PreparedInputs = Tuple[List[int], MultiModalInputs]
+
+
+class MultiModalMixin:
+
+    def prepare_multimodal_input(self, input_ids, input_multimodals,
+                                 **kwargs) -> PreparedInputs:
+        """prepare multimodals inputs."""
+        raise NotImplementedError('prepare input not implemented.')
diff --git a/lmdeploy/pytorch/multimodal/__init__.py b/lmdeploy/pytorch/multimodal/__init__.py
new file mode 100644
index 0000000000..c3e8c6a16f
--- /dev/null
+++ b/lmdeploy/pytorch/multimodal/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .data_type import MultiModalData, MultiModalTensor
+
+__all__ = ['MultiModalData', 'MultiModalTensor']
diff --git a/lmdeploy/pytorch/multimodal/data_type.py b/lmdeploy/pytorch/multimodal/data_type.py
new file mode 100644
index 0000000000..95ec72d26e
--- /dev/null
+++ b/lmdeploy/pytorch/multimodal/data_type.py
@@ -0,0 +1,51 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Union
+
+from torch import Tensor
+
+
+class MultiModalData:
+    pass
+
+
+MultiModalDataList = List[MultiModalData]
+
+NestedTensor = Union[Tensor, List[Tensor]]
+
+
+@dataclass
+class MultiModalTensor:
+    data: NestedTensor
+    start: int
+    end: int = None
+    encoder_len: int = None
+    meta: Dict[str, Any] = None
+
+    def __post_init__(self):
+        if self.end is None:
+            self.end = self.start
+
+    def to_device(self, device: str, non_blocking: bool = False):
+        """to device."""
+        if isinstance(self.data, Tensor):
+            self.data = self.data.to(device=device, non_blocking=non_blocking)
+        else:
+            data = [
+                d.to(device=device, non_blocking=non_blocking)
+                for d in self.data
+            ]
+            self.data = data
+
+        if self.meta is not None:
+            for k, v in self.meta.items():
+                if isinstance(v, Tensor):
+                    v = v.to(device=device, non_blocking=non_blocking)
+                    self.meta[k] = v
+                elif hasattr(v, 'to_device'):
+                    v = v.to_device(device=device, non_blocking=non_blocking)
+                    self.meta[k] = v
+        return self
+
+
+MultiModalInputs = Dict[str, List[MultiModalTensor]]
diff --git a/lmdeploy/pytorch/multimodal/image_type.py b/lmdeploy/pytorch/multimodal/image_type.py
new file mode 100644
index 0000000000..19211a381f
--- /dev/null
+++ b/lmdeploy/pytorch/multimodal/image_type.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from dataclasses import dataclass
+from typing import Any, ClassVar, Dict
+
+from PIL import Image
+
+from .data_type import MultiModalData
+
+
+@dataclass
+class ImageData(MultiModalData):
+    data: Image
+    loc: int
+    meta: Dict[str, Any] = None
+    type: ClassVar[str] = 'image'
diff --git a/lmdeploy/pytorch/nn/__init__.py b/lmdeploy/pytorch/nn/__init__.py
index 63df9a5ae9..4705115bf4 100644
--- a/lmdeploy/pytorch/nn/__init__.py
+++ b/lmdeploy/pytorch/nn/__init__.py
@@ -2,7 +2,7 @@
 # attention module is modified from:
 # https://github.com/vllm-project/vllm/blob/main/vllm/attention/
 from .activation import GeluAndMul, SiluAndMul  # noqa: F401
-from .attention import Attention  # noqa: F401
+from .attention import Attention, FlashAttention  # noqa: F401
 from .norm import LayerNorm, RMSNorm  # noqa: F401
 from .rotary_embedding import ApplyRotaryEmb  # noqa: F401
 from .rotary_embedding import RopeType  # noqa: F401
diff --git a/lmdeploy/pytorch/nn/attention.py b/lmdeploy/pytorch/nn/attention.py
index 26f1034d36..484041dfcc 100644
--- a/lmdeploy/pytorch/nn/attention.py
+++ b/lmdeploy/pytorch/nn/attention.py
@@ -9,6 +9,15 @@
 from .utils import get_distribute_size
 
 
+def _update_num_heads(num_heads: int, num_kv_heads: int, replicate_kv: bool):
+    """update heads."""
+    world_size, rank = get_world_rank()
+    num_heads = get_distribute_size(num_heads, world_size, rank)
+    if not replicate_kv:
+        num_kv_heads = get_distribute_size(num_kv_heads, world_size, rank)
+    return num_heads, num_kv_heads
+
+
 class Attention(nn.Module):
     """Attention layer."""
 
@@ -23,14 +32,20 @@ def __init__(
         sliding_window: int = None,
         logit_softcapping: float = None,
         replicate_kv: bool = False,
+        causal: bool = True,
         **kwargs,
     ):
         super().__init__()
-        num_heads, num_kv_heads = self._update_num_heads(
-            num_heads, num_kv_heads, replicate_kv)
+        if num_kv_heads is None:
+            num_kv_heads = num_heads
+        if v_head_size is None:
+            v_head_size = head_size
+        num_heads, num_kv_heads = _update_num_heads(num_heads, num_kv_heads,
+                                                    replicate_kv)
 
         layer_backend = get_backend()
-        impl_builder = layer_backend.get_layer_impl_builder(OpType.Attention)
+        impl_builder = layer_backend.get_layer_impl_builder(
+            OpType.PagedAttention)
 
         self.impl = impl_builder.build(
             num_heads=num_heads,
@@ -41,18 +56,10 @@ def __init__(
             alibi=alibi,
             sliding_window=sliding_window,
             logit_softcapping=logit_softcapping,
+            causal=causal,
             **kwargs,
         )
 
-    def _update_num_heads(self, num_heads: int, num_kv_heads: int,
-                          replicate_kv: bool):
-        """update heads."""
-        world_size, rank = get_world_rank()
-        num_heads = get_distribute_size(num_heads, world_size, rank)
-        if not replicate_kv:
-            num_kv_heads = get_distribute_size(num_kv_heads, world_size, rank)
-        return num_heads, num_kv_heads
-
     def forward(
         self,
         query: torch.Tensor,
@@ -77,3 +84,77 @@ def forward(
             v_scales_zeros=v_scales_zeros,
             inplace=inplace,
         )
+
+
+class FlashAttention(nn.Module):
+    """flash attention w/o paging."""
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_dim: int,
+        scale: float = None,
+        num_kv_heads: int = None,
+        v_head_dim: int = None,
+        causal: bool = True,
+        sliding_window: int = None,
+        logit_softcapping: float = None,
+        replicate_kv: bool = False,
+        **kwargs,
+    ):
+        super().__init__()
+        if num_kv_heads is None:
+            num_kv_heads = num_heads
+        if v_head_dim is None:
+            v_head_dim = head_dim
+        num_heads, num_kv_heads = _update_num_heads(num_heads, num_kv_heads,
+                                                    replicate_kv)
+
+        layer_backend = get_backend()
+
+        impl_builder = layer_backend.get_layer_impl_builder(
+            OpType.FlashAttention)
+
+        self.impl = impl_builder.build(
+            num_heads=num_heads,
+            head_dim=head_dim,
+            scale=scale,
+            num_kv_heads=num_kv_heads,
+            v_head_dim=v_head_dim,
+            causal=causal,
+            sliding_window=sliding_window,
+            logit_softcapping=logit_softcapping,
+            **kwargs,
+        )
+
+    def forward(self,
+                query: torch.Tensor,
+                key: torch.Tensor,
+                value: torch.Tensor,
+                q_start_loc: torch.Tensor,
+                q_seqlens: torch.Tensor,
+                kv_start_loc: torch.Tensor = None,
+                kv_seqlens: torch.Tensor = None,
+                max_q_seqlen: int = None) -> torch.Tensor:
+        """forward."""
+
+        if max_q_seqlen is None:
+            max_q_seqlen = query.numel() // (query.size(-1) * query.size(-2))
+
+        if kv_start_loc is None and kv_seqlens is None:
+            kv_start_loc = q_start_loc
+            kv_seqlens = q_seqlens
+
+        assert kv_start_loc is not None
+        assert kv_seqlens is not None
+
+        return self.impl.forward(
+            query,
+            key,
+            value,
+            q_start_loc=q_start_loc,
+            q_seqlens=q_seqlens,
+            kv_start_loc=kv_start_loc,
+            kv_seqlens=kv_seqlens,
+            max_q_seqlen=max_q_seqlen,
+        )
diff --git a/lmdeploy/pytorch/supported_models.py b/lmdeploy/pytorch/supported_models.py
index 7fa568651b..67452f78e3 100644
--- a/lmdeploy/pytorch/supported_models.py
+++ b/lmdeploy/pytorch/supported_models.py
@@ -47,9 +47,9 @@
     # cogvlm-chat
     CogVLMForCausalLM=True,
     # llava
-    LlavaLlamaForCausalLM=True,
+    LlavaLlamaForCausalLM=False,
     # llava mistral
-    LlavaMistralForCausalLM=True,
+    LlavaMistralForCausalLM=False,
     # deepseekvl
     MultiModalityCausalLM=False,
     # StarCoder2
diff --git a/lmdeploy/serve/vl_async_engine.py b/lmdeploy/serve/vl_async_engine.py
index c293cd71c8..c6f140a113 100644
--- a/lmdeploy/serve/vl_async_engine.py
+++ b/lmdeploy/serve/vl_async_engine.py
@@ -1,148 +1,208 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, List, Optional, Union
+import asyncio
+from typing import Dict, List, Literal, Optional, Tuple, Union
 
-import numpy as np
+import PIL
 
+from lmdeploy.messages import (PytorchEngineConfig, TurbomindEngineConfig,
+                               VisionConfig)
 from lmdeploy.pytorch.check_env import try_import_deeplink
 from lmdeploy.serve.async_engine import AsyncEngine
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.constants import IMAGE_DUMMY_TOKEN_INDEX, IMAGE_TOKEN
 from lmdeploy.vl.engine import ImageEncoder
-from lmdeploy.vl.templates import VLPromptType, get_vl_prompt_template
+from lmdeploy.vl.utils import load_image
 
 logger = get_logger('lmdeploy')
 
+VLPromptType = Union[str, Tuple[str, PIL.Image.Image],
+                     Tuple[str, List[PIL.Image.Image]]]
+
 
 class VLAsyncEngine(AsyncEngine):
     """Visual Language Async inference engine."""
 
-    def __init__(self, model_path: str, **kwargs) -> None:
-        vision_config = kwargs.pop('vision_config', None)
-        backend_config = kwargs.get('backend_config', None)
-        if kwargs.get('backend', '') == 'pytorch':
+    def __init__(self,
+                 model_path: str,
+                 backend: Literal['turbomind', 'pytorch'] = 'turbomind',
+                 backend_config: Optional[Union[TurbomindEngineConfig,
+                                                PytorchEngineConfig]] = None,
+                 vision_config: Optional[VisionConfig] = None,
+                 **kwargs) -> None:
+        if backend == 'pytorch':
             try_import_deeplink(backend_config.device_type)
         self.vl_encoder = ImageEncoder(model_path,
+                                       backend,
                                        vision_config,
                                        backend_config=backend_config)
-        super().__init__(model_path, **kwargs)
+        super().__init__(model_path,
+                         backend=backend,
+                         backend_config=backend_config,
+                         **kwargs)
         if self.model_name == 'base':
             raise RuntimeError(
                 'please specify chat template as guided in https://lmdeploy.readthedocs.io/en/latest/inference/vl_pipeline.html#set-chat-template'  # noqa: E501
             )
-        self.vl_prompt_template = get_vl_prompt_template(
-            model_path, self.chat_template, self.model_name)
 
-    def _convert_prompts(self,
+    @classmethod
+    def _convert_prompts(cls,
                          prompts: Union[VLPromptType, List[Dict],
                                         List[VLPromptType], List[List[Dict]]]):
-        """convert prompts to openai format."""
+        """convert prompts to openai GPT4V format."""
         if isinstance(prompts, str) or isinstance(prompts, tuple):
-            _prompts = self.vl_prompt_template.prompt_to_messages(prompts)
+            _prompts = cls.prompt_to_messages(prompts)
         elif isinstance(prompts[0], tuple) or isinstance(prompts[0], str):
-            _prompts = [
-                self.vl_prompt_template.prompt_to_messages(x) for x in prompts
-            ]
+            _prompts = [cls.prompt_to_messages(x) for x in prompts]
         else:
             _prompts = prompts
         return _prompts
 
     async def _get_prompt_input(self,
-                                prompt: Dict,
+                                messages: Union[str, List[Dict]],
                                 do_preprocess: bool,
                                 sequence_start: bool,
                                 adapter_name: str,
                                 tools: Optional[List[object]] = None,
                                 **kwargs):
-        """get input_ids, embeddings and offsets."""
-        if do_preprocess:
-            decorated = self.vl_prompt_template.messages2prompt(
-                prompt, sequence_start)
-        else:
-            decorated = prompt
-        segs = decorated.split(IMAGE_TOKEN)
-
-        results = {}
-        input_ids = []
-        from lmdeploy.vl.templates import (MllamaTempateWrapper,
-                                           MolmoChatTemplateWrapper,
-                                           Qwen2VLChatTemplateWrapper)
-        ranges = None
-        grid_thws = None
-        if len(segs) > 1:
-            # yapf: disable
-            images_with_kwargs = await self.vl_prompt_template.async_collect_pil_images(prompt)  # noqa: E501
-            # yapf: enable
-            features = []
-            if len(images_with_kwargs) > 0:
-                images, image_kwargs = list(zip(*images_with_kwargs))
-                features = await self.vl_encoder.async_infer(
-                    images, image_kwargs)
-
-                from lmdeploy.vl.templates import MiniCPMVTempateWrapper
-                if isinstance(self.vl_prompt_template, MiniCPMVTempateWrapper):
-                    decorated, features = self.vl_prompt_template.update_image_token(  # noqa: E501
-                        decorated, features)
-                    segs = decorated.split(IMAGE_TOKEN)
-
-                if isinstance(self.vl_prompt_template,
-                              Qwen2VLChatTemplateWrapper):
-                    grid_thws = [x['grid_thw'] for x in features]
-                    features = [x['embeddings'] for x in features]
-
-                if isinstance(self.vl_prompt_template, MllamaTempateWrapper):
-                    # llama3.2 just encode <|image|> and inference
-                    decorated = decorated.replace(IMAGE_TOKEN, '<|image|>')
-                    input_ids = self.tokenizer.encode(decorated,
-                                                      add_bos=sequence_start)
-                    results['input_ids'] = input_ids
-                    results['prompt'] = decorated
-                    assert len(features)
-                    results['cross_attention_states'] = features[0]
-                    return results
-
-                if isinstance(self.vl_prompt_template,
-                              MolmoChatTemplateWrapper):
-                    return features[0]
-
-            features = [x.cpu().numpy() for x in features]
-            input_ids = []
-            begins = []
-            ends = []
-            if len(segs) != len(features) + 1:
-                logger.error(
-                    f'the number of {IMAGE_TOKEN} is not equal '
-                    f'to input images, {len(segs) - 1} vs {len(features)}')
-                features = features[:len(segs) - 1]
-            for i, seg in enumerate(segs):
-                if i > 0 and i <= len(features):
-                    image_dim = features[i - 1].shape[0]
-                    begins.append(len(input_ids))
-                    ends.append(begins[-1] + image_dim)
-                    input_ids.extend([IMAGE_DUMMY_TOKEN_INDEX] * image_dim)
-                seg_ids = self.tokenizer.encode(seg,
-                                                add_bos=((i == 0)
-                                                         and sequence_start))
-                input_ids.extend(seg_ids)
-            ranges = np.stack([begins, ends], axis=1).tolist()
-            results['input_embeddings'] = features or None
-            results['input_embedding_ranges'] = ranges or None
+        """process messages and return the required data for the inference
+        engines.
+
+        Refer to pytorch.engine.EngineInstance.async_stream_infer and
+        turbomind.TurboMindInstance.async_stream_infer for the argument
+        specification.
+        """
+        if isinstance(messages, str):
+            return await super()._get_prompt_input(messages, do_preprocess,
+                                                   sequence_start,
+                                                   adapter_name, tools,
+                                                   **kwargs)
+        elif isinstance(messages, List):
+            has_multimodal_input = any(
+                isinstance(message['content'], list) and any(
+                    item['type'] in ['image_url', 'image_data']
+                    for item in message['content']) for message in messages)
+            if not has_multimodal_input:
+                return await super()._get_prompt_input(messages, do_preprocess,
+                                                       sequence_start,
+                                                       adapter_name, tools,
+                                                       **kwargs)
         else:
-            input_ids = self.tokenizer.encode(decorated,
-                                              add_bos=sequence_start)
-
-        if isinstance(self.vl_prompt_template, Qwen2VLChatTemplateWrapper):
-            # TODO: refactor _get_prompt_input function
-            mrope_position_ids, mrope_position_delta = \
-                self.vl_prompt_template.get_mrope_info(
-                    len(input_ids), grid_thws=grid_thws,
-                    embedding_ranges=ranges)
-            results['mrope_position_ids'] = mrope_position_ids
-            results['mrope_position_delta'] = mrope_position_delta
-
-        results['input_ids'] = input_ids
-        results['prompt'] = decorated
+            raise RuntimeError(f'unsupported messages {messages}')
+
+        messages = await self.async_convert_to_pil_images(messages)
+        results = await self.vl_encoder.preprocess(messages)
+        if self.backend == 'turbomind':
+            # for tm engine, this module perform vision embedding after image
+            # preprocessing. It utilizes the hf model's vision embeddings
+            # functions and returns the input_ids, input_embeddings,
+            # embedding_ranges and so on. All the returned values are passed
+            # to tm engine for token generation
+            results = await self.vl_encoder.async_infer(results)
+            results = await self.vl_encoder.wrap_for_turbomind(
+                results, self.chat_template, self.tokenizer, sequence_start)
+        elif self.backend == 'pytorch':
+            # for pt engine, this module only conduct the image preprocessing
+            # It leaves the vision embedding to the pt engine
+            results = await self.vl_encoder.wrap_for_pytorch(
+                results, self.chat_template, self.tokenizer, sequence_start)
         return results
 
+    @classmethod
+    async def async_convert_to_pil_images(cls,
+                                          messages: List[Dict]) -> List[Dict]:
+        """Scan the provided messages to find image URLs or base64-encoded
+        image data. Loads the images into Pillow image objects.
+
+        Args:
+            messages (List[Dict]): a user request of GPT4V message format
+        """
+        if isinstance(messages, Dict):
+            messages = [messages]
+        assert isinstance(messages, List)
+
+        out_messages = [None] * len(messages)
+
+        def _inner_call(i, in_messages, out_messages):
+            role = in_messages[i]['role']
+            content = in_messages[i]['content']
+            assert role in ['sytem', 'user', 'assistant'], \
+                f'unsupported role "{role}"'
+            if role != 'user' or isinstance(content, str):
+                # the content is a user's prompt or an assistant's prompt,
+                # returning it directly
+                out_messages[i] = in_messages[i]
+                return
+            # the role is a user and the content is a list, in which there
+            # might be image_url or image_data
+            assert isinstance(content, List)
+            message = dict(role=role, content=[])
+            for item in content:
+                # image url or base64-encoded image data
+                if item['type'] == 'image_url':
+                    """
+                    convert the following item:
+                    {
+                        'type': 'image_url',
+                        'image_url': {
+                            'url': 'image url or base64-encoded image data',
+                            'key': 'value'  # parameters used in image processing
+                            ...
+                        }
+                    }
+                    to:
+                    {
+                        'type': 'image',
+                        'image': Pillow.Image,
+                        'key': 'value'   # parameters used in image processing
+                        ...
+                    }
+                    """  # noqa
+                    data = item['image_url'].copy()
+                    try:
+                        url = data.pop('url')
+                        image = load_image(url)
+                        data.update(type='image', image=image)
+                        message['content'].append(data)
+                    except KeyError:
+                        logger.error(f'invalid format {message}')
+                elif item['type'] == 'image_data':
+                    """
+                    convert the following item:
+                    {
+                        'type': 'image_data',
+                        'image_data': {
+                            'data': Pillow.Image,
+                            'key': 'value'  # parameters used in image processing
+                            ...
+                        }
+                    }
+                    to:
+                    {
+                        'type': 'image',
+                        'image': Pillow.Image,
+                        'key': 'value'   # parameters used in image processing
+                        ...
+                    }
+                    """  # noqa
+                    data = item['image_data'].copy()
+                    try:
+                        image = data.pop('data')
+                        data.update(type='image', image=image)
+                        message['content'].append(data)
+                    except KeyError:
+                        logger.error(f'invalid format {message}')
+                elif item['type'] == 'text':
+                    message['content'].append(item)
+                else:
+                    logger.error(f'unexpected content type {message}')
+            out_messages[i] = message
+
+        await asyncio.gather(*[
+            asyncio.get_event_loop().run_in_executor(None, _inner_call, i,
+                                                     messages, out_messages)
+            for i in range(len(messages))
+        ])
+        return out_messages
+
     def batch_infer(self, prompts: Union[VLPromptType, List[Dict],
                                          List[VLPromptType], List[List[Dict]]],
                     **kwargs):
@@ -173,3 +233,46 @@ def chat(self, prompts: VLPromptType, **kwargs):
         last_round = sess.history[-1]
         sess.history[-1] = (prompts, last_round[-1])
         return sess
+
+    @classmethod
+    def prompt_to_messages(cls, prompt: VLPromptType):
+        """convert prompt to GTP4V format."""
+        messages = {
+            'role': 'user',
+            'content': [{
+                'type': 'text',
+                'text': '',
+            }]
+        }
+        if isinstance(prompt, str):
+            messages['content'][0]['text'] = prompt
+        else:
+            prompt, images = prompt
+            if not isinstance(images, list):
+                images = [images]
+            messages['content'][0]['text'] = prompt
+            for image in images:
+                # 'image_url': means url or local path to image.
+                # 'image_data': means PIL.Image.Image object.
+                if isinstance(image, str):
+                    image = load_image(image)
+                    item = {
+                        'type': 'image_data',
+                        'image_data': {
+                            'data': image
+                        }
+                    }
+                elif isinstance(image, PIL.Image.Image):
+                    item = {
+                        'type': 'image_data',
+                        'image_data': {
+                            'data': image
+                        }
+                    }
+                else:
+                    raise ValueError(
+                        'image should be a str(url/path) or PIL.Image.Image')
+
+                messages['content'].append(item)
+
+        return [messages]
diff --git a/lmdeploy/vl/engine.py b/lmdeploy/vl/engine.py
index 124fd537c6..7f786d5f90 100644
--- a/lmdeploy/vl/engine.py
+++ b/lmdeploy/vl/engine.py
@@ -1,13 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import asyncio
-import inspect
-import queue
-import time
-from threading import Thread
 from typing import Dict, List, Optional, Union
 
 import torch
-from PIL.Image import Image
 
 from lmdeploy.messages import (PytorchEngineConfig, TurbomindEngineConfig,
                                VisionConfig)
@@ -27,169 +22,94 @@ def _raise_exception_on_finish(task: asyncio.Task) -> None:
         raise e
 
 
-class Record:
-    """Batching manager."""
-
-    def __init__(self, thread_safe):
-        self.thread_safe = thread_safe
-        self.number = []
-        self.waiting = []
-        self.kwargs = []
-        self.done = []
-        self.res_que = []
-        self.total = 0
-
-    def enqueue(self, images: List[Image], kwargs: List[Dict],
-                que: Union[queue.Queue, asyncio.Queue]):
-        """add ith request to manager."""
-        self.number.append(len(images))
-        self.waiting.extend(images)
-        self.kwargs.extend(kwargs)
-        self.res_que.append(que)
-        self.total += len(images)
-        self.log('received', len(images))
-
-    def dequeue(self, max_batch_size):
-        """try to dequeue max batch size images."""
-        inputs = self.waiting[:max_batch_size]
-        kwargs = self.kwargs[:max_batch_size]
-        self.waiting = self.waiting[max_batch_size:]
-        self.kwargs = self.kwargs[max_batch_size:]
-        self.total -= len(inputs)
-        self.log('process', len(inputs))
-        return inputs, kwargs
-
-    def notify(self):
-        """set result if request i is finished."""
-        if len(self.number) == 0 or self.number[0] > len(self.done):
-            return False
-        num_images = self.number.pop(0)
-        outputs = self.done[:num_images]
-        self.done = self.done[num_images:]
-        que = self.res_que.pop(0)
-        self.log('done', num_images)
-        if self.thread_safe:
-            que._loop.call_soon_threadsafe(que.put_nowait, outputs)
-        else:
-            que.put_nowait(outputs)
-        return True
-
-    def log(self, task: str, num: int):
-        logger.info(f'ImageEncoder {task} {num} images, '
-                    f'left {self.total} images.')
-
-
 class ImageEncoder:
     """Image encoder."""
 
-    def __init__(self,
-                 model_path: str,
-                 vision_config: VisionConfig = None,
-                 backend_config: Optional[Union[TurbomindEngineConfig,
-                                                PytorchEngineConfig]] = None):
-        self.model = load_vl_model(model_path, backend_config=backend_config)
+    def __init__(
+        self,
+        model_path: str,
+        backend: str,
+        vision_config: VisionConfig = None,
+        backend_config: Optional[Union[TurbomindEngineConfig,
+                                       PytorchEngineConfig]] = None,
+    ):
+        self.model = load_vl_model(model_path,
+                                   backend,
+                                   backend_config=backend_config)
         if vision_config is None:
             vision_config = VisionConfig()
         self.vision_config = vision_config
         self.max_batch_size = vision_config.max_batch_size
         torch.cuda.empty_cache()
-        self._que: asyncio.Queue = None
-        self._loop_task: asyncio.Task = None
-        if vision_config.thread_safe:
-            self._create_thread_safe_task()
-
-    def _create_thread_safe_task(self):
-        """thread safe loop task."""
-        self._loop = asyncio.new_event_loop()
 
-        def _work_thread():
-            asyncio.set_event_loop(self._loop)
-            self._que = asyncio.Queue()
-            self._loop.run_until_complete(self._forward_loop())
-
-        thread = Thread(target=_work_thread, daemon=True)
-        thread.start()
-        self._loop_thread = thread
-
-    def _create_event_loop_task(self):
-        """event loop task."""
-        task = asyncio.get_event_loop().create_task(self._forward_loop())
-        self._loop_task = task
-        self._loop = task.get_loop()
-
-    @property
-    def req_que(self):
-        if self.vision_config.thread_safe:
-            return self._que
-        if self._que is None:
-            self._que = asyncio.Queue()
-        if self._loop_task is None:
-            self._create_event_loop_task()
-        if asyncio.get_event_loop() != self._loop:
-            raise RuntimeError('Current event loop is different from'
-                               ' the one bound to loop task!')
-        return self._que
-
-    async def _forward_loop(self):
-        """working loop to process images."""
-        logger.info('start ImageEncoder._forward_loop')
-        record = Record(self.vision_config.thread_safe)
-        while True:
-            while record.total == 0 or (self._que.qsize() and
-                                        record.total < self.max_batch_size):
-                while self._que.qsize() == 0:
-                    await asyncio.sleep(0.01)
-                item = await self._que.get()
-                record.enqueue(item[0], item[1], item[2])
-            inputs, kwargs = record.dequeue(self.max_batch_size)
-            future = asyncio.get_event_loop().run_in_executor(
-                None, self.forward, inputs, kwargs)
-            future.add_done_callback(_raise_exception_on_finish)
-            outputs = await future
-            record.done.extend(outputs)
-            while record.notify():
-                pass
-
-    def _init_input_params(self,
-                           inputs: List[Image],
-                           params: List[Dict] = None):
-        """Check and init inputs params."""
-        if params is None:
-            params = [{}] * len(inputs)
-        assert len(params) == len(inputs), \
-            'different length of inputs and kwargs'
-        return params
-
-    def forward(self, inputs: List[Image], params: List[Dict] = None):
-        """Model forward."""
-        params = self._init_input_params(inputs, params)
-        time_start = time.perf_counter()
-        func_params = inspect.signature(self.model.forward).parameters
-        func_inputs = [inputs, params] if len(func_params) > 1 else [inputs]
-        outputs = self.model.forward(*func_inputs)
-        if isinstance(outputs[0], torch.Tensor):
-            outputs = [x.cpu() for x in outputs]
-        time_end = time.perf_counter()
-        logger.info(f'ImageEncoder forward {len(inputs)} images, '
-                    f'cost {time_end - time_start:.3f}s')
+    async def preprocess(self, messages: List[Dict]) -> List[Dict]:
+        """preprocess multimodal data in the messages."""
+        future = asyncio.get_event_loop().run_in_executor(
+            None, self.model.preprocess, messages)
+        future.add_done_callback(_raise_exception_on_finish)
+        outputs = await future
         return outputs
 
-    def infer(self, inputs: List[Image], params: List[Dict] = None):
-        """infer."""
-        params = self._init_input_params(inputs, params)
-        results = self.forward(inputs, params)
-        return results
+    async def async_infer(self, messages: List[Dict]) -> List[Dict]:
+        """get multimodal embedding.
+
+        Args:
+            messages (List[Dict]): a list of message, which is the output
+            of `preprocess()`
+        """
+        future = asyncio.get_event_loop().run_in_executor(
+            None, self.model.forward, messages, self.max_batch_size)
+        future.add_done_callback(_raise_exception_on_finish)
+        outputs = await future
+        return outputs
 
-    async def async_infer(self,
-                          inputs: List[Image],
-                          params: List[Dict] = None):
-        """async infer."""
-        params = self._init_input_params(inputs, params)
-        outputs = asyncio.Queue()
-        item = (inputs, params, outputs)
-        if self.vision_config.thread_safe:
-            self._loop.call_soon_threadsafe(self._que.put_nowait, item)
-        else:
-            self.req_que.put_nowait(item)
-        results = await outputs.get()
-        return results
+    async def wrap_for_pytorch(self, messages: List[Dict], chat_template,
+                               tokenizer, sequence_start) -> List[Dict]:
+        """
+        Args:
+            messages (List[Dict]): a list of message, which is supposed to be
+                the output of `preprocess`
+        Returns:
+            a dict which will be passed to pytorch engine_instance's forward.
+            The dict is like the following:
+            Dict(
+                'prompt': 'the prompt after applying chat template'
+                'input_ids': [],
+                'multimodal': {
+                    'pixel_values': torch.Tensor,
+                    ...
+                ]
+            )
+        """
+        result = self.model.to_pytorch(messages, chat_template, tokenizer,
+                                       sequence_start)
+        # clear data
+        for i, message in enumerate(messages):
+            if isinstance(message['content'], List):
+                messages[i]['preprocess'] = None
+        return result
+
+    async def wrap_for_turbomind(self, messages: List[Dict], chat_template,
+                                 tokenizer, sequence_start) -> Dict:
+        """
+        Args:
+            messages (List[Dict]): a list of message, which is supposed to be
+                the output of `async_infer`
+        Returns:
+            a dict which will be passed to pytorch engine_instance's forward.
+            The dict is like the following:
+            Dict(
+                'prompt': 'the prompt after applying chat template'
+                'input_ids': [],
+                'input_embeddings': list[torch.Tensor],
+                'input_embedding_ranges': list[torch.Tensor],
+                ...
+        """
+        result = self.model.to_turbomind(messages, chat_template, tokenizer,
+                                         sequence_start)
+        # clear data
+        for i, message in enumerate(messages):
+            if isinstance(message['content'], List):
+                messages[i]['preprocess'] = None
+                messages[i]['forward'] = None
+        return result
diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/vl/model/base.py
index 9c5f5f6e6a..a350e70a94 100644
--- a/lmdeploy/vl/model/base.py
+++ b/lmdeploy/vl/model/base.py
@@ -2,8 +2,7 @@
 from abc import ABC, abstractmethod
 from typing import Dict, List, Union
 
-import PIL
-import torch
+import numpy as np
 from mmengine import Registry
 from transformers import AutoConfig
 
@@ -18,37 +17,227 @@ class VisonModel(ABC):
 
     def __init__(self,
                  model_path: str,
-                 with_llm: bool = False,
                  max_memory: Dict[int, int] = None,
-                 hf_config: AutoConfig = None):
+                 hf_config: AutoConfig = None,
+                 backend: str = ''):
         """init."""
         self.model_path = model_path
-        self.with_llm = with_llm
         self.max_memory = max_memory
+        self.backend = backend
         if hf_config is None:
             _, hf_config = get_model_arch(model_path)
         self.hf_config = hf_config
-        self.build_model()
 
     @abstractmethod
-    def build_model():
-        """build model."""
+    def build_preprocessor(self, ):
+        """build the preprocessor.
+
+        NOTE: When the derived class implements this method, try not to
+        introduce the upper stream model repo as a thirdparty package
+        """
         raise NotImplementedError()
 
+    def build_model(self, ):
+        """build model.
+
+        ONLY implement it when the backend is turbomind engine
+        """
+        if self.backend == 'turbomind':
+            raise NotImplementedError()
+
     @abstractmethod
+    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+        """preprocess multimodal data in the messages. The derived class,
+        i.e., a specific vision model, takes the charge of image preprocessing
+        and the result management.
+        It can integrate the result into the messages list, or insert it to
+        the individual image item.
+        Args:
+            message(Dict): multimodal data in a dict, which is as follows:
+            [
+                {'role': 'user', 'content': 'user prompt'},
+                {'role': 'assisant', 'content': 'AI reponse'},
+                {
+                    'role': 'user',
+                    'content': [
+                        {
+                            'type': 'text',
+                            'text': 'string',
+                        },
+                        {
+                            'type': 'image',
+                            'image': pillow.Image,
+                            'key1': value1,
+                            ...
+                        },
+                        {
+                            'type': 'image',
+                            'image': pillow.Image,
+                            'key1': value1,
+                            ...
+                        },
+                        ...
+                    ]
+                }
+                {....}
+            ]
+        Returns:
+            the message list with preprocessing results included, which is
+            determined by the derived classes
+        """  # noqa
+        raise NotImplementedError()
+
     def forward(self,
-                images: List[PIL.Image.Image],
-                image_kwargs: List[Dict] = None) -> List[torch.Tensor]:
-        """extract image feature.
+                messages: List[Dict],
+                max_batch_size: int = 1) -> List[Dict]:
+        """extract image feature. ONLY implement it when the backend is
+        turbomind engine.
 
         Args:
-            images (List[PIL.Image.Image]): input images
-            image_kwargs (List[Dict]): input kwargs for each images
-
+            messages(List[Dict]): the outputs of `preprocess`
+            max_batch_size(int): the max batch size when forwarding vision
+                model
         Return:
-            List[torch.Tensor]: extract image feature for each input image
+            the message list with forwarding results included, which is
+            determined by the derived classes
         """
-        raise NotImplementedError()
+        if self.backend == 'turbomind':
+            raise NotImplementedError()
+
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
+        """pack the preprocessing results in a format compatible with what is
+        required by pytorch engine. ONLY implement it when the backend is
+        pytorch engine.
+
+        Args:
+            messages(List[Dict]): the output of `preprocess`
+            chat_template: the chat template defined in `lmdeploy/model.py`
+            tokenzer: the tokenizer model
+            sequence_start: starting flag of a sequence
+        """
+        if self.backend == 'pytorch':
+            raise NotImplementedError()
+
+    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
+        """pack the forwarding results in a format compatible with what is
+        required by turbomind engine. ONLY implement it when the backend is
+        turbomind engine.
+
+        Args:
+            messages(List[Dict]): the output of `preprocess`
+            chat_template: the chat template defined in `lmdeploy/model.py`
+            tokenzer: the tokenizer model
+            sequence_start: starting flag of a sequence
+        """
+        if self.backend == 'turbomind':
+            raise NotImplementedError()
+
+    @staticmethod
+    def collect_images(messages):
+        """gather all images along with their respective parameters from the
+        messages and compile them into a single list. Each image is converted
+        to RGB color space.
+
+        Args:
+            messages (List[Tuple[Image, Dict]]): a list of images with their
+                corresponding parameters
+        """  # noqa
+        images = []
+        for message in messages:
+            content = message['content']
+            if not isinstance(content, List):
+                continue
+            images.extend([
+                (x['image'],
+                 {k: v
+                  for k, v in x.items() if k not in {'type', 'image'}})
+                for x in content if x['type'] == 'image'
+            ])
+        return images
+
+    @staticmethod
+    def to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                       sequence_start):
+        """auxiliary function to pack the preprocessing results in a format
+        compatible with what is required by pytorch engine.
+
+        Args:
+            messages(List[Dict]): the output of `preprocess`
+            prompt(str): the prompt after applying chat template
+            IMAGE_TOKEN(str): a placeholder where image tokens will be
+                inserted
+            tokenzer: the tokenizer model
+            sequence_start: starting flag of a sequence
+        """
+        # collect all preprocessing result from messages
+        preps = [x['content'] for x in messages if x['role'] == 'preprocess']
+        assert len(preps) == 1
+        preps = preps[0]
+
+        # split prompt into segments and validate data
+        segs = prompt.split(IMAGE_TOKEN)
+        assert len(segs) == len(preps) + 1, (
+            f'the number of {IMAGE_TOKEN} is not equal '
+            f'to input images, {len(segs) - 1} vs {len(preps)}')
+
+        # calculate the image token offset for each image
+        input_ids = []
+        for i, seg in enumerate(segs):
+            if i > 0 and i <= len(preps):
+                preps[i - 1].update(offset=len(input_ids))
+                image_tokens = preps[i - 1]['image_tokens']
+                image_token_id = preps[i - 1]['image_token_id']
+                input_ids.extend([image_token_id] * image_tokens)
+            token_ids = tokenizer.encode(seg,
+                                         add_bos=((i == 0) and sequence_start))
+            input_ids.extend(token_ids)
+
+        return dict(prompt=prompt, input_ids=input_ids, multimodal=preps)
+
+    @staticmethod
+    def to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                         sequence_start):
+        """auxiliary function to pack the forwarding results in a format
+        compatible with what is required by turbomind engine.
+
+        Args:
+            messages(List[Dict]): the output of `preprocess`
+            prompt(str): the prompt after applying chat template
+            IMAGE_TOKEN(str): a placeholder where image tokens will be
+                inserted
+            tokenzer: the tokenizer model
+            sequence_start: starting flag of a sequence
+        """
+        # collect image features from messages
+        features = [x['content'] for x in messages if x['role'] == 'forward']
+        features = features[0]
+        features = [x.cpu().numpy() for x in features]
+
+        # split prompt into segments and validate data
+        segs = prompt.split(IMAGE_TOKEN)
+        assert len(segs) == len(features) + 1, (
+            f'the number of {IMAGE_TOKEN} is not equal '
+            f'to input images, {len(segs) - 1} vs {len(features)}')
+
+        # tokenizer prompt, and get input_embeddings and input_embedding_ranges
+        input_ids = []
+        begins = []
+        ends = []
+        IMAGE_DUMMY_TOKEN_INDEX = 0
+        for i, seg in enumerate(segs):
+            if i > 0 and i <= len(features):
+                image_dim = features[i - 1].shape[0]
+                begins.append(len(input_ids))
+                ends.append(begins[-1] + image_dim)
+                input_ids.extend([IMAGE_DUMMY_TOKEN_INDEX] * image_dim)
+            seg_ids = tokenizer.encode(seg,
+                                       add_bos=((i == 0) and sequence_start))
+            input_ids.extend(seg_ids)
+        ranges = np.stack([begins, ends], axis=1).tolist()
+        return dict(prompt=prompt,
+                    input_ids=input_ids,
+                    input_embeddings=features,
+                    input_embedding_ranges=ranges)
 
     @classmethod
     def match(cls, config: AutoConfig):
diff --git a/lmdeploy/vl/model/builder.py b/lmdeploy/vl/model/builder.py
index 2401b42259..24ec4ccb60 100644
--- a/lmdeploy/vl/model/builder.py
+++ b/lmdeploy/vl/model/builder.py
@@ -2,6 +2,8 @@
 import os
 from typing import Optional, Union
 
+import torch
+
 from lmdeploy.archs import get_model_arch
 from lmdeploy.messages import PytorchEngineConfig, TurbomindEngineConfig
 from lmdeploy.utils import get_logger, get_model
@@ -29,15 +31,14 @@
 
 
 def load_vl_model(model_path: str,
-                  with_llm: bool = False,
+                  backend: str,
                   backend_config: Optional[Union[TurbomindEngineConfig,
                                                  PytorchEngineConfig]] = None):
     """load visual model.
 
     Args:
         model_path(str): the path or repo_id from model hub of the model
-        with_llm(bool): whether to remove the LLM part from the model.
-            When it is False, it means removing LLM part
+        backend(str): the name of inference backend
         backend_config: the config of the inference engine
     """
     if not os.path.exists(model_path):
@@ -48,39 +49,25 @@ def load_vl_model(model_path: str,
                                download_dir=download_dir)
 
     max_memory = None
-    if not with_llm:
-        import torch
-        tp = getattr(backend_config, 'tp', 1)
-        max_memory = {i: torch.cuda.mem_get_info(i)[0] for i in range(tp)}
+    tp = getattr(backend_config, 'tp', 1)
+    max_memory = {i: torch.cuda.mem_get_info(i)[0] for i in range(tp)}
 
     _, hf_config = get_model_arch(model_path)
     kwargs = dict(model_path=model_path,
-                  with_llm=with_llm,
                   max_memory=max_memory,
-                  hf_config=hf_config)
+                  hf_config=hf_config,
+                  backend=backend)
     for name, module in VISION_MODELS.module_dict.items():
         try:
             if module.match(hf_config):
                 logger.info(f'matching vision model: {name}')
-                return module(**kwargs)
+                model = module(**kwargs)
+                model.build_preprocessor()
+                if backend == 'turbomind':
+                    model.build_model()
+                return model
         except Exception:
             logger.error(f'matching vision model: {name} failed')
             raise
 
     raise ValueError(f'unsupported vl model with config {hf_config}')
-
-
-def vl_model_with_tokenizer(model_path: str, with_llm: bool = True):
-    """load visual model."""
-    vl_model = load_vl_model(model_path, with_llm).vl_model
-    llm = vl_model
-    if hasattr(vl_model, 'language_model'):  # deepseek vl
-        llm = vl_model.language_model
-    if hasattr(vl_model, 'llm'):  # MiniCPMV
-        llm = vl_model.llm
-    llm.config.use_cache = False
-    llm.half().eval()
-    from transformers import AutoTokenizer
-    tokenizer = AutoTokenizer.from_pretrained(model_path,
-                                              trust_remote_code=True)
-    return vl_model, llm, tokenizer
diff --git a/lmdeploy/vl/model/cogvlm.py b/lmdeploy/vl/model/cogvlm.py
index ea5a06159e..abeeff31ce 100644
--- a/lmdeploy/vl/model/cogvlm.py
+++ b/lmdeploy/vl/model/cogvlm.py
@@ -1,13 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-from typing import List
-
-import torch
-from PIL.Image import Image
-from transformers import AutoModelForCausalLM
+from typing import Dict, List
 
+from lmdeploy.utils import get_logger
 from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
-from lmdeploy.vl.model.utils import disable_logging
+
+logger = get_logger('lmdeploy')
 
 
 @VISION_MODELS.register_module()
@@ -16,7 +13,7 @@ class CogVLMVisionModel(VisonModel):
 
     _arch = 'CogVLMForCausalLM'
 
-    def build_model(self):
+    def build_preprocessor(self):
         from torchvision import transforms
         self.image_transform = transforms.Compose([
             transforms.Resize(
@@ -26,57 +23,65 @@ def build_model(self):
             transforms.Normalize((0.48145466, 0.4578275, 0.40821073),
                                  (0.26862954, 0.26130258, 0.27577711)),
         ])
+        image_size = self.hf_config.vision_config['image_size']
+        patch_size = self.hf_config.vision_config['patch_size']
+        self.n_token_per_image = 2 + (image_size // patch_size // 2)**2
 
-        from accelerate import init_empty_weights, load_checkpoint_and_dispatch
-        from accelerate.utils import get_balanced_memory, infer_auto_device_map
-        with init_empty_weights(), warnings.catch_warnings():
-            model = AutoModelForCausalLM.from_config(self.hf_config,
-                                                     trust_remote_code=True)
-            if not self.with_llm:
-                del model.lm_head
-                for key in ['layers', 'norm', 'embed_tokens']:
-                    setattr(model.model, key, None)
-            else:
-                self.vl_model = model
+    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+        """refer to the spec of `super().preprocess`"""
+        images = self.collect_images(messages)
+        outputs = []
+        for image, _ in images:
+            image = image.convert('RGB')
+            pixel_values = self.image_transform(image)
+            outputs.append(
+                dict(pixel_values=pixel_values,
+                     image_size=image.size,
+                     image_tokens=self.n_token_per_image,
+                     image_token_id=0))
+        messages.append(dict(role='preprocess', content=outputs))
+        return messages
 
-        no_split_module_classes = ['TransformerLayer']
-        max_memory = get_balanced_memory(
-            model,
-            max_memory=self.max_memory,
-            dtype=torch.half,
-            no_split_module_classes=no_split_module_classes)
-        device_map = infer_auto_device_map(
-            model,
-            no_split_module_classes=no_split_module_classes,
-            max_memory=max_memory,
-            dtype=torch.half)
-        same_device_keys = [('model.vision.linear_proj', 'model.vision.boi',
-                             'model.vision.eoi')]
-        for keys in same_device_keys:
-            keys = [k for k in keys if k in device_map]
-            if len(keys) <= 1:
+    @staticmethod
+    def proc_messages(messages, chat_template, sequence_start):
+        """apply chat template to get the prompt."""
+        prompt_messages = []
+        for message in messages:
+            if isinstance(message['content'], str):
+                prompt_messages.append(message)
                 continue
-            for k in keys[1:]:
-                device_map[k] = device_map[keys[0]]
+            elif message['role'] in ['images', 'preprocess', 'forward']:
+                continue
+            content = [
+                x['text'] for x in message['content'] if x['type'] == 'text'
+            ]
+            n_images = len(
+                [1 for x in message['content'] if x['type'] == 'image'])
+
+            prompt_messages.append(
+                dict(role='user', content=content[0], num_images=n_images))
 
-        with disable_logging():
-            load_checkpoint_and_dispatch(
-                model=model,
-                checkpoint=self.model_path,
-                device_map=device_map if not self.with_llm else {'': 'cpu'},
-                no_split_module_classes=no_split_module_classes,
-                dtype=torch.half)
-        self.model = model.model.vision
-        self.model.eval()
+        from lmdeploy.model import Vicuna
+        llm_chat_template = Vicuna(eoa=chat_template.eoa,
+                                   stop_words=chat_template.stop_words)
+        prompt = ''
+        IMAGE_TOKEN = '<IMAGE_TOKEN>'
+        for i, msg in enumerate(prompt_messages):
+            num_images = msg.pop('num_images', 0)
+            if num_images == 0:
+                role = msg['role']
+                msg = llm_chat_template.messages2prompt([msg], sequence_start
+                                                        and i == 0)
+                msg = dict(role=role, content=msg)
+            prompt_i = chat_template.messages2prompt([msg], sequence_start
+                                                     and i == 0)
+            if num_images > 0:
+                prompt_i = (IMAGE_TOKEN * num_images) + prompt_i
+            prompt += prompt_i
+        return prompt, IMAGE_TOKEN
 
-    @torch.no_grad()
-    def forward(self, images: List[Image]) -> List[torch.Tensor]:
-        """forward."""
-        outputs = [x.convert('RGB') for x in images]
-        outputs = [self.image_transform(x) for x in outputs]
-        outputs = torch.stack(outputs, dim=0).to(device='cuda:0',
-                                                 dtype=torch.half)
-        outputs = self.model(outputs)
-        outputs = torch.split(outputs, 1, dim=0)
-        outputs = [x.squeeze() for x in outputs]
-        return outputs
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
+        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
+                                                 sequence_start)
+        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                   sequence_start)
diff --git a/lmdeploy/vl/model/deepseek.py b/lmdeploy/vl/model/deepseek.py
index bfbf03f01e..ec5f2e21f7 100644
--- a/lmdeploy/vl/model/deepseek.py
+++ b/lmdeploy/vl/model/deepseek.py
@@ -1,15 +1,16 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-
 import warnings
-from typing import List
+from typing import Dict, List
 
 import torch
-from PIL.Image import Image
 from transformers import AutoModelForCausalLM
 
+from lmdeploy.utils import get_logger
 from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
 from lmdeploy.vl.model.utils import disable_logging
 
+logger = get_logger('lmdeploy')
+
 
 def check_deepseek_vl_install():
     """check deepseek_vl install."""
@@ -18,8 +19,8 @@ def check_deepseek_vl_install():
     except ImportError:
         raise ImportError(
             'To use DeepSeekVLModel, please install deepseek_vl by '
-            'pip install git+https://github.com/deepseek-ai/DeepSeek-VL.git'
-            ' --no-deps')
+            '`pip install git+https://github.com/deepseek-ai/DeepSeek-VL.git'
+            ' --no-deps`')
 
 
 @VISION_MODELS.register_module()
@@ -28,18 +29,18 @@ class DeepSeekVisionModel(VisonModel):
 
     _arch = 'MultiModalityCausalLM'
 
-    def build_model(self):
+    def build_preprocessor(self):
         check_deepseek_vl_install()
-        # empty init
-        from accelerate import init_empty_weights
         from deepseek_vl.models import VLChatProcessor
+        self.image_processor = VLChatProcessor.from_pretrained(
+            self.model_path).image_processor
+
+    def build_model(self):
+        from accelerate import init_empty_weights
         with init_empty_weights():
             warnings.simplefilter('ignore')
             model = AutoModelForCausalLM.from_pretrained(self.model_path)
-            if not self.with_llm:
-                del model.language_model
-            else:
-                self.vl_model = model
+            del model.language_model
 
         from accelerate.utils import get_balanced_memory, infer_auto_device_map
         max_memory = get_balanced_memory(model,
@@ -73,29 +74,115 @@ def build_model(self):
 
         from accelerate import load_checkpoint_and_dispatch
         with disable_logging():
-            load_checkpoint_and_dispatch(
-                model=model,
-                checkpoint=self.model_path,
-                device_map=device_map if not self.with_llm else {'': 'cpu'},
-                dtype=torch.half)
+            load_checkpoint_and_dispatch(model=model,
+                                         checkpoint=self.model_path,
+                                         device_map=device_map,
+                                         dtype=torch.half)
 
         self.vision_model = model.vision_model.eval()
         self.aligner = model.aligner.eval()
-        self.image_processor = VLChatProcessor.from_pretrained(
-            self.model_path).image_processor
+
+    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+        """refers to the spec of `super.preprocess()"""
+        images = self.collect_images(messages)
+        outputs = []
+        for image, _ in images:
+            image = image.convert('RGB')
+            pixel_values = self.image_processor(
+                [image], return_tensors='pt').pixel_values
+            outputs.append(
+                dict(
+                    pixel_values=pixel_values,
+                    image_size=image.size,
+                    # refer to https://github.com/deepseek-ai/DeepSeek-VL/blob/main/deepseek_vl/models/processing_vlm.py  # noqa
+                    # which is hardcoded 576
+                    image_tokens=576,
+                    image_token_id=0))
+        messages.append(dict(role='preprocess', content=outputs))
+        return messages
 
     @torch.no_grad()
-    def forward(self, images: List[Image]) -> List[torch.Tensor]:
-        """forward."""
-        outputs = [x.convert('RGB') for x in images]
-        pixel_values = self.image_processor(outputs,
-                                            return_tensors='pt').pixel_values
-        pixel_values = pixel_values.to(device=next(
-            self.vision_model.parameters()).device,
-                                       dtype=torch.float16)
-        # [b x n_images, T2, D]
-        images_embeds = self.aligner(self.vision_model(pixel_values))
-
-        outputs = torch.split(images_embeds, 1, dim=0)
-        outputs = [x.squeeze() for x in outputs]
-        return outputs
+    def forward(self,
+                messages: List[Dict],
+                max_batch_size: int = 1) -> List[Dict]:
+        """extract image feature. ONLY implement it when the backend is
+        turbomind engine.
+
+        Args:
+            messages(List[Dict]): the outputs of `preprocess`
+            max_batch_size(int): the max batch size when forwarding vision
+                model
+        Return:
+            the message list with forwarding results included
+        """
+        inputs = [x['content'] for x in messages if x['role'] == 'preprocess']
+        inputs = inputs[0]
+        outputs = []
+        for idx in range(0, len(inputs), max_batch_size):
+            pixel_values = [
+                x['pixel_values'] for x in inputs[idx:idx + max_batch_size]
+            ]
+            pixel_values = torch.cat(pixel_values, dim=0)
+            pixel_values = pixel_values.to(device=next(
+                self.vision_model.parameters()).device,
+                                           dtype=torch.float16)
+            # [b x n_images, T2, D]
+            logger.info(f'vision forward shape: {pixel_values.shape}')
+            feats = self.aligner(self.vision_model(pixel_values))
+            feats = torch.split(feats, 1, dim=0)
+            outputs.extend([x.squeeze() for x in feats])
+        messages.append(dict(role='forward', content=outputs))
+        return messages
+
+    @staticmethod
+    def proc_messages(messages, chat_template, sequence_start):
+        # apply chat template to get the prompt
+        prompt_messages = []
+        IMAGE_TOKEN = '<IMAGE_TOKEN>'
+        for message in messages:
+            if isinstance(message['content'], str):
+                prompt_messages.append(message)
+                continue
+            elif message['role'] in ['images', 'preprocess', 'forward']:
+                continue
+            content = [
+                x['text'] for x in message['content'] if x['type'] == 'text'
+            ]
+            content = content[0]
+            n_image = sum(
+                [1 for x in message['content'] if x['type'] == 'image'])
+            n_placeholder = content.count(IMAGE_TOKEN)
+            if n_placeholder == 0:
+                logger.warning(
+                    f"""for deepseek-vl model, the user should insert the {IMAGE_TOKEN}
+                    to user prompt manually, please read https://lmdeploy.readthedocs.io/en/latest/inference/vl_pipeline.html
+                    for more details.""")  # noqa
+            if n_placeholder != 0 and n_placeholder != n_image:
+                logger.error(
+                    f'unmatched placeholder and image: {n_placeholder} vs '
+                    f'{n_image}. Ignore the placeholder')
+                content = content.replace(IMAGE_TOKEN, '')
+                n_placeholder = 0
+            if n_placeholder == 0:
+                if n_image == 1:
+                    content = f'{IMAGE_TOKEN}{content}'
+                else:
+                    content = ''.join([
+                        f'{IMAGE_TOKEN} is Figure {str(i)}.\n'
+                        for i in range(n_image)
+                    ]) + content
+            prompt_messages.append(dict(role='user', content=content))
+        prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
+        return prompt, IMAGE_TOKEN
+
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
+        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
+                                                 sequence_start)
+        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                   sequence_start)
+
+    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
+        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
+                                                 sequence_start)
+        return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                     sequence_start)
diff --git a/lmdeploy/vl/model/glm_4v.py b/lmdeploy/vl/model/glm_4v.py
index 34e060f4c9..7cdd96d5dc 100644
--- a/lmdeploy/vl/model/glm_4v.py
+++ b/lmdeploy/vl/model/glm_4v.py
@@ -1,14 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List
 
-import warnings
-from typing import List
-
-import torch
-from PIL.Image import Image
 from transformers import AutoConfig
 
+from lmdeploy.utils import get_logger
 from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
-from lmdeploy.vl.model.utils import disable_logging
+
+logger = get_logger('lmdeploy')
 
 
 @VISION_MODELS.register_module()
@@ -25,53 +23,8 @@ def match(cls, config: AutoConfig):
             return True
         return False
 
-    def build_model(self):
-        from accelerate import init_empty_weights, load_checkpoint_and_dispatch
-        from accelerate.utils import infer_auto_device_map
+    def build_preprocessor(self):
         from torchvision import transforms
-
-        with init_empty_weights(), warnings.catch_warnings():
-            warnings.simplefilter('ignore')
-            from transformers import AutoModelForCausalLM
-            model = AutoModelForCausalLM.from_config(self.hf_config,
-                                                     trust_remote_code=True)
-            if not self.with_llm:
-                del model.transformer.embedding
-                del model.transformer.rotary_pos_emb
-                del model.transformer.encoder
-                del model.transformer.output_layer
-            else:
-                self.vl_model = model
-
-        no_split_module_classes = ['TransformerLayer']
-
-        device_map = infer_auto_device_map(
-            model,
-            no_split_module_classes=no_split_module_classes,
-            max_memory=self.max_memory,
-            dtype=torch.half)
-
-        same_device_keys = [
-            ('transformer.vision.linear_proj', 'transformer.vision.boi',
-             'transformer.vision.eoi')
-        ]
-        for keys in same_device_keys:
-            keys = [k for k in keys if k in device_map]
-            if len(keys) <= 1:
-                continue
-            for k in keys[1:]:
-                device_map[k] = device_map[keys[0]]
-
-        with disable_logging():
-            load_checkpoint_and_dispatch(
-                model=model,
-                checkpoint=self.model_path,
-                device_map=device_map if not self.with_llm else {'': 'cpu'},
-                no_split_module_classes=no_split_module_classes,
-                dtype=torch.half)
-
-        model.eval()
-        self.model = model
         self.image_transform = transforms.Compose([
             transforms.Resize(
                 (self.hf_config.vision_config['image_size'], ) * 2,
@@ -80,15 +33,57 @@ def build_model(self):
             transforms.Normalize((0.48145466, 0.4578275, 0.40821073),
                                  (0.26862954, 0.26130258, 0.27577711)),
         ])
+        image_size = self.hf_config.vision_config['image_size']
+        patch_size = self.hf_config.vision_config['patch_size']
+        self.n_token_per_image = 2 + (image_size // patch_size // 2)**2
+
+    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+        """refers to the spec of `super.preprocess()"""
+        outputs = []
+        for message in messages:
+            if not isinstance(message['content'], List):
+                continue
+            images = [
+                x['image'] for x in message['content'] if x['type'] == 'image'
+            ]
+            if len(images) > 1:
+                logger.warning(
+                    f'glm4v does not support the input of multiple images'
+                    f' in a single chat round, but got {len(images)} images.')
+            # we still pass all the images to the model and let the
+            # model decide what to do
+            images = [x.convert('RGB') for x in images]
+            pixel_values = [self.image_transform(x) for x in images]
+            outputs.extend([
+                dict(pixel_values=_2,
+                     image_size=_1.size,
+                     image_tokens=self.n_token_per_image,
+                     image_token_id=0) for _1, _2 in zip(images, pixel_values)
+            ])
+        messages.append(dict(role='preprocess', content=outputs))
+        return messages
+
+    @staticmethod
+    def proc_messages(messages, chat_template, sequence_start):
+        """apply chat template to get the prompt."""
+        prompt_messages = []
+        IMAGE_TOKEN = '<IMAGE_TOKEN>'
+        for message in messages:
+            content = message['content']
+            if isinstance(content, str):
+                prompt_messages.append(message)
+                continue
+            elif message['role'] in ['preprocess', 'forward']:
+                continue
+            prompt = [x['text'] for x in content if x['type'] == 'text']
+            n_images = len([1 for x in content if x['type'] == 'image'])
+            prompt = ''.join([f'{IMAGE_TOKEN}\n'] * n_images) + prompt[0]
+            prompt_messages.append(dict(role='user', content=prompt))
+        prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
+        return prompt, IMAGE_TOKEN
 
-    @torch.no_grad()
-    def forward(self, images: List[Image]) -> List[torch.Tensor]:
-        """forward."""
-        outputs = [x.convert('RGB') for x in images]
-        outputs = [self.image_transform(x) for x in outputs]
-        outputs = torch.stack(outputs, dim=0).to(device='cuda:0',
-                                                 dtype=torch.half)
-        outputs = self.model.transformer.vision(outputs)
-        outputs = torch.split(outputs, 1, dim=0)
-        outputs = [x.squeeze() for x in outputs]
-        return outputs
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
+        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
+                                                 sequence_start)
+        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                   sequence_start)
diff --git a/lmdeploy/vl/model/internvl.py b/lmdeploy/vl/model/internvl.py
index fa67192f11..af4fba30ca 100644
--- a/lmdeploy/vl/model/internvl.py
+++ b/lmdeploy/vl/model/internvl.py
@@ -1,10 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-
 from typing import Dict, List
 
 import torch
-from PIL.Image import Image
-from transformers import AutoModel, CLIPImageProcessor
+from transformers import AutoConfig, AutoModel, CLIPImageProcessor
 
 from lmdeploy.utils import get_logger
 from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
@@ -80,34 +78,15 @@ class InternVLVisionModel(VisonModel):
 
     _arch = 'InternVLChatModel'
 
-    def build_model(self):
-        """Load model."""
-        from accelerate import init_empty_weights
-        with init_empty_weights():
-            config = self.hf_config
-            # transformers below 4.37.0 may raise error about flash_attn
-            config.llm_config.attn_implementation = 'eager'
-            model = AutoModel.from_config(config, trust_remote_code=True)
-            if not self.with_llm:
-                del model.language_model
-            else:
-                self.vl_model = model
-            model.half()
+    def __init__(self,
+                 model_path: str,
+                 max_memory: Dict[int, int] = None,
+                 hf_config: AutoConfig = None,
+                 backend: str = ''):
+        super().__init__(model_path, max_memory, hf_config, backend)
 
-        from accelerate import load_checkpoint_and_dispatch
-        with disable_logging():
-            load_checkpoint_and_dispatch(
-                model=model,
-                checkpoint=self.model_path,
-                device_map='auto' if not self.with_llm else {'': 'cpu'},
-                max_memory=self.max_memory,
-                no_split_module_classes=['InternVisionEncoderLayer'],
-                dtype=torch.half)
-
-        # We need eval mode to freeze the weights in model, thus,
-        # avoid randomness in inference.
-        self.model = model.eval()
-        self.config = config
+    def build_preprocessor(self):
+        self.config = self.hf_config
         dynamic_image_size = getattr(self.config, 'dynamic_image_size', False)
         image_processor = None
         try:
@@ -131,62 +110,177 @@ def build_model(self):
                 T.ToTensor(),
                 T.Normalize(mean=MEAN, std=STD)
             ])
+            self.processor = self._preprocess_v1_5
             self._forward_func = self._forward_v1_5
         else:
+            self.processor = self._preprocess
             self.image_processor = image_processor
             self._forward_func = self._forward
 
-    def _preprocess_v1_5(self, images: List[Image], params: List[Dict] = None):
-        if params is not None:
-            assert len(images) == len(
-                params), 'different length of images and params'
-        else:
-            params = [{}] * len(images)
+        force_image_size = self.hf_config.force_image_size
+        patch_size = self.hf_config.vision_config.patch_size
+        downsample_ratio = self.hf_config.downsample_ratio
+        self.image_tokens_per_patch = int(
+            (force_image_size // patch_size)**2 * (downsample_ratio**2))
 
-        image_res = {'low': 6, 'medium': 12, 'high': 24}
+    def build_model(self):
+        """Load model."""
+        from accelerate import init_empty_weights
+        with init_empty_weights():
+            # transformers below 4.37.0 may raise error about flash_attn
+            self.config.llm_config.attn_implementation = 'eager'
+            model = AutoModel.from_config(self.config, trust_remote_code=True)
+            del model.language_model
 
-        outputs = []
-        for image, param in zip(images, params):
-            max_num = param.get('max_dynamic_patch')
-            if max_num is None or not isinstance(max_num, int):
-                res_key = param.get('detail', 'default')
-                max_num = image_res.get(res_key, self.config.max_dynamic_patch)
-            out = dynamic_preprocess(
-                image,
-                min_num=self.config.min_dynamic_patch,
-                max_num=max_num,
-                image_size=self.config.vision_config.image_size,
-                use_thumbnail=self.config.use_thumbnail)
-            out = [self.transform(x) for x in out]
-            out = torch.stack(out)  # (patch) x c x h x w
-            outputs.append(out)
-        return outputs
+        model.half()
+        from accelerate import load_checkpoint_and_dispatch
+        with disable_logging():
+            load_checkpoint_and_dispatch(
+                model=model,
+                checkpoint=self.model_path,
+                device_map='auto',
+                max_memory=self.max_memory,
+                no_split_module_classes=['InternVisionEncoderLayer'],
+                dtype=torch.half)
+
+        # We need eval mode to freeze the weights in model, thus,
+        # avoid randomness in inference.
+        self.model = model.eval()
 
-    def _forward_v1_5(self, images: List[Image], params: List[Dict] = None):
+    def _preprocess_v1_5(self, image, params=None):
+        image_res = {'low': 6, 'medium': 12, 'high': 24}
+        max_num = params.get('max_dynamic_patch')
+        if max_num is None or not isinstance(max_num, int):
+            res_key = params.get('detail', 'default')
+            max_num = image_res.get(res_key, self.config.max_dynamic_patch)
+        out = dynamic_preprocess(
+            image,
+            min_num=self.config.min_dynamic_patch,
+            max_num=max_num,
+            image_size=self.config.vision_config.image_size,
+            use_thumbnail=self.config.use_thumbnail)
+        pixel_values = [self.transform(x) for x in out]
+        # (patch) x c x h x w
+        pixel_values = torch.stack(pixel_values)
+        return pixel_values
+
+    def _forward_v1_5(self, inputs, max_batch_size):
         """forward for internvl-chat-v1-5."""
-        outputs = self._preprocess_v1_5(images, params)
-        split = [x.shape[0] for x in outputs]
-        outputs = torch.cat(outputs, dim=0)
-        outputs = outputs.to(self.model.device, dtype=torch.float16)
-        outputs = self.model.extract_feature(outputs)
-        outputs = torch.split(outputs, split, dim=0)
-        outputs = [x.reshape(-1, x.shape[-1]) for x in outputs]
+        assert all(x.get('pixel_values') is not None for x in inputs)
+        outputs = []
+        for idx in range(0, len(inputs), max_batch_size):
+            pixel_values = [
+                x['pixel_values'] for x in inputs[idx:idx + max_batch_size]
+            ]
+            split = [x.shape[0] for x in pixel_values]
+            pixel_values = torch.cat(pixel_values, dim=0)
+            pixel_values = pixel_values.to(self.model.device,
+                                           dtype=torch.float16)
+            logger.info(f'vision forward shape: {pixel_values.shape}')
+            feats = self.model.extract_feature(pixel_values)
+            feats = torch.split(feats, split, dim=0)
+            outputs.extend([x.reshape(-1, x.shape[-1]) for x in feats])
         return outputs
 
-    def _forward(self, images: List[Image], params: List[Dict] = None):
+    def _preprocess(self, image, params=None):
         """forward for internvl-chat-v1-1, internvl-chat-v1-2."""
-        pixel_values = self.image_processor(images=images,
+        pixel_values = self.image_processor(images=image,
                                             return_tensors='pt').pixel_values
-        pixel_values = pixel_values.to(self.model.device, dtype=torch.float16)
-        outputs = self.model.extract_feature(pixel_values)
-        outputs = torch.split(outputs, 1, dim=0)
-        outputs = [x.squeeze() for x in outputs]
+        return pixel_values
+
+    def _forward(self, inputs, max_batch_size):
+        """forward for internvl-chat-v1-1, internvl-chat-v1-2."""
+        assert all(x.get('pixel_values') is not None for x in inputs)
+        outputs = []
+        for idx in range(0, len(inputs), max_batch_size):
+            pixel_values = [
+                x['pixel_values'] for x in inputs[idx:idx + max_batch_size]
+            ]
+            pixel_values = torch.cat(pixel_values, dim=0)
+            pixel_values = pixel_values.to(self.model.device,
+                                           dtype=torch.float16)
+            logger.info(f'vision forward shape: {pixel_values.shape}')
+            feats = self.model.extract_feature(pixel_values)
+            feats = torch.split(feats, 1, dim=0)
+            outputs.extend([x.squeeze() for x in feats])
         return outputs
 
+    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+        """refers to `super.preprocess() for spec."""
+        images = self.collect_images(messages)
+        outputs = []
+        for image, params in images:
+            image = image.convert('RGB')
+            pixel_values = self.processor(image, params)
+            image_tokens = (pixel_values.shape[0] *
+                            self.image_tokens_per_patch)
+            outputs.append(
+                dict(pixel_values=pixel_values,
+                     image_tokens=image_tokens,
+                     image_token_id=0,
+                     image_size=image.size))
+        messages.append(dict(role='preprocess', content=outputs))
+        return messages
+
     @torch.no_grad()
     def forward(self,
-                images: List[Image],
-                params: List[Dict] = None) -> List[torch.Tensor]:
-        """forward."""
-        images = [x.convert('RGB') for x in images]
-        return self._forward_func(images, params)
+                messages: List[Dict],
+                max_batch_size: int = 1) -> List[Dict]:
+        """extract image feature. ONLY implement it when the backend is
+        turbomind engine.
+
+        Args:
+            messages(List[Dict]): the outputs of `preprocess`
+            max_batch_size(int): the max batch size when forwarding vision
+                model
+        Return:
+            the message list with forwarding results included
+        """
+        inputs = [x['content'] for x in messages if x['role'] == 'preprocess']
+        inputs = inputs[0]
+        outputs = self._forward_func(inputs, max_batch_size)
+        messages.append(dict(role='forward', content=outputs))
+        return messages
+
+    @staticmethod
+    def proc_messages(messages, chat_template, sequence_start):
+        """apply chat template to get the prompt."""
+        prompt_messages = []
+        IMAGE_TOKEN = '<IMAGE_TOKEN>'
+        for message in messages:
+            if isinstance(message['content'], str):
+                prompt_messages.append(message)
+                continue
+            elif message['role'] in ['preprocess', 'forward']:
+                continue
+            n_images = len(
+                [1 for x in message['content'] if x['type'] == 'image'])
+            content = [
+                x['text'] for x in message['content'] if x['type'] == 'text'
+            ]
+            prompt = content[0]
+            if IMAGE_TOKEN in prompt and f'<img>{IMAGE_TOKEN}' not in prompt:
+                prompt = prompt.replace(f'{IMAGE_TOKEN}',
+                                        f'<img>{IMAGE_TOKEN}</img>')
+                prompt = prompt.replace('</img><img>', '')
+                prompt = prompt.replace('<img><img>', '<img>')
+                prompt = prompt.replace('</img></img>', '</img>')
+            elif IMAGE_TOKEN not in prompt:
+                prompt = f'<img>{IMAGE_TOKEN * n_images}</img>\n' + prompt
+            else:
+                pass
+            prompt_messages.append(dict(role='user', content=prompt))
+        prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
+        return prompt, IMAGE_TOKEN
+
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
+        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
+                                                 sequence_start)
+        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                   sequence_start)
+
+    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
+        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
+                                                 sequence_start)
+        return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                     sequence_start)
diff --git a/lmdeploy/vl/model/internvl_llava.py b/lmdeploy/vl/model/internvl_llava.py
index f607082b18..44693dd1e7 100644
--- a/lmdeploy/vl/model/internvl_llava.py
+++ b/lmdeploy/vl/model/internvl_llava.py
@@ -2,14 +2,13 @@
 
 import warnings
 from contextlib import contextmanager
-from typing import List, Union
+from typing import Dict, List
 
 import torch
-from PIL.Image import Image
 from transformers import AutoConfig, AutoModelForCausalLM
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
+from lmdeploy.vl.model.llava import VISION_MODELS, LlavaVisionModel
 from lmdeploy.vl.model.utils import rewrite_ctx
 
 from .utils import disable_logging, disable_transformers_logging
@@ -18,14 +17,13 @@
 
 
 def check_llava_install():
-    """check llava install."""
     try:
         from llava.model.multimodal_encoder.clip_encoder import \
             InternVisionModel  # noqa: F401
     except ImportError:
         raise ImportError(
             'To use LlavaVLModel, please install llava by '
-            'pip install "git+https://github.com/OpenGVLab/InternVL#subdirectory=internvl_chat_llava" --no-deps'  # noqa: E501
+            '`pip install git+https://github.com/OpenGVLab/InternVL#subdirectory=internvl_chat_llava --no-deps`'  # noqa: E501
         )
 
 
@@ -65,7 +63,7 @@ def init_empty_vit():
 
 
 @VISION_MODELS.register_module()
-class InternVLLlavaVisionModel(VisonModel):
+class InternVLLlavaVisionModel(LlavaVisionModel):
     """Llava visual model."""
 
     @classmethod
@@ -78,9 +76,11 @@ def match(cls, config: AutoConfig):
                 return True
         return False
 
+    def build_preprocessor(self):
+        return super().build_preprocessor()
+
     def build_model(self):
         """build model & load weights."""
-        # check llava install
         check_llava_install()
         # currently, only support llava llama
         from llava.model.language_model.llava_llama import (  # noqa
@@ -98,13 +98,10 @@ def build_model(self):
             }  # disable vision part quantization
             model = AutoModelForCausalLM.from_config(self.config,
                                                      trust_remote_code=True)
-            if not self.with_llm:
-                del model.lm_head
-                del model.model.embed_tokens
-                del model.model.layers
-                del model.model.norm
-            else:
-                self.vl_model = model
+            del model.lm_head
+            del model.model.embed_tokens
+            del model.model.layers
+            del model.model.norm
 
             with init_empty_vit():
                 vision_tower = model.get_vision_tower()
@@ -129,7 +126,7 @@ def build_model(self):
                 model=model,
                 max_memory=self.max_memory,
                 checkpoint=self.model_path,
-                device_map='auto' if not self.with_llm else {'': 'cpu'},
+                device_map='auto',
                 no_split_module_classes=['InternVisionEncoderLayer'],
                 dtype=torch.half)
 
@@ -137,42 +134,43 @@ def build_model(self):
         self.vision_tower = model.model.vision_tower.eval()
         self.mm_projector = model.model.mm_projector.eval()
 
-    def encode_images(self, images: torch.Tensor) -> torch.Tensor:
-        """encode images."""
-        image_features = self.vision_tower(images)
-        image_features = self.mm_projector(image_features)
-        return image_features
-
-    def preprocess(
-            self,
-            images: List[Image]) -> Union[torch.Tensor, List[torch.Tensor]]:
-        """preprocess."""
-        # TODO: gpu processor
-        from llava.mm_utils import process_images
-        images = [x.convert('RGB') for x in images]
-        image_processor = self.vision_tower.image_processor
-        outputs = process_images(images, image_processor, self.config)
-        return outputs
+    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+        """refer to `super().preprocess() for spec."""
+        return super().preprocess(messages)
 
     @torch.no_grad()
-    def forward(self, images: List[Image]) -> List[torch.Tensor]:
-        """forward."""
-        images = self.preprocess(images)
-        if isinstance(images, list):
-            images = [
-                x.to(self.vision_tower.device, dtype=torch.float16)
-                for x in images
+    def forward(self,
+                messages: List[Dict],
+                max_batch_size: int = 1) -> List[Dict]:
+        """extract image feature. ONLY implement it when the backend is
+        turbomind engine.
+
+        Args:
+            messages(List[Dict]): the outputs of `preprocess`
+            max_batch_size(int): the max batch size when forwarding vision
+                model
+        Return:
+            the message list with forwarding results included
+        """
+        inputs = [x['content'] for x in messages if x['role'] == 'preprocess']
+        inputs = inputs[0]
+        outputs = []
+        for idx in range(0, len(inputs), max_batch_size):
+            pixel_values = [
+                x['pixel_values'] for x in inputs[idx:idx + max_batch_size]
             ]
-        else:
-            images = images.to(self.vision_tower.device, dtype=torch.float16)
-
-        if type(images) is list or images.ndim == 5:
-            concat_images = torch.cat([image for image in images], dim=0)
-            image_features = self.encode_images(concat_images)
-            split_sizes = [image.shape[0] for image in images]
-            image_features = torch.split(image_features, split_sizes, dim=0)
-            image_features = [x.flatten(0, 1) for x in image_features]
-        else:
-            image_features = self.encode_images(images)
-            image_features = [x for x in image_features]
-        return image_features
+            split_sizes = [x.shape[0] for x in pixel_values]
+            pixel_values = torch.cat(pixel_values, dim=0)
+            pixel_values = pixel_values.to(device=self.vision_tower.device,
+                                           dtype=torch.float16)
+            logger.info(f'vision forward shape: {pixel_values.shape}')
+            if pixel_values.ndim == 5:
+                feats = self.encode_images(pixel_values)
+                feats = torch.split(feats, split_sizes, dim=0)
+                feats = [x.flatten(0, 1) for x in feats]
+            else:
+                feats = self.encode_images(pixel_values)
+                feats = [x for x in feats]
+            outputs.extend(feats)
+        messages.append(dict(role='forward', content=outputs))
+        return messages
diff --git a/lmdeploy/vl/model/llava.py b/lmdeploy/vl/model/llava.py
index 0b18f460cd..4ef62a4bb6 100644
--- a/lmdeploy/vl/model/llava.py
+++ b/lmdeploy/vl/model/llava.py
@@ -1,16 +1,17 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-# Modified from
-# https://github.com/haotian-liu/LLaVA.git
+# Modified from https://github.com/haotian-liu/LLaVA.git
+import ast
+import math
 import warnings
 from contextlib import contextmanager
-from typing import List, Union
+from typing import Dict, List
 
 import torch
-from PIL.Image import Image
+from PIL import Image
 from transformers import AutoConfig, AutoModelForCausalLM
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
+from lmdeploy.vl.model.llava_hf import VISION_MODELS, LlavaHfVisionModel
 from lmdeploy.vl.model.utils import disable_logging, rewrite_ctx
 
 logger = get_logger('lmdeploy')
@@ -23,16 +24,14 @@ def check_llava_install():
     except ImportError:
         raise ImportError(
             'To use LlavaVLModel, please install llava by '
-            'pip install git+https://github.com/haotian-liu/LLaVA.git --no-deps'  # noqa: E501
+            '`pip install git+https://github.com/haotian-liu/LLaVA.git --no-deps`'  # noqa: E501
         )
 
 
 def _clip_vision_tower_load_model(self, **kwargs):
     logger.info(f'CLIPVisionTower.load_model: {self.vision_tower_name}')
-    from transformers import (CLIPImageProcessor, CLIPVisionConfig,
-                              CLIPVisionModel)
-    self.image_processor = CLIPImageProcessor.from_pretrained(
-        self.vision_tower_name)
+    from transformers import CLIPVisionConfig, CLIPVisionModel
+
     config = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
     self.vision_tower = CLIPVisionModel._from_config(config=config)
     self.vision_tower.requires_grad_(False)
@@ -53,8 +52,166 @@ def init_llava_vision_tower(config):
         yield
 
 
+def select_best_resolution(original_size, possible_resolutions):
+    """Selects the best resolution from a list of possible resolutions based on
+    the original size.
+
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """  # noqa
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float('inf')
+
+    for width, height in possible_resolutions:
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(
+            original_height * scale)
+        effective_resolution = min(downscaled_width * downscaled_height,
+                                   original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+
+        if effective_resolution > max_effective_resolution or (
+                effective_resolution == max_effective_resolution
+                and wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+
+    return best_fit
+
+
+def resize_and_pad_image(image, target_resolution):
+    """Resize and pad an image to a target resolution while maintaining aspect
+    ratio.
+
+    Args:
+        image (PIL.Image.Image): The input image.
+        target_resolution (tuple): The target resolution (width, height) of the image.
+
+    Returns:
+        PIL.Image.Image: The resized and padded image.
+    """  # noqa
+    original_width, original_height = image.size
+    target_width, target_height = target_resolution
+
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+
+    if scale_w < scale_h:
+        new_width = target_width
+        new_height = min(math.ceil(original_height * scale_w), target_height)
+    else:
+        new_height = target_height
+        new_width = min(math.ceil(original_width * scale_h), target_width)
+
+    # Resize the image
+    resized_image = image.resize((new_width, new_height))
+
+    new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
+    paste_x = (target_width - new_width) // 2
+    paste_y = (target_height - new_height) // 2
+    new_image.paste(resized_image, (paste_x, paste_y))
+
+    return new_image
+
+
+def divide_to_patches(image, patch_size):
+    """Divides an image into patches of a specified size.
+
+    Args:
+        image (PIL.Image.Image): The input image.
+        patch_size (int): The size of each patch.
+
+    Returns:
+        list: A list of PIL.Image.Image objects representing the patches.
+    """
+    patches = []
+    width, height = image.size
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            box = (j, i, j + patch_size, i + patch_size)
+            patch = image.crop(box)
+            patches.append(patch)
+
+    return patches
+
+
+def process_anyres_image(image, processor, grid_pinpoints):
+    """Process an image with variable resolutions.
+
+    Args:
+        image (PIL.Image.Image): The input image to be processed.
+        processor: The image processor object.
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+
+    Returns:
+        torch.Tensor: A tensor containing the processed image patches.
+    """  # noqa
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    best_resolution = select_best_resolution(image.size, possible_resolutions)
+    image_padded = resize_and_pad_image(image, best_resolution)
+
+    patches = divide_to_patches(image_padded, processor.crop_size['height'])
+
+    image_original_resize = image.resize(
+        (processor.size['shortest_edge'], processor.size['shortest_edge']))
+
+    image_patches = [image_original_resize] + patches
+    image_patches = [
+        processor.preprocess(image_patch,
+                             return_tensors='pt')['pixel_values'][0]
+        for image_patch in image_patches
+    ]
+    return torch.stack(image_patches, dim=0)
+
+
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+
+
+def process_images(images, image_processor, model_cfg):
+    image_aspect_ratio = getattr(model_cfg, 'image_aspect_ratio', None)
+    new_images = []
+    if image_aspect_ratio == 'pad':
+        for image in images:
+            image = expand2square(
+                image, tuple(int(x * 255) for x in image_processor.image_mean))
+            image = image_processor.preprocess(
+                image, return_tensors='pt')['pixel_values'][0]
+            new_images.append(image)
+    elif image_aspect_ratio == 'anyres':
+        for image in images:
+            image = process_anyres_image(image, image_processor,
+                                         model_cfg.image_grid_pinpoints)
+            new_images.append(image)
+    else:
+        return image_processor(images, return_tensors='pt')['pixel_values']
+    if all(x.shape == new_images[0].shape for x in new_images):
+        new_images = torch.stack(new_images, dim=0)
+    return new_images
+
+
 @VISION_MODELS.register_module()
-class LlavaVisionModel(VisonModel):
+class LlavaVisionModel(LlavaHfVisionModel):
     """Llava visual model."""
 
     @classmethod
@@ -73,9 +230,19 @@ def match(cls, config: AutoConfig):
             return True
         return False
 
+    def build_preprocessor(self):
+        from transformers import CLIPImageProcessor
+        self.image_processor = CLIPImageProcessor.from_pretrained(
+            self.hf_config.mm_vision_tower)
+        config = AutoConfig.from_pretrained(self.hf_config.mm_vision_tower)
+        image_size = config.vision_config.image_size
+        patch_size = config.vision_config.patch_size
+        self.n_token_per_image = (image_size // patch_size)**2
+        if self.hf_config.mm_vision_select_feature == 'cls_patch':
+            self.n_token_per_image += 1
+
     def build_model(self):
         """build model & load weights."""
-        # check llava install
         check_llava_install()
 
         self.arch = self.hf_config.architectures[0]
@@ -104,15 +271,11 @@ def build_model(self):
             model = AutoModelForCausalLM.from_config(self.config,
                                                      trust_remote_code=True)
 
-        if not self.with_llm:
-            # remove the LLM part from llava model.
-            # Instead, Load the LLM part to turbomind engine
-            del model.lm_head
-            del model.model.embed_tokens
-            del model.model.layers
-            del model.model.norm
-        else:
-            self.vl_model = model
+        # remove the LLM part from llava model.
+        del model.lm_head
+        del model.model.embed_tokens
+        del model.model.layers
+        del model.model.norm
 
         # init empty vision_tower, the embedding layer in CLIPVisionModel
         # can't init right under init_empty_weights
@@ -129,7 +292,7 @@ def build_model(self):
                 model=model,
                 max_memory=self.max_memory,
                 checkpoint=self.model_path,
-                device_map='auto' if not self.with_llm else {'': 'cpu'},
+                device_map='auto',
                 no_split_module_classes=['CLIPEncoderLayer'],
                 dtype=torch.half)
 
@@ -143,101 +306,113 @@ def encode_images(self, images: torch.Tensor) -> torch.Tensor:
         image_features = self.mm_projector(image_features)
         return image_features
 
-    def preprocess(
-            self,
-            images: List[Image]) -> Union[torch.Tensor, List[torch.Tensor]]:
-        """preprocess."""
-        # TODO: gpu processor
-        from llava.mm_utils import process_images
-        images = [x.convert('RGB') for x in images]
-        image_processor = self.vision_tower.image_processor
-        outputs = process_images(images, image_processor, self.config)
-        return outputs
+    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+        """refer to `super().preprocess() for spec."""
+        images = self.collect_images(messages)
+        outputs = []
+        for image, params in images:
+            image = image.convert('RGB')
+            pixel_values = process_images([image], self.image_processor,
+                                          self.config)
+            outputs.append(
+                dict(pixel_values=pixel_values,
+                     image_size=image.size,
+                     image_tokens=self.n_token_per_image,
+                     image_token_id=0))
+        messages.append(dict(role='preprocess', content=outputs))
+        return messages
 
     @torch.no_grad()
-    def forward(self, images: List[Image]) -> List[torch.Tensor]:
-        """forward."""
+    def forward(self,
+                messages: List[Dict],
+                max_batch_size: int = 1) -> List[Dict]:
+        """extract image feature. ONLY implement it when the backend is
+        turbomind engine.
+
+        Args:
+            messages(List[Dict]): the outputs of `preprocess`
+            max_batch_size(int): the max batch size when forwarding vision
+                model
+        Return:
+            the message list with forwarding results included
+        """
         from llava.model.llava_arch import (get_anyres_image_grid_shape,
                                             unpad_image)
-        image_sizes = [x.size for x in images]
-        images = self.preprocess(images)
-        if isinstance(images, list):
-            images = [
-                x.to(device=self.vision_tower.device, dtype=torch.float16)
-                for x in images
+        inputs = [x['content'] for x in messages if x['role'] == 'preprocess']
+        inputs = inputs[0]
+        outputs = []
+        for idx in range(0, len(inputs), max_batch_size):
+            image_sizes = [
+                x['image_size'] for x in inputs[idx:idx + max_batch_size]
             ]
-        else:
-            images = images.to(device=self.vision_tower.device,
-                               dtype=torch.float16)
-        if type(images) is list or images.ndim == 5:
-            if type(images) is list:
-                images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images]
-            concat_images = torch.cat([image for image in images], dim=0)
-            image_features = self.encode_images(concat_images)
-            split_sizes = [image.shape[0] for image in images]
-            image_features = torch.split(image_features, split_sizes, dim=0)
-            mm_patch_merge_type = getattr(self.config, 'mm_patch_merge_type',
-                                          'flat')
-            image_aspect_ratio = getattr(self.config, 'image_aspect_ratio',
-                                         'square')
-            if mm_patch_merge_type == 'flat':
-                image_features = [x.flatten(0, 1) for x in image_features]
-            elif mm_patch_merge_type.startswith('spatial'):
-                new_image_features = []
-                for image_idx, image_feature in enumerate(image_features):
-                    if image_feature.shape[0] > 1:
-                        base_image_feature = image_feature[0]
-                        image_feature = image_feature[1:]
-                        height = width = self.vision_tower.num_patches_per_side
-                        assert height * width == base_image_feature.shape[0]
-                        if image_aspect_ratio == 'anyres':
-                            num_patch_width, num_patch_height = \
-                                get_anyres_image_grid_shape(
-                                    image_sizes[image_idx],
-                                    self.config.image_grid_pinpoints,
-                                    self.vision_tower.config.image_size)
-                            image_feature = image_feature.view(
-                                num_patch_height, num_patch_width, height,
-                                width, -1)
-                        else:
-                            raise NotImplementedError
-                        if 'unpad' in mm_patch_merge_type:
-                            image_feature = image_feature.permute(
-                                4, 0, 2, 1, 3).contiguous()
-                            image_feature = image_feature.flatten(1,
-                                                                  2).flatten(
-                                                                      2, 3)
-                            image_feature = unpad_image(
-                                image_feature, image_sizes[image_idx])
-                            image_feature = torch.cat((
-                                image_feature,
-                                self.model.image_newline[:, None, None].expand(
-                                    *image_feature.shape[:-1], 1).to(
-                                        image_feature.device)),
-                                                      dim=-1)
-                            image_feature = image_feature.flatten(1,
-                                                                  2).transpose(
-                                                                      0, 1)
+            pixel_values = [
+                x['pixel_values'] for x in inputs[idx:idx + max_batch_size]
+            ]
+            if pixel_values[0].ndim == 5:
+                split_sizes = [x.shape[1] for x in pixel_values]
+                pixel_values = torch.cat([x for x in pixel_values], dim=1)
+                logger.info(f'vision forward shape: {pixel_values.shape}')
+                pixel_values = pixel_values.squeeze(0)
+                pixel_values = pixel_values.to(device=self.vision_tower.device,
+                                               dtype=torch.float16)
+                feats = self.encode_images(pixel_values)
+                feats = torch.split(feats, split_sizes, dim=0)
+                mm_patch_merge_type = getattr(self.config,
+                                              'mm_patch_merge_type', 'flat')
+                image_aspect_ratio = getattr(self.config, 'image_aspect_ratio',
+                                             'square')
+                if mm_patch_merge_type == 'flat':
+                    outputs.expand([x.flatten(0, 1) for x in feats])
+                elif mm_patch_merge_type.startswith('spatial'):
+                    for img_idx, feat in enumerate(feats):
+                        if feat.shape[0] > 1:
+                            base_feat = feat[0]
+                            feat = feat[1:]
+                            height = self.vision_tower.num_patches_per_side
+                            width = self.vision_tower.num_patches_per_side
+                            assert height * width == base_feat.shape[0]
+                            if image_aspect_ratio == 'anyres':
+                                num_patch_width, num_patch_height = \
+                                    get_anyres_image_grid_shape(
+                                        image_sizes[img_idx],
+                                        self.config.image_grid_pinpoints,
+                                        self.vision_tower.config.image_size)
+                                feat = feat.view(num_patch_height,
+                                                 num_patch_width, height,
+                                                 width, -1)
+                            else:
+                                raise NotImplementedError
+                            if 'unpad' in mm_patch_merge_type:
+                                feat = feat.permute(4, 0, 2, 1, 3).contiguous()
+                                feat = feat.flatten(1, 2).flatten(2, 3)
+                                feat = unpad_image(feat, image_sizes[img_idx])
+                                feat = torch.cat(
+                                    (feat, self.model.
+                                     image_newline[:, None, None].expand(
+                                         *feat.shape[:-1], 1).to(feat.device)),
+                                    dim=-1)
+                                feat = feat.flatten(1, 2).transpose(0, 1)
+                            else:
+                                feat = feat.permute(0, 2, 1, 3, 4).contiguous()
+                                feat = feat.flatten(0, 3)
+                            feat = torch.cat((base_feat, feat), dim=0)
                         else:
-                            image_feature = image_feature.permute(
-                                0, 2, 1, 3, 4).contiguous()
-                            image_feature = image_feature.flatten(0, 3)
-                        image_feature = torch.cat(
-                            (base_image_feature, image_feature), dim=0)
-                    else:
-                        image_feature = image_feature[0]
-                        if 'unpad' in mm_patch_merge_type:
-                            image_feature = torch.cat(
-                                (image_feature,
-                                 self.model.image_newline[None].to(
-                                     image_feature.device)),
-                                dim=0)
-                    new_image_features.append(image_feature)
-                image_features = new_image_features
+                            feat = feat[0]
+                            if 'unpad' in mm_patch_merge_type:
+                                feat = torch.cat(
+                                    (feat, self.model.image_newline[None].to(
+                                        feat.device)),
+                                    dim=0)
+                        outputs.append(feat)
+                else:
+                    raise ValueError('Unexpected mm_patch_merge_type: '
+                                     f'{self.config.mm_patch_merge_type}')
             else:
-                raise ValueError('Unexpected mm_patch_merge_type: '
-                                 f'{self.config.mm_patch_merge_type}')
-        else:
-            image_features = self.encode_images(images)
-            image_features = [x for x in image_features]
-        return image_features
+                pixel_values = torch.cat(pixel_values, dim=0)
+                pixel_values = pixel_values.to(device=self.vision_tower.device,
+                                               dtype=torch.float16)
+                logger.info(f'vision forward shape: {pixel_values.shape}')
+                feats = self.encode_images(pixel_values)
+                outputs.extend([x for x in feats])
+        messages.append(dict(role='forward', content=outputs))
+        return messages
diff --git a/lmdeploy/vl/model/llava_hf.py b/lmdeploy/vl/model/llava_hf.py
index 31be101ae8..0bbeea78ec 100644
--- a/lmdeploy/vl/model/llava_hf.py
+++ b/lmdeploy/vl/model/llava_hf.py
@@ -1,15 +1,16 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-
 import warnings
-from typing import List
+from typing import Dict, List
 
 import torch
-from PIL.Image import Image
 from transformers import AutoProcessor
 
+from lmdeploy.utils import get_logger
 from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
 from lmdeploy.vl.model.utils import disable_logging
 
+logger = get_logger('lmdeploy')
+
 
 @VISION_MODELS.register_module()
 class LlavaHfVisionModel(VisonModel):
@@ -17,6 +18,19 @@ class LlavaHfVisionModel(VisonModel):
 
     _arch = 'LlavaForConditionalGeneration'
 
+    def build_preprocessor(self):
+        processor = AutoProcessor.from_pretrained(self.model_path,
+                                                  trust_remote_code=True)
+        if hasattr(processor, 'tokenizer'):
+            del processor.tokenizer
+            processor.prtokenizer = None
+        self.processor = processor.image_processor
+        image_size = self.hf_config.vision_config.image_size
+        patch_size = self.hf_config.vision_config.patch_size
+        self.n_token_per_image = (image_size // patch_size)**2
+        if self.hf_config.vision_feature_select_strategy == 'full':
+            self.n_token_per_image += 1
+
     def build_model(self):
         from accelerate import init_empty_weights, load_checkpoint_and_dispatch
 
@@ -24,56 +38,113 @@ def build_model(self):
             warnings.simplefilter('ignore')
             from transformers import LlavaForConditionalGeneration
             model = LlavaForConditionalGeneration._from_config(self.hf_config)
-            if not self.with_llm:
-                del model.language_model
-                for key in ['language_model']:
-                    setattr(model, key, None)
-            else:
-                self.vl_model = model
+            del model.language_model
 
         # fix for llava-hf/llava-interleave-qwen-7b-hf
         setattr(model.config, 'tie_word_embeddings', False)
         with disable_logging():
-            load_checkpoint_and_dispatch(
-                model=model,
-                max_memory=self.max_memory,
-                checkpoint=self.model_path,
-                device_map='auto' if not self.with_llm else {'': 'cpu'},
-                no_split_module_classes=[
-                    'CLIPEncoderLayer', 'SiglipEncoderLayer'
-                ],
-                dtype=torch.half)
+            load_checkpoint_and_dispatch(model=model,
+                                         max_memory=self.max_memory,
+                                         checkpoint=self.model_path,
+                                         device_map='auto',
+                                         no_split_module_classes=[
+                                             'CLIPEncoderLayer',
+                                             'SiglipEncoderLayer'
+                                         ],
+                                         dtype=torch.half)
         model.eval()
         self.model = model
-        # processor
-        processor = AutoProcessor.from_pretrained(self.model_path,
-                                                  trust_remote_code=True)
-        if hasattr(processor, 'tokenizer'):
-            del processor.tokenizer
-            processor.prtokenizer = None
-        self.processor = processor.image_processor
+
+    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+        """refers to `super.preprocess() for spec."""
+        images = self.collect_images(messages)
+        outputs = []
+        for image, params in images:
+            image = image.convert('RGB')
+            pixel_values = self.processor(
+                image, return_tensors='pt',
+                input_data_format='channels_last').pixel_values
+            outputs.append(
+                dict(pixel_values=pixel_values,
+                     image_size=image.size,
+                     image_tokens=self.n_token_per_image,
+                     image_token_id=0))
+        messages.append(dict(role='preprocess', content=outputs))
+        return messages
 
     @torch.no_grad()
-    def forward(self, images: List[Image]) -> List[torch.Tensor]:
-        """forward."""
-        pixel_values = self.processor(
-            images, return_tensors='pt',
-            input_data_format='channels_last')['pixel_values']
-        pixel_values = pixel_values.to(device=self.model.device,
-                                       dtype=self.model.dtype)
-        image_outputs = self.model.vision_tower.forward(
-            pixel_values, output_hidden_states=True)
-        image_features = image_outputs.hidden_states[
-            self.hf_config.vision_feature_layer]
-        if self.hf_config.vision_feature_select_strategy == 'default':
-            image_features = image_features[:, 1:]
-        elif self.hf_config.vision_feature_select_strategy == 'full':
-            image_features = image_features
-        else:
-            raise ValueError(
-                'Unexpected select feature strategy: '
-                f'{self.hf_config.vision_feature_select_strategy}')
-        image_features = self.model.multi_modal_projector(image_features)
-        outputs = torch.split(image_features, 1, dim=0)
-        outputs = [x.squeeze() for x in outputs]
-        return outputs
+    def forward(self,
+                messages: List[Dict],
+                max_batch_size: int = 1) -> List[Dict]:
+        """extract image feature. ONLY implement it when the backend is
+        turbomind engine.
+
+        Args:
+            messages(List[Dict]): the outputs of `preprocess`
+            max_batch_size(int): the max batch size when forwarding vision
+                model
+        Return:
+            the message list with forwarding results included
+        """
+        inputs = [x['content'] for x in messages if x['role'] == 'preprocess']
+        inputs = inputs[0]
+        outputs = []
+        for idx in range(0, len(inputs), max_batch_size):
+            pixel_values = [
+                x['pixel_values'] for x in inputs[idx:idx + max_batch_size]
+            ]
+            pixel_values = torch.cat(pixel_values, dim=0)
+            pixel_values = pixel_values.to(device=self.model.device,
+                                           dtype=self.model.dtype)
+            logger.info(f'vision forward shape: {pixel_values.shape}')
+            image_outputs = self.model.vision_tower.forward(
+                pixel_values, output_hidden_states=True)
+            image_features = image_outputs.hidden_states[
+                self.hf_config.vision_feature_layer]
+            if self.hf_config.vision_feature_select_strategy == 'default':
+                image_features = image_features[:, 1:]
+            elif self.hf_config.vision_feature_select_strategy == 'full':
+                image_features = image_features
+            else:
+                raise ValueError(
+                    'Unexpected select feature strategy: '
+                    f'{self.hf_config.vision_feature_select_strategy}')
+            image_features = self.model.multi_modal_projector(image_features)
+            image_features = torch.split(image_features, 1, dim=0)
+            outputs.extend([x.squeeze() for x in image_features])
+        messages.append(dict(role='forward', content=outputs))
+        return messages
+
+    @staticmethod
+    def proc_messages(messages, chat_template, sequence_start):
+        """apply chat template to get the prompt."""
+        prompt_messages = []
+        IMAGE_TOKEN = '<IMAGE_TOKEN>'
+        for message in messages:
+            if isinstance(message['content'], str):
+                prompt_messages.append(message)
+                continue
+            elif message['role'] in ['images', 'preprocess', 'forward']:
+                continue
+            n_images = len(
+                [1 for x in message['content'] if x['type'] == 'image'])
+            content = [
+                item['text'] for item in message['content']
+                if item['type'] == 'text'
+            ]
+            prompt = (IMAGE_TOKEN + '\n') * n_images + content[0]
+            prompt_messages.append(dict(role='user', content=prompt))
+        prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
+        return prompt, IMAGE_TOKEN
+
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
+        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
+                                                 sequence_start)
+        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                   sequence_start)
+
+    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
+        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
+                                                 sequence_start)
+        return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                     sequence_start)
diff --git a/lmdeploy/vl/model/llava_next.py b/lmdeploy/vl/model/llava_next.py
index 9223ebea4f..e58d1ff46c 100644
--- a/lmdeploy/vl/model/llava_next.py
+++ b/lmdeploy/vl/model/llava_next.py
@@ -1,46 +1,47 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-
+import itertools
 import warnings
-from typing import List
+from typing import Dict, List
 
 import torch
-from PIL.Image import Image
-from transformers import AutoProcessor
 
-from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
+from lmdeploy.utils import get_logger
+from lmdeploy.vl.model.llava_hf import VISION_MODELS, LlavaHfVisionModel
 from lmdeploy.vl.model.utils import disable_logging
 
+logger = get_logger('lmdeploy')
+
 
 @VISION_MODELS.register_module()
-class LlavaNextVisionModel(VisonModel):
+class LlavaNextVisionModel(LlavaHfVisionModel):
     """Llava hf vision model."""
 
     _arch = 'LlavaNextForConditionalGeneration'
 
-    def build_model(self):
-        from accelerate import init_empty_weights, load_checkpoint_and_dispatch
-        from accelerate.utils import get_balanced_memory, infer_auto_device_map
-
+    def build_preprocessor(self):
+        super().build_preprocessor()
+        # build the model with empty weights. The model will be used in
+        # `preprocess` to get the image token number
+        from accelerate import init_empty_weights
         with init_empty_weights(), warnings.catch_warnings():
             warnings.simplefilter('ignore')
             from transformers import LlavaNextForConditionalGeneration
-            model = LlavaNextForConditionalGeneration._from_config(
+            self.model = LlavaNextForConditionalGeneration._from_config(
                 self.hf_config)
-            if not self.with_llm:
-                del model.language_model
-                for key in ['language_model']:
-                    setattr(model, key, None)
-            else:
-                self.vl_model = model
+            del self.model.language_model
+
+    def build_model(self):
+        from accelerate import load_checkpoint_and_dispatch
+        from accelerate.utils import get_balanced_memory, infer_auto_device_map
 
         no_split_module_classes = ['CLIPEncoderLayer']
         max_memory = get_balanced_memory(
-            model,
+            self.model,
             max_memory=self.max_memory,
             dtype=torch.half,
             no_split_module_classes=no_split_module_classes)
         device_map = infer_auto_device_map(
-            model,
+            self.model,
             no_split_module_classes=no_split_module_classes,
             max_memory=max_memory,
             dtype=torch.half)
@@ -55,75 +56,128 @@ def build_model(self):
 
         with disable_logging():
             load_checkpoint_and_dispatch(
-                model=model,
+                model=self.model,
                 checkpoint=self.model_path,
-                device_map=device_map if not self.with_llm else {'': 'cpu'},
+                device_map=device_map,
                 no_split_module_classes=no_split_module_classes,
                 dtype=torch.half)
-        model.eval()
-        self.model = model
-        # processor
-        processor = AutoProcessor.from_pretrained(self.model_path,
-                                                  trust_remote_code=True)
-        if hasattr(processor, 'tokenizer'):
-            del processor.tokenizer
-            processor.prtokenizer = None
-        self.processor = processor.image_processor
+        self.model.eval()
 
-    @torch.no_grad()
-    def forward(self, images: List[Image]) -> List[torch.Tensor]:
+    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+        """refers to the spec of `super.preprocess()"""
         from transformers.models.llava_next.modeling_llava_next import \
             image_size_to_num_patches
-        """forward."""
-        processed_inputs = self.processor(images,
-                                          return_tensors='pt',
-                                          input_data_format='channels_last')
-        pixel_values = processed_inputs['pixel_values'].to(
-            device=self.model.device, dtype=self.model.dtype)
-        image_sizes = processed_inputs['image_sizes'].to(
-            device=self.model.device, dtype=self.model.dtype)
-        # ! infer image_num_patches from image_sizes
-        image_num_patches = [
-            image_size_to_num_patches(
-                image_size=imsize,
-                grid_pinpoints=self.hf_config.image_grid_pinpoints,
-                patch_size=self.hf_config.vision_config.image_size,
-            ) for imsize in image_sizes
-        ]
-        # figure out if pixel_values is concatenated or stacked
-        if pixel_values.dim() == 5:
-            # stacking when input is
-            # (batch_size, num_patches, num_channels, height, width)
-            _pixel_values_list = [
-                pix_val[:num_patch]
-                for pix_val, num_patch in zip(pixel_values, image_num_patches)
+        images = self.collect_images(messages)
+        outputs = []
+        for image, params in images:
+            image = image.convert('RGB')
+            result = self.processor(image,
+                                    return_tensors='pt',
+                                    input_data_format='channels_last')
+            # ! infer image_num_patches from image_sizes
+            image_num_patches = [
+                image_size_to_num_patches(
+                    image_size=imsize,
+                    grid_pinpoints=self.hf_config.image_grid_pinpoints,
+                    patch_size=self.hf_config.vision_config.image_size,
+                ) for imsize in result['image_sizes']
             ]
-            pixel_values = torch.cat(_pixel_values_list, dim=0)
-        elif pixel_values.dim() != 4:
-            # otherwise has to be stacked from list of
-            # (num_patches, num_channels, height, width)
-            raise ValueError(f'pixel_values of shape {pixel_values.shape}, '
-                             'expect to be of 4 or 5 dimensions')
-        image_outputs = self.model.vision_tower.forward(
-            pixel_values, output_hidden_states=True)
-        image_features = image_outputs.hidden_states[
-            self.hf_config.vision_feature_layer]
-        if self.hf_config.vision_feature_select_strategy == 'default':
-            image_features = image_features[:, 1:]
-        elif self.hf_config.vision_feature_select_strategy == 'full':
-            image_features = image_features
-        else:
-            raise ValueError(
-                'Unexpected select feature strategy: '
-                f'{self.hf_config.vision_feature_select_strategy}')
-        image_features = self.model.multi_modal_projector(image_features)
-        image_features = torch.split(image_features, image_num_patches, dim=0)
-        image_features, feature_lens = self.model.pack_image_features(
-            image_features,
-            image_sizes,
-            image_newline=self.model.image_newline,
-        )
-        outputs = torch.split(image_features,
-                              feature_lens.cpu().numpy().tolist(),
-                              dim=0)
-        return outputs
+
+            hidden_size = self.hf_config.text_config.hidden_size
+            fake_image_features = torch.zeros(
+                [image_num_patches[0], self.n_token_per_image, hidden_size])
+            image_sizes = result['image_sizes']
+            image_newline = torch.randn(self.hf_config.text_config.hidden_size)
+            strategy = self.hf_config.vision_feature_select_strategy
+            _, image_tokens = self.model.pack_image_features(
+                [fake_image_features],
+                image_sizes,
+                vision_feature_select_strategy=strategy,
+                image_newline=image_newline)
+            result.update(
+                dict(image_size=image.size,
+                     image_patches=image_num_patches,
+                     image_tokens=image_tokens,
+                     image_token_id=0))
+            outputs.append(result)
+        messages.append(dict(role='preprocess', content=outputs))
+        return messages
+
+    @torch.no_grad()
+    def forward(self,
+                messages: List[Dict],
+                max_batch_size: int = 1) -> List[Dict]:
+        """extract image feature. ONLY implement it when the backend is
+        turbomind engine.
+
+        Args:
+            messages(List[Dict]): the outputs of `preprocess`
+            max_batch_size(int): the max batch size when forwarding vision
+                model
+        Return:
+            the message list with forwarding results included
+        """
+        inputs = [x['content'] for x in messages if x['role'] == 'preprocess']
+        inputs = inputs[0]
+        outputs = []
+        for idx in range(0, len(inputs), max_batch_size):
+            pixel_values = [
+                x['pixel_values'].to(device=self.model.device,
+                                     dtype=self.model.dtype)
+                for x in inputs[idx:idx + max_batch_size]
+            ]
+            pixel_values = torch.cat(pixel_values, dim=0)
+            image_sizes = [
+                x['image_sizes'].to(device=self.model.device,
+                                    dtype=self.model.dtype)
+                for x in inputs[idx:idx + max_batch_size]
+            ]
+            image_sizes = torch.cat(image_sizes, dim=0)
+            image_num_patches = [
+                x['num_patch'] for x in inputs[idx:idx + max_batch_size]
+            ]
+            image_num_patches = list(itertools.chain(*image_num_patches))
+            # figure out if pixel_values is concatenated or stacked
+            if pixel_values.dim() == 5:
+                # stacking when input is
+                # (batch_size, num_patches, num_channels, height, width)
+                _pixel_values_list = [
+                    pix_val[:num_patch] for pix_val, num_patch in zip(
+                        pixel_values, image_num_patches)
+                ]
+                pixel_values = torch.cat(_pixel_values_list, dim=0)
+            elif pixel_values.dim() != 4:
+                # otherwise has to be stacked from list of
+                # (num_patches, num_channels, height, width)
+                raise ValueError(
+                    f'pixel_values of shape {pixel_values.shape}, '
+                    'expect to be of 4 or 5 dimensions')
+            logger.info(f'vision forward shape: {pixel_values.shape}')
+            image_outputs = self.model.vision_tower.forward(
+                pixel_values, output_hidden_states=True)
+            image_features = image_outputs.hidden_states[
+                self.hf_config.vision_feature_layer]
+            strategy = self.hf_config.vision_feature_select_strategy
+            if strategy == 'default':
+                image_features = image_features[:, 1:]
+            elif strategy == 'full':
+                image_features = image_features
+            else:
+                raise ValueError('Unexpected select feature strategy: '
+                                 f'{strategy}')
+            image_features = self.model.multi_modal_projector(image_features)
+            image_features = torch.split(image_features,
+                                         image_num_patches,
+                                         dim=0)
+            image_features, feature_lens = self.model.pack_image_features(
+                image_features,
+                image_sizes,
+                vision_feature_select_strategy=strategy,
+                image_newline=self.model.image_newline,
+            )
+            image_features = torch.split(image_features,
+                                         feature_lens.cpu().numpy().tolist(),
+                                         dim=0)
+            outputs.extend(image_features)
+        messages.append(dict(role='forward', content=outputs))
+        return messages
diff --git a/lmdeploy/vl/model/mini_gemeni.py b/lmdeploy/vl/model/mini_gemeni.py
index 0565daeba5..3c6fc7d5cd 100644
--- a/lmdeploy/vl/model/mini_gemeni.py
+++ b/lmdeploy/vl/model/mini_gemeni.py
@@ -3,16 +3,18 @@
 import os.path as osp
 import warnings
 from contextlib import contextmanager
-from typing import List
+from typing import Dict, List
 
 import torch
-from PIL.Image import Image
 
+from lmdeploy.utils import get_logger
 from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
 from lmdeploy.vl.model.utils import (add_device_hook, disable_logging,
                                      disable_transformers_logging,
                                      hack_import_with)
 
+logger = get_logger('lmdeploy')
+
 
 def check_mini_gemini_install():
     """check mini gemini install."""
@@ -22,8 +24,8 @@ def check_mini_gemini_install():
     except ImportError:
         raise ImportError(
             'To use MiniGeminiVisionModel, please install minigemini by '
-            'pip install git+https://github.com/dvlab-research/MGM.git'
-            ' --no-deps')
+            '`pip install git+https://github.com/dvlab-research/MGM.git'
+            ' --no-deps`')
 
 
 def _build_vision_tower(vision_tower_cfg, **kwargs):
@@ -169,6 +171,12 @@ class MiniGeminiVisionModel(VisonModel):
 
     _arch = ['MiniGeminiLlamaForCausalLM', 'MGMLlamaForCausalLM']
 
+    def build_preprocessor(self):
+        # pytorch engine will not support mini-gemini. Therefore, in order to
+        # reuse the previous code as much as possible, we do not extract image
+        # preprocessor from `build_model` function.
+        pass
+
     def build_model(self):
         check_mini_gemini_install()
         # empty init
@@ -193,13 +201,10 @@ def build_model(self):
                 vision_tower.load_model()
                 vision_tower_aux = model.get_vision_tower_aux()
                 vision_tower_aux.load_model()
-                if not self.with_llm:
-                    del model.lm_head
-                    del model.model.embed_tokens
-                    del model.model.layers
-                    del model.model.norm
-                else:
-                    self.vl_model = model
+                del model.lm_head
+                del model.model.embed_tokens
+                del model.model.layers
+                del model.model.norm
 
         from accelerate.utils import get_balanced_memory, infer_auto_device_map
         max_memory = get_balanced_memory(
@@ -225,7 +230,7 @@ def build_model(self):
             load_checkpoint_and_dispatch(
                 model=model,
                 checkpoint=self.model_path,
-                device_map=device_map if not self.with_llm else {'': 'cpu'},
+                device_map=device_map,
                 no_split_module_classes=['CLIPEncoderLayer', 'ConvNeXtStage'],
                 dtype=torch.half)
 
@@ -246,11 +251,35 @@ def build_model(self):
         self.image_processor = image_processor
         self.process_images = process_images
 
+    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+        return messages
+
     @torch.no_grad()
-    def forward(self, images: List[Image]) -> List[torch.Tensor]:
-        """forward."""
-        outputs = [x.convert('RGB') for x in images]
-        image_tensor = self.process_images(outputs, self.image_processor,
+    def forward(self,
+                messages: List[Dict],
+                max_batch_size: int = 1) -> List[Dict]:
+        """extract image feature. ONLY implement it when the backend is
+        turbomind engine.
+
+        Args:
+            messages(List[Dict]): the outputs of `preprocess`
+            max_batch_size(int): the max batch size when forwarding vision
+                model
+        Return:
+            the message list with forwarding results included
+        """
+        images = []
+        for message in messages:
+            if not isinstance(message['content'], List):
+                continue
+            _ = [
+                x['image'] for x in message['content'] if x['type'] == 'image'
+            ]
+            assert len(_) == 1, f'MiniGeminiLlama accepts ONE input ' \
+                f'image, but got {len(images)} images'
+            images.extend(_)
+
+        image_tensor = self.process_images(images, self.image_processor,
                                            self.model.config)
         image_grid = getattr(self.model.config, 'image_grid', 1)
         if hasattr(self.model.config, 'image_size_aux'):
@@ -301,15 +330,47 @@ def forward(self, images: List[Image]) -> List[torch.Tensor]:
                 image.to(self.model.device, dtype=torch.float16)
                 for image in image_tensor_aux
             ]
+            logger.info(f'vision forward bs: {len(image_tensor)}')
         else:
             image_tensor = image_tensor.to(self.model.device,
                                            dtype=torch.float16)
             image_tensor_aux = image_tensor_aux.to(self.model.device,
                                                    dtype=torch.float16)
-
+            logger.info(f'vision forward shape: {image_tensor.shape}')
         images_embeds = self.model.encode_images(image_tensor,
                                                  image_tensor_aux)
 
         outputs = torch.split(images_embeds, 1, dim=0)
         outputs = [x.squeeze() for x in outputs]
-        return outputs
+        messages.append(dict(role='forward', cotent=outputs))
+
+    @staticmethod
+    def proc_messages(messages, chat_template, sequence_start):
+        """apply chat template to get the prompt."""
+        prompt_messages = []
+        IMAGE_TOKEN = '<IMAGE_TOKEN>'
+        for message in messages:
+            if isinstance(message['content'], str):
+                prompt_messages.append(message)
+                continue
+            elif message['role'] in ['images', 'preprocess', 'forward']:
+                continue
+            n_images = len(
+                [1 for x in message['content'] if x['type'] == 'image'])
+            content = [
+                item['text'] for item in message['content']
+                if item['type'] == 'text'
+            ]
+            prompt = (IMAGE_TOKEN + '\n') * n_images + content[0]
+            prompt_messages.append(dict(role='user', content=prompt))
+        prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
+        return prompt, IMAGE_TOKEN
+
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
+        assert 0, 'cogvlm is not supported by pytorch engine'
+
+    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
+        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
+                                                 sequence_start)
+        return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                     sequence_start)
diff --git a/lmdeploy/vl/model/minicpmv.py b/lmdeploy/vl/model/minicpmv.py
index 4e30190c1d..7986ff7954 100644
--- a/lmdeploy/vl/model/minicpmv.py
+++ b/lmdeploy/vl/model/minicpmv.py
@@ -1,10 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import itertools
 import warnings
 from typing import Dict, List
 
 import torch
 from PIL.Image import Image
-from transformers import AutoModelForCausalLM
+from transformers import AutoConfig, AutoModelForCausalLM
 
 from lmdeploy.utils import get_logger
 from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
@@ -19,6 +20,29 @@ class MiniCPMVModel(VisonModel):
 
     _arch = 'MiniCPMV'
 
+    def __init__(self,
+                 model_path: str,
+                 max_memory: Dict[int, int] = None,
+                 hf_config: AutoConfig = None,
+                 backend: str = ''):
+        super().__init__(model_path, max_memory, hf_config, backend)
+        if not hasattr(self.hf_config, 'version'):
+            raise ValueError('Can not find `version` in config.json. '
+                             'Please checkout the latest model')
+        version = str(self.hf_config.version)
+        if version not in ['2.5', '2.6']:
+            raise ValueError(
+                f'Only support v2.5 and v2.6, but got version {version}')
+        self.version = version
+
+    def build_preprocessor(self):
+        from transformers import AutoProcessor
+        self.processor = AutoProcessor.from_pretrained(self.model_path,
+                                                       trust_remote_code=True)
+        self.image_processor = self.processor.image_processor
+        self._preprocess_func = (self._preprocess_v2_5 if self.version == '2.5'
+                                 else self._preprocess_v2_6)
+
     def build_model(self):
         """build model & load weights."""
         from accelerate import init_empty_weights
@@ -29,67 +53,29 @@ def build_model(self):
             config.quantization_config = {}  # disable vision part quantization
             model = AutoModelForCausalLM.from_config(config,
                                                      trust_remote_code=True)
-        if not self.with_llm:
-            del model.llm
-        else:
-            self.vl_model = model
+        del model.llm
 
         from accelerate import load_checkpoint_and_dispatch
         with disable_logging():
-            load_checkpoint_and_dispatch(
-                model=model,
-                max_memory=self.max_memory,
-                checkpoint=self.model_path,
-                device_map='auto' if not self.with_llm else {'': 'cpu'},
-                no_split_module_classes=[
-                    'Idefics2EncoderLayer', 'Resampler', 'SiglipEncoderLayer'
-                ],
-                dtype=torch.half)
+            load_checkpoint_and_dispatch(model=model,
+                                         max_memory=self.max_memory,
+                                         checkpoint=self.model_path,
+                                         device_map='auto',
+                                         no_split_module_classes=[
+                                             'Idefics2EncoderLayer',
+                                             'Resampler', 'SiglipEncoderLayer'
+                                         ],
+                                         dtype=torch.half)
 
         model.resampler.pos_embed = model.resampler.pos_embed.to(
             device=model.resampler.proj.device)
         self.config = config
         self.model = model.eval()
-        self.init_forward_func()
-
-    def init_forward_func(self):
-        if not hasattr(self.config, 'version'):
-            msg = 'LMDeploy only support `MiniCPM-V-2_6` and '\
-                '`MiniCPM-Llama3-V-2_5`.\nCan not find `version` in config, ' \
-                'please consider update the huggingface model.'
-            logger.warn(msg)
-
-        self._forward_func = self._forward_v2_5
-        if hasattr(self.config, 'version'):
-            version = str(self.config.version)
-            if version == '2.6':
-                self._forward_func = self._forward_v2_6
-
-        if self._forward_func == self._forward_v2_5:
-            logger.info('using _forward_v2_5')
-            if not hasattr(self.model, 'slice_image'):
-                # adapt new code commit 287e3f85 (MiniCPM-Llama3-V-2_5)
-                from transformers import AutoProcessor
-                processor = AutoProcessor.from_pretrained(
-                    self.model_path, trust_remote_code=True)
-                self.model.slice_image = processor.image_processor.slice_image
-
-                def _reshape_by_patch(x):
-                    out = x.cpu().numpy()
-                    out = processor.image_processor.reshape_by_patch(out)
-                    return torch.from_numpy(out).to(device=x.device)
-
-                self.model.reshape_by_patch = _reshape_by_patch
-
-        if self._forward_func == self._forward_v2_6:
-            logger.info('using _forward_v2_6')
-            from transformers import AutoProcessor
-            self.model.processor = AutoProcessor.from_pretrained(
-                self.model_path, trust_remote_code=True)
 
     def _get_slice_image(self, image: Image):
         slice_images = []
-        source_image, patches, best_grid = self.model.slice_image(image)
+        source_image, patches, best_grid = self.image_processor.slice_image(
+            image)
         slice_images.append(source_image)
         if len(patches) > 0:
             for i in range(len(patches)):
@@ -103,114 +89,198 @@ def _reshape_by_patch(self, slice_images):
         for slice_image in slice_images:
             slice_image = self.model.transform(slice_image)
             H, W = slice_image.shape[1:]
-            patches.append(self.model.reshape_by_patch(slice_image))
+            slice_image = slice_image.numpy()
+            slice_image = self.image_processor.reshape_by_patch(slice_image)
+            slice_image = torch.from_numpy(slice_image)
+            patches.append(slice_image)
             H //= self.config.patch_size
             W //= self.config.patch_size
             tgt_sizes.append(torch.Tensor([H, W]).type(torch.int32))
         return patches, tgt_sizes
 
-    def _forward_v2_5(self, images: List[Image], params: List[Dict] = None):
-        """forward for MiniCPM-Llama3-V-2_5."""
-        patches = []
-        tgt_sizes = []
-        best_grids = []
-        num_patches = []
-        for image in images:
-            slice_images, best_grid = self._get_slice_image(image)
-            _patches, _tgt_sizes = self._reshape_by_patch(slice_images)
-            num_patches.append(len(_patches))
-            patches.extend(_patches)
-            tgt_sizes.extend(_tgt_sizes)
-            best_grids.append(best_grid)
-
-        patches = [
-            x.to(dtype=torch.half, device=self.model.device) for x in patches
-        ]
-        patches = [x.flatten(end_dim=1).permute(1, 0) for x in patches]
-        tgt_sizes = torch.vstack(tgt_sizes).type(torch.int32)
-        max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1])
-        all_pixel_values = torch.nn.utils.rnn.pad_sequence(patches,
-                                                           batch_first=True,
-                                                           padding_value=0.0)
-        B, L, _ = all_pixel_values.shape
-        all_pixel_values = all_pixel_values.permute(0, 2,
-                                                    1).reshape(B, 3, -1, L)
-        patch_attn_mask = torch.zeros((B, 1, max_patches),
-                                      dtype=torch.bool,
-                                      device=self.model.device)
-        for i in range(B):
-            patch_attn_mask[i, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True
-        vision_embedding = self.model.vpm(
-            all_pixel_values.type(torch.half),
-            patch_attention_mask=patch_attn_mask).last_hidden_state
-        vision_embedding = self.model.resampler(vision_embedding, tgt_sizes)
-        vision_embedding = torch.split(vision_embedding, num_patches, 0)
+    def _preprocess_v2_5(self, image: Image, params: Dict = None) -> Dict:
+        """image preprocessing for MiniCPM-Llama3-V-2_5."""
+        slice_images, best_grid = self._get_slice_image(image)
+        # pixel_values, tgt_sizes are list of torch tensors
+        pixel_values, tgt_sizes = self._reshape_by_patch(slice_images)
+        num_patches = len(pixel_values)
+        return dict(
+            pixel_values=pixel_values,  # a list
+            tgt_sizes=tgt_sizes,  # a list
+            best_grid=best_grid,
+            num_patches=num_patches,
+            image_tokens=1,
+            image_token_id=0)
+
+    def _preprocess_v2_6(self, image: Image, params: Dict = None) -> Dict:
+        """image preprocessing for MiniCPM-V-2_6."""
+        max_slice_nums = self.image_processor.max_slice_nums
+        use_image_id = self.image_processor.use_image_id
+        max_slice_nums = params.get('max_slice_nums', max_slice_nums)
+        use_image_id = params.get('use_image_id', use_image_id)
+        outputs = self.image_processor(image, max_slice_nums=max_slice_nums)
+        pixel_values = outputs['pixel_values'][0]
+        num_patches = len(pixel_values)
+        pixel_values = [torch.as_tensor(x) for x in pixel_values]
+        tgt_sizes = outputs['tgt_sizes'][0]
+        tgt_sizes = [torch.as_tensor(x) for x in tgt_sizes]
+        grid = self.image_processor.get_sliced_grid(
+            image_size=image.size, max_slice_nums=max_slice_nums)
+        return dict(
+            pixel_values=pixel_values,  # a list
+            tgt_sizes=tgt_sizes,  # a list
+            best_grid=grid,
+            num_patches=num_patches,
+            image_tokens=1,
+            image_token_id=0,
+            use_image_id=use_image_id)
+
+    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+        """refer to `super().preprocess() for spec."""
         outputs = []
-        for embeddings, grid in zip(vision_embedding, best_grids):
-            embeddings = embeddings.cpu()  # n x d x h
-            outputs.append(dict(embeddings=embeddings, grid=grid))
+        for i, message in enumerate(messages):
+            if message['role'] != 'user' or not isinstance(
+                    message['content'], List):
+                continue
+            for item in message['content']:
+                if item['type'] == 'image':
+                    image = item['image'].convert('RGB')
+                    params = {
+                        k: v
+                        for k, v in item.items() if k not in {'type', 'image'}
+                    }
+                    result = self._preprocess_func(image, params)
+                    outputs.append(result)
+            messages[i].update(dict(preprocess=outputs))
+        return messages
 
-        return outputs
+    @torch.no_grad()
+    def forward(self,
+                messages: List[Dict],
+                max_batch_size: int = 1) -> List[Dict]:
+        """extract image feature. ONLY implement it when the backend is
+        turbomind engine.
 
-    def _forward_v2_6(self, images: List[Image], params: List[Dict] = None):
-        """forward for MiniCPM-V-2_6."""
-        patches = []
-        tgt_sizes = []
-        best_grids = []
-        num_patches = []
-        max_slice_nums = self.model.processor.image_processor.max_slice_nums
-        use_image_id = self.model.processor.image_processor.use_image_id
-        for image, param in zip(images, params):
-            max_slice_nums = param.get('max_slice_nums', max_slice_nums)
-            use_image_id = param.get('use_image_id', use_image_id)
-            outputs = self.model.processor.image_processor(
-                image, max_slice_nums=max_slice_nums)
-            patches.extend(outputs['pixel_values'][0])
-            num_patches.append(len(outputs['pixel_values'][0]))
-            tgt_sizes.extend(outputs['tgt_sizes'][0])
-            grid = self.model.processor.image_processor.get_sliced_grid(
-                image_size=image.size, max_slice_nums=max_slice_nums)
-            best_grids.append(grid)
-
-        patches = [
-            torch.as_tensor(x).to(dtype=torch.half, device=self.model.device)
-            for x in patches
+        Args:
+            messages(List[Dict]): the outputs of `preprocess`
+            max_batch_size(int): the max batch size when forwarding vision
+                model
+        Return:
+            the message list with forwarding results included
+        """
+        # collect preprocess results into a list
+        inputs = []
+        inputs = [
+            x['preprocess'] for x in messages if 'preprocess' in x.keys()
         ]
-        patches = [x.flatten(end_dim=1).permute(1, 0) for x in patches]
-        tgt_sizes = [torch.as_tensor(x) for x in tgt_sizes]
-        tgt_sizes = torch.vstack(tgt_sizes).type(torch.int32)
-        max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1])
-        all_pixel_values = torch.nn.utils.rnn.pad_sequence(patches,
+        # flatten the list
+        inputs = list(itertools.chain(*inputs))
+        outputs = []
+        for idx in range(0, len(inputs), max_batch_size):
+            tgt_sizes = [
+                x['tgt_sizes'] for x in inputs[idx:idx + max_batch_size]
+            ]
+            pixel_values = [
+                x['pixel_values'] for x in inputs[idx:idx + max_batch_size]
+            ]
+            num_patches = [
+                x['num_patches'] for x in inputs[idx:idx + max_batch_size]
+            ]
+            # flatten the list
+            tgt_sizes = list(itertools.chain(*tgt_sizes))
+            pixel_values = list(itertools.chain(*pixel_values))
+            pixel_values = [
+                x.to(dtype=torch.half, device=self.model.device)
+                for x in pixel_values
+            ]
+            pixel_values = [
+                x.flatten(end_dim=1).permute(1, 0) for x in pixel_values
+            ]
+            pixel_values = torch.nn.utils.rnn.pad_sequence(pixel_values,
                                                            batch_first=True,
                                                            padding_value=0.0)
-        B, L, _ = all_pixel_values.shape
-        all_pixel_values = all_pixel_values.permute(0, 2,
-                                                    1).reshape(B, 3, -1, L)
-        patch_attn_mask = torch.zeros((B, 1, max_patches),
-                                      dtype=torch.bool,
-                                      device=self.model.device)
-        for i in range(B):
-            patch_attn_mask[i, 0, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True
-        vision_embedding = self.model.vpm(
-            all_pixel_values.type(torch.half),
-            patch_attention_mask=patch_attn_mask,
-            tgt_sizes=tgt_sizes).last_hidden_state
-        vision_embedding = self.model.resampler(vision_embedding, tgt_sizes)
-        vision_embedding = torch.split(vision_embedding, num_patches, 0)
-        outputs = []
-        for embeddings, grid in zip(vision_embedding, best_grids):
-            embeddings = embeddings.cpu()  # n x d x h
-            outputs.append(
-                dict(embeddings=embeddings,
-                     grid=grid,
-                     use_image_id=use_image_id))
+            B, L, _ = pixel_values.shape
+            pixel_values = pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
+            tgt_sizes = torch.vstack(tgt_sizes).type(torch.int32)
+            max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1])
+            patch_attn_mask = torch.zeros((B, 1, max_patches),
+                                          dtype=torch.bool,
+                                          device=self.model.device)
+            logger.info(f'vision forward shape: {pixel_values.shape}')
+            if self.version == '2.5':
+                for j in range(B):
+                    patch_attn_mask[j, :tgt_sizes[j][0] *
+                                    tgt_sizes[j][1]] = True
+                embeddings = self.model.vpm(
+                    pixel_values.type(torch.half),
+                    patch_attention_mask=patch_attn_mask).last_hidden_state
+            else:
+                for j in range(B):
+                    patch_attn_mask[j, 0, :tgt_sizes[j][0] *
+                                    tgt_sizes[j][1]] = True
+                embeddings = self.model.vpm(
+                    pixel_values.type(torch.half),
+                    patch_attention_mask=patch_attn_mask,
+                    tgt_sizes=tgt_sizes).last_hidden_state
 
-        return outputs
+            embeddings = self.model.resampler(embeddings, tgt_sizes)
+            embeddings = torch.split(embeddings, num_patches, 0)
+            for embedding in embeddings:
+                embedding = embedding.split(1, dim=0)
+                outputs.extend([x.squeeze() for x in embedding])
+        messages.append(dict(role='forward', content=outputs))
+        return messages
 
-    @torch.no_grad()
-    def forward(self,
-                images: List[Image],
-                params: List[Dict] = None) -> List[torch.Tensor]:
-        """forward."""
-        images = [x.convert('RGB') for x in images]
-        return self._forward_func(images, params)
+    def proc_messages(self, messages, chat_template, sequence_start):
+        """apply chat template to get the prompt."""
+        prompt_messages = []
+        IMAGE_TOKEN = '<IMAGE_TOKEN>'
+        idx = 0
+        for message in messages:
+            if isinstance(message['content'], str):
+                prompt_messages.append(message)
+                continue
+            if 'preprocess' not in message.keys():
+                continue
+            prompts = []
+            for x in message['preprocess']:
+                prompt = f'<image>{IMAGE_TOKEN}</image>'
+                if x.get('use_image_id', False):
+                    prompt = f'<image_id>{idx}</image_id>' + prompt
+                    idx += 1
+                grid = x['best_grid']
+                if grid is not None:
+                    if self.version == '2.5':
+                        slice = '\n'.join(
+                            [f'<image>{IMAGE_TOKEN}</image>' * grid[0]] *
+                            grid[1])
+                        prompt = f'{prompt}<slice>{slice}</slice>\n'
+                    elif self.version == '2.6':
+                        slice = '\n'.join(
+                            [f'<slice>{IMAGE_TOKEN}</slice>' * grid[0]] *
+                            grid[1])
+                        prompt = prompt + slice
+                        prompt += '\n'
+                else:
+                    prompt = (prompt +
+                              '\n' if self.version == '2.6' else prompt)
+                prompts.append(prompt)
+            content = [
+                x['text'] for x in message['content'] if x['type'] == 'text'
+            ]
+            prompt = ''.join(prompts) + content[0]
+            prompt_messages.append(dict(role='user', content=prompt))
+        prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
+        return prompt, IMAGE_TOKEN
+
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
+        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
+                                                 sequence_start)
+        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                   sequence_start)
+
+    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
+        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
+                                                 sequence_start)
+        return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                     sequence_start)
diff --git a/lmdeploy/vl/model/mllama.py b/lmdeploy/vl/model/mllama.py
index db0a0e9cbf..13a6b3a480 100644
--- a/lmdeploy/vl/model/mllama.py
+++ b/lmdeploy/vl/model/mllama.py
@@ -2,198 +2,7 @@
 
 from typing import Dict, List
 
-import torch
-import torch.nn.functional as F
-from PIL.Image import Image
-from transformers.modeling_outputs import BaseModelOutput
-from transformers.models.mllama.modeling_mllama import MllamaPreTrainedModel
-
 from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
-from lmdeploy.vl.model.utils import disable_logging
-
-
-class MllamaVisionModelPatch(MllamaPreTrainedModel):
-
-    def apply_class_embedding(self,
-                              hidden_state: torch.Tensor) -> torch.Tensor:
-        batch_size, _, hidden_size = hidden_state.shape
-        class_embedding = self.class_embedding.expand(batch_size, 1,
-                                                      hidden_size)
-        class_embedding = class_embedding.to(hidden_state.device)
-        hidden_state = torch.cat([class_embedding, hidden_state], dim=1)
-        return hidden_state
-
-    def forward(
-        self,
-        pixel_values: torch.Tensor,
-        aspect_ratio_ids: torch.Tensor,
-        aspect_ratio_mask: torch.Tensor,
-        output_attentions: bool = None,
-        output_hidden_states: bool = None,
-        return_dict: bool = None,
-    ):
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions  # noqa
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict  # noqa
-
-        batch_size, num_concurrent_media, num_tiles, num_channels, height, width = pixel_values.shape  # noqa
-
-        pixel_values = pixel_values.reshape(
-            batch_size * num_concurrent_media * num_tiles, num_channels,
-            height, width)
-        aspect_ratio_ids = aspect_ratio_ids.reshape(
-            batch_size * num_concurrent_media, -1)
-
-        # Patch embedding
-        patch_embeds = self.patch_embedding(
-            pixel_values.to(self.dtype).to(self.device))
-        hidden_state = patch_embeds.flatten(2).transpose(1, 2)
-
-        # Tile embeddings
-        _, num_patches, dim = hidden_state.shape
-        hidden_state = hidden_state.reshape(batch_size * num_concurrent_media,
-                                            num_tiles, -1, dim)
-        hidden_state = self.pre_tile_positional_embedding(
-            hidden_state, aspect_ratio_ids)
-
-        # Add cls token
-        hidden_state = hidden_state.reshape(
-            batch_size * num_concurrent_media * num_tiles, num_patches, dim)
-        hidden_state = self.apply_class_embedding(hidden_state)
-        num_patches += 1
-
-        # Position embeddings
-        hidden_state = hidden_state.reshape(batch_size * num_concurrent_media,
-                                            num_tiles, num_patches, dim)
-        hidden_state = self.gated_positional_embedding(hidden_state,
-                                                       aspect_ratio_ids)
-
-        hidden_state = self.layernorm_pre(hidden_state)
-
-        # Compute the number of tokens to pad
-        num_padding_patches = (8 - (hidden_state.shape[-2] % 8)) % 8
-        # Compute padding tuple for pad function
-        padding = (
-            0, 0, 0, num_padding_patches
-        )  # (pad_left, pad_right, pad_left for dim -2, pad_right for dim -2)
-        # Pad the tensor
-        hidden_state = F.pad(hidden_state, padding, mode='constant', value=0)
-        slice_index = -num_padding_patches if num_padding_patches > 0 else None
-
-        # Prepare attention mask
-        attention_mask = aspect_ratio_mask.reshape(
-            batch_size * num_concurrent_media, -1)
-        from transformers.models.mllama.modeling_mllama import \
-            _prepare_aspect_ratio_attention_mask
-        attention_mask = _prepare_aspect_ratio_attention_mask(
-            aspect_ratio_mask=attention_mask,
-            num_patches=self.num_patches,
-            target_length=hidden_state.shape[2],
-            dtype=self.dtype,
-        )
-
-        # Apply encoder
-        hidden_state = hidden_state.view(batch_size * num_concurrent_media, -1,
-                                         dim)
-        output = self.transformer(
-            hidden_state,
-            attention_mask=attention_mask,
-            output_hidden_states=True,
-            output_attentions=output_attentions,
-        )
-        hidden_state = output[0]
-
-        hidden_state = self.layernorm_post(hidden_state)
-
-        # Apply global encoder
-        hidden_state = hidden_state.reshape(batch_size * num_concurrent_media,
-                                            num_tiles,
-                                            num_patches + num_padding_patches,
-                                            dim)
-        hidden_state = self.post_tile_positional_embedding(
-            hidden_state, aspect_ratio_ids)
-        hidden_state = hidden_state.reshape(
-            batch_size * num_concurrent_media,
-            num_tiles * (num_patches + num_padding_patches), dim)
-        global_output = self.global_transformer(
-            hidden_state,
-            attention_mask=attention_mask,
-            output_hidden_states=output_hidden_states,
-            output_attentions=output_attentions,
-        )
-        hidden_state = global_output[0]
-
-        # Remove padding form hidden state
-        hidden_state = hidden_state.reshape(batch_size * num_concurrent_media,
-                                            num_tiles,
-                                            num_patches + num_padding_patches,
-                                            dim)
-        hidden_state = hidden_state[:, :, :slice_index]
-        hidden_state = hidden_state.reshape(batch_size, num_concurrent_media,
-                                            num_tiles, num_patches, dim)
-
-        # Collect intermediate layer outputs from encoder output
-        all_intermediate_hidden_states = output[1]
-        # rewrite to sync device during accelerate pipeline parallel
-        device = hidden_state.device
-        all_intermediate_hidden_states = [
-            s.to(device) for s in all_intermediate_hidden_states
-        ]
-        intermediate_hidden_states = torch.stack(
-            all_intermediate_hidden_states, dim=-1)
-        intermediate_hidden_states = intermediate_hidden_states[
-            ..., self.intermediate_layers_indices]
-
-        # Remove padding from intermediate hidden states
-        intermediate_hidden_states = intermediate_hidden_states.reshape(
-            batch_size * num_concurrent_media, num_tiles,
-            num_patches + num_padding_patches, -1)
-        intermediate_hidden_states = intermediate_hidden_states[:, :, :
-                                                                slice_index]
-        intermediate_hidden_states = intermediate_hidden_states.reshape(
-            batch_size, num_concurrent_media, num_tiles, num_patches, -1)
-
-        # Concatenate final hidden state and intermediate hidden states
-        hidden_state = torch.cat([hidden_state, intermediate_hidden_states],
-                                 dim=-1)
-
-        if output_hidden_states:
-            hidden_states = tuple(all_intermediate_hidden_states) + tuple(
-                global_output[1])
-        else:
-            hidden_states = None
-
-        if output_attentions:
-            # global transformer in contrast to `self.transformer` doesn't
-            # always return hidden states so we might go index out-of-range
-            global_attn = tuple(
-                global_output[2]) if output_hidden_states else tuple(
-                    global_output[1])
-            attentions = tuple(output[2]) + global_attn
-        else:
-            attentions = None
-
-        if not return_dict:
-            return tuple(v for v in [hidden_state, hidden_states, attentions]
-                         if v is not None)
-
-        return BaseModelOutput(
-            last_hidden_state=hidden_state,
-            hidden_states=hidden_states,
-            attentions=attentions,
-        )
-
-
-def check_transformers():
-    """check qwen_vl_utils."""
-    try:
-        from transformers import MllamaForConditionalGeneration  # noqa: F401
-    except ImportError:
-        raise ImportError(
-            'please install latest transformers by '
-            'pip install git+https://github.com/huggingface/transformers.git')
 
 
 @VISION_MODELS.register_module()
@@ -202,85 +11,50 @@ class MllamaVLModel(VisonModel):
 
     _arch = 'MllamaForConditionalGeneration'
 
-    def build_model(self):
-        check_transformers()
-
-        from transformers.models.mllama.modeling_mllama import \
-            MllamaVisionModel
-        MllamaVisionModel.forward = MllamaVisionModelPatch.forward
-        MllamaVisionModel.apply_class_embedding = MllamaVisionModelPatch.apply_class_embedding  # noqa
-        from accelerate import init_empty_weights
-        with init_empty_weights():
-            config = self.hf_config
-            config.quantization_config = {}  # disable vision part quantization
-            # disable accelerate check_tied_parameters_in_config
-            config.tie_word_embeddings = False
-            from transformers import MllamaForConditionalGeneration
-            model = MllamaForConditionalGeneration._from_config(config)
-            if not self.with_llm:
-                del model.language_model
-            else:
-                self.vl_model = model
-
-        from accelerate import load_checkpoint_and_dispatch
-        with disable_logging():
-            load_checkpoint_and_dispatch(
-                model=model,
-                checkpoint=self.model_path,
-                device_map='auto' if not self.with_llm else {'': 'cpu'},
-                max_memory=self.max_memory,
-                no_split_module_classes=[
-                    'MllamaPrecomputedPositionEmbedding',
-                    'MllamaPrecomputedAspectRatioEmbedding',
-                    'MllamaVisionEncoderLayer'
-                ],
-                dtype=config.torch_dtype)
-
-        self.model = model.eval()
-
-        # processor
+    def build_preprocessor(self):
         from transformers import AutoProcessor
         self.processor = AutoProcessor.from_pretrained(self.model_path)
         self.image_token_id = 128256
 
-    @torch.no_grad()
-    def forward(self,
-                images: List[Image],
-                params: List[Dict] = None) -> List[torch.Tensor]:
-        """forward."""
-        # only support image input
-        if params is not None:
-            assert len(images) == len(
-                params), 'different length of images and params'
-        else:
-            params = [{}] * len(images)
-        # resize images with abnormal shape
-        # TODO try catch image feature extraction in pipeline and
-        # throw error back to users
-        for i, image in enumerate(images):
-            size = image.size
-            if any([s < 3 for s in size]):
-                images[i] = image.resize([s * 3 for s in size])
-        image_inputs = self.processor.image_processor(images=images,
-                                                      return_tensors='pt')
-        pixel_values = image_inputs['pixel_values'].to(
-            self.model.vision_model.device)
-        pixel_values = pixel_values.type(self.model.vision_model.dtype)
-        aspect_ratio_ids = image_inputs['aspect_ratio_ids'].to(
-            self.model.vision_model.device)
-        aspect_ratio_mask = image_inputs['aspect_ratio_mask'].to(
-            self.model.vision_model.device)
-        vision_outputs = self.model.vision_model(
-            pixel_values=pixel_values,
-            aspect_ratio_ids=aspect_ratio_ids,
-            aspect_ratio_mask=aspect_ratio_mask,
-            output_hidden_states=False,
-            output_attentions=False,
-            return_dict=True)
-        cross_attention_states = vision_outputs[0]
-        cross_attention_states = self.model.multi_modal_projector(
-            cross_attention_states)
-        _, bsz, _, _, image_token_dim = tuple(cross_attention_states.shape)
-        cross_attention_states = cross_attention_states.view(
-            bsz, -1, image_token_dim).split([1] * len(images))
-        return cross_attention_states
+    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+        """refer to the spec of `super().preprocess`"""
+        images = self.collect_images(messages)
+        outputs = []
+        for image, params in images:
+            image = image.convert('RGB')
+            results = self.processor.image_processor(images=image,
+                                                     return_tensors='pt')
+            results.update(image_size=image.size,
+                           image_tokens=1,
+                           image_token_id=self.image_token_id)
+            outputs.append(results)
+        messages.append(dict(role='preprocess', content=outputs))
+        return messages
+
+    @staticmethod
+    def proc_messages(messages, chat_template, sequence_start):
+        """apply chat template to get the prompt."""
+        prompt_messages = []
+        IMAGE_TOKEN = '<|image|>'
+        for message in messages:
+            if isinstance(message['content'], str):
+                prompt_messages.append(message)
+                continue
+            elif message['role'] in ['images', 'preprocess', 'forward']:
+                continue
+            n_images = len(
+                [1 for x in message['content'] if x['type'] == 'image'])
+            content = [
+                item['text'] for item in message['content']
+                if item['type'] == 'text'
+            ]
+            prompt = (IMAGE_TOKEN) * n_images + content[0]
+            prompt_messages.append(dict(role='user', content=prompt))
+        prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
+        return prompt, IMAGE_TOKEN
+
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
+        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
+                                                 sequence_start)
+        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                   sequence_start)
diff --git a/lmdeploy/vl/model/molmo.py b/lmdeploy/vl/model/molmo.py
index 9abae7a309..01a3bed2a1 100644
--- a/lmdeploy/vl/model/molmo.py
+++ b/lmdeploy/vl/model/molmo.py
@@ -3,11 +3,9 @@
 from typing import Dict, List
 
 import torch
-from PIL.Image import Image
 from transformers import AutoModelForCausalLM, AutoProcessor
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.constants import IMAGE_TOKEN
 from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
 from lmdeploy.vl.model.utils import disable_logging
 
@@ -20,141 +18,187 @@ class MolmoVisionModel(VisonModel):
 
     _arch = 'MolmoForCausalLM'
 
+    def build_preprocessor(self):
+        self.processor = AutoProcessor.from_pretrained(self.model_path,
+                                                       trust_remote_code=True,
+                                                       torch_dtype=torch.half,
+                                                       device_map='auto')
+
     def build_model(self):
         """Load model."""
         from accelerate import init_empty_weights, load_checkpoint_and_dispatch
         with init_empty_weights():
-            config = self.hf_config
-            model = AutoModelForCausalLM.from_config(config,
+            model = AutoModelForCausalLM.from_config(self.hf_config,
                                                      trust_remote_code=True)
-            if not self.with_llm:
-                # Remove nn modules other than embedding from the LLM model
-                for key in ['emb_drop', 'ln_f', 'blocks', 'ff_out']:
-                    del model.model.transformer[key]
-                self.token_embedding = model.model.transformer.wte
-            else:
-                self.vl_model = model
+
+            # Remove nn modules other than embedding from the LLM model
+            for key in ['emb_drop', 'ln_f', 'blocks', 'ff_out']:
+                del model.model.transformer[key]
+            self.token_embedding = model.model.transformer.wte
 
         with disable_logging():
-            load_checkpoint_and_dispatch(
-                model=model,
-                checkpoint=self.model_path,
-                device_map='auto' if not self.with_llm else {'': 'cpu'},
-                max_memory=self.max_memory,
-                no_split_module_classes=[
-                    'ResidualAttentionBlock', 'Embedding'
-                ])
+            load_checkpoint_and_dispatch(model=model,
+                                         checkpoint=self.model_path,
+                                         device_map='auto',
+                                         max_memory=self.max_memory,
+                                         no_split_module_classes=[
+                                             'ResidualAttentionBlock',
+                                             'Embedding'
+                                         ],
+                                         dtype=torch.half)
 
         # We need eval mode to freeze the weights in model, thus,
         # avoid randomness in inference.
         self.model = model.eval()
-        self.config = config
 
-        self.processor = AutoProcessor.from_pretrained(self.model_path,
-                                                       trust_remote_code=True,
-                                                       torch_dtype='auto',
-                                                       device_map='auto')
+    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+        """refer to the `super.preprocess() for spec."""
+        for i, message in enumerate(messages):
+            if not isinstance(message['content'], List):
+                continue
+            images = [
+                x['image'] for x in message['content'] if x['type'] == 'image'
+            ]
+            content = [
+                x['text'] for x in message['content'] if x['type'] == 'text'
+            ]
+            prompt = f' User: {content[0]}'
+            tokens = self.processor.tokenizer.encode(prompt,
+                                                     add_special_tokens=False)
+            # preprocess images. The output is a dict, which is
+            # {
+            #     'input_ids': torch.Tensor,
+            #     'images': torch.Tensor, # (n_patch, d_model)
+            #     'image_input_idx': torch.Tensor, # (n_patch, d_model)
+            #     'image_masks': torch.Tensor,  # (n_patch, d_model)
+            # }
+            result = self.processor.process(images=images, tokens=tokens)
+            # remove the bos from input_ids which is prepended by molmo's
+            # processor
+            input_ids = result['input_ids'][1:]
+            result.update(input_ids=input_ids)
+            messages[i].update(preprocess=result)
+        return messages
 
     @torch.no_grad()
     def forward(self,
-                images: List[Image],
-                params: List[Dict] = None) -> List[Dict]:
-        """forward the model with given input.
+                messages: List[Dict],
+                max_batch_size: int = 1) -> List[Dict]:
+        """extract image feature. ONLY implement it when the backend is
+        turbomind engine.
 
         Args:
-            images (List): [None] it is not used
-            params (List): the inputs after precessing GPT4V messages in
-                `MolmoChatTemplateWrapper`. Its format is like the following:
-                [[
-                    {'role': 'user', 'content': 'user prompt'},
-                    {'role': 'asssistant', 'content': 'assistant prompt'},
-                    {'role': 'user', 'content': 'user prompt', 'images': [PIL image list]},
-                    ...
-                ]]
-        """  # noqa
-
-        messages = params[0]
-        assert isinstance(messages, List)
-        # append an assistant message to `messages`
-        messages.append(dict(role='assistant', content=''))
+            messages(List[Dict]): the outputs of `preprocess`
+            max_batch_size(int): the max batch size when forwarding vision
+                model
+        Return:
+            the message list with forwarding results included
+        """
+        for i, message in enumerate(messages):
+            if 'preprocess' not in message.keys():
+                continue
+            inputs = message['preprocess']
+            # get input_ids of embedding
+            inputs = {
+                k: v.to(self.model.device).unsqueeze(0)
+                for k, v in inputs.items()
+            }
+            input_ids = inputs['input_ids']
+            # (batch_size, num_image, num_patch, d_model)
+            images = inputs['images']
+            # (batch_size, num_image, num_patch)
+            image_input_idx = inputs['image_input_idx']
+            image_masks = inputs['image_masks']
+            batch_size, seq_len = input_ids.size()
+            assert batch_size == 1
+            input_ids = input_ids * (input_ids != -1).to(input_ids.dtype)
+            embeddings = self.model.model.transformer.wte(input_ids)
+            images = images.to(self.model.dtype)
+            image_masks = image_masks.to(self.model.dtype)
+            logger.info(f'vision forward shape: {images.shape}')
+            image_features, _ = self.model.model.vision_backbone(
+                images, image_masks)
+            num_image, num_patch = image_features.shape[1:3]
+            assert image_input_idx.shape == (batch_size, num_image, num_patch)
+
+            # insert the image feature into the embedding.
+            image_features = image_features.view(batch_size,
+                                                 num_image * num_patch, -1)
+            image_input_idx = image_input_idx.view(batch_size,
+                                                   num_image * num_patch)
+            valid = image_input_idx >= 0
+            batch_idx = torch.arange(batch_size, device=embeddings.device)
+            batch_idx = torch.tile(batch_idx[:, None],
+                                   [1, image_features.shape[1]])
+            image_features = image_features.to(embeddings.device)
+            # Since we remove bos_id from input_ids during `preprocess`,
+            # the index `image_input_idx[valid]` should be shift to left
+            # by subtracting 1
+            index = image_input_idx[valid] - 1
+            embeddings[batch_idx[valid], index] += image_features[valid]
+            assert embeddings.shape[:2] == (batch_size, seq_len)
+            messages[i].update(
+                dict(forward=dict(input_ids=input_ids.flatten(),
+                                  embeddings=embeddings)))
+        return messages
+
+    @staticmethod
+    def proc_messages(messages):
+        prompt = []
+        IMAGE_TOKEN = '<IMAGE_TOKEN>'
+        for message in messages:
+            role, content = message['role'], message['content']
+            if isinstance(content, List):
+                n_images = len([1 for x in content if x['type'] == 'image'])
+                content = [x['text'] for x in content if x['type'] == 'text']
+                prompt.append(' User: ' + (IMAGE_TOKEN + '\n') * n_images +
+                              content[0])
+            else:
+                if role == 'user':
+                    prompt.append(f' User: {content}')
+                elif role == 'assistant':
+                    prompt.append(f' Assistant:{content}')
+                else:
+                    assert 0, f'molmo does not support role {role}, message is {message}'  # noqa
+        prompt.append(' Assistant:')
+        return ''.join(prompt)
+
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
+        assert 0, 'molmo is not supported by pytorch engine'
+
+    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
         # results is a list of tuple(input_ids, embeddings)
         results = []
-        # the concat prompt. It is not used during inference but to adhere the
-        # interface definition of `_get_prompt_input` in `class VLAsyncEngine`
-        prompts = ''
         # Prepend BOS
         # qwen2 and olmo do not have a BOS, and instead use EOS as a generic
         # separator token.
         bos = (self.processor.tokenizer.bos_token_id
                or self.processor.tokenizer.eos_token_id)
         results.append(([bos], None))
+
         for i, message in enumerate(messages):
-            if 'images' in message.keys():
-                prompts += ' User: ' + (IMAGE_TOKEN + '\n') * len(
-                    message['images']) + message['content']
-                prompt = f' User: {message["content"]}'
-                tokens = self.processor.tokenizer.encode(
-                    prompt, add_special_tokens=False)
-                # preprocess images. The output is a dict
-                inputs = self.processor.process(images=message['images'],
-                                                tokens=tokens)
-                inputs = {
-                    k: v.to(self.model.device).unsqueeze(0)
-                    for k, v in inputs.items()
-                }
-                input_ids = inputs['input_ids']
-                # remove the bos from input_ids which is prepended by molmo's
-                # processor
-                input_ids = input_ids[:, 1:]
-                images = inputs[
-                    'images']  # (batch_size, num_image, num_patch, d_model)
-                image_input_idx = inputs[
-                    'image_input_idx']  # (batch_size, num_image, num_patch)
-                image_masks = inputs['image_masks']
-                batch_size, seq_len = input_ids.size()
-                assert batch_size == 1
-
-                # Get embeddings of input.
-                if input_ids is not None:
-                    input_ids = input_ids * (input_ids != -1).to(
-                        input_ids.dtype)
-                embeddings = self.model.model.transformer.wte(input_ids)
-                image_features, _ = self.model.model.vision_backbone(
-                    images, image_masks)
-                num_image, num_patch = image_features.shape[1:3]
-                assert image_input_idx.shape == (batch_size, num_image,
-                                                 num_patch)
-
-                # insert the image feature into the embedding.
-                image_features = image_features.view(batch_size,
-                                                     num_image * num_patch, -1)
-                image_input_idx = image_input_idx.view(batch_size,
-                                                       num_image * num_patch)
-
-                valid = image_input_idx >= 0
-                batch_idx = torch.arange(batch_size, device=embeddings.device)
-                batch_idx = torch.tile(batch_idx[:, None],
-                                       [1, image_features.shape[1]])
-                image_features = image_features.to(embeddings.device)
-                embeddings[batch_idx[valid],
-                           image_input_idx[valid]] += image_features[valid]
-                assert embeddings.shape[:2] == (batch_size, seq_len)
-                results.append((input_ids.flatten().tolist(), embeddings))
+            prompt = ''
+            role, content = message['role'], message['content']
+            if isinstance(content, List):
+                forward_result = message.pop('forward')
+                input_ids = forward_result['input_ids']
+                embeddings = forward_result['embeddings']
+                results.append((input_ids.tolist(), embeddings))
             else:
-                role = message['role']
-                content = message['content']
-                assert isinstance(content, str)
-                prompt = ''
                 if role == 'user':
                     prompt = f' User: {content}'
                 elif role == 'assistant':
                     prompt = f' Assistant:{content}'
                 else:
                     assert 0, f'molmo does not support role {role}, message is {message}'  # noqa
+            if i == len(messages) - 1:
+                # the last message
+                assert role == 'user', f'the role of last message is expected to be user, but got {role}'  # noqa
+                prompt += ' Assistant:'
+            if prompt:
                 input_ids = self.processor.tokenizer.encode(
                     prompt, add_special_tokens=False)
                 results.append((input_ids, None))
-                prompts += prompt
 
         # concat input_ids from results, calculate the range in the input_ids
         # where embeddings will be copied to
@@ -169,9 +213,9 @@ def forward(self,
                 input_embedding_ranges.append((start, end))
             input_ids += _input_ids
             start += len(_input_ids)
-        return [
-            dict(prompt=prompts,
-                 input_ids=input_ids,
-                 input_embeddings=input_embeddings,
-                 input_embedding_ranges=input_embedding_ranges)
-        ]
+
+        prompt = self.proc_messages(messages)
+        return dict(prompt=prompt,
+                    input_ids=input_ids,
+                    input_embeddings=input_embeddings,
+                    input_embedding_ranges=input_embedding_ranges)
diff --git a/lmdeploy/vl/model/phi3_vision.py b/lmdeploy/vl/model/phi3_vision.py
index 032b8404da..80204a2dee 100644
--- a/lmdeploy/vl/model/phi3_vision.py
+++ b/lmdeploy/vl/model/phi3_vision.py
@@ -1,198 +1,40 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-import warnings
-from typing import List
+from typing import Dict, List
 
-import torch
-from PIL.Image import Image
 from transformers import AutoProcessor
 
-from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
-from lmdeploy.vl.model.utils import disable_logging
-
-
-# from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_embedding_phi3_v.py # noqa E501
-def _process_image_embedding(self, pixel_values: torch.Tensor,
-                             image_sizes: torch.Tensor):
-    """process image embedding."""
-    img_embeds = pixel_values
-    img_sizes = image_sizes
-    target_device = pixel_values.device
-    target_dtype = pixel_values.dtype
-    if self.use_hd_transform and img_sizes is not None and len(img_sizes):
-        assert img_embeds.ndim == 5, f'img_embeds size: {img_embeds.size()}, expect 5D tensor for hd transform'  # noqa E501
-        # img_embeds: (num_images, max_num_crops, 3, H, W)
-        # img_sizes: (num_images, 2).view(1, -1)
-
-        bs = img_embeds.shape[0]
-        # Nx(HW)xC
-        img_features = self.get_img_features(img_embeds.flatten(0, 1))
-        base_feat_height = base_feat_width = int(img_features.shape[1]**0.5)
-
-        assert base_feat_height == 24 and base_feat_width == 24, f'base_feat_height: {base_feat_height}, base_feat_width: {base_feat_width}, expect 24x24 features for hd transform'  # noqa E501
-
-        # bs x max_num_crops x (24x24) x C
-        img_features = img_features.view(bs, -1,
-                                         base_feat_height * base_feat_width,
-                                         self.image_dim_out)
-        C = self.image_dim_out
-        H = base_feat_height
-
-        output_imgs = []
-        output_len = []
-        # training is tensor, inference is list
-        if isinstance(img_sizes, torch.Tensor):
-            img_sizes = img_sizes.view(-1, 2)
-        for _bs in range(bs):
-            h, w = img_sizes[_bs]
-            h = h // 336
-            w = w // 336
-            B_ = h * w
-
-            # 1 x (24x24) x 1024
-            global_img_feature = img_features[_bs, :1]
-
-            # 1 x 12 x 12 x 4096
-            glb_img = global_img_feature.reshape(1, H, H, C).reshape(
-                1, H // 2, 2, H // 2, 2,
-                C).contiguous().permute(0, 1, 3, 2, 4,
-                                        5).reshape(1, H // 2, H // 2,
-                                                   4 * C).contiguous()
-            temp_glb_GN = self.sub_GN.repeat(1, H // 2, 1, 1)
-
-            # 1 x 156 x 4096
-            glb_img = torch.cat([glb_img, temp_glb_GN],
-                                dim=2).reshape(1, -1, 4 * C)
-
-            # (max_num_crops-1) x (12x12) x C
-            sub_img = img_features[_bs, 1:]
-            # 16x574x1024
-            # get rid of padding sub_img
-            sub_img = sub_img[:B_]
-
-            # (num_crops, 12, 2, 12, 2, 1024)->(num_crops, 12, 12, 2, 2, 1024)
-            # -> (num_crops, 12*12, 4*1024)
-            sub_img = sub_img.reshape(B_, H, H, C).reshape(
-                B_, H // 2, 2, H // 2, 2,
-                C).contiguous().permute(0, 1, 3, 2, 4,
-                                        5).reshape(B_, -1, 4 * C).contiguous()
-            sub_img = sub_img.reshape(1, h, w, 12, 12, -1).permute(
-                0, 1, 3, 2, 4, 5).reshape(1, h * 12, w * 12, 4 * C)
-            temp_sub_GN = self.sub_GN.repeat(1, h * 12, 1, 1)
-            sub_img = torch.cat([sub_img, temp_sub_GN],
-                                dim=2).reshape(1, -1, 4 * C)
-            # (1, num_img_tokens, 1024*4)
-
-            # glb + sub
-            if self.hd_transform_order == 'glb_sub':
-                output_imgs.append(
-                    torch.cat([glb_img, self.glb_GN, sub_img], dim=1))
-            elif self.hd_transform_order == 'sub_glb':
-                output_imgs.append(
-                    torch.cat([sub_img, self.glb_GN, glb_img], dim=1))
-            else:
-                raise NotImplementedError(
-                    f'hd_transform_order = {self.hd_transform_order}'
-                )  # noqa E501
-
-            temp_len = int((h * w + 1) * 144 + 1 + (h + 1) * 12)
-            assert temp_len == output_imgs[-1].shape[
-                1], f'temp_len: {temp_len}, output_imgs[-1].shape[1]: {output_imgs[-1].shape[1]}'  # noqa E501
-            output_len.append(temp_len)
-
-        img_set_tensor = []
-        for _output_img in output_imgs:
-            img_feature_proj = self.img_projection(
-                _output_img.to(target_device).to(target_dtype))
-            img_set_tensor.append(img_feature_proj)
-    elif img_embeds.ndim == 4:
-        tt = (self.get_img_features(img_embeds).to(target_device).to(
-            target_dtype).reshape(-1, self.image_dim_out))
-        img_set_tensor = self.img_projection(tt)  # adapted visual features.
-    elif img_embeds.ndim == 3:
-        tt = (img_embeds.to(target_device).to(target_dtype).view(
-            -1, self.image_dim_out))
-        img_set_tensor = self.img_projection(tt)  # adapted visual features.
-    else:
-        raise NotImplementedError
-    return img_set_tensor
+from lmdeploy.vl.model.llava_hf import VISION_MODELS, LlavaHfVisionModel
 
 
 @VISION_MODELS.register_module()
-class Phi3VisionModel(VisonModel):
-    """Llava hf vision model."""
+class Phi3VisionModel(LlavaHfVisionModel):
+    """Phi3-vision model."""
 
     _arch = 'Phi3VForCausalLM'
 
-    def build_model(self):
-        from accelerate import init_empty_weights, load_checkpoint_and_dispatch
-        from accelerate.utils import get_balanced_memory, infer_auto_device_map
-
-        with init_empty_weights(), warnings.catch_warnings():
-            warnings.simplefilter('ignore')
-            from transformers import AutoModelForCausalLM
-            model = AutoModelForCausalLM.from_config(self.hf_config,
-                                                     trust_remote_code=True)
-            if not self.with_llm:
-                del model.lm_head
-                del model.model.layers
-                del model.model.norm
-                del model.model.embed_tokens
-                del model.model.vision_embed_tokens.wte
-            else:
-                self.vl_model = model
-
-        no_split_module_classes = ['CLIPEncoderLayer']
-        max_memory = get_balanced_memory(
-            model,
-            max_memory=self.max_memory,
-            dtype=torch.half,
-            no_split_module_classes=no_split_module_classes)
-        device_map = infer_auto_device_map(
-            model,
-            no_split_module_classes=no_split_module_classes,
-            max_memory=max_memory,
-            dtype=torch.half)
-        same_device_keys = [('model.vision_embed_tokens.img_projection',
-                             'model.vision_embed_tokens.sub_GN',
-                             'model.vision_embed_tokens.glb_GN')]
-        for keys in same_device_keys:
-            keys = [k for k in keys if k in device_map]
-            if len(keys) <= 1:
-                continue
-            for k in keys[1:]:
-                device_map[k] = device_map[keys[0]]
-
-        with disable_logging():
-            load_checkpoint_and_dispatch(
-                model=model,
-                checkpoint=self.model_path,
-                device_map=device_map if not self.with_llm else {'': 'cpu'},
-                no_split_module_classes=no_split_module_classes,
-                dtype=torch.half)
-
-        model.eval()
-        self.model = model
-        # processor
+    def build_preprocessor(self):
         processor = AutoProcessor.from_pretrained(self.model_path,
                                                   trust_remote_code=True)
         if hasattr(processor, 'tokenizer'):
             del processor.tokenizer
-            processor.prtokenizer = None
-        self.processor = processor.image_processor
+            processor.tokenizer = None
         self.processor = processor
 
-    @torch.no_grad()
-    def forward(self, images: List[Image]) -> List[torch.Tensor]:
-        """forward."""
-        process_outputs = self.processor.image_processor(
-            images, return_tensors='pt').to(device=self.model.device,
-                                            dtype=self.model.dtype)
-        pixel_values = process_outputs['pixel_values']
-        image_sizes = process_outputs['image_sizes']
-        image_features = _process_image_embedding(
-            self.model.model.vision_embed_tokens,
-            pixel_values=pixel_values,
-            image_sizes=image_sizes)
-        outputs = [x.squeeze() for x in image_features]
-        return outputs
+    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+        """refers to `super.preprocess() for spec."""
+        images = self.collect_images(messages)
+        outputs = []
+        for image, params in images:
+            image = image.convert('RGB')
+            result = self.processor.image_processor(image, return_tensors='pt')
+            h = result['image_sizes'][0][0].item() // 336
+            w = result['image_sizes'][0][1].item() // 336
+            image_tokens = int((h * w + 1) * 144 + 1 + (h + 1) * 12)
+            result.update(
+                dict(image_size=image.size,
+                     image_tokens=image_tokens,
+                     image_token_id=0))
+            outputs.append(result)
+        messages.append(dict(role='preprocess', content=outputs))
+        return messages
diff --git a/lmdeploy/vl/model/qwen.py b/lmdeploy/vl/model/qwen.py
index 3968f27d97..2e4f32862c 100644
--- a/lmdeploy/vl/model/qwen.py
+++ b/lmdeploy/vl/model/qwen.py
@@ -1,14 +1,16 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import List
+from typing import Dict, List
 
 import torch
-from PIL.Image import Image
 from transformers import AutoModelForCausalLM
 
+from lmdeploy.utils import get_logger
 from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
 from lmdeploy.vl.model.utils import disable_logging
 
+logger = get_logger('lmdeploy')
+
 
 @VISION_MODELS.register_module()
 class QwenVisionModel(VisonModel):
@@ -16,6 +18,19 @@ class QwenVisionModel(VisonModel):
 
     _arch = 'QWenLMHeadModel'
 
+    def build_preprocessor(self):
+        from torchvision import transforms
+        from torchvision.transforms import InterpolationMode
+        mean = (0.48145466, 0.4578275, 0.40821073)
+        std = (0.26862954, 0.26130258, 0.27577711)
+        image_size = self.hf_config.visual['image_size']
+        self.image_transform = transforms.Compose([
+            transforms.Resize((image_size, image_size),
+                              interpolation=InterpolationMode.BICUBIC),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=mean, std=std),
+        ])
+
     def build_model(self):
         from accelerate import init_empty_weights
         with init_empty_weights():
@@ -23,12 +38,9 @@ def build_model(self):
             config.quantization_config = {}  # disable vision part quantization
             model = AutoModelForCausalLM.from_config(config,
                                                      trust_remote_code=True)
-            if not self.with_llm:
-                del model.lm_head
-                for key in ['wte', 'h', 'ln_f']:
-                    setattr(model.transformer, key, None)
-            else:
-                self.vl_model = model
+            del model.lm_head
+            for key in ['wte', 'h', 'ln_f']:
+                setattr(model.transformer, key, None)
 
         from accelerate.utils import get_balanced_memory, infer_auto_device_map
         max_memory = get_balanced_memory(
@@ -54,19 +66,92 @@ def build_model(self):
             load_checkpoint_and_dispatch(
                 model=model,
                 checkpoint=self.model_path,
-                device_map=device_map if not self.with_llm else {'': 'cpu'},
+                device_map=device_map,
                 no_split_module_classes=['VisualAttentionBlock'],
                 dtype=torch.half)
 
         self.model = model.transformer.visual.eval()
 
+    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+        """refers to `super.preprocess() for spec."""
+        images = self.collect_images(messages)
+        outputs = []
+        for image, params in images:
+            image = image.convert('RGB')
+            pixel_values = self.image_transform(image)
+            outputs.append(
+                dict(pixel_values=pixel_values,
+                     image_size=image.size,
+                     image_tokens=256,
+                     image_token_id=0))
+        messages.append(dict(role='preprocess', content=outputs))
+        return messages
+
     @torch.no_grad()
-    def forward(self, images: List[Image]) -> List[torch.Tensor]:
-        """forward."""
-        outputs = [x.convert('RGB') for x in images]
-        outputs = [self.model.image_transform(x) for x in outputs]
-        outputs = torch.stack(outputs, dim=0)
-        outputs = self.model(outputs)
-        outputs = torch.split(outputs, 1, dim=0)
-        outputs = [x.squeeze() for x in outputs]
-        return outputs
+    def forward(self,
+                messages: List[Dict],
+                max_batch_size: int = 1) -> List[Dict]:
+        """extract image feature. ONLY implement it when the backend is
+        turbomind engine.
+
+        Args:
+            messages(List[Dict]): the outputs of `preprocess`
+            max_batch_size(int): the max batch size when forwarding vision
+                model
+        Return:
+            the message list with forwarding results included
+        """
+        inputs = [x['content'] for x in messages if x['role'] == 'preprocess']
+        inputs = inputs[0]
+        outputs = []
+        for idx in range(0, len(inputs), max_batch_size):
+            pixel_values = [
+                x['pixel_values'] for x in inputs[idx:idx + max_batch_size]
+            ]
+            pixel_values = torch.stack(pixel_values, dim=0)
+            logger.info(f'vision forward shape: {pixel_values.shape}')
+            feats = self.model(pixel_values)
+            feats = torch.split(feats, 1, dim=0)
+            outputs.extend([x.squeeze() for x in feats])
+        messages.append(dict(role='forward', content=outputs))
+        return messages
+
+    @staticmethod
+    def proc_messages(messages, chat_template, sequence_start):
+        """apply chat template to get the prompt."""
+        prompt_messages = []
+        IMAGE_TOKEN = '<IMAGE_TOKEN>'
+        for message in messages:
+            if isinstance(message['content'], str):
+                prompt_messages.append(message)
+                continue
+            elif message['role'] in ['images', 'preprocess', 'forward']:
+                continue
+            n_images = len(
+                [1 for x in message['content'] if x['type'] == 'image'])
+            content = [
+                x['text'] for x in message['content'] if x['type'] == 'text'
+            ]
+            prompt = content[0]
+            if IMAGE_TOKEN in prompt:
+                pass
+            else:
+                prompt = ''.join([
+                    f'Picture {str(i)}:{IMAGE_TOKEN}\n'
+                    for i in range(n_images)
+                ]) + prompt
+            prompt_messages.append(dict(role='user', content=prompt))
+        prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
+        return prompt, IMAGE_TOKEN
+
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
+        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
+                                                 sequence_start)
+        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                   sequence_start)
+
+    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
+        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
+                                                 sequence_start)
+        return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                     sequence_start)
diff --git a/lmdeploy/vl/model/qwen2.py b/lmdeploy/vl/model/qwen2.py
index 3eb3c1541c..51bb1f8ccb 100644
--- a/lmdeploy/vl/model/qwen2.py
+++ b/lmdeploy/vl/model/qwen2.py
@@ -1,12 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-
 from typing import Dict, List
 
 import torch
-from PIL.Image import Image
 
 from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
-from lmdeploy.vl.model.utils import disable_logging
 
 
 def check_qwen_vl_deps_install():
@@ -15,7 +12,7 @@ def check_qwen_vl_deps_install():
         import qwen_vl_utils  # noqa: F401
     except ImportError:
         raise ImportError(
-            'please install qwen_vl_utils by pip install qwen_vl_utils'  # noqa: E501
+            'please install qwen_vl_utils by `pip install qwen_vl_utils`'  # noqa: E501
         )
     try:
         from transformers import Qwen2VLForConditionalGeneration  # noqa: F401
@@ -31,85 +28,95 @@ class Qwen2VLModel(VisonModel):
 
     _arch = 'Qwen2VLForConditionalGeneration'
 
-    def build_model(self):
+    def build_preprocessor(self):
         check_qwen_vl_deps_install()
-        from transformers import Qwen2VLForConditionalGeneration
-        if self.with_llm:
-            model = Qwen2VLForConditionalGeneration.from_pretrained(
-                self.hf_config._name_or_path, trust_remote_code=True)
-            model.half()
-            self.vl_model = model
-        else:
-            from accelerate import init_empty_weights
-            with init_empty_weights():
-                config = self.hf_config
-                config.quantization_config = {
-                }  # disable vision part quantization
-                # disable accelerate check_tied_parameters_in_config
-                # for Qwen2-VL-2B-Instruct
-                config.tie_word_embeddings = False
+        from transformers import AutoProcessor
+        self.processor = AutoProcessor.from_pretrained(self.model_path)
 
-                model = Qwen2VLForConditionalGeneration._from_config(config)
-                del model.model
-                del model.lm_head
-                model.half()
-            from accelerate import load_checkpoint_and_dispatch
-            with disable_logging():
-                load_checkpoint_and_dispatch(
-                    model=model,
-                    checkpoint=self.model_path,
-                    device_map='auto' if not self.with_llm else {'': 'cpu'},
-                    max_memory=self.max_memory,
-                    no_split_module_classes=['Qwen2VLVisionBlock'],
-                    dtype=torch.half)
+    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+        """refer to `super().preprocess()` for spec."""
+        from qwen_vl_utils import process_vision_info
 
-        self.model = model.eval()
+        images = self.collect_images(messages)
+        optional_keys = {
+            'resized_height', 'resized_width', 'min_pixels', 'max_pixels'
+        }
+        outputs = []
+        for image, params in images:
+            image = image.convert('RGB')
 
-        # processor
-        from transformers import AutoProcessor
-        self.processor = AutoProcessor.from_pretrained(self.model_path)
+            item = dict(type='image', image=image)
+            item.update({
+                key: params[key]
+                for key in params.keys() if key in optional_keys
+            })
+            image_inputs, _ = process_vision_info([dict(content=[item])])
+            result = self.processor.image_processor(images=image_inputs,
+                                                    videos=None,
+                                                    return_tensors='pt')
+            merge_length = self.processor.image_processor.merge_size**2
+            image_tokens = result['image_grid_thw'].prod(dim=1) // merge_length
+            result.update(
+                dict(image_size=image.size,
+                     image_tokens=image_tokens,
+                     image_token_id=0))
+            outputs.append(result)
+        messages.append(dict(role='preprocess', content=outputs))
+        return messages
 
     @torch.no_grad()
     def forward(self,
-                images: List[Image],
-                params: List[Dict] = None) -> List[torch.Tensor]:
-        """forward."""
-        # only support image input
-        if params is not None:
-            assert len(images) == len(
-                params), 'different length of images and params'
-        else:
-            params = [{}] * len(images)
+                messages: List[Dict],
+                max_batch_size: int = 1) -> List[Dict]:
+        """extract image feature. ONLY implement it when the backend is
+        turbomind engine.
 
-        from qwen_vl_utils import process_vision_info
-        images = [x.convert('RGB') for x in images]
-        content = []
-        optional_keys = [
-            'resized_height', 'resized_width', 'min_pixels', 'max_pixels'
-        ]
-        for image, param in zip(images, params):
-            item = dict(type='image', image=image)
-            item.update({k: param[k] for k in optional_keys if k in param})
-            content.append(item)
-        messages = [dict(content=content)]
-        image_inputs, _ = process_vision_info(messages)
-        image_inputs = self.processor.image_processor(images=image_inputs,
-                                                      videos=None,
-                                                      return_tensors='pt')
-        pixel_values = image_inputs['pixel_values'].to(
-            self.model.visual.get_device())
-        image_grid_thw = image_inputs['image_grid_thw'].to(
-            self.model.visual.get_device())
-        pixel_values = pixel_values.type(self.model.visual.get_dtype())
-        image_embeds = self.model.visual(pixel_values,
-                                         grid_thw=image_grid_thw).cpu()
-        merge_length = self.processor.image_processor.merge_size**2
-        split_size = image_inputs['image_grid_thw'].prod(dim=1) // merge_length
-        image_embeds = image_embeds.split(split_size.tolist())
+        Args:
+            messages(List[Dict]): the outputs of `preprocess`
+            max_batch_size(int): the max batch size when forwarding vision
+                model
+        Return:
+            the message list with forwarding results included
+        """
+        assert 0, 'TODO: support turbomind engine'
 
-        outputs = []
-        for i, embeddings in enumerate(image_embeds):
-            outputs.append(
-                dict(embeddings=embeddings,
-                     grid_thw=image_inputs['image_grid_thw'][i].tolist()))
-        return outputs
+    @staticmethod
+    def proc_messages(messages, chat_template, sequence_start):
+        """apply chat template to get the prompt."""
+        prompt_messages = []
+        IMAGE_TOKEN = '<IMAGE_TOKEN>'
+        for message in messages:
+            if isinstance(message['content'], str):
+                prompt_messages.append(message)
+                continue
+            elif message['role'] in ['images', 'preprocess', 'forward']:
+                continue
+            n_images = len(
+                [1 for x in message['content'] if x['type'] == 'image'])
+            content = [
+                item['text'] for item in message['content']
+                if item['type'] == 'text'
+            ]
+            prompt = content[0]
+            if IMAGE_TOKEN in prompt and '<|vision_start|>' not in prompt:
+                prompt = prompt.replace(
+                    IMAGE_TOKEN,
+                    f'<|vision_start|>{IMAGE_TOKEN}<|vision_end|>')
+            else:
+                # Qwen2-VL-2B-Instruct will concat image and user prompt
+                # according to their order in the content list
+                # we insert image token before user prompt by default. The
+                # user can use custom image token position if they want the
+                # same decorated prompt as Qwen2-VL
+                prompt = f'<|vision_start|>{IMAGE_TOKEN}<|vision_end|>' * \
+                    n_images + prompt
+                prompt_messages.append(dict(role='user', content=prompt))
+        prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
+        return prompt, IMAGE_TOKEN
+
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
+        """return to the information needed by pytorch engine."""
+        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
+                                                 sequence_start)
+        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                   sequence_start)
diff --git a/lmdeploy/vl/model/xcomposer2.py b/lmdeploy/vl/model/xcomposer2.py
index 96bc900c02..620fd800b6 100644
--- a/lmdeploy/vl/model/xcomposer2.py
+++ b/lmdeploy/vl/model/xcomposer2.py
@@ -5,7 +5,7 @@
 import sys
 import warnings
 from contextlib import contextmanager
-from typing import Any, List, Tuple
+from typing import Any, Dict, List, Tuple
 
 import torch
 from PIL.Image import Image
@@ -19,6 +19,17 @@
 logger = get_logger('lmdeploy')
 
 
+def check_xcomposer_install():
+    try:
+        # WARNING! we have to do this otherwise the model_type is wrong for
+        # xcomposer2d5
+        import decord  # noqa: F401
+    except ImportError:
+        raise ImportError(
+            "No module named 'decord'. Please install decord by `pip install decord`"  # noqa
+        )
+
+
 class ModelType(enum.Enum):
     """Request type."""
     XCOMPOSER2 = enum.auto()
@@ -83,6 +94,16 @@ def init_empty_vit(model_path):
 class Xcomposer2VisionModel(VisonModel):
     """InternLM-Xcomposer2 vision model."""
 
+    def __init__(self,
+                 model_path: str,
+                 max_memory: Dict[int, int] = None,
+                 hf_config: AutoConfig = None,
+                 backend: str = ''):
+        super().__init__(model_path, max_memory, hf_config, backend)
+        check_xcomposer_install()
+        self.model_type, self.module = get_xcomposer_type(self.model_path)
+        logger.info(f'matching type of {self.model_type}')
+
     @classmethod
     def match(cls, config: AutoConfig):
         """check whether the config match the model."""
@@ -94,6 +115,34 @@ def match(cls, config: AutoConfig):
                 return True
         return False
 
+    def build_preprocessor(self):
+
+        import torchvision.transforms as transforms
+        from torchvision.transforms.functional import InterpolationMode
+
+        if self.model_type in [
+                ModelType.XCOMPOSER2D5, ModelType.XCOMPOSER2_4KHD
+        ]:
+            self.HD_transform = self.module
+            self.vis_processor = transforms.Compose([
+                transforms.ToTensor(),
+                transforms.Normalize((0.48145466, 0.4578275, 0.40821073),
+                                     (0.26862954, 0.26130258, 0.27577711)),
+            ])
+            self.preprocess_func = (self._preprocess_2d5 if self.model_type
+                                    == ModelType.XCOMPOSER2D5 else
+                                    self._preprocess_4khd_7b)
+        else:
+            self.vis_processor = transforms.Compose([
+                transforms.Resize(
+                    (self.hf_config.img_size, self.hf_config.img_size),
+                    interpolation=InterpolationMode.BICUBIC),
+                transforms.ToTensor(),
+                transforms.Normalize((0.48145466, 0.4578275, 0.40821073),
+                                     (0.26862954, 0.26130258, 0.27577711)),
+            ])
+            self.preprocess_func = self._preprocess_7b
+
     def build_model(self):
         from accelerate import init_empty_weights
         with init_empty_weights(), warnings.catch_warnings(), \
@@ -106,23 +155,8 @@ def build_model(self):
             model.vit.resize_pos()
             model.vit.vision_tower.vision_model.post_layernorm.to_empty(
                 device='cpu').half()
-            if not self.with_llm:
-                del model.model
-                del model.output
-            else:
-                self.vl_model = model
-
-        # additional components.
-        model_type, module = get_xcomposer_type(self.model_path)
-        logger.info(f'matching type of {model_type}')
-        if model_type == ModelType.XCOMPOSER2D5:
-            self.HD_transform = module
-            self._forward_func = self._forward_2d5
-        elif model_type == ModelType.XCOMPOSER2_4KHD:
-            self.HD_transform = module
-            self._forward_func = self._forward_4khd_7b
-        else:
-            self._forward_func = self._forward_7b
+            del model.model
+            del model.output
 
         from accelerate.utils import get_balanced_memory, infer_auto_device_map
         max_memory = get_balanced_memory(
@@ -144,7 +178,7 @@ def build_model(self):
             load_checkpoint_and_dispatch(
                 model=model,
                 checkpoint=self.model_path,
-                device_map=device_map if not self.with_llm else {'': 'cpu'},
+                device_map=device_map,
                 no_split_module_classes=['CLIPEncoderLayer'],
                 dtype=torch.half)
 
@@ -156,51 +190,117 @@ def build_model(self):
 
         self.model = model.eval()
 
-    def _forward_2d5(self, images: List[Image]) -> List[torch.Tensor]:
-        """internlm-xcomposer2d5-7b vit forward."""
-        outputs = [x.convert('RGB') for x in images]
-        hd_num = 6 if len(images) > 1 else 24
-        outputs = [self.HD_transform(x, hd_num=hd_num) for x in outputs]
-        outputs = [
-            self.model.vis_processor(x).unsqueeze(0).to(dtype=torch.half)
-            for x in outputs
-        ]
-        embeds, split = self.model.vit(outputs, self.model.plora_glb_GN,
-                                       self.model.plora_sub_GN)
-        embeds = self.model.vision_proj(embeds)
-        embeds = torch.split(embeds, split, dim=1)
-        embeds = [x.squeeze() for x in embeds]
-        return embeds
-
-    def _forward_7b(self, images: List[Image]) -> List[torch.Tensor]:
-        """internlm-xcomposer2-7b vit forward."""
-        outputs = [x.convert('RGB') for x in images]
-        outputs = [
-            self.model.vis_processor(x).unsqueeze(0).half() for x in outputs
-        ]
-        outputs = torch.cat(outputs, dim=0)
-        outputs = self.model.vit(outputs)
-        outputs = self.model.vision_proj(outputs)
-        outputs = torch.split(outputs, 1, dim=0)
-        outputs = [x.squeeze() for x in outputs]
-        return outputs
-
-    def _forward_4khd_7b(self, images: List[Image]) -> List[torch.Tensor]:
-        """internlm-xcomposer2-4khd-7b vit forward."""
-        outputs = [x.convert('RGB') for x in images]
-        outputs = [self.HD_transform(x, hd_num=25) for x in outputs]
-        outputs = [
-            self.model.vis_processor(x).unsqueeze(0).to(dtype=torch.half)
-            for x in outputs
-        ]
-        embeds, split = self.model.vit(outputs, self.model.plora_glb_GN,
-                                       self.model.plora_sub_GN)
-        embeds = self.model.vision_proj(embeds)
-        embeds = torch.split(embeds, split, dim=1)
-        embeds = [x.squeeze() for x in embeds]
-        return embeds
+    def _preprocess_2d5(self, image: Image, params: Dict) -> Dict:
+        """image preprocessing for internlm-xcomposer2d5-7b."""
+        hd_num = params.get('hd_num', 24)
+        image = self.HD_transform(image, hd_num=hd_num)
+        pixel_values = self.vis_processor(image).unsqueeze(0).half()
+        w, h = image.size
+        n_token_per_image = int((h * w + 1) * 400 + 1 + (h + 1) * 20)
+        return pixel_values, n_token_per_image
+
+    def _preprocess_7b(self, image: Image, params: Dict) -> Dict:
+        """image preprocessing for internlm-xcomposer2-7b."""
+        pixel_values = self.vis_processor(image).unsqueeze(0).half()
+        return pixel_values, 256
+
+    def _preprocess_4khd_7b(self, image: Image, params: Dict) -> Dict:
+        """image preprocessing for internlm-xcomposer2-4khd-7b."""
+        image = self.HD_transform(image, hd_num=25)
+        pixel_values = self.vis_processor(image).unsqueeze(0).half()
+        w, h = image.size
+        n_token_per_image = int((h * w + 1) * 144 + 1 + (h + 1) * 12)
+        return pixel_values, n_token_per_image
+
+    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+        """refer to `super().preprocess() for spec."""
+        images = self.collect_images(messages)
+        outputs = []
+        for image, params in images:
+            image = image.convert('RGB')
+            pixel_values, n_token = self.preprocess_func(image, params)
+            outputs.append(
+                dict(pixel_values=pixel_values,
+                     image_size=image.size,
+                     image_tokens=n_token,
+                     image_token_id=0))
+        messages.append(dict(role='preprocess', content=outputs))
+        return messages
 
     @torch.no_grad()
-    def forward(self, images: List[Image]) -> List[torch.Tensor]:
-        """forward."""
-        return self._forward_func(images)
+    def forward(self,
+                messages: List[Dict],
+                max_batch_size: int = 1) -> List[Dict]:
+        """extract image feature. ONLY implement it when the backend is
+        turbomind engine.
+
+        Args:
+            messages(List[Dict]): the outputs of `preprocess`
+            max_batch_size(int): the max batch size when forwarding vision
+                model
+        Return:
+            the message list with forwarding results included
+        """
+        inputs = [x['content'] for x in messages if x['role'] == 'preprocess']
+        inputs = inputs[0]
+        outputs = []
+        for idx in range(0, len(inputs), max_batch_size):
+            if self.model_type in [
+                    ModelType.XCOMPOSER2D5, ModelType.XCOMPOSER2_4KHD
+            ]:
+                pixel_values = [
+                    x['pixel_values'] for x in inputs[idx:idx + max_batch_size]
+                ]
+                embeds, split = self.model.vit(pixel_values,
+                                               self.model.plora_glb_GN,
+                                               self.model.plora_sub_GN)
+                embeds = self.model.vision_proj(embeds)
+                embeds = torch.split(embeds, split, dim=1)
+                embeds = [x.squeeze() for x in embeds]
+            else:
+                pixel_values = [
+                    x['pixel_values'] for x in inputs[idx:idx + max_batch_size]
+                ]
+                pixel_values = torch.cat(pixel_values, dim=0)
+                logger.info(f'vision forward shape: {pixel_values.shape}')
+                embeds = self.model.vit(pixel_values)
+                embeds = self.model.vision_proj(embeds)
+                embeds = torch.split(embeds, 1, dim=0)
+                embeds = [x.squeeze() for x in embeds]
+            outputs.extend(embeds)
+        messages.append(dict(role='forward', content=outputs))
+        return messages
+
+    @staticmethod
+    def proc_messages(messages, chat_template, sequence_start):
+        """apply chat template to get the prompt."""
+        prompt_messages = []
+        IMAGE_TOKEN = '<IMAGE_TOKEN>'
+        for message in messages:
+            if isinstance(message['content'], str):
+                prompt_messages.append(message)
+                continue
+            elif message['role'] in ['images', 'preprocess', 'forward']:
+                continue
+            n_images = len(
+                [1 for x in message['content'] if x['type'] == 'image'])
+            content = [
+                item['text'] for item in message['content']
+                if item['type'] == 'text'
+            ]
+            prompt = ' '.join([IMAGE_TOKEN] * n_images) + content[0]
+            prompt_messages.append(dict(role='user', content=prompt))
+        prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
+        return prompt, IMAGE_TOKEN
+
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start):
+        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
+                                                 sequence_start)
+        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                   sequence_start)
+
+    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start):
+        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template,
+                                                 sequence_start)
+        return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer,
+                                     sequence_start)
diff --git a/lmdeploy/vl/model/yi.py b/lmdeploy/vl/model/yi.py
index 34b993322e..1c02ed767a 100644
--- a/lmdeploy/vl/model/yi.py
+++ b/lmdeploy/vl/model/yi.py
@@ -1,12 +1,15 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
 from contextlib import contextmanager
+from os import path as osp
+from typing import Dict, List
 
 import torch.nn as nn
 from transformers import AutoConfig
 
 from lmdeploy.vl.model.base import VISION_MODELS
-from lmdeploy.vl.model.llava import LlavaVisionModel, check_llava_install
+from lmdeploy.vl.model.llava import (LlavaVisionModel, check_llava_install,
+                                     process_images)
 
 from .utils import disable_transformers_logging, rewrite_ctx
 
@@ -96,6 +99,19 @@ def match(cls, config: AutoConfig):
                 return True
         return False
 
+    def build_preprocessor(self):
+        from transformers import CLIPImageProcessor
+        vision_tower_name = osp.join(self.model_path,
+                                     self.hf_config.mm_vision_tower)
+        self.image_processor = CLIPImageProcessor.from_pretrained(
+            vision_tower_name)
+        config = AutoConfig.from_pretrained(vision_tower_name)
+        image_size = config.image_size
+        patch_size = config.patch_size
+        self.n_token_per_image = (image_size // patch_size)**2
+        if self.hf_config.mm_vision_select_feature == 'cls_patch':
+            self.n_token_per_image += 1
+
     def build_model(self):
         """build model & load weights."""
         check_llava_install()
@@ -105,3 +121,19 @@ def build_model(self):
 
         with init_yi_model(), disable_transformers_logging():
             super().build_model()
+
+    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+        """refer to `super().preprocess() for spec."""
+        images = self.collect_images(messages)
+        outputs = []
+        for image, params in images:
+            image = image.convert('RGB')
+            pixel_values = process_images([image], self.image_processor,
+                                          self.config)
+            outputs.append(
+                dict(pixel_values=pixel_values,
+                     image_size=image.size,
+                     image_tokens=self.n_token_per_image,
+                     image_token_id=0))
+        messages.append(dict(role='preprocess', content=outputs))
+        return messages
diff --git a/lmdeploy/vl/templates.py b/lmdeploy/vl/templates.py
deleted file mode 100644
index cdf398868a..0000000000
--- a/lmdeploy/vl/templates.py
+++ /dev/null
@@ -1,550 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import asyncio
-from typing import Dict, List, Tuple, Union
-
-import PIL
-import PIL.Image
-
-from lmdeploy.archs import get_model_arch
-from lmdeploy.model import BaseModel
-from lmdeploy.utils import get_logger
-from lmdeploy.vl.constants import IMAGE_TOKEN
-from lmdeploy.vl.utils import load_image
-
-logger = get_logger('lmdeploy')
-
-VLPromptType = Union[str, Tuple[str, PIL.Image.Image],
-                     Tuple[str, List[PIL.Image.Image]]]
-
-
-class VLChatTemplateWrapper:
-    """vl chat template wrapper."""
-
-    def __init__(self, chat_template: BaseModel):
-        self.chat_template = chat_template
-
-    def prompt_to_messages(self, prompt: VLPromptType):
-        """convert prompt to GTP4V format."""
-        messages = {
-            'role': 'user',
-            'content': [{
-                'type': 'text',
-                'text': '',
-            }]
-        }
-        if isinstance(prompt, str):
-            messages['content'][0]['text'] = prompt
-        else:
-            prompt, images = prompt
-            if not isinstance(images, list):
-                images = [images]
-            messages['content'][0]['text'] = prompt
-            for image in images:
-                # 'image_url': means url or local path to image.
-                # 'image_data': means PIL.Image.Image object.
-                if isinstance(image, str):
-                    image = load_image(image)
-                    item = {
-                        'type': 'image_data',
-                        'image_data': {
-                            'data': image
-                        }
-                    }
-                elif isinstance(image, PIL.Image.Image):
-                    item = {
-                        'type': 'image_data',
-                        'image_data': {
-                            'data': image
-                        }
-                    }
-                else:
-                    raise ValueError(
-                        'image should be a str(url/path) or PIL.Image.Image')
-
-                messages['content'].append(item)
-
-        return [messages]
-
-    async def async_collect_pil_images(
-            self, messages: Dict) -> List[Tuple[PIL.Image.Image, Dict]]:
-        """collect image from messages."""
-        images_with_kwargs = []
-        for message in messages:
-            role = message['role']
-            content = message['content']
-            if role != 'user' or isinstance(content, str):
-                continue
-            for item in content:
-                # 'image_url': means url or local path to image.
-                # 'image_data': means PIL.Image.Image object.
-                if item['type'] == 'image_url':
-                    item_copy = item['image_url'].copy()
-                    try:
-                        url = item_copy.pop('url')
-                        images_with_kwargs.append([url, item_copy])
-                    except KeyError:
-                        logger.error(f'invalid format {message}')
-                elif item['type'] == 'image_data':
-                    item_copy = item['image_data'].copy()
-                    try:
-                        data = item_copy.pop('data')
-                        images_with_kwargs.append([data, item_copy])
-                    except KeyError:
-                        logger.error(f'invalid format {message}')
-
-        def _inner_call(i, images):
-            url_or_data = images[i][0]
-            images[i][0] = load_image(url_or_data)
-
-        await asyncio.gather(*[
-            asyncio.get_event_loop().run_in_executor(None, _inner_call, i,
-                                                     images_with_kwargs)
-            for i in range(len(images_with_kwargs))
-        ])
-
-        return images_with_kwargs
-
-    def append_image_token(self, prompt, num_images: int):
-        """append image token to user prompt."""
-        if IMAGE_TOKEN in prompt:
-            return prompt
-        return (IMAGE_TOKEN + '\n') * num_images + prompt
-
-    def convert_messages(self, messages, sequence_start=True):
-        """convert GPT4V message format to GPT4 text format."""
-        new_messages = []
-        for message in messages:
-            role = message['role']
-            content = message['content']
-            if role != 'user' or isinstance(content, str):
-                if isinstance(content, list):
-                    text = content[0]['text']
-                    message = {'role': role, 'content': text}
-                new_messages.append(message)
-                continue
-            num_images = 0
-            for item in content:
-                # 'image_url': means url or local path to image.
-                # 'image_data': means PIL.Image.Image object.
-                if item['type'] == 'image_url':
-                    num_images += 1
-                elif item['type'] == 'image_data':
-                    num_images += 1
-                elif item['type'] == 'text':
-                    prompt = item['text']
-            if num_images > 0:
-                # add IMAGE_TOKEN to user prompt
-                prompt = self.append_image_token(prompt, num_images)
-            new_item = {'role': 'user', 'content': prompt}
-            new_messages.append(new_item)
-        return new_messages
-
-    def messages2prompt(self, messages, sequence_start=True, **kwargs) -> str:
-        """convert messages to decorated prompt."""
-        if isinstance(messages, str):
-            return self.chat_template.messages2prompt(messages, sequence_start)
-        new_messages = self.convert_messages(messages, sequence_start)
-        return self.chat_template.messages2prompt(new_messages, sequence_start)
-
-
-class LlavaVLChatTemplateWrapper(VLChatTemplateWrapper):
-    """Llava vl chat template."""
-    pass
-
-
-class YiVLChatTemplateWrapper(VLChatTemplateWrapper):
-    """Yi vl chat template."""
-    pass
-
-
-class InternVLChatTemplateWrapper(VLChatTemplateWrapper):
-    """InternVL chat template."""
-
-    def append_image_token(self, prompt, num_images: int):
-        """append image tokens to user prompt."""
-        # lmdeploy uses <IMAGET_TOKEN> as image token
-        # internvl uses special tags
-        if IMAGE_TOKEN in prompt and f'<img>{IMAGE_TOKEN}' not in prompt:
-            prompt = prompt.replace(f'{IMAGE_TOKEN}',
-                                    f'<img>{IMAGE_TOKEN}</img>')
-            prompt = prompt.replace('</img><img>', '')
-            prompt = prompt.replace('<img><img>', '<img>')
-            prompt = prompt.replace('</img></img>', '</img>')
-        elif IMAGE_TOKEN not in prompt:
-            prompt = f'<img>{IMAGE_TOKEN * num_images}</img>\n' + prompt
-        return prompt
-
-
-class DeepSeekVLChatTemplateWrapper(VLChatTemplateWrapper):
-    """DeepSeek vl chat template."""
-
-    def append_image_token(self, prompt, num_images: int):
-        """append image tokens to user prompt."""
-        if IMAGE_TOKEN in prompt:
-            return prompt
-        logger.error(
-            f'for deepseek-vl model, the user should insert the {IMAGE_TOKEN} '
-            'to user prompt manually, please read https://lmdeploy.readthedocs'
-            '.io/en/latest/inference/vl_pipeline.html for more details.')
-        if num_images == 1:
-            return f'{IMAGE_TOKEN}{prompt}'
-        res = ''
-        for i in range(num_images):
-            res += f'{IMAGE_TOKEN} is Figure {str(i)}.\n'
-        res = res + prompt
-        return res
-
-
-class QwenVLChatTemplateWrapper(VLChatTemplateWrapper):
-    """Qwen vl chat template."""
-
-    def append_image_token(self, prompt, num_images: int):
-        """append image tokens to user prompt."""
-        if IMAGE_TOKEN in prompt:
-            return prompt
-        res = ''
-        for i in range(num_images):
-            res += f'Picture {str(i)}:{IMAGE_TOKEN}\n'
-        res = res + prompt
-        return res
-
-
-class Qwen2VLChatTemplateWrapper(VLChatTemplateWrapper):
-    """qwen2 vl."""
-
-    def append_image_token(self, prompt, num_images: int):
-        """append image tokens to user prompt."""
-        if IMAGE_TOKEN in prompt and '<|vision_start|>' not in prompt:
-            prompt = prompt.replace(
-                IMAGE_TOKEN, f'<|vision_start|>{IMAGE_TOKEN}<|vision_end|>')
-        else:
-            # Qwen2-VL-2B-Instruct will concat image and user prompt according
-            #   to their order in the content list
-            # we insert image token before user prompt by default. The user can
-            #   use custom image token position if they want the same decorated
-            #   prompt as Qwen2-VL
-            prompt = f'<|vision_start|>{IMAGE_TOKEN}<|vision_end|>' * \
-                num_images + prompt
-        return prompt
-
-    def get_mrope_info(self,
-                       seq_len: int,
-                       grid_thws: List[Tuple[int, int, int]] = None,
-                       embedding_ranges: List[Tuple[int, int]] = None):
-        import torch
-        if grid_thws is None:
-            mrope_position_ids = torch.arange(seq_len).expand(3, -1)
-            mrope_position_delta = torch.tensor([0], dtype=torch.long)
-        else:
-            mrope_position_ids = [
-                torch.arange(embedding_ranges[0][0]).expand(3, -1)
-            ]
-            st_idx = embedding_ranges[0][0]
-            for i, (grid_thw, embedding_range) in enumerate(
-                    zip(grid_thws, embedding_ranges)):
-                llm_grid_t, llm_grid_h, llm_grid_w = grid_thw
-                llm_grid_h //= 2
-                llm_grid_w //= 2
-                t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
-                    -1, llm_grid_h * llm_grid_w).flatten()
-                h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
-                    llm_grid_t, -1, llm_grid_w).flatten()
-                w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
-                    llm_grid_t, llm_grid_h, -1).flatten()
-                mrope_position_ids.append(
-                    torch.stack([t_index, h_index, w_index]) + st_idx)
-                st_idx += max(llm_grid_h, llm_grid_w)
-                if i < len(embedding_ranges) - 1:
-                    text_len = embedding_ranges[i +
-                                                1][0] - embedding_ranges[i][1]
-                else:
-                    text_len = seq_len - embedding_range[1]
-                mrope_position_ids.append(
-                    torch.arange(text_len).expand(3, -1) + st_idx)
-                st_idx += text_len
-            mrope_position_ids = torch.cat(mrope_position_ids, dim=-1)
-            mrope_position_delta = torch.tensor([st_idx - seq_len],
-                                                dtype=torch.long)
-
-        return mrope_position_ids, mrope_position_delta
-
-
-class CogVLMChatTemplateWrapper(VLChatTemplateWrapper):
-    """cogvlm chat template wrapper."""
-
-    def __init__(self, chat_template: BaseModel):
-        from lmdeploy.model import Vicuna
-        self.chat_template = chat_template
-        self.llm_chat_template = Vicuna(eoa=chat_template.eoa,
-                                        stop_words=chat_template.stop_words)
-
-    def convert_messages(self, messages, sequence_start=True):
-        """convert GPT4V message format to GPT4 text format."""
-        new_messages = []
-        for message in messages:
-            role = message['role']
-            content = message['content']
-            if role != 'user' or isinstance(content, str):
-                new_messages.append(message)
-                continue
-            num_images = 0
-            for item in content:
-                if item['type'] == 'image_url':
-                    num_images += 1
-                elif item['type'] == 'image_data':
-                    num_images += 1
-                elif item['type'] == 'text':
-                    prompt = item['text']
-
-            new_item = {
-                'role': 'user',
-                'content': prompt,
-                'num_images': num_images
-            }
-            new_messages.append(new_item)
-        return new_messages
-
-    def messages2prompt(self, messages, sequence_start=True, **kwargs) -> str:
-        """convert messages to decorated prompt."""
-        if isinstance(messages, str):
-            return self.chat_template.messages2prompt(messages, sequence_start)
-        new_messages = self.convert_messages(messages, sequence_start)
-        prompt = ''
-        for i, msg in enumerate(new_messages):
-            num_images = msg.pop('num_images', 0)
-            if num_images == 0:
-                role = msg['role']
-                msg = self.llm_chat_template.messages2prompt([msg],
-                                                             sequence_start
-                                                             and i == 0)
-                msg = dict(role=role, content=msg)
-            prompt_i = self.chat_template.messages2prompt([msg], sequence_start
-                                                          and i == 0)
-            if num_images > 0:
-                prompt_i = (IMAGE_TOKEN * num_images) + prompt_i
-            prompt += prompt_i
-        return prompt
-
-
-class InternLMXComposer2TemplateWrapper(VLChatTemplateWrapper):
-    """InternLM-XComposer2 chat template."""
-
-    def append_image_token(self, prompt, num_images: int):
-        if IMAGE_TOKEN in prompt:
-            return prompt
-        logger.warning(f'auto append {IMAGE_TOKEN} at the beginning, '
-                       'the user can manually insert the token to prompt')
-        return ' '.join([IMAGE_TOKEN] * num_images) + prompt
-
-
-class MiniGeminiLlamaTempateWrapper(VLChatTemplateWrapper):
-    """Qwen vl chat template."""
-
-    def append_image_token(self, prompt, num_images: int):
-        """append image tokens to user prompt."""
-        if num_images == 0:
-            return prompt
-        if IMAGE_TOKEN in prompt:
-            return prompt
-        res = f'{IMAGE_TOKEN}\n'
-        assert num_images <= 1, 'MiniGeminiLlama accepts 1 input image'
-        res = res + prompt
-        return res
-
-
-class MllamaTempateWrapper(VLChatTemplateWrapper):
-    """Mllama chat template."""
-
-    def append_image_token(self, prompt, num_images: int):
-        """append image tokens to user prompt."""
-        return f'{IMAGE_TOKEN * num_images}{prompt}'
-
-
-class MiniCPMVTempateWrapper(VLChatTemplateWrapper):
-    """MiniCPM-Llama3-V-2_5 chat template."""
-
-    def append_image_token(self, prompt, num_images: int):
-        if IMAGE_TOKEN in prompt:
-            return prompt
-        prompt = f'{IMAGE_TOKEN}\n' * num_images + prompt
-        return prompt
-
-    def update_image_token(self, prompt, features):
-        _features = []
-        _prompt = []
-        segs = prompt.split(f'{IMAGE_TOKEN}\n')
-        for i, seg in enumerate(segs):
-            if i > 0 and i <= len(features):
-                _feat = features[i - 1]['embeddings'].split(1)
-                _feat = [x.squeeze() for x in _feat]
-                _features.extend(_feat)
-                _seg = f'<image>{IMAGE_TOKEN}</image>'
-                if len(_feat) > 1:
-                    grid = features[i - 1]['grid']
-                    if grid is not None:
-                        _slice = '\n'.join(
-                            [f'<image>{IMAGE_TOKEN}</image>' * grid[0]] *
-                            grid[1])
-                        _seg = f'{_seg}<slice>{_slice}</slice>\n'
-                _prompt.append(_seg)
-            _prompt.append(seg)
-        _prompt = ''.join(_prompt)
-        return _prompt, _features
-
-
-class MiniCPMV26TempateWrapper(MiniCPMVTempateWrapper):
-    """MiniCPM-V-2_6 chat template."""
-
-    def update_image_token(self, prompt, features):
-        _features = []
-        _prompt = []
-        segs = prompt.split(f'{IMAGE_TOKEN}\n')
-        idx = 0
-        for i, seg in enumerate(segs):
-            if i > 0 and i <= len(features):
-                _feat = features[i - 1]['embeddings'].split(1)
-                _feat = [x.squeeze() for x in _feat]
-                _features.extend(_feat)
-                _seg = f'<image>{IMAGE_TOKEN}</image>'
-                if features[i - 1].get('use_image_id', False):
-                    _seg = f'<image_id>{idx}</image_id>' + _seg
-                    idx += 1
-                if len(_feat) > 1:
-                    grid = features[i - 1]['grid']
-                    if grid is not None:
-                        _slice = '\n'.join(
-                            [f'<slice>{IMAGE_TOKEN}</slice>' * grid[0]] *
-                            grid[1])
-                        _seg = _seg + _slice
-                _seg += '\n'
-                _prompt.append(_seg)
-            _prompt.append(seg)
-        _prompt = ''.join(_prompt)
-        return _prompt, _features
-
-
-class GLM4VChatTemplateWrapper(VLChatTemplateWrapper):
-    """glm-4v chat template."""
-    pass
-
-
-class MolmoChatTemplateWrapper(VLChatTemplateWrapper):
-
-    async def async_collect_pil_images(
-            self, messages: List[Dict]) -> List[Tuple[PIL.Image.Image, Dict]]:
-        """collect images from messages.
-
-        Args:
-            messages (List[Dict]): a user request of GPT4V message format
-        """
-        if isinstance(messages, Dict):
-            messages = [messages]
-        assert isinstance(messages, List)
-
-        out_messages = [None] * len(messages)
-
-        def _inner_call(i, in_messages, out_messages):
-            role = in_messages[i]['role']
-            content = in_messages[i]['content']
-            if role != 'user' or isinstance(content, str):
-                # means message is user's prompt input or assistant's prompt,
-                # returning it directory
-                out_messages[i] = in_messages[i]
-                return
-            # the role is a user and the content is a list
-            assert isinstance(content, List)
-            message = dict(role=role, content='', images=[])
-            for item in content:
-                # 'image_url': means url or local path to image.
-                # 'image_data': means PIL.Image.Image object.
-                if item['type'] == 'image_url':
-                    try:
-                        image = load_image(item['image_url']['url'])
-                        message['images'].append(image)
-                    except KeyError:
-                        logger.error(f'invalid format {message}')
-                elif item['type'] == 'image_data':
-                    try:
-                        image = load_image(item['image_data']['data'])
-                        message['images'].append(image)
-                    except KeyError:
-                        logger.error(f'invalid format {message}')
-                elif item['type'] == 'text':
-                    message['content'] = item['text']
-                else:
-                    logger.error(f'unexpected content type {message}')
-            out_messages[i] = message
-
-        await asyncio.gather(*[
-            asyncio.get_event_loop().run_in_executor(None, _inner_call, i,
-                                                     messages, out_messages)
-            for i in range(len(messages))
-        ])
-        return [(None, out_messages)]
-
-    def messages2prompt(self, messages, sequence_start=True, **kwargs) -> str:
-        """Return a placeholder "IMAGE_TOKEN" so that
-        `vl_asyn_engine._get_prompt_input` can know that it."""
-        if isinstance(messages, str):
-            return self.chat_template.messages2prompt(messages, sequence_start)
-        else:
-            _messages = []
-            for message in messages:
-                role, content = message['role'], message['content']
-                if role != 'user' or isinstance(content, str):
-                    _messages.append(message)
-                    continue
-                for item in content:
-                    item_type = item['type']
-                    if item_type in ['image_url', 'image_data']:
-                        # Return the image placeholder so that
-                        # `vl_asyn_engine._get_prompt_input` can know that the
-                        # request contains images
-                        return IMAGE_TOKEN
-                    _messages.append(dict(role=role, content=item[item_type]))
-            return self.chat_template.messages2prompt(_messages,
-                                                      sequence_start)
-
-
-def get_vl_prompt_template(model_path: str, chat_template: BaseModel,
-                           model_name: str) -> VLChatTemplateWrapper:
-    """get vision language prompt template."""
-    assert type(chat_template) != type(BaseModel()), 'failed to match ' \
-        'chat template, please explicit set chat_template_config'  # noqa E721
-    if model_name == 'yi-vl':
-        return YiVLChatTemplateWrapper(chat_template)
-    arch, cfg = get_model_arch(model_path)
-    if arch == 'QWenLMHeadModel':
-        return QwenVLChatTemplateWrapper(chat_template)
-    elif arch in [
-            'LlavaLlamaForCausalLM', 'LlavaMistralForCausalLM',
-            'LlavaForConditionalGeneration',
-            'LlavaNextForConditionalGeneration', 'Phi3VForCausalLM'
-    ]:
-        return LlavaVLChatTemplateWrapper(chat_template)
-    elif arch == 'MultiModalityCausalLM':  # deepseek-vl
-        return DeepSeekVLChatTemplateWrapper(chat_template)
-    elif arch == 'MllamaForConditionalGeneration':  # llama 3.2
-        return MllamaTempateWrapper(chat_template)
-    elif arch == 'CogVLMForCausalLM':
-        return CogVLMChatTemplateWrapper(chat_template)
-    elif arch in ['InternLMXComposer2ForCausalLM', 'InternLM2ForCausalLM']:
-        return InternLMXComposer2TemplateWrapper(chat_template)
-    elif arch == 'InternVLChatModel':
-        return InternVLChatTemplateWrapper(chat_template)
-    elif arch in ['MiniGeminiLlamaForCausalLM', 'MGMLlamaForCausalLM']:
-        return MiniGeminiLlamaTempateWrapper(chat_template)
-    elif arch == 'MiniCPMV':
-        version_map = {
-            '2.5': MiniCPMVTempateWrapper,
-            '2.6': MiniCPMV26TempateWrapper
-        }
-        version = str(getattr(cfg, 'version', '2.5'))
-        return version_map[version](chat_template)
-    elif arch == 'ChatGLMModel':
-        return GLM4VChatTemplateWrapper(chat_template)
-    elif arch == 'Qwen2VLForConditionalGeneration':
-        return Qwen2VLChatTemplateWrapper(chat_template)
-    elif arch == 'MolmoForCausalLM':
-        return MolmoChatTemplateWrapper(chat_template)
-    raise ValueError(f'unsupported vl_prompt_template with arch {arch}')
diff --git a/tests/pytorch/kernel/test_flash_attention.py b/tests/pytorch/kernel/test_flash_attention.py
index 7d4b7a7f3a..e56de44b37 100644
--- a/tests/pytorch/kernel/test_flash_attention.py
+++ b/tests/pytorch/kernel/test_flash_attention.py
@@ -10,20 +10,26 @@ def _conti_input(data, q_seqlens):
     return data
 
 
-def _make_bias(q_seqlens, history_lens, neg_val):
-    full_seq_lens = q_seqlens + history_lens
+def _make_bias(q_seqlens, history_lens, neg_val, causal):
+    kv_seqlens = q_seqlens + history_lens
     max_seq_len = q_seqlens.max().item()
-    max_full_len = full_seq_lens.max().item()
-    seq_ranges = [torch.arange(max_seq_len) for _ in q_seqlens]
-    for r, l in zip(seq_ranges, q_seqlens):
-        r[l:] = -max_full_len
-    seq_ranges = torch.stack(seq_ranges, dim=0).cuda()
-    kv_ranges = [torch.arange(max_full_len) for _ in full_seq_lens]
-    kv_ranges = torch.stack(kv_ranges, 0).cuda()
-    mask = kv_ranges[:, None, :] - seq_ranges[:, :, None] > history_lens[:,
-                                                                         None,
-                                                                         None]
-    return mask.float() * neg_val
+    max_kv_len = kv_seqlens.max().item()
+    if causal:
+        seq_ranges = [torch.arange(max_seq_len) for _ in q_seqlens]
+        for r, l in zip(seq_ranges, q_seqlens):
+            r[l:] = -max_kv_len
+        seq_ranges = torch.stack(seq_ranges, dim=0).cuda()
+        kv_ranges = [torch.arange(max_kv_len) for _ in kv_seqlens]
+        kv_ranges = torch.stack(kv_ranges, 0).cuda()
+        mask = (kv_ranges[:, None, :] - seq_ranges[:, :, None] >
+                history_lens[:, None, None])
+        return mask.float() * neg_val
+    else:
+        q_mask = torch.arange(max_seq_len)[None].cuda() < q_seqlens[:, None]
+        k_mask = torch.arange(max_kv_len)[None].cuda() < kv_seqlens[:, None]
+        mask = q_mask[:, :, None] & k_mask[:, None, :]
+
+        return (~mask).float() * neg_val
 
 
 def _naive_attention(batched_q, batched_kv, bias):
@@ -100,6 +106,10 @@ def num_heads_q(self, request):
     def num_heads_k(self, request):
         yield request.param
 
+    @pytest.fixture
+    def causal(self, request):
+        yield request.param
+
     @pytest.fixture
     def q_seqlens(self, request):
         yield torch.tensor(request.param, device='cuda')
@@ -138,8 +148,8 @@ def batched_kv(self, q_seqlens, history_lens, num_heads_k, head_dim_k,
                    head_dim_v, dtype):
         torch.manual_seed(123)
         batch_size = len(q_seqlens)
-        full_seq_lens = q_seqlens + history_lens
-        max_seq_len = full_seq_lens.max().item()
+        kv_seqlens = q_seqlens + history_lens
+        max_seq_len = kv_seqlens.max().item()
         k = torch.rand(batch_size,
                        max_seq_len,
                        num_heads_k,
@@ -167,9 +177,9 @@ def conti_kv(self, kv_seqlens, batched_kv):
         yield (conti_k, conti_v)
 
     @pytest.fixture
-    def mask(self, q_seqlens, history_lens):
+    def mask(self, q_seqlens, history_lens, causal):
         neg_val = -1e30
-        yield _make_bias(q_seqlens, history_lens, neg_val)
+        yield _make_bias(q_seqlens, history_lens, neg_val, causal)
 
     @pytest.fixture
     def gt(self, batched_q, batched_kv, mask):
@@ -183,11 +193,13 @@ def conti_gt(self, gt, q_seqlens):
     @pytest.mark.parametrize('head_dim_v', [32], indirect=True)
     @pytest.mark.parametrize('num_heads_q', [8, 2], indirect=True)
     @pytest.mark.parametrize('num_heads_k', [2], indirect=True)
+    @pytest.mark.parametrize('causal', [True, False], indirect=True)
     @pytest.mark.parametrize(['q_seqlens', 'history_lens'],
                              [([30, 50, 70, 90], [50, 40, 30, 20])],
                              indirect=True)
     def test_flash_attention(self, conti_q, conti_kv, q_start_loc, q_seqlens,
-                             kv_start_loc, kv_seqlens, head_dim_v, conti_gt):
+                             kv_start_loc, kv_seqlens, head_dim_v, causal,
+                             conti_gt):
         from lmdeploy.pytorch.kernels.cuda.flashattention import \
             flash_attention_fwd
         max_seq_len = q_seqlens.max().item()
@@ -202,7 +214,8 @@ def test_flash_attention(self, conti_q, conti_kv, q_start_loc, q_seqlens,
                             q_seqlens=q_seqlens,
                             kv_start_loc=kv_start_loc,
                             kv_seqlens=kv_seqlens,
-                            max_seqlen=max_seq_len)
+                            max_seqlen=max_seq_len,
+                            causal=causal)
         torch.testing.assert_close(out, conti_gt, atol=1e-3, rtol=1e-5)
 
     @pytest.fixture
diff --git a/tests/test_lmdeploy/test_vl_encode.py b/tests/test_lmdeploy/test_vl/test_vl_encode.py
similarity index 100%
rename from tests/test_lmdeploy/test_vl_encode.py
rename to tests/test_lmdeploy/test_vl/test_vl_encode.py
diff --git a/tests/test_lmdeploy/test_vl_template.py b/tests/test_lmdeploy/test_vl_template.py
deleted file mode 100644
index cf8abf9e44..0000000000
--- a/tests/test_lmdeploy/test_vl_template.py
+++ /dev/null
@@ -1,132 +0,0 @@
-import PIL
-
-from lmdeploy.model import MODELS
-from lmdeploy.vl.constants import IMAGE_TOKEN
-from lmdeploy.vl.templates import VLChatTemplateWrapper
-
-
-def test_prompt_to_messages():
-    model = MODELS.get('llava-v1')()
-    templtae = VLChatTemplateWrapper(model)
-    out = templtae.prompt_to_messages('hi')
-    assert isinstance(out, list) and isinstance(out[0], dict)
-    im = PIL.Image.new(mode='RGB', size=(200, 200))
-    out = templtae.prompt_to_messages(('hi', [im]))
-    assert isinstance(out, list) and isinstance(out[0], dict)
-
-
-def test_messages2prompt():
-    model = MODELS.get('llava-v1')()
-    templtae = VLChatTemplateWrapper(model)
-    messages = [
-        dict(role='user',
-             content=[
-                 dict(type='text', text='q1'),
-                 dict(type='image_url', image_url=dict(url='xxx'))
-             ])
-    ]
-    prompt = templtae.messages2prompt(messages)
-    assert isinstance(prompt, str)
-    assert prompt.count(IMAGE_TOKEN) == 1
-    expected = (
-        'A chat between a curious human and an artificial intelligence '
-        'assistant. The assistant gives helpful, detailed, and polite '
-        "answers to the human's questions. USER: "
-        '<IMAGE_TOKEN>\nq1 ASSISTANT:')
-    assert prompt == expected
-
-    messages.append({'role': 'assistant', 'content': 'a1'})
-    messages.append({'role': 'user', 'content': 'q2'})
-    prompt = templtae.messages2prompt(messages)
-    expected = (
-        'A chat between a curious human and an artificial intelligence '
-        'assistant. The assistant gives helpful, detailed, and polite '
-        "answers to the human's questions. USER: "
-        '<IMAGE_TOKEN>\nq1 ASSISTANT: a1</s>USER: q2 ASSISTANT:')
-    assert prompt == expected
-
-
-def test_internvl2_conv():
-    # https://huggingface.co/OpenGVLab/InternVL2-8B/blob/3bfd3664dea4f3da628785f5125d30f889701253/conversation.py
-    from transformers.dynamic_module_utils import get_class_from_dynamic_module
-    get_conv_template = get_class_from_dynamic_module(
-        'conversation.get_conv_template', 'OpenGVLab/InternVL2-8B')
-    template = get_conv_template('internlm2-chat')
-    question1 = 'question1'
-    template.append_message(template.roles[0], question1)
-    template.append_message(template.roles[1], None)
-    model = MODELS.get('internvl2-internlm2')()
-    messages = [dict(role='user', content=question1)]
-    assert template.get_prompt() == model.messages2prompt(messages)
-
-    answer1 = 'answer1'
-    template.messages[-1][1] = answer1
-    question2 = 'question2'
-    template.append_message(template.roles[0], question2)
-    template.append_message(template.roles[1], None)
-    messages.append(dict(role='assistant', content=answer1))
-    messages.append(dict(role='user', content=question2))
-    assert template.get_prompt() == model.messages2prompt(messages)
-
-
-def test_llava_conv_chatml_direct():
-    model = MODELS.get('llava-chatml')()
-    templtae = VLChatTemplateWrapper(model)
-    messages = [
-        dict(role='user',
-             content=[
-                 dict(type='text', text='q1'),
-                 dict(type='image_url', image_url=dict(url='xxx'))
-             ])
-    ]
-
-    prompt = templtae.messages2prompt(messages)
-    expected = ('<|im_start|>system\nAnswer the questions.<|im_end|>'
-                '<|im_start|>user\n<IMAGE_TOKEN>\nq1<|im_end|>'
-                '<|im_start|>assistant\n')
-    assert prompt == expected
-
-    messages.append({'role': 'assistant', 'content': 'a1'})
-    messages.append({'role': 'user', 'content': 'q2'})
-    prompt = templtae.messages2prompt(messages)
-    expected = ('<|im_start|>system\nAnswer the questions.<|im_end|>'
-                '<|im_start|>user\n<IMAGE_TOKEN>\nq1<|im_end|>'
-                '<|im_start|>assistant\na1<|im_end|>'
-                '<|im_start|>user\nq2<|im_end|>'
-                '<|im_start|>assistant\n')
-    assert prompt == expected
-
-
-def test_custom_image_token():
-    from lmdeploy.vl.templates import DeepSeekVLChatTemplateWrapper
-    model = MODELS.get('deepseek-vl')()
-    template = DeepSeekVLChatTemplateWrapper(model)
-
-    def create_user(query: str):
-        item = dict(role='user', content=[dict(type='text', text=query)])
-        num = query.count(IMAGE_TOKEN)
-        for _ in range(num):
-            item['content'].append(
-                dict(type='image_url', image_url=dict(url='xxx')))
-        return item
-
-    def create_assistant(response: str):
-        return dict(role='assistant', content=response)
-
-    messages = [create_user(f'{IMAGE_TOKEN} q1')]
-    prompt = template.messages2prompt(messages)
-    expected = ('You are a helpful language and vision assistant. You are able'
-                ' to understand the visual content that the user provides, and'
-                ' assist the user with a variety of tasks using natural '
-                'language.\n\nUser: <IMAGE_TOKEN> q1\n\nAssistant:')
-    assert prompt == expected
-
-    messages.append(create_assistant('a1'))
-    messages.append(create_user(f'q2 {IMAGE_TOKEN}'))
-    prompt = template.messages2prompt(messages)
-    expected = ('You are a helpful language and vision assistant. You are able'
-                ' to understand the visual content that the user provides, and'
-                ' assist the user with a variety of tasks using natural '
-                'language.\n\nUser: <IMAGE_TOKEN> q1\n\nAssistant: '
-                'a1<｜end▁of▁sentence｜>User: q2 <IMAGE_TOKEN>\n\nAssistant:')
-    assert prompt == expected

From abd90dbc90d263a054246aaa7e5f6ff92af27d22 Mon Sep 17 00:00:00 2001
From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
Date: Mon, 16 Dec 2024 12:14:40 +0800
Subject: [PATCH 119/122] Fix llama3.1 chat template (#2862)

* Update model.py

* Update test_model.py
---
 lmdeploy/model.py                 | 2 +-
 tests/test_lmdeploy/test_model.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index a4355ea131..4bbbca2298 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -847,7 +847,7 @@ def __init__(
 - Only call one function at a time
 - Put the entire function call reply on one line"
 - Always add your sources when using search results to answer the user query\n\n""",  # noqa
-            knowledge='Cutting Knowledge Date: December 2023\nToday Date: 23 Jul 2024\n\n',
+            knowledge='Cutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n',
             meta_instruction='You are a helpful assistant.',
             ipython='<|start_header_id|>ipython<|end_header_id|>\n\n',
             eoi='<|eot_id|>',
diff --git a/tests/test_lmdeploy/test_model.py b/tests/test_lmdeploy/test_model.py
index 3b78053a74..0e53283a87 100644
--- a/tests/test_lmdeploy/test_model.py
+++ b/tests/test_lmdeploy/test_model.py
@@ -220,7 +220,7 @@ def test_llama3_1():
         },
     }]
     actual_prompt = model.messages2prompt(messages, tools=tools)
-    expected_prompt = '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 23 Jul 2024\n\n# Tool Instructions\n- Always execute python code in messages that you share.\n- When looking for real time information use relevant functions if available else fallback to brave_search\n\n\n\nYou have access to the following functions:\n\nUse the function \'spotify_trending_songs\' to: Get top trending songs on Spotify\n{"name": "spotify_trending_songs", "description": "Get top trending songs on Spotify", "parameters": {"n": {"param_type": "int", "description": "Number of trending songs to get", "required": true}}}\n\n\nIf a you choose to call a function ONLY reply in the following format:\n<{start_tag}={function_name}>{parameters}{end_tag}\nwhere\n\nstart_tag => `<function`\nparameters => a JSON dict with the function argument name as key and function argument value as value.\nend_tag => `</function>`\n\nHere is an example,\n<function=example_function_name>{"example_name": "example_value"}</function>\n\nReminder:\n- Function calls MUST follow the specified format\n- Required parameters MUST be specified\n- Only call one function at a time\n- Put the entire function call reply on one line"\n- Always add your sources when using search results to answer the user query\n\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCan you check the top 5 trending songs on spotify?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa
+    expected_prompt = '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n# Tool Instructions\n- Always execute python code in messages that you share.\n- When looking for real time information use relevant functions if available else fallback to brave_search\n\n\n\nYou have access to the following functions:\n\nUse the function \'spotify_trending_songs\' to: Get top trending songs on Spotify\n{"name": "spotify_trending_songs", "description": "Get top trending songs on Spotify", "parameters": {"n": {"param_type": "int", "description": "Number of trending songs to get", "required": true}}}\n\n\nIf a you choose to call a function ONLY reply in the following format:\n<{start_tag}={function_name}>{parameters}{end_tag}\nwhere\n\nstart_tag => `<function`\nparameters => a JSON dict with the function argument name as key and function argument value as value.\nend_tag => `</function>`\n\nHere is an example,\n<function=example_function_name>{"example_name": "example_value"}</function>\n\nReminder:\n- Function calls MUST follow the specified format\n- Required parameters MUST be specified\n- Only call one function at a time\n- Put the entire function call reply on one line"\n- Always add your sources when using search results to answer the user query\n\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCan you check the top 5 trending songs on spotify?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa
     assert actual_prompt == expected_prompt
 
 
From 1efed796eeb2555e5194b7a99356100aaeac980e Mon Sep 17 00:00:00 2001
From: tangzhiyi11 <tangzhiyi11@users.noreply.github.com>
Date: Mon, 16 Dec 2024 15:46:27 +0800
Subject: [PATCH 120/122] [dlinfer] only compile language_model in vl models
 (#2893)

Co-authored-by: jinminxi104 <jinminxi104@hotmail.com>
---
 .../backends/dlinfer/ascend/graph_runner.py       | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py b/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py
index f9664f13ff..e3c5dc4d5e 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/graph_runner.py
@@ -33,10 +33,17 @@ def __init__(self, model: torch.nn.Module, model_config: ModelConfig,
             dlinfer.graph.config.enable_graph_mode = True
             self.patch_kernels_custom_op()
             self.patch_kvcache_static_shape()
-            self.model = torch.compile(self.model,
-                                       fullgraph=True,
-                                       dynamic=True,
-                                       backend='atbgraph')
+            if hasattr(self.model, 'language_model'):
+                self.model.language_model = torch.compile(
+                    self.model.language_model,
+                    fullgraph=True,
+                    dynamic=True,
+                    backend='atbgraph')
+            else:
+                self.model = torch.compile(self.model,
+                                           fullgraph=True,
+                                           dynamic=True,
+                                           backend='atbgraph')
 
     def check_enable_graph(self):
         """check enable graph."""

From 8afb84c1243395c1573bf90bdf7c73fd3549e2be Mon Sep 17 00:00:00 2001
From: q yao <streetyao@live.com>
Date: Tue, 17 Dec 2024 21:40:09 +0800
Subject: [PATCH 121/122] Optimize tp broadcast (#2889)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* refactor VL modules for internvl and qwen2-vl (#2764)

* qwen2-vl

* internvl

* qwen2

* Refactor VL modules for glm4v, deepseek-vl, llava-hf, cogvlm (#2772)

* qwen2-vl

* internvl

* qwen2

* get image_tokens_per_patch for internvl2

* deepseek-vl

* cogvlm

* glm4v

* update internvl

* internvl_llava

* llava

* glm4v

* upate internvl

* cogvlm

* deepseek

* llava_hf

* rollback llava, internvl-llava

* Refactor VL modules for qwen-vl, llava and llava_next (#2773)

* qwen2-vl

* internvl

* qwen2

* get image_tokens_per_patch for internvl2

* deepseek-vl

* cogvlm

* glm4v

* update internvl

* internvl_llava

* llava

* glm4v

* upate internvl

* cogvlm

* deepseek

* llava_hf

* rollback llava, internvl-llava

* refactor qwen

* update internvl

* update llava_hf

* update qwen2-vl

* llava_next

* update llava_next

* update llava

* update llava

* update llava

* Refactor VL modules for qwen2-vl (#2777)

* qwen2-vl

* internvl

* qwen2

* get image_tokens_per_patch for internvl2

* deepseek-vl

* cogvlm

* glm4v

* update internvl

* internvl_llava

* llava

* glm4v

* upate internvl

* cogvlm

* deepseek

* llava_hf

* rollback llava, internvl-llava

* refactor qwen

* update internvl

* update llava_hf

* update qwen2-vl

* llava_next

* update llava_next

* update llava

* update llava

* update llava

* qwen2

* Fix side-effect to internvl (#2778)

* qwen2-vl

* internvl

* qwen2

* get image_tokens_per_patch for internvl2

* deepseek-vl

* cogvlm

* glm4v

* update internvl

* internvl_llava

* llava

* glm4v

* upate internvl

* cogvlm

* deepseek

* llava_hf

* rollback llava, internvl-llava

* refactor qwen

* update internvl

* update llava_hf

* update qwen2-vl

* llava_next

* update llava_next

* update llava

* update llava

* update llava

* qwen2

* fix internvl

* Refactor VL modules for phi3-vision (#2779)

* qwen2-vl

* internvl

* qwen2

* get image_tokens_per_patch for internvl2

* deepseek-vl

* cogvlm

* glm4v

* update internvl

* internvl_llava

* llava

* glm4v

* upate internvl

* cogvlm

* deepseek

* llava_hf

* rollback llava, internvl-llava

* refactor qwen

* update internvl

* update llava_hf

* update qwen2-vl

* llava_next

* update llava_next

* update llava

* update llava

* update llava

* qwen2

* fix internvl

* phi3-vision

* Refactor VL modules for mllama and yi-vl (#2781)

* qwen2-vl

* internvl

* qwen2

* get image_tokens_per_patch for internvl2

* deepseek-vl

* cogvlm

* glm4v

* update internvl

* internvl_llava

* llava

* glm4v

* upate internvl

* cogvlm

* deepseek

* llava_hf

* rollback llava, internvl-llava

* refactor qwen

* update internvl

* update llava_hf

* update qwen2-vl

* llava_next

* update llava_next

* update llava

* update llava

* update llava

* qwen2

* fix internvl

* phi3-vision

* refactor yi-vl

* refactor mllama

* Refactor VLM module for minicpm and molmo (#2794)

* Refactor VLM modules for xcomposer series (#2796)

* Refactor VLM modules for internvl-llava (#2797)

* Refactor VLM modules v2 (#2806)

* internvl2 v2

* cogvlm

* deepseek-vl

* glm-4v

* llava-hf

* llava-next

* llava

* internvl-llava

* mllama

* phi3-vision

* qwen

* qwen2

* yi-vl

* xcomposer

* minicpm

* molmo

* update

* update

* Remove vl template (#2809)

* Resolve conflicts (#2811)

* feature: support qwen2.5 fuction_call (#2737)

* feat: support qwen2.5 tools_call

* fix: npe bug

* fix: 模版不一致

* fix: adopting review suggestions

* fix: adopting review suggestions

* fix: adopting review suggestions

* fix: adopting review suggestions

* feat: Support multi tools calling

* feat: Support multi tools calling

* fix: Add '\n' between each tool

* fix: Add ensure_ascii=False

* bugfix: rfind

* bugfix: tools_call -> tool_calls

* bugfix: add toolName in tool_response

* fix: some '\n' error

* fix: remove toolname

* fix: replace '\n' to self.separator

* feat: add doc with multiple tool calling

* fix：update doc

* feat: add qwen2.5 prompt template test

* feat: add qwen2.5 no tool call prompt test

---------

Co-authored-by: gaozixiang <gaozixiang1@xiaomi.com>

* Update supported models & Ascend doc (#2765)

* update ascend supported model list

* fix markdown

* fix markdown

* fix lint

* Update get_started.md

* Update get_started.md

* [CI] Split vl testcases into turbomind and pytorch backend (#2751)

* updaet

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* [Feature] support minicpm-v_2_6 for pytorch engine. (#2767)

* support minicpmv_2_6.

* update supported_models.

* update supported_models.

* Support qwen2-vl AWQ quantization (#2787)

* Support qwen2-vl AWQ quantization

* Update config.yaml


* [dlinfer] Fix qwenvl rope error for dlinfer backend (#2795)

* Optimize update_step_ctx on Ascend (#2804)

* opt update_ctx for ascend

* fix lint


* PytorchEngine refactor multimodal (#2742)

* WIP

* support mrope

* support long context

* support causal=false

* fix mask

* flash attn bound

* optimize

* Moskau, Moskau, wirf die Gläser an die Wand

* YMCA

* optimize mllama

* update processor

* support cogvlm

* all work and no play make jack a dull boy

* upgrade triton

* support qwen2vl

* support internvl

* phi3-v WIP

* glm4v WIP

* support chatglm and cogvlm

* use image tokens

* support llava

* support internvl-mono

* phi3v, mllama

* add llavanext

* use img token ids

* support multiimage chatglm cogvlm

* fix ut

* minor-fix

* minor-fix (#2813)

* fix

* fix mono

* fix docs

* read norm_type

* super().collect_images->self.collect_images

* add note in supported models

* define the parameters clearly

* better streaming

* fix molmo

* Fix vision model batch inference (#2868)

* remove forward from vl models that are not supported by tm

* support max_batch_size

* fix

* warn glm4v does not support multi images

* unconst

* fix deepseek-vl

* fix internvl

* fix llava

* fix minicpm 2.6

* fix callback

* fix minicpm v2.5

* fix minicpm v2.6

* update llava_next.py

* remove hardcode from xcomposer2.py

* rollback supported_models

* change to staticmethod

* optimize tp

* fix vlm quantization

* update doc

* update
---
 lmdeploy/pytorch/engine/engine.py        |  3 +-
 lmdeploy/pytorch/engine/model_agent.py   | 36 +++++++++----
 lmdeploy/pytorch/model_inputs.py         | 64 +++++++++++++++++++++++-
 lmdeploy/pytorch/multimodal/data_type.py | 63 +++++++++++++++++++++--
 4 files changed, 149 insertions(+), 17 deletions(-)

diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
index afa350330c..e06e0cf80a 100644
--- a/lmdeploy/pytorch/engine/engine.py
+++ b/lmdeploy/pytorch/engine/engine.py
@@ -782,8 +782,7 @@ def __update_inputs(next_token_ids):
         logger.debug('<ForwardTask>: '
                      f'batch_size={inputs.seq_length.size(0)} '
                      f'num_tokens={inputs.input_ids.size(-1)}')
-        if self.gpu_count == 1:
-            inputs = inputs.to_device('cuda')
+        inputs = inputs.to_device('cuda')
         is_decoding = inputs.is_decoding
         if all_ids is not None:
             all_ids = all_ids.cuda()
diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
index 999fa135cc..7df0eeb021 100644
--- a/lmdeploy/pytorch/engine/model_agent.py
+++ b/lmdeploy/pytorch/engine/model_agent.py
@@ -135,7 +135,6 @@ def model_forward(
     stream = stream or torch.cuda.current_stream()
     with torch.cuda.stream(stream):
         # forward
-        inputs = inputs.to_device('cuda')
         ctx_mgr = model.ctx_mgr
         context = ctx_mgr.build_context(
             inputs=inputs,
@@ -372,14 +371,26 @@ def _broadcast_config(cache_config):
     return patched_model, cache_engine, cache_config
 
 
-def _broadcast_inputs(rank: int, inputs: Any, stream: torch.cuda.Stream):
+def _broadcast_inputs(rank: int, inputs: Any, group: dist.group,
+                      stream: torch.cuda.Stream):
     """get input tensor parallel."""
     # broadcast meta info
     if rank != 0:
         inputs = [None, None, None]
+    else:
+        device_inputs = inputs[0]
+        meta_inputs = device_inputs.to_device('meta')
+        inputs[0] = meta_inputs
 
     with torch.cuda.stream(stream):
-        dist.broadcast_object_list(inputs)
+        dist.broadcast_object_list(inputs, group=group)
+        if rank == 0:
+            device_inputs.broadcast()
+        else:
+            device_inputs = inputs[0].broadcast()
+
+    inputs[0] = device_inputs
+
     return inputs
 
 
@@ -392,6 +403,7 @@ def _tp_model_loop(
     adapters: Dict[str, str],
     world_size: int,
     barrier: mp.Barrier,
+    cpu_group: dist.group,
 ):
     """Start model loops for tensor parallel model inference.
 
@@ -417,11 +429,12 @@ def _tp_model_loop(
     while True:
         barrier.wait()
         inputs, swap_in_map, swap_out_map = _broadcast_inputs(
-            rank, None, stream)
+            rank, None, cpu_group, stream)
 
         cache_swapping(cache_engine,
                        swap_in_map=swap_in_map,
                        swap_out_map=swap_out_map)
+        inputs = inputs.to_device('cuda')
 
         model_forward(
             patched_model,
@@ -453,10 +466,13 @@ def _start_tp_process(proc_id: int,
     try:
         from lmdeploy.pytorch.check_env import check_env_deeplink
         check_env_deeplink(device_context.device_type)
+        timeout = timedelta(days=35600)
         dist.init_process_group('nccl',
                                 rank=rank,
                                 world_size=world_size,
-                                timeout=timedelta(days=35600))
+                                timeout=timeout)
+        cpu_group = dist.new_group(timeout=timeout, backend='gloo')
+        kwargs['cpu_group'] = cpu_group
         dist_ctx = DistContext(rank=rank, world_size=world_size)
         torch.cuda.set_device(rank)
         with get_dist_manager().context(dist_ctx), get_device_manager(
@@ -626,12 +642,15 @@ def _start_sub_process(self, model_path: str, model_config: ModelConfig,
 
         rank = 0
         try:
+            timeout = timedelta(days=35600)
             dist.init_process_group('nccl',
                                     rank=rank,
                                     world_size=world_size,
-                                    timeout=timedelta(days=35600))
+                                    timeout=timeout)
+            cpu_group = dist.new_group(timeout=timeout, backend='gloo')
             dist_ctx = DistContext(rank=rank, world_size=world_size)
             self._dist_ctx = dist_ctx
+            self._cpu_group = cpu_group
         except Exception as e:
             from traceback import print_exc
             logger.error(f'Rank[{rank}] failed.')
@@ -673,7 +692,8 @@ def _forward_impl(self, inputs: ModelInputs, swap_in_map: SwapMap,
             self.mp_bar.wait()
             rank = 0
             _broadcast_inputs(rank, [inputs, swap_in_map, swap_out_map],
-                              self.stream)
+                              self._cpu_group, self.stream)
+
             cache_swapping(self.cache_engine,
                            swap_in_map=swap_in_map,
                            swap_out_map=swap_out_map)
@@ -699,8 +719,6 @@ async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap,
                                     swap_in_map=swap_in_map,
                                     swap_out_map=swap_out_map)
         await asyncio.sleep(0)
-        while not self.stream.query():
-            await asyncio.sleep(0)
         return output
 
     def get_logits(self, hidden_states: torch.Tensor):
diff --git a/lmdeploy/pytorch/model_inputs.py b/lmdeploy/pytorch/model_inputs.py
index b5b74e4f02..d10da8557a 100644
--- a/lmdeploy/pytorch/model_inputs.py
+++ b/lmdeploy/pytorch/model_inputs.py
@@ -4,12 +4,21 @@
 from typing import Any, Dict, List, Literal
 
 import torch
+from torch import distributed as dist
 
 from lmdeploy.pytorch.backends import get_backend
 from lmdeploy.pytorch.config import ModelConfig
 from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
 
 
+def _broadcast_tensor(value: torch.Tensor, src: int = 0, device: str = 'cuda'):
+    """broadcast tensor."""
+    if value.device.type == 'meta':
+        value = torch.empty_like(value, device=device)
+    dist.broadcast(value, src)
+    return value
+
+
 @dataclass
 class VisionModelInputs:
     """Vision model inputs."""
@@ -36,10 +45,45 @@ def to_device(self, device: str):
             elif k == 'input_embeddings':
                 v = [[e.to(device) for e in li] for li in v]
             elif k == 'input_multimodals':
+                new_v = []
                 for mm_datas in v:
+                    new_mm_datas = dict()
                     for modal_type, data in mm_datas.items():
                         data = [d.to_device(device) for d in data]
-                        mm_datas[modal_type] = data
+                        new_mm_datas[modal_type] = data
+                    new_v.append(new_mm_datas)
+                v = new_v
+            out_dict[k] = v
+
+        return VisionModelInputs(**out_dict)
+
+    def broadcast(self):
+        """broadcast inputs.
+
+        Do `dist.broadcast_object_list(inputs.to_device('meta'))`
+        before broadcast tensors.
+        """
+        out_dict = dict()
+        for f in fields(self):
+            k = f.name
+            v = getattr(self, k)
+            if v is None:
+                continue
+            if isinstance(v, torch.Tensor):
+                v = _broadcast_tensor(v)
+            elif k == 'input_embedding_ranges':
+                v = [_broadcast_tensor(e) for e in v]
+            elif k == 'input_embeddings':
+                v = [[_broadcast_tensor(e) for e in li] for li in v]
+            elif k == 'input_multimodals':
+                new_v = []
+                for mm_datas in v:
+                    new_mm_datas = dict()
+                    for modal_type, data in mm_datas.items():
+                        data = [d.broadcast() for d in data]
+                        new_mm_datas[modal_type] = data
+                    new_v.append(new_mm_datas)
+                v = new_v
             out_dict[k] = v
 
         return VisionModelInputs(**out_dict)
@@ -202,6 +246,24 @@ def to_device(self, device: str):
 
         return ModelInputs(**out_dict)
 
+    def broadcast(self):
+        """broadcast inputs.
+
+        Do `dist.broadcast_object_list(inputs.to_device('meta'))`
+        before broadcast tensors.
+        """
+        out_dict = dict()
+        for f in fields(self):
+            k = f.name
+            v = getattr(self, k)
+            if isinstance(v, torch.Tensor):
+                v = _broadcast_tensor(v)
+            elif isinstance(v, VisionModelInputs):
+                v = v.broadcast()
+            out_dict[k] = v
+
+        return ModelInputs(**out_dict)
+
 
 @dataclass
 class StepContext:
diff --git a/lmdeploy/pytorch/multimodal/data_type.py b/lmdeploy/pytorch/multimodal/data_type.py
index 95ec72d26e..886c7ffbd0 100644
--- a/lmdeploy/pytorch/multimodal/data_type.py
+++ b/lmdeploy/pytorch/multimodal/data_type.py
@@ -1,8 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from dataclasses import dataclass
+from dataclasses import dataclass, fields
 from typing import Any, Dict, List, Union
 
+import torch
 from torch import Tensor
+from torch import distributed as dist
 
 
 class MultiModalData:
@@ -14,6 +16,14 @@ class MultiModalData:
 NestedTensor = Union[Tensor, List[Tensor]]
 
 
+def _broadcast_tensor(value: torch.Tensor, src: int = 0, device: str = 'cuda'):
+    """broadcast tensor."""
+    if value.device.type == 'meta':
+        value = torch.empty_like(value, device=device)
+    dist.broadcast(value, src)
+    return value
+
+
 @dataclass
 class MultiModalTensor:
     data: NestedTensor
@@ -28,24 +38,67 @@ def __post_init__(self):
 
     def to_device(self, device: str, non_blocking: bool = False):
         """to device."""
+        out_dict = dict()
+        for f in fields(self):
+            k = f.name
+            if k in ('data', 'meta'):
+                continue
+            v = getattr(self, k)
+            out_dict[k] = v
+
         if isinstance(self.data, Tensor):
-            self.data = self.data.to(device=device, non_blocking=non_blocking)
+            data = self.data.to(device=device, non_blocking=non_blocking)
         else:
             data = [
                 d.to(device=device, non_blocking=non_blocking)
                 for d in self.data
             ]
-            self.data = data
+        out_dict['data'] = data
 
+        new_meta = None
         if self.meta is not None:
+            new_meta = dict()
             for k, v in self.meta.items():
                 if isinstance(v, Tensor):
                     v = v.to(device=device, non_blocking=non_blocking)
-                    self.meta[k] = v
                 elif hasattr(v, 'to_device'):
                     v = v.to_device(device=device, non_blocking=non_blocking)
+                new_meta[k] = v
+
+        out_dict['meta'] = new_meta
+        return MultiModalTensor(**out_dict)
+
+    def broadcast(self):
+        """broadcast inputs tensors."""
+        out_dict = dict()
+        for f in fields(self):
+            k = f.name
+            if k in ('data', 'meta'):
+                continue
+            v = getattr(self, k)
+            out_dict[k] = v
+
+        if isinstance(self.data, Tensor):
+            data = _broadcast_tensor(self.data)
+        else:
+            data = [_broadcast_tensor(d) for d in self.data]
+        out_dict['data'] = data
+
+        new_meta = None
+        if self.meta is not None:
+            new_meta = dict()
+            for k, v in self.meta.items():
+                if isinstance(v, Tensor):
+                    v = _broadcast_tensor(v)
+                    self.meta[k] = v
+                elif hasattr(v, 'to_device'):
+                    assert hasattr(v, 'broadcast')
+                    v = v.broadcast()
                     self.meta[k] = v
-        return self
+                new_meta[k] = v
+
+        out_dict['meta'] = new_meta
+        return MultiModalTensor(**out_dict)
 
 
 MultiModalInputs = Dict[str, List[MultiModalTensor]]

From bafa3d25f45f3d0dccb0ac9ae0c34e5e776c0b03 Mon Sep 17 00:00:00 2001
From: RunningLeon <mnsheng@yeah.net>
Date: Wed, 18 Dec 2024 11:52:41 +0800
Subject: [PATCH 122/122] unfreeze torch version in dockerfile (#2906)

---
 docker/Dockerfile | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index caa58ee637..24b2b055da 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -10,9 +10,6 @@ FROM ${CUDA_VERSION} AS final
 
 ARG PYTHON_VERSION=3.10
 
-ARG TORCH_VERSION=2.3.0
-ARG TORCHVISION_VERSION=0.18.0
-
 RUN apt-get update -y && apt-get install -y software-properties-common wget vim git curl openssh-server ssh sudo &&\
     curl https://sh.rustup.rs -sSf | sh -s -- -y &&\
     add-apt-repository ppa:deadsnakes/ppa -y && apt-get update -y && apt-get install -y --no-install-recommends \
@@ -43,7 +40,6 @@ ENV LD_LIBRARY_PATH=/usr/local/nccl/lib:$LD_LIBRARY_PATH
 
 
 RUN --mount=type=cache,target=/root/.cache/pip python3 -m pip install --upgrade pip setuptools==69.5.1 &&\
-    python3 -m pip install torch==${TORCH_VERSION} torchvision==${TORCHVISION_VERSION} --index-url https://download.pytorch.org/whl/${CUDA_VERSION_SHORT} &&\
     python3 -m pip install cmake packaging wheel
 
 ENV NCCL_LAUNCH_MODE=GROUP
@@ -54,7 +50,7 @@ COPY . /opt/lmdeploy
 WORKDIR /opt/lmdeploy
 
 RUN --mount=type=cache,target=/root/.cache/pip cd /opt/lmdeploy &&\
-    python3 -m pip install -r requirements.txt &&\
+    python3 -m pip install -r requirements_cuda.txt --extra-index-url https://download.pytorch.org/whl/${CUDA_VERSION_SHORT} &&\
     mkdir -p build && cd build &&\
     sh ../generate.sh &&\
     ninja -j$(nproc) && ninja install &&\