diff --git a/autotest/config.yaml b/autotest/config.yaml
index 6c92d2cf0b..30d54fa952 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -62,6 +62,7 @@ turbomind_chat_model:
     - liuhaotian/llava-v1.6-vicuna-7b
     - deepseek-ai/deepseek-vl-1.3b-chat
     - deepseek-ai/deepseek-coder-1.3b-instruct
+    - deepseek-ai/DeepSeek-V2-Lite-Chat
     - codellama/CodeLlama-7b-Instruct-hf
     - THUDM/glm-4-9b-chat
     - openbmb/MiniCPM-Llama3-V-2_5
@@ -167,6 +168,7 @@ turbomind_quatization:
         - Qwen/Qwen2-VL-7B-Instruct
         - mistralai/Mistral-7B-Instruct-v0.3
         - deepseek-ai/deepseek-coder-1.3b-instruct
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
         - codellama/CodeLlama-7b-Instruct-hf
     gptq:
         - internlm/internlm2_5-7b-chat
diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index b0e513410e..1fb5fa0964 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -114,14 +114,14 @@ broadCastRequest(const std::vector<int>& v_start_ids,
         }
         else {
             // conditional case.
-            ft::deviceMalloc(&d_input_ids, size_1, false);
+            ft::deviceMalloc(&d_input_ids, size_1, nullptr, false);
             // ft::deviceMalloc(&d_input_lengths, size_2, false);
             ft::cudaH2Dcpy(d_input_ids, v_input_ids.data(), size_1);
             // ft::cudaH2Dcpy(d_input_lengths, v_input_lengths.data(), size_2);
         }
 
         if (!v_input_bad_words.empty()) {
-            ft::deviceMalloc(&d_input_bad_words, size_bad_words, false);
+            ft::deviceMalloc(&d_input_bad_words, size_bad_words, nullptr, false);
             ft::cudaH2Dcpy(d_input_bad_words, v_input_bad_words.data(), size_bad_words);
         }
         else {
diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
index c724b085a0..e483500e96 100644
--- a/lmdeploy/turbomind/deploy/config.py
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -2,6 +2,7 @@
 import inspect
 import json
 from dataclasses import asdict, fields
+from typing import List
 
 # use pydantic.dataclasses.dataclass to check data type
 from pydantic.dataclasses import dataclass
@@ -43,22 +44,33 @@ class ModelConfig:
     # of token_embedding
     embedding_size: int = 0
     num_layer: int = None
-    inter_size: int = None
+    inter_size: List[int] = None
     norm_eps: float = None
     attn_bias: int = 0
     start_id: int = None
     end_id: int = None
     size_per_head: int = 128
-    group_size: int = 0
+    group_size: int = 64
     weight_type: str = None
     session_len: int = None
     tp: int = 1
     model_format: str = 'hf'
-    expert_num: int = 0
+    expert_num: List[int] = ()
     expert_inter_size: int = 0
     experts_per_token: int = 0
-    moe_shared_gate: int = False
-    moe_norm_topk: int = False
+    moe_shared_gate: bool = False
+    norm_topk_prob: bool = False
+    routed_scale: float = 1.0
+    topk_group: int = 1
+    topk_method: str = 'greedy'
+    moe_group_num: int = 1
+    # MLA
+    q_lora_rank: int = 0
+    kv_lora_rank: int = 0
+    qk_rope_dim: int = 0
+    v_head_dim: int = 0
+    # tuning
+    tune_layer_num: int = 1
 
     def verify(self):
         invalid = {}
@@ -72,6 +84,7 @@ def verify(self):
 class AttentionConfig:
     rotary_embedding: int = 128
     rope_theta: float = 10000.0
+    softmax_scale: float = 0
     attention_factor: float = None
     max_position_embeddings: int = 0
     original_max_position_embeddings: int = 0
diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
index 1c847ede01..77f0bc8dc8 100644
--- a/lmdeploy/turbomind/deploy/converter.py
+++ b/lmdeploy/turbomind/deploy/converter.py
@@ -241,11 +241,10 @@ def get_tm_model(model_path,
         engine_config.model_format = quant_method
         group_size = _group_size
 
-    # Compatible to awq models that are quantized by lmdeploy (<=v0.3.0)
-    if not group_size:
-        group_size = 128
-
     if engine_config.model_format in ['awq', 'gptq']:
+        # Compatible to awq models that are quantized by lmdeploy (<=v0.3.0)
+        if not group_size:
+            group_size = 128
         assert group_size == 128, \
             f'model format is "{engine_config.model_format}" ' \
             f'but group_size is {group_size}. Currently, only 128 ' \
diff --git a/lmdeploy/turbomind/deploy/loader.py b/lmdeploy/turbomind/deploy/loader.py
index e3d79b164a..94e779b6b7 100644
--- a/lmdeploy/turbomind/deploy/loader.py
+++ b/lmdeploy/turbomind/deploy/loader.py
@@ -88,6 +88,27 @@ def items(self):
                     yield (-1, {k: f.get_tensor(k) for k in misc})
         assert not params
 
+    # def items(self):
+    #     params = defaultdict(dict)
+    #     for shard in self.shards:
+    #         # with safe_open(shard, 'pt') as f:
+    #         with open(shard, 'rb') as f:
+    #             w = safetensors.torch.load(f.read())
+    #             misc = []
+    #             for k in w.keys():
+    #                 match = re.findall(self.pattern, k)
+    #                 if not match:
+    #                     misc.append(k)
+    #                 else:
+    #                     idx = int(match[0])
+    #                     param = params[idx]
+    #                     param[k] = w[k]
+    #                     if len(param) == self.item_count[idx]:
+    #                         yield (idx, params.pop(idx))
+    #             if misc:
+    #                 yield (-1, {k: w[k] for k in misc})
+    #     assert not params
+
 
 class PytorchLoader(BaseLoader):
 
diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py
index 8d998abe2b..52497175ef 100644
--- a/lmdeploy/turbomind/deploy/module.py
+++ b/lmdeploy/turbomind/deploy/module.py
@@ -96,10 +96,13 @@ class Ffn(Module):
     def __init__(self, model: BaseOutputModel):
         self.model = model
         self.tp = model.tensor_para_size
+        # inter_sizes in config are padded and may be different from what's
+        # in the weights
         self.inter_size = model.model_config.inter_size
         self.group_size = max(1, model.model_config.group_size)
 
     def _export(self,
+                inter_size: int,
                 fmt: str,
                 idx: int,
                 w123,
@@ -110,11 +113,11 @@ def _export(self,
         w1, w2, w3 = map(transpose, w123)
 
         if not is_lora_a:
-            w1 = pad_out_dims(w1, self.inter_size)
-            w3 = pad_out_dims(w3, self.inter_size)
+            w1 = pad_out_dims(w1, inter_size)
+            w3 = pad_out_dims(w3, inter_size)
         if not is_lora_b:
             group_size = self.group_size if apply_gs else 1
-            w2 = pad_in_dims(w2, self.inter_size // group_size)
+            w2 = pad_in_dims(w2, inter_size // group_size)
 
         w1, w2, w3 = map(pack_fn, (w1, w2, w3))
         self.model.save_split(w1,
@@ -132,7 +135,8 @@ def _export(self,
 
     def apply(self, i: int, r: BaseReader):
         for e in get_params(r.ffn(i, None)):
-            e(partial(self._export, self._ffn), partial(r.ffn, i), i)
+            e(partial(self._export, self.inter_size[i], self._ffn),
+              partial(r.ffn, i), i)
 
 
 class MoeFfn(Ffn):
@@ -154,11 +158,13 @@ def __init__(self, model: BaseOutputModel):
         self.shared_gate = model.model_config.moe_shared_gate
 
     def apply(self, i: int, r: BaseReader):
+        if self.expert_num[i] == 0:
+            return
         for p in get_params(r.moe_ffn_expert()):
-            for e in range(self.expert_num):
+            for e in range(self.expert_num[i]):
                 fmt = self._moe_ffn_expert.replace('E', str(e))
-                p(partial(self._export, fmt), partial(r.moe_ffn_expert, e, i),
-                  i)
+                p(partial(self._export, self.inter_size, fmt),
+                  partial(r.moe_ffn_expert, e, i), i)
 
         gate = transpose(r.moe_ffn_gate(i))
         self.model.save_split(gate, self._moe_ffn_gate.format(i))
@@ -218,6 +224,62 @@ def apply(self, i: int, r: BaseReader):
             e(self._export, partial(r.attn, i), i)
 
 
+class MLA(Module):
+    """
+    requires:
+        r.mla(i, kind)
+        r.mla_norm(i)
+    """
+
+    _mla = 'layers.{0}.attention.{1}.{2}'
+
+    def __init__(self, model: BaseOutputModel):
+        self.model = model
+
+    def _export(self, idx: int, xs, kind: str, pack_fn, **kwargs):
+        if all(x is None for x in xs):
+            return
+        q_a, q_b, q, kv_a, kv_b, o = map(transpose, xs)
+
+        if q is not None:
+            q_b = q
+
+        cfg = self.model.model_config
+
+        o = o.reshape(cfg.head_num, cfg.v_head_dim, -1)
+        o = torch.nn.functional.pad(
+            o, (0, 0, 0, cfg.size_per_head - cfg.v_head_dim, 0, 0))
+        o = o.view(cfg.head_num * cfg.size_per_head, cfg.hidden_units)
+
+        if q_a is not None:
+            self.model.save_split(pack_fn(q_a),
+                                  self._mla.format(idx, 'q_a_proj', kind))
+        q_b_name = 'q_proj' if q_a is None else 'q_b_proj'
+        self.model.save_split(pack_fn(q_b),
+                              self._mla.format(idx, q_b_name, kind),
+                              split_dim=-1)
+        self.model.save_split(pack_fn(kv_a),
+                              self._mla.format(idx, 'kv_a_proj', kind))
+        self.model.save_split(pack_fn(kv_b),
+                              self._mla.format(idx, 'kv_b_proj', kind),
+                              split_dim=-1)
+        self.model.save_split(pack_fn(o),
+                              self._mla.format(idx, 'wo', kind),
+                              split_dim=0)
+
+    _layernorm = 'layers.{0}.attention.{1}_a_layernorm'
+
+    def apply(self, i: int, r: BaseReader):
+
+        for f in get_params(r.attn(i, None), bias=False):
+            f(self._export, partial(r.mla, i), i)
+
+        q, k = r.mla_norm(i)
+        if q is not None:
+            self.model.save_split(q, self._layernorm.format(i, 'q'))
+        self.model.save_split(k, self._layernorm.format(i, 'kv'))
+
+
 class Misc(Module):
     """
     requires:
@@ -258,7 +320,11 @@ class Transformer:
 
     def __init__(self, model: BaseOutputModel):
         self.model = model
-        modules = [Attn, LayerNorm]
+        modules = [LayerNorm]
+        if model.model_config.kv_lora_rank:
+            modules.append(MLA)
+        else:
+            modules.append(Attn)
         if model.model_config.inter_size:
             modules.append(Ffn)
         if model.model_config.expert_num:
diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py
index de16bdc0a0..b9394b1244 100644
--- a/lmdeploy/turbomind/deploy/source_model/__init__.py
+++ b/lmdeploy/turbomind/deploy/source_model/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .baichuan import Baichuan2Model, BaichuanModel  # noqa: F401
+from .deepseek2 import DeepSeek2Model  # noqa: F401
 from .deepseek_vl import DeepSeekVLModel  # noqa: F401
 from .glm4 import Glm4Model  # noqa: F401
 from .internlm2 import InternLM2Model  # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek2.py b/lmdeploy/turbomind/deploy/source_model/deepseek2.py
new file mode 100644
index 0000000000..0023f650ff
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/deepseek2.py
@@ -0,0 +1,134 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+from .base import INPUT_MODELS
+from .llama import LlamaModel, LlamaReader
+
+
+class DeepSeek2Reader(LlamaReader):
+
+    def moe_ffn_gate(self, i):
+        return self.params.get(f'model.layers.{i}.mlp.gate.weight')
+
+    def moe_ffn_expert(self, e=None, i=None, kind=None):
+        if not kind:
+            return self.filter(r'experts')
+        result = []
+        for key in ['gate', 'down', 'up']:
+            name = f'model.layers.{i}.mlp.experts.{e}.{key}_proj.{kind}'
+            tensor = self.params.get(name)
+            tensor = self.transform(tensor, kind)
+            result.append(tensor)
+        return (*result, )
+
+    def _ffn(self, i: int, kind: str):
+        """Get ffn kind for layer i."""
+        if not kind:
+            return self.filter(r'mlp' if i == 0 else r'shared_expert\.')
+        result = []
+        for key in ['gate', 'down', 'up']:
+            name = f'model.layers.{i}.mlp.shared_experts.{key}_proj.{kind}'
+            if i == 0:
+                name = name.replace('shared_experts.', '')
+            tensor = self.params.get(name)
+            tensor = self.transform(tensor, kind)
+            result.append(tensor)
+        return (*result, )
+
+    def mla(self, i: int, kind: str):
+        if not kind:
+            return self.filter(r'self_attn.*proj')
+        result = []
+        for key in [
+                'q_a_proj', 'q_b_proj', 'q_proj', 'kv_a_proj_with_mqa',
+                'kv_b_proj', 'o_proj'
+        ]:
+            tensor = self.params.get(
+                f'{self.attn_layer_prefix}.{i}.self_attn.{key}.{kind}')
+            tensor = self.transform(tensor, kind)
+            result.append(tensor)
+        return (*result, )
+
+    def mla_norm(self, i: int):
+        result = []
+        for k in ['q', 'kv']:
+            name = f'{self.attn_layer_prefix}.{i}.self_attn.{k}_a_layernorm.weight'  # noqa: E501
+            result.append(self.params.get(name))
+        return (*result, )
+
+
+def get_yarn_params(rope_scaling: dict):
+
+    scaling_factor = float(rope_scaling['factor'])
+    mscale = rope_scaling['mscale']
+    mscale_all_dim = rope_scaling['mscale_all_dim']
+
+    def yarn_get_mscale(scale=1, mscale=1):
+        if scale <= 1:
+            return 1.0
+        return 0.1 * mscale * math.log(scale) + 1.0
+
+    _mscale = float(
+        yarn_get_mscale(scaling_factor, mscale) /
+        yarn_get_mscale(scaling_factor, mscale_all_dim))
+
+    softmax_scale = 0
+    if mscale_all_dim:
+        scale = yarn_get_mscale(scaling_factor, mscale_all_dim)
+        softmax_scale = scale * scale
+
+    return _mscale, softmax_scale
+
+
+@INPUT_MODELS.register_module(name='deepseek2')
+class DeepSeek2Model(LlamaModel):
+
+    Reader = DeepSeek2Reader
+
+    def tokenizer_info(self):
+        n_words = self.model_config['vocab_size']
+        bos_id = self.model_config['bos_token_id']
+        eos_id = self.model_config['eos_token_id']
+        return n_words, bos_id, eos_id
+
+    def model_info(self):
+        cfg = self.model_config
+        info = super().model_info()
+        qk_nope_dim = cfg['qk_nope_head_dim']
+        qk_rope_dim = cfg['qk_rope_head_dim']
+        num_layer = cfg['num_hidden_layers']
+        expert_num = cfg['n_routed_experts']
+        expert_num = [expert_num] * num_layer
+        expert_num[0] = 0
+        n_shared_experts = cfg['n_shared_experts']
+        expert_inter_size = cfg['moe_intermediate_size']
+        experts_per_token = cfg['num_experts_per_tok']
+        inter_size = [n_shared_experts * expert_inter_size] * num_layer
+        inter_size[0] = cfg['intermediate_size']
+        norm_topk_prob = cfg['norm_topk_prob']
+        size_per_head = qk_rope_dim + qk_nope_dim
+        info.update(kv_lora_rank=cfg['kv_lora_rank'],
+                    q_lora_rank=cfg['q_lora_rank'] or 0,
+                    qk_rope_dim=qk_rope_dim,
+                    v_head_dim=cfg['v_head_dim'],
+                    size_per_head=size_per_head,
+                    rotary_embedding=qk_rope_dim,
+                    expert_num=expert_num,
+                    expert_inter_size=expert_inter_size,
+                    experts_per_token=experts_per_token,
+                    inter_size=inter_size,
+                    norm_topk_prob=norm_topk_prob,
+                    routed_scale=cfg['routed_scaling_factor'],
+                    topk_method=cfg['topk_method'],
+                    topk_group=cfg['topk_group'],
+                    moe_group_num=cfg['n_group'],
+                    tune_layer_num=2)
+        rope_scaling = cfg.get('rope_scaling')
+        if rope_scaling and rope_scaling['type'] == 'yarn':
+            attention_factor, softmax_scale = get_yarn_params(rope_scaling)
+            softmax_scale *= size_per_head**(-0.5)
+            info.update(max_position_embeddings=rope_scaling[
+                'original_max_position_embeddings'],
+                        attention_factor=attention_factor,
+                        softmax_scale=softmax_scale)
+        return info
diff --git a/lmdeploy/turbomind/deploy/source_model/mixtral.py b/lmdeploy/turbomind/deploy/source_model/mixtral.py
index ff9df2d409..6ac22a658e 100644
--- a/lmdeploy/turbomind/deploy/source_model/mixtral.py
+++ b/lmdeploy/turbomind/deploy/source_model/mixtral.py
@@ -33,6 +33,6 @@ def model_info(self):
         info['expert_num'] = cfg['num_local_experts']
         info['expert_inter_size'] = cfg['intermediate_size']
         info['experts_per_token'] = cfg['num_experts_per_tok']
-        info['moe_norm_topk'] = True
+        info['norm_topk_prob'] = True
         info['inter_size'] = 0
         return info
diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py
index 772bd03037..637983e8ce 100644
--- a/lmdeploy/turbomind/deploy/source_model/qwen.py
+++ b/lmdeploy/turbomind/deploy/source_model/qwen.py
@@ -178,6 +178,6 @@ def model_info(self):
         info['experts_per_token'] = cfg['num_experts_per_tok']
         info['inter_size'] = cfg['shared_expert_intermediate_size']
         info['moe_shared_gate'] = True
-        info['moe_norm_topk_prob'] = cfg['norm_topk_prob']
+        info['norm_topk_prob'] = cfg['norm_topk_prob']
         info['attn_bias'] = 1
         return info
diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py
index 09699ade09..f2c981bb24 100644
--- a/lmdeploy/turbomind/deploy/target_model/base.py
+++ b/lmdeploy/turbomind/deploy/target_model/base.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os.path as osp
 from abc import ABC
+from collections.abc import Sequence
 
 import torch
 import tqdm
@@ -65,13 +66,14 @@ def __init__(self,
         # get `model_info` and `tokenizer_info` at first, which
         # will be updated to `self.model_config` and `self.attention_config`
         self.input_model_info = self.input_model.model_info()
+        self.input_model_info = self.single_to_list(
+            self.input_model_info, keys=['inter_size', 'expert_num'])
         self.input_model_tokenizer_info = self.input_model.tokenizer_info()
         self.permute_qk = self.input_model_info.get('permute_qk', True)
-
         self.update_model_config()
-        self.model_config.inter_size = _pad_inter_size(
-            self.model_config.inter_size, self.model_config.group_size,
-            self.tensor_para_size)
+        for i, v in enumerate(self.model_config.inter_size):
+            self.model_config.inter_size[i] = _pad_inter_size(
+                v, self.model_config.group_size, self.tensor_para_size)
         if self.model_config.expert_num:
             self.model_config.expert_inter_size = _pad_inter_size(
                 self.model_config.expert_inter_size,
@@ -79,11 +81,21 @@ def __init__(self,
         self.model_config.verify()
         assert self.model_config.kv_head_num % self.tensor_para_size == 0
 
+        # print(self.model_config)
+
         self.update_attention_config()
         self.update_lora_config()
         # ! Dependency on `self`
         self.model = model_cls(self)
 
+    def single_to_list(self, config: dict, keys):
+        num_layer = int(config['num_layer'])
+        for k in keys:
+            v = config.get(k, None)
+            if v is not None and not isinstance(v, Sequence):
+                config[k] = [v] * num_layer
+        return config
+
     def update_model_config(self):
         """Update `self.model_config` according to the input_model's
         `tokenizer_info` and `model_info`"""
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index e66da22df0..11e99edfa0 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -33,6 +33,7 @@
     InternVLChatModel='internvl',
     # deepseek-vl
     MultiModalityCausalLM='deepseekvl',
+    DeepseekV2ForCausalLM='deepseek2',
     # MiniCPMV
     MiniCPMV='minicpmv',
     # mini gemini
diff --git a/src/turbomind/kernels/CMakeLists.txt b/src/turbomind/kernels/CMakeLists.txt
index febb8692dd..40a48402af 100644
--- a/src/turbomind/kernels/CMakeLists.txt
+++ b/src/turbomind/kernels/CMakeLists.txt
@@ -68,3 +68,4 @@ endif ()
 
 add_subdirectory(attention)
 add_subdirectory(gemm)
+add_subdirectory(norm)
diff --git a/src/turbomind/kernels/attention/CMakeLists.txt b/src/turbomind/kernels/attention/CMakeLists.txt
index af9d47e0e6..32de38981a 100644
--- a/src/turbomind/kernels/attention/CMakeLists.txt
+++ b/src/turbomind/kernels/attention/CMakeLists.txt
@@ -38,6 +38,8 @@ add_library(attention STATIC
             codegen/decoding_sm80_64_f16_f16.cu
             codegen/decoding_sm80_64_f16_u4.cu
             codegen/decoding_sm80_64_f16_u8.cu
+            codegen/attention_sm80_192.cu
+            codegen/decoding_sm80_192.cu
             )
 set_property(TARGET attention PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/src/turbomind/kernels/attention/attention.cu b/src/turbomind/kernels/attention/attention.cu
index 3f557234bc..e7642584c2 100644
--- a/src/turbomind/kernels/attention/attention.cu
+++ b/src/turbomind/kernels/attention/attention.cu
@@ -46,6 +46,12 @@ void dispatchAttention(const AttentionParams<T>& params)
     else if (params.size_per_head == 128) {
         return dispatch(std::integral_constant<int, 128>{});
     }
+
+    if (params.size_per_head == 192) {
+        using Config = AttentionConfig<arch::Sm80, T, 192, CacheType::kLinear>;
+        return invokeAttention<typename Config::Kernel>(params);
+    }
+
     FT_CHECK(0);
 }
 
diff --git a/src/turbomind/kernels/attention/codegen/attention_sm80_192.cu b/src/turbomind/kernels/attention/codegen/attention_sm80_192.cu
new file mode 100644
index 0000000000..ceeafa7a6d
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/attention_sm80_192.cu
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../attention_config.h"
+#include "../attention_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template void invokeAttention<typename AttentionConfig<arch::Sm80, nv_bfloat16, 192, CacheType::kLinear>::Kernel>(
+    const AttentionParams<nv_bfloat16>& params);
+
+template void invokeAttention<typename AttentionConfig<arch::Sm80, half, 192, CacheType::kLinear>::Kernel>(
+    const AttentionParams<half>& params);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu b/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu
new file mode 100644
index 0000000000..214e6748d9
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu
@@ -0,0 +1,20 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../decoding_config.h"
+#include "../decoding_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template bool
+invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, nv_bfloat16, 1, 192>>(const AttentionParams<nv_bfloat16>& params);
+
+template bool invokeDecoding<Decoding<arch::Sm80, half, half, 1, 192>>(const AttentionParams<half>& params);
+
+template bool
+invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, uint8_t, 1, 192>>(const AttentionParams<nv_bfloat16>& params);
+
+template bool invokeDecoding<Decoding<arch::Sm80, half, uint8_t, 1, 192>>(const AttentionParams<half>& params);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/decoding.cu b/src/turbomind/kernels/attention/decoding.cu
index 1b04b7d4eb..67bd81e45b 100644
--- a/src/turbomind/kernels/attention/decoding.cu
+++ b/src/turbomind/kernels/attention/decoding.cu
@@ -2,8 +2,8 @@
 
 #include "decoding.h"
 #include "decoding_config.h"
+#include "src/turbomind/kernels/attention/arch.h"
 #include "src/turbomind/models/llama/llama_utils.h"
-// #include "src/turbomind/utils/dispatch.h"
 #include <type_traits>
 #include <utility>
 
@@ -113,6 +113,21 @@ void dispatchDecoding(const AttentionParams<T>& params)
         return false;
     };
 
+    if (params.size_per_head == 192) {
+
+        if (is_kv_int8) {
+            invokeDecoding<Decoding<arch::Sm80, T, uint8_t, 1, 192>>(params);
+        }
+        else if (is_kv_int4) {
+            FT_CHECK_WITH_INFO(!is_kv_int4, "not implemented");
+            // invokeDecoding<Decoding<arch::Sm80, T, uint4_t, 1, 192>>(params);
+        }
+        else {
+            invokeDecoding<Decoding<arch::Sm80, T, T, 1, 192>>(params);
+        }
+        return;
+    }
+
     auto success = dispatch();
 
     FT_CHECK(success);
diff --git a/src/turbomind/kernels/attention/decoding_config.h b/src/turbomind/kernels/attention/decoding_config.h
index 7dcb119cfd..dfd5e07835 100644
--- a/src/turbomind/kernels/attention/decoding_config.h
+++ b/src/turbomind/kernels/attention/decoding_config.h
@@ -40,7 +40,7 @@ struct DecodingConfig<arch::Sm80, T, T, Qh_, HeadDim, std::enable_if_t<(Qh_ > 2)
 };
 
 template<class T, int Qh_, int HeadDim>
-struct DecodingConfig<arch::Sm80, T, uint8_t, Qh_, HeadDim> {
+struct DecodingConfig<arch::Sm80, T, uint8_t, Qh_, HeadDim, std::enable_if_t<(HeadDim != 192)>> {
     static constexpr int Qh = (Qh_ + 7) / 8 * 8;
     using Attention         = Impl<MMA_81616, T, uint8_t, Qh, 1, 64, Qh, 1, 16, HeadDim, 5>;
     using CacheIter         = GetBlockIterFactory<T, uint8_t, 64, HeadDim>;
@@ -76,4 +76,14 @@ struct DecodingConfig<arch::Sm70, T, Tkv, Qh, HeadDim> {
     using Kernel = AttentionUniversal<arch::Sm70, Mainloop<arch::Sm70, Attention>, CacheIter, DecodingCtaMap>;
 };
 
+template<class T>
+struct DecodingConfig<arch::Sm80, T, uint8_t, 1, 192> {
+    static constexpr int Qh      = 1;
+    static constexpr int HeadDim = 192;
+
+    using Attention = Impl<MMA_SIMT, T, uint8_t, Qh, 1, 64, Qh, 1, 16, HeadDim, 3>;
+    using CacheIter = GetBlockIterFactory<T, uint8_t, 64, HeadDim>;
+    using Kernel    = AttentionUniversal<arch::Sm80, Mainloop<Sm80_CpAsync<3>, Attention>, CacheIter, DecodingCtaMap>;
+};
+
 }  // namespace turbomind::attention
diff --git a/src/turbomind/kernels/attention/impl_16816.h b/src/turbomind/kernels/attention/impl_16816.h
index 6e8f37f4d4..07c7dcb12b 100644
--- a/src/turbomind/kernels/attention/impl_16816.h
+++ b/src/turbomind/kernels/attention/impl_16816.h
@@ -63,26 +63,28 @@ struct Impl<MMA_16816, T_, T_, CTA_H_, CTA_Q_, CTA_S_, WARP_H, WARP_Q, WARP_S, H
 
     static_assert(sizeof(FragS) / 2 == sizeof(FragP));
 
-    using SmemLayoutQ = std::conditional_t<HeadDim == 128,
+    using SmemLayoutQ = std::conditional_t<HeadDim % 128 == 0,
                                            SmemLayoutV2<CTA_Q * CTA_H, HeadDim, 64, 128, Swizzle<3, 3, 4>>,
                                            SmemLayoutV2<CTA_Q * CTA_H, HeadDim, 64, 64, Swizzle<3, 3, 3>>>;
-    using SmemLayoutK = std::conditional_t<HeadDim == 128,
+    using SmemLayoutK = std::conditional_t<HeadDim % 128 == 0,
                                            SmemLayoutV2<CTA_S, HeadDim, 16, 128, Swizzle<3, 3, 4>>,
                                            SmemLayoutV2<CTA_S, HeadDim, 16, 64, Swizzle<3, 3, 3>>>;
-    using SmemLayoutV = std::conditional_t<HeadDim == 128,
+    using SmemLayoutV = std::conditional_t<HeadDim % 128 == 0,
                                            SmemLayoutV2<CTA_S, HeadDim, 16, 128, Swizzle<3, 3, 4>>,
                                            SmemLayoutV2<CTA_S, HeadDim, 16, 64, Swizzle<3, 3, 3>>>;
 
     using SmemLayoutKVp = void;
 
+    static constexpr bool kUseSmemQ = false;
+    static constexpr bool kUseSmemP = false;
+
+    static_assert(!kUseSmemQ, "current smemQ impl yields inconsistent outputs");
+
     union SharedStorage {
         __align__(16) T KV[Stages * (SmemLayoutK::kSize + SmemLayoutV::kSize) / 2];
         __align__(16) T Q[SmemLayoutQ::kSize];
     };
 
-    static constexpr bool kUseSmemQ = false;
-    static constexpr bool kUseSmemP = false;
-
     using ThreadMapQ  = RakedThreadMap<HeadDim, CTA_Q * CTA_H, 8, kWarpCount>;
     using ThreadMapKV = RakedThreadMap<HeadDim, CTA_S, 8, kWarpCount>;
 
@@ -109,22 +111,24 @@ struct Impl<MMA_16816, T_, T_, CTA_H_, CTA_Q_, CTA_S_, WARP_H, WARP_Q, WARP_S, H
         const int warp_id = threadIdx.x / WARP_SIZE;
         const int lane_id = threadIdx.x % WARP_SIZE;
 
-        __syncwarp();
+        if constexpr (!kUseSmemQ) {
+            __syncwarp();
 
-        SmemAccessor<T, SmemLayoutQ> sQ{smem_Q};
+            SmemAccessor<T, SmemLayoutQ> sQ{smem_Q};
 
-        // Load from shared memory using LDSM, rearrange to m16n8k16 atom layout
-        PRAGMA_UNROLL
-        for (int m = 0; m < K_M; ++m) {
+            // Load from shared memory using LDSM, rearrange to m16n8k16 atom layout
             PRAGMA_UNROLL
-            for (int k = 0; k < K_K; ++k) {
-                const int qi = lane_id % 16 * 1 + m * 16 + warp_id * WARP_Q;
-                const int di = lane_id / 16 * 8 + k * 16;
-                ldsm_x4((Array<uint32_t, 4>&)frag_Q[k][m], cast_smem_ptr_to_uint(&sQ(qi, di)));
+            for (int m = 0; m < K_M; ++m) {
+                PRAGMA_UNROLL
+                for (int k = 0; k < K_K; ++k) {
+                    const int qi = lane_id % 16 * 1 + m * 16 + warp_id * WARP_Q;
+                    const int di = lane_id / 16 * 8 + k * 16;
+                    ldsm_x4((Array<uint32_t, 4>&)frag_Q[k][m], cast_smem_ptr_to_uint(&sQ(qi, di)));
+                }
             }
         }
 
-        if constexpr (kUseSmemQ) {
+        if constexpr (0) {
             __syncthreads();
 
             // Rearrange Q in smem so that swizzling is not needed for later LDSMs
@@ -142,20 +146,25 @@ struct Impl<MMA_16816, T_, T_, CTA_H_, CTA_Q_, CTA_S_, WARP_H, WARP_Q, WARP_S, H
 
     struct StateQK {
         SmemAccessor<T, SmemLayoutK> smem_K;
+        T*                           smem_Q;
 
         FragQ frag_Q;
         FragK frag_K;
 
         __device__ StateQK(SharedStorage& storage, FragQ frag_Q_): smem_K{storage.KV}
         {
-            static_assert(!kUseSmemQ, "not implemented");
-            PRAGMA_UNROLL
-            for (int k = 0; k < K_K; ++k) {
+            if constexpr (!kUseSmemQ) {
                 PRAGMA_UNROLL
-                for (int m = 0; m < K_M; ++m) {
-                    frag_Q[k][m] = frag_Q_[k][m];
+                for (int k = 0; k < K_K; ++k) {
+                    PRAGMA_UNROLL
+                    for (int m = 0; m < K_M; ++m) {
+                        frag_Q[k][m] = frag_Q_[k][m];
+                    }
                 }
             }
+            else {
+                smem_Q = storage.Q;
+            }
         }
 
         __device__ void Load(int k, int pipe_iter)
@@ -166,6 +175,16 @@ struct Impl<MMA_16816, T_, T_, CTA_H_, CTA_Q_, CTA_S_, WARP_H, WARP_Q, WARP_S, H
             const int offset_s      = group_lane_id % 8 + group_id * 8;
             const int offset_c      = group_lane_id / 8 * 8;
             const int offset        = pipe_iter * SmemLayoutK::kSize;
+            if constexpr (kUseSmemQ) {
+                const int                    warp_id = threadIdx.x / WARP_SIZE;
+                SmemAccessor<T, SmemLayoutQ> sQ{smem_Q};
+                PRAGMA_UNROLL
+                for (int m = 0; m < K_M; ++m) {
+                    const int qi = lane_id % 16 * 1 + m * 16 + warp_id * WARP_Q;
+                    const int di = lane_id / 16 * 8 + k * 16;
+                    ldsm_x4((Array<uint32_t, 4>&)frag_Q[k][m], cast_smem_ptr_to_uint(&sQ(qi, di)));
+                }
+            }
             PRAGMA_UNROLL
             for (int n = 0; n < K_N; n += 2) {  // Load (s16,d16) tiles
                 const int s = n * 8 + offset_s;
diff --git a/src/turbomind/kernels/attention/impl_81616.h b/src/turbomind/kernels/attention/impl_81616.h
index 3b90bcdf57..f865f1bc3a 100644
--- a/src/turbomind/kernels/attention/impl_81616.h
+++ b/src/turbomind/kernels/attention/impl_81616.h
@@ -104,7 +104,7 @@ struct Impl<MMA_81616, T_, Tkv_, CTA_H_, CTA_Q_, CTA_S_, WARP_H_, WARP_Q, WARP_S
     }
     static constexpr auto _SmemLayoutKV(std::integral_constant<int, 4>)
     {
-        return std::conditional_t<HeadDim == 128,
+        return std::conditional_t<HeadDim % 128 == 0,
                                   SmemLayoutV2<CTA_S, HeadDim, 32, 128, Swizzle<2, 5, 3>>,
                                   SmemLayoutV2<CTA_S, HeadDim, 32, 64, Swizzle<3, 4, 3>>>{};
     }
diff --git a/src/turbomind/kernels/attention/impl_simt.h b/src/turbomind/kernels/attention/impl_simt.h
index a886185a44..444b67e2c8 100644
--- a/src/turbomind/kernels/attention/impl_simt.h
+++ b/src/turbomind/kernels/attention/impl_simt.h
@@ -2,12 +2,16 @@
 
 #pragma once
 
-#include "src/turbomind/kernels/attention/impl.h"
+#include <limits>
+#include <numeric>
+#include <type_traits>
+
 #include "src/turbomind/kernels/core/array_ops.h"
 #include "src/turbomind/kernels/core/layout.h"
 #include "src/turbomind/kernels/core/thread_map.h"
-#include <limits>
-#include <type_traits>
+
+#include "src/turbomind/kernels/attention/impl.h"
+#include "src/turbomind/kernels/attention/quantization.h"
 
 namespace turbomind::attention {
 
@@ -51,7 +55,7 @@ struct Impl<MMA_SIMT, T_, Tkv_, CTA_H_, CTA_Q_, CTA_S_, WARP_H_, WARP_Q, WARP_S,
     static constexpr int T_D = 8;                // warp thread C
     static constexpr int T_S = WARP_SIZE / T_D;  // warp thread S
 
-    // warp footprint
+    // warp footprint (1x4x64)
     static constexpr int OP_H = 1;
     static constexpr int OP_S = T_S;
     static constexpr int OP_D = VEC * T_D;
@@ -76,7 +80,7 @@ struct Impl<MMA_SIMT, T_, Tkv_, CTA_H_, CTA_Q_, CTA_S_, WARP_H_, WARP_Q, WARP_S,
         static constexpr int S_S_thr = 1;
         static constexpr int S_D     = VEC;
         static constexpr int S_S     = T_S;
-        static constexpr int LDS     = K_K;
+        static constexpr int LDS     = std::gcd(16 / sizeof(Array<Tkv, VEC>), K_K);
     };
 
     struct LinearD {
diff --git a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
index 20bb00fde8..f2e2faef91 100644
--- a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
+++ b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
@@ -277,11 +277,14 @@ void invokeProcessKV_v2(char**       blocks,
     };
 
     auto dispatch = [&](auto tkv) {
-        if (head_dim == 128) {
+        if (head_dim == 64) {
+            return invoke(tkv, std::integral_constant<int, 64>{});
+        }
+        else if (head_dim == 128) {
             return invoke(tkv, std::integral_constant<int, 128>{});
         }
-        else if (head_dim == 64) {
-            return invoke(tkv, std::integral_constant<int, 64>{});
+        else if (head_dim == 192) {
+            return invoke(tkv, std::integral_constant<int, 192>{});
         }
         FT_CHECK(0);
     };
@@ -545,6 +548,9 @@ void invokeFlattenKV_v2(T*           k,
         else if (head_dim == 128) {
             return invoke(tkv, std::integral_constant<int, 128>{});
         }
+        else if (head_dim == 192) {
+            return invoke(tkv, std::integral_constant<int, 192>{});
+        }
         FT_CHECK(0);
     };
 
diff --git a/src/turbomind/kernels/attention/mainloop_sm80.h b/src/turbomind/kernels/attention/mainloop_sm80.h
index bf0fc1d32a..4435400b70 100644
--- a/src/turbomind/kernels/attention/mainloop_sm80.h
+++ b/src/turbomind/kernels/attention/mainloop_sm80.h
@@ -52,7 +52,7 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
     template<class... Args>
     __device__ void operator()(Args&&... args)
     {
-        Run(Sm80_CpAsync<Stages>{}, ((Args &&) args)...);
+        Run(Sm80_CpAsync<Stages>{}, std::integral_constant<int, Impl::kHeadDim>{}, ((Args &&) args)...);
     }
 
     template<int Idx, class A, class B>
@@ -81,8 +81,9 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
         }
     }
 
-    template<class CacheIter, class StoreS, int Stages_>
+    template<int head_dim, class CacheIter, class StoreS, int Stages_>
     __device__ void Run(Sm80_CpAsync<Stages_>,
+                        std::integral_constant<int, head_dim>,
                         FragQ&         frag_Q,
                         CacheIter&     cache_iter,
                         FragO&         frag_O,
@@ -199,9 +200,10 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
         __pipeline_wait_prior(0);
     }
 
-#if 0
+    // #if 1
     template<class CacheIter, class StoreS>
     __device__ void Run(Sm80_CpAsync<2>,
+                        std::integral_constant<int, 192>,
                         FragQ&         frag_Q,
                         CacheIter&     cache_iter,
                         FragO&         frag_O,
@@ -234,7 +236,7 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
         Wait();
         state_QK.Load(0, 0);
 
-        constexpr auto _ = [](int){};
+        constexpr auto _ = [](int) {};
 
         auto loop = [&](auto is_residue, auto is_mask) {
             const int offset_K = tile_iter * CTA_S;
@@ -292,14 +294,15 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
         __pipeline_wait_prior(0);
     }
 
-#elif 1
+    // #elif 1
     // Load      : K0,K1 | V0,K2,V1,K3 ...
     // Compute   :    K0 | K1,V0,K2,V1 ...
     // - more register consumption
     // - more interleaved HMMA and FMA
     // - slight performance gain
-    template<class CacheIter, class StoreS>
+    template<int head_dim, class CacheIter, class StoreS>
     __device__ void Run(Sm80_CpAsync<2>,
+                        std::integral_constant<int, head_dim>,
                         FragQ&         frag_Q,
                         CacheIter&     cache_iter_,
                         FragO&         frag_O,
@@ -407,7 +410,7 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
         __pipeline_commit();
         __pipeline_wait_prior(0);
     }
-#endif
+    // #endif
 
     __device__ void Wait()
     {
diff --git a/src/turbomind/kernels/attention/reduce.cu b/src/turbomind/kernels/attention/reduce.cu
index 12f6aff38b..c654f40d05 100644
--- a/src/turbomind/kernels/attention/reduce.cu
+++ b/src/turbomind/kernels/attention/reduce.cu
@@ -66,12 +66,14 @@ void invokeReduce(T*           out,
                                     float        exp_scale,                                                            \
                                     cudaStream_t stream);
 
-INSTANTIATE_invokeReduce(128, half);
 INSTANTIATE_invokeReduce(64, half);
+INSTANTIATE_invokeReduce(128, half);
+INSTANTIATE_invokeReduce(192, half);
 
 #if ENABLE_BF16
+INSTANTIATE_invokeReduce(64, nv_bfloat16);
 INSTANTIATE_invokeReduce(128, nv_bfloat16);
-INSTANTIATE_invokeReduce(64, nv_bfloat16)
+INSTANTIATE_invokeReduce(192, nv_bfloat16);
 #endif
 
 }  // namespace turbomind::attention
diff --git a/src/turbomind/kernels/attention/reduce_kernel.h b/src/turbomind/kernels/attention/reduce_kernel.h
index 88a3ab3af8..b4c9064cfe 100644
--- a/src/turbomind/kernels/attention/reduce_kernel.h
+++ b/src/turbomind/kernels/attention/reduce_kernel.h
@@ -128,9 +128,12 @@ struct Reduce {
 
         __syncthreads();
 
-        constexpr int kVecSize = HeadDim / WARP_SIZE;
+        // HeadDim / WARP_SIZE
+        // 128     -> 4
+        // 64, 192 -> 2
+        constexpr int kVecSize = HeadDim % 128 == 0 ? 4 : 2;
 
-        using Map = RakedThreadMap<HeadDim, WarpCnt * CTA_H, kVecSize, WarpCnt>;
+        using Map = RakedThreadMap<HeadDim, WarpCnt * CTA_H, kVecSize, WarpCnt, WARP_SIZE>;
 
         static_assert(Map::kIterS == CTA_H);
 
diff --git a/src/turbomind/kernels/attention/rotary_embedding.h b/src/turbomind/kernels/attention/rotary_embedding.h
index 8e09da22cd..db836ed184 100644
--- a/src/turbomind/kernels/attention/rotary_embedding.h
+++ b/src/turbomind/kernels/attention/rotary_embedding.h
@@ -131,6 +131,7 @@ struct FastRoPE {
     template<typename T>
     __device__ void apply(Array<T, N>& x, float timestep)
     {
+#if 0
         PRAGMA_UNROLL
         for (int i = 0; i < N; i += 2) {
             float c, s;
@@ -144,6 +145,22 @@ struct FastRoPE {
                 x[i + 1] = (T)tmp1;
             }
         }
+#else
+        // Most models apply rotary embedding in half precision
+        PRAGMA_UNROLL
+        for (int i = 0; i < N; i += 2) {
+            float c, s;
+            sincosf(timestep * inv_freq_[i / 2], &s, &c);
+            s *= attention_scaling_;
+            c *= attention_scaling_;
+            T tmp0 = (T)c * x[i] - (T)s * x[i + 1];
+            T tmp1 = (T)c * x[i + 1] + (T)s * x[i];
+            if (is_valid_) {
+                x[i]     = tmp0;
+                x[i + 1] = tmp1;
+            }
+        }
+#endif
     }
 };
 
diff --git a/src/turbomind/kernels/attention/test_attention.cu b/src/turbomind/kernels/attention/test_attention.cu
index c6d7b40637..804d4815dc 100644
--- a/src/turbomind/kernels/attention/test_attention.cu
+++ b/src/turbomind/kernels/attention/test_attention.cu
@@ -218,14 +218,14 @@ void TestBlocks(const thrust::universal_vector<T>& k_cache,        // [B, H, S,
 
 #define KV_INT4 0
 
-#define DECODING 1
+#define DECODING 0
 
 template<class T>
 int test_attention()
 {
     AttentionParams<T> params{};
 
-    constexpr size_t kHeadDim = 128;
+    constexpr size_t kHeadDim = 192;
 
 #if DECODING
     // constexpr size_t kHeadNum   = 32;
@@ -239,11 +239,11 @@ int test_attention()
     // constexpr size_t kSequenceLen = 511;
     // constexpr size_t kSequenceLen = 2047;
     // constexpr size_t kSequenceLen = 4095;
-    // constexpr size_t kSequenceLen = 8191;
+    constexpr size_t kSequenceLen = 8191;
     // constexpr size_t kSequenceLen = 32767;
     // constexpr size_t kSequenceLen = 65535;
     // constexpr size_t kSequenceLen = 131071;
-    constexpr size_t kSequenceLen = 200000;
+    // constexpr size_t kSequenceLen = 200000;
     // constexpr size_t kSequenceLen = 262143;
     // constexpr size_t kSequenceLen = (1 << 20) - 1;  // 1M
     // constexpr size_t kSequenceLen = (1 << 22) - 1;  // 4M
@@ -451,6 +451,10 @@ int test_attention()
     params.qk = qk_buf.data().get();
     params.pr = pr_buf.data().get();
 
+    params.attention_scaling          = 1.f;
+    params.llama3_inv_scaling_factor  = 0;
+    params.yarn_ramp_inv_factor_div_2 = 0;
+
     Reference<T> reference(kDump ? Reference<T>::kUNFUSED : Reference<T>::kFLASH_ATTENTION, {});
     // Reference<T> reference(Reference<T>::kUNFUSED, {});
     reference.Reshape(kInputLen, kContextLen, kHeadNum, kHeadDim, KvHeadNum, kBatchSize);
diff --git a/src/turbomind/kernels/core/array_ops.h b/src/turbomind/kernels/core/array_ops.h
index 6b639abc83..ec6e7fb4ed 100644
--- a/src/turbomind/kernels/core/array_ops.h
+++ b/src/turbomind/kernels/core/array_ops.h
@@ -172,7 +172,7 @@ inline __device__ void copy(const Array<T, N> (&src)[M], Array<T, N> (&dst)[M])
 }
 
 template<typename T, int N>
-inline __device__ void Store(T* __restrict__ dst, const Array<T, N>& src)
+inline __device__ void Store(T* dst, const Array<T, N>& src)
 {
     if constexpr (sizeof(Array<T, N>) == sizeof(uint4)) {
         *(uint4*)dst = (const uint4&)src;
diff --git a/src/turbomind/kernels/core/math.h b/src/turbomind/kernels/core/math.h
index a708a34985..054269c27f 100644
--- a/src/turbomind/kernels/core/math.h
+++ b/src/turbomind/kernels/core/math.h
@@ -5,6 +5,7 @@
 #include "src/turbomind/kernels/core/common.h"
 #include <cassert>
 #include <cstdint>
+#include <type_traits>
 
 namespace turbomind {
 
@@ -41,6 +42,13 @@ TM_HOST_DEVICE constexpr T log2(T x)
 // static_assert(log2(32) == 5);
 // static_assert(log2(1) == 0);
 
+template<class T>
+TM_HOST_DEVICE constexpr T lowbit(T x)
+{
+    const std::make_signed_t<T> s = x;
+    return static_cast<T>(s & -s);
+}
+
 // https://arxiv.org/abs/1902.01961
 template<class T>
 struct FastDivMod {
diff --git a/src/turbomind/kernels/core/thread_map.h b/src/turbomind/kernels/core/thread_map.h
index 66b691832f..1271aefcc0 100644
--- a/src/turbomind/kernels/core/thread_map.h
+++ b/src/turbomind/kernels/core/thread_map.h
@@ -3,6 +3,7 @@
 #pragma once
 
 #include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/math.h"
 
 #include <iostream>
 
@@ -51,7 +52,7 @@ struct ThreadMapQ {
     }
 };
 
-template<int DimC, int DimS, int AccessC, int WarpCount, int WarpThreadC = DimC / AccessC>
+template<int DimC, int DimS, int AccessC, int WarpCount, int WarpThreadC = lowbit(DimC) / AccessC>
 struct RakedThreadMap {
     static constexpr int kDimC = DimC;
     static constexpr int kDimS = DimS;
diff --git a/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt b/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt
index d41c391e9d..81c9750584 100644
--- a/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt
+++ b/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt
@@ -8,9 +8,11 @@ add_library(${PROJECT_NAME} STATIC
     # flash_fwd_hdim64_fp16_sm80.cu
     flash_fwd_hdim128_fp16_sm80.cu
     flash_fwd_hdim128_bf16_sm80.cu
-    # flash_fwd_hdim256_fp16_sm80.cu
+    flash_fwd_hdim256_bf16_sm80.cu
+    flash_fwd_hdim256_fp16_sm80.cu
     )
 target_include_directories(${PROJECT_NAME} PRIVATE ${CUTLASS_DIR} / include)
 target_link_libraries(${PROJECT_NAME} PRIVATE nvidia::cutlass::cutlass)
+
 set_property(TARGET ${PROJECT_NAME} PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h b/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h
index e108a55f28..2456496367 100644
--- a/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h
+++ b/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h
@@ -147,7 +147,7 @@ void run_mha_fwd_hdim128(Flash_fwd_params& params, cudaStream_t stream)
     });
 }
 
-#if 0
+#if 1
 template<typename T>
 void run_mha_fwd_hdim256(Flash_fwd_params& params, cudaStream_t stream)
 {
diff --git a/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h b/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h
index fd19a0ea61..b1df29cb7b 100644
--- a/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h
+++ b/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h
@@ -58,6 +58,18 @@
             return __VA_ARGS__();                                                                                      \
         }                                                                                                              \
     }()
+#elif 1
+#define FWD_HEADDIM_SWITCH(HEADDIM, ...)                                                                               \
+    [&] {                                                                                                              \
+        if (HEADDIM <= 128) {                                                                                          \
+            constexpr static int kHeadDim = 128;                                                                       \
+            return __VA_ARGS__();                                                                                      \
+        }                                                                                                              \
+        else if (HEADDIM <= 256) {                                                                                     \
+            constexpr static int kHeadDim = 256;                                                                       \
+            return __VA_ARGS__();                                                                                      \
+        }                                                                                                              \
+    }()
 #else
 #define FWD_HEADDIM_SWITCH(HEADDIM, ...)                                                                               \
     [&] {                                                                                                              \
diff --git a/src/turbomind/kernels/gemm/context.h b/src/turbomind/kernels/gemm/context.h
index 4fec5b732f..bd03917b89 100644
--- a/src/turbomind/kernels/gemm/context.h
+++ b/src/turbomind/kernels/gemm/context.h
@@ -113,12 +113,7 @@ class DynamicGemmContext: public StaticGemmContext {
 
 class MoeGemmContext: public Context {
 public:
-    MoeGemmContext(int experts,
-                   int experts_per_token,
-                   //    int                   output_dims,
-                   //    int                   input_dims,
-                   const cudaDeviceProp& prop,
-                   cudaStream_t          stream);
+    MoeGemmContext(int experts, int experts_per_token, const cudaDeviceProp& prop, cudaStream_t stream);
 
     ~MoeGemmContext() override;
 
@@ -156,9 +151,11 @@ class MoeGemmContext: public Context {
 
     Tape Schedule(const LaunchSpec&) override;
 
-    void set_offsets(const int* offsets)
+    void update(int expert_num, int experts_per_token, const int* offsets)
     {
-        offsets_ = offsets;
+        expert_num_        = expert_num;
+        experts_per_token_ = experts_per_token;
+        offsets_           = offsets;
     }
 
 protected:
diff --git a/src/turbomind/kernels/gemm/convert_v2.cu b/src/turbomind/kernels/gemm/convert_v2.cu
index ed8b2ee2ff..e58bfc9b95 100644
--- a/src/turbomind/kernels/gemm/convert_v2.cu
+++ b/src/turbomind/kernels/gemm/convert_v2.cu
@@ -279,17 +279,44 @@ get_weight_and_scales_layout(DataType dtype, bool is_fused_moe, int sm, bool for
     return {};
 }
 
-void* make_blocked_ptrs(const std::vector<std::pair<void*, int>>& ptrs, cudaStream_t stream)
+namespace {
+
+template<int N>
+struct Param {
+    StridedPtr  data[N];
+    StridedPtr* ptr;
+    int         n;
+};
+
+template<int N>
+__global__ void fill_strided_ptrs(Param<N> param)
 {
-    std::vector<StridedPtr> tmp;
-    for (const auto& [p, s] : ptrs) {
-        tmp.push_back({p, s});
+    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx < param.n) {
+        param.ptr[idx] = param.data[idx];
     }
+}
+
+}  // namespace
+
+void* make_blocked_ptrs(const std::vector<std::pair<void*, int>>& ptrs, cudaStream_t stream)
+{
+    constexpr int N = 64;
+    Param<N>      param{};
+    static_assert(sizeof(param) <= 4096);  // max parameter size for cuda11
     StridedPtr* ptr{};
     cudaMallocAsync(&ptr, sizeof(StridedPtr) * ptrs.size(), stream);
-    cudaMemcpyAsync(ptr, tmp.data(), sizeof(StridedPtr) * ptrs.size(), cudaMemcpyDefault, stream);
-    // Sync before tmp can be destructed
-    cudaStreamSynchronize(stream);
+    param.ptr = ptr;
+    for (int i = 0; i < (int)ptrs.size(); i += N) {
+        const int n = std::min<int>(ptrs.size() - i, N);
+        for (int j = 0; j < n; ++j) {
+            auto& [p, s]  = ptrs[i + j];
+            param.data[j] = StridedPtr{p, s};
+        }
+        param.n = n;
+        fill_strided_ptrs<<<1, N, 0, stream>>>(param);
+        param.ptr += N;
+    }
     return ptr;
 }
 
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu
index 5912c60a8a..a9e4f7da51 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.cu
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu
@@ -264,7 +264,8 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
                                  int          token_num_padded,
                                  int          expert_num,
                                  int          top_k,
-                                 bool         norm_topk)
+                                 bool         norm_topk,
+                                 float        routed_scale)
 {
     constexpr int max_tiles         = kMoeGateMaxTiles;
     constexpr int threads_per_token = max_expert_num / items_per_thread;  // 8
@@ -286,8 +287,8 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
 
     const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token;
 
-    const int warp_offset  = thread_idx / WARP_SIZE * WARP_SIZE / threads_per_token;
-    const int block_offset = thread_idx / block_dim * block_dim / threads_per_token;
+    // const int warp_offset  = thread_idx / WARP_SIZE * WARP_SIZE / threads_per_token;
+    // const int block_offset = thread_idx / block_dim * block_dim / threads_per_token;
 
     float data[items_per_thread];
     int   idxs[items_per_thread];
@@ -413,7 +414,13 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
 
 #endif
 
-    constexpr float kLog2e = 1.4426950408889634074;
+    // constexpr float kLog2e = 1.4426950408889634074;
+    // if (k == 0) {
+    //     PRAGMA_UNROLL
+    //     for (int i = 0; i < items_per_thread; ++i) {
+    //         data[i] *= kLog2e;
+    //     }
+    // }
 
     unsigned mask = (unsigned)-1;
     float    max_logit;
@@ -437,13 +444,6 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
             asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
         }
 
-        if (k == 0) {
-            PRAGMA_UNROLL
-            for (int i = 0; i < items_per_thread; ++i) {
-                data[i] *= kLog2e;
-            }
-        }
-
         int   g_max_ei  = ei;
         float g_max_val = max_val;
         if constexpr (threads_per_token > 1) {
@@ -486,7 +486,7 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
     PRAGMA_UNROLL
     for (int i = 0; i < items_per_thread; ++i) {
         if (!norm_topk || used[i]) {
-            data[i] = exp2f(data[i] - max_logit);
+            data[i] = expf(data[i] - max_logit);
             sum_prob += data[i];
         }
     }
@@ -515,9 +515,11 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
 
     PRAGMA_UNROLL
     for (int i = 0; i < max_tiles * max_expert_num; i += block_dim) {
-        int e                   = (i + threadIdx.x) % max_expert_num;
-        int t                   = (i + threadIdx.x) / max_expert_num;
-        smem.shared_accum[t][e] = 0;
+        int e = (i + threadIdx.x) % max_expert_num;
+        int t = (i + threadIdx.x) / max_expert_num;
+        if (t < max_tiles) {
+            smem.shared_accum[t][e] = 0;
+        }
     }
 
     __syncthreads();
@@ -536,10 +538,8 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
 
         if (ti2 < token_num && idx < top_k) {
             masks[expert_id * token_num_padded + ti2] = idx;
-            scales[idx * token_num + ti2]             = scale;
+            scales[idx * token_num + ti2]             = scale * routed_scale;
             atomicAdd(&smem.shared_accum[ti2 >> log_tile][expert_id], 1);
-
-            // printf("%d %d %f\n", idx, expert_id, scale);
         }
     }
 
@@ -569,6 +569,7 @@ void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
                       int          experts,        //  E
                       int          experts_per_token,
                       bool         norm_topk,
+                      float        routed_scale,
                       cudaStream_t st)
 {
     constexpr int base_log_tile = 9;
@@ -581,14 +582,14 @@ void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
 
     // std::cout << log_tile << " " << tiles << "\n";
 
-    auto invoke = [&](auto max_expert_num, auto top_k, auto items_per_thread) {
+    auto invoke = [&](auto max_expert_num, auto top_k, auto items_per_thread, auto vec_size) {
         constexpr int thrs_per_tok = max_expert_num.value / items_per_thread.value;
         constexpr int threads      = 256;
         const int     blocks       = ceil_div(tokens, threads / thrs_per_tok);
 
         cudaMemsetAsync(masks, -1, sizeof(int8_t) * experts * tokens_padded, st);
 
-        MoeGateKernel_v8<max_expert_num.value, top_k.value, items_per_thread.value, threads, 4>
+        MoeGateKernel_v8<max_expert_num.value, top_k.value, items_per_thread.value, threads, vec_size.value>
             <<<blocks, threads, 0, st>>>(  //
                 scales,
                 (int8_t*)masks,
@@ -600,28 +601,49 @@ void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
                 tokens_padded,
                 experts,
                 experts_per_token,
-                norm_topk);
+                norm_topk,
+                routed_scale);
     };
 
     auto fail = [&] {
-        std::cerr << "unsupported moe config: expert_num=" << experts << ", top_k=" << experts_per_token << "\n";
+        std::cerr << __FILE__ << "(" << __LINE__ << "): unsupported moe config: expert_num=" << experts
+                  << ", top_k=" << experts_per_token << "\n";
         std::abort();
     };
 
     if (experts <= 8) {
         if (experts_per_token <= 2) {
-            invoke(_Int<8>, _Int<2>, _Int<8>);
+            // MoeGateKernel_V2<2, 128><<<cdiv(tokens, 128), 128, 0, st>>>(scales,
+            //     (int8_t*)masks,
+            //     accum,
+            //     logits,
+            //     log_tile,
+            //     tiles,
+            //     tokens,
+            //     tokens_padded,
+            //     experts);
+
+            // std::cout << tokens << " " << experts << " " << experts_per_token << " " << tokens_padded << "\n";
+            invoke(_Int<8>, _Int<2>, _Int<8>, _Int<4>);
         }
         else {
-            invoke(_Int<8>, _Int<8>, _Int<8>);
+            invoke(_Int<8>, _Int<8>, _Int<8>, _Int<4>);
         }
     }
     else if (experts <= 64) {
         if (experts_per_token <= 4) {
-            invoke(_Int<64>, _Int<4>, _Int<16>);
+            invoke(_Int<64>, _Int<4>, _Int<16>, _Int<4>);
         }
         else if (experts_per_token <= 8) {
-            invoke(_Int<64>, _Int<8>, _Int<16>);
+            invoke(_Int<64>, _Int<8>, _Int<16>, _Int<4>);
+        }
+        else {
+            fail();
+        }
+    }
+    else if (experts <= 160) {
+        if (experts_per_token <= 8) {
+            invoke(_Int<160>, _Int<8>, _Int<10>, _Int<2>);
         }
         else {
             fail();
@@ -687,7 +709,8 @@ __global__ void MoeReduceKernel(T*           dst,         // [  n, d]
                                 const int*   en2f,        // [  e, n] :: (e,n) -> e*n
                                 const float* dst_scales,  // [n]
                                 int          dims,
-                                int          tokens)
+                                int          tokens,
+                                float        dst_scale)
 {
     using Vec = Array<T, vec_size>;
 
@@ -695,7 +718,6 @@ __global__ void MoeReduceKernel(T*           dst,         // [  n, d]
 
     auto dst_ptr = (Vec*)dst + dims * ti;
 
-    float dst_scale = 0;
     if (dst_scales) {
         dst_scale = dst_scales[ti];
         dst_scale = fdividef(1.f, 1.f + expf(-dst_scale));
@@ -711,8 +733,9 @@ __global__ void MoeReduceKernel(T*           dst,         // [  n, d]
     }
 
     for (int i = threadIdx.x; i < dims; i += block_dim) {
+#if 1
         Array<float, vec_size> accum{};
-        if (dst_scales) {
+        if (dst_scale) {
             Vec v;
             Ldg(v, dst_ptr[i].data());
             using namespace ops;
@@ -727,6 +750,24 @@ __global__ void MoeReduceKernel(T*           dst,         // [  n, d]
             accum        = accum + x;
         }
         Store(dst_ptr[i].data(), cast<T>(accum));
+#else
+        Array<T, vec_size> accum{};
+        if (dst_scale) {
+            Vec v;
+            Ldg(v, dst_ptr[i].data());
+            using namespace ops;
+            accum = v * (T)dst_scale;
+        }
+        PRAGMA_UNROLL
+        for (int e = 0; e < exp_k; ++e) {
+            Vec v;
+            Ldg(v, src_ptr[e][i].data());
+            using namespace ops;
+            const auto x = v * (T)scale[e];
+            accum        = accum + x;
+        }
+        Store(dst_ptr[i].data(), accum);
+#endif
     }
 }
 
@@ -739,6 +780,7 @@ void invokeMoeReduce(T*           dst,
                      int          tokens,
                      int          experts_per_token,
                      int          dims,
+                     float        dst_scale,
                      cudaStream_t st)
 {
     // std::cout << __PRETTY_FUNCTION__ << std::endl;
@@ -754,7 +796,8 @@ void invokeMoeReduce(T*           dst,
             en2f,
             dst_scales,
             dims / vec_size,
-            tokens);
+            tokens,
+            dst_scale);
     };
 
     switch (experts_per_token) {
@@ -774,10 +817,11 @@ void invokeMoeReduce(T*           dst,
     }
 }
 
-template void invokeMoeReduce(half*, const half*, const float*, const int*, const float*, int, int, int, cudaStream_t);
-#ifdef ENABLE_BF16
 template void
-invokeMoeReduce(nv_bfloat16*, const nv_bfloat16*, const float*, const int*, const float*, int, int, int, cudaStream_t);
+invokeMoeReduce(half*, const half*, const float*, const int*, const float*, int, int, int, float, cudaStream_t);
+#ifdef ENABLE_BF16
+template void invokeMoeReduce(
+    nv_bfloat16*, const nv_bfloat16*, const float*, const int*, const float*, int, int, int, float, cudaStream_t);
 #endif
 
 std::vector<int> SampleUniform(int token_num, int expert_num, int exp_per_tok, std::mt19937& g)
@@ -833,4 +877,89 @@ std::vector<int> SampleBalanced(int token_num, int expert_num, int exp_per_tok,
     return ret;
 }
 
+template<int max_expert_num, int items_per_thread, int access_size>
+__global__ void MoeMaskTopKGroups(float* logits, int token_num, int expert_num, int top_k)
+{
+    constexpr int threads_per_token = max_expert_num / items_per_thread;
+
+    static_assert((threads_per_token & (threads_per_token - 1)) == 0);
+    static_assert(items_per_thread % access_size == 0);
+
+    const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    const int ti = thread_idx / threads_per_token;
+    const int ei = thread_idx % threads_per_token;
+
+    float data[items_per_thread];
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        data[i] = -std::numeric_limits<float>::infinity();
+    }
+    float max_val = -std::numeric_limits<float>::infinity();
+    if (ti < token_num) {
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; i += access_size) {
+            const int e = ei * items_per_thread + i;
+            if (e < expert_num) {
+                Ldg((Array<float, access_size>&)data[i], &logits[ti * expert_num + e]);
+                PRAGMA_UNROLL
+                for (int c = 0; c < access_size; ++c) {
+                    max_val = fmaxf(max_val, data[i + c]);
+                }
+            }
+        }
+    }
+
+    const int warp_ti        = threadIdx.x % WARP_SIZE / threads_per_token;
+    const int warp_ti_offset = warp_ti * threads_per_token;
+
+    bool alive = false;
+
+    for (int k = 0; k < top_k; ++k) {
+        int   g_max_ei  = ei;
+        float g_max_val = max_val;
+        PRAGMA_UNROLL
+        for (int m = threads_per_token / 2; m >= 1; m /= 2) {
+            g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m));
+        }
+        // tie breaking
+        const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val);
+        g_max_ei          = __ffs(active >> (unsigned)warp_ti_offset) - 1;
+        if (ei == g_max_ei) {
+            alive   = true;
+            max_val = -std::numeric_limits<float>::infinity();
+        }
+    }
+
+    if (!alive && ti < token_num) {
+        Array<float, access_size> vec;
+        fill(vec, -std::numeric_limits<float>::infinity());
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; i += access_size) {
+            const int e = ei * items_per_thread + i;
+            if (e < expert_num) {
+                Store(&logits[ti * expert_num + e], vec);
+            }
+        }
+    }
+}
+
+void invokeMaskMoeTopKGroups(float* logits, int token_num, int expert_num, int group_size, int top_k, cudaStream_t st)
+{
+    auto invoke = [&](auto max_expert_num, auto items_per_thread, auto vec_size) {
+        constexpr int thrs_per_tok = max_expert_num.value / items_per_thread.value;
+        constexpr int threads      = 256;
+        const int     blocks       = ceil_div(token_num, threads / thrs_per_tok);
+        MoeMaskTopKGroups<max_expert_num.value, items_per_thread.value, vec_size.value>
+            <<<blocks, threads, 0, st>>>(logits, token_num, expert_num, top_k);
+    };
+    if (expert_num == 160 && group_size == 20) {
+        return invoke(_Int<160>, _Int<20>, _Int<4>);
+    }
+
+    std::cerr << __FILE__ << "(" << __LINE__ << "): unsupported moe config: expert_num=" << expert_num
+              << ", group_size=" << group_size << "\n";
+    std::abort();
+}
+
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.h b/src/turbomind/kernels/gemm/moe_utils_v2.h
index 0e4c36af09..d53de1354e 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.h
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.h
@@ -22,6 +22,7 @@ void invokeMoeGate_V2(int*         f2n,
                       int          experts,
                       int          exp_per_tok,
                       bool         norm_topk,
+                      float        routed_scale,
                       cudaStream_t st);
 
 template<class T>
@@ -54,8 +55,11 @@ void invokeMoeReduce(T*           dst,
                      int          tokens,
                      int          experts_per_token,
                      int          dims,
+                     float        dst_scale,
                      cudaStream_t st);
 
+void invokeMaskMoeTopKGroups(float* logits, int token_num, int expert_num, int group_size, int top_k, cudaStream_t st);
+
 // Sample `e` from `E` experts uniformly for every token
 std::vector<int> SampleUniform(int token_num, int expert_num, int exp_per_tok, std::mt19937& g);
 
diff --git a/src/turbomind/kernels/gemm/test/test_moe_utils.cu b/src/turbomind/kernels/gemm/test/test_moe_utils.cu
index 47e3bfdb16..4b2ea6a83a 100644
--- a/src/turbomind/kernels/gemm/test/test_moe_utils.cu
+++ b/src/turbomind/kernels/gemm/test/test_moe_utils.cu
@@ -45,72 +45,6 @@ void diff_vecs(const T* data, const T* refs, int m, int k, std::string msg)
     }
 }
 
-#if 0
-void func()
-{
-    using thrust::universal_vector;
-
-    // clang-format off
-    std::vector<float> h_logits{
-        8,  5,  1,  4,  3,  6,  2,  7,
-        50, 60, 90, 20, 70, 71, 72, 73,
-        0, 1, 0, 0, 0, 1, 0, 1,
-        0, 0, 0, 1, 0, 0, 0, 2};
-    // clang-format on
-
-    h_logits.resize(8);
-
-    // auto tmp = h_logits;
-    // for (int i = 0; i < 127; ++i) {
-    //     h_logits.insert(h_logits.end(), tmp.begin(), tmp.end());
-    // }
-
-    universal_vector<float> logits(h_logits.begin(), h_logits.end());
-
-    const int E = 8;
-    const int n = h_logits.size() / E;
-    const int e = 2;
-
-    const int n_padded = (n + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
-
-    universal_vector<int>   f2n(e * n);
-    universal_vector<int>   en2f(e * n);
-    universal_vector<int>   offsets(E + 1);
-    universal_vector<int>   accum(E * kMoeGateMaxTiles);
-    universal_vector<float> scales(n * e);
-    universal_vector<int>   masks(E * n_padded);
-
-    for (int i = 0; i < 10; ++i) {
-        gemm::CacheFlushing::flush(0);
-        cudaMemset(accum.data().get(), 0, sizeof(int) * accum.size());
-        invokeMoeGate_V2(f2n.data().get(),
-                         en2f.data().get(),
-                         offsets.data().get(),
-                         scales.data().get(),
-                         masks.data().get(),
-                         accum.data().get(),
-                         logits.data().get(),
-                         n,
-                         n_padded,
-                         E,
-                         e,
-                         0);
-    }
-
-    auto err = cudaDeviceSynchronize();
-    if (err) {
-        std::cerr << cudaGetErrorString(err) << "\n";
-    }
-
-    print_vecs(scales.data().get(), e, n, "scales", 12);
-    print_vecs(masks.data().get(), E, n_padded, "tmp");
-    print_vecs(accum.data().get(), E, 1, "accum");
-    print_vecs(offsets.data().get(), 1, E + 1, "offsets");
-    print_vecs(f2n.data().get(), n * e, 1, "f2n");
-    print_vecs(en2f.data().get(), e, n, "en2f");
-}
-#endif
-
 RNG& gRNG()
 {
     static RNG inst{};
@@ -271,6 +205,8 @@ bool test_moe_gate(int                     tokens,  //
     cudaMemPrefetchAsync(scales.data().get(), sizeof(float) * scales.size(), 0);
     cudaMemPrefetchAsync(logits.data().get(), sizeof(float) * logits.size(), 0);
 
+    // invokeMaskMoeTopKGroups(logits.data().get(), tokens, expert_num, expert_num / 8, 3, nullptr);
+
     for (int i = 0; i < 1; ++i) {
         gemm::CacheFlushing::flush();
         cudaMemset(accum.data().get(), 0, sizeof(int) * accum.size());
@@ -286,8 +222,9 @@ bool test_moe_gate(int                     tokens,  //
                          tokens_padded,
                          expert_num,
                          experts_per_token,
-                         true,
-                         0);
+                         false,
+                         1.f,
+                         nullptr);
     }
 
     // invokeMoeTiling(coords.data().get(), offsets.data().get(), expert_num, coords.size(), &tiling, 1, 0);
@@ -334,6 +271,8 @@ bool test_moe_gate(int                     tokens,  //
         success = false;
     }
 
+    // print_vecs(logits.data().get(), tokens, expert_num, "logits", 12);
+
     if (!success && 1) {
 
         diff_vecs(eids.data().get(), eids_ref.data().get(), experts_per_token, tokens, "eids");
@@ -353,6 +292,15 @@ bool test_moe_gate(int                     tokens,  //
         print_vecs(scales_ref.data().get(), experts_per_token, tokens, "scales_ref", 12);
         print_vecs(scales.data().get(), experts_per_token, tokens, "scales", 12);
 
+        for (int i = 0; i < tokens; ++i) {
+            float sum = 0;
+            for (int j = 0; j < experts_per_token; ++j) {
+                sum += scales[j * tokens + i];
+            }
+            std::cout << sum << " ";
+        }
+        std::cout << "\n";
+
         // print_vecs(accum.data().get(), expert_num, 1, "accum");
 
         // print_vecs(coords.data().get(), 1, max_coords, "coords");
@@ -393,7 +341,7 @@ int main()
     // test_moe_gate(32768, 64, 8, tape, tiling);
     // test_moe_gate(8, 60, 4, tape, tiling);
 
-    test_moe_gate(65536, 8, 2, tape, tiling);
+    test_moe_gate(16, 160, 6, tape, tiling);
     return 0;
 
     for (int i = 1; i < 16384; ++i) {
diff --git a/src/turbomind/kernels/gemm/test/testbed.h b/src/turbomind/kernels/gemm/test/testbed.h
index 7a089fbdf2..4747644f9a 100644
--- a/src/turbomind/kernels/gemm/test/testbed.h
+++ b/src/turbomind/kernels/gemm/test/testbed.h
@@ -357,7 +357,7 @@ class Testbed {
             }
         }
 
-        ((MoeGemmContext*)ctx_.get())->set_offsets(moe_m_offsets_.data().get());
+        ((MoeGemmContext*)ctx_.get())->update(experts_, exp_per_tok_, moe_m_offsets_.data().get());
 
         CHECK(batch_dim == 0);
         CHECK(a_desc_.order == kRowMajor);
@@ -518,6 +518,7 @@ class Testbed {
                             batch_size_,
                             expert_ids_.size() / batch_size_,
                             output_dims_,
+                            0.f,
                             stream_);
 
             invokeMoeReduce(c_ref_.data().get(),
@@ -528,6 +529,7 @@ class Testbed {
                             batch_size_,
                             expert_ids_.size() / batch_size_,
                             output_dims_,
+                            0.f,
                             stream_);
 
             cudaDeviceSynchronize();
diff --git a/src/turbomind/kernels/gemm/unpack.cu b/src/turbomind/kernels/gemm/unpack.cu
index 92f468d82b..39e6a2e1aa 100644
--- a/src/turbomind/kernels/gemm/unpack.cu
+++ b/src/turbomind/kernels/gemm/unpack.cu
@@ -71,14 +71,44 @@ void unpack_awq_gemm(uint4_t* dst, const uint4_t* src, int rows, int cols, cudaS
     permute_u4<0, 1, 3, 2><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape);
 }
 
+__global__ void transpose_u4_kernel(uint4_t* dst, const uint4_t* src, int s, int c)
+{
+    const int idx_c = 8 * (threadIdx.x + blockIdx.x * blockDim.x);
+    const int idx_s = 8 * (threadIdx.y + blockIdx.y * blockDim.y);
+    if (idx_c >= c || idx_s >= s) {
+        return;
+    }
+    uint32_t ivec[8];
+    PRAGMA_UNROLL
+    for (int i = 0; i < 8; ++i) {
+        ivec[i] = ((const uint32_t*)src)[((idx_s + i) * c + idx_c) / 8];
+    }
+    uint32_t ovec[8]{};
+    PRAGMA_UNROLL
+    for (int i = 0; i < 8; ++i) {
+        PRAGMA_UNROLL
+        for (int j = 0; j < 8; ++j) {
+            ovec[i] |= (((ivec[j] >> (i * 4)) & 0xfu) << (j * 4));
+        }
+    }
+    PRAGMA_UNROLL
+    for (int i = 0; i < 8; ++i) {
+        ((uint32_t*)dst)[((idx_c + i) * s + idx_s) / 8] = ovec[i];
+    }
+}
+
 void transpose_u4(uint4_t* dst, const uint4_t* src, int s, int c, cudaStream_t st)
 {
     if (s % 8 || c % 8) {
         std::cerr << "transpose_u4: invalid shape (" << s << "," << c << "), must be multiple of 8" << std::endl;
         return;
     }
-    Array<int, 2> shape{s, c};
-    permute_u4<1, 0><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape);
+    // Array<int, 2> shape{s, c};
+    // permute_u4<1, 0><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape);
+
+    const dim3 block(16, 16);
+    const dim3 grid((c + 15) / 16, (s + 15) / 16);
+    transpose_u4_kernel<<<grid, block, 0, st>>>(dst, src, s, c);
 }
 
 // load -> unpack -> extend_to_u8 -> manipulation -> compat_to_u4 -> store
diff --git a/src/turbomind/kernels/norm/CMakeLists.txt b/src/turbomind/kernels/norm/CMakeLists.txt
new file mode 100644
index 0000000000..bc1569c405
--- /dev/null
+++ b/src/turbomind/kernels/norm/CMakeLists.txt
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+add_library(rms_norm rms_norm.cu)
+set_property(TARGET rms_norm PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET rms_norm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/src/turbomind/kernels/norm/rms_norm.cu b/src/turbomind/kernels/norm/rms_norm.cu
new file mode 100644
index 0000000000..22fd69f52a
--- /dev/null
+++ b/src/turbomind/kernels/norm/rms_norm.cu
@@ -0,0 +1,235 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "cub/block/block_reduce.cuh"
+
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/common.h"
+
+namespace turbomind {
+
+template<class T, class Accum, int block_dim, int vec_size>
+__global__ void RMSNormKernel(T*       dst,
+                              int      dst_ld,
+                              const T* src,
+                              int      src_ld,
+                              const T* __restrict__ weights,
+                              int   dims,
+                              int   num,
+                              float eps,
+                              float inv_dims)
+{
+    const int ti = blockIdx.x;
+    const int di = threadIdx.x * vec_size;
+
+    if (ti >= num) {
+        return;
+    }
+
+    src += src_ld * ti;
+
+    Array<Accum, vec_size> accum{};
+    Array<T, vec_size>     vec;
+
+    for (int i = di; i < dims; i += block_dim * vec_size) {
+        Load(vec, &src[i]);
+        Array<Accum, vec_size> tmp = cast<Accum>(vec);
+        using namespace ops;
+        accum = accum + tmp * tmp;
+    }
+
+    float sum{};
+    PRAGMA_UNROLL
+    for (int i = 0; i < vec_size; ++i) {
+        sum += accum[i];
+    }
+
+    using BlockReduce = cub::BlockReduce<Accum, block_dim>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    sum = BlockReduce{temp_storage}.Sum(sum);
+
+    __shared__ float shared_sum;
+
+    if (threadIdx.x == 0) {
+        shared_sum = rsqrtf(sum * inv_dims + eps);
+    }
+
+    __syncthreads();
+
+    sum = shared_sum;
+
+    dst += dst_ld * ti;
+
+    Array<T, vec_size> sv;
+    for (int i = di; i < dims; i += block_dim * vec_size) {
+        Load(vec, &src[i]);
+        Ldg(sv, &weights[i]);
+        PRAGMA_UNROLL
+        for (int c = 0; c < vec_size; ++c) {
+            vec[c] = (T)((float)vec[c] * sum) * sv[c];
+            // vec[c] = (T)((float)vec[c] * sum * (float)sv[c]);
+        }
+        Store(&dst[i], vec);
+    }
+}
+
+template<class T>
+void invokeRMSNorm(
+    T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, cudaStream_t st)
+{
+    constexpr int vec_size = 16 / sizeof(T);
+
+    constexpr int threads = 512;
+    const int     blocks  = num;
+
+    RMSNormKernel<T, float, threads, vec_size><<<blocks, threads, 0, st>>>(dst,  //
+                                                                           dst_ld,
+                                                                           src,
+                                                                           src_ld,
+                                                                           weights,
+                                                                           dims,
+                                                                           num,
+                                                                           eps,
+                                                                           1.f / dims);
+}
+
+template void invokeRMSNorm(half*        dst,
+                            int          dst_ld,
+                            const half*  src,
+                            int          src_ld,
+                            const half*  weights,
+                            int          dims,
+                            int          num,
+                            float        eps,
+                            cudaStream_t st);
+#if ENABLE_BF16
+template void invokeRMSNorm(nv_bfloat16*       dst,
+                            int                dst_ld,
+                            const nv_bfloat16* src,
+                            int                src_ld,
+                            const nv_bfloat16* weights,
+                            int                dims,
+                            int                num,
+                            float              eps,
+                            cudaStream_t       st);
+#endif
+
+// r' <- r + (h + b)
+// h' <- norm(r') * w
+template<class T, class Tacc, int block_dim, int vec_size>
+__global__ void BiasResidualRMSNormKernel(T* __restrict__ residual,
+                                          T* __restrict__ hidden_states,
+                                          const T* __restrict__ weights,
+                                          const T* __restrict__ bias,
+                                          int   dims,
+                                          int   num,
+                                          float eps,
+                                          float inv_dims)
+{
+    const int ti = blockIdx.x;
+    const int di = threadIdx.x * vec_size;
+
+    if (ti >= num) {
+        return;
+    }
+
+    residual += dims * ti;
+    hidden_states += dims * ti;
+
+    Array<Tacc, vec_size> accum{};
+
+    Array<T, vec_size> r_vec;
+    Array<T, vec_size> h_vec;
+    Array<T, vec_size> b_vec;
+
+    for (int i = di; i < dims; i += block_dim * vec_size) {
+        Load(r_vec, &residual[i]);
+        Load(h_vec, &hidden_states[i]);
+
+        using namespace ops;
+        r_vec = r_vec + h_vec;
+
+        if (bias) {
+            Ldg(b_vec, &bias[i]);
+            r_vec = r_vec + b_vec;
+        }
+
+        Store(&residual[i], r_vec);
+
+        Array<Tacc, vec_size> tmp = cast<Tacc>(r_vec);
+
+        accum = accum + tmp * tmp;
+    }
+
+    float sum{};
+    PRAGMA_UNROLL
+    for (int i = 0; i < vec_size; ++i) {
+        sum += accum[i];
+    }
+
+    using BlockReduce = cub::BlockReduce<Tacc, block_dim>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    sum = BlockReduce{temp_storage}.Sum(sum);
+
+    __shared__ float shared_sum;
+
+    if (threadIdx.x == 0) {
+        shared_sum = rsqrtf(sum * inv_dims + eps);
+    }
+
+    __syncthreads();
+
+    sum = shared_sum;
+
+    Array<T, vec_size> w_vec;
+    for (int i = di; i < dims; i += block_dim * vec_size) {
+        Load(r_vec, &residual[i]);
+        Ldg(w_vec, &weights[i]);
+        PRAGMA_UNROLL
+        for (int c = 0; c < vec_size; ++c) {
+            r_vec[c] = (T)((float)r_vec[c] * sum) * w_vec[c];
+        }
+        Store(&hidden_states[i], r_vec);
+    }
+}
+
+template<class T>
+void invokeBiasResidualRMSNorm(
+    T* residual, T* hidden_states, const T* weights, const T* bias, int dims, int num, float eps, cudaStream_t st)
+{
+    constexpr int vec_size = 16 / sizeof(T);
+    constexpr int threads  = 512;
+    const int     blocks   = num;
+
+    BiasResidualRMSNormKernel<T, float, threads, vec_size><<<blocks, threads, 0, st>>>(residual,  //
+                                                                                       hidden_states,
+                                                                                       weights,
+                                                                                       bias,
+                                                                                       dims,
+                                                                                       num,
+                                                                                       eps,
+                                                                                       1.f / dims);
+}
+
+template void invokeBiasResidualRMSNorm(half*        residual,
+                                        half*        hidden_states,
+                                        const half*  weights,
+                                        const half*  bias,
+                                        int          dims,
+                                        int          num,
+                                        float        eps,
+                                        cudaStream_t st);
+
+#if ENABLE_BF16
+template void invokeBiasResidualRMSNorm(nv_bfloat16*       residual,
+                                        nv_bfloat16*       hidden_states,
+                                        const nv_bfloat16* weights,
+                                        const nv_bfloat16* bias,
+                                        int                dims,
+                                        int                num,
+                                        float              eps,
+                                        cudaStream_t       st);
+#endif
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/norm/rms_norm.h b/src/turbomind/kernels/norm/rms_norm.h
new file mode 100644
index 0000000000..83fa0f8263
--- /dev/null
+++ b/src/turbomind/kernels/norm/rms_norm.h
@@ -0,0 +1,21 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <cuda_runtime.h>
+
+namespace turbomind {
+
+template<class T>
+void invokeRMSNorm(
+    T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, cudaStream_t st);
+
+template<class T>
+void invokeRMSNorm(T* dst, const T* src, const T* weights, int dims, int num, float eps, cudaStream_t st)
+{
+    invokeRMSNorm(dst, dims, src, dims, weights, dims, num, eps, st);
+}
+
+template<class T>
+void invokeBiasResidualRMSNorm(
+    T* residual, T* hidden_states, const T* weights, const T* bias, int dims, int num, float eps, cudaStream_t st);
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/llama/CMakeLists.txt b/src/turbomind/models/llama/CMakeLists.txt
index 285fcea31f..3c714bd234 100644
--- a/src/turbomind/models/llama/CMakeLists.txt
+++ b/src/turbomind/models/llama/CMakeLists.txt
@@ -20,11 +20,13 @@ add_library(Llama STATIC
         unified_attention_layer.cc
         llama_kernels.cu
         llama_decoder_kernels.cu
-        llama_utils.cu)
+        llama_utils.cu
+        mla_utils.cu)
 set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 target_link_libraries(Llama PUBLIC CUDA::cudart
         gemm2
+        rms_norm
         cublasMMWrapper
         DynamicDecodeLayer
         activation_kernels
diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc
index 4138174e5d..ea321d06a0 100644
--- a/src/turbomind/models/llama/LlamaBatch.cc
+++ b/src/turbomind/models/llama/LlamaBatch.cc
@@ -20,6 +20,7 @@
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/debug_utils.h"
 #include "src/turbomind/utils/logger.h"
+#include "src/turbomind/utils/nccl_utils.h"
 #include <algorithm>
 #include <cmath>
 #include <cstddef>
@@ -1041,6 +1042,9 @@ LlamaBatch<T>::LlamaBatch(const EngineParam&           param,
 
     AllocateBuffer(max_batch_size_, session_len_, cache_block_seq_len);
     AllocatePersistantBuffer(max_batch_size_, cache_block_seq_len);
+
+    // Wait for allocations
+    check_cuda_error(cudaStreamSynchronize(stream_));
 }
 
 template<typename T>
@@ -1990,7 +1994,7 @@ void LlamaBatch<T>::tune()
                                    nullptr,
                                    nullptr);
             // implicit barrier for TP
-            check_cuda_error(cudaStreamSynchronize(stream_));
+            ftNcclStreamSynchronize(model_->tensor_para_, {}, stream_);
         }
 
         auto tock = std::chrono::steady_clock::now();
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
index f6f9ab0efa..0a2a3be175 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -52,28 +52,21 @@ static bool is_fuse_silu_act()
 }
 
 template<typename T>
-LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
-                                                    size_t     head_num,
-                                                    size_t     kv_head_num,
-                                                    size_t     size_per_head,
-                                                    size_t     hidden_units,
-                                                    size_t     inter_size,
-                                                    WeightType weight_type,
-                                                    int        group_size,
-                                                    LoraParam  lora_param,
-                                                    bool       attn_bias,
-                                                    MoeParam   moe_param,
-                                                    size_t     tensor_para_size,
-                                                    size_t     tensor_para_rank):
-    head_num_(head_num),
-    kv_head_num_(kv_head_num),
-    size_per_head_(size_per_head),
-    hidden_units_(hidden_units),
-    inter_size_(inter_size),
-    weight_type_(weight_type),
-    attn_bias_(attn_bias),
-    tensor_para_size_(tensor_para_size),
-    tensor_para_rank_(tensor_para_rank)
+LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int               layer_id,
+                                                    const ModelParam& model,
+                                                    const LoraParam&  lora_param,
+                                                    const MoeParam&   moe_param,
+                                                    size_t            tp_size,
+                                                    size_t            tp_rank):
+    head_num_(model.head_num),
+    kv_head_num_(model.kv_head_num),
+    size_per_head_(model.head_dim),
+    hidden_units_(model.hidden_units),
+    inter_size_(model.inter_size.at(layer_id)),
+    weight_type_(model.weight_type),
+    attn_bias_(model.attn_bias),
+    tensor_para_size_(tp_size),
+    tensor_para_rank_(tp_rank)
 {
     if (lora_param.policy == LoraPolicy::kPlora) {
         std::vector<std::string> keys = {
@@ -88,7 +81,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
             auto&       weight    = *weights[i];
             int         rank      = lora_param.r;
             float       scale     = lora_param.scale;
-            std::string full_name = "layers." + std::to_string(layer_idx) + "." + name;
+            std::string full_name = "layers." + std::to_string(layer_id) + "." + name;
 
             for (const auto& [re, pr] : lora_param.rank_pattern) {
                 if (std::regex_search(full_name, pr.first)) {
@@ -114,36 +107,44 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
 
     fused_up_and_gate_ = ffn_weights.gating.lora.policy != LoraPolicy::kPlora;
 
-    self_attn_weights.qkv.input_dims  = hidden_units_;
-    self_attn_weights.qkv.output_dims = (head_num + 2 * kv_head_num) * size_per_head / tensor_para_size_;
-    self_attn_weights.qkv.type        = weight_type;
-    self_attn_weights.qkv.group_size  = group_size;
-
-    self_attn_weights.output.input_dims  = (head_num * size_per_head) / tensor_para_size_;
-    self_attn_weights.output.output_dims = hidden_units_;
-    self_attn_weights.output.type        = weight_type;
-    self_attn_weights.output.group_size  = group_size;
+    self_attn_weights = LlamaAttentionWeight<T>{hidden_units_,
+                                                size_per_head_,
+                                                head_num_,
+                                                kv_head_num_,
+                                                model.mla,
+                                                attn_bias_,
+                                                tensor_para_size_,
+                                                weight_type_,
+                                                model.group_size};
 
     ffn_weights = LlamaFfnWeight<T>{
         hidden_units_,
         inter_size_,
         tensor_para_size_,
         weight_type_,
-        group_size,
+        model.group_size,
         weight_type_ == WeightType::kINT4 && is_fuse_silu_act(),
     };
 
-    moe_weights = MoeFfnWeight<T>{hidden_units_,
-                                  moe_param.inter_size,
-                                  moe_param.expert_num,
-                                  moe_param.method,
-                                  moe_param.shared_gate,
-                                  tensor_para_size_,
-                                  weight_type,
-                                  group_size,
-                                  is_fuse_silu_act()};
-
-    mallocWeights();
+    moe_weights = MoeFfnWeight<T>{
+        layer_id, moe_param, hidden_units_, weight_type_, model.group_size, tensor_para_size_, is_fuse_silu_act()};
+}
+
+template<typename T>
+void LlamaDecoderLayerWeight<T>::malloc(cudaStream_t st)
+{
+    deviceMalloc((T**)&self_attn_norm_weights, hidden_units_, st);
+    deviceMalloc((T**)&ffn_norm_weights, hidden_units_, st);
+
+    self_attn_weights.malloc(st);
+
+    if (inter_size_) {
+        ffn_weights.malloc(st);
+    }
+
+    if (!moe_weights.experts.empty()) {
+        moe_weights.malloc(st);
+    }
 }
 
 template<typename T>
@@ -168,52 +169,6 @@ size_t LlamaDecoderLayerWeight<T>::workspace_size() const noexcept
     return size * sizeof(uint16_t);
 }
 
-template<typename T>
-void freeWeights(LlamaDenseWeight<T>& weights)
-{
-    cudaFree(weights.kernel);
-    cudaFree(weights.bias);
-    cudaFree(weights.scales);
-    cudaFree(weights.zeros);
-
-    weights.kernel = nullptr;
-    weights.bias   = nullptr;
-    weights.scales = nullptr;
-    weights.zeros  = nullptr;
-
-    {
-        cudaFree(weights.lora.a);
-        cudaFree(weights.lora.b);
-        weights.lora.a = nullptr;
-        weights.lora.b = nullptr;
-    }
-}
-
-template<typename T>
-void LlamaDecoderLayerWeight<T>::mallocWeights(LlamaDenseWeight<T>& weights, bool bias)
-{
-    if (bias) {
-        deviceMalloc((T**)&weights.bias, weights.output_dims);
-    }
-    const size_t bit_size = getBitSize(weights.type);
-    if (bit_size >= 16) {  // fp16, fp32
-        deviceMalloc((T**)&weights.kernel, weights.input_dims * weights.output_dims);
-    }
-    else {  // int8, int4
-        const int factor = sizeof(float) * 8 / bit_size;
-        FT_CHECK(weights.input_dims % factor == 0);
-        deviceMalloc((int**)&weights.kernel, weights.input_dims * weights.output_dims / factor);
-        deviceMemSetZero((int*)weights.kernel, weights.input_dims * weights.output_dims / factor);
-        deviceMalloc((T**)&weights.scales, weights.input_dims / weights.group_size * weights.output_dims);
-        deviceMalloc((T**)&weights.zeros, weights.input_dims / weights.group_size * weights.output_dims);
-    }
-
-    if (weights.lora.r > 0) {
-        deviceMalloc((T**)&weights.lora.a, weights.input_dims * weights.lora.r);
-        deviceMalloc((T**)&weights.lora.b, weights.lora.r * weights.output_dims);
-    }
-}
-
 template<typename FirstArg, typename... Args>
 std::string concat(FirstArg&& first, Args&&... args)
 {
@@ -342,64 +297,24 @@ void loadWeights(LlamaDenseWeight<T>& w, std::string prefix, FtCudaDataType mode
 }
 
 template<typename T>
-void LlamaDecoderLayerWeight<T>::mallocWeights()
+void LlamaDecoderLayerWeight<T>::free(cudaStream_t st)
 {
-    deviceMalloc((T**)&self_attn_norm_weights, hidden_units_);
-    deviceMalloc((T**)&ffn_norm_weights, hidden_units_);
+    deviceFree(self_attn_norm_weights, st);
+    deviceFree(ffn_norm_weights, st);
 
-    mallocWeights(self_attn_weights.qkv, attn_bias_);
-    mallocWeights(self_attn_weights.output, attn_bias_);
+    self_attn_weights.free(st);
 
     if (inter_size_) {
-        mallocWeights(ffn_weights.gating, false);
-        mallocWeights(ffn_weights.intermediate, false);
-        mallocWeights(ffn_weights.output, false);
+        ffn_weights.free(st);
     }
 
     if (!moe_weights.experts.empty()) {
-        mallocWeights(moe_weights.gate, false);
-        for (auto& e : moe_weights.experts) {
-            mallocWeights(e.gating, false);
-            mallocWeights(e.intermediate, false);
-            mallocWeights(e.output, false);
-        }
-        if (moe_weights.shared_gate.output_dims) {
-            mallocWeights(moe_weights.shared_gate, false);
-        }
+        moe_weights.free(st);
     }
 }
 
 template<typename T>
-LlamaDecoderLayerWeight<T>::~LlamaDecoderLayerWeight()
-{
-    cudaFree((void*)self_attn_norm_weights);
-    cudaFree((void*)ffn_norm_weights);
-    self_attn_norm_weights = nullptr;
-    ffn_norm_weights       = nullptr;
-
-    freeWeights(self_attn_weights.qkv);
-    freeWeights(self_attn_weights.output);
-
-    if (inter_size_) {
-        freeWeights(ffn_weights.fused_gating_intermediate);
-        freeWeights(ffn_weights.gating);
-        freeWeights(ffn_weights.intermediate);
-        freeWeights(ffn_weights.output);
-    }
-
-    if (!moe_weights.experts.empty()) {
-        freeWeights(moe_weights.gate);
-        for (auto& e : moe_weights.experts) {
-            freeWeights(e.fused_gating_intermediate);
-            freeWeights(e.gating);
-            freeWeights(e.intermediate);
-            freeWeights(e.output);
-        }
-        if (moe_weights.shared_gate.kernel) {
-            freeWeights(moe_weights.shared_gate);
-        }
-    }
-}
+LlamaDecoderLayerWeight<T>::~LlamaDecoderLayerWeight() = default;
 
 template<typename T>
 void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType model_file_type)
@@ -432,6 +347,24 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
     }
 }
 
+template<class T>
+void getMLATensor(LlamaAttentionWeight<T>& w, const std::string& p, TensorMap& m, int tp_rank)
+{
+    if (w.q_proj.output_dims) {
+        getWeightTensor(w.q_proj, false, concat(p, "attention.q_proj", tp_rank), m);
+    }
+    else {
+        getWeightTensor(w.q_a_proj, false, concat(p, "attention.q_a_proj"), m);
+        getWeightTensor(w.q_b_proj, false, concat(p, "attention.q_b_proj", tp_rank), m);
+        m.insert(concat(p, "attention.q_a_layernorm"),
+                 Tensor{MEMORY_GPU, getTensorType<T>(), {sizeof(T) * w.q_b_proj.input_dims}, w.q_a_layernorm});
+    }
+    getWeightTensor(w.kv_a_proj, false, concat(p, "attention.kv_a_proj"), m);
+    getWeightTensor(w.kv_b_proj, false, concat(p, "attention.kv_b_proj", tp_rank), m);
+    m.insert(concat(p, "attention.kv_a_layernorm"),
+             Tensor{MEMORY_GPU, getTensorType<T>(), {sizeof(T) * w.kv_b_proj.input_dims}, w.kv_a_layernorm});
+}
+
 template<typename T>
 TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
 {
@@ -445,7 +378,12 @@ TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
 
     auto get_prefix = [=](std::string_view name) { return concat(prefix, name, tensor_para_rank_); };
 
-    getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output);
+    if (self_attn_weights.qkv.output_dims) {
+        getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output);
+    }
+    else {
+        getMLATensor(self_attn_weights, prefix, output, tensor_para_rank_);
+    }
     getWeightTensor(self_attn_weights.output, attn_bias_, get_prefix("attention.wo"), output);
 
     if (inter_size_) {
@@ -478,7 +416,8 @@ TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
 }
 
 // template<class T>
-static void convert_u4(LlamaDenseWeight<half>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt)
+static void convert_u4(
+    LlamaDenseWeight<half>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st)
 {
     FT_CHECK(weight.type == WeightType::kINT4);
 
@@ -488,11 +427,11 @@ static void convert_u4(LlamaDenseWeight<half>& weight, bool is_fused_moe, void*
         get_weight_and_scales_layout(gemm::DataType::U4, is_fused_moe, getSMVersion(), use_simt);
 
     if (order_b == kColMajor) {
-        transpose_u4((uint4_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims, weight.output_dims);
-        cudaMemcpy(weight.kernel, workspace, weight.input_dims * weight.output_dims / 2, cudaMemcpyDefault);
+        transpose_u4((uint4_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims, weight.output_dims, st);
+        cudaMemcpyAsync(weight.kernel, workspace, weight.input_dims * weight.output_dims / 2, cudaMemcpyDefault, st);
     }
 
-    extend_to_u16((uint16_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims * weight.output_dims);
+    extend_to_u16((uint16_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims * weight.output_dims, st);
     sync_check_cuda_error();
 
     MatrixLayout w_desc{
@@ -507,25 +446,22 @@ static void convert_u4(LlamaDenseWeight<half>& weight, bool is_fused_moe, void*
     k_desc.type         = gemm::DataType::U4;
     k_desc.pack         = pack_b;
 
-    cudaMemset(weight.kernel, 0, weight.input_dims * weight.output_dims / 2);
+    cudaMemsetAsync(weight.kernel, 0, weight.input_dims * weight.output_dims / 2, st);
 
-    FT_CHECK(Convert(workspace, w_desc, weight.kernel, k_desc, 0) == 0);
+    FT_CHECK(Convert(workspace, w_desc, weight.kernel, k_desc, st) == 0);
     sync_check_cuda_error();
 
     const int scale_count = (weight.input_dims / weight.group_size) * weight.output_dims;
 
     // std::cout << "fuse_scales_and_zeros\n";
-    fuse_scales_and_zeros((half*)workspace, weight.scales, weight.zeros, scale_count);
+    fuse_scales_and_zeros((half*)workspace, weight.scales, weight.zeros, scale_count, st);
     // cudaMemset((T*)workspace, 0, sizeof(T) * scale_count * 2);
     sync_check_cuda_error();
 
-    cudaDeviceSynchronize();
-
-    cudaFree(weight.scales);
-    cudaFree(weight.zeros);
-    weight.scales = weight.zeros = nullptr;
+    deviceFree(weight.scales, st);
+    deviceFree(weight.zeros, st);
 
-    deviceMalloc((half**)&weight.scales_zeros, scale_count * 2);
+    deviceMalloc((half**)&weight.scales_zeros, scale_count * 2, st);
 
     MatrixLayout s_desc{
         gemm::DataType::U32,
@@ -538,7 +474,7 @@ static void convert_u4(LlamaDenseWeight<half>& weight, bool is_fused_moe, void*
     MatrixLayout q_desc = s_desc;
     q_desc.pack         = pack_v;
 
-    FT_CHECK(Convert(workspace, s_desc, weight.scales_zeros, q_desc, 0) == 0);
+    FT_CHECK(Convert(workspace, s_desc, weight.scales_zeros, q_desc, st) == 0);
     sync_check_cuda_error();
 
     weight.k_desc = k_desc;
@@ -548,7 +484,8 @@ static void convert_u4(LlamaDenseWeight<half>& weight, bool is_fused_moe, void*
 }
 
 template<class T>
-static void convert_fp(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt)
+static void
+convert_fp(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st)
 {
     using namespace gemm;
 
@@ -563,12 +500,13 @@ static void convert_fp(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* wor
     const int output_dim = weight.output_dims;
 
     if (order_b == kColMajor) {
-        invokeTransposeAxis01((uint16_t*)workspace, (uint16_t*)weight.kernel, input_dim, output_dim, 1, nullptr);
+        invokeTransposeAxis01((uint16_t*)workspace, (uint16_t*)weight.kernel, input_dim, output_dim, 1, st);
         sync_check_cuda_error();
         // FT_CHECK(0);
     }
     else {
-        check_cuda_error(cudaMemcpy(workspace, weight.kernel, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault));
+        check_cuda_error(
+            cudaMemcpyAsync(workspace, weight.kernel, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault, st));
     }
 
     MatrixLayout src{
@@ -583,35 +521,42 @@ static void convert_fp(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* wor
     dst.pack         = pack_b;
 
     if (pack_b) {
-        FT_CHECK(Convert(workspace, src, weight.kernel, dst, nullptr) == 0);
+        FT_CHECK(Convert(workspace, src, weight.kernel, dst, st) == 0);
         sync_check_cuda_error();
         // FT_CHECK(0);
     }
     else {
-        check_cuda_error(cudaMemcpy(weight.kernel, workspace, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault));
+        check_cuda_error(
+            cudaMemcpyAsync(weight.kernel, workspace, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault, st));
     }
 
     weight.k_desc = dst;
 }
 
 template<class T>
-static void convert(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt)
+static void
+convert(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st)
 {
     if (weight.type == WeightType::kINT4) {
         if constexpr (std::is_same_v<T, half>) {
-            convert_u4(weight, is_fused_moe, workspace, size, use_simt);
+            convert_u4(weight, is_fused_moe, workspace, size, use_simt, st);
         }
         else {
             FT_CHECK(0);
         }
     }
     else {
-        convert_fp(weight, is_fused_moe, workspace, size, use_simt);
+        convert_fp(weight, is_fused_moe, workspace, size, use_simt, st);
     }
 }
 
 template<class T>
-void interleave(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>& b, void* workspace, size_t size)
+void interleave(LlamaDenseWeight<T>& c,
+                LlamaDenseWeight<T>& a,
+                LlamaDenseWeight<T>& b,
+                void*                workspace,
+                size_t               size,
+                cudaStream_t         st)
 {
     FT_CHECK(c.input_dims == a.input_dims);
     FT_CHECK(c.input_dims == b.input_dims);
@@ -628,18 +573,18 @@ void interleave(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight
         const auto sentinel = tmp_c + c.output_dims * c.input_dims;
         FT_CHECK(sentinel <= (uint8_t*)workspace + size);
 
-        extend_to_u8(tmp_a, (const uint4_t*)a.kernel, a.output_dims * a.input_dims);
-        extend_to_u8(tmp_b, (const uint4_t*)b.kernel, b.output_dims * b.input_dims);
+        extend_to_u8(tmp_a, (const uint4_t*)a.kernel, a.output_dims * a.input_dims, st);
+        extend_to_u8(tmp_b, (const uint4_t*)b.kernel, b.output_dims * b.input_dims, st);
 
-        interleave_output_dims(tmp_c, tmp_a, tmp_b, a.output_dims, a.input_dims, 0);
+        interleave_output_dims(tmp_c, tmp_a, tmp_b, a.output_dims, a.input_dims, st);
 
-        compact_to_u4((uint4_t*)c.kernel, tmp_c, c.output_dims * c.input_dims);
+        compact_to_u4((uint4_t*)c.kernel, tmp_c, c.output_dims * c.input_dims, st);
 
-        interleave_output_dims(c.scales, a.scales, b.scales, a.output_dims, a.input_dims / a.group_size, 0);
-        interleave_output_dims(c.zeros, a.zeros, b.zeros, a.output_dims, a.input_dims / a.group_size, 0);
+        interleave_output_dims(c.scales, a.scales, b.scales, a.output_dims, a.input_dims / a.group_size, st);
+        interleave_output_dims(c.zeros, a.zeros, b.zeros, a.output_dims, a.input_dims / a.group_size, st);
     }
     else {
-        interleave_output_dims((T*)c.kernel, (const T*)a.kernel, (const T*)b.kernel, a.output_dims, a.input_dims, 0);
+        interleave_output_dims((T*)c.kernel, (const T*)a.kernel, (const T*)b.kernel, a.output_dims, a.input_dims, st);
     }
 
     // Check at function level
@@ -647,7 +592,7 @@ void interleave(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight
 }
 
 template<class T>
-void chunk(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>& b, void*, size_t)
+void chunk(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>& b, void*, size_t, cudaStream_t st)
 {
     FT_CHECK(c.input_dims == a.input_dims);
     FT_CHECK(c.input_dims == b.input_dims);
@@ -656,9 +601,11 @@ void chunk(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>&
     FT_CHECK(c.group_size == a.group_size);
     FT_CHECK(c.group_size == b.group_size);
 
-    auto _chunks = [](auto c, auto a, auto b, int height, int width) {
-        check_cuda_error(cudaMemcpy2D((char*)c + 0x000, width * 2, a, width, width, height, cudaMemcpyDefault));
-        check_cuda_error(cudaMemcpy2D((char*)c + width, width * 2, b, width, width, height, cudaMemcpyDefault));
+    auto _chunks = [&](auto c, auto a, auto b, int height, int width) {
+        check_cuda_error(
+            cudaMemcpy2DAsync((char*)c + 0x000, width * 2, a, width, width, height, cudaMemcpyDefault, st));
+        check_cuda_error(
+            cudaMemcpy2DAsync((char*)c + width, width * 2, b, width, width, height, cudaMemcpyDefault, st));
     };
 
     if (c.type == WeightType::kINT4) {
@@ -675,37 +622,37 @@ void chunk(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>&
 }
 
 template<typename T>
-void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cudaDeviceProp& prop)
+void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cudaDeviceProp& prop, cudaStream_t st)
 {
     const bool is_16xx = is_16xx_series(prop.name);
 
-    convert(self_attn_weights.qkv, false, workspace, size, is_16xx);
-    convert(self_attn_weights.output, false, workspace, size, is_16xx);
+    convert(self_attn_weights.qkv, false, workspace, size, is_16xx, st);
+    convert(self_attn_weights.output, false, workspace, size, is_16xx, st);
 
     auto process_ffn = [&](LlamaFfnWeight<T>& ffn, bool is_fused_moe) {
         if (fused_up_and_gate_) {
             auto& fused_up_and_gate = ffn.fused_gating_intermediate;
 
-            mallocWeights(fused_up_and_gate, false);
+            fused_up_and_gate.malloc(st);
 
             if (ffn.is_fused_silu) {
-                interleave(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size);
+                interleave(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size, st);
             }
             else {
-                chunk(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size);
+                chunk(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size, st);
             }
 
-            convert(ffn.fused_gating_intermediate, is_fused_moe, workspace, size, is_16xx);
+            convert(ffn.fused_gating_intermediate, is_fused_moe, workspace, size, is_16xx, st);
 
-            freeWeights(ffn.gating);
-            freeWeights(ffn.intermediate);
+            ffn.gating.free(st);
+            ffn.intermediate.free(st);
         }
         else {
-            convert(ffn.gating, is_fused_moe, workspace, size, is_16xx);
-            convert(ffn.intermediate, is_fused_moe, workspace, size, is_16xx);
+            convert(ffn.gating, is_fused_moe, workspace, size, is_16xx, st);
+            convert(ffn.intermediate, is_fused_moe, workspace, size, is_16xx, st);
         }
 
-        convert(ffn.output, is_fused_moe, workspace, size, is_16xx);
+        convert(ffn.output, is_fused_moe, workspace, size, is_16xx, st);
     };
 
     if (inter_size_) {
@@ -722,7 +669,7 @@ void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cud
 
         for (auto& e : moe_weights.experts) {
 
-            process_ffn(e, moe_weights.method);
+            process_ffn(e, moe_weights.method == MoeParam::kFused);
 
             const auto& fused  = e.fused_gating_intermediate;
             const auto& output = e.output;
@@ -743,12 +690,12 @@ void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cud
         auto& output = moe_weights.block.output;
 
         // TODO: free these ptrs
-        fused.kernel  = gemm::make_blocked_ptrs(fused_ptrs, nullptr);
-        output.kernel = gemm::make_blocked_ptrs(output_ptrs, nullptr);
+        fused.kernel  = gemm::make_blocked_ptrs(fused_ptrs, st);
+        output.kernel = gemm::make_blocked_ptrs(output_ptrs, st);
 
         if (!fused_param_ptrs.empty()) {
-            fused.scales_zeros  = (T*)gemm::make_blocked_ptrs(fused_param_ptrs, nullptr);
-            output.scales_zeros = (T*)gemm::make_blocked_ptrs(output_param_ptrs, nullptr);
+            fused.scales_zeros  = (T*)gemm::make_blocked_ptrs(fused_param_ptrs, st);
+            output.scales_zeros = (T*)gemm::make_blocked_ptrs(output_param_ptrs, st);
         }
 
         fused.k_desc.ld = output.k_desc.ld = 0;
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
index f68a103dd5..9b204ed0dc 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
@@ -30,19 +30,14 @@ template<typename T>
 struct LlamaDecoderLayerWeight {
 public:
     LlamaDecoderLayerWeight() = delete;
-    LlamaDecoderLayerWeight(int        layer_idx,
-                            size_t     head_num,
-                            size_t     kv_head_num,
-                            size_t     size_per_head,
-                            size_t     hidden_units,
-                            size_t     inter_size,
-                            WeightType weight_type,
-                            int        group_size,
-                            LoraParam  lora_param,
-                            bool       attn_bias,
-                            MoeParam   moe_param,
-                            size_t     tensor_para_size,
-                            size_t     tensor_para_rank);
+
+    LlamaDecoderLayerWeight(int               layer_id,
+                            const ModelParam& model,
+                            const LoraParam&  lora_param,
+                            const MoeParam&   moe_param,
+                            size_t            tp_size,
+                            size_t            tp_rank);
+
     ~LlamaDecoderLayerWeight();
     LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other) = delete;
     LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other) = delete;
@@ -51,17 +46,21 @@ struct LlamaDecoderLayerWeight {
 
     TensorMap getParams(std::string prefix);
 
-    void prepare(void* workspace, size_t size, const cudaDeviceProp& prop);
+    void prepare(void* workspace, size_t size, const cudaDeviceProp& prop, cudaStream_t st);
 
     size_t workspace_size() const noexcept;
 
-    void mallocWeights(LlamaDenseWeight<T>& weights, bool bias);
+    void malloc(cudaStream_t st);
+
+    void free(cudaStream_t st);
+
+    T* self_attn_norm_weights{};
+    T* ffn_norm_weights{};
 
-    T*                      self_attn_norm_weights{};
-    T*                      ffn_norm_weights{};
     LlamaAttentionWeight<T> self_attn_weights{};
-    LlamaFfnWeight<T>       ffn_weights{};
-    MoeFfnWeight<T>         moe_weights{};
+
+    LlamaFfnWeight<T> ffn_weights{};
+    MoeFfnWeight<T>   moe_weights{};
 
 private:
     size_t     head_num_;
@@ -76,8 +75,6 @@ struct LlamaDecoderLayerWeight {
     size_t     tensor_para_rank_;
     bool       is_maintain_buffer_ = false;
     bool       fused_up_and_gate_;
-
-    void mallocWeights();
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h
index 169fb53bcf..944781bf5d 100644
--- a/src/turbomind/models/llama/LlamaDenseWeight.h
+++ b/src/turbomind/models/llama/LlamaDenseWeight.h
@@ -20,64 +20,14 @@
 #pragma once
 
 #include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/models/llama/weight_type.h"
 #include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/memory_utils.h"
 #include <cuda_bf16.h>
 
 namespace turbomind {
 
-enum class WeightType : int
-{
-    kFP32,
-    kFP16,
-    kFP8,  // not supported yet
-    kBF16,
-    kINT8,
-    kINT4
-};
-
-template<class T>
-constexpr WeightType get_default_weight_type()
-{
-    if constexpr (std::is_same_v<T, half>) {
-        return WeightType::kFP16;
-    }
-    else if constexpr (std::is_same_v<T, nv_bfloat16>) {
-        return WeightType::kBF16;
-    }
-    else if constexpr (std::is_same_v<T, float>) {
-        return WeightType::kFP32;
-    }
-    else {
-        static_assert(sizeof(T) != sizeof(T), "not implemented");
-        return {};
-    }
-}
-
-inline size_t getBitSize(WeightType type)
-{
-    switch (type) {
-        case WeightType::kFP32:
-            return 32;
-        case WeightType::kFP16:
-            return 16;
-        case WeightType::kFP8:
-            return 8;
-        case WeightType::kBF16:
-            return 16;
-        case WeightType::kINT8:
-            return 8;
-        case WeightType::kINT4:
-            return 4;
-    }
-    return 0;
-}
-
-enum class LoraPolicy : int
-{
-    kNull,
-    kPlora,
-};
-
 inline LoraPolicy getLoraPolicy(const std::string& policy)
 {
     if (policy == "plora") {
@@ -96,20 +46,31 @@ struct LoraWeight {
 
 template<typename T>
 struct LlamaDenseWeight {
-    size_t     input_dims;
-    size_t     output_dims;
-    void*      kernel;
+    size_t     input_dims  = 0;
+    size_t     output_dims = 0;
+    WeightType type;  // uninitialized
+    void*      kernel       = nullptr;
+    T*         bias         = nullptr;
+    T*         scales       = nullptr;
+    T*         zeros        = nullptr;
+    T*         scales_zeros = nullptr;
+    int        group_size   = 1;
+
     LoraWeight lora;
-    WeightType type;
-    T*         bias;
-    T*         scales;
-    T*         zeros;
-    T*         scales_zeros;
-    int        group_size;
 
     gemm::MatrixLayout k_desc;
     gemm::MatrixLayout q_desc;
 
+    LlamaDenseWeight(): type{}, lora{}, k_desc{}, q_desc{} {}
+
+    LlamaDenseWeight(size_t input_dim, size_t output_dim, WeightType type, int group_size): LlamaDenseWeight{}
+    {
+        this->input_dims  = input_dim;
+        this->output_dims = output_dim;
+        this->type        = type;
+        this->group_size  = group_size;
+    }
+
     size_t kernel_size() const noexcept
     {
         return getBitSize(type) * input_dims * output_dims / 8;
@@ -129,12 +90,121 @@ struct LlamaDenseWeight {
     {
         return {sizeof(T) * input_dims * lora.r, sizeof(T) * lora.r * output_dims};
     }
+
+    void malloc(cudaStream_t st, bool with_bias = false)
+    {
+        if (with_bias) {
+            deviceMalloc((T**)&bias, output_dims, st);
+        }
+        const size_t bit_size = getBitSize(type);
+        if (bit_size >= 16) {  // fp16, fp32
+            deviceMalloc((T**)&kernel, input_dims * output_dims, st);
+        }
+        else {  // int8, int4
+            const int factor = sizeof(float) * 8 / bit_size;
+            FT_CHECK(input_dims % factor == 0);
+            deviceMalloc((int**)&kernel, input_dims * output_dims / factor, st);
+            deviceMalloc((T**)&scales, input_dims / group_size * output_dims, st);
+            deviceMalloc((T**)&zeros, input_dims / group_size * output_dims, st);
+        }
+
+        if (lora.r > 0) {
+            deviceMalloc((T**)&lora.a, input_dims * lora.r, st);
+            deviceMalloc((T**)&lora.b, lora.r * output_dims, st);
+        }
+    }
+
+    void free(cudaStream_t st)
+    {
+        deviceFree(kernel, st);
+        deviceFree(bias, st);
+        deviceFree(scales, st);
+        deviceFree(zeros, st);
+        deviceFree(lora.a, st);
+        deviceFree(lora.b, st);
+    }
 };
 
 template<typename T>
 struct LlamaAttentionWeight {
+
+    LlamaAttentionWeight() = default;
+
+    LlamaAttentionWeight(size_t     hidden_dim,
+                         size_t     head_dim,
+                         size_t     head_num,
+                         size_t     kv_head_num,
+                         MLAParam   mla,
+                         bool       bias,
+                         size_t     tp,
+                         WeightType weight_type,
+                         int        group_size)
+    {
+        this->bias = bias;
+        if (mla.kv_lora_rank == 0) {
+            qkv = {hidden_dim, (head_num + 2 * kv_head_num) * head_dim / tp, weight_type, group_size};
+        }
+        else {
+            const int qk_nope_dim = head_dim - mla.qk_rope_dim;
+            if (mla.q_lora_rank) {
+                q_a_proj = {hidden_dim, mla.q_lora_rank, weight_type, group_size};
+                q_b_proj = {mla.q_lora_rank, head_num * head_dim / tp, weight_type, group_size};
+            }
+            else {
+                q_proj = {hidden_dim, head_num * head_dim / tp, weight_type, group_size};
+            }
+            kv_a_proj = {hidden_dim, mla.kv_lora_rank + mla.qk_rope_dim, weight_type, group_size};
+            kv_b_proj = {mla.kv_lora_rank, head_num * (qk_nope_dim + mla.v_head_dim) / tp, weight_type, group_size};
+        }
+        output = {(head_num * head_dim) / tp, hidden_dim, weight_type, group_size};
+    }
+
+    void malloc(cudaStream_t st)
+    {
+        if (qkv.output_dims) {
+            qkv.malloc(st, bias);
+        }
+        else {
+            if (q_proj.output_dims) {
+                q_proj.malloc(st);
+            }
+            else {
+                q_a_proj.malloc(st);
+                q_b_proj.malloc(st);
+                deviceMalloc((T**)&q_a_layernorm, q_b_proj.input_dims, st);
+            }
+            kv_a_proj.malloc(st);
+            kv_b_proj.malloc(st);
+            deviceMalloc((T**)&kv_a_layernorm, kv_b_proj.input_dims, st);
+        }
+        output.malloc(st, bias);
+    }
+
+    void free(cudaStream_t st)
+    {
+        qkv.free(st);
+        q_proj.free(st);
+        q_a_proj.free(st);
+        q_b_proj.free(st);
+        kv_a_proj.free(st);
+        kv_b_proj.free(st);
+        output.free(st);
+        deviceFree(q_a_layernorm, st);
+        deviceFree(kv_a_layernorm, st);
+    }
+
     LlamaDenseWeight<T> qkv;
     LlamaDenseWeight<T> output;
+    bool                bias{};
+
+    LlamaDenseWeight<T> q_proj;
+    LlamaDenseWeight<T> q_a_proj;
+    LlamaDenseWeight<T> q_b_proj;
+    LlamaDenseWeight<T> kv_a_proj;
+    LlamaDenseWeight<T> kv_b_proj;
+
+    T* q_a_layernorm{};
+    T* kv_a_layernorm{};
 };
 
 template<typename T>
@@ -172,6 +242,21 @@ struct LlamaFfnWeight {
         output.group_size  = group_size;
     }
 
+    void malloc(cudaStream_t st)
+    {
+        gating.malloc(st);
+        intermediate.malloc(st);
+        output.malloc(st);
+    }
+
+    void free(cudaStream_t st)
+    {
+        gating.free(st);
+        intermediate.free(st);
+        output.free(st);
+        fused_gating_intermediate.free(st);
+    }
+
     LlamaDenseWeight<T> gating;
     LlamaDenseWeight<T> intermediate;
     LlamaDenseWeight<T> output;
@@ -186,23 +271,27 @@ struct MoeFfnWeight {
 
     MoeFfnWeight() = default;
 
-    MoeFfnWeight(size_t     hidden_dim,
-                 int        inter_size,
-                 int        expert_num,
-                 int        method,
-                 bool       has_shared_gate,
-                 size_t     tp,
-                 WeightType weight_type,
-                 int        group_size,
-                 bool       fuse_silu_act)
+    MoeFfnWeight(int             layer_id,
+                 const MoeParam& param,
+                 size_t          hidden_dim,
+                 WeightType      weight_type,
+                 int             group_size,
+                 size_t          tp,
+                 bool            fuse_silu_act)
     {
 
-        // printf("%d %d %d\n", (int)hidden_dim, (int)inter_size, (int)expert_num);
+        if (param.expert_num.size() <= layer_id) {
+            return;
+        }
+
+        const int expert_num = param.expert_num[layer_id];
 
         if (expert_num == 0) {
             return;
         }
 
+        // printf("%d %d %d\n", (int)hidden_dim, (int)param.inter_size, (int)expert_num);
+
         gate.input_dims  = hidden_dim;
         gate.output_dims = expert_num;
         gate.type        = get_default_weight_type<T>();
@@ -210,15 +299,15 @@ struct MoeFfnWeight {
 
         experts.resize(expert_num);
 
-        this->method  = method;
-        fuse_silu_act = fuse_silu_act && method;
+        method        = param.method;
+        fuse_silu_act = fuse_silu_act && method == MoeParam::kFused;
 
         for (auto& e : experts) {
             // inter size is divided by tp in `FfnWeight`
-            e = LlamaFfnWeight<T>{hidden_dim, (size_t)inter_size, tp, weight_type, group_size, fuse_silu_act};
+            e = LlamaFfnWeight<T>{hidden_dim, (size_t)param.inter_size, tp, weight_type, group_size, fuse_silu_act};
         }
 
-        if (has_shared_gate) {
+        if (param.shared_gate) {
             shared_gate.input_dims  = hidden_dim;
             shared_gate.output_dims = 1;
             shared_gate.type        = get_default_weight_type<T>();
@@ -229,14 +318,36 @@ struct MoeFfnWeight {
         }
     }
 
+    void malloc(cudaStream_t st)
+    {
+        gate.malloc(st);
+        if (shared_gate.output_dims) {
+            shared_gate.malloc(st);
+        }
+        for (auto& e : experts) {
+            e.malloc(st);
+        }
+    }
+
+    void free(cudaStream_t st)
+    {
+        gate.free(st);
+        shared_gate.free(st);
+        for (auto& e : experts) {
+            e.free(st);
+        }
+        block.free(st);
+    }
+
     LlamaDenseWeight<T>            gate;
     std::vector<LlamaFfnWeight<T>> experts;
 
     LlamaDenseWeight<T> shared_gate;
 
+    // reference into `experts`
     LlamaFfnWeight<T> block;
 
-    int method{};
+    MoeParam::Method method{};
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc
index 8cce207203..907467341a 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.cc
+++ b/src/turbomind/models/llama/LlamaFfnLayer.cc
@@ -27,21 +27,20 @@
 namespace turbomind {
 
 template<typename T>
-void LlamaFfnLayer<T>::allocateBuffer(size_t                     token_num,
-                                      int                        inter_size,
-                                      const LlamaDenseWeight<T>* gating,
-                                      const LlamaDenseWeight<T>* inter)
+void LlamaFfnLayer<T>::allocateBuffer(
+    size_t token_num, int inter_size, size_t inter_buf_factor, size_t gating_lora_r, size_t inter_lora_r)
 {
     const size_t sz = token_num * inter_size;
 
-    const size_t sz_gate  = token_num * gating->lora.r;
-    const size_t sz_inter = token_num * inter->lora.r;
+    const size_t sz_gate  = token_num * gating_lora_r;
+    const size_t sz_inter = token_num * inter_lora_r;
 
-    gating_buf_ = (T*)allocator_->reMalloc(gating_buf_, sizeof(T) * (sz * 2 + sz_gate + sz_inter), false);
-    inter_buf_  = gating_buf_ + sz;
+    gating_buf_ =
+        (T*)allocator_->reMalloc(gating_buf_, sizeof(T) * (sz * inter_buf_factor + sz_gate + sz_inter), false);
+    inter_buf_ = gating_buf_ + sz;
 
     // gate & inter is not fused when lora is enabled
-    if (gating->lora.r) {
+    if (gating_lora_r) {
         inter_buf_ += sz_gate;
     }
 
@@ -93,12 +92,16 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
     const int    layer_id   = input_tensors->getVal<int>("layer_id");
     const int    inter_size = weights->inter_size;
 
-    allocateBuffer(token_num, inter_size, &weights->gating, &weights->intermediate);
+    const bool is_fused_silu = weights->fused_gating_intermediate.kernel && weights->is_fused_silu;
+
+    allocateBuffer(token_num, inter_size, is_fused_silu ? 1 : 2, weights->gating.lora.r, weights->intermediate.lora.r);
 
     const T* ffn_input_data  = input_tensors->at("ffn_input").getPtr<T>();
     T*       ffn_output_data = output_tensors->at("ffn_output").getPtr<T>();
     int*     lora_mask = input_tensors->at("lora_mask", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<int>();
 
+    const bool all_reduce = input_tensors->getVal<bool>("all_reduce", false);
+
     if (weights->fused_gating_intermediate.kernel) {
         NvtxScope scope("fused_silu_ffn");
 
@@ -145,7 +148,8 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
 
     count_and_fix(ffn_output_data, token_num * weights->output.output_dims, Concat("w2", layer_id), 3);
 
-    if (all_reduce_ && tensor_para_.world_size_ > 1) {
+    if (all_reduce && tensor_para_.world_size_ > 1) {
+        // std::cout << "ffn all reduce " << layer_id << "\n";
         NcclGuard nccl_guard(tensor_para_, stream_);
         ftNcclAllReduceSum(ffn_output_data, ffn_output_data, token_num * hidden_units_, tensor_para_, stream_);
         sync_check_cuda_error();
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.h b/src/turbomind/models/llama/LlamaFfnLayer.h
index 2daca2cc95..a72a24701e 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.h
+++ b/src/turbomind/models/llama/LlamaFfnLayer.h
@@ -30,13 +30,12 @@ namespace turbomind {
 template<typename T>
 class LlamaFfnLayer {
 public:
-    LlamaFfnLayer(const ModelParam& model, const NcclParam& tp, const Context<T>& ctx, bool all_reduce):
+    LlamaFfnLayer(const ModelParam& model, const NcclParam& tp, const Context<T>& ctx):
         hidden_units_(model.hidden_units),
         tensor_para_(tp),
         stream_(ctx.stream),
         linear_(ctx.linear.get()),
-        allocator_(ctx.allocator.get()),
-        all_reduce_(all_reduce)
+        allocator_(ctx.allocator.get())
     {
     }
 
@@ -48,7 +47,8 @@ class LlamaFfnLayer {
     void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaFfnWeight<T>* weights);
 
 private:
-    void allocateBuffer(size_t token_num, int inter_size, const LlamaDenseWeight<T>*, const LlamaDenseWeight<T>*);
+    void allocateBuffer(
+        size_t token_num, int inter_size, size_t inter_buf_factor, size_t gating_lora_r, size_t inter_lora_r);
 
     void freeBuffer();
 
@@ -59,7 +59,6 @@ class LlamaFfnLayer {
     cudaStream_t const    stream_;
     LlamaLinear<T>* const linear_;
     IAllocator* const     allocator_;
-    const bool            all_reduce_;
     bool                  is_free_buffer_after_forward_{};
 
     T* gating_buf_{};
diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc
index 3d50910ad4..05b22deed5 100644
--- a/src/turbomind/models/llama/LlamaV2.cc
+++ b/src/turbomind/models/llama/LlamaV2.cc
@@ -72,7 +72,6 @@ LlamaV2<T>::LlamaV2(const ModelParam&               model,
     lora_param_(lora),
     head_num_(model.head_num),
     size_per_head_(model.head_dim),
-    inter_size_(model.inter_size),
     hidden_units_(model.hidden_units),
     layer_num_(model.layer_num),
     vocab_size_(model.vocab_size),
diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h
index 6321d09d7c..658282f5e5 100644
--- a/src/turbomind/models/llama/LlamaV2.h
+++ b/src/turbomind/models/llama/LlamaV2.h
@@ -113,7 +113,6 @@ class LlamaV2 {
     const size_t    head_num_;
     const size_t    size_per_head_;
     const size_t    hidden_units_;
-    const size_t    inter_size_;
     const size_t    layer_num_;
     const size_t    vocab_size_;
     const size_t    vocab_size_padded_;
diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc
index 9d62042d62..bcee150977 100644
--- a/src/turbomind/models/llama/LlamaWeight.cc
+++ b/src/turbomind/models/llama/LlamaWeight.cc
@@ -20,36 +20,24 @@
 
 #include "src/turbomind/models/llama/LlamaWeight.h"
 #include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/memory_utils.h"
 #include <cuda_runtime.h>
 
 namespace turbomind {
 
 template<typename T>
-LlamaWeight<T>::LlamaWeight(size_t     head_num,
-                            size_t     kv_head_num,
-                            size_t     size_per_head,
-                            size_t     hidden_units,
-                            size_t     inter_size,
-                            size_t     vocab_size,
-                            size_t     embedding_size,
-                            size_t     num_layer,
-                            bool       attn_bias,
-                            WeightType weight_type,
-                            int        group_size,
-                            LoraParam  lora_param,
-                            MoeParam   moe_param,
-                            size_t     tensor_para_size,
-                            size_t     tensor_para_rank):
-    hidden_units_(hidden_units),
-    inter_size_(inter_size),
-    vocab_size_(vocab_size),
-    vocab_size_padded_(vocab_size),
-    embedding_size_(embedding_size),
-    num_layer_(num_layer),
-    weight_type_(weight_type),
-    tensor_para_size_(tensor_para_size),
-    tensor_para_rank_(tensor_para_rank)
+LlamaWeight<T>::LlamaWeight(
+    const ModelParam& model, const LoraParam& lora_param, const MoeParam& moe_param, size_t tp_size, size_t tp_rank):
+    hidden_units_(model.hidden_units),
+    inter_size_(model.inter_size),
+    vocab_size_(model.vocab_size),
+    vocab_size_padded_(model.vocab_size),
+    embedding_size_(model.embedding_size),
+    num_layer_(model.layer_num),
+    weight_type_(model.weight_type),
+    tensor_para_size_(tp_size),
+    tensor_para_rank_(tp_rank)
 {
     if (vocab_size_padded_ % tensor_para_size_ != 0) {
         vocab_size_padded_ = (vocab_size_ + tensor_para_size_ - 1) / tensor_para_size_ * tensor_para_size_;
@@ -61,49 +49,42 @@ LlamaWeight<T>::LlamaWeight(size_t     head_num,
     }
     FT_CHECK(hidden_units_ % tensor_para_size_ == 0);
 
+    check_cuda_error(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
+
     decoder_layer_weights.reserve(num_layer_);
     for (unsigned l = 0; l < num_layer_; ++l) {
-        decoder_layer_weights.push_back(new LlamaDecoderLayerWeight<T>(l,
-                                                                       head_num,
-                                                                       kv_head_num,
-                                                                       size_per_head,
-                                                                       hidden_units_,
-                                                                       inter_size_,
-                                                                       weight_type_,
-                                                                       group_size,
-                                                                       lora_param,
-                                                                       attn_bias,
-                                                                       moe_param,
-                                                                       tensor_para_size_,
-                                                                       tensor_para_rank_));
+        decoder_layer_weights.emplace_back(
+            new LlamaDecoderLayerWeight<T>(l, model, lora_param, moe_param, tp_size, tp_rank));
+        decoder_layer_weights.back()->malloc(stream_);
     }
 
-    mallocWeights();
+    FT_CHECK(vocab_size_padded_ % tensor_para_size_ == 0);
+    deviceMalloc((T**)&pre_decoder_embedding_table, embedding_size_ * hidden_units_ / tensor_para_size_, stream_);
+    deviceMalloc((T**)&output_norm_weight, hidden_units_, stream_);
+    deviceMalloc((T**)&post_decoder_embedding_kernel, hidden_units_ * vocab_size_padded_ / tensor_para_size_, stream_);
+
+    // Wait for allocations
+    check_cuda_error(cudaStreamSynchronize(stream_));
 }
 
 template<typename T>
 LlamaWeight<T>::~LlamaWeight()
 {
-    cudaFree((void*)pre_decoder_embedding_table);
-    cudaFree((void*)output_norm_weight);
-    cudaFree((void*)post_decoder_embedding_kernel);
-
-    pre_decoder_embedding_table   = nullptr;
-    output_norm_weight            = nullptr;
-    post_decoder_embedding_kernel = nullptr;
+    deviceFree(pre_decoder_embedding_table, stream_);
+    deviceFree(output_norm_weight, stream_);
+    deviceFree(post_decoder_embedding_kernel, stream_);
 
     for (auto& p : decoder_layer_weights) {
+        p->free(stream_);
         delete p;
     }
-}
 
-template<typename T>
-void LlamaWeight<T>::mallocWeights()
-{
-    FT_CHECK(vocab_size_padded_ % tensor_para_size_ == 0);
-    deviceMalloc((T**)&pre_decoder_embedding_table, embedding_size_ * hidden_units_ / tensor_para_size_);
-    deviceMalloc((T**)&output_norm_weight, hidden_units_);
-    deviceMalloc((T**)&post_decoder_embedding_kernel, hidden_units_ * vocab_size_padded_ / tensor_para_size_);
+    decoder_layer_weights.clear();
+
+    // Wait for deallocations
+    check_cuda_error(cudaStreamSynchronize(stream_));
+    check_cuda_error(cudaStreamDestroy(stream_));
+    stream_ = {};
 }
 
 template<typename T>
@@ -179,13 +160,19 @@ void LlamaWeight<T>::prepare(const cudaDeviceProp& prop)
 
     TM_LOG_INFO("[LlamaWeight<T>::prepare] workspace size: %d\n", workspace_size);
 
+    // Wait for the weights to be filled externally
+    check_cuda_error(cudaDeviceSynchronize());
+
     if (workspace_size) {
-        deviceMalloc((char**)&workspace, workspace_size);
+        deviceMalloc((char**)&workspace, workspace_size, stream_);
     }
     for (auto& layer : decoder_layer_weights) {
-        layer->prepare(workspace, workspace_size, prop);
+        layer->prepare(workspace, workspace_size, prop, stream_);
     }
-    deviceFree(workspace);
+
+    deviceFree(workspace, stream_);
+
+    check_cuda_error(cudaStreamSynchronize(stream_));
 }
 
 #ifdef ENABLE_FP32
diff --git a/src/turbomind/models/llama/LlamaWeight.h b/src/turbomind/models/llama/LlamaWeight.h
index c30e753565..629cd56120 100644
--- a/src/turbomind/models/llama/LlamaWeight.h
+++ b/src/turbomind/models/llama/LlamaWeight.h
@@ -22,28 +22,18 @@
 
 #include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
 #include "src/turbomind/models/llama/llama_params.h"
-#include "src/turbomind/utils/memory_utils.h"
 
 namespace turbomind {
 
 template<typename T>
 struct LlamaWeight {
     LlamaWeight() = default;
-    LlamaWeight(size_t     head_num,
-                size_t     kv_head_num,
-                size_t     size_per_head,
-                size_t     hidden_units,
-                size_t     inter_size,
-                size_t     vocab_size,
-                size_t     embedding_size,
-                size_t     num_layer,
-                bool       attn_bias,
-                WeightType weight_type,
-                int        group_size,
-                LoraParam  lora_param,
-                MoeParam   moe_param,
-                size_t     tensor_para_size,
-                size_t     tensor_para_rank);
+
+    LlamaWeight(const ModelParam& model_param,
+                const LoraParam&  lora_param,
+                const MoeParam&   moe_param,
+                size_t            tp_size,
+                size_t            tp_rank);
 
     ~LlamaWeight();
 
@@ -57,15 +47,13 @@ struct LlamaWeight {
     void prepare(const cudaDeviceProp& prop);
 
     std::vector<LlamaDecoderLayerWeight<T>*> decoder_layer_weights;
-    const T*                                 pre_decoder_embedding_table{};
-    const T*                                 output_norm_weight{};
-    const T*                                 post_decoder_embedding_kernel{};
 
-private:
-    void mallocWeights();
+    T* pre_decoder_embedding_table{};
+    T* output_norm_weight{};
+    T* post_decoder_embedding_kernel{};
 
+private:
     size_t     hidden_units_;
-    size_t     inter_size_;
     size_t     vocab_size_;
     size_t     vocab_size_padded_;
     size_t     embedding_size_;
@@ -73,6 +61,10 @@ struct LlamaWeight {
     WeightType weight_type_;
     size_t     tensor_para_size_;
     size_t     tensor_para_rank_;
+
+    std::vector<int> inter_size_;
+
+    cudaStream_t stream_;
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/llama_gemm.cc b/src/turbomind/models/llama/llama_gemm.cc
index 62952cd715..f9a0191e4b 100644
--- a/src/turbomind/models/llama/llama_gemm.cc
+++ b/src/turbomind/models/llama/llama_gemm.cc
@@ -84,7 +84,7 @@ int main(int argc, char* argv[])
         return -1;
     }
     else {
-        ft::deviceMalloc(reinterpret_cast<char**>(&gemm_test_buf), buf_size_in_byte, false);
+        ft::deviceMalloc(reinterpret_cast<char**>(&gemm_test_buf), buf_size_in_byte, nullptr, false);
     }
 
     if (0) {}
diff --git a/src/turbomind/models/llama/llama_kernels.h b/src/turbomind/models/llama/llama_kernels.h
index 3b01dee60d..aaade1a513 100644
--- a/src/turbomind/models/llama/llama_kernels.h
+++ b/src/turbomind/models/llama/llama_kernels.h
@@ -154,7 +154,7 @@ template<typename T>
 struct TempBuffer {
     TempBuffer(size_t size)
     {
-        deviceMalloc(&data, size, false);
+        cudaMalloc(&data, size);
     }
     T* data;
 };
diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h
index e6b9d690ae..0a505b11a9 100644
--- a/src/turbomind/models/llama/llama_params.h
+++ b/src/turbomind/models/llama/llama_params.h
@@ -2,28 +2,41 @@
 
 #pragma once
 
-#include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include <cstddef>
 #include <map>
 #include <regex>
 #include <string>
 
+#include "src/turbomind/models/llama/weight_type.h"
+
 namespace turbomind {
 
+struct MLAParam {
+    size_t q_lora_rank;
+    size_t kv_lora_rank;
+    size_t qk_rope_dim;
+    size_t v_head_dim;
+};
+
 struct ModelParam {
-    size_t head_num;
-    size_t head_dim;
-    size_t kv_head_num;
-    size_t hidden_units;
-    size_t layer_num;
-    size_t inter_size;
-    size_t vocab_size;
-    size_t embedding_size;
-    float  norm_eps;
-    int    quant_policy;
-    //
-    int start_id;
-    int end_id;
+    size_t     head_num;
+    size_t     head_dim;
+    size_t     kv_head_num;
+    size_t     hidden_units;
+    size_t     layer_num;
+    size_t     vocab_size;
+    size_t     embedding_size;
+    float      norm_eps;
+    int        quant_policy;
+    bool       attn_bias;
+    WeightType weight_type;
+    int        group_size;
+    int        start_id;
+    int        end_id;
+    MLAParam   mla;
+    int        tune_layer_num;
+
+    std::vector<int> inter_size;
 };
 
 struct MoeParam {
@@ -32,17 +45,25 @@ struct MoeParam {
         kNaive,
         kFused
     } method;
-    int  expert_num;
-    int  experts_per_token;
-    int  inter_size;
-    bool norm_topk;
-    bool shared_gate;
+
+    int   experts_per_token;
+    int   inter_size;
+    bool  norm_topk_prob;
+    bool  shared_gate;
+    float routed_scale;
+
+    int         topk_group;
+    std::string topk_method;
+    int         n_group;
+
+    std::vector<int> expert_num;
 };
 
 struct AttentionParam {
     int         rotary_embedding_dim;
     float       rotary_embedding_base;
     int         max_position_embeddings;
+    float       softmax_scale;
     std::string rope_scaling_type;
     int         original_max_position_embeddings;
     float       rope_scaling_factor;
@@ -74,6 +95,12 @@ struct EngineParam {
     int max_prefill_iters;
 };
 
+enum class LoraPolicy : int
+{
+    kNull,
+    kPlora,
+};
+
 struct LoraParam {
     int        r;
     float      scale;
diff --git a/src/turbomind/models/llama/llama_utils.cu b/src/turbomind/models/llama/llama_utils.cu
index 925c6b8831..eaa450ae20 100644
--- a/src/turbomind/models/llama/llama_utils.cu
+++ b/src/turbomind/models/llama/llama_utils.cu
@@ -1,47 +1,25 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
-#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
-#include "src/turbomind/models/llama/llama_utils.h"
-#include "src/turbomind/utils/cuda_utils.h"
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <type_traits>
+#include <vector>
+
 #include <cuda_fp16.h>
 #include <curand_kernel.h>
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/host_vector.h>
-#include <vector>
+
+#include "src/turbomind/models/llama/llama_utils.h"
+#include "src/turbomind/utils/cuda_utils.h"
 
 namespace turbomind {
 
 CmpMode compare_mode = kCmpRead;
-
-template<typename T>
-struct abs_diff_t {
-    using type = T;
-};
-
-template<>
-struct abs_diff_t<half> {
-    using type = float;
-};
-
-template<>
-struct abs_diff_t<__nv_bfloat16> {
-    using type = float;
-};
-
-template<typename T>
-struct abs_diff: public thrust::unary_function<thrust::tuple<T, T>, typename abs_diff_t<T>::type> {
-    __host__ __device__ float operator()(thrust::tuple<T, T> x) const
-    {
-        using R = typename abs_diff_t<T>::type;
-        auto r  = R(thrust::get<0>(x)) - R(thrust::get<1>(x));
-        return r < R(0) ? -r : r;
-    }
-};
+// CmpMode compare_mode = kCmpWrite;
 
 template<typename T>
 void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream)
@@ -63,10 +41,8 @@ void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream)
 template<typename T>
 void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream)
 {
-    // wait for b
-    check_cuda_error(cudaStreamSynchronize(stream));
     // read a from file
-    thrust::host_vector<T> h_a(size);
+    std::vector<T> h_a(size);
     {
         const auto    filename = "tmp/" + key + ".cmp";
         std::ifstream ifs(filename, std::ios::binary);
@@ -85,15 +61,30 @@ void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream)
         }
         ifs.read((char*)h_a.data(), sizeof(T) * h_a.size());
     }
-    // copy a to device
-    thrust::device_vector<T> a = h_a;
-    // create abs(a - b) iterator
-    thrust::device_ptr<T> dev_ptr(ptr);
-    auto                  zip_iter       = thrust::make_zip_iterator(thrust::make_tuple(a.begin(), dev_ptr));
-    auto                  transform_iter = thrust::make_transform_iterator(zip_iter, abs_diff<T>{});
-    // sum(abs(a - b))
-    auto asum = thrust::reduce(thrust::device, transform_iter, transform_iter + size);
-    std::cerr << key << ": " << asum << " " << asum / size << "\n";
+    std::vector<T> h_b(size);
+    check_cuda_error(cudaMemcpyAsync(h_b.data(), ptr, sizeof(T) * size, cudaMemcpyDefault, stream));
+    check_cuda_error(cudaStreamSynchronize(stream));
+
+    using Tacc         = std::conditional_t<std::is_integral_v<T>, int64_t, float>;
+    constexpr Tacc eps = std::is_integral_v<T> ? 1 : 1e-8f;
+
+    Tacc asum{};
+    Tacc rsum{};
+    Tacc amean{};
+    for (size_t i = 0; i < size; ++i) {
+        Tacc x        = (Tacc)h_b[i];
+        Tacc r        = (Tacc)h_a[i];
+        Tacc abs_diff = std::abs(x - r);
+        Tacc rel_diff = abs_diff / std::max(std::max(std::abs(r), std::abs(x)), eps);
+        asum += abs_diff;
+        rsum += rel_diff;
+        amean += std::abs(r);
+    }
+
+    std::cerr << key << ": " << amean / size << " " << asum << " " << asum / size << " " << rsum / size << "\n";
+
+    check_cuda_error(cudaMemcpyAsync(ptr, h_a.data(), sizeof(T) * h_a.size(), cudaMemcpyDefault, stream));
+    check_cuda_error(cudaStreamSynchronize(stream));
 }
 
 template<typename T>
diff --git a/src/turbomind/models/llama/mla_utils.cu b/src/turbomind/models/llama/mla_utils.cu
new file mode 100644
index 0000000000..2f9e786f2a
--- /dev/null
+++ b/src/turbomind/models/llama/mla_utils.cu
@@ -0,0 +1,93 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "src/turbomind/kernels/core/array_ops.h"
+
+namespace turbomind {
+
+template<class T, int vec_size>
+__global__ void mla_copy_qkv_kernel(T*       qkv,
+                                    const T* q,     // [h, head_dim]
+                                    const T* kv_a,  // [kv_lora_rank, rope_dim]
+                                    const T* kv_b,  // [h, nope_dim + v_head_dim]
+                                    int      head_num,
+                                    int      head_dim,
+                                    int      nope_dim,
+                                    int      rope_dim,
+                                    int      kv_lora_rank,
+                                    int      v_head_dim)
+{
+    const int type = blockIdx.y;
+
+    const int64_t ti = blockIdx.x;
+    const int     di = threadIdx.x;
+
+    const int kv_b_dim = nope_dim + v_head_dim;
+
+    // for (int hi = threadIdx.y; hi < head_num; hi += blockDim.y) {
+    const int          hi = threadIdx.y;
+    Array<T, vec_size> data{};
+    if (type == 0) {  // Q
+        if (di * vec_size < rope_dim) {
+            Ldg(data, &q[ti * head_num * head_dim + hi * head_dim + nope_dim + di * vec_size]);
+        }
+        else {
+            Ldg(data, &q[ti * head_num * head_dim + hi * head_dim + di * vec_size - rope_dim]);
+        }
+    }
+    else if (type == 1) {  // K
+        if (di * vec_size < rope_dim) {
+            Ldg(data, &kv_a[ti * (kv_lora_rank + rope_dim) + kv_lora_rank + di * vec_size]);
+        }
+        else {
+            Ldg(data, &kv_b[ti * head_num * kv_b_dim + hi * kv_b_dim + di * vec_size - rope_dim]);
+        }
+    }
+    else {  // V
+        if (di * vec_size < v_head_dim) {
+            Ldg(data, &kv_b[ti * head_num * kv_b_dim + hi * kv_b_dim + nope_dim + di * vec_size]);
+        }
+    }
+    const int stride = 3 * head_num * head_dim;
+    Store(&qkv[ti * stride + type * head_num * head_dim + hi * head_dim + di * vec_size], data);
+    // }
+}
+
+template<class T>
+void invokeMLACopyQKV(T*           qkv,
+                      const T*     q,
+                      const T*     kv_a,
+                      const T*     kv_b,
+                      int          token_num,
+                      int          head_num,
+                      int          nope_dim,
+                      int          rope_dim,
+                      int          kv_lora_rank,
+                      int          v_head_dim,
+                      cudaStream_t stream)
+{
+    constexpr int vec_size = 16 / sizeof(T);
+    const int     head_dim = nope_dim + rope_dim;
+
+    dim3 block(head_dim / vec_size, head_num);
+    // make sure block size <= 1024
+    while (block.x * block.y > 1024) {
+        block.y /= 2;
+    }
+    const dim3 grid(token_num, 3);
+
+    mla_copy_qkv_kernel<T, vec_size><<<grid, block, 0, stream>>>(
+        qkv, q, kv_a, kv_b, head_num, head_dim, nope_dim, rope_dim, kv_lora_rank, v_head_dim);
+}
+
+template void invokeMLACopyQKV(uint16_t*       qkv,
+                               const uint16_t* q,
+                               const uint16_t* kv_a,
+                               const uint16_t* kv_b,
+                               int             token_num,
+                               int             head_num,
+                               int             nope_dim,
+                               int             rope_dim,
+                               int             kv_lora_rank,
+                               int             v_head_dim,
+                               cudaStream_t    stream);
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/llama/mla_utils.h b/src/turbomind/models/llama/mla_utils.h
new file mode 100644
index 0000000000..bc06a352f9
--- /dev/null
+++ b/src/turbomind/models/llama/mla_utils.h
@@ -0,0 +1,57 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#pragma once
+
+#include <cstdint>
+#include <cuda_runtime.h>
+
+#include "src/turbomind/utils/cuda_utils.h"
+
+namespace turbomind {
+
+template<class T>
+void invokeMLACopyQKV(T*           qkv,
+                      const T*     q,
+                      const T*     kv_a,
+                      const T*     kv_b,
+                      int          token_num,
+                      int          head_num,
+                      int          nope_dim,
+                      int          rope_dim,
+                      int          kv_lora_rank,
+                      int          v_head_dim,
+                      cudaStream_t stream);
+
+template<class T>
+void dispatchMLACopyQKV(T*           qkv,
+                        const T*     q,
+                        const T*     kv_a,
+                        const T*     kv_b,
+                        int          token_num,
+                        int          head_num,
+                        int          nope_dim,
+                        int          rope_dim,
+                        int          kv_lora_rank,
+                        int          v_head_dim,
+                        cudaStream_t stream)
+{
+    auto invoke = [&](auto x) {
+        using type = decltype(x);
+        invokeMLACopyQKV((type*)qkv,
+                         (const type*)q,
+                         (const type*)kv_a,
+                         (const type*)kv_b,
+                         token_num,
+                         head_num,
+                         nope_dim,
+                         rope_dim,
+                         kv_lora_rank,
+                         v_head_dim,
+                         stream);
+    };
+    if constexpr (sizeof(T) == 2) {
+        return invoke(uint16_t{});
+    }
+    FT_CHECK(0);
+}
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc
index 1ad76839d1..390d147540 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.cc
+++ b/src/turbomind/models/llama/moe_ffn_layer.cc
@@ -11,22 +11,21 @@
 #include "src/turbomind/utils/nvtx_utils.h"
 #include "src/turbomind/utils/string_utils.h"
 #include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
 #include <iomanip>
 
 namespace turbomind {
 
 template<class T>
-void MoeFfnLayer<T>::AllocateBuffer(size_t tokens, size_t padded)
+void MoeFfnLayer<T>::AllocateBuffer(size_t tokens, size_t padded, size_t expert_num, size_t inter_buf_factor)
 {
     char* base = 0;
 
     auto allocate = [&](void* base) {
         Monotonic alloc{base};
         alloc(&inout_buf_, tokens * param_.experts_per_token * hidden_dim_);
-        alloc(&inter_buf_, tokens * param_.experts_per_token * inter_size_ * 2);
-        alloc(&logits_, tokens * param_.expert_num);
-        alloc(&masks_, param_.expert_num * padded);
+        alloc(&inter_buf_, tokens * param_.experts_per_token * inter_size_ * inter_buf_factor);
+        alloc(&logits_, tokens * expert_num);
+        alloc(&masks_, expert_num * padded);
         alloc(&f2n_, param_.experts_per_token * tokens);
         alloc(&en2f_, param_.experts_per_token * tokens);
         alloc(&scales_, param_.experts_per_token * tokens);
@@ -80,18 +79,42 @@ void MoeFfnLayer<T>::gate(float* logits, const T* input, int tokens, const Llama
 template<class T>
 void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight<T>& moe)
 {
-    const size_t padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
+    const size_t padded     = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
+    const int    expert_num = moe.experts.size();
 
-    AllocateBuffer(tokens, padded);
+    FT_CHECK(expert_num);
+
+    const size_t inter_buf_factor = [&] {
+        if (param_.method == MoeParam::kNaive) {
+            return 0;  // managed by ffn
+        }
+        else if (moe.block.is_fused_silu) {
+            return 1;
+        }
+        else {
+            return 2;
+        }
+    }();
+
+    AllocateBuffer(tokens, padded, expert_num, inter_buf_factor);
 
     gate(logits_, input, tokens, moe.gate);
     sync_check_cuda_error();
 
-    check_cuda_error(cudaMemsetAsync(accum_, 0, sizeof(int) * param_.expert_num * kMoeGateMaxTiles, stream_));
-    sync_check_cuda_error();
+    // if (tensor_para_.rank_ == 0) {
+    //     Compare(logits_, tokens * expert_num, Concat("logit", layer_id), compare_mode, stream_);
+    // }
+
+    check_cuda_error(cudaMemsetAsync(accum_, 0, sizeof(int) * expert_num * kMoeGateMaxTiles, stream_));
+    check_cuda_error(cudaMemsetAsync(masks_, -1, sizeof(int8_t) * expert_num * padded, stream_));
 
     // dump_logits(tokens, layer_id);
 
+    if (param_.topk_method == "group_limited_greedy") {
+        invokeMaskMoeTopKGroups(logits_, tokens, expert_num, expert_num / param_.n_group, param_.topk_group, stream_);
+        sync_check_cuda_error();
+    }
+
     /// TODO: fix illegal memory access even if NaN are present in logits
     invokeMoeGate_V2(f2n_,
                      en2f_,
@@ -102,25 +125,26 @@ void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id
                      logits_,
                      tokens,
                      padded,
-                     param_.expert_num,
+                     expert_num,
                      param_.experts_per_token,
-                     param_.norm_topk,
+                     param_.norm_topk_prob,
+                     param_.routed_scale,
                      stream_);
     sync_check_cuda_error();
 
     if (isTuning()) {
         std::mt19937     g;
-        const auto       expert_ids = SampleUniform(tokens, param_.expert_num, param_.experts_per_token, g);
-        std::vector<int> cnt(param_.expert_num);
+        const auto       expert_ids = SampleUniform(tokens, expert_num, param_.experts_per_token, g);
+        std::vector<int> cnt(expert_num);
         for (const auto& x : expert_ids) {
             ++cnt[x];
         }
         h_offsets_[0] = 0;
-        for (int i = 0; i < param_.expert_num; ++i) {
+        for (int i = 0; i < expert_num; ++i) {
             h_offsets_[i + 1] = h_offsets_[i] + cnt[i];
         }
         check_cuda_error(
-            cudaMemcpyAsync(offsets_, h_offsets_, sizeof(int) * (param_.expert_num + 1), cudaMemcpyDefault, stream_));
+            cudaMemcpyAsync(offsets_, h_offsets_, sizeof(int) * (expert_num + 1), cudaMemcpyDefault, stream_));
     }
 
     if (param_.method == MoeParam::kNaive) {
@@ -129,15 +153,15 @@ void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id
         sync_check_cuda_error();
 
         check_cuda_error(
-            cudaMemcpyAsync(h_offsets_, offsets_, sizeof(int) * (param_.expert_num + 1), cudaMemcpyDefault, stream_));
+            cudaMemcpyAsync(h_offsets_, offsets_, sizeof(int) * (expert_num + 1), cudaMemcpyDefault, stream_));
 
         check_cuda_error(cudaStreamSynchronize(stream_));
 
-        if (h_offsets_[param_.expert_num] != tokens * param_.experts_per_token) {
-            FT_CHECK_WITH_INFO(0, fmtstr("%d vs %d", h_offsets_[param_.expert_num], tokens * param_.experts_per_token));
+        if (h_offsets_[expert_num] != tokens * param_.experts_per_token) {
+            FT_CHECK_WITH_INFO(0, fmtstr("%d vs %d", h_offsets_[expert_num], tokens * param_.experts_per_token));
         }
 
-        for (int i = 0; i < param_.expert_num; ++i) {
+        for (int i = 0; i < expert_num; ++i) {
 
             FT_CHECK(moe.experts[i].is_fused_silu == false);
 
@@ -153,7 +177,7 @@ void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id
         }
     }
     else {
-        context_->set_offsets(offsets_);
+        context_->update(expert_num, param_.experts_per_token, offsets_);
 
         auto& block = moe.block;
 
@@ -217,7 +241,7 @@ void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id
 }
 
 template<class T>
-void MoeFfnLayer<T>::reduce(T* output, int tokens, const MoeFfnWeight<T>& moe)
+void MoeFfnLayer<T>::reduce(T* output, int tokens, float output_scale, int layer_id, const MoeFfnWeight<T>& moe)
 {
     invokeMoeReduce(output,
                     inout_buf_,
@@ -227,19 +251,21 @@ void MoeFfnLayer<T>::reduce(T* output, int tokens, const MoeFfnWeight<T>& moe)
                     tokens,
                     param_.experts_per_token,
                     hidden_dim_,
+                    output_scale,
                     stream_);
     sync_check_cuda_error();
 
     if (tensor_para_.world_size_ > 1) {
+        // std::cout << "moe all reduce " << layer_id << "\n";
         ftNcclAllReduceSum(output, output, tokens * hidden_dim_, tensor_para_, stream_);
         sync_check_cuda_error();
     }
 }
 
 template<class T>
-void MoeFfnLayer<T>::dump_logits(int token_num, int layer_id)
+void MoeFfnLayer<T>::dump_logits(int token_num, int layer_id, int expert_num)
 {
-    std::vector<float> logits(token_num * param_.expert_num);
+    std::vector<float> logits(token_num * expert_num);
     check_cuda_error(
         cudaMemcpyAsync(logits.data(), logits_, sizeof(float) * logits.size(), cudaMemcpyDefault, stream_));
     check_cuda_error(cudaStreamSynchronize(stream_));
@@ -247,7 +273,7 @@ void MoeFfnLayer<T>::dump_logits(int token_num, int layer_id)
     auto ptr = logits.data();
     std::cout << "layer_id: " << layer_id << std::endl;
     for (int i = 0; i < token_num; ++i) {
-        for (int e = 0; e < param_.expert_num; ++e) {
+        for (int e = 0; e < expert_num; ++e) {
             std::cout << *ptr++ << " ";
         }
         std::cout << std::endl;
diff --git a/src/turbomind/models/llama/moe_ffn_layer.h b/src/turbomind/models/llama/moe_ffn_layer.h
index 0f1713f7b5..74c62d004b 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.h
+++ b/src/turbomind/models/llama/moe_ffn_layer.h
@@ -9,6 +9,7 @@
 #include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
 #include "src/turbomind/utils/nccl_utils.h"
+#include <algorithm>
 
 namespace turbomind {
 
@@ -26,23 +27,24 @@ class MoeFfnLayer {
         linear_(ctx.linear.get()),
         allocator_(ctx.allocator.get())
     {
-        model.inter_size = param.inter_size;
+        FT_CHECK(!param.expert_num.empty());
+        const int max_expert_num = *std::max_element(param.expert_num.begin(), param.expert_num.end());
 
         if (param_.method == MoeParam::kFused) {
             context_ = std::make_unique<gemm::MoeGemmContext>(
-                param.expert_num, param.experts_per_token, ctx.cuda_device_prop, stream_);
+                max_expert_num, param.experts_per_token, ctx.cuda_device_prop, stream_);
         }
         else {
-            expert_ffn_ = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx, false);
+            expert_ffn_ = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx);
         }
 
-        h_offsets_ = (int*)allocator_->malloc(sizeof(int) * (param_.expert_num + 1), false, true);
+        h_offsets_ = (int*)allocator_->malloc(sizeof(int) * (max_expert_num + 1), false, true);
 
-        offsets_ = (int*)allocator_->malloc(sizeof(int) * (param_.expert_num + 1));
-        accum_   = (int*)allocator_->malloc(sizeof(int) * param_.expert_num * kMoeGateMaxTiles);
+        offsets_ = (int*)allocator_->malloc(sizeof(int) * (max_expert_num + 1));
+        accum_   = (int*)allocator_->malloc(sizeof(int) * max_expert_num * kMoeGateMaxTiles);
     }
 
-    void AllocateBuffer(size_t tokens, size_t padded);
+    void AllocateBuffer(size_t tokens, size_t padded, size_t expert_num, size_t inter_buf_factor);
 
     void FreeBuffer();
 
@@ -53,11 +55,11 @@ class MoeFfnLayer {
 
     void forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight<T>& moe);
 
-    void reduce(T* output, int tokens, const MoeFfnWeight<T>& moe);
+    void reduce(T* output, int tokens, float output_scale, int layer_id, const MoeFfnWeight<T>& moe);
 
     void gate(float* logits, const T* input, int tokens, const LlamaDenseWeight<T>& weight);
 
-    void dump_logits(int token_num, int layer_id);
+    void dump_logits(int token_num, int layer_id, int expert_num);
 
 private:
     const size_t           inter_size_;
diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc
index 2f99b0c2ce..7a6eddc4ba 100644
--- a/src/turbomind/models/llama/unified_attention_layer.cc
+++ b/src/turbomind/models/llama/unified_attention_layer.cc
@@ -19,21 +19,24 @@
 // Modified from
 // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc
 
-#include "src/turbomind/models/llama/unified_attention_layer.h"
+#include <algorithm>
+#include <math.h>
+
 #include "src/turbomind/kernels/attention/attention.h"
 #include "src/turbomind/kernels/attention/decoding.h"
 #include "src/turbomind/kernels/attention/kv_cache_utils_v2.h"
+#include "src/turbomind/kernels/norm/rms_norm.h"
 #include "src/turbomind/macro.h"
 #include "src/turbomind/models/llama/LlamaNcclGuard.h"
 #include "src/turbomind/models/llama/llama_kernels.h"
 #include "src/turbomind/models/llama/llama_utils.h"
+#include "src/turbomind/models/llama/mla_utils.h"
+#include "src/turbomind/models/llama/unified_attention_layer.h"
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/anomaly_handler.h"
 #include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/debug_utils.h"
 #include "src/turbomind/utils/logger.h"
-#include <algorithm>
-#include <math.h>
+#include "src/turbomind/utils/memory_utils.h"
 
 namespace turbomind {
 
@@ -72,17 +75,14 @@ UnifiedAttentionLayer<T>::UnifiedAttentionLayer(const ModelParam&     model,
 }
 
 template<typename T>
-void UnifiedAttentionLayer<T>::allocateBuffer(size_t            q_count,
-                                              size_t            k_count,
-                                              size_t            batch_size,
-                                              const WeightType* weights)
+void UnifiedAttentionLayer<T>::allocateBuffer(size_t q_count, size_t k_count, size_t batch_size, size_t qkv_lora_rank)
 {
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
 
     const int local_q_kv_head_num = local_head_num_ + 2 * local_kv_head_num_;
 
-    if (weights->qkv.lora.r) {
-        size_t sz = sizeof(T) * q_count * (local_q_kv_head_num * size_per_head_ + weights->qkv.lora.r);
+    if (qkv_lora_rank) {
+        size_t sz = sizeof(T) * q_count * (local_q_kv_head_num * size_per_head_ + qkv_lora_rank);
         qkv_buf_  = (T*)allocator_->reMalloc(qkv_buf_, sz, false);
     }
     else {
@@ -198,28 +198,38 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
     allocateBuffer(token_num,                                           // shared
                    h_cu_k_len[batch_size] - h_cu_k_len[dc_batch_size],  // prefill
                    batch_size,
-                   weights);
+                   weights->qkv.lora.r);
 
     // [L, 2, H, s, D]
     const size_t layer_offset = layer_id * 2 * local_kv_head_num_ * param_.cache_block_seq_len * size_per_head_;
 
-    static int count = 0;
+    // static int count = 0;
 
-    // if (layer_id == 0 && count == 0) {
-    //     Compare(attention_input, token_num * weights->qkv.input_dims, "qkv_input", compare_mode, stream_);
+    // if (tensor_para_.rank_ == 0) {
+    //     Compare(attention_input, token_num * hidden_units_, Concat("qkv_input", layer_id), compare_mode, stream_);
     // }
 
     int* lora_mask = inputs->at("lora_mask", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<int>();
-    //////////////////////////////////////////////
-    /// qkv gemm
-    // [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim]
-    linear_->forward(qkv_buf_, attention_input, token_num, weights->qkv, LlamaLinear<T>::kGemm, lora_mask);
-    sync_check_cuda_error();
+
+    if (weights->qkv.output_dims) {
+        //////////////////////////////////////////////
+        /// qkv gemm
+        // [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim]
+        linear_->forward(qkv_buf_, attention_input, token_num, weights->qkv, LlamaLinear<T>::kGemm, lora_mask);
+        sync_check_cuda_error();
+    }
+    else {
+        forward_mla(attention_input, token_num, *weights);
+    }
+
+    // std::cerr << layer_id << " " << count << " " << tensor_para_.rank_ << "\n";
 
     count_and_fix(qkv_buf_, token_num * weights->qkv.output_dims, Concat("qkv", layer_id), 3);
 
-    // if (layer_id == 0 && count == 0) {
-    //     Compare(qkv_buf_, token_num * weights->qkv.output_dims, "qkv_buf", compare_mode, stream_);
+    // std::cerr << "token num: " << token_num << "\n";
+
+    // if (layer_id == 0 && count == 0 && tensor_para_.rank_ == 0) {
+    //     Compare(qkv_buf_, token_num * (3 * local_head_num_ * size_per_head_), "qkv_buf", CMP_MODE, stream_);
     // }
 
     if constexpr (0) {
@@ -290,8 +300,15 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
         params.num_heads     = local_head_num_;
         params.num_kv_heads  = local_kv_head_num_;
         params.size_per_head = size_per_head_;
+
         // MSVC does not have M_LOG2E
-        params.inv_sqrt_dh = (float)std::log2(expf(1.)) / std::sqrt((float)params.size_per_head);
+        params.inv_sqrt_dh = (float)std::log2(expf(1.));
+        if (param_.softmax_scale) {  // model predefined softmax scale
+            params.inv_sqrt_dh *= param_.softmax_scale;
+        }
+        else {  // default value
+            params.inv_sqrt_dh /= std::sqrt((float)params.size_per_head);
+        }
 
         params.rotary_embedding_dim    = param_.rotary_embedding_dim;
         params.rotary_embedding_base   = param_.rotary_embedding_base;
@@ -324,8 +341,9 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
             };
             float low, high;
             find_correction_range(param_.beta_fast, param_.beta_slow, low, high);
+            // https://github.com/huggingface/transformers/blob/6c3f168b36882f0beebaa9121eafa1928ba29633/src/transformers/modeling_rope_utils.py#L216
             if (low == high) {
-                high += 0.01f;
+                high += 0.001f;
             }
             params.yarn_ramp_inv_factor_div_2   = 1.0 / (high - low) / 2.0;
             params.yarn_ramp_inv_factor_mul_min = 1.0 / (high - low) * low;
@@ -415,8 +433,6 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
     linear_->forward(attention_out, qkv_buf_3_, token_num, weights->output, LlamaLinear<T>::kGemm, lora_mask);
     sync_check_cuda_error();
 
-    // ++count;
-
     count_and_fix(attention_out, token_num * weights->output.output_dims, Concat("wo", layer_id), 3);
 
     if (tensor_para_.world_size_ > 1) {
@@ -425,10 +441,94 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
         sync_check_cuda_error();
     }
 
+    // if (tensor_para_.rank_ == 0) {
+    //     Compare(attention_out, token_num * hidden_units_, Concat("attn_out", layer_id), compare_mode, stream_);
+    //     // dump(qkv_buf_3_, num_token * weights->output.input_dims, stream_, "qkv_buf_3");
+    // }
+
     if (is_free_buffer_after_forward_ == true) {
         freeBuffer();
     }
     sync_check_cuda_error();
+
+    // ++count;
+}
+
+template<typename T>
+void UnifiedAttentionLayer<T>::forward_mla(const T* inputs, int token_num, const WeightType& w)
+{
+    const int q_lora_rank  = w.q_a_proj.output_dims;
+    const int kv_lora_rank = w.kv_b_proj.input_dims;
+    const int qk_rope_dim  = w.kv_a_proj.output_dims - kv_lora_rank;
+    const int qk_nope_dim  = std::max(w.q_b_proj.output_dims, w.q_proj.output_dims) / local_head_num_ - qk_rope_dim;
+    const int v_head_dim   = w.kv_b_proj.output_dims / local_head_num_ - qk_nope_dim;
+
+    T* q{};
+
+    if (w.q_proj.kernel) {
+        deviceMalloc((T**)&q, (size_t)token_num * w.q_proj.output_dims, stream_);
+        linear_->forward(q, inputs, token_num, w.q_proj);
+        sync_check_cuda_error();
+    }
+    else {
+        T* q_a{};
+        deviceMalloc((T**)&q_a, (size_t)token_num * q_lora_rank, stream_);
+
+        linear_->forward(q_a, inputs, token_num, w.q_a_proj);
+        sync_check_cuda_error();
+
+        invokeRMSNorm(q_a,
+                      q_lora_rank,
+                      q_a,
+                      q_lora_rank,
+                      w.q_a_layernorm,
+                      q_lora_rank,
+                      token_num,
+                      model_param_.norm_eps,
+                      stream_);
+        sync_check_cuda_error();
+
+        deviceMalloc((T**)&q, (size_t)token_num * w.q_b_proj.output_dims, stream_);
+        linear_->forward(q, q_a, token_num, w.q_b_proj);
+        sync_check_cuda_error();
+
+        deviceFree(q_a, stream_);
+    }
+
+    T*        kv_a{};
+    const int kv_a_dim = w.kv_a_proj.output_dims;
+    deviceMalloc((T**)&kv_a, (size_t)token_num * kv_a_dim, stream_);
+
+    linear_->forward(kv_a, inputs, token_num, w.kv_a_proj);
+    sync_check_cuda_error();
+
+    invokeRMSNorm(
+        kv_a, kv_a_dim, kv_a, kv_a_dim, w.kv_a_layernorm, kv_lora_rank, token_num, model_param_.norm_eps, stream_);
+    sync_check_cuda_error();
+
+    T* kv_b{};
+    deviceMalloc((T**)&kv_b, (size_t)token_num * w.kv_b_proj.output_dims, stream_);
+    sync_check_cuda_error();
+
+    linear_->forward(kv_b, {kv_a, kv_a_dim}, token_num, w.kv_b_proj);
+    sync_check_cuda_error();
+
+    dispatchMLACopyQKV(qkv_buf_,
+                       q,
+                       kv_a,
+                       kv_b,
+                       token_num,
+                       local_head_num_,
+                       qk_nope_dim,
+                       qk_rope_dim,
+                       kv_lora_rank,
+                       v_head_dim,
+                       stream_);
+    sync_check_cuda_error();
+
+    deviceFree(q, stream_);
+    deviceFree(kv_a, stream_);
+    deviceFree(kv_b, stream_);
 }
 
 #ifdef ENABLE_FP32
diff --git a/src/turbomind/models/llama/unified_attention_layer.h b/src/turbomind/models/llama/unified_attention_layer.h
index da0c0e6fc8..7d331b0e41 100644
--- a/src/turbomind/models/llama/unified_attention_layer.h
+++ b/src/turbomind/models/llama/unified_attention_layer.h
@@ -42,7 +42,7 @@ class UnifiedAttentionLayer {
     static constexpr int kMaxWorkspaceTokens = 4096;
 
     void freeBuffer();
-    void allocateBuffer(size_t q_count, size_t k_count, size_t batch_size, const WeightType* weights);
+    void allocateBuffer(size_t q_count, size_t k_count, size_t batch_size, size_t qkv_lora_rank);
 
     void allocateWorkspace();
     void freeWorkspace();
@@ -70,7 +70,7 @@ class UnifiedAttentionLayer {
                           const NcclParam&      tp,
                           const Context<T>&     context);
 
-    void forward(TensorMap* outputs, const TensorMap* inputs, const LlamaAttentionWeight<T>* weights);
+    void forward(TensorMap* outputs, const TensorMap* inputs, const WeightType* weights);
 
     void prefill(T*                output,
                  T*                tmp_kv_buffer,
@@ -107,6 +107,9 @@ class UnifiedAttentionLayer {
                 int               max_split_k,
                 const WeightType* weights);
 
+private:
+    void forward_mla(const T* inputs, int token_num, const WeightType& weights);
+
 private:
     const size_t head_num_;
     const size_t kv_head_num_;
diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc
index 28e8b5f649..ec0e75b7e5 100644
--- a/src/turbomind/models/llama/unified_decoder.cc
+++ b/src/turbomind/models/llama/unified_decoder.cc
@@ -1,13 +1,17 @@
 
-#include "src/turbomind/models/llama/unified_decoder.h"
+
+#include <cuda_runtime.h>
+
+#include "src/turbomind/kernels/norm/rms_norm.h"
 #include "src/turbomind/models/llama/llama_decoder_kernels.h"
 #include "src/turbomind/models/llama/llama_kernels.h"
 #include "src/turbomind/models/llama/llama_utils.h"
 #include "src/turbomind/models/llama/moe_ffn_layer.h"
 #include "src/turbomind/models/llama/unified_attention_layer.h"
+#include "src/turbomind/models/llama/unified_decoder.h"
+#include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/anomaly_handler.h"
 #include "src/turbomind/utils/cuda_utils.h"
-#include <cuda_runtime.h>
 
 namespace turbomind {
 
@@ -23,17 +27,19 @@ UnifiedDecoder<T>::UnifiedDecoder(const ModelParam&     model,
     rmsnorm_eps_(model.norm_eps),
     stream_(ctx.stream),
     allocator_(ctx.allocator.get()),
-    dtype_(getTensorType<T>())
+    tp_(tp),
+    dtype_(getTensorType<T>()),
+    tune_layer_num_(model.tune_layer_num)
 {
 
     attn_layer_ = std::make_unique<UnifiedAttentionLayer<T>>(model, attn, lora, tp, ctx);
 
-    if (moe.expert_num) {
+    if (std::accumulate(moe.expert_num.begin(), moe.expert_num.end(), 0LL)) {
         moe_ffn_layer_ = std::make_unique<MoeFfnLayer<T>>(model, moe, tp, ctx);
     }
 
-    if (model.inter_size) {
-        ffn_layer_ = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx, !moe_ffn_layer_);
+    if (std::accumulate(model.inter_size.begin(), model.inter_size.end(), 0LL)) {
+        ffn_layer_ = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx);
     }
 
     check_cuda_error(cudaEventCreateWithFlags(&ev_h_cu_x_, cudaEventDisableTiming));
@@ -65,13 +71,13 @@ void UnifiedDecoder<T>::freeBuffer()
 }
 
 template<typename T>
-void UnifiedDecoder<T>::forwardSelfAttn(T*                             attn_io,
-                                        TensorMap*                     _outputs,
-                                        const TensorMap*               _inputs,
-                                        size_t                         token_num,
-                                        size_t                         batch_size,
-                                        int                            layer_id,
-                                        const LlamaAttentionWeight<T>* weight)
+void UnifiedDecoder<T>::forwardSelfAttn(T*                attn_io,
+                                        TensorMap*        _outputs,
+                                        const TensorMap*  _inputs,
+                                        size_t            token_num,
+                                        size_t            batch_size,
+                                        int               layer_id,
+                                        const WeightType* weight)
 {
     TensorMap inputs(*_inputs);
     inputs.insert("input_query", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, attn_io});
@@ -84,7 +90,7 @@ void UnifiedDecoder<T>::forwardSelfAttn(T*                             attn_io,
     TensorMap outputs(*_outputs);
     outputs.insert("hidden_features", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, attn_io});
 
-    attn_layer_->forward(&outputs, &inputs, weight);
+    attn_layer_->forward(&outputs, &inputs, &weight->self_attn_weights);
 }
 
 template<typename T>
@@ -141,19 +147,15 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
 
     const int pf_offset = dc_batch_size;
 
-    // Compare(decoder_input_output, token_num * hidden_units_, "decoder_input", kCmpRead, stream_);
-
-    // printf("%d %f\n", (int)token_num, rmsnorm_eps_);
-
     /////////////////////////////////////////////
     /// RMSNorm
-    invokeRootMeanSquareNorm(decoder_output,
-                             decoder_input_output,
-                             weights->at(0)->self_attn_norm_weights,
-                             rmsnorm_eps_,
-                             token_num,
-                             hidden_units_,
-                             stream_);
+    invokeRMSNorm(decoder_output,
+                  decoder_input_output,
+                  weights->at(0)->self_attn_norm_weights,
+                  hidden_units_,
+                  token_num,
+                  rmsnorm_eps_,
+                  stream_);
     sync_check_cuda_error();
 
     count_and_fix(decoder_output, token_num * hidden_units_, Concat("norm0", 0), 2);
@@ -161,12 +163,10 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
     for (size_t layer = 0; layer < layer_num_; ++layer) {
 
         /// TODO: do not skip the layers when they are heterogeneous
-        if (isTuning() && layer != 0) {
+        if (isTuning() && layer >= tune_layer_num_) {
             continue;
         }
 
-        // Compare(decoder_output, token_num * hidden_units_, "attn_input", kCmpRead, stream_);
-
         /////////////////////////////////////////////
         /// self-attention
         forwardSelfAttn(decoder_output,  //
@@ -175,18 +175,18 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
                         token_num,
                         batch_size,
                         layer,
-                        &weights->at(layer)->self_attn_weights);
+                        weights->at(layer));
 
         count_and_fix(decoder_output, token_num * hidden_units_, Concat("attn_block", layer), 2);
 
-        invokeFusedAddBiasResidualRMSNorm(decoder_input_output,
-                                          decoder_output,
-                                          weights->at(layer)->self_attn_weights.output.bias,
-                                          weights->at(layer)->ffn_norm_weights,
-                                          rmsnorm_eps_,
-                                          token_num,
-                                          hidden_units_,
-                                          stream_);
+        invokeBiasResidualRMSNorm(decoder_input_output,
+                                  decoder_output,
+                                  weights->at(layer)->ffn_norm_weights,
+                                  weights->at(layer)->self_attn_weights.output.bias,
+                                  hidden_units_,
+                                  token_num,
+                                  rmsnorm_eps_,
+                                  stream_);
         sync_check_cuda_error();
 
         count_and_fix(decoder_input_output, token_num * hidden_units_, Concat("residual0", layer), 2);
@@ -195,14 +195,17 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
         ////////////////////////////////////////////
         /// feed-forward network
 
-        if (!weights->at(layer)->moe_weights.experts.empty()) {
+        const bool is_moe = !weights->at(layer)->moe_weights.experts.empty();
+        if (is_moe) {
             moe_ffn_layer_->forward(nullptr, decoder_output, token_num, layer, weights->at(layer)->moe_weights);
         }
 
-        if (ffn_layer_) {
-            int       layer_id = layer;  // int is needed
+        if (weights->at(layer)->ffn_weights.output.kernel) {
+            int       layer_id   = layer;  // int is needed
+            bool      all_reduce = !is_moe;
             TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}},
-                                 {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}}};
+                                 {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}},
+                                 {"all_reduce", {MEMORY_CPU, TYPE_BOOL, {1}, &all_reduce}}};
             TensorMap ffn_outputs{{"ffn_output", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}}};
             if (inputs->isExist("lora_mask")) {
                 ffn_inputs.insert({"lora_mask", inputs->at("lora_mask")});
@@ -210,8 +213,8 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
             ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &weights->at(layer)->ffn_weights);
         }
 
-        if (!weights->at(layer)->moe_weights.experts.empty()) {
-            moe_ffn_layer_->reduce(decoder_output, token_num, weights->at(layer)->moe_weights);
+        if (is_moe) {
+            moe_ffn_layer_->reduce(decoder_output, token_num, (bool)ffn_layer_, layer, weights->at(layer)->moe_weights);
         }
 
         count_and_fix(decoder_output, token_num * hidden_units_, Concat("ffn_block", layer), 2);
diff --git a/src/turbomind/models/llama/unified_decoder.h b/src/turbomind/models/llama/unified_decoder.h
index f13b4ba842..e08567136d 100644
--- a/src/turbomind/models/llama/unified_decoder.h
+++ b/src/turbomind/models/llama/unified_decoder.h
@@ -22,7 +22,9 @@ class UnifiedDecoder {
     const float        rmsnorm_eps_;
     cudaStream_t const stream_;
     IAllocator* const  allocator_;
+    const NcclParam    tp_;
     const DataType     dtype_;
+    const int          tune_layer_num_;
     bool               is_free_buffer_after_forward_{};
 
     int* cu_q_len_{};
@@ -39,13 +41,13 @@ class UnifiedDecoder {
 
     using WeightType = LlamaDecoderLayerWeight<T>;
 
-    void forwardSelfAttn(T*                             attn_io,
-                         TensorMap*                     _outputs,
-                         const TensorMap*               _inputs,
-                         size_t                         token_num,
-                         size_t                         batch_size,
-                         int                            layer_id,
-                         const LlamaAttentionWeight<T>* weight);
+    void forwardSelfAttn(T*                attn_io,
+                         TensorMap*        _outputs,
+                         const TensorMap*  _inputs,
+                         size_t            token_num,
+                         size_t            batch_size,
+                         int               layer_id,
+                         const WeightType* weight);
 
 public:
     UnifiedDecoder(const ModelParam&     model,
diff --git a/src/turbomind/models/llama/weight_type.h b/src/turbomind/models/llama/weight_type.h
new file mode 100644
index 0000000000..bc2f49a08e
--- /dev/null
+++ b/src/turbomind/models/llama/weight_type.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <cstdint>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace turbomind {
+
+enum class WeightType : int
+{
+    kFP32,
+    kFP16,
+    kFP8,  // not supported yet
+    kBF16,
+    kINT8,
+    kINT4
+};
+
+template<class T>
+constexpr WeightType get_default_weight_type()
+{
+    if constexpr (std::is_same_v<T, half>) {
+        return WeightType::kFP16;
+    }
+    else if constexpr (std::is_same_v<T, nv_bfloat16>) {
+        return WeightType::kBF16;
+    }
+    else if constexpr (std::is_same_v<T, float>) {
+        return WeightType::kFP32;
+    }
+    else {
+        static_assert(sizeof(T) != sizeof(T), "not implemented");
+        return {};
+    }
+}
+
+inline size_t getBitSize(WeightType type)
+{
+    switch (type) {
+        case WeightType::kFP32:
+            return 32;
+        case WeightType::kFP16:
+            return 16;
+        case WeightType::kFP8:
+            return 8;
+        case WeightType::kBF16:
+            return 16;
+        case WeightType::kINT8:
+            return 8;
+        case WeightType::kINT4:
+            return 4;
+    }
+    return 0;
+}
+
+}  // namespace turbomind
diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp
index 4eb34249ff..5a344d9545 100644
--- a/src/turbomind/python/bind.cpp
+++ b/src/turbomind/python/bind.cpp
@@ -215,6 +215,51 @@ DLTensor GetDLTensor(py::object obj)
     return dlmt->dl_tensor;
 }
 
+static void safe_memcpy(void* dst, const void* src, size_t size)
+{
+    cudaPointerAttributes dat{};
+    cudaPointerAttributes sat{};
+    ft::check_cuda_error(cudaPointerGetAttributes(&dat, dst));
+    ft::check_cuda_error(cudaPointerGetAttributes(&sat, src));
+    try {
+        if (dat.devicePointer && sat.devicePointer) {
+            // Both can be accessed from current context
+            ft::check_cuda_error(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
+        }
+        else if (dat.type == cudaMemoryTypeDevice && sat.type == cudaMemoryTypeDevice) {
+            if (dat.device != sat.device) {
+                // On different devices, try peer memcpy
+                ft::check_cuda_error(cudaMemcpyPeer(dst, dat.device, src, sat.device, size));
+            }
+            else {
+                // Same device, switch to the device first (this is unlikely)
+                ft::CudaDeviceGuard guard(dat.device);
+                ft::check_cuda_error(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
+            }
+        }
+        else {
+            // Unknown case, give it a try anyway
+            ft::check_cuda_error(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
+        }
+    }
+    catch (...) {
+        int device_id{-1};
+        cudaGetDevice(&device_id);
+        TM_LOG_ERROR("cudaMemcpy failed: dst=(%d, %d, %p, %p), src=(%d, %d, %p, %p), size=%s, device=%d",
+                     (int)dat.type,
+                     dat.device,
+                     dat.devicePointer,
+                     dat.hostPointer,
+                     (int)sat.type,
+                     sat.device,
+                     sat.devicePointer,
+                     sat.hostPointer,
+                     std::to_string(size).c_str(),
+                     device_id);
+        throw;
+    }
+}
+
 PYBIND11_MODULE(_turbomind, m)
 {
     // nccl param
@@ -293,8 +338,7 @@ PYBIND11_MODULE(_turbomind, m)
                             std::accumulate(src->shape.begin(), src->shape.end(), 1LL, std::multiplies<int64_t>());
                         auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8;
                         ft::FT_CHECK(self->shape.size() == 1 && num_bytes == self->shape[0]);
-                        cudaMemcpy(
-                            const_cast<void*>(self->data), const_cast<void*>(src->data), num_bytes, cudaMemcpyDefault);
+                        safe_memcpy(const_cast<void*>(self->data), src->data, num_bytes);
                         break;
                     }
                     default:
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 2deca46380..1c7c5eb468 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -256,22 +256,30 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     model_param_.kv_head_num        = model_reader["kv_head_num"].as<int>(0);
     model_param_.hidden_units       = model_reader["hidden_units"].as<int>();
     model_param_.layer_num          = model_reader["num_layer"].as<int>();
-    model_param_.inter_size         = model_reader["inter_size"].as<int>();
     model_param_.vocab_size         = model_reader["vocab_size"].as<int>();
     model_param_.embedding_size     = model_reader["embedding_size"].as<int>();
     model_param_.norm_eps           = model_reader["norm_eps"].as<float>();
     model_param_.start_id           = model_reader["start_id"].as<int>();
     model_param_.end_id             = model_reader["end_id"].as<int>();
+    model_param_.tune_layer_num     = model_reader["tune_layer_num"].as<int>(1);
+    model_param_.mla.q_lora_rank    = model_reader["q_lora_rank"].as<int>();
+    model_param_.mla.kv_lora_rank   = model_reader["kv_lora_rank"].as<int>();
+    model_param_.mla.qk_rope_dim    = model_reader["qk_rope_dim"].as<int>();
+    model_param_.mla.v_head_dim     = model_reader["v_head_dim"].as<int>();
     attn_param_.cache_block_seq_len = attention_reader["cache_block_seq_len"].as<int>(0);
     model_param_.quant_policy       = engine_reader["quant_policy"].as<int>(0);
-
+    YAML::Node inter_size           = model_reader["inter_size"];
+    for (auto it = inter_size.begin(); it != inter_size.end(); ++it) {
+        model_param_.inter_size.push_back(it->as<int>());
+    }
     // Only weight classes need these
-    attn_bias_  = model_reader["attn_bias"].as<int>(0);
-    group_size_ = model_reader["group_size"].as<int>(0);
+    model_param_.attn_bias  = model_reader["attn_bias"].as<int>(0);
+    model_param_.group_size = model_reader["group_size"].as<int>(0);
 
     // rotary embedding parameters
     attn_param_.rotary_embedding_dim    = attention_reader["rotary_embedding"].as<int>();
     attn_param_.rotary_embedding_base   = attention_reader["rope_theta"].as<float>(10000.0f);
+    attn_param_.softmax_scale           = attention_reader["softmax_scale"].as<float>(0);
     attn_param_.attention_factor        = attention_reader["attention_factor"].as<float>(-1.f);
     attn_param_.beta_fast               = attention_reader["beta_fast"].as<float>(32.f);
     attn_param_.beta_slow               = attention_reader["beta_slow"].as<float>(1.f);
@@ -297,19 +305,27 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     engine_param_.num_tokens_per_iter = engine_reader["num_tokens_per_iter"].as<int>(0);
     engine_param_.max_prefill_iters   = engine_reader["max_prefill_iters"].as<int>(1);
 
-    lora_param_.policy           = ft::getLoraPolicy(reader["lora_config"]["lora_policy"].as<std::string>(""));
-    lora_param_.r                = lora_reader["lora_r"].as<int>(0);
-    lora_param_.scale            = lora_reader["lora_scale"].as<float>(0);
-    lora_param_.max_wo_r         = lora_reader["lora_max_wo_r"].as<int>(0);
-    lora_param_.rank_pattern     = getLoraPattern<int>(lora_reader["lora_rank_pattern"].as<std::string>(""),
+    lora_param_.policy        = ft::getLoraPolicy(reader["lora_config"]["lora_policy"].as<std::string>(""));
+    lora_param_.r             = lora_reader["lora_r"].as<int>(0);
+    lora_param_.scale         = lora_reader["lora_scale"].as<float>(0);
+    lora_param_.max_wo_r      = lora_reader["lora_max_wo_r"].as<int>(0);
+    lora_param_.rank_pattern  = getLoraPattern<int>(lora_reader["lora_rank_pattern"].as<std::string>(""),
                                                    [](const std::string& s) { return std::stoi(s); });
-    lora_param_.scale_pattern    = getLoraPattern<float>(lora_reader["lora_scale_pattern"].as<std::string>(""),
+    lora_param_.scale_pattern = getLoraPattern<float>(lora_reader["lora_scale_pattern"].as<std::string>(""),
                                                       [](const std::string& s) { return std::stof(s); });
-    moe_param_.expert_num        = model_reader["expert_num"].as<int>(0);
+
     moe_param_.experts_per_token = model_reader["experts_per_token"].as<int>(0);
     moe_param_.inter_size        = model_reader["expert_inter_size"].as<int>(0);
-    moe_param_.shared_gate       = model_reader["moe_shared_gate"].as<int>(0);
-    moe_param_.norm_topk         = model_reader["moe_norm_topk"].as<bool>(false);
+    moe_param_.shared_gate       = model_reader["moe_shared_gate"].as<bool>();
+    moe_param_.norm_topk_prob    = model_reader["norm_topk_prob"].as<bool>();
+    moe_param_.routed_scale      = model_reader["routed_scale"].as<float>(1.f);
+    moe_param_.topk_group        = model_reader["topk_group"].as<int>(1);
+    moe_param_.topk_method       = model_reader["topk_method"].as<std::string>("greedy");
+    moe_param_.n_group           = model_reader["moe_group_num"].as<int>(1);
+    YAML::Node expert_num        = model_reader["expert_num"];
+    for (auto it = expert_num.begin(); it != expert_num.end(); ++it) {
+        moe_param_.expert_num.push_back(it->as<int>());
+    }
 
     handleMissingParams();
 
@@ -321,19 +337,19 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
 
     const std::string weight_type_str = model_reader["weight_type"].as<std::string>();
     if (weight_type_str == "fp16" || weight_type_str == "float16") {
-        weight_type_ = ft::WeightType::kFP16;
+        model_param_.weight_type = ft::WeightType::kFP16;
     }
     else if (weight_type_str == "bf16" || weight_type_str == "bfloat16") {
-        weight_type_ = ft::WeightType::kBF16;
+        model_param_.weight_type = ft::WeightType::kBF16;
     }
     else if (weight_type_str == "fp32") {
-        weight_type_ = ft::WeightType::kFP32;
+        model_param_.weight_type = ft::WeightType::kFP32;
     }
     else if (weight_type_str == "int8") {
-        weight_type_ = ft::WeightType::kINT8;
+        model_param_.weight_type = ft::WeightType::kINT8;
     }
     else if (weight_type_str == "int4") {
-        weight_type_ = ft::WeightType::kINT4;
+        model_param_.weight_type = ft::WeightType::kINT4;
     }
     else {
         std::cout << "[ERROR] Unsupported weight type: '" << weight_type_str << "'\n";
@@ -418,21 +434,8 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
     const int tensor_para_rank   = rank % tensor_para_size_;
     const int pipeline_para_rank = rank / tensor_para_size_;
     ft::FT_CHECK(pipeline_para_size_ == 1 && pipeline_para_rank == 0);
-    weights_[device_id] = std::make_shared<ft::LlamaWeight<T>>(model_param_.head_num,
-                                                               model_param_.kv_head_num,
-                                                               model_param_.head_dim,
-                                                               model_param_.hidden_units,
-                                                               model_param_.inter_size,
-                                                               model_param_.vocab_size,
-                                                               model_param_.embedding_size,
-                                                               model_param_.layer_num,
-                                                               attn_bias_,
-                                                               weight_type_,
-                                                               group_size_,
-                                                               lora_param_,
-                                                               moe_param_,
-                                                               tensor_para_size_,
-                                                               tensor_para_rank);
+    weights_[device_id] = std::make_shared<ft::LlamaWeight<T>>(
+        model_param_, lora_param_, moe_param_, tensor_para_size_, tensor_para_rank);
     // model inited with model_dir
     if (model_dir_ != "") {
         weights_[device_id]->loadModel(model_dir_);
@@ -488,9 +491,11 @@ std::string LlamaTritonModel<T>::toString()
     std::stringstream ss;
     ss << "Model: "  //
        << "\nhead_num: " << model_param_.head_num << "\nkv_head_num: " << model_param_.kv_head_num
-       << "\nsize_per_head: " << model_param_.head_dim << "\ninter_size: " << model_param_.inter_size
+       << "\nsize_per_head: "
+       << model_param_.head_dim
+       //    << "\ninter_size: " << model_param_.inter_size
        << "\nnum_layer: " << model_param_.layer_num << "\nvocab_size: " << model_param_.vocab_size
-       << "\nattn_bias: " << attn_bias_ << "\nmax_batch_size: " << engine_param_.max_batch_size
+       << "\nattn_bias: " << model_param_.attn_bias << "\nmax_batch_size: " << engine_param_.max_batch_size
        << "\nmax_prefill_token_num: " << engine_param_.max_prefill_token_num
        << "\nmax_context_token_num: " << engine_param_.max_context_token_num
        << "\nnum_tokens_per_iter: " << engine_param_.num_tokens_per_iter
@@ -501,8 +506,9 @@ std::string LlamaTritonModel<T>::toString()
        << "\nenable_prefix_caching: " << engine_param_.enable_prefix_caching << "\nstart_id: " << model_param_.start_id
        << "\ntensor_para_size: " << tensor_para_size_ << "\npipeline_para_size: " << pipeline_para_size_
        << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_ << "\nmodel_name: " << model_name_
-       << "\nmodel_dir: " << model_dir_ << "\nquant_policy: " << model_param_.quant_policy
-       << "\ngroup_size: " << group_size_ << "\nexpert_num: " << moe_param_.expert_num
+       << "\nmodel_dir: " << model_dir_ << "\nquant_policy: " << model_param_.quant_policy << "\ngroup_size: "
+       << model_param_.group_size
+       //    << "\nexpert_num: " << moe_param_.expert_num
        << "\nexpert_per_token: " << moe_param_.experts_per_token << "\nmoe_method: " << moe_param_.method << std::endl;
 
     return ss.str();
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
index 19a143e721..a6c1b862ac 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
@@ -91,9 +91,6 @@ struct LlamaTritonModel: public AbstractTransformerModel {
     ft::EngineParam    engine_param_;
     size_t             tensor_para_size_;
     size_t             pipeline_para_size_;
-    ft::WeightType     weight_type_;
-    bool               attn_bias_;
-    int                group_size_;
 
     std::shared_ptr<ft::SharedState> shared_state_;
     // Weights & engine instances for the ranks
diff --git a/src/turbomind/utils/allocator.h b/src/turbomind/utils/allocator.h
index bdcb9bfc46..88c299c3de 100644
--- a/src/turbomind/utils/allocator.h
+++ b/src/turbomind/utils/allocator.h
@@ -281,7 +281,8 @@ class Allocator<AllocatorType::CUDA>: public IAllocator {
                 pointer_mapping_.erase(address);
             }
             else {
-                TM_LOG_WARNING("pointer_mapping_ does not have information of ptr at %p.", address);
+                FT_CHECK_WITH_INFO(0,
+                                   fmtstr("pointer_mapping_ does not have information of ptr at %p.", address).c_str());
             }
         }
         *ptr = nullptr;
diff --git a/src/turbomind/utils/cuda_utils.h b/src/turbomind/utils/cuda_utils.h
index 2148fcc164..8311e6eb9e 100644
--- a/src/turbomind/utils/cuda_utils.h
+++ b/src/turbomind/utils/cuda_utils.h
@@ -483,5 +483,24 @@ void compareTwoTensor(
 
 bool is_16xx_series(const char* name);
 
+class CudaDeviceGuard {
+public:
+    CudaDeviceGuard(int device)
+    {
+        cudaGetDevice(&last_device_id_);
+        if (device != last_device_id_) {
+            cudaSetDevice(device);
+        }
+    }
+
+    ~CudaDeviceGuard()
+    {
+        cudaSetDevice(last_device_id_);
+    }
+
+private:
+    int last_device_id_{-1};
+};
+
 /* ************************** end of common utils ************************** */
 }  // namespace turbomind
diff --git a/src/turbomind/utils/memory_utils.cu b/src/turbomind/utils/memory_utils.cu
index f8bfb8efe0..e9a79ea5a1 100644
--- a/src/turbomind/utils/memory_utils.cu
+++ b/src/turbomind/utils/memory_utils.cu
@@ -26,77 +26,71 @@
 namespace turbomind {
 
 template<typename T>
-void deviceMalloc(T** ptr, size_t size, bool is_random_initialize)
+void deviceMalloc(T** ptr, size_t size, cudaStream_t st, bool is_random_initialize)
 {
-    FT_CHECK_WITH_INFO(size >= ((size_t)0), "Ask deviceMalloc size " + std::to_string(size) + "< 0 is invalid.");
-    check_cuda_error(cudaMalloc((void**)(ptr), sizeof(T) * size));
+    check_cuda_error(cudaMallocAsync((void**)(ptr), sizeof(T) * size, st));
     if (is_random_initialize) {
-        cudaRandomUniform(*ptr, size);
+        cudaRandomUniform(*ptr, size, st);
     }
 }
 
-template void deviceMalloc(float** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(half** ptr, size_t size, bool is_random_initialize);
+template void deviceMalloc(float** ptr, size_t size, cudaStream_t, bool is_random_initialize);
+template void deviceMalloc(half** ptr, size_t size, cudaStream_t, bool is_random_initialize);
 #ifdef ENABLE_BF16
-template void deviceMalloc(__nv_bfloat16** ptr, size_t size, bool is_random_initialize);
+template void deviceMalloc(__nv_bfloat16** ptr, size_t size, cudaStream_t, bool is_random_initialize);
 #endif
-template void deviceMalloc(uint16_t** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(int** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(bool** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(char** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(int8_t** ptr, size_t size, bool is_random_initialize);
+template void deviceMalloc(uint16_t** ptr, size_t size, cudaStream_t, bool is_random_initialize);
+template void deviceMalloc(int** ptr, size_t size, cudaStream_t, bool is_random_initialize);
+template void deviceMalloc(bool** ptr, size_t size, cudaStream_t, bool is_random_initialize);
+template void deviceMalloc(char** ptr, size_t size, cudaStream_t, bool is_random_initialize);
+template void deviceMalloc(int8_t** ptr, size_t size, cudaStream_t, bool is_random_initialize);
 #ifdef ENABLE_FP8
-template void deviceMalloc(__nv_fp8_e4m3** ptr, size_t size, bool is_random_initialize);
+template void deviceMalloc(__nv_fp8_e4m3** ptr, size_t size, cudaStream_t, bool is_random_initialize);
 #endif
 
 template<typename T>
-void deviceMemSetZero(T* ptr, size_t size)
-{
-    check_cuda_error(cudaMemset(static_cast<void*>(ptr), 0, sizeof(T) * size));
-}
-
-template void deviceMemSetZero(float* ptr, size_t size);
-template void deviceMemSetZero(half* ptr, size_t size);
-template void deviceMemSetZero(int* ptr, size_t size);
-template void deviceMemSetZero(uint32_t* ptr, size_t size);
-template void deviceMemSetZero(bool* ptr, size_t size);
-#ifdef ENABLE_FP8
-template void deviceMemSetZero(__nv_fp8_e4m3* ptr, size_t size);
-#endif
-#ifdef ENABLE_BF16
-template void deviceMemSetZero(__nv_bfloat16* ptr, size_t size);
-#endif
-
-template<typename T>
-void deviceFree(T*& ptr)
+void deviceFree(T*& ptr, cudaStream_t st)
 {
     if (ptr != NULL) {
-        check_cuda_error(cudaFree(ptr));
+        check_cuda_error(cudaFreeAsync(ptr, st));
         ptr = NULL;
     }
 }
 
-template void deviceFree(float*& ptr);
-template void deviceFree(half*& ptr);
+template void deviceFree(float*& ptr, cudaStream_t);
+template void deviceFree(half*& ptr, cudaStream_t);
 #ifdef ENABLE_BF16
-template void deviceFree(__nv_bfloat16*& ptr);
+template void deviceFree(__nv_bfloat16*& ptr, cudaStream_t);
 #endif
-template void deviceFree(unsigned short*& ptr);
-template void deviceFree(int*& ptr);
-template void deviceFree(bool*& ptr);
-template void deviceFree(char*& ptr);
-template void deviceFree(int8_t*& ptr);
+template void deviceFree(unsigned short*& ptr, cudaStream_t);
+template void deviceFree(int*& ptr, cudaStream_t);
+template void deviceFree(bool*& ptr, cudaStream_t);
+template void deviceFree(char*& ptr, cudaStream_t);
+template void deviceFree(int8_t*& ptr, cudaStream_t);
+template void deviceFree(void*& ptr, cudaStream_t);
 #ifdef ENABLE_FP8
-template void deviceFree(__nv_fp8_e4m3*& ptr);
+template void deviceFree(__nv_fp8_e4m3*& ptr, cudaStream_t);
 #endif
 
+namespace {
+
+template<class T>
+__global__ void fill_kernel(T* devptr, size_t size, T value)
+{
+    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    for (size_t i = idx; i < size; i += blockDim.x * gridDim.x) {
+        devptr[i] = value;
+    }
+}
+
+}  // namespace
+
 template<typename T>
 void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream)
 {
-    T* arr = new T[size];
-    std::fill(arr, arr + size, value);
-    check_cuda_error(cudaMemcpyAsync(devptr, arr, sizeof(T) * size, cudaMemcpyHostToDevice, stream));
-    delete[] arr;
+    constexpr int threads = 512;
+    const int     blocks  = (size + threads - 1) / threads;
+    fill_kernel<<<blocks, threads, 0, stream>>>(devptr, size, value);
 }
 
 template void deviceFill(float* devptr, size_t size, float value, cudaStream_t stream);
@@ -280,23 +274,23 @@ __global__ void cuda_random_uniform_kernel<char>(char* buffer, const size_t size
 }
 
 template<typename T>
-void cudaRandomUniform(T* buffer, const size_t size)
+void cudaRandomUniform(T* buffer, const size_t size, cudaStream_t st)
 {
     static int seq_offset = 0;
-    cuda_random_uniform_kernel<T><<<256, 256>>>(buffer, size, seq_offset);
+    cuda_random_uniform_kernel<T><<<256, 256, 0, st>>>(buffer, size, seq_offset);
     seq_offset += 256 * 256;
 }
 
-template void cudaRandomUniform(float* buffer, const size_t size);
-template void cudaRandomUniform(half* buffer, const size_t size);
+template void cudaRandomUniform(float* buffer, const size_t size, cudaStream_t);
+template void cudaRandomUniform(half* buffer, const size_t size, cudaStream_t);
 #ifdef ENABLE_BF16
-template void cudaRandomUniform(__nv_bfloat16* buffer, const size_t size);
+template void cudaRandomUniform(__nv_bfloat16* buffer, const size_t size, cudaStream_t);
 #endif
-template void cudaRandomUniform(int* buffer, const size_t size);
-template void cudaRandomUniform(bool* buffer, const size_t size);
-template void cudaRandomUniform(char* buffer, const size_t size);
+template void cudaRandomUniform(int* buffer, const size_t size, cudaStream_t);
+template void cudaRandomUniform(bool* buffer, const size_t size, cudaStream_t);
+template void cudaRandomUniform(char* buffer, const size_t size, cudaStream_t);
 #ifdef ENABLE_FP8
-template void cudaRandomUniform(__nv_fp8_e4m3* buffer, const size_t size);
+template void cudaRandomUniform(__nv_fp8_e4m3* buffer, const size_t size, cudaStream_t);
 #endif
 
 // loads data from binary file. If it succeeds, returns a non-empty vector. If loading fails or
@@ -366,10 +360,10 @@ int loadWeightFromBinFunc(T* ptr, std::vector<size_t> shape, std::string filenam
     }
     else {
         T_IN* ptr_2 = nullptr;
-        deviceMalloc(&ptr_2, host_array.size(), false);
+        deviceMalloc(&ptr_2, host_array.size(), nullptr, false);
         cudaH2Dcpy(ptr_2, host_array.data(), host_array.size());
         invokeCudaD2DcpyConvert(ptr, ptr_2, host_array.size());
-        deviceFree(ptr_2);
+        deviceFree(ptr_2, nullptr);
     }
     return 0;
 }
diff --git a/src/turbomind/utils/memory_utils.h b/src/turbomind/utils/memory_utils.h
index bb7a4f9c03..03a0ef7b33 100644
--- a/src/turbomind/utils/memory_utils.h
+++ b/src/turbomind/utils/memory_utils.h
@@ -23,16 +23,13 @@
 namespace turbomind {
 
 template<typename T>
-void deviceMalloc(T** ptr, size_t size, bool is_random_initialize = true);
+void deviceMalloc(T** ptr, size_t size, cudaStream_t st, bool is_random_initialize = false);
 
 template<typename T>
-void deviceMemSetZero(T* ptr, size_t size);
+void deviceFree(T*& ptr, cudaStream_t st);
 
 template<typename T>
-void deviceFree(T*& ptr);
-
-template<typename T>
-void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream = 0);
+void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream = {});
 
 template<typename T>
 void cudaD2Hcpy(T* tgt, const T* src, const size_t size);
@@ -44,10 +41,10 @@ template<typename T>
 void cudaD2Dcpy(T* tgt, const T* src, const size_t size);
 
 template<typename T>
-void cudaAutoCpy(T* tgt, const T* src, const size_t size, cudaStream_t stream = NULL);
+void cudaAutoCpy(T* tgt, const T* src, const size_t size, cudaStream_t stream = {});
 
 template<typename T>
-void cudaRandomUniform(T* buffer, const size_t size);
+void cudaRandomUniform(T* buffer, const size_t size, cudaStream_t stream = {});
 
 template<typename T>
 int loadWeightFromBin(T*                  ptr,