From e0b221e2a06717a7d3451009aff14609b765a36e Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Thu, 7 Nov 2024 04:58:48 +0000
Subject: [PATCH 01/21] add qwen2-moe

---
 lmdeploy/turbomind/deploy/config.py           |    2 +
 lmdeploy/turbomind/deploy/module.py           |   20 +-
 .../turbomind/deploy/source_model/mixtral.py  |    2 +
 .../turbomind/deploy/source_model/qwen.py     |   64 +
 lmdeploy/turbomind/supported_models.py        |    1 +
 src/turbomind/kernels/gemm/moe_utils_v2.cu    | 1623 +++++++++++++++--
 src/turbomind/kernels/gemm/moe_utils_v2.h     |    4 +-
 .../kernels/gemm/test/test_moe_utils.cu       |  103 +-
 src/turbomind/kernels/gemm/test/testbed.h     |    2 +
 .../models/llama/LlamaDecoderLayerWeight.cc   |   52 +-
 src/turbomind/models/llama/LlamaDenseWeight.h |   16 +
 src/turbomind/models/llama/llama_params.h     |    8 +-
 src/turbomind/models/llama/moe_ffn_layer.cc   |   78 +-
 src/turbomind/models/llama/moe_ffn_layer.h    |    6 +-
 src/turbomind/models/llama/unified_decoder.cc |   21 +-
 .../triton_backend/llama/LlamaTritonModel.cc  |    2 +
 16 files changed, 1777 insertions(+), 227 deletions(-)

diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
index 7e8ebf7b4..a535b0d4c 100644
--- a/lmdeploy/turbomind/deploy/config.py
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -50,6 +50,8 @@ class ModelConfig:
     expert_num: int = 0
     expert_inter_size: int = 0
     experts_per_token: int = 0
+    moe_shared_gate: int = False
+    moe_norm_topk: int = False
 
     def verify(self):
         invalid = {}
diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py
index a9f738537..8e20946b1 100644
--- a/lmdeploy/turbomind/deploy/module.py
+++ b/lmdeploy/turbomind/deploy/module.py
@@ -140,14 +140,18 @@ class MoeFfn(Ffn):
     requires:
         r.moe_ffn_expert(e, i, kind)
         r.moe_ffn_gate(i)
+        r.moe_ffn_shared_gate(i)
     """
 
     _moe_ffn_expert = 'layers.{0}.moe_ffn.experts.E.{1}.{2}'
-    _moe_ffn_gate = 'layers.{0}.moe_ffn.gate.{1}'
+    _moe_ffn_gate = 'layers.{0}.moe_ffn.gate.weight'
+    _moe_ffn_shared_gate = 'layers.{0}.moe_ffn.shared_gate.weight'
 
     def __init__(self, model: BaseOutputModel):
         super().__init__(model)
         self.expert_num = model.model_config.expert_num
+        self.inter_size = model.model_config.expert_inter_size
+        self.shared_gate = model.model_config.moe_shared_gate
 
     def apply(self, i: int, r: BaseReader):
         for p in get_params(r.moe_ffn_expert()):
@@ -157,7 +161,12 @@ def apply(self, i: int, r: BaseReader):
                   i)
 
         gate = transpose(r.moe_ffn_gate(i))
-        self.model.save_split(gate, self._moe_ffn_gate.format(i, 'weight'))
+        self.model.save_split(gate, self._moe_ffn_gate.format(i))
+
+        if self.shared_gate:
+            shared_gate = transpose(r.moe_ffn_shared_gate(i))
+            print(shared_gate)
+            self.model.save_split(shared_gate, self._moe_ffn_shared_gate.format(i))
 
 
 class Attn(Module):
@@ -248,8 +257,11 @@ class Transformer:
 
     def __init__(self, model: BaseOutputModel):
         self.model = model
-        ffn = MoeFfn if model.model_config.expert_num else Ffn
-        modules = [Attn, LayerNorm, ffn]
+        modules = [Attn, LayerNorm]
+        if model.model_config.inter_size:
+            modules.append(Ffn)
+        if model.model_config.expert_num:
+            modules.append(MoeFfn)
         self.modules = [c(model) for c in modules]
         self.misc = Misc(model)
 
diff --git a/lmdeploy/turbomind/deploy/source_model/mixtral.py b/lmdeploy/turbomind/deploy/source_model/mixtral.py
index 102ede29f..ff9df2d40 100644
--- a/lmdeploy/turbomind/deploy/source_model/mixtral.py
+++ b/lmdeploy/turbomind/deploy/source_model/mixtral.py
@@ -33,4 +33,6 @@ def model_info(self):
         info['expert_num'] = cfg['num_local_experts']
         info['expert_inter_size'] = cfg['intermediate_size']
         info['experts_per_token'] = cfg['num_experts_per_tok']
+        info['moe_norm_topk'] = True
+        info['inter_size'] = 0
         return info
diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py
index 0ec0586a3..071ade122 100644
--- a/lmdeploy/turbomind/deploy/source_model/qwen.py
+++ b/lmdeploy/turbomind/deploy/source_model/qwen.py
@@ -120,3 +120,67 @@ def model_info(self):
         cfg = super().model_info()
         cfg['attn_bias'] = 1
         return cfg
+    
+
+
+class Qwen2MoeReader(LlamaReader):
+
+    ffn_pattern = r'shared_expert\.'
+
+    def moe_ffn_expert(self, e=None, i=None, kind=None):
+        if not kind:
+            return self.filter(r'experts')
+        result = []
+        for key in ['gate', 'down', 'up']:
+            name = f'model.layers.{i}.mlp.experts.{e}.{key}_proj.{kind}'
+            tensor = self.params.get(name)
+            tensor = self.transform(tensor, kind)
+            result.append(tensor)
+        return (*result, )
+    
+    def moe_ffn_gate(self, i):
+        return self.params.get(
+            f'model.layers.{i}.mlp.gate.weight')
+    
+    def _ffn(self, i: int, kind: str):
+        """Get ffn kind for layer i."""
+        if not kind:
+            return self.filter(self.ffn_pattern)
+        result = []
+        for key in ['gate', 'down', 'up']:
+            tensor = self.params[
+                f'model.layers.{i}.mlp.shared_expert.{key}_proj.{kind}']
+            tensor = self.transform(tensor, kind)
+            result.append(tensor)
+        return (*result, )
+    
+    def moe_ffn_shared_gate(self, i):
+        return self.params.get(
+            f'model.layers.{i}.mlp.shared_expert_gate.weight'
+        )
+
+@INPUT_MODELS.register_module(name='qwen2-moe')
+class Qwen2MoeModel(LlamaModel):
+
+    Reader = Qwen2MoeReader
+
+    def tokenizer_info(self):
+        """
+        https://huggingface.co/Qwen/Qwen1.5-7B-Chat/blob/main/generation_config.json
+        """  # noqa: E501
+        n_words = 152064
+        bos_id = 151643
+        eos_id = 151645
+        return n_words, bos_id, eos_id
+
+    def model_info(self):
+        cfg = self.model_config
+        info = super().model_info()
+        info['expert_num'] = cfg['num_experts']
+        info['expert_inter_size'] = cfg['moe_intermediate_size']
+        info['experts_per_token'] = cfg['num_experts_per_tok']
+        info['inter_size'] = cfg['shared_expert_intermediate_size']
+        info['moe_shared_gate'] = True
+        info['moe_norm_topk_prob'] = cfg['norm_topk_prob']
+        info['attn_bias'] = 1
+        return info
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index 8a1f5e731..18c15481e 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -20,6 +20,7 @@
     QWenLMHeadModel='qwen',
     # Qwen2
     Qwen2ForCausalLM='qwen2',
+    Qwen2MoeForCausalLM='qwen2-moe',
     # mistral
     MistralForCausalLM='llama',
     # llava
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu
index acf635585..42ff84a3a 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.cu
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu
@@ -6,8 +6,13 @@
 #include <numeric>
 #include <random>
 
+#include <cub/block/block_exchange.cuh>
+#include <cub/block/block_load.cuh>
 #include <cub/block/block_reduce.cuh>
 #include <cub/block/block_scan.cuh>
+#include <cub/warp/warp_exchange.cuh>
+#include <cub/warp/warp_merge_sort.cuh>
+#include <cub/warp/warp_scan.cuh>
 #include <cuda_pipeline_primitives.h>
 
 #include "src/turbomind/kernels/core/array_ops.h"
@@ -19,7 +24,7 @@ namespace turbomind {
 
 template<int top_k, int block_dim>
 __global__ void MoeGateKernel_V2(float*       scales,  // [e,n]
-                                 int*         masks,   // [E,n], padded
+                                 int8_t*      masks,   // [E,n], padded
                                  int*         accum,   // [E,tiles]
                                  const float* logits,  // [E,n]
                                  int          log_tile,
@@ -88,6 +93,8 @@ __global__ void MoeGateKernel_V2(float*       scales,  // [e,n]
             const int lowbit = (mask & -mask);
             const int e      = 31 - __clz(lowbit);
 
+            // printf("e = %d, ti = %d, idx = %d\n", e, ti, i);
+
             masks[e * tokens_padded + ti] = i;
             atomicAdd(&shared_accum[e][ti >> log_tile], 1);
             top_val[i] = logits[ti * experts + e];
@@ -120,11 +127,11 @@ __global__ void MoeGateKernel_V2(float*       scales,  // [e,n]
     }
 }
 
-template<int block_dim>
-__global__ void MoeScanKernel_V2(int*       f2n,      // [e*n]
+template<int block_dim, class Mask>
+__global__ void MoeScanKernel_v2(int*       f2n,      // [e*n]
                                  int*       en2f,     // [e,n]
                                  int*       offsets,  // [E+1]
-                                 int*       masks,    // [E,n], padded
+                                 Mask*      masks,    // [E,n], padded
                                  const int* accum,    // [E,tiles]
                                  int        log_tile,
                                  int        tiles,
@@ -142,13 +149,15 @@ __global__ void MoeScanKernel_V2(int*       f2n,      // [e*n]
 
     constexpr int vec_size = kMoeGateVecSize;
 
-    using Vec = Array<int, vec_size>;
+    using Vec = Array<Mask, vec_size>;
 
     const int tile_id = blockIdx.x;
     const int ei      = blockIdx.y;
 
-    const int global_tile_id = ei * tiles + tile_id;
+    const int  global_tile_id = ei * tiles + tile_id;
+    const bool is_valid       = global_tile_id <= experts * tiles;
 
+#if 0
     int vacc[4]{};
     {
         int idx = threadIdx.x;
@@ -162,6 +171,18 @@ __global__ void MoeScanKernel_V2(int*       f2n,      // [e*n]
     }
 
     int offset = BlockReduce{temp_storage.reduce}.Sum(vacc);
+#else
+
+    int vacc = 0;
+    for (int i = threadIdx.x; i < global_tile_id; i += block_dim) {
+        if (is_valid && i < global_tile_id) {
+            vacc += accum[i];
+        }
+    }
+
+    int offset = BlockReduce{temp_storage.reduce}.Sum(vacc);
+
+#endif
 
     __shared__ int shared_offset;
 
@@ -200,7 +221,7 @@ __global__ void MoeScanKernel_V2(int*       f2n,      // [e*n]
         const bool pred = vi < tile_vec_end;
 
         Vec data;
-        fill(data, -1);
+        fill(data, Mask{-1});
         if (pred) {
             Ldg(data, mask_ptr[vi].data());
         }
@@ -231,137 +252,1504 @@ __global__ void MoeScanKernel_V2(int*       f2n,      // [e*n]
     }
 }
 
-void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
-                      int*         en2f,           // [e,n] -> n*e
-                      int*         offsets,        // [E+1]
-                      float*       scales,         // [e,n]
-                      int*         masks,          // [E,n]
-                      int*         accum,          // [E]
-                      const float* logits,         // [e,n]
-                      int          tokens,         //  n
-                      int          tokens_padded,  //  round_up(n, 4)
-                      int          experts,        //  E
-                      int          experts_per_token,
-                      cudaStream_t st)
+template<int max_expert_num, int max_top_k, bool norm_top_k, int items_per_thread, int block_dim>
+__global__ void MoeGateKernel_v3(float*       scales,  // [e,n]
+                                 int*         masks,   // [E,n], padded
+                                 int*         accum,   // [E,tiles]
+                                 const float* logits,  // [n,E]
+                                 int          log_tile,
+                                 int          tiles,
+                                 int          token_num,
+                                 int          token_num_padded,
+                                 int          expert_num,
+                                 int          top_k)
 {
-    constexpr int base_log_tile = 9;
+    constexpr int max_tiles         = kMoeGateMaxTiles;
+    constexpr int threads_per_token = max_expert_num / items_per_thread;  // 8
 
-    int log_tile = base_log_tile;
-    while (((tokens_padded + (1 << log_tile) - 1) >> log_tile) > kMoeGateMaxTiles) {
-        ++log_tile;
+    // We use bits in a uint32_t to represent selected experts
+    static_assert(items_per_thread <= 32);
+    // We use warp-level primitives for reduction
+    static_assert(threads_per_token <= 32);
+
+    static_assert((threads_per_token & (threads_per_token - 1)) == 0);
+
+    const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    const int ti = thread_idx / threads_per_token;
+    const int ei = thread_idx % threads_per_token;
+
+    const int warp_ti        = threadIdx.x % WARP_SIZE / threads_per_token;
+    const int warp_ti_offset = warp_ti * threads_per_token;
+
+    __shared__ int shared_accum[max_expert_num][max_tiles];
+
+    for (int i = threadIdx.x; i < expert_num * max_tiles; i += block_dim) {
+        int e = i / max_tiles;
+        int t = i % max_tiles;
+        if (e < expert_num && t < tiles) {
+            shared_accum[e][t] = 0;
+        }
     }
-    const int tiles = ceil_div(tokens_padded, 1 << log_tile);
 
-    // std::cout << log_tile << " " << tiles << "\n";
+    __syncthreads();
 
-    {
-        constexpr int threads = 128;
-        const int     blocks  = ceil_div(tokens, threads);
+    float data[items_per_thread];
 
-        auto invoke = [&](auto e) {
-            static constexpr int top_k = decltype(e)::value;
-            MoeGateKernel_V2<top_k, threads><<<blocks, threads, 0, st>>>(  //
-                scales,
-                masks,
-                accum,
-                logits,
-                log_tile,
-                tiles,
-                tokens,
-                tokens_padded,
-                experts);
-        };
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        data[i]     = -std::numeric_limits<float>::infinity();
+        const int e = threads_per_token * i + ei;
+        if (e < expert_num && ti < token_num) {
+            data[i] = __ldg(logits + ti * expert_num + e);
+
+            if (ti == 39505) {
+                printf("%f %d\n", data[i], e);
+            }
+        }
+    }
+
+    unsigned mask = (unsigned)-1;
+    float    max_logit;
+
+    auto run = [&](int k) {
+        unsigned bit     = 1;
+        unsigned max_bit = 0;
+        float    max_val = -std::numeric_limits<float>::infinity();
+        // local maximum
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; ++i) {
+            if ((mask & bit) && data[i] > max_val) {
+                max_bit = bit;
+                max_val = data[i];
+            }
+            asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
+        }
+        int   g_max_ei  = ei;
+        float g_max_val = max_val;
+        if constexpr (threads_per_token > 1) {
+            // global maximum
+            PRAGMA_UNROLL
+            for (int m = threads_per_token / 2; m >= 1; m /= 2) {
+                g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m));
+            }
+            // tie breaking
+            const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val);
+            g_max_ei          = __ffs(active >> (unsigned)warp_ti_offset) - 1;
+        }
+        if (k == 0) {
+            max_logit = g_max_val;
+        }
+        if (ei == g_max_ei) {
+            mask -= max_bit;
+        }
+    };
 
-        switch (experts_per_token) {
-            case 2:
-                invoke(std::integral_constant<int, 2>{});
-                break;
-            // case 4:
-            //     invoke(std::integral_constant<int, 4>{});
-            //     break;
-            default:
-                std::cerr << __FILE__ << ":" << __LINE__ << " Not implemented. " << std::endl;
-                std::abort();
+    run(0);
+    for (int k = 1; k < max_top_k; ++k) {
+        run(k);
+    }
+
+    mask = ~mask;
+
+    float    sum_prob{};
+    unsigned bit = 1;
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        if (norm_top_k == false || (mask & bit)) {
+            data[i] = expf(data[i] - max_logit);
+            sum_prob += data[i];
         }
+        asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
     }
 
-    // return;
+    PRAGMA_UNROLL
+    for (int m = threads_per_token / 2; m >= 1; m /= 2) {
+        sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m);
+    }
 
-    {
-        // Check: tiles * experts <= threads
+    sum_prob = fdividef(1.f, sum_prob);
 
-        constexpr int threads = (1 << base_log_tile) / kMoeGateVecSize;
-        const dim3    blocks(tiles, experts + 1);
-        MoeScanKernel_V2<threads><<<blocks, threads, 0, st>>>(f2n,  //
-                                                              en2f,
-                                                              offsets,
-                                                              masks,
-                                                              accum,
-                                                              log_tile,
-                                                              tiles,
-                                                              tokens,
-                                                              tokens_padded,
-                                                              experts);
+    const unsigned group_mask = ((1U << (unsigned)threads_per_token) - 1U) << (unsigned)warp_ti_offset;
+
+    // 1111 1111 << 24
+
+    const unsigned lanemask_lt = ((1U << (unsigned)ei) - 1U) << (unsigned)warp_ti_offset;
+
+    // 1000 0000
+    // 0111 1111 0000 0000 0000 0000 0000 0000
+
+    int offset = 0;
+    bit        = 1;
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        const int      valid  = mask & bit;
+        const unsigned active = __ballot_sync((uint32_t)-1, valid);
+        if (valid) {
+            const int e   = threads_per_token * i + ei;
+            const int idx = offset + __popc(active & lanemask_lt);
+            if (ti == 39505) {
+                printf("%d %d %f\n", e, idx, data[i] * sum_prob);
+            }
+            masks[e * token_num_padded + ti] = idx;
+            scales[idx * token_num + ti]     = data[i] * sum_prob;
+            // atomic add in Smem
+            atomicAdd(&shared_accum[e][ti >> log_tile], 1);
+        }
+        offset += __popc(active & group_mask);
+        asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
     }
-}
 
-template<int vec_size, int block_dim, class T>
-__global__ void MoeGatherKernel(T*         dst,  // [e*n, d]
-                                const T*   src,  // [  n, d]
-                                const int* f2n,  // [e*n] :: e*n -> n
-                                int        dims)
-{
-    using Vec        = Array<T, vec_size>;
-    const int64_t bi = blockIdx.x;
+    __syncthreads();
 
-    auto src_ptr = (const Vec*)src + dims * f2n[bi];
-    auto dst_ptr = (/* */ Vec*)dst + dims * bi;
-    for (int i = threadIdx.x; i < dims; i += block_dim) {
-        Vec v;
-        Ldg(v, src_ptr[i].data());
-        Store(dst_ptr[i].data(), v);
+    for (int i = threadIdx.x; i < expert_num * max_tiles; i += block_dim) {
+        int e = i / max_tiles;
+        int t = i % max_tiles;
+        if (e < expert_num && t < tiles) {
+            // atomic add in Gmem
+            atomicAdd(accum + e * tiles + t, shared_accum[e][t]);
+        }
     }
 }
 
-template<class T>
-void invokeMoeGather(T* dst, const T* src, const int* f2n, int tokens, int experts_per_token, int dims, cudaStream_t st)
+template<int max_expert_num, int max_top_k, bool norm_top_k, int items_per_thread, int block_dim, int alignment>
+__global__ void MoeGateKernel_v4(float*       scales,  // [e,n]
+                                 int*         masks,   // [E,n], padded
+                                 int*         accum,   // [E,tiles]
+                                 const float* logits,  // [n,E]
+                                 int          log_tile,
+                                 int          tiles,
+                                 int          token_num,
+                                 int          token_num_padded,
+                                 int          expert_num,
+                                 int          top_k)
 {
-    constexpr int threads  = 256;
-    constexpr int vec_size = 16 / sizeof(T);
-    MoeGatherKernel<vec_size, threads><<<tokens * experts_per_token, threads, 0, st>>>(  //
-        dst,
-        src,
-        f2n,
-        dims / vec_size);
+    constexpr int max_tiles         = kMoeGateMaxTiles;
+    constexpr int threads_per_token = max_expert_num / items_per_thread;  // 8
+
+    // We use bits in a uint32_t to represent selected experts
+    static_assert(items_per_thread <= 32);
+    // We use warp-level primitives for reduction
+    static_assert(threads_per_token <= 32);
+
+    static_assert((threads_per_token & (threads_per_token - 1)) == 0);
+
+    const int warp_id = threadIdx.x / WARP_SIZE;
+    const int lane_id = threadIdx.x % WARP_SIZE;
+
+    const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    const int ti = thread_idx / threads_per_token;
+    const int ei = thread_idx % threads_per_token;
+
+    const int warp_ti        = threadIdx.x % WARP_SIZE / threads_per_token;
+    const int warp_ti_offset = warp_ti * threads_per_token;
+
+    __shared__ int shared_accum[max_expert_num][max_tiles];
+
+    for (int i = threadIdx.x; i < max_expert_num * max_tiles; i += block_dim) {
+        int e              = i / max_tiles;
+        int t              = i % max_tiles;
+        shared_accum[e][t] = 0;
+    }
+
+#if 0
+    logits += blockIdx.x * block_dim / threads_per_token * expert_num;
+    logits += threadIdx.x / WARP_SIZE * (WARP_SIZE / threads_per_token) * expert_num;
+    constexpr int tokens_per_warp = WARP_SIZE / threads_per_token;
+    __shared__ float smem_data[block_dim / WARP_SIZE][tokens_per_warp][max_expert_num + 1];
+    PRAGMA_UNROLL
+    for (int i = 0; i < tokens_per_warp * max_expert_num; i += WARP_SIZE) {
+        smem_data[warp_id][(lane_id + i) / max_expert_num][(lane_id + i) % max_expert_num] = logits[lane_id + i];
+    }
+    __syncthreads();
+    float data[items_per_thread];
+    for (int i = 0; i < items_per_thread; ++i) {
+        const int e = items_per_thread * ei + i;
+        data[i]     = smem_data[warp_id][lane_id / threads_per_token][e];
+    }
+#else
+    __syncthreads();
+    float data[items_per_thread];
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        data[i] = -std::numeric_limits<float>::infinity();
+    }
+    if (ti < token_num) {
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; i += alignment) {
+            const int e = items_per_thread * ei + i;
+            if (e < expert_num) {
+                Load((Array<float, alignment>&)data[i], &logits[ti * expert_num + e]);
+            }
+        }
+    }
+#endif
+
+    unsigned mask = (unsigned)-1;
+    float    max_logit;
+
+    auto run = [&](int k) {
+        unsigned bit     = 1;
+        unsigned max_bit = 0;
+        float    max_val = -std::numeric_limits<float>::infinity();
+        // local maximum
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; ++i) {
+            if ((mask & bit) && data[i] > max_val) {
+                max_bit = bit;
+                max_val = data[i];
+            }
+            asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
+        }
+        int   g_max_ei  = ei;
+        float g_max_val = max_val;
+        if constexpr (threads_per_token > 1) {
+            // global maximum
+            PRAGMA_UNROLL
+            for (int m = threads_per_token / 2; m >= 1; m /= 2) {
+                g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m));
+            }
+            // tie breaking
+            const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val);
+            g_max_ei          = __ffs(active >> (unsigned)warp_ti_offset) - 1;
+        }
+        if (k == 0) {
+            max_logit = g_max_val;
+        }
+        if (ei == g_max_ei) {
+            mask -= max_bit;
+        }
+    };
+
+    run(0);
+    for (int k = 1; k < max_top_k; ++k) {
+        run(k);
+    }
+
+    mask = ~mask;
+
+    float    sum_prob{};
+    unsigned bit = 1;
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        if (norm_top_k == false || (mask & bit)) {
+            data[i] = expf(data[i] - max_logit);
+            sum_prob += data[i];
+        }
+        asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
+    }
+
+    PRAGMA_UNROLL
+    for (int m = threads_per_token / 2; m >= 1; m /= 2) {
+        sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m);
+    }
+
+    sum_prob = fdividef(1.f, sum_prob);
+
+    const int count = __popc(mask);
+
+    using WarpScan = cub::WarpScan<int, threads_per_token>;
+    __shared__ typename WarpScan::TempStorage temp_storage[block_dim / threads_per_token];
+
+    int idx{};
+    WarpScan{temp_storage[threadIdx.x / threads_per_token]}.ExclusiveSum(count, idx);
+
+    bit = 1;
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        const int valid = mask & bit;
+        const int e     = items_per_thread * ei + i;
+        if (valid) {
+            masks[e * token_num_padded + ti] = idx;
+            scales[idx * token_num + ti]     = data[i] * sum_prob;
+            // atomic add in Smem
+            atomicAdd(&shared_accum[e][ti >> log_tile], 1);
+            ++idx;
+        }
+        asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
+    }
+
+    __syncthreads();
+
+    for (int i = threadIdx.x; i < expert_num * max_tiles; i += block_dim) {
+        int e = i / max_tiles;
+        int t = i % max_tiles;
+        if (e < expert_num && t < tiles) {
+            // atomic add in Gmem
+            atomicAdd(accum + e * tiles + t, shared_accum[e][t]);
+        }
+    }
 }
 
-template void invokeMoeGather(uint16_t*, const uint16_t*, const int*, int, int, int, cudaStream_t);
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
 
-template<int vec_size, int exp_k, int block_dim, class T>
-__global__ void MoeReduceKernel(T*           dst,     // [  n, d]
-                                const T*     src,     // [e*n, d]
-                                const float* scales,  // [  e, n]
-                                const int*   en2f,    // [  e, n] :: (e,n) -> e*n
-                                int          dims,
-                                int          tokens)
+template<int max_expert_num, int max_top_k, bool norm_top_k, int items_per_thread, int block_dim, int alignment>
+__global__ void MoeGateKernel_v5(float*       scales,  // [e,n]
+                                 int*         masks,   // [E,n], padded
+                                 int*         accum,   // [E,tiles]
+                                 const float* logits,  // [n,E]
+                                 int          log_tile,
+                                 int          tiles,
+                                 int          token_num,
+                                 int          token_num_padded,
+                                 int          expert_num,
+                                 int          top_k)
 {
-    using Vec = Array<T, vec_size>;
+    constexpr int max_tiles         = kMoeGateMaxTiles;
+    constexpr int threads_per_token = max_expert_num / items_per_thread;  // 8
 
-    const int64_t ti = blockIdx.x;
+    // We use bits in a uint32_t to represent selected experts
+    static_assert(items_per_thread <= 32);
+    // We use warp-level primitives for reduction
+    static_assert(threads_per_token <= 32);
 
-    auto dst_ptr = (Vec*)dst + dims * ti;
+    static_assert((threads_per_token & (threads_per_token - 1)) == 0);
+
+    const int warp_id = threadIdx.x / WARP_SIZE;
+    const int lane_id = threadIdx.x % WARP_SIZE;
+
+    const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    const int ti = thread_idx / threads_per_token;
+    const int ei = thread_idx % threads_per_token;
+
+    const int warp_ti        = threadIdx.x % WARP_SIZE / threads_per_token;
+    const int warp_ti_offset = warp_ti * threads_per_token;
+
+    __shared__ int shared_accum[max_expert_num][max_tiles];
+
+    for (int i = threadIdx.x; i < max_expert_num * max_tiles; i += block_dim) {
+        int e              = i / max_tiles;
+        int t              = i % max_tiles;
+        shared_accum[e][t] = 0;
+    }
+
+    __syncthreads();
+
+    constexpr int sort_k = std::min(items_per_thread, max_top_k);
+
+    float data[items_per_thread];
 
-    // Should be warp uniforms
-    const Vec* src_ptr[exp_k];
-    float      scale[exp_k];
     PRAGMA_UNROLL
-    for (int e = 0; e < exp_k; ++e) {
-        src_ptr[e] = (const Vec*)src + dims * en2f[e * tokens + ti];
-        scale[e]   = scales ? scales[e * tokens + ti] : 1.f;
+    for (int i = 0; i < items_per_thread; ++i) {
+        data[i] = -std::numeric_limits<float>::infinity();
+    }
+    if (ti < token_num) {
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; i += alignment) {
+            const int e = items_per_thread * ei + i;
+            if (e < expert_num) {
+                Load((Array<float, alignment>&)data[i], &logits[ti * expert_num + e]);
+            }
+        }
     }
 
-    for (int i = threadIdx.x; i < dims; i += block_dim) {
-        Array<float, vec_size> accum{};
+    float value[items_per_thread];
+    int   index[items_per_thread];
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        value[i] = data[i];
+        index[i] = items_per_thread * ei + i;
+    }
+
+    // PRAGMA_UNROLL
+    // for (int i = 0; i < sort_k; ++i) {
+    //     for (int j = items_per_thread - 1; j > i; --j) {
+    //         if (value[j] > value[j - 1]) {
+    //             auto tmp0    = value[j];
+    //             value[j]     = value[j - 1];
+    //             value[j - 1] = tmp0;
+    //             auto tmp1    = index[j];
+    //             index[j]     = index[j - 1];
+    //             index[j - 1] = tmp1;
+    //         }
+    //     }
+    // }
+
+    PRAGMA_UNROLL
+    for (int j = items_per_thread - 1; j > 0; --j) {
+        if (value[j] > value[j - 1]) {
+            auto tmp0    = value[j];
+            value[j]     = value[j - 1];
+            value[j - 1] = tmp0;
+            auto tmp1    = index[j];
+            index[j]     = index[j - 1];
+            index[j - 1] = tmp1;
+        }
+    }
+
+    float max_logit;
+
+    int count = 0;
+
+    auto run = [&](int k) {
+        float max_val = -std::numeric_limits<float>::infinity();
+
+        PRAGMA_UNROLL
+        for (int i = 0; i <= k; ++i) {
+            if (i == count) {
+                max_val = value[i];
+            }
+        }
+
+        if (k + 1 < sort_k) {
+            PRAGMA_UNROLL
+            for (int j = items_per_thread - 1; j > k + 1; --j) {
+                if (value[j] > value[j - 1]) {
+                    auto tmp0    = value[j];
+                    value[j]     = value[j - 1];
+                    value[j - 1] = tmp0;
+                    auto tmp1    = index[j];
+                    index[j]     = index[j - 1];
+                    index[j - 1] = tmp1;
+                }
+            }
+        }
+
+        int   g_max_ei  = ei;
+        float g_max_val = max_val;
+        if constexpr (threads_per_token > 1) {
+            // global maximum
+            PRAGMA_UNROLL
+            for (int m = threads_per_token / 2; m >= 1; m /= 2) {
+                g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m));
+            }
+            // tie breaking
+            const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val);
+            g_max_ei          = __ffs(active >> (unsigned)warp_ti_offset) - 1;
+        }
+
+        if (k == 0) {
+            max_logit = g_max_val;
+        }
+
+        if (ei == g_max_ei) {
+            count += 1;
+        }
+    };
+
+    PRAGMA_UNROLL
+    for (int k = 0; k < max_top_k; ++k) {
+        run(k);
+    }
+
+    float sum_prob{};
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < sort_k; ++i) {
+        if (i < count) {
+            value[i] = expf(value[i] - max_logit);
+            sum_prob += value[i];
+        }
+    }
+
+    PRAGMA_UNROLL
+    for (int m = threads_per_token / 2; m >= 1; m /= 2) {
+        sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m);
+    }
+    sum_prob = fdividef(1.f, sum_prob);
+
+    using WarpScan = cub::WarpScan<int, threads_per_token>;
+    __shared__ typename WarpScan::TempStorage temp_storage[block_dim / threads_per_token];
+
+    int idx{};
+    WarpScan{temp_storage[threadIdx.x / threads_per_token]}.ExclusiveSum(count, idx);
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < sort_k; ++i) {
+        if (ti < token_num && i < count) {
+            const int e                      = index[i];
+            masks[e * token_num_padded + ti] = idx;
+            scales[idx * token_num + ti]     = value[i] * sum_prob;
+            atomicAdd(&shared_accum[e][ti >> log_tile], 1);
+            ++idx;
+        }
+    }
+
+    __syncthreads();
+
+    for (int i = threadIdx.x; i < expert_num * max_tiles; i += block_dim) {
+        int e = i / max_tiles;
+        int t = i % max_tiles;
+        if (e < expert_num && t < tiles) {
+            // atomic add in Gmem
+            atomicAdd(accum + e * tiles + t, shared_accum[e][t]);
+        }
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////
+
+template<int max_expert_num, int max_top_k, bool norm_top_k, int items_per_thread, int block_dim, int alignment>
+__global__ void MoeGateKernel_v6(float*       scales,  // [e,n]
+                                 int*         masks,   // [E,n], padded
+                                 int*         accum,   // [E,tiles]
+                                 const float* logits,  // [n,E]
+                                 int          log_tile,
+                                 int          tiles,
+                                 int          token_num,
+                                 int          token_num_padded,
+                                 int          expert_num,
+                                 int          top_k)
+{
+    constexpr int max_tiles         = kMoeGateMaxTiles;
+    constexpr int threads_per_token = max_expert_num / items_per_thread;  // 8
+
+    // We use bits in a uint32_t to represent selected experts
+    static_assert(items_per_thread <= 32);
+    // We use warp-level primitives for reduction
+    static_assert(threads_per_token <= 32);
+
+    static_assert((threads_per_token & (threads_per_token - 1)) == 0);
+
+    const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    const int ti = thread_idx / threads_per_token;
+    const int ei = thread_idx % threads_per_token;
+
+    const int warp_ti        = threadIdx.x % WARP_SIZE / threads_per_token;
+    const int warp_ti_offset = warp_ti * threads_per_token;
+
+    __shared__ int shared_accum[max_expert_num][max_tiles];
+
+    for (int i = threadIdx.x; i < max_expert_num * max_tiles; i += block_dim) {
+        int e              = i / max_tiles;
+        int t              = i % max_tiles;
+        shared_accum[e][t] = 0;
+    }
+
+    __syncthreads();
+
+    float data[items_per_thread];
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        data[i] = -std::numeric_limits<float>::infinity();
+    }
+    if (ti < token_num) {
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; i += alignment) {
+            const int e = items_per_thread * ei + i;
+            if (e < expert_num) {
+                Load((Array<float, alignment>&)data[i], &logits[ti * expert_num + e]);
+            }
+        }
+    }
+
+    float max_logit;
+
+    unsigned mask    = (unsigned)-1;
+    unsigned max_bit = 0;
+    float    max_val = -std::numeric_limits<float>::infinity();
+    {
+        unsigned bit = 1;
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; ++i) {
+            if ((mask & bit) && data[i] > max_val) {
+                max_bit = bit;
+                max_val = data[i];
+            }
+            asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
+        }
+    }
+
+    auto run = [&](int k) {
+        unsigned next_mask    = mask - max_bit;
+        unsigned next_max_bit = 0;
+        float    next_max_val = -std::numeric_limits<float>::infinity();
+        unsigned bit          = 1;
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; ++i) {
+            if ((next_mask & bit) && data[i] > next_max_val) {
+                next_max_bit = bit;
+                next_max_val = data[i];
+            }
+            asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
+        }
+
+        int   g_max_ei  = ei;
+        float g_max_val = max_val;
+
+        if constexpr (threads_per_token > 1) {
+            // global maximum
+            PRAGMA_UNROLL
+            for (int m = threads_per_token / 2; m >= 1; m /= 2) {
+                g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m));
+            }
+            // tie breaking
+            const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val);
+            g_max_ei          = __ffs(active >> (unsigned)warp_ti_offset) - 1;
+        }
+
+        if (k == 0) {
+            max_logit = g_max_val;
+        }
+
+        if (ei == g_max_ei) {
+            mask    = next_mask;
+            max_bit = next_max_bit;
+            max_val = next_max_val;
+        }
+    };
+
+    run(0);
+    for (int k = 1; k < max_top_k; ++k) {
+        run(k);
+    }
+
+    mask = ~mask;
+
+    float    sum_prob{};
+    unsigned bit = 1;
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        if (norm_top_k == false || (mask & bit)) {
+            data[i] = expf(data[i] - max_logit);
+            sum_prob += data[i];
+        }
+        asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
+    }
+
+    PRAGMA_UNROLL
+    for (int m = threads_per_token / 2; m >= 1; m /= 2) {
+        sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m);
+    }
+
+    sum_prob = fdividef(1.f, sum_prob);
+
+    const int count = __popc(mask);
+
+    using WarpScan = cub::WarpScan<int, threads_per_token>;
+    __shared__ typename WarpScan::TempStorage temp_storage[block_dim / threads_per_token];
+
+    int idx{};
+    WarpScan{temp_storage[threadIdx.x / threads_per_token]}.ExclusiveSum(count, idx);
+
+    bit = 1;
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        const int valid = mask & bit;
+        const int e     = items_per_thread * ei + i;
+        if (valid) {
+            masks[e * token_num_padded + ti] = idx;
+            scales[idx * token_num + ti]     = data[i] * sum_prob;
+            // atomic add in Smem
+            atomicAdd(&shared_accum[e][ti >> log_tile], 1);
+            ++idx;
+        }
+        asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
+    }
+
+    __syncthreads();
+
+    for (int i = threadIdx.x; i < expert_num * max_tiles; i += block_dim) {
+        int e = i / max_tiles;
+        int t = i % max_tiles;
+        if (e < expert_num && t < tiles) {
+            // atomic add in Gmem
+            atomicAdd(accum + e * tiles + t, shared_accum[e][t]);
+        }
+    }
+}
+
+template<int max_expert_num, int max_top_k, bool norm_top_k, int items_per_thread, int block_dim, int alignment>
+__global__ void MoeGateKernel_v7(float*       scales,  // [e,n]
+                                 int*         masks,   // [E,n], padded
+                                 int*         accum,   // [E,tiles]
+                                 const float* logits,  // [n,E]
+                                 int          log_tile,
+                                 int          tiles,
+                                 int          token_num,
+                                 int          token_num_padded,
+                                 int          expert_num,
+                                 int          top_k)
+{
+    constexpr int max_tiles         = kMoeGateMaxTiles;
+    constexpr int threads_per_token = max_expert_num / items_per_thread;  // 8
+
+    // We use bits in a uint32_t to represent selected experts
+    static_assert(items_per_thread <= 32);
+    // We use warp-level primitives for reduction
+    static_assert(threads_per_token <= 32);
+
+    static_assert((threads_per_token & (threads_per_token - 1)) == 0);
+
+    const int warp_id = threadIdx.x / WARP_SIZE;
+    const int lane_id = threadIdx.x % WARP_SIZE;
+
+    const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    const int ti = thread_idx / threads_per_token;
+    const int ei = thread_idx % threads_per_token;
+
+    const int warp_ti        = threadIdx.x % WARP_SIZE / threads_per_token;
+    const int warp_ti_offset = warp_ti * threads_per_token;
+
+    // +1 padding greatly reduced (-80%) bank conflicts
+    __shared__ int shared_accum[max_tiles][max_expert_num + 1];
+
+    for (int i = threadIdx.x; i < max_tiles * max_expert_num; i += block_dim) {
+        int e              = i % max_expert_num;
+        int t              = i / max_expert_num;
+        shared_accum[t][e] = 0;
+    }
+
+#if 0
+    logits += blockIdx.x * block_dim / threads_per_token * expert_num;
+    logits += threadIdx.x / WARP_SIZE * (WARP_SIZE / threads_per_token) * expert_num;
+    constexpr int tokens_per_warp = WARP_SIZE / threads_per_token;
+    __shared__ float smem_data[block_dim / WARP_SIZE][tokens_per_warp][max_expert_num];
+    PRAGMA_UNROLL
+    for (int i = 0; i < tokens_per_warp * max_expert_num; i += WARP_SIZE) {
+        smem_data[warp_id][(lane_id + i) / max_expert_num][(lane_id + i) % max_expert_num] = logits[lane_id + i];
+    }
+    __syncthreads();
+    float data[items_per_thread];
+    for (int i = 0; i < items_per_thread; ++i) {
+        const int e = items_per_thread * ei + i;
+        data[i]     = smem_data[warp_id][lane_id / threads_per_token][e];
+    }
+#else
+    __syncthreads();
+    float data[items_per_thread];
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        data[i] = -std::numeric_limits<float>::infinity();
+    }
+    if (ti < token_num) {
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; i += alignment) {
+            const int e = items_per_thread * ei + i;
+            if (e < expert_num) {
+                Load((Array<float, alignment>&)data[i], &logits[ti * expert_num + e]);
+            }
+        }
+    }
+#endif
+
+    unsigned mask = (unsigned)-1;
+    float    max_logit;
+
+    auto run = [&](int k) {
+        unsigned bit     = 1;
+        unsigned max_bit = 0;
+        float    max_val = -std::numeric_limits<float>::infinity();
+        // local maximum
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; ++i) {
+            if ((mask & bit) && data[i] > max_val) {
+                max_bit = bit;
+                max_val = data[i];
+            }
+            asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
+        }
+        int   g_max_ei  = ei;
+        float g_max_val = max_val;
+        if constexpr (threads_per_token > 1) {
+            // global maximum
+            PRAGMA_UNROLL
+            for (int m = threads_per_token / 2; m >= 1; m /= 2) {
+                g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m));
+            }
+            // tie breaking
+            const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val);
+            g_max_ei          = __ffs(active >> (unsigned)warp_ti_offset) - 1;
+        }
+        if (k == 0) {
+            max_logit = g_max_val;
+        }
+        if (ei == g_max_ei) {
+            mask -= max_bit;
+        }
+    };
+
+    run(0);
+    for (int k = 1; k < max_top_k; ++k) {
+        run(k);
+    }
+
+    mask = ~mask;
+
+    float    sum_prob{};
+    unsigned bit = 1;
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        if (norm_top_k == false || (mask & bit)) {
+            data[i] = expf(data[i] - max_logit);
+            sum_prob += data[i];
+        }
+        asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
+    }
+
+    PRAGMA_UNROLL
+    for (int m = threads_per_token / 2; m >= 1; m /= 2) {
+        sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m);
+    }
+
+    sum_prob = fdividef(1.f, sum_prob);
+
+    const int count = __popc(mask);
+
+    using WarpScan = cub::WarpScan<int, threads_per_token>;
+    __shared__ typename WarpScan::TempStorage temp_storage[block_dim / threads_per_token];
+
+    int idx{};
+    WarpScan{temp_storage[threadIdx.x / threads_per_token]}.ExclusiveSum(count, idx);
+
+    bit = 1;
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        const int valid = mask & bit;
+        const int e     = items_per_thread * ei + i;
+        if (valid) {
+            masks[e * token_num_padded + ti] = idx;
+            // scales[idx * token_num + ti]     = data[i] * sum_prob;
+            scales[ti * top_k + idx] = data[i] * sum_prob;
+            // atomic add in Smem
+            atomicAdd(&shared_accum[ti >> log_tile][e], 1);
+            ++idx;
+        }
+        asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
+    }
+
+    __syncthreads();
+
+    for (int i = threadIdx.x; i < max_expert_num * max_tiles; i += block_dim) {
+        int t = i % max_tiles;
+        int e = i / max_tiles;
+        if (e < expert_num && t < tiles) {
+            // atomic add in Gmem
+            atomicAdd(accum + e * tiles + t, shared_accum[t][e]);
+        }
+    }
+}
+
+template<int max_expert_num,
+         int max_top_k,
+         //  bool norm_top_k,
+         int items_per_thread,
+         int block_dim,
+         int access_size,
+         class Mask>
+__global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
+                                 Mask*        masks,   // [E,n], padded
+                                 int*         accum,   // [E,tiles]
+                                 const float* logits,  // [n,E]
+                                 int          log_tile,
+                                 int          tiles,
+                                 int          token_num,
+                                 int          token_num_padded,
+                                 int          expert_num,
+                                 int          top_k,
+                                 bool         norm_topk)
+{
+    constexpr int max_tiles         = kMoeGateMaxTiles;
+    constexpr int threads_per_token = max_expert_num / items_per_thread;  // 8
+    constexpr int tokens_per_cta    = block_dim / threads_per_token;
+
+    // We use bits in a uint32_t to represent selected experts
+    static_assert(items_per_thread <= 32);
+    // We use warp-level primitives for reduction
+    static_assert(threads_per_token <= 32);
+
+    static_assert((threads_per_token & (threads_per_token - 1)) == 0);
+
+    const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    const int ti = thread_idx / threads_per_token;
+    const int ei = thread_idx % threads_per_token;
+
+    const int bti = threadIdx.x / threads_per_token;
+
+    const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token;
+
+    const int warp_offset  = thread_idx / WARP_SIZE * WARP_SIZE / threads_per_token;
+    const int block_offset = thread_idx / block_dim * block_dim / threads_per_token;
+
+    float data[items_per_thread];
+    int   idxs[items_per_thread];
+
+#if 0
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        data[i] = -std::numeric_limits<float>::infinity();
+        idxs[i] = threads_per_token * (i / access_size * access_size) + i % access_size + ei * access_size;
+    }
+    if (ti < token_num) {
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; i += access_size) {
+            const int e = threads_per_token * i + ei * access_size;
+            if (e < expert_num) {
+                Ldg((Array<float, access_size>&)data[i], &logits[ti * expert_num + e]);
+            }
+        }
+    }
+
+    __shared__ union {
+        struct {
+            // +1 padding greatly reduced (-80%) bank conflicts
+            int   shared_accum[max_tiles][max_expert_num + 1];
+            float shared_scales[max_top_k][tokens_per_cta];
+            int   shared_exp_id[max_top_k][tokens_per_cta];
+        };
+    } smem;
+#elif 1
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        data[i] = -std::numeric_limits<float>::infinity();
+        // idxs[i] = threads_per_token * (i / access_size * access_size) + i % access_size + ei * access_size;
+        idxs[i] = ei * items_per_thread + i;
+    }
+    if (ti < token_num) {
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; i += access_size) {
+            // const int e = threads_per_token * i + ei * access_size;
+            const int e = ei * items_per_thread + i;
+            if (e < expert_num) {
+                Ldg((Array<float, access_size>&)data[i], &logits[ti * expert_num + e]);
+            }
+        }
+    }
+
+    __shared__ union {
+        struct {
+            // +1 padding greatly reduced (-80%) bank conflicts
+            int   shared_accum[max_tiles][max_expert_num + 1];
+            float shared_scales[max_top_k][tokens_per_cta];
+            int   shared_exp_id[max_top_k][tokens_per_cta];
+        };
+    } smem;
+#else
+
+    const int warp_id = threadIdx.x / WARP_SIZE;
+    const int lane_id = threadIdx.x % WARP_SIZE;
+
+    constexpr int vecs_per_thread = items_per_thread / access_size;
+
+    using Vec            = Array<float, access_size>;
+    constexpr int banks  = 128 / sizeof(Vec);
+    constexpr int chunks = 4;  // block_dim / WARP_SIZE;
+
+    __shared__ union {
+        Vec shared_data[chunks][vecs_per_thread * WARP_SIZE / banks][banks + 1];
+        struct {
+            // +1 padding greatly reduced (-80%) bank conflicts
+            int   shared_accum[max_tiles][max_expert_num + 1];
+            float shared_scales[max_top_k][tokens_per_cta];
+            int   shared_exp_id[max_top_k][tokens_per_cta];
+        };
+    } smem;
+
+    __align__(16) Vec vecs[vecs_per_thread];
+
+    {
+        const int warp_end = min(warp_offset + WARP_SIZE / threads_per_token, token_num) * expert_num;
+        int       p        = warp_offset * expert_num + access_size * lane_id;
+        PRAGMA_UNROLL
+        for (int i = 0; i < vecs_per_thread; ++i) {
+            fill(vecs[i], -std::numeric_limits<float>::infinity());
+            // const int p = warp_offset * expert_num + access_size * (lane_id + i * WARP_SIZE);
+            if (p < warp_end) {
+                Ldg(vecs[i], &logits[p]);
+            }
+            p += access_size * WARP_SIZE;
+        }
+    }
+
+    PRAGMA_UNROLL
+    for (int c = 0; c < block_dim / WARP_SIZE; c += chunks) {
+        PRAGMA_UNROLL
+        for (int i = 0; i < vecs_per_thread; ++i) {
+            int p = i * WARP_SIZE + lane_id;
+            if (c <= warp_id && warp_id < c + chunks) {
+                Store(smem.shared_data[warp_id - c][p / banks][p % banks].data(), vecs[i]);
+            }
+        }
+
+        __syncwarp();
+
+        PRAGMA_UNROLL
+        for (int i = 0; i < vecs_per_thread; ++i) {
+            int p = lane_id * vecs_per_thread + i;
+            if (c <= warp_id && warp_id < c + chunks) {
+                Load(vecs[i], smem.shared_data[warp_id - c][p / banks][p % banks].data());
+            }
+        }
+
+        __syncthreads();
+    }
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        idxs[i] = ei * items_per_thread + i;
+    }
+    PRAGMA_UNROLL
+    for (int i = 0; i < vecs_per_thread; ++i) {
+        (Array<float, access_size>&)data[i * access_size] = vecs[i];
+    }
+
+#endif
+
+    constexpr float kLog2e = 1.4426950408889634074;
+
+    unsigned mask = (unsigned)-1;
+    float    max_logit;
+
+    int   count{};
+    float sum_prob{};
+
+    const int warp_ti_offset = warp_ti * threads_per_token;
+
+    auto run = [&](int k) {
+        unsigned bit     = 1;
+        unsigned max_bit = 0;
+        float    max_val = -std::numeric_limits<float>::infinity();
+        // local maximum
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; ++i) {
+            if ((mask & bit) && data[i] > max_val) {
+                max_bit = bit;
+                max_val = data[i];
+            }
+            asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
+        }
+
+        if (k == 0) {
+            PRAGMA_UNROLL
+            for (int i = 0; i < items_per_thread; ++i) {
+                data[i] *= kLog2e;
+            }
+        }
+
+        int   g_max_ei  = ei;
+        float g_max_val = max_val;
+        if constexpr (threads_per_token > 1) {
+            // global maximum
+            PRAGMA_UNROLL
+            for (int m = threads_per_token / 2; m >= 1; m /= 2) {
+                g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m));
+            }
+            // tie breaking
+            const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val);
+            g_max_ei          = __ffs(active >> (unsigned)warp_ti_offset) - 1;
+        }
+        if (k == 0) {
+            max_logit = g_max_val;
+        }
+        if (ei == g_max_ei) {
+            mask -= max_bit;
+            ++count;
+        }
+    };
+
+    run(0);
+
+    for (int k = 1; k < top_k; ++k) {
+        run(k);
+    }
+
+    mask = ~mask;
+
+    int used[items_per_thread];
+    {
+        unsigned bit = 1;
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; ++i) {
+            used[i] = (mask & bit) > 0;
+            asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
+        }
+    }
+
+    // unsigned bit = 1;
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        if (!norm_topk || used[i]) {
+            data[i] = exp2f(data[i] - max_logit);
+            sum_prob += data[i];
+        }
+    }
+
+    PRAGMA_UNROLL
+    for (int m = threads_per_token / 2; m >= 1; m /= 2) {
+        sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m);
+    }
+
+    sum_prob = fdividef(1.f, sum_prob);
+
+    // float sum_v = 0;
+    // PRAGMA_UNROLL
+    // for (int i = 0; i < items_per_thread; ++i) {
+    //     // data[i] = exp2f(data[i] - max_logit);
+    //     // sum_prob += data[i];
+    //     float v = data[i] * sum_prob;
+    //     sum_v += v;
+    //     if (ti == 0) {
+    //         printf("%f\n", v);
+    //     }
+    // }
+    // PRAGMA_UNROLL
+    // for (int m = threads_per_token / 2; m >= 1; m /= 2) {
+    //     sum_v += __shfl_xor_sync((uint32_t)-1, sum_v, m);
+    // }
+    // if (ti == 0) {
+    //     printf("sum=%f\n", sum_v);
+    // }
+
+    using WarpScan = cub::WarpScan<int, threads_per_token>;
+    __shared__ typename WarpScan::TempStorage temp_storage[tokens_per_cta];
+
+    int idx{};
+    WarpScan{temp_storage[bti]}.ExclusiveSum(count, idx);
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        if (used[i]) {
+            smem.shared_exp_id[idx][bti] = idxs[i];
+            smem.shared_scales[idx][bti] = data[i] * sum_prob;
+            ++idx;
+        }
+    }
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < max_tiles * max_expert_num; i += block_dim) {
+        int e                   = (i + threadIdx.x) % max_expert_num;
+        int t                   = (i + threadIdx.x) / max_expert_num;
+        smem.shared_accum[t][e] = 0;
+    }
+
+    __syncthreads();
+
+    constexpr int k_per_thread = cdiv(max_top_k, threads_per_token);
+
+    const int bti2 = threadIdx.x % tokens_per_cta;
+    const int ei2  = threadIdx.x / tokens_per_cta;
+    const int ti2  = blockIdx.x * tokens_per_cta + bti2;
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < k_per_thread; ++i) {
+        const int   idx       = ei2 * k_per_thread + i;
+        const int   expert_id = smem.shared_exp_id[idx][bti2];
+        const float scale     = smem.shared_scales[idx][bti2];
+
+        if (ti2 < token_num && idx < top_k) {
+            masks[expert_id * token_num_padded + ti2] = idx;
+            scales[idx * token_num + ti2]             = scale;
+            atomicAdd(&smem.shared_accum[ti2 >> log_tile][expert_id], 1);
+
+            // printf("%d %d %f\n", idx, expert_id, scale);
+        }
+    }
+
+    __syncthreads();
+
+    for (int i = 0; i < max_expert_num * max_tiles; i += block_dim) {
+        int t = (threadIdx.x + i) % max_tiles;
+        int e = (threadIdx.x + i) / max_tiles;
+        if (e < expert_num && t < tiles) {
+            atomicAdd(accum + e * tiles + t, smem.shared_accum[t][e]);
+        }
+    }
+}
+
+#ifdef USE_WARPSORT
+
+struct Greater {
+    template<class T>
+    __device__ bool operator()(T a, T b) const noexcept
+    {
+        return a > b;
+    }
+};
+
+template<int max_expert_num, int max_top_k, bool norm_top_k, int items_per_thread, int block_dim, int alignment>
+__global__ void MoeGateKernel_v9(float*       scales,  // [e,n]
+                                 int*         masks,   // [E,n], padded
+                                 int*         accum,   // [E,tiles]
+                                 const float* logits,  // [n,E]
+                                 int          log_tile,
+                                 int          tiles,
+                                 int          token_num,
+                                 int          token_num_padded,
+                                 int          expert_num,
+                                 int          top_k)
+{
+    constexpr int max_tiles         = kMoeGateMaxTiles;
+    constexpr int threads_per_token = max_expert_num / items_per_thread;  // 8
+
+    // We use bits in a uint32_t to represent selected experts
+    static_assert(items_per_thread <= 32);
+    // We use warp-level primitives for reduction
+    static_assert(threads_per_token <= 32);
+
+    static_assert((threads_per_token & (threads_per_token - 1)) == 0);
+
+    const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    const int ti = thread_idx / threads_per_token;
+    const int ei = thread_idx % threads_per_token;
+
+    const int warp_ti        = threadIdx.x % WARP_SIZE / threads_per_token;
+    const int warp_ti_offset = warp_ti * threads_per_token;
+
+    // +1 padding greatly reduced (-80%) bank conflicts
+    __shared__ int shared_accum[max_tiles][max_expert_num + 1];
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < max_tiles * max_expert_num; i += block_dim) {
+        int e              = (i + threadIdx.x) % max_expert_num;
+        int t              = (i + threadIdx.x) / max_expert_num;
+        shared_accum[t][e] = 0;
+    }
+
+    __syncthreads();
+
+    float data[items_per_thread];
+    int   idxs[items_per_thread];
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        data[i] = -std::numeric_limits<float>::infinity();
+        idxs[i] = threads_per_token * (i / alignment * alignment) + i % alignment + ei * alignment;
+    }
+
+    if (ti < token_num) {
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; i += alignment) {
+            const int e = threads_per_token * i + ei * alignment;
+            if (e < expert_num) {
+                Load((Array<float, alignment>&)data[i], &logits[ti * expert_num + e]);
+            }
+        }
+    }
+
+    constexpr float kLog2e = 1.4426950408889634074;
+
+    using MergeSort = cub::WarpMergeSort<float, items_per_thread, threads_per_token, int>;
+
+    union Smem {
+        typename MergeSort::TempStorage temp[block_dim / threads_per_token];
+        // int2                            sorted[max_top_k][block_dim / threads_per_token + 1];
+        int2 sorted[block_dim / threads_per_token][max_top_k + 1];
+    };
+
+    __shared__ Smem smem;
+
+    MergeSort{smem.temp[threadIdx.x / threads_per_token]}.Sort(data, idxs, Greater{});
+
+    __syncthreads();
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        const int k = threads_per_token * ei + i;  // blocked
+        if (k < max_top_k) {
+            // smem.sorted[k][threadIdx.x / threads_per_token] = int2{float_as_int(data[i]), idxs[i]};
+            smem.sorted[threadIdx.x / threads_per_token][k] = int2{float_as_int(data[i]), idxs[i]};
+        }
+    }
+
+    __syncthreads();
+
+    constexpr int top_k_per_thread = cdiv(max_top_k, threads_per_token);
+
+    int es[top_k_per_thread];
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < top_k_per_thread; ++i) {
+        const int k = ei * top_k_per_thread + i;
+        if (k < max_top_k) {
+            // const int2 tmp                   = smem.sorted[k][threadIdx.x / threads_per_token];
+            const int2 tmp                   = smem.sorted[threadIdx.x / threads_per_token][k];
+            const int  e                     = tmp.y;
+            masks[e * token_num_padded + ti] = k;
+            scales[ti * top_k + k]           = int_as_float(tmp.x);
+            es[i]                            = e;
+            // atomicAdd(&shared_accum[ti >> log_tile][e], 1);
+        }
+    }
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < top_k_per_thread; ++i) {
+        const int k = ei * top_k_per_thread + i;
+        if (k < max_top_k) {
+            atomicAdd(&shared_accum[ti >> log_tile][es[i]], 1);
+        }
+    }
+
+    __syncthreads();
+
+    for (int i = threadIdx.x; i < max_expert_num * max_tiles; i += block_dim) {
+        int t = i % max_tiles;
+        int e = i / max_tiles;
+        if (e < expert_num && t < tiles) {
+            // atomic add in Gmem
+            atomicAdd(accum + e * tiles + t, shared_accum[t][e]);
+        }
+    }
+}
+
+#endif
+
+template<int N>
+inline constexpr std::integral_constant<int, N> _Int{};
+
+void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
+                      int*         en2f,           // [e,n] -> n*e
+                      int*         offsets,        // [E+1]
+                      float*       scales,         // [e,n]
+                      void*        masks,          // [E,n]
+                      int*         accum,          // [E]
+                      const float* logits,         // [e,n]
+                      int          tokens,         //  n
+                      int          tokens_padded,  //  round_up(n, 4)
+                      int          experts,        //  E
+                      int          experts_per_token,
+                      bool         norm_topk,
+                      cudaStream_t st)
+{
+    constexpr int base_log_tile = 9;
+
+    int log_tile = base_log_tile;
+    while (((tokens_padded + (1 << log_tile) - 1) >> log_tile) > kMoeGateMaxTiles) {
+        ++log_tile;
+    }
+    const int tiles = ceil_div(tokens_padded, 1 << log_tile);
+
+    // std::cout << log_tile << " " << tiles << "\n";
+
+    {
+
+        auto invoke = [&](auto e) {
+            constexpr int        threads = 128;
+            const int            blocks  = ceil_div(tokens * 2, threads);
+            static constexpr int top_k   = decltype(e)::value;
+            MoeGateKernel_V2<top_k, threads><<<blocks, threads, 0, st>>>(  //
+                scales,
+                (int8_t*)masks,
+                accum,
+                logits,
+                log_tile,
+                tiles,
+                tokens,
+                tokens_padded,
+                experts);
+        };
+
+        auto invoke2 = [&](auto max_expert_num, auto top_k, auto items_per_thread) {
+            constexpr int thrs_per_tok = max_expert_num.value / items_per_thread.value;
+            constexpr int threads      = 128;
+            const int     blocks       = ceil_div(tokens, threads / thrs_per_tok);
+
+            cudaMemsetAsync(masks, -1, sizeof(int8_t) * experts * tokens_padded, st);
+
+            MoeGateKernel_v8<max_expert_num.value, top_k.value, items_per_thread.value, threads, 4>
+                <<<blocks, threads, 0, st>>>(  //
+                    scales,
+                    (int8_t*)masks,
+                    accum,
+                    logits,
+                    log_tile,
+                    tiles,
+                    tokens,
+                    tokens_padded,
+                    experts,
+                    experts_per_token,
+                    norm_topk);
+        };
+
+        invoke2(_Int<64>, _Int<8>, _Int<16>);
+
+        // invoke(_Int<2>);
+    }
+
+    {
+        constexpr int threads = (1 << base_log_tile) / kMoeGateVecSize;
+        const dim3    blocks(tiles, experts + 1);
+
+        MoeScanKernel_v2<threads><<<blocks, threads, 0, st>>>(f2n,  //
+                                                              en2f,
+                                                              offsets,
+                                                              (int8_t*)masks,
+                                                              accum,
+                                                              log_tile,
+                                                              tiles,
+                                                              tokens,
+                                                              tokens_padded,
+                                                              experts);
+    }
+}
+
+template<int vec_size, int block_dim, class T>
+__global__ void MoeGatherKernel(T*         dst,  // [e*n, d]
+                                const T*   src,  // [  n, d]
+                                const int* f2n,  // [e*n] :: e*n -> n
+                                int        dims)
+{
+    using Vec        = Array<T, vec_size>;
+    const int64_t bi = blockIdx.x;
+
+    auto src_ptr = (const Vec*)src + dims * f2n[bi];
+    auto dst_ptr = (/* */ Vec*)dst + dims * bi;
+    for (int i = threadIdx.x; i < dims; i += block_dim) {
+        Vec v;
+        Ldg(v, src_ptr[i].data());
+        Store(dst_ptr[i].data(), v);
+    }
+}
+
+template<class T>
+void invokeMoeGather(T* dst, const T* src, const int* f2n, int tokens, int experts_per_token, int dims, cudaStream_t st)
+{
+    constexpr int threads  = 256;
+    constexpr int vec_size = 16 / sizeof(T);
+    MoeGatherKernel<vec_size, threads><<<tokens * experts_per_token, threads, 0, st>>>(  //
+        dst,
+        src,
+        f2n,
+        dims / vec_size);
+}
+
+template void invokeMoeGather(uint16_t*, const uint16_t*, const int*, int, int, int, cudaStream_t);
+
+template<int vec_size, int exp_k, int block_dim, class T>
+__global__ void MoeReduceKernel(T*           dst,         // [  n, d]
+                                const T*     src,         // [e*n, d]
+                                const float* scales,      // [  e, n]
+                                const int*   en2f,        // [  e, n] :: (e,n) -> e*n
+                                const float* dst_scales,  // [n]
+                                int          dims,
+                                int          tokens)
+{
+    using Vec = Array<T, vec_size>;
+
+    const int64_t ti = blockIdx.x;
+
+    auto dst_ptr = (Vec*)dst + dims * ti;
+
+    float dst_scale = 0;
+    if (dst_scales) {
+        dst_scale = dst_scales[ti];
+        dst_scale = fdividef(1.f, 1.f + expf(-dst_scale));
+    }
+
+    // Should be warp uniforms
+    const Vec* src_ptr[exp_k];
+    float      scale[exp_k];
+    PRAGMA_UNROLL
+    for (int e = 0; e < exp_k; ++e) {
+        src_ptr[e] = (const Vec*)src + dims * en2f[e * tokens + ti];
+        scale[e]   = scales ? scales[e * tokens + ti] : 1.f;
+    }
+
+    for (int i = threadIdx.x; i < dims; i += block_dim) {
+        Array<float, vec_size> accum{};
+        if (dst_scales) {
+            Vec v;
+            Ldg(v, dst_ptr[i].data());
+            using namespace ops;
+            accum = cast<float>(v) * dst_scale;
+        }
         PRAGMA_UNROLL
         for (int e = 0; e < exp_k; ++e) {
             Vec v;
@@ -379,6 +1767,7 @@ void invokeMoeReduce(T*           dst,
                      const T*     src,
                      const float* scales,
                      const int*   en2f,
+                     const float* dst_scales,
                      int          tokens,
                      int          experts_per_token,
                      int          dims,
@@ -395,6 +1784,7 @@ void invokeMoeReduce(T*           dst,
             src,
             scales,
             en2f,
+            dst_scales,
             dims / vec_size,
             tokens);
     };
@@ -404,19 +1794,22 @@ void invokeMoeReduce(T*           dst,
             return invoke(std::integral_constant<int, 1>{});
         case 2:
             return invoke(std::integral_constant<int, 2>{});
-        // case 4:
-        //     return invoke(std::integral_constant<int, 4>{});
-        // case 6:
-        //     return invoke(std::integral_constant<int, 6>{});
+        case 4:
+            return invoke(std::integral_constant<int, 4>{});
+        case 6:
+            return invoke(std::integral_constant<int, 6>{});
+        case 8:
+            return invoke(std::integral_constant<int, 8>{});
         default:
             fprintf(stderr, "Unsupported experts_per_token %d\n", experts_per_token);
             std::abort();
     }
 }
 
-template void invokeMoeReduce(half*, const half*, const float*, const int*, int, int, int, cudaStream_t);
+template void invokeMoeReduce(half*, const half*, const float*, const int*, const float*, int, int, int, cudaStream_t);
 #ifdef ENABLE_BF16
-template void invokeMoeReduce(nv_bfloat16*, const nv_bfloat16*, const float*, const int*, int, int, int, cudaStream_t);
+template void
+invokeMoeReduce(nv_bfloat16*, const nv_bfloat16*, const float*, const int*, const float*, int, int, int, cudaStream_t);
 #endif
 
 std::vector<int> SampleUniform(int token_num, int expert_num, int exp_per_tok, std::mt19937& g)
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.h b/src/turbomind/kernels/gemm/moe_utils_v2.h
index 334e2de27..0e4c36af0 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.h
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.h
@@ -14,13 +14,14 @@ void invokeMoeGate_V2(int*         f2n,
                       int*         en2f,
                       int*         offsets,
                       float*       scales,
-                      int*         masks,
+                      void*        masks,
                       int*         accum,
                       const float* logits,
                       int          tokens,
                       int          tokens_padded,
                       int          experts,
                       int          exp_per_tok,
+                      bool         norm_topk,
                       cudaStream_t st);
 
 template<class T>
@@ -49,6 +50,7 @@ void invokeMoeReduce(T*           dst,
                      const T*     src,
                      const float* scales,
                      const int*   en2f,
+                     const float* dst_scales,
                      int          tokens,
                      int          experts_per_token,
                      int          dims,
diff --git a/src/turbomind/kernels/gemm/test/test_moe_utils.cu b/src/turbomind/kernels/gemm/test/test_moe_utils.cu
index a31116219..47e3bfdb1 100644
--- a/src/turbomind/kernels/gemm/test/test_moe_utils.cu
+++ b/src/turbomind/kernels/gemm/test/test_moe_utils.cu
@@ -26,6 +26,25 @@ void print_vecs(const T* data, int m, int k, std::string msg, int width = 4)
     }
 }
 
+template<class T>
+void diff_vecs(const T* data, const T* refs, int m, int k, std::string msg)
+{
+    if (!msg.empty()) {
+        std::cout << msg << ": [" << m << ", " << k << "]\n";
+    }
+    for (int mm = 0; mm < m; ++mm) {
+        std::cout << "m=" << mm << ": ";
+        for (int kk = 0; kk < k; ++kk) {
+            const auto& x = data[mm * k + kk];
+            const auto& y = refs[mm * k + kk];
+            if (x != y) {
+                std::cout << kk << "(" << x << ", " << y << ") ";
+            }
+        }
+        std::cout << "\n";
+    }
+}
+
 #if 0
 void func()
 {
@@ -190,7 +209,7 @@ void moe_gate_ref(int                            tokens,
     }
 }
 
-void mask2eids(const universal_vector<int>& masks, universal_vector<int>& eids, int tokens, int expert_num)
+void mask2eids(universal_vector<int8_t>& masks, universal_vector<int>& eids, int tokens, int expert_num)
 {
     const int tokens_padded = masks.size() / expert_num;
     // std::cout << eids.size() << std::endl;
@@ -228,13 +247,13 @@ bool test_moe_gate(int                     tokens,  //
     const int tokens_padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
     // const int max_coords    = get_max_coords(tokens, expert_num, experts_per_token, tiling);
 
-    universal_vector<int>   offsets(expert_num + 1);
-    universal_vector<int>   accum(expert_num * kMoeGateMaxTiles);
-    universal_vector<int>   masks(expert_num * tokens_padded);
-    universal_vector<int>   eids(experts_per_token * tokens);
-    universal_vector<int>   f2n(experts_per_token * tokens);
-    universal_vector<int>   en2f(experts_per_token * tokens);
-    universal_vector<float> scales(experts_per_token * tokens);
+    universal_vector<int>    offsets(expert_num + 1);
+    universal_vector<int>    accum(expert_num * kMoeGateMaxTiles);
+    universal_vector<int8_t> masks(expert_num * tokens_padded);
+    universal_vector<int>    eids(experts_per_token * tokens);
+    universal_vector<int>    f2n(experts_per_token * tokens);
+    universal_vector<int>    en2f(experts_per_token * tokens);
+    universal_vector<float>  scales(experts_per_token * tokens);
     // universal_vector<int2>  coords(max_coords);
     // thrust::fill(coords.begin(), coords.end(), int2{-1, 0});
 
@@ -246,8 +265,16 @@ bool test_moe_gate(int                     tokens,  //
 
     moe_gate_ref(tokens, expert_num, experts_per_token, logits, offsets_ref, eids_ref, f2n_ref, en2f_ref, scales_ref);
 
-    for (int i = 0; i < 10; ++i) {
+    cudaMemPrefetchAsync(f2n.data().get(), sizeof(int) * f2n.size(), 0);
+    cudaMemPrefetchAsync(en2f.data().get(), sizeof(int) * en2f.size(), 0);
+    cudaMemPrefetchAsync(offsets.data().get(), sizeof(int) * offsets.size(), 0);
+    cudaMemPrefetchAsync(scales.data().get(), sizeof(float) * scales.size(), 0);
+    cudaMemPrefetchAsync(logits.data().get(), sizeof(float) * logits.size(), 0);
+
+    for (int i = 0; i < 1; ++i) {
+        gemm::CacheFlushing::flush();
         cudaMemset(accum.data().get(), 0, sizeof(int) * accum.size());
+        cudaMemset(masks.data().get(), -1, sizeof(int8_t) * masks.size());
         invokeMoeGate_V2(f2n.data().get(),
                          en2f.data().get(),
                          offsets.data().get(),
@@ -259,6 +286,7 @@ bool test_moe_gate(int                     tokens,  //
                          tokens_padded,
                          expert_num,
                          experts_per_token,
+                         true,
                          0);
     }
 
@@ -306,7 +334,10 @@ bool test_moe_gate(int                     tokens,  //
         success = false;
     }
 
-    if (!success || false) {
+    if (!success && 1) {
+
+        diff_vecs(eids.data().get(), eids_ref.data().get(), experts_per_token, tokens, "eids");
+
         print_vecs(offsets_ref.data().get(), 1, expert_num + 1, "offsets_ref");
         print_vecs(offsets.data().get(), 1, expert_num + 1, "offsets");
 
@@ -322,32 +353,32 @@ bool test_moe_gate(int                     tokens,  //
         print_vecs(scales_ref.data().get(), experts_per_token, tokens, "scales_ref", 12);
         print_vecs(scales.data().get(), experts_per_token, tokens, "scales", 12);
 
-        print_vecs(accum.data().get(), expert_num, 1, "accum");
+        // print_vecs(accum.data().get(), expert_num, 1, "accum");
 
         // print_vecs(coords.data().get(), 1, max_coords, "coords");
 
-        thrust::host_vector<int4> tile_offsets(tape.max_ctas);
-        std::cout << tape.max_ctas << std::endl;
-        cudaMemcpy(tile_offsets.data(), tape.tile_offsets, sizeof(int4) * tile_offsets.size(), cudaMemcpyDefault);
-        cudaDeviceSynchronize();
-
-        std::cout << "coords:\n";
-        int last = -1;
-        for (int i = 0; i < tape.max_ctas; ++i) {
-            auto& c = tile_offsets[i];
-            if (last >= 0 && c.w != last) {
-                std::cout << "\n";
-            }
-            if (c.w == -1) {
-                std::cout << i << "\n";
-                break;
-            }
-            last = c.w;
-            std::stringstream ss;
-            ss << c.x << "," << c.y;
-            std::cout << std::setw(6) << ss.str();
-        }
-        std::cout << "\n";
+        // thrust::host_vector<int4> tile_offsets(tape.max_ctas);
+        // std::cout << tape.max_ctas << std::endl;
+        // cudaMemcpy(tile_offsets.data(), tape.tile_offsets, sizeof(int4) * tile_offsets.size(), cudaMemcpyDefault);
+        // cudaDeviceSynchronize();
+
+        // std::cout << "coords:\n";
+        // int last = -1;
+        // for (int i = 0; i < tape.max_ctas; ++i) {
+        //     auto& c = tile_offsets[i];
+        //     if (last >= 0 && c.w != last) {
+        //         std::cout << "\n";
+        //     }
+        //     if (c.w == -1) {
+        //         std::cout << i << "\n";
+        //         break;
+        //     }
+        //     last = c.w;
+        //     std::stringstream ss;
+        //     ss << c.x << "," << c.y;
+        //     std::cout << std::setw(6) << ss.str();
+        // }
+        // std::cout << "\n";
     }
 
     return success;
@@ -358,7 +389,11 @@ int main()
     gemm::Tape       tape{};
     constexpr Tiling tiling{14336, 128, {128, 128, 32}};
 
-    test_moe_gate(8192, 8, 2, tape, tiling);
+    // test_moe_gate(32768 * 4, 60, 4, tape, tiling);
+    // test_moe_gate(32768, 64, 8, tape, tiling);
+    // test_moe_gate(8, 60, 4, tape, tiling);
+
+    test_moe_gate(65536, 8, 2, tape, tiling);
     return 0;
 
     for (int i = 1; i < 16384; ++i) {
diff --git a/src/turbomind/kernels/gemm/test/testbed.h b/src/turbomind/kernels/gemm/test/testbed.h
index 6b1ec88f5..7a089fbdf 100644
--- a/src/turbomind/kernels/gemm/test/testbed.h
+++ b/src/turbomind/kernels/gemm/test/testbed.h
@@ -514,6 +514,7 @@ class Testbed {
                             c_e_.data().get(),
                             moe_scales_.data().get(),
                             moe_en2f_.data().get(),
+                            nullptr,
                             batch_size_,
                             expert_ids_.size() / batch_size_,
                             output_dims_,
@@ -523,6 +524,7 @@ class Testbed {
                             c_e_ref_.data().get(),
                             moe_scales_.data().get(),
                             moe_en2f_.data().get(),
+                            nullptr,
                             batch_size_,
                             expert_ids_.size() / batch_size_,
                             output_dims_,
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
index 2d68ef353..3b9bb1c11 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -137,6 +137,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
                                   moe_param.inter_size,
                                   moe_param.expert_num,
                                   moe_param.method,
+                                  moe_param.shared_gate,
                                   tensor_para_size_,
                                   weight_type,
                                   group_size,
@@ -349,18 +350,22 @@ void LlamaDecoderLayerWeight<T>::mallocWeights()
     mallocWeights(self_attn_weights.qkv, attn_bias_);
     mallocWeights(self_attn_weights.output, attn_bias_);
 
-    if (moe_weights.experts.empty()) {
+    if (inter_size_) {
         mallocWeights(ffn_weights.gating, false);
         mallocWeights(ffn_weights.intermediate, false);
         mallocWeights(ffn_weights.output, false);
     }
-    else {
+
+    if (!moe_weights.experts.empty()) {
         mallocWeights(moe_weights.gate, false);
         for (auto& e : moe_weights.experts) {
             mallocWeights(e.gating, false);
             mallocWeights(e.intermediate, false);
             mallocWeights(e.output, false);
         }
+        if (moe_weights.shared_gate.output_dims) {
+            mallocWeights(moe_weights.shared_gate, false);
+        }
     }
 }
 
@@ -375,10 +380,25 @@ LlamaDecoderLayerWeight<T>::~LlamaDecoderLayerWeight()
     freeWeights(self_attn_weights.qkv);
     freeWeights(self_attn_weights.output);
 
-    freeWeights(ffn_weights.fused_gating_intermediate);
-    freeWeights(ffn_weights.gating);
-    freeWeights(ffn_weights.intermediate);
-    freeWeights(ffn_weights.output);
+    if (inter_size_) {
+        freeWeights(ffn_weights.fused_gating_intermediate);
+        freeWeights(ffn_weights.gating);
+        freeWeights(ffn_weights.intermediate);
+        freeWeights(ffn_weights.output);
+    }
+
+    if (!moe_weights.experts.empty()) {
+        freeWeights(moe_weights.gate);
+        for (auto& e : moe_weights.experts) {
+            freeWeights(e.fused_gating_intermediate);
+            freeWeights(e.gating);
+            freeWeights(e.intermediate);
+            freeWeights(e.output);
+        }
+        if (moe_weights.shared_gate.kernel) {
+            freeWeights(moe_weights.shared_gate);
+        }
+    }
 }
 
 template<typename T>
@@ -428,23 +448,30 @@ TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
     getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output);
     getWeightTensor(self_attn_weights.output, attn_bias_, get_prefix("attention.wo"), output);
 
-    if (moe_weights.experts.empty()) {
+    if (inter_size_) {
         getWeightTensor(ffn_weights.gating, false, get_prefix("feed_forward.w1"), output);
         getWeightTensor(ffn_weights.intermediate, false, get_prefix("feed_forward.w3"), output);
         getWeightTensor(ffn_weights.output, false, get_prefix("feed_forward.w2"), output);
     }
-    else {
+
+    if (!moe_weights.experts.empty()) {
         output.insert(
             concat(prefix, "moe_ffn.gate.weight"),
             Tensor{MEMORY_GPU, getTensorType<T>(), {moe_weights.gate.kernel_size()}, moe_weights.gate.kernel});
         auto& experts = moe_weights.experts;
         for (size_t i = 0; i < experts.size(); ++i) {
             const std::string name = "moe_ffn.experts." + std::to_string(i);
-            // std::cerr << "FUCK " << get_prefix(concat(name, "w1")) << "\n";
             getWeightTensor(experts[i].gating, false, get_prefix(concat(name, "w1")), output);
             getWeightTensor(experts[i].intermediate, false, get_prefix(concat(name, "w3")), output);
             getWeightTensor(experts[i].output, false, get_prefix(concat(name, "w2")), output);
         }
+        if (moe_weights.shared_gate.kernel) {
+            output.insert(concat(prefix, "moe_ffn.shared_gate.weight"),
+                          Tensor{MEMORY_GPU,
+                                 getTensorType<T>(),
+                                 {moe_weights.shared_gate.kernel_size()},
+                                 moe_weights.shared_gate.kernel});
+        }
     }
 
     return output;
@@ -681,10 +708,13 @@ void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cud
         convert(ffn.output, is_fused_moe, workspace, size, is_16xx);
     };
 
-    if (moe_weights.experts.empty()) {
+    if (inter_size_) {
+        std::cerr << "process FFN\n";
         process_ffn(ffn_weights, false);
     }
-    else {
+
+    if (!moe_weights.experts.empty()) {
+        std::cerr << "process MoE\n";
         std::vector<std::pair<void*, int>> fused_ptrs;
         std::vector<std::pair<void*, int>> output_ptrs;
         std::vector<std::pair<void*, int>> fused_param_ptrs;
diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h
index 9a895243b..382d0dfc6 100644
--- a/src/turbomind/models/llama/LlamaDenseWeight.h
+++ b/src/turbomind/models/llama/LlamaDenseWeight.h
@@ -185,11 +185,15 @@ struct MoeFfnWeight {
                  int        inter_size,
                  int        expert_num,
                  int        method,
+                 bool       has_shared_gate,
                  size_t     tp,
                  WeightType weight_type,
                  int        group_size,
                  bool       fuse_silu_act)
     {
+
+        printf("%d %d %d\n", (int)hidden_dim, (int)inter_size, (int)expert_num);
+
         if (expert_num == 0) {
             return;
         }
@@ -208,11 +212,23 @@ struct MoeFfnWeight {
             // inter size is divided by tp in `FfnWeight`
             e = LlamaFfnWeight<T>{hidden_dim, (size_t)inter_size, tp, weight_type, group_size, fuse_silu_act};
         }
+
+        if (has_shared_gate) {
+            shared_gate.input_dims  = hidden_dim;
+            shared_gate.output_dims = 1;
+            shared_gate.type        = get_default_weight_type<T>();
+            gate.group_size         = group_size;
+        }
+        else {
+            shared_gate = {};
+        }
     }
 
     LlamaDenseWeight<T>            gate;
     std::vector<LlamaFfnWeight<T>> experts;
 
+    LlamaDenseWeight<T> shared_gate;
+
     LlamaFfnWeight<T> block;
 
     int method{};
diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h
index 1c039ca66..2ea63f041 100644
--- a/src/turbomind/models/llama/llama_params.h
+++ b/src/turbomind/models/llama/llama_params.h
@@ -31,9 +31,11 @@ struct MoeParam {
         kNaive,
         kFused
     } method;
-    int expert_num;
-    int experts_per_token;
-    int inter_size;
+    int  expert_num;
+    int  experts_per_token;
+    int  inter_size;
+    bool norm_topk;
+    bool shared_gate;
 };
 
 struct AttentionParam {
diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc
index def6b04ab..1ad76839d 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.cc
+++ b/src/turbomind/models/llama/moe_ffn_layer.cc
@@ -30,6 +30,7 @@ void MoeFfnLayer<T>::AllocateBuffer(size_t tokens, size_t padded)
         alloc(&f2n_, param_.experts_per_token * tokens);
         alloc(&en2f_, param_.experts_per_token * tokens);
         alloc(&scales_, param_.experts_per_token * tokens);
+        alloc(&shared_scales_, tokens);
         return (char*)alloc.ptr() - (char*)base;
     };
 
@@ -69,7 +70,7 @@ void MoeFfnLayer<T>::gate(float* logits, const T* input, int tokens, const Llama
                   getCudaDataType<T>(),
                   hidden_dim_,
                   &beta,
-                  logits_,
+                  logits,
                   CUDA_R_32F,
                   weight.output_dims,
                   CUDA_R_32F,
@@ -77,13 +78,13 @@ void MoeFfnLayer<T>::gate(float* logits, const T* input, int tokens, const Llama
 }
 
 template<class T>
-void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWeight<T>& moe)
+void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight<T>& moe)
 {
     const size_t padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
 
     AllocateBuffer(tokens, padded);
 
-    gate(logits_, inout, tokens, moe.gate);
+    gate(logits_, input, tokens, moe.gate);
     sync_check_cuda_error();
 
     check_cuda_error(cudaMemsetAsync(accum_, 0, sizeof(int) * param_.expert_num * kMoeGateMaxTiles, stream_));
@@ -103,6 +104,7 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
                      padded,
                      param_.expert_num,
                      param_.experts_per_token,
+                     param_.norm_topk,
                      stream_);
     sync_check_cuda_error();
 
@@ -123,7 +125,7 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
 
     if (param_.method == MoeParam::kNaive) {
 
-        dispatchMoeGather(inout_buf_, inout, f2n_, tokens, param_.experts_per_token, hidden_dim_, stream_);
+        dispatchMoeGather(inout_buf_, input, f2n_, tokens, param_.experts_per_token, hidden_dim_, stream_);
         sync_check_cuda_error();
 
         check_cuda_error(
@@ -155,28 +157,8 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
 
         auto& block = moe.block;
 
-#if 0
-        FT_CHECK(!block.is_fused_silu);
-        for (int i = 0; i < param_.expert_num; ++i) {
-            if (size_t count = h_offsets_[i + 1] - h_offsets_[i]) {
-                cublas_->Gemm(CUBLAS_OP_T,  // (m, k)  W
-                              CUBLAS_OP_N,  // (k, n)  X
-                              inter_size_ * 2,
-                              count,
-                              hidden_dim_,
-                              moe.experts[i].fused_gating_intermediate.kernel,
-                              hidden_dim_,
-                              inout_buf_ + h_offsets_[i] * hidden_dim_,
-                              hidden_dim_,
-                              inter_buf_ + h_offsets_[i] * inter_size_ * 2,
-                              inter_size_ * 2);
-                sync_check_cuda_error();
-            }
-        }
-        auto mode = kCmpWrite;
-#else
         linear_->forward_moe(inter_buf_,
-                             {inout, (int)hidden_dim_},
+                             {input, (int)hidden_dim_},
                              f2n_,
                              offsets_,
                              tokens * param_.experts_per_token,
@@ -185,7 +167,6 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
                              context_.get());
         sync_check_cuda_error();
         auto mode = kCmpRead;
-#endif
 
         // if (tensor_para_.rank_ == 0) {
         //     Compare(inter_buf_,  //
@@ -205,25 +186,6 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
             sync_check_cuda_error();
         }
 
-#if 0
-        for (int i = 0; i < param_.expert_num; ++i) {
-            if (size_t count = h_offsets_[i + 1] - h_offsets_[i]) {
-                cublas_->Gemm(CUBLAS_OP_T,  // (m, k) W
-                              CUBLAS_OP_N,  // (k, n) X
-                              hidden_dim_,
-                              count,
-                              inter_size_,
-                              moe.experts[i].output.kernel,
-                              inter_size_,
-                              inter_buf_ + h_offsets_[i] * inter_size_ * 2,
-                              inter_size_ * 2,
-                              inout_buf_ + h_offsets_[i] * hidden_dim_,
-                              hidden_dim_);
-                sync_check_cuda_error();
-            }
-        }
-        auto mode1 = kCmpWrite;
-#else
         linear_->forward_moe(inout_buf_,
                              {inter_buf_, block.is_fused_silu ? (int)inter_size_ : (int)inter_size_ * 2},
                              nullptr,
@@ -234,7 +196,6 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
                              context_.get());
         sync_check_cuda_error();
         auto mode1 = kCmpRead;
-#endif
 
         // if (tensor_para_.rank_ == 0) {
         //     Compare(inter_buf_2_,  //
@@ -250,18 +211,29 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
         // }
     }
 
-    invokeMoeReduce(inout, inout_buf_, scales_, en2f_, tokens, param_.experts_per_token, hidden_dim_, stream_);
+    if (moe.shared_gate.kernel) {
+        gate(shared_scales_, input, tokens, moe.shared_gate);
+    }
+}
+
+template<class T>
+void MoeFfnLayer<T>::reduce(T* output, int tokens, const MoeFfnWeight<T>& moe)
+{
+    invokeMoeReduce(output,
+                    inout_buf_,
+                    scales_,
+                    en2f_,
+                    moe.shared_gate.kernel ? shared_scales_ : nullptr,
+                    tokens,
+                    param_.experts_per_token,
+                    hidden_dim_,
+                    stream_);
     sync_check_cuda_error();
 
     if (tensor_para_.world_size_ > 1) {
-        ftNcclAllReduceSum(inout, inout, tokens * hidden_dim_, tensor_para_, stream_);
+        ftNcclAllReduceSum(output, output, tokens * hidden_dim_, tensor_para_, stream_);
         sync_check_cuda_error();
     }
-
-    // if (tensor_para_.rank_ == 0) {
-    //     check_cuda_error(cudaStreamSynchronize(stream_));
-    //     std::abort();
-    // }
 }
 
 template<class T>
diff --git a/src/turbomind/models/llama/moe_ffn_layer.h b/src/turbomind/models/llama/moe_ffn_layer.h
index ef65aaa46..0f1713f7b 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.h
+++ b/src/turbomind/models/llama/moe_ffn_layer.h
@@ -51,7 +51,9 @@ class MoeFfnLayer {
         FreeBuffer();
     }
 
-    void forward(T* inout, int tokens, int layer_id, const MoeFfnWeight<T>& moe);
+    void forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight<T>& moe);
+
+    void reduce(T* output, int tokens, const MoeFfnWeight<T>& moe);
 
     void gate(float* logits, const T* input, int tokens, const LlamaDenseWeight<T>& weight);
 
@@ -85,6 +87,8 @@ class MoeFfnLayer {
     int*   en2f_{};
     float* scales_{};
 
+    float* shared_scales_{};
+
     int* accum_{};
     int* offsets_{};
 };
diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc
index 68392215f..28e8b5f64 100644
--- a/src/turbomind/models/llama/unified_decoder.cc
+++ b/src/turbomind/models/llama/unified_decoder.cc
@@ -26,9 +26,15 @@ UnifiedDecoder<T>::UnifiedDecoder(const ModelParam&     model,
     dtype_(getTensorType<T>())
 {
 
-    attn_layer_    = std::make_unique<UnifiedAttentionLayer<T>>(model, attn, lora, tp, ctx);
-    ffn_layer_     = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx, true);
-    moe_ffn_layer_ = std::make_unique<MoeFfnLayer<T>>(model, moe, tp, ctx);
+    attn_layer_ = std::make_unique<UnifiedAttentionLayer<T>>(model, attn, lora, tp, ctx);
+
+    if (moe.expert_num) {
+        moe_ffn_layer_ = std::make_unique<MoeFfnLayer<T>>(model, moe, tp, ctx);
+    }
+
+    if (model.inter_size) {
+        ffn_layer_ = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx, !moe_ffn_layer_);
+    }
 
     check_cuda_error(cudaEventCreateWithFlags(&ev_h_cu_x_, cudaEventDisableTiming));
 }
@@ -190,9 +196,10 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
         /// feed-forward network
 
         if (!weights->at(layer)->moe_weights.experts.empty()) {
-            moe_ffn_layer_->forward(decoder_output, token_num, layer, weights->at(layer)->moe_weights);
+            moe_ffn_layer_->forward(nullptr, decoder_output, token_num, layer, weights->at(layer)->moe_weights);
         }
-        else {
+
+        if (ffn_layer_) {
             int       layer_id = layer;  // int is needed
             TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}},
                                  {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}}};
@@ -203,6 +210,10 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
             ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &weights->at(layer)->ffn_weights);
         }
 
+        if (!weights->at(layer)->moe_weights.experts.empty()) {
+            moe_ffn_layer_->reduce(decoder_output, token_num, weights->at(layer)->moe_weights);
+        }
+
         count_and_fix(decoder_output, token_num * hidden_units_, Concat("ffn_block", layer), 2);
 
         const bool is_last_layer = layer == layer_num_ - 1;
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 8db13652f..38552be0c 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -301,6 +301,8 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     moe_param_.expert_num        = model_reader["expert_num"].as<int>(0);
     moe_param_.experts_per_token = model_reader["experts_per_token"].as<int>(0);
     moe_param_.inter_size        = model_reader["expert_inter_size"].as<int>(0);
+    moe_param_.shared_gate       = model_reader["moe_shared_gate"].as<int>(0);
+    moe_param_.norm_topk         = model_reader["moe_norm_topk"].as<bool>(false);
 
     handleMissingParams();
 

From cea0524c74a678797bd61c26a5e367174dffa167 Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Thu, 7 Nov 2024 07:32:53 +0000
Subject: [PATCH 02/21] eliminate `inter_size_` from ffn layer

---
 src/turbomind/models/llama/LlamaDenseWeight.h | 13 +++--
 src/turbomind/models/llama/LlamaFfnLayer.cc   | 47 ++++++++++---------
 src/turbomind/models/llama/LlamaFfnLayer.h    | 11 ++---
 3 files changed, 37 insertions(+), 34 deletions(-)

diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h
index 382d0dfc6..21c1c666c 100644
--- a/src/turbomind/models/llama/LlamaDenseWeight.h
+++ b/src/turbomind/models/llama/LlamaDenseWeight.h
@@ -145,24 +145,28 @@ struct LlamaFfnWeight {
     LlamaFfnWeight(
         size_t hidden_dim, size_t inter_size, size_t tp, WeightType weight_type, int group_size, bool fuse_silu_act)
     {
+        inter_size /= tp;
+
+        this->inter_size = inter_size;
+
         gating.input_dims  = hidden_dim;
-        gating.output_dims = inter_size / tp;
+        gating.output_dims = inter_size;
         gating.type        = weight_type;
         gating.group_size  = group_size;
 
         intermediate.input_dims  = hidden_dim;
-        intermediate.output_dims = inter_size / tp;
+        intermediate.output_dims = inter_size;
         intermediate.type        = weight_type;
         intermediate.group_size  = group_size;
 
         fused_gating_intermediate.input_dims  = hidden_dim;
-        fused_gating_intermediate.output_dims = inter_size / tp * 2;
+        fused_gating_intermediate.output_dims = inter_size * 2;
         fused_gating_intermediate.type        = weight_type;
         fused_gating_intermediate.group_size  = group_size;
 
         is_fused_silu = fuse_silu_act;
 
-        output.input_dims  = inter_size / tp;
+        output.input_dims  = inter_size;
         output.output_dims = hidden_dim;
         output.type        = weight_type;
         output.group_size  = group_size;
@@ -173,6 +177,7 @@ struct LlamaFfnWeight {
     LlamaDenseWeight<T> output;
     LlamaDenseWeight<T> fused_gating_intermediate;
 
+    int  inter_size{};
     bool is_fused_silu{};
 };
 
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc
index f9ee0c4ad..8cce20720 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.cc
+++ b/src/turbomind/models/llama/LlamaFfnLayer.cc
@@ -28,10 +28,11 @@ namespace turbomind {
 
 template<typename T>
 void LlamaFfnLayer<T>::allocateBuffer(size_t                     token_num,
+                                      int                        inter_size,
                                       const LlamaDenseWeight<T>* gating,
                                       const LlamaDenseWeight<T>* inter)
 {
-    const size_t sz = token_num * inter_size_;
+    const size_t sz = token_num * inter_size;
 
     const size_t sz_gate  = token_num * gating->lora.r;
     const size_t sz_inter = token_num * inter->lora.r;
@@ -51,24 +52,24 @@ template<typename T>
 void LlamaFfnLayer<T>::freeBuffer()
 {
     if (is_allocate_buffer_) {
-        // allocator_->free((void**)&inter_buf_);
         allocator_->free((void**)&gating_buf_);
         is_allocate_buffer_ = false;
     }
 }
 
 template<typename T>
-void LlamaFfnLayer<T>::activation(int token_num, bool is_chunked)
+void LlamaFfnLayer<T>::activation(int token_num, int inter_size, bool is_chunked)
 {
     NvtxScope scope("activation");
     if (is_chunked) {
+        // gate & up are in the SAME buffer
         invokeGenericActivation_v2<SiluActivation>(
-            gating_buf_, gating_buf_ + inter_size_, inter_size_ * 2, token_num, inter_size_, stream_);
+            gating_buf_, gating_buf_ + inter_size, inter_size * 2, token_num, inter_size, stream_);
         sync_check_cuda_error();
     }
     else {
-        invokeGenericActivation_v2<SiluActivation>(
-            gating_buf_, inter_buf_, inter_size_, token_num, inter_size_, stream_);
+        // gate & up are in separate buffers
+        invokeGenericActivation_v2<SiluActivation>(gating_buf_, inter_buf_, inter_size, token_num, inter_size, stream_);
         sync_check_cuda_error();
     }
 }
@@ -88,11 +89,11 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
 
     NvtxScope scope("ffn");
 
-    const size_t num_token = input_tensors->at("ffn_input").shape[0];
-    const int    layer_id  = input_tensors->getVal<int>("layer_id");
-    // LOG(WARNING);
+    const size_t token_num  = input_tensors->at("ffn_input").shape[0];
+    const int    layer_id   = input_tensors->getVal<int>("layer_id");
+    const int    inter_size = weights->inter_size;
 
-    allocateBuffer(num_token, &weights->gating, &weights->intermediate);
+    allocateBuffer(token_num, inter_size, &weights->gating, &weights->intermediate);
 
     const T* ffn_input_data  = input_tensors->at("ffn_input").getPtr<T>();
     T*       ffn_output_data = output_tensors->at("ffn_output").getPtr<T>();
@@ -103,50 +104,50 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
 
         const auto type = weights->is_fused_silu ? LlamaLinear<T>::kFusedSiluFfn : LlamaLinear<T>::kGemm;
 
-        linear_->forward(gating_buf_, ffn_input_data, num_token, weights->fused_gating_intermediate, type);
+        linear_->forward(gating_buf_, ffn_input_data, token_num, weights->fused_gating_intermediate, type);
         sync_check_cuda_error();
 
         if (!weights->is_fused_silu) {
-            activation(num_token, true);
+            activation(token_num, inter_size, true);
         }
 
-        count_and_fix(gating_buf_, num_token * weights->output.input_dims, Concat("w1_w3_silu", layer_id), 3);
+        count_and_fix(gating_buf_, token_num * weights->output.input_dims, Concat("w1_w3_silu", layer_id), 3);
     }
     else {
         {  // w1(x)
             NvtxScope scope("w1");
-            linear_->forward(gating_buf_, ffn_input_data, num_token, weights->gating, LlamaLinear<T>::kGemm, lora_mask);
+            linear_->forward(gating_buf_, ffn_input_data, token_num, weights->gating, LlamaLinear<T>::kGemm, lora_mask);
             sync_check_cuda_error();
         }
-        count_and_fix(gating_buf_, num_token * weights->gating.output_dims, Concat("w1", layer_id), 3);
+        count_and_fix(gating_buf_, token_num * weights->gating.output_dims, Concat("w1", layer_id), 3);
 
         {  // w3(x)
             NvtxScope scope("w3");
             linear_->forward(
-                inter_buf_, ffn_input_data, num_token, weights->intermediate, LlamaLinear<T>::kGemm, lora_mask);
+                inter_buf_, ffn_input_data, token_num, weights->intermediate, LlamaLinear<T>::kGemm, lora_mask);
             sync_check_cuda_error();
         }
-        count_and_fix(inter_buf_, num_token * weights->intermediate.output_dims, Concat("w3", layer_id), 3);
+        count_and_fix(inter_buf_, token_num * weights->intermediate.output_dims, Concat("w3", layer_id), 3);
 
         // silu(w1(x)) * w3(x)
-        activation(num_token, false);
+        activation(token_num, inter_size, false);
 
-        count_and_fix(gating_buf_, num_token * weights->output.input_dims, Concat("act", layer_id), 3);
+        count_and_fix(gating_buf_, token_num * weights->output.input_dims, Concat("act", layer_id), 3);
     }
 
     {  // w2(x)
         NvtxScope scope("w2");
-        const int pitch = (weights->fused_gating_intermediate.kernel && !weights->is_fused_silu) ? inter_size_ * 2 : 0;
+        const int pitch = (weights->fused_gating_intermediate.kernel && !weights->is_fused_silu) ? inter_size * 2 : 0;
         linear_->forward(
-            ffn_output_data, {gating_buf_, pitch}, num_token, weights->output, LlamaLinear<T>::kGemm, lora_mask);
+            ffn_output_data, {gating_buf_, pitch}, token_num, weights->output, LlamaLinear<T>::kGemm, lora_mask);
         sync_check_cuda_error();
     }
 
-    count_and_fix(ffn_output_data, num_token * weights->output.output_dims, Concat("w2", layer_id), 3);
+    count_and_fix(ffn_output_data, token_num * weights->output.output_dims, Concat("w2", layer_id), 3);
 
     if (all_reduce_ && tensor_para_.world_size_ > 1) {
         NcclGuard nccl_guard(tensor_para_, stream_);
-        ftNcclAllReduceSum(ffn_output_data, ffn_output_data, num_token * hidden_units_, tensor_para_, stream_);
+        ftNcclAllReduceSum(ffn_output_data, ffn_output_data, token_num * hidden_units_, tensor_para_, stream_);
         sync_check_cuda_error();
     }
 
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.h b/src/turbomind/models/llama/LlamaFfnLayer.h
index 75ced5f9a..2daca2cc9 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.h
+++ b/src/turbomind/models/llama/LlamaFfnLayer.h
@@ -19,12 +19,11 @@
 
 #pragma once
 
-#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
 #include "src/turbomind/models/llama/LlamaLinear.h"
 #include "src/turbomind/models/llama/context.h"
-#include "src/turbomind/utils/custom_ar_comm.h"
+#include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/nccl_utils.h"
-#include <functional>
 
 namespace turbomind {
 
@@ -32,7 +31,6 @@ template<typename T>
 class LlamaFfnLayer {
 public:
     LlamaFfnLayer(const ModelParam& model, const NcclParam& tp, const Context<T>& ctx, bool all_reduce):
-        inter_size_(model.inter_size / tp.world_size_),
         hidden_units_(model.hidden_units),
         tensor_para_(tp),
         stream_(ctx.stream),
@@ -50,13 +48,12 @@ class LlamaFfnLayer {
     void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaFfnWeight<T>* weights);
 
 private:
-    void allocateBuffer(size_t token_num, const LlamaDenseWeight<T>*, const LlamaDenseWeight<T>*);
+    void allocateBuffer(size_t token_num, int inter_size, const LlamaDenseWeight<T>*, const LlamaDenseWeight<T>*);
 
     void freeBuffer();
 
-    void activation(int token_num, bool is_chunked);
+    void activation(int token_num, int inter_size, bool is_chunked);
 
-    const size_t          inter_size_;
     const size_t          hidden_units_;
     const NcclParam       tensor_para_;
     cudaStream_t const    stream_;

From b7d050a044be2ff4eded41e440e1f54494c9a816 Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Thu, 7 Nov 2024 08:02:29 +0000
Subject: [PATCH 03/21] clean up

---
 lmdeploy/turbomind/deploy/module.py           |    2 +-
 src/turbomind/kernels/gemm/moe_utils_v2.cu    | 1101 +----------------
 .../models/llama/LlamaDecoderLayerWeight.cc   |    4 +-
 src/turbomind/models/llama/LlamaDenseWeight.h |    2 +-
 4 files changed, 41 insertions(+), 1068 deletions(-)

diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py
index 8e20946b1..2d3575c37 100644
--- a/lmdeploy/turbomind/deploy/module.py
+++ b/lmdeploy/turbomind/deploy/module.py
@@ -165,7 +165,7 @@ def apply(self, i: int, r: BaseReader):
 
         if self.shared_gate:
             shared_gate = transpose(r.moe_ffn_shared_gate(i))
-            print(shared_gate)
+            # print(shared_gate)
             self.model.save_split(shared_gate, self._moe_ffn_shared_gate.format(i))
 
 
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu
index 42ff84a3a..dc2c21d7c 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.cu
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu
@@ -252,880 +252,6 @@ __global__ void MoeScanKernel_v2(int*       f2n,      // [e*n]
     }
 }
 
-template<int max_expert_num, int max_top_k, bool norm_top_k, int items_per_thread, int block_dim>
-__global__ void MoeGateKernel_v3(float*       scales,  // [e,n]
-                                 int*         masks,   // [E,n], padded
-                                 int*         accum,   // [E,tiles]
-                                 const float* logits,  // [n,E]
-                                 int          log_tile,
-                                 int          tiles,
-                                 int          token_num,
-                                 int          token_num_padded,
-                                 int          expert_num,
-                                 int          top_k)
-{
-    constexpr int max_tiles         = kMoeGateMaxTiles;
-    constexpr int threads_per_token = max_expert_num / items_per_thread;  // 8
-
-    // We use bits in a uint32_t to represent selected experts
-    static_assert(items_per_thread <= 32);
-    // We use warp-level primitives for reduction
-    static_assert(threads_per_token <= 32);
-
-    static_assert((threads_per_token & (threads_per_token - 1)) == 0);
-
-    const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
-
-    const int ti = thread_idx / threads_per_token;
-    const int ei = thread_idx % threads_per_token;
-
-    const int warp_ti        = threadIdx.x % WARP_SIZE / threads_per_token;
-    const int warp_ti_offset = warp_ti * threads_per_token;
-
-    __shared__ int shared_accum[max_expert_num][max_tiles];
-
-    for (int i = threadIdx.x; i < expert_num * max_tiles; i += block_dim) {
-        int e = i / max_tiles;
-        int t = i % max_tiles;
-        if (e < expert_num && t < tiles) {
-            shared_accum[e][t] = 0;
-        }
-    }
-
-    __syncthreads();
-
-    float data[items_per_thread];
-
-    PRAGMA_UNROLL
-    for (int i = 0; i < items_per_thread; ++i) {
-        data[i]     = -std::numeric_limits<float>::infinity();
-        const int e = threads_per_token * i + ei;
-        if (e < expert_num && ti < token_num) {
-            data[i] = __ldg(logits + ti * expert_num + e);
-
-            if (ti == 39505) {
-                printf("%f %d\n", data[i], e);
-            }
-        }
-    }
-
-    unsigned mask = (unsigned)-1;
-    float    max_logit;
-
-    auto run = [&](int k) {
-        unsigned bit     = 1;
-        unsigned max_bit = 0;
-        float    max_val = -std::numeric_limits<float>::infinity();
-        // local maximum
-        PRAGMA_UNROLL
-        for (int i = 0; i < items_per_thread; ++i) {
-            if ((mask & bit) && data[i] > max_val) {
-                max_bit = bit;
-                max_val = data[i];
-            }
-            asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
-        }
-        int   g_max_ei  = ei;
-        float g_max_val = max_val;
-        if constexpr (threads_per_token > 1) {
-            // global maximum
-            PRAGMA_UNROLL
-            for (int m = threads_per_token / 2; m >= 1; m /= 2) {
-                g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m));
-            }
-            // tie breaking
-            const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val);
-            g_max_ei          = __ffs(active >> (unsigned)warp_ti_offset) - 1;
-        }
-        if (k == 0) {
-            max_logit = g_max_val;
-        }
-        if (ei == g_max_ei) {
-            mask -= max_bit;
-        }
-    };
-
-    run(0);
-    for (int k = 1; k < max_top_k; ++k) {
-        run(k);
-    }
-
-    mask = ~mask;
-
-    float    sum_prob{};
-    unsigned bit = 1;
-    PRAGMA_UNROLL
-    for (int i = 0; i < items_per_thread; ++i) {
-        if (norm_top_k == false || (mask & bit)) {
-            data[i] = expf(data[i] - max_logit);
-            sum_prob += data[i];
-        }
-        asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
-    }
-
-    PRAGMA_UNROLL
-    for (int m = threads_per_token / 2; m >= 1; m /= 2) {
-        sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m);
-    }
-
-    sum_prob = fdividef(1.f, sum_prob);
-
-    const unsigned group_mask = ((1U << (unsigned)threads_per_token) - 1U) << (unsigned)warp_ti_offset;
-
-    // 1111 1111 << 24
-
-    const unsigned lanemask_lt = ((1U << (unsigned)ei) - 1U) << (unsigned)warp_ti_offset;
-
-    // 1000 0000
-    // 0111 1111 0000 0000 0000 0000 0000 0000
-
-    int offset = 0;
-    bit        = 1;
-    PRAGMA_UNROLL
-    for (int i = 0; i < items_per_thread; ++i) {
-        const int      valid  = mask & bit;
-        const unsigned active = __ballot_sync((uint32_t)-1, valid);
-        if (valid) {
-            const int e   = threads_per_token * i + ei;
-            const int idx = offset + __popc(active & lanemask_lt);
-            if (ti == 39505) {
-                printf("%d %d %f\n", e, idx, data[i] * sum_prob);
-            }
-            masks[e * token_num_padded + ti] = idx;
-            scales[idx * token_num + ti]     = data[i] * sum_prob;
-            // atomic add in Smem
-            atomicAdd(&shared_accum[e][ti >> log_tile], 1);
-        }
-        offset += __popc(active & group_mask);
-        asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
-    }
-
-    __syncthreads();
-
-    for (int i = threadIdx.x; i < expert_num * max_tiles; i += block_dim) {
-        int e = i / max_tiles;
-        int t = i % max_tiles;
-        if (e < expert_num && t < tiles) {
-            // atomic add in Gmem
-            atomicAdd(accum + e * tiles + t, shared_accum[e][t]);
-        }
-    }
-}
-
-template<int max_expert_num, int max_top_k, bool norm_top_k, int items_per_thread, int block_dim, int alignment>
-__global__ void MoeGateKernel_v4(float*       scales,  // [e,n]
-                                 int*         masks,   // [E,n], padded
-                                 int*         accum,   // [E,tiles]
-                                 const float* logits,  // [n,E]
-                                 int          log_tile,
-                                 int          tiles,
-                                 int          token_num,
-                                 int          token_num_padded,
-                                 int          expert_num,
-                                 int          top_k)
-{
-    constexpr int max_tiles         = kMoeGateMaxTiles;
-    constexpr int threads_per_token = max_expert_num / items_per_thread;  // 8
-
-    // We use bits in a uint32_t to represent selected experts
-    static_assert(items_per_thread <= 32);
-    // We use warp-level primitives for reduction
-    static_assert(threads_per_token <= 32);
-
-    static_assert((threads_per_token & (threads_per_token - 1)) == 0);
-
-    const int warp_id = threadIdx.x / WARP_SIZE;
-    const int lane_id = threadIdx.x % WARP_SIZE;
-
-    const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
-
-    const int ti = thread_idx / threads_per_token;
-    const int ei = thread_idx % threads_per_token;
-
-    const int warp_ti        = threadIdx.x % WARP_SIZE / threads_per_token;
-    const int warp_ti_offset = warp_ti * threads_per_token;
-
-    __shared__ int shared_accum[max_expert_num][max_tiles];
-
-    for (int i = threadIdx.x; i < max_expert_num * max_tiles; i += block_dim) {
-        int e              = i / max_tiles;
-        int t              = i % max_tiles;
-        shared_accum[e][t] = 0;
-    }
-
-#if 0
-    logits += blockIdx.x * block_dim / threads_per_token * expert_num;
-    logits += threadIdx.x / WARP_SIZE * (WARP_SIZE / threads_per_token) * expert_num;
-    constexpr int tokens_per_warp = WARP_SIZE / threads_per_token;
-    __shared__ float smem_data[block_dim / WARP_SIZE][tokens_per_warp][max_expert_num + 1];
-    PRAGMA_UNROLL
-    for (int i = 0; i < tokens_per_warp * max_expert_num; i += WARP_SIZE) {
-        smem_data[warp_id][(lane_id + i) / max_expert_num][(lane_id + i) % max_expert_num] = logits[lane_id + i];
-    }
-    __syncthreads();
-    float data[items_per_thread];
-    for (int i = 0; i < items_per_thread; ++i) {
-        const int e = items_per_thread * ei + i;
-        data[i]     = smem_data[warp_id][lane_id / threads_per_token][e];
-    }
-#else
-    __syncthreads();
-    float data[items_per_thread];
-    PRAGMA_UNROLL
-    for (int i = 0; i < items_per_thread; ++i) {
-        data[i] = -std::numeric_limits<float>::infinity();
-    }
-    if (ti < token_num) {
-        PRAGMA_UNROLL
-        for (int i = 0; i < items_per_thread; i += alignment) {
-            const int e = items_per_thread * ei + i;
-            if (e < expert_num) {
-                Load((Array<float, alignment>&)data[i], &logits[ti * expert_num + e]);
-            }
-        }
-    }
-#endif
-
-    unsigned mask = (unsigned)-1;
-    float    max_logit;
-
-    auto run = [&](int k) {
-        unsigned bit     = 1;
-        unsigned max_bit = 0;
-        float    max_val = -std::numeric_limits<float>::infinity();
-        // local maximum
-        PRAGMA_UNROLL
-        for (int i = 0; i < items_per_thread; ++i) {
-            if ((mask & bit) && data[i] > max_val) {
-                max_bit = bit;
-                max_val = data[i];
-            }
-            asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
-        }
-        int   g_max_ei  = ei;
-        float g_max_val = max_val;
-        if constexpr (threads_per_token > 1) {
-            // global maximum
-            PRAGMA_UNROLL
-            for (int m = threads_per_token / 2; m >= 1; m /= 2) {
-                g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m));
-            }
-            // tie breaking
-            const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val);
-            g_max_ei          = __ffs(active >> (unsigned)warp_ti_offset) - 1;
-        }
-        if (k == 0) {
-            max_logit = g_max_val;
-        }
-        if (ei == g_max_ei) {
-            mask -= max_bit;
-        }
-    };
-
-    run(0);
-    for (int k = 1; k < max_top_k; ++k) {
-        run(k);
-    }
-
-    mask = ~mask;
-
-    float    sum_prob{};
-    unsigned bit = 1;
-    PRAGMA_UNROLL
-    for (int i = 0; i < items_per_thread; ++i) {
-        if (norm_top_k == false || (mask & bit)) {
-            data[i] = expf(data[i] - max_logit);
-            sum_prob += data[i];
-        }
-        asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
-    }
-
-    PRAGMA_UNROLL
-    for (int m = threads_per_token / 2; m >= 1; m /= 2) {
-        sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m);
-    }
-
-    sum_prob = fdividef(1.f, sum_prob);
-
-    const int count = __popc(mask);
-
-    using WarpScan = cub::WarpScan<int, threads_per_token>;
-    __shared__ typename WarpScan::TempStorage temp_storage[block_dim / threads_per_token];
-
-    int idx{};
-    WarpScan{temp_storage[threadIdx.x / threads_per_token]}.ExclusiveSum(count, idx);
-
-    bit = 1;
-    PRAGMA_UNROLL
-    for (int i = 0; i < items_per_thread; ++i) {
-        const int valid = mask & bit;
-        const int e     = items_per_thread * ei + i;
-        if (valid) {
-            masks[e * token_num_padded + ti] = idx;
-            scales[idx * token_num + ti]     = data[i] * sum_prob;
-            // atomic add in Smem
-            atomicAdd(&shared_accum[e][ti >> log_tile], 1);
-            ++idx;
-        }
-        asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
-    }
-
-    __syncthreads();
-
-    for (int i = threadIdx.x; i < expert_num * max_tiles; i += block_dim) {
-        int e = i / max_tiles;
-        int t = i % max_tiles;
-        if (e < expert_num && t < tiles) {
-            // atomic add in Gmem
-            atomicAdd(accum + e * tiles + t, shared_accum[e][t]);
-        }
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//
-
-template<int max_expert_num, int max_top_k, bool norm_top_k, int items_per_thread, int block_dim, int alignment>
-__global__ void MoeGateKernel_v5(float*       scales,  // [e,n]
-                                 int*         masks,   // [E,n], padded
-                                 int*         accum,   // [E,tiles]
-                                 const float* logits,  // [n,E]
-                                 int          log_tile,
-                                 int          tiles,
-                                 int          token_num,
-                                 int          token_num_padded,
-                                 int          expert_num,
-                                 int          top_k)
-{
-    constexpr int max_tiles         = kMoeGateMaxTiles;
-    constexpr int threads_per_token = max_expert_num / items_per_thread;  // 8
-
-    // We use bits in a uint32_t to represent selected experts
-    static_assert(items_per_thread <= 32);
-    // We use warp-level primitives for reduction
-    static_assert(threads_per_token <= 32);
-
-    static_assert((threads_per_token & (threads_per_token - 1)) == 0);
-
-    const int warp_id = threadIdx.x / WARP_SIZE;
-    const int lane_id = threadIdx.x % WARP_SIZE;
-
-    const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
-
-    const int ti = thread_idx / threads_per_token;
-    const int ei = thread_idx % threads_per_token;
-
-    const int warp_ti        = threadIdx.x % WARP_SIZE / threads_per_token;
-    const int warp_ti_offset = warp_ti * threads_per_token;
-
-    __shared__ int shared_accum[max_expert_num][max_tiles];
-
-    for (int i = threadIdx.x; i < max_expert_num * max_tiles; i += block_dim) {
-        int e              = i / max_tiles;
-        int t              = i % max_tiles;
-        shared_accum[e][t] = 0;
-    }
-
-    __syncthreads();
-
-    constexpr int sort_k = std::min(items_per_thread, max_top_k);
-
-    float data[items_per_thread];
-
-    PRAGMA_UNROLL
-    for (int i = 0; i < items_per_thread; ++i) {
-        data[i] = -std::numeric_limits<float>::infinity();
-    }
-    if (ti < token_num) {
-        PRAGMA_UNROLL
-        for (int i = 0; i < items_per_thread; i += alignment) {
-            const int e = items_per_thread * ei + i;
-            if (e < expert_num) {
-                Load((Array<float, alignment>&)data[i], &logits[ti * expert_num + e]);
-            }
-        }
-    }
-
-    float value[items_per_thread];
-    int   index[items_per_thread];
-
-    PRAGMA_UNROLL
-    for (int i = 0; i < items_per_thread; ++i) {
-        value[i] = data[i];
-        index[i] = items_per_thread * ei + i;
-    }
-
-    // PRAGMA_UNROLL
-    // for (int i = 0; i < sort_k; ++i) {
-    //     for (int j = items_per_thread - 1; j > i; --j) {
-    //         if (value[j] > value[j - 1]) {
-    //             auto tmp0    = value[j];
-    //             value[j]     = value[j - 1];
-    //             value[j - 1] = tmp0;
-    //             auto tmp1    = index[j];
-    //             index[j]     = index[j - 1];
-    //             index[j - 1] = tmp1;
-    //         }
-    //     }
-    // }
-
-    PRAGMA_UNROLL
-    for (int j = items_per_thread - 1; j > 0; --j) {
-        if (value[j] > value[j - 1]) {
-            auto tmp0    = value[j];
-            value[j]     = value[j - 1];
-            value[j - 1] = tmp0;
-            auto tmp1    = index[j];
-            index[j]     = index[j - 1];
-            index[j - 1] = tmp1;
-        }
-    }
-
-    float max_logit;
-
-    int count = 0;
-
-    auto run = [&](int k) {
-        float max_val = -std::numeric_limits<float>::infinity();
-
-        PRAGMA_UNROLL
-        for (int i = 0; i <= k; ++i) {
-            if (i == count) {
-                max_val = value[i];
-            }
-        }
-
-        if (k + 1 < sort_k) {
-            PRAGMA_UNROLL
-            for (int j = items_per_thread - 1; j > k + 1; --j) {
-                if (value[j] > value[j - 1]) {
-                    auto tmp0    = value[j];
-                    value[j]     = value[j - 1];
-                    value[j - 1] = tmp0;
-                    auto tmp1    = index[j];
-                    index[j]     = index[j - 1];
-                    index[j - 1] = tmp1;
-                }
-            }
-        }
-
-        int   g_max_ei  = ei;
-        float g_max_val = max_val;
-        if constexpr (threads_per_token > 1) {
-            // global maximum
-            PRAGMA_UNROLL
-            for (int m = threads_per_token / 2; m >= 1; m /= 2) {
-                g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m));
-            }
-            // tie breaking
-            const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val);
-            g_max_ei          = __ffs(active >> (unsigned)warp_ti_offset) - 1;
-        }
-
-        if (k == 0) {
-            max_logit = g_max_val;
-        }
-
-        if (ei == g_max_ei) {
-            count += 1;
-        }
-    };
-
-    PRAGMA_UNROLL
-    for (int k = 0; k < max_top_k; ++k) {
-        run(k);
-    }
-
-    float sum_prob{};
-
-    PRAGMA_UNROLL
-    for (int i = 0; i < sort_k; ++i) {
-        if (i < count) {
-            value[i] = expf(value[i] - max_logit);
-            sum_prob += value[i];
-        }
-    }
-
-    PRAGMA_UNROLL
-    for (int m = threads_per_token / 2; m >= 1; m /= 2) {
-        sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m);
-    }
-    sum_prob = fdividef(1.f, sum_prob);
-
-    using WarpScan = cub::WarpScan<int, threads_per_token>;
-    __shared__ typename WarpScan::TempStorage temp_storage[block_dim / threads_per_token];
-
-    int idx{};
-    WarpScan{temp_storage[threadIdx.x / threads_per_token]}.ExclusiveSum(count, idx);
-
-    PRAGMA_UNROLL
-    for (int i = 0; i < sort_k; ++i) {
-        if (ti < token_num && i < count) {
-            const int e                      = index[i];
-            masks[e * token_num_padded + ti] = idx;
-            scales[idx * token_num + ti]     = value[i] * sum_prob;
-            atomicAdd(&shared_accum[e][ti >> log_tile], 1);
-            ++idx;
-        }
-    }
-
-    __syncthreads();
-
-    for (int i = threadIdx.x; i < expert_num * max_tiles; i += block_dim) {
-        int e = i / max_tiles;
-        int t = i % max_tiles;
-        if (e < expert_num && t < tiles) {
-            // atomic add in Gmem
-            atomicAdd(accum + e * tiles + t, shared_accum[e][t]);
-        }
-    }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////
-
-template<int max_expert_num, int max_top_k, bool norm_top_k, int items_per_thread, int block_dim, int alignment>
-__global__ void MoeGateKernel_v6(float*       scales,  // [e,n]
-                                 int*         masks,   // [E,n], padded
-                                 int*         accum,   // [E,tiles]
-                                 const float* logits,  // [n,E]
-                                 int          log_tile,
-                                 int          tiles,
-                                 int          token_num,
-                                 int          token_num_padded,
-                                 int          expert_num,
-                                 int          top_k)
-{
-    constexpr int max_tiles         = kMoeGateMaxTiles;
-    constexpr int threads_per_token = max_expert_num / items_per_thread;  // 8
-
-    // We use bits in a uint32_t to represent selected experts
-    static_assert(items_per_thread <= 32);
-    // We use warp-level primitives for reduction
-    static_assert(threads_per_token <= 32);
-
-    static_assert((threads_per_token & (threads_per_token - 1)) == 0);
-
-    const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
-
-    const int ti = thread_idx / threads_per_token;
-    const int ei = thread_idx % threads_per_token;
-
-    const int warp_ti        = threadIdx.x % WARP_SIZE / threads_per_token;
-    const int warp_ti_offset = warp_ti * threads_per_token;
-
-    __shared__ int shared_accum[max_expert_num][max_tiles];
-
-    for (int i = threadIdx.x; i < max_expert_num * max_tiles; i += block_dim) {
-        int e              = i / max_tiles;
-        int t              = i % max_tiles;
-        shared_accum[e][t] = 0;
-    }
-
-    __syncthreads();
-
-    float data[items_per_thread];
-    PRAGMA_UNROLL
-    for (int i = 0; i < items_per_thread; ++i) {
-        data[i] = -std::numeric_limits<float>::infinity();
-    }
-    if (ti < token_num) {
-        PRAGMA_UNROLL
-        for (int i = 0; i < items_per_thread; i += alignment) {
-            const int e = items_per_thread * ei + i;
-            if (e < expert_num) {
-                Load((Array<float, alignment>&)data[i], &logits[ti * expert_num + e]);
-            }
-        }
-    }
-
-    float max_logit;
-
-    unsigned mask    = (unsigned)-1;
-    unsigned max_bit = 0;
-    float    max_val = -std::numeric_limits<float>::infinity();
-    {
-        unsigned bit = 1;
-        PRAGMA_UNROLL
-        for (int i = 0; i < items_per_thread; ++i) {
-            if ((mask & bit) && data[i] > max_val) {
-                max_bit = bit;
-                max_val = data[i];
-            }
-            asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
-        }
-    }
-
-    auto run = [&](int k) {
-        unsigned next_mask    = mask - max_bit;
-        unsigned next_max_bit = 0;
-        float    next_max_val = -std::numeric_limits<float>::infinity();
-        unsigned bit          = 1;
-        PRAGMA_UNROLL
-        for (int i = 0; i < items_per_thread; ++i) {
-            if ((next_mask & bit) && data[i] > next_max_val) {
-                next_max_bit = bit;
-                next_max_val = data[i];
-            }
-            asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
-        }
-
-        int   g_max_ei  = ei;
-        float g_max_val = max_val;
-
-        if constexpr (threads_per_token > 1) {
-            // global maximum
-            PRAGMA_UNROLL
-            for (int m = threads_per_token / 2; m >= 1; m /= 2) {
-                g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m));
-            }
-            // tie breaking
-            const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val);
-            g_max_ei          = __ffs(active >> (unsigned)warp_ti_offset) - 1;
-        }
-
-        if (k == 0) {
-            max_logit = g_max_val;
-        }
-
-        if (ei == g_max_ei) {
-            mask    = next_mask;
-            max_bit = next_max_bit;
-            max_val = next_max_val;
-        }
-    };
-
-    run(0);
-    for (int k = 1; k < max_top_k; ++k) {
-        run(k);
-    }
-
-    mask = ~mask;
-
-    float    sum_prob{};
-    unsigned bit = 1;
-    PRAGMA_UNROLL
-    for (int i = 0; i < items_per_thread; ++i) {
-        if (norm_top_k == false || (mask & bit)) {
-            data[i] = expf(data[i] - max_logit);
-            sum_prob += data[i];
-        }
-        asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
-    }
-
-    PRAGMA_UNROLL
-    for (int m = threads_per_token / 2; m >= 1; m /= 2) {
-        sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m);
-    }
-
-    sum_prob = fdividef(1.f, sum_prob);
-
-    const int count = __popc(mask);
-
-    using WarpScan = cub::WarpScan<int, threads_per_token>;
-    __shared__ typename WarpScan::TempStorage temp_storage[block_dim / threads_per_token];
-
-    int idx{};
-    WarpScan{temp_storage[threadIdx.x / threads_per_token]}.ExclusiveSum(count, idx);
-
-    bit = 1;
-    PRAGMA_UNROLL
-    for (int i = 0; i < items_per_thread; ++i) {
-        const int valid = mask & bit;
-        const int e     = items_per_thread * ei + i;
-        if (valid) {
-            masks[e * token_num_padded + ti] = idx;
-            scales[idx * token_num + ti]     = data[i] * sum_prob;
-            // atomic add in Smem
-            atomicAdd(&shared_accum[e][ti >> log_tile], 1);
-            ++idx;
-        }
-        asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
-    }
-
-    __syncthreads();
-
-    for (int i = threadIdx.x; i < expert_num * max_tiles; i += block_dim) {
-        int e = i / max_tiles;
-        int t = i % max_tiles;
-        if (e < expert_num && t < tiles) {
-            // atomic add in Gmem
-            atomicAdd(accum + e * tiles + t, shared_accum[e][t]);
-        }
-    }
-}
-
-template<int max_expert_num, int max_top_k, bool norm_top_k, int items_per_thread, int block_dim, int alignment>
-__global__ void MoeGateKernel_v7(float*       scales,  // [e,n]
-                                 int*         masks,   // [E,n], padded
-                                 int*         accum,   // [E,tiles]
-                                 const float* logits,  // [n,E]
-                                 int          log_tile,
-                                 int          tiles,
-                                 int          token_num,
-                                 int          token_num_padded,
-                                 int          expert_num,
-                                 int          top_k)
-{
-    constexpr int max_tiles         = kMoeGateMaxTiles;
-    constexpr int threads_per_token = max_expert_num / items_per_thread;  // 8
-
-    // We use bits in a uint32_t to represent selected experts
-    static_assert(items_per_thread <= 32);
-    // We use warp-level primitives for reduction
-    static_assert(threads_per_token <= 32);
-
-    static_assert((threads_per_token & (threads_per_token - 1)) == 0);
-
-    const int warp_id = threadIdx.x / WARP_SIZE;
-    const int lane_id = threadIdx.x % WARP_SIZE;
-
-    const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
-
-    const int ti = thread_idx / threads_per_token;
-    const int ei = thread_idx % threads_per_token;
-
-    const int warp_ti        = threadIdx.x % WARP_SIZE / threads_per_token;
-    const int warp_ti_offset = warp_ti * threads_per_token;
-
-    // +1 padding greatly reduced (-80%) bank conflicts
-    __shared__ int shared_accum[max_tiles][max_expert_num + 1];
-
-    for (int i = threadIdx.x; i < max_tiles * max_expert_num; i += block_dim) {
-        int e              = i % max_expert_num;
-        int t              = i / max_expert_num;
-        shared_accum[t][e] = 0;
-    }
-
-#if 0
-    logits += blockIdx.x * block_dim / threads_per_token * expert_num;
-    logits += threadIdx.x / WARP_SIZE * (WARP_SIZE / threads_per_token) * expert_num;
-    constexpr int tokens_per_warp = WARP_SIZE / threads_per_token;
-    __shared__ float smem_data[block_dim / WARP_SIZE][tokens_per_warp][max_expert_num];
-    PRAGMA_UNROLL
-    for (int i = 0; i < tokens_per_warp * max_expert_num; i += WARP_SIZE) {
-        smem_data[warp_id][(lane_id + i) / max_expert_num][(lane_id + i) % max_expert_num] = logits[lane_id + i];
-    }
-    __syncthreads();
-    float data[items_per_thread];
-    for (int i = 0; i < items_per_thread; ++i) {
-        const int e = items_per_thread * ei + i;
-        data[i]     = smem_data[warp_id][lane_id / threads_per_token][e];
-    }
-#else
-    __syncthreads();
-    float data[items_per_thread];
-    PRAGMA_UNROLL
-    for (int i = 0; i < items_per_thread; ++i) {
-        data[i] = -std::numeric_limits<float>::infinity();
-    }
-    if (ti < token_num) {
-        PRAGMA_UNROLL
-        for (int i = 0; i < items_per_thread; i += alignment) {
-            const int e = items_per_thread * ei + i;
-            if (e < expert_num) {
-                Load((Array<float, alignment>&)data[i], &logits[ti * expert_num + e]);
-            }
-        }
-    }
-#endif
-
-    unsigned mask = (unsigned)-1;
-    float    max_logit;
-
-    auto run = [&](int k) {
-        unsigned bit     = 1;
-        unsigned max_bit = 0;
-        float    max_val = -std::numeric_limits<float>::infinity();
-        // local maximum
-        PRAGMA_UNROLL
-        for (int i = 0; i < items_per_thread; ++i) {
-            if ((mask & bit) && data[i] > max_val) {
-                max_bit = bit;
-                max_val = data[i];
-            }
-            asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
-        }
-        int   g_max_ei  = ei;
-        float g_max_val = max_val;
-        if constexpr (threads_per_token > 1) {
-            // global maximum
-            PRAGMA_UNROLL
-            for (int m = threads_per_token / 2; m >= 1; m /= 2) {
-                g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m));
-            }
-            // tie breaking
-            const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val);
-            g_max_ei          = __ffs(active >> (unsigned)warp_ti_offset) - 1;
-        }
-        if (k == 0) {
-            max_logit = g_max_val;
-        }
-        if (ei == g_max_ei) {
-            mask -= max_bit;
-        }
-    };
-
-    run(0);
-    for (int k = 1; k < max_top_k; ++k) {
-        run(k);
-    }
-
-    mask = ~mask;
-
-    float    sum_prob{};
-    unsigned bit = 1;
-    PRAGMA_UNROLL
-    for (int i = 0; i < items_per_thread; ++i) {
-        if (norm_top_k == false || (mask & bit)) {
-            data[i] = expf(data[i] - max_logit);
-            sum_prob += data[i];
-        }
-        asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
-    }
-
-    PRAGMA_UNROLL
-    for (int m = threads_per_token / 2; m >= 1; m /= 2) {
-        sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m);
-    }
-
-    sum_prob = fdividef(1.f, sum_prob);
-
-    const int count = __popc(mask);
-
-    using WarpScan = cub::WarpScan<int, threads_per_token>;
-    __shared__ typename WarpScan::TempStorage temp_storage[block_dim / threads_per_token];
-
-    int idx{};
-    WarpScan{temp_storage[threadIdx.x / threads_per_token]}.ExclusiveSum(count, idx);
-
-    bit = 1;
-    PRAGMA_UNROLL
-    for (int i = 0; i < items_per_thread; ++i) {
-        const int valid = mask & bit;
-        const int e     = items_per_thread * ei + i;
-        if (valid) {
-            masks[e * token_num_padded + ti] = idx;
-            // scales[idx * token_num + ti]     = data[i] * sum_prob;
-            scales[ti * top_k + idx] = data[i] * sum_prob;
-            // atomic add in Smem
-            atomicAdd(&shared_accum[ti >> log_tile][e], 1);
-            ++idx;
-        }
-        asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
-    }
-
-    __syncthreads();
-
-    for (int i = threadIdx.x; i < max_expert_num * max_tiles; i += block_dim) {
-        int t = i % max_tiles;
-        int e = i / max_tiles;
-        if (e < expert_num && t < tiles) {
-            // atomic add in Gmem
-            atomicAdd(accum + e * tiles + t, shared_accum[t][e]);
-        }
-    }
-}
-
 template<int max_expert_num,
          int max_top_k,
          //  bool norm_top_k,
@@ -1362,7 +488,6 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
         }
     }
 
-    // unsigned bit = 1;
     PRAGMA_UNROLL
     for (int i = 0; i < items_per_thread; ++i) {
         if (!norm_topk || used[i]) {
@@ -1378,25 +503,6 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
 
     sum_prob = fdividef(1.f, sum_prob);
 
-    // float sum_v = 0;
-    // PRAGMA_UNROLL
-    // for (int i = 0; i < items_per_thread; ++i) {
-    //     // data[i] = exp2f(data[i] - max_logit);
-    //     // sum_prob += data[i];
-    //     float v = data[i] * sum_prob;
-    //     sum_v += v;
-    //     if (ti == 0) {
-    //         printf("%f\n", v);
-    //     }
-    // }
-    // PRAGMA_UNROLL
-    // for (int m = threads_per_token / 2; m >= 1; m /= 2) {
-    //     sum_v += __shfl_xor_sync((uint32_t)-1, sum_v, m);
-    // }
-    // if (ti == 0) {
-    //     printf("sum=%f\n", sum_v);
-    // }
-
     using WarpScan = cub::WarpScan<int, threads_per_token>;
     __shared__ typename WarpScan::TempStorage temp_storage[tokens_per_cta];
 
@@ -1453,144 +559,6 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
     }
 }
 
-#ifdef USE_WARPSORT
-
-struct Greater {
-    template<class T>
-    __device__ bool operator()(T a, T b) const noexcept
-    {
-        return a > b;
-    }
-};
-
-template<int max_expert_num, int max_top_k, bool norm_top_k, int items_per_thread, int block_dim, int alignment>
-__global__ void MoeGateKernel_v9(float*       scales,  // [e,n]
-                                 int*         masks,   // [E,n], padded
-                                 int*         accum,   // [E,tiles]
-                                 const float* logits,  // [n,E]
-                                 int          log_tile,
-                                 int          tiles,
-                                 int          token_num,
-                                 int          token_num_padded,
-                                 int          expert_num,
-                                 int          top_k)
-{
-    constexpr int max_tiles         = kMoeGateMaxTiles;
-    constexpr int threads_per_token = max_expert_num / items_per_thread;  // 8
-
-    // We use bits in a uint32_t to represent selected experts
-    static_assert(items_per_thread <= 32);
-    // We use warp-level primitives for reduction
-    static_assert(threads_per_token <= 32);
-
-    static_assert((threads_per_token & (threads_per_token - 1)) == 0);
-
-    const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
-
-    const int ti = thread_idx / threads_per_token;
-    const int ei = thread_idx % threads_per_token;
-
-    const int warp_ti        = threadIdx.x % WARP_SIZE / threads_per_token;
-    const int warp_ti_offset = warp_ti * threads_per_token;
-
-    // +1 padding greatly reduced (-80%) bank conflicts
-    __shared__ int shared_accum[max_tiles][max_expert_num + 1];
-
-    PRAGMA_UNROLL
-    for (int i = 0; i < max_tiles * max_expert_num; i += block_dim) {
-        int e              = (i + threadIdx.x) % max_expert_num;
-        int t              = (i + threadIdx.x) / max_expert_num;
-        shared_accum[t][e] = 0;
-    }
-
-    __syncthreads();
-
-    float data[items_per_thread];
-    int   idxs[items_per_thread];
-
-    PRAGMA_UNROLL
-    for (int i = 0; i < items_per_thread; ++i) {
-        data[i] = -std::numeric_limits<float>::infinity();
-        idxs[i] = threads_per_token * (i / alignment * alignment) + i % alignment + ei * alignment;
-    }
-
-    if (ti < token_num) {
-        PRAGMA_UNROLL
-        for (int i = 0; i < items_per_thread; i += alignment) {
-            const int e = threads_per_token * i + ei * alignment;
-            if (e < expert_num) {
-                Load((Array<float, alignment>&)data[i], &logits[ti * expert_num + e]);
-            }
-        }
-    }
-
-    constexpr float kLog2e = 1.4426950408889634074;
-
-    using MergeSort = cub::WarpMergeSort<float, items_per_thread, threads_per_token, int>;
-
-    union Smem {
-        typename MergeSort::TempStorage temp[block_dim / threads_per_token];
-        // int2                            sorted[max_top_k][block_dim / threads_per_token + 1];
-        int2 sorted[block_dim / threads_per_token][max_top_k + 1];
-    };
-
-    __shared__ Smem smem;
-
-    MergeSort{smem.temp[threadIdx.x / threads_per_token]}.Sort(data, idxs, Greater{});
-
-    __syncthreads();
-
-    PRAGMA_UNROLL
-    for (int i = 0; i < items_per_thread; ++i) {
-        const int k = threads_per_token * ei + i;  // blocked
-        if (k < max_top_k) {
-            // smem.sorted[k][threadIdx.x / threads_per_token] = int2{float_as_int(data[i]), idxs[i]};
-            smem.sorted[threadIdx.x / threads_per_token][k] = int2{float_as_int(data[i]), idxs[i]};
-        }
-    }
-
-    __syncthreads();
-
-    constexpr int top_k_per_thread = cdiv(max_top_k, threads_per_token);
-
-    int es[top_k_per_thread];
-
-    PRAGMA_UNROLL
-    for (int i = 0; i < top_k_per_thread; ++i) {
-        const int k = ei * top_k_per_thread + i;
-        if (k < max_top_k) {
-            // const int2 tmp                   = smem.sorted[k][threadIdx.x / threads_per_token];
-            const int2 tmp                   = smem.sorted[threadIdx.x / threads_per_token][k];
-            const int  e                     = tmp.y;
-            masks[e * token_num_padded + ti] = k;
-            scales[ti * top_k + k]           = int_as_float(tmp.x);
-            es[i]                            = e;
-            // atomicAdd(&shared_accum[ti >> log_tile][e], 1);
-        }
-    }
-
-    PRAGMA_UNROLL
-    for (int i = 0; i < top_k_per_thread; ++i) {
-        const int k = ei * top_k_per_thread + i;
-        if (k < max_top_k) {
-            atomicAdd(&shared_accum[ti >> log_tile][es[i]], 1);
-        }
-    }
-
-    __syncthreads();
-
-    for (int i = threadIdx.x; i < max_expert_num * max_tiles; i += block_dim) {
-        int t = i % max_tiles;
-        int e = i / max_tiles;
-        if (e < expert_num && t < tiles) {
-            // atomic add in Gmem
-            atomicAdd(accum + e * tiles + t, shared_accum[t][e]);
-        }
-    }
-}
-
-#endif
-
 template<int N>
 inline constexpr std::integral_constant<int, N> _Int{};
 
@@ -1618,13 +586,15 @@ void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
 
     // std::cout << log_tile << " " << tiles << "\n";
 
-    {
+    auto invoke = [&](auto max_expert_num, auto top_k, auto items_per_thread) {
+        constexpr int thrs_per_tok = max_expert_num.value / items_per_thread.value;
+        constexpr int threads      = 256;
+        const int     blocks       = ceil_div(tokens, threads / thrs_per_tok);
+
+        cudaMemsetAsync(masks, -1, sizeof(int8_t) * experts * tokens_padded, st);
 
-        auto invoke = [&](auto e) {
-            constexpr int        threads = 128;
-            const int            blocks  = ceil_div(tokens * 2, threads);
-            static constexpr int top_k   = decltype(e)::value;
-            MoeGateKernel_V2<top_k, threads><<<blocks, threads, 0, st>>>(  //
+        MoeGateKernel_v8<max_expert_num.value, top_k.value, items_per_thread.value, threads, 4>
+            <<<blocks, threads, 0, st>>>(  //
                 scales,
                 (int8_t*)masks,
                 accum,
@@ -1633,34 +603,37 @@ void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
                 tiles,
                 tokens,
                 tokens_padded,
-                experts);
-        };
-
-        auto invoke2 = [&](auto max_expert_num, auto top_k, auto items_per_thread) {
-            constexpr int thrs_per_tok = max_expert_num.value / items_per_thread.value;
-            constexpr int threads      = 128;
-            const int     blocks       = ceil_div(tokens, threads / thrs_per_tok);
-
-            cudaMemsetAsync(masks, -1, sizeof(int8_t) * experts * tokens_padded, st);
-
-            MoeGateKernel_v8<max_expert_num.value, top_k.value, items_per_thread.value, threads, 4>
-                <<<blocks, threads, 0, st>>>(  //
-                    scales,
-                    (int8_t*)masks,
-                    accum,
-                    logits,
-                    log_tile,
-                    tiles,
-                    tokens,
-                    tokens_padded,
-                    experts,
-                    experts_per_token,
-                    norm_topk);
-        };
+                experts,
+                experts_per_token,
+                norm_topk);
+    };
 
-        invoke2(_Int<64>, _Int<8>, _Int<16>);
+    auto fail = [&] {
+        std::cerr << "unsupported moe config: expert_num=" << experts << ", top_k=" << experts_per_token << "\n";
+        std::abort();
+    };
 
-        // invoke(_Int<2>);
+    if (experts <= 8) {
+        if (experts_per_token <= 2) {
+            invoke(_Int<8>, _Int<2>, _Int<8>);
+        }
+        else {
+            invoke(_Int<8>, _Int<8>, _Int<8>);
+        }
+    }
+    else if (experts <= 64) {
+        if (experts_per_token <= 4) {
+            invoke(_Int<64>, _Int<4>, _Int<16>);
+        }
+        else if (experts_per_token <= 8) {
+            invoke(_Int<64>, _Int<8>, _Int<16>);
+        }
+        else {
+            fail();
+        }
+    }
+    else {
+        fail();
     }
 
     {
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
index 3b9bb1c11..f6f9ab0ef 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -709,12 +709,12 @@ void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cud
     };
 
     if (inter_size_) {
-        std::cerr << "process FFN\n";
+        // std::cerr << "process FFN\n";
         process_ffn(ffn_weights, false);
     }
 
     if (!moe_weights.experts.empty()) {
-        std::cerr << "process MoE\n";
+        // std::cerr << "process MoE\n";
         std::vector<std::pair<void*, int>> fused_ptrs;
         std::vector<std::pair<void*, int>> output_ptrs;
         std::vector<std::pair<void*, int>> fused_param_ptrs;
diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h
index 21c1c666c..169fb53bc 100644
--- a/src/turbomind/models/llama/LlamaDenseWeight.h
+++ b/src/turbomind/models/llama/LlamaDenseWeight.h
@@ -197,7 +197,7 @@ struct MoeFfnWeight {
                  bool       fuse_silu_act)
     {
 
-        printf("%d %d %d\n", (int)hidden_dim, (int)inter_size, (int)expert_num);
+        // printf("%d %d %d\n", (int)hidden_dim, (int)inter_size, (int)expert_num);
 
         if (expert_num == 0) {
             return;

From 8fade570d22fa34b167653524d400451072c09b6 Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Thu, 7 Nov 2024 08:22:26 +0000
Subject: [PATCH 04/21] fix lint

---
 lmdeploy/turbomind/deploy/module.py           |  3 ++-
 .../turbomind/deploy/source_model/qwen.py     | 19 ++++++++-----------
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py
index 2d3575c37..8d998abe2 100644
--- a/lmdeploy/turbomind/deploy/module.py
+++ b/lmdeploy/turbomind/deploy/module.py
@@ -166,7 +166,8 @@ def apply(self, i: int, r: BaseReader):
         if self.shared_gate:
             shared_gate = transpose(r.moe_ffn_shared_gate(i))
             # print(shared_gate)
-            self.model.save_split(shared_gate, self._moe_ffn_shared_gate.format(i))
+            self.model.save_split(shared_gate,
+                                  self._moe_ffn_shared_gate.format(i))
 
 
 class Attn(Module):
diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py
index 071ade122..772bd0303 100644
--- a/lmdeploy/turbomind/deploy/source_model/qwen.py
+++ b/lmdeploy/turbomind/deploy/source_model/qwen.py
@@ -120,7 +120,6 @@ def model_info(self):
         cfg = super().model_info()
         cfg['attn_bias'] = 1
         return cfg
-    
 
 
 class Qwen2MoeReader(LlamaReader):
@@ -137,11 +136,10 @@ def moe_ffn_expert(self, e=None, i=None, kind=None):
             tensor = self.transform(tensor, kind)
             result.append(tensor)
         return (*result, )
-    
+
     def moe_ffn_gate(self, i):
-        return self.params.get(
-            f'model.layers.{i}.mlp.gate.weight')
-    
+        return self.params.get(f'model.layers.{i}.mlp.gate.weight')
+
     def _ffn(self, i: int, kind: str):
         """Get ffn kind for layer i."""
         if not kind:
@@ -153,11 +151,11 @@ def _ffn(self, i: int, kind: str):
             tensor = self.transform(tensor, kind)
             result.append(tensor)
         return (*result, )
-    
+
     def moe_ffn_shared_gate(self, i):
         return self.params.get(
-            f'model.layers.{i}.mlp.shared_expert_gate.weight'
-        )
+            f'model.layers.{i}.mlp.shared_expert_gate.weight')
+
 
 @INPUT_MODELS.register_module(name='qwen2-moe')
 class Qwen2MoeModel(LlamaModel):
@@ -165,9 +163,8 @@ class Qwen2MoeModel(LlamaModel):
     Reader = Qwen2MoeReader
 
     def tokenizer_info(self):
-        """
-        https://huggingface.co/Qwen/Qwen1.5-7B-Chat/blob/main/generation_config.json
-        """  # noqa: E501
+        """https://huggingface.co/Qwen/Qwen1.5-7B-Chat/blob/main/generation_con
+        fig.json."""  # noqa: E501
         n_words = 152064
         bos_id = 151643
         eos_id = 151645

From 4c0f9026dabcda0eec54fa3043cb21ef4c62a134 Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Thu, 7 Nov 2024 08:29:27 +0000
Subject: [PATCH 05/21] clean up

---
 src/turbomind/kernels/gemm/moe_utils_v2.cu | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu
index dc2c21d7c..5912c60a8 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.cu
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu
@@ -6,14 +6,9 @@
 #include <numeric>
 #include <random>
 
-#include <cub/block/block_exchange.cuh>
-#include <cub/block/block_load.cuh>
 #include <cub/block/block_reduce.cuh>
 #include <cub/block/block_scan.cuh>
-#include <cub/warp/warp_exchange.cuh>
-#include <cub/warp/warp_merge_sort.cuh>
 #include <cub/warp/warp_scan.cuh>
-#include <cuda_pipeline_primitives.h>
 
 #include "src/turbomind/kernels/core/array_ops.h"
 #include "src/turbomind/kernels/core/common.h"

From da6aeff2d05e1bf6c0bfe26f7419a2ae1367e6c9 Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Fri, 8 Nov 2024 07:47:48 +0000
Subject: [PATCH 06/21] layer-wise `inter_size` & `expert_num`

---
 lmdeploy/turbomind/deploy/config.py           |  7 +-
 lmdeploy/turbomind/deploy/module.py           | 14 +--
 .../turbomind/deploy/target_model/base.py     | 20 +++-
 src/turbomind/kernels/gemm/context.h          | 13 +--
 src/turbomind/kernels/gemm/test/testbed.h     |  2 +-
 .../models/llama/LlamaDecoderLayerWeight.cc   | 91 ++++++++++++++++++-
 .../models/llama/LlamaDecoderLayerWeight.h    | 36 +++++---
 src/turbomind/models/llama/LlamaDenseWeight.h | 89 +++++-------------
 src/turbomind/models/llama/LlamaV2.cc         |  1 -
 src/turbomind/models/llama/LlamaV2.h          |  1 -
 src/turbomind/models/llama/LlamaWeight.cc     | 33 ++++++-
 src/turbomind/models/llama/LlamaWeight.h      | 39 ++++----
 src/turbomind/models/llama/llama_params.h     | 42 +++++----
 src/turbomind/models/llama/moe_ffn_layer.cc   | 41 +++++----
 src/turbomind/models/llama/moe_ffn_layer.h    | 16 ++--
 src/turbomind/models/llama/unified_decoder.cc |  4 +-
 src/turbomind/models/llama/weight_type.h      | 55 +++++++++++
 .../triton_backend/llama/LlamaTritonModel.cc  | 65 +++++++------
 .../triton_backend/llama/LlamaTritonModel.h   |  3 -
 19 files changed, 365 insertions(+), 207 deletions(-)
 create mode 100644 src/turbomind/models/llama/weight_type.h

diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
index a535b0d4c..bfa46f325 100644
--- a/lmdeploy/turbomind/deploy/config.py
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -2,6 +2,7 @@
 import inspect
 import json
 from dataclasses import asdict, fields
+from typing import List
 
 # use pydantic.dataclasses.dataclass to check data type
 from pydantic.dataclasses import dataclass
@@ -36,7 +37,8 @@ class ModelConfig:
     hidden_units: int = None
     vocab_size: int = None
     num_layer: int = None
-    inter_size: int = None
+    # inter_size: int = None
+    inter_size: List[int] = None
     norm_eps: float = None
     attn_bias: int = 0
     start_id: int = None
@@ -47,7 +49,8 @@ class ModelConfig:
     session_len: int = None
     tp: int = 1
     model_format: str = 'hf'
-    expert_num: int = 0
+    # expert_num: int = 0
+    expert_num: List[int] = None
     expert_inter_size: int = 0
     experts_per_token: int = 0
     moe_shared_gate: int = False
diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py
index 8d998abe2..51e842aee 100644
--- a/lmdeploy/turbomind/deploy/module.py
+++ b/lmdeploy/turbomind/deploy/module.py
@@ -96,10 +96,12 @@ class Ffn(Module):
     def __init__(self, model: BaseOutputModel):
         self.model = model
         self.tp = model.tensor_para_size
+        # inter_sizes in config are padded and my differ from what's in the weights
         self.inter_size = model.model_config.inter_size
         self.group_size = max(1, model.model_config.group_size)
 
     def _export(self,
+                inter_size: int,
                 fmt: str,
                 idx: int,
                 w123,
@@ -110,11 +112,11 @@ def _export(self,
         w1, w2, w3 = map(transpose, w123)
 
         if not is_lora_a:
-            w1 = pad_out_dims(w1, self.inter_size)
-            w3 = pad_out_dims(w3, self.inter_size)
+            w1 = pad_out_dims(w1, inter_size)
+            w3 = pad_out_dims(w3, inter_size)
         if not is_lora_b:
             group_size = self.group_size if apply_gs else 1
-            w2 = pad_in_dims(w2, self.inter_size // group_size)
+            w2 = pad_in_dims(w2, inter_size // group_size)
 
         w1, w2, w3 = map(pack_fn, (w1, w2, w3))
         self.model.save_split(w1,
@@ -132,7 +134,7 @@ def _export(self,
 
     def apply(self, i: int, r: BaseReader):
         for e in get_params(r.ffn(i, None)):
-            e(partial(self._export, self._ffn), partial(r.ffn, i), i)
+            e(partial(self._export, self.inter_size[i], self._ffn), partial(r.ffn, i), i)
 
 
 class MoeFfn(Ffn):
@@ -155,9 +157,9 @@ def __init__(self, model: BaseOutputModel):
 
     def apply(self, i: int, r: BaseReader):
         for p in get_params(r.moe_ffn_expert()):
-            for e in range(self.expert_num):
+            for e in range(self.expert_num[i]):
                 fmt = self._moe_ffn_expert.replace('E', str(e))
-                p(partial(self._export, fmt), partial(r.moe_ffn_expert, e, i),
+                p(partial(self._export, self.inter_size, fmt), partial(r.moe_ffn_expert, e, i),
                   i)
 
         gate = transpose(r.moe_ffn_gate(i))
diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py
index 4750cde85..c989d630d 100644
--- a/lmdeploy/turbomind/deploy/target_model/base.py
+++ b/lmdeploy/turbomind/deploy/target_model/base.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import collections
 import os.path as osp
 from abc import ABC
 
@@ -64,13 +65,14 @@ def __init__(self,
         # get `model_info` and `tokenizer_info` at first, which
         # will be updated to `self.model_config` and `self.attention_config`
         self.input_model_info = self.input_model.model_info()
+        self.input_model_info = self.single_to_list(
+            self.input_model_info, keys=['inter_size', 'expert_num'])
         self.input_model_tokenizer_info = self.input_model.tokenizer_info()
         self.permute_qk = self.input_model_info.get('permute_qk', True)
-
         self.update_model_config()
-        self.model_config.inter_size = _pad_inter_size(
-            self.model_config.inter_size, self.model_config.group_size,
-            self.tensor_para_size)
+        for i, v in enumerate(self.model_config.inter_size):
+            self.model_config.inter_size[i] = _pad_inter_size(
+                v, self.model_config.group_size, self.tensor_para_size)
         if self.model_config.expert_num:
             self.model_config.expert_inter_size = _pad_inter_size(
                 self.model_config.expert_inter_size,
@@ -78,11 +80,21 @@ def __init__(self,
         self.model_config.verify()
         assert self.model_config.kv_head_num % self.tensor_para_size == 0
 
+        # print(self.model_config)
+
         self.update_attention_config()
         self.update_lora_config()
         # ! Dependency on `self`
         self.model = model_cls(self)
 
+    def single_to_list(self, config: dict, keys):
+        num_layer = int(config['num_layer'])
+        for k in keys:
+            v = config.get(k, None)
+            if v is not None and not isinstance(v, collections.Sequence):
+                config[k] = [v] * num_layer
+        return config
+
     def update_model_config(self):
         """Update `self.model_config` according to the input_model's
         `tokenizer_info` and `model_info`"""
diff --git a/src/turbomind/kernels/gemm/context.h b/src/turbomind/kernels/gemm/context.h
index 4fec5b732..bd03917b8 100644
--- a/src/turbomind/kernels/gemm/context.h
+++ b/src/turbomind/kernels/gemm/context.h
@@ -113,12 +113,7 @@ class DynamicGemmContext: public StaticGemmContext {
 
 class MoeGemmContext: public Context {
 public:
-    MoeGemmContext(int experts,
-                   int experts_per_token,
-                   //    int                   output_dims,
-                   //    int                   input_dims,
-                   const cudaDeviceProp& prop,
-                   cudaStream_t          stream);
+    MoeGemmContext(int experts, int experts_per_token, const cudaDeviceProp& prop, cudaStream_t stream);
 
     ~MoeGemmContext() override;
 
@@ -156,9 +151,11 @@ class MoeGemmContext: public Context {
 
     Tape Schedule(const LaunchSpec&) override;
 
-    void set_offsets(const int* offsets)
+    void update(int expert_num, int experts_per_token, const int* offsets)
     {
-        offsets_ = offsets;
+        expert_num_        = expert_num;
+        experts_per_token_ = experts_per_token;
+        offsets_           = offsets;
     }
 
 protected:
diff --git a/src/turbomind/kernels/gemm/test/testbed.h b/src/turbomind/kernels/gemm/test/testbed.h
index 7a089fbdf..2678470bb 100644
--- a/src/turbomind/kernels/gemm/test/testbed.h
+++ b/src/turbomind/kernels/gemm/test/testbed.h
@@ -357,7 +357,7 @@ class Testbed {
             }
         }
 
-        ((MoeGemmContext*)ctx_.get())->set_offsets(moe_m_offsets_.data().get());
+        ((MoeGemmContext*)ctx_.get())->update(experts_, exp_per_tok_, moe_m_offsets_.data().get());
 
         CHECK(batch_dim == 0);
         CHECK(a_desc_.order == kRowMajor);
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
index f6f9ab0ef..fadebda78 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -50,7 +50,7 @@ static bool is_fuse_silu_act()
     }();
     return value;
 }
-
+#if 0
 template<typename T>
 LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
                                                     size_t     head_num,
@@ -145,7 +145,90 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
 
     mallocWeights();
 }
+#else
+
+template<typename T>
+LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int               layer_id,
+                                                    const ModelParam& model,
+                                                    const LoraParam&  lora_param,
+                                                    const MoeParam&   moe_param,
+                                                    size_t            tp_size,
+                                                    size_t            tp_rank):
+    head_num_(model.head_num),
+    kv_head_num_(model.kv_head_num),
+    size_per_head_(model.head_dim),
+    hidden_units_(model.hidden_units),
+    inter_size_(model.inter_size.at(layer_id)),
+    weight_type_(model.weight_type),
+    attn_bias_(model.attn_bias),
+    tensor_para_size_(tp_size),
+    tensor_para_rank_(tp_rank)
+{
+    if (lora_param.policy == LoraPolicy::kPlora) {
+        std::vector<std::string> keys = {
+            "attention.w_qkv", "attention.wo", "feed_forward.w1", "feed_forward.w2", "feed_forward.w3"};
+        std::vector<LlamaDenseWeight<T>*> weights = {&self_attn_weights.qkv,
+                                                     &self_attn_weights.output,
+                                                     &ffn_weights.gating,
+                                                     &ffn_weights.output,
+                                                     &ffn_weights.intermediate};
+        for (int i = 0; i < keys.size(); i++) {
+            const auto& name      = keys[i];
+            auto&       weight    = *weights[i];
+            int         rank      = lora_param.r;
+            float       scale     = lora_param.scale;
+            std::string full_name = "layers." + std::to_string(layer_id) + "." + name;
+
+            for (const auto& [re, pr] : lora_param.rank_pattern) {
+                if (std::regex_search(full_name, pr.first)) {
+                    rank = pr.second;
+                    TM_LOG_DEBUG("find rank, pattern=%s, name=%s, value=%d", re.c_str(), full_name.c_str(), rank);
+                    break;
+                }
+            }
+            for (const auto& [re, pr] : lora_param.scale_pattern) {
+                if (std::regex_search(full_name, pr.first)) {
+                    scale = pr.second;
+                    TM_LOG_DEBUG("find scale pattern=%s, name=%s, value=%f", re.c_str(), full_name.c_str(), scale);
+                    break;
+                }
+            }
+            if (rank) {
+                weight.lora.r      = rank;
+                weight.lora.scale  = scale;
+                weight.lora.policy = lora_param.policy;
+            }
+        }
+    }
+
+    fused_up_and_gate_ = ffn_weights.gating.lora.policy != LoraPolicy::kPlora;
+
+    self_attn_weights.qkv.input_dims  = hidden_units_;
+    self_attn_weights.qkv.output_dims = (head_num_ + 2 * kv_head_num_) * size_per_head_ / tensor_para_size_;
+    self_attn_weights.qkv.type        = weight_type_;
+    self_attn_weights.qkv.group_size  = model.group_size;
+
+    self_attn_weights.output.input_dims  = (head_num_ * size_per_head_) / tensor_para_size_;
+    self_attn_weights.output.output_dims = hidden_units_;
+    self_attn_weights.output.type        = weight_type_;
+    self_attn_weights.output.group_size  = model.group_size;
+
+    ffn_weights = LlamaFfnWeight<T>{
+        hidden_units_,
+        inter_size_,
+        tensor_para_size_,
+        weight_type_,
+        model.group_size,
+        weight_type_ == WeightType::kINT4 && is_fuse_silu_act(),
+    };
+
+    moe_weights = MoeFfnWeight<T>{
+        layer_id, moe_param, hidden_units_, weight_type_, model.group_size, tensor_para_size_, is_fuse_silu_act()};
+
+    mallocWeights();
+}
 
+#endif
 template<typename T>
 size_t LlamaDecoderLayerWeight<T>::workspace_size() const noexcept
 {
@@ -709,12 +792,12 @@ void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cud
     };
 
     if (inter_size_) {
-        // std::cerr << "process FFN\n";
+        std::cerr << "process FFN\n";
         process_ffn(ffn_weights, false);
     }
 
     if (!moe_weights.experts.empty()) {
-        // std::cerr << "process MoE\n";
+        std::cerr << "process MoE\n";
         std::vector<std::pair<void*, int>> fused_ptrs;
         std::vector<std::pair<void*, int>> output_ptrs;
         std::vector<std::pair<void*, int>> fused_param_ptrs;
@@ -722,7 +805,7 @@ void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cud
 
         for (auto& e : moe_weights.experts) {
 
-            process_ffn(e, moe_weights.method);
+            process_ffn(e, moe_weights.method == MoeParam::kFused);
 
             const auto& fused  = e.fused_gating_intermediate;
             const auto& output = e.output;
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
index f68a103dd..342775b70 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
@@ -30,21 +30,29 @@ template<typename T>
 struct LlamaDecoderLayerWeight {
 public:
     LlamaDecoderLayerWeight() = delete;
-    LlamaDecoderLayerWeight(int        layer_idx,
-                            size_t     head_num,
-                            size_t     kv_head_num,
-                            size_t     size_per_head,
-                            size_t     hidden_units,
-                            size_t     inter_size,
-                            WeightType weight_type,
-                            int        group_size,
-                            LoraParam  lora_param,
-                            bool       attn_bias,
-                            MoeParam   moe_param,
-                            size_t     tensor_para_size,
-                            size_t     tensor_para_rank);
+    // LlamaDecoderLayerWeight(int        layer_idx,
+    //                         size_t     head_num,
+    //                         size_t     kv_head_num,
+    //                         size_t     size_per_head,
+    //                         size_t     hidden_units,
+    //                         size_t     inter_size,
+    //                         WeightType weight_type,
+    //                         int        group_size,
+    //                         LoraParam  lora_param,
+    //                         bool       attn_bias,
+    //                         MoeParam   moe_param,
+    //                         size_t     tensor_para_size,
+    //                         size_t     tensor_para_rank);
+
+    LlamaDecoderLayerWeight(int               layer_id,
+                            const ModelParam& model,
+                            const LoraParam&  lora_param,
+                            const MoeParam&   moe_param,
+                            size_t            tp_size,
+                            size_t            tp_rank);
+
     ~LlamaDecoderLayerWeight();
-    LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other) = delete;
+    LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other)            = delete;
     LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other) = delete;
 
     void loadModel(std::string dir_path, FtCudaDataType model_file_type);
diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h
index 169fb53bc..eed1a4119 100644
--- a/src/turbomind/models/llama/LlamaDenseWeight.h
+++ b/src/turbomind/models/llama/LlamaDenseWeight.h
@@ -20,64 +20,13 @@
 #pragma once
 
 #include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/models/llama/weight_type.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include <cuda_bf16.h>
 
 namespace turbomind {
 
-enum class WeightType : int
-{
-    kFP32,
-    kFP16,
-    kFP8,  // not supported yet
-    kBF16,
-    kINT8,
-    kINT4
-};
-
-template<class T>
-constexpr WeightType get_default_weight_type()
-{
-    if constexpr (std::is_same_v<T, half>) {
-        return WeightType::kFP16;
-    }
-    else if constexpr (std::is_same_v<T, nv_bfloat16>) {
-        return WeightType::kBF16;
-    }
-    else if constexpr (std::is_same_v<T, float>) {
-        return WeightType::kFP32;
-    }
-    else {
-        static_assert(sizeof(T) != sizeof(T), "not implemented");
-        return {};
-    }
-}
-
-inline size_t getBitSize(WeightType type)
-{
-    switch (type) {
-        case WeightType::kFP32:
-            return 32;
-        case WeightType::kFP16:
-            return 16;
-        case WeightType::kFP8:
-            return 8;
-        case WeightType::kBF16:
-            return 16;
-        case WeightType::kINT8:
-            return 8;
-        case WeightType::kINT4:
-            return 4;
-    }
-    return 0;
-}
-
-enum class LoraPolicy : int
-{
-    kNull,
-    kPlora,
-};
-
 inline LoraPolicy getLoraPolicy(const std::string& policy)
 {
     if (policy == "plora") {
@@ -186,23 +135,27 @@ struct MoeFfnWeight {
 
     MoeFfnWeight() = default;
 
-    MoeFfnWeight(size_t     hidden_dim,
-                 int        inter_size,
-                 int        expert_num,
-                 int        method,
-                 bool       has_shared_gate,
-                 size_t     tp,
-                 WeightType weight_type,
-                 int        group_size,
-                 bool       fuse_silu_act)
+    MoeFfnWeight(int             layer_id,
+                 const MoeParam& param,
+                 size_t          hidden_dim,
+                 WeightType      weight_type,
+                 int             group_size,
+                 size_t          tp,
+                 bool            fuse_silu_act)
     {
 
-        // printf("%d %d %d\n", (int)hidden_dim, (int)inter_size, (int)expert_num);
+        if (param.expert_num.size() < layer_id) {
+            return;
+        }
+
+        const int expert_num = param.expert_num[layer_id];
 
         if (expert_num == 0) {
             return;
         }
 
+        printf("%d %d %d\n", (int)hidden_dim, (int)param.inter_size, (int)expert_num);
+
         gate.input_dims  = hidden_dim;
         gate.output_dims = expert_num;
         gate.type        = get_default_weight_type<T>();
@@ -210,15 +163,15 @@ struct MoeFfnWeight {
 
         experts.resize(expert_num);
 
-        this->method  = method;
-        fuse_silu_act = fuse_silu_act && method;
+        method        = param.method;
+        fuse_silu_act = fuse_silu_act && method == MoeParam::kFused;
 
         for (auto& e : experts) {
             // inter size is divided by tp in `FfnWeight`
-            e = LlamaFfnWeight<T>{hidden_dim, (size_t)inter_size, tp, weight_type, group_size, fuse_silu_act};
+            e = LlamaFfnWeight<T>{hidden_dim, (size_t)param.inter_size, tp, weight_type, group_size, fuse_silu_act};
         }
 
-        if (has_shared_gate) {
+        if (param.shared_gate) {
             shared_gate.input_dims  = hidden_dim;
             shared_gate.output_dims = 1;
             shared_gate.type        = get_default_weight_type<T>();
@@ -236,7 +189,7 @@ struct MoeFfnWeight {
 
     LlamaFfnWeight<T> block;
 
-    int method{};
+    MoeParam::Method method{};
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc
index 3d50910ad..05b22deed 100644
--- a/src/turbomind/models/llama/LlamaV2.cc
+++ b/src/turbomind/models/llama/LlamaV2.cc
@@ -72,7 +72,6 @@ LlamaV2<T>::LlamaV2(const ModelParam&               model,
     lora_param_(lora),
     head_num_(model.head_num),
     size_per_head_(model.head_dim),
-    inter_size_(model.inter_size),
     hidden_units_(model.hidden_units),
     layer_num_(model.layer_num),
     vocab_size_(model.vocab_size),
diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h
index 6321d09d7..658282f5e 100644
--- a/src/turbomind/models/llama/LlamaV2.h
+++ b/src/turbomind/models/llama/LlamaV2.h
@@ -113,7 +113,6 @@ class LlamaV2 {
     const size_t    head_num_;
     const size_t    size_per_head_;
     const size_t    hidden_units_;
-    const size_t    inter_size_;
     const size_t    layer_num_;
     const size_t    vocab_size_;
     const size_t    vocab_size_padded_;
diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc
index 1ac2d82dd..325789f29 100644
--- a/src/turbomind/models/llama/LlamaWeight.cc
+++ b/src/turbomind/models/llama/LlamaWeight.cc
@@ -24,7 +24,7 @@
 #include <cuda_runtime.h>
 
 namespace turbomind {
-
+#if 0
 template<typename T>
 LlamaWeight<T>::LlamaWeight(size_t     head_num,
                             size_t     kv_head_num,
@@ -75,6 +75,37 @@ LlamaWeight<T>::LlamaWeight(size_t     head_num,
 
     mallocWeights();
 }
+#else
+
+template<typename T>
+LlamaWeight<T>::LlamaWeight(
+    const ModelParam& model, const LoraParam& lora_param, const MoeParam& moe_param, size_t tp_size, size_t tp_rank):
+    hidden_units_(model.hidden_units),
+    inter_size_(model.inter_size),
+    vocab_size_(model.vocab_size),
+    vocab_size_padded_(model.vocab_size),
+    num_layer_(model.layer_num),
+    weight_type_(model.weight_type),
+    tensor_para_size_(tp_size),
+    tensor_para_rank_(tp_rank)
+{
+    if (vocab_size_padded_ % tensor_para_size_ != 0) {
+        vocab_size_padded_ = (vocab_size_padded_ + tensor_para_size_ - 1) / tensor_para_size_ * tensor_para_size_;
+        TM_LOG_WARNING("pad vocab size from %d to %d", vocab_size_, vocab_size_padded_);
+    }
+
+    FT_CHECK(hidden_units_ % tensor_para_size_ == 0);
+
+    decoder_layer_weights.reserve(num_layer_);
+    for (unsigned l = 0; l < num_layer_; ++l) {
+        decoder_layer_weights.push_back(
+            new LlamaDecoderLayerWeight<T>(l, model, lora_param, moe_param, tp_size, tp_rank));
+    }
+
+    mallocWeights();
+}
+
+#endif
 
 template<typename T>
 LlamaWeight<T>::~LlamaWeight()
diff --git a/src/turbomind/models/llama/LlamaWeight.h b/src/turbomind/models/llama/LlamaWeight.h
index c04bf6c5a..f4ca7e455 100644
--- a/src/turbomind/models/llama/LlamaWeight.h
+++ b/src/turbomind/models/llama/LlamaWeight.h
@@ -29,24 +29,30 @@ namespace turbomind {
 template<typename T>
 struct LlamaWeight {
     LlamaWeight() = default;
-    LlamaWeight(size_t     head_num,
-                size_t     kv_head_num,
-                size_t     size_per_head,
-                size_t     hidden_units,
-                size_t     inter_size,
-                size_t     vocab_size,
-                size_t     num_layer,
-                bool       attn_bias,
-                WeightType weight_type,
-                int        group_size,
-                LoraParam  lora_param,
-                MoeParam   moe_param,
-                size_t     tensor_para_size,
-                size_t     tensor_para_rank);
+    // LlamaWeight(size_t     head_num,
+    //             size_t     kv_head_num,
+    //             size_t     size_per_head,
+    //             size_t     hidden_units,
+    //             size_t     inter_size,
+    //             size_t     vocab_size,
+    //             size_t     num_layer,
+    //             bool       attn_bias,
+    //             WeightType weight_type,
+    //             int        group_size,
+    //             LoraParam  lora_param,
+    //             MoeParam   moe_param,
+    //             size_t     tensor_para_size,
+    //             size_t     tensor_para_rank);
+
+    LlamaWeight(const ModelParam& model_param,
+                const LoraParam&  lora_param,
+                const MoeParam&   moe_param,
+                size_t            tp_size,
+                size_t            tp_rank);
 
     ~LlamaWeight();
 
-    LlamaWeight(const LlamaWeight& other) = delete;
+    LlamaWeight(const LlamaWeight& other)            = delete;
     LlamaWeight& operator=(const LlamaWeight& other) = delete;
 
     void loadModel(std::string dir_path);
@@ -64,13 +70,14 @@ struct LlamaWeight {
     void mallocWeights();
 
     size_t     hidden_units_;
-    size_t     inter_size_;
     size_t     vocab_size_;
     size_t     vocab_size_padded_;
     size_t     num_layer_;
     WeightType weight_type_;
     size_t     tensor_para_size_;
     size_t     tensor_para_rank_;
+
+    std::vector<int> inter_size_;
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h
index 2ea63f041..baac216f9 100644
--- a/src/turbomind/models/llama/llama_params.h
+++ b/src/turbomind/models/llama/llama_params.h
@@ -2,40 +2,45 @@
 
 #pragma once
 
-#include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include <cstddef>
 #include <map>
 #include <regex>
 #include <string>
 
+#include "src/turbomind/models/llama/weight_type.h"
+
 namespace turbomind {
 
 struct ModelParam {
-    size_t head_num;
-    size_t head_dim;
-    size_t kv_head_num;
-    size_t hidden_units;
-    size_t layer_num;
-    size_t inter_size;
-    size_t vocab_size;
-    float  norm_eps;
-    int    quant_policy;
-    //
-    int start_id;
-    int end_id;
+    size_t     head_num;
+    size_t     head_dim;
+    size_t     kv_head_num;
+    size_t     hidden_units;
+    size_t     layer_num;
+    size_t     vocab_size;
+    float      norm_eps;
+    int        quant_policy;
+    bool       attn_bias;
+    WeightType weight_type;
+    int        group_size;
+    int        start_id;
+    int        end_id;
+
+    std::vector<int> inter_size;
 };
 
 struct MoeParam {
-    enum Method
-    {
+    enum Method {
         kNaive,
         kFused
     } method;
-    int  expert_num;
+
     int  experts_per_token;
     int  inter_size;
     bool norm_topk;
     bool shared_gate;
+
+    std::vector<int> expert_num;
 };
 
 struct AttentionParam {
@@ -73,6 +78,11 @@ struct EngineParam {
     int max_prefill_iters;
 };
 
+enum class LoraPolicy : int {
+    kNull,
+    kPlora,
+};
+
 struct LoraParam {
     int        r;
     float      scale;
diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc
index 1ad76839d..dcb3f3706 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.cc
+++ b/src/turbomind/models/llama/moe_ffn_layer.cc
@@ -17,7 +17,7 @@
 namespace turbomind {
 
 template<class T>
-void MoeFfnLayer<T>::AllocateBuffer(size_t tokens, size_t padded)
+void MoeFfnLayer<T>::AllocateBuffer(size_t tokens, size_t padded, size_t expert_num)
 {
     char* base = 0;
 
@@ -25,8 +25,8 @@ void MoeFfnLayer<T>::AllocateBuffer(size_t tokens, size_t padded)
         Monotonic alloc{base};
         alloc(&inout_buf_, tokens * param_.experts_per_token * hidden_dim_);
         alloc(&inter_buf_, tokens * param_.experts_per_token * inter_size_ * 2);
-        alloc(&logits_, tokens * param_.expert_num);
-        alloc(&masks_, param_.expert_num * padded);
+        alloc(&logits_, tokens * expert_num);
+        alloc(&masks_, expert_num * padded);
         alloc(&f2n_, param_.experts_per_token * tokens);
         alloc(&en2f_, param_.experts_per_token * tokens);
         alloc(&scales_, param_.experts_per_token * tokens);
@@ -80,14 +80,17 @@ void MoeFfnLayer<T>::gate(float* logits, const T* input, int tokens, const Llama
 template<class T>
 void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight<T>& moe)
 {
-    const size_t padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
+    const size_t padded     = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
+    const int    expert_num = moe.experts.size();
 
-    AllocateBuffer(tokens, padded);
+    FT_CHECK(expert_num);
+
+    AllocateBuffer(tokens, padded, expert_num);
 
     gate(logits_, input, tokens, moe.gate);
     sync_check_cuda_error();
 
-    check_cuda_error(cudaMemsetAsync(accum_, 0, sizeof(int) * param_.expert_num * kMoeGateMaxTiles, stream_));
+    check_cuda_error(cudaMemsetAsync(accum_, 0, sizeof(int) * expert_num * kMoeGateMaxTiles, stream_));
     sync_check_cuda_error();
 
     // dump_logits(tokens, layer_id);
@@ -102,7 +105,7 @@ void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id
                      logits_,
                      tokens,
                      padded,
-                     param_.expert_num,
+                     expert_num,
                      param_.experts_per_token,
                      param_.norm_topk,
                      stream_);
@@ -110,17 +113,17 @@ void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id
 
     if (isTuning()) {
         std::mt19937     g;
-        const auto       expert_ids = SampleUniform(tokens, param_.expert_num, param_.experts_per_token, g);
-        std::vector<int> cnt(param_.expert_num);
+        const auto       expert_ids = SampleUniform(tokens, expert_num, param_.experts_per_token, g);
+        std::vector<int> cnt(expert_num);
         for (const auto& x : expert_ids) {
             ++cnt[x];
         }
         h_offsets_[0] = 0;
-        for (int i = 0; i < param_.expert_num; ++i) {
+        for (int i = 0; i < expert_num; ++i) {
             h_offsets_[i + 1] = h_offsets_[i] + cnt[i];
         }
         check_cuda_error(
-            cudaMemcpyAsync(offsets_, h_offsets_, sizeof(int) * (param_.expert_num + 1), cudaMemcpyDefault, stream_));
+            cudaMemcpyAsync(offsets_, h_offsets_, sizeof(int) * (expert_num + 1), cudaMemcpyDefault, stream_));
     }
 
     if (param_.method == MoeParam::kNaive) {
@@ -129,15 +132,15 @@ void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id
         sync_check_cuda_error();
 
         check_cuda_error(
-            cudaMemcpyAsync(h_offsets_, offsets_, sizeof(int) * (param_.expert_num + 1), cudaMemcpyDefault, stream_));
+            cudaMemcpyAsync(h_offsets_, offsets_, sizeof(int) * (expert_num + 1), cudaMemcpyDefault, stream_));
 
         check_cuda_error(cudaStreamSynchronize(stream_));
 
-        if (h_offsets_[param_.expert_num] != tokens * param_.experts_per_token) {
-            FT_CHECK_WITH_INFO(0, fmtstr("%d vs %d", h_offsets_[param_.expert_num], tokens * param_.experts_per_token));
+        if (h_offsets_[expert_num] != tokens * param_.experts_per_token) {
+            FT_CHECK_WITH_INFO(0, fmtstr("%d vs %d", h_offsets_[expert_num], tokens * param_.experts_per_token));
         }
 
-        for (int i = 0; i < param_.expert_num; ++i) {
+        for (int i = 0; i < expert_num; ++i) {
 
             FT_CHECK(moe.experts[i].is_fused_silu == false);
 
@@ -153,7 +156,7 @@ void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id
         }
     }
     else {
-        context_->set_offsets(offsets_);
+        context_->update(expert_num, param_.experts_per_token, offsets_);
 
         auto& block = moe.block;
 
@@ -237,9 +240,9 @@ void MoeFfnLayer<T>::reduce(T* output, int tokens, const MoeFfnWeight<T>& moe)
 }
 
 template<class T>
-void MoeFfnLayer<T>::dump_logits(int token_num, int layer_id)
+void MoeFfnLayer<T>::dump_logits(int token_num, int layer_id, int expert_num)
 {
-    std::vector<float> logits(token_num * param_.expert_num);
+    std::vector<float> logits(token_num * expert_num);
     check_cuda_error(
         cudaMemcpyAsync(logits.data(), logits_, sizeof(float) * logits.size(), cudaMemcpyDefault, stream_));
     check_cuda_error(cudaStreamSynchronize(stream_));
@@ -247,7 +250,7 @@ void MoeFfnLayer<T>::dump_logits(int token_num, int layer_id)
     auto ptr = logits.data();
     std::cout << "layer_id: " << layer_id << std::endl;
     for (int i = 0; i < token_num; ++i) {
-        for (int e = 0; e < param_.expert_num; ++e) {
+        for (int e = 0; e < expert_num; ++e) {
             std::cout << *ptr++ << " ";
         }
         std::cout << std::endl;
diff --git a/src/turbomind/models/llama/moe_ffn_layer.h b/src/turbomind/models/llama/moe_ffn_layer.h
index 0f1713f7b..e7edb7a67 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.h
+++ b/src/turbomind/models/llama/moe_ffn_layer.h
@@ -9,6 +9,7 @@
 #include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
 #include "src/turbomind/utils/nccl_utils.h"
+#include <algorithm>
 
 namespace turbomind {
 
@@ -26,23 +27,24 @@ class MoeFfnLayer {
         linear_(ctx.linear.get()),
         allocator_(ctx.allocator.get())
     {
-        model.inter_size = param.inter_size;
+        FT_CHECK(!param.expert_num.empty());
+        const int max_expert_num = *std::max_element(param.expert_num.begin(), param.expert_num.end());
 
         if (param_.method == MoeParam::kFused) {
             context_ = std::make_unique<gemm::MoeGemmContext>(
-                param.expert_num, param.experts_per_token, ctx.cuda_device_prop, stream_);
+                max_expert_num, param.experts_per_token, ctx.cuda_device_prop, stream_);
         }
         else {
             expert_ffn_ = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx, false);
         }
 
-        h_offsets_ = (int*)allocator_->malloc(sizeof(int) * (param_.expert_num + 1), false, true);
+        h_offsets_ = (int*)allocator_->malloc(sizeof(int) * (max_expert_num + 1), false, true);
 
-        offsets_ = (int*)allocator_->malloc(sizeof(int) * (param_.expert_num + 1));
-        accum_   = (int*)allocator_->malloc(sizeof(int) * param_.expert_num * kMoeGateMaxTiles);
+        offsets_ = (int*)allocator_->malloc(sizeof(int) * (max_expert_num + 1));
+        accum_   = (int*)allocator_->malloc(sizeof(int) * max_expert_num * kMoeGateMaxTiles);
     }
 
-    void AllocateBuffer(size_t tokens, size_t padded);
+    void AllocateBuffer(size_t tokens, size_t padded, size_t expert_num);
 
     void FreeBuffer();
 
@@ -57,7 +59,7 @@ class MoeFfnLayer {
 
     void gate(float* logits, const T* input, int tokens, const LlamaDenseWeight<T>& weight);
 
-    void dump_logits(int token_num, int layer_id);
+    void dump_logits(int token_num, int layer_id, int expert_num);
 
 private:
     const size_t           inter_size_;
diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc
index 28e8b5f64..bc8a6a147 100644
--- a/src/turbomind/models/llama/unified_decoder.cc
+++ b/src/turbomind/models/llama/unified_decoder.cc
@@ -28,11 +28,11 @@ UnifiedDecoder<T>::UnifiedDecoder(const ModelParam&     model,
 
     attn_layer_ = std::make_unique<UnifiedAttentionLayer<T>>(model, attn, lora, tp, ctx);
 
-    if (moe.expert_num) {
+    if (std::accumulate(moe.expert_num.begin(), moe.expert_num.end(), 0LL)) {
         moe_ffn_layer_ = std::make_unique<MoeFfnLayer<T>>(model, moe, tp, ctx);
     }
 
-    if (model.inter_size) {
+    if (std::accumulate(model.inter_size.begin(), model.inter_size.end(), 0LL)) {
         ffn_layer_ = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx, !moe_ffn_layer_);
     }
 
diff --git a/src/turbomind/models/llama/weight_type.h b/src/turbomind/models/llama/weight_type.h
new file mode 100644
index 000000000..ade02e10e
--- /dev/null
+++ b/src/turbomind/models/llama/weight_type.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <cstdint>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace turbomind {
+
+enum class WeightType : int {
+    kFP32,
+    kFP16,
+    kFP8,  // not supported yet
+    kBF16,
+    kINT8,
+    kINT4
+};
+
+template<class T>
+constexpr WeightType get_default_weight_type()
+{
+    if constexpr (std::is_same_v<T, half>) {
+        return WeightType::kFP16;
+    }
+    else if constexpr (std::is_same_v<T, nv_bfloat16>) {
+        return WeightType::kBF16;
+    }
+    else if constexpr (std::is_same_v<T, float>) {
+        return WeightType::kFP32;
+    }
+    else {
+        static_assert(sizeof(T) != sizeof(T), "not implemented");
+        return {};
+    }
+}
+
+inline size_t getBitSize(WeightType type)
+{
+    switch (type) {
+        case WeightType::kFP32:
+            return 32;
+        case WeightType::kFP16:
+            return 16;
+        case WeightType::kFP8:
+            return 8;
+        case WeightType::kBF16:
+            return 16;
+        case WeightType::kINT8:
+            return 8;
+        case WeightType::kINT4:
+            return 4;
+    }
+    return 0;
+}
+
+}  // namespace turbomind
\ No newline at end of file
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 38552be0c..deb28bb35 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -250,17 +250,19 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     model_param_.kv_head_num        = model_reader["kv_head_num"].as<int>(0);
     model_param_.hidden_units       = model_reader["hidden_units"].as<int>();
     model_param_.layer_num          = model_reader["num_layer"].as<int>();
-    model_param_.inter_size         = model_reader["inter_size"].as<int>();
     model_param_.vocab_size         = model_reader["vocab_size"].as<int>();
     model_param_.norm_eps           = model_reader["norm_eps"].as<float>();
     model_param_.start_id           = model_reader["start_id"].as<int>();
     model_param_.end_id             = model_reader["end_id"].as<int>();
     attn_param_.cache_block_seq_len = attention_reader["cache_block_seq_len"].as<int>(0);
     model_param_.quant_policy       = engine_reader["quant_policy"].as<int>(0);
-
+    YAML::Node inter_size           = model_reader["inter_size"];
+    for (auto it = inter_size.begin(); it != inter_size.end(); ++it) {
+        model_param_.inter_size.push_back(it->as<int>());
+    }
     // Only weight classes need these
-    attn_bias_  = model_reader["attn_bias"].as<int>(0);
-    group_size_ = model_reader["group_size"].as<int>(0);
+    model_param_.attn_bias  = model_reader["attn_bias"].as<int>(0);
+    model_param_.group_size = model_reader["group_size"].as<int>(0);
 
     // rotary embedding parameters
     attn_param_.rotary_embedding_dim    = attention_reader["rotary_embedding"].as<int>();
@@ -290,19 +292,23 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     engine_param_.num_tokens_per_iter = engine_reader["num_tokens_per_iter"].as<int>(0);
     engine_param_.max_prefill_iters   = engine_reader["max_prefill_iters"].as<int>(1);
 
-    lora_param_.policy           = ft::getLoraPolicy(reader["lora_config"]["lora_policy"].as<std::string>(""));
-    lora_param_.r                = lora_reader["lora_r"].as<int>(0);
-    lora_param_.scale            = lora_reader["lora_scale"].as<float>(0);
-    lora_param_.max_wo_r         = lora_reader["lora_max_wo_r"].as<int>(0);
-    lora_param_.rank_pattern     = getLoraPattern<int>(lora_reader["lora_rank_pattern"].as<std::string>(""),
+    lora_param_.policy        = ft::getLoraPolicy(reader["lora_config"]["lora_policy"].as<std::string>(""));
+    lora_param_.r             = lora_reader["lora_r"].as<int>(0);
+    lora_param_.scale         = lora_reader["lora_scale"].as<float>(0);
+    lora_param_.max_wo_r      = lora_reader["lora_max_wo_r"].as<int>(0);
+    lora_param_.rank_pattern  = getLoraPattern<int>(lora_reader["lora_rank_pattern"].as<std::string>(""),
                                                    [](const std::string& s) { return std::stoi(s); });
-    lora_param_.scale_pattern    = getLoraPattern<float>(lora_reader["lora_scale_pattern"].as<std::string>(""),
+    lora_param_.scale_pattern = getLoraPattern<float>(lora_reader["lora_scale_pattern"].as<std::string>(""),
                                                       [](const std::string& s) { return std::stof(s); });
-    moe_param_.expert_num        = model_reader["expert_num"].as<int>(0);
+
     moe_param_.experts_per_token = model_reader["experts_per_token"].as<int>(0);
     moe_param_.inter_size        = model_reader["expert_inter_size"].as<int>(0);
     moe_param_.shared_gate       = model_reader["moe_shared_gate"].as<int>(0);
     moe_param_.norm_topk         = model_reader["moe_norm_topk"].as<bool>(false);
+    YAML::Node expert_num        = model_reader["expert_num"];
+    for (auto it = expert_num.begin(); it != expert_num.end(); ++it) {
+        moe_param_.expert_num.push_back(it->as<int>());
+    }
 
     handleMissingParams();
 
@@ -314,19 +320,19 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
 
     const std::string weight_type_str = model_reader["weight_type"].as<std::string>();
     if (weight_type_str == "fp16" || weight_type_str == "float16") {
-        weight_type_ = ft::WeightType::kFP16;
+        model_param_.weight_type = ft::WeightType::kFP16;
     }
     else if (weight_type_str == "bf16" || weight_type_str == "bfloat16") {
-        weight_type_ = ft::WeightType::kBF16;
+        model_param_.weight_type = ft::WeightType::kBF16;
     }
     else if (weight_type_str == "fp32") {
-        weight_type_ = ft::WeightType::kFP32;
+        model_param_.weight_type = ft::WeightType::kFP32;
     }
     else if (weight_type_str == "int8") {
-        weight_type_ = ft::WeightType::kINT8;
+        model_param_.weight_type = ft::WeightType::kINT8;
     }
     else if (weight_type_str == "int4") {
-        weight_type_ = ft::WeightType::kINT4;
+        model_param_.weight_type = ft::WeightType::kINT4;
     }
     else {
         std::cout << "[ERROR] Unsupported weight type: '" << weight_type_str << "'\n";
@@ -411,20 +417,8 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
     const int tensor_para_rank   = rank % tensor_para_size_;
     const int pipeline_para_rank = rank / tensor_para_size_;
     ft::FT_CHECK(pipeline_para_size_ == 1 && pipeline_para_rank == 0);
-    weights_[device_id] = std::make_shared<ft::LlamaWeight<T>>(model_param_.head_num,
-                                                               model_param_.kv_head_num,
-                                                               model_param_.head_dim,
-                                                               model_param_.hidden_units,
-                                                               model_param_.inter_size,
-                                                               model_param_.vocab_size,
-                                                               model_param_.layer_num,
-                                                               attn_bias_,
-                                                               weight_type_,
-                                                               group_size_,
-                                                               lora_param_,
-                                                               moe_param_,
-                                                               tensor_para_size_,
-                                                               tensor_para_rank);
+    weights_[device_id] = std::make_shared<ft::LlamaWeight<T>>(
+        model_param_, lora_param_, moe_param_, tensor_para_size_, tensor_para_rank);
     // model inited with model_dir
     if (model_dir_ != "") {
         weights_[device_id]->loadModel(model_dir_);
@@ -480,9 +474,11 @@ std::string LlamaTritonModel<T>::toString()
     std::stringstream ss;
     ss << "Model: "  //
        << "\nhead_num: " << model_param_.head_num << "\nkv_head_num: " << model_param_.kv_head_num
-       << "\nsize_per_head: " << model_param_.head_dim << "\ninter_size: " << model_param_.inter_size
+       << "\nsize_per_head: "
+       << model_param_.head_dim
+       //    << "\ninter_size: " << model_param_.inter_size
        << "\nnum_layer: " << model_param_.layer_num << "\nvocab_size: " << model_param_.vocab_size
-       << "\nattn_bias: " << attn_bias_ << "\nmax_batch_size: " << engine_param_.max_batch_size
+       << "\nattn_bias: " << model_param_.attn_bias << "\nmax_batch_size: " << engine_param_.max_batch_size
        << "\nmax_prefill_token_num: " << engine_param_.max_prefill_token_num
        << "\nmax_context_token_num: " << engine_param_.max_context_token_num
        << "\nnum_tokens_per_iter: " << engine_param_.num_tokens_per_iter
@@ -493,8 +489,9 @@ std::string LlamaTritonModel<T>::toString()
        << "\nenable_prefix_caching: " << engine_param_.enable_prefix_caching << "\nstart_id: " << model_param_.start_id
        << "\ntensor_para_size: " << tensor_para_size_ << "\npipeline_para_size: " << pipeline_para_size_
        << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_ << "\nmodel_name: " << model_name_
-       << "\nmodel_dir: " << model_dir_ << "\nquant_policy: " << model_param_.quant_policy
-       << "\ngroup_size: " << group_size_ << "\nexpert_num: " << moe_param_.expert_num
+       << "\nmodel_dir: " << model_dir_ << "\nquant_policy: " << model_param_.quant_policy << "\ngroup_size: "
+       << model_param_.group_size
+       //    << "\nexpert_num: " << moe_param_.expert_num
        << "\nexpert_per_token: " << moe_param_.experts_per_token << "\nmoe_method: " << moe_param_.method << std::endl;
 
     return ss.str();
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
index 19a143e72..a6c1b862a 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
@@ -91,9 +91,6 @@ struct LlamaTritonModel: public AbstractTransformerModel {
     ft::EngineParam    engine_param_;
     size_t             tensor_para_size_;
     size_t             pipeline_para_size_;
-    ft::WeightType     weight_type_;
-    bool               attn_bias_;
-    int                group_size_;
 
     std::shared_ptr<ft::SharedState> shared_state_;
     // Weights & engine instances for the ranks

From 1ecf2911a82102520712370302d4c016330cc1b3 Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Mon, 11 Nov 2024 08:53:53 +0000
Subject: [PATCH 07/21] add head dim 192

---
 .../kernels/attention/CMakeLists.txt          |  2 +
 src/turbomind/kernels/attention/attention.cu  |  6 ++
 .../attention/codegen/attention_sm80_192.cu   | 16 +++++
 .../attention/codegen/decoding_sm80_192.cu    | 15 +++++
 src/turbomind/kernels/attention/decoding.cu   |  6 ++
 src/turbomind/kernels/attention/impl_16816.h  | 61 ++++++++++++-------
 src/turbomind/kernels/attention/impl_81616.h  |  2 +-
 src/turbomind/kernels/attention/impl_simt.h   |  5 +-
 .../kernels/attention/kv_cache_utils_v2.cu    | 12 +++-
 .../kernels/attention/mainloop_sm80.h         | 15 +++--
 src/turbomind/kernels/attention/reduce.cu     |  6 +-
 .../kernels/attention/reduce_kernel.h         |  9 ++-
 .../kernels/attention/test_attention.cu       | 12 ++--
 .../flash_attention2/CMakeLists.txt           |  4 +-
 .../flash_fwd_launch_template.h               |  2 +-
 .../flash_attention2/static_switch.h          | 12 ++++
 16 files changed, 142 insertions(+), 43 deletions(-)
 create mode 100644 src/turbomind/kernels/attention/codegen/attention_sm80_192.cu
 create mode 100644 src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu

diff --git a/src/turbomind/kernels/attention/CMakeLists.txt b/src/turbomind/kernels/attention/CMakeLists.txt
index af9d47e0e..32de38981 100644
--- a/src/turbomind/kernels/attention/CMakeLists.txt
+++ b/src/turbomind/kernels/attention/CMakeLists.txt
@@ -38,6 +38,8 @@ add_library(attention STATIC
             codegen/decoding_sm80_64_f16_f16.cu
             codegen/decoding_sm80_64_f16_u4.cu
             codegen/decoding_sm80_64_f16_u8.cu
+            codegen/attention_sm80_192.cu
+            codegen/decoding_sm80_192.cu
             )
 set_property(TARGET attention PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/src/turbomind/kernels/attention/attention.cu b/src/turbomind/kernels/attention/attention.cu
index 3f557234b..e7642584c 100644
--- a/src/turbomind/kernels/attention/attention.cu
+++ b/src/turbomind/kernels/attention/attention.cu
@@ -46,6 +46,12 @@ void dispatchAttention(const AttentionParams<T>& params)
     else if (params.size_per_head == 128) {
         return dispatch(std::integral_constant<int, 128>{});
     }
+
+    if (params.size_per_head == 192) {
+        using Config = AttentionConfig<arch::Sm80, T, 192, CacheType::kLinear>;
+        return invokeAttention<typename Config::Kernel>(params);
+    }
+
     FT_CHECK(0);
 }
 
diff --git a/src/turbomind/kernels/attention/codegen/attention_sm80_192.cu b/src/turbomind/kernels/attention/codegen/attention_sm80_192.cu
new file mode 100644
index 000000000..ceeafa7a6
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/attention_sm80_192.cu
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../attention_config.h"
+#include "../attention_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template void invokeAttention<typename AttentionConfig<arch::Sm80, nv_bfloat16, 192, CacheType::kLinear>::Kernel>(
+    const AttentionParams<nv_bfloat16>& params);
+
+template void invokeAttention<typename AttentionConfig<arch::Sm80, half, 192, CacheType::kLinear>::Kernel>(
+    const AttentionParams<half>& params);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu b/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu
new file mode 100644
index 000000000..9294fc396
--- /dev/null
+++ b/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu
@@ -0,0 +1,15 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "../decoding_config.h"
+#include "../decoding_template.h"
+
+namespace turbomind {
+
+using namespace attention;
+
+template bool
+invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, nv_bfloat16, 1, 192>>(const AttentionParams<nv_bfloat16>& params);
+
+template bool invokeDecoding<Decoding<arch::Sm80, half, half, 1, 192>>(const AttentionParams<half>& params);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/decoding.cu b/src/turbomind/kernels/attention/decoding.cu
index 1b04b7d4e..b50b67f04 100644
--- a/src/turbomind/kernels/attention/decoding.cu
+++ b/src/turbomind/kernels/attention/decoding.cu
@@ -2,6 +2,7 @@
 
 #include "decoding.h"
 #include "decoding_config.h"
+#include "src/turbomind/kernels/attention/arch.h"
 #include "src/turbomind/models/llama/llama_utils.h"
 // #include "src/turbomind/utils/dispatch.h"
 #include <type_traits>
@@ -113,6 +114,11 @@ void dispatchDecoding(const AttentionParams<T>& params)
         return false;
     };
 
+    if (params.size_per_head == 192) {
+        invokeDecoding<Decoding<arch::Sm80, T, T, 1, 192>>(params);
+        return;
+    }
+
     auto success = dispatch();
 
     FT_CHECK(success);
diff --git a/src/turbomind/kernels/attention/impl_16816.h b/src/turbomind/kernels/attention/impl_16816.h
index 6e8f37f4d..07c7dcb12 100644
--- a/src/turbomind/kernels/attention/impl_16816.h
+++ b/src/turbomind/kernels/attention/impl_16816.h
@@ -63,26 +63,28 @@ struct Impl<MMA_16816, T_, T_, CTA_H_, CTA_Q_, CTA_S_, WARP_H, WARP_Q, WARP_S, H
 
     static_assert(sizeof(FragS) / 2 == sizeof(FragP));
 
-    using SmemLayoutQ = std::conditional_t<HeadDim == 128,
+    using SmemLayoutQ = std::conditional_t<HeadDim % 128 == 0,
                                            SmemLayoutV2<CTA_Q * CTA_H, HeadDim, 64, 128, Swizzle<3, 3, 4>>,
                                            SmemLayoutV2<CTA_Q * CTA_H, HeadDim, 64, 64, Swizzle<3, 3, 3>>>;
-    using SmemLayoutK = std::conditional_t<HeadDim == 128,
+    using SmemLayoutK = std::conditional_t<HeadDim % 128 == 0,
                                            SmemLayoutV2<CTA_S, HeadDim, 16, 128, Swizzle<3, 3, 4>>,
                                            SmemLayoutV2<CTA_S, HeadDim, 16, 64, Swizzle<3, 3, 3>>>;
-    using SmemLayoutV = std::conditional_t<HeadDim == 128,
+    using SmemLayoutV = std::conditional_t<HeadDim % 128 == 0,
                                            SmemLayoutV2<CTA_S, HeadDim, 16, 128, Swizzle<3, 3, 4>>,
                                            SmemLayoutV2<CTA_S, HeadDim, 16, 64, Swizzle<3, 3, 3>>>;
 
     using SmemLayoutKVp = void;
 
+    static constexpr bool kUseSmemQ = false;
+    static constexpr bool kUseSmemP = false;
+
+    static_assert(!kUseSmemQ, "current smemQ impl yields inconsistent outputs");
+
     union SharedStorage {
         __align__(16) T KV[Stages * (SmemLayoutK::kSize + SmemLayoutV::kSize) / 2];
         __align__(16) T Q[SmemLayoutQ::kSize];
     };
 
-    static constexpr bool kUseSmemQ = false;
-    static constexpr bool kUseSmemP = false;
-
     using ThreadMapQ  = RakedThreadMap<HeadDim, CTA_Q * CTA_H, 8, kWarpCount>;
     using ThreadMapKV = RakedThreadMap<HeadDim, CTA_S, 8, kWarpCount>;
 
@@ -109,22 +111,24 @@ struct Impl<MMA_16816, T_, T_, CTA_H_, CTA_Q_, CTA_S_, WARP_H, WARP_Q, WARP_S, H
         const int warp_id = threadIdx.x / WARP_SIZE;
         const int lane_id = threadIdx.x % WARP_SIZE;
 
-        __syncwarp();
+        if constexpr (!kUseSmemQ) {
+            __syncwarp();
 
-        SmemAccessor<T, SmemLayoutQ> sQ{smem_Q};
+            SmemAccessor<T, SmemLayoutQ> sQ{smem_Q};
 
-        // Load from shared memory using LDSM, rearrange to m16n8k16 atom layout
-        PRAGMA_UNROLL
-        for (int m = 0; m < K_M; ++m) {
+            // Load from shared memory using LDSM, rearrange to m16n8k16 atom layout
             PRAGMA_UNROLL
-            for (int k = 0; k < K_K; ++k) {
-                const int qi = lane_id % 16 * 1 + m * 16 + warp_id * WARP_Q;
-                const int di = lane_id / 16 * 8 + k * 16;
-                ldsm_x4((Array<uint32_t, 4>&)frag_Q[k][m], cast_smem_ptr_to_uint(&sQ(qi, di)));
+            for (int m = 0; m < K_M; ++m) {
+                PRAGMA_UNROLL
+                for (int k = 0; k < K_K; ++k) {
+                    const int qi = lane_id % 16 * 1 + m * 16 + warp_id * WARP_Q;
+                    const int di = lane_id / 16 * 8 + k * 16;
+                    ldsm_x4((Array<uint32_t, 4>&)frag_Q[k][m], cast_smem_ptr_to_uint(&sQ(qi, di)));
+                }
             }
         }
 
-        if constexpr (kUseSmemQ) {
+        if constexpr (0) {
             __syncthreads();
 
             // Rearrange Q in smem so that swizzling is not needed for later LDSMs
@@ -142,20 +146,25 @@ struct Impl<MMA_16816, T_, T_, CTA_H_, CTA_Q_, CTA_S_, WARP_H, WARP_Q, WARP_S, H
 
     struct StateQK {
         SmemAccessor<T, SmemLayoutK> smem_K;
+        T*                           smem_Q;
 
         FragQ frag_Q;
         FragK frag_K;
 
         __device__ StateQK(SharedStorage& storage, FragQ frag_Q_): smem_K{storage.KV}
         {
-            static_assert(!kUseSmemQ, "not implemented");
-            PRAGMA_UNROLL
-            for (int k = 0; k < K_K; ++k) {
+            if constexpr (!kUseSmemQ) {
                 PRAGMA_UNROLL
-                for (int m = 0; m < K_M; ++m) {
-                    frag_Q[k][m] = frag_Q_[k][m];
+                for (int k = 0; k < K_K; ++k) {
+                    PRAGMA_UNROLL
+                    for (int m = 0; m < K_M; ++m) {
+                        frag_Q[k][m] = frag_Q_[k][m];
+                    }
                 }
             }
+            else {
+                smem_Q = storage.Q;
+            }
         }
 
         __device__ void Load(int k, int pipe_iter)
@@ -166,6 +175,16 @@ struct Impl<MMA_16816, T_, T_, CTA_H_, CTA_Q_, CTA_S_, WARP_H, WARP_Q, WARP_S, H
             const int offset_s      = group_lane_id % 8 + group_id * 8;
             const int offset_c      = group_lane_id / 8 * 8;
             const int offset        = pipe_iter * SmemLayoutK::kSize;
+            if constexpr (kUseSmemQ) {
+                const int                    warp_id = threadIdx.x / WARP_SIZE;
+                SmemAccessor<T, SmemLayoutQ> sQ{smem_Q};
+                PRAGMA_UNROLL
+                for (int m = 0; m < K_M; ++m) {
+                    const int qi = lane_id % 16 * 1 + m * 16 + warp_id * WARP_Q;
+                    const int di = lane_id / 16 * 8 + k * 16;
+                    ldsm_x4((Array<uint32_t, 4>&)frag_Q[k][m], cast_smem_ptr_to_uint(&sQ(qi, di)));
+                }
+            }
             PRAGMA_UNROLL
             for (int n = 0; n < K_N; n += 2) {  // Load (s16,d16) tiles
                 const int s = n * 8 + offset_s;
diff --git a/src/turbomind/kernels/attention/impl_81616.h b/src/turbomind/kernels/attention/impl_81616.h
index 3b90bcdf5..f865f1bc3 100644
--- a/src/turbomind/kernels/attention/impl_81616.h
+++ b/src/turbomind/kernels/attention/impl_81616.h
@@ -104,7 +104,7 @@ struct Impl<MMA_81616, T_, Tkv_, CTA_H_, CTA_Q_, CTA_S_, WARP_H_, WARP_Q, WARP_S
     }
     static constexpr auto _SmemLayoutKV(std::integral_constant<int, 4>)
     {
-        return std::conditional_t<HeadDim == 128,
+        return std::conditional_t<HeadDim % 128 == 0,
                                   SmemLayoutV2<CTA_S, HeadDim, 32, 128, Swizzle<2, 5, 3>>,
                                   SmemLayoutV2<CTA_S, HeadDim, 32, 64, Swizzle<3, 4, 3>>>{};
     }
diff --git a/src/turbomind/kernels/attention/impl_simt.h b/src/turbomind/kernels/attention/impl_simt.h
index a886185a4..667a0ce43 100644
--- a/src/turbomind/kernels/attention/impl_simt.h
+++ b/src/turbomind/kernels/attention/impl_simt.h
@@ -165,8 +165,11 @@ struct Impl<MMA_SIMT, T_, Tkv_, CTA_H_, CTA_Q_, CTA_S_, WARP_H_, WARP_Q, WARP_S,
     static constexpr bool kUseSmemQ = false;
     static constexpr bool kUseSmemP = false;
 
+    static constexpr int kAccessC_KV     = 128 / bitsof<Tkv>;
+    static constexpr int kWarpThreadC_KV = HeadDim != 192 ? HeadDim / kAccessC_KV : 8;
+
     using ThreadMapQ  = RakedThreadMap<HeadDim, CTA_H, 8, kWarpCount>;
-    using ThreadMapKV = RakedThreadMap<HeadDim, CTA_S, 128 / bitsof<Tkv>, kWarpCount>;
+    using ThreadMapKV = RakedThreadMap<HeadDim, CTA_S, kAccessC_KV, kWarpCount, kWarpThreadC_KV>;
     // `WARP_SIZE / WARP_S` is chosen to achieve minimum kIterS w/o introducing partial S iter
     using ThreadMapKVp = RakedThreadMap<2, CTA_S, 2, kWarpCount, WARP_SIZE / WARP_S>;
 
diff --git a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
index 20bb00fde..f2e2faef9 100644
--- a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
+++ b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
@@ -277,11 +277,14 @@ void invokeProcessKV_v2(char**       blocks,
     };
 
     auto dispatch = [&](auto tkv) {
-        if (head_dim == 128) {
+        if (head_dim == 64) {
+            return invoke(tkv, std::integral_constant<int, 64>{});
+        }
+        else if (head_dim == 128) {
             return invoke(tkv, std::integral_constant<int, 128>{});
         }
-        else if (head_dim == 64) {
-            return invoke(tkv, std::integral_constant<int, 64>{});
+        else if (head_dim == 192) {
+            return invoke(tkv, std::integral_constant<int, 192>{});
         }
         FT_CHECK(0);
     };
@@ -545,6 +548,9 @@ void invokeFlattenKV_v2(T*           k,
         else if (head_dim == 128) {
             return invoke(tkv, std::integral_constant<int, 128>{});
         }
+        else if (head_dim == 192) {
+            return invoke(tkv, std::integral_constant<int, 192>{});
+        }
         FT_CHECK(0);
     };
 
diff --git a/src/turbomind/kernels/attention/mainloop_sm80.h b/src/turbomind/kernels/attention/mainloop_sm80.h
index bf0fc1d32..0a65515a4 100644
--- a/src/turbomind/kernels/attention/mainloop_sm80.h
+++ b/src/turbomind/kernels/attention/mainloop_sm80.h
@@ -52,7 +52,7 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
     template<class... Args>
     __device__ void operator()(Args&&... args)
     {
-        Run(Sm80_CpAsync<Stages>{}, ((Args &&) args)...);
+        Run(Sm80_CpAsync<Stages>{}, std::integral_constant<int, Impl::kHeadDim>{}, ((Args&&)args)...);
     }
 
     template<int Idx, class A, class B>
@@ -81,8 +81,9 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
         }
     }
 
-    template<class CacheIter, class StoreS, int Stages_>
+    template<int head_dim, class CacheIter, class StoreS, int Stages_>
     __device__ void Run(Sm80_CpAsync<Stages_>,
+                        std::integral_constant<int, head_dim>,
                         FragQ&         frag_Q,
                         CacheIter&     cache_iter,
                         FragO&         frag_O,
@@ -199,9 +200,10 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
         __pipeline_wait_prior(0);
     }
 
-#if 0
+    // #if 1
     template<class CacheIter, class StoreS>
     __device__ void Run(Sm80_CpAsync<2>,
+                        std::integral_constant<int, 192>,
                         FragQ&         frag_Q,
                         CacheIter&     cache_iter,
                         FragO&         frag_O,
@@ -292,14 +294,15 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
         __pipeline_wait_prior(0);
     }
 
-#elif 1
+    // #elif 1
     // Load      : K0,K1 | V0,K2,V1,K3 ...
     // Compute   :    K0 | K1,V0,K2,V1 ...
     // - more register consumption
     // - more interleaved HMMA and FMA
     // - slight performance gain
-    template<class CacheIter, class StoreS>
+    template<int head_dim, class CacheIter, class StoreS>
     __device__ void Run(Sm80_CpAsync<2>,
+                        std::integral_constant<int, head_dim>,
                         FragQ&         frag_Q,
                         CacheIter&     cache_iter_,
                         FragO&         frag_O,
@@ -407,7 +410,7 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
         __pipeline_commit();
         __pipeline_wait_prior(0);
     }
-#endif
+    // #endif
 
     __device__ void Wait()
     {
diff --git a/src/turbomind/kernels/attention/reduce.cu b/src/turbomind/kernels/attention/reduce.cu
index 12f6aff38..051b2baa7 100644
--- a/src/turbomind/kernels/attention/reduce.cu
+++ b/src/turbomind/kernels/attention/reduce.cu
@@ -66,12 +66,14 @@ void invokeReduce(T*           out,
                                     float        exp_scale,                                                            \
                                     cudaStream_t stream);
 
-INSTANTIATE_invokeReduce(128, half);
 INSTANTIATE_invokeReduce(64, half);
+INSTANTIATE_invokeReduce(128, half);
+INSTANTIATE_invokeReduce(192, half);
 
 #if ENABLE_BF16
-INSTANTIATE_invokeReduce(128, nv_bfloat16);
 INSTANTIATE_invokeReduce(64, nv_bfloat16)
+INSTANTIATE_invokeReduce(128, nv_bfloat16);
+INSTANTIATE_invokeReduce(192, nv_bfloat16);
 #endif
 
 }  // namespace turbomind::attention
diff --git a/src/turbomind/kernels/attention/reduce_kernel.h b/src/turbomind/kernels/attention/reduce_kernel.h
index 88a3ab3af..14c5005cd 100644
--- a/src/turbomind/kernels/attention/reduce_kernel.h
+++ b/src/turbomind/kernels/attention/reduce_kernel.h
@@ -127,10 +127,13 @@ struct Reduce {
         }
 
         __syncthreads();
+        
+        // HeadDim / WARP_SIZE
+        // 128     -> 4
+        // 64, 192 -> 2
+        constexpr int kVecSize = HeadDim % 128 == 0 ? 4 : 2;
 
-        constexpr int kVecSize = HeadDim / WARP_SIZE;
-
-        using Map = RakedThreadMap<HeadDim, WarpCnt * CTA_H, kVecSize, WarpCnt>;
+        using Map = RakedThreadMap<HeadDim, WarpCnt * CTA_H, kVecSize, WarpCnt, WARP_SIZE>;
 
         static_assert(Map::kIterS == CTA_H);
 
diff --git a/src/turbomind/kernels/attention/test_attention.cu b/src/turbomind/kernels/attention/test_attention.cu
index c6d7b4063..804d4815d 100644
--- a/src/turbomind/kernels/attention/test_attention.cu
+++ b/src/turbomind/kernels/attention/test_attention.cu
@@ -218,14 +218,14 @@ void TestBlocks(const thrust::universal_vector<T>& k_cache,        // [B, H, S,
 
 #define KV_INT4 0
 
-#define DECODING 1
+#define DECODING 0
 
 template<class T>
 int test_attention()
 {
     AttentionParams<T> params{};
 
-    constexpr size_t kHeadDim = 128;
+    constexpr size_t kHeadDim = 192;
 
 #if DECODING
     // constexpr size_t kHeadNum   = 32;
@@ -239,11 +239,11 @@ int test_attention()
     // constexpr size_t kSequenceLen = 511;
     // constexpr size_t kSequenceLen = 2047;
     // constexpr size_t kSequenceLen = 4095;
-    // constexpr size_t kSequenceLen = 8191;
+    constexpr size_t kSequenceLen = 8191;
     // constexpr size_t kSequenceLen = 32767;
     // constexpr size_t kSequenceLen = 65535;
     // constexpr size_t kSequenceLen = 131071;
-    constexpr size_t kSequenceLen = 200000;
+    // constexpr size_t kSequenceLen = 200000;
     // constexpr size_t kSequenceLen = 262143;
     // constexpr size_t kSequenceLen = (1 << 20) - 1;  // 1M
     // constexpr size_t kSequenceLen = (1 << 22) - 1;  // 4M
@@ -451,6 +451,10 @@ int test_attention()
     params.qk = qk_buf.data().get();
     params.pr = pr_buf.data().get();
 
+    params.attention_scaling          = 1.f;
+    params.llama3_inv_scaling_factor  = 0;
+    params.yarn_ramp_inv_factor_div_2 = 0;
+
     Reference<T> reference(kDump ? Reference<T>::kUNFUSED : Reference<T>::kFLASH_ATTENTION, {});
     // Reference<T> reference(Reference<T>::kUNFUSED, {});
     reference.Reshape(kInputLen, kContextLen, kHeadNum, kHeadDim, KvHeadNum, kBatchSize);
diff --git a/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt b/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt
index d41c391e9..81c975058 100644
--- a/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt
+++ b/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt
@@ -8,9 +8,11 @@ add_library(${PROJECT_NAME} STATIC
     # flash_fwd_hdim64_fp16_sm80.cu
     flash_fwd_hdim128_fp16_sm80.cu
     flash_fwd_hdim128_bf16_sm80.cu
-    # flash_fwd_hdim256_fp16_sm80.cu
+    flash_fwd_hdim256_bf16_sm80.cu
+    flash_fwd_hdim256_fp16_sm80.cu
     )
 target_include_directories(${PROJECT_NAME} PRIVATE ${CUTLASS_DIR} / include)
 target_link_libraries(${PROJECT_NAME} PRIVATE nvidia::cutlass::cutlass)
+
 set_property(TARGET ${PROJECT_NAME} PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h b/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h
index e108a55f2..245649636 100644
--- a/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h
+++ b/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h
@@ -147,7 +147,7 @@ void run_mha_fwd_hdim128(Flash_fwd_params& params, cudaStream_t stream)
     });
 }
 
-#if 0
+#if 1
 template<typename T>
 void run_mha_fwd_hdim256(Flash_fwd_params& params, cudaStream_t stream)
 {
diff --git a/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h b/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h
index fd19a0ea6..ca141ee0b 100644
--- a/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h
+++ b/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h
@@ -58,6 +58,18 @@
             return __VA_ARGS__();                                                                                      \
         }                                                                                                              \
     }()
+#elif 1
+#define FWD_HEADDIM_SWITCH(HEADDIM, ...)                                                                               \
+    [&] {                                                                                                                                                                                                                          \
+        if (HEADDIM <= 128) {                                                                                     \
+            constexpr static int kHeadDim = 128;                                                                       \
+            return __VA_ARGS__();                                                                                      \
+        }                                                                                                              \
+        else if (HEADDIM <= 256) {                                                                                     \
+            constexpr static int kHeadDim = 256;                                                                       \
+            return __VA_ARGS__();                                                                                      \
+        }                                                                                                              \
+    }()
 #else
 #define FWD_HEADDIM_SWITCH(HEADDIM, ...)                                                                               \
     [&] {                                                                                                              \

From 0caa11317b37e81f8e5af81ded3589bd8d8d2817 Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Wed, 13 Nov 2024 15:53:58 +0800
Subject: [PATCH 08/21] refactor weight processing

---
 examples/cpp/llama/llama_triton_example.cc    |   4 +-
 lmdeploy/turbomind/deploy/config.py           |   2 +-
 src/turbomind/kernels/core/array_ops.h        |   2 +-
 src/turbomind/kernels/gemm/convert_v2.cu      |  41 ++-
 src/turbomind/kernels/gemm/unpack.cu          |  34 +-
 .../models/llama/LlamaDecoderLayerWeight.cc   | 340 +++++-------------
 .../models/llama/LlamaDecoderLayerWeight.h    |  21 +-
 src/turbomind/models/llama/LlamaDenseWeight.h | 185 +++++++++-
 src/turbomind/models/llama/LlamaWeight.cc     |  51 +--
 src/turbomind/models/llama/LlamaWeight.h      |  26 +-
 src/turbomind/models/llama/llama_gemm.cc      |   2 +-
 src/turbomind/models/llama/llama_kernels.h    |   2 +-
 src/turbomind/utils/memory_utils.cu           | 108 +++---
 src/turbomind/utils/memory_utils.h            |  13 +-
 14 files changed, 443 insertions(+), 388 deletions(-)

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index b0e513410..1fb5fa096 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -114,14 +114,14 @@ broadCastRequest(const std::vector<int>& v_start_ids,
         }
         else {
             // conditional case.
-            ft::deviceMalloc(&d_input_ids, size_1, false);
+            ft::deviceMalloc(&d_input_ids, size_1, nullptr, false);
             // ft::deviceMalloc(&d_input_lengths, size_2, false);
             ft::cudaH2Dcpy(d_input_ids, v_input_ids.data(), size_1);
             // ft::cudaH2Dcpy(d_input_lengths, v_input_lengths.data(), size_2);
         }
 
         if (!v_input_bad_words.empty()) {
-            ft::deviceMalloc(&d_input_bad_words, size_bad_words, false);
+            ft::deviceMalloc(&d_input_bad_words, size_bad_words, nullptr, false);
             ft::cudaH2Dcpy(d_input_bad_words, v_input_bad_words.data(), size_bad_words);
         }
         else {
diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
index bfa46f325..8bd7e6c51 100644
--- a/lmdeploy/turbomind/deploy/config.py
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -50,7 +50,7 @@ class ModelConfig:
     tp: int = 1
     model_format: str = 'hf'
     # expert_num: int = 0
-    expert_num: List[int] = None
+    expert_num: List[int] = ()
     expert_inter_size: int = 0
     experts_per_token: int = 0
     moe_shared_gate: int = False
diff --git a/src/turbomind/kernels/core/array_ops.h b/src/turbomind/kernels/core/array_ops.h
index 6b639abc8..ec6e7fb4e 100644
--- a/src/turbomind/kernels/core/array_ops.h
+++ b/src/turbomind/kernels/core/array_ops.h
@@ -172,7 +172,7 @@ inline __device__ void copy(const Array<T, N> (&src)[M], Array<T, N> (&dst)[M])
 }
 
 template<typename T, int N>
-inline __device__ void Store(T* __restrict__ dst, const Array<T, N>& src)
+inline __device__ void Store(T* dst, const Array<T, N>& src)
 {
     if constexpr (sizeof(Array<T, N>) == sizeof(uint4)) {
         *(uint4*)dst = (const uint4&)src;
diff --git a/src/turbomind/kernels/gemm/convert_v2.cu b/src/turbomind/kernels/gemm/convert_v2.cu
index ed8b2ee2f..90e4b97dd 100644
--- a/src/turbomind/kernels/gemm/convert_v2.cu
+++ b/src/turbomind/kernels/gemm/convert_v2.cu
@@ -279,17 +279,44 @@ get_weight_and_scales_layout(DataType dtype, bool is_fused_moe, int sm, bool for
     return {};
 }
 
-void* make_blocked_ptrs(const std::vector<std::pair<void*, int>>& ptrs, cudaStream_t stream)
+namespace {
+
+template<int N>
+struct Param {
+    StridedPtr  data[N];
+    StridedPtr* ptr;
+    int         n;
+};
+
+template<int N>
+__global__ void fill_strided_ptrs(Param<N> param)
 {
-    std::vector<StridedPtr> tmp;
-    for (const auto& [p, s] : ptrs) {
-        tmp.push_back({p, s});
+    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx < param.n) {
+        param.ptr[idx] = param.data[idx];
     }
+}
+
+}  // namespace
+
+void* make_blocked_ptrs(const std::vector<std::pair<void*, int>>& ptrs, cudaStream_t stream)
+{
+    constexpr int N = 64;
+    Param<N>      param{};
+    static_assert(sizeof(param) <= 4096); // max parameter size for cuda11
     StridedPtr* ptr{};
     cudaMallocAsync(&ptr, sizeof(StridedPtr) * ptrs.size(), stream);
-    cudaMemcpyAsync(ptr, tmp.data(), sizeof(StridedPtr) * ptrs.size(), cudaMemcpyDefault, stream);
-    // Sync before tmp can be destructed
-    cudaStreamSynchronize(stream);
+    param.ptr = ptr;
+    for (int i = 0; i < (int)ptrs.size(); i += N) {
+        const int n = std::min<int>(ptrs.size() - i, N);
+        for (int j = 0; j < n; ++j) {
+            auto& [p, s]  = ptrs[i + j];
+            param.data[j] = StridedPtr{p, s};
+        }
+        param.n = n;
+        fill_strided_ptrs<<<1, N, 0, stream>>>(param);
+        param.ptr += N;
+    }
     return ptr;
 }
 
diff --git a/src/turbomind/kernels/gemm/unpack.cu b/src/turbomind/kernels/gemm/unpack.cu
index 92f468d82..39e6a2e1a 100644
--- a/src/turbomind/kernels/gemm/unpack.cu
+++ b/src/turbomind/kernels/gemm/unpack.cu
@@ -71,14 +71,44 @@ void unpack_awq_gemm(uint4_t* dst, const uint4_t* src, int rows, int cols, cudaS
     permute_u4<0, 1, 3, 2><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape);
 }
 
+__global__ void transpose_u4_kernel(uint4_t* dst, const uint4_t* src, int s, int c)
+{
+    const int idx_c = 8 * (threadIdx.x + blockIdx.x * blockDim.x);
+    const int idx_s = 8 * (threadIdx.y + blockIdx.y * blockDim.y);
+    if (idx_c >= c || idx_s >= s) {
+        return;
+    }
+    uint32_t ivec[8];
+    PRAGMA_UNROLL
+    for (int i = 0; i < 8; ++i) {
+        ivec[i] = ((const uint32_t*)src)[((idx_s + i) * c + idx_c) / 8];
+    }
+    uint32_t ovec[8]{};
+    PRAGMA_UNROLL
+    for (int i = 0; i < 8; ++i) {
+        PRAGMA_UNROLL
+        for (int j = 0; j < 8; ++j) {
+            ovec[i] |= (((ivec[j] >> (i * 4)) & 0xfu) << (j * 4));
+        }
+    }
+    PRAGMA_UNROLL
+    for (int i = 0; i < 8; ++i) {
+        ((uint32_t*)dst)[((idx_c + i) * s + idx_s) / 8] = ovec[i];
+    }
+}
+
 void transpose_u4(uint4_t* dst, const uint4_t* src, int s, int c, cudaStream_t st)
 {
     if (s % 8 || c % 8) {
         std::cerr << "transpose_u4: invalid shape (" << s << "," << c << "), must be multiple of 8" << std::endl;
         return;
     }
-    Array<int, 2> shape{s, c};
-    permute_u4<1, 0><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape);
+    // Array<int, 2> shape{s, c};
+    // permute_u4<1, 0><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape);
+
+    const dim3 block(16, 16);
+    const dim3 grid((c + 15) / 16, (s + 15) / 16);
+    transpose_u4_kernel<<<grid, block, 0, st>>>(dst, src, s, c);
 }
 
 // load -> unpack -> extend_to_u8 -> manipulation -> compat_to_u4 -> store
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
index fadebda78..bb0794988 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -50,102 +50,6 @@ static bool is_fuse_silu_act()
     }();
     return value;
 }
-#if 0
-template<typename T>
-LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
-                                                    size_t     head_num,
-                                                    size_t     kv_head_num,
-                                                    size_t     size_per_head,
-                                                    size_t     hidden_units,
-                                                    size_t     inter_size,
-                                                    WeightType weight_type,
-                                                    int        group_size,
-                                                    LoraParam  lora_param,
-                                                    bool       attn_bias,
-                                                    MoeParam   moe_param,
-                                                    size_t     tensor_para_size,
-                                                    size_t     tensor_para_rank):
-    head_num_(head_num),
-    kv_head_num_(kv_head_num),
-    size_per_head_(size_per_head),
-    hidden_units_(hidden_units),
-    inter_size_(inter_size),
-    weight_type_(weight_type),
-    attn_bias_(attn_bias),
-    tensor_para_size_(tensor_para_size),
-    tensor_para_rank_(tensor_para_rank)
-{
-    if (lora_param.policy == LoraPolicy::kPlora) {
-        std::vector<std::string> keys = {
-            "attention.w_qkv", "attention.wo", "feed_forward.w1", "feed_forward.w2", "feed_forward.w3"};
-        std::vector<LlamaDenseWeight<T>*> weights = {&self_attn_weights.qkv,
-                                                     &self_attn_weights.output,
-                                                     &ffn_weights.gating,
-                                                     &ffn_weights.output,
-                                                     &ffn_weights.intermediate};
-        for (int i = 0; i < keys.size(); i++) {
-            const auto& name      = keys[i];
-            auto&       weight    = *weights[i];
-            int         rank      = lora_param.r;
-            float       scale     = lora_param.scale;
-            std::string full_name = "layers." + std::to_string(layer_idx) + "." + name;
-
-            for (const auto& [re, pr] : lora_param.rank_pattern) {
-                if (std::regex_search(full_name, pr.first)) {
-                    rank = pr.second;
-                    TM_LOG_DEBUG("find rank, pattern=%s, name=%s, value=%d", re.c_str(), full_name.c_str(), rank);
-                    break;
-                }
-            }
-            for (const auto& [re, pr] : lora_param.scale_pattern) {
-                if (std::regex_search(full_name, pr.first)) {
-                    scale = pr.second;
-                    TM_LOG_DEBUG("find scale pattern=%s, name=%s, value=%f", re.c_str(), full_name.c_str(), scale);
-                    break;
-                }
-            }
-            if (rank) {
-                weight.lora.r      = rank;
-                weight.lora.scale  = scale;
-                weight.lora.policy = lora_param.policy;
-            }
-        }
-    }
-
-    fused_up_and_gate_ = ffn_weights.gating.lora.policy != LoraPolicy::kPlora;
-
-    self_attn_weights.qkv.input_dims  = hidden_units_;
-    self_attn_weights.qkv.output_dims = (head_num + 2 * kv_head_num) * size_per_head / tensor_para_size_;
-    self_attn_weights.qkv.type        = weight_type;
-    self_attn_weights.qkv.group_size  = group_size;
-
-    self_attn_weights.output.input_dims  = (head_num * size_per_head) / tensor_para_size_;
-    self_attn_weights.output.output_dims = hidden_units_;
-    self_attn_weights.output.type        = weight_type;
-    self_attn_weights.output.group_size  = group_size;
-
-    ffn_weights = LlamaFfnWeight<T>{
-        hidden_units_,
-        inter_size_,
-        tensor_para_size_,
-        weight_type_,
-        group_size,
-        weight_type_ == WeightType::kINT4 && is_fuse_silu_act(),
-    };
-
-    moe_weights = MoeFfnWeight<T>{hidden_units_,
-                                  moe_param.inter_size,
-                                  moe_param.expert_num,
-                                  moe_param.method,
-                                  moe_param.shared_gate,
-                                  tensor_para_size_,
-                                  weight_type,
-                                  group_size,
-                                  is_fuse_silu_act()};
-
-    mallocWeights();
-}
-#else
 
 template<typename T>
 LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int               layer_id,
@@ -203,15 +107,14 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int               layer_id,
 
     fused_up_and_gate_ = ffn_weights.gating.lora.policy != LoraPolicy::kPlora;
 
-    self_attn_weights.qkv.input_dims  = hidden_units_;
-    self_attn_weights.qkv.output_dims = (head_num_ + 2 * kv_head_num_) * size_per_head_ / tensor_para_size_;
-    self_attn_weights.qkv.type        = weight_type_;
-    self_attn_weights.qkv.group_size  = model.group_size;
-
-    self_attn_weights.output.input_dims  = (head_num_ * size_per_head_) / tensor_para_size_;
-    self_attn_weights.output.output_dims = hidden_units_;
-    self_attn_weights.output.type        = weight_type_;
-    self_attn_weights.output.group_size  = model.group_size;
+    self_attn_weights = LlamaAttentionWeight<T>{hidden_units_,
+                                                size_per_head_,
+                                                head_num_,
+                                                kv_head_num_,
+                                                attn_bias_,
+                                                tensor_para_size_,
+                                                weight_type_,
+                                                model.group_size};
 
     ffn_weights = LlamaFfnWeight<T>{
         hidden_units_,
@@ -224,11 +127,25 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int               layer_id,
 
     moe_weights = MoeFfnWeight<T>{
         layer_id, moe_param, hidden_units_, weight_type_, model.group_size, tensor_para_size_, is_fuse_silu_act()};
+}
+
+template<typename T>
+void LlamaDecoderLayerWeight<T>::malloc(cudaStream_t st)
+{
+    deviceMalloc((T**)&self_attn_norm_weights, hidden_units_, st);
+    deviceMalloc((T**)&ffn_norm_weights, hidden_units_, st);
+
+    self_attn_weights.malloc(st);
 
-    mallocWeights();
+    if (inter_size_) {
+        ffn_weights.malloc(st);
+    }
+
+    if (!moe_weights.experts.empty()) {
+        moe_weights.malloc(st);
+    }
 }
 
-#endif
 template<typename T>
 size_t LlamaDecoderLayerWeight<T>::workspace_size() const noexcept
 {
@@ -251,52 +168,6 @@ size_t LlamaDecoderLayerWeight<T>::workspace_size() const noexcept
     return size * sizeof(uint16_t);
 }
 
-template<typename T>
-void freeWeights(LlamaDenseWeight<T>& weights)
-{
-    cudaFree(weights.kernel);
-    cudaFree(weights.bias);
-    cudaFree(weights.scales);
-    cudaFree(weights.zeros);
-
-    weights.kernel = nullptr;
-    weights.bias   = nullptr;
-    weights.scales = nullptr;
-    weights.zeros  = nullptr;
-
-    {
-        cudaFree(weights.lora.a);
-        cudaFree(weights.lora.b);
-        weights.lora.a = nullptr;
-        weights.lora.b = nullptr;
-    }
-}
-
-template<typename T>
-void LlamaDecoderLayerWeight<T>::mallocWeights(LlamaDenseWeight<T>& weights, bool bias)
-{
-    if (bias) {
-        deviceMalloc((T**)&weights.bias, weights.output_dims);
-    }
-    const size_t bit_size = getBitSize(weights.type);
-    if (bit_size >= 16) {  // fp16, fp32
-        deviceMalloc((T**)&weights.kernel, weights.input_dims * weights.output_dims);
-    }
-    else {  // int8, int4
-        const int factor = sizeof(float) * 8 / bit_size;
-        FT_CHECK(weights.input_dims % factor == 0);
-        deviceMalloc((int**)&weights.kernel, weights.input_dims * weights.output_dims / factor);
-        deviceMemSetZero((int*)weights.kernel, weights.input_dims * weights.output_dims / factor);
-        deviceMalloc((T**)&weights.scales, weights.input_dims / weights.group_size * weights.output_dims);
-        deviceMalloc((T**)&weights.zeros, weights.input_dims / weights.group_size * weights.output_dims);
-    }
-
-    if (weights.lora.r > 0) {
-        deviceMalloc((T**)&weights.lora.a, weights.input_dims * weights.lora.r);
-        deviceMalloc((T**)&weights.lora.b, weights.lora.r * weights.output_dims);
-    }
-}
-
 template<typename FirstArg, typename... Args>
 std::string concat(FirstArg&& first, Args&&... args)
 {
@@ -425,64 +296,24 @@ void loadWeights(LlamaDenseWeight<T>& w, std::string prefix, FtCudaDataType mode
 }
 
 template<typename T>
-void LlamaDecoderLayerWeight<T>::mallocWeights()
+void LlamaDecoderLayerWeight<T>::free(cudaStream_t st)
 {
-    deviceMalloc((T**)&self_attn_norm_weights, hidden_units_);
-    deviceMalloc((T**)&ffn_norm_weights, hidden_units_);
+    deviceFree(self_attn_norm_weights, st);
+    deviceFree(ffn_norm_weights, st);
 
-    mallocWeights(self_attn_weights.qkv, attn_bias_);
-    mallocWeights(self_attn_weights.output, attn_bias_);
+    self_attn_weights.free(st);
 
     if (inter_size_) {
-        mallocWeights(ffn_weights.gating, false);
-        mallocWeights(ffn_weights.intermediate, false);
-        mallocWeights(ffn_weights.output, false);
+        ffn_weights.free(st);
     }
 
     if (!moe_weights.experts.empty()) {
-        mallocWeights(moe_weights.gate, false);
-        for (auto& e : moe_weights.experts) {
-            mallocWeights(e.gating, false);
-            mallocWeights(e.intermediate, false);
-            mallocWeights(e.output, false);
-        }
-        if (moe_weights.shared_gate.output_dims) {
-            mallocWeights(moe_weights.shared_gate, false);
-        }
+        moe_weights.free(st);
     }
 }
 
 template<typename T>
-LlamaDecoderLayerWeight<T>::~LlamaDecoderLayerWeight()
-{
-    cudaFree((void*)self_attn_norm_weights);
-    cudaFree((void*)ffn_norm_weights);
-    self_attn_norm_weights = nullptr;
-    ffn_norm_weights       = nullptr;
-
-    freeWeights(self_attn_weights.qkv);
-    freeWeights(self_attn_weights.output);
-
-    if (inter_size_) {
-        freeWeights(ffn_weights.fused_gating_intermediate);
-        freeWeights(ffn_weights.gating);
-        freeWeights(ffn_weights.intermediate);
-        freeWeights(ffn_weights.output);
-    }
-
-    if (!moe_weights.experts.empty()) {
-        freeWeights(moe_weights.gate);
-        for (auto& e : moe_weights.experts) {
-            freeWeights(e.fused_gating_intermediate);
-            freeWeights(e.gating);
-            freeWeights(e.intermediate);
-            freeWeights(e.output);
-        }
-        if (moe_weights.shared_gate.kernel) {
-            freeWeights(moe_weights.shared_gate);
-        }
-    }
-}
+LlamaDecoderLayerWeight<T>::~LlamaDecoderLayerWeight() = default;
 
 template<typename T>
 void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType model_file_type)
@@ -561,7 +392,8 @@ TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
 }
 
 // template<class T>
-static void convert_u4(LlamaDenseWeight<half>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt)
+static void convert_u4(
+    LlamaDenseWeight<half>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st)
 {
     FT_CHECK(weight.type == WeightType::kINT4);
 
@@ -571,11 +403,11 @@ static void convert_u4(LlamaDenseWeight<half>& weight, bool is_fused_moe, void*
         get_weight_and_scales_layout(gemm::DataType::U4, is_fused_moe, getSMVersion(), use_simt);
 
     if (order_b == kColMajor) {
-        transpose_u4((uint4_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims, weight.output_dims);
-        cudaMemcpy(weight.kernel, workspace, weight.input_dims * weight.output_dims / 2, cudaMemcpyDefault);
+        transpose_u4((uint4_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims, weight.output_dims, st);
+        cudaMemcpyAsync(weight.kernel, workspace, weight.input_dims * weight.output_dims / 2, cudaMemcpyDefault, st);
     }
 
-    extend_to_u16((uint16_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims * weight.output_dims);
+    extend_to_u16((uint16_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims * weight.output_dims, st);
     sync_check_cuda_error();
 
     MatrixLayout w_desc{
@@ -590,25 +422,22 @@ static void convert_u4(LlamaDenseWeight<half>& weight, bool is_fused_moe, void*
     k_desc.type         = gemm::DataType::U4;
     k_desc.pack         = pack_b;
 
-    cudaMemset(weight.kernel, 0, weight.input_dims * weight.output_dims / 2);
+    cudaMemsetAsync(weight.kernel, 0, weight.input_dims * weight.output_dims / 2, st);
 
-    FT_CHECK(Convert(workspace, w_desc, weight.kernel, k_desc, 0) == 0);
+    FT_CHECK(Convert(workspace, w_desc, weight.kernel, k_desc, st) == 0);
     sync_check_cuda_error();
 
     const int scale_count = (weight.input_dims / weight.group_size) * weight.output_dims;
 
     // std::cout << "fuse_scales_and_zeros\n";
-    fuse_scales_and_zeros((half*)workspace, weight.scales, weight.zeros, scale_count);
+    fuse_scales_and_zeros((half*)workspace, weight.scales, weight.zeros, scale_count, st);
     // cudaMemset((T*)workspace, 0, sizeof(T) * scale_count * 2);
     sync_check_cuda_error();
 
-    cudaDeviceSynchronize();
-
-    cudaFree(weight.scales);
-    cudaFree(weight.zeros);
-    weight.scales = weight.zeros = nullptr;
+    deviceFree(weight.scales, st);
+    deviceFree(weight.zeros, st);
 
-    deviceMalloc((half**)&weight.scales_zeros, scale_count * 2);
+    deviceMalloc((half**)&weight.scales_zeros, scale_count * 2, st);
 
     MatrixLayout s_desc{
         gemm::DataType::U32,
@@ -621,7 +450,7 @@ static void convert_u4(LlamaDenseWeight<half>& weight, bool is_fused_moe, void*
     MatrixLayout q_desc = s_desc;
     q_desc.pack         = pack_v;
 
-    FT_CHECK(Convert(workspace, s_desc, weight.scales_zeros, q_desc, 0) == 0);
+    FT_CHECK(Convert(workspace, s_desc, weight.scales_zeros, q_desc, st) == 0);
     sync_check_cuda_error();
 
     weight.k_desc = k_desc;
@@ -631,7 +460,8 @@ static void convert_u4(LlamaDenseWeight<half>& weight, bool is_fused_moe, void*
 }
 
 template<class T>
-static void convert_fp(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt)
+static void
+convert_fp(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st)
 {
     using namespace gemm;
 
@@ -646,12 +476,13 @@ static void convert_fp(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* wor
     const int output_dim = weight.output_dims;
 
     if (order_b == kColMajor) {
-        invokeTransposeAxis01((uint16_t*)workspace, (uint16_t*)weight.kernel, input_dim, output_dim, 1, nullptr);
+        invokeTransposeAxis01((uint16_t*)workspace, (uint16_t*)weight.kernel, input_dim, output_dim, 1, st);
         sync_check_cuda_error();
         // FT_CHECK(0);
     }
     else {
-        check_cuda_error(cudaMemcpy(workspace, weight.kernel, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault));
+        check_cuda_error(
+            cudaMemcpyAsync(workspace, weight.kernel, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault, st));
     }
 
     MatrixLayout src{
@@ -666,35 +497,42 @@ static void convert_fp(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* wor
     dst.pack         = pack_b;
 
     if (pack_b) {
-        FT_CHECK(Convert(workspace, src, weight.kernel, dst, nullptr) == 0);
+        FT_CHECK(Convert(workspace, src, weight.kernel, dst, st) == 0);
         sync_check_cuda_error();
         // FT_CHECK(0);
     }
     else {
-        check_cuda_error(cudaMemcpy(weight.kernel, workspace, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault));
+        check_cuda_error(
+            cudaMemcpyAsync(weight.kernel, workspace, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault, st));
     }
 
     weight.k_desc = dst;
 }
 
 template<class T>
-static void convert(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt)
+static void
+convert(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st)
 {
     if (weight.type == WeightType::kINT4) {
         if constexpr (std::is_same_v<T, half>) {
-            convert_u4(weight, is_fused_moe, workspace, size, use_simt);
+            convert_u4(weight, is_fused_moe, workspace, size, use_simt, st);
         }
         else {
             FT_CHECK(0);
         }
     }
     else {
-        convert_fp(weight, is_fused_moe, workspace, size, use_simt);
+        convert_fp(weight, is_fused_moe, workspace, size, use_simt, st);
     }
 }
 
 template<class T>
-void interleave(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>& b, void* workspace, size_t size)
+void interleave(LlamaDenseWeight<T>& c,
+                LlamaDenseWeight<T>& a,
+                LlamaDenseWeight<T>& b,
+                void*                workspace,
+                size_t               size,
+                cudaStream_t         st)
 {
     FT_CHECK(c.input_dims == a.input_dims);
     FT_CHECK(c.input_dims == b.input_dims);
@@ -711,18 +549,18 @@ void interleave(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight
         const auto sentinel = tmp_c + c.output_dims * c.input_dims;
         FT_CHECK(sentinel <= (uint8_t*)workspace + size);
 
-        extend_to_u8(tmp_a, (const uint4_t*)a.kernel, a.output_dims * a.input_dims);
-        extend_to_u8(tmp_b, (const uint4_t*)b.kernel, b.output_dims * b.input_dims);
+        extend_to_u8(tmp_a, (const uint4_t*)a.kernel, a.output_dims * a.input_dims, st);
+        extend_to_u8(tmp_b, (const uint4_t*)b.kernel, b.output_dims * b.input_dims, st);
 
-        interleave_output_dims(tmp_c, tmp_a, tmp_b, a.output_dims, a.input_dims, 0);
+        interleave_output_dims(tmp_c, tmp_a, tmp_b, a.output_dims, a.input_dims, st);
 
-        compact_to_u4((uint4_t*)c.kernel, tmp_c, c.output_dims * c.input_dims);
+        compact_to_u4((uint4_t*)c.kernel, tmp_c, c.output_dims * c.input_dims, st);
 
-        interleave_output_dims(c.scales, a.scales, b.scales, a.output_dims, a.input_dims / a.group_size, 0);
-        interleave_output_dims(c.zeros, a.zeros, b.zeros, a.output_dims, a.input_dims / a.group_size, 0);
+        interleave_output_dims(c.scales, a.scales, b.scales, a.output_dims, a.input_dims / a.group_size, st);
+        interleave_output_dims(c.zeros, a.zeros, b.zeros, a.output_dims, a.input_dims / a.group_size, st);
     }
     else {
-        interleave_output_dims((T*)c.kernel, (const T*)a.kernel, (const T*)b.kernel, a.output_dims, a.input_dims, 0);
+        interleave_output_dims((T*)c.kernel, (const T*)a.kernel, (const T*)b.kernel, a.output_dims, a.input_dims, st);
     }
 
     // Check at function level
@@ -730,7 +568,7 @@ void interleave(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight
 }
 
 template<class T>
-void chunk(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>& b, void*, size_t)
+void chunk(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>& b, void*, size_t, cudaStream_t st)
 {
     FT_CHECK(c.input_dims == a.input_dims);
     FT_CHECK(c.input_dims == b.input_dims);
@@ -739,9 +577,11 @@ void chunk(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>&
     FT_CHECK(c.group_size == a.group_size);
     FT_CHECK(c.group_size == b.group_size);
 
-    auto _chunks = [](auto c, auto a, auto b, int height, int width) {
-        check_cuda_error(cudaMemcpy2D((char*)c + 0x000, width * 2, a, width, width, height, cudaMemcpyDefault));
-        check_cuda_error(cudaMemcpy2D((char*)c + width, width * 2, b, width, width, height, cudaMemcpyDefault));
+    auto _chunks = [&](auto c, auto a, auto b, int height, int width) {
+        check_cuda_error(
+            cudaMemcpy2DAsync((char*)c + 0x000, width * 2, a, width, width, height, cudaMemcpyDefault, st));
+        check_cuda_error(
+            cudaMemcpy2DAsync((char*)c + width, width * 2, b, width, width, height, cudaMemcpyDefault, st));
     };
 
     if (c.type == WeightType::kINT4) {
@@ -758,37 +598,37 @@ void chunk(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>&
 }
 
 template<typename T>
-void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cudaDeviceProp& prop)
+void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cudaDeviceProp& prop, cudaStream_t st)
 {
     const bool is_16xx = is_16xx_series(prop.name);
 
-    convert(self_attn_weights.qkv, false, workspace, size, is_16xx);
-    convert(self_attn_weights.output, false, workspace, size, is_16xx);
+    convert(self_attn_weights.qkv, false, workspace, size, is_16xx, st);
+    convert(self_attn_weights.output, false, workspace, size, is_16xx, st);
 
     auto process_ffn = [&](LlamaFfnWeight<T>& ffn, bool is_fused_moe) {
         if (fused_up_and_gate_) {
             auto& fused_up_and_gate = ffn.fused_gating_intermediate;
 
-            mallocWeights(fused_up_and_gate, false);
+            fused_up_and_gate.malloc(st);
 
             if (ffn.is_fused_silu) {
-                interleave(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size);
+                interleave(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size, st);
             }
             else {
-                chunk(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size);
+                chunk(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size, st);
             }
 
-            convert(ffn.fused_gating_intermediate, is_fused_moe, workspace, size, is_16xx);
+            convert(ffn.fused_gating_intermediate, is_fused_moe, workspace, size, is_16xx, st);
 
-            freeWeights(ffn.gating);
-            freeWeights(ffn.intermediate);
+            ffn.gating.free(st);
+            ffn.intermediate.free(st);
         }
         else {
-            convert(ffn.gating, is_fused_moe, workspace, size, is_16xx);
-            convert(ffn.intermediate, is_fused_moe, workspace, size, is_16xx);
+            convert(ffn.gating, is_fused_moe, workspace, size, is_16xx, st);
+            convert(ffn.intermediate, is_fused_moe, workspace, size, is_16xx, st);
         }
 
-        convert(ffn.output, is_fused_moe, workspace, size, is_16xx);
+        convert(ffn.output, is_fused_moe, workspace, size, is_16xx, st);
     };
 
     if (inter_size_) {
@@ -826,12 +666,12 @@ void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cud
         auto& output = moe_weights.block.output;
 
         // TODO: free these ptrs
-        fused.kernel  = gemm::make_blocked_ptrs(fused_ptrs, nullptr);
-        output.kernel = gemm::make_blocked_ptrs(output_ptrs, nullptr);
+        fused.kernel  = gemm::make_blocked_ptrs(fused_ptrs, st);
+        output.kernel = gemm::make_blocked_ptrs(output_ptrs, st);
 
         if (!fused_param_ptrs.empty()) {
-            fused.scales_zeros  = (T*)gemm::make_blocked_ptrs(fused_param_ptrs, nullptr);
-            output.scales_zeros = (T*)gemm::make_blocked_ptrs(output_param_ptrs, nullptr);
+            fused.scales_zeros  = (T*)gemm::make_blocked_ptrs(fused_param_ptrs, st);
+            output.scales_zeros = (T*)gemm::make_blocked_ptrs(output_param_ptrs, st);
         }
 
         fused.k_desc.ld = output.k_desc.ld = 0;
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
index 342775b70..4c871f9f5 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
@@ -30,19 +30,6 @@ template<typename T>
 struct LlamaDecoderLayerWeight {
 public:
     LlamaDecoderLayerWeight() = delete;
-    // LlamaDecoderLayerWeight(int        layer_idx,
-    //                         size_t     head_num,
-    //                         size_t     kv_head_num,
-    //                         size_t     size_per_head,
-    //                         size_t     hidden_units,
-    //                         size_t     inter_size,
-    //                         WeightType weight_type,
-    //                         int        group_size,
-    //                         LoraParam  lora_param,
-    //                         bool       attn_bias,
-    //                         MoeParam   moe_param,
-    //                         size_t     tensor_para_size,
-    //                         size_t     tensor_para_rank);
 
     LlamaDecoderLayerWeight(int               layer_id,
                             const ModelParam& model,
@@ -59,11 +46,13 @@ struct LlamaDecoderLayerWeight {
 
     TensorMap getParams(std::string prefix);
 
-    void prepare(void* workspace, size_t size, const cudaDeviceProp& prop);
+    void prepare(void* workspace, size_t size, const cudaDeviceProp& prop, cudaStream_t st);
 
     size_t workspace_size() const noexcept;
 
-    void mallocWeights(LlamaDenseWeight<T>& weights, bool bias);
+    void malloc(cudaStream_t st);
+
+    void free(cudaStream_t st);
 
     T*                      self_attn_norm_weights{};
     T*                      ffn_norm_weights{};
@@ -84,8 +73,6 @@ struct LlamaDecoderLayerWeight {
     size_t     tensor_para_rank_;
     bool       is_maintain_buffer_ = false;
     bool       fused_up_and_gate_;
-
-    void mallocWeights();
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h
index eed1a4119..766f8066c 100644
--- a/src/turbomind/models/llama/LlamaDenseWeight.h
+++ b/src/turbomind/models/llama/LlamaDenseWeight.h
@@ -23,6 +23,7 @@
 #include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/models/llama/weight_type.h"
 #include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/memory_utils.h"
 #include <cuda_bf16.h>
 
 namespace turbomind {
@@ -78,12 +79,157 @@ struct LlamaDenseWeight {
     {
         return {sizeof(T) * input_dims * lora.r, sizeof(T) * lora.r * output_dims};
     }
+
+    void malloc(cudaStream_t st, bool with_bias = false)
+    {
+        if (with_bias) {
+            deviceMalloc((T**)&bias, output_dims, st);
+        }
+        const size_t bit_size = getBitSize(type);
+        if (bit_size >= 16) {  // fp16, fp32
+            deviceMalloc((T**)&kernel, input_dims * output_dims, st);
+        }
+        else {  // int8, int4
+            const int factor = sizeof(float) * 8 / bit_size;
+            FT_CHECK(input_dims % factor == 0);
+            deviceMalloc((int**)&kernel, input_dims * output_dims / factor, st);
+            deviceMalloc((T**)&scales, input_dims / group_size * output_dims, st);
+            deviceMalloc((T**)&zeros, input_dims / group_size * output_dims, st);
+        }
+
+        if (lora.r > 0) {
+            deviceMalloc((T**)&lora.a, input_dims * lora.r, st);
+            deviceMalloc((T**)&lora.b, lora.r * output_dims, st);
+        }
+    }
+
+    void free(cudaStream_t st)
+    {
+        deviceFree(kernel, st);
+        deviceFree(bias, st);
+        deviceFree(scales, st);
+        deviceFree(zeros, st);
+        deviceFree(lora.a, st);
+        deviceFree(lora.b, st);
+    }
+};
+
+template<typename T>
+struct LatentAttentionWeight {
+
+    LatentAttentionWeight() = default;
+
+    LatentAttentionWeight(size_t     hidden_dim,
+                          size_t     q_lora_rank,
+                          size_t     kv_lora_rank,
+                          int        head_dim,
+                          int        head_num,
+                          WeightType weight_type,
+                          int        group_size):
+        LatentAttentionWeight{}
+    {
+        if (q_lora_rank) {
+            q_a_proj.input_dims  = hidden_dim;
+            q_a_proj.output_dims = q_lora_rank;
+            q_b_proj.input_dims  = q_lora_rank;
+            q_b_proj.output_dims = head_num * head_dim;
+            q_b_proj.type = q_a_proj.type = weight_type;
+            q_b_proj.group_size = q_a_proj.group_size = group_size;
+        }
+        else {
+            q_proj.input_dims  = hidden_dim;
+            q_proj.output_dims = head_num * head_dim;
+            q_proj.type        = weight_type;
+            q_proj.group_size  = group_size;
+        }
+
+        kv_a_proj.input_dims  = hidden_dim;
+        kv_a_proj.output_dims = kv_lora_rank;
+        kv_b_proj.input_dims  = kv_lora_rank;
+        kv_b_proj.output_dims = head_num * head_dim;
+        kv_b_proj.type = kv_a_proj.type = weight_type;
+        kv_b_proj.group_size = kv_a_proj.group_size = group_size;
+    }
+
+    void malloc(cudaStream_t st)
+    {
+        if (q_proj.output_dims) {
+            q_proj.malloc(st);
+        }
+        else {
+            q_a_proj.malloc(st);
+            q_b_proj.malloc(st);
+            deviceMalloc((T**)q_a_layernorm, q_a_proj.output_dims, st);
+        }
+        kv_a_proj.malloc(st);
+        kv_b_proj.malloc(st);
+        deviceMalloc((T**)kv_a_layernorm, kv_a_proj.output_dims, st);
+    }
+
+    void free(cudaStream_t st)
+    {
+        q_proj.free(st);
+        q_a_proj.free(st);
+        q_b_proj.free(st);
+        kv_a_proj.free(st);
+        kv_b_proj.free(st);
+        deviceFree(q_a_layernorm, st);
+        deviceFree(kv_a_layernorm, st);
+    }
+
+    LlamaDenseWeight<T> q_proj;
+
+    LlamaDenseWeight<T> q_a_proj;
+    LlamaDenseWeight<T> q_b_proj;
+    T*                  q_a_layernorm;
+
+    LlamaDenseWeight<T> kv_a_proj;
+    LlamaDenseWeight<T> kv_b_proj;
+    T*                  kv_a_layernorm;
 };
 
 template<typename T>
 struct LlamaAttentionWeight {
+
+    LlamaAttentionWeight() = default;
+
+    LlamaAttentionWeight(size_t     hidden_dim,
+                         size_t     head_dim,
+                         size_t     head_num,
+                         size_t     kv_head_num,
+                         bool       bias,
+                         size_t     tp,
+                         WeightType weight_type,
+                         int        group_size)
+    {
+        qkv.input_dims  = hidden_dim;
+        qkv.output_dims = (head_num + 2 * kv_head_num) * head_dim / tp;
+        qkv.type        = weight_type;
+        qkv.group_size  = group_size;
+
+        output.input_dims  = (head_num * head_dim) / tp;
+        output.output_dims = hidden_dim;
+        output.type        = weight_type;
+        output.group_size  = group_size;
+
+        this->bias = bias;
+    }
+
+    void malloc(cudaStream_t st)
+    {
+        qkv.malloc(st, bias);
+        output.malloc(st, bias);
+    }
+
+    void free(cudaStream_t st)
+    {
+        qkv.free(st);
+        output.free(st);
+    }
+
     LlamaDenseWeight<T> qkv;
     LlamaDenseWeight<T> output;
+    bool                bias{};
 };
 
 template<typename T>
@@ -121,6 +267,21 @@ struct LlamaFfnWeight {
         output.group_size  = group_size;
     }
 
+    void malloc(cudaStream_t st)
+    {
+        gating.malloc(st);
+        intermediate.malloc(st);
+        output.malloc(st);
+    }
+
+    void free(cudaStream_t st)
+    {
+        gating.free(st);
+        intermediate.free(st);
+        output.free(st);
+        fused_gating_intermediate.free(st);
+    }
+
     LlamaDenseWeight<T> gating;
     LlamaDenseWeight<T> intermediate;
     LlamaDenseWeight<T> output;
@@ -144,7 +305,7 @@ struct MoeFfnWeight {
                  bool            fuse_silu_act)
     {
 
-        if (param.expert_num.size() < layer_id) {
+        if (param.expert_num.size() <= layer_id) {
             return;
         }
 
@@ -182,11 +343,33 @@ struct MoeFfnWeight {
         }
     }
 
+    void malloc(cudaStream_t st)
+    {
+        gate.malloc(st);
+        if (shared_gate.output_dims) {
+            shared_gate.malloc(st);
+        }
+        for (auto& e : experts) {
+            e.malloc(st);
+        }
+    }
+
+    void free(cudaStream_t st)
+    {
+        gate.free(st);
+        shared_gate.free(st);
+        for (auto& e : experts) {
+            e.free(st);
+        }
+        block.free(st);
+    }
+
     LlamaDenseWeight<T>            gate;
     std::vector<LlamaFfnWeight<T>> experts;
 
     LlamaDenseWeight<T> shared_gate;
 
+    // reference into `experts`
     LlamaFfnWeight<T> block;
 
     MoeParam::Method method{};
diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc
index 325789f29..2db8fc9c7 100644
--- a/src/turbomind/models/llama/LlamaWeight.cc
+++ b/src/turbomind/models/llama/LlamaWeight.cc
@@ -20,6 +20,7 @@
 
 #include "src/turbomind/models/llama/LlamaWeight.h"
 #include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/memory_utils.h"
 #include <cuda_runtime.h>
 
@@ -96,13 +97,22 @@ LlamaWeight<T>::LlamaWeight(
 
     FT_CHECK(hidden_units_ % tensor_para_size_ == 0);
 
+    check_cuda_error(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
+
     decoder_layer_weights.reserve(num_layer_);
     for (unsigned l = 0; l < num_layer_; ++l) {
-        decoder_layer_weights.push_back(
+        decoder_layer_weights.emplace_back(
             new LlamaDecoderLayerWeight<T>(l, model, lora_param, moe_param, tp_size, tp_rank));
+        decoder_layer_weights.back()->malloc(stream_);
     }
 
-    mallocWeights();
+    FT_CHECK(vocab_size_padded_ % tensor_para_size_ == 0);
+    deviceMalloc((T**)&pre_decoder_embedding_table, vocab_size_padded_ * hidden_units_ / tensor_para_size_, stream_);
+    deviceMalloc((T**)&output_norm_weight, hidden_units_, stream_);
+    deviceMalloc((T**)&post_decoder_embedding_kernel, hidden_units_ * vocab_size_padded_ / tensor_para_size_, stream_);
+
+    // Wait for allocations
+    check_cuda_error(cudaStreamSynchronize(stream_));
 }
 
 #endif
@@ -110,26 +120,21 @@ LlamaWeight<T>::LlamaWeight(
 template<typename T>
 LlamaWeight<T>::~LlamaWeight()
 {
-    cudaFree((void*)pre_decoder_embedding_table);
-    cudaFree((void*)output_norm_weight);
-    cudaFree((void*)post_decoder_embedding_kernel);
-
-    pre_decoder_embedding_table   = nullptr;
-    output_norm_weight            = nullptr;
-    post_decoder_embedding_kernel = nullptr;
+    deviceFree(pre_decoder_embedding_table, stream_);
+    deviceFree(output_norm_weight, stream_);
+    deviceFree(post_decoder_embedding_kernel, stream_);
 
     for (auto& p : decoder_layer_weights) {
+        p->free(stream_);
         delete p;
     }
-}
 
-template<typename T>
-void LlamaWeight<T>::mallocWeights()
-{
-    FT_CHECK(vocab_size_padded_ % tensor_para_size_ == 0);
-    deviceMalloc((T**)&pre_decoder_embedding_table, vocab_size_padded_ * hidden_units_ / tensor_para_size_);
-    deviceMalloc((T**)&output_norm_weight, hidden_units_);
-    deviceMalloc((T**)&post_decoder_embedding_kernel, hidden_units_ * vocab_size_padded_ / tensor_para_size_);
+    decoder_layer_weights.clear();
+
+    // Wait for deallocations
+    check_cuda_error(cudaStreamSynchronize(stream_));
+    check_cuda_error(cudaStreamDestroy(stream_));
+    stream_ = {};
 }
 
 template<typename T>
@@ -205,13 +210,19 @@ void LlamaWeight<T>::prepare(const cudaDeviceProp& prop)
 
     TM_LOG_INFO("[LlamaWeight<T>::prepare] workspace size: %d\n", workspace_size);
 
+    // Wait for the weights to be filled externally
+    check_cuda_error(cudaDeviceSynchronize());
+
     if (workspace_size) {
-        deviceMalloc((char**)&workspace, workspace_size);
+        deviceMalloc((char**)&workspace, workspace_size, stream_);
     }
     for (auto& layer : decoder_layer_weights) {
-        layer->prepare(workspace, workspace_size, prop);
+        layer->prepare(workspace, workspace_size, prop, stream_);
     }
-    deviceFree(workspace);
+
+    deviceFree(workspace, stream_);
+
+    check_cuda_error(cudaStreamSynchronize(stream_));
 }
 
 #ifdef ENABLE_FP32
diff --git a/src/turbomind/models/llama/LlamaWeight.h b/src/turbomind/models/llama/LlamaWeight.h
index f4ca7e455..51d67069d 100644
--- a/src/turbomind/models/llama/LlamaWeight.h
+++ b/src/turbomind/models/llama/LlamaWeight.h
@@ -22,27 +22,12 @@
 
 #include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
 #include "src/turbomind/models/llama/llama_params.h"
-#include "src/turbomind/utils/memory_utils.h"
 
 namespace turbomind {
 
 template<typename T>
 struct LlamaWeight {
     LlamaWeight() = default;
-    // LlamaWeight(size_t     head_num,
-    //             size_t     kv_head_num,
-    //             size_t     size_per_head,
-    //             size_t     hidden_units,
-    //             size_t     inter_size,
-    //             size_t     vocab_size,
-    //             size_t     num_layer,
-    //             bool       attn_bias,
-    //             WeightType weight_type,
-    //             int        group_size,
-    //             LoraParam  lora_param,
-    //             MoeParam   moe_param,
-    //             size_t     tensor_para_size,
-    //             size_t     tensor_para_rank);
 
     LlamaWeight(const ModelParam& model_param,
                 const LoraParam&  lora_param,
@@ -62,13 +47,12 @@ struct LlamaWeight {
     void prepare(const cudaDeviceProp& prop);
 
     std::vector<LlamaDecoderLayerWeight<T>*> decoder_layer_weights;
-    const T*                                 pre_decoder_embedding_table{};
-    const T*                                 output_norm_weight{};
-    const T*                                 post_decoder_embedding_kernel{};
 
-private:
-    void mallocWeights();
+    T* pre_decoder_embedding_table{};
+    T* output_norm_weight{};
+    T* post_decoder_embedding_kernel{};
 
+private:
     size_t     hidden_units_;
     size_t     vocab_size_;
     size_t     vocab_size_padded_;
@@ -78,6 +62,8 @@ struct LlamaWeight {
     size_t     tensor_para_rank_;
 
     std::vector<int> inter_size_;
+
+    cudaStream_t stream_;
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/llama_gemm.cc b/src/turbomind/models/llama/llama_gemm.cc
index 62952cd71..f9a0191e4 100644
--- a/src/turbomind/models/llama/llama_gemm.cc
+++ b/src/turbomind/models/llama/llama_gemm.cc
@@ -84,7 +84,7 @@ int main(int argc, char* argv[])
         return -1;
     }
     else {
-        ft::deviceMalloc(reinterpret_cast<char**>(&gemm_test_buf), buf_size_in_byte, false);
+        ft::deviceMalloc(reinterpret_cast<char**>(&gemm_test_buf), buf_size_in_byte, nullptr, false);
     }
 
     if (0) {}
diff --git a/src/turbomind/models/llama/llama_kernels.h b/src/turbomind/models/llama/llama_kernels.h
index 3b01dee60..aaade1a51 100644
--- a/src/turbomind/models/llama/llama_kernels.h
+++ b/src/turbomind/models/llama/llama_kernels.h
@@ -154,7 +154,7 @@ template<typename T>
 struct TempBuffer {
     TempBuffer(size_t size)
     {
-        deviceMalloc(&data, size, false);
+        cudaMalloc(&data, size);
     }
     T* data;
 };
diff --git a/src/turbomind/utils/memory_utils.cu b/src/turbomind/utils/memory_utils.cu
index f8bfb8efe..e9a79ea5a 100644
--- a/src/turbomind/utils/memory_utils.cu
+++ b/src/turbomind/utils/memory_utils.cu
@@ -26,77 +26,71 @@
 namespace turbomind {
 
 template<typename T>
-void deviceMalloc(T** ptr, size_t size, bool is_random_initialize)
+void deviceMalloc(T** ptr, size_t size, cudaStream_t st, bool is_random_initialize)
 {
-    FT_CHECK_WITH_INFO(size >= ((size_t)0), "Ask deviceMalloc size " + std::to_string(size) + "< 0 is invalid.");
-    check_cuda_error(cudaMalloc((void**)(ptr), sizeof(T) * size));
+    check_cuda_error(cudaMallocAsync((void**)(ptr), sizeof(T) * size, st));
     if (is_random_initialize) {
-        cudaRandomUniform(*ptr, size);
+        cudaRandomUniform(*ptr, size, st);
     }
 }
 
-template void deviceMalloc(float** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(half** ptr, size_t size, bool is_random_initialize);
+template void deviceMalloc(float** ptr, size_t size, cudaStream_t, bool is_random_initialize);
+template void deviceMalloc(half** ptr, size_t size, cudaStream_t, bool is_random_initialize);
 #ifdef ENABLE_BF16
-template void deviceMalloc(__nv_bfloat16** ptr, size_t size, bool is_random_initialize);
+template void deviceMalloc(__nv_bfloat16** ptr, size_t size, cudaStream_t, bool is_random_initialize);
 #endif
-template void deviceMalloc(uint16_t** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(int** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(bool** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(char** ptr, size_t size, bool is_random_initialize);
-template void deviceMalloc(int8_t** ptr, size_t size, bool is_random_initialize);
+template void deviceMalloc(uint16_t** ptr, size_t size, cudaStream_t, bool is_random_initialize);
+template void deviceMalloc(int** ptr, size_t size, cudaStream_t, bool is_random_initialize);
+template void deviceMalloc(bool** ptr, size_t size, cudaStream_t, bool is_random_initialize);
+template void deviceMalloc(char** ptr, size_t size, cudaStream_t, bool is_random_initialize);
+template void deviceMalloc(int8_t** ptr, size_t size, cudaStream_t, bool is_random_initialize);
 #ifdef ENABLE_FP8
-template void deviceMalloc(__nv_fp8_e4m3** ptr, size_t size, bool is_random_initialize);
+template void deviceMalloc(__nv_fp8_e4m3** ptr, size_t size, cudaStream_t, bool is_random_initialize);
 #endif
 
 template<typename T>
-void deviceMemSetZero(T* ptr, size_t size)
-{
-    check_cuda_error(cudaMemset(static_cast<void*>(ptr), 0, sizeof(T) * size));
-}
-
-template void deviceMemSetZero(float* ptr, size_t size);
-template void deviceMemSetZero(half* ptr, size_t size);
-template void deviceMemSetZero(int* ptr, size_t size);
-template void deviceMemSetZero(uint32_t* ptr, size_t size);
-template void deviceMemSetZero(bool* ptr, size_t size);
-#ifdef ENABLE_FP8
-template void deviceMemSetZero(__nv_fp8_e4m3* ptr, size_t size);
-#endif
-#ifdef ENABLE_BF16
-template void deviceMemSetZero(__nv_bfloat16* ptr, size_t size);
-#endif
-
-template<typename T>
-void deviceFree(T*& ptr)
+void deviceFree(T*& ptr, cudaStream_t st)
 {
     if (ptr != NULL) {
-        check_cuda_error(cudaFree(ptr));
+        check_cuda_error(cudaFreeAsync(ptr, st));
         ptr = NULL;
     }
 }
 
-template void deviceFree(float*& ptr);
-template void deviceFree(half*& ptr);
+template void deviceFree(float*& ptr, cudaStream_t);
+template void deviceFree(half*& ptr, cudaStream_t);
 #ifdef ENABLE_BF16
-template void deviceFree(__nv_bfloat16*& ptr);
+template void deviceFree(__nv_bfloat16*& ptr, cudaStream_t);
 #endif
-template void deviceFree(unsigned short*& ptr);
-template void deviceFree(int*& ptr);
-template void deviceFree(bool*& ptr);
-template void deviceFree(char*& ptr);
-template void deviceFree(int8_t*& ptr);
+template void deviceFree(unsigned short*& ptr, cudaStream_t);
+template void deviceFree(int*& ptr, cudaStream_t);
+template void deviceFree(bool*& ptr, cudaStream_t);
+template void deviceFree(char*& ptr, cudaStream_t);
+template void deviceFree(int8_t*& ptr, cudaStream_t);
+template void deviceFree(void*& ptr, cudaStream_t);
 #ifdef ENABLE_FP8
-template void deviceFree(__nv_fp8_e4m3*& ptr);
+template void deviceFree(__nv_fp8_e4m3*& ptr, cudaStream_t);
 #endif
 
+namespace {
+
+template<class T>
+__global__ void fill_kernel(T* devptr, size_t size, T value)
+{
+    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    for (size_t i = idx; i < size; i += blockDim.x * gridDim.x) {
+        devptr[i] = value;
+    }
+}
+
+}  // namespace
+
 template<typename T>
 void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream)
 {
-    T* arr = new T[size];
-    std::fill(arr, arr + size, value);
-    check_cuda_error(cudaMemcpyAsync(devptr, arr, sizeof(T) * size, cudaMemcpyHostToDevice, stream));
-    delete[] arr;
+    constexpr int threads = 512;
+    const int     blocks  = (size + threads - 1) / threads;
+    fill_kernel<<<blocks, threads, 0, stream>>>(devptr, size, value);
 }
 
 template void deviceFill(float* devptr, size_t size, float value, cudaStream_t stream);
@@ -280,23 +274,23 @@ __global__ void cuda_random_uniform_kernel<char>(char* buffer, const size_t size
 }
 
 template<typename T>
-void cudaRandomUniform(T* buffer, const size_t size)
+void cudaRandomUniform(T* buffer, const size_t size, cudaStream_t st)
 {
     static int seq_offset = 0;
-    cuda_random_uniform_kernel<T><<<256, 256>>>(buffer, size, seq_offset);
+    cuda_random_uniform_kernel<T><<<256, 256, 0, st>>>(buffer, size, seq_offset);
     seq_offset += 256 * 256;
 }
 
-template void cudaRandomUniform(float* buffer, const size_t size);
-template void cudaRandomUniform(half* buffer, const size_t size);
+template void cudaRandomUniform(float* buffer, const size_t size, cudaStream_t);
+template void cudaRandomUniform(half* buffer, const size_t size, cudaStream_t);
 #ifdef ENABLE_BF16
-template void cudaRandomUniform(__nv_bfloat16* buffer, const size_t size);
+template void cudaRandomUniform(__nv_bfloat16* buffer, const size_t size, cudaStream_t);
 #endif
-template void cudaRandomUniform(int* buffer, const size_t size);
-template void cudaRandomUniform(bool* buffer, const size_t size);
-template void cudaRandomUniform(char* buffer, const size_t size);
+template void cudaRandomUniform(int* buffer, const size_t size, cudaStream_t);
+template void cudaRandomUniform(bool* buffer, const size_t size, cudaStream_t);
+template void cudaRandomUniform(char* buffer, const size_t size, cudaStream_t);
 #ifdef ENABLE_FP8
-template void cudaRandomUniform(__nv_fp8_e4m3* buffer, const size_t size);
+template void cudaRandomUniform(__nv_fp8_e4m3* buffer, const size_t size, cudaStream_t);
 #endif
 
 // loads data from binary file. If it succeeds, returns a non-empty vector. If loading fails or
@@ -366,10 +360,10 @@ int loadWeightFromBinFunc(T* ptr, std::vector<size_t> shape, std::string filenam
     }
     else {
         T_IN* ptr_2 = nullptr;
-        deviceMalloc(&ptr_2, host_array.size(), false);
+        deviceMalloc(&ptr_2, host_array.size(), nullptr, false);
         cudaH2Dcpy(ptr_2, host_array.data(), host_array.size());
         invokeCudaD2DcpyConvert(ptr, ptr_2, host_array.size());
-        deviceFree(ptr_2);
+        deviceFree(ptr_2, nullptr);
     }
     return 0;
 }
diff --git a/src/turbomind/utils/memory_utils.h b/src/turbomind/utils/memory_utils.h
index bb7a4f9c0..03a0ef7b3 100644
--- a/src/turbomind/utils/memory_utils.h
+++ b/src/turbomind/utils/memory_utils.h
@@ -23,16 +23,13 @@
 namespace turbomind {
 
 template<typename T>
-void deviceMalloc(T** ptr, size_t size, bool is_random_initialize = true);
+void deviceMalloc(T** ptr, size_t size, cudaStream_t st, bool is_random_initialize = false);
 
 template<typename T>
-void deviceMemSetZero(T* ptr, size_t size);
+void deviceFree(T*& ptr, cudaStream_t st);
 
 template<typename T>
-void deviceFree(T*& ptr);
-
-template<typename T>
-void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream = 0);
+void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream = {});
 
 template<typename T>
 void cudaD2Hcpy(T* tgt, const T* src, const size_t size);
@@ -44,10 +41,10 @@ template<typename T>
 void cudaD2Dcpy(T* tgt, const T* src, const size_t size);
 
 template<typename T>
-void cudaAutoCpy(T* tgt, const T* src, const size_t size, cudaStream_t stream = NULL);
+void cudaAutoCpy(T* tgt, const T* src, const size_t size, cudaStream_t stream = {});
 
 template<typename T>
-void cudaRandomUniform(T* buffer, const size_t size);
+void cudaRandomUniform(T* buffer, const size_t size, cudaStream_t stream = {});
 
 template<typename T>
 int loadWeightFromBin(T*                  ptr,

From 260c9f04e83d9ecb849fc64c7392a42a44dd3f1c Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Sun, 17 Nov 2024 23:02:32 +0800
Subject: [PATCH 09/21] deepseek-v2-lite

---
 lmdeploy/turbomind/deploy/config.py           |  11 +-
 lmdeploy/turbomind/deploy/module.py           |  79 ++++++++-
 .../turbomind/deploy/source_model/__init__.py |   1 +
 .../deploy/source_model/deepseek2.py          |  96 ++++++++++
 .../turbomind/deploy/source_model/mixtral.py  |   2 +-
 .../turbomind/deploy/target_model/base.py     |   3 +-
 lmdeploy/turbomind/supported_models.py        |   1 +
 src/turbomind/kernels/CMakeLists.txt          |   1 +
 .../kernels/attention/reduce_kernel.h         |   2 +-
 src/turbomind/kernels/gemm/moe_utils_v2.cu    |  17 +-
 src/turbomind/kernels/gemm/moe_utils_v2.h     |   1 +
 .../kernels/gemm/test/test_moe_utils.cu       |  81 ++-------
 src/turbomind/kernels/gemm/test/testbed.h     |   2 +
 src/turbomind/kernels/norm/CMakeLists.txt     |   5 +
 src/turbomind/kernels/norm/rms_norm.cu        | 108 ++++++++++++
 src/turbomind/kernels/norm/rms_norm.h         |  11 ++
 src/turbomind/models/llama/CMakeLists.txt     |   4 +-
 .../models/llama/LlamaDecoderLayerWeight.cc   |  27 ++-
 .../models/llama/LlamaDecoderLayerWeight.h    |  10 +-
 src/turbomind/models/llama/LlamaDenseWeight.h | 165 ++++++++----------
 src/turbomind/models/llama/llama_params.h     |  11 +-
 src/turbomind/models/llama/mla_utils.cu       |  87 +++++++++
 src/turbomind/models/llama/mla_utils.h        |  57 ++++++
 src/turbomind/models/llama/moe_ffn_layer.cc   |   5 +-
 src/turbomind/models/llama/moe_ffn_layer.h    |   2 +-
 .../models/llama/unified_attention_layer.cc   | 116 ++++++++++--
 .../models/llama/unified_attention_layer.h    |   7 +-
 src/turbomind/models/llama/unified_decoder.cc |  25 +--
 src/turbomind/models/llama/unified_decoder.h  |  15 +-
 src/turbomind/models/llama/weight_type.h      |   2 +-
 src/turbomind/python/bind.cpp                 |  13 +-
 .../triton_backend/llama/LlamaTritonModel.cc  |   7 +-
 src/turbomind/utils/cuda_utils.h              |  19 ++
 33 files changed, 760 insertions(+), 233 deletions(-)
 create mode 100644 lmdeploy/turbomind/deploy/source_model/deepseek2.py
 create mode 100644 src/turbomind/kernels/norm/CMakeLists.txt
 create mode 100644 src/turbomind/kernels/norm/rms_norm.cu
 create mode 100644 src/turbomind/kernels/norm/rms_norm.h
 create mode 100644 src/turbomind/models/llama/mla_utils.cu
 create mode 100644 src/turbomind/models/llama/mla_utils.h

diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
index 8bd7e6c51..b6f2ee765 100644
--- a/lmdeploy/turbomind/deploy/config.py
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -37,7 +37,6 @@ class ModelConfig:
     hidden_units: int = None
     vocab_size: int = None
     num_layer: int = None
-    # inter_size: int = None
     inter_size: List[int] = None
     norm_eps: float = None
     attn_bias: int = 0
@@ -49,12 +48,18 @@ class ModelConfig:
     session_len: int = None
     tp: int = 1
     model_format: str = 'hf'
-    # expert_num: int = 0
     expert_num: List[int] = ()
     expert_inter_size: int = 0
     experts_per_token: int = 0
     moe_shared_gate: int = False
-    moe_norm_topk: int = False
+    norm_topk_prob: int = False
+    # MLA
+    q_lora_rank: int = 0
+    kv_lora_rank: int = 0
+    qk_rope_dim: int = 0
+    v_head_dim: int = 0
+    # tuning
+    tune_layer_num: int = 1
 
     def verify(self):
         invalid = {}
diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py
index 51e842aee..6fa6fcb42 100644
--- a/lmdeploy/turbomind/deploy/module.py
+++ b/lmdeploy/turbomind/deploy/module.py
@@ -96,7 +96,8 @@ class Ffn(Module):
     def __init__(self, model: BaseOutputModel):
         self.model = model
         self.tp = model.tensor_para_size
-        # inter_sizes in config are padded and my differ from what's in the weights
+        # inter_sizes in config are padded and may be different from what's
+        # in the weights
         self.inter_size = model.model_config.inter_size
         self.group_size = max(1, model.model_config.group_size)
 
@@ -134,7 +135,8 @@ def _export(self,
 
     def apply(self, i: int, r: BaseReader):
         for e in get_params(r.ffn(i, None)):
-            e(partial(self._export, self.inter_size[i], self._ffn), partial(r.ffn, i), i)
+            e(partial(self._export, self.inter_size[i], self._ffn),
+              partial(r.ffn, i), i)
 
 
 class MoeFfn(Ffn):
@@ -156,11 +158,13 @@ def __init__(self, model: BaseOutputModel):
         self.shared_gate = model.model_config.moe_shared_gate
 
     def apply(self, i: int, r: BaseReader):
+        if self.expert_num[i] == 0:
+            return
         for p in get_params(r.moe_ffn_expert()):
             for e in range(self.expert_num[i]):
                 fmt = self._moe_ffn_expert.replace('E', str(e))
-                p(partial(self._export, self.inter_size, fmt), partial(r.moe_ffn_expert, e, i),
-                  i)
+                p(partial(self._export, self.inter_size, fmt),
+                  partial(r.moe_ffn_expert, e, i), i)
 
         gate = transpose(r.moe_ffn_gate(i))
         self.model.save_split(gate, self._moe_ffn_gate.format(i))
@@ -220,6 +224,67 @@ def apply(self, i: int, r: BaseReader):
             e(self._export, partial(r.attn, i), i)
 
 
+class MLA(Module):
+    """
+    requires:
+        r.mla(i, kind)
+        r.mla_norm(i)
+    """
+
+    _mla = 'layers.{0}.attention.{1}.{2}'
+
+    def __init__(self, model: BaseOutputModel):
+        self.model = model
+
+    def _export(self, idx: int, xs, kind: str, pack_fn, **kwargs):
+        if all(x is None for x in xs):
+            return
+        q_a, q_b, kv_a, kv_b, o = map(transpose, xs)
+
+        cfg = self.model.model_config
+        qk_nope_dim = cfg.size_per_head - cfg.qk_rope_dim
+
+        q_b = q_b.reshape(-1, cfg.size_per_head)
+
+        # [nope_dim | rope_dim] -> [rope_dim | nope_dim]
+        q_nope, q_pe = torch.split(q_b, (qk_nope_dim, cfg.qk_rope_dim), dim=-1)
+        q_b = torch.cat((q_pe, q_nope),
+                        dim=-1).view(-1, cfg.head_num * cfg.size_per_head)
+
+        o = o.reshape(cfg.head_num, cfg.v_head_dim, -1)
+        o = torch.nn.functional.pad(
+            o, (0, 0, 0, cfg.size_per_head - cfg.v_head_dim, 0, 0))
+        o = o.view(cfg.head_num * cfg.size_per_head, cfg.hidden_units)
+
+        if q_a is not None:
+            self.model.save_split(pack_fn(q_a),
+                                  self._mla.format(idx, 'q_a_proj', kind))
+        q_b_name = 'q_proj' if q_a is None else 'q_b_proj'
+        self.model.save_split(pack_fn(q_b),
+                              self._mla.format(idx, q_b_name, kind),
+                              split_dim=-1)
+        self.model.save_split(pack_fn(kv_a),
+                              self._mla.format(idx, 'kv_a_proj', kind))
+        self.model.save_split(pack_fn(kv_b),
+                              self._mla.format(idx, 'kv_b_proj', kind),
+                              split_dim=-1)
+        self.model.save_split(pack_fn(o),
+                              self._mla.format(idx, 'wo', kind),
+                              split_dim=0)
+
+    _layernorm = 'layers.{0}.attention.{1}_a_layernorm'
+
+    def apply(self, i: int, r: BaseReader):
+
+        for f in get_params(r.attn(i, None), bias=False):
+            f(self._export, partial(r.mla, i), i)
+
+        q, k = r.mla_norm(i)
+        if q is not None:
+            self.model.save_split(q, self._layernorm.format(i, 'q'))
+        self.model.save_split(k, self._layernorm.format(i, 'kv'))
+
+
 class Misc(Module):
     """
     requires:
@@ -260,7 +325,11 @@ class Transformer:
 
     def __init__(self, model: BaseOutputModel):
         self.model = model
-        modules = [Attn, LayerNorm]
+        modules = [LayerNorm]
+        if model.model_config.kv_lora_rank:
+            modules.append(MLA)
+        else:
+            modules.append(Attn)
         if model.model_config.inter_size:
             modules.append(Ffn)
         if model.model_config.expert_num:
diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py
index a36102e1c..011d7b555 100644
--- a/lmdeploy/turbomind/deploy/source_model/__init__.py
+++ b/lmdeploy/turbomind/deploy/source_model/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .baichuan import Baichuan2Model, BaichuanModel  # noqa: F401
+from .deepseek2 import DeepSeek2Model  # noqa: F401
 from .deepseek_vl import DeepSeekVLModel  # noqa: F401
 from .glm4 import Glm4Model  # noqa: F401
 from .internlm2 import InternLM2Model  # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek2.py b/lmdeploy/turbomind/deploy/source_model/deepseek2.py
new file mode 100644
index 000000000..ccdfafb18
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/deepseek2.py
@@ -0,0 +1,96 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import INPUT_MODELS
+from .llama import LlamaModel, LlamaReader
+
+
+class DeepSeek2Reader(LlamaReader):
+    
+    def moe_ffn_gate(self, i):
+        return self.params.get(f'model.layers.{i}.mlp.gate.weight')
+    
+    def moe_ffn_expert(self, e=None, i=None, kind=None):
+        if not kind:
+            return self.filter(r'experts')
+        result = []
+        for key in ['gate', 'down', 'up']:
+            name = f'model.layers.{i}.mlp.experts.{e}.{key}_proj.{kind}'
+            tensor = self.params.get(name)
+            tensor = self.transform(tensor, kind)
+            result.append(tensor)
+        return (*result, )
+    
+    def _ffn(self, i: int, kind: str):
+        """Get ffn kind for layer i."""
+        if not kind:
+            return self.filter(r'mlp' if i == 0 else r'shared_expert\.')
+        result = []
+        for key in ['gate', 'down', 'up']:
+            name = f'model.layers.{i}.mlp.shared_experts.{key}_proj.{kind}'
+            if i == 0:
+                name = name.replace('shared_experts.', '')
+            tensor = self.params.get(name)
+            tensor = self.transform(tensor, kind)
+            result.append(tensor)
+        return (*result, )
+    
+    def mla(self, i: int, kind: str):
+        if not kind:
+            return self.filter(r'self_attn.*proj')
+        result = []
+        for key in ['q_a_proj', 'q_proj', 'kv_a_proj_with_mqa', 'kv_b_proj', 'o_proj']:
+            tensor = self.params.get(
+                f'{self.attn_layer_prefix}.{i}.self_attn.{key}.{kind}'
+            )
+            tensor = self.transform(tensor, kind)
+            result.append(tensor)
+        return (*result, )
+    
+    def mla_norm(self, i: int):
+        result = []
+        for k in ['q', 'kv']:
+            result.append(self.params.get(f'{self.attn_layer_prefix}.{i}.self_attn.{k}_a_layernorm.weight'))
+        return (*result, )
+        
+
+@INPUT_MODELS.register_module(name='deepseek2')
+class DeepSeek2Model(LlamaModel):
+    
+    Reader = DeepSeek2Reader
+
+    def tokenizer_info(self):
+        n_words = self.model_config['vocab_size']
+        bos_id = self.model_config['bos_token_id']
+        eos_id = self.model_config['eos_token_id']
+        return n_words, bos_id, eos_id
+
+    def model_info(self):
+        cfg = self.model_config
+        info = super().model_info()
+        qk_nope_dim = cfg['qk_nope_head_dim']
+        qk_rope_dim = cfg['qk_rope_head_dim']
+        num_layer = cfg['num_hidden_layers']
+        expert_num = cfg['n_routed_experts']
+        expert_num = [expert_num] * num_layer
+        expert_num[0] = 0
+        n_shared_experts = cfg['n_shared_experts']
+        expert_inter_size = cfg['moe_intermediate_size']
+        experts_per_token = cfg['num_experts_per_tok']
+        inter_size = [n_shared_experts * expert_inter_size] * num_layer
+        inter_size[0] = cfg['intermediate_size']
+        norm_topk_prob = cfg['norm_topk_prob']
+        info.update(
+            kv_lora_rank=cfg['kv_lora_rank'],
+            q_lora_rank=cfg['q_lora_rank'] or 0,
+            qk_rope_dim=qk_rope_dim,
+            v_head_dim=cfg['v_head_dim'],
+            size_per_head=qk_rope_dim + qk_nope_dim,
+            rotary_embedding=qk_rope_dim,
+            expert_num=expert_num,
+            expert_inter_size=expert_inter_size,
+            experts_per_token=experts_per_token,
+            inter_size=inter_size,
+            norm_topk_prob=norm_topk_prob,
+            tune_layer_num=2
+        )
+        return info
+
diff --git a/lmdeploy/turbomind/deploy/source_model/mixtral.py b/lmdeploy/turbomind/deploy/source_model/mixtral.py
index ff9df2d40..6ac22a658 100644
--- a/lmdeploy/turbomind/deploy/source_model/mixtral.py
+++ b/lmdeploy/turbomind/deploy/source_model/mixtral.py
@@ -33,6 +33,6 @@ def model_info(self):
         info['expert_num'] = cfg['num_local_experts']
         info['expert_inter_size'] = cfg['intermediate_size']
         info['experts_per_token'] = cfg['num_experts_per_tok']
-        info['moe_norm_topk'] = True
+        info['norm_topk_prob'] = True
         info['inter_size'] = 0
         return info
diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py
index c989d630d..2f33d030e 100644
--- a/lmdeploy/turbomind/deploy/target_model/base.py
+++ b/lmdeploy/turbomind/deploy/target_model/base.py
@@ -39,7 +39,8 @@ def _weight_dtype_map(weight_type: str, default=None):
 
 def _pad_inter_size(inter_size: int, group_size: int, tp: int):
     group_size = max(1, group_size)
-    groups_per_rank = (inter_size // group_size + tp - 1) // tp
+    group_num = (inter_size + group_size - 1) // group_size
+    groups_per_rank = (group_num + tp - 1) // tp
     inter_size_padded = groups_per_rank * group_size * tp
     return inter_size_padded
 
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index 26be3a5cd..5411f72f0 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -32,6 +32,7 @@
     InternVLChatModel='internvl',
     # deepseek-vl
     MultiModalityCausalLM='deepseekvl',
+    DeepseekV2ForCausalLM='deepseek2',
     # MiniCPMV
     MiniCPMV='minicpmv',
     # mini gemini
diff --git a/src/turbomind/kernels/CMakeLists.txt b/src/turbomind/kernels/CMakeLists.txt
index febb8692d..40a48402a 100644
--- a/src/turbomind/kernels/CMakeLists.txt
+++ b/src/turbomind/kernels/CMakeLists.txt
@@ -68,3 +68,4 @@ endif ()
 
 add_subdirectory(attention)
 add_subdirectory(gemm)
+add_subdirectory(norm)
diff --git a/src/turbomind/kernels/attention/reduce_kernel.h b/src/turbomind/kernels/attention/reduce_kernel.h
index 14c5005cd..b4c9064cf 100644
--- a/src/turbomind/kernels/attention/reduce_kernel.h
+++ b/src/turbomind/kernels/attention/reduce_kernel.h
@@ -127,7 +127,7 @@ struct Reduce {
         }
 
         __syncthreads();
-        
+
         // HeadDim / WARP_SIZE
         // 128     -> 4
         // 64, 192 -> 2
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu
index 5912c60a8..c8c8db197 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.cu
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu
@@ -687,7 +687,8 @@ __global__ void MoeReduceKernel(T*           dst,         // [  n, d]
                                 const int*   en2f,        // [  e, n] :: (e,n) -> e*n
                                 const float* dst_scales,  // [n]
                                 int          dims,
-                                int          tokens)
+                                int          tokens,
+                                float        dst_scale)
 {
     using Vec = Array<T, vec_size>;
 
@@ -695,7 +696,6 @@ __global__ void MoeReduceKernel(T*           dst,         // [  n, d]
 
     auto dst_ptr = (Vec*)dst + dims * ti;
 
-    float dst_scale = 0;
     if (dst_scales) {
         dst_scale = dst_scales[ti];
         dst_scale = fdividef(1.f, 1.f + expf(-dst_scale));
@@ -712,7 +712,7 @@ __global__ void MoeReduceKernel(T*           dst,         // [  n, d]
 
     for (int i = threadIdx.x; i < dims; i += block_dim) {
         Array<float, vec_size> accum{};
-        if (dst_scales) {
+        if (dst_scale) {
             Vec v;
             Ldg(v, dst_ptr[i].data());
             using namespace ops;
@@ -739,6 +739,7 @@ void invokeMoeReduce(T*           dst,
                      int          tokens,
                      int          experts_per_token,
                      int          dims,
+                     float        dst_scale,
                      cudaStream_t st)
 {
     // std::cout << __PRETTY_FUNCTION__ << std::endl;
@@ -754,7 +755,8 @@ void invokeMoeReduce(T*           dst,
             en2f,
             dst_scales,
             dims / vec_size,
-            tokens);
+            tokens,
+            dst_scale);
     };
 
     switch (experts_per_token) {
@@ -774,10 +776,11 @@ void invokeMoeReduce(T*           dst,
     }
 }
 
-template void invokeMoeReduce(half*, const half*, const float*, const int*, const float*, int, int, int, cudaStream_t);
-#ifdef ENABLE_BF16
 template void
-invokeMoeReduce(nv_bfloat16*, const nv_bfloat16*, const float*, const int*, const float*, int, int, int, cudaStream_t);
+invokeMoeReduce(half*, const half*, const float*, const int*, const float*, int, int, int, float, cudaStream_t);
+#ifdef ENABLE_BF16
+template void invokeMoeReduce(
+    nv_bfloat16*, const nv_bfloat16*, const float*, const int*, const float*, int, int, int, float, cudaStream_t);
 #endif
 
 std::vector<int> SampleUniform(int token_num, int expert_num, int exp_per_tok, std::mt19937& g)
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.h b/src/turbomind/kernels/gemm/moe_utils_v2.h
index 0e4c36af0..f2aa9870f 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.h
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.h
@@ -54,6 +54,7 @@ void invokeMoeReduce(T*           dst,
                      int          tokens,
                      int          experts_per_token,
                      int          dims,
+                     float        dst_scale,
                      cudaStream_t st);
 
 // Sample `e` from `E` experts uniformly for every token
diff --git a/src/turbomind/kernels/gemm/test/test_moe_utils.cu b/src/turbomind/kernels/gemm/test/test_moe_utils.cu
index 47e3bfdb1..58ad9d26d 100644
--- a/src/turbomind/kernels/gemm/test/test_moe_utils.cu
+++ b/src/turbomind/kernels/gemm/test/test_moe_utils.cu
@@ -45,72 +45,6 @@ void diff_vecs(const T* data, const T* refs, int m, int k, std::string msg)
     }
 }
 
-#if 0
-void func()
-{
-    using thrust::universal_vector;
-
-    // clang-format off
-    std::vector<float> h_logits{
-        8,  5,  1,  4,  3,  6,  2,  7,
-        50, 60, 90, 20, 70, 71, 72, 73,
-        0, 1, 0, 0, 0, 1, 0, 1,
-        0, 0, 0, 1, 0, 0, 0, 2};
-    // clang-format on
-
-    h_logits.resize(8);
-
-    // auto tmp = h_logits;
-    // for (int i = 0; i < 127; ++i) {
-    //     h_logits.insert(h_logits.end(), tmp.begin(), tmp.end());
-    // }
-
-    universal_vector<float> logits(h_logits.begin(), h_logits.end());
-
-    const int E = 8;
-    const int n = h_logits.size() / E;
-    const int e = 2;
-
-    const int n_padded = (n + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
-
-    universal_vector<int>   f2n(e * n);
-    universal_vector<int>   en2f(e * n);
-    universal_vector<int>   offsets(E + 1);
-    universal_vector<int>   accum(E * kMoeGateMaxTiles);
-    universal_vector<float> scales(n * e);
-    universal_vector<int>   masks(E * n_padded);
-
-    for (int i = 0; i < 10; ++i) {
-        gemm::CacheFlushing::flush(0);
-        cudaMemset(accum.data().get(), 0, sizeof(int) * accum.size());
-        invokeMoeGate_V2(f2n.data().get(),
-                         en2f.data().get(),
-                         offsets.data().get(),
-                         scales.data().get(),
-                         masks.data().get(),
-                         accum.data().get(),
-                         logits.data().get(),
-                         n,
-                         n_padded,
-                         E,
-                         e,
-                         0);
-    }
-
-    auto err = cudaDeviceSynchronize();
-    if (err) {
-        std::cerr << cudaGetErrorString(err) << "\n";
-    }
-
-    print_vecs(scales.data().get(), e, n, "scales", 12);
-    print_vecs(masks.data().get(), E, n_padded, "tmp");
-    print_vecs(accum.data().get(), E, 1, "accum");
-    print_vecs(offsets.data().get(), 1, E + 1, "offsets");
-    print_vecs(f2n.data().get(), n * e, 1, "f2n");
-    print_vecs(en2f.data().get(), e, n, "en2f");
-}
-#endif
-
 RNG& gRNG()
 {
     static RNG inst{};
@@ -286,7 +220,7 @@ bool test_moe_gate(int                     tokens,  //
                          tokens_padded,
                          expert_num,
                          experts_per_token,
-                         true,
+                         false,
                          0);
     }
 
@@ -334,7 +268,7 @@ bool test_moe_gate(int                     tokens,  //
         success = false;
     }
 
-    if (!success && 1) {
+    if (!success || 1) {
 
         diff_vecs(eids.data().get(), eids_ref.data().get(), experts_per_token, tokens, "eids");
 
@@ -353,6 +287,15 @@ bool test_moe_gate(int                     tokens,  //
         print_vecs(scales_ref.data().get(), experts_per_token, tokens, "scales_ref", 12);
         print_vecs(scales.data().get(), experts_per_token, tokens, "scales", 12);
 
+        for (int i = 0; i < tokens; ++i) {
+            float sum = 0;
+            for (int j = 0; j < experts_per_token; ++j) {
+                sum += scales[j * tokens + i];
+            }
+            std::cout << sum << " ";
+        }
+        std::cout << "\n";
+
         // print_vecs(accum.data().get(), expert_num, 1, "accum");
 
         // print_vecs(coords.data().get(), 1, max_coords, "coords");
@@ -393,7 +336,7 @@ int main()
     // test_moe_gate(32768, 64, 8, tape, tiling);
     // test_moe_gate(8, 60, 4, tape, tiling);
 
-    test_moe_gate(65536, 8, 2, tape, tiling);
+    test_moe_gate(16, 64, 6, tape, tiling);
     return 0;
 
     for (int i = 1; i < 16384; ++i) {
diff --git a/src/turbomind/kernels/gemm/test/testbed.h b/src/turbomind/kernels/gemm/test/testbed.h
index 2678470bb..4747644f9 100644
--- a/src/turbomind/kernels/gemm/test/testbed.h
+++ b/src/turbomind/kernels/gemm/test/testbed.h
@@ -518,6 +518,7 @@ class Testbed {
                             batch_size_,
                             expert_ids_.size() / batch_size_,
                             output_dims_,
+                            0.f,
                             stream_);
 
             invokeMoeReduce(c_ref_.data().get(),
@@ -528,6 +529,7 @@ class Testbed {
                             batch_size_,
                             expert_ids_.size() / batch_size_,
                             output_dims_,
+                            0.f,
                             stream_);
 
             cudaDeviceSynchronize();
diff --git a/src/turbomind/kernels/norm/CMakeLists.txt b/src/turbomind/kernels/norm/CMakeLists.txt
new file mode 100644
index 000000000..bc1569c40
--- /dev/null
+++ b/src/turbomind/kernels/norm/CMakeLists.txt
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+add_library(rms_norm rms_norm.cu)
+set_property(TARGET rms_norm PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET rms_norm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/src/turbomind/kernels/norm/rms_norm.cu b/src/turbomind/kernels/norm/rms_norm.cu
new file mode 100644
index 000000000..ea5026a09
--- /dev/null
+++ b/src/turbomind/kernels/norm/rms_norm.cu
@@ -0,0 +1,108 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "cub/block/block_reduce.cuh"
+
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/common.h"
+
+namespace turbomind {
+
+template<class T, class Accum, int block_dim, int vec_size>
+__global__ void RMSNormKernel(
+    T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, float inv_dims)
+{
+    const int ti = blockIdx.x;
+    const int di = threadIdx.x * vec_size;
+
+    if (ti >= num) {
+        return;
+    }
+
+    src += src_ld * ti;
+
+    Array<Accum, vec_size> accum{};
+    Array<T, vec_size>     vec;
+
+    for (int i = di; i < dims; i += block_dim * vec_size) {
+        Load(vec, &src[i]);
+        Array<Accum, vec_size> tmp = cast<Accum>(vec);
+        using namespace ops;
+        accum = accum + tmp * tmp;
+    }
+
+    float sum{};
+    PRAGMA_UNROLL
+    for (int i = 0; i < vec_size; ++i) {
+        sum += accum[i];
+    }
+
+    using BlockReduce = cub::BlockReduce<Accum, block_dim>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    sum = BlockReduce{temp_storage}.Sum(sum);
+
+    __shared__ float shared_sum;
+
+    if (threadIdx.x == 0) {
+        shared_sum = rsqrtf(sum * inv_dims + eps);
+    }
+
+    __syncthreads();
+
+    sum = shared_sum;
+
+    dst += dst_ld * ti;
+
+    Array<T, vec_size> sv;
+    for (int i = di; i < dims; i += block_dim * vec_size) {
+        Load(vec, &src[i]);
+        Array<Accum, vec_size> tmp = cast<Accum>(vec);
+        Load(sv, &weights[i]);
+        PRAGMA_UNROLL
+        for (int c = 0; c < vec_size; ++c) {
+            tmp[c] *= (float)sv[c] * sum;
+        }
+        Store(&dst[i], cast<T>(tmp));
+    }
+}
+
+template<class T>
+void invokeRMSNorm(
+    T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, cudaStream_t st)
+{
+    constexpr int threads = 256;
+    const int     blocks  = num;
+
+    RMSNormKernel<T, float, threads, 8><<<blocks, threads, 0, st>>>(dst,  //
+                                                                    dst_ld,
+                                                                    src,
+                                                                    src_ld,
+                                                                    weights,
+                                                                    dims,
+                                                                    num,
+                                                                    eps,
+                                                                    1.f / dims);
+}
+
+template void invokeRMSNorm(half*        dst,
+                            int          dst_ld,
+                            const half*  src,
+                            int          src_ld,
+                            const half*  weights,
+                            int          dims,
+                            int          num,
+                            float        eps,
+                            cudaStream_t st);
+#if ENABLE_BF16
+template void invokeRMSNorm(nv_bfloat16*       dst,
+                            int                dst_ld,
+                            const nv_bfloat16* src,
+                            int                src_ld,
+                            const nv_bfloat16* weights,
+                            int                dims,
+                            int                num,
+                            float              eps,
+                            cudaStream_t       st);
+#endif
+
+}  // namespace turbomind
\ No newline at end of file
diff --git a/src/turbomind/kernels/norm/rms_norm.h b/src/turbomind/kernels/norm/rms_norm.h
new file mode 100644
index 000000000..ebf49a53e
--- /dev/null
+++ b/src/turbomind/kernels/norm/rms_norm.h
@@ -0,0 +1,11 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <cuda_runtime.h>
+
+namespace turbomind {
+
+template<class T>
+void invokeRMSNorm(
+    T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, cudaStream_t st);
+
+}
\ No newline at end of file
diff --git a/src/turbomind/models/llama/CMakeLists.txt b/src/turbomind/models/llama/CMakeLists.txt
index 285fcea31..3c714bd23 100644
--- a/src/turbomind/models/llama/CMakeLists.txt
+++ b/src/turbomind/models/llama/CMakeLists.txt
@@ -20,11 +20,13 @@ add_library(Llama STATIC
         unified_attention_layer.cc
         llama_kernels.cu
         llama_decoder_kernels.cu
-        llama_utils.cu)
+        llama_utils.cu
+        mla_utils.cu)
 set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 target_link_libraries(Llama PUBLIC CUDA::cudart
         gemm2
+        rms_norm
         cublasMMWrapper
         DynamicDecodeLayer
         activation_kernels
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
index bb0794988..c05bf0219 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -111,6 +111,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int               layer_id,
                                                 size_per_head_,
                                                 head_num_,
                                                 kv_head_num_,
+                                                model.mla,
                                                 attn_bias_,
                                                 tensor_para_size_,
                                                 weight_type_,
@@ -346,6 +347,25 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
     }
 }
 
+template<class T>
+void getMLATensor(LlamaAttentionWeight<T>& w, const std::string& p, TensorMap& m, int tp_rank)
+{
+    if (w.q_proj.output_dims) {
+        getWeightTensor(w.q_proj, false, concat(p, "attention.q_proj", tp_rank), m);
+    }
+    else {
+        getWeightTensor(w.q_a_proj, false, concat(p, "attention.q_a_proj"), m);
+        getWeightTensor(w.q_b_proj, false, concat(p, "attention.q_b_proj", tp_rank), m);
+        m.insert(concat(p, "attention.q_a_layernorm"),
+                 Tensor{MEMORY_GPU, getTensorType<T>(), {sizeof(T) * w.q_b_proj.input_dims}, w.q_a_layernorm});
+    }
+    getWeightTensor(w.kv_a_proj, false, concat(p, "attention.kv_a_proj"), m);
+    getWeightTensor(w.kv_b_proj, false, concat(p, "attention.kv_b_proj", tp_rank), m);
+    m.insert(concat(p, "attention.kv_a_layernorm"),
+             Tensor{MEMORY_GPU, getTensorType<T>(), {sizeof(T) * w.kv_b_proj.input_dims}, w.kv_a_layernorm});
+}
+
+
 template<typename T>
 TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
 {
@@ -359,7 +379,12 @@ TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
 
     auto get_prefix = [=](std::string_view name) { return concat(prefix, name, tensor_para_rank_); };
 
-    getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output);
+    if (self_attn_weights.qkv.output_dims) {
+        getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output);
+    }
+    else {
+        getMLATensor(self_attn_weights, prefix, output, tensor_para_rank_);
+    }
     getWeightTensor(self_attn_weights.output, attn_bias_, get_prefix("attention.wo"), output);
 
     if (inter_size_) {
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
index 4c871f9f5..59ee0ea5f 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
@@ -54,11 +54,13 @@ struct LlamaDecoderLayerWeight {
 
     void free(cudaStream_t st);
 
-    T*                      self_attn_norm_weights{};
-    T*                      ffn_norm_weights{};
+    T* self_attn_norm_weights{};
+    T* ffn_norm_weights{};
+
     LlamaAttentionWeight<T> self_attn_weights{};
-    LlamaFfnWeight<T>       ffn_weights{};
-    MoeFfnWeight<T>         moe_weights{};
+
+    LlamaFfnWeight<T> ffn_weights{};
+    MoeFfnWeight<T>   moe_weights{};
 
 private:
     size_t     head_num_;
diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h
index 766f8066c..944781bf5 100644
--- a/src/turbomind/models/llama/LlamaDenseWeight.h
+++ b/src/turbomind/models/llama/LlamaDenseWeight.h
@@ -46,20 +46,31 @@ struct LoraWeight {
 
 template<typename T>
 struct LlamaDenseWeight {
-    size_t     input_dims;
-    size_t     output_dims;
-    void*      kernel;
+    size_t     input_dims  = 0;
+    size_t     output_dims = 0;
+    WeightType type;  // uninitialized
+    void*      kernel       = nullptr;
+    T*         bias         = nullptr;
+    T*         scales       = nullptr;
+    T*         zeros        = nullptr;
+    T*         scales_zeros = nullptr;
+    int        group_size   = 1;
+
     LoraWeight lora;
-    WeightType type;
-    T*         bias;
-    T*         scales;
-    T*         zeros;
-    T*         scales_zeros;
-    int        group_size;
 
     gemm::MatrixLayout k_desc;
     gemm::MatrixLayout q_desc;
 
+    LlamaDenseWeight(): type{}, lora{}, k_desc{}, q_desc{} {}
+
+    LlamaDenseWeight(size_t input_dim, size_t output_dim, WeightType type, int group_size): LlamaDenseWeight{}
+    {
+        this->input_dims  = input_dim;
+        this->output_dims = output_dim;
+        this->type        = type;
+        this->group_size  = group_size;
+    }
+
     size_t kernel_size() const noexcept
     {
         return getBitSize(type) * input_dims * output_dims / 8;
@@ -114,80 +125,6 @@ struct LlamaDenseWeight {
     }
 };
 
-template<typename T>
-struct LatentAttentionWeight {
-
-    LatentAttentionWeight() = default;
-
-    LatentAttentionWeight(size_t     hidden_dim,
-                          size_t     q_lora_rank,
-                          size_t     kv_lora_rank,
-                          int        head_dim,
-                          int        head_num,
-                          WeightType weight_type,
-                          int        group_size):
-        LatentAttentionWeight{}
-    {
-        if (q_lora_rank) {
-            q_a_proj.input_dims  = hidden_dim;
-            q_a_proj.output_dims = q_lora_rank;
-            q_b_proj.input_dims  = q_lora_rank;
-            q_b_proj.output_dims = head_num * head_dim;
-            q_b_proj.type = q_a_proj.type = weight_type;
-            q_b_proj.group_size = q_a_proj.group_size = group_size;
-        }
-        else {
-            q_proj.input_dims  = hidden_dim;
-            q_proj.output_dims = head_num * head_dim;
-            q_proj.type        = weight_type;
-            q_proj.group_size  = group_size;
-        }
-
-        kv_a_proj.input_dims  = hidden_dim;
-        kv_a_proj.output_dims = kv_lora_rank;
-        kv_b_proj.input_dims  = kv_lora_rank;
-        kv_b_proj.output_dims = head_num * head_dim;
-        kv_b_proj.type = kv_a_proj.type = weight_type;
-        kv_b_proj.group_size = kv_a_proj.group_size = group_size;
-    }
-
-    void malloc(cudaStream_t st)
-    {
-        if (q_proj.output_dims) {
-            q_proj.malloc(st);
-        }
-        else {
-            q_a_proj.malloc(st);
-            q_b_proj.malloc(st);
-            deviceMalloc((T**)q_a_layernorm, q_a_proj.output_dims, st);
-        }
-        kv_a_proj.malloc(st);
-        kv_b_proj.malloc(st);
-        deviceMalloc((T**)kv_a_layernorm, kv_a_proj.output_dims, st);
-    }
-
-    void free(cudaStream_t st)
-    {
-        q_proj.free(st);
-        q_a_proj.free(st);
-        q_b_proj.free(st);
-        kv_a_proj.free(st);
-        kv_b_proj.free(st);
-        deviceFree(q_a_layernorm, st);
-        deviceFree(kv_a_layernorm, st);
-    }
-
-    LlamaDenseWeight<T> q_proj;
-
-    LlamaDenseWeight<T> q_a_proj;
-    LlamaDenseWeight<T> q_b_proj;
-    T*                  q_a_layernorm;
-
-    LlamaDenseWeight<T> kv_a_proj;
-    LlamaDenseWeight<T> kv_b_proj;
-    T*                  kv_a_layernorm;
-};
-
 template<typename T>
 struct LlamaAttentionWeight {
 
@@ -197,39 +134,77 @@ struct LlamaAttentionWeight {
                          size_t     head_dim,
                          size_t     head_num,
                          size_t     kv_head_num,
+                         MLAParam   mla,
                          bool       bias,
                          size_t     tp,
                          WeightType weight_type,
                          int        group_size)
     {
-        qkv.input_dims  = hidden_dim;
-        qkv.output_dims = (head_num + 2 * kv_head_num) * head_dim / tp;
-        qkv.type        = weight_type;
-        qkv.group_size  = group_size;
-
-        output.input_dims  = (head_num * head_dim) / tp;
-        output.output_dims = hidden_dim;
-        output.type        = weight_type;
-        output.group_size  = group_size;
-
         this->bias = bias;
+        if (mla.kv_lora_rank == 0) {
+            qkv = {hidden_dim, (head_num + 2 * kv_head_num) * head_dim / tp, weight_type, group_size};
+        }
+        else {
+            const int qk_nope_dim = head_dim - mla.qk_rope_dim;
+            if (mla.q_lora_rank) {
+                q_a_proj = {hidden_dim, mla.q_lora_rank, weight_type, group_size};
+                q_b_proj = {mla.q_lora_rank, head_num * head_dim / tp, weight_type, group_size};
+            }
+            else {
+                q_proj = {hidden_dim, head_num * head_dim / tp, weight_type, group_size};
+            }
+            kv_a_proj = {hidden_dim, mla.kv_lora_rank + mla.qk_rope_dim, weight_type, group_size};
+            kv_b_proj = {mla.kv_lora_rank, head_num * (qk_nope_dim + mla.v_head_dim) / tp, weight_type, group_size};
+        }
+        output = {(head_num * head_dim) / tp, hidden_dim, weight_type, group_size};
     }
 
     void malloc(cudaStream_t st)
     {
-        qkv.malloc(st, bias);
+        if (qkv.output_dims) {
+            qkv.malloc(st, bias);
+        }
+        else {
+            if (q_proj.output_dims) {
+                q_proj.malloc(st);
+            }
+            else {
+                q_a_proj.malloc(st);
+                q_b_proj.malloc(st);
+                deviceMalloc((T**)&q_a_layernorm, q_b_proj.input_dims, st);
+            }
+            kv_a_proj.malloc(st);
+            kv_b_proj.malloc(st);
+            deviceMalloc((T**)&kv_a_layernorm, kv_b_proj.input_dims, st);
+        }
         output.malloc(st, bias);
     }
 
     void free(cudaStream_t st)
     {
         qkv.free(st);
+        q_proj.free(st);
+        q_a_proj.free(st);
+        q_b_proj.free(st);
+        kv_a_proj.free(st);
+        kv_b_proj.free(st);
         output.free(st);
+        deviceFree(q_a_layernorm, st);
+        deviceFree(kv_a_layernorm, st);
     }
 
     LlamaDenseWeight<T> qkv;
     LlamaDenseWeight<T> output;
     bool                bias{};
+
+    LlamaDenseWeight<T> q_proj;
+    LlamaDenseWeight<T> q_a_proj;
+    LlamaDenseWeight<T> q_b_proj;
+    LlamaDenseWeight<T> kv_a_proj;
+    LlamaDenseWeight<T> kv_b_proj;
+
+    T* q_a_layernorm{};
+    T* kv_a_layernorm{};
 };
 
 template<typename T>
@@ -315,7 +290,7 @@ struct MoeFfnWeight {
             return;
         }
 
-        printf("%d %d %d\n", (int)hidden_dim, (int)param.inter_size, (int)expert_num);
+        // printf("%d %d %d\n", (int)hidden_dim, (int)param.inter_size, (int)expert_num);
 
         gate.input_dims  = hidden_dim;
         gate.output_dims = expert_num;
diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h
index baac216f9..ba915caaf 100644
--- a/src/turbomind/models/llama/llama_params.h
+++ b/src/turbomind/models/llama/llama_params.h
@@ -11,6 +11,13 @@
 
 namespace turbomind {
 
+struct MLAParam {
+    size_t q_lora_rank;
+    size_t kv_lora_rank;
+    size_t qk_rope_dim;
+    size_t v_head_dim;
+};
+
 struct ModelParam {
     size_t     head_num;
     size_t     head_dim;
@@ -25,6 +32,8 @@ struct ModelParam {
     int        group_size;
     int        start_id;
     int        end_id;
+    MLAParam   mla;
+    int        tune_layer_num;
 
     std::vector<int> inter_size;
 };
@@ -37,7 +46,7 @@ struct MoeParam {
 
     int  experts_per_token;
     int  inter_size;
-    bool norm_topk;
+    bool norm_topk_prob;
     bool shared_gate;
 
     std::vector<int> expert_num;
diff --git a/src/turbomind/models/llama/mla_utils.cu b/src/turbomind/models/llama/mla_utils.cu
new file mode 100644
index 000000000..16999b812
--- /dev/null
+++ b/src/turbomind/models/llama/mla_utils.cu
@@ -0,0 +1,87 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "src/turbomind/kernels/core/array_ops.h"
+
+namespace turbomind {
+
+template<class T, int vec_size>
+__global__ void mla_copy_qkv_kernel(T*       qkv,
+                                    const T* q,     // [h, head_dim]
+                                    const T* kv_a,  // [kv_lora_rank, rope_dim]
+                                    const T* kv_b,  // [h, nope_dim + v_head_dim]
+                                    int      head_num,
+                                    int      head_dim,
+                                    int      nope_dim,
+                                    int      rope_dim,
+                                    int      kv_lora_rank,
+                                    int      v_head_dim)
+{
+    const int type = blockIdx.y;
+
+    const int ti = blockIdx.x;
+    const int di = threadIdx.x;
+
+    const int kv_b_dim = nope_dim + v_head_dim;
+
+    for (int hi = threadIdx.y; hi < head_num; hi += blockDim.y) {
+        Array<T, vec_size> data{};
+        if (type == 0) {  // Q
+            Ldg(data, &q[ti * head_num * head_dim + hi * head_dim + di * vec_size]);
+        }
+        else if (type == 1) {  // K
+            if (di * vec_size < rope_dim) {
+                Ldg(data, &kv_a[ti * (kv_lora_rank + rope_dim) + kv_lora_rank + di * vec_size]);
+            }
+            else {
+                Ldg(data, &kv_b[ti * head_num * kv_b_dim + hi * kv_b_dim + di * vec_size - rope_dim]);
+            }
+        }
+        else {  // V
+            if (di * vec_size < v_head_dim) {
+                Ldg(data, &kv_b[ti * head_num * kv_b_dim + hi * kv_b_dim + nope_dim + di * vec_size]);
+            }
+        }
+        const int ti_stride = 3 * head_num * head_dim;
+        Store(&qkv[ti * ti_stride + type * head_num * head_dim + hi * head_dim + di * vec_size], data);
+    }
+}
+
+template<class T>
+void invokeMLACopyQKV(T*           qkv,
+                      const T*     q,
+                      const T*     kv_a,
+                      const T*     kv_b,
+                      int          token_num,
+                      int          head_num,
+                      int          nope_dim,
+                      int          rope_dim,
+                      int          kv_lora_rank,
+                      int          v_head_dim,
+                      cudaStream_t stream)
+{
+    constexpr int vec_size = 16 / sizeof(T);
+    const int     head_dim = nope_dim + rope_dim;
+
+    dim3 block(head_dim / vec_size, head_num);
+    // make sure block size <= 1024
+    while (block.x * block.y > 1024) {
+        block.y /= 2;
+    }
+    const dim3 grid(token_num, 3);
+
+    mla_copy_qkv_kernel<T, vec_size><<<grid, block, 0, stream>>>(
+        qkv, q, kv_a, kv_b, head_num, head_dim, nope_dim, rope_dim, kv_lora_rank, v_head_dim);
+}
+
+template void invokeMLACopyQKV(uint16_t*       qkv,
+                               const uint16_t* q,
+                               const uint16_t* kv_a,
+                               const uint16_t* kv_b,
+                               int             token_num,
+                               int             head_num,
+                               int             nope_dim,
+                               int             rope_dim,
+                               int             kv_lora_rank,
+                               int             v_head_dim,
+                               cudaStream_t    stream);
+
+}  // namespace turbomind
\ No newline at end of file
diff --git a/src/turbomind/models/llama/mla_utils.h b/src/turbomind/models/llama/mla_utils.h
new file mode 100644
index 000000000..8e5cad117
--- /dev/null
+++ b/src/turbomind/models/llama/mla_utils.h
@@ -0,0 +1,57 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#pragma once
+
+#include <cstdint>
+#include <cuda_runtime.h>
+
+#include "src/turbomind/utils/cuda_utils.h"
+
+namespace turbomind {
+
+template<class T>
+void invokeMLACopyQKV(T*           qkv,
+                      const T*     q,
+                      const T*     kv_a,
+                      const T*     kv_b,
+                      int          token_num,
+                      int          head_num,
+                      int          nope_dim,
+                      int          rope_dim,
+                      int          kv_lora_rank,
+                      int          v_head_dim,
+                      cudaStream_t stream);
+
+template<class T>
+void dispatchMLACopyQKV(T*           qkv,
+                        const T*     q,
+                        const T*     kv_a,
+                        const T*     kv_b,
+                        int          token_num,
+                        int          head_num,
+                        int          nope_dim,
+                        int          rope_dim,
+                        int          kv_lora_rank,
+                        int          v_head_dim,
+                        cudaStream_t stream)
+{
+    auto invoke = [&](auto x) {
+        using type = decltype(x);
+        invokeMLACopyQKV((type*)qkv,
+                         (const type*)q,
+                         (const type*)kv_a,
+                         (const type*)kv_b,
+                         token_num,
+                         head_num,
+                         nope_dim,
+                         rope_dim,
+                         kv_lora_rank,
+                         v_head_dim,
+                         stream);
+    };
+    if constexpr (sizeof(T) == 2) {
+        return invoke(uint16_t{});
+    }
+    FT_CHECK(0);
+}
+
+}  // namespace turbomind
\ No newline at end of file
diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc
index dcb3f3706..99a66c223 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.cc
+++ b/src/turbomind/models/llama/moe_ffn_layer.cc
@@ -107,7 +107,7 @@ void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id
                      padded,
                      expert_num,
                      param_.experts_per_token,
-                     param_.norm_topk,
+                     param_.norm_topk_prob,
                      stream_);
     sync_check_cuda_error();
 
@@ -220,7 +220,7 @@ void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id
 }
 
 template<class T>
-void MoeFfnLayer<T>::reduce(T* output, int tokens, const MoeFfnWeight<T>& moe)
+void MoeFfnLayer<T>::reduce(T* output, int tokens, float output_scale, const MoeFfnWeight<T>& moe)
 {
     invokeMoeReduce(output,
                     inout_buf_,
@@ -230,6 +230,7 @@ void MoeFfnLayer<T>::reduce(T* output, int tokens, const MoeFfnWeight<T>& moe)
                     tokens,
                     param_.experts_per_token,
                     hidden_dim_,
+                    output_scale,
                     stream_);
     sync_check_cuda_error();
 
diff --git a/src/turbomind/models/llama/moe_ffn_layer.h b/src/turbomind/models/llama/moe_ffn_layer.h
index e7edb7a67..8911d931f 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.h
+++ b/src/turbomind/models/llama/moe_ffn_layer.h
@@ -55,7 +55,7 @@ class MoeFfnLayer {
 
     void forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight<T>& moe);
 
-    void reduce(T* output, int tokens, const MoeFfnWeight<T>& moe);
+    void reduce(T* output, int tokens, float output_scale, const MoeFfnWeight<T>& moe);
 
     void gate(float* logits, const T* input, int tokens, const LlamaDenseWeight<T>& weight);
 
diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc
index 2f99b0c2c..ba002a746 100644
--- a/src/turbomind/models/llama/unified_attention_layer.cc
+++ b/src/turbomind/models/llama/unified_attention_layer.cc
@@ -19,21 +19,24 @@
 // Modified from
 // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc
 
-#include "src/turbomind/models/llama/unified_attention_layer.h"
+#include <algorithm>
+#include <math.h>
+
 #include "src/turbomind/kernels/attention/attention.h"
 #include "src/turbomind/kernels/attention/decoding.h"
 #include "src/turbomind/kernels/attention/kv_cache_utils_v2.h"
+#include "src/turbomind/kernels/norm/rms_norm.h"
 #include "src/turbomind/macro.h"
 #include "src/turbomind/models/llama/LlamaNcclGuard.h"
 #include "src/turbomind/models/llama/llama_kernels.h"
 #include "src/turbomind/models/llama/llama_utils.h"
+#include "src/turbomind/models/llama/mla_utils.h"
+#include "src/turbomind/models/llama/unified_attention_layer.h"
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/anomaly_handler.h"
 #include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/debug_utils.h"
 #include "src/turbomind/utils/logger.h"
-#include <algorithm>
-#include <math.h>
+#include "src/turbomind/utils/memory_utils.h"
 
 namespace turbomind {
 
@@ -72,17 +75,14 @@ UnifiedAttentionLayer<T>::UnifiedAttentionLayer(const ModelParam&     model,
 }
 
 template<typename T>
-void UnifiedAttentionLayer<T>::allocateBuffer(size_t            q_count,
-                                              size_t            k_count,
-                                              size_t            batch_size,
-                                              const WeightType* weights)
+void UnifiedAttentionLayer<T>::allocateBuffer(size_t q_count, size_t k_count, size_t batch_size, size_t qkv_lora_rank)
 {
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
 
     const int local_q_kv_head_num = local_head_num_ + 2 * local_kv_head_num_;
 
-    if (weights->qkv.lora.r) {
-        size_t sz = sizeof(T) * q_count * (local_q_kv_head_num * size_per_head_ + weights->qkv.lora.r);
+    if (qkv_lora_rank) {
+        size_t sz = sizeof(T) * q_count * (local_q_kv_head_num * size_per_head_ + qkv_lora_rank);
         qkv_buf_  = (T*)allocator_->reMalloc(qkv_buf_, sz, false);
     }
     else {
@@ -198,7 +198,7 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
     allocateBuffer(token_num,                                           // shared
                    h_cu_k_len[batch_size] - h_cu_k_len[dc_batch_size],  // prefill
                    batch_size,
-                   weights);
+                   weights->qkv.lora.r);
 
     // [L, 2, H, s, D]
     const size_t layer_offset = layer_id * 2 * local_kv_head_num_ * param_.cache_block_seq_len * size_per_head_;
@@ -210,11 +210,17 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
     // }
 
     int* lora_mask = inputs->at("lora_mask", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<int>();
-    //////////////////////////////////////////////
-    /// qkv gemm
-    // [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim]
-    linear_->forward(qkv_buf_, attention_input, token_num, weights->qkv, LlamaLinear<T>::kGemm, lora_mask);
-    sync_check_cuda_error();
+
+    if (weights->qkv.output_dims) {
+        //////////////////////////////////////////////
+        /// qkv gemm
+        // [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim]
+        linear_->forward(qkv_buf_, attention_input, token_num, weights->qkv, LlamaLinear<T>::kGemm, lora_mask);
+        sync_check_cuda_error();
+    }
+    else {
+        forward_mla(attention_input, token_num, *weights);
+    }
 
     count_and_fix(qkv_buf_, token_num * weights->qkv.output_dims, Concat("qkv", layer_id), 3);
 
@@ -431,6 +437,84 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
     sync_check_cuda_error();
 }
 
+template<typename T>
+void UnifiedAttentionLayer<T>::forward_mla(const T* inputs, int token_num, const WeightType& w)
+{
+    const int q_lora_rank  = w.q_a_proj.output_dims;
+    const int kv_lora_rank = w.kv_b_proj.input_dims;
+    const int qk_rope_dim  = w.kv_a_proj.output_dims - kv_lora_rank;
+    const int qk_nope_dim  = std::max(w.q_b_proj.output_dims, w.q_proj.output_dims) / local_head_num_ - qk_rope_dim;
+    const int v_head_dim   = w.kv_b_proj.output_dims / local_head_num_ - qk_nope_dim;
+
+    T* q{};
+
+    if (w.q_proj.output_dims) {
+        deviceMalloc((T**)&q, (size_t)token_num * w.q_proj.output_dims, stream_);
+        linear_->forward(q, inputs, token_num, w.q_proj);
+        sync_check_cuda_error();
+    }
+    else {
+        FT_CHECK(0);
+        T* q_a{};
+        deviceMalloc((T**)&q_a, (size_t)token_num * q_lora_rank, stream_);
+
+        linear_->forward(q_a, inputs, token_num, w.q_a_proj);
+        sync_check_cuda_error();
+
+        invokeRMSNorm(q_a,
+                      q_lora_rank,
+                      q_a,
+                      q_lora_rank,
+                      w.q_a_layernorm,
+                      q_lora_rank,
+                      token_num,
+                      model_param_.norm_eps,
+                      stream_);
+        sync_check_cuda_error();
+
+        deviceMalloc((T**)&q, (size_t)token_num * w.q_b_proj.output_dims, stream_);
+        linear_->forward(q, q_a, token_num, w.q_b_proj);
+        sync_check_cuda_error();
+
+        deviceFree(q_a, stream_);
+    }
+
+    T*        kv_a{};
+    const int kv_a_dim = w.kv_a_proj.output_dims;
+    deviceMalloc((T**)&kv_a, (size_t)token_num * kv_a_dim, stream_);
+
+    linear_->forward(kv_a, inputs, token_num, w.kv_a_proj);
+    sync_check_cuda_error();
+
+    invokeRMSNorm(
+        kv_a, kv_a_dim, kv_a, kv_a_dim, w.kv_a_layernorm, kv_lora_rank, token_num, model_param_.norm_eps, stream_);
+    sync_check_cuda_error();
+
+    T* kv_b{};
+    deviceMalloc((T**)&kv_b, (size_t)token_num * w.kv_b_proj.output_dims, stream_);
+    sync_check_cuda_error();
+
+    linear_->forward(kv_b, {kv_a, kv_a_dim}, token_num, w.kv_b_proj);
+    sync_check_cuda_error();
+
+    dispatchMLACopyQKV(qkv_buf_,
+                       q,
+                       kv_a,
+                       kv_b,
+                       token_num,
+                       local_head_num_,
+                       qk_nope_dim,
+                       qk_rope_dim,
+                       kv_lora_rank,
+                       v_head_dim,
+                       stream_);
+    sync_check_cuda_error();
+
+    deviceFree(q, stream_);
+    deviceFree(kv_a, stream_);
+    deviceFree(kv_b, stream_);
+}
+
 #ifdef ENABLE_FP32
 template class UnifiedAttentionLayer<float>;
 #endif
diff --git a/src/turbomind/models/llama/unified_attention_layer.h b/src/turbomind/models/llama/unified_attention_layer.h
index da0c0e6fc..7d331b0e4 100644
--- a/src/turbomind/models/llama/unified_attention_layer.h
+++ b/src/turbomind/models/llama/unified_attention_layer.h
@@ -42,7 +42,7 @@ class UnifiedAttentionLayer {
     static constexpr int kMaxWorkspaceTokens = 4096;
 
     void freeBuffer();
-    void allocateBuffer(size_t q_count, size_t k_count, size_t batch_size, const WeightType* weights);
+    void allocateBuffer(size_t q_count, size_t k_count, size_t batch_size, size_t qkv_lora_rank);
 
     void allocateWorkspace();
     void freeWorkspace();
@@ -70,7 +70,7 @@ class UnifiedAttentionLayer {
                           const NcclParam&      tp,
                           const Context<T>&     context);
 
-    void forward(TensorMap* outputs, const TensorMap* inputs, const LlamaAttentionWeight<T>* weights);
+    void forward(TensorMap* outputs, const TensorMap* inputs, const WeightType* weights);
 
     void prefill(T*                output,
                  T*                tmp_kv_buffer,
@@ -107,6 +107,9 @@ class UnifiedAttentionLayer {
                 int               max_split_k,
                 const WeightType* weights);
 
+private:
+    void forward_mla(const T* inputs, int token_num, const WeightType& weights);
+
 private:
     const size_t head_num_;
     const size_t kv_head_num_;
diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc
index bc8a6a147..115e730b4 100644
--- a/src/turbomind/models/llama/unified_decoder.cc
+++ b/src/turbomind/models/llama/unified_decoder.cc
@@ -23,7 +23,8 @@ UnifiedDecoder<T>::UnifiedDecoder(const ModelParam&     model,
     rmsnorm_eps_(model.norm_eps),
     stream_(ctx.stream),
     allocator_(ctx.allocator.get()),
-    dtype_(getTensorType<T>())
+    dtype_(getTensorType<T>()),
+    tune_layer_num_(model.tune_layer_num)
 {
 
     attn_layer_ = std::make_unique<UnifiedAttentionLayer<T>>(model, attn, lora, tp, ctx);
@@ -65,13 +66,13 @@ void UnifiedDecoder<T>::freeBuffer()
 }
 
 template<typename T>
-void UnifiedDecoder<T>::forwardSelfAttn(T*                             attn_io,
-                                        TensorMap*                     _outputs,
-                                        const TensorMap*               _inputs,
-                                        size_t                         token_num,
-                                        size_t                         batch_size,
-                                        int                            layer_id,
-                                        const LlamaAttentionWeight<T>* weight)
+void UnifiedDecoder<T>::forwardSelfAttn(T*                attn_io,
+                                        TensorMap*        _outputs,
+                                        const TensorMap*  _inputs,
+                                        size_t            token_num,
+                                        size_t            batch_size,
+                                        int               layer_id,
+                                        const WeightType* weight)
 {
     TensorMap inputs(*_inputs);
     inputs.insert("input_query", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, attn_io});
@@ -84,7 +85,7 @@ void UnifiedDecoder<T>::forwardSelfAttn(T*                             attn_io,
     TensorMap outputs(*_outputs);
     outputs.insert("hidden_features", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, attn_io});
 
-    attn_layer_->forward(&outputs, &inputs, weight);
+    attn_layer_->forward(&outputs, &inputs, &weight->self_attn_weights);
 }
 
 template<typename T>
@@ -161,7 +162,7 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
     for (size_t layer = 0; layer < layer_num_; ++layer) {
 
         /// TODO: do not skip the layers when they are heterogeneous
-        if (isTuning() && layer != 0) {
+        if (isTuning() && layer < tune_layer_num_) {
             continue;
         }
 
@@ -175,7 +176,7 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
                         token_num,
                         batch_size,
                         layer,
-                        &weights->at(layer)->self_attn_weights);
+                        weights->at(layer));
 
         count_and_fix(decoder_output, token_num * hidden_units_, Concat("attn_block", layer), 2);
 
@@ -211,7 +212,7 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
         }
 
         if (!weights->at(layer)->moe_weights.experts.empty()) {
-            moe_ffn_layer_->reduce(decoder_output, token_num, weights->at(layer)->moe_weights);
+            moe_ffn_layer_->reduce(decoder_output, token_num, (bool)ffn_layer_, weights->at(layer)->moe_weights);
         }
 
         count_and_fix(decoder_output, token_num * hidden_units_, Concat("ffn_block", layer), 2);
diff --git a/src/turbomind/models/llama/unified_decoder.h b/src/turbomind/models/llama/unified_decoder.h
index f13b4ba84..7e1bd8866 100644
--- a/src/turbomind/models/llama/unified_decoder.h
+++ b/src/turbomind/models/llama/unified_decoder.h
@@ -23,6 +23,7 @@ class UnifiedDecoder {
     cudaStream_t const stream_;
     IAllocator* const  allocator_;
     const DataType     dtype_;
+    const int          tune_layer_num_;
     bool               is_free_buffer_after_forward_{};
 
     int* cu_q_len_{};
@@ -39,13 +40,13 @@ class UnifiedDecoder {
 
     using WeightType = LlamaDecoderLayerWeight<T>;
 
-    void forwardSelfAttn(T*                             attn_io,
-                         TensorMap*                     _outputs,
-                         const TensorMap*               _inputs,
-                         size_t                         token_num,
-                         size_t                         batch_size,
-                         int                            layer_id,
-                         const LlamaAttentionWeight<T>* weight);
+    void forwardSelfAttn(T*                attn_io,
+                         TensorMap*        _outputs,
+                         const TensorMap*  _inputs,
+                         size_t            token_num,
+                         size_t            batch_size,
+                         int               layer_id,
+                         const WeightType* weight);
 
 public:
     UnifiedDecoder(const ModelParam&     model,
diff --git a/src/turbomind/models/llama/weight_type.h b/src/turbomind/models/llama/weight_type.h
index ade02e10e..27d7affe5 100644
--- a/src/turbomind/models/llama/weight_type.h
+++ b/src/turbomind/models/llama/weight_type.h
@@ -52,4 +52,4 @@ inline size_t getBitSize(WeightType type)
     return 0;
 }
 
-}  // namespace turbomind
\ No newline at end of file
+}  // namespace turbomind
diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp
index 4eb34249f..1e74f4bae 100644
--- a/src/turbomind/python/bind.cpp
+++ b/src/turbomind/python/bind.cpp
@@ -293,8 +293,17 @@ PYBIND11_MODULE(_turbomind, m)
                             std::accumulate(src->shape.begin(), src->shape.end(), 1LL, std::multiplies<int64_t>());
                         auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8;
                         ft::FT_CHECK(self->shape.size() == 1 && num_bytes == self->shape[0]);
-                        cudaMemcpy(
-                            const_cast<void*>(self->data), const_cast<void*>(src->data), num_bytes, cudaMemcpyDefault);
+                        cudaPointerAttributes at{};
+                        ft::check_cuda_error(cudaPointerGetAttributes(&at, self->data));
+                        {
+                            // Switch to the same device where TM's tenosr memory resides because it's allocated
+                            // from a pool with no peer access enabled (can't be accessed from a context of other devices)
+                            ft::CudaDeviceGuard guard{at.device};
+                            ft::check_cuda_error(cudaMemcpy(const_cast<void*>(self->data),
+                                                            const_cast<void*>(src->data),
+                                                            num_bytes,
+                                                            cudaMemcpyDefault));
+                        }
                         break;
                     }
                     default:
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index deb28bb35..0dba71575 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -254,6 +254,11 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     model_param_.norm_eps           = model_reader["norm_eps"].as<float>();
     model_param_.start_id           = model_reader["start_id"].as<int>();
     model_param_.end_id             = model_reader["end_id"].as<int>();
+    model_param_.tune_layer_num     = model_reader["tune_layer_num"].as<int>(1);
+    model_param_.mla.q_lora_rank    = model_reader["q_lora_rank"].as<int>();
+    model_param_.mla.kv_lora_rank   = model_reader["kv_lora_rank"].as<int>();
+    model_param_.mla.qk_rope_dim    = model_reader["qk_rope_dim"].as<int>();
+    model_param_.mla.v_head_dim     = model_reader["v_head_dim"].as<int>();
     attn_param_.cache_block_seq_len = attention_reader["cache_block_seq_len"].as<int>(0);
     model_param_.quant_policy       = engine_reader["quant_policy"].as<int>(0);
     YAML::Node inter_size           = model_reader["inter_size"];
@@ -304,7 +309,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     moe_param_.experts_per_token = model_reader["experts_per_token"].as<int>(0);
     moe_param_.inter_size        = model_reader["expert_inter_size"].as<int>(0);
     moe_param_.shared_gate       = model_reader["moe_shared_gate"].as<int>(0);
-    moe_param_.norm_topk         = model_reader["moe_norm_topk"].as<bool>(false);
+    moe_param_.norm_topk_prob    = model_reader["norm_topk_prob"].as<bool>(false);
     YAML::Node expert_num        = model_reader["expert_num"];
     for (auto it = expert_num.begin(); it != expert_num.end(); ++it) {
         moe_param_.expert_num.push_back(it->as<int>());
diff --git a/src/turbomind/utils/cuda_utils.h b/src/turbomind/utils/cuda_utils.h
index 2148fcc16..8311e6eb9 100644
--- a/src/turbomind/utils/cuda_utils.h
+++ b/src/turbomind/utils/cuda_utils.h
@@ -483,5 +483,24 @@ void compareTwoTensor(
 
 bool is_16xx_series(const char* name);
 
+class CudaDeviceGuard {
+public:
+    CudaDeviceGuard(int device)
+    {
+        cudaGetDevice(&last_device_id_);
+        if (device != last_device_id_) {
+            cudaSetDevice(device);
+        }
+    }
+
+    ~CudaDeviceGuard()
+    {
+        cudaSetDevice(last_device_id_);
+    }
+
+private:
+    int last_device_id_{-1};
+};
+
 /* ************************** end of common utils ************************** */
 }  // namespace turbomind

From 375afb43b5e6e167d3b2b1bb80d9f1fe7af7108e Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Tue, 19 Nov 2024 22:15:08 +0800
Subject: [PATCH 10/21] deepseek-v2

---
 lmdeploy/turbomind/deploy/config.py           |   6 +-
 lmdeploy/turbomind/deploy/converter.py        |   7 +-
 lmdeploy/turbomind/deploy/loader.py           |  23 ++++
 lmdeploy/turbomind/deploy/module.py           |   5 +-
 .../deploy/source_model/deepseek2.py          |   6 +-
 .../attention/codegen/decoding_sm80_192.cu    |   5 +
 src/turbomind/kernels/attention/decoding.cu   |  13 +-
 .../kernels/attention/decoding_config.h       |  12 +-
 src/turbomind/kernels/attention/impl_simt.h   |  20 +--
 src/turbomind/kernels/core/math.h             |  11 +-
 src/turbomind/kernels/core/thread_map.h       |   3 +-
 src/turbomind/kernels/gemm/moe_utils_v2.cu    | 121 ++++++++++++++++--
 src/turbomind/kernels/gemm/moe_utils_v2.h     |   3 +
 .../kernels/gemm/test/test_moe_utils.cu       |  11 +-
 src/turbomind/models/llama/LlamaBatch.cc      |   6 +-
 src/turbomind/models/llama/LlamaFfnLayer.cc   |  28 ++--
 src/turbomind/models/llama/LlamaFfnLayer.h    |   9 +-
 src/turbomind/models/llama/LlamaWeight.cc     |  54 --------
 src/turbomind/models/llama/llama_params.h     |  13 +-
 src/turbomind/models/llama/llama_utils.cu     |   3 +
 src/turbomind/models/llama/moe_ffn_layer.cc   |  34 ++++-
 src/turbomind/models/llama/moe_ffn_layer.h    |   6 +-
 .../models/llama/unified_attention_layer.cc   |  24 ++--
 src/turbomind/models/llama/unified_decoder.cc |  42 ++++--
 src/turbomind/models/llama/unified_decoder.h  |   1 +
 .../triton_backend/llama/LlamaTritonModel.cc  |   4 +
 src/turbomind/utils/allocator.h               |   2 +-
 27 files changed, 333 insertions(+), 139 deletions(-)

diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
index b6f2ee765..1d1aa2755 100644
--- a/lmdeploy/turbomind/deploy/config.py
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -43,7 +43,7 @@ class ModelConfig:
     start_id: int = None
     end_id: int = None
     size_per_head: int = 128
-    group_size: int = 0
+    group_size: int = 64
     weight_type: str = None
     session_len: int = None
     tp: int = 1
@@ -53,6 +53,10 @@ class ModelConfig:
     experts_per_token: int = 0
     moe_shared_gate: int = False
     norm_topk_prob: int = False
+    routed_scale: float = 1.0
+    topk_group: int = 1
+    topk_method: str = 'greedy'
+    moe_group_num: int = 1
     # MLA
     q_lora_rank: int = 0
     kv_lora_rank: int = 0
diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
index 1c847ede0..77f0bc8dc 100644
--- a/lmdeploy/turbomind/deploy/converter.py
+++ b/lmdeploy/turbomind/deploy/converter.py
@@ -241,11 +241,10 @@ def get_tm_model(model_path,
         engine_config.model_format = quant_method
         group_size = _group_size
 
-    # Compatible to awq models that are quantized by lmdeploy (<=v0.3.0)
-    if not group_size:
-        group_size = 128
-
     if engine_config.model_format in ['awq', 'gptq']:
+        # Compatible to awq models that are quantized by lmdeploy (<=v0.3.0)
+        if not group_size:
+            group_size = 128
         assert group_size == 128, \
             f'model format is "{engine_config.model_format}" ' \
             f'but group_size is {group_size}. Currently, only 128 ' \
diff --git a/lmdeploy/turbomind/deploy/loader.py b/lmdeploy/turbomind/deploy/loader.py
index e3d79b164..dc319db9f 100644
--- a/lmdeploy/turbomind/deploy/loader.py
+++ b/lmdeploy/turbomind/deploy/loader.py
@@ -10,6 +10,7 @@
 
 import torch
 from safetensors import safe_open
+import safetensors
 
 # https://github.com/huggingface/transformers/blob/53fad641cfdb5105e2470bcf3ef17ea8e25cc300/src/transformers/modeling_utils.py#L372
 WEIGHT_INDEX_NAME = 'pytorch_model.bin.index.json'
@@ -89,6 +90,28 @@ def items(self):
         assert not params
 
 
+    # def items(self):
+    #     params = defaultdict(dict)
+    #     for shard in self.shards:
+    #         # with safe_open(shard, 'pt') as f:
+    #         with open(shard, 'rb') as f:
+    #             w = safetensors.torch.load(f.read())
+    #             misc = []
+    #             for k in w.keys():
+    #                 match = re.findall(self.pattern, k)
+    #                 if not match:
+    #                     misc.append(k)
+    #                 else:
+    #                     idx = int(match[0])
+    #                     param = params[idx]
+    #                     param[k] = w[k]
+    #                     if len(param) == self.item_count[idx]:
+    #                         yield (idx, params.pop(idx))
+    #             if misc:
+    #                 yield (-1, {k: w[k] for k in misc})
+    #     assert not params
+
+
 class PytorchLoader(BaseLoader):
 
     def __init__(self,
diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py
index 6fa6fcb42..aa98ed646 100644
--- a/lmdeploy/turbomind/deploy/module.py
+++ b/lmdeploy/turbomind/deploy/module.py
@@ -239,7 +239,10 @@ def __init__(self, model: BaseOutputModel):
     def _export(self, idx: int, xs, kind: str, pack_fn, **kwargs):
         if all(x is None for x in xs):
             return
-        q_a, q_b, kv_a, kv_b, o = map(transpose, xs)
+        q_a, q_b, q, kv_a, kv_b, o = map(transpose, xs)
+
+        if q is not None:
+            q_b = q
 
         cfg = self.model.model_config
         qk_nope_dim = cfg.size_per_head - cfg.qk_rope_dim
diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek2.py b/lmdeploy/turbomind/deploy/source_model/deepseek2.py
index ccdfafb18..f780ceeb7 100644
--- a/lmdeploy/turbomind/deploy/source_model/deepseek2.py
+++ b/lmdeploy/turbomind/deploy/source_model/deepseek2.py
@@ -37,7 +37,7 @@ def mla(self, i: int, kind: str):
         if not kind:
             return self.filter(r'self_attn.*proj')
         result = []
-        for key in ['q_a_proj', 'q_proj', 'kv_a_proj_with_mqa', 'kv_b_proj', 'o_proj']:
+        for key in ['q_a_proj', 'q_b_proj', 'q_proj', 'kv_a_proj_with_mqa', 'kv_b_proj', 'o_proj']:
             tensor = self.params.get(
                 f'{self.attn_layer_prefix}.{i}.self_attn.{key}.{kind}'
             )
@@ -90,6 +90,10 @@ def model_info(self):
             experts_per_token=experts_per_token,
             inter_size=inter_size,
             norm_topk_prob=norm_topk_prob,
+            routed_scale=cfg['routed_scaling_factor'],
+            topk_method=cfg['topk_method'],
+            topk_group=cfg['topk_group'],
+            moe_group_num=cfg['n_group'],
             tune_layer_num=2
         )
         return info
diff --git a/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu b/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu
index 9294fc396..214e6748d 100644
--- a/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu
+++ b/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu
@@ -12,4 +12,9 @@ invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, nv_bfloat16, 1, 192>>(const Att
 
 template bool invokeDecoding<Decoding<arch::Sm80, half, half, 1, 192>>(const AttentionParams<half>& params);
 
+template bool
+invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, uint8_t, 1, 192>>(const AttentionParams<nv_bfloat16>& params);
+
+template bool invokeDecoding<Decoding<arch::Sm80, half, uint8_t, 1, 192>>(const AttentionParams<half>& params);
+
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/decoding.cu b/src/turbomind/kernels/attention/decoding.cu
index b50b67f04..101b4170e 100644
--- a/src/turbomind/kernels/attention/decoding.cu
+++ b/src/turbomind/kernels/attention/decoding.cu
@@ -4,7 +4,6 @@
 #include "decoding_config.h"
 #include "src/turbomind/kernels/attention/arch.h"
 #include "src/turbomind/models/llama/llama_utils.h"
-// #include "src/turbomind/utils/dispatch.h"
 #include <type_traits>
 #include <utility>
 
@@ -115,7 +114,17 @@ void dispatchDecoding(const AttentionParams<T>& params)
     };
 
     if (params.size_per_head == 192) {
-        invokeDecoding<Decoding<arch::Sm80, T, T, 1, 192>>(params);
+        
+        if (is_kv_int8) {
+            invokeDecoding<Decoding<arch::Sm80, T, uint8_t, 1, 192>>(params);
+        }
+        else if (is_kv_int4) {
+            FT_CHECK_WITH_INFO(!is_kv_int4, "not implemented");
+            // invokeDecoding<Decoding<arch::Sm80, T, uint4_t, 1, 192>>(params);
+        }
+        else {
+            invokeDecoding<Decoding<arch::Sm80, T, T, 1, 192>>(params);
+        }
         return;
     }
 
diff --git a/src/turbomind/kernels/attention/decoding_config.h b/src/turbomind/kernels/attention/decoding_config.h
index 7dcb119cf..dfd5e0783 100644
--- a/src/turbomind/kernels/attention/decoding_config.h
+++ b/src/turbomind/kernels/attention/decoding_config.h
@@ -40,7 +40,7 @@ struct DecodingConfig<arch::Sm80, T, T, Qh_, HeadDim, std::enable_if_t<(Qh_ > 2)
 };
 
 template<class T, int Qh_, int HeadDim>
-struct DecodingConfig<arch::Sm80, T, uint8_t, Qh_, HeadDim> {
+struct DecodingConfig<arch::Sm80, T, uint8_t, Qh_, HeadDim, std::enable_if_t<(HeadDim != 192)>> {
     static constexpr int Qh = (Qh_ + 7) / 8 * 8;
     using Attention         = Impl<MMA_81616, T, uint8_t, Qh, 1, 64, Qh, 1, 16, HeadDim, 5>;
     using CacheIter         = GetBlockIterFactory<T, uint8_t, 64, HeadDim>;
@@ -76,4 +76,14 @@ struct DecodingConfig<arch::Sm70, T, Tkv, Qh, HeadDim> {
     using Kernel = AttentionUniversal<arch::Sm70, Mainloop<arch::Sm70, Attention>, CacheIter, DecodingCtaMap>;
 };
 
+template<class T>
+struct DecodingConfig<arch::Sm80, T, uint8_t, 1, 192> {
+    static constexpr int Qh      = 1;
+    static constexpr int HeadDim = 192;
+
+    using Attention = Impl<MMA_SIMT, T, uint8_t, Qh, 1, 64, Qh, 1, 16, HeadDim, 3>;
+    using CacheIter = GetBlockIterFactory<T, uint8_t, 64, HeadDim>;
+    using Kernel    = AttentionUniversal<arch::Sm80, Mainloop<Sm80_CpAsync<3>, Attention>, CacheIter, DecodingCtaMap>;
+};
+
 }  // namespace turbomind::attention
diff --git a/src/turbomind/kernels/attention/impl_simt.h b/src/turbomind/kernels/attention/impl_simt.h
index 667a0ce43..790fc1b4f 100644
--- a/src/turbomind/kernels/attention/impl_simt.h
+++ b/src/turbomind/kernels/attention/impl_simt.h
@@ -2,12 +2,17 @@
 
 #pragma once
 
-#include "src/turbomind/kernels/attention/impl.h"
+#include <limits>
+#include <numeric>
+#include <type_traits>
+
 #include "src/turbomind/kernels/core/array_ops.h"
 #include "src/turbomind/kernels/core/layout.h"
 #include "src/turbomind/kernels/core/thread_map.h"
-#include <limits>
-#include <type_traits>
+
+#include "src/turbomind/kernels/attention/impl.h"
+#include "src/turbomind/kernels/attention/quantization.h"
+
 
 namespace turbomind::attention {
 
@@ -51,7 +56,7 @@ struct Impl<MMA_SIMT, T_, Tkv_, CTA_H_, CTA_Q_, CTA_S_, WARP_H_, WARP_Q, WARP_S,
     static constexpr int T_D = 8;                // warp thread C
     static constexpr int T_S = WARP_SIZE / T_D;  // warp thread S
 
-    // warp footprint
+    // warp footprint (1x4x64)
     static constexpr int OP_H = 1;
     static constexpr int OP_S = T_S;
     static constexpr int OP_D = VEC * T_D;
@@ -76,7 +81,7 @@ struct Impl<MMA_SIMT, T_, Tkv_, CTA_H_, CTA_Q_, CTA_S_, WARP_H_, WARP_Q, WARP_S,
         static constexpr int S_S_thr = 1;
         static constexpr int S_D     = VEC;
         static constexpr int S_S     = T_S;
-        static constexpr int LDS     = K_K;
+        static constexpr int LDS     = std::gcd(16 / sizeof(Array<Tkv, VEC>), K_K);
     };
 
     struct LinearD {
@@ -165,11 +170,8 @@ struct Impl<MMA_SIMT, T_, Tkv_, CTA_H_, CTA_Q_, CTA_S_, WARP_H_, WARP_Q, WARP_S,
     static constexpr bool kUseSmemQ = false;
     static constexpr bool kUseSmemP = false;
 
-    static constexpr int kAccessC_KV     = 128 / bitsof<Tkv>;
-    static constexpr int kWarpThreadC_KV = HeadDim != 192 ? HeadDim / kAccessC_KV : 8;
-
     using ThreadMapQ  = RakedThreadMap<HeadDim, CTA_H, 8, kWarpCount>;
-    using ThreadMapKV = RakedThreadMap<HeadDim, CTA_S, kAccessC_KV, kWarpCount, kWarpThreadC_KV>;
+    using ThreadMapKV = RakedThreadMap<HeadDim, CTA_S, 128 / bitsof<Tkv>, kWarpCount>;
     // `WARP_SIZE / WARP_S` is chosen to achieve minimum kIterS w/o introducing partial S iter
     using ThreadMapKVp = RakedThreadMap<2, CTA_S, 2, kWarpCount, WARP_SIZE / WARP_S>;
 
diff --git a/src/turbomind/kernels/core/math.h b/src/turbomind/kernels/core/math.h
index a708a3498..c78ab95ab 100644
--- a/src/turbomind/kernels/core/math.h
+++ b/src/turbomind/kernels/core/math.h
@@ -5,6 +5,7 @@
 #include "src/turbomind/kernels/core/common.h"
 #include <cassert>
 #include <cstdint>
+#include <type_traits>
 
 namespace turbomind {
 
@@ -41,10 +42,16 @@ TM_HOST_DEVICE constexpr T log2(T x)
 // static_assert(log2(32) == 5);
 // static_assert(log2(1) == 0);
 
+template<class T>
+TM_HOST_DEVICE constexpr T lowbit(T x)
+{
+    const std::make_signed_t<T> s = x;
+    return static_cast<T>(s & -s);
+}
+
 // https://arxiv.org/abs/1902.01961
 template<class T>
-struct FastDivMod {
-};
+struct FastDivMod {};
 
 template<>
 struct FastDivMod<uint16_t> {
diff --git a/src/turbomind/kernels/core/thread_map.h b/src/turbomind/kernels/core/thread_map.h
index 66b691832..1271aefcc 100644
--- a/src/turbomind/kernels/core/thread_map.h
+++ b/src/turbomind/kernels/core/thread_map.h
@@ -3,6 +3,7 @@
 #pragma once
 
 #include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/math.h"
 
 #include <iostream>
 
@@ -51,7 +52,7 @@ struct ThreadMapQ {
     }
 };
 
-template<int DimC, int DimS, int AccessC, int WarpCount, int WarpThreadC = DimC / AccessC>
+template<int DimC, int DimS, int AccessC, int WarpCount, int WarpThreadC = lowbit(DimC) / AccessC>
 struct RakedThreadMap {
     static constexpr int kDimC = DimC;
     static constexpr int kDimS = DimS;
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu
index c8c8db197..66bf634ea 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.cu
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu
@@ -264,7 +264,8 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
                                  int          token_num_padded,
                                  int          expert_num,
                                  int          top_k,
-                                 bool         norm_topk)
+                                 bool         norm_topk,
+                                 float        routed_scale)
 {
     constexpr int max_tiles         = kMoeGateMaxTiles;
     constexpr int threads_per_token = max_expert_num / items_per_thread;  // 8
@@ -286,8 +287,8 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
 
     const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token;
 
-    const int warp_offset  = thread_idx / WARP_SIZE * WARP_SIZE / threads_per_token;
-    const int block_offset = thread_idx / block_dim * block_dim / threads_per_token;
+    // const int warp_offset  = thread_idx / WARP_SIZE * WARP_SIZE / threads_per_token;
+    // const int block_offset = thread_idx / block_dim * block_dim / threads_per_token;
 
     float data[items_per_thread];
     int   idxs[items_per_thread];
@@ -536,7 +537,7 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
 
         if (ti2 < token_num && idx < top_k) {
             masks[expert_id * token_num_padded + ti2] = idx;
-            scales[idx * token_num + ti2]             = scale;
+            scales[idx * token_num + ti2]             = scale * routed_scale;
             atomicAdd(&smem.shared_accum[ti2 >> log_tile][expert_id], 1);
 
             // printf("%d %d %f\n", idx, expert_id, scale);
@@ -569,6 +570,7 @@ void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
                       int          experts,        //  E
                       int          experts_per_token,
                       bool         norm_topk,
+                      float        routed_scale,
                       cudaStream_t st)
 {
     constexpr int base_log_tile = 9;
@@ -581,14 +583,14 @@ void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
 
     // std::cout << log_tile << " " << tiles << "\n";
 
-    auto invoke = [&](auto max_expert_num, auto top_k, auto items_per_thread) {
+    auto invoke = [&](auto max_expert_num, auto top_k, auto items_per_thread, auto vec_size) {
         constexpr int thrs_per_tok = max_expert_num.value / items_per_thread.value;
         constexpr int threads      = 256;
         const int     blocks       = ceil_div(tokens, threads / thrs_per_tok);
 
         cudaMemsetAsync(masks, -1, sizeof(int8_t) * experts * tokens_padded, st);
 
-        MoeGateKernel_v8<max_expert_num.value, top_k.value, items_per_thread.value, threads, 4>
+        MoeGateKernel_v8<max_expert_num.value, top_k.value, items_per_thread.value, threads, vec_size.value>
             <<<blocks, threads, 0, st>>>(  //
                 scales,
                 (int8_t*)masks,
@@ -600,28 +602,38 @@ void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
                 tokens_padded,
                 experts,
                 experts_per_token,
-                norm_topk);
+                norm_topk,
+                routed_scale);
     };
 
     auto fail = [&] {
-        std::cerr << "unsupported moe config: expert_num=" << experts << ", top_k=" << experts_per_token << "\n";
+        std::cerr << __FILE__ << "(" << __LINE__ << "): unsupported moe config: expert_num=" << experts
+                  << ", top_k=" << experts_per_token << "\n";
         std::abort();
     };
 
     if (experts <= 8) {
         if (experts_per_token <= 2) {
-            invoke(_Int<8>, _Int<2>, _Int<8>);
+            invoke(_Int<8>, _Int<2>, _Int<8>, _Int<4>);
         }
         else {
-            invoke(_Int<8>, _Int<8>, _Int<8>);
+            invoke(_Int<8>, _Int<8>, _Int<8>, _Int<4>);
         }
     }
     else if (experts <= 64) {
         if (experts_per_token <= 4) {
-            invoke(_Int<64>, _Int<4>, _Int<16>);
+            invoke(_Int<64>, _Int<4>, _Int<16>, _Int<4>);
         }
         else if (experts_per_token <= 8) {
-            invoke(_Int<64>, _Int<8>, _Int<16>);
+            invoke(_Int<64>, _Int<8>, _Int<16>, _Int<4>);
+        }
+        else {
+            fail();
+        }
+    }
+    else if (experts <= 160) {
+        if (experts_per_token <= 8) {
+            invoke(_Int<160>, _Int<8>, _Int<10>, _Int<2>);
         }
         else {
             fail();
@@ -836,4 +848,89 @@ std::vector<int> SampleBalanced(int token_num, int expert_num, int exp_per_tok,
     return ret;
 }
 
+template<int max_expert_num, int items_per_thread, int access_size>
+__global__ void MoeMaskTopKGroups(float* logits, int token_num, int expert_num, int top_k)
+{
+    constexpr int threads_per_token = max_expert_num / items_per_thread;
+
+    static_assert((threads_per_token & (threads_per_token - 1)) == 0);
+    static_assert(items_per_thread % access_size == 0);
+
+    const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    const int ti = thread_idx / threads_per_token;
+    const int ei = thread_idx % threads_per_token;
+
+    float data[items_per_thread];
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        data[i] = -std::numeric_limits<float>::infinity();
+    }
+    float max_val = -std::numeric_limits<float>::infinity();
+    if (ti < token_num) {
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; i += access_size) {
+            const int e = ei * items_per_thread + i;
+            if (e < expert_num) {
+                Ldg((Array<float, access_size>&)data[i], &logits[ti * expert_num + e]);
+                PRAGMA_UNROLL
+                for (int c = 0; c < access_size; ++c) {
+                    max_val = fmaxf(max_val, data[i + c]);
+                }
+            }
+        }
+    }
+
+    const int warp_ti        = threadIdx.x % WARP_SIZE / threads_per_token;
+    const int warp_ti_offset = warp_ti * threads_per_token;
+
+    bool alive = false;
+
+    for (int k = 0; k < top_k; ++k) {
+        int   g_max_ei  = ei;
+        float g_max_val = max_val;
+        PRAGMA_UNROLL
+        for (int m = threads_per_token / 2; m >= 1; m /= 2) {
+            g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m));
+        }
+        // tie breaking
+        const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val);
+        g_max_ei          = __ffs(active >> (unsigned)warp_ti_offset) - 1;
+        if (ei == g_max_ei) {
+            alive   = true;
+            max_val = -std::numeric_limits<float>::infinity();
+        }
+    }
+
+    if (!alive && ti < token_num) {
+        Array<float, access_size> vec;
+        fill(vec, -std::numeric_limits<float>::infinity());
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; i += access_size) {
+            const int e = ei * items_per_thread + i;
+            if (e < expert_num) {
+                Store(&logits[ti * expert_num + e], vec);
+            }
+        }
+    }
+}
+
+void invokeMaskMoeTopKGroups(float* logits, int token_num, int expert_num, int group_size, int top_k, cudaStream_t st)
+{
+    auto invoke = [&](auto max_expert_num, auto items_per_thread, auto vec_size) {
+        constexpr int thrs_per_tok = max_expert_num.value / items_per_thread.value;
+        constexpr int threads      = 256;
+        const int     blocks       = ceil_div(token_num, threads / thrs_per_tok);
+        MoeMaskTopKGroups<max_expert_num.value, items_per_thread.value, vec_size.value>
+            <<<blocks, threads, 0, st>>>(logits, token_num, expert_num, top_k);
+    };
+    if (expert_num == 160 && group_size == 20) {
+        return invoke(_Int<160>, _Int<20>, _Int<4>);
+    }
+
+    std::cerr << __FILE__ << "(" << __LINE__ << "): unsupported moe config: expert_num=" << expert_num
+              << ", group_size=" << group_size << "\n";
+    std::abort();
+}
+
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.h b/src/turbomind/kernels/gemm/moe_utils_v2.h
index f2aa9870f..d53de1354 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.h
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.h
@@ -22,6 +22,7 @@ void invokeMoeGate_V2(int*         f2n,
                       int          experts,
                       int          exp_per_tok,
                       bool         norm_topk,
+                      float        routed_scale,
                       cudaStream_t st);
 
 template<class T>
@@ -57,6 +58,8 @@ void invokeMoeReduce(T*           dst,
                      float        dst_scale,
                      cudaStream_t st);
 
+void invokeMaskMoeTopKGroups(float* logits, int token_num, int expert_num, int group_size, int top_k, cudaStream_t st);
+
 // Sample `e` from `E` experts uniformly for every token
 std::vector<int> SampleUniform(int token_num, int expert_num, int exp_per_tok, std::mt19937& g);
 
diff --git a/src/turbomind/kernels/gemm/test/test_moe_utils.cu b/src/turbomind/kernels/gemm/test/test_moe_utils.cu
index 58ad9d26d..4b2ea6a83 100644
--- a/src/turbomind/kernels/gemm/test/test_moe_utils.cu
+++ b/src/turbomind/kernels/gemm/test/test_moe_utils.cu
@@ -205,6 +205,8 @@ bool test_moe_gate(int                     tokens,  //
     cudaMemPrefetchAsync(scales.data().get(), sizeof(float) * scales.size(), 0);
     cudaMemPrefetchAsync(logits.data().get(), sizeof(float) * logits.size(), 0);
 
+    // invokeMaskMoeTopKGroups(logits.data().get(), tokens, expert_num, expert_num / 8, 3, nullptr);
+
     for (int i = 0; i < 1; ++i) {
         gemm::CacheFlushing::flush();
         cudaMemset(accum.data().get(), 0, sizeof(int) * accum.size());
@@ -221,7 +223,8 @@ bool test_moe_gate(int                     tokens,  //
                          expert_num,
                          experts_per_token,
                          false,
-                         0);
+                         1.f,
+                         nullptr);
     }
 
     // invokeMoeTiling(coords.data().get(), offsets.data().get(), expert_num, coords.size(), &tiling, 1, 0);
@@ -268,7 +271,9 @@ bool test_moe_gate(int                     tokens,  //
         success = false;
     }
 
-    if (!success || 1) {
+    // print_vecs(logits.data().get(), tokens, expert_num, "logits", 12);
+
+    if (!success && 1) {
 
         diff_vecs(eids.data().get(), eids_ref.data().get(), experts_per_token, tokens, "eids");
 
@@ -336,7 +341,7 @@ int main()
     // test_moe_gate(32768, 64, 8, tape, tiling);
     // test_moe_gate(8, 60, 4, tape, tiling);
 
-    test_moe_gate(16, 64, 6, tape, tiling);
+    test_moe_gate(16, 160, 6, tape, tiling);
     return 0;
 
     for (int i = 1; i < 16384; ++i) {
diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc
index 4138174e5..ea321d06a 100644
--- a/src/turbomind/models/llama/LlamaBatch.cc
+++ b/src/turbomind/models/llama/LlamaBatch.cc
@@ -20,6 +20,7 @@
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/debug_utils.h"
 #include "src/turbomind/utils/logger.h"
+#include "src/turbomind/utils/nccl_utils.h"
 #include <algorithm>
 #include <cmath>
 #include <cstddef>
@@ -1041,6 +1042,9 @@ LlamaBatch<T>::LlamaBatch(const EngineParam&           param,
 
     AllocateBuffer(max_batch_size_, session_len_, cache_block_seq_len);
     AllocatePersistantBuffer(max_batch_size_, cache_block_seq_len);
+
+    // Wait for allocations
+    check_cuda_error(cudaStreamSynchronize(stream_));
 }
 
 template<typename T>
@@ -1990,7 +1994,7 @@ void LlamaBatch<T>::tune()
                                    nullptr,
                                    nullptr);
             // implicit barrier for TP
-            check_cuda_error(cudaStreamSynchronize(stream_));
+            ftNcclStreamSynchronize(model_->tensor_para_, {}, stream_);
         }
 
         auto tock = std::chrono::steady_clock::now();
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc
index 8cce20720..5afc75869 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.cc
+++ b/src/turbomind/models/llama/LlamaFfnLayer.cc
@@ -27,21 +27,20 @@
 namespace turbomind {
 
 template<typename T>
-void LlamaFfnLayer<T>::allocateBuffer(size_t                     token_num,
-                                      int                        inter_size,
-                                      const LlamaDenseWeight<T>* gating,
-                                      const LlamaDenseWeight<T>* inter)
+void LlamaFfnLayer<T>::allocateBuffer(
+    size_t token_num, int inter_size, size_t inter_buf_factor, size_t gating_lora_r, size_t inter_lora_r)
 {
-    const size_t sz = token_num * inter_size;
+    const size_t sz  = token_num * inter_size;
 
-    const size_t sz_gate  = token_num * gating->lora.r;
-    const size_t sz_inter = token_num * inter->lora.r;
+    const size_t sz_gate  = token_num * gating_lora_r;
+    const size_t sz_inter = token_num * inter_lora_r;
 
-    gating_buf_ = (T*)allocator_->reMalloc(gating_buf_, sizeof(T) * (sz * 2 + sz_gate + sz_inter), false);
-    inter_buf_  = gating_buf_ + sz;
+    gating_buf_ =
+        (T*)allocator_->reMalloc(gating_buf_, sizeof(T) * (sz * inter_buf_factor + sz_gate + sz_inter), false);
+    inter_buf_ = gating_buf_ + sz;
 
     // gate & inter is not fused when lora is enabled
-    if (gating->lora.r) {
+    if (gating_lora_r) {
         inter_buf_ += sz_gate;
     }
 
@@ -93,12 +92,16 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
     const int    layer_id   = input_tensors->getVal<int>("layer_id");
     const int    inter_size = weights->inter_size;
 
-    allocateBuffer(token_num, inter_size, &weights->gating, &weights->intermediate);
+    const bool is_fused_silu = weights->fused_gating_intermediate.kernel && weights->is_fused_silu;
+
+    allocateBuffer(token_num, inter_size, is_fused_silu ? 1 : 2, weights->gating.lora.r, weights->intermediate.lora.r);
 
     const T* ffn_input_data  = input_tensors->at("ffn_input").getPtr<T>();
     T*       ffn_output_data = output_tensors->at("ffn_output").getPtr<T>();
     int*     lora_mask = input_tensors->at("lora_mask", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<int>();
 
+    const bool all_reduce = input_tensors->getVal<bool>("all_reduce", false);
+
     if (weights->fused_gating_intermediate.kernel) {
         NvtxScope scope("fused_silu_ffn");
 
@@ -145,7 +148,8 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
 
     count_and_fix(ffn_output_data, token_num * weights->output.output_dims, Concat("w2", layer_id), 3);
 
-    if (all_reduce_ && tensor_para_.world_size_ > 1) {
+    if (all_reduce && tensor_para_.world_size_ > 1) {
+        // std::cout << "ffn all reduce " << layer_id << "\n";
         NcclGuard nccl_guard(tensor_para_, stream_);
         ftNcclAllReduceSum(ffn_output_data, ffn_output_data, token_num * hidden_units_, tensor_para_, stream_);
         sync_check_cuda_error();
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.h b/src/turbomind/models/llama/LlamaFfnLayer.h
index 2daca2cc9..a72a24701 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.h
+++ b/src/turbomind/models/llama/LlamaFfnLayer.h
@@ -30,13 +30,12 @@ namespace turbomind {
 template<typename T>
 class LlamaFfnLayer {
 public:
-    LlamaFfnLayer(const ModelParam& model, const NcclParam& tp, const Context<T>& ctx, bool all_reduce):
+    LlamaFfnLayer(const ModelParam& model, const NcclParam& tp, const Context<T>& ctx):
         hidden_units_(model.hidden_units),
         tensor_para_(tp),
         stream_(ctx.stream),
         linear_(ctx.linear.get()),
-        allocator_(ctx.allocator.get()),
-        all_reduce_(all_reduce)
+        allocator_(ctx.allocator.get())
     {
     }
 
@@ -48,7 +47,8 @@ class LlamaFfnLayer {
     void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaFfnWeight<T>* weights);
 
 private:
-    void allocateBuffer(size_t token_num, int inter_size, const LlamaDenseWeight<T>*, const LlamaDenseWeight<T>*);
+    void allocateBuffer(
+        size_t token_num, int inter_size, size_t inter_buf_factor, size_t gating_lora_r, size_t inter_lora_r);
 
     void freeBuffer();
 
@@ -59,7 +59,6 @@ class LlamaFfnLayer {
     cudaStream_t const    stream_;
     LlamaLinear<T>* const linear_;
     IAllocator* const     allocator_;
-    const bool            all_reduce_;
     bool                  is_free_buffer_after_forward_{};
 
     T* gating_buf_{};
diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc
index 2db8fc9c7..596207298 100644
--- a/src/turbomind/models/llama/LlamaWeight.cc
+++ b/src/turbomind/models/llama/LlamaWeight.cc
@@ -25,58 +25,6 @@
 #include <cuda_runtime.h>
 
 namespace turbomind {
-#if 0
-template<typename T>
-LlamaWeight<T>::LlamaWeight(size_t     head_num,
-                            size_t     kv_head_num,
-                            size_t     size_per_head,
-                            size_t     hidden_units,
-                            size_t     inter_size,
-                            size_t     vocab_size,
-                            size_t     num_layer,
-                            bool       attn_bias,
-                            WeightType weight_type,
-                            int        group_size,
-                            LoraParam  lora_param,
-                            MoeParam   moe_param,
-                            size_t     tensor_para_size,
-                            size_t     tensor_para_rank):
-    hidden_units_(hidden_units),
-    inter_size_(inter_size),
-    vocab_size_(vocab_size),
-    vocab_size_padded_(vocab_size),
-    num_layer_(num_layer),
-    weight_type_(weight_type),
-    tensor_para_size_(tensor_para_size),
-    tensor_para_rank_(tensor_para_rank)
-{
-    if (vocab_size_padded_ % tensor_para_size_ != 0) {
-        vocab_size_padded_ = (vocab_size_padded_ + tensor_para_size_ - 1) / tensor_para_size_ * tensor_para_size_;
-        TM_LOG_WARNING("pad vocab size from %d to %d", vocab_size_, vocab_size_padded_);
-    }
-
-    FT_CHECK(hidden_units_ % tensor_para_size_ == 0);
-
-    decoder_layer_weights.reserve(num_layer_);
-    for (unsigned l = 0; l < num_layer_; ++l) {
-        decoder_layer_weights.push_back(new LlamaDecoderLayerWeight<T>(l,
-                                                                       head_num,
-                                                                       kv_head_num,
-                                                                       size_per_head,
-                                                                       hidden_units_,
-                                                                       inter_size_,
-                                                                       weight_type_,
-                                                                       group_size,
-                                                                       lora_param,
-                                                                       attn_bias,
-                                                                       moe_param,
-                                                                       tensor_para_size_,
-                                                                       tensor_para_rank_));
-    }
-
-    mallocWeights();
-}
-#else
 
 template<typename T>
 LlamaWeight<T>::LlamaWeight(
@@ -115,8 +63,6 @@ LlamaWeight<T>::LlamaWeight(
     check_cuda_error(cudaStreamSynchronize(stream_));
 }
 
-#endif
-
 template<typename T>
 LlamaWeight<T>::~LlamaWeight()
 {
diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h
index ba915caaf..ea00fde03 100644
--- a/src/turbomind/models/llama/llama_params.h
+++ b/src/turbomind/models/llama/llama_params.h
@@ -44,10 +44,15 @@ struct MoeParam {
         kFused
     } method;
 
-    int  experts_per_token;
-    int  inter_size;
-    bool norm_topk_prob;
-    bool shared_gate;
+    int   experts_per_token;
+    int   inter_size;
+    bool  norm_topk_prob;
+    bool  shared_gate;
+    float routed_scale;
+
+    int         topk_group;
+    std::string topk_method;
+    int         n_group;
 
     std::vector<int> expert_num;
 };
diff --git a/src/turbomind/models/llama/llama_utils.cu b/src/turbomind/models/llama/llama_utils.cu
index 925c6b883..570e33b77 100644
--- a/src/turbomind/models/llama/llama_utils.cu
+++ b/src/turbomind/models/llama/llama_utils.cu
@@ -17,6 +17,7 @@
 namespace turbomind {
 
 CmpMode compare_mode = kCmpRead;
+// CmpMode compare_mode = kCmpWrite;
 
 template<typename T>
 struct abs_diff_t {
@@ -94,6 +95,8 @@ void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream)
     // sum(abs(a - b))
     auto asum = thrust::reduce(thrust::device, transform_iter, transform_iter + size);
     std::cerr << key << ": " << asum << " " << asum / size << "\n";
+
+    check_cuda_error(cudaMemcpyAsync(ptr, h_a.data(), sizeof(T) * h_a.size(), cudaMemcpyDefault, stream));
 }
 
 template<typename T>
diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc
index 99a66c223..390d14754 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.cc
+++ b/src/turbomind/models/llama/moe_ffn_layer.cc
@@ -11,20 +11,19 @@
 #include "src/turbomind/utils/nvtx_utils.h"
 #include "src/turbomind/utils/string_utils.h"
 #include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
 #include <iomanip>
 
 namespace turbomind {
 
 template<class T>
-void MoeFfnLayer<T>::AllocateBuffer(size_t tokens, size_t padded, size_t expert_num)
+void MoeFfnLayer<T>::AllocateBuffer(size_t tokens, size_t padded, size_t expert_num, size_t inter_buf_factor)
 {
     char* base = 0;
 
     auto allocate = [&](void* base) {
         Monotonic alloc{base};
         alloc(&inout_buf_, tokens * param_.experts_per_token * hidden_dim_);
-        alloc(&inter_buf_, tokens * param_.experts_per_token * inter_size_ * 2);
+        alloc(&inter_buf_, tokens * param_.experts_per_token * inter_size_ * inter_buf_factor);
         alloc(&logits_, tokens * expert_num);
         alloc(&masks_, expert_num * padded);
         alloc(&f2n_, param_.experts_per_token * tokens);
@@ -85,16 +84,37 @@ void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id
 
     FT_CHECK(expert_num);
 
-    AllocateBuffer(tokens, padded, expert_num);
+    const size_t inter_buf_factor = [&] {
+        if (param_.method == MoeParam::kNaive) {
+            return 0;  // managed by ffn
+        }
+        else if (moe.block.is_fused_silu) {
+            return 1;
+        }
+        else {
+            return 2;
+        }
+    }();
+
+    AllocateBuffer(tokens, padded, expert_num, inter_buf_factor);
 
     gate(logits_, input, tokens, moe.gate);
     sync_check_cuda_error();
 
+    // if (tensor_para_.rank_ == 0) {
+    //     Compare(logits_, tokens * expert_num, Concat("logit", layer_id), compare_mode, stream_);
+    // }
+
     check_cuda_error(cudaMemsetAsync(accum_, 0, sizeof(int) * expert_num * kMoeGateMaxTiles, stream_));
-    sync_check_cuda_error();
+    check_cuda_error(cudaMemsetAsync(masks_, -1, sizeof(int8_t) * expert_num * padded, stream_));
 
     // dump_logits(tokens, layer_id);
 
+    if (param_.topk_method == "group_limited_greedy") {
+        invokeMaskMoeTopKGroups(logits_, tokens, expert_num, expert_num / param_.n_group, param_.topk_group, stream_);
+        sync_check_cuda_error();
+    }
+
     /// TODO: fix illegal memory access even if NaN are present in logits
     invokeMoeGate_V2(f2n_,
                      en2f_,
@@ -108,6 +128,7 @@ void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id
                      expert_num,
                      param_.experts_per_token,
                      param_.norm_topk_prob,
+                     param_.routed_scale,
                      stream_);
     sync_check_cuda_error();
 
@@ -220,7 +241,7 @@ void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id
 }
 
 template<class T>
-void MoeFfnLayer<T>::reduce(T* output, int tokens, float output_scale, const MoeFfnWeight<T>& moe)
+void MoeFfnLayer<T>::reduce(T* output, int tokens, float output_scale, int layer_id, const MoeFfnWeight<T>& moe)
 {
     invokeMoeReduce(output,
                     inout_buf_,
@@ -235,6 +256,7 @@ void MoeFfnLayer<T>::reduce(T* output, int tokens, float output_scale, const Moe
     sync_check_cuda_error();
 
     if (tensor_para_.world_size_ > 1) {
+        // std::cout << "moe all reduce " << layer_id << "\n";
         ftNcclAllReduceSum(output, output, tokens * hidden_dim_, tensor_para_, stream_);
         sync_check_cuda_error();
     }
diff --git a/src/turbomind/models/llama/moe_ffn_layer.h b/src/turbomind/models/llama/moe_ffn_layer.h
index 8911d931f..74c62d004 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.h
+++ b/src/turbomind/models/llama/moe_ffn_layer.h
@@ -35,7 +35,7 @@ class MoeFfnLayer {
                 max_expert_num, param.experts_per_token, ctx.cuda_device_prop, stream_);
         }
         else {
-            expert_ffn_ = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx, false);
+            expert_ffn_ = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx);
         }
 
         h_offsets_ = (int*)allocator_->malloc(sizeof(int) * (max_expert_num + 1), false, true);
@@ -44,7 +44,7 @@ class MoeFfnLayer {
         accum_   = (int*)allocator_->malloc(sizeof(int) * max_expert_num * kMoeGateMaxTiles);
     }
 
-    void AllocateBuffer(size_t tokens, size_t padded, size_t expert_num);
+    void AllocateBuffer(size_t tokens, size_t padded, size_t expert_num, size_t inter_buf_factor);
 
     void FreeBuffer();
 
@@ -55,7 +55,7 @@ class MoeFfnLayer {
 
     void forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight<T>& moe);
 
-    void reduce(T* output, int tokens, float output_scale, const MoeFfnWeight<T>& moe);
+    void reduce(T* output, int tokens, float output_scale, int layer_id, const MoeFfnWeight<T>& moe);
 
     void gate(float* logits, const T* input, int tokens, const LlamaDenseWeight<T>& weight);
 
diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc
index ba002a746..bf9b97378 100644
--- a/src/turbomind/models/llama/unified_attention_layer.cc
+++ b/src/turbomind/models/llama/unified_attention_layer.cc
@@ -203,10 +203,10 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
     // [L, 2, H, s, D]
     const size_t layer_offset = layer_id * 2 * local_kv_head_num_ * param_.cache_block_seq_len * size_per_head_;
 
-    static int count = 0;
+    // static int count = 0;
 
-    // if (layer_id == 0 && count == 0) {
-    //     Compare(attention_input, token_num * weights->qkv.input_dims, "qkv_input", compare_mode, stream_);
+    // if (tensor_para_.rank_ == 0) {
+    //     Compare(attention_input, token_num * hidden_units_, Concat("qkv_input", layer_id), compare_mode, stream_);
     // }
 
     int* lora_mask = inputs->at("lora_mask", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<int>();
@@ -222,10 +222,14 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
         forward_mla(attention_input, token_num, *weights);
     }
 
+    // std::cerr << layer_id << " " << count << " " << tensor_para_.rank_ << "\n";
+
     count_and_fix(qkv_buf_, token_num * weights->qkv.output_dims, Concat("qkv", layer_id), 3);
 
-    // if (layer_id == 0 && count == 0) {
-    //     Compare(qkv_buf_, token_num * weights->qkv.output_dims, "qkv_buf", compare_mode, stream_);
+    // std::cerr << "token num: " << token_num << "\n";
+
+    // if (layer_id == 0 && count == 0 && tensor_para_.rank_ == 0) {
+    //     Compare(qkv_buf_, token_num * (3 * local_head_num_ * size_per_head_), "qkv_buf", CMP_MODE, stream_);
     // }
 
     if constexpr (0) {
@@ -421,8 +425,6 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
     linear_->forward(attention_out, qkv_buf_3_, token_num, weights->output, LlamaLinear<T>::kGemm, lora_mask);
     sync_check_cuda_error();
 
-    // ++count;
-
     count_and_fix(attention_out, token_num * weights->output.output_dims, Concat("wo", layer_id), 3);
 
     if (tensor_para_.world_size_ > 1) {
@@ -431,10 +433,17 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
         sync_check_cuda_error();
     }
 
+    // if (tensor_para_.rank_ == 0) {
+    //     Compare(attention_out, token_num * hidden_units_, Concat("attn_out", layer_id), compare_mode, stream_);
+    //     // dump(qkv_buf_3_, num_token * weights->output.input_dims, stream_, "qkv_buf_3");
+    // }
+
     if (is_free_buffer_after_forward_ == true) {
         freeBuffer();
     }
     sync_check_cuda_error();
+
+    // ++count;
 }
 
 template<typename T>
@@ -454,7 +463,6 @@ void UnifiedAttentionLayer<T>::forward_mla(const T* inputs, int token_num, const
         sync_check_cuda_error();
     }
     else {
-        FT_CHECK(0);
         T* q_a{};
         deviceMalloc((T**)&q_a, (size_t)token_num * q_lora_rank, stream_);
 
diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc
index 115e730b4..83015e558 100644
--- a/src/turbomind/models/llama/unified_decoder.cc
+++ b/src/turbomind/models/llama/unified_decoder.cc
@@ -5,6 +5,7 @@
 #include "src/turbomind/models/llama/llama_utils.h"
 #include "src/turbomind/models/llama/moe_ffn_layer.h"
 #include "src/turbomind/models/llama/unified_attention_layer.h"
+#include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/anomaly_handler.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include <cuda_runtime.h>
@@ -23,6 +24,7 @@ UnifiedDecoder<T>::UnifiedDecoder(const ModelParam&     model,
     rmsnorm_eps_(model.norm_eps),
     stream_(ctx.stream),
     allocator_(ctx.allocator.get()),
+    tp_(tp),
     dtype_(getTensorType<T>()),
     tune_layer_num_(model.tune_layer_num)
 {
@@ -34,7 +36,7 @@ UnifiedDecoder<T>::UnifiedDecoder(const ModelParam&     model,
     }
 
     if (std::accumulate(model.inter_size.begin(), model.inter_size.end(), 0LL)) {
-        ffn_layer_ = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx, !moe_ffn_layer_);
+        ffn_layer_ = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx);
     }
 
     check_cuda_error(cudaEventCreateWithFlags(&ev_h_cu_x_, cudaEventDisableTiming));
@@ -162,7 +164,7 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
     for (size_t layer = 0; layer < layer_num_; ++layer) {
 
         /// TODO: do not skip the layers when they are heterogeneous
-        if (isTuning() && layer < tune_layer_num_) {
+        if (isTuning() && layer >= tune_layer_num_) {
             continue;
         }
 
@@ -196,14 +198,21 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
         ////////////////////////////////////////////
         /// feed-forward network
 
-        if (!weights->at(layer)->moe_weights.experts.empty()) {
+        // if (tp_.rank_ == 0) {
+        //     Compare(decoder_output, token_num * hidden_units_, Concat("ffn_input", layer), compare_mode, stream_);
+        // }
+
+        const bool is_moe = !weights->at(layer)->moe_weights.experts.empty();
+        if (is_moe) {
             moe_ffn_layer_->forward(nullptr, decoder_output, token_num, layer, weights->at(layer)->moe_weights);
         }
 
-        if (ffn_layer_) {
-            int       layer_id = layer;  // int is needed
+        if (weights->at(layer)->ffn_weights.output.output_dims) {
+            int       layer_id   = layer;  // int is needed
+            bool      all_reduce = !is_moe;
             TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}},
-                                 {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}}};
+                                 {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}},
+                                 {"all_reduce", {MEMORY_CPU, TYPE_BOOL, {1}, &all_reduce}}};
             TensorMap ffn_outputs{{"ffn_output", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}}};
             if (inputs->isExist("lora_mask")) {
                 ffn_inputs.insert({"lora_mask", inputs->at("lora_mask")});
@@ -211,10 +220,18 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
             ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &weights->at(layer)->ffn_weights);
         }
 
-        if (!weights->at(layer)->moe_weights.experts.empty()) {
-            moe_ffn_layer_->reduce(decoder_output, token_num, (bool)ffn_layer_, weights->at(layer)->moe_weights);
+        // if (tp_.rank_ == 0) {
+        //     Compare(decoder_output, token_num * hidden_units_, Concat("ffn_out", layer), compare_mode, stream_);
+        // }
+
+        if (is_moe) {
+            moe_ffn_layer_->reduce(decoder_output, token_num, (bool)ffn_layer_, layer, weights->at(layer)->moe_weights);
         }
 
+        // if (tp_.rank_ == 0) {
+        //     Compare(decoder_output, token_num * hidden_units_, Concat("moe_ffn_out", layer), compare_mode, stream_);
+        // }
+
         count_and_fix(decoder_output, token_num * hidden_units_, Concat("ffn_block", layer), 2);
 
         const bool is_last_layer = layer == layer_num_ - 1;
@@ -261,6 +278,15 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
 
     // Wait for `h_cu_q/k_len_` to be consumed
     check_cuda_error(cudaEventSynchronize(ev_h_cu_x_));
+
+    // check_cuda_error(cudaStreamSynchronize(stream_));
+    // if (tp_.rank_ == 0) {
+    //     std::abort();
+    // }
+    // else {
+    //     while (1)
+    //         ;
+    // }
 }
 
 #ifdef ENABLE_FP32
diff --git a/src/turbomind/models/llama/unified_decoder.h b/src/turbomind/models/llama/unified_decoder.h
index 7e1bd8866..e08567136 100644
--- a/src/turbomind/models/llama/unified_decoder.h
+++ b/src/turbomind/models/llama/unified_decoder.h
@@ -22,6 +22,7 @@ class UnifiedDecoder {
     const float        rmsnorm_eps_;
     cudaStream_t const stream_;
     IAllocator* const  allocator_;
+    const NcclParam    tp_;
     const DataType     dtype_;
     const int          tune_layer_num_;
     bool               is_free_buffer_after_forward_{};
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 0dba71575..909077ba3 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -310,6 +310,10 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     moe_param_.inter_size        = model_reader["expert_inter_size"].as<int>(0);
     moe_param_.shared_gate       = model_reader["moe_shared_gate"].as<int>(0);
     moe_param_.norm_topk_prob    = model_reader["norm_topk_prob"].as<bool>(false);
+    moe_param_.routed_scale      = model_reader["routed_scale"].as<float>(1.f);
+    moe_param_.topk_group        = model_reader["topk_group"].as<int>(1);
+    moe_param_.topk_method       = model_reader["topk_method"].as<std::string>("greedy");
+    moe_param_.n_group           = model_reader["moe_group_num"].as<int>(1);
     YAML::Node expert_num        = model_reader["expert_num"];
     for (auto it = expert_num.begin(); it != expert_num.end(); ++it) {
         moe_param_.expert_num.push_back(it->as<int>());
diff --git a/src/turbomind/utils/allocator.h b/src/turbomind/utils/allocator.h
index bdcb9bfc4..d18652ef9 100644
--- a/src/turbomind/utils/allocator.h
+++ b/src/turbomind/utils/allocator.h
@@ -281,7 +281,7 @@ class Allocator<AllocatorType::CUDA>: public IAllocator {
                 pointer_mapping_.erase(address);
             }
             else {
-                TM_LOG_WARNING("pointer_mapping_ does not have information of ptr at %p.", address);
+                FT_CHECK_WITH_INFO(0, fmtstr("pointer_mapping_ does not have information of ptr at %p.", address).c_str());
             }
         }
         *ptr = nullptr;

From bc296e51dd5d8948adab1c3a1dbb1bce5c11b714 Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Wed, 20 Nov 2024 16:22:40 +0800
Subject: [PATCH 11/21] fix lint

---
 lmdeploy/turbomind/deploy/loader.py           |  2 -
 .../deploy/source_model/deepseek2.py          | 60 +++++++++----------
 src/turbomind/kernels/attention/decoding.cu   |  2 +-
 src/turbomind/kernels/norm/rms_norm.cu        |  2 +-
 src/turbomind/kernels/norm/rms_norm.h         |  2 +-
 src/turbomind/models/llama/mla_utils.cu       |  2 +-
 src/turbomind/models/llama/mla_utils.h        |  2 +-
 7 files changed, 35 insertions(+), 37 deletions(-)

diff --git a/lmdeploy/turbomind/deploy/loader.py b/lmdeploy/turbomind/deploy/loader.py
index dc319db9f..94e779b6b 100644
--- a/lmdeploy/turbomind/deploy/loader.py
+++ b/lmdeploy/turbomind/deploy/loader.py
@@ -10,7 +10,6 @@
 
 import torch
 from safetensors import safe_open
-import safetensors
 
 # https://github.com/huggingface/transformers/blob/53fad641cfdb5105e2470bcf3ef17ea8e25cc300/src/transformers/modeling_utils.py#L372
 WEIGHT_INDEX_NAME = 'pytorch_model.bin.index.json'
@@ -89,7 +88,6 @@ def items(self):
                     yield (-1, {k: f.get_tensor(k) for k in misc})
         assert not params
 
-
     # def items(self):
     #     params = defaultdict(dict)
     #     for shard in self.shards:
diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek2.py b/lmdeploy/turbomind/deploy/source_model/deepseek2.py
index f780ceeb7..deb69f2a7 100644
--- a/lmdeploy/turbomind/deploy/source_model/deepseek2.py
+++ b/lmdeploy/turbomind/deploy/source_model/deepseek2.py
@@ -4,10 +4,10 @@
 
 
 class DeepSeek2Reader(LlamaReader):
-    
+
     def moe_ffn_gate(self, i):
         return self.params.get(f'model.layers.{i}.mlp.gate.weight')
-    
+
     def moe_ffn_expert(self, e=None, i=None, kind=None):
         if not kind:
             return self.filter(r'experts')
@@ -18,7 +18,7 @@ def moe_ffn_expert(self, e=None, i=None, kind=None):
             tensor = self.transform(tensor, kind)
             result.append(tensor)
         return (*result, )
-    
+
     def _ffn(self, i: int, kind: str):
         """Get ffn kind for layer i."""
         if not kind:
@@ -32,29 +32,32 @@ def _ffn(self, i: int, kind: str):
             tensor = self.transform(tensor, kind)
             result.append(tensor)
         return (*result, )
-    
+
     def mla(self, i: int, kind: str):
         if not kind:
             return self.filter(r'self_attn.*proj')
         result = []
-        for key in ['q_a_proj', 'q_b_proj', 'q_proj', 'kv_a_proj_with_mqa', 'kv_b_proj', 'o_proj']:
+        for key in [
+                'q_a_proj', 'q_b_proj', 'q_proj', 'kv_a_proj_with_mqa',
+                'kv_b_proj', 'o_proj'
+        ]:
             tensor = self.params.get(
-                f'{self.attn_layer_prefix}.{i}.self_attn.{key}.{kind}'
-            )
+                f'{self.attn_layer_prefix}.{i}.self_attn.{key}.{kind}')
             tensor = self.transform(tensor, kind)
             result.append(tensor)
         return (*result, )
-    
+
     def mla_norm(self, i: int):
         result = []
         for k in ['q', 'kv']:
-            result.append(self.params.get(f'{self.attn_layer_prefix}.{i}.self_attn.{k}_a_layernorm.weight'))
+            name = f'{self.attn_layer_prefix}.{i}.self_attn.{k}_a_layernorm.weight'  # noqa: E501
+            result.append(self.params.get(name))
         return (*result, )
-        
+
 
 @INPUT_MODELS.register_module(name='deepseek2')
 class DeepSeek2Model(LlamaModel):
-    
+
     Reader = DeepSeek2Reader
 
     def tokenizer_info(self):
@@ -78,23 +81,20 @@ def model_info(self):
         inter_size = [n_shared_experts * expert_inter_size] * num_layer
         inter_size[0] = cfg['intermediate_size']
         norm_topk_prob = cfg['norm_topk_prob']
-        info.update(
-            kv_lora_rank=cfg['kv_lora_rank'],
-            q_lora_rank=cfg['q_lora_rank'] or 0,
-            qk_rope_dim=qk_rope_dim,
-            v_head_dim=cfg['v_head_dim'],
-            size_per_head=qk_rope_dim + qk_nope_dim,
-            rotary_embedding=qk_rope_dim,
-            expert_num=expert_num,
-            expert_inter_size=expert_inter_size,
-            experts_per_token=experts_per_token,
-            inter_size=inter_size,
-            norm_topk_prob=norm_topk_prob,
-            routed_scale=cfg['routed_scaling_factor'],
-            topk_method=cfg['topk_method'],
-            topk_group=cfg['topk_group'],
-            moe_group_num=cfg['n_group'],
-            tune_layer_num=2
-        )
+        info.update(kv_lora_rank=cfg['kv_lora_rank'],
+                    q_lora_rank=cfg['q_lora_rank'] or 0,
+                    qk_rope_dim=qk_rope_dim,
+                    v_head_dim=cfg['v_head_dim'],
+                    size_per_head=qk_rope_dim + qk_nope_dim,
+                    rotary_embedding=qk_rope_dim,
+                    expert_num=expert_num,
+                    expert_inter_size=expert_inter_size,
+                    experts_per_token=experts_per_token,
+                    inter_size=inter_size,
+                    norm_topk_prob=norm_topk_prob,
+                    routed_scale=cfg['routed_scaling_factor'],
+                    topk_method=cfg['topk_method'],
+                    topk_group=cfg['topk_group'],
+                    moe_group_num=cfg['n_group'],
+                    tune_layer_num=2)
         return info
-
diff --git a/src/turbomind/kernels/attention/decoding.cu b/src/turbomind/kernels/attention/decoding.cu
index 101b4170e..67bd81e45 100644
--- a/src/turbomind/kernels/attention/decoding.cu
+++ b/src/turbomind/kernels/attention/decoding.cu
@@ -114,7 +114,7 @@ void dispatchDecoding(const AttentionParams<T>& params)
     };
 
     if (params.size_per_head == 192) {
-        
+
         if (is_kv_int8) {
             invokeDecoding<Decoding<arch::Sm80, T, uint8_t, 1, 192>>(params);
         }
diff --git a/src/turbomind/kernels/norm/rms_norm.cu b/src/turbomind/kernels/norm/rms_norm.cu
index ea5026a09..cba0130b8 100644
--- a/src/turbomind/kernels/norm/rms_norm.cu
+++ b/src/turbomind/kernels/norm/rms_norm.cu
@@ -105,4 +105,4 @@ template void invokeRMSNorm(nv_bfloat16*       dst,
                             cudaStream_t       st);
 #endif
 
-}  // namespace turbomind
\ No newline at end of file
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/norm/rms_norm.h b/src/turbomind/kernels/norm/rms_norm.h
index ebf49a53e..417daed14 100644
--- a/src/turbomind/kernels/norm/rms_norm.h
+++ b/src/turbomind/kernels/norm/rms_norm.h
@@ -8,4 +8,4 @@ template<class T>
 void invokeRMSNorm(
     T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, cudaStream_t st);
 
-}
\ No newline at end of file
+}
diff --git a/src/turbomind/models/llama/mla_utils.cu b/src/turbomind/models/llama/mla_utils.cu
index 16999b812..ad38c7550 100644
--- a/src/turbomind/models/llama/mla_utils.cu
+++ b/src/turbomind/models/llama/mla_utils.cu
@@ -84,4 +84,4 @@ template void invokeMLACopyQKV(uint16_t*       qkv,
                                int             v_head_dim,
                                cudaStream_t    stream);
 
-}  // namespace turbomind
\ No newline at end of file
+}  // namespace turbomind
diff --git a/src/turbomind/models/llama/mla_utils.h b/src/turbomind/models/llama/mla_utils.h
index 8e5cad117..bc06a352f 100644
--- a/src/turbomind/models/llama/mla_utils.h
+++ b/src/turbomind/models/llama/mla_utils.h
@@ -54,4 +54,4 @@ void dispatchMLACopyQKV(T*           qkv,
     FT_CHECK(0);
 }
 
-}  // namespace turbomind
\ No newline at end of file
+}  // namespace turbomind

From 8fb10ba6d82c289453e753b98d0c2cab5caea58e Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Wed, 20 Nov 2024 16:32:39 +0800
Subject: [PATCH 12/21] fix lint

---
 src/turbomind/kernels/attention/impl_simt.h                 | 1 -
 src/turbomind/kernels/attention/mainloop_sm80.h             | 4 ++--
 src/turbomind/kernels/attention/reduce.cu                   | 2 +-
 src/turbomind/kernels/core/math.h                           | 3 ++-
 .../flash_attention/flash_attention2/static_switch.h        | 4 ++--
 src/turbomind/kernels/gemm/convert_v2.cu                    | 2 +-
 src/turbomind/models/llama/LlamaDecoderLayerWeight.cc       | 1 -
 src/turbomind/models/llama/LlamaDecoderLayerWeight.h        | 2 +-
 src/turbomind/models/llama/LlamaFfnLayer.cc                 | 2 +-
 src/turbomind/models/llama/LlamaWeight.h                    | 2 +-
 src/turbomind/models/llama/llama_params.h                   | 6 ++++--
 src/turbomind/models/llama/weight_type.h                    | 3 ++-
 src/turbomind/python/bind.cpp                               | 3 ++-
 src/turbomind/utils/allocator.h                             | 3 ++-
 14 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/src/turbomind/kernels/attention/impl_simt.h b/src/turbomind/kernels/attention/impl_simt.h
index 790fc1b4f..444b67e2c 100644
--- a/src/turbomind/kernels/attention/impl_simt.h
+++ b/src/turbomind/kernels/attention/impl_simt.h
@@ -13,7 +13,6 @@
 #include "src/turbomind/kernels/attention/impl.h"
 #include "src/turbomind/kernels/attention/quantization.h"
 
-
 namespace turbomind::attention {
 
 template<class T_,
diff --git a/src/turbomind/kernels/attention/mainloop_sm80.h b/src/turbomind/kernels/attention/mainloop_sm80.h
index 0a65515a4..4435400b7 100644
--- a/src/turbomind/kernels/attention/mainloop_sm80.h
+++ b/src/turbomind/kernels/attention/mainloop_sm80.h
@@ -52,7 +52,7 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
     template<class... Args>
     __device__ void operator()(Args&&... args)
     {
-        Run(Sm80_CpAsync<Stages>{}, std::integral_constant<int, Impl::kHeadDim>{}, ((Args&&)args)...);
+        Run(Sm80_CpAsync<Stages>{}, std::integral_constant<int, Impl::kHeadDim>{}, ((Args &&) args)...);
     }
 
     template<int Idx, class A, class B>
@@ -236,7 +236,7 @@ struct Mainloop<Sm80_CpAsync<Stages>, Impl_> {
         Wait();
         state_QK.Load(0, 0);
 
-        constexpr auto _ = [](int){};
+        constexpr auto _ = [](int) {};
 
         auto loop = [&](auto is_residue, auto is_mask) {
             const int offset_K = tile_iter * CTA_S;
diff --git a/src/turbomind/kernels/attention/reduce.cu b/src/turbomind/kernels/attention/reduce.cu
index 051b2baa7..c654f40d0 100644
--- a/src/turbomind/kernels/attention/reduce.cu
+++ b/src/turbomind/kernels/attention/reduce.cu
@@ -71,7 +71,7 @@ INSTANTIATE_invokeReduce(128, half);
 INSTANTIATE_invokeReduce(192, half);
 
 #if ENABLE_BF16
-INSTANTIATE_invokeReduce(64, nv_bfloat16)
+INSTANTIATE_invokeReduce(64, nv_bfloat16);
 INSTANTIATE_invokeReduce(128, nv_bfloat16);
 INSTANTIATE_invokeReduce(192, nv_bfloat16);
 #endif
diff --git a/src/turbomind/kernels/core/math.h b/src/turbomind/kernels/core/math.h
index c78ab95ab..054269c27 100644
--- a/src/turbomind/kernels/core/math.h
+++ b/src/turbomind/kernels/core/math.h
@@ -51,7 +51,8 @@ TM_HOST_DEVICE constexpr T lowbit(T x)
 
 // https://arxiv.org/abs/1902.01961
 template<class T>
-struct FastDivMod {};
+struct FastDivMod {
+};
 
 template<>
 struct FastDivMod<uint16_t> {
diff --git a/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h b/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h
index ca141ee0b..b1df29cb7 100644
--- a/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h
+++ b/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h
@@ -60,8 +60,8 @@
     }()
 #elif 1
 #define FWD_HEADDIM_SWITCH(HEADDIM, ...)                                                                               \
-    [&] {                                                                                                                                                                                                                          \
-        if (HEADDIM <= 128) {                                                                                     \
+    [&] {                                                                                                              \
+        if (HEADDIM <= 128) {                                                                                          \
             constexpr static int kHeadDim = 128;                                                                       \
             return __VA_ARGS__();                                                                                      \
         }                                                                                                              \
diff --git a/src/turbomind/kernels/gemm/convert_v2.cu b/src/turbomind/kernels/gemm/convert_v2.cu
index 90e4b97dd..e58bfc9b9 100644
--- a/src/turbomind/kernels/gemm/convert_v2.cu
+++ b/src/turbomind/kernels/gemm/convert_v2.cu
@@ -303,7 +303,7 @@ void* make_blocked_ptrs(const std::vector<std::pair<void*, int>>& ptrs, cudaStre
 {
     constexpr int N = 64;
     Param<N>      param{};
-    static_assert(sizeof(param) <= 4096); // max parameter size for cuda11
+    static_assert(sizeof(param) <= 4096);  // max parameter size for cuda11
     StridedPtr* ptr{};
     cudaMallocAsync(&ptr, sizeof(StridedPtr) * ptrs.size(), stream);
     param.ptr = ptr;
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
index 7f2c83315..0a2a3be17 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -365,7 +365,6 @@ void getMLATensor(LlamaAttentionWeight<T>& w, const std::string& p, TensorMap& m
              Tensor{MEMORY_GPU, getTensorType<T>(), {sizeof(T) * w.kv_b_proj.input_dims}, w.kv_a_layernorm});
 }
 
-
 template<typename T>
 TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
 {
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
index 59ee0ea5f..9b204ed0d 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
@@ -39,7 +39,7 @@ struct LlamaDecoderLayerWeight {
                             size_t            tp_rank);
 
     ~LlamaDecoderLayerWeight();
-    LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other)            = delete;
+    LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other) = delete;
     LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other) = delete;
 
     void loadModel(std::string dir_path, FtCudaDataType model_file_type);
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc
index 5afc75869..907467341 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.cc
+++ b/src/turbomind/models/llama/LlamaFfnLayer.cc
@@ -30,7 +30,7 @@ template<typename T>
 void LlamaFfnLayer<T>::allocateBuffer(
     size_t token_num, int inter_size, size_t inter_buf_factor, size_t gating_lora_r, size_t inter_lora_r)
 {
-    const size_t sz  = token_num * inter_size;
+    const size_t sz = token_num * inter_size;
 
     const size_t sz_gate  = token_num * gating_lora_r;
     const size_t sz_inter = token_num * inter_lora_r;
diff --git a/src/turbomind/models/llama/LlamaWeight.h b/src/turbomind/models/llama/LlamaWeight.h
index 8e9bfd4eb..629cd5612 100644
--- a/src/turbomind/models/llama/LlamaWeight.h
+++ b/src/turbomind/models/llama/LlamaWeight.h
@@ -37,7 +37,7 @@ struct LlamaWeight {
 
     ~LlamaWeight();
 
-    LlamaWeight(const LlamaWeight& other)            = delete;
+    LlamaWeight(const LlamaWeight& other) = delete;
     LlamaWeight& operator=(const LlamaWeight& other) = delete;
 
     void loadModel(std::string dir_path);
diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h
index 4a13740d7..ea6e21a08 100644
--- a/src/turbomind/models/llama/llama_params.h
+++ b/src/turbomind/models/llama/llama_params.h
@@ -40,7 +40,8 @@ struct ModelParam {
 };
 
 struct MoeParam {
-    enum Method {
+    enum Method
+    {
         kNaive,
         kFused
     } method;
@@ -93,7 +94,8 @@ struct EngineParam {
     int max_prefill_iters;
 };
 
-enum class LoraPolicy : int {
+enum class LoraPolicy : int
+{
     kNull,
     kPlora,
 };
diff --git a/src/turbomind/models/llama/weight_type.h b/src/turbomind/models/llama/weight_type.h
index 27d7affe5..bc2f49a08 100644
--- a/src/turbomind/models/llama/weight_type.h
+++ b/src/turbomind/models/llama/weight_type.h
@@ -6,7 +6,8 @@
 
 namespace turbomind {
 
-enum class WeightType : int {
+enum class WeightType : int
+{
     kFP32,
     kFP16,
     kFP8,  // not supported yet
diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp
index 1e74f4bae..b1df396b1 100644
--- a/src/turbomind/python/bind.cpp
+++ b/src/turbomind/python/bind.cpp
@@ -297,7 +297,8 @@ PYBIND11_MODULE(_turbomind, m)
                         ft::check_cuda_error(cudaPointerGetAttributes(&at, self->data));
                         {
                             // Switch to the same device where TM's tenosr memory resides because it's allocated
-                            // from a pool with no peer access enabled (can't be accessed from a context of other devices)
+                            // from a pool with no peer access enabled (can't be accessed from a context of other
+                            // devices)
                             ft::CudaDeviceGuard guard{at.device};
                             ft::check_cuda_error(cudaMemcpy(const_cast<void*>(self->data),
                                                             const_cast<void*>(src->data),
diff --git a/src/turbomind/utils/allocator.h b/src/turbomind/utils/allocator.h
index d18652ef9..88c299c3d 100644
--- a/src/turbomind/utils/allocator.h
+++ b/src/turbomind/utils/allocator.h
@@ -281,7 +281,8 @@ class Allocator<AllocatorType::CUDA>: public IAllocator {
                 pointer_mapping_.erase(address);
             }
             else {
-                FT_CHECK_WITH_INFO(0, fmtstr("pointer_mapping_ does not have information of ptr at %p.", address).c_str());
+                FT_CHECK_WITH_INFO(0,
+                                   fmtstr("pointer_mapping_ does not have information of ptr at %p.", address).c_str());
             }
         }
         *ptr = nullptr;

From 103ba46888077be58294db66ccefcc2e7a8a60f9 Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Wed, 20 Nov 2024 18:41:48 +0800
Subject: [PATCH 13/21] fix ut

---
 lmdeploy/turbomind/deploy/target_model/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py
index d84944793..f2c981bb2 100644
--- a/lmdeploy/turbomind/deploy/target_model/base.py
+++ b/lmdeploy/turbomind/deploy/target_model/base.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import collections
 import os.path as osp
 from abc import ABC
+from collections.abc import Sequence
 
 import torch
 import tqdm
@@ -92,7 +92,7 @@ def single_to_list(self, config: dict, keys):
         num_layer = int(config['num_layer'])
         for k in keys:
             v = config.get(k, None)
-            if v is not None and not isinstance(v, collections.Sequence):
+            if v is not None and not isinstance(v, Sequence):
                 config[k] = [v] * num_layer
         return config
 

From 9c7dbbd8e1882b7d567024cad23946ddff492df8 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Mon, 25 Nov 2024 16:08:58 +0800
Subject: [PATCH 14/21] Update config.yaml

---
 autotest/config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/autotest/config.yaml b/autotest/config.yaml
index 6c92d2cf0..00b65c816 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -62,6 +62,7 @@ turbomind_chat_model:
     - liuhaotian/llava-v1.6-vicuna-7b
     - deepseek-ai/deepseek-vl-1.3b-chat
     - deepseek-ai/deepseek-coder-1.3b-instruct
+    - deepseek-ai/DeepSeek-V2-Lite-Chat
     - codellama/CodeLlama-7b-Instruct-hf
     - THUDM/glm-4-9b-chat
     - openbmb/MiniCPM-Llama3-V-2_5

From 8169e975be5c84ff77a0be7eefedcf835da1a2bc Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Mon, 25 Nov 2024 16:09:55 +0800
Subject: [PATCH 15/21] Update config.yaml

---
 autotest/config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/autotest/config.yaml b/autotest/config.yaml
index 00b65c816..30d54fa95 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -168,6 +168,7 @@ turbomind_quatization:
         - Qwen/Qwen2-VL-7B-Instruct
         - mistralai/Mistral-7B-Instruct-v0.3
         - deepseek-ai/deepseek-coder-1.3b-instruct
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
         - codellama/CodeLlama-7b-Instruct-hf
     gptq:
         - internlm/internlm2_5-7b-chat

From 4d710074419774a815d6e706360fe1bfe7660728 Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Tue, 26 Nov 2024 18:24:52 +0800
Subject: [PATCH 16/21] fix mixtral

---
 src/turbomind/kernels/gemm/moe_utils_v2.cu    | 17 ++++++++---------
 src/turbomind/models/llama/unified_decoder.cc |  2 +-
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu
index 66bf634ea..e97482dd3 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.cu
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu
@@ -414,7 +414,13 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
 
 #endif
 
-    constexpr float kLog2e = 1.4426950408889634074;
+    // constexpr float kLog2e = 1.4426950408889634074;
+    // if (k == 0) {
+    //     PRAGMA_UNROLL
+    //     for (int i = 0; i < items_per_thread; ++i) {
+    //         data[i] *= kLog2e;
+    //     }
+    // }
 
     unsigned mask = (unsigned)-1;
     float    max_logit;
@@ -438,13 +444,6 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
             asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
         }
 
-        if (k == 0) {
-            PRAGMA_UNROLL
-            for (int i = 0; i < items_per_thread; ++i) {
-                data[i] *= kLog2e;
-            }
-        }
-
         int   g_max_ei  = ei;
         float g_max_val = max_val;
         if constexpr (threads_per_token > 1) {
@@ -487,7 +486,7 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
     PRAGMA_UNROLL
     for (int i = 0; i < items_per_thread; ++i) {
         if (!norm_topk || used[i]) {
-            data[i] = exp2f(data[i] - max_logit);
+            data[i] = expf(data[i] - max_logit);
             sum_prob += data[i];
         }
     }
diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc
index 83015e558..d9f556430 100644
--- a/src/turbomind/models/llama/unified_decoder.cc
+++ b/src/turbomind/models/llama/unified_decoder.cc
@@ -207,7 +207,7 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
             moe_ffn_layer_->forward(nullptr, decoder_output, token_num, layer, weights->at(layer)->moe_weights);
         }
 
-        if (weights->at(layer)->ffn_weights.output.output_dims) {
+        if (weights->at(layer)->ffn_weights.output.kernel) {
             int       layer_id   = layer;  // int is needed
             bool      all_reduce = !is_moe;
             TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}},

From c6bd5fe14be9486ba19ce44a26612d5ce68fefc7 Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Wed, 27 Nov 2024 15:19:53 +0800
Subject: [PATCH 17/21] fix moe gating & config parsing

---
 lmdeploy/turbomind/deploy/config.py           |  4 +-
 .../turbomind/deploy/source_model/qwen.py     |  2 +-
 src/turbomind/kernels/gemm/moe_utils_v2.cu    | 21 +++++--
 src/turbomind/models/llama/llama_utils.cu     | 59 ++++++-------------
 .../triton_backend/llama/LlamaTritonModel.cc  |  4 +-
 5 files changed, 40 insertions(+), 50 deletions(-)

diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
index ac86a900d..1474792e2 100644
--- a/lmdeploy/turbomind/deploy/config.py
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -58,8 +58,8 @@ class ModelConfig:
     expert_num: List[int] = ()
     expert_inter_size: int = 0
     experts_per_token: int = 0
-    moe_shared_gate: int = False
-    norm_topk_prob: int = False
+    moe_shared_gate: bool = False
+    norm_topk_prob: bool = False
     routed_scale: float = 1.0
     topk_group: int = 1
     topk_method: str = 'greedy'
diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py
index 772bd0303..637983e8c 100644
--- a/lmdeploy/turbomind/deploy/source_model/qwen.py
+++ b/lmdeploy/turbomind/deploy/source_model/qwen.py
@@ -178,6 +178,6 @@ def model_info(self):
         info['experts_per_token'] = cfg['num_experts_per_tok']
         info['inter_size'] = cfg['shared_expert_intermediate_size']
         info['moe_shared_gate'] = True
-        info['moe_norm_topk_prob'] = cfg['norm_topk_prob']
+        info['norm_topk_prob'] = cfg['norm_topk_prob']
         info['attn_bias'] = 1
         return info
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu
index e97482dd3..548dc3dda 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.cu
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu
@@ -515,9 +515,11 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
 
     PRAGMA_UNROLL
     for (int i = 0; i < max_tiles * max_expert_num; i += block_dim) {
-        int e                   = (i + threadIdx.x) % max_expert_num;
-        int t                   = (i + threadIdx.x) / max_expert_num;
-        smem.shared_accum[t][e] = 0;
+        int e = (i + threadIdx.x) % max_expert_num;
+        int t = (i + threadIdx.x) / max_expert_num;
+        if (t < max_tiles) {
+            smem.shared_accum[t][e] = 0;
+        }
     }
 
     __syncthreads();
@@ -538,8 +540,6 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
             masks[expert_id * token_num_padded + ti2] = idx;
             scales[idx * token_num + ti2]             = scale * routed_scale;
             atomicAdd(&smem.shared_accum[ti2 >> log_tile][expert_id], 1);
-
-            // printf("%d %d %f\n", idx, expert_id, scale);
         }
     }
 
@@ -613,6 +613,17 @@ void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
 
     if (experts <= 8) {
         if (experts_per_token <= 2) {
+            // MoeGateKernel_V2<2, 128><<<cdiv(tokens, 128), 128, 0, st>>>(scales,
+            //     (int8_t*)masks,
+            //     accum,
+            //     logits,
+            //     log_tile,
+            //     tiles,
+            //     tokens,
+            //     tokens_padded,
+            //     experts);
+
+            // std::cout << tokens << " " << experts << " " << experts_per_token << " " << tokens_padded << "\n";
             invoke(_Int<8>, _Int<2>, _Int<8>, _Int<4>);
         }
         else {
diff --git a/src/turbomind/models/llama/llama_utils.cu b/src/turbomind/models/llama/llama_utils.cu
index 570e33b77..d0ded16cb 100644
--- a/src/turbomind/models/llama/llama_utils.cu
+++ b/src/turbomind/models/llama/llama_utils.cu
@@ -1,49 +1,26 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
-#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
-#include "src/turbomind/models/llama/llama_utils.h"
-#include "src/turbomind/utils/cuda_utils.h"
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <type_traits>
+#include <vector>
+
 #include <cuda_fp16.h>
 #include <curand_kernel.h>
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/host_vector.h>
-#include <vector>
+
+#include "src/turbomind/models/llama/llama_utils.h"
+#include "src/turbomind/utils/cuda_utils.h"
 
 namespace turbomind {
 
 CmpMode compare_mode = kCmpRead;
 // CmpMode compare_mode = kCmpWrite;
 
-template<typename T>
-struct abs_diff_t {
-    using type = T;
-};
-
-template<>
-struct abs_diff_t<half> {
-    using type = float;
-};
-
-template<>
-struct abs_diff_t<__nv_bfloat16> {
-    using type = float;
-};
-
-template<typename T>
-struct abs_diff: public thrust::unary_function<thrust::tuple<T, T>, typename abs_diff_t<T>::type> {
-    __host__ __device__ float operator()(thrust::tuple<T, T> x) const
-    {
-        using R = typename abs_diff_t<T>::type;
-        auto r  = R(thrust::get<0>(x)) - R(thrust::get<1>(x));
-        return r < R(0) ? -r : r;
-    }
-};
-
 template<typename T>
 void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream)
 {
@@ -64,10 +41,8 @@ void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream)
 template<typename T>
 void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream)
 {
-    // wait for b
-    check_cuda_error(cudaStreamSynchronize(stream));
     // read a from file
-    thrust::host_vector<T> h_a(size);
+    std::vector<T> h_a(size);
     {
         const auto    filename = "tmp/" + key + ".cmp";
         std::ifstream ifs(filename, std::ios::binary);
@@ -86,17 +61,21 @@ void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream)
         }
         ifs.read((char*)h_a.data(), sizeof(T) * h_a.size());
     }
-    // copy a to device
-    thrust::device_vector<T> a = h_a;
-    // create abs(a - b) iterator
-    thrust::device_ptr<T> dev_ptr(ptr);
-    auto                  zip_iter       = thrust::make_zip_iterator(thrust::make_tuple(a.begin(), dev_ptr));
-    auto                  transform_iter = thrust::make_transform_iterator(zip_iter, abs_diff<T>{});
-    // sum(abs(a - b))
-    auto asum = thrust::reduce(thrust::device, transform_iter, transform_iter + size);
+    std::vector<T> h_b(size);
+    check_cuda_error(cudaMemcpyAsync(h_b.data(), ptr, sizeof(T) * size, cudaMemcpyDefault, stream));
+    check_cuda_error(cudaStreamSynchronize(stream));
+
+    using Tacc = std::conditional_t<std::is_integral_v<T>, int64_t, float>;
+
+    Tacc asum{};
+    for (size_t i = 0; i < size; ++i) {
+        asum += std::abs((Tacc)h_a[i] - (Tacc)h_b[i]);
+    }
+
     std::cerr << key << ": " << asum << " " << asum / size << "\n";
 
     check_cuda_error(cudaMemcpyAsync(ptr, h_a.data(), sizeof(T) * h_a.size(), cudaMemcpyDefault, stream));
+    check_cuda_error(cudaStreamSynchronize(stream));
 }
 
 template<typename T>
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 2939b4c71..ba964ddf0 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -315,8 +315,8 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
 
     moe_param_.experts_per_token = model_reader["experts_per_token"].as<int>(0);
     moe_param_.inter_size        = model_reader["expert_inter_size"].as<int>(0);
-    moe_param_.shared_gate       = model_reader["moe_shared_gate"].as<int>(0);
-    moe_param_.norm_topk_prob    = model_reader["norm_topk_prob"].as<bool>(false);
+    moe_param_.shared_gate       = model_reader["moe_shared_gate"].as<bool>();
+    moe_param_.norm_topk_prob    = model_reader["norm_topk_prob"].as<bool>();
     moe_param_.routed_scale      = model_reader["routed_scale"].as<float>(1.f);
     moe_param_.topk_group        = model_reader["topk_group"].as<int>(1);
     moe_param_.topk_method       = model_reader["topk_method"].as<std::string>("greedy");

From 3d4f22f76e3199934a33047161e3ef47ba233de3 Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Wed, 27 Nov 2024 18:15:37 +0800
Subject: [PATCH 18/21] fix yarn for deepseek-v2

---
 .../deploy/source_model/deepseek2.py          | 25 +++++++++++++++++++
 .../models/llama/unified_attention_layer.cc   |  3 ++-
 src/turbomind/models/llama/unified_decoder.cc |  9 -------
 3 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek2.py b/lmdeploy/turbomind/deploy/source_model/deepseek2.py
index deb69f2a7..0a4a9e189 100644
--- a/lmdeploy/turbomind/deploy/source_model/deepseek2.py
+++ b/lmdeploy/turbomind/deploy/source_model/deepseek2.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import math
+
 from .base import INPUT_MODELS
 from .llama import LlamaModel, LlamaReader
 
@@ -55,6 +57,23 @@ def mla_norm(self, i: int):
         return (*result, )
 
 
+def get_yarn_attention_factor(rope_scaling: dict):
+
+    scaling_factor = float(rope_scaling['factor'])
+    mscale = rope_scaling['mscale']
+    mscale_all_dim = rope_scaling['mscale_all_dim']
+
+    def yarn_get_mscale(scale=1, mscale=1):
+        if scale <= 1:
+            return 1.0
+        return 0.1 * mscale * math.log(scale) + 1.0
+
+    _mscale = float(
+        yarn_get_mscale(scaling_factor, mscale) /
+        yarn_get_mscale(scaling_factor, mscale_all_dim))
+    return _mscale
+
+
 @INPUT_MODELS.register_module(name='deepseek2')
 class DeepSeek2Model(LlamaModel):
 
@@ -97,4 +116,10 @@ def model_info(self):
                     topk_group=cfg['topk_group'],
                     moe_group_num=cfg['n_group'],
                     tune_layer_num=2)
+        rope_scaling = cfg.get('rope_scaling')
+        if rope_scaling and rope_scaling['type'] == 'yarn':
+            info.update(
+                max_position_embeddings=rope_scaling[
+                    'original_max_position_embeddings'],
+                attention_factor=get_yarn_attention_factor(rope_scaling))
         return info
diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc
index bf9b97378..e7d57db91 100644
--- a/src/turbomind/models/llama/unified_attention_layer.cc
+++ b/src/turbomind/models/llama/unified_attention_layer.cc
@@ -334,8 +334,9 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
             };
             float low, high;
             find_correction_range(param_.beta_fast, param_.beta_slow, low, high);
+            // https://github.com/huggingface/transformers/blob/6c3f168b36882f0beebaa9121eafa1928ba29633/src/transformers/modeling_rope_utils.py#L216
             if (low == high) {
-                high += 0.01f;
+                high += 0.001f;
             }
             params.yarn_ramp_inv_factor_div_2   = 1.0 / (high - low) / 2.0;
             params.yarn_ramp_inv_factor_mul_min = 1.0 / (high - low) * low;
diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc
index d9f556430..d5fdf96e5 100644
--- a/src/turbomind/models/llama/unified_decoder.cc
+++ b/src/turbomind/models/llama/unified_decoder.cc
@@ -278,15 +278,6 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
 
     // Wait for `h_cu_q/k_len_` to be consumed
     check_cuda_error(cudaEventSynchronize(ev_h_cu_x_));
-
-    // check_cuda_error(cudaStreamSynchronize(stream_));
-    // if (tp_.rank_ == 0) {
-    //     std::abort();
-    // }
-    // else {
-    //     while (1)
-    //         ;
-    // }
 }
 
 #ifdef ENABLE_FP32

From 329e4416f72b912c1362fe2df657b3a66b1e4d94 Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Wed, 27 Nov 2024 20:22:09 +0800
Subject: [PATCH 19/21] fix `copy_from`

---
 src/turbomind/python/bind.cpp | 58 +++++++++++++++++++++++++++--------
 1 file changed, 46 insertions(+), 12 deletions(-)

diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp
index b1df396b1..5a344d954 100644
--- a/src/turbomind/python/bind.cpp
+++ b/src/turbomind/python/bind.cpp
@@ -215,6 +215,51 @@ DLTensor GetDLTensor(py::object obj)
     return dlmt->dl_tensor;
 }
 
+static void safe_memcpy(void* dst, const void* src, size_t size)
+{
+    cudaPointerAttributes dat{};
+    cudaPointerAttributes sat{};
+    ft::check_cuda_error(cudaPointerGetAttributes(&dat, dst));
+    ft::check_cuda_error(cudaPointerGetAttributes(&sat, src));
+    try {
+        if (dat.devicePointer && sat.devicePointer) {
+            // Both can be accessed from current context
+            ft::check_cuda_error(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
+        }
+        else if (dat.type == cudaMemoryTypeDevice && sat.type == cudaMemoryTypeDevice) {
+            if (dat.device != sat.device) {
+                // On different devices, try peer memcpy
+                ft::check_cuda_error(cudaMemcpyPeer(dst, dat.device, src, sat.device, size));
+            }
+            else {
+                // Same device, switch to the device first (this is unlikely)
+                ft::CudaDeviceGuard guard(dat.device);
+                ft::check_cuda_error(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
+            }
+        }
+        else {
+            // Unknown case, give it a try anyway
+            ft::check_cuda_error(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
+        }
+    }
+    catch (...) {
+        int device_id{-1};
+        cudaGetDevice(&device_id);
+        TM_LOG_ERROR("cudaMemcpy failed: dst=(%d, %d, %p, %p), src=(%d, %d, %p, %p), size=%s, device=%d",
+                     (int)dat.type,
+                     dat.device,
+                     dat.devicePointer,
+                     dat.hostPointer,
+                     (int)sat.type,
+                     sat.device,
+                     sat.devicePointer,
+                     sat.hostPointer,
+                     std::to_string(size).c_str(),
+                     device_id);
+        throw;
+    }
+}
+
 PYBIND11_MODULE(_turbomind, m)
 {
     // nccl param
@@ -293,18 +338,7 @@ PYBIND11_MODULE(_turbomind, m)
                             std::accumulate(src->shape.begin(), src->shape.end(), 1LL, std::multiplies<int64_t>());
                         auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8;
                         ft::FT_CHECK(self->shape.size() == 1 && num_bytes == self->shape[0]);
-                        cudaPointerAttributes at{};
-                        ft::check_cuda_error(cudaPointerGetAttributes(&at, self->data));
-                        {
-                            // Switch to the same device where TM's tenosr memory resides because it's allocated
-                            // from a pool with no peer access enabled (can't be accessed from a context of other
-                            // devices)
-                            ft::CudaDeviceGuard guard{at.device};
-                            ft::check_cuda_error(cudaMemcpy(const_cast<void*>(self->data),
-                                                            const_cast<void*>(src->data),
-                                                            num_bytes,
-                                                            cudaMemcpyDefault));
-                        }
+                        safe_memcpy(const_cast<void*>(self->data), src->data, num_bytes);
                         break;
                     }
                     default:

From 90d2529791ec6b8342d419fabc01f393dcec2ef2 Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Thu, 28 Nov 2024 21:14:45 +0800
Subject: [PATCH 20/21] fix rms norm, rotary embedding & deepseek v2 attention

---
 lmdeploy/turbomind/deploy/config.py           |   1 +
 lmdeploy/turbomind/deploy/module.py           |   8 -
 .../deploy/source_model/deepseek2.py          |  23 ++-
 .../kernels/attention/rotary_embedding.h      |  17 ++
 src/turbomind/kernels/gemm/moe_utils_v2.cu    |  19 +++
 src/turbomind/kernels/norm/rms_norm.cu        | 159 ++++++++++++++++--
 src/turbomind/kernels/norm/rms_norm.h         |  10 ++
 src/turbomind/models/llama/llama_params.h     |   1 +
 src/turbomind/models/llama/llama_utils.cu     |  15 +-
 src/turbomind/models/llama/mla_utils.cu       |  44 ++---
 .../models/llama/unified_attention_layer.cc   |  29 +++-
 src/turbomind/models/llama/unified_decoder.cc |  66 ++++++--
 .../triton_backend/llama/LlamaTritonModel.cc  |   1 +
 13 files changed, 320 insertions(+), 73 deletions(-)

diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
index 1474792e2..e483500e9 100644
--- a/lmdeploy/turbomind/deploy/config.py
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -84,6 +84,7 @@ def verify(self):
 class AttentionConfig:
     rotary_embedding: int = 128
     rope_theta: float = 10000.0
+    softmax_scale: float = 0
     attention_factor: float = None
     max_position_embeddings: int = 0
     original_max_position_embeddings: int = 0
diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py
index aa98ed646..52497175e 100644
--- a/lmdeploy/turbomind/deploy/module.py
+++ b/lmdeploy/turbomind/deploy/module.py
@@ -245,14 +245,6 @@ def _export(self, idx: int, xs, kind: str, pack_fn, **kwargs):
             q_b = q
 
         cfg = self.model.model_config
-        qk_nope_dim = cfg.size_per_head - cfg.qk_rope_dim
-
-        q_b = q_b.reshape(-1, cfg.size_per_head)
-
-        # [nope_dim | rope_dim] -> [rope_dim | nope_dim]
-        q_nope, q_pe = torch.split(q_b, (qk_nope_dim, cfg.qk_rope_dim), dim=-1)
-        q_b = torch.cat((q_pe, q_nope),
-                        dim=-1).view(-1, cfg.head_num * cfg.size_per_head)
 
         o = o.reshape(cfg.head_num, cfg.v_head_dim, -1)
         o = torch.nn.functional.pad(
diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek2.py b/lmdeploy/turbomind/deploy/source_model/deepseek2.py
index 0a4a9e189..0023f650f 100644
--- a/lmdeploy/turbomind/deploy/source_model/deepseek2.py
+++ b/lmdeploy/turbomind/deploy/source_model/deepseek2.py
@@ -57,7 +57,7 @@ def mla_norm(self, i: int):
         return (*result, )
 
 
-def get_yarn_attention_factor(rope_scaling: dict):
+def get_yarn_params(rope_scaling: dict):
 
     scaling_factor = float(rope_scaling['factor'])
     mscale = rope_scaling['mscale']
@@ -71,7 +71,13 @@ def yarn_get_mscale(scale=1, mscale=1):
     _mscale = float(
         yarn_get_mscale(scaling_factor, mscale) /
         yarn_get_mscale(scaling_factor, mscale_all_dim))
-    return _mscale
+
+    softmax_scale = 0
+    if mscale_all_dim:
+        scale = yarn_get_mscale(scaling_factor, mscale_all_dim)
+        softmax_scale = scale * scale
+
+    return _mscale, softmax_scale
 
 
 @INPUT_MODELS.register_module(name='deepseek2')
@@ -100,11 +106,12 @@ def model_info(self):
         inter_size = [n_shared_experts * expert_inter_size] * num_layer
         inter_size[0] = cfg['intermediate_size']
         norm_topk_prob = cfg['norm_topk_prob']
+        size_per_head = qk_rope_dim + qk_nope_dim
         info.update(kv_lora_rank=cfg['kv_lora_rank'],
                     q_lora_rank=cfg['q_lora_rank'] or 0,
                     qk_rope_dim=qk_rope_dim,
                     v_head_dim=cfg['v_head_dim'],
-                    size_per_head=qk_rope_dim + qk_nope_dim,
+                    size_per_head=size_per_head,
                     rotary_embedding=qk_rope_dim,
                     expert_num=expert_num,
                     expert_inter_size=expert_inter_size,
@@ -118,8 +125,10 @@ def model_info(self):
                     tune_layer_num=2)
         rope_scaling = cfg.get('rope_scaling')
         if rope_scaling and rope_scaling['type'] == 'yarn':
-            info.update(
-                max_position_embeddings=rope_scaling[
-                    'original_max_position_embeddings'],
-                attention_factor=get_yarn_attention_factor(rope_scaling))
+            attention_factor, softmax_scale = get_yarn_params(rope_scaling)
+            softmax_scale *= size_per_head**(-0.5)
+            info.update(max_position_embeddings=rope_scaling[
+                'original_max_position_embeddings'],
+                        attention_factor=attention_factor,
+                        softmax_scale=softmax_scale)
         return info
diff --git a/src/turbomind/kernels/attention/rotary_embedding.h b/src/turbomind/kernels/attention/rotary_embedding.h
index 8e09da22c..db836ed18 100644
--- a/src/turbomind/kernels/attention/rotary_embedding.h
+++ b/src/turbomind/kernels/attention/rotary_embedding.h
@@ -131,6 +131,7 @@ struct FastRoPE {
     template<typename T>
     __device__ void apply(Array<T, N>& x, float timestep)
     {
+#if 0
         PRAGMA_UNROLL
         for (int i = 0; i < N; i += 2) {
             float c, s;
@@ -144,6 +145,22 @@ struct FastRoPE {
                 x[i + 1] = (T)tmp1;
             }
         }
+#else
+        // Most models apply rotary embedding in half precision
+        PRAGMA_UNROLL
+        for (int i = 0; i < N; i += 2) {
+            float c, s;
+            sincosf(timestep * inv_freq_[i / 2], &s, &c);
+            s *= attention_scaling_;
+            c *= attention_scaling_;
+            T tmp0 = (T)c * x[i] - (T)s * x[i + 1];
+            T tmp1 = (T)c * x[i + 1] + (T)s * x[i];
+            if (is_valid_) {
+                x[i]     = tmp0;
+                x[i + 1] = tmp1;
+            }
+        }
+#endif
     }
 };
 
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu
index 548dc3dda..a9e4f7da5 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.cu
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu
@@ -733,6 +733,7 @@ __global__ void MoeReduceKernel(T*           dst,         // [  n, d]
     }
 
     for (int i = threadIdx.x; i < dims; i += block_dim) {
+#if 1
         Array<float, vec_size> accum{};
         if (dst_scale) {
             Vec v;
@@ -749,6 +750,24 @@ __global__ void MoeReduceKernel(T*           dst,         // [  n, d]
             accum        = accum + x;
         }
         Store(dst_ptr[i].data(), cast<T>(accum));
+#else
+        Array<T, vec_size> accum{};
+        if (dst_scale) {
+            Vec v;
+            Ldg(v, dst_ptr[i].data());
+            using namespace ops;
+            accum = v * (T)dst_scale;
+        }
+        PRAGMA_UNROLL
+        for (int e = 0; e < exp_k; ++e) {
+            Vec v;
+            Ldg(v, src_ptr[e][i].data());
+            using namespace ops;
+            const auto x = v * (T)scale[e];
+            accum        = accum + x;
+        }
+        Store(dst_ptr[i].data(), accum);
+#endif
     }
 }
 
diff --git a/src/turbomind/kernels/norm/rms_norm.cu b/src/turbomind/kernels/norm/rms_norm.cu
index cba0130b8..22fd69f52 100644
--- a/src/turbomind/kernels/norm/rms_norm.cu
+++ b/src/turbomind/kernels/norm/rms_norm.cu
@@ -8,8 +8,15 @@
 namespace turbomind {
 
 template<class T, class Accum, int block_dim, int vec_size>
-__global__ void RMSNormKernel(
-    T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, float inv_dims)
+__global__ void RMSNormKernel(T*       dst,
+                              int      dst_ld,
+                              const T* src,
+                              int      src_ld,
+                              const T* __restrict__ weights,
+                              int   dims,
+                              int   num,
+                              float eps,
+                              float inv_dims)
 {
     const int ti = blockIdx.x;
     const int di = threadIdx.x * vec_size;
@@ -56,13 +63,13 @@ __global__ void RMSNormKernel(
     Array<T, vec_size> sv;
     for (int i = di; i < dims; i += block_dim * vec_size) {
         Load(vec, &src[i]);
-        Array<Accum, vec_size> tmp = cast<Accum>(vec);
-        Load(sv, &weights[i]);
+        Ldg(sv, &weights[i]);
         PRAGMA_UNROLL
         for (int c = 0; c < vec_size; ++c) {
-            tmp[c] *= (float)sv[c] * sum;
+            vec[c] = (T)((float)vec[c] * sum) * sv[c];
+            // vec[c] = (T)((float)vec[c] * sum * (float)sv[c]);
         }
-        Store(&dst[i], cast<T>(tmp));
+        Store(&dst[i], vec);
     }
 }
 
@@ -70,18 +77,20 @@ template<class T>
 void invokeRMSNorm(
     T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, cudaStream_t st)
 {
-    constexpr int threads = 256;
+    constexpr int vec_size = 16 / sizeof(T);
+
+    constexpr int threads = 512;
     const int     blocks  = num;
 
-    RMSNormKernel<T, float, threads, 8><<<blocks, threads, 0, st>>>(dst,  //
-                                                                    dst_ld,
-                                                                    src,
-                                                                    src_ld,
-                                                                    weights,
-                                                                    dims,
-                                                                    num,
-                                                                    eps,
-                                                                    1.f / dims);
+    RMSNormKernel<T, float, threads, vec_size><<<blocks, threads, 0, st>>>(dst,  //
+                                                                           dst_ld,
+                                                                           src,
+                                                                           src_ld,
+                                                                           weights,
+                                                                           dims,
+                                                                           num,
+                                                                           eps,
+                                                                           1.f / dims);
 }
 
 template void invokeRMSNorm(half*        dst,
@@ -105,4 +114,122 @@ template void invokeRMSNorm(nv_bfloat16*       dst,
                             cudaStream_t       st);
 #endif
 
+// r' <- r + (h + b)
+// h' <- norm(r') * w
+template<class T, class Tacc, int block_dim, int vec_size>
+__global__ void BiasResidualRMSNormKernel(T* __restrict__ residual,
+                                          T* __restrict__ hidden_states,
+                                          const T* __restrict__ weights,
+                                          const T* __restrict__ bias,
+                                          int   dims,
+                                          int   num,
+                                          float eps,
+                                          float inv_dims)
+{
+    const int ti = blockIdx.x;
+    const int di = threadIdx.x * vec_size;
+
+    if (ti >= num) {
+        return;
+    }
+
+    residual += dims * ti;
+    hidden_states += dims * ti;
+
+    Array<Tacc, vec_size> accum{};
+
+    Array<T, vec_size> r_vec;
+    Array<T, vec_size> h_vec;
+    Array<T, vec_size> b_vec;
+
+    for (int i = di; i < dims; i += block_dim * vec_size) {
+        Load(r_vec, &residual[i]);
+        Load(h_vec, &hidden_states[i]);
+
+        using namespace ops;
+        r_vec = r_vec + h_vec;
+
+        if (bias) {
+            Ldg(b_vec, &bias[i]);
+            r_vec = r_vec + b_vec;
+        }
+
+        Store(&residual[i], r_vec);
+
+        Array<Tacc, vec_size> tmp = cast<Tacc>(r_vec);
+
+        accum = accum + tmp * tmp;
+    }
+
+    float sum{};
+    PRAGMA_UNROLL
+    for (int i = 0; i < vec_size; ++i) {
+        sum += accum[i];
+    }
+
+    using BlockReduce = cub::BlockReduce<Tacc, block_dim>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    sum = BlockReduce{temp_storage}.Sum(sum);
+
+    __shared__ float shared_sum;
+
+    if (threadIdx.x == 0) {
+        shared_sum = rsqrtf(sum * inv_dims + eps);
+    }
+
+    __syncthreads();
+
+    sum = shared_sum;
+
+    Array<T, vec_size> w_vec;
+    for (int i = di; i < dims; i += block_dim * vec_size) {
+        Load(r_vec, &residual[i]);
+        Ldg(w_vec, &weights[i]);
+        PRAGMA_UNROLL
+        for (int c = 0; c < vec_size; ++c) {
+            r_vec[c] = (T)((float)r_vec[c] * sum) * w_vec[c];
+        }
+        Store(&hidden_states[i], r_vec);
+    }
+}
+
+template<class T>
+void invokeBiasResidualRMSNorm(
+    T* residual, T* hidden_states, const T* weights, const T* bias, int dims, int num, float eps, cudaStream_t st)
+{
+    constexpr int vec_size = 16 / sizeof(T);
+    constexpr int threads  = 512;
+    const int     blocks   = num;
+
+    BiasResidualRMSNormKernel<T, float, threads, vec_size><<<blocks, threads, 0, st>>>(residual,  //
+                                                                                       hidden_states,
+                                                                                       weights,
+                                                                                       bias,
+                                                                                       dims,
+                                                                                       num,
+                                                                                       eps,
+                                                                                       1.f / dims);
+}
+
+template void invokeBiasResidualRMSNorm(half*        residual,
+                                        half*        hidden_states,
+                                        const half*  weights,
+                                        const half*  bias,
+                                        int          dims,
+                                        int          num,
+                                        float        eps,
+                                        cudaStream_t st);
+
+#if ENABLE_BF16
+template void invokeBiasResidualRMSNorm(nv_bfloat16*       residual,
+                                        nv_bfloat16*       hidden_states,
+                                        const nv_bfloat16* weights,
+                                        const nv_bfloat16* bias,
+                                        int                dims,
+                                        int                num,
+                                        float              eps,
+                                        cudaStream_t       st);
+#endif
+
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/norm/rms_norm.h b/src/turbomind/kernels/norm/rms_norm.h
index 417daed14..83fa0f826 100644
--- a/src/turbomind/kernels/norm/rms_norm.h
+++ b/src/turbomind/kernels/norm/rms_norm.h
@@ -8,4 +8,14 @@ template<class T>
 void invokeRMSNorm(
     T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, cudaStream_t st);
 
+template<class T>
+void invokeRMSNorm(T* dst, const T* src, const T* weights, int dims, int num, float eps, cudaStream_t st)
+{
+    invokeRMSNorm(dst, dims, src, dims, weights, dims, num, eps, st);
 }
+
+template<class T>
+void invokeBiasResidualRMSNorm(
+    T* residual, T* hidden_states, const T* weights, const T* bias, int dims, int num, float eps, cudaStream_t st);
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h
index ea6e21a08..0a505b11a 100644
--- a/src/turbomind/models/llama/llama_params.h
+++ b/src/turbomind/models/llama/llama_params.h
@@ -63,6 +63,7 @@ struct AttentionParam {
     int         rotary_embedding_dim;
     float       rotary_embedding_base;
     int         max_position_embeddings;
+    float       softmax_scale;
     std::string rope_scaling_type;
     int         original_max_position_embeddings;
     float       rope_scaling_factor;
diff --git a/src/turbomind/models/llama/llama_utils.cu b/src/turbomind/models/llama/llama_utils.cu
index d0ded16cb..eaa450ae2 100644
--- a/src/turbomind/models/llama/llama_utils.cu
+++ b/src/turbomind/models/llama/llama_utils.cu
@@ -65,14 +65,23 @@ void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream)
     check_cuda_error(cudaMemcpyAsync(h_b.data(), ptr, sizeof(T) * size, cudaMemcpyDefault, stream));
     check_cuda_error(cudaStreamSynchronize(stream));
 
-    using Tacc = std::conditional_t<std::is_integral_v<T>, int64_t, float>;
+    using Tacc         = std::conditional_t<std::is_integral_v<T>, int64_t, float>;
+    constexpr Tacc eps = std::is_integral_v<T> ? 1 : 1e-8f;
 
     Tacc asum{};
+    Tacc rsum{};
+    Tacc amean{};
     for (size_t i = 0; i < size; ++i) {
-        asum += std::abs((Tacc)h_a[i] - (Tacc)h_b[i]);
+        Tacc x        = (Tacc)h_b[i];
+        Tacc r        = (Tacc)h_a[i];
+        Tacc abs_diff = std::abs(x - r);
+        Tacc rel_diff = abs_diff / std::max(std::max(std::abs(r), std::abs(x)), eps);
+        asum += abs_diff;
+        rsum += rel_diff;
+        amean += std::abs(r);
     }
 
-    std::cerr << key << ": " << asum << " " << asum / size << "\n";
+    std::cerr << key << ": " << amean / size << " " << asum << " " << asum / size << " " << rsum / size << "\n";
 
     check_cuda_error(cudaMemcpyAsync(ptr, h_a.data(), sizeof(T) * h_a.size(), cudaMemcpyDefault, stream));
     check_cuda_error(cudaStreamSynchronize(stream));
diff --git a/src/turbomind/models/llama/mla_utils.cu b/src/turbomind/models/llama/mla_utils.cu
index ad38c7550..2f9e786f2 100644
--- a/src/turbomind/models/llama/mla_utils.cu
+++ b/src/turbomind/models/llama/mla_utils.cu
@@ -17,32 +17,38 @@ __global__ void mla_copy_qkv_kernel(T*       qkv,
 {
     const int type = blockIdx.y;
 
-    const int ti = blockIdx.x;
-    const int di = threadIdx.x;
+    const int64_t ti = blockIdx.x;
+    const int     di = threadIdx.x;
 
     const int kv_b_dim = nope_dim + v_head_dim;
 
-    for (int hi = threadIdx.y; hi < head_num; hi += blockDim.y) {
-        Array<T, vec_size> data{};
-        if (type == 0) {  // Q
-            Ldg(data, &q[ti * head_num * head_dim + hi * head_dim + di * vec_size]);
+    // for (int hi = threadIdx.y; hi < head_num; hi += blockDim.y) {
+    const int          hi = threadIdx.y;
+    Array<T, vec_size> data{};
+    if (type == 0) {  // Q
+        if (di * vec_size < rope_dim) {
+            Ldg(data, &q[ti * head_num * head_dim + hi * head_dim + nope_dim + di * vec_size]);
         }
-        else if (type == 1) {  // K
-            if (di * vec_size < rope_dim) {
-                Ldg(data, &kv_a[ti * (kv_lora_rank + rope_dim) + kv_lora_rank + di * vec_size]);
-            }
-            else {
-                Ldg(data, &kv_b[ti * head_num * kv_b_dim + hi * kv_b_dim + di * vec_size - rope_dim]);
-            }
+        else {
+            Ldg(data, &q[ti * head_num * head_dim + hi * head_dim + di * vec_size - rope_dim]);
         }
-        else {  // V
-            if (di * vec_size < v_head_dim) {
-                Ldg(data, &kv_b[ti * head_num * kv_b_dim + hi * kv_b_dim + nope_dim + di * vec_size]);
-            }
+    }
+    else if (type == 1) {  // K
+        if (di * vec_size < rope_dim) {
+            Ldg(data, &kv_a[ti * (kv_lora_rank + rope_dim) + kv_lora_rank + di * vec_size]);
+        }
+        else {
+            Ldg(data, &kv_b[ti * head_num * kv_b_dim + hi * kv_b_dim + di * vec_size - rope_dim]);
+        }
+    }
+    else {  // V
+        if (di * vec_size < v_head_dim) {
+            Ldg(data, &kv_b[ti * head_num * kv_b_dim + hi * kv_b_dim + nope_dim + di * vec_size]);
         }
-        const int ti_stride = 3 * head_num * head_dim;
-        Store(&qkv[ti * ti_stride + type * head_num * head_dim + hi * head_dim + di * vec_size], data);
     }
+    const int stride = 3 * head_num * head_dim;
+    Store(&qkv[ti * stride + type * head_num * head_dim + hi * head_dim + di * vec_size], data);
+    // }
 }
 
 template<class T>
diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc
index e7d57db91..7462176f9 100644
--- a/src/turbomind/models/llama/unified_attention_layer.cc
+++ b/src/turbomind/models/llama/unified_attention_layer.cc
@@ -300,8 +300,15 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
         params.num_heads     = local_head_num_;
         params.num_kv_heads  = local_kv_head_num_;
         params.size_per_head = size_per_head_;
+
         // MSVC does not have M_LOG2E
-        params.inv_sqrt_dh = (float)std::log2(expf(1.)) / std::sqrt((float)params.size_per_head);
+        params.inv_sqrt_dh = (float)std::log2(expf(1.));
+        if (param_.softmax_scale) {  // model predefined softmax scale
+            params.inv_sqrt_dh *= param_.softmax_scale;
+        }
+        else {  // default value
+            params.inv_sqrt_dh /= std::sqrt((float)params.size_per_head);
+        }
 
         params.rotary_embedding_dim    = param_.rotary_embedding_dim;
         params.rotary_embedding_base   = param_.rotary_embedding_base;
@@ -421,6 +428,24 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
 
     count_and_fix(qkv_buf_3_, token_num * weights->output.input_dims, Concat("attn", layer_id), 3);
 
+#if 0
+    if (!isTuning()) {
+        T* o{};
+        cudaMallocAsync(&o, sizeof(T) * token_num * head_num_ * size_per_head_, stream_);
+        cudaMemsetAsync(o, 0, sizeof(T) * token_num * head_num_ * size_per_head_, stream_);
+        auto dst = o;
+        auto src = qkv_buf_3_;
+        for (int i = 0; i < token_num; ++i) {
+            for (int j = 0; j < head_num_; ++j) {
+                cudaMemcpyAsync(dst, src, sizeof(T) * 128, cudaMemcpyDefault, stream_);
+                src += 192;
+                dst += 128;
+            }
+        }
+        Compare(o, token_num * head_num_ * 128, "attn", kCmpRead, stream_);
+    }
+#endif
+
     //////////////////////////////////////////////
     /// output gemm <Bs,HD> -> <Bs,HD>
     linear_->forward(attention_out, qkv_buf_3_, token_num, weights->output, LlamaLinear<T>::kGemm, lora_mask);
@@ -458,7 +483,7 @@ void UnifiedAttentionLayer<T>::forward_mla(const T* inputs, int token_num, const
 
     T* q{};
 
-    if (w.q_proj.output_dims) {
+    if (w.q_proj.kernel) {
         deviceMalloc((T**)&q, (size_t)token_num * w.q_proj.output_dims, stream_);
         linear_->forward(q, inputs, token_num, w.q_proj);
         sync_check_cuda_error();
diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc
index d5fdf96e5..1352669a4 100644
--- a/src/turbomind/models/llama/unified_decoder.cc
+++ b/src/turbomind/models/llama/unified_decoder.cc
@@ -1,14 +1,17 @@
 
-#include "src/turbomind/models/llama/unified_decoder.h"
+
+#include <cuda_runtime.h>
+
+#include "src/turbomind/kernels/norm/rms_norm.h"
 #include "src/turbomind/models/llama/llama_decoder_kernels.h"
 #include "src/turbomind/models/llama/llama_kernels.h"
 #include "src/turbomind/models/llama/llama_utils.h"
 #include "src/turbomind/models/llama/moe_ffn_layer.h"
 #include "src/turbomind/models/llama/unified_attention_layer.h"
+#include "src/turbomind/models/llama/unified_decoder.h"
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/anomaly_handler.h"
 #include "src/turbomind/utils/cuda_utils.h"
-#include <cuda_runtime.h>
 
 namespace turbomind {
 
@@ -144,19 +147,25 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
 
     const int pf_offset = dc_batch_size;
 
+    const bool flag = false && !isTuning();
+
     // Compare(decoder_input_output, token_num * hidden_units_, "decoder_input", kCmpRead, stream_);
 
     // printf("%d %f\n", (int)token_num, rmsnorm_eps_);
 
+    if (flag) {
+        Compare(decoder_input_output, token_num * hidden_units_, "norm0", kCmpRead, stream_);
+    }
+
     /////////////////////////////////////////////
     /// RMSNorm
-    invokeRootMeanSquareNorm(decoder_output,
-                             decoder_input_output,
-                             weights->at(0)->self_attn_norm_weights,
-                             rmsnorm_eps_,
-                             token_num,
-                             hidden_units_,
-                             stream_);
+    invokeRMSNorm(decoder_output,
+                  decoder_input_output,
+                  weights->at(0)->self_attn_norm_weights,
+                  hidden_units_,
+                  token_num,
+                  rmsnorm_eps_,
+                  stream_);
     sync_check_cuda_error();
 
     count_and_fix(decoder_output, token_num * hidden_units_, Concat("norm0", 0), 2);
@@ -168,7 +177,9 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
             continue;
         }
 
-        // Compare(decoder_output, token_num * hidden_units_, "attn_input", kCmpRead, stream_);
+        if (flag) {
+            Compare(decoder_output, token_num * hidden_units_, "attn_input", kCmpRead, stream_);
+        }
 
         /////////////////////////////////////////////
         /// self-attention
@@ -182,16 +193,26 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
 
         count_and_fix(decoder_output, token_num * hidden_units_, Concat("attn_block", layer), 2);
 
-        invokeFusedAddBiasResidualRMSNorm(decoder_input_output,
-                                          decoder_output,
-                                          weights->at(layer)->self_attn_weights.output.bias,
-                                          weights->at(layer)->ffn_norm_weights,
-                                          rmsnorm_eps_,
-                                          token_num,
-                                          hidden_units_,
-                                          stream_);
+        if (flag) {
+            Compare(decoder_input_output, token_num * hidden_units_, "res0", kCmpRead, stream_);
+            Compare(decoder_output, token_num * hidden_units_, "attn_out", kCmpRead, stream_);
+        }
+
+        invokeBiasResidualRMSNorm(decoder_input_output,
+                                  decoder_output,
+                                  weights->at(layer)->ffn_norm_weights,
+                                  weights->at(layer)->self_attn_weights.output.bias,
+                                  hidden_units_,
+                                  token_num,
+                                  rmsnorm_eps_,
+                                  stream_);
         sync_check_cuda_error();
 
+        if (flag) {
+            Compare(decoder_input_output, token_num * hidden_units_, "res1", kCmpRead, stream_);
+            Compare(decoder_output, token_num * hidden_units_, "ffn_in", kCmpRead, stream_);
+        }
+
         count_and_fix(decoder_input_output, token_num * hidden_units_, Concat("residual0", layer), 2);
         count_and_fix(decoder_output, token_num * hidden_units_, Concat("norm1", layer), 2);
 
@@ -228,6 +249,10 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
             moe_ffn_layer_->reduce(decoder_output, token_num, (bool)ffn_layer_, layer, weights->at(layer)->moe_weights);
         }
 
+        if (flag) {
+            Compare(decoder_output, token_num * hidden_units_, "ffn_out", kCmpRead, stream_);
+        }
+
         // if (tp_.rank_ == 0) {
         //     Compare(decoder_output, token_num * hidden_units_, Concat("moe_ffn_out", layer), compare_mode, stream_);
         // }
@@ -250,6 +275,11 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
 
         count_and_fix(decoder_input_output, token_num * hidden_units_, Concat("residual1", layer), 2);
         count_and_fix(decoder_output, token_num * hidden_units_, Concat("norm0", layer + 1), 2);
+
+        if (flag) {
+            cudaStreamSynchronize(stream_);
+            std::abort();
+        }
     }
 
     if (dc_batch_size) {
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index ba964ddf0..1c7c5eb46 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -279,6 +279,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     // rotary embedding parameters
     attn_param_.rotary_embedding_dim    = attention_reader["rotary_embedding"].as<int>();
     attn_param_.rotary_embedding_base   = attention_reader["rope_theta"].as<float>(10000.0f);
+    attn_param_.softmax_scale           = attention_reader["softmax_scale"].as<float>(0);
     attn_param_.attention_factor        = attention_reader["attention_factor"].as<float>(-1.f);
     attn_param_.beta_fast               = attention_reader["beta_fast"].as<float>(32.f);
     attn_param_.beta_slow               = attention_reader["beta_slow"].as<float>(1.f);

From f565ef71522a876f10accf0483610238030dc36b Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Thu, 28 Nov 2024 21:42:01 +0800
Subject: [PATCH 21/21] remove debug code

---
 .../models/llama/unified_attention_layer.cc   | 18 --------
 src/turbomind/models/llama/unified_decoder.cc | 45 -------------------
 2 files changed, 63 deletions(-)

diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc
index 7462176f9..7a6eddc4b 100644
--- a/src/turbomind/models/llama/unified_attention_layer.cc
+++ b/src/turbomind/models/llama/unified_attention_layer.cc
@@ -428,24 +428,6 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
 
     count_and_fix(qkv_buf_3_, token_num * weights->output.input_dims, Concat("attn", layer_id), 3);
 
-#if 0
-    if (!isTuning()) {
-        T* o{};
-        cudaMallocAsync(&o, sizeof(T) * token_num * head_num_ * size_per_head_, stream_);
-        cudaMemsetAsync(o, 0, sizeof(T) * token_num * head_num_ * size_per_head_, stream_);
-        auto dst = o;
-        auto src = qkv_buf_3_;
-        for (int i = 0; i < token_num; ++i) {
-            for (int j = 0; j < head_num_; ++j) {
-                cudaMemcpyAsync(dst, src, sizeof(T) * 128, cudaMemcpyDefault, stream_);
-                src += 192;
-                dst += 128;
-            }
-        }
-        Compare(o, token_num * head_num_ * 128, "attn", kCmpRead, stream_);
-    }
-#endif
-
     //////////////////////////////////////////////
     /// output gemm <Bs,HD> -> <Bs,HD>
     linear_->forward(attention_out, qkv_buf_3_, token_num, weights->output, LlamaLinear<T>::kGemm, lora_mask);
diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc
index 1352669a4..ec0e75b7e 100644
--- a/src/turbomind/models/llama/unified_decoder.cc
+++ b/src/turbomind/models/llama/unified_decoder.cc
@@ -147,16 +147,6 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
 
     const int pf_offset = dc_batch_size;
 
-    const bool flag = false && !isTuning();
-
-    // Compare(decoder_input_output, token_num * hidden_units_, "decoder_input", kCmpRead, stream_);
-
-    // printf("%d %f\n", (int)token_num, rmsnorm_eps_);
-
-    if (flag) {
-        Compare(decoder_input_output, token_num * hidden_units_, "norm0", kCmpRead, stream_);
-    }
-
     /////////////////////////////////////////////
     /// RMSNorm
     invokeRMSNorm(decoder_output,
@@ -177,10 +167,6 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
             continue;
         }
 
-        if (flag) {
-            Compare(decoder_output, token_num * hidden_units_, "attn_input", kCmpRead, stream_);
-        }
-
         /////////////////////////////////////////////
         /// self-attention
         forwardSelfAttn(decoder_output,  //
@@ -193,11 +179,6 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
 
         count_and_fix(decoder_output, token_num * hidden_units_, Concat("attn_block", layer), 2);
 
-        if (flag) {
-            Compare(decoder_input_output, token_num * hidden_units_, "res0", kCmpRead, stream_);
-            Compare(decoder_output, token_num * hidden_units_, "attn_out", kCmpRead, stream_);
-        }
-
         invokeBiasResidualRMSNorm(decoder_input_output,
                                   decoder_output,
                                   weights->at(layer)->ffn_norm_weights,
@@ -208,21 +189,12 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
                                   stream_);
         sync_check_cuda_error();
 
-        if (flag) {
-            Compare(decoder_input_output, token_num * hidden_units_, "res1", kCmpRead, stream_);
-            Compare(decoder_output, token_num * hidden_units_, "ffn_in", kCmpRead, stream_);
-        }
-
         count_and_fix(decoder_input_output, token_num * hidden_units_, Concat("residual0", layer), 2);
         count_and_fix(decoder_output, token_num * hidden_units_, Concat("norm1", layer), 2);
 
         ////////////////////////////////////////////
         /// feed-forward network
 
-        // if (tp_.rank_ == 0) {
-        //     Compare(decoder_output, token_num * hidden_units_, Concat("ffn_input", layer), compare_mode, stream_);
-        // }
-
         const bool is_moe = !weights->at(layer)->moe_weights.experts.empty();
         if (is_moe) {
             moe_ffn_layer_->forward(nullptr, decoder_output, token_num, layer, weights->at(layer)->moe_weights);
@@ -241,22 +213,10 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
             ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &weights->at(layer)->ffn_weights);
         }
 
-        // if (tp_.rank_ == 0) {
-        //     Compare(decoder_output, token_num * hidden_units_, Concat("ffn_out", layer), compare_mode, stream_);
-        // }
-
         if (is_moe) {
             moe_ffn_layer_->reduce(decoder_output, token_num, (bool)ffn_layer_, layer, weights->at(layer)->moe_weights);
         }
 
-        if (flag) {
-            Compare(decoder_output, token_num * hidden_units_, "ffn_out", kCmpRead, stream_);
-        }
-
-        // if (tp_.rank_ == 0) {
-        //     Compare(decoder_output, token_num * hidden_units_, Concat("moe_ffn_out", layer), compare_mode, stream_);
-        // }
-
         count_and_fix(decoder_output, token_num * hidden_units_, Concat("ffn_block", layer), 2);
 
         const bool is_last_layer = layer == layer_num_ - 1;
@@ -275,11 +235,6 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
 
         count_and_fix(decoder_input_output, token_num * hidden_units_, Concat("residual1", layer), 2);
         count_and_fix(decoder_output, token_num * hidden_units_, Concat("norm0", layer + 1), 2);
-
-        if (flag) {
-            cudaStreamSynchronize(stream_);
-            std::abort();
-        }
     }
 
     if (dc_batch_size) {