fix moe gating & config parsing

lzhangzz · lzhangzz · commit c6bd5fe14be9 · 2024-11-27T15:19:53.000+08:00
diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
@@ -58,8 +58,8 @@ class ModelConfig:
     expert_num: List[int] = ()
     expert_inter_size: int = 0
     experts_per_token: int = 0
-    moe_shared_gate: int = False
-    norm_topk_prob: int = False
+    moe_shared_gate: bool = False
+    norm_topk_prob: bool = False
     routed_scale: float = 1.0
     topk_group: int = 1
     topk_method: str = 'greedy'
diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py
@@ -178,6 +178,6 @@ def model_info(self):
         info['experts_per_token'] = cfg['num_experts_per_tok']
         info['inter_size'] = cfg['shared_expert_intermediate_size']
         info['moe_shared_gate'] = True
-        info['moe_norm_topk_prob'] = cfg['norm_topk_prob']
+        info['norm_topk_prob'] = cfg['norm_topk_prob']
         info['attn_bias'] = 1
         return info
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu
@@ -515,9 +515,11 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
 
     PRAGMA_UNROLL
     for (int i = 0; i < max_tiles * max_expert_num; i += block_dim) {
-        int e                   = (i + threadIdx.x) % max_expert_num;
-        int t                   = (i + threadIdx.x) / max_expert_num;
-        smem.shared_accum[t][e] = 0;
+        int e = (i + threadIdx.x) % max_expert_num;
+        int t = (i + threadIdx.x) / max_expert_num;
+        if (t < max_tiles) {
+            smem.shared_accum[t][e] = 0;
+        }
     }
 
     __syncthreads();
@@ -538,8 +540,6 @@ __global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
             masks[expert_id * token_num_padded + ti2] = idx;
             scales[idx * token_num + ti2]             = scale * routed_scale;
             atomicAdd(&smem.shared_accum[ti2 >> log_tile][expert_id], 1);
-
-            // printf("%d %d %f\n", idx, expert_id, scale);
         }
     }
 
@@ -613,6 +613,17 @@ void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
 
     if (experts <= 8) {
         if (experts_per_token <= 2) {
+            // MoeGateKernel_V2<2, 128><<<cdiv(tokens, 128), 128, 0, st>>>(scales,
+            //     (int8_t*)masks,
+            //     accum,
+            //     logits,
+            //     log_tile,
+            //     tiles,
+            //     tokens,
+            //     tokens_padded,
+            //     experts);
+
+            // std::cout << tokens << " " << experts << " " << experts_per_token << " " << tokens_padded << "\n";
             invoke(_Int<8>, _Int<2>, _Int<8>, _Int<4>);
         }
         else {
diff --git a/src/turbomind/models/llama/llama_utils.cu b/src/turbomind/models/llama/llama_utils.cu
@@ -1,49 +1,26 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
-#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
-#include "src/turbomind/models/llama/llama_utils.h"
-#include "src/turbomind/utils/cuda_utils.h"
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <type_traits>
+#include <vector>
+
 #include <cuda_fp16.h>
 #include <curand_kernel.h>
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/host_vector.h>
-#include <vector>
+
+#include "src/turbomind/models/llama/llama_utils.h"
+#include "src/turbomind/utils/cuda_utils.h"
 
 namespace turbomind {
 
 CmpMode compare_mode = kCmpRead;
 // CmpMode compare_mode = kCmpWrite;
 
-template<typename T>
-struct abs_diff_t {
-    using type = T;
-};
-
-template<>
-struct abs_diff_t<half> {
-    using type = float;
-};
-
-template<>
-struct abs_diff_t<__nv_bfloat16> {
-    using type = float;
-};
-
-template<typename T>
-struct abs_diff: public thrust::unary_function<thrust::tuple<T, T>, typename abs_diff_t<T>::type> {
-    __host__ __device__ float operator()(thrust::tuple<T, T> x) const
-    {
-        using R = typename abs_diff_t<T>::type;
-        auto r  = R(thrust::get<0>(x)) - R(thrust::get<1>(x));
-        return r < R(0) ? -r : r;
-    }
-};
-
 template<typename T>
 void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream)
 {
@@ -64,10 +41,8 @@ void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream)
 template<typename T>
 void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream)
 {
-    // wait for b
-    check_cuda_error(cudaStreamSynchronize(stream));
     // read a from file
-    thrust::host_vector<T> h_a(size);
+    std::vector<T> h_a(size);
     {
         const auto    filename = "tmp/" + key + ".cmp";
         std::ifstream ifs(filename, std::ios::binary);
@@ -86,17 +61,21 @@ void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream)
         }
         ifs.read((char*)h_a.data(), sizeof(T) * h_a.size());
     }
-    // copy a to device
-    thrust::device_vector<T> a = h_a;
-    // create abs(a - b) iterator
-    thrust::device_ptr<T> dev_ptr(ptr);
-    auto                  zip_iter       = thrust::make_zip_iterator(thrust::make_tuple(a.begin(), dev_ptr));
-    auto                  transform_iter = thrust::make_transform_iterator(zip_iter, abs_diff<T>{});
-    // sum(abs(a - b))
-    auto asum = thrust::reduce(thrust::device, transform_iter, transform_iter + size);
+    std::vector<T> h_b(size);
+    check_cuda_error(cudaMemcpyAsync(h_b.data(), ptr, sizeof(T) * size, cudaMemcpyDefault, stream));
+    check_cuda_error(cudaStreamSynchronize(stream));
+
+    using Tacc = std::conditional_t<std::is_integral_v<T>, int64_t, float>;
+
+    Tacc asum{};
+    for (size_t i = 0; i < size; ++i) {
+        asum += std::abs((Tacc)h_a[i] - (Tacc)h_b[i]);
+    }
+
     std::cerr << key << ": " << asum << " " << asum / size << "\n";
 
     check_cuda_error(cudaMemcpyAsync(ptr, h_a.data(), sizeof(T) * h_a.size(), cudaMemcpyDefault, stream));
+    check_cuda_error(cudaStreamSynchronize(stream));
 }
 
 template<typename T>
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -315,8 +315,8 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
 
     moe_param_.experts_per_token = model_reader["experts_per_token"].as<int>(0);
     moe_param_.inter_size        = model_reader["expert_inter_size"].as<int>(0);
-    moe_param_.shared_gate       = model_reader["moe_shared_gate"].as<int>(0);
-    moe_param_.norm_topk_prob    = model_reader["norm_topk_prob"].as<bool>(false);
+    moe_param_.shared_gate       = model_reader["moe_shared_gate"].as<bool>();
+    moe_param_.norm_topk_prob    = model_reader["norm_topk_prob"].as<bool>();
     moe_param_.routed_scale      = model_reader["routed_scale"].as<float>(1.f);
     moe_param_.topk_group        = model_reader["topk_group"].as<int>(1);
     moe_param_.topk_method       = model_reader["topk_method"].as<std::string>("greedy");