From e0b221e2a06717a7d3451009aff14609b765a36e Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Thu, 7 Nov 2024 04:58:48 +0000 Subject: [PATCH 01/21] add qwen2-moe --- lmdeploy/turbomind/deploy/config.py | 2 + lmdeploy/turbomind/deploy/module.py | 20 +- .../turbomind/deploy/source_model/mixtral.py | 2 + .../turbomind/deploy/source_model/qwen.py | 64 + lmdeploy/turbomind/supported_models.py | 1 + src/turbomind/kernels/gemm/moe_utils_v2.cu | 1623 +++++++++++++++-- src/turbomind/kernels/gemm/moe_utils_v2.h | 4 +- .../kernels/gemm/test/test_moe_utils.cu | 103 +- src/turbomind/kernels/gemm/test/testbed.h | 2 + .../models/llama/LlamaDecoderLayerWeight.cc | 52 +- src/turbomind/models/llama/LlamaDenseWeight.h | 16 + src/turbomind/models/llama/llama_params.h | 8 +- src/turbomind/models/llama/moe_ffn_layer.cc | 78 +- src/turbomind/models/llama/moe_ffn_layer.h | 6 +- src/turbomind/models/llama/unified_decoder.cc | 21 +- .../triton_backend/llama/LlamaTritonModel.cc | 2 + 16 files changed, 1777 insertions(+), 227 deletions(-) diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py index 7e8ebf7b4..a535b0d4c 100644 --- a/lmdeploy/turbomind/deploy/config.py +++ b/lmdeploy/turbomind/deploy/config.py @@ -50,6 +50,8 @@ class ModelConfig: expert_num: int = 0 expert_inter_size: int = 0 experts_per_token: int = 0 + moe_shared_gate: int = False + moe_norm_topk: int = False def verify(self): invalid = {} diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py index a9f738537..8e20946b1 100644 --- a/lmdeploy/turbomind/deploy/module.py +++ b/lmdeploy/turbomind/deploy/module.py @@ -140,14 +140,18 @@ class MoeFfn(Ffn): requires: r.moe_ffn_expert(e, i, kind) r.moe_ffn_gate(i) + r.moe_ffn_shared_gate(i) """ _moe_ffn_expert = 'layers.{0}.moe_ffn.experts.E.{1}.{2}' - _moe_ffn_gate = 'layers.{0}.moe_ffn.gate.{1}' + _moe_ffn_gate = 'layers.{0}.moe_ffn.gate.weight' + _moe_ffn_shared_gate = 'layers.{0}.moe_ffn.shared_gate.weight' def __init__(self, model: BaseOutputModel): super().__init__(model) self.expert_num = model.model_config.expert_num + self.inter_size = model.model_config.expert_inter_size + self.shared_gate = model.model_config.moe_shared_gate def apply(self, i: int, r: BaseReader): for p in get_params(r.moe_ffn_expert()): @@ -157,7 +161,12 @@ def apply(self, i: int, r: BaseReader): i) gate = transpose(r.moe_ffn_gate(i)) - self.model.save_split(gate, self._moe_ffn_gate.format(i, 'weight')) + self.model.save_split(gate, self._moe_ffn_gate.format(i)) + + if self.shared_gate: + shared_gate = transpose(r.moe_ffn_shared_gate(i)) + print(shared_gate) + self.model.save_split(shared_gate, self._moe_ffn_shared_gate.format(i)) class Attn(Module): @@ -248,8 +257,11 @@ class Transformer: def __init__(self, model: BaseOutputModel): self.model = model - ffn = MoeFfn if model.model_config.expert_num else Ffn - modules = [Attn, LayerNorm, ffn] + modules = [Attn, LayerNorm] + if model.model_config.inter_size: + modules.append(Ffn) + if model.model_config.expert_num: + modules.append(MoeFfn) self.modules = [c(model) for c in modules] self.misc = Misc(model) diff --git a/lmdeploy/turbomind/deploy/source_model/mixtral.py b/lmdeploy/turbomind/deploy/source_model/mixtral.py index 102ede29f..ff9df2d40 100644 --- a/lmdeploy/turbomind/deploy/source_model/mixtral.py +++ b/lmdeploy/turbomind/deploy/source_model/mixtral.py @@ -33,4 +33,6 @@ def model_info(self): info['expert_num'] = cfg['num_local_experts'] info['expert_inter_size'] = cfg['intermediate_size'] info['experts_per_token'] = cfg['num_experts_per_tok'] + info['moe_norm_topk'] = True + info['inter_size'] = 0 return info diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py index 0ec0586a3..071ade122 100644 --- a/lmdeploy/turbomind/deploy/source_model/qwen.py +++ b/lmdeploy/turbomind/deploy/source_model/qwen.py @@ -120,3 +120,67 @@ def model_info(self): cfg = super().model_info() cfg['attn_bias'] = 1 return cfg + + + +class Qwen2MoeReader(LlamaReader): + + ffn_pattern = r'shared_expert\.' + + def moe_ffn_expert(self, e=None, i=None, kind=None): + if not kind: + return self.filter(r'experts') + result = [] + for key in ['gate', 'down', 'up']: + name = f'model.layers.{i}.mlp.experts.{e}.{key}_proj.{kind}' + tensor = self.params.get(name) + tensor = self.transform(tensor, kind) + result.append(tensor) + return (*result, ) + + def moe_ffn_gate(self, i): + return self.params.get( + f'model.layers.{i}.mlp.gate.weight') + + def _ffn(self, i: int, kind: str): + """Get ffn kind for layer i.""" + if not kind: + return self.filter(self.ffn_pattern) + result = [] + for key in ['gate', 'down', 'up']: + tensor = self.params[ + f'model.layers.{i}.mlp.shared_expert.{key}_proj.{kind}'] + tensor = self.transform(tensor, kind) + result.append(tensor) + return (*result, ) + + def moe_ffn_shared_gate(self, i): + return self.params.get( + f'model.layers.{i}.mlp.shared_expert_gate.weight' + ) + +@INPUT_MODELS.register_module(name='qwen2-moe') +class Qwen2MoeModel(LlamaModel): + + Reader = Qwen2MoeReader + + def tokenizer_info(self): + """ + https://huggingface.co/Qwen/Qwen1.5-7B-Chat/blob/main/generation_config.json + """ # noqa: E501 + n_words = 152064 + bos_id = 151643 + eos_id = 151645 + return n_words, bos_id, eos_id + + def model_info(self): + cfg = self.model_config + info = super().model_info() + info['expert_num'] = cfg['num_experts'] + info['expert_inter_size'] = cfg['moe_intermediate_size'] + info['experts_per_token'] = cfg['num_experts_per_tok'] + info['inter_size'] = cfg['shared_expert_intermediate_size'] + info['moe_shared_gate'] = True + info['moe_norm_topk_prob'] = cfg['norm_topk_prob'] + info['attn_bias'] = 1 + return info diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py index 8a1f5e731..18c15481e 100644 --- a/lmdeploy/turbomind/supported_models.py +++ b/lmdeploy/turbomind/supported_models.py @@ -20,6 +20,7 @@ QWenLMHeadModel='qwen', # Qwen2 Qwen2ForCausalLM='qwen2', + Qwen2MoeForCausalLM='qwen2-moe', # mistral MistralForCausalLM='llama', # llava diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu index acf635585..42ff84a3a 100644 --- a/src/turbomind/kernels/gemm/moe_utils_v2.cu +++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu @@ -6,8 +6,13 @@ #include #include +#include +#include #include #include +#include +#include +#include #include #include "src/turbomind/kernels/core/array_ops.h" @@ -19,7 +24,7 @@ namespace turbomind { template __global__ void MoeGateKernel_V2(float* scales, // [e,n] - int* masks, // [E,n], padded + int8_t* masks, // [E,n], padded int* accum, // [E,tiles] const float* logits, // [E,n] int log_tile, @@ -88,6 +93,8 @@ __global__ void MoeGateKernel_V2(float* scales, // [e,n] const int lowbit = (mask & -mask); const int e = 31 - __clz(lowbit); + // printf("e = %d, ti = %d, idx = %d\n", e, ti, i); + masks[e * tokens_padded + ti] = i; atomicAdd(&shared_accum[e][ti >> log_tile], 1); top_val[i] = logits[ti * experts + e]; @@ -120,11 +127,11 @@ __global__ void MoeGateKernel_V2(float* scales, // [e,n] } } -template -__global__ void MoeScanKernel_V2(int* f2n, // [e*n] +template +__global__ void MoeScanKernel_v2(int* f2n, // [e*n] int* en2f, // [e,n] int* offsets, // [E+1] - int* masks, // [E,n], padded + Mask* masks, // [E,n], padded const int* accum, // [E,tiles] int log_tile, int tiles, @@ -142,13 +149,15 @@ __global__ void MoeScanKernel_V2(int* f2n, // [e*n] constexpr int vec_size = kMoeGateVecSize; - using Vec = Array; + using Vec = Array; const int tile_id = blockIdx.x; const int ei = blockIdx.y; - const int global_tile_id = ei * tiles + tile_id; + const int global_tile_id = ei * tiles + tile_id; + const bool is_valid = global_tile_id <= experts * tiles; +#if 0 int vacc[4]{}; { int idx = threadIdx.x; @@ -162,6 +171,18 @@ __global__ void MoeScanKernel_V2(int* f2n, // [e*n] } int offset = BlockReduce{temp_storage.reduce}.Sum(vacc); +#else + + int vacc = 0; + for (int i = threadIdx.x; i < global_tile_id; i += block_dim) { + if (is_valid && i < global_tile_id) { + vacc += accum[i]; + } + } + + int offset = BlockReduce{temp_storage.reduce}.Sum(vacc); + +#endif __shared__ int shared_offset; @@ -200,7 +221,7 @@ __global__ void MoeScanKernel_V2(int* f2n, // [e*n] const bool pred = vi < tile_vec_end; Vec data; - fill(data, -1); + fill(data, Mask{-1}); if (pred) { Ldg(data, mask_ptr[vi].data()); } @@ -231,137 +252,1504 @@ __global__ void MoeScanKernel_V2(int* f2n, // [e*n] } } -void invokeMoeGate_V2(int* f2n, // [e*n] -> n - int* en2f, // [e,n] -> n*e - int* offsets, // [E+1] - float* scales, // [e,n] - int* masks, // [E,n] - int* accum, // [E] - const float* logits, // [e,n] - int tokens, // n - int tokens_padded, // round_up(n, 4) - int experts, // E - int experts_per_token, - cudaStream_t st) +template +__global__ void MoeGateKernel_v3(float* scales, // [e,n] + int* masks, // [E,n], padded + int* accum, // [E,tiles] + const float* logits, // [n,E] + int log_tile, + int tiles, + int token_num, + int token_num_padded, + int expert_num, + int top_k) { - constexpr int base_log_tile = 9; + constexpr int max_tiles = kMoeGateMaxTiles; + constexpr int threads_per_token = max_expert_num / items_per_thread; // 8 - int log_tile = base_log_tile; - while (((tokens_padded + (1 << log_tile) - 1) >> log_tile) > kMoeGateMaxTiles) { - ++log_tile; + // We use bits in a uint32_t to represent selected experts + static_assert(items_per_thread <= 32); + // We use warp-level primitives for reduction + static_assert(threads_per_token <= 32); + + static_assert((threads_per_token & (threads_per_token - 1)) == 0); + + const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x; + + const int ti = thread_idx / threads_per_token; + const int ei = thread_idx % threads_per_token; + + const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token; + const int warp_ti_offset = warp_ti * threads_per_token; + + __shared__ int shared_accum[max_expert_num][max_tiles]; + + for (int i = threadIdx.x; i < expert_num * max_tiles; i += block_dim) { + int e = i / max_tiles; + int t = i % max_tiles; + if (e < expert_num && t < tiles) { + shared_accum[e][t] = 0; + } } - const int tiles = ceil_div(tokens_padded, 1 << log_tile); - // std::cout << log_tile << " " << tiles << "\n"; + __syncthreads(); - { - constexpr int threads = 128; - const int blocks = ceil_div(tokens, threads); + float data[items_per_thread]; - auto invoke = [&](auto e) { - static constexpr int top_k = decltype(e)::value; - MoeGateKernel_V2<<>>( // - scales, - masks, - accum, - logits, - log_tile, - tiles, - tokens, - tokens_padded, - experts); - }; + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + data[i] = -std::numeric_limits::infinity(); + const int e = threads_per_token * i + ei; + if (e < expert_num && ti < token_num) { + data[i] = __ldg(logits + ti * expert_num + e); + + if (ti == 39505) { + printf("%f %d\n", data[i], e); + } + } + } + + unsigned mask = (unsigned)-1; + float max_logit; + + auto run = [&](int k) { + unsigned bit = 1; + unsigned max_bit = 0; + float max_val = -std::numeric_limits::infinity(); + // local maximum + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + if ((mask & bit) && data[i] > max_val) { + max_bit = bit; + max_val = data[i]; + } + asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); + } + int g_max_ei = ei; + float g_max_val = max_val; + if constexpr (threads_per_token > 1) { + // global maximum + PRAGMA_UNROLL + for (int m = threads_per_token / 2; m >= 1; m /= 2) { + g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m)); + } + // tie breaking + const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val); + g_max_ei = __ffs(active >> (unsigned)warp_ti_offset) - 1; + } + if (k == 0) { + max_logit = g_max_val; + } + if (ei == g_max_ei) { + mask -= max_bit; + } + }; - switch (experts_per_token) { - case 2: - invoke(std::integral_constant{}); - break; - // case 4: - // invoke(std::integral_constant{}); - // break; - default: - std::cerr << __FILE__ << ":" << __LINE__ << " Not implemented. " << std::endl; - std::abort(); + run(0); + for (int k = 1; k < max_top_k; ++k) { + run(k); + } + + mask = ~mask; + + float sum_prob{}; + unsigned bit = 1; + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + if (norm_top_k == false || (mask & bit)) { + data[i] = expf(data[i] - max_logit); + sum_prob += data[i]; } + asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); } - // return; + PRAGMA_UNROLL + for (int m = threads_per_token / 2; m >= 1; m /= 2) { + sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m); + } - { - // Check: tiles * experts <= threads + sum_prob = fdividef(1.f, sum_prob); - constexpr int threads = (1 << base_log_tile) / kMoeGateVecSize; - const dim3 blocks(tiles, experts + 1); - MoeScanKernel_V2<<>>(f2n, // - en2f, - offsets, - masks, - accum, - log_tile, - tiles, - tokens, - tokens_padded, - experts); + const unsigned group_mask = ((1U << (unsigned)threads_per_token) - 1U) << (unsigned)warp_ti_offset; + + // 1111 1111 << 24 + + const unsigned lanemask_lt = ((1U << (unsigned)ei) - 1U) << (unsigned)warp_ti_offset; + + // 1000 0000 + // 0111 1111 0000 0000 0000 0000 0000 0000 + + int offset = 0; + bit = 1; + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + const int valid = mask & bit; + const unsigned active = __ballot_sync((uint32_t)-1, valid); + if (valid) { + const int e = threads_per_token * i + ei; + const int idx = offset + __popc(active & lanemask_lt); + if (ti == 39505) { + printf("%d %d %f\n", e, idx, data[i] * sum_prob); + } + masks[e * token_num_padded + ti] = idx; + scales[idx * token_num + ti] = data[i] * sum_prob; + // atomic add in Smem + atomicAdd(&shared_accum[e][ti >> log_tile], 1); + } + offset += __popc(active & group_mask); + asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); } -} -template -__global__ void MoeGatherKernel(T* dst, // [e*n, d] - const T* src, // [ n, d] - const int* f2n, // [e*n] :: e*n -> n - int dims) -{ - using Vec = Array; - const int64_t bi = blockIdx.x; + __syncthreads(); - auto src_ptr = (const Vec*)src + dims * f2n[bi]; - auto dst_ptr = (/* */ Vec*)dst + dims * bi; - for (int i = threadIdx.x; i < dims; i += block_dim) { - Vec v; - Ldg(v, src_ptr[i].data()); - Store(dst_ptr[i].data(), v); + for (int i = threadIdx.x; i < expert_num * max_tiles; i += block_dim) { + int e = i / max_tiles; + int t = i % max_tiles; + if (e < expert_num && t < tiles) { + // atomic add in Gmem + atomicAdd(accum + e * tiles + t, shared_accum[e][t]); + } } } -template -void invokeMoeGather(T* dst, const T* src, const int* f2n, int tokens, int experts_per_token, int dims, cudaStream_t st) +template +__global__ void MoeGateKernel_v4(float* scales, // [e,n] + int* masks, // [E,n], padded + int* accum, // [E,tiles] + const float* logits, // [n,E] + int log_tile, + int tiles, + int token_num, + int token_num_padded, + int expert_num, + int top_k) { - constexpr int threads = 256; - constexpr int vec_size = 16 / sizeof(T); - MoeGatherKernel<<>>( // - dst, - src, - f2n, - dims / vec_size); + constexpr int max_tiles = kMoeGateMaxTiles; + constexpr int threads_per_token = max_expert_num / items_per_thread; // 8 + + // We use bits in a uint32_t to represent selected experts + static_assert(items_per_thread <= 32); + // We use warp-level primitives for reduction + static_assert(threads_per_token <= 32); + + static_assert((threads_per_token & (threads_per_token - 1)) == 0); + + const int warp_id = threadIdx.x / WARP_SIZE; + const int lane_id = threadIdx.x % WARP_SIZE; + + const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x; + + const int ti = thread_idx / threads_per_token; + const int ei = thread_idx % threads_per_token; + + const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token; + const int warp_ti_offset = warp_ti * threads_per_token; + + __shared__ int shared_accum[max_expert_num][max_tiles]; + + for (int i = threadIdx.x; i < max_expert_num * max_tiles; i += block_dim) { + int e = i / max_tiles; + int t = i % max_tiles; + shared_accum[e][t] = 0; + } + +#if 0 + logits += blockIdx.x * block_dim / threads_per_token * expert_num; + logits += threadIdx.x / WARP_SIZE * (WARP_SIZE / threads_per_token) * expert_num; + constexpr int tokens_per_warp = WARP_SIZE / threads_per_token; + __shared__ float smem_data[block_dim / WARP_SIZE][tokens_per_warp][max_expert_num + 1]; + PRAGMA_UNROLL + for (int i = 0; i < tokens_per_warp * max_expert_num; i += WARP_SIZE) { + smem_data[warp_id][(lane_id + i) / max_expert_num][(lane_id + i) % max_expert_num] = logits[lane_id + i]; + } + __syncthreads(); + float data[items_per_thread]; + for (int i = 0; i < items_per_thread; ++i) { + const int e = items_per_thread * ei + i; + data[i] = smem_data[warp_id][lane_id / threads_per_token][e]; + } +#else + __syncthreads(); + float data[items_per_thread]; + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + data[i] = -std::numeric_limits::infinity(); + } + if (ti < token_num) { + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; i += alignment) { + const int e = items_per_thread * ei + i; + if (e < expert_num) { + Load((Array&)data[i], &logits[ti * expert_num + e]); + } + } + } +#endif + + unsigned mask = (unsigned)-1; + float max_logit; + + auto run = [&](int k) { + unsigned bit = 1; + unsigned max_bit = 0; + float max_val = -std::numeric_limits::infinity(); + // local maximum + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + if ((mask & bit) && data[i] > max_val) { + max_bit = bit; + max_val = data[i]; + } + asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); + } + int g_max_ei = ei; + float g_max_val = max_val; + if constexpr (threads_per_token > 1) { + // global maximum + PRAGMA_UNROLL + for (int m = threads_per_token / 2; m >= 1; m /= 2) { + g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m)); + } + // tie breaking + const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val); + g_max_ei = __ffs(active >> (unsigned)warp_ti_offset) - 1; + } + if (k == 0) { + max_logit = g_max_val; + } + if (ei == g_max_ei) { + mask -= max_bit; + } + }; + + run(0); + for (int k = 1; k < max_top_k; ++k) { + run(k); + } + + mask = ~mask; + + float sum_prob{}; + unsigned bit = 1; + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + if (norm_top_k == false || (mask & bit)) { + data[i] = expf(data[i] - max_logit); + sum_prob += data[i]; + } + asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); + } + + PRAGMA_UNROLL + for (int m = threads_per_token / 2; m >= 1; m /= 2) { + sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m); + } + + sum_prob = fdividef(1.f, sum_prob); + + const int count = __popc(mask); + + using WarpScan = cub::WarpScan; + __shared__ typename WarpScan::TempStorage temp_storage[block_dim / threads_per_token]; + + int idx{}; + WarpScan{temp_storage[threadIdx.x / threads_per_token]}.ExclusiveSum(count, idx); + + bit = 1; + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + const int valid = mask & bit; + const int e = items_per_thread * ei + i; + if (valid) { + masks[e * token_num_padded + ti] = idx; + scales[idx * token_num + ti] = data[i] * sum_prob; + // atomic add in Smem + atomicAdd(&shared_accum[e][ti >> log_tile], 1); + ++idx; + } + asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); + } + + __syncthreads(); + + for (int i = threadIdx.x; i < expert_num * max_tiles; i += block_dim) { + int e = i / max_tiles; + int t = i % max_tiles; + if (e < expert_num && t < tiles) { + // atomic add in Gmem + atomicAdd(accum + e * tiles + t, shared_accum[e][t]); + } + } } -template void invokeMoeGather(uint16_t*, const uint16_t*, const int*, int, int, int, cudaStream_t); +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// -template -__global__ void MoeReduceKernel(T* dst, // [ n, d] - const T* src, // [e*n, d] - const float* scales, // [ e, n] - const int* en2f, // [ e, n] :: (e,n) -> e*n - int dims, - int tokens) +template +__global__ void MoeGateKernel_v5(float* scales, // [e,n] + int* masks, // [E,n], padded + int* accum, // [E,tiles] + const float* logits, // [n,E] + int log_tile, + int tiles, + int token_num, + int token_num_padded, + int expert_num, + int top_k) { - using Vec = Array; + constexpr int max_tiles = kMoeGateMaxTiles; + constexpr int threads_per_token = max_expert_num / items_per_thread; // 8 - const int64_t ti = blockIdx.x; + // We use bits in a uint32_t to represent selected experts + static_assert(items_per_thread <= 32); + // We use warp-level primitives for reduction + static_assert(threads_per_token <= 32); - auto dst_ptr = (Vec*)dst + dims * ti; + static_assert((threads_per_token & (threads_per_token - 1)) == 0); + + const int warp_id = threadIdx.x / WARP_SIZE; + const int lane_id = threadIdx.x % WARP_SIZE; + + const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x; + + const int ti = thread_idx / threads_per_token; + const int ei = thread_idx % threads_per_token; + + const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token; + const int warp_ti_offset = warp_ti * threads_per_token; + + __shared__ int shared_accum[max_expert_num][max_tiles]; + + for (int i = threadIdx.x; i < max_expert_num * max_tiles; i += block_dim) { + int e = i / max_tiles; + int t = i % max_tiles; + shared_accum[e][t] = 0; + } + + __syncthreads(); + + constexpr int sort_k = std::min(items_per_thread, max_top_k); + + float data[items_per_thread]; - // Should be warp uniforms - const Vec* src_ptr[exp_k]; - float scale[exp_k]; PRAGMA_UNROLL - for (int e = 0; e < exp_k; ++e) { - src_ptr[e] = (const Vec*)src + dims * en2f[e * tokens + ti]; - scale[e] = scales ? scales[e * tokens + ti] : 1.f; + for (int i = 0; i < items_per_thread; ++i) { + data[i] = -std::numeric_limits::infinity(); + } + if (ti < token_num) { + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; i += alignment) { + const int e = items_per_thread * ei + i; + if (e < expert_num) { + Load((Array&)data[i], &logits[ti * expert_num + e]); + } + } } - for (int i = threadIdx.x; i < dims; i += block_dim) { - Array accum{}; + float value[items_per_thread]; + int index[items_per_thread]; + + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + value[i] = data[i]; + index[i] = items_per_thread * ei + i; + } + + // PRAGMA_UNROLL + // for (int i = 0; i < sort_k; ++i) { + // for (int j = items_per_thread - 1; j > i; --j) { + // if (value[j] > value[j - 1]) { + // auto tmp0 = value[j]; + // value[j] = value[j - 1]; + // value[j - 1] = tmp0; + // auto tmp1 = index[j]; + // index[j] = index[j - 1]; + // index[j - 1] = tmp1; + // } + // } + // } + + PRAGMA_UNROLL + for (int j = items_per_thread - 1; j > 0; --j) { + if (value[j] > value[j - 1]) { + auto tmp0 = value[j]; + value[j] = value[j - 1]; + value[j - 1] = tmp0; + auto tmp1 = index[j]; + index[j] = index[j - 1]; + index[j - 1] = tmp1; + } + } + + float max_logit; + + int count = 0; + + auto run = [&](int k) { + float max_val = -std::numeric_limits::infinity(); + + PRAGMA_UNROLL + for (int i = 0; i <= k; ++i) { + if (i == count) { + max_val = value[i]; + } + } + + if (k + 1 < sort_k) { + PRAGMA_UNROLL + for (int j = items_per_thread - 1; j > k + 1; --j) { + if (value[j] > value[j - 1]) { + auto tmp0 = value[j]; + value[j] = value[j - 1]; + value[j - 1] = tmp0; + auto tmp1 = index[j]; + index[j] = index[j - 1]; + index[j - 1] = tmp1; + } + } + } + + int g_max_ei = ei; + float g_max_val = max_val; + if constexpr (threads_per_token > 1) { + // global maximum + PRAGMA_UNROLL + for (int m = threads_per_token / 2; m >= 1; m /= 2) { + g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m)); + } + // tie breaking + const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val); + g_max_ei = __ffs(active >> (unsigned)warp_ti_offset) - 1; + } + + if (k == 0) { + max_logit = g_max_val; + } + + if (ei == g_max_ei) { + count += 1; + } + }; + + PRAGMA_UNROLL + for (int k = 0; k < max_top_k; ++k) { + run(k); + } + + float sum_prob{}; + + PRAGMA_UNROLL + for (int i = 0; i < sort_k; ++i) { + if (i < count) { + value[i] = expf(value[i] - max_logit); + sum_prob += value[i]; + } + } + + PRAGMA_UNROLL + for (int m = threads_per_token / 2; m >= 1; m /= 2) { + sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m); + } + sum_prob = fdividef(1.f, sum_prob); + + using WarpScan = cub::WarpScan; + __shared__ typename WarpScan::TempStorage temp_storage[block_dim / threads_per_token]; + + int idx{}; + WarpScan{temp_storage[threadIdx.x / threads_per_token]}.ExclusiveSum(count, idx); + + PRAGMA_UNROLL + for (int i = 0; i < sort_k; ++i) { + if (ti < token_num && i < count) { + const int e = index[i]; + masks[e * token_num_padded + ti] = idx; + scales[idx * token_num + ti] = value[i] * sum_prob; + atomicAdd(&shared_accum[e][ti >> log_tile], 1); + ++idx; + } + } + + __syncthreads(); + + for (int i = threadIdx.x; i < expert_num * max_tiles; i += block_dim) { + int e = i / max_tiles; + int t = i % max_tiles; + if (e < expert_num && t < tiles) { + // atomic add in Gmem + atomicAdd(accum + e * tiles + t, shared_accum[e][t]); + } + } +} + +/////////////////////////////////////////////////////////////////////////////////////////// + +template +__global__ void MoeGateKernel_v6(float* scales, // [e,n] + int* masks, // [E,n], padded + int* accum, // [E,tiles] + const float* logits, // [n,E] + int log_tile, + int tiles, + int token_num, + int token_num_padded, + int expert_num, + int top_k) +{ + constexpr int max_tiles = kMoeGateMaxTiles; + constexpr int threads_per_token = max_expert_num / items_per_thread; // 8 + + // We use bits in a uint32_t to represent selected experts + static_assert(items_per_thread <= 32); + // We use warp-level primitives for reduction + static_assert(threads_per_token <= 32); + + static_assert((threads_per_token & (threads_per_token - 1)) == 0); + + const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x; + + const int ti = thread_idx / threads_per_token; + const int ei = thread_idx % threads_per_token; + + const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token; + const int warp_ti_offset = warp_ti * threads_per_token; + + __shared__ int shared_accum[max_expert_num][max_tiles]; + + for (int i = threadIdx.x; i < max_expert_num * max_tiles; i += block_dim) { + int e = i / max_tiles; + int t = i % max_tiles; + shared_accum[e][t] = 0; + } + + __syncthreads(); + + float data[items_per_thread]; + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + data[i] = -std::numeric_limits::infinity(); + } + if (ti < token_num) { + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; i += alignment) { + const int e = items_per_thread * ei + i; + if (e < expert_num) { + Load((Array&)data[i], &logits[ti * expert_num + e]); + } + } + } + + float max_logit; + + unsigned mask = (unsigned)-1; + unsigned max_bit = 0; + float max_val = -std::numeric_limits::infinity(); + { + unsigned bit = 1; + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + if ((mask & bit) && data[i] > max_val) { + max_bit = bit; + max_val = data[i]; + } + asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); + } + } + + auto run = [&](int k) { + unsigned next_mask = mask - max_bit; + unsigned next_max_bit = 0; + float next_max_val = -std::numeric_limits::infinity(); + unsigned bit = 1; + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + if ((next_mask & bit) && data[i] > next_max_val) { + next_max_bit = bit; + next_max_val = data[i]; + } + asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); + } + + int g_max_ei = ei; + float g_max_val = max_val; + + if constexpr (threads_per_token > 1) { + // global maximum + PRAGMA_UNROLL + for (int m = threads_per_token / 2; m >= 1; m /= 2) { + g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m)); + } + // tie breaking + const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val); + g_max_ei = __ffs(active >> (unsigned)warp_ti_offset) - 1; + } + + if (k == 0) { + max_logit = g_max_val; + } + + if (ei == g_max_ei) { + mask = next_mask; + max_bit = next_max_bit; + max_val = next_max_val; + } + }; + + run(0); + for (int k = 1; k < max_top_k; ++k) { + run(k); + } + + mask = ~mask; + + float sum_prob{}; + unsigned bit = 1; + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + if (norm_top_k == false || (mask & bit)) { + data[i] = expf(data[i] - max_logit); + sum_prob += data[i]; + } + asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); + } + + PRAGMA_UNROLL + for (int m = threads_per_token / 2; m >= 1; m /= 2) { + sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m); + } + + sum_prob = fdividef(1.f, sum_prob); + + const int count = __popc(mask); + + using WarpScan = cub::WarpScan; + __shared__ typename WarpScan::TempStorage temp_storage[block_dim / threads_per_token]; + + int idx{}; + WarpScan{temp_storage[threadIdx.x / threads_per_token]}.ExclusiveSum(count, idx); + + bit = 1; + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + const int valid = mask & bit; + const int e = items_per_thread * ei + i; + if (valid) { + masks[e * token_num_padded + ti] = idx; + scales[idx * token_num + ti] = data[i] * sum_prob; + // atomic add in Smem + atomicAdd(&shared_accum[e][ti >> log_tile], 1); + ++idx; + } + asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); + } + + __syncthreads(); + + for (int i = threadIdx.x; i < expert_num * max_tiles; i += block_dim) { + int e = i / max_tiles; + int t = i % max_tiles; + if (e < expert_num && t < tiles) { + // atomic add in Gmem + atomicAdd(accum + e * tiles + t, shared_accum[e][t]); + } + } +} + +template +__global__ void MoeGateKernel_v7(float* scales, // [e,n] + int* masks, // [E,n], padded + int* accum, // [E,tiles] + const float* logits, // [n,E] + int log_tile, + int tiles, + int token_num, + int token_num_padded, + int expert_num, + int top_k) +{ + constexpr int max_tiles = kMoeGateMaxTiles; + constexpr int threads_per_token = max_expert_num / items_per_thread; // 8 + + // We use bits in a uint32_t to represent selected experts + static_assert(items_per_thread <= 32); + // We use warp-level primitives for reduction + static_assert(threads_per_token <= 32); + + static_assert((threads_per_token & (threads_per_token - 1)) == 0); + + const int warp_id = threadIdx.x / WARP_SIZE; + const int lane_id = threadIdx.x % WARP_SIZE; + + const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x; + + const int ti = thread_idx / threads_per_token; + const int ei = thread_idx % threads_per_token; + + const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token; + const int warp_ti_offset = warp_ti * threads_per_token; + + // +1 padding greatly reduced (-80%) bank conflicts + __shared__ int shared_accum[max_tiles][max_expert_num + 1]; + + for (int i = threadIdx.x; i < max_tiles * max_expert_num; i += block_dim) { + int e = i % max_expert_num; + int t = i / max_expert_num; + shared_accum[t][e] = 0; + } + +#if 0 + logits += blockIdx.x * block_dim / threads_per_token * expert_num; + logits += threadIdx.x / WARP_SIZE * (WARP_SIZE / threads_per_token) * expert_num; + constexpr int tokens_per_warp = WARP_SIZE / threads_per_token; + __shared__ float smem_data[block_dim / WARP_SIZE][tokens_per_warp][max_expert_num]; + PRAGMA_UNROLL + for (int i = 0; i < tokens_per_warp * max_expert_num; i += WARP_SIZE) { + smem_data[warp_id][(lane_id + i) / max_expert_num][(lane_id + i) % max_expert_num] = logits[lane_id + i]; + } + __syncthreads(); + float data[items_per_thread]; + for (int i = 0; i < items_per_thread; ++i) { + const int e = items_per_thread * ei + i; + data[i] = smem_data[warp_id][lane_id / threads_per_token][e]; + } +#else + __syncthreads(); + float data[items_per_thread]; + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + data[i] = -std::numeric_limits::infinity(); + } + if (ti < token_num) { + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; i += alignment) { + const int e = items_per_thread * ei + i; + if (e < expert_num) { + Load((Array&)data[i], &logits[ti * expert_num + e]); + } + } + } +#endif + + unsigned mask = (unsigned)-1; + float max_logit; + + auto run = [&](int k) { + unsigned bit = 1; + unsigned max_bit = 0; + float max_val = -std::numeric_limits::infinity(); + // local maximum + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + if ((mask & bit) && data[i] > max_val) { + max_bit = bit; + max_val = data[i]; + } + asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); + } + int g_max_ei = ei; + float g_max_val = max_val; + if constexpr (threads_per_token > 1) { + // global maximum + PRAGMA_UNROLL + for (int m = threads_per_token / 2; m >= 1; m /= 2) { + g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m)); + } + // tie breaking + const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val); + g_max_ei = __ffs(active >> (unsigned)warp_ti_offset) - 1; + } + if (k == 0) { + max_logit = g_max_val; + } + if (ei == g_max_ei) { + mask -= max_bit; + } + }; + + run(0); + for (int k = 1; k < max_top_k; ++k) { + run(k); + } + + mask = ~mask; + + float sum_prob{}; + unsigned bit = 1; + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + if (norm_top_k == false || (mask & bit)) { + data[i] = expf(data[i] - max_logit); + sum_prob += data[i]; + } + asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); + } + + PRAGMA_UNROLL + for (int m = threads_per_token / 2; m >= 1; m /= 2) { + sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m); + } + + sum_prob = fdividef(1.f, sum_prob); + + const int count = __popc(mask); + + using WarpScan = cub::WarpScan; + __shared__ typename WarpScan::TempStorage temp_storage[block_dim / threads_per_token]; + + int idx{}; + WarpScan{temp_storage[threadIdx.x / threads_per_token]}.ExclusiveSum(count, idx); + + bit = 1; + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + const int valid = mask & bit; + const int e = items_per_thread * ei + i; + if (valid) { + masks[e * token_num_padded + ti] = idx; + // scales[idx * token_num + ti] = data[i] * sum_prob; + scales[ti * top_k + idx] = data[i] * sum_prob; + // atomic add in Smem + atomicAdd(&shared_accum[ti >> log_tile][e], 1); + ++idx; + } + asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); + } + + __syncthreads(); + + for (int i = threadIdx.x; i < max_expert_num * max_tiles; i += block_dim) { + int t = i % max_tiles; + int e = i / max_tiles; + if (e < expert_num && t < tiles) { + // atomic add in Gmem + atomicAdd(accum + e * tiles + t, shared_accum[t][e]); + } + } +} + +template +__global__ void MoeGateKernel_v8(float* scales, // [e,n] + Mask* masks, // [E,n], padded + int* accum, // [E,tiles] + const float* logits, // [n,E] + int log_tile, + int tiles, + int token_num, + int token_num_padded, + int expert_num, + int top_k, + bool norm_topk) +{ + constexpr int max_tiles = kMoeGateMaxTiles; + constexpr int threads_per_token = max_expert_num / items_per_thread; // 8 + constexpr int tokens_per_cta = block_dim / threads_per_token; + + // We use bits in a uint32_t to represent selected experts + static_assert(items_per_thread <= 32); + // We use warp-level primitives for reduction + static_assert(threads_per_token <= 32); + + static_assert((threads_per_token & (threads_per_token - 1)) == 0); + + const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x; + + const int ti = thread_idx / threads_per_token; + const int ei = thread_idx % threads_per_token; + + const int bti = threadIdx.x / threads_per_token; + + const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token; + + const int warp_offset = thread_idx / WARP_SIZE * WARP_SIZE / threads_per_token; + const int block_offset = thread_idx / block_dim * block_dim / threads_per_token; + + float data[items_per_thread]; + int idxs[items_per_thread]; + +#if 0 + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + data[i] = -std::numeric_limits::infinity(); + idxs[i] = threads_per_token * (i / access_size * access_size) + i % access_size + ei * access_size; + } + if (ti < token_num) { + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; i += access_size) { + const int e = threads_per_token * i + ei * access_size; + if (e < expert_num) { + Ldg((Array&)data[i], &logits[ti * expert_num + e]); + } + } + } + + __shared__ union { + struct { + // +1 padding greatly reduced (-80%) bank conflicts + int shared_accum[max_tiles][max_expert_num + 1]; + float shared_scales[max_top_k][tokens_per_cta]; + int shared_exp_id[max_top_k][tokens_per_cta]; + }; + } smem; +#elif 1 + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + data[i] = -std::numeric_limits::infinity(); + // idxs[i] = threads_per_token * (i / access_size * access_size) + i % access_size + ei * access_size; + idxs[i] = ei * items_per_thread + i; + } + if (ti < token_num) { + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; i += access_size) { + // const int e = threads_per_token * i + ei * access_size; + const int e = ei * items_per_thread + i; + if (e < expert_num) { + Ldg((Array&)data[i], &logits[ti * expert_num + e]); + } + } + } + + __shared__ union { + struct { + // +1 padding greatly reduced (-80%) bank conflicts + int shared_accum[max_tiles][max_expert_num + 1]; + float shared_scales[max_top_k][tokens_per_cta]; + int shared_exp_id[max_top_k][tokens_per_cta]; + }; + } smem; +#else + + const int warp_id = threadIdx.x / WARP_SIZE; + const int lane_id = threadIdx.x % WARP_SIZE; + + constexpr int vecs_per_thread = items_per_thread / access_size; + + using Vec = Array; + constexpr int banks = 128 / sizeof(Vec); + constexpr int chunks = 4; // block_dim / WARP_SIZE; + + __shared__ union { + Vec shared_data[chunks][vecs_per_thread * WARP_SIZE / banks][banks + 1]; + struct { + // +1 padding greatly reduced (-80%) bank conflicts + int shared_accum[max_tiles][max_expert_num + 1]; + float shared_scales[max_top_k][tokens_per_cta]; + int shared_exp_id[max_top_k][tokens_per_cta]; + }; + } smem; + + __align__(16) Vec vecs[vecs_per_thread]; + + { + const int warp_end = min(warp_offset + WARP_SIZE / threads_per_token, token_num) * expert_num; + int p = warp_offset * expert_num + access_size * lane_id; + PRAGMA_UNROLL + for (int i = 0; i < vecs_per_thread; ++i) { + fill(vecs[i], -std::numeric_limits::infinity()); + // const int p = warp_offset * expert_num + access_size * (lane_id + i * WARP_SIZE); + if (p < warp_end) { + Ldg(vecs[i], &logits[p]); + } + p += access_size * WARP_SIZE; + } + } + + PRAGMA_UNROLL + for (int c = 0; c < block_dim / WARP_SIZE; c += chunks) { + PRAGMA_UNROLL + for (int i = 0; i < vecs_per_thread; ++i) { + int p = i * WARP_SIZE + lane_id; + if (c <= warp_id && warp_id < c + chunks) { + Store(smem.shared_data[warp_id - c][p / banks][p % banks].data(), vecs[i]); + } + } + + __syncwarp(); + + PRAGMA_UNROLL + for (int i = 0; i < vecs_per_thread; ++i) { + int p = lane_id * vecs_per_thread + i; + if (c <= warp_id && warp_id < c + chunks) { + Load(vecs[i], smem.shared_data[warp_id - c][p / banks][p % banks].data()); + } + } + + __syncthreads(); + } + + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + idxs[i] = ei * items_per_thread + i; + } + PRAGMA_UNROLL + for (int i = 0; i < vecs_per_thread; ++i) { + (Array&)data[i * access_size] = vecs[i]; + } + +#endif + + constexpr float kLog2e = 1.4426950408889634074; + + unsigned mask = (unsigned)-1; + float max_logit; + + int count{}; + float sum_prob{}; + + const int warp_ti_offset = warp_ti * threads_per_token; + + auto run = [&](int k) { + unsigned bit = 1; + unsigned max_bit = 0; + float max_val = -std::numeric_limits::infinity(); + // local maximum + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + if ((mask & bit) && data[i] > max_val) { + max_bit = bit; + max_val = data[i]; + } + asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); + } + + if (k == 0) { + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + data[i] *= kLog2e; + } + } + + int g_max_ei = ei; + float g_max_val = max_val; + if constexpr (threads_per_token > 1) { + // global maximum + PRAGMA_UNROLL + for (int m = threads_per_token / 2; m >= 1; m /= 2) { + g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m)); + } + // tie breaking + const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val); + g_max_ei = __ffs(active >> (unsigned)warp_ti_offset) - 1; + } + if (k == 0) { + max_logit = g_max_val; + } + if (ei == g_max_ei) { + mask -= max_bit; + ++count; + } + }; + + run(0); + + for (int k = 1; k < top_k; ++k) { + run(k); + } + + mask = ~mask; + + int used[items_per_thread]; + { + unsigned bit = 1; + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + used[i] = (mask & bit) > 0; + asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); + } + } + + // unsigned bit = 1; + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + if (!norm_topk || used[i]) { + data[i] = exp2f(data[i] - max_logit); + sum_prob += data[i]; + } + } + + PRAGMA_UNROLL + for (int m = threads_per_token / 2; m >= 1; m /= 2) { + sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m); + } + + sum_prob = fdividef(1.f, sum_prob); + + // float sum_v = 0; + // PRAGMA_UNROLL + // for (int i = 0; i < items_per_thread; ++i) { + // // data[i] = exp2f(data[i] - max_logit); + // // sum_prob += data[i]; + // float v = data[i] * sum_prob; + // sum_v += v; + // if (ti == 0) { + // printf("%f\n", v); + // } + // } + // PRAGMA_UNROLL + // for (int m = threads_per_token / 2; m >= 1; m /= 2) { + // sum_v += __shfl_xor_sync((uint32_t)-1, sum_v, m); + // } + // if (ti == 0) { + // printf("sum=%f\n", sum_v); + // } + + using WarpScan = cub::WarpScan; + __shared__ typename WarpScan::TempStorage temp_storage[tokens_per_cta]; + + int idx{}; + WarpScan{temp_storage[bti]}.ExclusiveSum(count, idx); + + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + if (used[i]) { + smem.shared_exp_id[idx][bti] = idxs[i]; + smem.shared_scales[idx][bti] = data[i] * sum_prob; + ++idx; + } + } + + PRAGMA_UNROLL + for (int i = 0; i < max_tiles * max_expert_num; i += block_dim) { + int e = (i + threadIdx.x) % max_expert_num; + int t = (i + threadIdx.x) / max_expert_num; + smem.shared_accum[t][e] = 0; + } + + __syncthreads(); + + constexpr int k_per_thread = cdiv(max_top_k, threads_per_token); + + const int bti2 = threadIdx.x % tokens_per_cta; + const int ei2 = threadIdx.x / tokens_per_cta; + const int ti2 = blockIdx.x * tokens_per_cta + bti2; + + PRAGMA_UNROLL + for (int i = 0; i < k_per_thread; ++i) { + const int idx = ei2 * k_per_thread + i; + const int expert_id = smem.shared_exp_id[idx][bti2]; + const float scale = smem.shared_scales[idx][bti2]; + + if (ti2 < token_num && idx < top_k) { + masks[expert_id * token_num_padded + ti2] = idx; + scales[idx * token_num + ti2] = scale; + atomicAdd(&smem.shared_accum[ti2 >> log_tile][expert_id], 1); + + // printf("%d %d %f\n", idx, expert_id, scale); + } + } + + __syncthreads(); + + for (int i = 0; i < max_expert_num * max_tiles; i += block_dim) { + int t = (threadIdx.x + i) % max_tiles; + int e = (threadIdx.x + i) / max_tiles; + if (e < expert_num && t < tiles) { + atomicAdd(accum + e * tiles + t, smem.shared_accum[t][e]); + } + } +} + +#ifdef USE_WARPSORT + +struct Greater { + template + __device__ bool operator()(T a, T b) const noexcept + { + return a > b; + } +}; + +template +__global__ void MoeGateKernel_v9(float* scales, // [e,n] + int* masks, // [E,n], padded + int* accum, // [E,tiles] + const float* logits, // [n,E] + int log_tile, + int tiles, + int token_num, + int token_num_padded, + int expert_num, + int top_k) +{ + constexpr int max_tiles = kMoeGateMaxTiles; + constexpr int threads_per_token = max_expert_num / items_per_thread; // 8 + + // We use bits in a uint32_t to represent selected experts + static_assert(items_per_thread <= 32); + // We use warp-level primitives for reduction + static_assert(threads_per_token <= 32); + + static_assert((threads_per_token & (threads_per_token - 1)) == 0); + + const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x; + + const int ti = thread_idx / threads_per_token; + const int ei = thread_idx % threads_per_token; + + const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token; + const int warp_ti_offset = warp_ti * threads_per_token; + + // +1 padding greatly reduced (-80%) bank conflicts + __shared__ int shared_accum[max_tiles][max_expert_num + 1]; + + PRAGMA_UNROLL + for (int i = 0; i < max_tiles * max_expert_num; i += block_dim) { + int e = (i + threadIdx.x) % max_expert_num; + int t = (i + threadIdx.x) / max_expert_num; + shared_accum[t][e] = 0; + } + + __syncthreads(); + + float data[items_per_thread]; + int idxs[items_per_thread]; + + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + data[i] = -std::numeric_limits::infinity(); + idxs[i] = threads_per_token * (i / alignment * alignment) + i % alignment + ei * alignment; + } + + if (ti < token_num) { + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; i += alignment) { + const int e = threads_per_token * i + ei * alignment; + if (e < expert_num) { + Load((Array&)data[i], &logits[ti * expert_num + e]); + } + } + } + + constexpr float kLog2e = 1.4426950408889634074; + + using MergeSort = cub::WarpMergeSort; + + union Smem { + typename MergeSort::TempStorage temp[block_dim / threads_per_token]; + // int2 sorted[max_top_k][block_dim / threads_per_token + 1]; + int2 sorted[block_dim / threads_per_token][max_top_k + 1]; + }; + + __shared__ Smem smem; + + MergeSort{smem.temp[threadIdx.x / threads_per_token]}.Sort(data, idxs, Greater{}); + + __syncthreads(); + + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + const int k = threads_per_token * ei + i; // blocked + if (k < max_top_k) { + // smem.sorted[k][threadIdx.x / threads_per_token] = int2{float_as_int(data[i]), idxs[i]}; + smem.sorted[threadIdx.x / threads_per_token][k] = int2{float_as_int(data[i]), idxs[i]}; + } + } + + __syncthreads(); + + constexpr int top_k_per_thread = cdiv(max_top_k, threads_per_token); + + int es[top_k_per_thread]; + + PRAGMA_UNROLL + for (int i = 0; i < top_k_per_thread; ++i) { + const int k = ei * top_k_per_thread + i; + if (k < max_top_k) { + // const int2 tmp = smem.sorted[k][threadIdx.x / threads_per_token]; + const int2 tmp = smem.sorted[threadIdx.x / threads_per_token][k]; + const int e = tmp.y; + masks[e * token_num_padded + ti] = k; + scales[ti * top_k + k] = int_as_float(tmp.x); + es[i] = e; + // atomicAdd(&shared_accum[ti >> log_tile][e], 1); + } + } + + PRAGMA_UNROLL + for (int i = 0; i < top_k_per_thread; ++i) { + const int k = ei * top_k_per_thread + i; + if (k < max_top_k) { + atomicAdd(&shared_accum[ti >> log_tile][es[i]], 1); + } + } + + __syncthreads(); + + for (int i = threadIdx.x; i < max_expert_num * max_tiles; i += block_dim) { + int t = i % max_tiles; + int e = i / max_tiles; + if (e < expert_num && t < tiles) { + // atomic add in Gmem + atomicAdd(accum + e * tiles + t, shared_accum[t][e]); + } + } +} + +#endif + +template +inline constexpr std::integral_constant _Int{}; + +void invokeMoeGate_V2(int* f2n, // [e*n] -> n + int* en2f, // [e,n] -> n*e + int* offsets, // [E+1] + float* scales, // [e,n] + void* masks, // [E,n] + int* accum, // [E] + const float* logits, // [e,n] + int tokens, // n + int tokens_padded, // round_up(n, 4) + int experts, // E + int experts_per_token, + bool norm_topk, + cudaStream_t st) +{ + constexpr int base_log_tile = 9; + + int log_tile = base_log_tile; + while (((tokens_padded + (1 << log_tile) - 1) >> log_tile) > kMoeGateMaxTiles) { + ++log_tile; + } + const int tiles = ceil_div(tokens_padded, 1 << log_tile); + + // std::cout << log_tile << " " << tiles << "\n"; + + { + + auto invoke = [&](auto e) { + constexpr int threads = 128; + const int blocks = ceil_div(tokens * 2, threads); + static constexpr int top_k = decltype(e)::value; + MoeGateKernel_V2<<>>( // + scales, + (int8_t*)masks, + accum, + logits, + log_tile, + tiles, + tokens, + tokens_padded, + experts); + }; + + auto invoke2 = [&](auto max_expert_num, auto top_k, auto items_per_thread) { + constexpr int thrs_per_tok = max_expert_num.value / items_per_thread.value; + constexpr int threads = 128; + const int blocks = ceil_div(tokens, threads / thrs_per_tok); + + cudaMemsetAsync(masks, -1, sizeof(int8_t) * experts * tokens_padded, st); + + MoeGateKernel_v8 + <<>>( // + scales, + (int8_t*)masks, + accum, + logits, + log_tile, + tiles, + tokens, + tokens_padded, + experts, + experts_per_token, + norm_topk); + }; + + invoke2(_Int<64>, _Int<8>, _Int<16>); + + // invoke(_Int<2>); + } + + { + constexpr int threads = (1 << base_log_tile) / kMoeGateVecSize; + const dim3 blocks(tiles, experts + 1); + + MoeScanKernel_v2<<>>(f2n, // + en2f, + offsets, + (int8_t*)masks, + accum, + log_tile, + tiles, + tokens, + tokens_padded, + experts); + } +} + +template +__global__ void MoeGatherKernel(T* dst, // [e*n, d] + const T* src, // [ n, d] + const int* f2n, // [e*n] :: e*n -> n + int dims) +{ + using Vec = Array; + const int64_t bi = blockIdx.x; + + auto src_ptr = (const Vec*)src + dims * f2n[bi]; + auto dst_ptr = (/* */ Vec*)dst + dims * bi; + for (int i = threadIdx.x; i < dims; i += block_dim) { + Vec v; + Ldg(v, src_ptr[i].data()); + Store(dst_ptr[i].data(), v); + } +} + +template +void invokeMoeGather(T* dst, const T* src, const int* f2n, int tokens, int experts_per_token, int dims, cudaStream_t st) +{ + constexpr int threads = 256; + constexpr int vec_size = 16 / sizeof(T); + MoeGatherKernel<<>>( // + dst, + src, + f2n, + dims / vec_size); +} + +template void invokeMoeGather(uint16_t*, const uint16_t*, const int*, int, int, int, cudaStream_t); + +template +__global__ void MoeReduceKernel(T* dst, // [ n, d] + const T* src, // [e*n, d] + const float* scales, // [ e, n] + const int* en2f, // [ e, n] :: (e,n) -> e*n + const float* dst_scales, // [n] + int dims, + int tokens) +{ + using Vec = Array; + + const int64_t ti = blockIdx.x; + + auto dst_ptr = (Vec*)dst + dims * ti; + + float dst_scale = 0; + if (dst_scales) { + dst_scale = dst_scales[ti]; + dst_scale = fdividef(1.f, 1.f + expf(-dst_scale)); + } + + // Should be warp uniforms + const Vec* src_ptr[exp_k]; + float scale[exp_k]; + PRAGMA_UNROLL + for (int e = 0; e < exp_k; ++e) { + src_ptr[e] = (const Vec*)src + dims * en2f[e * tokens + ti]; + scale[e] = scales ? scales[e * tokens + ti] : 1.f; + } + + for (int i = threadIdx.x; i < dims; i += block_dim) { + Array accum{}; + if (dst_scales) { + Vec v; + Ldg(v, dst_ptr[i].data()); + using namespace ops; + accum = cast(v) * dst_scale; + } PRAGMA_UNROLL for (int e = 0; e < exp_k; ++e) { Vec v; @@ -379,6 +1767,7 @@ void invokeMoeReduce(T* dst, const T* src, const float* scales, const int* en2f, + const float* dst_scales, int tokens, int experts_per_token, int dims, @@ -395,6 +1784,7 @@ void invokeMoeReduce(T* dst, src, scales, en2f, + dst_scales, dims / vec_size, tokens); }; @@ -404,19 +1794,22 @@ void invokeMoeReduce(T* dst, return invoke(std::integral_constant{}); case 2: return invoke(std::integral_constant{}); - // case 4: - // return invoke(std::integral_constant{}); - // case 6: - // return invoke(std::integral_constant{}); + case 4: + return invoke(std::integral_constant{}); + case 6: + return invoke(std::integral_constant{}); + case 8: + return invoke(std::integral_constant{}); default: fprintf(stderr, "Unsupported experts_per_token %d\n", experts_per_token); std::abort(); } } -template void invokeMoeReduce(half*, const half*, const float*, const int*, int, int, int, cudaStream_t); +template void invokeMoeReduce(half*, const half*, const float*, const int*, const float*, int, int, int, cudaStream_t); #ifdef ENABLE_BF16 -template void invokeMoeReduce(nv_bfloat16*, const nv_bfloat16*, const float*, const int*, int, int, int, cudaStream_t); +template void +invokeMoeReduce(nv_bfloat16*, const nv_bfloat16*, const float*, const int*, const float*, int, int, int, cudaStream_t); #endif std::vector SampleUniform(int token_num, int expert_num, int exp_per_tok, std::mt19937& g) diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.h b/src/turbomind/kernels/gemm/moe_utils_v2.h index 334e2de27..0e4c36af0 100644 --- a/src/turbomind/kernels/gemm/moe_utils_v2.h +++ b/src/turbomind/kernels/gemm/moe_utils_v2.h @@ -14,13 +14,14 @@ void invokeMoeGate_V2(int* f2n, int* en2f, int* offsets, float* scales, - int* masks, + void* masks, int* accum, const float* logits, int tokens, int tokens_padded, int experts, int exp_per_tok, + bool norm_topk, cudaStream_t st); template @@ -49,6 +50,7 @@ void invokeMoeReduce(T* dst, const T* src, const float* scales, const int* en2f, + const float* dst_scales, int tokens, int experts_per_token, int dims, diff --git a/src/turbomind/kernels/gemm/test/test_moe_utils.cu b/src/turbomind/kernels/gemm/test/test_moe_utils.cu index a31116219..47e3bfdb1 100644 --- a/src/turbomind/kernels/gemm/test/test_moe_utils.cu +++ b/src/turbomind/kernels/gemm/test/test_moe_utils.cu @@ -26,6 +26,25 @@ void print_vecs(const T* data, int m, int k, std::string msg, int width = 4) } } +template +void diff_vecs(const T* data, const T* refs, int m, int k, std::string msg) +{ + if (!msg.empty()) { + std::cout << msg << ": [" << m << ", " << k << "]\n"; + } + for (int mm = 0; mm < m; ++mm) { + std::cout << "m=" << mm << ": "; + for (int kk = 0; kk < k; ++kk) { + const auto& x = data[mm * k + kk]; + const auto& y = refs[mm * k + kk]; + if (x != y) { + std::cout << kk << "(" << x << ", " << y << ") "; + } + } + std::cout << "\n"; + } +} + #if 0 void func() { @@ -190,7 +209,7 @@ void moe_gate_ref(int tokens, } } -void mask2eids(const universal_vector& masks, universal_vector& eids, int tokens, int expert_num) +void mask2eids(universal_vector& masks, universal_vector& eids, int tokens, int expert_num) { const int tokens_padded = masks.size() / expert_num; // std::cout << eids.size() << std::endl; @@ -228,13 +247,13 @@ bool test_moe_gate(int tokens, // const int tokens_padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize; // const int max_coords = get_max_coords(tokens, expert_num, experts_per_token, tiling); - universal_vector offsets(expert_num + 1); - universal_vector accum(expert_num * kMoeGateMaxTiles); - universal_vector masks(expert_num * tokens_padded); - universal_vector eids(experts_per_token * tokens); - universal_vector f2n(experts_per_token * tokens); - universal_vector en2f(experts_per_token * tokens); - universal_vector scales(experts_per_token * tokens); + universal_vector offsets(expert_num + 1); + universal_vector accum(expert_num * kMoeGateMaxTiles); + universal_vector masks(expert_num * tokens_padded); + universal_vector eids(experts_per_token * tokens); + universal_vector f2n(experts_per_token * tokens); + universal_vector en2f(experts_per_token * tokens); + universal_vector scales(experts_per_token * tokens); // universal_vector coords(max_coords); // thrust::fill(coords.begin(), coords.end(), int2{-1, 0}); @@ -246,8 +265,16 @@ bool test_moe_gate(int tokens, // moe_gate_ref(tokens, expert_num, experts_per_token, logits, offsets_ref, eids_ref, f2n_ref, en2f_ref, scales_ref); - for (int i = 0; i < 10; ++i) { + cudaMemPrefetchAsync(f2n.data().get(), sizeof(int) * f2n.size(), 0); + cudaMemPrefetchAsync(en2f.data().get(), sizeof(int) * en2f.size(), 0); + cudaMemPrefetchAsync(offsets.data().get(), sizeof(int) * offsets.size(), 0); + cudaMemPrefetchAsync(scales.data().get(), sizeof(float) * scales.size(), 0); + cudaMemPrefetchAsync(logits.data().get(), sizeof(float) * logits.size(), 0); + + for (int i = 0; i < 1; ++i) { + gemm::CacheFlushing::flush(); cudaMemset(accum.data().get(), 0, sizeof(int) * accum.size()); + cudaMemset(masks.data().get(), -1, sizeof(int8_t) * masks.size()); invokeMoeGate_V2(f2n.data().get(), en2f.data().get(), offsets.data().get(), @@ -259,6 +286,7 @@ bool test_moe_gate(int tokens, // tokens_padded, expert_num, experts_per_token, + true, 0); } @@ -306,7 +334,10 @@ bool test_moe_gate(int tokens, // success = false; } - if (!success || false) { + if (!success && 1) { + + diff_vecs(eids.data().get(), eids_ref.data().get(), experts_per_token, tokens, "eids"); + print_vecs(offsets_ref.data().get(), 1, expert_num + 1, "offsets_ref"); print_vecs(offsets.data().get(), 1, expert_num + 1, "offsets"); @@ -322,32 +353,32 @@ bool test_moe_gate(int tokens, // print_vecs(scales_ref.data().get(), experts_per_token, tokens, "scales_ref", 12); print_vecs(scales.data().get(), experts_per_token, tokens, "scales", 12); - print_vecs(accum.data().get(), expert_num, 1, "accum"); + // print_vecs(accum.data().get(), expert_num, 1, "accum"); // print_vecs(coords.data().get(), 1, max_coords, "coords"); - thrust::host_vector tile_offsets(tape.max_ctas); - std::cout << tape.max_ctas << std::endl; - cudaMemcpy(tile_offsets.data(), tape.tile_offsets, sizeof(int4) * tile_offsets.size(), cudaMemcpyDefault); - cudaDeviceSynchronize(); - - std::cout << "coords:\n"; - int last = -1; - for (int i = 0; i < tape.max_ctas; ++i) { - auto& c = tile_offsets[i]; - if (last >= 0 && c.w != last) { - std::cout << "\n"; - } - if (c.w == -1) { - std::cout << i << "\n"; - break; - } - last = c.w; - std::stringstream ss; - ss << c.x << "," << c.y; - std::cout << std::setw(6) << ss.str(); - } - std::cout << "\n"; + // thrust::host_vector tile_offsets(tape.max_ctas); + // std::cout << tape.max_ctas << std::endl; + // cudaMemcpy(tile_offsets.data(), tape.tile_offsets, sizeof(int4) * tile_offsets.size(), cudaMemcpyDefault); + // cudaDeviceSynchronize(); + + // std::cout << "coords:\n"; + // int last = -1; + // for (int i = 0; i < tape.max_ctas; ++i) { + // auto& c = tile_offsets[i]; + // if (last >= 0 && c.w != last) { + // std::cout << "\n"; + // } + // if (c.w == -1) { + // std::cout << i << "\n"; + // break; + // } + // last = c.w; + // std::stringstream ss; + // ss << c.x << "," << c.y; + // std::cout << std::setw(6) << ss.str(); + // } + // std::cout << "\n"; } return success; @@ -358,7 +389,11 @@ int main() gemm::Tape tape{}; constexpr Tiling tiling{14336, 128, {128, 128, 32}}; - test_moe_gate(8192, 8, 2, tape, tiling); + // test_moe_gate(32768 * 4, 60, 4, tape, tiling); + // test_moe_gate(32768, 64, 8, tape, tiling); + // test_moe_gate(8, 60, 4, tape, tiling); + + test_moe_gate(65536, 8, 2, tape, tiling); return 0; for (int i = 1; i < 16384; ++i) { diff --git a/src/turbomind/kernels/gemm/test/testbed.h b/src/turbomind/kernels/gemm/test/testbed.h index 6b1ec88f5..7a089fbdf 100644 --- a/src/turbomind/kernels/gemm/test/testbed.h +++ b/src/turbomind/kernels/gemm/test/testbed.h @@ -514,6 +514,7 @@ class Testbed { c_e_.data().get(), moe_scales_.data().get(), moe_en2f_.data().get(), + nullptr, batch_size_, expert_ids_.size() / batch_size_, output_dims_, @@ -523,6 +524,7 @@ class Testbed { c_e_ref_.data().get(), moe_scales_.data().get(), moe_en2f_.data().get(), + nullptr, batch_size_, expert_ids_.size() / batch_size_, output_dims_, diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc index 2d68ef353..3b9bb1c11 100644 --- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc +++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc @@ -137,6 +137,7 @@ LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(int layer_idx, moe_param.inter_size, moe_param.expert_num, moe_param.method, + moe_param.shared_gate, tensor_para_size_, weight_type, group_size, @@ -349,18 +350,22 @@ void LlamaDecoderLayerWeight::mallocWeights() mallocWeights(self_attn_weights.qkv, attn_bias_); mallocWeights(self_attn_weights.output, attn_bias_); - if (moe_weights.experts.empty()) { + if (inter_size_) { mallocWeights(ffn_weights.gating, false); mallocWeights(ffn_weights.intermediate, false); mallocWeights(ffn_weights.output, false); } - else { + + if (!moe_weights.experts.empty()) { mallocWeights(moe_weights.gate, false); for (auto& e : moe_weights.experts) { mallocWeights(e.gating, false); mallocWeights(e.intermediate, false); mallocWeights(e.output, false); } + if (moe_weights.shared_gate.output_dims) { + mallocWeights(moe_weights.shared_gate, false); + } } } @@ -375,10 +380,25 @@ LlamaDecoderLayerWeight::~LlamaDecoderLayerWeight() freeWeights(self_attn_weights.qkv); freeWeights(self_attn_weights.output); - freeWeights(ffn_weights.fused_gating_intermediate); - freeWeights(ffn_weights.gating); - freeWeights(ffn_weights.intermediate); - freeWeights(ffn_weights.output); + if (inter_size_) { + freeWeights(ffn_weights.fused_gating_intermediate); + freeWeights(ffn_weights.gating); + freeWeights(ffn_weights.intermediate); + freeWeights(ffn_weights.output); + } + + if (!moe_weights.experts.empty()) { + freeWeights(moe_weights.gate); + for (auto& e : moe_weights.experts) { + freeWeights(e.fused_gating_intermediate); + freeWeights(e.gating); + freeWeights(e.intermediate); + freeWeights(e.output); + } + if (moe_weights.shared_gate.kernel) { + freeWeights(moe_weights.shared_gate); + } + } } template @@ -428,23 +448,30 @@ TensorMap LlamaDecoderLayerWeight::getParams(std::string prefix) getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output); getWeightTensor(self_attn_weights.output, attn_bias_, get_prefix("attention.wo"), output); - if (moe_weights.experts.empty()) { + if (inter_size_) { getWeightTensor(ffn_weights.gating, false, get_prefix("feed_forward.w1"), output); getWeightTensor(ffn_weights.intermediate, false, get_prefix("feed_forward.w3"), output); getWeightTensor(ffn_weights.output, false, get_prefix("feed_forward.w2"), output); } - else { + + if (!moe_weights.experts.empty()) { output.insert( concat(prefix, "moe_ffn.gate.weight"), Tensor{MEMORY_GPU, getTensorType(), {moe_weights.gate.kernel_size()}, moe_weights.gate.kernel}); auto& experts = moe_weights.experts; for (size_t i = 0; i < experts.size(); ++i) { const std::string name = "moe_ffn.experts." + std::to_string(i); - // std::cerr << "FUCK " << get_prefix(concat(name, "w1")) << "\n"; getWeightTensor(experts[i].gating, false, get_prefix(concat(name, "w1")), output); getWeightTensor(experts[i].intermediate, false, get_prefix(concat(name, "w3")), output); getWeightTensor(experts[i].output, false, get_prefix(concat(name, "w2")), output); } + if (moe_weights.shared_gate.kernel) { + output.insert(concat(prefix, "moe_ffn.shared_gate.weight"), + Tensor{MEMORY_GPU, + getTensorType(), + {moe_weights.shared_gate.kernel_size()}, + moe_weights.shared_gate.kernel}); + } } return output; @@ -681,10 +708,13 @@ void LlamaDecoderLayerWeight::prepare(void* workspace, size_t size, const cud convert(ffn.output, is_fused_moe, workspace, size, is_16xx); }; - if (moe_weights.experts.empty()) { + if (inter_size_) { + std::cerr << "process FFN\n"; process_ffn(ffn_weights, false); } - else { + + if (!moe_weights.experts.empty()) { + std::cerr << "process MoE\n"; std::vector> fused_ptrs; std::vector> output_ptrs; std::vector> fused_param_ptrs; diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h index 9a895243b..382d0dfc6 100644 --- a/src/turbomind/models/llama/LlamaDenseWeight.h +++ b/src/turbomind/models/llama/LlamaDenseWeight.h @@ -185,11 +185,15 @@ struct MoeFfnWeight { int inter_size, int expert_num, int method, + bool has_shared_gate, size_t tp, WeightType weight_type, int group_size, bool fuse_silu_act) { + + printf("%d %d %d\n", (int)hidden_dim, (int)inter_size, (int)expert_num); + if (expert_num == 0) { return; } @@ -208,11 +212,23 @@ struct MoeFfnWeight { // inter size is divided by tp in `FfnWeight` e = LlamaFfnWeight{hidden_dim, (size_t)inter_size, tp, weight_type, group_size, fuse_silu_act}; } + + if (has_shared_gate) { + shared_gate.input_dims = hidden_dim; + shared_gate.output_dims = 1; + shared_gate.type = get_default_weight_type(); + gate.group_size = group_size; + } + else { + shared_gate = {}; + } } LlamaDenseWeight gate; std::vector> experts; + LlamaDenseWeight shared_gate; + LlamaFfnWeight block; int method{}; diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h index 1c039ca66..2ea63f041 100644 --- a/src/turbomind/models/llama/llama_params.h +++ b/src/turbomind/models/llama/llama_params.h @@ -31,9 +31,11 @@ struct MoeParam { kNaive, kFused } method; - int expert_num; - int experts_per_token; - int inter_size; + int expert_num; + int experts_per_token; + int inter_size; + bool norm_topk; + bool shared_gate; }; struct AttentionParam { diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc index def6b04ab..1ad76839d 100644 --- a/src/turbomind/models/llama/moe_ffn_layer.cc +++ b/src/turbomind/models/llama/moe_ffn_layer.cc @@ -30,6 +30,7 @@ void MoeFfnLayer::AllocateBuffer(size_t tokens, size_t padded) alloc(&f2n_, param_.experts_per_token * tokens); alloc(&en2f_, param_.experts_per_token * tokens); alloc(&scales_, param_.experts_per_token * tokens); + alloc(&shared_scales_, tokens); return (char*)alloc.ptr() - (char*)base; }; @@ -69,7 +70,7 @@ void MoeFfnLayer::gate(float* logits, const T* input, int tokens, const Llama getCudaDataType(), hidden_dim_, &beta, - logits_, + logits, CUDA_R_32F, weight.output_dims, CUDA_R_32F, @@ -77,13 +78,13 @@ void MoeFfnLayer::gate(float* logits, const T* input, int tokens, const Llama } template -void MoeFfnLayer::forward(T* inout, int tokens, int layer_id, const MoeFfnWeight& moe) +void MoeFfnLayer::forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight& moe) { const size_t padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize; AllocateBuffer(tokens, padded); - gate(logits_, inout, tokens, moe.gate); + gate(logits_, input, tokens, moe.gate); sync_check_cuda_error(); check_cuda_error(cudaMemsetAsync(accum_, 0, sizeof(int) * param_.expert_num * kMoeGateMaxTiles, stream_)); @@ -103,6 +104,7 @@ void MoeFfnLayer::forward(T* inout, int tokens, int layer_id, const MoeFfnWei padded, param_.expert_num, param_.experts_per_token, + param_.norm_topk, stream_); sync_check_cuda_error(); @@ -123,7 +125,7 @@ void MoeFfnLayer::forward(T* inout, int tokens, int layer_id, const MoeFfnWei if (param_.method == MoeParam::kNaive) { - dispatchMoeGather(inout_buf_, inout, f2n_, tokens, param_.experts_per_token, hidden_dim_, stream_); + dispatchMoeGather(inout_buf_, input, f2n_, tokens, param_.experts_per_token, hidden_dim_, stream_); sync_check_cuda_error(); check_cuda_error( @@ -155,28 +157,8 @@ void MoeFfnLayer::forward(T* inout, int tokens, int layer_id, const MoeFfnWei auto& block = moe.block; -#if 0 - FT_CHECK(!block.is_fused_silu); - for (int i = 0; i < param_.expert_num; ++i) { - if (size_t count = h_offsets_[i + 1] - h_offsets_[i]) { - cublas_->Gemm(CUBLAS_OP_T, // (m, k) W - CUBLAS_OP_N, // (k, n) X - inter_size_ * 2, - count, - hidden_dim_, - moe.experts[i].fused_gating_intermediate.kernel, - hidden_dim_, - inout_buf_ + h_offsets_[i] * hidden_dim_, - hidden_dim_, - inter_buf_ + h_offsets_[i] * inter_size_ * 2, - inter_size_ * 2); - sync_check_cuda_error(); - } - } - auto mode = kCmpWrite; -#else linear_->forward_moe(inter_buf_, - {inout, (int)hidden_dim_}, + {input, (int)hidden_dim_}, f2n_, offsets_, tokens * param_.experts_per_token, @@ -185,7 +167,6 @@ void MoeFfnLayer::forward(T* inout, int tokens, int layer_id, const MoeFfnWei context_.get()); sync_check_cuda_error(); auto mode = kCmpRead; -#endif // if (tensor_para_.rank_ == 0) { // Compare(inter_buf_, // @@ -205,25 +186,6 @@ void MoeFfnLayer::forward(T* inout, int tokens, int layer_id, const MoeFfnWei sync_check_cuda_error(); } -#if 0 - for (int i = 0; i < param_.expert_num; ++i) { - if (size_t count = h_offsets_[i + 1] - h_offsets_[i]) { - cublas_->Gemm(CUBLAS_OP_T, // (m, k) W - CUBLAS_OP_N, // (k, n) X - hidden_dim_, - count, - inter_size_, - moe.experts[i].output.kernel, - inter_size_, - inter_buf_ + h_offsets_[i] * inter_size_ * 2, - inter_size_ * 2, - inout_buf_ + h_offsets_[i] * hidden_dim_, - hidden_dim_); - sync_check_cuda_error(); - } - } - auto mode1 = kCmpWrite; -#else linear_->forward_moe(inout_buf_, {inter_buf_, block.is_fused_silu ? (int)inter_size_ : (int)inter_size_ * 2}, nullptr, @@ -234,7 +196,6 @@ void MoeFfnLayer::forward(T* inout, int tokens, int layer_id, const MoeFfnWei context_.get()); sync_check_cuda_error(); auto mode1 = kCmpRead; -#endif // if (tensor_para_.rank_ == 0) { // Compare(inter_buf_2_, // @@ -250,18 +211,29 @@ void MoeFfnLayer::forward(T* inout, int tokens, int layer_id, const MoeFfnWei // } } - invokeMoeReduce(inout, inout_buf_, scales_, en2f_, tokens, param_.experts_per_token, hidden_dim_, stream_); + if (moe.shared_gate.kernel) { + gate(shared_scales_, input, tokens, moe.shared_gate); + } +} + +template +void MoeFfnLayer::reduce(T* output, int tokens, const MoeFfnWeight& moe) +{ + invokeMoeReduce(output, + inout_buf_, + scales_, + en2f_, + moe.shared_gate.kernel ? shared_scales_ : nullptr, + tokens, + param_.experts_per_token, + hidden_dim_, + stream_); sync_check_cuda_error(); if (tensor_para_.world_size_ > 1) { - ftNcclAllReduceSum(inout, inout, tokens * hidden_dim_, tensor_para_, stream_); + ftNcclAllReduceSum(output, output, tokens * hidden_dim_, tensor_para_, stream_); sync_check_cuda_error(); } - - // if (tensor_para_.rank_ == 0) { - // check_cuda_error(cudaStreamSynchronize(stream_)); - // std::abort(); - // } } template diff --git a/src/turbomind/models/llama/moe_ffn_layer.h b/src/turbomind/models/llama/moe_ffn_layer.h index ef65aaa46..0f1713f7b 100644 --- a/src/turbomind/models/llama/moe_ffn_layer.h +++ b/src/turbomind/models/llama/moe_ffn_layer.h @@ -51,7 +51,9 @@ class MoeFfnLayer { FreeBuffer(); } - void forward(T* inout, int tokens, int layer_id, const MoeFfnWeight& moe); + void forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight& moe); + + void reduce(T* output, int tokens, const MoeFfnWeight& moe); void gate(float* logits, const T* input, int tokens, const LlamaDenseWeight& weight); @@ -85,6 +87,8 @@ class MoeFfnLayer { int* en2f_{}; float* scales_{}; + float* shared_scales_{}; + int* accum_{}; int* offsets_{}; }; diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc index 68392215f..28e8b5f64 100644 --- a/src/turbomind/models/llama/unified_decoder.cc +++ b/src/turbomind/models/llama/unified_decoder.cc @@ -26,9 +26,15 @@ UnifiedDecoder::UnifiedDecoder(const ModelParam& model, dtype_(getTensorType()) { - attn_layer_ = std::make_unique>(model, attn, lora, tp, ctx); - ffn_layer_ = std::make_unique>(model, tp, ctx, true); - moe_ffn_layer_ = std::make_unique>(model, moe, tp, ctx); + attn_layer_ = std::make_unique>(model, attn, lora, tp, ctx); + + if (moe.expert_num) { + moe_ffn_layer_ = std::make_unique>(model, moe, tp, ctx); + } + + if (model.inter_size) { + ffn_layer_ = std::make_unique>(model, tp, ctx, !moe_ffn_layer_); + } check_cuda_error(cudaEventCreateWithFlags(&ev_h_cu_x_, cudaEventDisableTiming)); } @@ -190,9 +196,10 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con /// feed-forward network if (!weights->at(layer)->moe_weights.experts.empty()) { - moe_ffn_layer_->forward(decoder_output, token_num, layer, weights->at(layer)->moe_weights); + moe_ffn_layer_->forward(nullptr, decoder_output, token_num, layer, weights->at(layer)->moe_weights); } - else { + + if (ffn_layer_) { int layer_id = layer; // int is needed TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}}, {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}}}; @@ -203,6 +210,10 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &weights->at(layer)->ffn_weights); } + if (!weights->at(layer)->moe_weights.experts.empty()) { + moe_ffn_layer_->reduce(decoder_output, token_num, weights->at(layer)->moe_weights); + } + count_and_fix(decoder_output, token_num * hidden_units_, Concat("ffn_block", layer), 2); const bool is_last_layer = layer == layer_num_ - 1; diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index 8db13652f..38552be0c 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -301,6 +301,8 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, moe_param_.expert_num = model_reader["expert_num"].as(0); moe_param_.experts_per_token = model_reader["experts_per_token"].as(0); moe_param_.inter_size = model_reader["expert_inter_size"].as(0); + moe_param_.shared_gate = model_reader["moe_shared_gate"].as(0); + moe_param_.norm_topk = model_reader["moe_norm_topk"].as(false); handleMissingParams(); From cea0524c74a678797bd61c26a5e367174dffa167 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Thu, 7 Nov 2024 07:32:53 +0000 Subject: [PATCH 02/21] eliminate `inter_size_` from ffn layer --- src/turbomind/models/llama/LlamaDenseWeight.h | 13 +++-- src/turbomind/models/llama/LlamaFfnLayer.cc | 47 ++++++++++--------- src/turbomind/models/llama/LlamaFfnLayer.h | 11 ++--- 3 files changed, 37 insertions(+), 34 deletions(-) diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h index 382d0dfc6..21c1c666c 100644 --- a/src/turbomind/models/llama/LlamaDenseWeight.h +++ b/src/turbomind/models/llama/LlamaDenseWeight.h @@ -145,24 +145,28 @@ struct LlamaFfnWeight { LlamaFfnWeight( size_t hidden_dim, size_t inter_size, size_t tp, WeightType weight_type, int group_size, bool fuse_silu_act) { + inter_size /= tp; + + this->inter_size = inter_size; + gating.input_dims = hidden_dim; - gating.output_dims = inter_size / tp; + gating.output_dims = inter_size; gating.type = weight_type; gating.group_size = group_size; intermediate.input_dims = hidden_dim; - intermediate.output_dims = inter_size / tp; + intermediate.output_dims = inter_size; intermediate.type = weight_type; intermediate.group_size = group_size; fused_gating_intermediate.input_dims = hidden_dim; - fused_gating_intermediate.output_dims = inter_size / tp * 2; + fused_gating_intermediate.output_dims = inter_size * 2; fused_gating_intermediate.type = weight_type; fused_gating_intermediate.group_size = group_size; is_fused_silu = fuse_silu_act; - output.input_dims = inter_size / tp; + output.input_dims = inter_size; output.output_dims = hidden_dim; output.type = weight_type; output.group_size = group_size; @@ -173,6 +177,7 @@ struct LlamaFfnWeight { LlamaDenseWeight output; LlamaDenseWeight fused_gating_intermediate; + int inter_size{}; bool is_fused_silu{}; }; diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc index f9ee0c4ad..8cce20720 100644 --- a/src/turbomind/models/llama/LlamaFfnLayer.cc +++ b/src/turbomind/models/llama/LlamaFfnLayer.cc @@ -28,10 +28,11 @@ namespace turbomind { template void LlamaFfnLayer::allocateBuffer(size_t token_num, + int inter_size, const LlamaDenseWeight* gating, const LlamaDenseWeight* inter) { - const size_t sz = token_num * inter_size_; + const size_t sz = token_num * inter_size; const size_t sz_gate = token_num * gating->lora.r; const size_t sz_inter = token_num * inter->lora.r; @@ -51,24 +52,24 @@ template void LlamaFfnLayer::freeBuffer() { if (is_allocate_buffer_) { - // allocator_->free((void**)&inter_buf_); allocator_->free((void**)&gating_buf_); is_allocate_buffer_ = false; } } template -void LlamaFfnLayer::activation(int token_num, bool is_chunked) +void LlamaFfnLayer::activation(int token_num, int inter_size, bool is_chunked) { NvtxScope scope("activation"); if (is_chunked) { + // gate & up are in the SAME buffer invokeGenericActivation_v2( - gating_buf_, gating_buf_ + inter_size_, inter_size_ * 2, token_num, inter_size_, stream_); + gating_buf_, gating_buf_ + inter_size, inter_size * 2, token_num, inter_size, stream_); sync_check_cuda_error(); } else { - invokeGenericActivation_v2( - gating_buf_, inter_buf_, inter_size_, token_num, inter_size_, stream_); + // gate & up are in separate buffers + invokeGenericActivation_v2(gating_buf_, inter_buf_, inter_size, token_num, inter_size, stream_); sync_check_cuda_error(); } } @@ -88,11 +89,11 @@ void LlamaFfnLayer::forward(TensorMap* output_tensors, NvtxScope scope("ffn"); - const size_t num_token = input_tensors->at("ffn_input").shape[0]; - const int layer_id = input_tensors->getVal("layer_id"); - // LOG(WARNING); + const size_t token_num = input_tensors->at("ffn_input").shape[0]; + const int layer_id = input_tensors->getVal("layer_id"); + const int inter_size = weights->inter_size; - allocateBuffer(num_token, &weights->gating, &weights->intermediate); + allocateBuffer(token_num, inter_size, &weights->gating, &weights->intermediate); const T* ffn_input_data = input_tensors->at("ffn_input").getPtr(); T* ffn_output_data = output_tensors->at("ffn_output").getPtr(); @@ -103,50 +104,50 @@ void LlamaFfnLayer::forward(TensorMap* output_tensors, const auto type = weights->is_fused_silu ? LlamaLinear::kFusedSiluFfn : LlamaLinear::kGemm; - linear_->forward(gating_buf_, ffn_input_data, num_token, weights->fused_gating_intermediate, type); + linear_->forward(gating_buf_, ffn_input_data, token_num, weights->fused_gating_intermediate, type); sync_check_cuda_error(); if (!weights->is_fused_silu) { - activation(num_token, true); + activation(token_num, inter_size, true); } - count_and_fix(gating_buf_, num_token * weights->output.input_dims, Concat("w1_w3_silu", layer_id), 3); + count_and_fix(gating_buf_, token_num * weights->output.input_dims, Concat("w1_w3_silu", layer_id), 3); } else { { // w1(x) NvtxScope scope("w1"); - linear_->forward(gating_buf_, ffn_input_data, num_token, weights->gating, LlamaLinear::kGemm, lora_mask); + linear_->forward(gating_buf_, ffn_input_data, token_num, weights->gating, LlamaLinear::kGemm, lora_mask); sync_check_cuda_error(); } - count_and_fix(gating_buf_, num_token * weights->gating.output_dims, Concat("w1", layer_id), 3); + count_and_fix(gating_buf_, token_num * weights->gating.output_dims, Concat("w1", layer_id), 3); { // w3(x) NvtxScope scope("w3"); linear_->forward( - inter_buf_, ffn_input_data, num_token, weights->intermediate, LlamaLinear::kGemm, lora_mask); + inter_buf_, ffn_input_data, token_num, weights->intermediate, LlamaLinear::kGemm, lora_mask); sync_check_cuda_error(); } - count_and_fix(inter_buf_, num_token * weights->intermediate.output_dims, Concat("w3", layer_id), 3); + count_and_fix(inter_buf_, token_num * weights->intermediate.output_dims, Concat("w3", layer_id), 3); // silu(w1(x)) * w3(x) - activation(num_token, false); + activation(token_num, inter_size, false); - count_and_fix(gating_buf_, num_token * weights->output.input_dims, Concat("act", layer_id), 3); + count_and_fix(gating_buf_, token_num * weights->output.input_dims, Concat("act", layer_id), 3); } { // w2(x) NvtxScope scope("w2"); - const int pitch = (weights->fused_gating_intermediate.kernel && !weights->is_fused_silu) ? inter_size_ * 2 : 0; + const int pitch = (weights->fused_gating_intermediate.kernel && !weights->is_fused_silu) ? inter_size * 2 : 0; linear_->forward( - ffn_output_data, {gating_buf_, pitch}, num_token, weights->output, LlamaLinear::kGemm, lora_mask); + ffn_output_data, {gating_buf_, pitch}, token_num, weights->output, LlamaLinear::kGemm, lora_mask); sync_check_cuda_error(); } - count_and_fix(ffn_output_data, num_token * weights->output.output_dims, Concat("w2", layer_id), 3); + count_and_fix(ffn_output_data, token_num * weights->output.output_dims, Concat("w2", layer_id), 3); if (all_reduce_ && tensor_para_.world_size_ > 1) { NcclGuard nccl_guard(tensor_para_, stream_); - ftNcclAllReduceSum(ffn_output_data, ffn_output_data, num_token * hidden_units_, tensor_para_, stream_); + ftNcclAllReduceSum(ffn_output_data, ffn_output_data, token_num * hidden_units_, tensor_para_, stream_); sync_check_cuda_error(); } diff --git a/src/turbomind/models/llama/LlamaFfnLayer.h b/src/turbomind/models/llama/LlamaFfnLayer.h index 75ced5f9a..2daca2cc9 100644 --- a/src/turbomind/models/llama/LlamaFfnLayer.h +++ b/src/turbomind/models/llama/LlamaFfnLayer.h @@ -19,12 +19,11 @@ #pragma once -#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h" #include "src/turbomind/models/llama/LlamaLinear.h" #include "src/turbomind/models/llama/context.h" -#include "src/turbomind/utils/custom_ar_comm.h" +#include "src/turbomind/models/llama/llama_params.h" +#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/nccl_utils.h" -#include namespace turbomind { @@ -32,7 +31,6 @@ template class LlamaFfnLayer { public: LlamaFfnLayer(const ModelParam& model, const NcclParam& tp, const Context& ctx, bool all_reduce): - inter_size_(model.inter_size / tp.world_size_), hidden_units_(model.hidden_units), tensor_para_(tp), stream_(ctx.stream), @@ -50,13 +48,12 @@ class LlamaFfnLayer { void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaFfnWeight* weights); private: - void allocateBuffer(size_t token_num, const LlamaDenseWeight*, const LlamaDenseWeight*); + void allocateBuffer(size_t token_num, int inter_size, const LlamaDenseWeight*, const LlamaDenseWeight*); void freeBuffer(); - void activation(int token_num, bool is_chunked); + void activation(int token_num, int inter_size, bool is_chunked); - const size_t inter_size_; const size_t hidden_units_; const NcclParam tensor_para_; cudaStream_t const stream_; From b7d050a044be2ff4eded41e440e1f54494c9a816 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Thu, 7 Nov 2024 08:02:29 +0000 Subject: [PATCH 03/21] clean up --- lmdeploy/turbomind/deploy/module.py | 2 +- src/turbomind/kernels/gemm/moe_utils_v2.cu | 1101 +---------------- .../models/llama/LlamaDecoderLayerWeight.cc | 4 +- src/turbomind/models/llama/LlamaDenseWeight.h | 2 +- 4 files changed, 41 insertions(+), 1068 deletions(-) diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py index 8e20946b1..2d3575c37 100644 --- a/lmdeploy/turbomind/deploy/module.py +++ b/lmdeploy/turbomind/deploy/module.py @@ -165,7 +165,7 @@ def apply(self, i: int, r: BaseReader): if self.shared_gate: shared_gate = transpose(r.moe_ffn_shared_gate(i)) - print(shared_gate) + # print(shared_gate) self.model.save_split(shared_gate, self._moe_ffn_shared_gate.format(i)) diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu index 42ff84a3a..dc2c21d7c 100644 --- a/src/turbomind/kernels/gemm/moe_utils_v2.cu +++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu @@ -252,880 +252,6 @@ __global__ void MoeScanKernel_v2(int* f2n, // [e*n] } } -template -__global__ void MoeGateKernel_v3(float* scales, // [e,n] - int* masks, // [E,n], padded - int* accum, // [E,tiles] - const float* logits, // [n,E] - int log_tile, - int tiles, - int token_num, - int token_num_padded, - int expert_num, - int top_k) -{ - constexpr int max_tiles = kMoeGateMaxTiles; - constexpr int threads_per_token = max_expert_num / items_per_thread; // 8 - - // We use bits in a uint32_t to represent selected experts - static_assert(items_per_thread <= 32); - // We use warp-level primitives for reduction - static_assert(threads_per_token <= 32); - - static_assert((threads_per_token & (threads_per_token - 1)) == 0); - - const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x; - - const int ti = thread_idx / threads_per_token; - const int ei = thread_idx % threads_per_token; - - const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token; - const int warp_ti_offset = warp_ti * threads_per_token; - - __shared__ int shared_accum[max_expert_num][max_tiles]; - - for (int i = threadIdx.x; i < expert_num * max_tiles; i += block_dim) { - int e = i / max_tiles; - int t = i % max_tiles; - if (e < expert_num && t < tiles) { - shared_accum[e][t] = 0; - } - } - - __syncthreads(); - - float data[items_per_thread]; - - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; ++i) { - data[i] = -std::numeric_limits::infinity(); - const int e = threads_per_token * i + ei; - if (e < expert_num && ti < token_num) { - data[i] = __ldg(logits + ti * expert_num + e); - - if (ti == 39505) { - printf("%f %d\n", data[i], e); - } - } - } - - unsigned mask = (unsigned)-1; - float max_logit; - - auto run = [&](int k) { - unsigned bit = 1; - unsigned max_bit = 0; - float max_val = -std::numeric_limits::infinity(); - // local maximum - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; ++i) { - if ((mask & bit) && data[i] > max_val) { - max_bit = bit; - max_val = data[i]; - } - asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); - } - int g_max_ei = ei; - float g_max_val = max_val; - if constexpr (threads_per_token > 1) { - // global maximum - PRAGMA_UNROLL - for (int m = threads_per_token / 2; m >= 1; m /= 2) { - g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m)); - } - // tie breaking - const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val); - g_max_ei = __ffs(active >> (unsigned)warp_ti_offset) - 1; - } - if (k == 0) { - max_logit = g_max_val; - } - if (ei == g_max_ei) { - mask -= max_bit; - } - }; - - run(0); - for (int k = 1; k < max_top_k; ++k) { - run(k); - } - - mask = ~mask; - - float sum_prob{}; - unsigned bit = 1; - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; ++i) { - if (norm_top_k == false || (mask & bit)) { - data[i] = expf(data[i] - max_logit); - sum_prob += data[i]; - } - asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); - } - - PRAGMA_UNROLL - for (int m = threads_per_token / 2; m >= 1; m /= 2) { - sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m); - } - - sum_prob = fdividef(1.f, sum_prob); - - const unsigned group_mask = ((1U << (unsigned)threads_per_token) - 1U) << (unsigned)warp_ti_offset; - - // 1111 1111 << 24 - - const unsigned lanemask_lt = ((1U << (unsigned)ei) - 1U) << (unsigned)warp_ti_offset; - - // 1000 0000 - // 0111 1111 0000 0000 0000 0000 0000 0000 - - int offset = 0; - bit = 1; - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; ++i) { - const int valid = mask & bit; - const unsigned active = __ballot_sync((uint32_t)-1, valid); - if (valid) { - const int e = threads_per_token * i + ei; - const int idx = offset + __popc(active & lanemask_lt); - if (ti == 39505) { - printf("%d %d %f\n", e, idx, data[i] * sum_prob); - } - masks[e * token_num_padded + ti] = idx; - scales[idx * token_num + ti] = data[i] * sum_prob; - // atomic add in Smem - atomicAdd(&shared_accum[e][ti >> log_tile], 1); - } - offset += __popc(active & group_mask); - asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); - } - - __syncthreads(); - - for (int i = threadIdx.x; i < expert_num * max_tiles; i += block_dim) { - int e = i / max_tiles; - int t = i % max_tiles; - if (e < expert_num && t < tiles) { - // atomic add in Gmem - atomicAdd(accum + e * tiles + t, shared_accum[e][t]); - } - } -} - -template -__global__ void MoeGateKernel_v4(float* scales, // [e,n] - int* masks, // [E,n], padded - int* accum, // [E,tiles] - const float* logits, // [n,E] - int log_tile, - int tiles, - int token_num, - int token_num_padded, - int expert_num, - int top_k) -{ - constexpr int max_tiles = kMoeGateMaxTiles; - constexpr int threads_per_token = max_expert_num / items_per_thread; // 8 - - // We use bits in a uint32_t to represent selected experts - static_assert(items_per_thread <= 32); - // We use warp-level primitives for reduction - static_assert(threads_per_token <= 32); - - static_assert((threads_per_token & (threads_per_token - 1)) == 0); - - const int warp_id = threadIdx.x / WARP_SIZE; - const int lane_id = threadIdx.x % WARP_SIZE; - - const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x; - - const int ti = thread_idx / threads_per_token; - const int ei = thread_idx % threads_per_token; - - const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token; - const int warp_ti_offset = warp_ti * threads_per_token; - - __shared__ int shared_accum[max_expert_num][max_tiles]; - - for (int i = threadIdx.x; i < max_expert_num * max_tiles; i += block_dim) { - int e = i / max_tiles; - int t = i % max_tiles; - shared_accum[e][t] = 0; - } - -#if 0 - logits += blockIdx.x * block_dim / threads_per_token * expert_num; - logits += threadIdx.x / WARP_SIZE * (WARP_SIZE / threads_per_token) * expert_num; - constexpr int tokens_per_warp = WARP_SIZE / threads_per_token; - __shared__ float smem_data[block_dim / WARP_SIZE][tokens_per_warp][max_expert_num + 1]; - PRAGMA_UNROLL - for (int i = 0; i < tokens_per_warp * max_expert_num; i += WARP_SIZE) { - smem_data[warp_id][(lane_id + i) / max_expert_num][(lane_id + i) % max_expert_num] = logits[lane_id + i]; - } - __syncthreads(); - float data[items_per_thread]; - for (int i = 0; i < items_per_thread; ++i) { - const int e = items_per_thread * ei + i; - data[i] = smem_data[warp_id][lane_id / threads_per_token][e]; - } -#else - __syncthreads(); - float data[items_per_thread]; - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; ++i) { - data[i] = -std::numeric_limits::infinity(); - } - if (ti < token_num) { - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; i += alignment) { - const int e = items_per_thread * ei + i; - if (e < expert_num) { - Load((Array&)data[i], &logits[ti * expert_num + e]); - } - } - } -#endif - - unsigned mask = (unsigned)-1; - float max_logit; - - auto run = [&](int k) { - unsigned bit = 1; - unsigned max_bit = 0; - float max_val = -std::numeric_limits::infinity(); - // local maximum - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; ++i) { - if ((mask & bit) && data[i] > max_val) { - max_bit = bit; - max_val = data[i]; - } - asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); - } - int g_max_ei = ei; - float g_max_val = max_val; - if constexpr (threads_per_token > 1) { - // global maximum - PRAGMA_UNROLL - for (int m = threads_per_token / 2; m >= 1; m /= 2) { - g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m)); - } - // tie breaking - const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val); - g_max_ei = __ffs(active >> (unsigned)warp_ti_offset) - 1; - } - if (k == 0) { - max_logit = g_max_val; - } - if (ei == g_max_ei) { - mask -= max_bit; - } - }; - - run(0); - for (int k = 1; k < max_top_k; ++k) { - run(k); - } - - mask = ~mask; - - float sum_prob{}; - unsigned bit = 1; - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; ++i) { - if (norm_top_k == false || (mask & bit)) { - data[i] = expf(data[i] - max_logit); - sum_prob += data[i]; - } - asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); - } - - PRAGMA_UNROLL - for (int m = threads_per_token / 2; m >= 1; m /= 2) { - sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m); - } - - sum_prob = fdividef(1.f, sum_prob); - - const int count = __popc(mask); - - using WarpScan = cub::WarpScan; - __shared__ typename WarpScan::TempStorage temp_storage[block_dim / threads_per_token]; - - int idx{}; - WarpScan{temp_storage[threadIdx.x / threads_per_token]}.ExclusiveSum(count, idx); - - bit = 1; - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; ++i) { - const int valid = mask & bit; - const int e = items_per_thread * ei + i; - if (valid) { - masks[e * token_num_padded + ti] = idx; - scales[idx * token_num + ti] = data[i] * sum_prob; - // atomic add in Smem - atomicAdd(&shared_accum[e][ti >> log_tile], 1); - ++idx; - } - asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); - } - - __syncthreads(); - - for (int i = threadIdx.x; i < expert_num * max_tiles; i += block_dim) { - int e = i / max_tiles; - int t = i % max_tiles; - if (e < expert_num && t < tiles) { - // atomic add in Gmem - atomicAdd(accum + e * tiles + t, shared_accum[e][t]); - } - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// - -template -__global__ void MoeGateKernel_v5(float* scales, // [e,n] - int* masks, // [E,n], padded - int* accum, // [E,tiles] - const float* logits, // [n,E] - int log_tile, - int tiles, - int token_num, - int token_num_padded, - int expert_num, - int top_k) -{ - constexpr int max_tiles = kMoeGateMaxTiles; - constexpr int threads_per_token = max_expert_num / items_per_thread; // 8 - - // We use bits in a uint32_t to represent selected experts - static_assert(items_per_thread <= 32); - // We use warp-level primitives for reduction - static_assert(threads_per_token <= 32); - - static_assert((threads_per_token & (threads_per_token - 1)) == 0); - - const int warp_id = threadIdx.x / WARP_SIZE; - const int lane_id = threadIdx.x % WARP_SIZE; - - const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x; - - const int ti = thread_idx / threads_per_token; - const int ei = thread_idx % threads_per_token; - - const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token; - const int warp_ti_offset = warp_ti * threads_per_token; - - __shared__ int shared_accum[max_expert_num][max_tiles]; - - for (int i = threadIdx.x; i < max_expert_num * max_tiles; i += block_dim) { - int e = i / max_tiles; - int t = i % max_tiles; - shared_accum[e][t] = 0; - } - - __syncthreads(); - - constexpr int sort_k = std::min(items_per_thread, max_top_k); - - float data[items_per_thread]; - - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; ++i) { - data[i] = -std::numeric_limits::infinity(); - } - if (ti < token_num) { - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; i += alignment) { - const int e = items_per_thread * ei + i; - if (e < expert_num) { - Load((Array&)data[i], &logits[ti * expert_num + e]); - } - } - } - - float value[items_per_thread]; - int index[items_per_thread]; - - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; ++i) { - value[i] = data[i]; - index[i] = items_per_thread * ei + i; - } - - // PRAGMA_UNROLL - // for (int i = 0; i < sort_k; ++i) { - // for (int j = items_per_thread - 1; j > i; --j) { - // if (value[j] > value[j - 1]) { - // auto tmp0 = value[j]; - // value[j] = value[j - 1]; - // value[j - 1] = tmp0; - // auto tmp1 = index[j]; - // index[j] = index[j - 1]; - // index[j - 1] = tmp1; - // } - // } - // } - - PRAGMA_UNROLL - for (int j = items_per_thread - 1; j > 0; --j) { - if (value[j] > value[j - 1]) { - auto tmp0 = value[j]; - value[j] = value[j - 1]; - value[j - 1] = tmp0; - auto tmp1 = index[j]; - index[j] = index[j - 1]; - index[j - 1] = tmp1; - } - } - - float max_logit; - - int count = 0; - - auto run = [&](int k) { - float max_val = -std::numeric_limits::infinity(); - - PRAGMA_UNROLL - for (int i = 0; i <= k; ++i) { - if (i == count) { - max_val = value[i]; - } - } - - if (k + 1 < sort_k) { - PRAGMA_UNROLL - for (int j = items_per_thread - 1; j > k + 1; --j) { - if (value[j] > value[j - 1]) { - auto tmp0 = value[j]; - value[j] = value[j - 1]; - value[j - 1] = tmp0; - auto tmp1 = index[j]; - index[j] = index[j - 1]; - index[j - 1] = tmp1; - } - } - } - - int g_max_ei = ei; - float g_max_val = max_val; - if constexpr (threads_per_token > 1) { - // global maximum - PRAGMA_UNROLL - for (int m = threads_per_token / 2; m >= 1; m /= 2) { - g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m)); - } - // tie breaking - const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val); - g_max_ei = __ffs(active >> (unsigned)warp_ti_offset) - 1; - } - - if (k == 0) { - max_logit = g_max_val; - } - - if (ei == g_max_ei) { - count += 1; - } - }; - - PRAGMA_UNROLL - for (int k = 0; k < max_top_k; ++k) { - run(k); - } - - float sum_prob{}; - - PRAGMA_UNROLL - for (int i = 0; i < sort_k; ++i) { - if (i < count) { - value[i] = expf(value[i] - max_logit); - sum_prob += value[i]; - } - } - - PRAGMA_UNROLL - for (int m = threads_per_token / 2; m >= 1; m /= 2) { - sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m); - } - sum_prob = fdividef(1.f, sum_prob); - - using WarpScan = cub::WarpScan; - __shared__ typename WarpScan::TempStorage temp_storage[block_dim / threads_per_token]; - - int idx{}; - WarpScan{temp_storage[threadIdx.x / threads_per_token]}.ExclusiveSum(count, idx); - - PRAGMA_UNROLL - for (int i = 0; i < sort_k; ++i) { - if (ti < token_num && i < count) { - const int e = index[i]; - masks[e * token_num_padded + ti] = idx; - scales[idx * token_num + ti] = value[i] * sum_prob; - atomicAdd(&shared_accum[e][ti >> log_tile], 1); - ++idx; - } - } - - __syncthreads(); - - for (int i = threadIdx.x; i < expert_num * max_tiles; i += block_dim) { - int e = i / max_tiles; - int t = i % max_tiles; - if (e < expert_num && t < tiles) { - // atomic add in Gmem - atomicAdd(accum + e * tiles + t, shared_accum[e][t]); - } - } -} - -/////////////////////////////////////////////////////////////////////////////////////////// - -template -__global__ void MoeGateKernel_v6(float* scales, // [e,n] - int* masks, // [E,n], padded - int* accum, // [E,tiles] - const float* logits, // [n,E] - int log_tile, - int tiles, - int token_num, - int token_num_padded, - int expert_num, - int top_k) -{ - constexpr int max_tiles = kMoeGateMaxTiles; - constexpr int threads_per_token = max_expert_num / items_per_thread; // 8 - - // We use bits in a uint32_t to represent selected experts - static_assert(items_per_thread <= 32); - // We use warp-level primitives for reduction - static_assert(threads_per_token <= 32); - - static_assert((threads_per_token & (threads_per_token - 1)) == 0); - - const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x; - - const int ti = thread_idx / threads_per_token; - const int ei = thread_idx % threads_per_token; - - const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token; - const int warp_ti_offset = warp_ti * threads_per_token; - - __shared__ int shared_accum[max_expert_num][max_tiles]; - - for (int i = threadIdx.x; i < max_expert_num * max_tiles; i += block_dim) { - int e = i / max_tiles; - int t = i % max_tiles; - shared_accum[e][t] = 0; - } - - __syncthreads(); - - float data[items_per_thread]; - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; ++i) { - data[i] = -std::numeric_limits::infinity(); - } - if (ti < token_num) { - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; i += alignment) { - const int e = items_per_thread * ei + i; - if (e < expert_num) { - Load((Array&)data[i], &logits[ti * expert_num + e]); - } - } - } - - float max_logit; - - unsigned mask = (unsigned)-1; - unsigned max_bit = 0; - float max_val = -std::numeric_limits::infinity(); - { - unsigned bit = 1; - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; ++i) { - if ((mask & bit) && data[i] > max_val) { - max_bit = bit; - max_val = data[i]; - } - asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); - } - } - - auto run = [&](int k) { - unsigned next_mask = mask - max_bit; - unsigned next_max_bit = 0; - float next_max_val = -std::numeric_limits::infinity(); - unsigned bit = 1; - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; ++i) { - if ((next_mask & bit) && data[i] > next_max_val) { - next_max_bit = bit; - next_max_val = data[i]; - } - asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); - } - - int g_max_ei = ei; - float g_max_val = max_val; - - if constexpr (threads_per_token > 1) { - // global maximum - PRAGMA_UNROLL - for (int m = threads_per_token / 2; m >= 1; m /= 2) { - g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m)); - } - // tie breaking - const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val); - g_max_ei = __ffs(active >> (unsigned)warp_ti_offset) - 1; - } - - if (k == 0) { - max_logit = g_max_val; - } - - if (ei == g_max_ei) { - mask = next_mask; - max_bit = next_max_bit; - max_val = next_max_val; - } - }; - - run(0); - for (int k = 1; k < max_top_k; ++k) { - run(k); - } - - mask = ~mask; - - float sum_prob{}; - unsigned bit = 1; - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; ++i) { - if (norm_top_k == false || (mask & bit)) { - data[i] = expf(data[i] - max_logit); - sum_prob += data[i]; - } - asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); - } - - PRAGMA_UNROLL - for (int m = threads_per_token / 2; m >= 1; m /= 2) { - sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m); - } - - sum_prob = fdividef(1.f, sum_prob); - - const int count = __popc(mask); - - using WarpScan = cub::WarpScan; - __shared__ typename WarpScan::TempStorage temp_storage[block_dim / threads_per_token]; - - int idx{}; - WarpScan{temp_storage[threadIdx.x / threads_per_token]}.ExclusiveSum(count, idx); - - bit = 1; - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; ++i) { - const int valid = mask & bit; - const int e = items_per_thread * ei + i; - if (valid) { - masks[e * token_num_padded + ti] = idx; - scales[idx * token_num + ti] = data[i] * sum_prob; - // atomic add in Smem - atomicAdd(&shared_accum[e][ti >> log_tile], 1); - ++idx; - } - asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); - } - - __syncthreads(); - - for (int i = threadIdx.x; i < expert_num * max_tiles; i += block_dim) { - int e = i / max_tiles; - int t = i % max_tiles; - if (e < expert_num && t < tiles) { - // atomic add in Gmem - atomicAdd(accum + e * tiles + t, shared_accum[e][t]); - } - } -} - -template -__global__ void MoeGateKernel_v7(float* scales, // [e,n] - int* masks, // [E,n], padded - int* accum, // [E,tiles] - const float* logits, // [n,E] - int log_tile, - int tiles, - int token_num, - int token_num_padded, - int expert_num, - int top_k) -{ - constexpr int max_tiles = kMoeGateMaxTiles; - constexpr int threads_per_token = max_expert_num / items_per_thread; // 8 - - // We use bits in a uint32_t to represent selected experts - static_assert(items_per_thread <= 32); - // We use warp-level primitives for reduction - static_assert(threads_per_token <= 32); - - static_assert((threads_per_token & (threads_per_token - 1)) == 0); - - const int warp_id = threadIdx.x / WARP_SIZE; - const int lane_id = threadIdx.x % WARP_SIZE; - - const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x; - - const int ti = thread_idx / threads_per_token; - const int ei = thread_idx % threads_per_token; - - const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token; - const int warp_ti_offset = warp_ti * threads_per_token; - - // +1 padding greatly reduced (-80%) bank conflicts - __shared__ int shared_accum[max_tiles][max_expert_num + 1]; - - for (int i = threadIdx.x; i < max_tiles * max_expert_num; i += block_dim) { - int e = i % max_expert_num; - int t = i / max_expert_num; - shared_accum[t][e] = 0; - } - -#if 0 - logits += blockIdx.x * block_dim / threads_per_token * expert_num; - logits += threadIdx.x / WARP_SIZE * (WARP_SIZE / threads_per_token) * expert_num; - constexpr int tokens_per_warp = WARP_SIZE / threads_per_token; - __shared__ float smem_data[block_dim / WARP_SIZE][tokens_per_warp][max_expert_num]; - PRAGMA_UNROLL - for (int i = 0; i < tokens_per_warp * max_expert_num; i += WARP_SIZE) { - smem_data[warp_id][(lane_id + i) / max_expert_num][(lane_id + i) % max_expert_num] = logits[lane_id + i]; - } - __syncthreads(); - float data[items_per_thread]; - for (int i = 0; i < items_per_thread; ++i) { - const int e = items_per_thread * ei + i; - data[i] = smem_data[warp_id][lane_id / threads_per_token][e]; - } -#else - __syncthreads(); - float data[items_per_thread]; - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; ++i) { - data[i] = -std::numeric_limits::infinity(); - } - if (ti < token_num) { - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; i += alignment) { - const int e = items_per_thread * ei + i; - if (e < expert_num) { - Load((Array&)data[i], &logits[ti * expert_num + e]); - } - } - } -#endif - - unsigned mask = (unsigned)-1; - float max_logit; - - auto run = [&](int k) { - unsigned bit = 1; - unsigned max_bit = 0; - float max_val = -std::numeric_limits::infinity(); - // local maximum - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; ++i) { - if ((mask & bit) && data[i] > max_val) { - max_bit = bit; - max_val = data[i]; - } - asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); - } - int g_max_ei = ei; - float g_max_val = max_val; - if constexpr (threads_per_token > 1) { - // global maximum - PRAGMA_UNROLL - for (int m = threads_per_token / 2; m >= 1; m /= 2) { - g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m)); - } - // tie breaking - const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val); - g_max_ei = __ffs(active >> (unsigned)warp_ti_offset) - 1; - } - if (k == 0) { - max_logit = g_max_val; - } - if (ei == g_max_ei) { - mask -= max_bit; - } - }; - - run(0); - for (int k = 1; k < max_top_k; ++k) { - run(k); - } - - mask = ~mask; - - float sum_prob{}; - unsigned bit = 1; - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; ++i) { - if (norm_top_k == false || (mask & bit)) { - data[i] = expf(data[i] - max_logit); - sum_prob += data[i]; - } - asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); - } - - PRAGMA_UNROLL - for (int m = threads_per_token / 2; m >= 1; m /= 2) { - sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m); - } - - sum_prob = fdividef(1.f, sum_prob); - - const int count = __popc(mask); - - using WarpScan = cub::WarpScan; - __shared__ typename WarpScan::TempStorage temp_storage[block_dim / threads_per_token]; - - int idx{}; - WarpScan{temp_storage[threadIdx.x / threads_per_token]}.ExclusiveSum(count, idx); - - bit = 1; - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; ++i) { - const int valid = mask & bit; - const int e = items_per_thread * ei + i; - if (valid) { - masks[e * token_num_padded + ti] = idx; - // scales[idx * token_num + ti] = data[i] * sum_prob; - scales[ti * top_k + idx] = data[i] * sum_prob; - // atomic add in Smem - atomicAdd(&shared_accum[ti >> log_tile][e], 1); - ++idx; - } - asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); - } - - __syncthreads(); - - for (int i = threadIdx.x; i < max_expert_num * max_tiles; i += block_dim) { - int t = i % max_tiles; - int e = i / max_tiles; - if (e < expert_num && t < tiles) { - // atomic add in Gmem - atomicAdd(accum + e * tiles + t, shared_accum[t][e]); - } - } -} - template= 1; m /= 2) { - // sum_v += __shfl_xor_sync((uint32_t)-1, sum_v, m); - // } - // if (ti == 0) { - // printf("sum=%f\n", sum_v); - // } - using WarpScan = cub::WarpScan; __shared__ typename WarpScan::TempStorage temp_storage[tokens_per_cta]; @@ -1453,144 +559,6 @@ __global__ void MoeGateKernel_v8(float* scales, // [e,n] } } -#ifdef USE_WARPSORT - -struct Greater { - template - __device__ bool operator()(T a, T b) const noexcept - { - return a > b; - } -}; - -template -__global__ void MoeGateKernel_v9(float* scales, // [e,n] - int* masks, // [E,n], padded - int* accum, // [E,tiles] - const float* logits, // [n,E] - int log_tile, - int tiles, - int token_num, - int token_num_padded, - int expert_num, - int top_k) -{ - constexpr int max_tiles = kMoeGateMaxTiles; - constexpr int threads_per_token = max_expert_num / items_per_thread; // 8 - - // We use bits in a uint32_t to represent selected experts - static_assert(items_per_thread <= 32); - // We use warp-level primitives for reduction - static_assert(threads_per_token <= 32); - - static_assert((threads_per_token & (threads_per_token - 1)) == 0); - - const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x; - - const int ti = thread_idx / threads_per_token; - const int ei = thread_idx % threads_per_token; - - const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token; - const int warp_ti_offset = warp_ti * threads_per_token; - - // +1 padding greatly reduced (-80%) bank conflicts - __shared__ int shared_accum[max_tiles][max_expert_num + 1]; - - PRAGMA_UNROLL - for (int i = 0; i < max_tiles * max_expert_num; i += block_dim) { - int e = (i + threadIdx.x) % max_expert_num; - int t = (i + threadIdx.x) / max_expert_num; - shared_accum[t][e] = 0; - } - - __syncthreads(); - - float data[items_per_thread]; - int idxs[items_per_thread]; - - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; ++i) { - data[i] = -std::numeric_limits::infinity(); - idxs[i] = threads_per_token * (i / alignment * alignment) + i % alignment + ei * alignment; - } - - if (ti < token_num) { - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; i += alignment) { - const int e = threads_per_token * i + ei * alignment; - if (e < expert_num) { - Load((Array&)data[i], &logits[ti * expert_num + e]); - } - } - } - - constexpr float kLog2e = 1.4426950408889634074; - - using MergeSort = cub::WarpMergeSort; - - union Smem { - typename MergeSort::TempStorage temp[block_dim / threads_per_token]; - // int2 sorted[max_top_k][block_dim / threads_per_token + 1]; - int2 sorted[block_dim / threads_per_token][max_top_k + 1]; - }; - - __shared__ Smem smem; - - MergeSort{smem.temp[threadIdx.x / threads_per_token]}.Sort(data, idxs, Greater{}); - - __syncthreads(); - - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; ++i) { - const int k = threads_per_token * ei + i; // blocked - if (k < max_top_k) { - // smem.sorted[k][threadIdx.x / threads_per_token] = int2{float_as_int(data[i]), idxs[i]}; - smem.sorted[threadIdx.x / threads_per_token][k] = int2{float_as_int(data[i]), idxs[i]}; - } - } - - __syncthreads(); - - constexpr int top_k_per_thread = cdiv(max_top_k, threads_per_token); - - int es[top_k_per_thread]; - - PRAGMA_UNROLL - for (int i = 0; i < top_k_per_thread; ++i) { - const int k = ei * top_k_per_thread + i; - if (k < max_top_k) { - // const int2 tmp = smem.sorted[k][threadIdx.x / threads_per_token]; - const int2 tmp = smem.sorted[threadIdx.x / threads_per_token][k]; - const int e = tmp.y; - masks[e * token_num_padded + ti] = k; - scales[ti * top_k + k] = int_as_float(tmp.x); - es[i] = e; - // atomicAdd(&shared_accum[ti >> log_tile][e], 1); - } - } - - PRAGMA_UNROLL - for (int i = 0; i < top_k_per_thread; ++i) { - const int k = ei * top_k_per_thread + i; - if (k < max_top_k) { - atomicAdd(&shared_accum[ti >> log_tile][es[i]], 1); - } - } - - __syncthreads(); - - for (int i = threadIdx.x; i < max_expert_num * max_tiles; i += block_dim) { - int t = i % max_tiles; - int e = i / max_tiles; - if (e < expert_num && t < tiles) { - // atomic add in Gmem - atomicAdd(accum + e * tiles + t, shared_accum[t][e]); - } - } -} - -#endif - template inline constexpr std::integral_constant _Int{}; @@ -1618,13 +586,15 @@ void invokeMoeGate_V2(int* f2n, // [e*n] -> n // std::cout << log_tile << " " << tiles << "\n"; - { + auto invoke = [&](auto max_expert_num, auto top_k, auto items_per_thread) { + constexpr int thrs_per_tok = max_expert_num.value / items_per_thread.value; + constexpr int threads = 256; + const int blocks = ceil_div(tokens, threads / thrs_per_tok); + + cudaMemsetAsync(masks, -1, sizeof(int8_t) * experts * tokens_padded, st); - auto invoke = [&](auto e) { - constexpr int threads = 128; - const int blocks = ceil_div(tokens * 2, threads); - static constexpr int top_k = decltype(e)::value; - MoeGateKernel_V2<<>>( // + MoeGateKernel_v8 + <<>>( // scales, (int8_t*)masks, accum, @@ -1633,34 +603,37 @@ void invokeMoeGate_V2(int* f2n, // [e*n] -> n tiles, tokens, tokens_padded, - experts); - }; - - auto invoke2 = [&](auto max_expert_num, auto top_k, auto items_per_thread) { - constexpr int thrs_per_tok = max_expert_num.value / items_per_thread.value; - constexpr int threads = 128; - const int blocks = ceil_div(tokens, threads / thrs_per_tok); - - cudaMemsetAsync(masks, -1, sizeof(int8_t) * experts * tokens_padded, st); - - MoeGateKernel_v8 - <<>>( // - scales, - (int8_t*)masks, - accum, - logits, - log_tile, - tiles, - tokens, - tokens_padded, - experts, - experts_per_token, - norm_topk); - }; + experts, + experts_per_token, + norm_topk); + }; - invoke2(_Int<64>, _Int<8>, _Int<16>); + auto fail = [&] { + std::cerr << "unsupported moe config: expert_num=" << experts << ", top_k=" << experts_per_token << "\n"; + std::abort(); + }; - // invoke(_Int<2>); + if (experts <= 8) { + if (experts_per_token <= 2) { + invoke(_Int<8>, _Int<2>, _Int<8>); + } + else { + invoke(_Int<8>, _Int<8>, _Int<8>); + } + } + else if (experts <= 64) { + if (experts_per_token <= 4) { + invoke(_Int<64>, _Int<4>, _Int<16>); + } + else if (experts_per_token <= 8) { + invoke(_Int<64>, _Int<8>, _Int<16>); + } + else { + fail(); + } + } + else { + fail(); } { diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc index 3b9bb1c11..f6f9ab0ef 100644 --- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc +++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc @@ -709,12 +709,12 @@ void LlamaDecoderLayerWeight::prepare(void* workspace, size_t size, const cud }; if (inter_size_) { - std::cerr << "process FFN\n"; + // std::cerr << "process FFN\n"; process_ffn(ffn_weights, false); } if (!moe_weights.experts.empty()) { - std::cerr << "process MoE\n"; + // std::cerr << "process MoE\n"; std::vector> fused_ptrs; std::vector> output_ptrs; std::vector> fused_param_ptrs; diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h index 21c1c666c..169fb53bc 100644 --- a/src/turbomind/models/llama/LlamaDenseWeight.h +++ b/src/turbomind/models/llama/LlamaDenseWeight.h @@ -197,7 +197,7 @@ struct MoeFfnWeight { bool fuse_silu_act) { - printf("%d %d %d\n", (int)hidden_dim, (int)inter_size, (int)expert_num); + // printf("%d %d %d\n", (int)hidden_dim, (int)inter_size, (int)expert_num); if (expert_num == 0) { return; From 8fade570d22fa34b167653524d400451072c09b6 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Thu, 7 Nov 2024 08:22:26 +0000 Subject: [PATCH 04/21] fix lint --- lmdeploy/turbomind/deploy/module.py | 3 ++- .../turbomind/deploy/source_model/qwen.py | 19 ++++++++----------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py index 2d3575c37..8d998abe2 100644 --- a/lmdeploy/turbomind/deploy/module.py +++ b/lmdeploy/turbomind/deploy/module.py @@ -166,7 +166,8 @@ def apply(self, i: int, r: BaseReader): if self.shared_gate: shared_gate = transpose(r.moe_ffn_shared_gate(i)) # print(shared_gate) - self.model.save_split(shared_gate, self._moe_ffn_shared_gate.format(i)) + self.model.save_split(shared_gate, + self._moe_ffn_shared_gate.format(i)) class Attn(Module): diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py index 071ade122..772bd0303 100644 --- a/lmdeploy/turbomind/deploy/source_model/qwen.py +++ b/lmdeploy/turbomind/deploy/source_model/qwen.py @@ -120,7 +120,6 @@ def model_info(self): cfg = super().model_info() cfg['attn_bias'] = 1 return cfg - class Qwen2MoeReader(LlamaReader): @@ -137,11 +136,10 @@ def moe_ffn_expert(self, e=None, i=None, kind=None): tensor = self.transform(tensor, kind) result.append(tensor) return (*result, ) - + def moe_ffn_gate(self, i): - return self.params.get( - f'model.layers.{i}.mlp.gate.weight') - + return self.params.get(f'model.layers.{i}.mlp.gate.weight') + def _ffn(self, i: int, kind: str): """Get ffn kind for layer i.""" if not kind: @@ -153,11 +151,11 @@ def _ffn(self, i: int, kind: str): tensor = self.transform(tensor, kind) result.append(tensor) return (*result, ) - + def moe_ffn_shared_gate(self, i): return self.params.get( - f'model.layers.{i}.mlp.shared_expert_gate.weight' - ) + f'model.layers.{i}.mlp.shared_expert_gate.weight') + @INPUT_MODELS.register_module(name='qwen2-moe') class Qwen2MoeModel(LlamaModel): @@ -165,9 +163,8 @@ class Qwen2MoeModel(LlamaModel): Reader = Qwen2MoeReader def tokenizer_info(self): - """ - https://huggingface.co/Qwen/Qwen1.5-7B-Chat/blob/main/generation_config.json - """ # noqa: E501 + """https://huggingface.co/Qwen/Qwen1.5-7B-Chat/blob/main/generation_con + fig.json.""" # noqa: E501 n_words = 152064 bos_id = 151643 eos_id = 151645 From 4c0f9026dabcda0eec54fa3043cb21ef4c62a134 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Thu, 7 Nov 2024 08:29:27 +0000 Subject: [PATCH 05/21] clean up --- src/turbomind/kernels/gemm/moe_utils_v2.cu | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu index dc2c21d7c..5912c60a8 100644 --- a/src/turbomind/kernels/gemm/moe_utils_v2.cu +++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu @@ -6,14 +6,9 @@ #include #include -#include -#include #include #include -#include -#include #include -#include #include "src/turbomind/kernels/core/array_ops.h" #include "src/turbomind/kernels/core/common.h" From da6aeff2d05e1bf6c0bfe26f7419a2ae1367e6c9 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Fri, 8 Nov 2024 07:47:48 +0000 Subject: [PATCH 06/21] layer-wise `inter_size` & `expert_num` --- lmdeploy/turbomind/deploy/config.py | 7 +- lmdeploy/turbomind/deploy/module.py | 14 +-- .../turbomind/deploy/target_model/base.py | 20 +++- src/turbomind/kernels/gemm/context.h | 13 +-- src/turbomind/kernels/gemm/test/testbed.h | 2 +- .../models/llama/LlamaDecoderLayerWeight.cc | 91 ++++++++++++++++++- .../models/llama/LlamaDecoderLayerWeight.h | 36 +++++--- src/turbomind/models/llama/LlamaDenseWeight.h | 89 +++++------------- src/turbomind/models/llama/LlamaV2.cc | 1 - src/turbomind/models/llama/LlamaV2.h | 1 - src/turbomind/models/llama/LlamaWeight.cc | 33 ++++++- src/turbomind/models/llama/LlamaWeight.h | 39 ++++---- src/turbomind/models/llama/llama_params.h | 42 +++++---- src/turbomind/models/llama/moe_ffn_layer.cc | 41 +++++---- src/turbomind/models/llama/moe_ffn_layer.h | 16 ++-- src/turbomind/models/llama/unified_decoder.cc | 4 +- src/turbomind/models/llama/weight_type.h | 55 +++++++++++ .../triton_backend/llama/LlamaTritonModel.cc | 65 +++++++------ .../triton_backend/llama/LlamaTritonModel.h | 3 - 19 files changed, 365 insertions(+), 207 deletions(-) create mode 100644 src/turbomind/models/llama/weight_type.h diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py index a535b0d4c..bfa46f325 100644 --- a/lmdeploy/turbomind/deploy/config.py +++ b/lmdeploy/turbomind/deploy/config.py @@ -2,6 +2,7 @@ import inspect import json from dataclasses import asdict, fields +from typing import List # use pydantic.dataclasses.dataclass to check data type from pydantic.dataclasses import dataclass @@ -36,7 +37,8 @@ class ModelConfig: hidden_units: int = None vocab_size: int = None num_layer: int = None - inter_size: int = None + # inter_size: int = None + inter_size: List[int] = None norm_eps: float = None attn_bias: int = 0 start_id: int = None @@ -47,7 +49,8 @@ class ModelConfig: session_len: int = None tp: int = 1 model_format: str = 'hf' - expert_num: int = 0 + # expert_num: int = 0 + expert_num: List[int] = None expert_inter_size: int = 0 experts_per_token: int = 0 moe_shared_gate: int = False diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py index 8d998abe2..51e842aee 100644 --- a/lmdeploy/turbomind/deploy/module.py +++ b/lmdeploy/turbomind/deploy/module.py @@ -96,10 +96,12 @@ class Ffn(Module): def __init__(self, model: BaseOutputModel): self.model = model self.tp = model.tensor_para_size + # inter_sizes in config are padded and my differ from what's in the weights self.inter_size = model.model_config.inter_size self.group_size = max(1, model.model_config.group_size) def _export(self, + inter_size: int, fmt: str, idx: int, w123, @@ -110,11 +112,11 @@ def _export(self, w1, w2, w3 = map(transpose, w123) if not is_lora_a: - w1 = pad_out_dims(w1, self.inter_size) - w3 = pad_out_dims(w3, self.inter_size) + w1 = pad_out_dims(w1, inter_size) + w3 = pad_out_dims(w3, inter_size) if not is_lora_b: group_size = self.group_size if apply_gs else 1 - w2 = pad_in_dims(w2, self.inter_size // group_size) + w2 = pad_in_dims(w2, inter_size // group_size) w1, w2, w3 = map(pack_fn, (w1, w2, w3)) self.model.save_split(w1, @@ -132,7 +134,7 @@ def _export(self, def apply(self, i: int, r: BaseReader): for e in get_params(r.ffn(i, None)): - e(partial(self._export, self._ffn), partial(r.ffn, i), i) + e(partial(self._export, self.inter_size[i], self._ffn), partial(r.ffn, i), i) class MoeFfn(Ffn): @@ -155,9 +157,9 @@ def __init__(self, model: BaseOutputModel): def apply(self, i: int, r: BaseReader): for p in get_params(r.moe_ffn_expert()): - for e in range(self.expert_num): + for e in range(self.expert_num[i]): fmt = self._moe_ffn_expert.replace('E', str(e)) - p(partial(self._export, fmt), partial(r.moe_ffn_expert, e, i), + p(partial(self._export, self.inter_size, fmt), partial(r.moe_ffn_expert, e, i), i) gate = transpose(r.moe_ffn_gate(i)) diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py index 4750cde85..c989d630d 100644 --- a/lmdeploy/turbomind/deploy/target_model/base.py +++ b/lmdeploy/turbomind/deploy/target_model/base.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +import collections import os.path as osp from abc import ABC @@ -64,13 +65,14 @@ def __init__(self, # get `model_info` and `tokenizer_info` at first, which # will be updated to `self.model_config` and `self.attention_config` self.input_model_info = self.input_model.model_info() + self.input_model_info = self.single_to_list( + self.input_model_info, keys=['inter_size', 'expert_num']) self.input_model_tokenizer_info = self.input_model.tokenizer_info() self.permute_qk = self.input_model_info.get('permute_qk', True) - self.update_model_config() - self.model_config.inter_size = _pad_inter_size( - self.model_config.inter_size, self.model_config.group_size, - self.tensor_para_size) + for i, v in enumerate(self.model_config.inter_size): + self.model_config.inter_size[i] = _pad_inter_size( + v, self.model_config.group_size, self.tensor_para_size) if self.model_config.expert_num: self.model_config.expert_inter_size = _pad_inter_size( self.model_config.expert_inter_size, @@ -78,11 +80,21 @@ def __init__(self, self.model_config.verify() assert self.model_config.kv_head_num % self.tensor_para_size == 0 + # print(self.model_config) + self.update_attention_config() self.update_lora_config() # ! Dependency on `self` self.model = model_cls(self) + def single_to_list(self, config: dict, keys): + num_layer = int(config['num_layer']) + for k in keys: + v = config.get(k, None) + if v is not None and not isinstance(v, collections.Sequence): + config[k] = [v] * num_layer + return config + def update_model_config(self): """Update `self.model_config` according to the input_model's `tokenizer_info` and `model_info`""" diff --git a/src/turbomind/kernels/gemm/context.h b/src/turbomind/kernels/gemm/context.h index 4fec5b732..bd03917b8 100644 --- a/src/turbomind/kernels/gemm/context.h +++ b/src/turbomind/kernels/gemm/context.h @@ -113,12 +113,7 @@ class DynamicGemmContext: public StaticGemmContext { class MoeGemmContext: public Context { public: - MoeGemmContext(int experts, - int experts_per_token, - // int output_dims, - // int input_dims, - const cudaDeviceProp& prop, - cudaStream_t stream); + MoeGemmContext(int experts, int experts_per_token, const cudaDeviceProp& prop, cudaStream_t stream); ~MoeGemmContext() override; @@ -156,9 +151,11 @@ class MoeGemmContext: public Context { Tape Schedule(const LaunchSpec&) override; - void set_offsets(const int* offsets) + void update(int expert_num, int experts_per_token, const int* offsets) { - offsets_ = offsets; + expert_num_ = expert_num; + experts_per_token_ = experts_per_token; + offsets_ = offsets; } protected: diff --git a/src/turbomind/kernels/gemm/test/testbed.h b/src/turbomind/kernels/gemm/test/testbed.h index 7a089fbdf..2678470bb 100644 --- a/src/turbomind/kernels/gemm/test/testbed.h +++ b/src/turbomind/kernels/gemm/test/testbed.h @@ -357,7 +357,7 @@ class Testbed { } } - ((MoeGemmContext*)ctx_.get())->set_offsets(moe_m_offsets_.data().get()); + ((MoeGemmContext*)ctx_.get())->update(experts_, exp_per_tok_, moe_m_offsets_.data().get()); CHECK(batch_dim == 0); CHECK(a_desc_.order == kRowMajor); diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc index f6f9ab0ef..fadebda78 100644 --- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc +++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc @@ -50,7 +50,7 @@ static bool is_fuse_silu_act() }(); return value; } - +#if 0 template LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(int layer_idx, size_t head_num, @@ -145,7 +145,90 @@ LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(int layer_idx, mallocWeights(); } +#else + +template +LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(int layer_id, + const ModelParam& model, + const LoraParam& lora_param, + const MoeParam& moe_param, + size_t tp_size, + size_t tp_rank): + head_num_(model.head_num), + kv_head_num_(model.kv_head_num), + size_per_head_(model.head_dim), + hidden_units_(model.hidden_units), + inter_size_(model.inter_size.at(layer_id)), + weight_type_(model.weight_type), + attn_bias_(model.attn_bias), + tensor_para_size_(tp_size), + tensor_para_rank_(tp_rank) +{ + if (lora_param.policy == LoraPolicy::kPlora) { + std::vector keys = { + "attention.w_qkv", "attention.wo", "feed_forward.w1", "feed_forward.w2", "feed_forward.w3"}; + std::vector*> weights = {&self_attn_weights.qkv, + &self_attn_weights.output, + &ffn_weights.gating, + &ffn_weights.output, + &ffn_weights.intermediate}; + for (int i = 0; i < keys.size(); i++) { + const auto& name = keys[i]; + auto& weight = *weights[i]; + int rank = lora_param.r; + float scale = lora_param.scale; + std::string full_name = "layers." + std::to_string(layer_id) + "." + name; + + for (const auto& [re, pr] : lora_param.rank_pattern) { + if (std::regex_search(full_name, pr.first)) { + rank = pr.second; + TM_LOG_DEBUG("find rank, pattern=%s, name=%s, value=%d", re.c_str(), full_name.c_str(), rank); + break; + } + } + for (const auto& [re, pr] : lora_param.scale_pattern) { + if (std::regex_search(full_name, pr.first)) { + scale = pr.second; + TM_LOG_DEBUG("find scale pattern=%s, name=%s, value=%f", re.c_str(), full_name.c_str(), scale); + break; + } + } + if (rank) { + weight.lora.r = rank; + weight.lora.scale = scale; + weight.lora.policy = lora_param.policy; + } + } + } + + fused_up_and_gate_ = ffn_weights.gating.lora.policy != LoraPolicy::kPlora; + + self_attn_weights.qkv.input_dims = hidden_units_; + self_attn_weights.qkv.output_dims = (head_num_ + 2 * kv_head_num_) * size_per_head_ / tensor_para_size_; + self_attn_weights.qkv.type = weight_type_; + self_attn_weights.qkv.group_size = model.group_size; + + self_attn_weights.output.input_dims = (head_num_ * size_per_head_) / tensor_para_size_; + self_attn_weights.output.output_dims = hidden_units_; + self_attn_weights.output.type = weight_type_; + self_attn_weights.output.group_size = model.group_size; + + ffn_weights = LlamaFfnWeight{ + hidden_units_, + inter_size_, + tensor_para_size_, + weight_type_, + model.group_size, + weight_type_ == WeightType::kINT4 && is_fuse_silu_act(), + }; + + moe_weights = MoeFfnWeight{ + layer_id, moe_param, hidden_units_, weight_type_, model.group_size, tensor_para_size_, is_fuse_silu_act()}; + + mallocWeights(); +} +#endif template size_t LlamaDecoderLayerWeight::workspace_size() const noexcept { @@ -709,12 +792,12 @@ void LlamaDecoderLayerWeight::prepare(void* workspace, size_t size, const cud }; if (inter_size_) { - // std::cerr << "process FFN\n"; + std::cerr << "process FFN\n"; process_ffn(ffn_weights, false); } if (!moe_weights.experts.empty()) { - // std::cerr << "process MoE\n"; + std::cerr << "process MoE\n"; std::vector> fused_ptrs; std::vector> output_ptrs; std::vector> fused_param_ptrs; @@ -722,7 +805,7 @@ void LlamaDecoderLayerWeight::prepare(void* workspace, size_t size, const cud for (auto& e : moe_weights.experts) { - process_ffn(e, moe_weights.method); + process_ffn(e, moe_weights.method == MoeParam::kFused); const auto& fused = e.fused_gating_intermediate; const auto& output = e.output; diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h index f68a103dd..342775b70 100644 --- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h +++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h @@ -30,21 +30,29 @@ template struct LlamaDecoderLayerWeight { public: LlamaDecoderLayerWeight() = delete; - LlamaDecoderLayerWeight(int layer_idx, - size_t head_num, - size_t kv_head_num, - size_t size_per_head, - size_t hidden_units, - size_t inter_size, - WeightType weight_type, - int group_size, - LoraParam lora_param, - bool attn_bias, - MoeParam moe_param, - size_t tensor_para_size, - size_t tensor_para_rank); + // LlamaDecoderLayerWeight(int layer_idx, + // size_t head_num, + // size_t kv_head_num, + // size_t size_per_head, + // size_t hidden_units, + // size_t inter_size, + // WeightType weight_type, + // int group_size, + // LoraParam lora_param, + // bool attn_bias, + // MoeParam moe_param, + // size_t tensor_para_size, + // size_t tensor_para_rank); + + LlamaDecoderLayerWeight(int layer_id, + const ModelParam& model, + const LoraParam& lora_param, + const MoeParam& moe_param, + size_t tp_size, + size_t tp_rank); + ~LlamaDecoderLayerWeight(); - LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other) = delete; + LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other) = delete; LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other) = delete; void loadModel(std::string dir_path, FtCudaDataType model_file_type); diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h index 169fb53bc..eed1a4119 100644 --- a/src/turbomind/models/llama/LlamaDenseWeight.h +++ b/src/turbomind/models/llama/LlamaDenseWeight.h @@ -20,64 +20,13 @@ #pragma once #include "src/turbomind/kernels/gemm/types.h" +#include "src/turbomind/models/llama/llama_params.h" +#include "src/turbomind/models/llama/weight_type.h" #include "src/turbomind/utils/cuda_utils.h" #include namespace turbomind { -enum class WeightType : int -{ - kFP32, - kFP16, - kFP8, // not supported yet - kBF16, - kINT8, - kINT4 -}; - -template -constexpr WeightType get_default_weight_type() -{ - if constexpr (std::is_same_v) { - return WeightType::kFP16; - } - else if constexpr (std::is_same_v) { - return WeightType::kBF16; - } - else if constexpr (std::is_same_v) { - return WeightType::kFP32; - } - else { - static_assert(sizeof(T) != sizeof(T), "not implemented"); - return {}; - } -} - -inline size_t getBitSize(WeightType type) -{ - switch (type) { - case WeightType::kFP32: - return 32; - case WeightType::kFP16: - return 16; - case WeightType::kFP8: - return 8; - case WeightType::kBF16: - return 16; - case WeightType::kINT8: - return 8; - case WeightType::kINT4: - return 4; - } - return 0; -} - -enum class LoraPolicy : int -{ - kNull, - kPlora, -}; - inline LoraPolicy getLoraPolicy(const std::string& policy) { if (policy == "plora") { @@ -186,23 +135,27 @@ struct MoeFfnWeight { MoeFfnWeight() = default; - MoeFfnWeight(size_t hidden_dim, - int inter_size, - int expert_num, - int method, - bool has_shared_gate, - size_t tp, - WeightType weight_type, - int group_size, - bool fuse_silu_act) + MoeFfnWeight(int layer_id, + const MoeParam& param, + size_t hidden_dim, + WeightType weight_type, + int group_size, + size_t tp, + bool fuse_silu_act) { - // printf("%d %d %d\n", (int)hidden_dim, (int)inter_size, (int)expert_num); + if (param.expert_num.size() < layer_id) { + return; + } + + const int expert_num = param.expert_num[layer_id]; if (expert_num == 0) { return; } + printf("%d %d %d\n", (int)hidden_dim, (int)param.inter_size, (int)expert_num); + gate.input_dims = hidden_dim; gate.output_dims = expert_num; gate.type = get_default_weight_type(); @@ -210,15 +163,15 @@ struct MoeFfnWeight { experts.resize(expert_num); - this->method = method; - fuse_silu_act = fuse_silu_act && method; + method = param.method; + fuse_silu_act = fuse_silu_act && method == MoeParam::kFused; for (auto& e : experts) { // inter size is divided by tp in `FfnWeight` - e = LlamaFfnWeight{hidden_dim, (size_t)inter_size, tp, weight_type, group_size, fuse_silu_act}; + e = LlamaFfnWeight{hidden_dim, (size_t)param.inter_size, tp, weight_type, group_size, fuse_silu_act}; } - if (has_shared_gate) { + if (param.shared_gate) { shared_gate.input_dims = hidden_dim; shared_gate.output_dims = 1; shared_gate.type = get_default_weight_type(); @@ -236,7 +189,7 @@ struct MoeFfnWeight { LlamaFfnWeight block; - int method{}; + MoeParam::Method method{}; }; } // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc index 3d50910ad..05b22deed 100644 --- a/src/turbomind/models/llama/LlamaV2.cc +++ b/src/turbomind/models/llama/LlamaV2.cc @@ -72,7 +72,6 @@ LlamaV2::LlamaV2(const ModelParam& model, lora_param_(lora), head_num_(model.head_num), size_per_head_(model.head_dim), - inter_size_(model.inter_size), hidden_units_(model.hidden_units), layer_num_(model.layer_num), vocab_size_(model.vocab_size), diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h index 6321d09d7..658282f5e 100644 --- a/src/turbomind/models/llama/LlamaV2.h +++ b/src/turbomind/models/llama/LlamaV2.h @@ -113,7 +113,6 @@ class LlamaV2 { const size_t head_num_; const size_t size_per_head_; const size_t hidden_units_; - const size_t inter_size_; const size_t layer_num_; const size_t vocab_size_; const size_t vocab_size_padded_; diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc index 1ac2d82dd..325789f29 100644 --- a/src/turbomind/models/llama/LlamaWeight.cc +++ b/src/turbomind/models/llama/LlamaWeight.cc @@ -24,7 +24,7 @@ #include namespace turbomind { - +#if 0 template LlamaWeight::LlamaWeight(size_t head_num, size_t kv_head_num, @@ -75,6 +75,37 @@ LlamaWeight::LlamaWeight(size_t head_num, mallocWeights(); } +#else + +template +LlamaWeight::LlamaWeight( + const ModelParam& model, const LoraParam& lora_param, const MoeParam& moe_param, size_t tp_size, size_t tp_rank): + hidden_units_(model.hidden_units), + inter_size_(model.inter_size), + vocab_size_(model.vocab_size), + vocab_size_padded_(model.vocab_size), + num_layer_(model.layer_num), + weight_type_(model.weight_type), + tensor_para_size_(tp_size), + tensor_para_rank_(tp_rank) +{ + if (vocab_size_padded_ % tensor_para_size_ != 0) { + vocab_size_padded_ = (vocab_size_padded_ + tensor_para_size_ - 1) / tensor_para_size_ * tensor_para_size_; + TM_LOG_WARNING("pad vocab size from %d to %d", vocab_size_, vocab_size_padded_); + } + + FT_CHECK(hidden_units_ % tensor_para_size_ == 0); + + decoder_layer_weights.reserve(num_layer_); + for (unsigned l = 0; l < num_layer_; ++l) { + decoder_layer_weights.push_back( + new LlamaDecoderLayerWeight(l, model, lora_param, moe_param, tp_size, tp_rank)); + } + + mallocWeights(); +} + +#endif template LlamaWeight::~LlamaWeight() diff --git a/src/turbomind/models/llama/LlamaWeight.h b/src/turbomind/models/llama/LlamaWeight.h index c04bf6c5a..f4ca7e455 100644 --- a/src/turbomind/models/llama/LlamaWeight.h +++ b/src/turbomind/models/llama/LlamaWeight.h @@ -29,24 +29,30 @@ namespace turbomind { template struct LlamaWeight { LlamaWeight() = default; - LlamaWeight(size_t head_num, - size_t kv_head_num, - size_t size_per_head, - size_t hidden_units, - size_t inter_size, - size_t vocab_size, - size_t num_layer, - bool attn_bias, - WeightType weight_type, - int group_size, - LoraParam lora_param, - MoeParam moe_param, - size_t tensor_para_size, - size_t tensor_para_rank); + // LlamaWeight(size_t head_num, + // size_t kv_head_num, + // size_t size_per_head, + // size_t hidden_units, + // size_t inter_size, + // size_t vocab_size, + // size_t num_layer, + // bool attn_bias, + // WeightType weight_type, + // int group_size, + // LoraParam lora_param, + // MoeParam moe_param, + // size_t tensor_para_size, + // size_t tensor_para_rank); + + LlamaWeight(const ModelParam& model_param, + const LoraParam& lora_param, + const MoeParam& moe_param, + size_t tp_size, + size_t tp_rank); ~LlamaWeight(); - LlamaWeight(const LlamaWeight& other) = delete; + LlamaWeight(const LlamaWeight& other) = delete; LlamaWeight& operator=(const LlamaWeight& other) = delete; void loadModel(std::string dir_path); @@ -64,13 +70,14 @@ struct LlamaWeight { void mallocWeights(); size_t hidden_units_; - size_t inter_size_; size_t vocab_size_; size_t vocab_size_padded_; size_t num_layer_; WeightType weight_type_; size_t tensor_para_size_; size_t tensor_para_rank_; + + std::vector inter_size_; }; } // namespace turbomind diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h index 2ea63f041..baac216f9 100644 --- a/src/turbomind/models/llama/llama_params.h +++ b/src/turbomind/models/llama/llama_params.h @@ -2,40 +2,45 @@ #pragma once -#include "src/turbomind/models/llama/LlamaDenseWeight.h" #include #include #include #include +#include "src/turbomind/models/llama/weight_type.h" + namespace turbomind { struct ModelParam { - size_t head_num; - size_t head_dim; - size_t kv_head_num; - size_t hidden_units; - size_t layer_num; - size_t inter_size; - size_t vocab_size; - float norm_eps; - int quant_policy; - // - int start_id; - int end_id; + size_t head_num; + size_t head_dim; + size_t kv_head_num; + size_t hidden_units; + size_t layer_num; + size_t vocab_size; + float norm_eps; + int quant_policy; + bool attn_bias; + WeightType weight_type; + int group_size; + int start_id; + int end_id; + + std::vector inter_size; }; struct MoeParam { - enum Method - { + enum Method { kNaive, kFused } method; - int expert_num; + int experts_per_token; int inter_size; bool norm_topk; bool shared_gate; + + std::vector expert_num; }; struct AttentionParam { @@ -73,6 +78,11 @@ struct EngineParam { int max_prefill_iters; }; +enum class LoraPolicy : int { + kNull, + kPlora, +}; + struct LoraParam { int r; float scale; diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc index 1ad76839d..dcb3f3706 100644 --- a/src/turbomind/models/llama/moe_ffn_layer.cc +++ b/src/turbomind/models/llama/moe_ffn_layer.cc @@ -17,7 +17,7 @@ namespace turbomind { template -void MoeFfnLayer::AllocateBuffer(size_t tokens, size_t padded) +void MoeFfnLayer::AllocateBuffer(size_t tokens, size_t padded, size_t expert_num) { char* base = 0; @@ -25,8 +25,8 @@ void MoeFfnLayer::AllocateBuffer(size_t tokens, size_t padded) Monotonic alloc{base}; alloc(&inout_buf_, tokens * param_.experts_per_token * hidden_dim_); alloc(&inter_buf_, tokens * param_.experts_per_token * inter_size_ * 2); - alloc(&logits_, tokens * param_.expert_num); - alloc(&masks_, param_.expert_num * padded); + alloc(&logits_, tokens * expert_num); + alloc(&masks_, expert_num * padded); alloc(&f2n_, param_.experts_per_token * tokens); alloc(&en2f_, param_.experts_per_token * tokens); alloc(&scales_, param_.experts_per_token * tokens); @@ -80,14 +80,17 @@ void MoeFfnLayer::gate(float* logits, const T* input, int tokens, const Llama template void MoeFfnLayer::forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight& moe) { - const size_t padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize; + const size_t padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize; + const int expert_num = moe.experts.size(); - AllocateBuffer(tokens, padded); + FT_CHECK(expert_num); + + AllocateBuffer(tokens, padded, expert_num); gate(logits_, input, tokens, moe.gate); sync_check_cuda_error(); - check_cuda_error(cudaMemsetAsync(accum_, 0, sizeof(int) * param_.expert_num * kMoeGateMaxTiles, stream_)); + check_cuda_error(cudaMemsetAsync(accum_, 0, sizeof(int) * expert_num * kMoeGateMaxTiles, stream_)); sync_check_cuda_error(); // dump_logits(tokens, layer_id); @@ -102,7 +105,7 @@ void MoeFfnLayer::forward(T* output, const T* input, int tokens, int layer_id logits_, tokens, padded, - param_.expert_num, + expert_num, param_.experts_per_token, param_.norm_topk, stream_); @@ -110,17 +113,17 @@ void MoeFfnLayer::forward(T* output, const T* input, int tokens, int layer_id if (isTuning()) { std::mt19937 g; - const auto expert_ids = SampleUniform(tokens, param_.expert_num, param_.experts_per_token, g); - std::vector cnt(param_.expert_num); + const auto expert_ids = SampleUniform(tokens, expert_num, param_.experts_per_token, g); + std::vector cnt(expert_num); for (const auto& x : expert_ids) { ++cnt[x]; } h_offsets_[0] = 0; - for (int i = 0; i < param_.expert_num; ++i) { + for (int i = 0; i < expert_num; ++i) { h_offsets_[i + 1] = h_offsets_[i] + cnt[i]; } check_cuda_error( - cudaMemcpyAsync(offsets_, h_offsets_, sizeof(int) * (param_.expert_num + 1), cudaMemcpyDefault, stream_)); + cudaMemcpyAsync(offsets_, h_offsets_, sizeof(int) * (expert_num + 1), cudaMemcpyDefault, stream_)); } if (param_.method == MoeParam::kNaive) { @@ -129,15 +132,15 @@ void MoeFfnLayer::forward(T* output, const T* input, int tokens, int layer_id sync_check_cuda_error(); check_cuda_error( - cudaMemcpyAsync(h_offsets_, offsets_, sizeof(int) * (param_.expert_num + 1), cudaMemcpyDefault, stream_)); + cudaMemcpyAsync(h_offsets_, offsets_, sizeof(int) * (expert_num + 1), cudaMemcpyDefault, stream_)); check_cuda_error(cudaStreamSynchronize(stream_)); - if (h_offsets_[param_.expert_num] != tokens * param_.experts_per_token) { - FT_CHECK_WITH_INFO(0, fmtstr("%d vs %d", h_offsets_[param_.expert_num], tokens * param_.experts_per_token)); + if (h_offsets_[expert_num] != tokens * param_.experts_per_token) { + FT_CHECK_WITH_INFO(0, fmtstr("%d vs %d", h_offsets_[expert_num], tokens * param_.experts_per_token)); } - for (int i = 0; i < param_.expert_num; ++i) { + for (int i = 0; i < expert_num; ++i) { FT_CHECK(moe.experts[i].is_fused_silu == false); @@ -153,7 +156,7 @@ void MoeFfnLayer::forward(T* output, const T* input, int tokens, int layer_id } } else { - context_->set_offsets(offsets_); + context_->update(expert_num, param_.experts_per_token, offsets_); auto& block = moe.block; @@ -237,9 +240,9 @@ void MoeFfnLayer::reduce(T* output, int tokens, const MoeFfnWeight& moe) } template -void MoeFfnLayer::dump_logits(int token_num, int layer_id) +void MoeFfnLayer::dump_logits(int token_num, int layer_id, int expert_num) { - std::vector logits(token_num * param_.expert_num); + std::vector logits(token_num * expert_num); check_cuda_error( cudaMemcpyAsync(logits.data(), logits_, sizeof(float) * logits.size(), cudaMemcpyDefault, stream_)); check_cuda_error(cudaStreamSynchronize(stream_)); @@ -247,7 +250,7 @@ void MoeFfnLayer::dump_logits(int token_num, int layer_id) auto ptr = logits.data(); std::cout << "layer_id: " << layer_id << std::endl; for (int i = 0; i < token_num; ++i) { - for (int e = 0; e < param_.expert_num; ++e) { + for (int e = 0; e < expert_num; ++e) { std::cout << *ptr++ << " "; } std::cout << std::endl; diff --git a/src/turbomind/models/llama/moe_ffn_layer.h b/src/turbomind/models/llama/moe_ffn_layer.h index 0f1713f7b..e7edb7a67 100644 --- a/src/turbomind/models/llama/moe_ffn_layer.h +++ b/src/turbomind/models/llama/moe_ffn_layer.h @@ -9,6 +9,7 @@ #include "src/turbomind/models/llama/llama_params.h" #include "src/turbomind/utils/cublasMMWrapper.h" #include "src/turbomind/utils/nccl_utils.h" +#include namespace turbomind { @@ -26,23 +27,24 @@ class MoeFfnLayer { linear_(ctx.linear.get()), allocator_(ctx.allocator.get()) { - model.inter_size = param.inter_size; + FT_CHECK(!param.expert_num.empty()); + const int max_expert_num = *std::max_element(param.expert_num.begin(), param.expert_num.end()); if (param_.method == MoeParam::kFused) { context_ = std::make_unique( - param.expert_num, param.experts_per_token, ctx.cuda_device_prop, stream_); + max_expert_num, param.experts_per_token, ctx.cuda_device_prop, stream_); } else { expert_ffn_ = std::make_unique>(model, tp, ctx, false); } - h_offsets_ = (int*)allocator_->malloc(sizeof(int) * (param_.expert_num + 1), false, true); + h_offsets_ = (int*)allocator_->malloc(sizeof(int) * (max_expert_num + 1), false, true); - offsets_ = (int*)allocator_->malloc(sizeof(int) * (param_.expert_num + 1)); - accum_ = (int*)allocator_->malloc(sizeof(int) * param_.expert_num * kMoeGateMaxTiles); + offsets_ = (int*)allocator_->malloc(sizeof(int) * (max_expert_num + 1)); + accum_ = (int*)allocator_->malloc(sizeof(int) * max_expert_num * kMoeGateMaxTiles); } - void AllocateBuffer(size_t tokens, size_t padded); + void AllocateBuffer(size_t tokens, size_t padded, size_t expert_num); void FreeBuffer(); @@ -57,7 +59,7 @@ class MoeFfnLayer { void gate(float* logits, const T* input, int tokens, const LlamaDenseWeight& weight); - void dump_logits(int token_num, int layer_id); + void dump_logits(int token_num, int layer_id, int expert_num); private: const size_t inter_size_; diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc index 28e8b5f64..bc8a6a147 100644 --- a/src/turbomind/models/llama/unified_decoder.cc +++ b/src/turbomind/models/llama/unified_decoder.cc @@ -28,11 +28,11 @@ UnifiedDecoder::UnifiedDecoder(const ModelParam& model, attn_layer_ = std::make_unique>(model, attn, lora, tp, ctx); - if (moe.expert_num) { + if (std::accumulate(moe.expert_num.begin(), moe.expert_num.end(), 0LL)) { moe_ffn_layer_ = std::make_unique>(model, moe, tp, ctx); } - if (model.inter_size) { + if (std::accumulate(model.inter_size.begin(), model.inter_size.end(), 0LL)) { ffn_layer_ = std::make_unique>(model, tp, ctx, !moe_ffn_layer_); } diff --git a/src/turbomind/models/llama/weight_type.h b/src/turbomind/models/llama/weight_type.h new file mode 100644 index 000000000..ade02e10e --- /dev/null +++ b/src/turbomind/models/llama/weight_type.h @@ -0,0 +1,55 @@ +#pragma once + +#include +#include +#include + +namespace turbomind { + +enum class WeightType : int { + kFP32, + kFP16, + kFP8, // not supported yet + kBF16, + kINT8, + kINT4 +}; + +template +constexpr WeightType get_default_weight_type() +{ + if constexpr (std::is_same_v) { + return WeightType::kFP16; + } + else if constexpr (std::is_same_v) { + return WeightType::kBF16; + } + else if constexpr (std::is_same_v) { + return WeightType::kFP32; + } + else { + static_assert(sizeof(T) != sizeof(T), "not implemented"); + return {}; + } +} + +inline size_t getBitSize(WeightType type) +{ + switch (type) { + case WeightType::kFP32: + return 32; + case WeightType::kFP16: + return 16; + case WeightType::kFP8: + return 8; + case WeightType::kBF16: + return 16; + case WeightType::kINT8: + return 8; + case WeightType::kINT4: + return 4; + } + return 0; +} + +} // namespace turbomind \ No newline at end of file diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index 38552be0c..deb28bb35 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -250,17 +250,19 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, model_param_.kv_head_num = model_reader["kv_head_num"].as(0); model_param_.hidden_units = model_reader["hidden_units"].as(); model_param_.layer_num = model_reader["num_layer"].as(); - model_param_.inter_size = model_reader["inter_size"].as(); model_param_.vocab_size = model_reader["vocab_size"].as(); model_param_.norm_eps = model_reader["norm_eps"].as(); model_param_.start_id = model_reader["start_id"].as(); model_param_.end_id = model_reader["end_id"].as(); attn_param_.cache_block_seq_len = attention_reader["cache_block_seq_len"].as(0); model_param_.quant_policy = engine_reader["quant_policy"].as(0); - + YAML::Node inter_size = model_reader["inter_size"]; + for (auto it = inter_size.begin(); it != inter_size.end(); ++it) { + model_param_.inter_size.push_back(it->as()); + } // Only weight classes need these - attn_bias_ = model_reader["attn_bias"].as(0); - group_size_ = model_reader["group_size"].as(0); + model_param_.attn_bias = model_reader["attn_bias"].as(0); + model_param_.group_size = model_reader["group_size"].as(0); // rotary embedding parameters attn_param_.rotary_embedding_dim = attention_reader["rotary_embedding"].as(); @@ -290,19 +292,23 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, engine_param_.num_tokens_per_iter = engine_reader["num_tokens_per_iter"].as(0); engine_param_.max_prefill_iters = engine_reader["max_prefill_iters"].as(1); - lora_param_.policy = ft::getLoraPolicy(reader["lora_config"]["lora_policy"].as("")); - lora_param_.r = lora_reader["lora_r"].as(0); - lora_param_.scale = lora_reader["lora_scale"].as(0); - lora_param_.max_wo_r = lora_reader["lora_max_wo_r"].as(0); - lora_param_.rank_pattern = getLoraPattern(lora_reader["lora_rank_pattern"].as(""), + lora_param_.policy = ft::getLoraPolicy(reader["lora_config"]["lora_policy"].as("")); + lora_param_.r = lora_reader["lora_r"].as(0); + lora_param_.scale = lora_reader["lora_scale"].as(0); + lora_param_.max_wo_r = lora_reader["lora_max_wo_r"].as(0); + lora_param_.rank_pattern = getLoraPattern(lora_reader["lora_rank_pattern"].as(""), [](const std::string& s) { return std::stoi(s); }); - lora_param_.scale_pattern = getLoraPattern(lora_reader["lora_scale_pattern"].as(""), + lora_param_.scale_pattern = getLoraPattern(lora_reader["lora_scale_pattern"].as(""), [](const std::string& s) { return std::stof(s); }); - moe_param_.expert_num = model_reader["expert_num"].as(0); + moe_param_.experts_per_token = model_reader["experts_per_token"].as(0); moe_param_.inter_size = model_reader["expert_inter_size"].as(0); moe_param_.shared_gate = model_reader["moe_shared_gate"].as(0); moe_param_.norm_topk = model_reader["moe_norm_topk"].as(false); + YAML::Node expert_num = model_reader["expert_num"]; + for (auto it = expert_num.begin(); it != expert_num.end(); ++it) { + moe_param_.expert_num.push_back(it->as()); + } handleMissingParams(); @@ -314,19 +320,19 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, const std::string weight_type_str = model_reader["weight_type"].as(); if (weight_type_str == "fp16" || weight_type_str == "float16") { - weight_type_ = ft::WeightType::kFP16; + model_param_.weight_type = ft::WeightType::kFP16; } else if (weight_type_str == "bf16" || weight_type_str == "bfloat16") { - weight_type_ = ft::WeightType::kBF16; + model_param_.weight_type = ft::WeightType::kBF16; } else if (weight_type_str == "fp32") { - weight_type_ = ft::WeightType::kFP32; + model_param_.weight_type = ft::WeightType::kFP32; } else if (weight_type_str == "int8") { - weight_type_ = ft::WeightType::kINT8; + model_param_.weight_type = ft::WeightType::kINT8; } else if (weight_type_str == "int4") { - weight_type_ = ft::WeightType::kINT4; + model_param_.weight_type = ft::WeightType::kINT4; } else { std::cout << "[ERROR] Unsupported weight type: '" << weight_type_str << "'\n"; @@ -411,20 +417,8 @@ void LlamaTritonModel::createSharedWeights(int device_id, int rank) const int tensor_para_rank = rank % tensor_para_size_; const int pipeline_para_rank = rank / tensor_para_size_; ft::FT_CHECK(pipeline_para_size_ == 1 && pipeline_para_rank == 0); - weights_[device_id] = std::make_shared>(model_param_.head_num, - model_param_.kv_head_num, - model_param_.head_dim, - model_param_.hidden_units, - model_param_.inter_size, - model_param_.vocab_size, - model_param_.layer_num, - attn_bias_, - weight_type_, - group_size_, - lora_param_, - moe_param_, - tensor_para_size_, - tensor_para_rank); + weights_[device_id] = std::make_shared>( + model_param_, lora_param_, moe_param_, tensor_para_size_, tensor_para_rank); // model inited with model_dir if (model_dir_ != "") { weights_[device_id]->loadModel(model_dir_); @@ -480,9 +474,11 @@ std::string LlamaTritonModel::toString() std::stringstream ss; ss << "Model: " // << "\nhead_num: " << model_param_.head_num << "\nkv_head_num: " << model_param_.kv_head_num - << "\nsize_per_head: " << model_param_.head_dim << "\ninter_size: " << model_param_.inter_size + << "\nsize_per_head: " + << model_param_.head_dim + // << "\ninter_size: " << model_param_.inter_size << "\nnum_layer: " << model_param_.layer_num << "\nvocab_size: " << model_param_.vocab_size - << "\nattn_bias: " << attn_bias_ << "\nmax_batch_size: " << engine_param_.max_batch_size + << "\nattn_bias: " << model_param_.attn_bias << "\nmax_batch_size: " << engine_param_.max_batch_size << "\nmax_prefill_token_num: " << engine_param_.max_prefill_token_num << "\nmax_context_token_num: " << engine_param_.max_context_token_num << "\nnum_tokens_per_iter: " << engine_param_.num_tokens_per_iter @@ -493,8 +489,9 @@ std::string LlamaTritonModel::toString() << "\nenable_prefix_caching: " << engine_param_.enable_prefix_caching << "\nstart_id: " << model_param_.start_id << "\ntensor_para_size: " << tensor_para_size_ << "\npipeline_para_size: " << pipeline_para_size_ << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_ << "\nmodel_name: " << model_name_ - << "\nmodel_dir: " << model_dir_ << "\nquant_policy: " << model_param_.quant_policy - << "\ngroup_size: " << group_size_ << "\nexpert_num: " << moe_param_.expert_num + << "\nmodel_dir: " << model_dir_ << "\nquant_policy: " << model_param_.quant_policy << "\ngroup_size: " + << model_param_.group_size + // << "\nexpert_num: " << moe_param_.expert_num << "\nexpert_per_token: " << moe_param_.experts_per_token << "\nmoe_method: " << moe_param_.method << std::endl; return ss.str(); diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h index 19a143e72..a6c1b862a 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h @@ -91,9 +91,6 @@ struct LlamaTritonModel: public AbstractTransformerModel { ft::EngineParam engine_param_; size_t tensor_para_size_; size_t pipeline_para_size_; - ft::WeightType weight_type_; - bool attn_bias_; - int group_size_; std::shared_ptr shared_state_; // Weights & engine instances for the ranks From 1ecf2911a82102520712370302d4c016330cc1b3 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Mon, 11 Nov 2024 08:53:53 +0000 Subject: [PATCH 07/21] add head dim 192 --- .../kernels/attention/CMakeLists.txt | 2 + src/turbomind/kernels/attention/attention.cu | 6 ++ .../attention/codegen/attention_sm80_192.cu | 16 +++++ .../attention/codegen/decoding_sm80_192.cu | 15 +++++ src/turbomind/kernels/attention/decoding.cu | 6 ++ src/turbomind/kernels/attention/impl_16816.h | 61 ++++++++++++------- src/turbomind/kernels/attention/impl_81616.h | 2 +- src/turbomind/kernels/attention/impl_simt.h | 5 +- .../kernels/attention/kv_cache_utils_v2.cu | 12 +++- .../kernels/attention/mainloop_sm80.h | 15 +++-- src/turbomind/kernels/attention/reduce.cu | 6 +- .../kernels/attention/reduce_kernel.h | 9 ++- .../kernels/attention/test_attention.cu | 12 ++-- .../flash_attention2/CMakeLists.txt | 4 +- .../flash_fwd_launch_template.h | 2 +- .../flash_attention2/static_switch.h | 12 ++++ 16 files changed, 142 insertions(+), 43 deletions(-) create mode 100644 src/turbomind/kernels/attention/codegen/attention_sm80_192.cu create mode 100644 src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu diff --git a/src/turbomind/kernels/attention/CMakeLists.txt b/src/turbomind/kernels/attention/CMakeLists.txt index af9d47e0e..32de38981 100644 --- a/src/turbomind/kernels/attention/CMakeLists.txt +++ b/src/turbomind/kernels/attention/CMakeLists.txt @@ -38,6 +38,8 @@ add_library(attention STATIC codegen/decoding_sm80_64_f16_f16.cu codegen/decoding_sm80_64_f16_u4.cu codegen/decoding_sm80_64_f16_u8.cu + codegen/attention_sm80_192.cu + codegen/decoding_sm80_192.cu ) set_property(TARGET attention PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) diff --git a/src/turbomind/kernels/attention/attention.cu b/src/turbomind/kernels/attention/attention.cu index 3f557234b..e7642584c 100644 --- a/src/turbomind/kernels/attention/attention.cu +++ b/src/turbomind/kernels/attention/attention.cu @@ -46,6 +46,12 @@ void dispatchAttention(const AttentionParams& params) else if (params.size_per_head == 128) { return dispatch(std::integral_constant{}); } + + if (params.size_per_head == 192) { + using Config = AttentionConfig; + return invokeAttention(params); + } + FT_CHECK(0); } diff --git a/src/turbomind/kernels/attention/codegen/attention_sm80_192.cu b/src/turbomind/kernels/attention/codegen/attention_sm80_192.cu new file mode 100644 index 000000000..ceeafa7a6 --- /dev/null +++ b/src/turbomind/kernels/attention/codegen/attention_sm80_192.cu @@ -0,0 +1,16 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "../attention_config.h" +#include "../attention_template.h" + +namespace turbomind { + +using namespace attention; + +template void invokeAttention::Kernel>( + const AttentionParams& params); + +template void invokeAttention::Kernel>( + const AttentionParams& params); + +} // namespace turbomind diff --git a/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu b/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu new file mode 100644 index 000000000..9294fc396 --- /dev/null +++ b/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu @@ -0,0 +1,15 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "../decoding_config.h" +#include "../decoding_template.h" + +namespace turbomind { + +using namespace attention; + +template bool +invokeDecoding>(const AttentionParams& params); + +template bool invokeDecoding>(const AttentionParams& params); + +} // namespace turbomind diff --git a/src/turbomind/kernels/attention/decoding.cu b/src/turbomind/kernels/attention/decoding.cu index 1b04b7d4e..b50b67f04 100644 --- a/src/turbomind/kernels/attention/decoding.cu +++ b/src/turbomind/kernels/attention/decoding.cu @@ -2,6 +2,7 @@ #include "decoding.h" #include "decoding_config.h" +#include "src/turbomind/kernels/attention/arch.h" #include "src/turbomind/models/llama/llama_utils.h" // #include "src/turbomind/utils/dispatch.h" #include @@ -113,6 +114,11 @@ void dispatchDecoding(const AttentionParams& params) return false; }; + if (params.size_per_head == 192) { + invokeDecoding>(params); + return; + } + auto success = dispatch(); FT_CHECK(success); diff --git a/src/turbomind/kernels/attention/impl_16816.h b/src/turbomind/kernels/attention/impl_16816.h index 6e8f37f4d..07c7dcb12 100644 --- a/src/turbomind/kernels/attention/impl_16816.h +++ b/src/turbomind/kernels/attention/impl_16816.h @@ -63,26 +63,28 @@ struct Impl>, SmemLayoutV2>>; - using SmemLayoutK = std::conditional_t>, SmemLayoutV2>>; - using SmemLayoutV = std::conditional_t>, SmemLayoutV2>>; using SmemLayoutKVp = void; + static constexpr bool kUseSmemQ = false; + static constexpr bool kUseSmemP = false; + + static_assert(!kUseSmemQ, "current smemQ impl yields inconsistent outputs"); + union SharedStorage { __align__(16) T KV[Stages * (SmemLayoutK::kSize + SmemLayoutV::kSize) / 2]; __align__(16) T Q[SmemLayoutQ::kSize]; }; - static constexpr bool kUseSmemQ = false; - static constexpr bool kUseSmemP = false; - using ThreadMapQ = RakedThreadMap; using ThreadMapKV = RakedThreadMap; @@ -109,22 +111,24 @@ struct Impl sQ{smem_Q}; + SmemAccessor sQ{smem_Q}; - // Load from shared memory using LDSM, rearrange to m16n8k16 atom layout - PRAGMA_UNROLL - for (int m = 0; m < K_M; ++m) { + // Load from shared memory using LDSM, rearrange to m16n8k16 atom layout PRAGMA_UNROLL - for (int k = 0; k < K_K; ++k) { - const int qi = lane_id % 16 * 1 + m * 16 + warp_id * WARP_Q; - const int di = lane_id / 16 * 8 + k * 16; - ldsm_x4((Array&)frag_Q[k][m], cast_smem_ptr_to_uint(&sQ(qi, di))); + for (int m = 0; m < K_M; ++m) { + PRAGMA_UNROLL + for (int k = 0; k < K_K; ++k) { + const int qi = lane_id % 16 * 1 + m * 16 + warp_id * WARP_Q; + const int di = lane_id / 16 * 8 + k * 16; + ldsm_x4((Array&)frag_Q[k][m], cast_smem_ptr_to_uint(&sQ(qi, di))); + } } } - if constexpr (kUseSmemQ) { + if constexpr (0) { __syncthreads(); // Rearrange Q in smem so that swizzling is not needed for later LDSMs @@ -142,20 +146,25 @@ struct Impl smem_K; + T* smem_Q; FragQ frag_Q; FragK frag_K; __device__ StateQK(SharedStorage& storage, FragQ frag_Q_): smem_K{storage.KV} { - static_assert(!kUseSmemQ, "not implemented"); - PRAGMA_UNROLL - for (int k = 0; k < K_K; ++k) { + if constexpr (!kUseSmemQ) { PRAGMA_UNROLL - for (int m = 0; m < K_M; ++m) { - frag_Q[k][m] = frag_Q_[k][m]; + for (int k = 0; k < K_K; ++k) { + PRAGMA_UNROLL + for (int m = 0; m < K_M; ++m) { + frag_Q[k][m] = frag_Q_[k][m]; + } } } + else { + smem_Q = storage.Q; + } } __device__ void Load(int k, int pipe_iter) @@ -166,6 +175,16 @@ struct Impl sQ{smem_Q}; + PRAGMA_UNROLL + for (int m = 0; m < K_M; ++m) { + const int qi = lane_id % 16 * 1 + m * 16 + warp_id * WARP_Q; + const int di = lane_id / 16 * 8 + k * 16; + ldsm_x4((Array&)frag_Q[k][m], cast_smem_ptr_to_uint(&sQ(qi, di))); + } + } PRAGMA_UNROLL for (int n = 0; n < K_N; n += 2) { // Load (s16,d16) tiles const int s = n * 8 + offset_s; diff --git a/src/turbomind/kernels/attention/impl_81616.h b/src/turbomind/kernels/attention/impl_81616.h index 3b90bcdf5..f865f1bc3 100644 --- a/src/turbomind/kernels/attention/impl_81616.h +++ b/src/turbomind/kernels/attention/impl_81616.h @@ -104,7 +104,7 @@ struct Impl) { - return std::conditional_t>, SmemLayoutV2>>{}; } diff --git a/src/turbomind/kernels/attention/impl_simt.h b/src/turbomind/kernels/attention/impl_simt.h index a886185a4..667a0ce43 100644 --- a/src/turbomind/kernels/attention/impl_simt.h +++ b/src/turbomind/kernels/attention/impl_simt.h @@ -165,8 +165,11 @@ struct Impl; + static constexpr int kWarpThreadC_KV = HeadDim != 192 ? HeadDim / kAccessC_KV : 8; + using ThreadMapQ = RakedThreadMap; - using ThreadMapKV = RakedThreadMap, kWarpCount>; + using ThreadMapKV = RakedThreadMap; // `WARP_SIZE / WARP_S` is chosen to achieve minimum kIterS w/o introducing partial S iter using ThreadMapKVp = RakedThreadMap<2, CTA_S, 2, kWarpCount, WARP_SIZE / WARP_S>; diff --git a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu index 20bb00fde..f2e2faef9 100644 --- a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu +++ b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu @@ -277,11 +277,14 @@ void invokeProcessKV_v2(char** blocks, }; auto dispatch = [&](auto tkv) { - if (head_dim == 128) { + if (head_dim == 64) { + return invoke(tkv, std::integral_constant{}); + } + else if (head_dim == 128) { return invoke(tkv, std::integral_constant{}); } - else if (head_dim == 64) { - return invoke(tkv, std::integral_constant{}); + else if (head_dim == 192) { + return invoke(tkv, std::integral_constant{}); } FT_CHECK(0); }; @@ -545,6 +548,9 @@ void invokeFlattenKV_v2(T* k, else if (head_dim == 128) { return invoke(tkv, std::integral_constant{}); } + else if (head_dim == 192) { + return invoke(tkv, std::integral_constant{}); + } FT_CHECK(0); }; diff --git a/src/turbomind/kernels/attention/mainloop_sm80.h b/src/turbomind/kernels/attention/mainloop_sm80.h index bf0fc1d32..0a65515a4 100644 --- a/src/turbomind/kernels/attention/mainloop_sm80.h +++ b/src/turbomind/kernels/attention/mainloop_sm80.h @@ -52,7 +52,7 @@ struct Mainloop, Impl_> { template __device__ void operator()(Args&&... args) { - Run(Sm80_CpAsync{}, ((Args &&) args)...); + Run(Sm80_CpAsync{}, std::integral_constant{}, ((Args&&)args)...); } template @@ -81,8 +81,9 @@ struct Mainloop, Impl_> { } } - template + template __device__ void Run(Sm80_CpAsync, + std::integral_constant, FragQ& frag_Q, CacheIter& cache_iter, FragO& frag_O, @@ -199,9 +200,10 @@ struct Mainloop, Impl_> { __pipeline_wait_prior(0); } -#if 0 + // #if 1 template __device__ void Run(Sm80_CpAsync<2>, + std::integral_constant, FragQ& frag_Q, CacheIter& cache_iter, FragO& frag_O, @@ -292,14 +294,15 @@ struct Mainloop, Impl_> { __pipeline_wait_prior(0); } -#elif 1 + // #elif 1 // Load : K0,K1 | V0,K2,V1,K3 ... // Compute : K0 | K1,V0,K2,V1 ... // - more register consumption // - more interleaved HMMA and FMA // - slight performance gain - template + template __device__ void Run(Sm80_CpAsync<2>, + std::integral_constant, FragQ& frag_Q, CacheIter& cache_iter_, FragO& frag_O, @@ -407,7 +410,7 @@ struct Mainloop, Impl_> { __pipeline_commit(); __pipeline_wait_prior(0); } -#endif + // #endif __device__ void Wait() { diff --git a/src/turbomind/kernels/attention/reduce.cu b/src/turbomind/kernels/attention/reduce.cu index 12f6aff38..051b2baa7 100644 --- a/src/turbomind/kernels/attention/reduce.cu +++ b/src/turbomind/kernels/attention/reduce.cu @@ -66,12 +66,14 @@ void invokeReduce(T* out, float exp_scale, \ cudaStream_t stream); -INSTANTIATE_invokeReduce(128, half); INSTANTIATE_invokeReduce(64, half); +INSTANTIATE_invokeReduce(128, half); +INSTANTIATE_invokeReduce(192, half); #if ENABLE_BF16 -INSTANTIATE_invokeReduce(128, nv_bfloat16); INSTANTIATE_invokeReduce(64, nv_bfloat16) +INSTANTIATE_invokeReduce(128, nv_bfloat16); +INSTANTIATE_invokeReduce(192, nv_bfloat16); #endif } // namespace turbomind::attention diff --git a/src/turbomind/kernels/attention/reduce_kernel.h b/src/turbomind/kernels/attention/reduce_kernel.h index 88a3ab3af..14c5005cd 100644 --- a/src/turbomind/kernels/attention/reduce_kernel.h +++ b/src/turbomind/kernels/attention/reduce_kernel.h @@ -127,10 +127,13 @@ struct Reduce { } __syncthreads(); + + // HeadDim / WARP_SIZE + // 128 -> 4 + // 64, 192 -> 2 + constexpr int kVecSize = HeadDim % 128 == 0 ? 4 : 2; - constexpr int kVecSize = HeadDim / WARP_SIZE; - - using Map = RakedThreadMap; + using Map = RakedThreadMap; static_assert(Map::kIterS == CTA_H); diff --git a/src/turbomind/kernels/attention/test_attention.cu b/src/turbomind/kernels/attention/test_attention.cu index c6d7b4063..804d4815d 100644 --- a/src/turbomind/kernels/attention/test_attention.cu +++ b/src/turbomind/kernels/attention/test_attention.cu @@ -218,14 +218,14 @@ void TestBlocks(const thrust::universal_vector& k_cache, // [B, H, S, #define KV_INT4 0 -#define DECODING 1 +#define DECODING 0 template int test_attention() { AttentionParams params{}; - constexpr size_t kHeadDim = 128; + constexpr size_t kHeadDim = 192; #if DECODING // constexpr size_t kHeadNum = 32; @@ -239,11 +239,11 @@ int test_attention() // constexpr size_t kSequenceLen = 511; // constexpr size_t kSequenceLen = 2047; // constexpr size_t kSequenceLen = 4095; - // constexpr size_t kSequenceLen = 8191; + constexpr size_t kSequenceLen = 8191; // constexpr size_t kSequenceLen = 32767; // constexpr size_t kSequenceLen = 65535; // constexpr size_t kSequenceLen = 131071; - constexpr size_t kSequenceLen = 200000; + // constexpr size_t kSequenceLen = 200000; // constexpr size_t kSequenceLen = 262143; // constexpr size_t kSequenceLen = (1 << 20) - 1; // 1M // constexpr size_t kSequenceLen = (1 << 22) - 1; // 4M @@ -451,6 +451,10 @@ int test_attention() params.qk = qk_buf.data().get(); params.pr = pr_buf.data().get(); + params.attention_scaling = 1.f; + params.llama3_inv_scaling_factor = 0; + params.yarn_ramp_inv_factor_div_2 = 0; + Reference reference(kDump ? Reference::kUNFUSED : Reference::kFLASH_ATTENTION, {}); // Reference reference(Reference::kUNFUSED, {}); reference.Reshape(kInputLen, kContextLen, kHeadNum, kHeadDim, KvHeadNum, kBatchSize); diff --git a/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt b/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt index d41c391e9..81c975058 100644 --- a/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt +++ b/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt @@ -8,9 +8,11 @@ add_library(${PROJECT_NAME} STATIC # flash_fwd_hdim64_fp16_sm80.cu flash_fwd_hdim128_fp16_sm80.cu flash_fwd_hdim128_bf16_sm80.cu - # flash_fwd_hdim256_fp16_sm80.cu + flash_fwd_hdim256_bf16_sm80.cu + flash_fwd_hdim256_fp16_sm80.cu ) target_include_directories(${PROJECT_NAME} PRIVATE ${CUTLASS_DIR} / include) target_link_libraries(${PROJECT_NAME} PRIVATE nvidia::cutlass::cutlass) + set_property(TARGET ${PROJECT_NAME} PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) diff --git a/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h b/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h index e108a55f2..245649636 100644 --- a/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h +++ b/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h @@ -147,7 +147,7 @@ void run_mha_fwd_hdim128(Flash_fwd_params& params, cudaStream_t stream) }); } -#if 0 +#if 1 template void run_mha_fwd_hdim256(Flash_fwd_params& params, cudaStream_t stream) { diff --git a/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h b/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h index fd19a0ea6..ca141ee0b 100644 --- a/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h +++ b/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h @@ -58,6 +58,18 @@ return __VA_ARGS__(); \ } \ }() +#elif 1 +#define FWD_HEADDIM_SWITCH(HEADDIM, ...) \ + [&] { \ + if (HEADDIM <= 128) { \ + constexpr static int kHeadDim = 128; \ + return __VA_ARGS__(); \ + } \ + else if (HEADDIM <= 256) { \ + constexpr static int kHeadDim = 256; \ + return __VA_ARGS__(); \ + } \ + }() #else #define FWD_HEADDIM_SWITCH(HEADDIM, ...) \ [&] { \ From 0caa11317b37e81f8e5af81ded3589bd8d8d2817 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Wed, 13 Nov 2024 15:53:58 +0800 Subject: [PATCH 08/21] refactor weight processing --- examples/cpp/llama/llama_triton_example.cc | 4 +- lmdeploy/turbomind/deploy/config.py | 2 +- src/turbomind/kernels/core/array_ops.h | 2 +- src/turbomind/kernels/gemm/convert_v2.cu | 41 ++- src/turbomind/kernels/gemm/unpack.cu | 34 +- .../models/llama/LlamaDecoderLayerWeight.cc | 340 +++++------------- .../models/llama/LlamaDecoderLayerWeight.h | 21 +- src/turbomind/models/llama/LlamaDenseWeight.h | 185 +++++++++- src/turbomind/models/llama/LlamaWeight.cc | 51 +-- src/turbomind/models/llama/LlamaWeight.h | 26 +- src/turbomind/models/llama/llama_gemm.cc | 2 +- src/turbomind/models/llama/llama_kernels.h | 2 +- src/turbomind/utils/memory_utils.cu | 108 +++--- src/turbomind/utils/memory_utils.h | 13 +- 14 files changed, 443 insertions(+), 388 deletions(-) diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index b0e513410..1fb5fa096 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -114,14 +114,14 @@ broadCastRequest(const std::vector& v_start_ids, } else { // conditional case. - ft::deviceMalloc(&d_input_ids, size_1, false); + ft::deviceMalloc(&d_input_ids, size_1, nullptr, false); // ft::deviceMalloc(&d_input_lengths, size_2, false); ft::cudaH2Dcpy(d_input_ids, v_input_ids.data(), size_1); // ft::cudaH2Dcpy(d_input_lengths, v_input_lengths.data(), size_2); } if (!v_input_bad_words.empty()) { - ft::deviceMalloc(&d_input_bad_words, size_bad_words, false); + ft::deviceMalloc(&d_input_bad_words, size_bad_words, nullptr, false); ft::cudaH2Dcpy(d_input_bad_words, v_input_bad_words.data(), size_bad_words); } else { diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py index bfa46f325..8bd7e6c51 100644 --- a/lmdeploy/turbomind/deploy/config.py +++ b/lmdeploy/turbomind/deploy/config.py @@ -50,7 +50,7 @@ class ModelConfig: tp: int = 1 model_format: str = 'hf' # expert_num: int = 0 - expert_num: List[int] = None + expert_num: List[int] = () expert_inter_size: int = 0 experts_per_token: int = 0 moe_shared_gate: int = False diff --git a/src/turbomind/kernels/core/array_ops.h b/src/turbomind/kernels/core/array_ops.h index 6b639abc8..ec6e7fb4e 100644 --- a/src/turbomind/kernels/core/array_ops.h +++ b/src/turbomind/kernels/core/array_ops.h @@ -172,7 +172,7 @@ inline __device__ void copy(const Array (&src)[M], Array (&dst)[M]) } template -inline __device__ void Store(T* __restrict__ dst, const Array& src) +inline __device__ void Store(T* dst, const Array& src) { if constexpr (sizeof(Array) == sizeof(uint4)) { *(uint4*)dst = (const uint4&)src; diff --git a/src/turbomind/kernels/gemm/convert_v2.cu b/src/turbomind/kernels/gemm/convert_v2.cu index ed8b2ee2f..90e4b97dd 100644 --- a/src/turbomind/kernels/gemm/convert_v2.cu +++ b/src/turbomind/kernels/gemm/convert_v2.cu @@ -279,17 +279,44 @@ get_weight_and_scales_layout(DataType dtype, bool is_fused_moe, int sm, bool for return {}; } -void* make_blocked_ptrs(const std::vector>& ptrs, cudaStream_t stream) +namespace { + +template +struct Param { + StridedPtr data[N]; + StridedPtr* ptr; + int n; +}; + +template +__global__ void fill_strided_ptrs(Param param) { - std::vector tmp; - for (const auto& [p, s] : ptrs) { - tmp.push_back({p, s}); + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < param.n) { + param.ptr[idx] = param.data[idx]; } +} + +} // namespace + +void* make_blocked_ptrs(const std::vector>& ptrs, cudaStream_t stream) +{ + constexpr int N = 64; + Param param{}; + static_assert(sizeof(param) <= 4096); // max parameter size for cuda11 StridedPtr* ptr{}; cudaMallocAsync(&ptr, sizeof(StridedPtr) * ptrs.size(), stream); - cudaMemcpyAsync(ptr, tmp.data(), sizeof(StridedPtr) * ptrs.size(), cudaMemcpyDefault, stream); - // Sync before tmp can be destructed - cudaStreamSynchronize(stream); + param.ptr = ptr; + for (int i = 0; i < (int)ptrs.size(); i += N) { + const int n = std::min(ptrs.size() - i, N); + for (int j = 0; j < n; ++j) { + auto& [p, s] = ptrs[i + j]; + param.data[j] = StridedPtr{p, s}; + } + param.n = n; + fill_strided_ptrs<<<1, N, 0, stream>>>(param); + param.ptr += N; + } return ptr; } diff --git a/src/turbomind/kernels/gemm/unpack.cu b/src/turbomind/kernels/gemm/unpack.cu index 92f468d82..39e6a2e1a 100644 --- a/src/turbomind/kernels/gemm/unpack.cu +++ b/src/turbomind/kernels/gemm/unpack.cu @@ -71,14 +71,44 @@ void unpack_awq_gemm(uint4_t* dst, const uint4_t* src, int rows, int cols, cudaS permute_u4<0, 1, 3, 2><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape); } +__global__ void transpose_u4_kernel(uint4_t* dst, const uint4_t* src, int s, int c) +{ + const int idx_c = 8 * (threadIdx.x + blockIdx.x * blockDim.x); + const int idx_s = 8 * (threadIdx.y + blockIdx.y * blockDim.y); + if (idx_c >= c || idx_s >= s) { + return; + } + uint32_t ivec[8]; + PRAGMA_UNROLL + for (int i = 0; i < 8; ++i) { + ivec[i] = ((const uint32_t*)src)[((idx_s + i) * c + idx_c) / 8]; + } + uint32_t ovec[8]{}; + PRAGMA_UNROLL + for (int i = 0; i < 8; ++i) { + PRAGMA_UNROLL + for (int j = 0; j < 8; ++j) { + ovec[i] |= (((ivec[j] >> (i * 4)) & 0xfu) << (j * 4)); + } + } + PRAGMA_UNROLL + for (int i = 0; i < 8; ++i) { + ((uint32_t*)dst)[((idx_c + i) * s + idx_s) / 8] = ovec[i]; + } +} + void transpose_u4(uint4_t* dst, const uint4_t* src, int s, int c, cudaStream_t st) { if (s % 8 || c % 8) { std::cerr << "transpose_u4: invalid shape (" << s << "," << c << "), must be multiple of 8" << std::endl; return; } - Array shape{s, c}; - permute_u4<1, 0><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape); + // Array shape{s, c}; + // permute_u4<1, 0><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape); + + const dim3 block(16, 16); + const dim3 grid((c + 15) / 16, (s + 15) / 16); + transpose_u4_kernel<<>>(dst, src, s, c); } // load -> unpack -> extend_to_u8 -> manipulation -> compat_to_u4 -> store diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc index fadebda78..bb0794988 100644 --- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc +++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc @@ -50,102 +50,6 @@ static bool is_fuse_silu_act() }(); return value; } -#if 0 -template -LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(int layer_idx, - size_t head_num, - size_t kv_head_num, - size_t size_per_head, - size_t hidden_units, - size_t inter_size, - WeightType weight_type, - int group_size, - LoraParam lora_param, - bool attn_bias, - MoeParam moe_param, - size_t tensor_para_size, - size_t tensor_para_rank): - head_num_(head_num), - kv_head_num_(kv_head_num), - size_per_head_(size_per_head), - hidden_units_(hidden_units), - inter_size_(inter_size), - weight_type_(weight_type), - attn_bias_(attn_bias), - tensor_para_size_(tensor_para_size), - tensor_para_rank_(tensor_para_rank) -{ - if (lora_param.policy == LoraPolicy::kPlora) { - std::vector keys = { - "attention.w_qkv", "attention.wo", "feed_forward.w1", "feed_forward.w2", "feed_forward.w3"}; - std::vector*> weights = {&self_attn_weights.qkv, - &self_attn_weights.output, - &ffn_weights.gating, - &ffn_weights.output, - &ffn_weights.intermediate}; - for (int i = 0; i < keys.size(); i++) { - const auto& name = keys[i]; - auto& weight = *weights[i]; - int rank = lora_param.r; - float scale = lora_param.scale; - std::string full_name = "layers." + std::to_string(layer_idx) + "." + name; - - for (const auto& [re, pr] : lora_param.rank_pattern) { - if (std::regex_search(full_name, pr.first)) { - rank = pr.second; - TM_LOG_DEBUG("find rank, pattern=%s, name=%s, value=%d", re.c_str(), full_name.c_str(), rank); - break; - } - } - for (const auto& [re, pr] : lora_param.scale_pattern) { - if (std::regex_search(full_name, pr.first)) { - scale = pr.second; - TM_LOG_DEBUG("find scale pattern=%s, name=%s, value=%f", re.c_str(), full_name.c_str(), scale); - break; - } - } - if (rank) { - weight.lora.r = rank; - weight.lora.scale = scale; - weight.lora.policy = lora_param.policy; - } - } - } - - fused_up_and_gate_ = ffn_weights.gating.lora.policy != LoraPolicy::kPlora; - - self_attn_weights.qkv.input_dims = hidden_units_; - self_attn_weights.qkv.output_dims = (head_num + 2 * kv_head_num) * size_per_head / tensor_para_size_; - self_attn_weights.qkv.type = weight_type; - self_attn_weights.qkv.group_size = group_size; - - self_attn_weights.output.input_dims = (head_num * size_per_head) / tensor_para_size_; - self_attn_weights.output.output_dims = hidden_units_; - self_attn_weights.output.type = weight_type; - self_attn_weights.output.group_size = group_size; - - ffn_weights = LlamaFfnWeight{ - hidden_units_, - inter_size_, - tensor_para_size_, - weight_type_, - group_size, - weight_type_ == WeightType::kINT4 && is_fuse_silu_act(), - }; - - moe_weights = MoeFfnWeight{hidden_units_, - moe_param.inter_size, - moe_param.expert_num, - moe_param.method, - moe_param.shared_gate, - tensor_para_size_, - weight_type, - group_size, - is_fuse_silu_act()}; - - mallocWeights(); -} -#else template LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(int layer_id, @@ -203,15 +107,14 @@ LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(int layer_id, fused_up_and_gate_ = ffn_weights.gating.lora.policy != LoraPolicy::kPlora; - self_attn_weights.qkv.input_dims = hidden_units_; - self_attn_weights.qkv.output_dims = (head_num_ + 2 * kv_head_num_) * size_per_head_ / tensor_para_size_; - self_attn_weights.qkv.type = weight_type_; - self_attn_weights.qkv.group_size = model.group_size; - - self_attn_weights.output.input_dims = (head_num_ * size_per_head_) / tensor_para_size_; - self_attn_weights.output.output_dims = hidden_units_; - self_attn_weights.output.type = weight_type_; - self_attn_weights.output.group_size = model.group_size; + self_attn_weights = LlamaAttentionWeight{hidden_units_, + size_per_head_, + head_num_, + kv_head_num_, + attn_bias_, + tensor_para_size_, + weight_type_, + model.group_size}; ffn_weights = LlamaFfnWeight{ hidden_units_, @@ -224,11 +127,25 @@ LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(int layer_id, moe_weights = MoeFfnWeight{ layer_id, moe_param, hidden_units_, weight_type_, model.group_size, tensor_para_size_, is_fuse_silu_act()}; +} + +template +void LlamaDecoderLayerWeight::malloc(cudaStream_t st) +{ + deviceMalloc((T**)&self_attn_norm_weights, hidden_units_, st); + deviceMalloc((T**)&ffn_norm_weights, hidden_units_, st); + + self_attn_weights.malloc(st); - mallocWeights(); + if (inter_size_) { + ffn_weights.malloc(st); + } + + if (!moe_weights.experts.empty()) { + moe_weights.malloc(st); + } } -#endif template size_t LlamaDecoderLayerWeight::workspace_size() const noexcept { @@ -251,52 +168,6 @@ size_t LlamaDecoderLayerWeight::workspace_size() const noexcept return size * sizeof(uint16_t); } -template -void freeWeights(LlamaDenseWeight& weights) -{ - cudaFree(weights.kernel); - cudaFree(weights.bias); - cudaFree(weights.scales); - cudaFree(weights.zeros); - - weights.kernel = nullptr; - weights.bias = nullptr; - weights.scales = nullptr; - weights.zeros = nullptr; - - { - cudaFree(weights.lora.a); - cudaFree(weights.lora.b); - weights.lora.a = nullptr; - weights.lora.b = nullptr; - } -} - -template -void LlamaDecoderLayerWeight::mallocWeights(LlamaDenseWeight& weights, bool bias) -{ - if (bias) { - deviceMalloc((T**)&weights.bias, weights.output_dims); - } - const size_t bit_size = getBitSize(weights.type); - if (bit_size >= 16) { // fp16, fp32 - deviceMalloc((T**)&weights.kernel, weights.input_dims * weights.output_dims); - } - else { // int8, int4 - const int factor = sizeof(float) * 8 / bit_size; - FT_CHECK(weights.input_dims % factor == 0); - deviceMalloc((int**)&weights.kernel, weights.input_dims * weights.output_dims / factor); - deviceMemSetZero((int*)weights.kernel, weights.input_dims * weights.output_dims / factor); - deviceMalloc((T**)&weights.scales, weights.input_dims / weights.group_size * weights.output_dims); - deviceMalloc((T**)&weights.zeros, weights.input_dims / weights.group_size * weights.output_dims); - } - - if (weights.lora.r > 0) { - deviceMalloc((T**)&weights.lora.a, weights.input_dims * weights.lora.r); - deviceMalloc((T**)&weights.lora.b, weights.lora.r * weights.output_dims); - } -} - template std::string concat(FirstArg&& first, Args&&... args) { @@ -425,64 +296,24 @@ void loadWeights(LlamaDenseWeight& w, std::string prefix, FtCudaDataType mode } template -void LlamaDecoderLayerWeight::mallocWeights() +void LlamaDecoderLayerWeight::free(cudaStream_t st) { - deviceMalloc((T**)&self_attn_norm_weights, hidden_units_); - deviceMalloc((T**)&ffn_norm_weights, hidden_units_); + deviceFree(self_attn_norm_weights, st); + deviceFree(ffn_norm_weights, st); - mallocWeights(self_attn_weights.qkv, attn_bias_); - mallocWeights(self_attn_weights.output, attn_bias_); + self_attn_weights.free(st); if (inter_size_) { - mallocWeights(ffn_weights.gating, false); - mallocWeights(ffn_weights.intermediate, false); - mallocWeights(ffn_weights.output, false); + ffn_weights.free(st); } if (!moe_weights.experts.empty()) { - mallocWeights(moe_weights.gate, false); - for (auto& e : moe_weights.experts) { - mallocWeights(e.gating, false); - mallocWeights(e.intermediate, false); - mallocWeights(e.output, false); - } - if (moe_weights.shared_gate.output_dims) { - mallocWeights(moe_weights.shared_gate, false); - } + moe_weights.free(st); } } template -LlamaDecoderLayerWeight::~LlamaDecoderLayerWeight() -{ - cudaFree((void*)self_attn_norm_weights); - cudaFree((void*)ffn_norm_weights); - self_attn_norm_weights = nullptr; - ffn_norm_weights = nullptr; - - freeWeights(self_attn_weights.qkv); - freeWeights(self_attn_weights.output); - - if (inter_size_) { - freeWeights(ffn_weights.fused_gating_intermediate); - freeWeights(ffn_weights.gating); - freeWeights(ffn_weights.intermediate); - freeWeights(ffn_weights.output); - } - - if (!moe_weights.experts.empty()) { - freeWeights(moe_weights.gate); - for (auto& e : moe_weights.experts) { - freeWeights(e.fused_gating_intermediate); - freeWeights(e.gating); - freeWeights(e.intermediate); - freeWeights(e.output); - } - if (moe_weights.shared_gate.kernel) { - freeWeights(moe_weights.shared_gate); - } - } -} +LlamaDecoderLayerWeight::~LlamaDecoderLayerWeight() = default; template void LlamaDecoderLayerWeight::loadModel(std::string dir_path, FtCudaDataType model_file_type) @@ -561,7 +392,8 @@ TensorMap LlamaDecoderLayerWeight::getParams(std::string prefix) } // template -static void convert_u4(LlamaDenseWeight& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt) +static void convert_u4( + LlamaDenseWeight& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st) { FT_CHECK(weight.type == WeightType::kINT4); @@ -571,11 +403,11 @@ static void convert_u4(LlamaDenseWeight& weight, bool is_fused_moe, void* get_weight_and_scales_layout(gemm::DataType::U4, is_fused_moe, getSMVersion(), use_simt); if (order_b == kColMajor) { - transpose_u4((uint4_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims, weight.output_dims); - cudaMemcpy(weight.kernel, workspace, weight.input_dims * weight.output_dims / 2, cudaMemcpyDefault); + transpose_u4((uint4_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims, weight.output_dims, st); + cudaMemcpyAsync(weight.kernel, workspace, weight.input_dims * weight.output_dims / 2, cudaMemcpyDefault, st); } - extend_to_u16((uint16_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims * weight.output_dims); + extend_to_u16((uint16_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims * weight.output_dims, st); sync_check_cuda_error(); MatrixLayout w_desc{ @@ -590,25 +422,22 @@ static void convert_u4(LlamaDenseWeight& weight, bool is_fused_moe, void* k_desc.type = gemm::DataType::U4; k_desc.pack = pack_b; - cudaMemset(weight.kernel, 0, weight.input_dims * weight.output_dims / 2); + cudaMemsetAsync(weight.kernel, 0, weight.input_dims * weight.output_dims / 2, st); - FT_CHECK(Convert(workspace, w_desc, weight.kernel, k_desc, 0) == 0); + FT_CHECK(Convert(workspace, w_desc, weight.kernel, k_desc, st) == 0); sync_check_cuda_error(); const int scale_count = (weight.input_dims / weight.group_size) * weight.output_dims; // std::cout << "fuse_scales_and_zeros\n"; - fuse_scales_and_zeros((half*)workspace, weight.scales, weight.zeros, scale_count); + fuse_scales_and_zeros((half*)workspace, weight.scales, weight.zeros, scale_count, st); // cudaMemset((T*)workspace, 0, sizeof(T) * scale_count * 2); sync_check_cuda_error(); - cudaDeviceSynchronize(); - - cudaFree(weight.scales); - cudaFree(weight.zeros); - weight.scales = weight.zeros = nullptr; + deviceFree(weight.scales, st); + deviceFree(weight.zeros, st); - deviceMalloc((half**)&weight.scales_zeros, scale_count * 2); + deviceMalloc((half**)&weight.scales_zeros, scale_count * 2, st); MatrixLayout s_desc{ gemm::DataType::U32, @@ -621,7 +450,7 @@ static void convert_u4(LlamaDenseWeight& weight, bool is_fused_moe, void* MatrixLayout q_desc = s_desc; q_desc.pack = pack_v; - FT_CHECK(Convert(workspace, s_desc, weight.scales_zeros, q_desc, 0) == 0); + FT_CHECK(Convert(workspace, s_desc, weight.scales_zeros, q_desc, st) == 0); sync_check_cuda_error(); weight.k_desc = k_desc; @@ -631,7 +460,8 @@ static void convert_u4(LlamaDenseWeight& weight, bool is_fused_moe, void* } template -static void convert_fp(LlamaDenseWeight& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt) +static void +convert_fp(LlamaDenseWeight& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st) { using namespace gemm; @@ -646,12 +476,13 @@ static void convert_fp(LlamaDenseWeight& weight, bool is_fused_moe, void* wor const int output_dim = weight.output_dims; if (order_b == kColMajor) { - invokeTransposeAxis01((uint16_t*)workspace, (uint16_t*)weight.kernel, input_dim, output_dim, 1, nullptr); + invokeTransposeAxis01((uint16_t*)workspace, (uint16_t*)weight.kernel, input_dim, output_dim, 1, st); sync_check_cuda_error(); // FT_CHECK(0); } else { - check_cuda_error(cudaMemcpy(workspace, weight.kernel, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault)); + check_cuda_error( + cudaMemcpyAsync(workspace, weight.kernel, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault, st)); } MatrixLayout src{ @@ -666,35 +497,42 @@ static void convert_fp(LlamaDenseWeight& weight, bool is_fused_moe, void* wor dst.pack = pack_b; if (pack_b) { - FT_CHECK(Convert(workspace, src, weight.kernel, dst, nullptr) == 0); + FT_CHECK(Convert(workspace, src, weight.kernel, dst, st) == 0); sync_check_cuda_error(); // FT_CHECK(0); } else { - check_cuda_error(cudaMemcpy(weight.kernel, workspace, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault)); + check_cuda_error( + cudaMemcpyAsync(weight.kernel, workspace, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault, st)); } weight.k_desc = dst; } template -static void convert(LlamaDenseWeight& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt) +static void +convert(LlamaDenseWeight& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st) { if (weight.type == WeightType::kINT4) { if constexpr (std::is_same_v) { - convert_u4(weight, is_fused_moe, workspace, size, use_simt); + convert_u4(weight, is_fused_moe, workspace, size, use_simt, st); } else { FT_CHECK(0); } } else { - convert_fp(weight, is_fused_moe, workspace, size, use_simt); + convert_fp(weight, is_fused_moe, workspace, size, use_simt, st); } } template -void interleave(LlamaDenseWeight& c, LlamaDenseWeight& a, LlamaDenseWeight& b, void* workspace, size_t size) +void interleave(LlamaDenseWeight& c, + LlamaDenseWeight& a, + LlamaDenseWeight& b, + void* workspace, + size_t size, + cudaStream_t st) { FT_CHECK(c.input_dims == a.input_dims); FT_CHECK(c.input_dims == b.input_dims); @@ -711,18 +549,18 @@ void interleave(LlamaDenseWeight& c, LlamaDenseWeight& a, LlamaDenseWeight const auto sentinel = tmp_c + c.output_dims * c.input_dims; FT_CHECK(sentinel <= (uint8_t*)workspace + size); - extend_to_u8(tmp_a, (const uint4_t*)a.kernel, a.output_dims * a.input_dims); - extend_to_u8(tmp_b, (const uint4_t*)b.kernel, b.output_dims * b.input_dims); + extend_to_u8(tmp_a, (const uint4_t*)a.kernel, a.output_dims * a.input_dims, st); + extend_to_u8(tmp_b, (const uint4_t*)b.kernel, b.output_dims * b.input_dims, st); - interleave_output_dims(tmp_c, tmp_a, tmp_b, a.output_dims, a.input_dims, 0); + interleave_output_dims(tmp_c, tmp_a, tmp_b, a.output_dims, a.input_dims, st); - compact_to_u4((uint4_t*)c.kernel, tmp_c, c.output_dims * c.input_dims); + compact_to_u4((uint4_t*)c.kernel, tmp_c, c.output_dims * c.input_dims, st); - interleave_output_dims(c.scales, a.scales, b.scales, a.output_dims, a.input_dims / a.group_size, 0); - interleave_output_dims(c.zeros, a.zeros, b.zeros, a.output_dims, a.input_dims / a.group_size, 0); + interleave_output_dims(c.scales, a.scales, b.scales, a.output_dims, a.input_dims / a.group_size, st); + interleave_output_dims(c.zeros, a.zeros, b.zeros, a.output_dims, a.input_dims / a.group_size, st); } else { - interleave_output_dims((T*)c.kernel, (const T*)a.kernel, (const T*)b.kernel, a.output_dims, a.input_dims, 0); + interleave_output_dims((T*)c.kernel, (const T*)a.kernel, (const T*)b.kernel, a.output_dims, a.input_dims, st); } // Check at function level @@ -730,7 +568,7 @@ void interleave(LlamaDenseWeight& c, LlamaDenseWeight& a, LlamaDenseWeight } template -void chunk(LlamaDenseWeight& c, LlamaDenseWeight& a, LlamaDenseWeight& b, void*, size_t) +void chunk(LlamaDenseWeight& c, LlamaDenseWeight& a, LlamaDenseWeight& b, void*, size_t, cudaStream_t st) { FT_CHECK(c.input_dims == a.input_dims); FT_CHECK(c.input_dims == b.input_dims); @@ -739,9 +577,11 @@ void chunk(LlamaDenseWeight& c, LlamaDenseWeight& a, LlamaDenseWeight& FT_CHECK(c.group_size == a.group_size); FT_CHECK(c.group_size == b.group_size); - auto _chunks = [](auto c, auto a, auto b, int height, int width) { - check_cuda_error(cudaMemcpy2D((char*)c + 0x000, width * 2, a, width, width, height, cudaMemcpyDefault)); - check_cuda_error(cudaMemcpy2D((char*)c + width, width * 2, b, width, width, height, cudaMemcpyDefault)); + auto _chunks = [&](auto c, auto a, auto b, int height, int width) { + check_cuda_error( + cudaMemcpy2DAsync((char*)c + 0x000, width * 2, a, width, width, height, cudaMemcpyDefault, st)); + check_cuda_error( + cudaMemcpy2DAsync((char*)c + width, width * 2, b, width, width, height, cudaMemcpyDefault, st)); }; if (c.type == WeightType::kINT4) { @@ -758,37 +598,37 @@ void chunk(LlamaDenseWeight& c, LlamaDenseWeight& a, LlamaDenseWeight& } template -void LlamaDecoderLayerWeight::prepare(void* workspace, size_t size, const cudaDeviceProp& prop) +void LlamaDecoderLayerWeight::prepare(void* workspace, size_t size, const cudaDeviceProp& prop, cudaStream_t st) { const bool is_16xx = is_16xx_series(prop.name); - convert(self_attn_weights.qkv, false, workspace, size, is_16xx); - convert(self_attn_weights.output, false, workspace, size, is_16xx); + convert(self_attn_weights.qkv, false, workspace, size, is_16xx, st); + convert(self_attn_weights.output, false, workspace, size, is_16xx, st); auto process_ffn = [&](LlamaFfnWeight& ffn, bool is_fused_moe) { if (fused_up_and_gate_) { auto& fused_up_and_gate = ffn.fused_gating_intermediate; - mallocWeights(fused_up_and_gate, false); + fused_up_and_gate.malloc(st); if (ffn.is_fused_silu) { - interleave(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size); + interleave(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size, st); } else { - chunk(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size); + chunk(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size, st); } - convert(ffn.fused_gating_intermediate, is_fused_moe, workspace, size, is_16xx); + convert(ffn.fused_gating_intermediate, is_fused_moe, workspace, size, is_16xx, st); - freeWeights(ffn.gating); - freeWeights(ffn.intermediate); + ffn.gating.free(st); + ffn.intermediate.free(st); } else { - convert(ffn.gating, is_fused_moe, workspace, size, is_16xx); - convert(ffn.intermediate, is_fused_moe, workspace, size, is_16xx); + convert(ffn.gating, is_fused_moe, workspace, size, is_16xx, st); + convert(ffn.intermediate, is_fused_moe, workspace, size, is_16xx, st); } - convert(ffn.output, is_fused_moe, workspace, size, is_16xx); + convert(ffn.output, is_fused_moe, workspace, size, is_16xx, st); }; if (inter_size_) { @@ -826,12 +666,12 @@ void LlamaDecoderLayerWeight::prepare(void* workspace, size_t size, const cud auto& output = moe_weights.block.output; // TODO: free these ptrs - fused.kernel = gemm::make_blocked_ptrs(fused_ptrs, nullptr); - output.kernel = gemm::make_blocked_ptrs(output_ptrs, nullptr); + fused.kernel = gemm::make_blocked_ptrs(fused_ptrs, st); + output.kernel = gemm::make_blocked_ptrs(output_ptrs, st); if (!fused_param_ptrs.empty()) { - fused.scales_zeros = (T*)gemm::make_blocked_ptrs(fused_param_ptrs, nullptr); - output.scales_zeros = (T*)gemm::make_blocked_ptrs(output_param_ptrs, nullptr); + fused.scales_zeros = (T*)gemm::make_blocked_ptrs(fused_param_ptrs, st); + output.scales_zeros = (T*)gemm::make_blocked_ptrs(output_param_ptrs, st); } fused.k_desc.ld = output.k_desc.ld = 0; diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h index 342775b70..4c871f9f5 100644 --- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h +++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h @@ -30,19 +30,6 @@ template struct LlamaDecoderLayerWeight { public: LlamaDecoderLayerWeight() = delete; - // LlamaDecoderLayerWeight(int layer_idx, - // size_t head_num, - // size_t kv_head_num, - // size_t size_per_head, - // size_t hidden_units, - // size_t inter_size, - // WeightType weight_type, - // int group_size, - // LoraParam lora_param, - // bool attn_bias, - // MoeParam moe_param, - // size_t tensor_para_size, - // size_t tensor_para_rank); LlamaDecoderLayerWeight(int layer_id, const ModelParam& model, @@ -59,11 +46,13 @@ struct LlamaDecoderLayerWeight { TensorMap getParams(std::string prefix); - void prepare(void* workspace, size_t size, const cudaDeviceProp& prop); + void prepare(void* workspace, size_t size, const cudaDeviceProp& prop, cudaStream_t st); size_t workspace_size() const noexcept; - void mallocWeights(LlamaDenseWeight& weights, bool bias); + void malloc(cudaStream_t st); + + void free(cudaStream_t st); T* self_attn_norm_weights{}; T* ffn_norm_weights{}; @@ -84,8 +73,6 @@ struct LlamaDecoderLayerWeight { size_t tensor_para_rank_; bool is_maintain_buffer_ = false; bool fused_up_and_gate_; - - void mallocWeights(); }; } // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h index eed1a4119..766f8066c 100644 --- a/src/turbomind/models/llama/LlamaDenseWeight.h +++ b/src/turbomind/models/llama/LlamaDenseWeight.h @@ -23,6 +23,7 @@ #include "src/turbomind/models/llama/llama_params.h" #include "src/turbomind/models/llama/weight_type.h" #include "src/turbomind/utils/cuda_utils.h" +#include "src/turbomind/utils/memory_utils.h" #include namespace turbomind { @@ -78,12 +79,157 @@ struct LlamaDenseWeight { { return {sizeof(T) * input_dims * lora.r, sizeof(T) * lora.r * output_dims}; } + + void malloc(cudaStream_t st, bool with_bias = false) + { + if (with_bias) { + deviceMalloc((T**)&bias, output_dims, st); + } + const size_t bit_size = getBitSize(type); + if (bit_size >= 16) { // fp16, fp32 + deviceMalloc((T**)&kernel, input_dims * output_dims, st); + } + else { // int8, int4 + const int factor = sizeof(float) * 8 / bit_size; + FT_CHECK(input_dims % factor == 0); + deviceMalloc((int**)&kernel, input_dims * output_dims / factor, st); + deviceMalloc((T**)&scales, input_dims / group_size * output_dims, st); + deviceMalloc((T**)&zeros, input_dims / group_size * output_dims, st); + } + + if (lora.r > 0) { + deviceMalloc((T**)&lora.a, input_dims * lora.r, st); + deviceMalloc((T**)&lora.b, lora.r * output_dims, st); + } + } + + void free(cudaStream_t st) + { + deviceFree(kernel, st); + deviceFree(bias, st); + deviceFree(scales, st); + deviceFree(zeros, st); + deviceFree(lora.a, st); + deviceFree(lora.b, st); + } +}; + +template +struct LatentAttentionWeight { + + LatentAttentionWeight() = default; + + LatentAttentionWeight(size_t hidden_dim, + size_t q_lora_rank, + size_t kv_lora_rank, + int head_dim, + int head_num, + WeightType weight_type, + int group_size): + LatentAttentionWeight{} + { + if (q_lora_rank) { + q_a_proj.input_dims = hidden_dim; + q_a_proj.output_dims = q_lora_rank; + q_b_proj.input_dims = q_lora_rank; + q_b_proj.output_dims = head_num * head_dim; + q_b_proj.type = q_a_proj.type = weight_type; + q_b_proj.group_size = q_a_proj.group_size = group_size; + } + else { + q_proj.input_dims = hidden_dim; + q_proj.output_dims = head_num * head_dim; + q_proj.type = weight_type; + q_proj.group_size = group_size; + } + + kv_a_proj.input_dims = hidden_dim; + kv_a_proj.output_dims = kv_lora_rank; + kv_b_proj.input_dims = kv_lora_rank; + kv_b_proj.output_dims = head_num * head_dim; + kv_b_proj.type = kv_a_proj.type = weight_type; + kv_b_proj.group_size = kv_a_proj.group_size = group_size; + } + + void malloc(cudaStream_t st) + { + if (q_proj.output_dims) { + q_proj.malloc(st); + } + else { + q_a_proj.malloc(st); + q_b_proj.malloc(st); + deviceMalloc((T**)q_a_layernorm, q_a_proj.output_dims, st); + } + kv_a_proj.malloc(st); + kv_b_proj.malloc(st); + deviceMalloc((T**)kv_a_layernorm, kv_a_proj.output_dims, st); + } + + void free(cudaStream_t st) + { + q_proj.free(st); + q_a_proj.free(st); + q_b_proj.free(st); + kv_a_proj.free(st); + kv_b_proj.free(st); + deviceFree(q_a_layernorm, st); + deviceFree(kv_a_layernorm, st); + } + + LlamaDenseWeight q_proj; + + LlamaDenseWeight q_a_proj; + LlamaDenseWeight q_b_proj; + T* q_a_layernorm; + + LlamaDenseWeight kv_a_proj; + LlamaDenseWeight kv_b_proj; + T* kv_a_layernorm; }; template struct LlamaAttentionWeight { + + LlamaAttentionWeight() = default; + + LlamaAttentionWeight(size_t hidden_dim, + size_t head_dim, + size_t head_num, + size_t kv_head_num, + bool bias, + size_t tp, + WeightType weight_type, + int group_size) + { + qkv.input_dims = hidden_dim; + qkv.output_dims = (head_num + 2 * kv_head_num) * head_dim / tp; + qkv.type = weight_type; + qkv.group_size = group_size; + + output.input_dims = (head_num * head_dim) / tp; + output.output_dims = hidden_dim; + output.type = weight_type; + output.group_size = group_size; + + this->bias = bias; + } + + void malloc(cudaStream_t st) + { + qkv.malloc(st, bias); + output.malloc(st, bias); + } + + void free(cudaStream_t st) + { + qkv.free(st); + output.free(st); + } + LlamaDenseWeight qkv; LlamaDenseWeight output; + bool bias{}; }; template @@ -121,6 +267,21 @@ struct LlamaFfnWeight { output.group_size = group_size; } + void malloc(cudaStream_t st) + { + gating.malloc(st); + intermediate.malloc(st); + output.malloc(st); + } + + void free(cudaStream_t st) + { + gating.free(st); + intermediate.free(st); + output.free(st); + fused_gating_intermediate.free(st); + } + LlamaDenseWeight gating; LlamaDenseWeight intermediate; LlamaDenseWeight output; @@ -144,7 +305,7 @@ struct MoeFfnWeight { bool fuse_silu_act) { - if (param.expert_num.size() < layer_id) { + if (param.expert_num.size() <= layer_id) { return; } @@ -182,11 +343,33 @@ struct MoeFfnWeight { } } + void malloc(cudaStream_t st) + { + gate.malloc(st); + if (shared_gate.output_dims) { + shared_gate.malloc(st); + } + for (auto& e : experts) { + e.malloc(st); + } + } + + void free(cudaStream_t st) + { + gate.free(st); + shared_gate.free(st); + for (auto& e : experts) { + e.free(st); + } + block.free(st); + } + LlamaDenseWeight gate; std::vector> experts; LlamaDenseWeight shared_gate; + // reference into `experts` LlamaFfnWeight block; MoeParam::Method method{}; diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc index 325789f29..2db8fc9c7 100644 --- a/src/turbomind/models/llama/LlamaWeight.cc +++ b/src/turbomind/models/llama/LlamaWeight.cc @@ -20,6 +20,7 @@ #include "src/turbomind/models/llama/LlamaWeight.h" #include "src/turbomind/models/llama/llama_params.h" +#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/memory_utils.h" #include @@ -96,13 +97,22 @@ LlamaWeight::LlamaWeight( FT_CHECK(hidden_units_ % tensor_para_size_ == 0); + check_cuda_error(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)); + decoder_layer_weights.reserve(num_layer_); for (unsigned l = 0; l < num_layer_; ++l) { - decoder_layer_weights.push_back( + decoder_layer_weights.emplace_back( new LlamaDecoderLayerWeight(l, model, lora_param, moe_param, tp_size, tp_rank)); + decoder_layer_weights.back()->malloc(stream_); } - mallocWeights(); + FT_CHECK(vocab_size_padded_ % tensor_para_size_ == 0); + deviceMalloc((T**)&pre_decoder_embedding_table, vocab_size_padded_ * hidden_units_ / tensor_para_size_, stream_); + deviceMalloc((T**)&output_norm_weight, hidden_units_, stream_); + deviceMalloc((T**)&post_decoder_embedding_kernel, hidden_units_ * vocab_size_padded_ / tensor_para_size_, stream_); + + // Wait for allocations + check_cuda_error(cudaStreamSynchronize(stream_)); } #endif @@ -110,26 +120,21 @@ LlamaWeight::LlamaWeight( template LlamaWeight::~LlamaWeight() { - cudaFree((void*)pre_decoder_embedding_table); - cudaFree((void*)output_norm_weight); - cudaFree((void*)post_decoder_embedding_kernel); - - pre_decoder_embedding_table = nullptr; - output_norm_weight = nullptr; - post_decoder_embedding_kernel = nullptr; + deviceFree(pre_decoder_embedding_table, stream_); + deviceFree(output_norm_weight, stream_); + deviceFree(post_decoder_embedding_kernel, stream_); for (auto& p : decoder_layer_weights) { + p->free(stream_); delete p; } -} -template -void LlamaWeight::mallocWeights() -{ - FT_CHECK(vocab_size_padded_ % tensor_para_size_ == 0); - deviceMalloc((T**)&pre_decoder_embedding_table, vocab_size_padded_ * hidden_units_ / tensor_para_size_); - deviceMalloc((T**)&output_norm_weight, hidden_units_); - deviceMalloc((T**)&post_decoder_embedding_kernel, hidden_units_ * vocab_size_padded_ / tensor_para_size_); + decoder_layer_weights.clear(); + + // Wait for deallocations + check_cuda_error(cudaStreamSynchronize(stream_)); + check_cuda_error(cudaStreamDestroy(stream_)); + stream_ = {}; } template @@ -205,13 +210,19 @@ void LlamaWeight::prepare(const cudaDeviceProp& prop) TM_LOG_INFO("[LlamaWeight::prepare] workspace size: %d\n", workspace_size); + // Wait for the weights to be filled externally + check_cuda_error(cudaDeviceSynchronize()); + if (workspace_size) { - deviceMalloc((char**)&workspace, workspace_size); + deviceMalloc((char**)&workspace, workspace_size, stream_); } for (auto& layer : decoder_layer_weights) { - layer->prepare(workspace, workspace_size, prop); + layer->prepare(workspace, workspace_size, prop, stream_); } - deviceFree(workspace); + + deviceFree(workspace, stream_); + + check_cuda_error(cudaStreamSynchronize(stream_)); } #ifdef ENABLE_FP32 diff --git a/src/turbomind/models/llama/LlamaWeight.h b/src/turbomind/models/llama/LlamaWeight.h index f4ca7e455..51d67069d 100644 --- a/src/turbomind/models/llama/LlamaWeight.h +++ b/src/turbomind/models/llama/LlamaWeight.h @@ -22,27 +22,12 @@ #include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h" #include "src/turbomind/models/llama/llama_params.h" -#include "src/turbomind/utils/memory_utils.h" namespace turbomind { template struct LlamaWeight { LlamaWeight() = default; - // LlamaWeight(size_t head_num, - // size_t kv_head_num, - // size_t size_per_head, - // size_t hidden_units, - // size_t inter_size, - // size_t vocab_size, - // size_t num_layer, - // bool attn_bias, - // WeightType weight_type, - // int group_size, - // LoraParam lora_param, - // MoeParam moe_param, - // size_t tensor_para_size, - // size_t tensor_para_rank); LlamaWeight(const ModelParam& model_param, const LoraParam& lora_param, @@ -62,13 +47,12 @@ struct LlamaWeight { void prepare(const cudaDeviceProp& prop); std::vector*> decoder_layer_weights; - const T* pre_decoder_embedding_table{}; - const T* output_norm_weight{}; - const T* post_decoder_embedding_kernel{}; -private: - void mallocWeights(); + T* pre_decoder_embedding_table{}; + T* output_norm_weight{}; + T* post_decoder_embedding_kernel{}; +private: size_t hidden_units_; size_t vocab_size_; size_t vocab_size_padded_; @@ -78,6 +62,8 @@ struct LlamaWeight { size_t tensor_para_rank_; std::vector inter_size_; + + cudaStream_t stream_; }; } // namespace turbomind diff --git a/src/turbomind/models/llama/llama_gemm.cc b/src/turbomind/models/llama/llama_gemm.cc index 62952cd71..f9a0191e4 100644 --- a/src/turbomind/models/llama/llama_gemm.cc +++ b/src/turbomind/models/llama/llama_gemm.cc @@ -84,7 +84,7 @@ int main(int argc, char* argv[]) return -1; } else { - ft::deviceMalloc(reinterpret_cast(&gemm_test_buf), buf_size_in_byte, false); + ft::deviceMalloc(reinterpret_cast(&gemm_test_buf), buf_size_in_byte, nullptr, false); } if (0) {} diff --git a/src/turbomind/models/llama/llama_kernels.h b/src/turbomind/models/llama/llama_kernels.h index 3b01dee60..aaade1a51 100644 --- a/src/turbomind/models/llama/llama_kernels.h +++ b/src/turbomind/models/llama/llama_kernels.h @@ -154,7 +154,7 @@ template struct TempBuffer { TempBuffer(size_t size) { - deviceMalloc(&data, size, false); + cudaMalloc(&data, size); } T* data; }; diff --git a/src/turbomind/utils/memory_utils.cu b/src/turbomind/utils/memory_utils.cu index f8bfb8efe..e9a79ea5a 100644 --- a/src/turbomind/utils/memory_utils.cu +++ b/src/turbomind/utils/memory_utils.cu @@ -26,77 +26,71 @@ namespace turbomind { template -void deviceMalloc(T** ptr, size_t size, bool is_random_initialize) +void deviceMalloc(T** ptr, size_t size, cudaStream_t st, bool is_random_initialize) { - FT_CHECK_WITH_INFO(size >= ((size_t)0), "Ask deviceMalloc size " + std::to_string(size) + "< 0 is invalid."); - check_cuda_error(cudaMalloc((void**)(ptr), sizeof(T) * size)); + check_cuda_error(cudaMallocAsync((void**)(ptr), sizeof(T) * size, st)); if (is_random_initialize) { - cudaRandomUniform(*ptr, size); + cudaRandomUniform(*ptr, size, st); } } -template void deviceMalloc(float** ptr, size_t size, bool is_random_initialize); -template void deviceMalloc(half** ptr, size_t size, bool is_random_initialize); +template void deviceMalloc(float** ptr, size_t size, cudaStream_t, bool is_random_initialize); +template void deviceMalloc(half** ptr, size_t size, cudaStream_t, bool is_random_initialize); #ifdef ENABLE_BF16 -template void deviceMalloc(__nv_bfloat16** ptr, size_t size, bool is_random_initialize); +template void deviceMalloc(__nv_bfloat16** ptr, size_t size, cudaStream_t, bool is_random_initialize); #endif -template void deviceMalloc(uint16_t** ptr, size_t size, bool is_random_initialize); -template void deviceMalloc(int** ptr, size_t size, bool is_random_initialize); -template void deviceMalloc(bool** ptr, size_t size, bool is_random_initialize); -template void deviceMalloc(char** ptr, size_t size, bool is_random_initialize); -template void deviceMalloc(int8_t** ptr, size_t size, bool is_random_initialize); +template void deviceMalloc(uint16_t** ptr, size_t size, cudaStream_t, bool is_random_initialize); +template void deviceMalloc(int** ptr, size_t size, cudaStream_t, bool is_random_initialize); +template void deviceMalloc(bool** ptr, size_t size, cudaStream_t, bool is_random_initialize); +template void deviceMalloc(char** ptr, size_t size, cudaStream_t, bool is_random_initialize); +template void deviceMalloc(int8_t** ptr, size_t size, cudaStream_t, bool is_random_initialize); #ifdef ENABLE_FP8 -template void deviceMalloc(__nv_fp8_e4m3** ptr, size_t size, bool is_random_initialize); +template void deviceMalloc(__nv_fp8_e4m3** ptr, size_t size, cudaStream_t, bool is_random_initialize); #endif template -void deviceMemSetZero(T* ptr, size_t size) -{ - check_cuda_error(cudaMemset(static_cast(ptr), 0, sizeof(T) * size)); -} - -template void deviceMemSetZero(float* ptr, size_t size); -template void deviceMemSetZero(half* ptr, size_t size); -template void deviceMemSetZero(int* ptr, size_t size); -template void deviceMemSetZero(uint32_t* ptr, size_t size); -template void deviceMemSetZero(bool* ptr, size_t size); -#ifdef ENABLE_FP8 -template void deviceMemSetZero(__nv_fp8_e4m3* ptr, size_t size); -#endif -#ifdef ENABLE_BF16 -template void deviceMemSetZero(__nv_bfloat16* ptr, size_t size); -#endif - -template -void deviceFree(T*& ptr) +void deviceFree(T*& ptr, cudaStream_t st) { if (ptr != NULL) { - check_cuda_error(cudaFree(ptr)); + check_cuda_error(cudaFreeAsync(ptr, st)); ptr = NULL; } } -template void deviceFree(float*& ptr); -template void deviceFree(half*& ptr); +template void deviceFree(float*& ptr, cudaStream_t); +template void deviceFree(half*& ptr, cudaStream_t); #ifdef ENABLE_BF16 -template void deviceFree(__nv_bfloat16*& ptr); +template void deviceFree(__nv_bfloat16*& ptr, cudaStream_t); #endif -template void deviceFree(unsigned short*& ptr); -template void deviceFree(int*& ptr); -template void deviceFree(bool*& ptr); -template void deviceFree(char*& ptr); -template void deviceFree(int8_t*& ptr); +template void deviceFree(unsigned short*& ptr, cudaStream_t); +template void deviceFree(int*& ptr, cudaStream_t); +template void deviceFree(bool*& ptr, cudaStream_t); +template void deviceFree(char*& ptr, cudaStream_t); +template void deviceFree(int8_t*& ptr, cudaStream_t); +template void deviceFree(void*& ptr, cudaStream_t); #ifdef ENABLE_FP8 -template void deviceFree(__nv_fp8_e4m3*& ptr); +template void deviceFree(__nv_fp8_e4m3*& ptr, cudaStream_t); #endif +namespace { + +template +__global__ void fill_kernel(T* devptr, size_t size, T value) +{ + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + for (size_t i = idx; i < size; i += blockDim.x * gridDim.x) { + devptr[i] = value; + } +} + +} // namespace + template void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream) { - T* arr = new T[size]; - std::fill(arr, arr + size, value); - check_cuda_error(cudaMemcpyAsync(devptr, arr, sizeof(T) * size, cudaMemcpyHostToDevice, stream)); - delete[] arr; + constexpr int threads = 512; + const int blocks = (size + threads - 1) / threads; + fill_kernel<<>>(devptr, size, value); } template void deviceFill(float* devptr, size_t size, float value, cudaStream_t stream); @@ -280,23 +274,23 @@ __global__ void cuda_random_uniform_kernel(char* buffer, const size_t size } template -void cudaRandomUniform(T* buffer, const size_t size) +void cudaRandomUniform(T* buffer, const size_t size, cudaStream_t st) { static int seq_offset = 0; - cuda_random_uniform_kernel<<<256, 256>>>(buffer, size, seq_offset); + cuda_random_uniform_kernel<<<256, 256, 0, st>>>(buffer, size, seq_offset); seq_offset += 256 * 256; } -template void cudaRandomUniform(float* buffer, const size_t size); -template void cudaRandomUniform(half* buffer, const size_t size); +template void cudaRandomUniform(float* buffer, const size_t size, cudaStream_t); +template void cudaRandomUniform(half* buffer, const size_t size, cudaStream_t); #ifdef ENABLE_BF16 -template void cudaRandomUniform(__nv_bfloat16* buffer, const size_t size); +template void cudaRandomUniform(__nv_bfloat16* buffer, const size_t size, cudaStream_t); #endif -template void cudaRandomUniform(int* buffer, const size_t size); -template void cudaRandomUniform(bool* buffer, const size_t size); -template void cudaRandomUniform(char* buffer, const size_t size); +template void cudaRandomUniform(int* buffer, const size_t size, cudaStream_t); +template void cudaRandomUniform(bool* buffer, const size_t size, cudaStream_t); +template void cudaRandomUniform(char* buffer, const size_t size, cudaStream_t); #ifdef ENABLE_FP8 -template void cudaRandomUniform(__nv_fp8_e4m3* buffer, const size_t size); +template void cudaRandomUniform(__nv_fp8_e4m3* buffer, const size_t size, cudaStream_t); #endif // loads data from binary file. If it succeeds, returns a non-empty vector. If loading fails or @@ -366,10 +360,10 @@ int loadWeightFromBinFunc(T* ptr, std::vector shape, std::string filenam } else { T_IN* ptr_2 = nullptr; - deviceMalloc(&ptr_2, host_array.size(), false); + deviceMalloc(&ptr_2, host_array.size(), nullptr, false); cudaH2Dcpy(ptr_2, host_array.data(), host_array.size()); invokeCudaD2DcpyConvert(ptr, ptr_2, host_array.size()); - deviceFree(ptr_2); + deviceFree(ptr_2, nullptr); } return 0; } diff --git a/src/turbomind/utils/memory_utils.h b/src/turbomind/utils/memory_utils.h index bb7a4f9c0..03a0ef7b3 100644 --- a/src/turbomind/utils/memory_utils.h +++ b/src/turbomind/utils/memory_utils.h @@ -23,16 +23,13 @@ namespace turbomind { template -void deviceMalloc(T** ptr, size_t size, bool is_random_initialize = true); +void deviceMalloc(T** ptr, size_t size, cudaStream_t st, bool is_random_initialize = false); template -void deviceMemSetZero(T* ptr, size_t size); +void deviceFree(T*& ptr, cudaStream_t st); template -void deviceFree(T*& ptr); - -template -void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream = 0); +void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream = {}); template void cudaD2Hcpy(T* tgt, const T* src, const size_t size); @@ -44,10 +41,10 @@ template void cudaD2Dcpy(T* tgt, const T* src, const size_t size); template -void cudaAutoCpy(T* tgt, const T* src, const size_t size, cudaStream_t stream = NULL); +void cudaAutoCpy(T* tgt, const T* src, const size_t size, cudaStream_t stream = {}); template -void cudaRandomUniform(T* buffer, const size_t size); +void cudaRandomUniform(T* buffer, const size_t size, cudaStream_t stream = {}); template int loadWeightFromBin(T* ptr, From 260c9f04e83d9ecb849fc64c7392a42a44dd3f1c Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Sun, 17 Nov 2024 23:02:32 +0800 Subject: [PATCH 09/21] deepseek-v2-lite --- lmdeploy/turbomind/deploy/config.py | 11 +- lmdeploy/turbomind/deploy/module.py | 79 ++++++++- .../turbomind/deploy/source_model/__init__.py | 1 + .../deploy/source_model/deepseek2.py | 96 ++++++++++ .../turbomind/deploy/source_model/mixtral.py | 2 +- .../turbomind/deploy/target_model/base.py | 3 +- lmdeploy/turbomind/supported_models.py | 1 + src/turbomind/kernels/CMakeLists.txt | 1 + .../kernels/attention/reduce_kernel.h | 2 +- src/turbomind/kernels/gemm/moe_utils_v2.cu | 17 +- src/turbomind/kernels/gemm/moe_utils_v2.h | 1 + .../kernels/gemm/test/test_moe_utils.cu | 81 ++------- src/turbomind/kernels/gemm/test/testbed.h | 2 + src/turbomind/kernels/norm/CMakeLists.txt | 5 + src/turbomind/kernels/norm/rms_norm.cu | 108 ++++++++++++ src/turbomind/kernels/norm/rms_norm.h | 11 ++ src/turbomind/models/llama/CMakeLists.txt | 4 +- .../models/llama/LlamaDecoderLayerWeight.cc | 27 ++- .../models/llama/LlamaDecoderLayerWeight.h | 10 +- src/turbomind/models/llama/LlamaDenseWeight.h | 165 ++++++++---------- src/turbomind/models/llama/llama_params.h | 11 +- src/turbomind/models/llama/mla_utils.cu | 87 +++++++++ src/turbomind/models/llama/mla_utils.h | 57 ++++++ src/turbomind/models/llama/moe_ffn_layer.cc | 5 +- src/turbomind/models/llama/moe_ffn_layer.h | 2 +- .../models/llama/unified_attention_layer.cc | 116 ++++++++++-- .../models/llama/unified_attention_layer.h | 7 +- src/turbomind/models/llama/unified_decoder.cc | 25 +-- src/turbomind/models/llama/unified_decoder.h | 15 +- src/turbomind/models/llama/weight_type.h | 2 +- src/turbomind/python/bind.cpp | 13 +- .../triton_backend/llama/LlamaTritonModel.cc | 7 +- src/turbomind/utils/cuda_utils.h | 19 ++ 33 files changed, 760 insertions(+), 233 deletions(-) create mode 100644 lmdeploy/turbomind/deploy/source_model/deepseek2.py create mode 100644 src/turbomind/kernels/norm/CMakeLists.txt create mode 100644 src/turbomind/kernels/norm/rms_norm.cu create mode 100644 src/turbomind/kernels/norm/rms_norm.h create mode 100644 src/turbomind/models/llama/mla_utils.cu create mode 100644 src/turbomind/models/llama/mla_utils.h diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py index 8bd7e6c51..b6f2ee765 100644 --- a/lmdeploy/turbomind/deploy/config.py +++ b/lmdeploy/turbomind/deploy/config.py @@ -37,7 +37,6 @@ class ModelConfig: hidden_units: int = None vocab_size: int = None num_layer: int = None - # inter_size: int = None inter_size: List[int] = None norm_eps: float = None attn_bias: int = 0 @@ -49,12 +48,18 @@ class ModelConfig: session_len: int = None tp: int = 1 model_format: str = 'hf' - # expert_num: int = 0 expert_num: List[int] = () expert_inter_size: int = 0 experts_per_token: int = 0 moe_shared_gate: int = False - moe_norm_topk: int = False + norm_topk_prob: int = False + # MLA + q_lora_rank: int = 0 + kv_lora_rank: int = 0 + qk_rope_dim: int = 0 + v_head_dim: int = 0 + # tuning + tune_layer_num: int = 1 def verify(self): invalid = {} diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py index 51e842aee..6fa6fcb42 100644 --- a/lmdeploy/turbomind/deploy/module.py +++ b/lmdeploy/turbomind/deploy/module.py @@ -96,7 +96,8 @@ class Ffn(Module): def __init__(self, model: BaseOutputModel): self.model = model self.tp = model.tensor_para_size - # inter_sizes in config are padded and my differ from what's in the weights + # inter_sizes in config are padded and may be different from what's + # in the weights self.inter_size = model.model_config.inter_size self.group_size = max(1, model.model_config.group_size) @@ -134,7 +135,8 @@ def _export(self, def apply(self, i: int, r: BaseReader): for e in get_params(r.ffn(i, None)): - e(partial(self._export, self.inter_size[i], self._ffn), partial(r.ffn, i), i) + e(partial(self._export, self.inter_size[i], self._ffn), + partial(r.ffn, i), i) class MoeFfn(Ffn): @@ -156,11 +158,13 @@ def __init__(self, model: BaseOutputModel): self.shared_gate = model.model_config.moe_shared_gate def apply(self, i: int, r: BaseReader): + if self.expert_num[i] == 0: + return for p in get_params(r.moe_ffn_expert()): for e in range(self.expert_num[i]): fmt = self._moe_ffn_expert.replace('E', str(e)) - p(partial(self._export, self.inter_size, fmt), partial(r.moe_ffn_expert, e, i), - i) + p(partial(self._export, self.inter_size, fmt), + partial(r.moe_ffn_expert, e, i), i) gate = transpose(r.moe_ffn_gate(i)) self.model.save_split(gate, self._moe_ffn_gate.format(i)) @@ -220,6 +224,67 @@ def apply(self, i: int, r: BaseReader): e(self._export, partial(r.attn, i), i) +class MLA(Module): + """ + requires: + r.mla(i, kind) + r.mla_norm(i) + """ + + _mla = 'layers.{0}.attention.{1}.{2}' + + def __init__(self, model: BaseOutputModel): + self.model = model + + def _export(self, idx: int, xs, kind: str, pack_fn, **kwargs): + if all(x is None for x in xs): + return + q_a, q_b, kv_a, kv_b, o = map(transpose, xs) + + cfg = self.model.model_config + qk_nope_dim = cfg.size_per_head - cfg.qk_rope_dim + + q_b = q_b.reshape(-1, cfg.size_per_head) + + # [nope_dim | rope_dim] -> [rope_dim | nope_dim] + q_nope, q_pe = torch.split(q_b, (qk_nope_dim, cfg.qk_rope_dim), dim=-1) + q_b = torch.cat((q_pe, q_nope), + dim=-1).view(-1, cfg.head_num * cfg.size_per_head) + + o = o.reshape(cfg.head_num, cfg.v_head_dim, -1) + o = torch.nn.functional.pad( + o, (0, 0, 0, cfg.size_per_head - cfg.v_head_dim, 0, 0)) + o = o.view(cfg.head_num * cfg.size_per_head, cfg.hidden_units) + + if q_a is not None: + self.model.save_split(pack_fn(q_a), + self._mla.format(idx, 'q_a_proj', kind)) + q_b_name = 'q_proj' if q_a is None else 'q_b_proj' + self.model.save_split(pack_fn(q_b), + self._mla.format(idx, q_b_name, kind), + split_dim=-1) + self.model.save_split(pack_fn(kv_a), + self._mla.format(idx, 'kv_a_proj', kind)) + self.model.save_split(pack_fn(kv_b), + self._mla.format(idx, 'kv_b_proj', kind), + split_dim=-1) + self.model.save_split(pack_fn(o), + self._mla.format(idx, 'wo', kind), + split_dim=0) + + _layernorm = 'layers.{0}.attention.{1}_a_layernorm' + + def apply(self, i: int, r: BaseReader): + + for f in get_params(r.attn(i, None), bias=False): + f(self._export, partial(r.mla, i), i) + + q, k = r.mla_norm(i) + if q is not None: + self.model.save_split(q, self._layernorm.format(i, 'q')) + self.model.save_split(k, self._layernorm.format(i, 'kv')) + + class Misc(Module): """ requires: @@ -260,7 +325,11 @@ class Transformer: def __init__(self, model: BaseOutputModel): self.model = model - modules = [Attn, LayerNorm] + modules = [LayerNorm] + if model.model_config.kv_lora_rank: + modules.append(MLA) + else: + modules.append(Attn) if model.model_config.inter_size: modules.append(Ffn) if model.model_config.expert_num: diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py index a36102e1c..011d7b555 100644 --- a/lmdeploy/turbomind/deploy/source_model/__init__.py +++ b/lmdeploy/turbomind/deploy/source_model/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from .baichuan import Baichuan2Model, BaichuanModel # noqa: F401 +from .deepseek2 import DeepSeek2Model # noqa: F401 from .deepseek_vl import DeepSeekVLModel # noqa: F401 from .glm4 import Glm4Model # noqa: F401 from .internlm2 import InternLM2Model # noqa: F401 diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek2.py b/lmdeploy/turbomind/deploy/source_model/deepseek2.py new file mode 100644 index 000000000..ccdfafb18 --- /dev/null +++ b/lmdeploy/turbomind/deploy/source_model/deepseek2.py @@ -0,0 +1,96 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base import INPUT_MODELS +from .llama import LlamaModel, LlamaReader + + +class DeepSeek2Reader(LlamaReader): + + def moe_ffn_gate(self, i): + return self.params.get(f'model.layers.{i}.mlp.gate.weight') + + def moe_ffn_expert(self, e=None, i=None, kind=None): + if not kind: + return self.filter(r'experts') + result = [] + for key in ['gate', 'down', 'up']: + name = f'model.layers.{i}.mlp.experts.{e}.{key}_proj.{kind}' + tensor = self.params.get(name) + tensor = self.transform(tensor, kind) + result.append(tensor) + return (*result, ) + + def _ffn(self, i: int, kind: str): + """Get ffn kind for layer i.""" + if not kind: + return self.filter(r'mlp' if i == 0 else r'shared_expert\.') + result = [] + for key in ['gate', 'down', 'up']: + name = f'model.layers.{i}.mlp.shared_experts.{key}_proj.{kind}' + if i == 0: + name = name.replace('shared_experts.', '') + tensor = self.params.get(name) + tensor = self.transform(tensor, kind) + result.append(tensor) + return (*result, ) + + def mla(self, i: int, kind: str): + if not kind: + return self.filter(r'self_attn.*proj') + result = [] + for key in ['q_a_proj', 'q_proj', 'kv_a_proj_with_mqa', 'kv_b_proj', 'o_proj']: + tensor = self.params.get( + f'{self.attn_layer_prefix}.{i}.self_attn.{key}.{kind}' + ) + tensor = self.transform(tensor, kind) + result.append(tensor) + return (*result, ) + + def mla_norm(self, i: int): + result = [] + for k in ['q', 'kv']: + result.append(self.params.get(f'{self.attn_layer_prefix}.{i}.self_attn.{k}_a_layernorm.weight')) + return (*result, ) + + +@INPUT_MODELS.register_module(name='deepseek2') +class DeepSeek2Model(LlamaModel): + + Reader = DeepSeek2Reader + + def tokenizer_info(self): + n_words = self.model_config['vocab_size'] + bos_id = self.model_config['bos_token_id'] + eos_id = self.model_config['eos_token_id'] + return n_words, bos_id, eos_id + + def model_info(self): + cfg = self.model_config + info = super().model_info() + qk_nope_dim = cfg['qk_nope_head_dim'] + qk_rope_dim = cfg['qk_rope_head_dim'] + num_layer = cfg['num_hidden_layers'] + expert_num = cfg['n_routed_experts'] + expert_num = [expert_num] * num_layer + expert_num[0] = 0 + n_shared_experts = cfg['n_shared_experts'] + expert_inter_size = cfg['moe_intermediate_size'] + experts_per_token = cfg['num_experts_per_tok'] + inter_size = [n_shared_experts * expert_inter_size] * num_layer + inter_size[0] = cfg['intermediate_size'] + norm_topk_prob = cfg['norm_topk_prob'] + info.update( + kv_lora_rank=cfg['kv_lora_rank'], + q_lora_rank=cfg['q_lora_rank'] or 0, + qk_rope_dim=qk_rope_dim, + v_head_dim=cfg['v_head_dim'], + size_per_head=qk_rope_dim + qk_nope_dim, + rotary_embedding=qk_rope_dim, + expert_num=expert_num, + expert_inter_size=expert_inter_size, + experts_per_token=experts_per_token, + inter_size=inter_size, + norm_topk_prob=norm_topk_prob, + tune_layer_num=2 + ) + return info + diff --git a/lmdeploy/turbomind/deploy/source_model/mixtral.py b/lmdeploy/turbomind/deploy/source_model/mixtral.py index ff9df2d40..6ac22a658 100644 --- a/lmdeploy/turbomind/deploy/source_model/mixtral.py +++ b/lmdeploy/turbomind/deploy/source_model/mixtral.py @@ -33,6 +33,6 @@ def model_info(self): info['expert_num'] = cfg['num_local_experts'] info['expert_inter_size'] = cfg['intermediate_size'] info['experts_per_token'] = cfg['num_experts_per_tok'] - info['moe_norm_topk'] = True + info['norm_topk_prob'] = True info['inter_size'] = 0 return info diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py index c989d630d..2f33d030e 100644 --- a/lmdeploy/turbomind/deploy/target_model/base.py +++ b/lmdeploy/turbomind/deploy/target_model/base.py @@ -39,7 +39,8 @@ def _weight_dtype_map(weight_type: str, default=None): def _pad_inter_size(inter_size: int, group_size: int, tp: int): group_size = max(1, group_size) - groups_per_rank = (inter_size // group_size + tp - 1) // tp + group_num = (inter_size + group_size - 1) // group_size + groups_per_rank = (group_num + tp - 1) // tp inter_size_padded = groups_per_rank * group_size * tp return inter_size_padded diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py index 26be3a5cd..5411f72f0 100644 --- a/lmdeploy/turbomind/supported_models.py +++ b/lmdeploy/turbomind/supported_models.py @@ -32,6 +32,7 @@ InternVLChatModel='internvl', # deepseek-vl MultiModalityCausalLM='deepseekvl', + DeepseekV2ForCausalLM='deepseek2', # MiniCPMV MiniCPMV='minicpmv', # mini gemini diff --git a/src/turbomind/kernels/CMakeLists.txt b/src/turbomind/kernels/CMakeLists.txt index febb8692d..40a48402a 100644 --- a/src/turbomind/kernels/CMakeLists.txt +++ b/src/turbomind/kernels/CMakeLists.txt @@ -68,3 +68,4 @@ endif () add_subdirectory(attention) add_subdirectory(gemm) +add_subdirectory(norm) diff --git a/src/turbomind/kernels/attention/reduce_kernel.h b/src/turbomind/kernels/attention/reduce_kernel.h index 14c5005cd..b4c9064cf 100644 --- a/src/turbomind/kernels/attention/reduce_kernel.h +++ b/src/turbomind/kernels/attention/reduce_kernel.h @@ -127,7 +127,7 @@ struct Reduce { } __syncthreads(); - + // HeadDim / WARP_SIZE // 128 -> 4 // 64, 192 -> 2 diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu index 5912c60a8..c8c8db197 100644 --- a/src/turbomind/kernels/gemm/moe_utils_v2.cu +++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu @@ -687,7 +687,8 @@ __global__ void MoeReduceKernel(T* dst, // [ n, d] const int* en2f, // [ e, n] :: (e,n) -> e*n const float* dst_scales, // [n] int dims, - int tokens) + int tokens, + float dst_scale) { using Vec = Array; @@ -695,7 +696,6 @@ __global__ void MoeReduceKernel(T* dst, // [ n, d] auto dst_ptr = (Vec*)dst + dims * ti; - float dst_scale = 0; if (dst_scales) { dst_scale = dst_scales[ti]; dst_scale = fdividef(1.f, 1.f + expf(-dst_scale)); @@ -712,7 +712,7 @@ __global__ void MoeReduceKernel(T* dst, // [ n, d] for (int i = threadIdx.x; i < dims; i += block_dim) { Array accum{}; - if (dst_scales) { + if (dst_scale) { Vec v; Ldg(v, dst_ptr[i].data()); using namespace ops; @@ -739,6 +739,7 @@ void invokeMoeReduce(T* dst, int tokens, int experts_per_token, int dims, + float dst_scale, cudaStream_t st) { // std::cout << __PRETTY_FUNCTION__ << std::endl; @@ -754,7 +755,8 @@ void invokeMoeReduce(T* dst, en2f, dst_scales, dims / vec_size, - tokens); + tokens, + dst_scale); }; switch (experts_per_token) { @@ -774,10 +776,11 @@ void invokeMoeReduce(T* dst, } } -template void invokeMoeReduce(half*, const half*, const float*, const int*, const float*, int, int, int, cudaStream_t); -#ifdef ENABLE_BF16 template void -invokeMoeReduce(nv_bfloat16*, const nv_bfloat16*, const float*, const int*, const float*, int, int, int, cudaStream_t); +invokeMoeReduce(half*, const half*, const float*, const int*, const float*, int, int, int, float, cudaStream_t); +#ifdef ENABLE_BF16 +template void invokeMoeReduce( + nv_bfloat16*, const nv_bfloat16*, const float*, const int*, const float*, int, int, int, float, cudaStream_t); #endif std::vector SampleUniform(int token_num, int expert_num, int exp_per_tok, std::mt19937& g) diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.h b/src/turbomind/kernels/gemm/moe_utils_v2.h index 0e4c36af0..f2aa9870f 100644 --- a/src/turbomind/kernels/gemm/moe_utils_v2.h +++ b/src/turbomind/kernels/gemm/moe_utils_v2.h @@ -54,6 +54,7 @@ void invokeMoeReduce(T* dst, int tokens, int experts_per_token, int dims, + float dst_scale, cudaStream_t st); // Sample `e` from `E` experts uniformly for every token diff --git a/src/turbomind/kernels/gemm/test/test_moe_utils.cu b/src/turbomind/kernels/gemm/test/test_moe_utils.cu index 47e3bfdb1..58ad9d26d 100644 --- a/src/turbomind/kernels/gemm/test/test_moe_utils.cu +++ b/src/turbomind/kernels/gemm/test/test_moe_utils.cu @@ -45,72 +45,6 @@ void diff_vecs(const T* data, const T* refs, int m, int k, std::string msg) } } -#if 0 -void func() -{ - using thrust::universal_vector; - - // clang-format off - std::vector h_logits{ - 8, 5, 1, 4, 3, 6, 2, 7, - 50, 60, 90, 20, 70, 71, 72, 73, - 0, 1, 0, 0, 0, 1, 0, 1, - 0, 0, 0, 1, 0, 0, 0, 2}; - // clang-format on - - h_logits.resize(8); - - // auto tmp = h_logits; - // for (int i = 0; i < 127; ++i) { - // h_logits.insert(h_logits.end(), tmp.begin(), tmp.end()); - // } - - universal_vector logits(h_logits.begin(), h_logits.end()); - - const int E = 8; - const int n = h_logits.size() / E; - const int e = 2; - - const int n_padded = (n + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize; - - universal_vector f2n(e * n); - universal_vector en2f(e * n); - universal_vector offsets(E + 1); - universal_vector accum(E * kMoeGateMaxTiles); - universal_vector scales(n * e); - universal_vector masks(E * n_padded); - - for (int i = 0; i < 10; ++i) { - gemm::CacheFlushing::flush(0); - cudaMemset(accum.data().get(), 0, sizeof(int) * accum.size()); - invokeMoeGate_V2(f2n.data().get(), - en2f.data().get(), - offsets.data().get(), - scales.data().get(), - masks.data().get(), - accum.data().get(), - logits.data().get(), - n, - n_padded, - E, - e, - 0); - } - - auto err = cudaDeviceSynchronize(); - if (err) { - std::cerr << cudaGetErrorString(err) << "\n"; - } - - print_vecs(scales.data().get(), e, n, "scales", 12); - print_vecs(masks.data().get(), E, n_padded, "tmp"); - print_vecs(accum.data().get(), E, 1, "accum"); - print_vecs(offsets.data().get(), 1, E + 1, "offsets"); - print_vecs(f2n.data().get(), n * e, 1, "f2n"); - print_vecs(en2f.data().get(), e, n, "en2f"); -} -#endif - RNG& gRNG() { static RNG inst{}; @@ -286,7 +220,7 @@ bool test_moe_gate(int tokens, // tokens_padded, expert_num, experts_per_token, - true, + false, 0); } @@ -334,7 +268,7 @@ bool test_moe_gate(int tokens, // success = false; } - if (!success && 1) { + if (!success || 1) { diff_vecs(eids.data().get(), eids_ref.data().get(), experts_per_token, tokens, "eids"); @@ -353,6 +287,15 @@ bool test_moe_gate(int tokens, // print_vecs(scales_ref.data().get(), experts_per_token, tokens, "scales_ref", 12); print_vecs(scales.data().get(), experts_per_token, tokens, "scales", 12); + for (int i = 0; i < tokens; ++i) { + float sum = 0; + for (int j = 0; j < experts_per_token; ++j) { + sum += scales[j * tokens + i]; + } + std::cout << sum << " "; + } + std::cout << "\n"; + // print_vecs(accum.data().get(), expert_num, 1, "accum"); // print_vecs(coords.data().get(), 1, max_coords, "coords"); @@ -393,7 +336,7 @@ int main() // test_moe_gate(32768, 64, 8, tape, tiling); // test_moe_gate(8, 60, 4, tape, tiling); - test_moe_gate(65536, 8, 2, tape, tiling); + test_moe_gate(16, 64, 6, tape, tiling); return 0; for (int i = 1; i < 16384; ++i) { diff --git a/src/turbomind/kernels/gemm/test/testbed.h b/src/turbomind/kernels/gemm/test/testbed.h index 2678470bb..4747644f9 100644 --- a/src/turbomind/kernels/gemm/test/testbed.h +++ b/src/turbomind/kernels/gemm/test/testbed.h @@ -518,6 +518,7 @@ class Testbed { batch_size_, expert_ids_.size() / batch_size_, output_dims_, + 0.f, stream_); invokeMoeReduce(c_ref_.data().get(), @@ -528,6 +529,7 @@ class Testbed { batch_size_, expert_ids_.size() / batch_size_, output_dims_, + 0.f, stream_); cudaDeviceSynchronize(); diff --git a/src/turbomind/kernels/norm/CMakeLists.txt b/src/turbomind/kernels/norm/CMakeLists.txt new file mode 100644 index 000000000..bc1569c40 --- /dev/null +++ b/src/turbomind/kernels/norm/CMakeLists.txt @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +add_library(rms_norm rms_norm.cu) +set_property(TARGET rms_norm PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET rms_norm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) diff --git a/src/turbomind/kernels/norm/rms_norm.cu b/src/turbomind/kernels/norm/rms_norm.cu new file mode 100644 index 000000000..ea5026a09 --- /dev/null +++ b/src/turbomind/kernels/norm/rms_norm.cu @@ -0,0 +1,108 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "cub/block/block_reduce.cuh" + +#include "src/turbomind/kernels/core/array_ops.h" +#include "src/turbomind/kernels/core/common.h" + +namespace turbomind { + +template +__global__ void RMSNormKernel( + T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, float inv_dims) +{ + const int ti = blockIdx.x; + const int di = threadIdx.x * vec_size; + + if (ti >= num) { + return; + } + + src += src_ld * ti; + + Array accum{}; + Array vec; + + for (int i = di; i < dims; i += block_dim * vec_size) { + Load(vec, &src[i]); + Array tmp = cast(vec); + using namespace ops; + accum = accum + tmp * tmp; + } + + float sum{}; + PRAGMA_UNROLL + for (int i = 0; i < vec_size; ++i) { + sum += accum[i]; + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + sum = BlockReduce{temp_storage}.Sum(sum); + + __shared__ float shared_sum; + + if (threadIdx.x == 0) { + shared_sum = rsqrtf(sum * inv_dims + eps); + } + + __syncthreads(); + + sum = shared_sum; + + dst += dst_ld * ti; + + Array sv; + for (int i = di; i < dims; i += block_dim * vec_size) { + Load(vec, &src[i]); + Array tmp = cast(vec); + Load(sv, &weights[i]); + PRAGMA_UNROLL + for (int c = 0; c < vec_size; ++c) { + tmp[c] *= (float)sv[c] * sum; + } + Store(&dst[i], cast(tmp)); + } +} + +template +void invokeRMSNorm( + T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, cudaStream_t st) +{ + constexpr int threads = 256; + const int blocks = num; + + RMSNormKernel<<>>(dst, // + dst_ld, + src, + src_ld, + weights, + dims, + num, + eps, + 1.f / dims); +} + +template void invokeRMSNorm(half* dst, + int dst_ld, + const half* src, + int src_ld, + const half* weights, + int dims, + int num, + float eps, + cudaStream_t st); +#if ENABLE_BF16 +template void invokeRMSNorm(nv_bfloat16* dst, + int dst_ld, + const nv_bfloat16* src, + int src_ld, + const nv_bfloat16* weights, + int dims, + int num, + float eps, + cudaStream_t st); +#endif + +} // namespace turbomind \ No newline at end of file diff --git a/src/turbomind/kernels/norm/rms_norm.h b/src/turbomind/kernels/norm/rms_norm.h new file mode 100644 index 000000000..ebf49a53e --- /dev/null +++ b/src/turbomind/kernels/norm/rms_norm.h @@ -0,0 +1,11 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include + +namespace turbomind { + +template +void invokeRMSNorm( + T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, cudaStream_t st); + +} \ No newline at end of file diff --git a/src/turbomind/models/llama/CMakeLists.txt b/src/turbomind/models/llama/CMakeLists.txt index 285fcea31..3c714bd23 100644 --- a/src/turbomind/models/llama/CMakeLists.txt +++ b/src/turbomind/models/llama/CMakeLists.txt @@ -20,11 +20,13 @@ add_library(Llama STATIC unified_attention_layer.cc llama_kernels.cu llama_decoder_kernels.cu - llama_utils.cu) + llama_utils.cu + mla_utils.cu) set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(Llama PUBLIC CUDA::cudart gemm2 + rms_norm cublasMMWrapper DynamicDecodeLayer activation_kernels diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc index bb0794988..c05bf0219 100644 --- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc +++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc @@ -111,6 +111,7 @@ LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(int layer_id, size_per_head_, head_num_, kv_head_num_, + model.mla, attn_bias_, tensor_para_size_, weight_type_, @@ -346,6 +347,25 @@ void LlamaDecoderLayerWeight::loadModel(std::string dir_path, FtCudaDataType } } +template +void getMLATensor(LlamaAttentionWeight& w, const std::string& p, TensorMap& m, int tp_rank) +{ + if (w.q_proj.output_dims) { + getWeightTensor(w.q_proj, false, concat(p, "attention.q_proj", tp_rank), m); + } + else { + getWeightTensor(w.q_a_proj, false, concat(p, "attention.q_a_proj"), m); + getWeightTensor(w.q_b_proj, false, concat(p, "attention.q_b_proj", tp_rank), m); + m.insert(concat(p, "attention.q_a_layernorm"), + Tensor{MEMORY_GPU, getTensorType(), {sizeof(T) * w.q_b_proj.input_dims}, w.q_a_layernorm}); + } + getWeightTensor(w.kv_a_proj, false, concat(p, "attention.kv_a_proj"), m); + getWeightTensor(w.kv_b_proj, false, concat(p, "attention.kv_b_proj", tp_rank), m); + m.insert(concat(p, "attention.kv_a_layernorm"), + Tensor{MEMORY_GPU, getTensorType(), {sizeof(T) * w.kv_b_proj.input_dims}, w.kv_a_layernorm}); +} + + template TensorMap LlamaDecoderLayerWeight::getParams(std::string prefix) { @@ -359,7 +379,12 @@ TensorMap LlamaDecoderLayerWeight::getParams(std::string prefix) auto get_prefix = [=](std::string_view name) { return concat(prefix, name, tensor_para_rank_); }; - getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output); + if (self_attn_weights.qkv.output_dims) { + getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output); + } + else { + getMLATensor(self_attn_weights, prefix, output, tensor_para_rank_); + } getWeightTensor(self_attn_weights.output, attn_bias_, get_prefix("attention.wo"), output); if (inter_size_) { diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h index 4c871f9f5..59ee0ea5f 100644 --- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h +++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h @@ -54,11 +54,13 @@ struct LlamaDecoderLayerWeight { void free(cudaStream_t st); - T* self_attn_norm_weights{}; - T* ffn_norm_weights{}; + T* self_attn_norm_weights{}; + T* ffn_norm_weights{}; + LlamaAttentionWeight self_attn_weights{}; - LlamaFfnWeight ffn_weights{}; - MoeFfnWeight moe_weights{}; + + LlamaFfnWeight ffn_weights{}; + MoeFfnWeight moe_weights{}; private: size_t head_num_; diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h index 766f8066c..944781bf5 100644 --- a/src/turbomind/models/llama/LlamaDenseWeight.h +++ b/src/turbomind/models/llama/LlamaDenseWeight.h @@ -46,20 +46,31 @@ struct LoraWeight { template struct LlamaDenseWeight { - size_t input_dims; - size_t output_dims; - void* kernel; + size_t input_dims = 0; + size_t output_dims = 0; + WeightType type; // uninitialized + void* kernel = nullptr; + T* bias = nullptr; + T* scales = nullptr; + T* zeros = nullptr; + T* scales_zeros = nullptr; + int group_size = 1; + LoraWeight lora; - WeightType type; - T* bias; - T* scales; - T* zeros; - T* scales_zeros; - int group_size; gemm::MatrixLayout k_desc; gemm::MatrixLayout q_desc; + LlamaDenseWeight(): type{}, lora{}, k_desc{}, q_desc{} {} + + LlamaDenseWeight(size_t input_dim, size_t output_dim, WeightType type, int group_size): LlamaDenseWeight{} + { + this->input_dims = input_dim; + this->output_dims = output_dim; + this->type = type; + this->group_size = group_size; + } + size_t kernel_size() const noexcept { return getBitSize(type) * input_dims * output_dims / 8; @@ -114,80 +125,6 @@ struct LlamaDenseWeight { } }; -template -struct LatentAttentionWeight { - - LatentAttentionWeight() = default; - - LatentAttentionWeight(size_t hidden_dim, - size_t q_lora_rank, - size_t kv_lora_rank, - int head_dim, - int head_num, - WeightType weight_type, - int group_size): - LatentAttentionWeight{} - { - if (q_lora_rank) { - q_a_proj.input_dims = hidden_dim; - q_a_proj.output_dims = q_lora_rank; - q_b_proj.input_dims = q_lora_rank; - q_b_proj.output_dims = head_num * head_dim; - q_b_proj.type = q_a_proj.type = weight_type; - q_b_proj.group_size = q_a_proj.group_size = group_size; - } - else { - q_proj.input_dims = hidden_dim; - q_proj.output_dims = head_num * head_dim; - q_proj.type = weight_type; - q_proj.group_size = group_size; - } - - kv_a_proj.input_dims = hidden_dim; - kv_a_proj.output_dims = kv_lora_rank; - kv_b_proj.input_dims = kv_lora_rank; - kv_b_proj.output_dims = head_num * head_dim; - kv_b_proj.type = kv_a_proj.type = weight_type; - kv_b_proj.group_size = kv_a_proj.group_size = group_size; - } - - void malloc(cudaStream_t st) - { - if (q_proj.output_dims) { - q_proj.malloc(st); - } - else { - q_a_proj.malloc(st); - q_b_proj.malloc(st); - deviceMalloc((T**)q_a_layernorm, q_a_proj.output_dims, st); - } - kv_a_proj.malloc(st); - kv_b_proj.malloc(st); - deviceMalloc((T**)kv_a_layernorm, kv_a_proj.output_dims, st); - } - - void free(cudaStream_t st) - { - q_proj.free(st); - q_a_proj.free(st); - q_b_proj.free(st); - kv_a_proj.free(st); - kv_b_proj.free(st); - deviceFree(q_a_layernorm, st); - deviceFree(kv_a_layernorm, st); - } - - LlamaDenseWeight q_proj; - - LlamaDenseWeight q_a_proj; - LlamaDenseWeight q_b_proj; - T* q_a_layernorm; - - LlamaDenseWeight kv_a_proj; - LlamaDenseWeight kv_b_proj; - T* kv_a_layernorm; -}; - template struct LlamaAttentionWeight { @@ -197,39 +134,77 @@ struct LlamaAttentionWeight { size_t head_dim, size_t head_num, size_t kv_head_num, + MLAParam mla, bool bias, size_t tp, WeightType weight_type, int group_size) { - qkv.input_dims = hidden_dim; - qkv.output_dims = (head_num + 2 * kv_head_num) * head_dim / tp; - qkv.type = weight_type; - qkv.group_size = group_size; - - output.input_dims = (head_num * head_dim) / tp; - output.output_dims = hidden_dim; - output.type = weight_type; - output.group_size = group_size; - this->bias = bias; + if (mla.kv_lora_rank == 0) { + qkv = {hidden_dim, (head_num + 2 * kv_head_num) * head_dim / tp, weight_type, group_size}; + } + else { + const int qk_nope_dim = head_dim - mla.qk_rope_dim; + if (mla.q_lora_rank) { + q_a_proj = {hidden_dim, mla.q_lora_rank, weight_type, group_size}; + q_b_proj = {mla.q_lora_rank, head_num * head_dim / tp, weight_type, group_size}; + } + else { + q_proj = {hidden_dim, head_num * head_dim / tp, weight_type, group_size}; + } + kv_a_proj = {hidden_dim, mla.kv_lora_rank + mla.qk_rope_dim, weight_type, group_size}; + kv_b_proj = {mla.kv_lora_rank, head_num * (qk_nope_dim + mla.v_head_dim) / tp, weight_type, group_size}; + } + output = {(head_num * head_dim) / tp, hidden_dim, weight_type, group_size}; } void malloc(cudaStream_t st) { - qkv.malloc(st, bias); + if (qkv.output_dims) { + qkv.malloc(st, bias); + } + else { + if (q_proj.output_dims) { + q_proj.malloc(st); + } + else { + q_a_proj.malloc(st); + q_b_proj.malloc(st); + deviceMalloc((T**)&q_a_layernorm, q_b_proj.input_dims, st); + } + kv_a_proj.malloc(st); + kv_b_proj.malloc(st); + deviceMalloc((T**)&kv_a_layernorm, kv_b_proj.input_dims, st); + } output.malloc(st, bias); } void free(cudaStream_t st) { qkv.free(st); + q_proj.free(st); + q_a_proj.free(st); + q_b_proj.free(st); + kv_a_proj.free(st); + kv_b_proj.free(st); output.free(st); + deviceFree(q_a_layernorm, st); + deviceFree(kv_a_layernorm, st); } LlamaDenseWeight qkv; LlamaDenseWeight output; bool bias{}; + + LlamaDenseWeight q_proj; + LlamaDenseWeight q_a_proj; + LlamaDenseWeight q_b_proj; + LlamaDenseWeight kv_a_proj; + LlamaDenseWeight kv_b_proj; + + T* q_a_layernorm{}; + T* kv_a_layernorm{}; }; template @@ -315,7 +290,7 @@ struct MoeFfnWeight { return; } - printf("%d %d %d\n", (int)hidden_dim, (int)param.inter_size, (int)expert_num); + // printf("%d %d %d\n", (int)hidden_dim, (int)param.inter_size, (int)expert_num); gate.input_dims = hidden_dim; gate.output_dims = expert_num; diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h index baac216f9..ba915caaf 100644 --- a/src/turbomind/models/llama/llama_params.h +++ b/src/turbomind/models/llama/llama_params.h @@ -11,6 +11,13 @@ namespace turbomind { +struct MLAParam { + size_t q_lora_rank; + size_t kv_lora_rank; + size_t qk_rope_dim; + size_t v_head_dim; +}; + struct ModelParam { size_t head_num; size_t head_dim; @@ -25,6 +32,8 @@ struct ModelParam { int group_size; int start_id; int end_id; + MLAParam mla; + int tune_layer_num; std::vector inter_size; }; @@ -37,7 +46,7 @@ struct MoeParam { int experts_per_token; int inter_size; - bool norm_topk; + bool norm_topk_prob; bool shared_gate; std::vector expert_num; diff --git a/src/turbomind/models/llama/mla_utils.cu b/src/turbomind/models/llama/mla_utils.cu new file mode 100644 index 000000000..16999b812 --- /dev/null +++ b/src/turbomind/models/llama/mla_utils.cu @@ -0,0 +1,87 @@ +// Copyright (c) OpenMMLab. All rights reserved. +#include "src/turbomind/kernels/core/array_ops.h" + +namespace turbomind { + +template +__global__ void mla_copy_qkv_kernel(T* qkv, + const T* q, // [h, head_dim] + const T* kv_a, // [kv_lora_rank, rope_dim] + const T* kv_b, // [h, nope_dim + v_head_dim] + int head_num, + int head_dim, + int nope_dim, + int rope_dim, + int kv_lora_rank, + int v_head_dim) +{ + const int type = blockIdx.y; + + const int ti = blockIdx.x; + const int di = threadIdx.x; + + const int kv_b_dim = nope_dim + v_head_dim; + + for (int hi = threadIdx.y; hi < head_num; hi += blockDim.y) { + Array data{}; + if (type == 0) { // Q + Ldg(data, &q[ti * head_num * head_dim + hi * head_dim + di * vec_size]); + } + else if (type == 1) { // K + if (di * vec_size < rope_dim) { + Ldg(data, &kv_a[ti * (kv_lora_rank + rope_dim) + kv_lora_rank + di * vec_size]); + } + else { + Ldg(data, &kv_b[ti * head_num * kv_b_dim + hi * kv_b_dim + di * vec_size - rope_dim]); + } + } + else { // V + if (di * vec_size < v_head_dim) { + Ldg(data, &kv_b[ti * head_num * kv_b_dim + hi * kv_b_dim + nope_dim + di * vec_size]); + } + } + const int ti_stride = 3 * head_num * head_dim; + Store(&qkv[ti * ti_stride + type * head_num * head_dim + hi * head_dim + di * vec_size], data); + } +} + +template +void invokeMLACopyQKV(T* qkv, + const T* q, + const T* kv_a, + const T* kv_b, + int token_num, + int head_num, + int nope_dim, + int rope_dim, + int kv_lora_rank, + int v_head_dim, + cudaStream_t stream) +{ + constexpr int vec_size = 16 / sizeof(T); + const int head_dim = nope_dim + rope_dim; + + dim3 block(head_dim / vec_size, head_num); + // make sure block size <= 1024 + while (block.x * block.y > 1024) { + block.y /= 2; + } + const dim3 grid(token_num, 3); + + mla_copy_qkv_kernel<<>>( + qkv, q, kv_a, kv_b, head_num, head_dim, nope_dim, rope_dim, kv_lora_rank, v_head_dim); +} + +template void invokeMLACopyQKV(uint16_t* qkv, + const uint16_t* q, + const uint16_t* kv_a, + const uint16_t* kv_b, + int token_num, + int head_num, + int nope_dim, + int rope_dim, + int kv_lora_rank, + int v_head_dim, + cudaStream_t stream); + +} // namespace turbomind \ No newline at end of file diff --git a/src/turbomind/models/llama/mla_utils.h b/src/turbomind/models/llama/mla_utils.h new file mode 100644 index 000000000..8e5cad117 --- /dev/null +++ b/src/turbomind/models/llama/mla_utils.h @@ -0,0 +1,57 @@ +// Copyright (c) OpenMMLab. All rights reserved. +#pragma once + +#include +#include + +#include "src/turbomind/utils/cuda_utils.h" + +namespace turbomind { + +template +void invokeMLACopyQKV(T* qkv, + const T* q, + const T* kv_a, + const T* kv_b, + int token_num, + int head_num, + int nope_dim, + int rope_dim, + int kv_lora_rank, + int v_head_dim, + cudaStream_t stream); + +template +void dispatchMLACopyQKV(T* qkv, + const T* q, + const T* kv_a, + const T* kv_b, + int token_num, + int head_num, + int nope_dim, + int rope_dim, + int kv_lora_rank, + int v_head_dim, + cudaStream_t stream) +{ + auto invoke = [&](auto x) { + using type = decltype(x); + invokeMLACopyQKV((type*)qkv, + (const type*)q, + (const type*)kv_a, + (const type*)kv_b, + token_num, + head_num, + nope_dim, + rope_dim, + kv_lora_rank, + v_head_dim, + stream); + }; + if constexpr (sizeof(T) == 2) { + return invoke(uint16_t{}); + } + FT_CHECK(0); +} + +} // namespace turbomind \ No newline at end of file diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc index dcb3f3706..99a66c223 100644 --- a/src/turbomind/models/llama/moe_ffn_layer.cc +++ b/src/turbomind/models/llama/moe_ffn_layer.cc @@ -107,7 +107,7 @@ void MoeFfnLayer::forward(T* output, const T* input, int tokens, int layer_id padded, expert_num, param_.experts_per_token, - param_.norm_topk, + param_.norm_topk_prob, stream_); sync_check_cuda_error(); @@ -220,7 +220,7 @@ void MoeFfnLayer::forward(T* output, const T* input, int tokens, int layer_id } template -void MoeFfnLayer::reduce(T* output, int tokens, const MoeFfnWeight& moe) +void MoeFfnLayer::reduce(T* output, int tokens, float output_scale, const MoeFfnWeight& moe) { invokeMoeReduce(output, inout_buf_, @@ -230,6 +230,7 @@ void MoeFfnLayer::reduce(T* output, int tokens, const MoeFfnWeight& moe) tokens, param_.experts_per_token, hidden_dim_, + output_scale, stream_); sync_check_cuda_error(); diff --git a/src/turbomind/models/llama/moe_ffn_layer.h b/src/turbomind/models/llama/moe_ffn_layer.h index e7edb7a67..8911d931f 100644 --- a/src/turbomind/models/llama/moe_ffn_layer.h +++ b/src/turbomind/models/llama/moe_ffn_layer.h @@ -55,7 +55,7 @@ class MoeFfnLayer { void forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight& moe); - void reduce(T* output, int tokens, const MoeFfnWeight& moe); + void reduce(T* output, int tokens, float output_scale, const MoeFfnWeight& moe); void gate(float* logits, const T* input, int tokens, const LlamaDenseWeight& weight); diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc index 2f99b0c2c..ba002a746 100644 --- a/src/turbomind/models/llama/unified_attention_layer.cc +++ b/src/turbomind/models/llama/unified_attention_layer.cc @@ -19,21 +19,24 @@ // Modified from // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc -#include "src/turbomind/models/llama/unified_attention_layer.h" +#include +#include + #include "src/turbomind/kernels/attention/attention.h" #include "src/turbomind/kernels/attention/decoding.h" #include "src/turbomind/kernels/attention/kv_cache_utils_v2.h" +#include "src/turbomind/kernels/norm/rms_norm.h" #include "src/turbomind/macro.h" #include "src/turbomind/models/llama/LlamaNcclGuard.h" #include "src/turbomind/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_utils.h" +#include "src/turbomind/models/llama/mla_utils.h" +#include "src/turbomind/models/llama/unified_attention_layer.h" #include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/anomaly_handler.h" #include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/debug_utils.h" #include "src/turbomind/utils/logger.h" -#include -#include +#include "src/turbomind/utils/memory_utils.h" namespace turbomind { @@ -72,17 +75,14 @@ UnifiedAttentionLayer::UnifiedAttentionLayer(const ModelParam& model, } template -void UnifiedAttentionLayer::allocateBuffer(size_t q_count, - size_t k_count, - size_t batch_size, - const WeightType* weights) +void UnifiedAttentionLayer::allocateBuffer(size_t q_count, size_t k_count, size_t batch_size, size_t qkv_lora_rank) { TM_LOG_DEBUG(__PRETTY_FUNCTION__); const int local_q_kv_head_num = local_head_num_ + 2 * local_kv_head_num_; - if (weights->qkv.lora.r) { - size_t sz = sizeof(T) * q_count * (local_q_kv_head_num * size_per_head_ + weights->qkv.lora.r); + if (qkv_lora_rank) { + size_t sz = sizeof(T) * q_count * (local_q_kv_head_num * size_per_head_ + qkv_lora_rank); qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sz, false); } else { @@ -198,7 +198,7 @@ inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMa allocateBuffer(token_num, // shared h_cu_k_len[batch_size] - h_cu_k_len[dc_batch_size], // prefill batch_size, - weights); + weights->qkv.lora.r); // [L, 2, H, s, D] const size_t layer_offset = layer_id * 2 * local_kv_head_num_ * param_.cache_block_seq_len * size_per_head_; @@ -210,11 +210,17 @@ inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMa // } int* lora_mask = inputs->at("lora_mask", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr(); - ////////////////////////////////////////////// - /// qkv gemm - // [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim] - linear_->forward(qkv_buf_, attention_input, token_num, weights->qkv, LlamaLinear::kGemm, lora_mask); - sync_check_cuda_error(); + + if (weights->qkv.output_dims) { + ////////////////////////////////////////////// + /// qkv gemm + // [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim] + linear_->forward(qkv_buf_, attention_input, token_num, weights->qkv, LlamaLinear::kGemm, lora_mask); + sync_check_cuda_error(); + } + else { + forward_mla(attention_input, token_num, *weights); + } count_and_fix(qkv_buf_, token_num * weights->qkv.output_dims, Concat("qkv", layer_id), 3); @@ -431,6 +437,84 @@ inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMa sync_check_cuda_error(); } +template +void UnifiedAttentionLayer::forward_mla(const T* inputs, int token_num, const WeightType& w) +{ + const int q_lora_rank = w.q_a_proj.output_dims; + const int kv_lora_rank = w.kv_b_proj.input_dims; + const int qk_rope_dim = w.kv_a_proj.output_dims - kv_lora_rank; + const int qk_nope_dim = std::max(w.q_b_proj.output_dims, w.q_proj.output_dims) / local_head_num_ - qk_rope_dim; + const int v_head_dim = w.kv_b_proj.output_dims / local_head_num_ - qk_nope_dim; + + T* q{}; + + if (w.q_proj.output_dims) { + deviceMalloc((T**)&q, (size_t)token_num * w.q_proj.output_dims, stream_); + linear_->forward(q, inputs, token_num, w.q_proj); + sync_check_cuda_error(); + } + else { + FT_CHECK(0); + T* q_a{}; + deviceMalloc((T**)&q_a, (size_t)token_num * q_lora_rank, stream_); + + linear_->forward(q_a, inputs, token_num, w.q_a_proj); + sync_check_cuda_error(); + + invokeRMSNorm(q_a, + q_lora_rank, + q_a, + q_lora_rank, + w.q_a_layernorm, + q_lora_rank, + token_num, + model_param_.norm_eps, + stream_); + sync_check_cuda_error(); + + deviceMalloc((T**)&q, (size_t)token_num * w.q_b_proj.output_dims, stream_); + linear_->forward(q, q_a, token_num, w.q_b_proj); + sync_check_cuda_error(); + + deviceFree(q_a, stream_); + } + + T* kv_a{}; + const int kv_a_dim = w.kv_a_proj.output_dims; + deviceMalloc((T**)&kv_a, (size_t)token_num * kv_a_dim, stream_); + + linear_->forward(kv_a, inputs, token_num, w.kv_a_proj); + sync_check_cuda_error(); + + invokeRMSNorm( + kv_a, kv_a_dim, kv_a, kv_a_dim, w.kv_a_layernorm, kv_lora_rank, token_num, model_param_.norm_eps, stream_); + sync_check_cuda_error(); + + T* kv_b{}; + deviceMalloc((T**)&kv_b, (size_t)token_num * w.kv_b_proj.output_dims, stream_); + sync_check_cuda_error(); + + linear_->forward(kv_b, {kv_a, kv_a_dim}, token_num, w.kv_b_proj); + sync_check_cuda_error(); + + dispatchMLACopyQKV(qkv_buf_, + q, + kv_a, + kv_b, + token_num, + local_head_num_, + qk_nope_dim, + qk_rope_dim, + kv_lora_rank, + v_head_dim, + stream_); + sync_check_cuda_error(); + + deviceFree(q, stream_); + deviceFree(kv_a, stream_); + deviceFree(kv_b, stream_); +} + #ifdef ENABLE_FP32 template class UnifiedAttentionLayer; #endif diff --git a/src/turbomind/models/llama/unified_attention_layer.h b/src/turbomind/models/llama/unified_attention_layer.h index da0c0e6fc..7d331b0e4 100644 --- a/src/turbomind/models/llama/unified_attention_layer.h +++ b/src/turbomind/models/llama/unified_attention_layer.h @@ -42,7 +42,7 @@ class UnifiedAttentionLayer { static constexpr int kMaxWorkspaceTokens = 4096; void freeBuffer(); - void allocateBuffer(size_t q_count, size_t k_count, size_t batch_size, const WeightType* weights); + void allocateBuffer(size_t q_count, size_t k_count, size_t batch_size, size_t qkv_lora_rank); void allocateWorkspace(); void freeWorkspace(); @@ -70,7 +70,7 @@ class UnifiedAttentionLayer { const NcclParam& tp, const Context& context); - void forward(TensorMap* outputs, const TensorMap* inputs, const LlamaAttentionWeight* weights); + void forward(TensorMap* outputs, const TensorMap* inputs, const WeightType* weights); void prefill(T* output, T* tmp_kv_buffer, @@ -107,6 +107,9 @@ class UnifiedAttentionLayer { int max_split_k, const WeightType* weights); +private: + void forward_mla(const T* inputs, int token_num, const WeightType& weights); + private: const size_t head_num_; const size_t kv_head_num_; diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc index bc8a6a147..115e730b4 100644 --- a/src/turbomind/models/llama/unified_decoder.cc +++ b/src/turbomind/models/llama/unified_decoder.cc @@ -23,7 +23,8 @@ UnifiedDecoder::UnifiedDecoder(const ModelParam& model, rmsnorm_eps_(model.norm_eps), stream_(ctx.stream), allocator_(ctx.allocator.get()), - dtype_(getTensorType()) + dtype_(getTensorType()), + tune_layer_num_(model.tune_layer_num) { attn_layer_ = std::make_unique>(model, attn, lora, tp, ctx); @@ -65,13 +66,13 @@ void UnifiedDecoder::freeBuffer() } template -void UnifiedDecoder::forwardSelfAttn(T* attn_io, - TensorMap* _outputs, - const TensorMap* _inputs, - size_t token_num, - size_t batch_size, - int layer_id, - const LlamaAttentionWeight* weight) +void UnifiedDecoder::forwardSelfAttn(T* attn_io, + TensorMap* _outputs, + const TensorMap* _inputs, + size_t token_num, + size_t batch_size, + int layer_id, + const WeightType* weight) { TensorMap inputs(*_inputs); inputs.insert("input_query", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, attn_io}); @@ -84,7 +85,7 @@ void UnifiedDecoder::forwardSelfAttn(T* attn_io, TensorMap outputs(*_outputs); outputs.insert("hidden_features", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, attn_io}); - attn_layer_->forward(&outputs, &inputs, weight); + attn_layer_->forward(&outputs, &inputs, &weight->self_attn_weights); } template @@ -161,7 +162,7 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con for (size_t layer = 0; layer < layer_num_; ++layer) { /// TODO: do not skip the layers when they are heterogeneous - if (isTuning() && layer != 0) { + if (isTuning() && layer < tune_layer_num_) { continue; } @@ -175,7 +176,7 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con token_num, batch_size, layer, - &weights->at(layer)->self_attn_weights); + weights->at(layer)); count_and_fix(decoder_output, token_num * hidden_units_, Concat("attn_block", layer), 2); @@ -211,7 +212,7 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con } if (!weights->at(layer)->moe_weights.experts.empty()) { - moe_ffn_layer_->reduce(decoder_output, token_num, weights->at(layer)->moe_weights); + moe_ffn_layer_->reduce(decoder_output, token_num, (bool)ffn_layer_, weights->at(layer)->moe_weights); } count_and_fix(decoder_output, token_num * hidden_units_, Concat("ffn_block", layer), 2); diff --git a/src/turbomind/models/llama/unified_decoder.h b/src/turbomind/models/llama/unified_decoder.h index f13b4ba84..7e1bd8866 100644 --- a/src/turbomind/models/llama/unified_decoder.h +++ b/src/turbomind/models/llama/unified_decoder.h @@ -23,6 +23,7 @@ class UnifiedDecoder { cudaStream_t const stream_; IAllocator* const allocator_; const DataType dtype_; + const int tune_layer_num_; bool is_free_buffer_after_forward_{}; int* cu_q_len_{}; @@ -39,13 +40,13 @@ class UnifiedDecoder { using WeightType = LlamaDecoderLayerWeight; - void forwardSelfAttn(T* attn_io, - TensorMap* _outputs, - const TensorMap* _inputs, - size_t token_num, - size_t batch_size, - int layer_id, - const LlamaAttentionWeight* weight); + void forwardSelfAttn(T* attn_io, + TensorMap* _outputs, + const TensorMap* _inputs, + size_t token_num, + size_t batch_size, + int layer_id, + const WeightType* weight); public: UnifiedDecoder(const ModelParam& model, diff --git a/src/turbomind/models/llama/weight_type.h b/src/turbomind/models/llama/weight_type.h index ade02e10e..27d7affe5 100644 --- a/src/turbomind/models/llama/weight_type.h +++ b/src/turbomind/models/llama/weight_type.h @@ -52,4 +52,4 @@ inline size_t getBitSize(WeightType type) return 0; } -} // namespace turbomind \ No newline at end of file +} // namespace turbomind diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index 4eb34249f..1e74f4bae 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -293,8 +293,17 @@ PYBIND11_MODULE(_turbomind, m) std::accumulate(src->shape.begin(), src->shape.end(), 1LL, std::multiplies()); auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8; ft::FT_CHECK(self->shape.size() == 1 && num_bytes == self->shape[0]); - cudaMemcpy( - const_cast(self->data), const_cast(src->data), num_bytes, cudaMemcpyDefault); + cudaPointerAttributes at{}; + ft::check_cuda_error(cudaPointerGetAttributes(&at, self->data)); + { + // Switch to the same device where TM's tenosr memory resides because it's allocated + // from a pool with no peer access enabled (can't be accessed from a context of other devices) + ft::CudaDeviceGuard guard{at.device}; + ft::check_cuda_error(cudaMemcpy(const_cast(self->data), + const_cast(src->data), + num_bytes, + cudaMemcpyDefault)); + } break; } default: diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index deb28bb35..0dba71575 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -254,6 +254,11 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, model_param_.norm_eps = model_reader["norm_eps"].as(); model_param_.start_id = model_reader["start_id"].as(); model_param_.end_id = model_reader["end_id"].as(); + model_param_.tune_layer_num = model_reader["tune_layer_num"].as(1); + model_param_.mla.q_lora_rank = model_reader["q_lora_rank"].as(); + model_param_.mla.kv_lora_rank = model_reader["kv_lora_rank"].as(); + model_param_.mla.qk_rope_dim = model_reader["qk_rope_dim"].as(); + model_param_.mla.v_head_dim = model_reader["v_head_dim"].as(); attn_param_.cache_block_seq_len = attention_reader["cache_block_seq_len"].as(0); model_param_.quant_policy = engine_reader["quant_policy"].as(0); YAML::Node inter_size = model_reader["inter_size"]; @@ -304,7 +309,7 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, moe_param_.experts_per_token = model_reader["experts_per_token"].as(0); moe_param_.inter_size = model_reader["expert_inter_size"].as(0); moe_param_.shared_gate = model_reader["moe_shared_gate"].as(0); - moe_param_.norm_topk = model_reader["moe_norm_topk"].as(false); + moe_param_.norm_topk_prob = model_reader["norm_topk_prob"].as(false); YAML::Node expert_num = model_reader["expert_num"]; for (auto it = expert_num.begin(); it != expert_num.end(); ++it) { moe_param_.expert_num.push_back(it->as()); diff --git a/src/turbomind/utils/cuda_utils.h b/src/turbomind/utils/cuda_utils.h index 2148fcc16..8311e6eb9 100644 --- a/src/turbomind/utils/cuda_utils.h +++ b/src/turbomind/utils/cuda_utils.h @@ -483,5 +483,24 @@ void compareTwoTensor( bool is_16xx_series(const char* name); +class CudaDeviceGuard { +public: + CudaDeviceGuard(int device) + { + cudaGetDevice(&last_device_id_); + if (device != last_device_id_) { + cudaSetDevice(device); + } + } + + ~CudaDeviceGuard() + { + cudaSetDevice(last_device_id_); + } + +private: + int last_device_id_{-1}; +}; + /* ************************** end of common utils ************************** */ } // namespace turbomind From 375afb43b5e6e167d3b2b1bb80d9f1fe7af7108e Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Tue, 19 Nov 2024 22:15:08 +0800 Subject: [PATCH 10/21] deepseek-v2 --- lmdeploy/turbomind/deploy/config.py | 6 +- lmdeploy/turbomind/deploy/converter.py | 7 +- lmdeploy/turbomind/deploy/loader.py | 23 ++++ lmdeploy/turbomind/deploy/module.py | 5 +- .../deploy/source_model/deepseek2.py | 6 +- .../attention/codegen/decoding_sm80_192.cu | 5 + src/turbomind/kernels/attention/decoding.cu | 13 +- .../kernels/attention/decoding_config.h | 12 +- src/turbomind/kernels/attention/impl_simt.h | 20 +-- src/turbomind/kernels/core/math.h | 11 +- src/turbomind/kernels/core/thread_map.h | 3 +- src/turbomind/kernels/gemm/moe_utils_v2.cu | 121 ++++++++++++++++-- src/turbomind/kernels/gemm/moe_utils_v2.h | 3 + .../kernels/gemm/test/test_moe_utils.cu | 11 +- src/turbomind/models/llama/LlamaBatch.cc | 6 +- src/turbomind/models/llama/LlamaFfnLayer.cc | 28 ++-- src/turbomind/models/llama/LlamaFfnLayer.h | 9 +- src/turbomind/models/llama/LlamaWeight.cc | 54 -------- src/turbomind/models/llama/llama_params.h | 13 +- src/turbomind/models/llama/llama_utils.cu | 3 + src/turbomind/models/llama/moe_ffn_layer.cc | 34 ++++- src/turbomind/models/llama/moe_ffn_layer.h | 6 +- .../models/llama/unified_attention_layer.cc | 24 ++-- src/turbomind/models/llama/unified_decoder.cc | 42 ++++-- src/turbomind/models/llama/unified_decoder.h | 1 + .../triton_backend/llama/LlamaTritonModel.cc | 4 + src/turbomind/utils/allocator.h | 2 +- 27 files changed, 333 insertions(+), 139 deletions(-) diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py index b6f2ee765..1d1aa2755 100644 --- a/lmdeploy/turbomind/deploy/config.py +++ b/lmdeploy/turbomind/deploy/config.py @@ -43,7 +43,7 @@ class ModelConfig: start_id: int = None end_id: int = None size_per_head: int = 128 - group_size: int = 0 + group_size: int = 64 weight_type: str = None session_len: int = None tp: int = 1 @@ -53,6 +53,10 @@ class ModelConfig: experts_per_token: int = 0 moe_shared_gate: int = False norm_topk_prob: int = False + routed_scale: float = 1.0 + topk_group: int = 1 + topk_method: str = 'greedy' + moe_group_num: int = 1 # MLA q_lora_rank: int = 0 kv_lora_rank: int = 0 diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py index 1c847ede0..77f0bc8dc 100644 --- a/lmdeploy/turbomind/deploy/converter.py +++ b/lmdeploy/turbomind/deploy/converter.py @@ -241,11 +241,10 @@ def get_tm_model(model_path, engine_config.model_format = quant_method group_size = _group_size - # Compatible to awq models that are quantized by lmdeploy (<=v0.3.0) - if not group_size: - group_size = 128 - if engine_config.model_format in ['awq', 'gptq']: + # Compatible to awq models that are quantized by lmdeploy (<=v0.3.0) + if not group_size: + group_size = 128 assert group_size == 128, \ f'model format is "{engine_config.model_format}" ' \ f'but group_size is {group_size}. Currently, only 128 ' \ diff --git a/lmdeploy/turbomind/deploy/loader.py b/lmdeploy/turbomind/deploy/loader.py index e3d79b164..dc319db9f 100644 --- a/lmdeploy/turbomind/deploy/loader.py +++ b/lmdeploy/turbomind/deploy/loader.py @@ -10,6 +10,7 @@ import torch from safetensors import safe_open +import safetensors # https://github.com/huggingface/transformers/blob/53fad641cfdb5105e2470bcf3ef17ea8e25cc300/src/transformers/modeling_utils.py#L372 WEIGHT_INDEX_NAME = 'pytorch_model.bin.index.json' @@ -89,6 +90,28 @@ def items(self): assert not params + # def items(self): + # params = defaultdict(dict) + # for shard in self.shards: + # # with safe_open(shard, 'pt') as f: + # with open(shard, 'rb') as f: + # w = safetensors.torch.load(f.read()) + # misc = [] + # for k in w.keys(): + # match = re.findall(self.pattern, k) + # if not match: + # misc.append(k) + # else: + # idx = int(match[0]) + # param = params[idx] + # param[k] = w[k] + # if len(param) == self.item_count[idx]: + # yield (idx, params.pop(idx)) + # if misc: + # yield (-1, {k: w[k] for k in misc}) + # assert not params + + class PytorchLoader(BaseLoader): def __init__(self, diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py index 6fa6fcb42..aa98ed646 100644 --- a/lmdeploy/turbomind/deploy/module.py +++ b/lmdeploy/turbomind/deploy/module.py @@ -239,7 +239,10 @@ def __init__(self, model: BaseOutputModel): def _export(self, idx: int, xs, kind: str, pack_fn, **kwargs): if all(x is None for x in xs): return - q_a, q_b, kv_a, kv_b, o = map(transpose, xs) + q_a, q_b, q, kv_a, kv_b, o = map(transpose, xs) + + if q is not None: + q_b = q cfg = self.model.model_config qk_nope_dim = cfg.size_per_head - cfg.qk_rope_dim diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek2.py b/lmdeploy/turbomind/deploy/source_model/deepseek2.py index ccdfafb18..f780ceeb7 100644 --- a/lmdeploy/turbomind/deploy/source_model/deepseek2.py +++ b/lmdeploy/turbomind/deploy/source_model/deepseek2.py @@ -37,7 +37,7 @@ def mla(self, i: int, kind: str): if not kind: return self.filter(r'self_attn.*proj') result = [] - for key in ['q_a_proj', 'q_proj', 'kv_a_proj_with_mqa', 'kv_b_proj', 'o_proj']: + for key in ['q_a_proj', 'q_b_proj', 'q_proj', 'kv_a_proj_with_mqa', 'kv_b_proj', 'o_proj']: tensor = self.params.get( f'{self.attn_layer_prefix}.{i}.self_attn.{key}.{kind}' ) @@ -90,6 +90,10 @@ def model_info(self): experts_per_token=experts_per_token, inter_size=inter_size, norm_topk_prob=norm_topk_prob, + routed_scale=cfg['routed_scaling_factor'], + topk_method=cfg['topk_method'], + topk_group=cfg['topk_group'], + moe_group_num=cfg['n_group'], tune_layer_num=2 ) return info diff --git a/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu b/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu index 9294fc396..214e6748d 100644 --- a/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu +++ b/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu @@ -12,4 +12,9 @@ invokeDecoding>(const Att template bool invokeDecoding>(const AttentionParams& params); +template bool +invokeDecoding>(const AttentionParams& params); + +template bool invokeDecoding>(const AttentionParams& params); + } // namespace turbomind diff --git a/src/turbomind/kernels/attention/decoding.cu b/src/turbomind/kernels/attention/decoding.cu index b50b67f04..101b4170e 100644 --- a/src/turbomind/kernels/attention/decoding.cu +++ b/src/turbomind/kernels/attention/decoding.cu @@ -4,7 +4,6 @@ #include "decoding_config.h" #include "src/turbomind/kernels/attention/arch.h" #include "src/turbomind/models/llama/llama_utils.h" -// #include "src/turbomind/utils/dispatch.h" #include #include @@ -115,7 +114,17 @@ void dispatchDecoding(const AttentionParams& params) }; if (params.size_per_head == 192) { - invokeDecoding>(params); + + if (is_kv_int8) { + invokeDecoding>(params); + } + else if (is_kv_int4) { + FT_CHECK_WITH_INFO(!is_kv_int4, "not implemented"); + // invokeDecoding>(params); + } + else { + invokeDecoding>(params); + } return; } diff --git a/src/turbomind/kernels/attention/decoding_config.h b/src/turbomind/kernels/attention/decoding_config.h index 7dcb119cf..dfd5e0783 100644 --- a/src/turbomind/kernels/attention/decoding_config.h +++ b/src/turbomind/kernels/attention/decoding_config.h @@ -40,7 +40,7 @@ struct DecodingConfig 2) }; template -struct DecodingConfig { +struct DecodingConfig> { static constexpr int Qh = (Qh_ + 7) / 8 * 8; using Attention = Impl; using CacheIter = GetBlockIterFactory; @@ -76,4 +76,14 @@ struct DecodingConfig { using Kernel = AttentionUniversal, CacheIter, DecodingCtaMap>; }; +template +struct DecodingConfig { + static constexpr int Qh = 1; + static constexpr int HeadDim = 192; + + using Attention = Impl; + using CacheIter = GetBlockIterFactory; + using Kernel = AttentionUniversal, Attention>, CacheIter, DecodingCtaMap>; +}; + } // namespace turbomind::attention diff --git a/src/turbomind/kernels/attention/impl_simt.h b/src/turbomind/kernels/attention/impl_simt.h index 667a0ce43..790fc1b4f 100644 --- a/src/turbomind/kernels/attention/impl_simt.h +++ b/src/turbomind/kernels/attention/impl_simt.h @@ -2,12 +2,17 @@ #pragma once -#include "src/turbomind/kernels/attention/impl.h" +#include +#include +#include + #include "src/turbomind/kernels/core/array_ops.h" #include "src/turbomind/kernels/core/layout.h" #include "src/turbomind/kernels/core/thread_map.h" -#include -#include + +#include "src/turbomind/kernels/attention/impl.h" +#include "src/turbomind/kernels/attention/quantization.h" + namespace turbomind::attention { @@ -51,7 +56,7 @@ struct Impl), K_K); }; struct LinearD { @@ -165,11 +170,8 @@ struct Impl; - static constexpr int kWarpThreadC_KV = HeadDim != 192 ? HeadDim / kAccessC_KV : 8; - using ThreadMapQ = RakedThreadMap; - using ThreadMapKV = RakedThreadMap; + using ThreadMapKV = RakedThreadMap, kWarpCount>; // `WARP_SIZE / WARP_S` is chosen to achieve minimum kIterS w/o introducing partial S iter using ThreadMapKVp = RakedThreadMap<2, CTA_S, 2, kWarpCount, WARP_SIZE / WARP_S>; diff --git a/src/turbomind/kernels/core/math.h b/src/turbomind/kernels/core/math.h index a708a3498..c78ab95ab 100644 --- a/src/turbomind/kernels/core/math.h +++ b/src/turbomind/kernels/core/math.h @@ -5,6 +5,7 @@ #include "src/turbomind/kernels/core/common.h" #include #include +#include namespace turbomind { @@ -41,10 +42,16 @@ TM_HOST_DEVICE constexpr T log2(T x) // static_assert(log2(32) == 5); // static_assert(log2(1) == 0); +template +TM_HOST_DEVICE constexpr T lowbit(T x) +{ + const std::make_signed_t s = x; + return static_cast(s & -s); +} + // https://arxiv.org/abs/1902.01961 template -struct FastDivMod { -}; +struct FastDivMod {}; template<> struct FastDivMod { diff --git a/src/turbomind/kernels/core/thread_map.h b/src/turbomind/kernels/core/thread_map.h index 66b691832..1271aefcc 100644 --- a/src/turbomind/kernels/core/thread_map.h +++ b/src/turbomind/kernels/core/thread_map.h @@ -3,6 +3,7 @@ #pragma once #include "src/turbomind/kernels/core/common.h" +#include "src/turbomind/kernels/core/math.h" #include @@ -51,7 +52,7 @@ struct ThreadMapQ { } }; -template +template struct RakedThreadMap { static constexpr int kDimC = DimC; static constexpr int kDimS = DimS; diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu index c8c8db197..66bf634ea 100644 --- a/src/turbomind/kernels/gemm/moe_utils_v2.cu +++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu @@ -264,7 +264,8 @@ __global__ void MoeGateKernel_v8(float* scales, // [e,n] int token_num_padded, int expert_num, int top_k, - bool norm_topk) + bool norm_topk, + float routed_scale) { constexpr int max_tiles = kMoeGateMaxTiles; constexpr int threads_per_token = max_expert_num / items_per_thread; // 8 @@ -286,8 +287,8 @@ __global__ void MoeGateKernel_v8(float* scales, // [e,n] const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token; - const int warp_offset = thread_idx / WARP_SIZE * WARP_SIZE / threads_per_token; - const int block_offset = thread_idx / block_dim * block_dim / threads_per_token; + // const int warp_offset = thread_idx / WARP_SIZE * WARP_SIZE / threads_per_token; + // const int block_offset = thread_idx / block_dim * block_dim / threads_per_token; float data[items_per_thread]; int idxs[items_per_thread]; @@ -536,7 +537,7 @@ __global__ void MoeGateKernel_v8(float* scales, // [e,n] if (ti2 < token_num && idx < top_k) { masks[expert_id * token_num_padded + ti2] = idx; - scales[idx * token_num + ti2] = scale; + scales[idx * token_num + ti2] = scale * routed_scale; atomicAdd(&smem.shared_accum[ti2 >> log_tile][expert_id], 1); // printf("%d %d %f\n", idx, expert_id, scale); @@ -569,6 +570,7 @@ void invokeMoeGate_V2(int* f2n, // [e*n] -> n int experts, // E int experts_per_token, bool norm_topk, + float routed_scale, cudaStream_t st) { constexpr int base_log_tile = 9; @@ -581,14 +583,14 @@ void invokeMoeGate_V2(int* f2n, // [e*n] -> n // std::cout << log_tile << " " << tiles << "\n"; - auto invoke = [&](auto max_expert_num, auto top_k, auto items_per_thread) { + auto invoke = [&](auto max_expert_num, auto top_k, auto items_per_thread, auto vec_size) { constexpr int thrs_per_tok = max_expert_num.value / items_per_thread.value; constexpr int threads = 256; const int blocks = ceil_div(tokens, threads / thrs_per_tok); cudaMemsetAsync(masks, -1, sizeof(int8_t) * experts * tokens_padded, st); - MoeGateKernel_v8 + MoeGateKernel_v8 <<>>( // scales, (int8_t*)masks, @@ -600,28 +602,38 @@ void invokeMoeGate_V2(int* f2n, // [e*n] -> n tokens_padded, experts, experts_per_token, - norm_topk); + norm_topk, + routed_scale); }; auto fail = [&] { - std::cerr << "unsupported moe config: expert_num=" << experts << ", top_k=" << experts_per_token << "\n"; + std::cerr << __FILE__ << "(" << __LINE__ << "): unsupported moe config: expert_num=" << experts + << ", top_k=" << experts_per_token << "\n"; std::abort(); }; if (experts <= 8) { if (experts_per_token <= 2) { - invoke(_Int<8>, _Int<2>, _Int<8>); + invoke(_Int<8>, _Int<2>, _Int<8>, _Int<4>); } else { - invoke(_Int<8>, _Int<8>, _Int<8>); + invoke(_Int<8>, _Int<8>, _Int<8>, _Int<4>); } } else if (experts <= 64) { if (experts_per_token <= 4) { - invoke(_Int<64>, _Int<4>, _Int<16>); + invoke(_Int<64>, _Int<4>, _Int<16>, _Int<4>); } else if (experts_per_token <= 8) { - invoke(_Int<64>, _Int<8>, _Int<16>); + invoke(_Int<64>, _Int<8>, _Int<16>, _Int<4>); + } + else { + fail(); + } + } + else if (experts <= 160) { + if (experts_per_token <= 8) { + invoke(_Int<160>, _Int<8>, _Int<10>, _Int<2>); } else { fail(); @@ -836,4 +848,89 @@ std::vector SampleBalanced(int token_num, int expert_num, int exp_per_tok, return ret; } +template +__global__ void MoeMaskTopKGroups(float* logits, int token_num, int expert_num, int top_k) +{ + constexpr int threads_per_token = max_expert_num / items_per_thread; + + static_assert((threads_per_token & (threads_per_token - 1)) == 0); + static_assert(items_per_thread % access_size == 0); + + const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x; + + const int ti = thread_idx / threads_per_token; + const int ei = thread_idx % threads_per_token; + + float data[items_per_thread]; + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + data[i] = -std::numeric_limits::infinity(); + } + float max_val = -std::numeric_limits::infinity(); + if (ti < token_num) { + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; i += access_size) { + const int e = ei * items_per_thread + i; + if (e < expert_num) { + Ldg((Array&)data[i], &logits[ti * expert_num + e]); + PRAGMA_UNROLL + for (int c = 0; c < access_size; ++c) { + max_val = fmaxf(max_val, data[i + c]); + } + } + } + } + + const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token; + const int warp_ti_offset = warp_ti * threads_per_token; + + bool alive = false; + + for (int k = 0; k < top_k; ++k) { + int g_max_ei = ei; + float g_max_val = max_val; + PRAGMA_UNROLL + for (int m = threads_per_token / 2; m >= 1; m /= 2) { + g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m)); + } + // tie breaking + const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val); + g_max_ei = __ffs(active >> (unsigned)warp_ti_offset) - 1; + if (ei == g_max_ei) { + alive = true; + max_val = -std::numeric_limits::infinity(); + } + } + + if (!alive && ti < token_num) { + Array vec; + fill(vec, -std::numeric_limits::infinity()); + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; i += access_size) { + const int e = ei * items_per_thread + i; + if (e < expert_num) { + Store(&logits[ti * expert_num + e], vec); + } + } + } +} + +void invokeMaskMoeTopKGroups(float* logits, int token_num, int expert_num, int group_size, int top_k, cudaStream_t st) +{ + auto invoke = [&](auto max_expert_num, auto items_per_thread, auto vec_size) { + constexpr int thrs_per_tok = max_expert_num.value / items_per_thread.value; + constexpr int threads = 256; + const int blocks = ceil_div(token_num, threads / thrs_per_tok); + MoeMaskTopKGroups + <<>>(logits, token_num, expert_num, top_k); + }; + if (expert_num == 160 && group_size == 20) { + return invoke(_Int<160>, _Int<20>, _Int<4>); + } + + std::cerr << __FILE__ << "(" << __LINE__ << "): unsupported moe config: expert_num=" << expert_num + << ", group_size=" << group_size << "\n"; + std::abort(); +} + } // namespace turbomind diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.h b/src/turbomind/kernels/gemm/moe_utils_v2.h index f2aa9870f..d53de1354 100644 --- a/src/turbomind/kernels/gemm/moe_utils_v2.h +++ b/src/turbomind/kernels/gemm/moe_utils_v2.h @@ -22,6 +22,7 @@ void invokeMoeGate_V2(int* f2n, int experts, int exp_per_tok, bool norm_topk, + float routed_scale, cudaStream_t st); template @@ -57,6 +58,8 @@ void invokeMoeReduce(T* dst, float dst_scale, cudaStream_t st); +void invokeMaskMoeTopKGroups(float* logits, int token_num, int expert_num, int group_size, int top_k, cudaStream_t st); + // Sample `e` from `E` experts uniformly for every token std::vector SampleUniform(int token_num, int expert_num, int exp_per_tok, std::mt19937& g); diff --git a/src/turbomind/kernels/gemm/test/test_moe_utils.cu b/src/turbomind/kernels/gemm/test/test_moe_utils.cu index 58ad9d26d..4b2ea6a83 100644 --- a/src/turbomind/kernels/gemm/test/test_moe_utils.cu +++ b/src/turbomind/kernels/gemm/test/test_moe_utils.cu @@ -205,6 +205,8 @@ bool test_moe_gate(int tokens, // cudaMemPrefetchAsync(scales.data().get(), sizeof(float) * scales.size(), 0); cudaMemPrefetchAsync(logits.data().get(), sizeof(float) * logits.size(), 0); + // invokeMaskMoeTopKGroups(logits.data().get(), tokens, expert_num, expert_num / 8, 3, nullptr); + for (int i = 0; i < 1; ++i) { gemm::CacheFlushing::flush(); cudaMemset(accum.data().get(), 0, sizeof(int) * accum.size()); @@ -221,7 +223,8 @@ bool test_moe_gate(int tokens, // expert_num, experts_per_token, false, - 0); + 1.f, + nullptr); } // invokeMoeTiling(coords.data().get(), offsets.data().get(), expert_num, coords.size(), &tiling, 1, 0); @@ -268,7 +271,9 @@ bool test_moe_gate(int tokens, // success = false; } - if (!success || 1) { + // print_vecs(logits.data().get(), tokens, expert_num, "logits", 12); + + if (!success && 1) { diff_vecs(eids.data().get(), eids_ref.data().get(), experts_per_token, tokens, "eids"); @@ -336,7 +341,7 @@ int main() // test_moe_gate(32768, 64, 8, tape, tiling); // test_moe_gate(8, 60, 4, tape, tiling); - test_moe_gate(16, 64, 6, tape, tiling); + test_moe_gate(16, 160, 6, tape, tiling); return 0; for (int i = 1; i < 16384; ++i) { diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 4138174e5..ea321d06a 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -20,6 +20,7 @@ #include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/debug_utils.h" #include "src/turbomind/utils/logger.h" +#include "src/turbomind/utils/nccl_utils.h" #include #include #include @@ -1041,6 +1042,9 @@ LlamaBatch::LlamaBatch(const EngineParam& param, AllocateBuffer(max_batch_size_, session_len_, cache_block_seq_len); AllocatePersistantBuffer(max_batch_size_, cache_block_seq_len); + + // Wait for allocations + check_cuda_error(cudaStreamSynchronize(stream_)); } template @@ -1990,7 +1994,7 @@ void LlamaBatch::tune() nullptr, nullptr); // implicit barrier for TP - check_cuda_error(cudaStreamSynchronize(stream_)); + ftNcclStreamSynchronize(model_->tensor_para_, {}, stream_); } auto tock = std::chrono::steady_clock::now(); diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc index 8cce20720..5afc75869 100644 --- a/src/turbomind/models/llama/LlamaFfnLayer.cc +++ b/src/turbomind/models/llama/LlamaFfnLayer.cc @@ -27,21 +27,20 @@ namespace turbomind { template -void LlamaFfnLayer::allocateBuffer(size_t token_num, - int inter_size, - const LlamaDenseWeight* gating, - const LlamaDenseWeight* inter) +void LlamaFfnLayer::allocateBuffer( + size_t token_num, int inter_size, size_t inter_buf_factor, size_t gating_lora_r, size_t inter_lora_r) { - const size_t sz = token_num * inter_size; + const size_t sz = token_num * inter_size; - const size_t sz_gate = token_num * gating->lora.r; - const size_t sz_inter = token_num * inter->lora.r; + const size_t sz_gate = token_num * gating_lora_r; + const size_t sz_inter = token_num * inter_lora_r; - gating_buf_ = (T*)allocator_->reMalloc(gating_buf_, sizeof(T) * (sz * 2 + sz_gate + sz_inter), false); - inter_buf_ = gating_buf_ + sz; + gating_buf_ = + (T*)allocator_->reMalloc(gating_buf_, sizeof(T) * (sz * inter_buf_factor + sz_gate + sz_inter), false); + inter_buf_ = gating_buf_ + sz; // gate & inter is not fused when lora is enabled - if (gating->lora.r) { + if (gating_lora_r) { inter_buf_ += sz_gate; } @@ -93,12 +92,16 @@ void LlamaFfnLayer::forward(TensorMap* output_tensors, const int layer_id = input_tensors->getVal("layer_id"); const int inter_size = weights->inter_size; - allocateBuffer(token_num, inter_size, &weights->gating, &weights->intermediate); + const bool is_fused_silu = weights->fused_gating_intermediate.kernel && weights->is_fused_silu; + + allocateBuffer(token_num, inter_size, is_fused_silu ? 1 : 2, weights->gating.lora.r, weights->intermediate.lora.r); const T* ffn_input_data = input_tensors->at("ffn_input").getPtr(); T* ffn_output_data = output_tensors->at("ffn_output").getPtr(); int* lora_mask = input_tensors->at("lora_mask", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr(); + const bool all_reduce = input_tensors->getVal("all_reduce", false); + if (weights->fused_gating_intermediate.kernel) { NvtxScope scope("fused_silu_ffn"); @@ -145,7 +148,8 @@ void LlamaFfnLayer::forward(TensorMap* output_tensors, count_and_fix(ffn_output_data, token_num * weights->output.output_dims, Concat("w2", layer_id), 3); - if (all_reduce_ && tensor_para_.world_size_ > 1) { + if (all_reduce && tensor_para_.world_size_ > 1) { + // std::cout << "ffn all reduce " << layer_id << "\n"; NcclGuard nccl_guard(tensor_para_, stream_); ftNcclAllReduceSum(ffn_output_data, ffn_output_data, token_num * hidden_units_, tensor_para_, stream_); sync_check_cuda_error(); diff --git a/src/turbomind/models/llama/LlamaFfnLayer.h b/src/turbomind/models/llama/LlamaFfnLayer.h index 2daca2cc9..a72a24701 100644 --- a/src/turbomind/models/llama/LlamaFfnLayer.h +++ b/src/turbomind/models/llama/LlamaFfnLayer.h @@ -30,13 +30,12 @@ namespace turbomind { template class LlamaFfnLayer { public: - LlamaFfnLayer(const ModelParam& model, const NcclParam& tp, const Context& ctx, bool all_reduce): + LlamaFfnLayer(const ModelParam& model, const NcclParam& tp, const Context& ctx): hidden_units_(model.hidden_units), tensor_para_(tp), stream_(ctx.stream), linear_(ctx.linear.get()), - allocator_(ctx.allocator.get()), - all_reduce_(all_reduce) + allocator_(ctx.allocator.get()) { } @@ -48,7 +47,8 @@ class LlamaFfnLayer { void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaFfnWeight* weights); private: - void allocateBuffer(size_t token_num, int inter_size, const LlamaDenseWeight*, const LlamaDenseWeight*); + void allocateBuffer( + size_t token_num, int inter_size, size_t inter_buf_factor, size_t gating_lora_r, size_t inter_lora_r); void freeBuffer(); @@ -59,7 +59,6 @@ class LlamaFfnLayer { cudaStream_t const stream_; LlamaLinear* const linear_; IAllocator* const allocator_; - const bool all_reduce_; bool is_free_buffer_after_forward_{}; T* gating_buf_{}; diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc index 2db8fc9c7..596207298 100644 --- a/src/turbomind/models/llama/LlamaWeight.cc +++ b/src/turbomind/models/llama/LlamaWeight.cc @@ -25,58 +25,6 @@ #include namespace turbomind { -#if 0 -template -LlamaWeight::LlamaWeight(size_t head_num, - size_t kv_head_num, - size_t size_per_head, - size_t hidden_units, - size_t inter_size, - size_t vocab_size, - size_t num_layer, - bool attn_bias, - WeightType weight_type, - int group_size, - LoraParam lora_param, - MoeParam moe_param, - size_t tensor_para_size, - size_t tensor_para_rank): - hidden_units_(hidden_units), - inter_size_(inter_size), - vocab_size_(vocab_size), - vocab_size_padded_(vocab_size), - num_layer_(num_layer), - weight_type_(weight_type), - tensor_para_size_(tensor_para_size), - tensor_para_rank_(tensor_para_rank) -{ - if (vocab_size_padded_ % tensor_para_size_ != 0) { - vocab_size_padded_ = (vocab_size_padded_ + tensor_para_size_ - 1) / tensor_para_size_ * tensor_para_size_; - TM_LOG_WARNING("pad vocab size from %d to %d", vocab_size_, vocab_size_padded_); - } - - FT_CHECK(hidden_units_ % tensor_para_size_ == 0); - - decoder_layer_weights.reserve(num_layer_); - for (unsigned l = 0; l < num_layer_; ++l) { - decoder_layer_weights.push_back(new LlamaDecoderLayerWeight(l, - head_num, - kv_head_num, - size_per_head, - hidden_units_, - inter_size_, - weight_type_, - group_size, - lora_param, - attn_bias, - moe_param, - tensor_para_size_, - tensor_para_rank_)); - } - - mallocWeights(); -} -#else template LlamaWeight::LlamaWeight( @@ -115,8 +63,6 @@ LlamaWeight::LlamaWeight( check_cuda_error(cudaStreamSynchronize(stream_)); } -#endif - template LlamaWeight::~LlamaWeight() { diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h index ba915caaf..ea00fde03 100644 --- a/src/turbomind/models/llama/llama_params.h +++ b/src/turbomind/models/llama/llama_params.h @@ -44,10 +44,15 @@ struct MoeParam { kFused } method; - int experts_per_token; - int inter_size; - bool norm_topk_prob; - bool shared_gate; + int experts_per_token; + int inter_size; + bool norm_topk_prob; + bool shared_gate; + float routed_scale; + + int topk_group; + std::string topk_method; + int n_group; std::vector expert_num; }; diff --git a/src/turbomind/models/llama/llama_utils.cu b/src/turbomind/models/llama/llama_utils.cu index 925c6b883..570e33b77 100644 --- a/src/turbomind/models/llama/llama_utils.cu +++ b/src/turbomind/models/llama/llama_utils.cu @@ -17,6 +17,7 @@ namespace turbomind { CmpMode compare_mode = kCmpRead; +// CmpMode compare_mode = kCmpWrite; template struct abs_diff_t { @@ -94,6 +95,8 @@ void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream) // sum(abs(a - b)) auto asum = thrust::reduce(thrust::device, transform_iter, transform_iter + size); std::cerr << key << ": " << asum << " " << asum / size << "\n"; + + check_cuda_error(cudaMemcpyAsync(ptr, h_a.data(), sizeof(T) * h_a.size(), cudaMemcpyDefault, stream)); } template diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc index 99a66c223..390d14754 100644 --- a/src/turbomind/models/llama/moe_ffn_layer.cc +++ b/src/turbomind/models/llama/moe_ffn_layer.cc @@ -11,20 +11,19 @@ #include "src/turbomind/utils/nvtx_utils.h" #include "src/turbomind/utils/string_utils.h" #include -#include #include namespace turbomind { template -void MoeFfnLayer::AllocateBuffer(size_t tokens, size_t padded, size_t expert_num) +void MoeFfnLayer::AllocateBuffer(size_t tokens, size_t padded, size_t expert_num, size_t inter_buf_factor) { char* base = 0; auto allocate = [&](void* base) { Monotonic alloc{base}; alloc(&inout_buf_, tokens * param_.experts_per_token * hidden_dim_); - alloc(&inter_buf_, tokens * param_.experts_per_token * inter_size_ * 2); + alloc(&inter_buf_, tokens * param_.experts_per_token * inter_size_ * inter_buf_factor); alloc(&logits_, tokens * expert_num); alloc(&masks_, expert_num * padded); alloc(&f2n_, param_.experts_per_token * tokens); @@ -85,16 +84,37 @@ void MoeFfnLayer::forward(T* output, const T* input, int tokens, int layer_id FT_CHECK(expert_num); - AllocateBuffer(tokens, padded, expert_num); + const size_t inter_buf_factor = [&] { + if (param_.method == MoeParam::kNaive) { + return 0; // managed by ffn + } + else if (moe.block.is_fused_silu) { + return 1; + } + else { + return 2; + } + }(); + + AllocateBuffer(tokens, padded, expert_num, inter_buf_factor); gate(logits_, input, tokens, moe.gate); sync_check_cuda_error(); + // if (tensor_para_.rank_ == 0) { + // Compare(logits_, tokens * expert_num, Concat("logit", layer_id), compare_mode, stream_); + // } + check_cuda_error(cudaMemsetAsync(accum_, 0, sizeof(int) * expert_num * kMoeGateMaxTiles, stream_)); - sync_check_cuda_error(); + check_cuda_error(cudaMemsetAsync(masks_, -1, sizeof(int8_t) * expert_num * padded, stream_)); // dump_logits(tokens, layer_id); + if (param_.topk_method == "group_limited_greedy") { + invokeMaskMoeTopKGroups(logits_, tokens, expert_num, expert_num / param_.n_group, param_.topk_group, stream_); + sync_check_cuda_error(); + } + /// TODO: fix illegal memory access even if NaN are present in logits invokeMoeGate_V2(f2n_, en2f_, @@ -108,6 +128,7 @@ void MoeFfnLayer::forward(T* output, const T* input, int tokens, int layer_id expert_num, param_.experts_per_token, param_.norm_topk_prob, + param_.routed_scale, stream_); sync_check_cuda_error(); @@ -220,7 +241,7 @@ void MoeFfnLayer::forward(T* output, const T* input, int tokens, int layer_id } template -void MoeFfnLayer::reduce(T* output, int tokens, float output_scale, const MoeFfnWeight& moe) +void MoeFfnLayer::reduce(T* output, int tokens, float output_scale, int layer_id, const MoeFfnWeight& moe) { invokeMoeReduce(output, inout_buf_, @@ -235,6 +256,7 @@ void MoeFfnLayer::reduce(T* output, int tokens, float output_scale, const Moe sync_check_cuda_error(); if (tensor_para_.world_size_ > 1) { + // std::cout << "moe all reduce " << layer_id << "\n"; ftNcclAllReduceSum(output, output, tokens * hidden_dim_, tensor_para_, stream_); sync_check_cuda_error(); } diff --git a/src/turbomind/models/llama/moe_ffn_layer.h b/src/turbomind/models/llama/moe_ffn_layer.h index 8911d931f..74c62d004 100644 --- a/src/turbomind/models/llama/moe_ffn_layer.h +++ b/src/turbomind/models/llama/moe_ffn_layer.h @@ -35,7 +35,7 @@ class MoeFfnLayer { max_expert_num, param.experts_per_token, ctx.cuda_device_prop, stream_); } else { - expert_ffn_ = std::make_unique>(model, tp, ctx, false); + expert_ffn_ = std::make_unique>(model, tp, ctx); } h_offsets_ = (int*)allocator_->malloc(sizeof(int) * (max_expert_num + 1), false, true); @@ -44,7 +44,7 @@ class MoeFfnLayer { accum_ = (int*)allocator_->malloc(sizeof(int) * max_expert_num * kMoeGateMaxTiles); } - void AllocateBuffer(size_t tokens, size_t padded, size_t expert_num); + void AllocateBuffer(size_t tokens, size_t padded, size_t expert_num, size_t inter_buf_factor); void FreeBuffer(); @@ -55,7 +55,7 @@ class MoeFfnLayer { void forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight& moe); - void reduce(T* output, int tokens, float output_scale, const MoeFfnWeight& moe); + void reduce(T* output, int tokens, float output_scale, int layer_id, const MoeFfnWeight& moe); void gate(float* logits, const T* input, int tokens, const LlamaDenseWeight& weight); diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc index ba002a746..bf9b97378 100644 --- a/src/turbomind/models/llama/unified_attention_layer.cc +++ b/src/turbomind/models/llama/unified_attention_layer.cc @@ -203,10 +203,10 @@ inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMa // [L, 2, H, s, D] const size_t layer_offset = layer_id * 2 * local_kv_head_num_ * param_.cache_block_seq_len * size_per_head_; - static int count = 0; + // static int count = 0; - // if (layer_id == 0 && count == 0) { - // Compare(attention_input, token_num * weights->qkv.input_dims, "qkv_input", compare_mode, stream_); + // if (tensor_para_.rank_ == 0) { + // Compare(attention_input, token_num * hidden_units_, Concat("qkv_input", layer_id), compare_mode, stream_); // } int* lora_mask = inputs->at("lora_mask", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr(); @@ -222,10 +222,14 @@ inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMa forward_mla(attention_input, token_num, *weights); } + // std::cerr << layer_id << " " << count << " " << tensor_para_.rank_ << "\n"; + count_and_fix(qkv_buf_, token_num * weights->qkv.output_dims, Concat("qkv", layer_id), 3); - // if (layer_id == 0 && count == 0) { - // Compare(qkv_buf_, token_num * weights->qkv.output_dims, "qkv_buf", compare_mode, stream_); + // std::cerr << "token num: " << token_num << "\n"; + + // if (layer_id == 0 && count == 0 && tensor_para_.rank_ == 0) { + // Compare(qkv_buf_, token_num * (3 * local_head_num_ * size_per_head_), "qkv_buf", CMP_MODE, stream_); // } if constexpr (0) { @@ -421,8 +425,6 @@ inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMa linear_->forward(attention_out, qkv_buf_3_, token_num, weights->output, LlamaLinear::kGemm, lora_mask); sync_check_cuda_error(); - // ++count; - count_and_fix(attention_out, token_num * weights->output.output_dims, Concat("wo", layer_id), 3); if (tensor_para_.world_size_ > 1) { @@ -431,10 +433,17 @@ inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMa sync_check_cuda_error(); } + // if (tensor_para_.rank_ == 0) { + // Compare(attention_out, token_num * hidden_units_, Concat("attn_out", layer_id), compare_mode, stream_); + // // dump(qkv_buf_3_, num_token * weights->output.input_dims, stream_, "qkv_buf_3"); + // } + if (is_free_buffer_after_forward_ == true) { freeBuffer(); } sync_check_cuda_error(); + + // ++count; } template @@ -454,7 +463,6 @@ void UnifiedAttentionLayer::forward_mla(const T* inputs, int token_num, const sync_check_cuda_error(); } else { - FT_CHECK(0); T* q_a{}; deviceMalloc((T**)&q_a, (size_t)token_num * q_lora_rank, stream_); diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc index 115e730b4..83015e558 100644 --- a/src/turbomind/models/llama/unified_decoder.cc +++ b/src/turbomind/models/llama/unified_decoder.cc @@ -5,6 +5,7 @@ #include "src/turbomind/models/llama/llama_utils.h" #include "src/turbomind/models/llama/moe_ffn_layer.h" #include "src/turbomind/models/llama/unified_attention_layer.h" +#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/anomaly_handler.h" #include "src/turbomind/utils/cuda_utils.h" #include @@ -23,6 +24,7 @@ UnifiedDecoder::UnifiedDecoder(const ModelParam& model, rmsnorm_eps_(model.norm_eps), stream_(ctx.stream), allocator_(ctx.allocator.get()), + tp_(tp), dtype_(getTensorType()), tune_layer_num_(model.tune_layer_num) { @@ -34,7 +36,7 @@ UnifiedDecoder::UnifiedDecoder(const ModelParam& model, } if (std::accumulate(model.inter_size.begin(), model.inter_size.end(), 0LL)) { - ffn_layer_ = std::make_unique>(model, tp, ctx, !moe_ffn_layer_); + ffn_layer_ = std::make_unique>(model, tp, ctx); } check_cuda_error(cudaEventCreateWithFlags(&ev_h_cu_x_, cudaEventDisableTiming)); @@ -162,7 +164,7 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con for (size_t layer = 0; layer < layer_num_; ++layer) { /// TODO: do not skip the layers when they are heterogeneous - if (isTuning() && layer < tune_layer_num_) { + if (isTuning() && layer >= tune_layer_num_) { continue; } @@ -196,14 +198,21 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con //////////////////////////////////////////// /// feed-forward network - if (!weights->at(layer)->moe_weights.experts.empty()) { + // if (tp_.rank_ == 0) { + // Compare(decoder_output, token_num * hidden_units_, Concat("ffn_input", layer), compare_mode, stream_); + // } + + const bool is_moe = !weights->at(layer)->moe_weights.experts.empty(); + if (is_moe) { moe_ffn_layer_->forward(nullptr, decoder_output, token_num, layer, weights->at(layer)->moe_weights); } - if (ffn_layer_) { - int layer_id = layer; // int is needed + if (weights->at(layer)->ffn_weights.output.output_dims) { + int layer_id = layer; // int is needed + bool all_reduce = !is_moe; TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}}, - {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}}}; + {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}}, + {"all_reduce", {MEMORY_CPU, TYPE_BOOL, {1}, &all_reduce}}}; TensorMap ffn_outputs{{"ffn_output", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}}}; if (inputs->isExist("lora_mask")) { ffn_inputs.insert({"lora_mask", inputs->at("lora_mask")}); @@ -211,10 +220,18 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &weights->at(layer)->ffn_weights); } - if (!weights->at(layer)->moe_weights.experts.empty()) { - moe_ffn_layer_->reduce(decoder_output, token_num, (bool)ffn_layer_, weights->at(layer)->moe_weights); + // if (tp_.rank_ == 0) { + // Compare(decoder_output, token_num * hidden_units_, Concat("ffn_out", layer), compare_mode, stream_); + // } + + if (is_moe) { + moe_ffn_layer_->reduce(decoder_output, token_num, (bool)ffn_layer_, layer, weights->at(layer)->moe_weights); } + // if (tp_.rank_ == 0) { + // Compare(decoder_output, token_num * hidden_units_, Concat("moe_ffn_out", layer), compare_mode, stream_); + // } + count_and_fix(decoder_output, token_num * hidden_units_, Concat("ffn_block", layer), 2); const bool is_last_layer = layer == layer_num_ - 1; @@ -261,6 +278,15 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con // Wait for `h_cu_q/k_len_` to be consumed check_cuda_error(cudaEventSynchronize(ev_h_cu_x_)); + + // check_cuda_error(cudaStreamSynchronize(stream_)); + // if (tp_.rank_ == 0) { + // std::abort(); + // } + // else { + // while (1) + // ; + // } } #ifdef ENABLE_FP32 diff --git a/src/turbomind/models/llama/unified_decoder.h b/src/turbomind/models/llama/unified_decoder.h index 7e1bd8866..e08567136 100644 --- a/src/turbomind/models/llama/unified_decoder.h +++ b/src/turbomind/models/llama/unified_decoder.h @@ -22,6 +22,7 @@ class UnifiedDecoder { const float rmsnorm_eps_; cudaStream_t const stream_; IAllocator* const allocator_; + const NcclParam tp_; const DataType dtype_; const int tune_layer_num_; bool is_free_buffer_after_forward_{}; diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index 0dba71575..909077ba3 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -310,6 +310,10 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, moe_param_.inter_size = model_reader["expert_inter_size"].as(0); moe_param_.shared_gate = model_reader["moe_shared_gate"].as(0); moe_param_.norm_topk_prob = model_reader["norm_topk_prob"].as(false); + moe_param_.routed_scale = model_reader["routed_scale"].as(1.f); + moe_param_.topk_group = model_reader["topk_group"].as(1); + moe_param_.topk_method = model_reader["topk_method"].as("greedy"); + moe_param_.n_group = model_reader["moe_group_num"].as(1); YAML::Node expert_num = model_reader["expert_num"]; for (auto it = expert_num.begin(); it != expert_num.end(); ++it) { moe_param_.expert_num.push_back(it->as()); diff --git a/src/turbomind/utils/allocator.h b/src/turbomind/utils/allocator.h index bdcb9bfc4..d18652ef9 100644 --- a/src/turbomind/utils/allocator.h +++ b/src/turbomind/utils/allocator.h @@ -281,7 +281,7 @@ class Allocator: public IAllocator { pointer_mapping_.erase(address); } else { - TM_LOG_WARNING("pointer_mapping_ does not have information of ptr at %p.", address); + FT_CHECK_WITH_INFO(0, fmtstr("pointer_mapping_ does not have information of ptr at %p.", address).c_str()); } } *ptr = nullptr; From bc296e51dd5d8948adab1c3a1dbb1bce5c11b714 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Wed, 20 Nov 2024 16:22:40 +0800 Subject: [PATCH 11/21] fix lint --- lmdeploy/turbomind/deploy/loader.py | 2 - .../deploy/source_model/deepseek2.py | 60 +++++++++---------- src/turbomind/kernels/attention/decoding.cu | 2 +- src/turbomind/kernels/norm/rms_norm.cu | 2 +- src/turbomind/kernels/norm/rms_norm.h | 2 +- src/turbomind/models/llama/mla_utils.cu | 2 +- src/turbomind/models/llama/mla_utils.h | 2 +- 7 files changed, 35 insertions(+), 37 deletions(-) diff --git a/lmdeploy/turbomind/deploy/loader.py b/lmdeploy/turbomind/deploy/loader.py index dc319db9f..94e779b6b 100644 --- a/lmdeploy/turbomind/deploy/loader.py +++ b/lmdeploy/turbomind/deploy/loader.py @@ -10,7 +10,6 @@ import torch from safetensors import safe_open -import safetensors # https://github.com/huggingface/transformers/blob/53fad641cfdb5105e2470bcf3ef17ea8e25cc300/src/transformers/modeling_utils.py#L372 WEIGHT_INDEX_NAME = 'pytorch_model.bin.index.json' @@ -89,7 +88,6 @@ def items(self): yield (-1, {k: f.get_tensor(k) for k in misc}) assert not params - # def items(self): # params = defaultdict(dict) # for shard in self.shards: diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek2.py b/lmdeploy/turbomind/deploy/source_model/deepseek2.py index f780ceeb7..deb69f2a7 100644 --- a/lmdeploy/turbomind/deploy/source_model/deepseek2.py +++ b/lmdeploy/turbomind/deploy/source_model/deepseek2.py @@ -4,10 +4,10 @@ class DeepSeek2Reader(LlamaReader): - + def moe_ffn_gate(self, i): return self.params.get(f'model.layers.{i}.mlp.gate.weight') - + def moe_ffn_expert(self, e=None, i=None, kind=None): if not kind: return self.filter(r'experts') @@ -18,7 +18,7 @@ def moe_ffn_expert(self, e=None, i=None, kind=None): tensor = self.transform(tensor, kind) result.append(tensor) return (*result, ) - + def _ffn(self, i: int, kind: str): """Get ffn kind for layer i.""" if not kind: @@ -32,29 +32,32 @@ def _ffn(self, i: int, kind: str): tensor = self.transform(tensor, kind) result.append(tensor) return (*result, ) - + def mla(self, i: int, kind: str): if not kind: return self.filter(r'self_attn.*proj') result = [] - for key in ['q_a_proj', 'q_b_proj', 'q_proj', 'kv_a_proj_with_mqa', 'kv_b_proj', 'o_proj']: + for key in [ + 'q_a_proj', 'q_b_proj', 'q_proj', 'kv_a_proj_with_mqa', + 'kv_b_proj', 'o_proj' + ]: tensor = self.params.get( - f'{self.attn_layer_prefix}.{i}.self_attn.{key}.{kind}' - ) + f'{self.attn_layer_prefix}.{i}.self_attn.{key}.{kind}') tensor = self.transform(tensor, kind) result.append(tensor) return (*result, ) - + def mla_norm(self, i: int): result = [] for k in ['q', 'kv']: - result.append(self.params.get(f'{self.attn_layer_prefix}.{i}.self_attn.{k}_a_layernorm.weight')) + name = f'{self.attn_layer_prefix}.{i}.self_attn.{k}_a_layernorm.weight' # noqa: E501 + result.append(self.params.get(name)) return (*result, ) - + @INPUT_MODELS.register_module(name='deepseek2') class DeepSeek2Model(LlamaModel): - + Reader = DeepSeek2Reader def tokenizer_info(self): @@ -78,23 +81,20 @@ def model_info(self): inter_size = [n_shared_experts * expert_inter_size] * num_layer inter_size[0] = cfg['intermediate_size'] norm_topk_prob = cfg['norm_topk_prob'] - info.update( - kv_lora_rank=cfg['kv_lora_rank'], - q_lora_rank=cfg['q_lora_rank'] or 0, - qk_rope_dim=qk_rope_dim, - v_head_dim=cfg['v_head_dim'], - size_per_head=qk_rope_dim + qk_nope_dim, - rotary_embedding=qk_rope_dim, - expert_num=expert_num, - expert_inter_size=expert_inter_size, - experts_per_token=experts_per_token, - inter_size=inter_size, - norm_topk_prob=norm_topk_prob, - routed_scale=cfg['routed_scaling_factor'], - topk_method=cfg['topk_method'], - topk_group=cfg['topk_group'], - moe_group_num=cfg['n_group'], - tune_layer_num=2 - ) + info.update(kv_lora_rank=cfg['kv_lora_rank'], + q_lora_rank=cfg['q_lora_rank'] or 0, + qk_rope_dim=qk_rope_dim, + v_head_dim=cfg['v_head_dim'], + size_per_head=qk_rope_dim + qk_nope_dim, + rotary_embedding=qk_rope_dim, + expert_num=expert_num, + expert_inter_size=expert_inter_size, + experts_per_token=experts_per_token, + inter_size=inter_size, + norm_topk_prob=norm_topk_prob, + routed_scale=cfg['routed_scaling_factor'], + topk_method=cfg['topk_method'], + topk_group=cfg['topk_group'], + moe_group_num=cfg['n_group'], + tune_layer_num=2) return info - diff --git a/src/turbomind/kernels/attention/decoding.cu b/src/turbomind/kernels/attention/decoding.cu index 101b4170e..67bd81e45 100644 --- a/src/turbomind/kernels/attention/decoding.cu +++ b/src/turbomind/kernels/attention/decoding.cu @@ -114,7 +114,7 @@ void dispatchDecoding(const AttentionParams& params) }; if (params.size_per_head == 192) { - + if (is_kv_int8) { invokeDecoding>(params); } diff --git a/src/turbomind/kernels/norm/rms_norm.cu b/src/turbomind/kernels/norm/rms_norm.cu index ea5026a09..cba0130b8 100644 --- a/src/turbomind/kernels/norm/rms_norm.cu +++ b/src/turbomind/kernels/norm/rms_norm.cu @@ -105,4 +105,4 @@ template void invokeRMSNorm(nv_bfloat16* dst, cudaStream_t st); #endif -} // namespace turbomind \ No newline at end of file +} // namespace turbomind diff --git a/src/turbomind/kernels/norm/rms_norm.h b/src/turbomind/kernels/norm/rms_norm.h index ebf49a53e..417daed14 100644 --- a/src/turbomind/kernels/norm/rms_norm.h +++ b/src/turbomind/kernels/norm/rms_norm.h @@ -8,4 +8,4 @@ template void invokeRMSNorm( T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, cudaStream_t st); -} \ No newline at end of file +} diff --git a/src/turbomind/models/llama/mla_utils.cu b/src/turbomind/models/llama/mla_utils.cu index 16999b812..ad38c7550 100644 --- a/src/turbomind/models/llama/mla_utils.cu +++ b/src/turbomind/models/llama/mla_utils.cu @@ -84,4 +84,4 @@ template void invokeMLACopyQKV(uint16_t* qkv, int v_head_dim, cudaStream_t stream); -} // namespace turbomind \ No newline at end of file +} // namespace turbomind diff --git a/src/turbomind/models/llama/mla_utils.h b/src/turbomind/models/llama/mla_utils.h index 8e5cad117..bc06a352f 100644 --- a/src/turbomind/models/llama/mla_utils.h +++ b/src/turbomind/models/llama/mla_utils.h @@ -54,4 +54,4 @@ void dispatchMLACopyQKV(T* qkv, FT_CHECK(0); } -} // namespace turbomind \ No newline at end of file +} // namespace turbomind From 8fb10ba6d82c289453e753b98d0c2cab5caea58e Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Wed, 20 Nov 2024 16:32:39 +0800 Subject: [PATCH 12/21] fix lint --- src/turbomind/kernels/attention/impl_simt.h | 1 - src/turbomind/kernels/attention/mainloop_sm80.h | 4 ++-- src/turbomind/kernels/attention/reduce.cu | 2 +- src/turbomind/kernels/core/math.h | 3 ++- .../flash_attention/flash_attention2/static_switch.h | 4 ++-- src/turbomind/kernels/gemm/convert_v2.cu | 2 +- src/turbomind/models/llama/LlamaDecoderLayerWeight.cc | 1 - src/turbomind/models/llama/LlamaDecoderLayerWeight.h | 2 +- src/turbomind/models/llama/LlamaFfnLayer.cc | 2 +- src/turbomind/models/llama/LlamaWeight.h | 2 +- src/turbomind/models/llama/llama_params.h | 6 ++++-- src/turbomind/models/llama/weight_type.h | 3 ++- src/turbomind/python/bind.cpp | 3 ++- src/turbomind/utils/allocator.h | 3 ++- 14 files changed, 21 insertions(+), 17 deletions(-) diff --git a/src/turbomind/kernels/attention/impl_simt.h b/src/turbomind/kernels/attention/impl_simt.h index 790fc1b4f..444b67e2c 100644 --- a/src/turbomind/kernels/attention/impl_simt.h +++ b/src/turbomind/kernels/attention/impl_simt.h @@ -13,7 +13,6 @@ #include "src/turbomind/kernels/attention/impl.h" #include "src/turbomind/kernels/attention/quantization.h" - namespace turbomind::attention { template, Impl_> { template __device__ void operator()(Args&&... args) { - Run(Sm80_CpAsync{}, std::integral_constant{}, ((Args&&)args)...); + Run(Sm80_CpAsync{}, std::integral_constant{}, ((Args &&) args)...); } template @@ -236,7 +236,7 @@ struct Mainloop, Impl_> { Wait(); state_QK.Load(0, 0); - constexpr auto _ = [](int){}; + constexpr auto _ = [](int) {}; auto loop = [&](auto is_residue, auto is_mask) { const int offset_K = tile_iter * CTA_S; diff --git a/src/turbomind/kernels/attention/reduce.cu b/src/turbomind/kernels/attention/reduce.cu index 051b2baa7..c654f40d0 100644 --- a/src/turbomind/kernels/attention/reduce.cu +++ b/src/turbomind/kernels/attention/reduce.cu @@ -71,7 +71,7 @@ INSTANTIATE_invokeReduce(128, half); INSTANTIATE_invokeReduce(192, half); #if ENABLE_BF16 -INSTANTIATE_invokeReduce(64, nv_bfloat16) +INSTANTIATE_invokeReduce(64, nv_bfloat16); INSTANTIATE_invokeReduce(128, nv_bfloat16); INSTANTIATE_invokeReduce(192, nv_bfloat16); #endif diff --git a/src/turbomind/kernels/core/math.h b/src/turbomind/kernels/core/math.h index c78ab95ab..054269c27 100644 --- a/src/turbomind/kernels/core/math.h +++ b/src/turbomind/kernels/core/math.h @@ -51,7 +51,8 @@ TM_HOST_DEVICE constexpr T lowbit(T x) // https://arxiv.org/abs/1902.01961 template -struct FastDivMod {}; +struct FastDivMod { +}; template<> struct FastDivMod { diff --git a/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h b/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h index ca141ee0b..b1df29cb7 100644 --- a/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h +++ b/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h @@ -60,8 +60,8 @@ }() #elif 1 #define FWD_HEADDIM_SWITCH(HEADDIM, ...) \ - [&] { \ - if (HEADDIM <= 128) { \ + [&] { \ + if (HEADDIM <= 128) { \ constexpr static int kHeadDim = 128; \ return __VA_ARGS__(); \ } \ diff --git a/src/turbomind/kernels/gemm/convert_v2.cu b/src/turbomind/kernels/gemm/convert_v2.cu index 90e4b97dd..e58bfc9b9 100644 --- a/src/turbomind/kernels/gemm/convert_v2.cu +++ b/src/turbomind/kernels/gemm/convert_v2.cu @@ -303,7 +303,7 @@ void* make_blocked_ptrs(const std::vector>& ptrs, cudaStre { constexpr int N = 64; Param param{}; - static_assert(sizeof(param) <= 4096); // max parameter size for cuda11 + static_assert(sizeof(param) <= 4096); // max parameter size for cuda11 StridedPtr* ptr{}; cudaMallocAsync(&ptr, sizeof(StridedPtr) * ptrs.size(), stream); param.ptr = ptr; diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc index 7f2c83315..0a2a3be17 100644 --- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc +++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc @@ -365,7 +365,6 @@ void getMLATensor(LlamaAttentionWeight& w, const std::string& p, TensorMap& m Tensor{MEMORY_GPU, getTensorType(), {sizeof(T) * w.kv_b_proj.input_dims}, w.kv_a_layernorm}); } - template TensorMap LlamaDecoderLayerWeight::getParams(std::string prefix) { diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h index 59ee0ea5f..9b204ed0d 100644 --- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h +++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h @@ -39,7 +39,7 @@ struct LlamaDecoderLayerWeight { size_t tp_rank); ~LlamaDecoderLayerWeight(); - LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other) = delete; + LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other) = delete; LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other) = delete; void loadModel(std::string dir_path, FtCudaDataType model_file_type); diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc index 5afc75869..907467341 100644 --- a/src/turbomind/models/llama/LlamaFfnLayer.cc +++ b/src/turbomind/models/llama/LlamaFfnLayer.cc @@ -30,7 +30,7 @@ template void LlamaFfnLayer::allocateBuffer( size_t token_num, int inter_size, size_t inter_buf_factor, size_t gating_lora_r, size_t inter_lora_r) { - const size_t sz = token_num * inter_size; + const size_t sz = token_num * inter_size; const size_t sz_gate = token_num * gating_lora_r; const size_t sz_inter = token_num * inter_lora_r; diff --git a/src/turbomind/models/llama/LlamaWeight.h b/src/turbomind/models/llama/LlamaWeight.h index 8e9bfd4eb..629cd5612 100644 --- a/src/turbomind/models/llama/LlamaWeight.h +++ b/src/turbomind/models/llama/LlamaWeight.h @@ -37,7 +37,7 @@ struct LlamaWeight { ~LlamaWeight(); - LlamaWeight(const LlamaWeight& other) = delete; + LlamaWeight(const LlamaWeight& other) = delete; LlamaWeight& operator=(const LlamaWeight& other) = delete; void loadModel(std::string dir_path); diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h index 4a13740d7..ea6e21a08 100644 --- a/src/turbomind/models/llama/llama_params.h +++ b/src/turbomind/models/llama/llama_params.h @@ -40,7 +40,8 @@ struct ModelParam { }; struct MoeParam { - enum Method { + enum Method + { kNaive, kFused } method; @@ -93,7 +94,8 @@ struct EngineParam { int max_prefill_iters; }; -enum class LoraPolicy : int { +enum class LoraPolicy : int +{ kNull, kPlora, }; diff --git a/src/turbomind/models/llama/weight_type.h b/src/turbomind/models/llama/weight_type.h index 27d7affe5..bc2f49a08 100644 --- a/src/turbomind/models/llama/weight_type.h +++ b/src/turbomind/models/llama/weight_type.h @@ -6,7 +6,8 @@ namespace turbomind { -enum class WeightType : int { +enum class WeightType : int +{ kFP32, kFP16, kFP8, // not supported yet diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index 1e74f4bae..b1df396b1 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -297,7 +297,8 @@ PYBIND11_MODULE(_turbomind, m) ft::check_cuda_error(cudaPointerGetAttributes(&at, self->data)); { // Switch to the same device where TM's tenosr memory resides because it's allocated - // from a pool with no peer access enabled (can't be accessed from a context of other devices) + // from a pool with no peer access enabled (can't be accessed from a context of other + // devices) ft::CudaDeviceGuard guard{at.device}; ft::check_cuda_error(cudaMemcpy(const_cast(self->data), const_cast(src->data), diff --git a/src/turbomind/utils/allocator.h b/src/turbomind/utils/allocator.h index d18652ef9..88c299c3d 100644 --- a/src/turbomind/utils/allocator.h +++ b/src/turbomind/utils/allocator.h @@ -281,7 +281,8 @@ class Allocator: public IAllocator { pointer_mapping_.erase(address); } else { - FT_CHECK_WITH_INFO(0, fmtstr("pointer_mapping_ does not have information of ptr at %p.", address).c_str()); + FT_CHECK_WITH_INFO(0, + fmtstr("pointer_mapping_ does not have information of ptr at %p.", address).c_str()); } } *ptr = nullptr; From 103ba46888077be58294db66ccefcc2e7a8a60f9 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Wed, 20 Nov 2024 18:41:48 +0800 Subject: [PATCH 13/21] fix ut --- lmdeploy/turbomind/deploy/target_model/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py index d84944793..f2c981bb2 100644 --- a/lmdeploy/turbomind/deploy/target_model/base.py +++ b/lmdeploy/turbomind/deploy/target_model/base.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -import collections import os.path as osp from abc import ABC +from collections.abc import Sequence import torch import tqdm @@ -92,7 +92,7 @@ def single_to_list(self, config: dict, keys): num_layer = int(config['num_layer']) for k in keys: v = config.get(k, None) - if v is not None and not isinstance(v, collections.Sequence): + if v is not None and not isinstance(v, Sequence): config[k] = [v] * num_layer return config From 9c7dbbd8e1882b7d567024cad23946ddff492df8 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Mon, 25 Nov 2024 16:08:58 +0800 Subject: [PATCH 14/21] Update config.yaml --- autotest/config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/autotest/config.yaml b/autotest/config.yaml index 6c92d2cf0..00b65c816 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -62,6 +62,7 @@ turbomind_chat_model: - liuhaotian/llava-v1.6-vicuna-7b - deepseek-ai/deepseek-vl-1.3b-chat - deepseek-ai/deepseek-coder-1.3b-instruct + - deepseek-ai/DeepSeek-V2-Lite-Chat - codellama/CodeLlama-7b-Instruct-hf - THUDM/glm-4-9b-chat - openbmb/MiniCPM-Llama3-V-2_5 From 8169e975be5c84ff77a0be7eefedcf835da1a2bc Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Mon, 25 Nov 2024 16:09:55 +0800 Subject: [PATCH 15/21] Update config.yaml --- autotest/config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/autotest/config.yaml b/autotest/config.yaml index 00b65c816..30d54fa95 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -168,6 +168,7 @@ turbomind_quatization: - Qwen/Qwen2-VL-7B-Instruct - mistralai/Mistral-7B-Instruct-v0.3 - deepseek-ai/deepseek-coder-1.3b-instruct + - deepseek-ai/DeepSeek-V2-Lite-Chat - codellama/CodeLlama-7b-Instruct-hf gptq: - internlm/internlm2_5-7b-chat From 4d710074419774a815d6e706360fe1bfe7660728 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Tue, 26 Nov 2024 18:24:52 +0800 Subject: [PATCH 16/21] fix mixtral --- src/turbomind/kernels/gemm/moe_utils_v2.cu | 17 ++++++++--------- src/turbomind/models/llama/unified_decoder.cc | 2 +- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu index 66bf634ea..e97482dd3 100644 --- a/src/turbomind/kernels/gemm/moe_utils_v2.cu +++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu @@ -414,7 +414,13 @@ __global__ void MoeGateKernel_v8(float* scales, // [e,n] #endif - constexpr float kLog2e = 1.4426950408889634074; + // constexpr float kLog2e = 1.4426950408889634074; + // if (k == 0) { + // PRAGMA_UNROLL + // for (int i = 0; i < items_per_thread; ++i) { + // data[i] *= kLog2e; + // } + // } unsigned mask = (unsigned)-1; float max_logit; @@ -438,13 +444,6 @@ __global__ void MoeGateKernel_v8(float* scales, // [e,n] asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); } - if (k == 0) { - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; ++i) { - data[i] *= kLog2e; - } - } - int g_max_ei = ei; float g_max_val = max_val; if constexpr (threads_per_token > 1) { @@ -487,7 +486,7 @@ __global__ void MoeGateKernel_v8(float* scales, // [e,n] PRAGMA_UNROLL for (int i = 0; i < items_per_thread; ++i) { if (!norm_topk || used[i]) { - data[i] = exp2f(data[i] - max_logit); + data[i] = expf(data[i] - max_logit); sum_prob += data[i]; } } diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc index 83015e558..d9f556430 100644 --- a/src/turbomind/models/llama/unified_decoder.cc +++ b/src/turbomind/models/llama/unified_decoder.cc @@ -207,7 +207,7 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con moe_ffn_layer_->forward(nullptr, decoder_output, token_num, layer, weights->at(layer)->moe_weights); } - if (weights->at(layer)->ffn_weights.output.output_dims) { + if (weights->at(layer)->ffn_weights.output.kernel) { int layer_id = layer; // int is needed bool all_reduce = !is_moe; TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}}, From c6bd5fe14be9486ba19ce44a26612d5ce68fefc7 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Wed, 27 Nov 2024 15:19:53 +0800 Subject: [PATCH 17/21] fix moe gating & config parsing --- lmdeploy/turbomind/deploy/config.py | 4 +- .../turbomind/deploy/source_model/qwen.py | 2 +- src/turbomind/kernels/gemm/moe_utils_v2.cu | 21 +++++-- src/turbomind/models/llama/llama_utils.cu | 59 ++++++------------- .../triton_backend/llama/LlamaTritonModel.cc | 4 +- 5 files changed, 40 insertions(+), 50 deletions(-) diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py index ac86a900d..1474792e2 100644 --- a/lmdeploy/turbomind/deploy/config.py +++ b/lmdeploy/turbomind/deploy/config.py @@ -58,8 +58,8 @@ class ModelConfig: expert_num: List[int] = () expert_inter_size: int = 0 experts_per_token: int = 0 - moe_shared_gate: int = False - norm_topk_prob: int = False + moe_shared_gate: bool = False + norm_topk_prob: bool = False routed_scale: float = 1.0 topk_group: int = 1 topk_method: str = 'greedy' diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py index 772bd0303..637983e8c 100644 --- a/lmdeploy/turbomind/deploy/source_model/qwen.py +++ b/lmdeploy/turbomind/deploy/source_model/qwen.py @@ -178,6 +178,6 @@ def model_info(self): info['experts_per_token'] = cfg['num_experts_per_tok'] info['inter_size'] = cfg['shared_expert_intermediate_size'] info['moe_shared_gate'] = True - info['moe_norm_topk_prob'] = cfg['norm_topk_prob'] + info['norm_topk_prob'] = cfg['norm_topk_prob'] info['attn_bias'] = 1 return info diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu index e97482dd3..548dc3dda 100644 --- a/src/turbomind/kernels/gemm/moe_utils_v2.cu +++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu @@ -515,9 +515,11 @@ __global__ void MoeGateKernel_v8(float* scales, // [e,n] PRAGMA_UNROLL for (int i = 0; i < max_tiles * max_expert_num; i += block_dim) { - int e = (i + threadIdx.x) % max_expert_num; - int t = (i + threadIdx.x) / max_expert_num; - smem.shared_accum[t][e] = 0; + int e = (i + threadIdx.x) % max_expert_num; + int t = (i + threadIdx.x) / max_expert_num; + if (t < max_tiles) { + smem.shared_accum[t][e] = 0; + } } __syncthreads(); @@ -538,8 +540,6 @@ __global__ void MoeGateKernel_v8(float* scales, // [e,n] masks[expert_id * token_num_padded + ti2] = idx; scales[idx * token_num + ti2] = scale * routed_scale; atomicAdd(&smem.shared_accum[ti2 >> log_tile][expert_id], 1); - - // printf("%d %d %f\n", idx, expert_id, scale); } } @@ -613,6 +613,17 @@ void invokeMoeGate_V2(int* f2n, // [e*n] -> n if (experts <= 8) { if (experts_per_token <= 2) { + // MoeGateKernel_V2<2, 128><<>>(scales, + // (int8_t*)masks, + // accum, + // logits, + // log_tile, + // tiles, + // tokens, + // tokens_padded, + // experts); + + // std::cout << tokens << " " << experts << " " << experts_per_token << " " << tokens_padded << "\n"; invoke(_Int<8>, _Int<2>, _Int<8>, _Int<4>); } else { diff --git a/src/turbomind/models/llama/llama_utils.cu b/src/turbomind/models/llama/llama_utils.cu index 570e33b77..d0ded16cb 100644 --- a/src/turbomind/models/llama/llama_utils.cu +++ b/src/turbomind/models/llama/llama_utils.cu @@ -1,49 +1,26 @@ // Copyright (c) OpenMMLab. All rights reserved. -#include "src/turbomind/kernels/reduce_kernel_utils.cuh" -#include "src/turbomind/models/llama/llama_utils.h" -#include "src/turbomind/utils/cuda_utils.h" #include #include #include #include +#include +#include + #include #include #include #include #include -#include + +#include "src/turbomind/models/llama/llama_utils.h" +#include "src/turbomind/utils/cuda_utils.h" namespace turbomind { CmpMode compare_mode = kCmpRead; // CmpMode compare_mode = kCmpWrite; -template -struct abs_diff_t { - using type = T; -}; - -template<> -struct abs_diff_t { - using type = float; -}; - -template<> -struct abs_diff_t<__nv_bfloat16> { - using type = float; -}; - -template -struct abs_diff: public thrust::unary_function, typename abs_diff_t::type> { - __host__ __device__ float operator()(thrust::tuple x) const - { - using R = typename abs_diff_t::type; - auto r = R(thrust::get<0>(x)) - R(thrust::get<1>(x)); - return r < R(0) ? -r : r; - } -}; - template void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream) { @@ -64,10 +41,8 @@ void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream) template void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream) { - // wait for b - check_cuda_error(cudaStreamSynchronize(stream)); // read a from file - thrust::host_vector h_a(size); + std::vector h_a(size); { const auto filename = "tmp/" + key + ".cmp"; std::ifstream ifs(filename, std::ios::binary); @@ -86,17 +61,21 @@ void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream) } ifs.read((char*)h_a.data(), sizeof(T) * h_a.size()); } - // copy a to device - thrust::device_vector a = h_a; - // create abs(a - b) iterator - thrust::device_ptr dev_ptr(ptr); - auto zip_iter = thrust::make_zip_iterator(thrust::make_tuple(a.begin(), dev_ptr)); - auto transform_iter = thrust::make_transform_iterator(zip_iter, abs_diff{}); - // sum(abs(a - b)) - auto asum = thrust::reduce(thrust::device, transform_iter, transform_iter + size); + std::vector h_b(size); + check_cuda_error(cudaMemcpyAsync(h_b.data(), ptr, sizeof(T) * size, cudaMemcpyDefault, stream)); + check_cuda_error(cudaStreamSynchronize(stream)); + + using Tacc = std::conditional_t, int64_t, float>; + + Tacc asum{}; + for (size_t i = 0; i < size; ++i) { + asum += std::abs((Tacc)h_a[i] - (Tacc)h_b[i]); + } + std::cerr << key << ": " << asum << " " << asum / size << "\n"; check_cuda_error(cudaMemcpyAsync(ptr, h_a.data(), sizeof(T) * h_a.size(), cudaMemcpyDefault, stream)); + check_cuda_error(cudaStreamSynchronize(stream)); } template diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index 2939b4c71..ba964ddf0 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -315,8 +315,8 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, moe_param_.experts_per_token = model_reader["experts_per_token"].as(0); moe_param_.inter_size = model_reader["expert_inter_size"].as(0); - moe_param_.shared_gate = model_reader["moe_shared_gate"].as(0); - moe_param_.norm_topk_prob = model_reader["norm_topk_prob"].as(false); + moe_param_.shared_gate = model_reader["moe_shared_gate"].as(); + moe_param_.norm_topk_prob = model_reader["norm_topk_prob"].as(); moe_param_.routed_scale = model_reader["routed_scale"].as(1.f); moe_param_.topk_group = model_reader["topk_group"].as(1); moe_param_.topk_method = model_reader["topk_method"].as("greedy"); From 3d4f22f76e3199934a33047161e3ef47ba233de3 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Wed, 27 Nov 2024 18:15:37 +0800 Subject: [PATCH 18/21] fix yarn for deepseek-v2 --- .../deploy/source_model/deepseek2.py | 25 +++++++++++++++++++ .../models/llama/unified_attention_layer.cc | 3 ++- src/turbomind/models/llama/unified_decoder.cc | 9 ------- 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek2.py b/lmdeploy/turbomind/deploy/source_model/deepseek2.py index deb69f2a7..0a4a9e189 100644 --- a/lmdeploy/turbomind/deploy/source_model/deepseek2.py +++ b/lmdeploy/turbomind/deploy/source_model/deepseek2.py @@ -1,4 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. +import math + from .base import INPUT_MODELS from .llama import LlamaModel, LlamaReader @@ -55,6 +57,23 @@ def mla_norm(self, i: int): return (*result, ) +def get_yarn_attention_factor(rope_scaling: dict): + + scaling_factor = float(rope_scaling['factor']) + mscale = rope_scaling['mscale'] + mscale_all_dim = rope_scaling['mscale_all_dim'] + + def yarn_get_mscale(scale=1, mscale=1): + if scale <= 1: + return 1.0 + return 0.1 * mscale * math.log(scale) + 1.0 + + _mscale = float( + yarn_get_mscale(scaling_factor, mscale) / + yarn_get_mscale(scaling_factor, mscale_all_dim)) + return _mscale + + @INPUT_MODELS.register_module(name='deepseek2') class DeepSeek2Model(LlamaModel): @@ -97,4 +116,10 @@ def model_info(self): topk_group=cfg['topk_group'], moe_group_num=cfg['n_group'], tune_layer_num=2) + rope_scaling = cfg.get('rope_scaling') + if rope_scaling and rope_scaling['type'] == 'yarn': + info.update( + max_position_embeddings=rope_scaling[ + 'original_max_position_embeddings'], + attention_factor=get_yarn_attention_factor(rope_scaling)) return info diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc index bf9b97378..e7d57db91 100644 --- a/src/turbomind/models/llama/unified_attention_layer.cc +++ b/src/turbomind/models/llama/unified_attention_layer.cc @@ -334,8 +334,9 @@ inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMa }; float low, high; find_correction_range(param_.beta_fast, param_.beta_slow, low, high); + // https://github.com/huggingface/transformers/blob/6c3f168b36882f0beebaa9121eafa1928ba29633/src/transformers/modeling_rope_utils.py#L216 if (low == high) { - high += 0.01f; + high += 0.001f; } params.yarn_ramp_inv_factor_div_2 = 1.0 / (high - low) / 2.0; params.yarn_ramp_inv_factor_mul_min = 1.0 / (high - low) * low; diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc index d9f556430..d5fdf96e5 100644 --- a/src/turbomind/models/llama/unified_decoder.cc +++ b/src/turbomind/models/llama/unified_decoder.cc @@ -278,15 +278,6 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con // Wait for `h_cu_q/k_len_` to be consumed check_cuda_error(cudaEventSynchronize(ev_h_cu_x_)); - - // check_cuda_error(cudaStreamSynchronize(stream_)); - // if (tp_.rank_ == 0) { - // std::abort(); - // } - // else { - // while (1) - // ; - // } } #ifdef ENABLE_FP32 From 329e4416f72b912c1362fe2df657b3a66b1e4d94 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Wed, 27 Nov 2024 20:22:09 +0800 Subject: [PATCH 19/21] fix `copy_from` --- src/turbomind/python/bind.cpp | 58 +++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index b1df396b1..5a344d954 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -215,6 +215,51 @@ DLTensor GetDLTensor(py::object obj) return dlmt->dl_tensor; } +static void safe_memcpy(void* dst, const void* src, size_t size) +{ + cudaPointerAttributes dat{}; + cudaPointerAttributes sat{}; + ft::check_cuda_error(cudaPointerGetAttributes(&dat, dst)); + ft::check_cuda_error(cudaPointerGetAttributes(&sat, src)); + try { + if (dat.devicePointer && sat.devicePointer) { + // Both can be accessed from current context + ft::check_cuda_error(cudaMemcpy(dst, src, size, cudaMemcpyDefault)); + } + else if (dat.type == cudaMemoryTypeDevice && sat.type == cudaMemoryTypeDevice) { + if (dat.device != sat.device) { + // On different devices, try peer memcpy + ft::check_cuda_error(cudaMemcpyPeer(dst, dat.device, src, sat.device, size)); + } + else { + // Same device, switch to the device first (this is unlikely) + ft::CudaDeviceGuard guard(dat.device); + ft::check_cuda_error(cudaMemcpy(dst, src, size, cudaMemcpyDefault)); + } + } + else { + // Unknown case, give it a try anyway + ft::check_cuda_error(cudaMemcpy(dst, src, size, cudaMemcpyDefault)); + } + } + catch (...) { + int device_id{-1}; + cudaGetDevice(&device_id); + TM_LOG_ERROR("cudaMemcpy failed: dst=(%d, %d, %p, %p), src=(%d, %d, %p, %p), size=%s, device=%d", + (int)dat.type, + dat.device, + dat.devicePointer, + dat.hostPointer, + (int)sat.type, + sat.device, + sat.devicePointer, + sat.hostPointer, + std::to_string(size).c_str(), + device_id); + throw; + } +} + PYBIND11_MODULE(_turbomind, m) { // nccl param @@ -293,18 +338,7 @@ PYBIND11_MODULE(_turbomind, m) std::accumulate(src->shape.begin(), src->shape.end(), 1LL, std::multiplies()); auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8; ft::FT_CHECK(self->shape.size() == 1 && num_bytes == self->shape[0]); - cudaPointerAttributes at{}; - ft::check_cuda_error(cudaPointerGetAttributes(&at, self->data)); - { - // Switch to the same device where TM's tenosr memory resides because it's allocated - // from a pool with no peer access enabled (can't be accessed from a context of other - // devices) - ft::CudaDeviceGuard guard{at.device}; - ft::check_cuda_error(cudaMemcpy(const_cast(self->data), - const_cast(src->data), - num_bytes, - cudaMemcpyDefault)); - } + safe_memcpy(const_cast(self->data), src->data, num_bytes); break; } default: From 90d2529791ec6b8342d419fabc01f393dcec2ef2 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Thu, 28 Nov 2024 21:14:45 +0800 Subject: [PATCH 20/21] fix rms norm, rotary embedding & deepseek v2 attention --- lmdeploy/turbomind/deploy/config.py | 1 + lmdeploy/turbomind/deploy/module.py | 8 - .../deploy/source_model/deepseek2.py | 23 ++- .../kernels/attention/rotary_embedding.h | 17 ++ src/turbomind/kernels/gemm/moe_utils_v2.cu | 19 +++ src/turbomind/kernels/norm/rms_norm.cu | 159 ++++++++++++++++-- src/turbomind/kernels/norm/rms_norm.h | 10 ++ src/turbomind/models/llama/llama_params.h | 1 + src/turbomind/models/llama/llama_utils.cu | 15 +- src/turbomind/models/llama/mla_utils.cu | 44 ++--- .../models/llama/unified_attention_layer.cc | 29 +++- src/turbomind/models/llama/unified_decoder.cc | 66 ++++++-- .../triton_backend/llama/LlamaTritonModel.cc | 1 + 13 files changed, 320 insertions(+), 73 deletions(-) diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py index 1474792e2..e483500e9 100644 --- a/lmdeploy/turbomind/deploy/config.py +++ b/lmdeploy/turbomind/deploy/config.py @@ -84,6 +84,7 @@ def verify(self): class AttentionConfig: rotary_embedding: int = 128 rope_theta: float = 10000.0 + softmax_scale: float = 0 attention_factor: float = None max_position_embeddings: int = 0 original_max_position_embeddings: int = 0 diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py index aa98ed646..52497175e 100644 --- a/lmdeploy/turbomind/deploy/module.py +++ b/lmdeploy/turbomind/deploy/module.py @@ -245,14 +245,6 @@ def _export(self, idx: int, xs, kind: str, pack_fn, **kwargs): q_b = q cfg = self.model.model_config - qk_nope_dim = cfg.size_per_head - cfg.qk_rope_dim - - q_b = q_b.reshape(-1, cfg.size_per_head) - - # [nope_dim | rope_dim] -> [rope_dim | nope_dim] - q_nope, q_pe = torch.split(q_b, (qk_nope_dim, cfg.qk_rope_dim), dim=-1) - q_b = torch.cat((q_pe, q_nope), - dim=-1).view(-1, cfg.head_num * cfg.size_per_head) o = o.reshape(cfg.head_num, cfg.v_head_dim, -1) o = torch.nn.functional.pad( diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek2.py b/lmdeploy/turbomind/deploy/source_model/deepseek2.py index 0a4a9e189..0023f650f 100644 --- a/lmdeploy/turbomind/deploy/source_model/deepseek2.py +++ b/lmdeploy/turbomind/deploy/source_model/deepseek2.py @@ -57,7 +57,7 @@ def mla_norm(self, i: int): return (*result, ) -def get_yarn_attention_factor(rope_scaling: dict): +def get_yarn_params(rope_scaling: dict): scaling_factor = float(rope_scaling['factor']) mscale = rope_scaling['mscale'] @@ -71,7 +71,13 @@ def yarn_get_mscale(scale=1, mscale=1): _mscale = float( yarn_get_mscale(scaling_factor, mscale) / yarn_get_mscale(scaling_factor, mscale_all_dim)) - return _mscale + + softmax_scale = 0 + if mscale_all_dim: + scale = yarn_get_mscale(scaling_factor, mscale_all_dim) + softmax_scale = scale * scale + + return _mscale, softmax_scale @INPUT_MODELS.register_module(name='deepseek2') @@ -100,11 +106,12 @@ def model_info(self): inter_size = [n_shared_experts * expert_inter_size] * num_layer inter_size[0] = cfg['intermediate_size'] norm_topk_prob = cfg['norm_topk_prob'] + size_per_head = qk_rope_dim + qk_nope_dim info.update(kv_lora_rank=cfg['kv_lora_rank'], q_lora_rank=cfg['q_lora_rank'] or 0, qk_rope_dim=qk_rope_dim, v_head_dim=cfg['v_head_dim'], - size_per_head=qk_rope_dim + qk_nope_dim, + size_per_head=size_per_head, rotary_embedding=qk_rope_dim, expert_num=expert_num, expert_inter_size=expert_inter_size, @@ -118,8 +125,10 @@ def model_info(self): tune_layer_num=2) rope_scaling = cfg.get('rope_scaling') if rope_scaling and rope_scaling['type'] == 'yarn': - info.update( - max_position_embeddings=rope_scaling[ - 'original_max_position_embeddings'], - attention_factor=get_yarn_attention_factor(rope_scaling)) + attention_factor, softmax_scale = get_yarn_params(rope_scaling) + softmax_scale *= size_per_head**(-0.5) + info.update(max_position_embeddings=rope_scaling[ + 'original_max_position_embeddings'], + attention_factor=attention_factor, + softmax_scale=softmax_scale) return info diff --git a/src/turbomind/kernels/attention/rotary_embedding.h b/src/turbomind/kernels/attention/rotary_embedding.h index 8e09da22c..db836ed18 100644 --- a/src/turbomind/kernels/attention/rotary_embedding.h +++ b/src/turbomind/kernels/attention/rotary_embedding.h @@ -131,6 +131,7 @@ struct FastRoPE { template __device__ void apply(Array& x, float timestep) { +#if 0 PRAGMA_UNROLL for (int i = 0; i < N; i += 2) { float c, s; @@ -144,6 +145,22 @@ struct FastRoPE { x[i + 1] = (T)tmp1; } } +#else + // Most models apply rotary embedding in half precision + PRAGMA_UNROLL + for (int i = 0; i < N; i += 2) { + float c, s; + sincosf(timestep * inv_freq_[i / 2], &s, &c); + s *= attention_scaling_; + c *= attention_scaling_; + T tmp0 = (T)c * x[i] - (T)s * x[i + 1]; + T tmp1 = (T)c * x[i + 1] + (T)s * x[i]; + if (is_valid_) { + x[i] = tmp0; + x[i + 1] = tmp1; + } + } +#endif } }; diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu index 548dc3dda..a9e4f7da5 100644 --- a/src/turbomind/kernels/gemm/moe_utils_v2.cu +++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu @@ -733,6 +733,7 @@ __global__ void MoeReduceKernel(T* dst, // [ n, d] } for (int i = threadIdx.x; i < dims; i += block_dim) { +#if 1 Array accum{}; if (dst_scale) { Vec v; @@ -749,6 +750,24 @@ __global__ void MoeReduceKernel(T* dst, // [ n, d] accum = accum + x; } Store(dst_ptr[i].data(), cast(accum)); +#else + Array accum{}; + if (dst_scale) { + Vec v; + Ldg(v, dst_ptr[i].data()); + using namespace ops; + accum = v * (T)dst_scale; + } + PRAGMA_UNROLL + for (int e = 0; e < exp_k; ++e) { + Vec v; + Ldg(v, src_ptr[e][i].data()); + using namespace ops; + const auto x = v * (T)scale[e]; + accum = accum + x; + } + Store(dst_ptr[i].data(), accum); +#endif } } diff --git a/src/turbomind/kernels/norm/rms_norm.cu b/src/turbomind/kernels/norm/rms_norm.cu index cba0130b8..22fd69f52 100644 --- a/src/turbomind/kernels/norm/rms_norm.cu +++ b/src/turbomind/kernels/norm/rms_norm.cu @@ -8,8 +8,15 @@ namespace turbomind { template -__global__ void RMSNormKernel( - T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, float inv_dims) +__global__ void RMSNormKernel(T* dst, + int dst_ld, + const T* src, + int src_ld, + const T* __restrict__ weights, + int dims, + int num, + float eps, + float inv_dims) { const int ti = blockIdx.x; const int di = threadIdx.x * vec_size; @@ -56,13 +63,13 @@ __global__ void RMSNormKernel( Array sv; for (int i = di; i < dims; i += block_dim * vec_size) { Load(vec, &src[i]); - Array tmp = cast(vec); - Load(sv, &weights[i]); + Ldg(sv, &weights[i]); PRAGMA_UNROLL for (int c = 0; c < vec_size; ++c) { - tmp[c] *= (float)sv[c] * sum; + vec[c] = (T)((float)vec[c] * sum) * sv[c]; + // vec[c] = (T)((float)vec[c] * sum * (float)sv[c]); } - Store(&dst[i], cast(tmp)); + Store(&dst[i], vec); } } @@ -70,18 +77,20 @@ template void invokeRMSNorm( T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, cudaStream_t st) { - constexpr int threads = 256; + constexpr int vec_size = 16 / sizeof(T); + + constexpr int threads = 512; const int blocks = num; - RMSNormKernel<<>>(dst, // - dst_ld, - src, - src_ld, - weights, - dims, - num, - eps, - 1.f / dims); + RMSNormKernel<<>>(dst, // + dst_ld, + src, + src_ld, + weights, + dims, + num, + eps, + 1.f / dims); } template void invokeRMSNorm(half* dst, @@ -105,4 +114,122 @@ template void invokeRMSNorm(nv_bfloat16* dst, cudaStream_t st); #endif +// r' <- r + (h + b) +// h' <- norm(r') * w +template +__global__ void BiasResidualRMSNormKernel(T* __restrict__ residual, + T* __restrict__ hidden_states, + const T* __restrict__ weights, + const T* __restrict__ bias, + int dims, + int num, + float eps, + float inv_dims) +{ + const int ti = blockIdx.x; + const int di = threadIdx.x * vec_size; + + if (ti >= num) { + return; + } + + residual += dims * ti; + hidden_states += dims * ti; + + Array accum{}; + + Array r_vec; + Array h_vec; + Array b_vec; + + for (int i = di; i < dims; i += block_dim * vec_size) { + Load(r_vec, &residual[i]); + Load(h_vec, &hidden_states[i]); + + using namespace ops; + r_vec = r_vec + h_vec; + + if (bias) { + Ldg(b_vec, &bias[i]); + r_vec = r_vec + b_vec; + } + + Store(&residual[i], r_vec); + + Array tmp = cast(r_vec); + + accum = accum + tmp * tmp; + } + + float sum{}; + PRAGMA_UNROLL + for (int i = 0; i < vec_size; ++i) { + sum += accum[i]; + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + sum = BlockReduce{temp_storage}.Sum(sum); + + __shared__ float shared_sum; + + if (threadIdx.x == 0) { + shared_sum = rsqrtf(sum * inv_dims + eps); + } + + __syncthreads(); + + sum = shared_sum; + + Array w_vec; + for (int i = di; i < dims; i += block_dim * vec_size) { + Load(r_vec, &residual[i]); + Ldg(w_vec, &weights[i]); + PRAGMA_UNROLL + for (int c = 0; c < vec_size; ++c) { + r_vec[c] = (T)((float)r_vec[c] * sum) * w_vec[c]; + } + Store(&hidden_states[i], r_vec); + } +} + +template +void invokeBiasResidualRMSNorm( + T* residual, T* hidden_states, const T* weights, const T* bias, int dims, int num, float eps, cudaStream_t st) +{ + constexpr int vec_size = 16 / sizeof(T); + constexpr int threads = 512; + const int blocks = num; + + BiasResidualRMSNormKernel<<>>(residual, // + hidden_states, + weights, + bias, + dims, + num, + eps, + 1.f / dims); +} + +template void invokeBiasResidualRMSNorm(half* residual, + half* hidden_states, + const half* weights, + const half* bias, + int dims, + int num, + float eps, + cudaStream_t st); + +#if ENABLE_BF16 +template void invokeBiasResidualRMSNorm(nv_bfloat16* residual, + nv_bfloat16* hidden_states, + const nv_bfloat16* weights, + const nv_bfloat16* bias, + int dims, + int num, + float eps, + cudaStream_t st); +#endif + } // namespace turbomind diff --git a/src/turbomind/kernels/norm/rms_norm.h b/src/turbomind/kernels/norm/rms_norm.h index 417daed14..83fa0f826 100644 --- a/src/turbomind/kernels/norm/rms_norm.h +++ b/src/turbomind/kernels/norm/rms_norm.h @@ -8,4 +8,14 @@ template void invokeRMSNorm( T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, cudaStream_t st); +template +void invokeRMSNorm(T* dst, const T* src, const T* weights, int dims, int num, float eps, cudaStream_t st) +{ + invokeRMSNorm(dst, dims, src, dims, weights, dims, num, eps, st); } + +template +void invokeBiasResidualRMSNorm( + T* residual, T* hidden_states, const T* weights, const T* bias, int dims, int num, float eps, cudaStream_t st); + +} // namespace turbomind diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h index ea6e21a08..0a505b11a 100644 --- a/src/turbomind/models/llama/llama_params.h +++ b/src/turbomind/models/llama/llama_params.h @@ -63,6 +63,7 @@ struct AttentionParam { int rotary_embedding_dim; float rotary_embedding_base; int max_position_embeddings; + float softmax_scale; std::string rope_scaling_type; int original_max_position_embeddings; float rope_scaling_factor; diff --git a/src/turbomind/models/llama/llama_utils.cu b/src/turbomind/models/llama/llama_utils.cu index d0ded16cb..eaa450ae2 100644 --- a/src/turbomind/models/llama/llama_utils.cu +++ b/src/turbomind/models/llama/llama_utils.cu @@ -65,14 +65,23 @@ void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream) check_cuda_error(cudaMemcpyAsync(h_b.data(), ptr, sizeof(T) * size, cudaMemcpyDefault, stream)); check_cuda_error(cudaStreamSynchronize(stream)); - using Tacc = std::conditional_t, int64_t, float>; + using Tacc = std::conditional_t, int64_t, float>; + constexpr Tacc eps = std::is_integral_v ? 1 : 1e-8f; Tacc asum{}; + Tacc rsum{}; + Tacc amean{}; for (size_t i = 0; i < size; ++i) { - asum += std::abs((Tacc)h_a[i] - (Tacc)h_b[i]); + Tacc x = (Tacc)h_b[i]; + Tacc r = (Tacc)h_a[i]; + Tacc abs_diff = std::abs(x - r); + Tacc rel_diff = abs_diff / std::max(std::max(std::abs(r), std::abs(x)), eps); + asum += abs_diff; + rsum += rel_diff; + amean += std::abs(r); } - std::cerr << key << ": " << asum << " " << asum / size << "\n"; + std::cerr << key << ": " << amean / size << " " << asum << " " << asum / size << " " << rsum / size << "\n"; check_cuda_error(cudaMemcpyAsync(ptr, h_a.data(), sizeof(T) * h_a.size(), cudaMemcpyDefault, stream)); check_cuda_error(cudaStreamSynchronize(stream)); diff --git a/src/turbomind/models/llama/mla_utils.cu b/src/turbomind/models/llama/mla_utils.cu index ad38c7550..2f9e786f2 100644 --- a/src/turbomind/models/llama/mla_utils.cu +++ b/src/turbomind/models/llama/mla_utils.cu @@ -17,32 +17,38 @@ __global__ void mla_copy_qkv_kernel(T* qkv, { const int type = blockIdx.y; - const int ti = blockIdx.x; - const int di = threadIdx.x; + const int64_t ti = blockIdx.x; + const int di = threadIdx.x; const int kv_b_dim = nope_dim + v_head_dim; - for (int hi = threadIdx.y; hi < head_num; hi += blockDim.y) { - Array data{}; - if (type == 0) { // Q - Ldg(data, &q[ti * head_num * head_dim + hi * head_dim + di * vec_size]); + // for (int hi = threadIdx.y; hi < head_num; hi += blockDim.y) { + const int hi = threadIdx.y; + Array data{}; + if (type == 0) { // Q + if (di * vec_size < rope_dim) { + Ldg(data, &q[ti * head_num * head_dim + hi * head_dim + nope_dim + di * vec_size]); } - else if (type == 1) { // K - if (di * vec_size < rope_dim) { - Ldg(data, &kv_a[ti * (kv_lora_rank + rope_dim) + kv_lora_rank + di * vec_size]); - } - else { - Ldg(data, &kv_b[ti * head_num * kv_b_dim + hi * kv_b_dim + di * vec_size - rope_dim]); - } + else { + Ldg(data, &q[ti * head_num * head_dim + hi * head_dim + di * vec_size - rope_dim]); } - else { // V - if (di * vec_size < v_head_dim) { - Ldg(data, &kv_b[ti * head_num * kv_b_dim + hi * kv_b_dim + nope_dim + di * vec_size]); - } + } + else if (type == 1) { // K + if (di * vec_size < rope_dim) { + Ldg(data, &kv_a[ti * (kv_lora_rank + rope_dim) + kv_lora_rank + di * vec_size]); + } + else { + Ldg(data, &kv_b[ti * head_num * kv_b_dim + hi * kv_b_dim + di * vec_size - rope_dim]); + } + } + else { // V + if (di * vec_size < v_head_dim) { + Ldg(data, &kv_b[ti * head_num * kv_b_dim + hi * kv_b_dim + nope_dim + di * vec_size]); } - const int ti_stride = 3 * head_num * head_dim; - Store(&qkv[ti * ti_stride + type * head_num * head_dim + hi * head_dim + di * vec_size], data); } + const int stride = 3 * head_num * head_dim; + Store(&qkv[ti * stride + type * head_num * head_dim + hi * head_dim + di * vec_size], data); + // } } template diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc index e7d57db91..7462176f9 100644 --- a/src/turbomind/models/llama/unified_attention_layer.cc +++ b/src/turbomind/models/llama/unified_attention_layer.cc @@ -300,8 +300,15 @@ inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMa params.num_heads = local_head_num_; params.num_kv_heads = local_kv_head_num_; params.size_per_head = size_per_head_; + // MSVC does not have M_LOG2E - params.inv_sqrt_dh = (float)std::log2(expf(1.)) / std::sqrt((float)params.size_per_head); + params.inv_sqrt_dh = (float)std::log2(expf(1.)); + if (param_.softmax_scale) { // model predefined softmax scale + params.inv_sqrt_dh *= param_.softmax_scale; + } + else { // default value + params.inv_sqrt_dh /= std::sqrt((float)params.size_per_head); + } params.rotary_embedding_dim = param_.rotary_embedding_dim; params.rotary_embedding_base = param_.rotary_embedding_base; @@ -421,6 +428,24 @@ inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMa count_and_fix(qkv_buf_3_, token_num * weights->output.input_dims, Concat("attn", layer_id), 3); +#if 0 + if (!isTuning()) { + T* o{}; + cudaMallocAsync(&o, sizeof(T) * token_num * head_num_ * size_per_head_, stream_); + cudaMemsetAsync(o, 0, sizeof(T) * token_num * head_num_ * size_per_head_, stream_); + auto dst = o; + auto src = qkv_buf_3_; + for (int i = 0; i < token_num; ++i) { + for (int j = 0; j < head_num_; ++j) { + cudaMemcpyAsync(dst, src, sizeof(T) * 128, cudaMemcpyDefault, stream_); + src += 192; + dst += 128; + } + } + Compare(o, token_num * head_num_ * 128, "attn", kCmpRead, stream_); + } +#endif + ////////////////////////////////////////////// /// output gemm -> linear_->forward(attention_out, qkv_buf_3_, token_num, weights->output, LlamaLinear::kGemm, lora_mask); @@ -458,7 +483,7 @@ void UnifiedAttentionLayer::forward_mla(const T* inputs, int token_num, const T* q{}; - if (w.q_proj.output_dims) { + if (w.q_proj.kernel) { deviceMalloc((T**)&q, (size_t)token_num * w.q_proj.output_dims, stream_); linear_->forward(q, inputs, token_num, w.q_proj); sync_check_cuda_error(); diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc index d5fdf96e5..1352669a4 100644 --- a/src/turbomind/models/llama/unified_decoder.cc +++ b/src/turbomind/models/llama/unified_decoder.cc @@ -1,14 +1,17 @@ -#include "src/turbomind/models/llama/unified_decoder.h" + +#include + +#include "src/turbomind/kernels/norm/rms_norm.h" #include "src/turbomind/models/llama/llama_decoder_kernels.h" #include "src/turbomind/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_utils.h" #include "src/turbomind/models/llama/moe_ffn_layer.h" #include "src/turbomind/models/llama/unified_attention_layer.h" +#include "src/turbomind/models/llama/unified_decoder.h" #include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/anomaly_handler.h" #include "src/turbomind/utils/cuda_utils.h" -#include namespace turbomind { @@ -144,19 +147,25 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con const int pf_offset = dc_batch_size; + const bool flag = false && !isTuning(); + // Compare(decoder_input_output, token_num * hidden_units_, "decoder_input", kCmpRead, stream_); // printf("%d %f\n", (int)token_num, rmsnorm_eps_); + if (flag) { + Compare(decoder_input_output, token_num * hidden_units_, "norm0", kCmpRead, stream_); + } + ///////////////////////////////////////////// /// RMSNorm - invokeRootMeanSquareNorm(decoder_output, - decoder_input_output, - weights->at(0)->self_attn_norm_weights, - rmsnorm_eps_, - token_num, - hidden_units_, - stream_); + invokeRMSNorm(decoder_output, + decoder_input_output, + weights->at(0)->self_attn_norm_weights, + hidden_units_, + token_num, + rmsnorm_eps_, + stream_); sync_check_cuda_error(); count_and_fix(decoder_output, token_num * hidden_units_, Concat("norm0", 0), 2); @@ -168,7 +177,9 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con continue; } - // Compare(decoder_output, token_num * hidden_units_, "attn_input", kCmpRead, stream_); + if (flag) { + Compare(decoder_output, token_num * hidden_units_, "attn_input", kCmpRead, stream_); + } ///////////////////////////////////////////// /// self-attention @@ -182,16 +193,26 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con count_and_fix(decoder_output, token_num * hidden_units_, Concat("attn_block", layer), 2); - invokeFusedAddBiasResidualRMSNorm(decoder_input_output, - decoder_output, - weights->at(layer)->self_attn_weights.output.bias, - weights->at(layer)->ffn_norm_weights, - rmsnorm_eps_, - token_num, - hidden_units_, - stream_); + if (flag) { + Compare(decoder_input_output, token_num * hidden_units_, "res0", kCmpRead, stream_); + Compare(decoder_output, token_num * hidden_units_, "attn_out", kCmpRead, stream_); + } + + invokeBiasResidualRMSNorm(decoder_input_output, + decoder_output, + weights->at(layer)->ffn_norm_weights, + weights->at(layer)->self_attn_weights.output.bias, + hidden_units_, + token_num, + rmsnorm_eps_, + stream_); sync_check_cuda_error(); + if (flag) { + Compare(decoder_input_output, token_num * hidden_units_, "res1", kCmpRead, stream_); + Compare(decoder_output, token_num * hidden_units_, "ffn_in", kCmpRead, stream_); + } + count_and_fix(decoder_input_output, token_num * hidden_units_, Concat("residual0", layer), 2); count_and_fix(decoder_output, token_num * hidden_units_, Concat("norm1", layer), 2); @@ -228,6 +249,10 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con moe_ffn_layer_->reduce(decoder_output, token_num, (bool)ffn_layer_, layer, weights->at(layer)->moe_weights); } + if (flag) { + Compare(decoder_output, token_num * hidden_units_, "ffn_out", kCmpRead, stream_); + } + // if (tp_.rank_ == 0) { // Compare(decoder_output, token_num * hidden_units_, Concat("moe_ffn_out", layer), compare_mode, stream_); // } @@ -250,6 +275,11 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con count_and_fix(decoder_input_output, token_num * hidden_units_, Concat("residual1", layer), 2); count_and_fix(decoder_output, token_num * hidden_units_, Concat("norm0", layer + 1), 2); + + if (flag) { + cudaStreamSynchronize(stream_); + std::abort(); + } } if (dc_batch_size) { diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index ba964ddf0..1c7c5eb46 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -279,6 +279,7 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, // rotary embedding parameters attn_param_.rotary_embedding_dim = attention_reader["rotary_embedding"].as(); attn_param_.rotary_embedding_base = attention_reader["rope_theta"].as(10000.0f); + attn_param_.softmax_scale = attention_reader["softmax_scale"].as(0); attn_param_.attention_factor = attention_reader["attention_factor"].as(-1.f); attn_param_.beta_fast = attention_reader["beta_fast"].as(32.f); attn_param_.beta_slow = attention_reader["beta_slow"].as(1.f); From f565ef71522a876f10accf0483610238030dc36b Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Thu, 28 Nov 2024 21:42:01 +0800 Subject: [PATCH 21/21] remove debug code --- .../models/llama/unified_attention_layer.cc | 18 -------- src/turbomind/models/llama/unified_decoder.cc | 45 ------------------- 2 files changed, 63 deletions(-) diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc index 7462176f9..7a6eddc4b 100644 --- a/src/turbomind/models/llama/unified_attention_layer.cc +++ b/src/turbomind/models/llama/unified_attention_layer.cc @@ -428,24 +428,6 @@ inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMa count_and_fix(qkv_buf_3_, token_num * weights->output.input_dims, Concat("attn", layer_id), 3); -#if 0 - if (!isTuning()) { - T* o{}; - cudaMallocAsync(&o, sizeof(T) * token_num * head_num_ * size_per_head_, stream_); - cudaMemsetAsync(o, 0, sizeof(T) * token_num * head_num_ * size_per_head_, stream_); - auto dst = o; - auto src = qkv_buf_3_; - for (int i = 0; i < token_num; ++i) { - for (int j = 0; j < head_num_; ++j) { - cudaMemcpyAsync(dst, src, sizeof(T) * 128, cudaMemcpyDefault, stream_); - src += 192; - dst += 128; - } - } - Compare(o, token_num * head_num_ * 128, "attn", kCmpRead, stream_); - } -#endif - ////////////////////////////////////////////// /// output gemm -> linear_->forward(attention_out, qkv_buf_3_, token_num, weights->output, LlamaLinear::kGemm, lora_mask); diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc index 1352669a4..ec0e75b7e 100644 --- a/src/turbomind/models/llama/unified_decoder.cc +++ b/src/turbomind/models/llama/unified_decoder.cc @@ -147,16 +147,6 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con const int pf_offset = dc_batch_size; - const bool flag = false && !isTuning(); - - // Compare(decoder_input_output, token_num * hidden_units_, "decoder_input", kCmpRead, stream_); - - // printf("%d %f\n", (int)token_num, rmsnorm_eps_); - - if (flag) { - Compare(decoder_input_output, token_num * hidden_units_, "norm0", kCmpRead, stream_); - } - ///////////////////////////////////////////// /// RMSNorm invokeRMSNorm(decoder_output, @@ -177,10 +167,6 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con continue; } - if (flag) { - Compare(decoder_output, token_num * hidden_units_, "attn_input", kCmpRead, stream_); - } - ///////////////////////////////////////////// /// self-attention forwardSelfAttn(decoder_output, // @@ -193,11 +179,6 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con count_and_fix(decoder_output, token_num * hidden_units_, Concat("attn_block", layer), 2); - if (flag) { - Compare(decoder_input_output, token_num * hidden_units_, "res0", kCmpRead, stream_); - Compare(decoder_output, token_num * hidden_units_, "attn_out", kCmpRead, stream_); - } - invokeBiasResidualRMSNorm(decoder_input_output, decoder_output, weights->at(layer)->ffn_norm_weights, @@ -208,21 +189,12 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con stream_); sync_check_cuda_error(); - if (flag) { - Compare(decoder_input_output, token_num * hidden_units_, "res1", kCmpRead, stream_); - Compare(decoder_output, token_num * hidden_units_, "ffn_in", kCmpRead, stream_); - } - count_and_fix(decoder_input_output, token_num * hidden_units_, Concat("residual0", layer), 2); count_and_fix(decoder_output, token_num * hidden_units_, Concat("norm1", layer), 2); //////////////////////////////////////////// /// feed-forward network - // if (tp_.rank_ == 0) { - // Compare(decoder_output, token_num * hidden_units_, Concat("ffn_input", layer), compare_mode, stream_); - // } - const bool is_moe = !weights->at(layer)->moe_weights.experts.empty(); if (is_moe) { moe_ffn_layer_->forward(nullptr, decoder_output, token_num, layer, weights->at(layer)->moe_weights); @@ -241,22 +213,10 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &weights->at(layer)->ffn_weights); } - // if (tp_.rank_ == 0) { - // Compare(decoder_output, token_num * hidden_units_, Concat("ffn_out", layer), compare_mode, stream_); - // } - if (is_moe) { moe_ffn_layer_->reduce(decoder_output, token_num, (bool)ffn_layer_, layer, weights->at(layer)->moe_weights); } - if (flag) { - Compare(decoder_output, token_num * hidden_units_, "ffn_out", kCmpRead, stream_); - } - - // if (tp_.rank_ == 0) { - // Compare(decoder_output, token_num * hidden_units_, Concat("moe_ffn_out", layer), compare_mode, stream_); - // } - count_and_fix(decoder_output, token_num * hidden_units_, Concat("ffn_block", layer), 2); const bool is_last_layer = layer == layer_num_ - 1; @@ -275,11 +235,6 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con count_and_fix(decoder_input_output, token_num * hidden_units_, Concat("residual1", layer), 2); count_and_fix(decoder_output, token_num * hidden_units_, Concat("norm0", layer + 1), 2); - - if (flag) { - cudaStreamSynchronize(stream_); - std::abort(); - } } if (dc_batch_size) {