diff --git a/autotest/config.yaml b/autotest/config.yaml index 6c92d2cf0b..30d54fa952 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -62,6 +62,7 @@ turbomind_chat_model: - liuhaotian/llava-v1.6-vicuna-7b - deepseek-ai/deepseek-vl-1.3b-chat - deepseek-ai/deepseek-coder-1.3b-instruct + - deepseek-ai/DeepSeek-V2-Lite-Chat - codellama/CodeLlama-7b-Instruct-hf - THUDM/glm-4-9b-chat - openbmb/MiniCPM-Llama3-V-2_5 @@ -167,6 +168,7 @@ turbomind_quatization: - Qwen/Qwen2-VL-7B-Instruct - mistralai/Mistral-7B-Instruct-v0.3 - deepseek-ai/deepseek-coder-1.3b-instruct + - deepseek-ai/DeepSeek-V2-Lite-Chat - codellama/CodeLlama-7b-Instruct-hf gptq: - internlm/internlm2_5-7b-chat diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index b0e513410e..1fb5fa0964 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -114,14 +114,14 @@ broadCastRequest(const std::vector& v_start_ids, } else { // conditional case. - ft::deviceMalloc(&d_input_ids, size_1, false); + ft::deviceMalloc(&d_input_ids, size_1, nullptr, false); // ft::deviceMalloc(&d_input_lengths, size_2, false); ft::cudaH2Dcpy(d_input_ids, v_input_ids.data(), size_1); // ft::cudaH2Dcpy(d_input_lengths, v_input_lengths.data(), size_2); } if (!v_input_bad_words.empty()) { - ft::deviceMalloc(&d_input_bad_words, size_bad_words, false); + ft::deviceMalloc(&d_input_bad_words, size_bad_words, nullptr, false); ft::cudaH2Dcpy(d_input_bad_words, v_input_bad_words.data(), size_bad_words); } else { diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py index c724b085a0..e483500e96 100644 --- a/lmdeploy/turbomind/deploy/config.py +++ b/lmdeploy/turbomind/deploy/config.py @@ -2,6 +2,7 @@ import inspect import json from dataclasses import asdict, fields +from typing import List # use pydantic.dataclasses.dataclass to check data type from pydantic.dataclasses import dataclass @@ -43,22 +44,33 @@ class ModelConfig: # of token_embedding embedding_size: int = 0 num_layer: int = None - inter_size: int = None + inter_size: List[int] = None norm_eps: float = None attn_bias: int = 0 start_id: int = None end_id: int = None size_per_head: int = 128 - group_size: int = 0 + group_size: int = 64 weight_type: str = None session_len: int = None tp: int = 1 model_format: str = 'hf' - expert_num: int = 0 + expert_num: List[int] = () expert_inter_size: int = 0 experts_per_token: int = 0 - moe_shared_gate: int = False - moe_norm_topk: int = False + moe_shared_gate: bool = False + norm_topk_prob: bool = False + routed_scale: float = 1.0 + topk_group: int = 1 + topk_method: str = 'greedy' + moe_group_num: int = 1 + # MLA + q_lora_rank: int = 0 + kv_lora_rank: int = 0 + qk_rope_dim: int = 0 + v_head_dim: int = 0 + # tuning + tune_layer_num: int = 1 def verify(self): invalid = {} @@ -72,6 +84,7 @@ def verify(self): class AttentionConfig: rotary_embedding: int = 128 rope_theta: float = 10000.0 + softmax_scale: float = 0 attention_factor: float = None max_position_embeddings: int = 0 original_max_position_embeddings: int = 0 diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py index 1c847ede01..77f0bc8dc8 100644 --- a/lmdeploy/turbomind/deploy/converter.py +++ b/lmdeploy/turbomind/deploy/converter.py @@ -241,11 +241,10 @@ def get_tm_model(model_path, engine_config.model_format = quant_method group_size = _group_size - # Compatible to awq models that are quantized by lmdeploy (<=v0.3.0) - if not group_size: - group_size = 128 - if engine_config.model_format in ['awq', 'gptq']: + # Compatible to awq models that are quantized by lmdeploy (<=v0.3.0) + if not group_size: + group_size = 128 assert group_size == 128, \ f'model format is "{engine_config.model_format}" ' \ f'but group_size is {group_size}. Currently, only 128 ' \ diff --git a/lmdeploy/turbomind/deploy/loader.py b/lmdeploy/turbomind/deploy/loader.py index e3d79b164a..94e779b6b7 100644 --- a/lmdeploy/turbomind/deploy/loader.py +++ b/lmdeploy/turbomind/deploy/loader.py @@ -88,6 +88,27 @@ def items(self): yield (-1, {k: f.get_tensor(k) for k in misc}) assert not params + # def items(self): + # params = defaultdict(dict) + # for shard in self.shards: + # # with safe_open(shard, 'pt') as f: + # with open(shard, 'rb') as f: + # w = safetensors.torch.load(f.read()) + # misc = [] + # for k in w.keys(): + # match = re.findall(self.pattern, k) + # if not match: + # misc.append(k) + # else: + # idx = int(match[0]) + # param = params[idx] + # param[k] = w[k] + # if len(param) == self.item_count[idx]: + # yield (idx, params.pop(idx)) + # if misc: + # yield (-1, {k: w[k] for k in misc}) + # assert not params + class PytorchLoader(BaseLoader): diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py index 8d998abe2b..52497175ef 100644 --- a/lmdeploy/turbomind/deploy/module.py +++ b/lmdeploy/turbomind/deploy/module.py @@ -96,10 +96,13 @@ class Ffn(Module): def __init__(self, model: BaseOutputModel): self.model = model self.tp = model.tensor_para_size + # inter_sizes in config are padded and may be different from what's + # in the weights self.inter_size = model.model_config.inter_size self.group_size = max(1, model.model_config.group_size) def _export(self, + inter_size: int, fmt: str, idx: int, w123, @@ -110,11 +113,11 @@ def _export(self, w1, w2, w3 = map(transpose, w123) if not is_lora_a: - w1 = pad_out_dims(w1, self.inter_size) - w3 = pad_out_dims(w3, self.inter_size) + w1 = pad_out_dims(w1, inter_size) + w3 = pad_out_dims(w3, inter_size) if not is_lora_b: group_size = self.group_size if apply_gs else 1 - w2 = pad_in_dims(w2, self.inter_size // group_size) + w2 = pad_in_dims(w2, inter_size // group_size) w1, w2, w3 = map(pack_fn, (w1, w2, w3)) self.model.save_split(w1, @@ -132,7 +135,8 @@ def _export(self, def apply(self, i: int, r: BaseReader): for e in get_params(r.ffn(i, None)): - e(partial(self._export, self._ffn), partial(r.ffn, i), i) + e(partial(self._export, self.inter_size[i], self._ffn), + partial(r.ffn, i), i) class MoeFfn(Ffn): @@ -154,11 +158,13 @@ def __init__(self, model: BaseOutputModel): self.shared_gate = model.model_config.moe_shared_gate def apply(self, i: int, r: BaseReader): + if self.expert_num[i] == 0: + return for p in get_params(r.moe_ffn_expert()): - for e in range(self.expert_num): + for e in range(self.expert_num[i]): fmt = self._moe_ffn_expert.replace('E', str(e)) - p(partial(self._export, fmt), partial(r.moe_ffn_expert, e, i), - i) + p(partial(self._export, self.inter_size, fmt), + partial(r.moe_ffn_expert, e, i), i) gate = transpose(r.moe_ffn_gate(i)) self.model.save_split(gate, self._moe_ffn_gate.format(i)) @@ -218,6 +224,62 @@ def apply(self, i: int, r: BaseReader): e(self._export, partial(r.attn, i), i) +class MLA(Module): + """ + requires: + r.mla(i, kind) + r.mla_norm(i) + """ + + _mla = 'layers.{0}.attention.{1}.{2}' + + def __init__(self, model: BaseOutputModel): + self.model = model + + def _export(self, idx: int, xs, kind: str, pack_fn, **kwargs): + if all(x is None for x in xs): + return + q_a, q_b, q, kv_a, kv_b, o = map(transpose, xs) + + if q is not None: + q_b = q + + cfg = self.model.model_config + + o = o.reshape(cfg.head_num, cfg.v_head_dim, -1) + o = torch.nn.functional.pad( + o, (0, 0, 0, cfg.size_per_head - cfg.v_head_dim, 0, 0)) + o = o.view(cfg.head_num * cfg.size_per_head, cfg.hidden_units) + + if q_a is not None: + self.model.save_split(pack_fn(q_a), + self._mla.format(idx, 'q_a_proj', kind)) + q_b_name = 'q_proj' if q_a is None else 'q_b_proj' + self.model.save_split(pack_fn(q_b), + self._mla.format(idx, q_b_name, kind), + split_dim=-1) + self.model.save_split(pack_fn(kv_a), + self._mla.format(idx, 'kv_a_proj', kind)) + self.model.save_split(pack_fn(kv_b), + self._mla.format(idx, 'kv_b_proj', kind), + split_dim=-1) + self.model.save_split(pack_fn(o), + self._mla.format(idx, 'wo', kind), + split_dim=0) + + _layernorm = 'layers.{0}.attention.{1}_a_layernorm' + + def apply(self, i: int, r: BaseReader): + + for f in get_params(r.attn(i, None), bias=False): + f(self._export, partial(r.mla, i), i) + + q, k = r.mla_norm(i) + if q is not None: + self.model.save_split(q, self._layernorm.format(i, 'q')) + self.model.save_split(k, self._layernorm.format(i, 'kv')) + + class Misc(Module): """ requires: @@ -258,7 +320,11 @@ class Transformer: def __init__(self, model: BaseOutputModel): self.model = model - modules = [Attn, LayerNorm] + modules = [LayerNorm] + if model.model_config.kv_lora_rank: + modules.append(MLA) + else: + modules.append(Attn) if model.model_config.inter_size: modules.append(Ffn) if model.model_config.expert_num: diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py index de16bdc0a0..b9394b1244 100644 --- a/lmdeploy/turbomind/deploy/source_model/__init__.py +++ b/lmdeploy/turbomind/deploy/source_model/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from .baichuan import Baichuan2Model, BaichuanModel # noqa: F401 +from .deepseek2 import DeepSeek2Model # noqa: F401 from .deepseek_vl import DeepSeekVLModel # noqa: F401 from .glm4 import Glm4Model # noqa: F401 from .internlm2 import InternLM2Model # noqa: F401 diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek2.py b/lmdeploy/turbomind/deploy/source_model/deepseek2.py new file mode 100644 index 0000000000..0023f650ff --- /dev/null +++ b/lmdeploy/turbomind/deploy/source_model/deepseek2.py @@ -0,0 +1,134 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +from .base import INPUT_MODELS +from .llama import LlamaModel, LlamaReader + + +class DeepSeek2Reader(LlamaReader): + + def moe_ffn_gate(self, i): + return self.params.get(f'model.layers.{i}.mlp.gate.weight') + + def moe_ffn_expert(self, e=None, i=None, kind=None): + if not kind: + return self.filter(r'experts') + result = [] + for key in ['gate', 'down', 'up']: + name = f'model.layers.{i}.mlp.experts.{e}.{key}_proj.{kind}' + tensor = self.params.get(name) + tensor = self.transform(tensor, kind) + result.append(tensor) + return (*result, ) + + def _ffn(self, i: int, kind: str): + """Get ffn kind for layer i.""" + if not kind: + return self.filter(r'mlp' if i == 0 else r'shared_expert\.') + result = [] + for key in ['gate', 'down', 'up']: + name = f'model.layers.{i}.mlp.shared_experts.{key}_proj.{kind}' + if i == 0: + name = name.replace('shared_experts.', '') + tensor = self.params.get(name) + tensor = self.transform(tensor, kind) + result.append(tensor) + return (*result, ) + + def mla(self, i: int, kind: str): + if not kind: + return self.filter(r'self_attn.*proj') + result = [] + for key in [ + 'q_a_proj', 'q_b_proj', 'q_proj', 'kv_a_proj_with_mqa', + 'kv_b_proj', 'o_proj' + ]: + tensor = self.params.get( + f'{self.attn_layer_prefix}.{i}.self_attn.{key}.{kind}') + tensor = self.transform(tensor, kind) + result.append(tensor) + return (*result, ) + + def mla_norm(self, i: int): + result = [] + for k in ['q', 'kv']: + name = f'{self.attn_layer_prefix}.{i}.self_attn.{k}_a_layernorm.weight' # noqa: E501 + result.append(self.params.get(name)) + return (*result, ) + + +def get_yarn_params(rope_scaling: dict): + + scaling_factor = float(rope_scaling['factor']) + mscale = rope_scaling['mscale'] + mscale_all_dim = rope_scaling['mscale_all_dim'] + + def yarn_get_mscale(scale=1, mscale=1): + if scale <= 1: + return 1.0 + return 0.1 * mscale * math.log(scale) + 1.0 + + _mscale = float( + yarn_get_mscale(scaling_factor, mscale) / + yarn_get_mscale(scaling_factor, mscale_all_dim)) + + softmax_scale = 0 + if mscale_all_dim: + scale = yarn_get_mscale(scaling_factor, mscale_all_dim) + softmax_scale = scale * scale + + return _mscale, softmax_scale + + +@INPUT_MODELS.register_module(name='deepseek2') +class DeepSeek2Model(LlamaModel): + + Reader = DeepSeek2Reader + + def tokenizer_info(self): + n_words = self.model_config['vocab_size'] + bos_id = self.model_config['bos_token_id'] + eos_id = self.model_config['eos_token_id'] + return n_words, bos_id, eos_id + + def model_info(self): + cfg = self.model_config + info = super().model_info() + qk_nope_dim = cfg['qk_nope_head_dim'] + qk_rope_dim = cfg['qk_rope_head_dim'] + num_layer = cfg['num_hidden_layers'] + expert_num = cfg['n_routed_experts'] + expert_num = [expert_num] * num_layer + expert_num[0] = 0 + n_shared_experts = cfg['n_shared_experts'] + expert_inter_size = cfg['moe_intermediate_size'] + experts_per_token = cfg['num_experts_per_tok'] + inter_size = [n_shared_experts * expert_inter_size] * num_layer + inter_size[0] = cfg['intermediate_size'] + norm_topk_prob = cfg['norm_topk_prob'] + size_per_head = qk_rope_dim + qk_nope_dim + info.update(kv_lora_rank=cfg['kv_lora_rank'], + q_lora_rank=cfg['q_lora_rank'] or 0, + qk_rope_dim=qk_rope_dim, + v_head_dim=cfg['v_head_dim'], + size_per_head=size_per_head, + rotary_embedding=qk_rope_dim, + expert_num=expert_num, + expert_inter_size=expert_inter_size, + experts_per_token=experts_per_token, + inter_size=inter_size, + norm_topk_prob=norm_topk_prob, + routed_scale=cfg['routed_scaling_factor'], + topk_method=cfg['topk_method'], + topk_group=cfg['topk_group'], + moe_group_num=cfg['n_group'], + tune_layer_num=2) + rope_scaling = cfg.get('rope_scaling') + if rope_scaling and rope_scaling['type'] == 'yarn': + attention_factor, softmax_scale = get_yarn_params(rope_scaling) + softmax_scale *= size_per_head**(-0.5) + info.update(max_position_embeddings=rope_scaling[ + 'original_max_position_embeddings'], + attention_factor=attention_factor, + softmax_scale=softmax_scale) + return info diff --git a/lmdeploy/turbomind/deploy/source_model/mixtral.py b/lmdeploy/turbomind/deploy/source_model/mixtral.py index ff9df2d409..6ac22a658e 100644 --- a/lmdeploy/turbomind/deploy/source_model/mixtral.py +++ b/lmdeploy/turbomind/deploy/source_model/mixtral.py @@ -33,6 +33,6 @@ def model_info(self): info['expert_num'] = cfg['num_local_experts'] info['expert_inter_size'] = cfg['intermediate_size'] info['experts_per_token'] = cfg['num_experts_per_tok'] - info['moe_norm_topk'] = True + info['norm_topk_prob'] = True info['inter_size'] = 0 return info diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py index 772bd03037..637983e8ce 100644 --- a/lmdeploy/turbomind/deploy/source_model/qwen.py +++ b/lmdeploy/turbomind/deploy/source_model/qwen.py @@ -178,6 +178,6 @@ def model_info(self): info['experts_per_token'] = cfg['num_experts_per_tok'] info['inter_size'] = cfg['shared_expert_intermediate_size'] info['moe_shared_gate'] = True - info['moe_norm_topk_prob'] = cfg['norm_topk_prob'] + info['norm_topk_prob'] = cfg['norm_topk_prob'] info['attn_bias'] = 1 return info diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py index 09699ade09..f2c981bb24 100644 --- a/lmdeploy/turbomind/deploy/target_model/base.py +++ b/lmdeploy/turbomind/deploy/target_model/base.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import os.path as osp from abc import ABC +from collections.abc import Sequence import torch import tqdm @@ -65,13 +66,14 @@ def __init__(self, # get `model_info` and `tokenizer_info` at first, which # will be updated to `self.model_config` and `self.attention_config` self.input_model_info = self.input_model.model_info() + self.input_model_info = self.single_to_list( + self.input_model_info, keys=['inter_size', 'expert_num']) self.input_model_tokenizer_info = self.input_model.tokenizer_info() self.permute_qk = self.input_model_info.get('permute_qk', True) - self.update_model_config() - self.model_config.inter_size = _pad_inter_size( - self.model_config.inter_size, self.model_config.group_size, - self.tensor_para_size) + for i, v in enumerate(self.model_config.inter_size): + self.model_config.inter_size[i] = _pad_inter_size( + v, self.model_config.group_size, self.tensor_para_size) if self.model_config.expert_num: self.model_config.expert_inter_size = _pad_inter_size( self.model_config.expert_inter_size, @@ -79,11 +81,21 @@ def __init__(self, self.model_config.verify() assert self.model_config.kv_head_num % self.tensor_para_size == 0 + # print(self.model_config) + self.update_attention_config() self.update_lora_config() # ! Dependency on `self` self.model = model_cls(self) + def single_to_list(self, config: dict, keys): + num_layer = int(config['num_layer']) + for k in keys: + v = config.get(k, None) + if v is not None and not isinstance(v, Sequence): + config[k] = [v] * num_layer + return config + def update_model_config(self): """Update `self.model_config` according to the input_model's `tokenizer_info` and `model_info`""" diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py index e66da22df0..11e99edfa0 100644 --- a/lmdeploy/turbomind/supported_models.py +++ b/lmdeploy/turbomind/supported_models.py @@ -33,6 +33,7 @@ InternVLChatModel='internvl', # deepseek-vl MultiModalityCausalLM='deepseekvl', + DeepseekV2ForCausalLM='deepseek2', # MiniCPMV MiniCPMV='minicpmv', # mini gemini diff --git a/src/turbomind/kernels/CMakeLists.txt b/src/turbomind/kernels/CMakeLists.txt index febb8692dd..40a48402af 100644 --- a/src/turbomind/kernels/CMakeLists.txt +++ b/src/turbomind/kernels/CMakeLists.txt @@ -68,3 +68,4 @@ endif () add_subdirectory(attention) add_subdirectory(gemm) +add_subdirectory(norm) diff --git a/src/turbomind/kernels/attention/CMakeLists.txt b/src/turbomind/kernels/attention/CMakeLists.txt index af9d47e0e6..32de38981a 100644 --- a/src/turbomind/kernels/attention/CMakeLists.txt +++ b/src/turbomind/kernels/attention/CMakeLists.txt @@ -38,6 +38,8 @@ add_library(attention STATIC codegen/decoding_sm80_64_f16_f16.cu codegen/decoding_sm80_64_f16_u4.cu codegen/decoding_sm80_64_f16_u8.cu + codegen/attention_sm80_192.cu + codegen/decoding_sm80_192.cu ) set_property(TARGET attention PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) diff --git a/src/turbomind/kernels/attention/attention.cu b/src/turbomind/kernels/attention/attention.cu index 3f557234bc..e7642584c2 100644 --- a/src/turbomind/kernels/attention/attention.cu +++ b/src/turbomind/kernels/attention/attention.cu @@ -46,6 +46,12 @@ void dispatchAttention(const AttentionParams& params) else if (params.size_per_head == 128) { return dispatch(std::integral_constant{}); } + + if (params.size_per_head == 192) { + using Config = AttentionConfig; + return invokeAttention(params); + } + FT_CHECK(0); } diff --git a/src/turbomind/kernels/attention/codegen/attention_sm80_192.cu b/src/turbomind/kernels/attention/codegen/attention_sm80_192.cu new file mode 100644 index 0000000000..ceeafa7a6d --- /dev/null +++ b/src/turbomind/kernels/attention/codegen/attention_sm80_192.cu @@ -0,0 +1,16 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "../attention_config.h" +#include "../attention_template.h" + +namespace turbomind { + +using namespace attention; + +template void invokeAttention::Kernel>( + const AttentionParams& params); + +template void invokeAttention::Kernel>( + const AttentionParams& params); + +} // namespace turbomind diff --git a/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu b/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu new file mode 100644 index 0000000000..214e6748d9 --- /dev/null +++ b/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu @@ -0,0 +1,20 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "../decoding_config.h" +#include "../decoding_template.h" + +namespace turbomind { + +using namespace attention; + +template bool +invokeDecoding>(const AttentionParams& params); + +template bool invokeDecoding>(const AttentionParams& params); + +template bool +invokeDecoding>(const AttentionParams& params); + +template bool invokeDecoding>(const AttentionParams& params); + +} // namespace turbomind diff --git a/src/turbomind/kernels/attention/decoding.cu b/src/turbomind/kernels/attention/decoding.cu index 1b04b7d4eb..67bd81e45b 100644 --- a/src/turbomind/kernels/attention/decoding.cu +++ b/src/turbomind/kernels/attention/decoding.cu @@ -2,8 +2,8 @@ #include "decoding.h" #include "decoding_config.h" +#include "src/turbomind/kernels/attention/arch.h" #include "src/turbomind/models/llama/llama_utils.h" -// #include "src/turbomind/utils/dispatch.h" #include #include @@ -113,6 +113,21 @@ void dispatchDecoding(const AttentionParams& params) return false; }; + if (params.size_per_head == 192) { + + if (is_kv_int8) { + invokeDecoding>(params); + } + else if (is_kv_int4) { + FT_CHECK_WITH_INFO(!is_kv_int4, "not implemented"); + // invokeDecoding>(params); + } + else { + invokeDecoding>(params); + } + return; + } + auto success = dispatch(); FT_CHECK(success); diff --git a/src/turbomind/kernels/attention/decoding_config.h b/src/turbomind/kernels/attention/decoding_config.h index 7dcb119cfd..dfd5e07835 100644 --- a/src/turbomind/kernels/attention/decoding_config.h +++ b/src/turbomind/kernels/attention/decoding_config.h @@ -40,7 +40,7 @@ struct DecodingConfig 2) }; template -struct DecodingConfig { +struct DecodingConfig> { static constexpr int Qh = (Qh_ + 7) / 8 * 8; using Attention = Impl; using CacheIter = GetBlockIterFactory; @@ -76,4 +76,14 @@ struct DecodingConfig { using Kernel = AttentionUniversal, CacheIter, DecodingCtaMap>; }; +template +struct DecodingConfig { + static constexpr int Qh = 1; + static constexpr int HeadDim = 192; + + using Attention = Impl; + using CacheIter = GetBlockIterFactory; + using Kernel = AttentionUniversal, Attention>, CacheIter, DecodingCtaMap>; +}; + } // namespace turbomind::attention diff --git a/src/turbomind/kernels/attention/impl_16816.h b/src/turbomind/kernels/attention/impl_16816.h index 6e8f37f4d4..07c7dcb12b 100644 --- a/src/turbomind/kernels/attention/impl_16816.h +++ b/src/turbomind/kernels/attention/impl_16816.h @@ -63,26 +63,28 @@ struct Impl>, SmemLayoutV2>>; - using SmemLayoutK = std::conditional_t>, SmemLayoutV2>>; - using SmemLayoutV = std::conditional_t>, SmemLayoutV2>>; using SmemLayoutKVp = void; + static constexpr bool kUseSmemQ = false; + static constexpr bool kUseSmemP = false; + + static_assert(!kUseSmemQ, "current smemQ impl yields inconsistent outputs"); + union SharedStorage { __align__(16) T KV[Stages * (SmemLayoutK::kSize + SmemLayoutV::kSize) / 2]; __align__(16) T Q[SmemLayoutQ::kSize]; }; - static constexpr bool kUseSmemQ = false; - static constexpr bool kUseSmemP = false; - using ThreadMapQ = RakedThreadMap; using ThreadMapKV = RakedThreadMap; @@ -109,22 +111,24 @@ struct Impl sQ{smem_Q}; + SmemAccessor sQ{smem_Q}; - // Load from shared memory using LDSM, rearrange to m16n8k16 atom layout - PRAGMA_UNROLL - for (int m = 0; m < K_M; ++m) { + // Load from shared memory using LDSM, rearrange to m16n8k16 atom layout PRAGMA_UNROLL - for (int k = 0; k < K_K; ++k) { - const int qi = lane_id % 16 * 1 + m * 16 + warp_id * WARP_Q; - const int di = lane_id / 16 * 8 + k * 16; - ldsm_x4((Array&)frag_Q[k][m], cast_smem_ptr_to_uint(&sQ(qi, di))); + for (int m = 0; m < K_M; ++m) { + PRAGMA_UNROLL + for (int k = 0; k < K_K; ++k) { + const int qi = lane_id % 16 * 1 + m * 16 + warp_id * WARP_Q; + const int di = lane_id / 16 * 8 + k * 16; + ldsm_x4((Array&)frag_Q[k][m], cast_smem_ptr_to_uint(&sQ(qi, di))); + } } } - if constexpr (kUseSmemQ) { + if constexpr (0) { __syncthreads(); // Rearrange Q in smem so that swizzling is not needed for later LDSMs @@ -142,20 +146,25 @@ struct Impl smem_K; + T* smem_Q; FragQ frag_Q; FragK frag_K; __device__ StateQK(SharedStorage& storage, FragQ frag_Q_): smem_K{storage.KV} { - static_assert(!kUseSmemQ, "not implemented"); - PRAGMA_UNROLL - for (int k = 0; k < K_K; ++k) { + if constexpr (!kUseSmemQ) { PRAGMA_UNROLL - for (int m = 0; m < K_M; ++m) { - frag_Q[k][m] = frag_Q_[k][m]; + for (int k = 0; k < K_K; ++k) { + PRAGMA_UNROLL + for (int m = 0; m < K_M; ++m) { + frag_Q[k][m] = frag_Q_[k][m]; + } } } + else { + smem_Q = storage.Q; + } } __device__ void Load(int k, int pipe_iter) @@ -166,6 +175,16 @@ struct Impl sQ{smem_Q}; + PRAGMA_UNROLL + for (int m = 0; m < K_M; ++m) { + const int qi = lane_id % 16 * 1 + m * 16 + warp_id * WARP_Q; + const int di = lane_id / 16 * 8 + k * 16; + ldsm_x4((Array&)frag_Q[k][m], cast_smem_ptr_to_uint(&sQ(qi, di))); + } + } PRAGMA_UNROLL for (int n = 0; n < K_N; n += 2) { // Load (s16,d16) tiles const int s = n * 8 + offset_s; diff --git a/src/turbomind/kernels/attention/impl_81616.h b/src/turbomind/kernels/attention/impl_81616.h index 3b90bcdf57..f865f1bc3a 100644 --- a/src/turbomind/kernels/attention/impl_81616.h +++ b/src/turbomind/kernels/attention/impl_81616.h @@ -104,7 +104,7 @@ struct Impl) { - return std::conditional_t>, SmemLayoutV2>>{}; } diff --git a/src/turbomind/kernels/attention/impl_simt.h b/src/turbomind/kernels/attention/impl_simt.h index a886185a44..444b67e2c8 100644 --- a/src/turbomind/kernels/attention/impl_simt.h +++ b/src/turbomind/kernels/attention/impl_simt.h @@ -2,12 +2,16 @@ #pragma once -#include "src/turbomind/kernels/attention/impl.h" +#include +#include +#include + #include "src/turbomind/kernels/core/array_ops.h" #include "src/turbomind/kernels/core/layout.h" #include "src/turbomind/kernels/core/thread_map.h" -#include -#include + +#include "src/turbomind/kernels/attention/impl.h" +#include "src/turbomind/kernels/attention/quantization.h" namespace turbomind::attention { @@ -51,7 +55,7 @@ struct Impl), K_K); }; struct LinearD { diff --git a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu index 20bb00fde8..f2e2faef91 100644 --- a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu +++ b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu @@ -277,11 +277,14 @@ void invokeProcessKV_v2(char** blocks, }; auto dispatch = [&](auto tkv) { - if (head_dim == 128) { + if (head_dim == 64) { + return invoke(tkv, std::integral_constant{}); + } + else if (head_dim == 128) { return invoke(tkv, std::integral_constant{}); } - else if (head_dim == 64) { - return invoke(tkv, std::integral_constant{}); + else if (head_dim == 192) { + return invoke(tkv, std::integral_constant{}); } FT_CHECK(0); }; @@ -545,6 +548,9 @@ void invokeFlattenKV_v2(T* k, else if (head_dim == 128) { return invoke(tkv, std::integral_constant{}); } + else if (head_dim == 192) { + return invoke(tkv, std::integral_constant{}); + } FT_CHECK(0); }; diff --git a/src/turbomind/kernels/attention/mainloop_sm80.h b/src/turbomind/kernels/attention/mainloop_sm80.h index bf0fc1d32a..4435400b70 100644 --- a/src/turbomind/kernels/attention/mainloop_sm80.h +++ b/src/turbomind/kernels/attention/mainloop_sm80.h @@ -52,7 +52,7 @@ struct Mainloop, Impl_> { template __device__ void operator()(Args&&... args) { - Run(Sm80_CpAsync{}, ((Args &&) args)...); + Run(Sm80_CpAsync{}, std::integral_constant{}, ((Args &&) args)...); } template @@ -81,8 +81,9 @@ struct Mainloop, Impl_> { } } - template + template __device__ void Run(Sm80_CpAsync, + std::integral_constant, FragQ& frag_Q, CacheIter& cache_iter, FragO& frag_O, @@ -199,9 +200,10 @@ struct Mainloop, Impl_> { __pipeline_wait_prior(0); } -#if 0 + // #if 1 template __device__ void Run(Sm80_CpAsync<2>, + std::integral_constant, FragQ& frag_Q, CacheIter& cache_iter, FragO& frag_O, @@ -234,7 +236,7 @@ struct Mainloop, Impl_> { Wait(); state_QK.Load(0, 0); - constexpr auto _ = [](int){}; + constexpr auto _ = [](int) {}; auto loop = [&](auto is_residue, auto is_mask) { const int offset_K = tile_iter * CTA_S; @@ -292,14 +294,15 @@ struct Mainloop, Impl_> { __pipeline_wait_prior(0); } -#elif 1 + // #elif 1 // Load : K0,K1 | V0,K2,V1,K3 ... // Compute : K0 | K1,V0,K2,V1 ... // - more register consumption // - more interleaved HMMA and FMA // - slight performance gain - template + template __device__ void Run(Sm80_CpAsync<2>, + std::integral_constant, FragQ& frag_Q, CacheIter& cache_iter_, FragO& frag_O, @@ -407,7 +410,7 @@ struct Mainloop, Impl_> { __pipeline_commit(); __pipeline_wait_prior(0); } -#endif + // #endif __device__ void Wait() { diff --git a/src/turbomind/kernels/attention/reduce.cu b/src/turbomind/kernels/attention/reduce.cu index 12f6aff38b..c654f40d05 100644 --- a/src/turbomind/kernels/attention/reduce.cu +++ b/src/turbomind/kernels/attention/reduce.cu @@ -66,12 +66,14 @@ void invokeReduce(T* out, float exp_scale, \ cudaStream_t stream); -INSTANTIATE_invokeReduce(128, half); INSTANTIATE_invokeReduce(64, half); +INSTANTIATE_invokeReduce(128, half); +INSTANTIATE_invokeReduce(192, half); #if ENABLE_BF16 +INSTANTIATE_invokeReduce(64, nv_bfloat16); INSTANTIATE_invokeReduce(128, nv_bfloat16); -INSTANTIATE_invokeReduce(64, nv_bfloat16) +INSTANTIATE_invokeReduce(192, nv_bfloat16); #endif } // namespace turbomind::attention diff --git a/src/turbomind/kernels/attention/reduce_kernel.h b/src/turbomind/kernels/attention/reduce_kernel.h index 88a3ab3af8..b4c9064cfe 100644 --- a/src/turbomind/kernels/attention/reduce_kernel.h +++ b/src/turbomind/kernels/attention/reduce_kernel.h @@ -128,9 +128,12 @@ struct Reduce { __syncthreads(); - constexpr int kVecSize = HeadDim / WARP_SIZE; + // HeadDim / WARP_SIZE + // 128 -> 4 + // 64, 192 -> 2 + constexpr int kVecSize = HeadDim % 128 == 0 ? 4 : 2; - using Map = RakedThreadMap; + using Map = RakedThreadMap; static_assert(Map::kIterS == CTA_H); diff --git a/src/turbomind/kernels/attention/rotary_embedding.h b/src/turbomind/kernels/attention/rotary_embedding.h index 8e09da22cd..db836ed184 100644 --- a/src/turbomind/kernels/attention/rotary_embedding.h +++ b/src/turbomind/kernels/attention/rotary_embedding.h @@ -131,6 +131,7 @@ struct FastRoPE { template __device__ void apply(Array& x, float timestep) { +#if 0 PRAGMA_UNROLL for (int i = 0; i < N; i += 2) { float c, s; @@ -144,6 +145,22 @@ struct FastRoPE { x[i + 1] = (T)tmp1; } } +#else + // Most models apply rotary embedding in half precision + PRAGMA_UNROLL + for (int i = 0; i < N; i += 2) { + float c, s; + sincosf(timestep * inv_freq_[i / 2], &s, &c); + s *= attention_scaling_; + c *= attention_scaling_; + T tmp0 = (T)c * x[i] - (T)s * x[i + 1]; + T tmp1 = (T)c * x[i + 1] + (T)s * x[i]; + if (is_valid_) { + x[i] = tmp0; + x[i + 1] = tmp1; + } + } +#endif } }; diff --git a/src/turbomind/kernels/attention/test_attention.cu b/src/turbomind/kernels/attention/test_attention.cu index c6d7b40637..804d4815dc 100644 --- a/src/turbomind/kernels/attention/test_attention.cu +++ b/src/turbomind/kernels/attention/test_attention.cu @@ -218,14 +218,14 @@ void TestBlocks(const thrust::universal_vector& k_cache, // [B, H, S, #define KV_INT4 0 -#define DECODING 1 +#define DECODING 0 template int test_attention() { AttentionParams params{}; - constexpr size_t kHeadDim = 128; + constexpr size_t kHeadDim = 192; #if DECODING // constexpr size_t kHeadNum = 32; @@ -239,11 +239,11 @@ int test_attention() // constexpr size_t kSequenceLen = 511; // constexpr size_t kSequenceLen = 2047; // constexpr size_t kSequenceLen = 4095; - // constexpr size_t kSequenceLen = 8191; + constexpr size_t kSequenceLen = 8191; // constexpr size_t kSequenceLen = 32767; // constexpr size_t kSequenceLen = 65535; // constexpr size_t kSequenceLen = 131071; - constexpr size_t kSequenceLen = 200000; + // constexpr size_t kSequenceLen = 200000; // constexpr size_t kSequenceLen = 262143; // constexpr size_t kSequenceLen = (1 << 20) - 1; // 1M // constexpr size_t kSequenceLen = (1 << 22) - 1; // 4M @@ -451,6 +451,10 @@ int test_attention() params.qk = qk_buf.data().get(); params.pr = pr_buf.data().get(); + params.attention_scaling = 1.f; + params.llama3_inv_scaling_factor = 0; + params.yarn_ramp_inv_factor_div_2 = 0; + Reference reference(kDump ? Reference::kUNFUSED : Reference::kFLASH_ATTENTION, {}); // Reference reference(Reference::kUNFUSED, {}); reference.Reshape(kInputLen, kContextLen, kHeadNum, kHeadDim, KvHeadNum, kBatchSize); diff --git a/src/turbomind/kernels/core/array_ops.h b/src/turbomind/kernels/core/array_ops.h index 6b639abc83..ec6e7fb4ed 100644 --- a/src/turbomind/kernels/core/array_ops.h +++ b/src/turbomind/kernels/core/array_ops.h @@ -172,7 +172,7 @@ inline __device__ void copy(const Array (&src)[M], Array (&dst)[M]) } template -inline __device__ void Store(T* __restrict__ dst, const Array& src) +inline __device__ void Store(T* dst, const Array& src) { if constexpr (sizeof(Array) == sizeof(uint4)) { *(uint4*)dst = (const uint4&)src; diff --git a/src/turbomind/kernels/core/math.h b/src/turbomind/kernels/core/math.h index a708a34985..054269c27f 100644 --- a/src/turbomind/kernels/core/math.h +++ b/src/turbomind/kernels/core/math.h @@ -5,6 +5,7 @@ #include "src/turbomind/kernels/core/common.h" #include #include +#include namespace turbomind { @@ -41,6 +42,13 @@ TM_HOST_DEVICE constexpr T log2(T x) // static_assert(log2(32) == 5); // static_assert(log2(1) == 0); +template +TM_HOST_DEVICE constexpr T lowbit(T x) +{ + const std::make_signed_t s = x; + return static_cast(s & -s); +} + // https://arxiv.org/abs/1902.01961 template struct FastDivMod { diff --git a/src/turbomind/kernels/core/thread_map.h b/src/turbomind/kernels/core/thread_map.h index 66b691832f..1271aefcc0 100644 --- a/src/turbomind/kernels/core/thread_map.h +++ b/src/turbomind/kernels/core/thread_map.h @@ -3,6 +3,7 @@ #pragma once #include "src/turbomind/kernels/core/common.h" +#include "src/turbomind/kernels/core/math.h" #include @@ -51,7 +52,7 @@ struct ThreadMapQ { } }; -template +template struct RakedThreadMap { static constexpr int kDimC = DimC; static constexpr int kDimS = DimS; diff --git a/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt b/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt index d41c391e9d..81c9750584 100644 --- a/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt +++ b/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt @@ -8,9 +8,11 @@ add_library(${PROJECT_NAME} STATIC # flash_fwd_hdim64_fp16_sm80.cu flash_fwd_hdim128_fp16_sm80.cu flash_fwd_hdim128_bf16_sm80.cu - # flash_fwd_hdim256_fp16_sm80.cu + flash_fwd_hdim256_bf16_sm80.cu + flash_fwd_hdim256_fp16_sm80.cu ) target_include_directories(${PROJECT_NAME} PRIVATE ${CUTLASS_DIR} / include) target_link_libraries(${PROJECT_NAME} PRIVATE nvidia::cutlass::cutlass) + set_property(TARGET ${PROJECT_NAME} PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) diff --git a/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h b/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h index e108a55f28..2456496367 100644 --- a/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h +++ b/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_launch_template.h @@ -147,7 +147,7 @@ void run_mha_fwd_hdim128(Flash_fwd_params& params, cudaStream_t stream) }); } -#if 0 +#if 1 template void run_mha_fwd_hdim256(Flash_fwd_params& params, cudaStream_t stream) { diff --git a/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h b/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h index fd19a0ea61..b1df29cb7b 100644 --- a/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h +++ b/src/turbomind/kernels/flash_attention/flash_attention2/static_switch.h @@ -58,6 +58,18 @@ return __VA_ARGS__(); \ } \ }() +#elif 1 +#define FWD_HEADDIM_SWITCH(HEADDIM, ...) \ + [&] { \ + if (HEADDIM <= 128) { \ + constexpr static int kHeadDim = 128; \ + return __VA_ARGS__(); \ + } \ + else if (HEADDIM <= 256) { \ + constexpr static int kHeadDim = 256; \ + return __VA_ARGS__(); \ + } \ + }() #else #define FWD_HEADDIM_SWITCH(HEADDIM, ...) \ [&] { \ diff --git a/src/turbomind/kernels/gemm/context.h b/src/turbomind/kernels/gemm/context.h index 4fec5b732f..bd03917b89 100644 --- a/src/turbomind/kernels/gemm/context.h +++ b/src/turbomind/kernels/gemm/context.h @@ -113,12 +113,7 @@ class DynamicGemmContext: public StaticGemmContext { class MoeGemmContext: public Context { public: - MoeGemmContext(int experts, - int experts_per_token, - // int output_dims, - // int input_dims, - const cudaDeviceProp& prop, - cudaStream_t stream); + MoeGemmContext(int experts, int experts_per_token, const cudaDeviceProp& prop, cudaStream_t stream); ~MoeGemmContext() override; @@ -156,9 +151,11 @@ class MoeGemmContext: public Context { Tape Schedule(const LaunchSpec&) override; - void set_offsets(const int* offsets) + void update(int expert_num, int experts_per_token, const int* offsets) { - offsets_ = offsets; + expert_num_ = expert_num; + experts_per_token_ = experts_per_token; + offsets_ = offsets; } protected: diff --git a/src/turbomind/kernels/gemm/convert_v2.cu b/src/turbomind/kernels/gemm/convert_v2.cu index ed8b2ee2ff..e58bfc9b95 100644 --- a/src/turbomind/kernels/gemm/convert_v2.cu +++ b/src/turbomind/kernels/gemm/convert_v2.cu @@ -279,17 +279,44 @@ get_weight_and_scales_layout(DataType dtype, bool is_fused_moe, int sm, bool for return {}; } -void* make_blocked_ptrs(const std::vector>& ptrs, cudaStream_t stream) +namespace { + +template +struct Param { + StridedPtr data[N]; + StridedPtr* ptr; + int n; +}; + +template +__global__ void fill_strided_ptrs(Param param) { - std::vector tmp; - for (const auto& [p, s] : ptrs) { - tmp.push_back({p, s}); + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < param.n) { + param.ptr[idx] = param.data[idx]; } +} + +} // namespace + +void* make_blocked_ptrs(const std::vector>& ptrs, cudaStream_t stream) +{ + constexpr int N = 64; + Param param{}; + static_assert(sizeof(param) <= 4096); // max parameter size for cuda11 StridedPtr* ptr{}; cudaMallocAsync(&ptr, sizeof(StridedPtr) * ptrs.size(), stream); - cudaMemcpyAsync(ptr, tmp.data(), sizeof(StridedPtr) * ptrs.size(), cudaMemcpyDefault, stream); - // Sync before tmp can be destructed - cudaStreamSynchronize(stream); + param.ptr = ptr; + for (int i = 0; i < (int)ptrs.size(); i += N) { + const int n = std::min(ptrs.size() - i, N); + for (int j = 0; j < n; ++j) { + auto& [p, s] = ptrs[i + j]; + param.data[j] = StridedPtr{p, s}; + } + param.n = n; + fill_strided_ptrs<<<1, N, 0, stream>>>(param); + param.ptr += N; + } return ptr; } diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu index 5912c60a8a..a9e4f7da51 100644 --- a/src/turbomind/kernels/gemm/moe_utils_v2.cu +++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu @@ -264,7 +264,8 @@ __global__ void MoeGateKernel_v8(float* scales, // [e,n] int token_num_padded, int expert_num, int top_k, - bool norm_topk) + bool norm_topk, + float routed_scale) { constexpr int max_tiles = kMoeGateMaxTiles; constexpr int threads_per_token = max_expert_num / items_per_thread; // 8 @@ -286,8 +287,8 @@ __global__ void MoeGateKernel_v8(float* scales, // [e,n] const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token; - const int warp_offset = thread_idx / WARP_SIZE * WARP_SIZE / threads_per_token; - const int block_offset = thread_idx / block_dim * block_dim / threads_per_token; + // const int warp_offset = thread_idx / WARP_SIZE * WARP_SIZE / threads_per_token; + // const int block_offset = thread_idx / block_dim * block_dim / threads_per_token; float data[items_per_thread]; int idxs[items_per_thread]; @@ -413,7 +414,13 @@ __global__ void MoeGateKernel_v8(float* scales, // [e,n] #endif - constexpr float kLog2e = 1.4426950408889634074; + // constexpr float kLog2e = 1.4426950408889634074; + // if (k == 0) { + // PRAGMA_UNROLL + // for (int i = 0; i < items_per_thread; ++i) { + // data[i] *= kLog2e; + // } + // } unsigned mask = (unsigned)-1; float max_logit; @@ -437,13 +444,6 @@ __global__ void MoeGateKernel_v8(float* scales, // [e,n] asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); } - if (k == 0) { - PRAGMA_UNROLL - for (int i = 0; i < items_per_thread; ++i) { - data[i] *= kLog2e; - } - } - int g_max_ei = ei; float g_max_val = max_val; if constexpr (threads_per_token > 1) { @@ -486,7 +486,7 @@ __global__ void MoeGateKernel_v8(float* scales, // [e,n] PRAGMA_UNROLL for (int i = 0; i < items_per_thread; ++i) { if (!norm_topk || used[i]) { - data[i] = exp2f(data[i] - max_logit); + data[i] = expf(data[i] - max_logit); sum_prob += data[i]; } } @@ -515,9 +515,11 @@ __global__ void MoeGateKernel_v8(float* scales, // [e,n] PRAGMA_UNROLL for (int i = 0; i < max_tiles * max_expert_num; i += block_dim) { - int e = (i + threadIdx.x) % max_expert_num; - int t = (i + threadIdx.x) / max_expert_num; - smem.shared_accum[t][e] = 0; + int e = (i + threadIdx.x) % max_expert_num; + int t = (i + threadIdx.x) / max_expert_num; + if (t < max_tiles) { + smem.shared_accum[t][e] = 0; + } } __syncthreads(); @@ -536,10 +538,8 @@ __global__ void MoeGateKernel_v8(float* scales, // [e,n] if (ti2 < token_num && idx < top_k) { masks[expert_id * token_num_padded + ti2] = idx; - scales[idx * token_num + ti2] = scale; + scales[idx * token_num + ti2] = scale * routed_scale; atomicAdd(&smem.shared_accum[ti2 >> log_tile][expert_id], 1); - - // printf("%d %d %f\n", idx, expert_id, scale); } } @@ -569,6 +569,7 @@ void invokeMoeGate_V2(int* f2n, // [e*n] -> n int experts, // E int experts_per_token, bool norm_topk, + float routed_scale, cudaStream_t st) { constexpr int base_log_tile = 9; @@ -581,14 +582,14 @@ void invokeMoeGate_V2(int* f2n, // [e*n] -> n // std::cout << log_tile << " " << tiles << "\n"; - auto invoke = [&](auto max_expert_num, auto top_k, auto items_per_thread) { + auto invoke = [&](auto max_expert_num, auto top_k, auto items_per_thread, auto vec_size) { constexpr int thrs_per_tok = max_expert_num.value / items_per_thread.value; constexpr int threads = 256; const int blocks = ceil_div(tokens, threads / thrs_per_tok); cudaMemsetAsync(masks, -1, sizeof(int8_t) * experts * tokens_padded, st); - MoeGateKernel_v8 + MoeGateKernel_v8 <<>>( // scales, (int8_t*)masks, @@ -600,28 +601,49 @@ void invokeMoeGate_V2(int* f2n, // [e*n] -> n tokens_padded, experts, experts_per_token, - norm_topk); + norm_topk, + routed_scale); }; auto fail = [&] { - std::cerr << "unsupported moe config: expert_num=" << experts << ", top_k=" << experts_per_token << "\n"; + std::cerr << __FILE__ << "(" << __LINE__ << "): unsupported moe config: expert_num=" << experts + << ", top_k=" << experts_per_token << "\n"; std::abort(); }; if (experts <= 8) { if (experts_per_token <= 2) { - invoke(_Int<8>, _Int<2>, _Int<8>); + // MoeGateKernel_V2<2, 128><<>>(scales, + // (int8_t*)masks, + // accum, + // logits, + // log_tile, + // tiles, + // tokens, + // tokens_padded, + // experts); + + // std::cout << tokens << " " << experts << " " << experts_per_token << " " << tokens_padded << "\n"; + invoke(_Int<8>, _Int<2>, _Int<8>, _Int<4>); } else { - invoke(_Int<8>, _Int<8>, _Int<8>); + invoke(_Int<8>, _Int<8>, _Int<8>, _Int<4>); } } else if (experts <= 64) { if (experts_per_token <= 4) { - invoke(_Int<64>, _Int<4>, _Int<16>); + invoke(_Int<64>, _Int<4>, _Int<16>, _Int<4>); } else if (experts_per_token <= 8) { - invoke(_Int<64>, _Int<8>, _Int<16>); + invoke(_Int<64>, _Int<8>, _Int<16>, _Int<4>); + } + else { + fail(); + } + } + else if (experts <= 160) { + if (experts_per_token <= 8) { + invoke(_Int<160>, _Int<8>, _Int<10>, _Int<2>); } else { fail(); @@ -687,7 +709,8 @@ __global__ void MoeReduceKernel(T* dst, // [ n, d] const int* en2f, // [ e, n] :: (e,n) -> e*n const float* dst_scales, // [n] int dims, - int tokens) + int tokens, + float dst_scale) { using Vec = Array; @@ -695,7 +718,6 @@ __global__ void MoeReduceKernel(T* dst, // [ n, d] auto dst_ptr = (Vec*)dst + dims * ti; - float dst_scale = 0; if (dst_scales) { dst_scale = dst_scales[ti]; dst_scale = fdividef(1.f, 1.f + expf(-dst_scale)); @@ -711,8 +733,9 @@ __global__ void MoeReduceKernel(T* dst, // [ n, d] } for (int i = threadIdx.x; i < dims; i += block_dim) { +#if 1 Array accum{}; - if (dst_scales) { + if (dst_scale) { Vec v; Ldg(v, dst_ptr[i].data()); using namespace ops; @@ -727,6 +750,24 @@ __global__ void MoeReduceKernel(T* dst, // [ n, d] accum = accum + x; } Store(dst_ptr[i].data(), cast(accum)); +#else + Array accum{}; + if (dst_scale) { + Vec v; + Ldg(v, dst_ptr[i].data()); + using namespace ops; + accum = v * (T)dst_scale; + } + PRAGMA_UNROLL + for (int e = 0; e < exp_k; ++e) { + Vec v; + Ldg(v, src_ptr[e][i].data()); + using namespace ops; + const auto x = v * (T)scale[e]; + accum = accum + x; + } + Store(dst_ptr[i].data(), accum); +#endif } } @@ -739,6 +780,7 @@ void invokeMoeReduce(T* dst, int tokens, int experts_per_token, int dims, + float dst_scale, cudaStream_t st) { // std::cout << __PRETTY_FUNCTION__ << std::endl; @@ -754,7 +796,8 @@ void invokeMoeReduce(T* dst, en2f, dst_scales, dims / vec_size, - tokens); + tokens, + dst_scale); }; switch (experts_per_token) { @@ -774,10 +817,11 @@ void invokeMoeReduce(T* dst, } } -template void invokeMoeReduce(half*, const half*, const float*, const int*, const float*, int, int, int, cudaStream_t); -#ifdef ENABLE_BF16 template void -invokeMoeReduce(nv_bfloat16*, const nv_bfloat16*, const float*, const int*, const float*, int, int, int, cudaStream_t); +invokeMoeReduce(half*, const half*, const float*, const int*, const float*, int, int, int, float, cudaStream_t); +#ifdef ENABLE_BF16 +template void invokeMoeReduce( + nv_bfloat16*, const nv_bfloat16*, const float*, const int*, const float*, int, int, int, float, cudaStream_t); #endif std::vector SampleUniform(int token_num, int expert_num, int exp_per_tok, std::mt19937& g) @@ -833,4 +877,89 @@ std::vector SampleBalanced(int token_num, int expert_num, int exp_per_tok, return ret; } +template +__global__ void MoeMaskTopKGroups(float* logits, int token_num, int expert_num, int top_k) +{ + constexpr int threads_per_token = max_expert_num / items_per_thread; + + static_assert((threads_per_token & (threads_per_token - 1)) == 0); + static_assert(items_per_thread % access_size == 0); + + const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x; + + const int ti = thread_idx / threads_per_token; + const int ei = thread_idx % threads_per_token; + + float data[items_per_thread]; + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + data[i] = -std::numeric_limits::infinity(); + } + float max_val = -std::numeric_limits::infinity(); + if (ti < token_num) { + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; i += access_size) { + const int e = ei * items_per_thread + i; + if (e < expert_num) { + Ldg((Array&)data[i], &logits[ti * expert_num + e]); + PRAGMA_UNROLL + for (int c = 0; c < access_size; ++c) { + max_val = fmaxf(max_val, data[i + c]); + } + } + } + } + + const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token; + const int warp_ti_offset = warp_ti * threads_per_token; + + bool alive = false; + + for (int k = 0; k < top_k; ++k) { + int g_max_ei = ei; + float g_max_val = max_val; + PRAGMA_UNROLL + for (int m = threads_per_token / 2; m >= 1; m /= 2) { + g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m)); + } + // tie breaking + const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val); + g_max_ei = __ffs(active >> (unsigned)warp_ti_offset) - 1; + if (ei == g_max_ei) { + alive = true; + max_val = -std::numeric_limits::infinity(); + } + } + + if (!alive && ti < token_num) { + Array vec; + fill(vec, -std::numeric_limits::infinity()); + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; i += access_size) { + const int e = ei * items_per_thread + i; + if (e < expert_num) { + Store(&logits[ti * expert_num + e], vec); + } + } + } +} + +void invokeMaskMoeTopKGroups(float* logits, int token_num, int expert_num, int group_size, int top_k, cudaStream_t st) +{ + auto invoke = [&](auto max_expert_num, auto items_per_thread, auto vec_size) { + constexpr int thrs_per_tok = max_expert_num.value / items_per_thread.value; + constexpr int threads = 256; + const int blocks = ceil_div(token_num, threads / thrs_per_tok); + MoeMaskTopKGroups + <<>>(logits, token_num, expert_num, top_k); + }; + if (expert_num == 160 && group_size == 20) { + return invoke(_Int<160>, _Int<20>, _Int<4>); + } + + std::cerr << __FILE__ << "(" << __LINE__ << "): unsupported moe config: expert_num=" << expert_num + << ", group_size=" << group_size << "\n"; + std::abort(); +} + } // namespace turbomind diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.h b/src/turbomind/kernels/gemm/moe_utils_v2.h index 0e4c36af09..d53de1354e 100644 --- a/src/turbomind/kernels/gemm/moe_utils_v2.h +++ b/src/turbomind/kernels/gemm/moe_utils_v2.h @@ -22,6 +22,7 @@ void invokeMoeGate_V2(int* f2n, int experts, int exp_per_tok, bool norm_topk, + float routed_scale, cudaStream_t st); template @@ -54,8 +55,11 @@ void invokeMoeReduce(T* dst, int tokens, int experts_per_token, int dims, + float dst_scale, cudaStream_t st); +void invokeMaskMoeTopKGroups(float* logits, int token_num, int expert_num, int group_size, int top_k, cudaStream_t st); + // Sample `e` from `E` experts uniformly for every token std::vector SampleUniform(int token_num, int expert_num, int exp_per_tok, std::mt19937& g); diff --git a/src/turbomind/kernels/gemm/test/test_moe_utils.cu b/src/turbomind/kernels/gemm/test/test_moe_utils.cu index 47e3bfdb16..4b2ea6a83a 100644 --- a/src/turbomind/kernels/gemm/test/test_moe_utils.cu +++ b/src/turbomind/kernels/gemm/test/test_moe_utils.cu @@ -45,72 +45,6 @@ void diff_vecs(const T* data, const T* refs, int m, int k, std::string msg) } } -#if 0 -void func() -{ - using thrust::universal_vector; - - // clang-format off - std::vector h_logits{ - 8, 5, 1, 4, 3, 6, 2, 7, - 50, 60, 90, 20, 70, 71, 72, 73, - 0, 1, 0, 0, 0, 1, 0, 1, - 0, 0, 0, 1, 0, 0, 0, 2}; - // clang-format on - - h_logits.resize(8); - - // auto tmp = h_logits; - // for (int i = 0; i < 127; ++i) { - // h_logits.insert(h_logits.end(), tmp.begin(), tmp.end()); - // } - - universal_vector logits(h_logits.begin(), h_logits.end()); - - const int E = 8; - const int n = h_logits.size() / E; - const int e = 2; - - const int n_padded = (n + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize; - - universal_vector f2n(e * n); - universal_vector en2f(e * n); - universal_vector offsets(E + 1); - universal_vector accum(E * kMoeGateMaxTiles); - universal_vector scales(n * e); - universal_vector masks(E * n_padded); - - for (int i = 0; i < 10; ++i) { - gemm::CacheFlushing::flush(0); - cudaMemset(accum.data().get(), 0, sizeof(int) * accum.size()); - invokeMoeGate_V2(f2n.data().get(), - en2f.data().get(), - offsets.data().get(), - scales.data().get(), - masks.data().get(), - accum.data().get(), - logits.data().get(), - n, - n_padded, - E, - e, - 0); - } - - auto err = cudaDeviceSynchronize(); - if (err) { - std::cerr << cudaGetErrorString(err) << "\n"; - } - - print_vecs(scales.data().get(), e, n, "scales", 12); - print_vecs(masks.data().get(), E, n_padded, "tmp"); - print_vecs(accum.data().get(), E, 1, "accum"); - print_vecs(offsets.data().get(), 1, E + 1, "offsets"); - print_vecs(f2n.data().get(), n * e, 1, "f2n"); - print_vecs(en2f.data().get(), e, n, "en2f"); -} -#endif - RNG& gRNG() { static RNG inst{}; @@ -271,6 +205,8 @@ bool test_moe_gate(int tokens, // cudaMemPrefetchAsync(scales.data().get(), sizeof(float) * scales.size(), 0); cudaMemPrefetchAsync(logits.data().get(), sizeof(float) * logits.size(), 0); + // invokeMaskMoeTopKGroups(logits.data().get(), tokens, expert_num, expert_num / 8, 3, nullptr); + for (int i = 0; i < 1; ++i) { gemm::CacheFlushing::flush(); cudaMemset(accum.data().get(), 0, sizeof(int) * accum.size()); @@ -286,8 +222,9 @@ bool test_moe_gate(int tokens, // tokens_padded, expert_num, experts_per_token, - true, - 0); + false, + 1.f, + nullptr); } // invokeMoeTiling(coords.data().get(), offsets.data().get(), expert_num, coords.size(), &tiling, 1, 0); @@ -334,6 +271,8 @@ bool test_moe_gate(int tokens, // success = false; } + // print_vecs(logits.data().get(), tokens, expert_num, "logits", 12); + if (!success && 1) { diff_vecs(eids.data().get(), eids_ref.data().get(), experts_per_token, tokens, "eids"); @@ -353,6 +292,15 @@ bool test_moe_gate(int tokens, // print_vecs(scales_ref.data().get(), experts_per_token, tokens, "scales_ref", 12); print_vecs(scales.data().get(), experts_per_token, tokens, "scales", 12); + for (int i = 0; i < tokens; ++i) { + float sum = 0; + for (int j = 0; j < experts_per_token; ++j) { + sum += scales[j * tokens + i]; + } + std::cout << sum << " "; + } + std::cout << "\n"; + // print_vecs(accum.data().get(), expert_num, 1, "accum"); // print_vecs(coords.data().get(), 1, max_coords, "coords"); @@ -393,7 +341,7 @@ int main() // test_moe_gate(32768, 64, 8, tape, tiling); // test_moe_gate(8, 60, 4, tape, tiling); - test_moe_gate(65536, 8, 2, tape, tiling); + test_moe_gate(16, 160, 6, tape, tiling); return 0; for (int i = 1; i < 16384; ++i) { diff --git a/src/turbomind/kernels/gemm/test/testbed.h b/src/turbomind/kernels/gemm/test/testbed.h index 7a089fbdf2..4747644f9a 100644 --- a/src/turbomind/kernels/gemm/test/testbed.h +++ b/src/turbomind/kernels/gemm/test/testbed.h @@ -357,7 +357,7 @@ class Testbed { } } - ((MoeGemmContext*)ctx_.get())->set_offsets(moe_m_offsets_.data().get()); + ((MoeGemmContext*)ctx_.get())->update(experts_, exp_per_tok_, moe_m_offsets_.data().get()); CHECK(batch_dim == 0); CHECK(a_desc_.order == kRowMajor); @@ -518,6 +518,7 @@ class Testbed { batch_size_, expert_ids_.size() / batch_size_, output_dims_, + 0.f, stream_); invokeMoeReduce(c_ref_.data().get(), @@ -528,6 +529,7 @@ class Testbed { batch_size_, expert_ids_.size() / batch_size_, output_dims_, + 0.f, stream_); cudaDeviceSynchronize(); diff --git a/src/turbomind/kernels/gemm/unpack.cu b/src/turbomind/kernels/gemm/unpack.cu index 92f468d82b..39e6a2e1aa 100644 --- a/src/turbomind/kernels/gemm/unpack.cu +++ b/src/turbomind/kernels/gemm/unpack.cu @@ -71,14 +71,44 @@ void unpack_awq_gemm(uint4_t* dst, const uint4_t* src, int rows, int cols, cudaS permute_u4<0, 1, 3, 2><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape); } +__global__ void transpose_u4_kernel(uint4_t* dst, const uint4_t* src, int s, int c) +{ + const int idx_c = 8 * (threadIdx.x + blockIdx.x * blockDim.x); + const int idx_s = 8 * (threadIdx.y + blockIdx.y * blockDim.y); + if (idx_c >= c || idx_s >= s) { + return; + } + uint32_t ivec[8]; + PRAGMA_UNROLL + for (int i = 0; i < 8; ++i) { + ivec[i] = ((const uint32_t*)src)[((idx_s + i) * c + idx_c) / 8]; + } + uint32_t ovec[8]{}; + PRAGMA_UNROLL + for (int i = 0; i < 8; ++i) { + PRAGMA_UNROLL + for (int j = 0; j < 8; ++j) { + ovec[i] |= (((ivec[j] >> (i * 4)) & 0xfu) << (j * 4)); + } + } + PRAGMA_UNROLL + for (int i = 0; i < 8; ++i) { + ((uint32_t*)dst)[((idx_c + i) * s + idx_s) / 8] = ovec[i]; + } +} + void transpose_u4(uint4_t* dst, const uint4_t* src, int s, int c, cudaStream_t st) { if (s % 8 || c % 8) { std::cerr << "transpose_u4: invalid shape (" << s << "," << c << "), must be multiple of 8" << std::endl; return; } - Array shape{s, c}; - permute_u4<1, 0><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape); + // Array shape{s, c}; + // permute_u4<1, 0><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape); + + const dim3 block(16, 16); + const dim3 grid((c + 15) / 16, (s + 15) / 16); + transpose_u4_kernel<<>>(dst, src, s, c); } // load -> unpack -> extend_to_u8 -> manipulation -> compat_to_u4 -> store diff --git a/src/turbomind/kernels/norm/CMakeLists.txt b/src/turbomind/kernels/norm/CMakeLists.txt new file mode 100644 index 0000000000..bc1569c405 --- /dev/null +++ b/src/turbomind/kernels/norm/CMakeLists.txt @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +add_library(rms_norm rms_norm.cu) +set_property(TARGET rms_norm PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET rms_norm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) diff --git a/src/turbomind/kernels/norm/rms_norm.cu b/src/turbomind/kernels/norm/rms_norm.cu new file mode 100644 index 0000000000..22fd69f52a --- /dev/null +++ b/src/turbomind/kernels/norm/rms_norm.cu @@ -0,0 +1,235 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "cub/block/block_reduce.cuh" + +#include "src/turbomind/kernels/core/array_ops.h" +#include "src/turbomind/kernels/core/common.h" + +namespace turbomind { + +template +__global__ void RMSNormKernel(T* dst, + int dst_ld, + const T* src, + int src_ld, + const T* __restrict__ weights, + int dims, + int num, + float eps, + float inv_dims) +{ + const int ti = blockIdx.x; + const int di = threadIdx.x * vec_size; + + if (ti >= num) { + return; + } + + src += src_ld * ti; + + Array accum{}; + Array vec; + + for (int i = di; i < dims; i += block_dim * vec_size) { + Load(vec, &src[i]); + Array tmp = cast(vec); + using namespace ops; + accum = accum + tmp * tmp; + } + + float sum{}; + PRAGMA_UNROLL + for (int i = 0; i < vec_size; ++i) { + sum += accum[i]; + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + sum = BlockReduce{temp_storage}.Sum(sum); + + __shared__ float shared_sum; + + if (threadIdx.x == 0) { + shared_sum = rsqrtf(sum * inv_dims + eps); + } + + __syncthreads(); + + sum = shared_sum; + + dst += dst_ld * ti; + + Array sv; + for (int i = di; i < dims; i += block_dim * vec_size) { + Load(vec, &src[i]); + Ldg(sv, &weights[i]); + PRAGMA_UNROLL + for (int c = 0; c < vec_size; ++c) { + vec[c] = (T)((float)vec[c] * sum) * sv[c]; + // vec[c] = (T)((float)vec[c] * sum * (float)sv[c]); + } + Store(&dst[i], vec); + } +} + +template +void invokeRMSNorm( + T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, cudaStream_t st) +{ + constexpr int vec_size = 16 / sizeof(T); + + constexpr int threads = 512; + const int blocks = num; + + RMSNormKernel<<>>(dst, // + dst_ld, + src, + src_ld, + weights, + dims, + num, + eps, + 1.f / dims); +} + +template void invokeRMSNorm(half* dst, + int dst_ld, + const half* src, + int src_ld, + const half* weights, + int dims, + int num, + float eps, + cudaStream_t st); +#if ENABLE_BF16 +template void invokeRMSNorm(nv_bfloat16* dst, + int dst_ld, + const nv_bfloat16* src, + int src_ld, + const nv_bfloat16* weights, + int dims, + int num, + float eps, + cudaStream_t st); +#endif + +// r' <- r + (h + b) +// h' <- norm(r') * w +template +__global__ void BiasResidualRMSNormKernel(T* __restrict__ residual, + T* __restrict__ hidden_states, + const T* __restrict__ weights, + const T* __restrict__ bias, + int dims, + int num, + float eps, + float inv_dims) +{ + const int ti = blockIdx.x; + const int di = threadIdx.x * vec_size; + + if (ti >= num) { + return; + } + + residual += dims * ti; + hidden_states += dims * ti; + + Array accum{}; + + Array r_vec; + Array h_vec; + Array b_vec; + + for (int i = di; i < dims; i += block_dim * vec_size) { + Load(r_vec, &residual[i]); + Load(h_vec, &hidden_states[i]); + + using namespace ops; + r_vec = r_vec + h_vec; + + if (bias) { + Ldg(b_vec, &bias[i]); + r_vec = r_vec + b_vec; + } + + Store(&residual[i], r_vec); + + Array tmp = cast(r_vec); + + accum = accum + tmp * tmp; + } + + float sum{}; + PRAGMA_UNROLL + for (int i = 0; i < vec_size; ++i) { + sum += accum[i]; + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + sum = BlockReduce{temp_storage}.Sum(sum); + + __shared__ float shared_sum; + + if (threadIdx.x == 0) { + shared_sum = rsqrtf(sum * inv_dims + eps); + } + + __syncthreads(); + + sum = shared_sum; + + Array w_vec; + for (int i = di; i < dims; i += block_dim * vec_size) { + Load(r_vec, &residual[i]); + Ldg(w_vec, &weights[i]); + PRAGMA_UNROLL + for (int c = 0; c < vec_size; ++c) { + r_vec[c] = (T)((float)r_vec[c] * sum) * w_vec[c]; + } + Store(&hidden_states[i], r_vec); + } +} + +template +void invokeBiasResidualRMSNorm( + T* residual, T* hidden_states, const T* weights, const T* bias, int dims, int num, float eps, cudaStream_t st) +{ + constexpr int vec_size = 16 / sizeof(T); + constexpr int threads = 512; + const int blocks = num; + + BiasResidualRMSNormKernel<<>>(residual, // + hidden_states, + weights, + bias, + dims, + num, + eps, + 1.f / dims); +} + +template void invokeBiasResidualRMSNorm(half* residual, + half* hidden_states, + const half* weights, + const half* bias, + int dims, + int num, + float eps, + cudaStream_t st); + +#if ENABLE_BF16 +template void invokeBiasResidualRMSNorm(nv_bfloat16* residual, + nv_bfloat16* hidden_states, + const nv_bfloat16* weights, + const nv_bfloat16* bias, + int dims, + int num, + float eps, + cudaStream_t st); +#endif + +} // namespace turbomind diff --git a/src/turbomind/kernels/norm/rms_norm.h b/src/turbomind/kernels/norm/rms_norm.h new file mode 100644 index 0000000000..83fa0f8263 --- /dev/null +++ b/src/turbomind/kernels/norm/rms_norm.h @@ -0,0 +1,21 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include + +namespace turbomind { + +template +void invokeRMSNorm( + T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, cudaStream_t st); + +template +void invokeRMSNorm(T* dst, const T* src, const T* weights, int dims, int num, float eps, cudaStream_t st) +{ + invokeRMSNorm(dst, dims, src, dims, weights, dims, num, eps, st); +} + +template +void invokeBiasResidualRMSNorm( + T* residual, T* hidden_states, const T* weights, const T* bias, int dims, int num, float eps, cudaStream_t st); + +} // namespace turbomind diff --git a/src/turbomind/models/llama/CMakeLists.txt b/src/turbomind/models/llama/CMakeLists.txt index 285fcea31f..3c714bd234 100644 --- a/src/turbomind/models/llama/CMakeLists.txt +++ b/src/turbomind/models/llama/CMakeLists.txt @@ -20,11 +20,13 @@ add_library(Llama STATIC unified_attention_layer.cc llama_kernels.cu llama_decoder_kernels.cu - llama_utils.cu) + llama_utils.cu + mla_utils.cu) set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(Llama PUBLIC CUDA::cudart gemm2 + rms_norm cublasMMWrapper DynamicDecodeLayer activation_kernels diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 4138174e5d..ea321d06a0 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -20,6 +20,7 @@ #include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/debug_utils.h" #include "src/turbomind/utils/logger.h" +#include "src/turbomind/utils/nccl_utils.h" #include #include #include @@ -1041,6 +1042,9 @@ LlamaBatch::LlamaBatch(const EngineParam& param, AllocateBuffer(max_batch_size_, session_len_, cache_block_seq_len); AllocatePersistantBuffer(max_batch_size_, cache_block_seq_len); + + // Wait for allocations + check_cuda_error(cudaStreamSynchronize(stream_)); } template @@ -1990,7 +1994,7 @@ void LlamaBatch::tune() nullptr, nullptr); // implicit barrier for TP - check_cuda_error(cudaStreamSynchronize(stream_)); + ftNcclStreamSynchronize(model_->tensor_para_, {}, stream_); } auto tock = std::chrono::steady_clock::now(); diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc index f6f9ab0efa..0a2a3be175 100644 --- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc +++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc @@ -52,28 +52,21 @@ static bool is_fuse_silu_act() } template -LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(int layer_idx, - size_t head_num, - size_t kv_head_num, - size_t size_per_head, - size_t hidden_units, - size_t inter_size, - WeightType weight_type, - int group_size, - LoraParam lora_param, - bool attn_bias, - MoeParam moe_param, - size_t tensor_para_size, - size_t tensor_para_rank): - head_num_(head_num), - kv_head_num_(kv_head_num), - size_per_head_(size_per_head), - hidden_units_(hidden_units), - inter_size_(inter_size), - weight_type_(weight_type), - attn_bias_(attn_bias), - tensor_para_size_(tensor_para_size), - tensor_para_rank_(tensor_para_rank) +LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(int layer_id, + const ModelParam& model, + const LoraParam& lora_param, + const MoeParam& moe_param, + size_t tp_size, + size_t tp_rank): + head_num_(model.head_num), + kv_head_num_(model.kv_head_num), + size_per_head_(model.head_dim), + hidden_units_(model.hidden_units), + inter_size_(model.inter_size.at(layer_id)), + weight_type_(model.weight_type), + attn_bias_(model.attn_bias), + tensor_para_size_(tp_size), + tensor_para_rank_(tp_rank) { if (lora_param.policy == LoraPolicy::kPlora) { std::vector keys = { @@ -88,7 +81,7 @@ LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(int layer_idx, auto& weight = *weights[i]; int rank = lora_param.r; float scale = lora_param.scale; - std::string full_name = "layers." + std::to_string(layer_idx) + "." + name; + std::string full_name = "layers." + std::to_string(layer_id) + "." + name; for (const auto& [re, pr] : lora_param.rank_pattern) { if (std::regex_search(full_name, pr.first)) { @@ -114,36 +107,44 @@ LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(int layer_idx, fused_up_and_gate_ = ffn_weights.gating.lora.policy != LoraPolicy::kPlora; - self_attn_weights.qkv.input_dims = hidden_units_; - self_attn_weights.qkv.output_dims = (head_num + 2 * kv_head_num) * size_per_head / tensor_para_size_; - self_attn_weights.qkv.type = weight_type; - self_attn_weights.qkv.group_size = group_size; - - self_attn_weights.output.input_dims = (head_num * size_per_head) / tensor_para_size_; - self_attn_weights.output.output_dims = hidden_units_; - self_attn_weights.output.type = weight_type; - self_attn_weights.output.group_size = group_size; + self_attn_weights = LlamaAttentionWeight{hidden_units_, + size_per_head_, + head_num_, + kv_head_num_, + model.mla, + attn_bias_, + tensor_para_size_, + weight_type_, + model.group_size}; ffn_weights = LlamaFfnWeight{ hidden_units_, inter_size_, tensor_para_size_, weight_type_, - group_size, + model.group_size, weight_type_ == WeightType::kINT4 && is_fuse_silu_act(), }; - moe_weights = MoeFfnWeight{hidden_units_, - moe_param.inter_size, - moe_param.expert_num, - moe_param.method, - moe_param.shared_gate, - tensor_para_size_, - weight_type, - group_size, - is_fuse_silu_act()}; - - mallocWeights(); + moe_weights = MoeFfnWeight{ + layer_id, moe_param, hidden_units_, weight_type_, model.group_size, tensor_para_size_, is_fuse_silu_act()}; +} + +template +void LlamaDecoderLayerWeight::malloc(cudaStream_t st) +{ + deviceMalloc((T**)&self_attn_norm_weights, hidden_units_, st); + deviceMalloc((T**)&ffn_norm_weights, hidden_units_, st); + + self_attn_weights.malloc(st); + + if (inter_size_) { + ffn_weights.malloc(st); + } + + if (!moe_weights.experts.empty()) { + moe_weights.malloc(st); + } } template @@ -168,52 +169,6 @@ size_t LlamaDecoderLayerWeight::workspace_size() const noexcept return size * sizeof(uint16_t); } -template -void freeWeights(LlamaDenseWeight& weights) -{ - cudaFree(weights.kernel); - cudaFree(weights.bias); - cudaFree(weights.scales); - cudaFree(weights.zeros); - - weights.kernel = nullptr; - weights.bias = nullptr; - weights.scales = nullptr; - weights.zeros = nullptr; - - { - cudaFree(weights.lora.a); - cudaFree(weights.lora.b); - weights.lora.a = nullptr; - weights.lora.b = nullptr; - } -} - -template -void LlamaDecoderLayerWeight::mallocWeights(LlamaDenseWeight& weights, bool bias) -{ - if (bias) { - deviceMalloc((T**)&weights.bias, weights.output_dims); - } - const size_t bit_size = getBitSize(weights.type); - if (bit_size >= 16) { // fp16, fp32 - deviceMalloc((T**)&weights.kernel, weights.input_dims * weights.output_dims); - } - else { // int8, int4 - const int factor = sizeof(float) * 8 / bit_size; - FT_CHECK(weights.input_dims % factor == 0); - deviceMalloc((int**)&weights.kernel, weights.input_dims * weights.output_dims / factor); - deviceMemSetZero((int*)weights.kernel, weights.input_dims * weights.output_dims / factor); - deviceMalloc((T**)&weights.scales, weights.input_dims / weights.group_size * weights.output_dims); - deviceMalloc((T**)&weights.zeros, weights.input_dims / weights.group_size * weights.output_dims); - } - - if (weights.lora.r > 0) { - deviceMalloc((T**)&weights.lora.a, weights.input_dims * weights.lora.r); - deviceMalloc((T**)&weights.lora.b, weights.lora.r * weights.output_dims); - } -} - template std::string concat(FirstArg&& first, Args&&... args) { @@ -342,64 +297,24 @@ void loadWeights(LlamaDenseWeight& w, std::string prefix, FtCudaDataType mode } template -void LlamaDecoderLayerWeight::mallocWeights() +void LlamaDecoderLayerWeight::free(cudaStream_t st) { - deviceMalloc((T**)&self_attn_norm_weights, hidden_units_); - deviceMalloc((T**)&ffn_norm_weights, hidden_units_); + deviceFree(self_attn_norm_weights, st); + deviceFree(ffn_norm_weights, st); - mallocWeights(self_attn_weights.qkv, attn_bias_); - mallocWeights(self_attn_weights.output, attn_bias_); + self_attn_weights.free(st); if (inter_size_) { - mallocWeights(ffn_weights.gating, false); - mallocWeights(ffn_weights.intermediate, false); - mallocWeights(ffn_weights.output, false); + ffn_weights.free(st); } if (!moe_weights.experts.empty()) { - mallocWeights(moe_weights.gate, false); - for (auto& e : moe_weights.experts) { - mallocWeights(e.gating, false); - mallocWeights(e.intermediate, false); - mallocWeights(e.output, false); - } - if (moe_weights.shared_gate.output_dims) { - mallocWeights(moe_weights.shared_gate, false); - } + moe_weights.free(st); } } template -LlamaDecoderLayerWeight::~LlamaDecoderLayerWeight() -{ - cudaFree((void*)self_attn_norm_weights); - cudaFree((void*)ffn_norm_weights); - self_attn_norm_weights = nullptr; - ffn_norm_weights = nullptr; - - freeWeights(self_attn_weights.qkv); - freeWeights(self_attn_weights.output); - - if (inter_size_) { - freeWeights(ffn_weights.fused_gating_intermediate); - freeWeights(ffn_weights.gating); - freeWeights(ffn_weights.intermediate); - freeWeights(ffn_weights.output); - } - - if (!moe_weights.experts.empty()) { - freeWeights(moe_weights.gate); - for (auto& e : moe_weights.experts) { - freeWeights(e.fused_gating_intermediate); - freeWeights(e.gating); - freeWeights(e.intermediate); - freeWeights(e.output); - } - if (moe_weights.shared_gate.kernel) { - freeWeights(moe_weights.shared_gate); - } - } -} +LlamaDecoderLayerWeight::~LlamaDecoderLayerWeight() = default; template void LlamaDecoderLayerWeight::loadModel(std::string dir_path, FtCudaDataType model_file_type) @@ -432,6 +347,24 @@ void LlamaDecoderLayerWeight::loadModel(std::string dir_path, FtCudaDataType } } +template +void getMLATensor(LlamaAttentionWeight& w, const std::string& p, TensorMap& m, int tp_rank) +{ + if (w.q_proj.output_dims) { + getWeightTensor(w.q_proj, false, concat(p, "attention.q_proj", tp_rank), m); + } + else { + getWeightTensor(w.q_a_proj, false, concat(p, "attention.q_a_proj"), m); + getWeightTensor(w.q_b_proj, false, concat(p, "attention.q_b_proj", tp_rank), m); + m.insert(concat(p, "attention.q_a_layernorm"), + Tensor{MEMORY_GPU, getTensorType(), {sizeof(T) * w.q_b_proj.input_dims}, w.q_a_layernorm}); + } + getWeightTensor(w.kv_a_proj, false, concat(p, "attention.kv_a_proj"), m); + getWeightTensor(w.kv_b_proj, false, concat(p, "attention.kv_b_proj", tp_rank), m); + m.insert(concat(p, "attention.kv_a_layernorm"), + Tensor{MEMORY_GPU, getTensorType(), {sizeof(T) * w.kv_b_proj.input_dims}, w.kv_a_layernorm}); +} + template TensorMap LlamaDecoderLayerWeight::getParams(std::string prefix) { @@ -445,7 +378,12 @@ TensorMap LlamaDecoderLayerWeight::getParams(std::string prefix) auto get_prefix = [=](std::string_view name) { return concat(prefix, name, tensor_para_rank_); }; - getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output); + if (self_attn_weights.qkv.output_dims) { + getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output); + } + else { + getMLATensor(self_attn_weights, prefix, output, tensor_para_rank_); + } getWeightTensor(self_attn_weights.output, attn_bias_, get_prefix("attention.wo"), output); if (inter_size_) { @@ -478,7 +416,8 @@ TensorMap LlamaDecoderLayerWeight::getParams(std::string prefix) } // template -static void convert_u4(LlamaDenseWeight& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt) +static void convert_u4( + LlamaDenseWeight& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st) { FT_CHECK(weight.type == WeightType::kINT4); @@ -488,11 +427,11 @@ static void convert_u4(LlamaDenseWeight& weight, bool is_fused_moe, void* get_weight_and_scales_layout(gemm::DataType::U4, is_fused_moe, getSMVersion(), use_simt); if (order_b == kColMajor) { - transpose_u4((uint4_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims, weight.output_dims); - cudaMemcpy(weight.kernel, workspace, weight.input_dims * weight.output_dims / 2, cudaMemcpyDefault); + transpose_u4((uint4_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims, weight.output_dims, st); + cudaMemcpyAsync(weight.kernel, workspace, weight.input_dims * weight.output_dims / 2, cudaMemcpyDefault, st); } - extend_to_u16((uint16_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims * weight.output_dims); + extend_to_u16((uint16_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims * weight.output_dims, st); sync_check_cuda_error(); MatrixLayout w_desc{ @@ -507,25 +446,22 @@ static void convert_u4(LlamaDenseWeight& weight, bool is_fused_moe, void* k_desc.type = gemm::DataType::U4; k_desc.pack = pack_b; - cudaMemset(weight.kernel, 0, weight.input_dims * weight.output_dims / 2); + cudaMemsetAsync(weight.kernel, 0, weight.input_dims * weight.output_dims / 2, st); - FT_CHECK(Convert(workspace, w_desc, weight.kernel, k_desc, 0) == 0); + FT_CHECK(Convert(workspace, w_desc, weight.kernel, k_desc, st) == 0); sync_check_cuda_error(); const int scale_count = (weight.input_dims / weight.group_size) * weight.output_dims; // std::cout << "fuse_scales_and_zeros\n"; - fuse_scales_and_zeros((half*)workspace, weight.scales, weight.zeros, scale_count); + fuse_scales_and_zeros((half*)workspace, weight.scales, weight.zeros, scale_count, st); // cudaMemset((T*)workspace, 0, sizeof(T) * scale_count * 2); sync_check_cuda_error(); - cudaDeviceSynchronize(); - - cudaFree(weight.scales); - cudaFree(weight.zeros); - weight.scales = weight.zeros = nullptr; + deviceFree(weight.scales, st); + deviceFree(weight.zeros, st); - deviceMalloc((half**)&weight.scales_zeros, scale_count * 2); + deviceMalloc((half**)&weight.scales_zeros, scale_count * 2, st); MatrixLayout s_desc{ gemm::DataType::U32, @@ -538,7 +474,7 @@ static void convert_u4(LlamaDenseWeight& weight, bool is_fused_moe, void* MatrixLayout q_desc = s_desc; q_desc.pack = pack_v; - FT_CHECK(Convert(workspace, s_desc, weight.scales_zeros, q_desc, 0) == 0); + FT_CHECK(Convert(workspace, s_desc, weight.scales_zeros, q_desc, st) == 0); sync_check_cuda_error(); weight.k_desc = k_desc; @@ -548,7 +484,8 @@ static void convert_u4(LlamaDenseWeight& weight, bool is_fused_moe, void* } template -static void convert_fp(LlamaDenseWeight& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt) +static void +convert_fp(LlamaDenseWeight& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st) { using namespace gemm; @@ -563,12 +500,13 @@ static void convert_fp(LlamaDenseWeight& weight, bool is_fused_moe, void* wor const int output_dim = weight.output_dims; if (order_b == kColMajor) { - invokeTransposeAxis01((uint16_t*)workspace, (uint16_t*)weight.kernel, input_dim, output_dim, 1, nullptr); + invokeTransposeAxis01((uint16_t*)workspace, (uint16_t*)weight.kernel, input_dim, output_dim, 1, st); sync_check_cuda_error(); // FT_CHECK(0); } else { - check_cuda_error(cudaMemcpy(workspace, weight.kernel, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault)); + check_cuda_error( + cudaMemcpyAsync(workspace, weight.kernel, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault, st)); } MatrixLayout src{ @@ -583,35 +521,42 @@ static void convert_fp(LlamaDenseWeight& weight, bool is_fused_moe, void* wor dst.pack = pack_b; if (pack_b) { - FT_CHECK(Convert(workspace, src, weight.kernel, dst, nullptr) == 0); + FT_CHECK(Convert(workspace, src, weight.kernel, dst, st) == 0); sync_check_cuda_error(); // FT_CHECK(0); } else { - check_cuda_error(cudaMemcpy(weight.kernel, workspace, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault)); + check_cuda_error( + cudaMemcpyAsync(weight.kernel, workspace, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault, st)); } weight.k_desc = dst; } template -static void convert(LlamaDenseWeight& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt) +static void +convert(LlamaDenseWeight& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st) { if (weight.type == WeightType::kINT4) { if constexpr (std::is_same_v) { - convert_u4(weight, is_fused_moe, workspace, size, use_simt); + convert_u4(weight, is_fused_moe, workspace, size, use_simt, st); } else { FT_CHECK(0); } } else { - convert_fp(weight, is_fused_moe, workspace, size, use_simt); + convert_fp(weight, is_fused_moe, workspace, size, use_simt, st); } } template -void interleave(LlamaDenseWeight& c, LlamaDenseWeight& a, LlamaDenseWeight& b, void* workspace, size_t size) +void interleave(LlamaDenseWeight& c, + LlamaDenseWeight& a, + LlamaDenseWeight& b, + void* workspace, + size_t size, + cudaStream_t st) { FT_CHECK(c.input_dims == a.input_dims); FT_CHECK(c.input_dims == b.input_dims); @@ -628,18 +573,18 @@ void interleave(LlamaDenseWeight& c, LlamaDenseWeight& a, LlamaDenseWeight const auto sentinel = tmp_c + c.output_dims * c.input_dims; FT_CHECK(sentinel <= (uint8_t*)workspace + size); - extend_to_u8(tmp_a, (const uint4_t*)a.kernel, a.output_dims * a.input_dims); - extend_to_u8(tmp_b, (const uint4_t*)b.kernel, b.output_dims * b.input_dims); + extend_to_u8(tmp_a, (const uint4_t*)a.kernel, a.output_dims * a.input_dims, st); + extend_to_u8(tmp_b, (const uint4_t*)b.kernel, b.output_dims * b.input_dims, st); - interleave_output_dims(tmp_c, tmp_a, tmp_b, a.output_dims, a.input_dims, 0); + interleave_output_dims(tmp_c, tmp_a, tmp_b, a.output_dims, a.input_dims, st); - compact_to_u4((uint4_t*)c.kernel, tmp_c, c.output_dims * c.input_dims); + compact_to_u4((uint4_t*)c.kernel, tmp_c, c.output_dims * c.input_dims, st); - interleave_output_dims(c.scales, a.scales, b.scales, a.output_dims, a.input_dims / a.group_size, 0); - interleave_output_dims(c.zeros, a.zeros, b.zeros, a.output_dims, a.input_dims / a.group_size, 0); + interleave_output_dims(c.scales, a.scales, b.scales, a.output_dims, a.input_dims / a.group_size, st); + interleave_output_dims(c.zeros, a.zeros, b.zeros, a.output_dims, a.input_dims / a.group_size, st); } else { - interleave_output_dims((T*)c.kernel, (const T*)a.kernel, (const T*)b.kernel, a.output_dims, a.input_dims, 0); + interleave_output_dims((T*)c.kernel, (const T*)a.kernel, (const T*)b.kernel, a.output_dims, a.input_dims, st); } // Check at function level @@ -647,7 +592,7 @@ void interleave(LlamaDenseWeight& c, LlamaDenseWeight& a, LlamaDenseWeight } template -void chunk(LlamaDenseWeight& c, LlamaDenseWeight& a, LlamaDenseWeight& b, void*, size_t) +void chunk(LlamaDenseWeight& c, LlamaDenseWeight& a, LlamaDenseWeight& b, void*, size_t, cudaStream_t st) { FT_CHECK(c.input_dims == a.input_dims); FT_CHECK(c.input_dims == b.input_dims); @@ -656,9 +601,11 @@ void chunk(LlamaDenseWeight& c, LlamaDenseWeight& a, LlamaDenseWeight& FT_CHECK(c.group_size == a.group_size); FT_CHECK(c.group_size == b.group_size); - auto _chunks = [](auto c, auto a, auto b, int height, int width) { - check_cuda_error(cudaMemcpy2D((char*)c + 0x000, width * 2, a, width, width, height, cudaMemcpyDefault)); - check_cuda_error(cudaMemcpy2D((char*)c + width, width * 2, b, width, width, height, cudaMemcpyDefault)); + auto _chunks = [&](auto c, auto a, auto b, int height, int width) { + check_cuda_error( + cudaMemcpy2DAsync((char*)c + 0x000, width * 2, a, width, width, height, cudaMemcpyDefault, st)); + check_cuda_error( + cudaMemcpy2DAsync((char*)c + width, width * 2, b, width, width, height, cudaMemcpyDefault, st)); }; if (c.type == WeightType::kINT4) { @@ -675,37 +622,37 @@ void chunk(LlamaDenseWeight& c, LlamaDenseWeight& a, LlamaDenseWeight& } template -void LlamaDecoderLayerWeight::prepare(void* workspace, size_t size, const cudaDeviceProp& prop) +void LlamaDecoderLayerWeight::prepare(void* workspace, size_t size, const cudaDeviceProp& prop, cudaStream_t st) { const bool is_16xx = is_16xx_series(prop.name); - convert(self_attn_weights.qkv, false, workspace, size, is_16xx); - convert(self_attn_weights.output, false, workspace, size, is_16xx); + convert(self_attn_weights.qkv, false, workspace, size, is_16xx, st); + convert(self_attn_weights.output, false, workspace, size, is_16xx, st); auto process_ffn = [&](LlamaFfnWeight& ffn, bool is_fused_moe) { if (fused_up_and_gate_) { auto& fused_up_and_gate = ffn.fused_gating_intermediate; - mallocWeights(fused_up_and_gate, false); + fused_up_and_gate.malloc(st); if (ffn.is_fused_silu) { - interleave(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size); + interleave(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size, st); } else { - chunk(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size); + chunk(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size, st); } - convert(ffn.fused_gating_intermediate, is_fused_moe, workspace, size, is_16xx); + convert(ffn.fused_gating_intermediate, is_fused_moe, workspace, size, is_16xx, st); - freeWeights(ffn.gating); - freeWeights(ffn.intermediate); + ffn.gating.free(st); + ffn.intermediate.free(st); } else { - convert(ffn.gating, is_fused_moe, workspace, size, is_16xx); - convert(ffn.intermediate, is_fused_moe, workspace, size, is_16xx); + convert(ffn.gating, is_fused_moe, workspace, size, is_16xx, st); + convert(ffn.intermediate, is_fused_moe, workspace, size, is_16xx, st); } - convert(ffn.output, is_fused_moe, workspace, size, is_16xx); + convert(ffn.output, is_fused_moe, workspace, size, is_16xx, st); }; if (inter_size_) { @@ -722,7 +669,7 @@ void LlamaDecoderLayerWeight::prepare(void* workspace, size_t size, const cud for (auto& e : moe_weights.experts) { - process_ffn(e, moe_weights.method); + process_ffn(e, moe_weights.method == MoeParam::kFused); const auto& fused = e.fused_gating_intermediate; const auto& output = e.output; @@ -743,12 +690,12 @@ void LlamaDecoderLayerWeight::prepare(void* workspace, size_t size, const cud auto& output = moe_weights.block.output; // TODO: free these ptrs - fused.kernel = gemm::make_blocked_ptrs(fused_ptrs, nullptr); - output.kernel = gemm::make_blocked_ptrs(output_ptrs, nullptr); + fused.kernel = gemm::make_blocked_ptrs(fused_ptrs, st); + output.kernel = gemm::make_blocked_ptrs(output_ptrs, st); if (!fused_param_ptrs.empty()) { - fused.scales_zeros = (T*)gemm::make_blocked_ptrs(fused_param_ptrs, nullptr); - output.scales_zeros = (T*)gemm::make_blocked_ptrs(output_param_ptrs, nullptr); + fused.scales_zeros = (T*)gemm::make_blocked_ptrs(fused_param_ptrs, st); + output.scales_zeros = (T*)gemm::make_blocked_ptrs(output_param_ptrs, st); } fused.k_desc.ld = output.k_desc.ld = 0; diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h index f68a103dd5..9b204ed0dc 100644 --- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h +++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h @@ -30,19 +30,14 @@ template struct LlamaDecoderLayerWeight { public: LlamaDecoderLayerWeight() = delete; - LlamaDecoderLayerWeight(int layer_idx, - size_t head_num, - size_t kv_head_num, - size_t size_per_head, - size_t hidden_units, - size_t inter_size, - WeightType weight_type, - int group_size, - LoraParam lora_param, - bool attn_bias, - MoeParam moe_param, - size_t tensor_para_size, - size_t tensor_para_rank); + + LlamaDecoderLayerWeight(int layer_id, + const ModelParam& model, + const LoraParam& lora_param, + const MoeParam& moe_param, + size_t tp_size, + size_t tp_rank); + ~LlamaDecoderLayerWeight(); LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other) = delete; LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other) = delete; @@ -51,17 +46,21 @@ struct LlamaDecoderLayerWeight { TensorMap getParams(std::string prefix); - void prepare(void* workspace, size_t size, const cudaDeviceProp& prop); + void prepare(void* workspace, size_t size, const cudaDeviceProp& prop, cudaStream_t st); size_t workspace_size() const noexcept; - void mallocWeights(LlamaDenseWeight& weights, bool bias); + void malloc(cudaStream_t st); + + void free(cudaStream_t st); + + T* self_attn_norm_weights{}; + T* ffn_norm_weights{}; - T* self_attn_norm_weights{}; - T* ffn_norm_weights{}; LlamaAttentionWeight self_attn_weights{}; - LlamaFfnWeight ffn_weights{}; - MoeFfnWeight moe_weights{}; + + LlamaFfnWeight ffn_weights{}; + MoeFfnWeight moe_weights{}; private: size_t head_num_; @@ -76,8 +75,6 @@ struct LlamaDecoderLayerWeight { size_t tensor_para_rank_; bool is_maintain_buffer_ = false; bool fused_up_and_gate_; - - void mallocWeights(); }; } // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h index 169fb53bcf..944781bf5d 100644 --- a/src/turbomind/models/llama/LlamaDenseWeight.h +++ b/src/turbomind/models/llama/LlamaDenseWeight.h @@ -20,64 +20,14 @@ #pragma once #include "src/turbomind/kernels/gemm/types.h" +#include "src/turbomind/models/llama/llama_params.h" +#include "src/turbomind/models/llama/weight_type.h" #include "src/turbomind/utils/cuda_utils.h" +#include "src/turbomind/utils/memory_utils.h" #include namespace turbomind { -enum class WeightType : int -{ - kFP32, - kFP16, - kFP8, // not supported yet - kBF16, - kINT8, - kINT4 -}; - -template -constexpr WeightType get_default_weight_type() -{ - if constexpr (std::is_same_v) { - return WeightType::kFP16; - } - else if constexpr (std::is_same_v) { - return WeightType::kBF16; - } - else if constexpr (std::is_same_v) { - return WeightType::kFP32; - } - else { - static_assert(sizeof(T) != sizeof(T), "not implemented"); - return {}; - } -} - -inline size_t getBitSize(WeightType type) -{ - switch (type) { - case WeightType::kFP32: - return 32; - case WeightType::kFP16: - return 16; - case WeightType::kFP8: - return 8; - case WeightType::kBF16: - return 16; - case WeightType::kINT8: - return 8; - case WeightType::kINT4: - return 4; - } - return 0; -} - -enum class LoraPolicy : int -{ - kNull, - kPlora, -}; - inline LoraPolicy getLoraPolicy(const std::string& policy) { if (policy == "plora") { @@ -96,20 +46,31 @@ struct LoraWeight { template struct LlamaDenseWeight { - size_t input_dims; - size_t output_dims; - void* kernel; + size_t input_dims = 0; + size_t output_dims = 0; + WeightType type; // uninitialized + void* kernel = nullptr; + T* bias = nullptr; + T* scales = nullptr; + T* zeros = nullptr; + T* scales_zeros = nullptr; + int group_size = 1; + LoraWeight lora; - WeightType type; - T* bias; - T* scales; - T* zeros; - T* scales_zeros; - int group_size; gemm::MatrixLayout k_desc; gemm::MatrixLayout q_desc; + LlamaDenseWeight(): type{}, lora{}, k_desc{}, q_desc{} {} + + LlamaDenseWeight(size_t input_dim, size_t output_dim, WeightType type, int group_size): LlamaDenseWeight{} + { + this->input_dims = input_dim; + this->output_dims = output_dim; + this->type = type; + this->group_size = group_size; + } + size_t kernel_size() const noexcept { return getBitSize(type) * input_dims * output_dims / 8; @@ -129,12 +90,121 @@ struct LlamaDenseWeight { { return {sizeof(T) * input_dims * lora.r, sizeof(T) * lora.r * output_dims}; } + + void malloc(cudaStream_t st, bool with_bias = false) + { + if (with_bias) { + deviceMalloc((T**)&bias, output_dims, st); + } + const size_t bit_size = getBitSize(type); + if (bit_size >= 16) { // fp16, fp32 + deviceMalloc((T**)&kernel, input_dims * output_dims, st); + } + else { // int8, int4 + const int factor = sizeof(float) * 8 / bit_size; + FT_CHECK(input_dims % factor == 0); + deviceMalloc((int**)&kernel, input_dims * output_dims / factor, st); + deviceMalloc((T**)&scales, input_dims / group_size * output_dims, st); + deviceMalloc((T**)&zeros, input_dims / group_size * output_dims, st); + } + + if (lora.r > 0) { + deviceMalloc((T**)&lora.a, input_dims * lora.r, st); + deviceMalloc((T**)&lora.b, lora.r * output_dims, st); + } + } + + void free(cudaStream_t st) + { + deviceFree(kernel, st); + deviceFree(bias, st); + deviceFree(scales, st); + deviceFree(zeros, st); + deviceFree(lora.a, st); + deviceFree(lora.b, st); + } }; template struct LlamaAttentionWeight { + + LlamaAttentionWeight() = default; + + LlamaAttentionWeight(size_t hidden_dim, + size_t head_dim, + size_t head_num, + size_t kv_head_num, + MLAParam mla, + bool bias, + size_t tp, + WeightType weight_type, + int group_size) + { + this->bias = bias; + if (mla.kv_lora_rank == 0) { + qkv = {hidden_dim, (head_num + 2 * kv_head_num) * head_dim / tp, weight_type, group_size}; + } + else { + const int qk_nope_dim = head_dim - mla.qk_rope_dim; + if (mla.q_lora_rank) { + q_a_proj = {hidden_dim, mla.q_lora_rank, weight_type, group_size}; + q_b_proj = {mla.q_lora_rank, head_num * head_dim / tp, weight_type, group_size}; + } + else { + q_proj = {hidden_dim, head_num * head_dim / tp, weight_type, group_size}; + } + kv_a_proj = {hidden_dim, mla.kv_lora_rank + mla.qk_rope_dim, weight_type, group_size}; + kv_b_proj = {mla.kv_lora_rank, head_num * (qk_nope_dim + mla.v_head_dim) / tp, weight_type, group_size}; + } + output = {(head_num * head_dim) / tp, hidden_dim, weight_type, group_size}; + } + + void malloc(cudaStream_t st) + { + if (qkv.output_dims) { + qkv.malloc(st, bias); + } + else { + if (q_proj.output_dims) { + q_proj.malloc(st); + } + else { + q_a_proj.malloc(st); + q_b_proj.malloc(st); + deviceMalloc((T**)&q_a_layernorm, q_b_proj.input_dims, st); + } + kv_a_proj.malloc(st); + kv_b_proj.malloc(st); + deviceMalloc((T**)&kv_a_layernorm, kv_b_proj.input_dims, st); + } + output.malloc(st, bias); + } + + void free(cudaStream_t st) + { + qkv.free(st); + q_proj.free(st); + q_a_proj.free(st); + q_b_proj.free(st); + kv_a_proj.free(st); + kv_b_proj.free(st); + output.free(st); + deviceFree(q_a_layernorm, st); + deviceFree(kv_a_layernorm, st); + } + LlamaDenseWeight qkv; LlamaDenseWeight output; + bool bias{}; + + LlamaDenseWeight q_proj; + LlamaDenseWeight q_a_proj; + LlamaDenseWeight q_b_proj; + LlamaDenseWeight kv_a_proj; + LlamaDenseWeight kv_b_proj; + + T* q_a_layernorm{}; + T* kv_a_layernorm{}; }; template @@ -172,6 +242,21 @@ struct LlamaFfnWeight { output.group_size = group_size; } + void malloc(cudaStream_t st) + { + gating.malloc(st); + intermediate.malloc(st); + output.malloc(st); + } + + void free(cudaStream_t st) + { + gating.free(st); + intermediate.free(st); + output.free(st); + fused_gating_intermediate.free(st); + } + LlamaDenseWeight gating; LlamaDenseWeight intermediate; LlamaDenseWeight output; @@ -186,23 +271,27 @@ struct MoeFfnWeight { MoeFfnWeight() = default; - MoeFfnWeight(size_t hidden_dim, - int inter_size, - int expert_num, - int method, - bool has_shared_gate, - size_t tp, - WeightType weight_type, - int group_size, - bool fuse_silu_act) + MoeFfnWeight(int layer_id, + const MoeParam& param, + size_t hidden_dim, + WeightType weight_type, + int group_size, + size_t tp, + bool fuse_silu_act) { - // printf("%d %d %d\n", (int)hidden_dim, (int)inter_size, (int)expert_num); + if (param.expert_num.size() <= layer_id) { + return; + } + + const int expert_num = param.expert_num[layer_id]; if (expert_num == 0) { return; } + // printf("%d %d %d\n", (int)hidden_dim, (int)param.inter_size, (int)expert_num); + gate.input_dims = hidden_dim; gate.output_dims = expert_num; gate.type = get_default_weight_type(); @@ -210,15 +299,15 @@ struct MoeFfnWeight { experts.resize(expert_num); - this->method = method; - fuse_silu_act = fuse_silu_act && method; + method = param.method; + fuse_silu_act = fuse_silu_act && method == MoeParam::kFused; for (auto& e : experts) { // inter size is divided by tp in `FfnWeight` - e = LlamaFfnWeight{hidden_dim, (size_t)inter_size, tp, weight_type, group_size, fuse_silu_act}; + e = LlamaFfnWeight{hidden_dim, (size_t)param.inter_size, tp, weight_type, group_size, fuse_silu_act}; } - if (has_shared_gate) { + if (param.shared_gate) { shared_gate.input_dims = hidden_dim; shared_gate.output_dims = 1; shared_gate.type = get_default_weight_type(); @@ -229,14 +318,36 @@ struct MoeFfnWeight { } } + void malloc(cudaStream_t st) + { + gate.malloc(st); + if (shared_gate.output_dims) { + shared_gate.malloc(st); + } + for (auto& e : experts) { + e.malloc(st); + } + } + + void free(cudaStream_t st) + { + gate.free(st); + shared_gate.free(st); + for (auto& e : experts) { + e.free(st); + } + block.free(st); + } + LlamaDenseWeight gate; std::vector> experts; LlamaDenseWeight shared_gate; + // reference into `experts` LlamaFfnWeight block; - int method{}; + MoeParam::Method method{}; }; } // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc index 8cce207203..907467341a 100644 --- a/src/turbomind/models/llama/LlamaFfnLayer.cc +++ b/src/turbomind/models/llama/LlamaFfnLayer.cc @@ -27,21 +27,20 @@ namespace turbomind { template -void LlamaFfnLayer::allocateBuffer(size_t token_num, - int inter_size, - const LlamaDenseWeight* gating, - const LlamaDenseWeight* inter) +void LlamaFfnLayer::allocateBuffer( + size_t token_num, int inter_size, size_t inter_buf_factor, size_t gating_lora_r, size_t inter_lora_r) { const size_t sz = token_num * inter_size; - const size_t sz_gate = token_num * gating->lora.r; - const size_t sz_inter = token_num * inter->lora.r; + const size_t sz_gate = token_num * gating_lora_r; + const size_t sz_inter = token_num * inter_lora_r; - gating_buf_ = (T*)allocator_->reMalloc(gating_buf_, sizeof(T) * (sz * 2 + sz_gate + sz_inter), false); - inter_buf_ = gating_buf_ + sz; + gating_buf_ = + (T*)allocator_->reMalloc(gating_buf_, sizeof(T) * (sz * inter_buf_factor + sz_gate + sz_inter), false); + inter_buf_ = gating_buf_ + sz; // gate & inter is not fused when lora is enabled - if (gating->lora.r) { + if (gating_lora_r) { inter_buf_ += sz_gate; } @@ -93,12 +92,16 @@ void LlamaFfnLayer::forward(TensorMap* output_tensors, const int layer_id = input_tensors->getVal("layer_id"); const int inter_size = weights->inter_size; - allocateBuffer(token_num, inter_size, &weights->gating, &weights->intermediate); + const bool is_fused_silu = weights->fused_gating_intermediate.kernel && weights->is_fused_silu; + + allocateBuffer(token_num, inter_size, is_fused_silu ? 1 : 2, weights->gating.lora.r, weights->intermediate.lora.r); const T* ffn_input_data = input_tensors->at("ffn_input").getPtr(); T* ffn_output_data = output_tensors->at("ffn_output").getPtr(); int* lora_mask = input_tensors->at("lora_mask", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr(); + const bool all_reduce = input_tensors->getVal("all_reduce", false); + if (weights->fused_gating_intermediate.kernel) { NvtxScope scope("fused_silu_ffn"); @@ -145,7 +148,8 @@ void LlamaFfnLayer::forward(TensorMap* output_tensors, count_and_fix(ffn_output_data, token_num * weights->output.output_dims, Concat("w2", layer_id), 3); - if (all_reduce_ && tensor_para_.world_size_ > 1) { + if (all_reduce && tensor_para_.world_size_ > 1) { + // std::cout << "ffn all reduce " << layer_id << "\n"; NcclGuard nccl_guard(tensor_para_, stream_); ftNcclAllReduceSum(ffn_output_data, ffn_output_data, token_num * hidden_units_, tensor_para_, stream_); sync_check_cuda_error(); diff --git a/src/turbomind/models/llama/LlamaFfnLayer.h b/src/turbomind/models/llama/LlamaFfnLayer.h index 2daca2cc95..a72a24701e 100644 --- a/src/turbomind/models/llama/LlamaFfnLayer.h +++ b/src/turbomind/models/llama/LlamaFfnLayer.h @@ -30,13 +30,12 @@ namespace turbomind { template class LlamaFfnLayer { public: - LlamaFfnLayer(const ModelParam& model, const NcclParam& tp, const Context& ctx, bool all_reduce): + LlamaFfnLayer(const ModelParam& model, const NcclParam& tp, const Context& ctx): hidden_units_(model.hidden_units), tensor_para_(tp), stream_(ctx.stream), linear_(ctx.linear.get()), - allocator_(ctx.allocator.get()), - all_reduce_(all_reduce) + allocator_(ctx.allocator.get()) { } @@ -48,7 +47,8 @@ class LlamaFfnLayer { void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaFfnWeight* weights); private: - void allocateBuffer(size_t token_num, int inter_size, const LlamaDenseWeight*, const LlamaDenseWeight*); + void allocateBuffer( + size_t token_num, int inter_size, size_t inter_buf_factor, size_t gating_lora_r, size_t inter_lora_r); void freeBuffer(); @@ -59,7 +59,6 @@ class LlamaFfnLayer { cudaStream_t const stream_; LlamaLinear* const linear_; IAllocator* const allocator_; - const bool all_reduce_; bool is_free_buffer_after_forward_{}; T* gating_buf_{}; diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc index 3d50910ad4..05b22deed5 100644 --- a/src/turbomind/models/llama/LlamaV2.cc +++ b/src/turbomind/models/llama/LlamaV2.cc @@ -72,7 +72,6 @@ LlamaV2::LlamaV2(const ModelParam& model, lora_param_(lora), head_num_(model.head_num), size_per_head_(model.head_dim), - inter_size_(model.inter_size), hidden_units_(model.hidden_units), layer_num_(model.layer_num), vocab_size_(model.vocab_size), diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h index 6321d09d7c..658282f5e5 100644 --- a/src/turbomind/models/llama/LlamaV2.h +++ b/src/turbomind/models/llama/LlamaV2.h @@ -113,7 +113,6 @@ class LlamaV2 { const size_t head_num_; const size_t size_per_head_; const size_t hidden_units_; - const size_t inter_size_; const size_t layer_num_; const size_t vocab_size_; const size_t vocab_size_padded_; diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc index 9d62042d62..bcee150977 100644 --- a/src/turbomind/models/llama/LlamaWeight.cc +++ b/src/turbomind/models/llama/LlamaWeight.cc @@ -20,36 +20,24 @@ #include "src/turbomind/models/llama/LlamaWeight.h" #include "src/turbomind/models/llama/llama_params.h" +#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/memory_utils.h" #include namespace turbomind { template -LlamaWeight::LlamaWeight(size_t head_num, - size_t kv_head_num, - size_t size_per_head, - size_t hidden_units, - size_t inter_size, - size_t vocab_size, - size_t embedding_size, - size_t num_layer, - bool attn_bias, - WeightType weight_type, - int group_size, - LoraParam lora_param, - MoeParam moe_param, - size_t tensor_para_size, - size_t tensor_para_rank): - hidden_units_(hidden_units), - inter_size_(inter_size), - vocab_size_(vocab_size), - vocab_size_padded_(vocab_size), - embedding_size_(embedding_size), - num_layer_(num_layer), - weight_type_(weight_type), - tensor_para_size_(tensor_para_size), - tensor_para_rank_(tensor_para_rank) +LlamaWeight::LlamaWeight( + const ModelParam& model, const LoraParam& lora_param, const MoeParam& moe_param, size_t tp_size, size_t tp_rank): + hidden_units_(model.hidden_units), + inter_size_(model.inter_size), + vocab_size_(model.vocab_size), + vocab_size_padded_(model.vocab_size), + embedding_size_(model.embedding_size), + num_layer_(model.layer_num), + weight_type_(model.weight_type), + tensor_para_size_(tp_size), + tensor_para_rank_(tp_rank) { if (vocab_size_padded_ % tensor_para_size_ != 0) { vocab_size_padded_ = (vocab_size_ + tensor_para_size_ - 1) / tensor_para_size_ * tensor_para_size_; @@ -61,49 +49,42 @@ LlamaWeight::LlamaWeight(size_t head_num, } FT_CHECK(hidden_units_ % tensor_para_size_ == 0); + check_cuda_error(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)); + decoder_layer_weights.reserve(num_layer_); for (unsigned l = 0; l < num_layer_; ++l) { - decoder_layer_weights.push_back(new LlamaDecoderLayerWeight(l, - head_num, - kv_head_num, - size_per_head, - hidden_units_, - inter_size_, - weight_type_, - group_size, - lora_param, - attn_bias, - moe_param, - tensor_para_size_, - tensor_para_rank_)); + decoder_layer_weights.emplace_back( + new LlamaDecoderLayerWeight(l, model, lora_param, moe_param, tp_size, tp_rank)); + decoder_layer_weights.back()->malloc(stream_); } - mallocWeights(); + FT_CHECK(vocab_size_padded_ % tensor_para_size_ == 0); + deviceMalloc((T**)&pre_decoder_embedding_table, embedding_size_ * hidden_units_ / tensor_para_size_, stream_); + deviceMalloc((T**)&output_norm_weight, hidden_units_, stream_); + deviceMalloc((T**)&post_decoder_embedding_kernel, hidden_units_ * vocab_size_padded_ / tensor_para_size_, stream_); + + // Wait for allocations + check_cuda_error(cudaStreamSynchronize(stream_)); } template LlamaWeight::~LlamaWeight() { - cudaFree((void*)pre_decoder_embedding_table); - cudaFree((void*)output_norm_weight); - cudaFree((void*)post_decoder_embedding_kernel); - - pre_decoder_embedding_table = nullptr; - output_norm_weight = nullptr; - post_decoder_embedding_kernel = nullptr; + deviceFree(pre_decoder_embedding_table, stream_); + deviceFree(output_norm_weight, stream_); + deviceFree(post_decoder_embedding_kernel, stream_); for (auto& p : decoder_layer_weights) { + p->free(stream_); delete p; } -} -template -void LlamaWeight::mallocWeights() -{ - FT_CHECK(vocab_size_padded_ % tensor_para_size_ == 0); - deviceMalloc((T**)&pre_decoder_embedding_table, embedding_size_ * hidden_units_ / tensor_para_size_); - deviceMalloc((T**)&output_norm_weight, hidden_units_); - deviceMalloc((T**)&post_decoder_embedding_kernel, hidden_units_ * vocab_size_padded_ / tensor_para_size_); + decoder_layer_weights.clear(); + + // Wait for deallocations + check_cuda_error(cudaStreamSynchronize(stream_)); + check_cuda_error(cudaStreamDestroy(stream_)); + stream_ = {}; } template @@ -179,13 +160,19 @@ void LlamaWeight::prepare(const cudaDeviceProp& prop) TM_LOG_INFO("[LlamaWeight::prepare] workspace size: %d\n", workspace_size); + // Wait for the weights to be filled externally + check_cuda_error(cudaDeviceSynchronize()); + if (workspace_size) { - deviceMalloc((char**)&workspace, workspace_size); + deviceMalloc((char**)&workspace, workspace_size, stream_); } for (auto& layer : decoder_layer_weights) { - layer->prepare(workspace, workspace_size, prop); + layer->prepare(workspace, workspace_size, prop, stream_); } - deviceFree(workspace); + + deviceFree(workspace, stream_); + + check_cuda_error(cudaStreamSynchronize(stream_)); } #ifdef ENABLE_FP32 diff --git a/src/turbomind/models/llama/LlamaWeight.h b/src/turbomind/models/llama/LlamaWeight.h index c30e753565..629cd56120 100644 --- a/src/turbomind/models/llama/LlamaWeight.h +++ b/src/turbomind/models/llama/LlamaWeight.h @@ -22,28 +22,18 @@ #include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h" #include "src/turbomind/models/llama/llama_params.h" -#include "src/turbomind/utils/memory_utils.h" namespace turbomind { template struct LlamaWeight { LlamaWeight() = default; - LlamaWeight(size_t head_num, - size_t kv_head_num, - size_t size_per_head, - size_t hidden_units, - size_t inter_size, - size_t vocab_size, - size_t embedding_size, - size_t num_layer, - bool attn_bias, - WeightType weight_type, - int group_size, - LoraParam lora_param, - MoeParam moe_param, - size_t tensor_para_size, - size_t tensor_para_rank); + + LlamaWeight(const ModelParam& model_param, + const LoraParam& lora_param, + const MoeParam& moe_param, + size_t tp_size, + size_t tp_rank); ~LlamaWeight(); @@ -57,15 +47,13 @@ struct LlamaWeight { void prepare(const cudaDeviceProp& prop); std::vector*> decoder_layer_weights; - const T* pre_decoder_embedding_table{}; - const T* output_norm_weight{}; - const T* post_decoder_embedding_kernel{}; -private: - void mallocWeights(); + T* pre_decoder_embedding_table{}; + T* output_norm_weight{}; + T* post_decoder_embedding_kernel{}; +private: size_t hidden_units_; - size_t inter_size_; size_t vocab_size_; size_t vocab_size_padded_; size_t embedding_size_; @@ -73,6 +61,10 @@ struct LlamaWeight { WeightType weight_type_; size_t tensor_para_size_; size_t tensor_para_rank_; + + std::vector inter_size_; + + cudaStream_t stream_; }; } // namespace turbomind diff --git a/src/turbomind/models/llama/llama_gemm.cc b/src/turbomind/models/llama/llama_gemm.cc index 62952cd715..f9a0191e4b 100644 --- a/src/turbomind/models/llama/llama_gemm.cc +++ b/src/turbomind/models/llama/llama_gemm.cc @@ -84,7 +84,7 @@ int main(int argc, char* argv[]) return -1; } else { - ft::deviceMalloc(reinterpret_cast(&gemm_test_buf), buf_size_in_byte, false); + ft::deviceMalloc(reinterpret_cast(&gemm_test_buf), buf_size_in_byte, nullptr, false); } if (0) {} diff --git a/src/turbomind/models/llama/llama_kernels.h b/src/turbomind/models/llama/llama_kernels.h index 3b01dee60d..aaade1a513 100644 --- a/src/turbomind/models/llama/llama_kernels.h +++ b/src/turbomind/models/llama/llama_kernels.h @@ -154,7 +154,7 @@ template struct TempBuffer { TempBuffer(size_t size) { - deviceMalloc(&data, size, false); + cudaMalloc(&data, size); } T* data; }; diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h index e6b9d690ae..0a505b11a9 100644 --- a/src/turbomind/models/llama/llama_params.h +++ b/src/turbomind/models/llama/llama_params.h @@ -2,28 +2,41 @@ #pragma once -#include "src/turbomind/models/llama/LlamaDenseWeight.h" #include #include #include #include +#include "src/turbomind/models/llama/weight_type.h" + namespace turbomind { +struct MLAParam { + size_t q_lora_rank; + size_t kv_lora_rank; + size_t qk_rope_dim; + size_t v_head_dim; +}; + struct ModelParam { - size_t head_num; - size_t head_dim; - size_t kv_head_num; - size_t hidden_units; - size_t layer_num; - size_t inter_size; - size_t vocab_size; - size_t embedding_size; - float norm_eps; - int quant_policy; - // - int start_id; - int end_id; + size_t head_num; + size_t head_dim; + size_t kv_head_num; + size_t hidden_units; + size_t layer_num; + size_t vocab_size; + size_t embedding_size; + float norm_eps; + int quant_policy; + bool attn_bias; + WeightType weight_type; + int group_size; + int start_id; + int end_id; + MLAParam mla; + int tune_layer_num; + + std::vector inter_size; }; struct MoeParam { @@ -32,17 +45,25 @@ struct MoeParam { kNaive, kFused } method; - int expert_num; - int experts_per_token; - int inter_size; - bool norm_topk; - bool shared_gate; + + int experts_per_token; + int inter_size; + bool norm_topk_prob; + bool shared_gate; + float routed_scale; + + int topk_group; + std::string topk_method; + int n_group; + + std::vector expert_num; }; struct AttentionParam { int rotary_embedding_dim; float rotary_embedding_base; int max_position_embeddings; + float softmax_scale; std::string rope_scaling_type; int original_max_position_embeddings; float rope_scaling_factor; @@ -74,6 +95,12 @@ struct EngineParam { int max_prefill_iters; }; +enum class LoraPolicy : int +{ + kNull, + kPlora, +}; + struct LoraParam { int r; float scale; diff --git a/src/turbomind/models/llama/llama_utils.cu b/src/turbomind/models/llama/llama_utils.cu index 925c6b8831..eaa450ae20 100644 --- a/src/turbomind/models/llama/llama_utils.cu +++ b/src/turbomind/models/llama/llama_utils.cu @@ -1,47 +1,25 @@ // Copyright (c) OpenMMLab. All rights reserved. -#include "src/turbomind/kernels/reduce_kernel_utils.cuh" -#include "src/turbomind/models/llama/llama_utils.h" -#include "src/turbomind/utils/cuda_utils.h" #include #include #include #include +#include +#include + #include #include #include #include #include -#include + +#include "src/turbomind/models/llama/llama_utils.h" +#include "src/turbomind/utils/cuda_utils.h" namespace turbomind { CmpMode compare_mode = kCmpRead; - -template -struct abs_diff_t { - using type = T; -}; - -template<> -struct abs_diff_t { - using type = float; -}; - -template<> -struct abs_diff_t<__nv_bfloat16> { - using type = float; -}; - -template -struct abs_diff: public thrust::unary_function, typename abs_diff_t::type> { - __host__ __device__ float operator()(thrust::tuple x) const - { - using R = typename abs_diff_t::type; - auto r = R(thrust::get<0>(x)) - R(thrust::get<1>(x)); - return r < R(0) ? -r : r; - } -}; +// CmpMode compare_mode = kCmpWrite; template void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream) @@ -63,10 +41,8 @@ void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream) template void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream) { - // wait for b - check_cuda_error(cudaStreamSynchronize(stream)); // read a from file - thrust::host_vector h_a(size); + std::vector h_a(size); { const auto filename = "tmp/" + key + ".cmp"; std::ifstream ifs(filename, std::ios::binary); @@ -85,15 +61,30 @@ void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream) } ifs.read((char*)h_a.data(), sizeof(T) * h_a.size()); } - // copy a to device - thrust::device_vector a = h_a; - // create abs(a - b) iterator - thrust::device_ptr dev_ptr(ptr); - auto zip_iter = thrust::make_zip_iterator(thrust::make_tuple(a.begin(), dev_ptr)); - auto transform_iter = thrust::make_transform_iterator(zip_iter, abs_diff{}); - // sum(abs(a - b)) - auto asum = thrust::reduce(thrust::device, transform_iter, transform_iter + size); - std::cerr << key << ": " << asum << " " << asum / size << "\n"; + std::vector h_b(size); + check_cuda_error(cudaMemcpyAsync(h_b.data(), ptr, sizeof(T) * size, cudaMemcpyDefault, stream)); + check_cuda_error(cudaStreamSynchronize(stream)); + + using Tacc = std::conditional_t, int64_t, float>; + constexpr Tacc eps = std::is_integral_v ? 1 : 1e-8f; + + Tacc asum{}; + Tacc rsum{}; + Tacc amean{}; + for (size_t i = 0; i < size; ++i) { + Tacc x = (Tacc)h_b[i]; + Tacc r = (Tacc)h_a[i]; + Tacc abs_diff = std::abs(x - r); + Tacc rel_diff = abs_diff / std::max(std::max(std::abs(r), std::abs(x)), eps); + asum += abs_diff; + rsum += rel_diff; + amean += std::abs(r); + } + + std::cerr << key << ": " << amean / size << " " << asum << " " << asum / size << " " << rsum / size << "\n"; + + check_cuda_error(cudaMemcpyAsync(ptr, h_a.data(), sizeof(T) * h_a.size(), cudaMemcpyDefault, stream)); + check_cuda_error(cudaStreamSynchronize(stream)); } template diff --git a/src/turbomind/models/llama/mla_utils.cu b/src/turbomind/models/llama/mla_utils.cu new file mode 100644 index 0000000000..2f9e786f2a --- /dev/null +++ b/src/turbomind/models/llama/mla_utils.cu @@ -0,0 +1,93 @@ +// Copyright (c) OpenMMLab. All rights reserved. +#include "src/turbomind/kernels/core/array_ops.h" + +namespace turbomind { + +template +__global__ void mla_copy_qkv_kernel(T* qkv, + const T* q, // [h, head_dim] + const T* kv_a, // [kv_lora_rank, rope_dim] + const T* kv_b, // [h, nope_dim + v_head_dim] + int head_num, + int head_dim, + int nope_dim, + int rope_dim, + int kv_lora_rank, + int v_head_dim) +{ + const int type = blockIdx.y; + + const int64_t ti = blockIdx.x; + const int di = threadIdx.x; + + const int kv_b_dim = nope_dim + v_head_dim; + + // for (int hi = threadIdx.y; hi < head_num; hi += blockDim.y) { + const int hi = threadIdx.y; + Array data{}; + if (type == 0) { // Q + if (di * vec_size < rope_dim) { + Ldg(data, &q[ti * head_num * head_dim + hi * head_dim + nope_dim + di * vec_size]); + } + else { + Ldg(data, &q[ti * head_num * head_dim + hi * head_dim + di * vec_size - rope_dim]); + } + } + else if (type == 1) { // K + if (di * vec_size < rope_dim) { + Ldg(data, &kv_a[ti * (kv_lora_rank + rope_dim) + kv_lora_rank + di * vec_size]); + } + else { + Ldg(data, &kv_b[ti * head_num * kv_b_dim + hi * kv_b_dim + di * vec_size - rope_dim]); + } + } + else { // V + if (di * vec_size < v_head_dim) { + Ldg(data, &kv_b[ti * head_num * kv_b_dim + hi * kv_b_dim + nope_dim + di * vec_size]); + } + } + const int stride = 3 * head_num * head_dim; + Store(&qkv[ti * stride + type * head_num * head_dim + hi * head_dim + di * vec_size], data); + // } +} + +template +void invokeMLACopyQKV(T* qkv, + const T* q, + const T* kv_a, + const T* kv_b, + int token_num, + int head_num, + int nope_dim, + int rope_dim, + int kv_lora_rank, + int v_head_dim, + cudaStream_t stream) +{ + constexpr int vec_size = 16 / sizeof(T); + const int head_dim = nope_dim + rope_dim; + + dim3 block(head_dim / vec_size, head_num); + // make sure block size <= 1024 + while (block.x * block.y > 1024) { + block.y /= 2; + } + const dim3 grid(token_num, 3); + + mla_copy_qkv_kernel<<>>( + qkv, q, kv_a, kv_b, head_num, head_dim, nope_dim, rope_dim, kv_lora_rank, v_head_dim); +} + +template void invokeMLACopyQKV(uint16_t* qkv, + const uint16_t* q, + const uint16_t* kv_a, + const uint16_t* kv_b, + int token_num, + int head_num, + int nope_dim, + int rope_dim, + int kv_lora_rank, + int v_head_dim, + cudaStream_t stream); + +} // namespace turbomind diff --git a/src/turbomind/models/llama/mla_utils.h b/src/turbomind/models/llama/mla_utils.h new file mode 100644 index 0000000000..bc06a352f9 --- /dev/null +++ b/src/turbomind/models/llama/mla_utils.h @@ -0,0 +1,57 @@ +// Copyright (c) OpenMMLab. All rights reserved. +#pragma once + +#include +#include + +#include "src/turbomind/utils/cuda_utils.h" + +namespace turbomind { + +template +void invokeMLACopyQKV(T* qkv, + const T* q, + const T* kv_a, + const T* kv_b, + int token_num, + int head_num, + int nope_dim, + int rope_dim, + int kv_lora_rank, + int v_head_dim, + cudaStream_t stream); + +template +void dispatchMLACopyQKV(T* qkv, + const T* q, + const T* kv_a, + const T* kv_b, + int token_num, + int head_num, + int nope_dim, + int rope_dim, + int kv_lora_rank, + int v_head_dim, + cudaStream_t stream) +{ + auto invoke = [&](auto x) { + using type = decltype(x); + invokeMLACopyQKV((type*)qkv, + (const type*)q, + (const type*)kv_a, + (const type*)kv_b, + token_num, + head_num, + nope_dim, + rope_dim, + kv_lora_rank, + v_head_dim, + stream); + }; + if constexpr (sizeof(T) == 2) { + return invoke(uint16_t{}); + } + FT_CHECK(0); +} + +} // namespace turbomind diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc index 1ad76839d1..390d147540 100644 --- a/src/turbomind/models/llama/moe_ffn_layer.cc +++ b/src/turbomind/models/llama/moe_ffn_layer.cc @@ -11,22 +11,21 @@ #include "src/turbomind/utils/nvtx_utils.h" #include "src/turbomind/utils/string_utils.h" #include -#include #include namespace turbomind { template -void MoeFfnLayer::AllocateBuffer(size_t tokens, size_t padded) +void MoeFfnLayer::AllocateBuffer(size_t tokens, size_t padded, size_t expert_num, size_t inter_buf_factor) { char* base = 0; auto allocate = [&](void* base) { Monotonic alloc{base}; alloc(&inout_buf_, tokens * param_.experts_per_token * hidden_dim_); - alloc(&inter_buf_, tokens * param_.experts_per_token * inter_size_ * 2); - alloc(&logits_, tokens * param_.expert_num); - alloc(&masks_, param_.expert_num * padded); + alloc(&inter_buf_, tokens * param_.experts_per_token * inter_size_ * inter_buf_factor); + alloc(&logits_, tokens * expert_num); + alloc(&masks_, expert_num * padded); alloc(&f2n_, param_.experts_per_token * tokens); alloc(&en2f_, param_.experts_per_token * tokens); alloc(&scales_, param_.experts_per_token * tokens); @@ -80,18 +79,42 @@ void MoeFfnLayer::gate(float* logits, const T* input, int tokens, const Llama template void MoeFfnLayer::forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight& moe) { - const size_t padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize; + const size_t padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize; + const int expert_num = moe.experts.size(); - AllocateBuffer(tokens, padded); + FT_CHECK(expert_num); + + const size_t inter_buf_factor = [&] { + if (param_.method == MoeParam::kNaive) { + return 0; // managed by ffn + } + else if (moe.block.is_fused_silu) { + return 1; + } + else { + return 2; + } + }(); + + AllocateBuffer(tokens, padded, expert_num, inter_buf_factor); gate(logits_, input, tokens, moe.gate); sync_check_cuda_error(); - check_cuda_error(cudaMemsetAsync(accum_, 0, sizeof(int) * param_.expert_num * kMoeGateMaxTiles, stream_)); - sync_check_cuda_error(); + // if (tensor_para_.rank_ == 0) { + // Compare(logits_, tokens * expert_num, Concat("logit", layer_id), compare_mode, stream_); + // } + + check_cuda_error(cudaMemsetAsync(accum_, 0, sizeof(int) * expert_num * kMoeGateMaxTiles, stream_)); + check_cuda_error(cudaMemsetAsync(masks_, -1, sizeof(int8_t) * expert_num * padded, stream_)); // dump_logits(tokens, layer_id); + if (param_.topk_method == "group_limited_greedy") { + invokeMaskMoeTopKGroups(logits_, tokens, expert_num, expert_num / param_.n_group, param_.topk_group, stream_); + sync_check_cuda_error(); + } + /// TODO: fix illegal memory access even if NaN are present in logits invokeMoeGate_V2(f2n_, en2f_, @@ -102,25 +125,26 @@ void MoeFfnLayer::forward(T* output, const T* input, int tokens, int layer_id logits_, tokens, padded, - param_.expert_num, + expert_num, param_.experts_per_token, - param_.norm_topk, + param_.norm_topk_prob, + param_.routed_scale, stream_); sync_check_cuda_error(); if (isTuning()) { std::mt19937 g; - const auto expert_ids = SampleUniform(tokens, param_.expert_num, param_.experts_per_token, g); - std::vector cnt(param_.expert_num); + const auto expert_ids = SampleUniform(tokens, expert_num, param_.experts_per_token, g); + std::vector cnt(expert_num); for (const auto& x : expert_ids) { ++cnt[x]; } h_offsets_[0] = 0; - for (int i = 0; i < param_.expert_num; ++i) { + for (int i = 0; i < expert_num; ++i) { h_offsets_[i + 1] = h_offsets_[i] + cnt[i]; } check_cuda_error( - cudaMemcpyAsync(offsets_, h_offsets_, sizeof(int) * (param_.expert_num + 1), cudaMemcpyDefault, stream_)); + cudaMemcpyAsync(offsets_, h_offsets_, sizeof(int) * (expert_num + 1), cudaMemcpyDefault, stream_)); } if (param_.method == MoeParam::kNaive) { @@ -129,15 +153,15 @@ void MoeFfnLayer::forward(T* output, const T* input, int tokens, int layer_id sync_check_cuda_error(); check_cuda_error( - cudaMemcpyAsync(h_offsets_, offsets_, sizeof(int) * (param_.expert_num + 1), cudaMemcpyDefault, stream_)); + cudaMemcpyAsync(h_offsets_, offsets_, sizeof(int) * (expert_num + 1), cudaMemcpyDefault, stream_)); check_cuda_error(cudaStreamSynchronize(stream_)); - if (h_offsets_[param_.expert_num] != tokens * param_.experts_per_token) { - FT_CHECK_WITH_INFO(0, fmtstr("%d vs %d", h_offsets_[param_.expert_num], tokens * param_.experts_per_token)); + if (h_offsets_[expert_num] != tokens * param_.experts_per_token) { + FT_CHECK_WITH_INFO(0, fmtstr("%d vs %d", h_offsets_[expert_num], tokens * param_.experts_per_token)); } - for (int i = 0; i < param_.expert_num; ++i) { + for (int i = 0; i < expert_num; ++i) { FT_CHECK(moe.experts[i].is_fused_silu == false); @@ -153,7 +177,7 @@ void MoeFfnLayer::forward(T* output, const T* input, int tokens, int layer_id } } else { - context_->set_offsets(offsets_); + context_->update(expert_num, param_.experts_per_token, offsets_); auto& block = moe.block; @@ -217,7 +241,7 @@ void MoeFfnLayer::forward(T* output, const T* input, int tokens, int layer_id } template -void MoeFfnLayer::reduce(T* output, int tokens, const MoeFfnWeight& moe) +void MoeFfnLayer::reduce(T* output, int tokens, float output_scale, int layer_id, const MoeFfnWeight& moe) { invokeMoeReduce(output, inout_buf_, @@ -227,19 +251,21 @@ void MoeFfnLayer::reduce(T* output, int tokens, const MoeFfnWeight& moe) tokens, param_.experts_per_token, hidden_dim_, + output_scale, stream_); sync_check_cuda_error(); if (tensor_para_.world_size_ > 1) { + // std::cout << "moe all reduce " << layer_id << "\n"; ftNcclAllReduceSum(output, output, tokens * hidden_dim_, tensor_para_, stream_); sync_check_cuda_error(); } } template -void MoeFfnLayer::dump_logits(int token_num, int layer_id) +void MoeFfnLayer::dump_logits(int token_num, int layer_id, int expert_num) { - std::vector logits(token_num * param_.expert_num); + std::vector logits(token_num * expert_num); check_cuda_error( cudaMemcpyAsync(logits.data(), logits_, sizeof(float) * logits.size(), cudaMemcpyDefault, stream_)); check_cuda_error(cudaStreamSynchronize(stream_)); @@ -247,7 +273,7 @@ void MoeFfnLayer::dump_logits(int token_num, int layer_id) auto ptr = logits.data(); std::cout << "layer_id: " << layer_id << std::endl; for (int i = 0; i < token_num; ++i) { - for (int e = 0; e < param_.expert_num; ++e) { + for (int e = 0; e < expert_num; ++e) { std::cout << *ptr++ << " "; } std::cout << std::endl; diff --git a/src/turbomind/models/llama/moe_ffn_layer.h b/src/turbomind/models/llama/moe_ffn_layer.h index 0f1713f7b5..74c62d004b 100644 --- a/src/turbomind/models/llama/moe_ffn_layer.h +++ b/src/turbomind/models/llama/moe_ffn_layer.h @@ -9,6 +9,7 @@ #include "src/turbomind/models/llama/llama_params.h" #include "src/turbomind/utils/cublasMMWrapper.h" #include "src/turbomind/utils/nccl_utils.h" +#include namespace turbomind { @@ -26,23 +27,24 @@ class MoeFfnLayer { linear_(ctx.linear.get()), allocator_(ctx.allocator.get()) { - model.inter_size = param.inter_size; + FT_CHECK(!param.expert_num.empty()); + const int max_expert_num = *std::max_element(param.expert_num.begin(), param.expert_num.end()); if (param_.method == MoeParam::kFused) { context_ = std::make_unique( - param.expert_num, param.experts_per_token, ctx.cuda_device_prop, stream_); + max_expert_num, param.experts_per_token, ctx.cuda_device_prop, stream_); } else { - expert_ffn_ = std::make_unique>(model, tp, ctx, false); + expert_ffn_ = std::make_unique>(model, tp, ctx); } - h_offsets_ = (int*)allocator_->malloc(sizeof(int) * (param_.expert_num + 1), false, true); + h_offsets_ = (int*)allocator_->malloc(sizeof(int) * (max_expert_num + 1), false, true); - offsets_ = (int*)allocator_->malloc(sizeof(int) * (param_.expert_num + 1)); - accum_ = (int*)allocator_->malloc(sizeof(int) * param_.expert_num * kMoeGateMaxTiles); + offsets_ = (int*)allocator_->malloc(sizeof(int) * (max_expert_num + 1)); + accum_ = (int*)allocator_->malloc(sizeof(int) * max_expert_num * kMoeGateMaxTiles); } - void AllocateBuffer(size_t tokens, size_t padded); + void AllocateBuffer(size_t tokens, size_t padded, size_t expert_num, size_t inter_buf_factor); void FreeBuffer(); @@ -53,11 +55,11 @@ class MoeFfnLayer { void forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight& moe); - void reduce(T* output, int tokens, const MoeFfnWeight& moe); + void reduce(T* output, int tokens, float output_scale, int layer_id, const MoeFfnWeight& moe); void gate(float* logits, const T* input, int tokens, const LlamaDenseWeight& weight); - void dump_logits(int token_num, int layer_id); + void dump_logits(int token_num, int layer_id, int expert_num); private: const size_t inter_size_; diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc index 2f99b0c2ce..7a6eddc4ba 100644 --- a/src/turbomind/models/llama/unified_attention_layer.cc +++ b/src/turbomind/models/llama/unified_attention_layer.cc @@ -19,21 +19,24 @@ // Modified from // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc -#include "src/turbomind/models/llama/unified_attention_layer.h" +#include +#include + #include "src/turbomind/kernels/attention/attention.h" #include "src/turbomind/kernels/attention/decoding.h" #include "src/turbomind/kernels/attention/kv_cache_utils_v2.h" +#include "src/turbomind/kernels/norm/rms_norm.h" #include "src/turbomind/macro.h" #include "src/turbomind/models/llama/LlamaNcclGuard.h" #include "src/turbomind/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_utils.h" +#include "src/turbomind/models/llama/mla_utils.h" +#include "src/turbomind/models/llama/unified_attention_layer.h" #include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/anomaly_handler.h" #include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/debug_utils.h" #include "src/turbomind/utils/logger.h" -#include -#include +#include "src/turbomind/utils/memory_utils.h" namespace turbomind { @@ -72,17 +75,14 @@ UnifiedAttentionLayer::UnifiedAttentionLayer(const ModelParam& model, } template -void UnifiedAttentionLayer::allocateBuffer(size_t q_count, - size_t k_count, - size_t batch_size, - const WeightType* weights) +void UnifiedAttentionLayer::allocateBuffer(size_t q_count, size_t k_count, size_t batch_size, size_t qkv_lora_rank) { TM_LOG_DEBUG(__PRETTY_FUNCTION__); const int local_q_kv_head_num = local_head_num_ + 2 * local_kv_head_num_; - if (weights->qkv.lora.r) { - size_t sz = sizeof(T) * q_count * (local_q_kv_head_num * size_per_head_ + weights->qkv.lora.r); + if (qkv_lora_rank) { + size_t sz = sizeof(T) * q_count * (local_q_kv_head_num * size_per_head_ + qkv_lora_rank); qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sz, false); } else { @@ -198,28 +198,38 @@ inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMa allocateBuffer(token_num, // shared h_cu_k_len[batch_size] - h_cu_k_len[dc_batch_size], // prefill batch_size, - weights); + weights->qkv.lora.r); // [L, 2, H, s, D] const size_t layer_offset = layer_id * 2 * local_kv_head_num_ * param_.cache_block_seq_len * size_per_head_; - static int count = 0; + // static int count = 0; - // if (layer_id == 0 && count == 0) { - // Compare(attention_input, token_num * weights->qkv.input_dims, "qkv_input", compare_mode, stream_); + // if (tensor_para_.rank_ == 0) { + // Compare(attention_input, token_num * hidden_units_, Concat("qkv_input", layer_id), compare_mode, stream_); // } int* lora_mask = inputs->at("lora_mask", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr(); - ////////////////////////////////////////////// - /// qkv gemm - // [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim] - linear_->forward(qkv_buf_, attention_input, token_num, weights->qkv, LlamaLinear::kGemm, lora_mask); - sync_check_cuda_error(); + + if (weights->qkv.output_dims) { + ////////////////////////////////////////////// + /// qkv gemm + // [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim] + linear_->forward(qkv_buf_, attention_input, token_num, weights->qkv, LlamaLinear::kGemm, lora_mask); + sync_check_cuda_error(); + } + else { + forward_mla(attention_input, token_num, *weights); + } + + // std::cerr << layer_id << " " << count << " " << tensor_para_.rank_ << "\n"; count_and_fix(qkv_buf_, token_num * weights->qkv.output_dims, Concat("qkv", layer_id), 3); - // if (layer_id == 0 && count == 0) { - // Compare(qkv_buf_, token_num * weights->qkv.output_dims, "qkv_buf", compare_mode, stream_); + // std::cerr << "token num: " << token_num << "\n"; + + // if (layer_id == 0 && count == 0 && tensor_para_.rank_ == 0) { + // Compare(qkv_buf_, token_num * (3 * local_head_num_ * size_per_head_), "qkv_buf", CMP_MODE, stream_); // } if constexpr (0) { @@ -290,8 +300,15 @@ inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMa params.num_heads = local_head_num_; params.num_kv_heads = local_kv_head_num_; params.size_per_head = size_per_head_; + // MSVC does not have M_LOG2E - params.inv_sqrt_dh = (float)std::log2(expf(1.)) / std::sqrt((float)params.size_per_head); + params.inv_sqrt_dh = (float)std::log2(expf(1.)); + if (param_.softmax_scale) { // model predefined softmax scale + params.inv_sqrt_dh *= param_.softmax_scale; + } + else { // default value + params.inv_sqrt_dh /= std::sqrt((float)params.size_per_head); + } params.rotary_embedding_dim = param_.rotary_embedding_dim; params.rotary_embedding_base = param_.rotary_embedding_base; @@ -324,8 +341,9 @@ inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMa }; float low, high; find_correction_range(param_.beta_fast, param_.beta_slow, low, high); + // https://github.com/huggingface/transformers/blob/6c3f168b36882f0beebaa9121eafa1928ba29633/src/transformers/modeling_rope_utils.py#L216 if (low == high) { - high += 0.01f; + high += 0.001f; } params.yarn_ramp_inv_factor_div_2 = 1.0 / (high - low) / 2.0; params.yarn_ramp_inv_factor_mul_min = 1.0 / (high - low) * low; @@ -415,8 +433,6 @@ inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMa linear_->forward(attention_out, qkv_buf_3_, token_num, weights->output, LlamaLinear::kGemm, lora_mask); sync_check_cuda_error(); - // ++count; - count_and_fix(attention_out, token_num * weights->output.output_dims, Concat("wo", layer_id), 3); if (tensor_para_.world_size_ > 1) { @@ -425,10 +441,94 @@ inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMa sync_check_cuda_error(); } + // if (tensor_para_.rank_ == 0) { + // Compare(attention_out, token_num * hidden_units_, Concat("attn_out", layer_id), compare_mode, stream_); + // // dump(qkv_buf_3_, num_token * weights->output.input_dims, stream_, "qkv_buf_3"); + // } + if (is_free_buffer_after_forward_ == true) { freeBuffer(); } sync_check_cuda_error(); + + // ++count; +} + +template +void UnifiedAttentionLayer::forward_mla(const T* inputs, int token_num, const WeightType& w) +{ + const int q_lora_rank = w.q_a_proj.output_dims; + const int kv_lora_rank = w.kv_b_proj.input_dims; + const int qk_rope_dim = w.kv_a_proj.output_dims - kv_lora_rank; + const int qk_nope_dim = std::max(w.q_b_proj.output_dims, w.q_proj.output_dims) / local_head_num_ - qk_rope_dim; + const int v_head_dim = w.kv_b_proj.output_dims / local_head_num_ - qk_nope_dim; + + T* q{}; + + if (w.q_proj.kernel) { + deviceMalloc((T**)&q, (size_t)token_num * w.q_proj.output_dims, stream_); + linear_->forward(q, inputs, token_num, w.q_proj); + sync_check_cuda_error(); + } + else { + T* q_a{}; + deviceMalloc((T**)&q_a, (size_t)token_num * q_lora_rank, stream_); + + linear_->forward(q_a, inputs, token_num, w.q_a_proj); + sync_check_cuda_error(); + + invokeRMSNorm(q_a, + q_lora_rank, + q_a, + q_lora_rank, + w.q_a_layernorm, + q_lora_rank, + token_num, + model_param_.norm_eps, + stream_); + sync_check_cuda_error(); + + deviceMalloc((T**)&q, (size_t)token_num * w.q_b_proj.output_dims, stream_); + linear_->forward(q, q_a, token_num, w.q_b_proj); + sync_check_cuda_error(); + + deviceFree(q_a, stream_); + } + + T* kv_a{}; + const int kv_a_dim = w.kv_a_proj.output_dims; + deviceMalloc((T**)&kv_a, (size_t)token_num * kv_a_dim, stream_); + + linear_->forward(kv_a, inputs, token_num, w.kv_a_proj); + sync_check_cuda_error(); + + invokeRMSNorm( + kv_a, kv_a_dim, kv_a, kv_a_dim, w.kv_a_layernorm, kv_lora_rank, token_num, model_param_.norm_eps, stream_); + sync_check_cuda_error(); + + T* kv_b{}; + deviceMalloc((T**)&kv_b, (size_t)token_num * w.kv_b_proj.output_dims, stream_); + sync_check_cuda_error(); + + linear_->forward(kv_b, {kv_a, kv_a_dim}, token_num, w.kv_b_proj); + sync_check_cuda_error(); + + dispatchMLACopyQKV(qkv_buf_, + q, + kv_a, + kv_b, + token_num, + local_head_num_, + qk_nope_dim, + qk_rope_dim, + kv_lora_rank, + v_head_dim, + stream_); + sync_check_cuda_error(); + + deviceFree(q, stream_); + deviceFree(kv_a, stream_); + deviceFree(kv_b, stream_); } #ifdef ENABLE_FP32 diff --git a/src/turbomind/models/llama/unified_attention_layer.h b/src/turbomind/models/llama/unified_attention_layer.h index da0c0e6fc8..7d331b0e41 100644 --- a/src/turbomind/models/llama/unified_attention_layer.h +++ b/src/turbomind/models/llama/unified_attention_layer.h @@ -42,7 +42,7 @@ class UnifiedAttentionLayer { static constexpr int kMaxWorkspaceTokens = 4096; void freeBuffer(); - void allocateBuffer(size_t q_count, size_t k_count, size_t batch_size, const WeightType* weights); + void allocateBuffer(size_t q_count, size_t k_count, size_t batch_size, size_t qkv_lora_rank); void allocateWorkspace(); void freeWorkspace(); @@ -70,7 +70,7 @@ class UnifiedAttentionLayer { const NcclParam& tp, const Context& context); - void forward(TensorMap* outputs, const TensorMap* inputs, const LlamaAttentionWeight* weights); + void forward(TensorMap* outputs, const TensorMap* inputs, const WeightType* weights); void prefill(T* output, T* tmp_kv_buffer, @@ -107,6 +107,9 @@ class UnifiedAttentionLayer { int max_split_k, const WeightType* weights); +private: + void forward_mla(const T* inputs, int token_num, const WeightType& weights); + private: const size_t head_num_; const size_t kv_head_num_; diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc index 28e8b5f649..ec0e75b7e5 100644 --- a/src/turbomind/models/llama/unified_decoder.cc +++ b/src/turbomind/models/llama/unified_decoder.cc @@ -1,13 +1,17 @@ -#include "src/turbomind/models/llama/unified_decoder.h" + +#include + +#include "src/turbomind/kernels/norm/rms_norm.h" #include "src/turbomind/models/llama/llama_decoder_kernels.h" #include "src/turbomind/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_utils.h" #include "src/turbomind/models/llama/moe_ffn_layer.h" #include "src/turbomind/models/llama/unified_attention_layer.h" +#include "src/turbomind/models/llama/unified_decoder.h" +#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/anomaly_handler.h" #include "src/turbomind/utils/cuda_utils.h" -#include namespace turbomind { @@ -23,17 +27,19 @@ UnifiedDecoder::UnifiedDecoder(const ModelParam& model, rmsnorm_eps_(model.norm_eps), stream_(ctx.stream), allocator_(ctx.allocator.get()), - dtype_(getTensorType()) + tp_(tp), + dtype_(getTensorType()), + tune_layer_num_(model.tune_layer_num) { attn_layer_ = std::make_unique>(model, attn, lora, tp, ctx); - if (moe.expert_num) { + if (std::accumulate(moe.expert_num.begin(), moe.expert_num.end(), 0LL)) { moe_ffn_layer_ = std::make_unique>(model, moe, tp, ctx); } - if (model.inter_size) { - ffn_layer_ = std::make_unique>(model, tp, ctx, !moe_ffn_layer_); + if (std::accumulate(model.inter_size.begin(), model.inter_size.end(), 0LL)) { + ffn_layer_ = std::make_unique>(model, tp, ctx); } check_cuda_error(cudaEventCreateWithFlags(&ev_h_cu_x_, cudaEventDisableTiming)); @@ -65,13 +71,13 @@ void UnifiedDecoder::freeBuffer() } template -void UnifiedDecoder::forwardSelfAttn(T* attn_io, - TensorMap* _outputs, - const TensorMap* _inputs, - size_t token_num, - size_t batch_size, - int layer_id, - const LlamaAttentionWeight* weight) +void UnifiedDecoder::forwardSelfAttn(T* attn_io, + TensorMap* _outputs, + const TensorMap* _inputs, + size_t token_num, + size_t batch_size, + int layer_id, + const WeightType* weight) { TensorMap inputs(*_inputs); inputs.insert("input_query", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, attn_io}); @@ -84,7 +90,7 @@ void UnifiedDecoder::forwardSelfAttn(T* attn_io, TensorMap outputs(*_outputs); outputs.insert("hidden_features", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, attn_io}); - attn_layer_->forward(&outputs, &inputs, weight); + attn_layer_->forward(&outputs, &inputs, &weight->self_attn_weights); } template @@ -141,19 +147,15 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con const int pf_offset = dc_batch_size; - // Compare(decoder_input_output, token_num * hidden_units_, "decoder_input", kCmpRead, stream_); - - // printf("%d %f\n", (int)token_num, rmsnorm_eps_); - ///////////////////////////////////////////// /// RMSNorm - invokeRootMeanSquareNorm(decoder_output, - decoder_input_output, - weights->at(0)->self_attn_norm_weights, - rmsnorm_eps_, - token_num, - hidden_units_, - stream_); + invokeRMSNorm(decoder_output, + decoder_input_output, + weights->at(0)->self_attn_norm_weights, + hidden_units_, + token_num, + rmsnorm_eps_, + stream_); sync_check_cuda_error(); count_and_fix(decoder_output, token_num * hidden_units_, Concat("norm0", 0), 2); @@ -161,12 +163,10 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con for (size_t layer = 0; layer < layer_num_; ++layer) { /// TODO: do not skip the layers when they are heterogeneous - if (isTuning() && layer != 0) { + if (isTuning() && layer >= tune_layer_num_) { continue; } - // Compare(decoder_output, token_num * hidden_units_, "attn_input", kCmpRead, stream_); - ///////////////////////////////////////////// /// self-attention forwardSelfAttn(decoder_output, // @@ -175,18 +175,18 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con token_num, batch_size, layer, - &weights->at(layer)->self_attn_weights); + weights->at(layer)); count_and_fix(decoder_output, token_num * hidden_units_, Concat("attn_block", layer), 2); - invokeFusedAddBiasResidualRMSNorm(decoder_input_output, - decoder_output, - weights->at(layer)->self_attn_weights.output.bias, - weights->at(layer)->ffn_norm_weights, - rmsnorm_eps_, - token_num, - hidden_units_, - stream_); + invokeBiasResidualRMSNorm(decoder_input_output, + decoder_output, + weights->at(layer)->ffn_norm_weights, + weights->at(layer)->self_attn_weights.output.bias, + hidden_units_, + token_num, + rmsnorm_eps_, + stream_); sync_check_cuda_error(); count_and_fix(decoder_input_output, token_num * hidden_units_, Concat("residual0", layer), 2); @@ -195,14 +195,17 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con //////////////////////////////////////////// /// feed-forward network - if (!weights->at(layer)->moe_weights.experts.empty()) { + const bool is_moe = !weights->at(layer)->moe_weights.experts.empty(); + if (is_moe) { moe_ffn_layer_->forward(nullptr, decoder_output, token_num, layer, weights->at(layer)->moe_weights); } - if (ffn_layer_) { - int layer_id = layer; // int is needed + if (weights->at(layer)->ffn_weights.output.kernel) { + int layer_id = layer; // int is needed + bool all_reduce = !is_moe; TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}}, - {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}}}; + {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}}, + {"all_reduce", {MEMORY_CPU, TYPE_BOOL, {1}, &all_reduce}}}; TensorMap ffn_outputs{{"ffn_output", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}}}; if (inputs->isExist("lora_mask")) { ffn_inputs.insert({"lora_mask", inputs->at("lora_mask")}); @@ -210,8 +213,8 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &weights->at(layer)->ffn_weights); } - if (!weights->at(layer)->moe_weights.experts.empty()) { - moe_ffn_layer_->reduce(decoder_output, token_num, weights->at(layer)->moe_weights); + if (is_moe) { + moe_ffn_layer_->reduce(decoder_output, token_num, (bool)ffn_layer_, layer, weights->at(layer)->moe_weights); } count_and_fix(decoder_output, token_num * hidden_units_, Concat("ffn_block", layer), 2); diff --git a/src/turbomind/models/llama/unified_decoder.h b/src/turbomind/models/llama/unified_decoder.h index f13b4ba842..e08567136d 100644 --- a/src/turbomind/models/llama/unified_decoder.h +++ b/src/turbomind/models/llama/unified_decoder.h @@ -22,7 +22,9 @@ class UnifiedDecoder { const float rmsnorm_eps_; cudaStream_t const stream_; IAllocator* const allocator_; + const NcclParam tp_; const DataType dtype_; + const int tune_layer_num_; bool is_free_buffer_after_forward_{}; int* cu_q_len_{}; @@ -39,13 +41,13 @@ class UnifiedDecoder { using WeightType = LlamaDecoderLayerWeight; - void forwardSelfAttn(T* attn_io, - TensorMap* _outputs, - const TensorMap* _inputs, - size_t token_num, - size_t batch_size, - int layer_id, - const LlamaAttentionWeight* weight); + void forwardSelfAttn(T* attn_io, + TensorMap* _outputs, + const TensorMap* _inputs, + size_t token_num, + size_t batch_size, + int layer_id, + const WeightType* weight); public: UnifiedDecoder(const ModelParam& model, diff --git a/src/turbomind/models/llama/weight_type.h b/src/turbomind/models/llama/weight_type.h new file mode 100644 index 0000000000..bc2f49a08e --- /dev/null +++ b/src/turbomind/models/llama/weight_type.h @@ -0,0 +1,56 @@ +#pragma once + +#include +#include +#include + +namespace turbomind { + +enum class WeightType : int +{ + kFP32, + kFP16, + kFP8, // not supported yet + kBF16, + kINT8, + kINT4 +}; + +template +constexpr WeightType get_default_weight_type() +{ + if constexpr (std::is_same_v) { + return WeightType::kFP16; + } + else if constexpr (std::is_same_v) { + return WeightType::kBF16; + } + else if constexpr (std::is_same_v) { + return WeightType::kFP32; + } + else { + static_assert(sizeof(T) != sizeof(T), "not implemented"); + return {}; + } +} + +inline size_t getBitSize(WeightType type) +{ + switch (type) { + case WeightType::kFP32: + return 32; + case WeightType::kFP16: + return 16; + case WeightType::kFP8: + return 8; + case WeightType::kBF16: + return 16; + case WeightType::kINT8: + return 8; + case WeightType::kINT4: + return 4; + } + return 0; +} + +} // namespace turbomind diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index 4eb34249ff..5a344d9545 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -215,6 +215,51 @@ DLTensor GetDLTensor(py::object obj) return dlmt->dl_tensor; } +static void safe_memcpy(void* dst, const void* src, size_t size) +{ + cudaPointerAttributes dat{}; + cudaPointerAttributes sat{}; + ft::check_cuda_error(cudaPointerGetAttributes(&dat, dst)); + ft::check_cuda_error(cudaPointerGetAttributes(&sat, src)); + try { + if (dat.devicePointer && sat.devicePointer) { + // Both can be accessed from current context + ft::check_cuda_error(cudaMemcpy(dst, src, size, cudaMemcpyDefault)); + } + else if (dat.type == cudaMemoryTypeDevice && sat.type == cudaMemoryTypeDevice) { + if (dat.device != sat.device) { + // On different devices, try peer memcpy + ft::check_cuda_error(cudaMemcpyPeer(dst, dat.device, src, sat.device, size)); + } + else { + // Same device, switch to the device first (this is unlikely) + ft::CudaDeviceGuard guard(dat.device); + ft::check_cuda_error(cudaMemcpy(dst, src, size, cudaMemcpyDefault)); + } + } + else { + // Unknown case, give it a try anyway + ft::check_cuda_error(cudaMemcpy(dst, src, size, cudaMemcpyDefault)); + } + } + catch (...) { + int device_id{-1}; + cudaGetDevice(&device_id); + TM_LOG_ERROR("cudaMemcpy failed: dst=(%d, %d, %p, %p), src=(%d, %d, %p, %p), size=%s, device=%d", + (int)dat.type, + dat.device, + dat.devicePointer, + dat.hostPointer, + (int)sat.type, + sat.device, + sat.devicePointer, + sat.hostPointer, + std::to_string(size).c_str(), + device_id); + throw; + } +} + PYBIND11_MODULE(_turbomind, m) { // nccl param @@ -293,8 +338,7 @@ PYBIND11_MODULE(_turbomind, m) std::accumulate(src->shape.begin(), src->shape.end(), 1LL, std::multiplies()); auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8; ft::FT_CHECK(self->shape.size() == 1 && num_bytes == self->shape[0]); - cudaMemcpy( - const_cast(self->data), const_cast(src->data), num_bytes, cudaMemcpyDefault); + safe_memcpy(const_cast(self->data), src->data, num_bytes); break; } default: diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index 2deca46380..1c7c5eb468 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -256,22 +256,30 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, model_param_.kv_head_num = model_reader["kv_head_num"].as(0); model_param_.hidden_units = model_reader["hidden_units"].as(); model_param_.layer_num = model_reader["num_layer"].as(); - model_param_.inter_size = model_reader["inter_size"].as(); model_param_.vocab_size = model_reader["vocab_size"].as(); model_param_.embedding_size = model_reader["embedding_size"].as(); model_param_.norm_eps = model_reader["norm_eps"].as(); model_param_.start_id = model_reader["start_id"].as(); model_param_.end_id = model_reader["end_id"].as(); + model_param_.tune_layer_num = model_reader["tune_layer_num"].as(1); + model_param_.mla.q_lora_rank = model_reader["q_lora_rank"].as(); + model_param_.mla.kv_lora_rank = model_reader["kv_lora_rank"].as(); + model_param_.mla.qk_rope_dim = model_reader["qk_rope_dim"].as(); + model_param_.mla.v_head_dim = model_reader["v_head_dim"].as(); attn_param_.cache_block_seq_len = attention_reader["cache_block_seq_len"].as(0); model_param_.quant_policy = engine_reader["quant_policy"].as(0); - + YAML::Node inter_size = model_reader["inter_size"]; + for (auto it = inter_size.begin(); it != inter_size.end(); ++it) { + model_param_.inter_size.push_back(it->as()); + } // Only weight classes need these - attn_bias_ = model_reader["attn_bias"].as(0); - group_size_ = model_reader["group_size"].as(0); + model_param_.attn_bias = model_reader["attn_bias"].as(0); + model_param_.group_size = model_reader["group_size"].as(0); // rotary embedding parameters attn_param_.rotary_embedding_dim = attention_reader["rotary_embedding"].as(); attn_param_.rotary_embedding_base = attention_reader["rope_theta"].as(10000.0f); + attn_param_.softmax_scale = attention_reader["softmax_scale"].as(0); attn_param_.attention_factor = attention_reader["attention_factor"].as(-1.f); attn_param_.beta_fast = attention_reader["beta_fast"].as(32.f); attn_param_.beta_slow = attention_reader["beta_slow"].as(1.f); @@ -297,19 +305,27 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, engine_param_.num_tokens_per_iter = engine_reader["num_tokens_per_iter"].as(0); engine_param_.max_prefill_iters = engine_reader["max_prefill_iters"].as(1); - lora_param_.policy = ft::getLoraPolicy(reader["lora_config"]["lora_policy"].as("")); - lora_param_.r = lora_reader["lora_r"].as(0); - lora_param_.scale = lora_reader["lora_scale"].as(0); - lora_param_.max_wo_r = lora_reader["lora_max_wo_r"].as(0); - lora_param_.rank_pattern = getLoraPattern(lora_reader["lora_rank_pattern"].as(""), + lora_param_.policy = ft::getLoraPolicy(reader["lora_config"]["lora_policy"].as("")); + lora_param_.r = lora_reader["lora_r"].as(0); + lora_param_.scale = lora_reader["lora_scale"].as(0); + lora_param_.max_wo_r = lora_reader["lora_max_wo_r"].as(0); + lora_param_.rank_pattern = getLoraPattern(lora_reader["lora_rank_pattern"].as(""), [](const std::string& s) { return std::stoi(s); }); - lora_param_.scale_pattern = getLoraPattern(lora_reader["lora_scale_pattern"].as(""), + lora_param_.scale_pattern = getLoraPattern(lora_reader["lora_scale_pattern"].as(""), [](const std::string& s) { return std::stof(s); }); - moe_param_.expert_num = model_reader["expert_num"].as(0); + moe_param_.experts_per_token = model_reader["experts_per_token"].as(0); moe_param_.inter_size = model_reader["expert_inter_size"].as(0); - moe_param_.shared_gate = model_reader["moe_shared_gate"].as(0); - moe_param_.norm_topk = model_reader["moe_norm_topk"].as(false); + moe_param_.shared_gate = model_reader["moe_shared_gate"].as(); + moe_param_.norm_topk_prob = model_reader["norm_topk_prob"].as(); + moe_param_.routed_scale = model_reader["routed_scale"].as(1.f); + moe_param_.topk_group = model_reader["topk_group"].as(1); + moe_param_.topk_method = model_reader["topk_method"].as("greedy"); + moe_param_.n_group = model_reader["moe_group_num"].as(1); + YAML::Node expert_num = model_reader["expert_num"]; + for (auto it = expert_num.begin(); it != expert_num.end(); ++it) { + moe_param_.expert_num.push_back(it->as()); + } handleMissingParams(); @@ -321,19 +337,19 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, const std::string weight_type_str = model_reader["weight_type"].as(); if (weight_type_str == "fp16" || weight_type_str == "float16") { - weight_type_ = ft::WeightType::kFP16; + model_param_.weight_type = ft::WeightType::kFP16; } else if (weight_type_str == "bf16" || weight_type_str == "bfloat16") { - weight_type_ = ft::WeightType::kBF16; + model_param_.weight_type = ft::WeightType::kBF16; } else if (weight_type_str == "fp32") { - weight_type_ = ft::WeightType::kFP32; + model_param_.weight_type = ft::WeightType::kFP32; } else if (weight_type_str == "int8") { - weight_type_ = ft::WeightType::kINT8; + model_param_.weight_type = ft::WeightType::kINT8; } else if (weight_type_str == "int4") { - weight_type_ = ft::WeightType::kINT4; + model_param_.weight_type = ft::WeightType::kINT4; } else { std::cout << "[ERROR] Unsupported weight type: '" << weight_type_str << "'\n"; @@ -418,21 +434,8 @@ void LlamaTritonModel::createSharedWeights(int device_id, int rank) const int tensor_para_rank = rank % tensor_para_size_; const int pipeline_para_rank = rank / tensor_para_size_; ft::FT_CHECK(pipeline_para_size_ == 1 && pipeline_para_rank == 0); - weights_[device_id] = std::make_shared>(model_param_.head_num, - model_param_.kv_head_num, - model_param_.head_dim, - model_param_.hidden_units, - model_param_.inter_size, - model_param_.vocab_size, - model_param_.embedding_size, - model_param_.layer_num, - attn_bias_, - weight_type_, - group_size_, - lora_param_, - moe_param_, - tensor_para_size_, - tensor_para_rank); + weights_[device_id] = std::make_shared>( + model_param_, lora_param_, moe_param_, tensor_para_size_, tensor_para_rank); // model inited with model_dir if (model_dir_ != "") { weights_[device_id]->loadModel(model_dir_); @@ -488,9 +491,11 @@ std::string LlamaTritonModel::toString() std::stringstream ss; ss << "Model: " // << "\nhead_num: " << model_param_.head_num << "\nkv_head_num: " << model_param_.kv_head_num - << "\nsize_per_head: " << model_param_.head_dim << "\ninter_size: " << model_param_.inter_size + << "\nsize_per_head: " + << model_param_.head_dim + // << "\ninter_size: " << model_param_.inter_size << "\nnum_layer: " << model_param_.layer_num << "\nvocab_size: " << model_param_.vocab_size - << "\nattn_bias: " << attn_bias_ << "\nmax_batch_size: " << engine_param_.max_batch_size + << "\nattn_bias: " << model_param_.attn_bias << "\nmax_batch_size: " << engine_param_.max_batch_size << "\nmax_prefill_token_num: " << engine_param_.max_prefill_token_num << "\nmax_context_token_num: " << engine_param_.max_context_token_num << "\nnum_tokens_per_iter: " << engine_param_.num_tokens_per_iter @@ -501,8 +506,9 @@ std::string LlamaTritonModel::toString() << "\nenable_prefix_caching: " << engine_param_.enable_prefix_caching << "\nstart_id: " << model_param_.start_id << "\ntensor_para_size: " << tensor_para_size_ << "\npipeline_para_size: " << pipeline_para_size_ << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_ << "\nmodel_name: " << model_name_ - << "\nmodel_dir: " << model_dir_ << "\nquant_policy: " << model_param_.quant_policy - << "\ngroup_size: " << group_size_ << "\nexpert_num: " << moe_param_.expert_num + << "\nmodel_dir: " << model_dir_ << "\nquant_policy: " << model_param_.quant_policy << "\ngroup_size: " + << model_param_.group_size + // << "\nexpert_num: " << moe_param_.expert_num << "\nexpert_per_token: " << moe_param_.experts_per_token << "\nmoe_method: " << moe_param_.method << std::endl; return ss.str(); diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h index 19a143e721..a6c1b862ac 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h @@ -91,9 +91,6 @@ struct LlamaTritonModel: public AbstractTransformerModel { ft::EngineParam engine_param_; size_t tensor_para_size_; size_t pipeline_para_size_; - ft::WeightType weight_type_; - bool attn_bias_; - int group_size_; std::shared_ptr shared_state_; // Weights & engine instances for the ranks diff --git a/src/turbomind/utils/allocator.h b/src/turbomind/utils/allocator.h index bdcb9bfc46..88c299c3de 100644 --- a/src/turbomind/utils/allocator.h +++ b/src/turbomind/utils/allocator.h @@ -281,7 +281,8 @@ class Allocator: public IAllocator { pointer_mapping_.erase(address); } else { - TM_LOG_WARNING("pointer_mapping_ does not have information of ptr at %p.", address); + FT_CHECK_WITH_INFO(0, + fmtstr("pointer_mapping_ does not have information of ptr at %p.", address).c_str()); } } *ptr = nullptr; diff --git a/src/turbomind/utils/cuda_utils.h b/src/turbomind/utils/cuda_utils.h index 2148fcc164..8311e6eb9e 100644 --- a/src/turbomind/utils/cuda_utils.h +++ b/src/turbomind/utils/cuda_utils.h @@ -483,5 +483,24 @@ void compareTwoTensor( bool is_16xx_series(const char* name); +class CudaDeviceGuard { +public: + CudaDeviceGuard(int device) + { + cudaGetDevice(&last_device_id_); + if (device != last_device_id_) { + cudaSetDevice(device); + } + } + + ~CudaDeviceGuard() + { + cudaSetDevice(last_device_id_); + } + +private: + int last_device_id_{-1}; +}; + /* ************************** end of common utils ************************** */ } // namespace turbomind diff --git a/src/turbomind/utils/memory_utils.cu b/src/turbomind/utils/memory_utils.cu index f8bfb8efe0..e9a79ea5a1 100644 --- a/src/turbomind/utils/memory_utils.cu +++ b/src/turbomind/utils/memory_utils.cu @@ -26,77 +26,71 @@ namespace turbomind { template -void deviceMalloc(T** ptr, size_t size, bool is_random_initialize) +void deviceMalloc(T** ptr, size_t size, cudaStream_t st, bool is_random_initialize) { - FT_CHECK_WITH_INFO(size >= ((size_t)0), "Ask deviceMalloc size " + std::to_string(size) + "< 0 is invalid."); - check_cuda_error(cudaMalloc((void**)(ptr), sizeof(T) * size)); + check_cuda_error(cudaMallocAsync((void**)(ptr), sizeof(T) * size, st)); if (is_random_initialize) { - cudaRandomUniform(*ptr, size); + cudaRandomUniform(*ptr, size, st); } } -template void deviceMalloc(float** ptr, size_t size, bool is_random_initialize); -template void deviceMalloc(half** ptr, size_t size, bool is_random_initialize); +template void deviceMalloc(float** ptr, size_t size, cudaStream_t, bool is_random_initialize); +template void deviceMalloc(half** ptr, size_t size, cudaStream_t, bool is_random_initialize); #ifdef ENABLE_BF16 -template void deviceMalloc(__nv_bfloat16** ptr, size_t size, bool is_random_initialize); +template void deviceMalloc(__nv_bfloat16** ptr, size_t size, cudaStream_t, bool is_random_initialize); #endif -template void deviceMalloc(uint16_t** ptr, size_t size, bool is_random_initialize); -template void deviceMalloc(int** ptr, size_t size, bool is_random_initialize); -template void deviceMalloc(bool** ptr, size_t size, bool is_random_initialize); -template void deviceMalloc(char** ptr, size_t size, bool is_random_initialize); -template void deviceMalloc(int8_t** ptr, size_t size, bool is_random_initialize); +template void deviceMalloc(uint16_t** ptr, size_t size, cudaStream_t, bool is_random_initialize); +template void deviceMalloc(int** ptr, size_t size, cudaStream_t, bool is_random_initialize); +template void deviceMalloc(bool** ptr, size_t size, cudaStream_t, bool is_random_initialize); +template void deviceMalloc(char** ptr, size_t size, cudaStream_t, bool is_random_initialize); +template void deviceMalloc(int8_t** ptr, size_t size, cudaStream_t, bool is_random_initialize); #ifdef ENABLE_FP8 -template void deviceMalloc(__nv_fp8_e4m3** ptr, size_t size, bool is_random_initialize); +template void deviceMalloc(__nv_fp8_e4m3** ptr, size_t size, cudaStream_t, bool is_random_initialize); #endif template -void deviceMemSetZero(T* ptr, size_t size) -{ - check_cuda_error(cudaMemset(static_cast(ptr), 0, sizeof(T) * size)); -} - -template void deviceMemSetZero(float* ptr, size_t size); -template void deviceMemSetZero(half* ptr, size_t size); -template void deviceMemSetZero(int* ptr, size_t size); -template void deviceMemSetZero(uint32_t* ptr, size_t size); -template void deviceMemSetZero(bool* ptr, size_t size); -#ifdef ENABLE_FP8 -template void deviceMemSetZero(__nv_fp8_e4m3* ptr, size_t size); -#endif -#ifdef ENABLE_BF16 -template void deviceMemSetZero(__nv_bfloat16* ptr, size_t size); -#endif - -template -void deviceFree(T*& ptr) +void deviceFree(T*& ptr, cudaStream_t st) { if (ptr != NULL) { - check_cuda_error(cudaFree(ptr)); + check_cuda_error(cudaFreeAsync(ptr, st)); ptr = NULL; } } -template void deviceFree(float*& ptr); -template void deviceFree(half*& ptr); +template void deviceFree(float*& ptr, cudaStream_t); +template void deviceFree(half*& ptr, cudaStream_t); #ifdef ENABLE_BF16 -template void deviceFree(__nv_bfloat16*& ptr); +template void deviceFree(__nv_bfloat16*& ptr, cudaStream_t); #endif -template void deviceFree(unsigned short*& ptr); -template void deviceFree(int*& ptr); -template void deviceFree(bool*& ptr); -template void deviceFree(char*& ptr); -template void deviceFree(int8_t*& ptr); +template void deviceFree(unsigned short*& ptr, cudaStream_t); +template void deviceFree(int*& ptr, cudaStream_t); +template void deviceFree(bool*& ptr, cudaStream_t); +template void deviceFree(char*& ptr, cudaStream_t); +template void deviceFree(int8_t*& ptr, cudaStream_t); +template void deviceFree(void*& ptr, cudaStream_t); #ifdef ENABLE_FP8 -template void deviceFree(__nv_fp8_e4m3*& ptr); +template void deviceFree(__nv_fp8_e4m3*& ptr, cudaStream_t); #endif +namespace { + +template +__global__ void fill_kernel(T* devptr, size_t size, T value) +{ + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + for (size_t i = idx; i < size; i += blockDim.x * gridDim.x) { + devptr[i] = value; + } +} + +} // namespace + template void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream) { - T* arr = new T[size]; - std::fill(arr, arr + size, value); - check_cuda_error(cudaMemcpyAsync(devptr, arr, sizeof(T) * size, cudaMemcpyHostToDevice, stream)); - delete[] arr; + constexpr int threads = 512; + const int blocks = (size + threads - 1) / threads; + fill_kernel<<>>(devptr, size, value); } template void deviceFill(float* devptr, size_t size, float value, cudaStream_t stream); @@ -280,23 +274,23 @@ __global__ void cuda_random_uniform_kernel(char* buffer, const size_t size } template -void cudaRandomUniform(T* buffer, const size_t size) +void cudaRandomUniform(T* buffer, const size_t size, cudaStream_t st) { static int seq_offset = 0; - cuda_random_uniform_kernel<<<256, 256>>>(buffer, size, seq_offset); + cuda_random_uniform_kernel<<<256, 256, 0, st>>>(buffer, size, seq_offset); seq_offset += 256 * 256; } -template void cudaRandomUniform(float* buffer, const size_t size); -template void cudaRandomUniform(half* buffer, const size_t size); +template void cudaRandomUniform(float* buffer, const size_t size, cudaStream_t); +template void cudaRandomUniform(half* buffer, const size_t size, cudaStream_t); #ifdef ENABLE_BF16 -template void cudaRandomUniform(__nv_bfloat16* buffer, const size_t size); +template void cudaRandomUniform(__nv_bfloat16* buffer, const size_t size, cudaStream_t); #endif -template void cudaRandomUniform(int* buffer, const size_t size); -template void cudaRandomUniform(bool* buffer, const size_t size); -template void cudaRandomUniform(char* buffer, const size_t size); +template void cudaRandomUniform(int* buffer, const size_t size, cudaStream_t); +template void cudaRandomUniform(bool* buffer, const size_t size, cudaStream_t); +template void cudaRandomUniform(char* buffer, const size_t size, cudaStream_t); #ifdef ENABLE_FP8 -template void cudaRandomUniform(__nv_fp8_e4m3* buffer, const size_t size); +template void cudaRandomUniform(__nv_fp8_e4m3* buffer, const size_t size, cudaStream_t); #endif // loads data from binary file. If it succeeds, returns a non-empty vector. If loading fails or @@ -366,10 +360,10 @@ int loadWeightFromBinFunc(T* ptr, std::vector shape, std::string filenam } else { T_IN* ptr_2 = nullptr; - deviceMalloc(&ptr_2, host_array.size(), false); + deviceMalloc(&ptr_2, host_array.size(), nullptr, false); cudaH2Dcpy(ptr_2, host_array.data(), host_array.size()); invokeCudaD2DcpyConvert(ptr, ptr_2, host_array.size()); - deviceFree(ptr_2); + deviceFree(ptr_2, nullptr); } return 0; } diff --git a/src/turbomind/utils/memory_utils.h b/src/turbomind/utils/memory_utils.h index bb7a4f9c03..03a0ef7b33 100644 --- a/src/turbomind/utils/memory_utils.h +++ b/src/turbomind/utils/memory_utils.h @@ -23,16 +23,13 @@ namespace turbomind { template -void deviceMalloc(T** ptr, size_t size, bool is_random_initialize = true); +void deviceMalloc(T** ptr, size_t size, cudaStream_t st, bool is_random_initialize = false); template -void deviceMemSetZero(T* ptr, size_t size); +void deviceFree(T*& ptr, cudaStream_t st); template -void deviceFree(T*& ptr); - -template -void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream = 0); +void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream = {}); template void cudaD2Hcpy(T* tgt, const T* src, const size_t size); @@ -44,10 +41,10 @@ template void cudaD2Dcpy(T* tgt, const T* src, const size_t size); template -void cudaAutoCpy(T* tgt, const T* src, const size_t size, cudaStream_t stream = NULL); +void cudaAutoCpy(T* tgt, const T* src, const size_t size, cudaStream_t stream = {}); template -void cudaRandomUniform(T* buffer, const size_t size); +void cudaRandomUniform(T* buffer, const size_t size, cudaStream_t stream = {}); template int loadWeightFromBin(T* ptr,