From 08ecad804830b95ffd0d85d424feecff701d2611 Mon Sep 17 00:00:00 2001 From: grimoire Date: Mon, 4 Dec 2023 20:20:56 +0800 Subject: [PATCH 01/22] Add bf16 template sp --- CMakeLists.txt | 8 +- lmdeploy/turbomind/deploy/converter.py | 27 + .../turbomind/deploy/target_model/__init__.py | 1 + .../turbomind/deploy/target_model/base.py | 16 +- .../turbomind/deploy/target_model/bf16.py | 5 + ...er_masked_multihead_attention_template.cuh | 83 +++ ...decoder_masked_multihead_attention_utils.h | 528 ++++++++++++++++++ .../decoder_multihead_attention.cu | 4 +- .../decoder_multihead_attention/kv_cache.cu | 18 +- src/turbomind/models/llama/LlamaBatch.cc | 3 + .../llama/LlamaContextAttentionLayer.cc | 3 + .../models/llama/LlamaContextDecoder.cc | 3 + src/turbomind/models/llama/LlamaDecoder.cc | 3 + .../models/llama/LlamaDecoderLayerWeight.cc | 3 + .../llama/LlamaDecoderSelfAttentionLayer.cc | 3 + src/turbomind/models/llama/LlamaFfnLayer.cc | 3 + src/turbomind/models/llama/LlamaV2.cc | 3 + src/turbomind/models/llama/LlamaWeight.cc | 3 + .../llama/flash_attention2/CMakeLists.txt | 1 + .../llama/flash_attention2/flash_api.cpp | 7 +- .../flash_fwd_hdim128_bf16_sm80.cu | 13 + .../flash_fwd_hdim256_bf16_sm80.cu | 13 + .../flash_fwd_hdim32_bf16_sm80.cu | 13 + .../flash_fwd_hdim64_bf16_sm80.cu | 13 + .../llama_flash_attention_kernel.cu | 28 +- .../models/llama/llama_decoder_kernels.cu | 30 +- src/turbomind/models/llama/llama_kernels.cu | 44 ++ src/turbomind/python/bind.cpp | 11 + .../triton_backend/llama/LlamaTritonModel.cc | 3 + .../llama/LlamaTritonModelInstance.cc | 3 + 30 files changed, 879 insertions(+), 17 deletions(-) create mode 100644 lmdeploy/turbomind/deploy/target_model/bf16.py create mode 100644 src/turbomind/models/llama/flash_attention2/flash_fwd_hdim128_bf16_sm80.cu create mode 100644 src/turbomind/models/llama/flash_attention2/flash_fwd_hdim256_bf16_sm80.cu create mode 100644 src/turbomind/models/llama/flash_attention2/flash_fwd_hdim32_bf16_sm80.cu create mode 100644 src/turbomind/models/llama/flash_attention2/flash_fwd_hdim64_bf16_sm80.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index 27b6b150e7..5743ff78e3 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,10 +17,10 @@ project(TurboMind LANGUAGES CXX CUDA) find_package(CUDA 10.2 REQUIRED) -# if(${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11") -# add_definitions("-DENABLE_BF16") -# message("CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} is greater or equal than 11.0, enable -DENABLE_BF16 flag") -# endif() +if(${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11") + add_definitions("-DENABLE_BF16") + message("CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} is greater or equal than 11.0, enable -DENABLE_BF16 flag") +endif() # if((${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11" AND ${CUDA_VERSION_MINOR} VERSION_GREATER_EQUAL "8") OR (${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "12")) # add_definitions("-DENABLE_FP8") diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py index 5bcab7b537..7ba0e048be 100644 --- a/lmdeploy/turbomind/deploy/converter.py +++ b/lmdeploy/turbomind/deploy/converter.py @@ -113,6 +113,30 @@ def copy_tokenizer(model_path: str, tokenizer_path: str, osp.join(triton_models_path, 'tokenizer')) +def update_output_format(model_name: str, model_format: str, model_path: str, + output_format: str): + """Update output format according to model info.""" + TORCH_DTYPE_MAP = {'bfloat16': 'bf16'} + MODEL_NAME_MAP = {'qwen': 'bf16'} + model_name = model_name.split('-')[0] + + def _infer_output_format(config): + """_infer_output_format.""" + torch_dtype = getattr(config, 'torch_dtype', None) + if torch_dtype: + return TORCH_DTYPE_MAP.get(torch_dtype, output_format) + else: + # get model name prefix + return MODEL_NAME_MAP.get(model_name, output_format) + + if model_format != 'hf': + return MODEL_NAME_MAP.get(model_name, output_format) + else: + from transformers import AutoConfig + config = AutoConfig.from_pretrained(model_path) + return _infer_output_format(config) + + def pack_model_repository(workspace_path: str): """package the model repository. @@ -215,6 +239,9 @@ def main(model_name: str, cfg.weight_type = 'int4' output_format = 'w4' assert group_size > 0, f'group_size: {group_size} should > 0' + else: + output_format = update_output_format(model_name, inferred_model_format, + model_path, output_format) # convert print('model_name ', model_name) diff --git a/lmdeploy/turbomind/deploy/target_model/__init__.py b/lmdeploy/turbomind/deploy/target_model/__init__.py index fe03500e45..2c15fe2514 100644 --- a/lmdeploy/turbomind/deploy/target_model/__init__.py +++ b/lmdeploy/turbomind/deploy/target_model/__init__.py @@ -1,3 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. +from .bf16 import TurbomindBF16Model # noqa: F401 from .fp import TurbomindModel # noqa: F401 from .w4 import TurbomindW4Model # noqa: F401 diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py index 92e6232301..eee724cb82 100644 --- a/lmdeploy/turbomind/deploy/target_model/base.py +++ b/lmdeploy/turbomind/deploy/target_model/base.py @@ -136,19 +136,25 @@ def export_config(self) -> None: def export_weight(self, param: torch.Tensor, name: str) -> None: """export turbomind weight.""" + + def _tofile(tensor, path): + """to file.""" + if tensor.dtype == torch.bfloat16: + tensor = tensor.view(torch.half) + tensor.contiguous().cpu().numpy().tofile(path) + if self.to_file: - if param.dtype in [torch.float, torch.bfloat16]: + if param.dtype in [torch.float]: param = param.half() tprint(name, param.shape) - param.contiguous().cpu().numpy().tofile( - osp.join(self.out_dir, name)) + _tofile(param, osp.join(self.out_dir, name)) elif len(self.tm_params) > 0: tm_params = self.tm_params weight_type = self.cfg.weight_type assert weight_type in ['fp16', 'fp32', 'int4'] # currently, the tensor type should in - # [torch.float, torch.half, torch.int32] + # [torch.float, torch.half, torch.bfloat16, torch.int32] torch_tensor = param.cuda().contiguous() assert torch_tensor.dtype in [ torch.int32, torch.float, torch.half, torch.bfloat16 @@ -156,6 +162,8 @@ def export_weight(self, param: torch.Tensor, name: str) -> None: if torch_tensor.dtype != torch.int32: if weight_type in ['fp16', 'int4']: torch_tensor = torch_tensor.half() + elif weight_type == 'bf16': + torch_tensor = torch_tensor.bfloat16() else: torch_tensor = torch_tensor.float() for tm_tensor in tm_params[name]: diff --git a/lmdeploy/turbomind/deploy/target_model/bf16.py b/lmdeploy/turbomind/deploy/target_model/bf16.py new file mode 100644 index 0000000000..5302d42378 --- /dev/null +++ b/lmdeploy/turbomind/deploy/target_model/bf16.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base import OUTPUT_MODELS +from .fp import TurbomindModel + +TurbomindBF16Model = OUTPUT_MODELS.register_module(name='bf16')(TurbomindModel) diff --git a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh index 85ece1fa99..31dbe147c0 100644 --- a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh +++ b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh @@ -590,6 +590,39 @@ __inline__ __device__ uint4 vec_conversion(const Float8_& a) return b; } +#ifdef ENABLE_BF16 +template<> +__inline__ __device__ __nv_bfloat162 vec_conversion<__nv_bfloat162, float2>(const float2& a) +{ + return __float22bfloat162_rn(a); +} +template<> +__inline__ __device__ bf16_4_t vec_conversion(const float4& a) +{ + bf16_4_t b; + float2 val; + val.x = a.x; + val.y = a.y; + b.x = vec_conversion<__nv_bfloat162, float2>(val); + + val.x = a.z; + val.y = a.w; + b.y = vec_conversion<__nv_bfloat162, float2>(val); + + return b; +} +template<> +__inline__ __device__ bf16_8_t vec_conversion(const Float8_& a) +{ + bf16_8_t b; + b.x = vec_conversion<__nv_bfloat162, float2>(a.x); + b.y = vec_conversion<__nv_bfloat162, float2>(a.y); + b.z = vec_conversion<__nv_bfloat162, float2>(a.z); + b.w = vec_conversion<__nv_bfloat162, float2>(a.w); + return b; +} +#endif + //////////////////////////////////////////////////////////////////////////////////////////////////// template @@ -1053,6 +1086,56 @@ inline __device__ int64_t quant(uint4 a, const float scale, const float zp) int16[3] = quant(a.w, scale, zp); return int64; } + + +// bfloat16 to int8 +inline __device__ int8_t quant(__nv_bfloat16 a, const float scale, const float zp) +{ + int8_t int8; + float b = bfloat16_to_float(a); + int8 = round(max(-128.f, min(127.f, (b - zp) / scale))); + return int8; +} +// bfloat16x2 to int8x2 +inline __device__ int16_t quant(__nv_bfloat162 a, const float scale, const float zp) +{ + union { + int8_t int8[2]; + short int16; + }; + float2 b = bfloat162_to_float2(a); + + int8[0] = round(max(-128.f, min(127.f, (b.x - zp) / scale))); + int8[1] = round(max(-128.f, min(127.f, (b.y - zp) / scale))); + return int16; +} +// bfloat16x4 to int8x4 +inline __device__ int32_t quant(bf16_4_t a, const float scale, const float zp) +{ + union { + int16_t int16[2]; + int32_t int32; + }; + + int16[0] = quant(a.x, scale, zp); + int16[1] = quant(a.y, scale, zp); + return int32; +} +// bfloat16x8 to int8x8 +inline __device__ int64_t quant(bf16_8_t a, const float scale, const float zp) +{ + union { + int16_t int16[4]; + int64_t int64; + }; + + int16[0] = quant(a.x, scale, zp); + int16[1] = quant(a.y, scale, zp); + int16[2] = quant(a.z, scale, zp); + int16[3] = quant(a.w, scale, zp); + return int64; +} + // int8 to float32, then `vec_conversion` to target format inline __device__ float dequant(int8_t a, const float scale, const float zp) { diff --git a/src/turbomind/kernels/decoder_masked_multihead_attention_utils.h b/src/turbomind/kernels/decoder_masked_multihead_attention_utils.h index 6479647799..bac144c8cc 100644 --- a/src/turbomind/kernels/decoder_masked_multihead_attention_utils.h +++ b/src/turbomind/kernels/decoder_masked_multihead_attention_utils.h @@ -43,6 +43,24 @@ struct Float4_ { //////////////////////////////////////////////////////////////////////////////////////////////////// +#ifdef ENABLE_BF16 +struct bf16_4_t { + __nv_bfloat162 x; + __nv_bfloat162 y; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct bf16_8_t { + __nv_bfloat162 x; + __nv_bfloat162 y; + __nv_bfloat162 z; + __nv_bfloat162 w; +}; +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// + template struct num_elems; template<> @@ -79,6 +97,21 @@ struct num_elems { static constexpr int value = 8; }; +#ifdef ENABLE_BF16 +template<> +struct num_elems<__nv_bfloat162> { + static constexpr int value = 2; +}; +template<> +struct num_elems { + static constexpr int value = 4; +}; +template<> +struct num_elems { + static constexpr int value = 8; +}; +#endif + //////////////////////////////////////////////////////////////////////////////////////////////////// template @@ -144,6 +177,44 @@ inline __device__ float4 add(float4 a, float4 b) //////////////////////////////////////////////////////////////////////////////////////////////////// +#ifdef ENABLE_BF16 +inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b) +{ + return a + b; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ __nv_bfloat162 add(__nv_bfloat162 a, __nv_bfloat162 b) +{ + return bf16hadd2(a, b); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ bf16_4_t add(bf16_4_t a, bf16_4_t b) +{ + bf16_4_t c; + c.x = add(a.x, b.x); + c.y = add(a.y, b.y); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ bf16_8_t add(bf16_8_t a, bf16_8_t b) +{ + bf16_8_t c; + c.x = add(a.x, b.x); + c.y = add(a.y, b.y); + c.z = add(a.z, b.z); + c.w = add(a.w, b.w); + return c; +} +#endif // ENABLE_BF16 + +//////////////////////////////////////////////////////////////////////////////////////////////////// + inline __device__ uint16_t add(uint16_t a, uint16_t b) { uint16_t c; @@ -236,6 +307,26 @@ inline __device__ float2 half2_to_float2(uint32_t v) //////////////////////////////////////////////////////////////////////////////////////////////////// +inline __device__ float bfloat16_to_float(__nv_bfloat16 h) +{ + return __bfloat162float(h); + // float f; + // asm volatile("cvt.f32.bf16 %0, %1;\n" : "=f"(f) : "h"(h)); + // return f; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float2 bfloat162_to_float2(__nv_bfloat162 v) +{ + return __bfloat1622float2(v); + // __nv_bfloat16 lo, hi; + // asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(v)); + // return make_float2(bfloat16_to_float(lo), bfloat16_to_float(hi)); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + inline __device__ float add(float a, uint16_t b) { return a + half_to_float(b); @@ -243,6 +334,15 @@ inline __device__ float add(float a, uint16_t b) //////////////////////////////////////////////////////////////////////////////////////////////////// +#ifdef ENABLE_BF16 +inline __device__ float add(float a, __nv_bfloat16 b) +{ + return a + __bfloat162float(b); +} +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// + inline __device__ float2 add(uint32_t a, float2 fb) { float2 fa = half2_to_float2(a); @@ -367,6 +467,37 @@ inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c) //////////////////////////////////////////////////////////////////////////////////////////////////// +#ifdef ENABLE_BF16 +inline __device__ float2 add(__nv_bfloat162 a, float2 fb) +{ + float2 fa = bf1622float2(a); + return add(fa, fb); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float4_ add(bf16_4_t a, Float4_ fb) +{ + Float4_ fc; + fc.x = add(a.x, fb.x); + fc.y = add(a.y, fb.y); + return fc; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float8_ add(bf16_8_t a, Float8_ fb) +{ + Float8_ fc; + fc.x = add(a.x, fb.x); + fc.y = add(a.y, fb.y); + fc.z = add(a.z, fb.z); + fc.w = add(a.w, fb.w); + return fc; +} +#endif // ENABLE_BF16 +//////////////////////////////////////////////////////////////////////////////////////////////////// + inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c) { uint32_t d; @@ -498,6 +629,134 @@ inline __device__ Float8_ fma(uint16_t a, uint4 b, Float8_ fc) return fd; } +//////////////////////////////////////////////////////////////////////////////////////////////////// +#ifdef ENABLE_BF16 +inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) +{ + return bf16hfma2(a, b, c); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b, __nv_bfloat162 c) +{ + return bf16hfma2(bf162bf162(a), b, c); +} +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ bf16_4_t fma(bf16_4_t a, bf16_4_t b, bf16_4_t c) +{ + bf16_4_t d; + d.x = fma(a.x, b.x, c.x); + d.y = fma(a.y, b.y, c.y); + return d; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ bf16_4_t fma(__nv_bfloat16 a, bf16_4_t b, bf16_4_t c) +{ + __nv_bfloat162 s = bf162bf162(a); + bf16_4_t d; + d.x = fma(s, b.x, c.x); + d.y = fma(s, b.y, c.y); + return d; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ bf16_8_t fma(bf16_8_t a, bf16_8_t b, bf16_8_t c) +{ + bf16_8_t d; + d.x = fma(a.x, b.x, c.x); + d.y = fma(a.y, b.y, c.y); + d.z = fma(a.z, b.z, c.z); + d.w = fma(a.w, b.w, c.w); + return d; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ bf16_8_t fma(__nv_bfloat16 a, bf16_8_t b, bf16_8_t c) +{ + __nv_bfloat162 s = bf162bf162(a); + bf16_8_t d; + d.x = fma(s, b.x, c.x); + d.y = fma(s, b.y, c.y); + d.z = fma(s, b.z, c.z); + d.w = fma(s, b.w, c.w); + return d; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float fma(__nv_bfloat16 a, __nv_bfloat16 b, float fc) +{ + return __bfloat162float(a) * __bfloat162float(b) + fc; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float2 fma(__nv_bfloat162 a, __nv_bfloat162 b, float2 fc) +{ + float2 fa = bf1622float2(a); + float2 fb = bf1622float2(b); + return fma(fa, fb, fc); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float2 fma(__nv_bfloat16 a, __nv_bfloat162 b, float2 fc) +{ + return fma(bf162bf162(a), b, fc); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float4_ fma(bf16_4_t a, bf16_4_t b, Float4_ fc) +{ + Float4_ fd; + fd.x = fma(a.x, b.x, fc.x); + fd.y = fma(a.y, b.y, fc.y); + return fd; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float4_ fma(__nv_bfloat16 a, bf16_4_t b, Float4_ fc) +{ + __nv_bfloat162 s = bf162bf162(a); + Float4_ fd; + fd.x = fma(s, b.x, fc.x); + fd.y = fma(s, b.y, fc.y); + return fd; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float8_ fma(bf16_8_t a, bf16_8_t b, Float8_ fc) +{ + Float8_ fd; + fd.x = fma(a.x, b.x, fc.x); + fd.y = fma(a.y, b.y, fc.y); + fd.z = fma(a.z, b.z, fc.z); + fd.w = fma(a.w, b.w, fc.w); + return fd; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float8_ fma(__nv_bfloat16 a, bf16_8_t b, Float8_ fc) +{ + __nv_bfloat162 s = bf162bf162(a); + Float8_ fd; + fd.x = fma(s, b.x, fc.x); + fd.y = fma(s, b.y, fc.y); + fd.z = fma(s, b.z, fc.z); + fd.w = fma(s, b.w, fc.w); + return fd; +} +#endif // ENABLE_BF16 //////////////////////////////////////////////////////////////////////////////////////////////////// template @@ -746,6 +1005,171 @@ inline __device__ float sum(float v) //////////////////////////////////////////////////////////////////////////////////////////////////// +#ifdef ENABLE_BF16 +template<> +inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) +{ +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + return __hmul(a, b); +#else + return bf16hmul(a, b); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) +{ + return bf16hmul2(a, b); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ __nv_bfloat162 mul(__nv_bfloat16 a, __nv_bfloat162 b) +{ + return mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) +{ + bf16_4_t c; + c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x); + c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.y, b.y); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) +{ + __nv_bfloat162 s = bf162bf162(a); + bf16_4_t c; + c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.x); + c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.y); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) +{ + bf16_8_t c; + c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x); + c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.y, b.y); + c.z = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.z, b.z); + c.w = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.w, b.w); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) +{ + __nv_bfloat162 s = bf162bf162(a); + bf16_8_t c; + c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.x); + c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.y); + c.z = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.z); + c.w = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.w); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ float mul(__nv_bfloat16 a, __nv_bfloat16 b) +{ + float fa = (float)a; + float fb = (float)b; + return fa * fb; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ float mul(__nv_bfloat16 a, float b) +{ + return __bfloat162float(a) * b; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ float2 mul(__nv_bfloat162 a, __nv_bfloat162 b) +{ + float2 fa = bf1622float2(a); + float2 fb = bf1622float2(b); + return mul(fa, fb); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ float2 mul(__nv_bfloat16 a, __nv_bfloat162 b) +{ + return mul(bf162bf162(a), b); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) +{ + Float4_ fc; + fc.x = mul(a.x, b.x); + fc.y = mul(a.y, b.y); + return fc; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) +{ + __nv_bfloat162 s = bf162bf162(a); + Float4_ fc; + fc.x = mul(s, b.x); + fc.y = mul(s, b.y); + return fc; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) +{ + Float8_ fc; + fc.x = mul(a.x, b.x); + fc.y = mul(a.y, b.y); + fc.z = mul(a.z, b.z); + fc.w = mul(a.w, b.w); + return fc; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) +{ + __nv_bfloat162 s = bf162bf162(a); + Float8_ fc; + fc.x = mul(s, b.x); + fc.y = mul(s, b.y); + fc.z = mul(s, b.z); + fc.w = mul(s, b.w); + return fc; +} +#endif // ENABLE_BF16 +//////////////////////////////////////////////////////////////////////////////////////////////////// + inline __device__ float sum(float2 v) { return v.x + v.y; @@ -758,6 +1182,31 @@ inline __device__ float sum(float4 v) return v.x + v.y + v.z + v.w; } +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef ENABLE_BF16 +inline __device__ float sum(__nv_bfloat162 v) +{ + float2 vf = bf1622float2(v); + return vf.x + vf.y; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float sum(bf16_4_t v) +{ + return sum(v.x) + sum(v.y); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float sum(bf16_8_t v) +{ + return sum(v.x) + sum(v.y) + sum(v.z) + sum(v.w); +} +#endif // ENABLE_BF16 +//////////////////////////////////////////////////////////////////////////////////////////////////// + inline __device__ float sum(uint16_t v) { return half_to_float(v); @@ -1048,4 +1497,83 @@ inline __device__ void apply_rotary_embedding(uint4& q, uint4& k, int tid, int r k.w = rotary_embedding_transform(k.w, coef3); } +#ifdef ENABLE_BF16 +inline __device__ void apply_rotary_embedding(__nv_bfloat162& q, int tid, int rot_embed_dim, float base, int t_step) +{ + if (2 * tid >= rot_embed_dim) { + return; + } + const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, t_step); + q = rotary_embedding_transform(q, coef); +} + +inline __device__ void +apply_rotary_embedding(__nv_bfloat162& q, __nv_bfloat162& k, int tid, int rot_embed_dim, float base, int t_step) +{ + if (2 * tid >= rot_embed_dim) { + return; + } + const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, base, t_step); + q = rotary_embedding_transform(q, coef); + k = rotary_embedding_transform(k, coef); +} + +inline __device__ void apply_rotary_embedding(bf16_4_t& q, int tid, int rot_embed_dim, float base, int t_step) +{ + if (4 * tid >= rot_embed_dim) { + return; + } + const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, t_step); + q.x = rotary_embedding_transform(q.x, coef0); + const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, t_step); + q.y = rotary_embedding_transform(q.y, coef1); +} + +inline __device__ void apply_rotary_embedding(bf16_4_t& q, bf16_4_t& k, int tid, int rot_embed_dim, float base, int t_step) +{ + if (4 * tid >= rot_embed_dim) { + return; + } + const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, base, t_step); + q.x = rotary_embedding_transform(q.x, coef0); + k.x = rotary_embedding_transform(k.x, coef0); + const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, base, t_step); + q.y = rotary_embedding_transform(q.y, coef1); + k.y = rotary_embedding_transform(k.y, coef1); +} + +inline __device__ void apply_rotary_embedding(bf16_8_t& q, int tid, int rot_embed_dim, float base, int t_step) +{ + if (8 * tid >= rot_embed_dim) { + return; + } + const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, base, t_step); + q.x = rotary_embedding_transform(q.x, coef0); + const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, base, t_step); + q.y = rotary_embedding_transform(q.y, coef1); + const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, base, t_step); + q.z = rotary_embedding_transform(q.z, coef2); + const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, base, t_step); + q.w = rotary_embedding_transform(q.w, coef3); +} + +inline __device__ void apply_rotary_embedding(bf16_8_t& q, bf16_8_t& k, int tid, int rot_embed_dim, float base, int t_step) +{ + if (8 * tid >= rot_embed_dim) { + return; + } + const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, base, t_step); + q.x = rotary_embedding_transform(q.x, coef0); + k.x = rotary_embedding_transform(k.x, coef0); + const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, base, t_step); + q.y = rotary_embedding_transform(q.y, coef1); + k.y = rotary_embedding_transform(k.y, coef1); + const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, base, t_step); + q.z = rotary_embedding_transform(q.z, coef2); + k.z = rotary_embedding_transform(k.z, coef2); + const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, base, t_step); + q.w = rotary_embedding_transform(q.w, coef3); + k.w = rotary_embedding_transform(k.w, coef3); +} +#endif // ENABLE_BF16 } // namespace mmha diff --git a/src/turbomind/kernels/decoder_multihead_attention/decoder_multihead_attention.cu b/src/turbomind/kernels/decoder_multihead_attention/decoder_multihead_attention.cu index 02cc827694..08179b6da6 100644 --- a/src/turbomind/kernels/decoder_multihead_attention/decoder_multihead_attention.cu +++ b/src/turbomind/kernels/decoder_multihead_attention/decoder_multihead_attention.cu @@ -111,5 +111,7 @@ void DispatchDecoderMultiheadAttention(const DecoderMultiHeadAttentionParams& template void DispatchDecoderMultiheadAttention(const DecoderMultiHeadAttentionParams& params); template void DispatchDecoderMultiheadAttention(const DecoderMultiHeadAttentionParams& params); - +#ifdef ENABLE_BF16 +template void DispatchDecoderMultiheadAttention(const DecoderMultiHeadAttentionParams<__nv_bfloat16>& params); +#endif } // namespace turbomind diff --git a/src/turbomind/kernels/decoder_multihead_attention/kv_cache.cu b/src/turbomind/kernels/decoder_multihead_attention/kv_cache.cu index d9a46c40a7..35406eda6e 100644 --- a/src/turbomind/kernels/decoder_multihead_attention/kv_cache.cu +++ b/src/turbomind/kernels/decoder_multihead_attention/kv_cache.cu @@ -477,5 +477,21 @@ template void ConvertKvCacheBlocksToLinear2(const void** src_k_block_ptrs, int quant_policy, const float* kv_params, cudaStream_t st); - +#ifdef ENABLE_BF16 +template void ConvertKvCacheBlocksToLinear2(const void** src_k_block_ptrs, + const void** src_v_block_ptrs, + __nv_bfloat16** dst_k_ptrs, + __nv_bfloat16** dst_v_ptrs, + const int* src_cu_block_cnts, + const int* seq_lens, + int src_offset, + int src_block_len, + int dst_block_len, + int head_num, + int head_dim, + int batch_size, + int quant_policy, + const float* kv_params, + cudaStream_t st); +#endif } // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 76394e862c..8bd263387d 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -1523,5 +1523,8 @@ void LlamaBatch::OutputThreadEntry() template class LlamaBatch; template class LlamaBatch; +#ifdef ENABLE_BF16 +template class LlamaBatch<__nv_bfloat16>; +#endif } // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaContextAttentionLayer.cc b/src/turbomind/models/llama/LlamaContextAttentionLayer.cc index 32565d3694..53f2d7730e 100644 --- a/src/turbomind/models/llama/LlamaContextAttentionLayer.cc +++ b/src/turbomind/models/llama/LlamaContextAttentionLayer.cc @@ -474,5 +474,8 @@ void LlamaContextAttentionLayer::unfusedMultiHeadAttention(T** key_c template class LlamaContextAttentionLayer; template class LlamaContextAttentionLayer; +#ifdef ENABLE_BF16 +template class LlamaContextAttentionLayer<__nv_bfloat16>; +#endif } // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaContextDecoder.cc b/src/turbomind/models/llama/LlamaContextDecoder.cc index 268ff7ab58..ed97f0d62e 100644 --- a/src/turbomind/models/llama/LlamaContextDecoder.cc +++ b/src/turbomind/models/llama/LlamaContextDecoder.cc @@ -294,5 +294,8 @@ void LlamaContextDecoder::forward(std::unordered_map* template class LlamaContextDecoder; template class LlamaContextDecoder; +#ifdef ENABLE_BF16 +template class LlamaContextDecoder<__nv_bfloat16>; +#endif } // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaDecoder.cc b/src/turbomind/models/llama/LlamaDecoder.cc index 071e2e0e0c..5de4b373f9 100644 --- a/src/turbomind/models/llama/LlamaDecoder.cc +++ b/src/turbomind/models/llama/LlamaDecoder.cc @@ -262,5 +262,8 @@ void LlamaDecoder::forward(std::unordered_map* ou template class LlamaDecoder; template class LlamaDecoder; +#ifdef ENABLE_BF16 +template class LlamaDecoder<__nv_bfloat16>; +#endif } // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc index ab3eb783c4..34c0abf86d 100644 --- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc +++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc @@ -377,5 +377,8 @@ TensorMap LlamaDecoderLayerWeight::getParams(std::string prefix) template struct LlamaDecoderLayerWeight; template struct LlamaDecoderLayerWeight; +#ifdef ENABLE_BF16 +template struct LlamaDecoderLayerWeight<__nv_bfloat16>; +#endif } // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc index 7352c19c52..210ba7429a 100644 --- a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc +++ b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc @@ -231,5 +231,8 @@ void LlamaDecoderSelfAttentionLayer::forward(TensorMap* o template class LlamaDecoderSelfAttentionLayer; template class LlamaDecoderSelfAttentionLayer; +#ifdef ENABLE_BF16 +template class LlamaDecoderSelfAttentionLayer<__nv_bfloat16>; +#endif } // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc index 0d78dc4e80..42575af665 100644 --- a/src/turbomind/models/llama/LlamaFfnLayer.cc +++ b/src/turbomind/models/llama/LlamaFfnLayer.cc @@ -125,5 +125,8 @@ void LlamaFfnLayer::forward(TensorMap* output_tensors, template class LlamaFfnLayer; template class LlamaFfnLayer; +#ifdef ENABLE_BF16 +template class LlamaFfnLayer<__nv_bfloat16>; +#endif } // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc index 79abc772ce..54185dd5f7 100644 --- a/src/turbomind/models/llama/LlamaV2.cc +++ b/src/turbomind/models/llama/LlamaV2.cc @@ -578,5 +578,8 @@ void LlamaV2::forward(std::unordered_map* outputs, template class LlamaV2; template class LlamaV2; +#ifdef ENABLE_BF16 +template class LlamaV2<__nv_bfloat16>; +#endif } // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc index e270d3ba5c..66b5e99e86 100644 --- a/src/turbomind/models/llama/LlamaWeight.cc +++ b/src/turbomind/models/llama/LlamaWeight.cc @@ -140,5 +140,8 @@ TensorMap LlamaWeight::getParams() template struct LlamaWeight; template struct LlamaWeight; +#ifdef ENABLE_BF16 +template struct LlamaWeight<__nv_bfloat16>; +#endif } // namespace turbomind diff --git a/src/turbomind/models/llama/flash_attention2/CMakeLists.txt b/src/turbomind/models/llama/flash_attention2/CMakeLists.txt index 1a1fe37eaa..d41c391e9d 100644 --- a/src/turbomind/models/llama/flash_attention2/CMakeLists.txt +++ b/src/turbomind/models/llama/flash_attention2/CMakeLists.txt @@ -7,6 +7,7 @@ add_library(${PROJECT_NAME} STATIC # flash_fwd_hdim32_fp16_sm80.cu # flash_fwd_hdim64_fp16_sm80.cu flash_fwd_hdim128_fp16_sm80.cu + flash_fwd_hdim128_bf16_sm80.cu # flash_fwd_hdim256_fp16_sm80.cu ) target_include_directories(${PROJECT_NAME} PRIVATE ${CUTLASS_DIR} / include) diff --git a/src/turbomind/models/llama/flash_attention2/flash_api.cpp b/src/turbomind/models/llama/flash_attention2/flash_api.cpp index 55bc92c1ff..f0407e04a4 100644 --- a/src/turbomind/models/llama/flash_attention2/flash_api.cpp +++ b/src/turbomind/models/llama/flash_attention2/flash_api.cpp @@ -12,7 +12,7 @@ void run_mha_fwd(Flash_fwd_params& params, cudaStream_t stream) { - FP16_SWITCH(true, + FP16_SWITCH(!params.is_bf16, [&] { FWD_HEADDIM_SWITCH(params.d, [&] { run_mha_fwd_(params, stream); }); }); } @@ -126,7 +126,7 @@ class FlashAttentionOpImpl::impl { fwd_params.blockmask = reinterpret_cast(params.mask); - fwd_params.is_bf16 = false; + fwd_params.is_bf16 = std::is_same::value; fwd_params.is_causal = true; fwd_params.q_enable_seqlen = params.layout_q.use_seqlens; @@ -163,5 +163,8 @@ void FlashAttentionOpImpl::operator()(Params& params, cudaStrea template class FlashAttentionOpImpl; template class FlashAttentionOpImpl; +#ifdef ENABLE_BF16 +template class FlashAttentionOpImpl<__nv_bfloat16, FMHA_VERSION>; +#endif } // namespace turbomind diff --git a/src/turbomind/models/llama/flash_attention2/flash_fwd_hdim128_bf16_sm80.cu b/src/turbomind/models/llama/flash_attention2/flash_fwd_hdim128_bf16_sm80.cu new file mode 100644 index 0000000000..5b63fe086f --- /dev/null +++ b/src/turbomind/models/llama/flash_attention2/flash_fwd_hdim128_bf16_sm80.cu @@ -0,0 +1,13 @@ +// Copyright (c) 2023, Tri Dao. + +// Splitting the different head dimensions to different files to speed up compilation. + +#include "flash_fwd_launch_template.h" + +#ifdef ENABLE_BF16 +template<> +void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream) +{ + run_mha_fwd_hdim128(params, stream); +} +#endif diff --git a/src/turbomind/models/llama/flash_attention2/flash_fwd_hdim256_bf16_sm80.cu b/src/turbomind/models/llama/flash_attention2/flash_fwd_hdim256_bf16_sm80.cu new file mode 100644 index 0000000000..00dbdc700b --- /dev/null +++ b/src/turbomind/models/llama/flash_attention2/flash_fwd_hdim256_bf16_sm80.cu @@ -0,0 +1,13 @@ +// Copyright (c) 2023, Tri Dao. + +// Splitting the different head dimensions to different files to speed up compilation. + +#include "flash_fwd_launch_template.h" + +#ifdef ENABLE_BF16 +template<> +void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream) +{ + run_mha_fwd_hdim256(params, stream); +} +#endif diff --git a/src/turbomind/models/llama/flash_attention2/flash_fwd_hdim32_bf16_sm80.cu b/src/turbomind/models/llama/flash_attention2/flash_fwd_hdim32_bf16_sm80.cu new file mode 100644 index 0000000000..f3b2df8cd0 --- /dev/null +++ b/src/turbomind/models/llama/flash_attention2/flash_fwd_hdim32_bf16_sm80.cu @@ -0,0 +1,13 @@ +// Copyright (c) 2023, Tri Dao. + +// Splitting the different head dimensions to different files to speed up compilation. + +#include "flash_fwd_launch_template.h" + +#ifdef ENABLE_BF16 +template<> +void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream) +{ + run_mha_fwd_hdim32(params, stream); +} +#endif diff --git a/src/turbomind/models/llama/flash_attention2/flash_fwd_hdim64_bf16_sm80.cu b/src/turbomind/models/llama/flash_attention2/flash_fwd_hdim64_bf16_sm80.cu new file mode 100644 index 0000000000..638d86cb71 --- /dev/null +++ b/src/turbomind/models/llama/flash_attention2/flash_fwd_hdim64_bf16_sm80.cu @@ -0,0 +1,13 @@ +// Copyright (c) 2023, Tri Dao. + +// Splitting the different head dimensions to different files to speed up compilation. + +#include "flash_fwd_launch_template.h" + +#ifdef ENABLE_BF16 +template<> +void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream) +{ + run_mha_fwd_hdim64(params, stream); +} +#endif diff --git a/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu b/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu index 4fae69bd08..8fbfa007f3 100644 --- a/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu +++ b/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu @@ -13,6 +13,25 @@ namespace turbomind { +template +struct ToCutlassType_ { +}; + +template<> +struct ToCutlassType_ { + using Type = float; +}; + +template<> +struct ToCutlassType_ { + using Type = cutlass::half_t; +}; + +template<> +struct ToCutlassType_<__nv_bfloat16> { + using Type = cutlass::bfloat16_t; +}; + template< // dtype of Q/K/V/M typename Element_, @@ -656,7 +675,7 @@ void invokeFlashAttention_impl(int batc auto group_size = attention_params.group_size; using scalar_t = - typename std::conditional_t::type>::value, cutlass::half_t, T>; + typename ToCutlassType_::Type; const float qk_scale = static_cast(1.f / sqrtf(size_per_head * 1.f)); @@ -743,7 +762,7 @@ template bool get_needs_accum_buffer() { using scalar_t = - typename std::conditional_t::type>::value, cutlass::half_t, T>; + typename ToCutlassType_::Type; #define GET_NEED_ACCUM_BUFFER(sm) \ ATTENTION_KERNEL(scalar_t, sm, kQueriesPerBlock, kKeysPerBlock, false)::kNeedsOutputAccumulatorBuffer @@ -775,7 +794,7 @@ void invoke_attention_impl(bool single_v cudaStream_t st) { using scalar_t = - typename std::conditional_t::type>::value, cutlass::half_t, T>; + typename ToCutlassType_::Type; #define INVOKE_ATTEN_IMPL(sm, single_value) \ { \ @@ -837,7 +856,7 @@ private: static constexpr int kQueriesPerBlock = 32; static constexpr int kKeysPerBlock = 128; using scalar_t = - typename std::conditional_t::type>::value, cutlass::half_t, T>; + typename ToCutlassType_::Type; using Params = typename FlashAttentionOpImpl::Params; int batch_size_; @@ -909,5 +928,6 @@ void FlashAttentionOpImpl::operator()(Params& params, cudaStream_t st) con template class FlashAttentionOpImpl; template class FlashAttentionOpImpl; +template class FlashAttentionOpImpl<__nv_bfloat16, 1>; } // namespace turbomind diff --git a/src/turbomind/models/llama/llama_decoder_kernels.cu b/src/turbomind/models/llama/llama_decoder_kernels.cu index 18c7f4deea..d22ec6d7d7 100644 --- a/src/turbomind/models/llama/llama_decoder_kernels.cu +++ b/src/turbomind/models/llama/llama_decoder_kernels.cu @@ -83,6 +83,32 @@ struct res_norm_ops_t { } }; +#ifdef ENABLE_BF16 +template<> +struct res_norm_ops_t<__nv_bfloat16> { + __device__ float2 cast(const uint& x) const + { + return __bfloat1622float2(reinterpret_cast(x)); + } + __device__ uint cast(const float2& x) const + { + auto y = __float22bfloat162_rn(x); + return reinterpret_cast(y); + } + __device__ float2 add(const float2& a, const float2& b, const float2& bias, float& accum) const + { + float2 c{a.x + b.x + bias.x, a.y + b.y + bias.y}; + accum += c.x * c.x + c.y * c.y; + return c; + } + __device__ float2 norm(const float2& a, const float2& s, float factor) const + { + return {a.x * s.x * factor, a.y * s.y * factor}; + } +}; + +#endif + template __device__ T blockReduceSum(const cg::thread_block& block, T value) { @@ -162,5 +188,7 @@ void invokeFusedAddBiasResidualRMSNorm( template void invokeFusedAddBiasResidualRMSNorm(float*, float*, const float*, const float*, float, int, int, cudaStream_t); template void invokeFusedAddBiasResidualRMSNorm(half*, half*, const half*, const half*, float, int, int, cudaStream_t); - +#ifdef ENABLE_BF16 +template void invokeFusedAddBiasResidualRMSNorm(__nv_bfloat16*, __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, float, int, int, cudaStream_t); +#endif } // namespace turbomind diff --git a/src/turbomind/models/llama/llama_kernels.cu b/src/turbomind/models/llama/llama_kernels.cu index 3ce5ca86b2..8dcb14b3a6 100644 --- a/src/turbomind/models/llama/llama_kernels.cu +++ b/src/turbomind/models/llama/llama_kernels.cu @@ -88,6 +88,9 @@ void invokeRootMeanSquareNorm(T* out, const T* input, const T* scale, float eps, template void invokeRootMeanSquareNorm(float*, const float*, const float*, float, int, int, cudaStream_t); template void invokeRootMeanSquareNorm(half*, const half*, const half*, float, int, int, cudaStream_t); +#ifdef ENABLE_BF16 +template void invokeRootMeanSquareNorm(__nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, float, int, int, cudaStream_t); +#endif // #ifdef ENABLE_BF16 @@ -206,6 +209,9 @@ void invokeCreateCausalMasks( template void invokeCreateCausalMasks(float* mask, const int*, const int*, int, int, int, cudaStream_t); template void invokeCreateCausalMasks(half* mask, const int*, const int*, int, int, int, cudaStream_t); +#ifdef ENABLE_BF16 +template void invokeCreateCausalMasks(__nv_bfloat16* mask, const int*, const int*, int, int, int, cudaStream_t); +#endif template struct ExtendKvCache { @@ -375,6 +381,24 @@ template void invokeExtendKVCache(void** k_dst_ptrs, int quant, const float* kv_scale, cudaStream_t stream); +#ifdef ENABLE_BF16 +template void invokeExtendKVCache(void** k_dst_ptrs, + void** v_dst_ptrs, + const __nv_bfloat16* k_src, + const __nv_bfloat16* v_src, + const int* cu_block_counts, + const int* query_length, + const int* history_length, + int batch_size, + int block_length, + size_t dst_layer_offset, + int max_q_len, + int head_dim, + int head_num, + int quant, + const float* kv_scale, + cudaStream_t stream); +#endif template struct TransposeKvCache { @@ -525,6 +549,23 @@ template void invokeTransposeKVCache(half*, cudaStream_t stream, int, const float*); +#ifdef ENABLE_BF16 +template void invokeTransposeKVCache(__nv_bfloat16*, + __nv_bfloat16*, + const __nv_bfloat16**, + const __nv_bfloat16**, + size_t, + int, + const int*, + int, + int, + int, + int, + int, + cudaStream_t stream, + int, + const float*); +#endif __global__ void gatherOutput(int* output_ids, const int* ids, @@ -833,5 +874,8 @@ void FlashAttentionOp::operator()(Params& params, cudaStream_t st) const template class FlashAttentionOp; template class FlashAttentionOp; +#ifdef ENABLE_BF16 +template class FlashAttentionOp<__nv_bfloat16>; +#endif } // namespace turbomind diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index 46e8443a86..e488a04695 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -6,6 +6,7 @@ #include "src/turbomind/utils/nccl_utils.h" #include #include +#include #include #include #include @@ -380,6 +381,16 @@ PYBIND11_MODULE(_turbomind, m) model->setFfiLock(gil_control); return model; } + if(data_type == "bf16") { +#ifdef ENABLE_BF16 + auto model = std::make_shared>( + tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config); + model->setFfiLock(gil_control); + return model; +#else + throw std::runtime_error("Error: turbomind has not been built with bf16 support."); +#endif + } else { auto model = std::make_shared>( tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config); diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index 580f6f0e7b..378cd3ceb7 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -438,3 +438,6 @@ int LlamaTritonModel::getPipelineParaSize() template struct LlamaTritonModel; template struct LlamaTritonModel; +#ifdef ENABLE_BF16 +template struct LlamaTritonModel<__nv_bfloat16>; +#endif diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc index b4666bd1e7..5513681399 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc @@ -244,3 +244,6 @@ void LlamaTritonModelInstance::freeBuffer() template struct LlamaTritonModelInstance; template struct LlamaTritonModelInstance; +#ifdef ENABLE_BF16 +template struct LlamaTritonModelInstance<__nv_bfloat16>; +#endif From 796859821d929f1f51fe604c96e568feeceb807d Mon Sep 17 00:00:00 2001 From: grimoire Date: Tue, 5 Dec 2023 15:35:15 +0800 Subject: [PATCH 02/22] prepare merge --- lmdeploy/turbomind/deploy/converter.py | 11 +++++++++++ .../turbomind/deploy/target_model/bf16.py | 8 ++++++-- src/turbomind/models/llama/LlamaDenseWeight.h | 3 +++ src/turbomind/models/llama/LlamaLinear.h | 1 + src/turbomind/models/llama/LlamaWeight.cc | 3 +++ .../llama_flash_attention_kernel.cu | 2 ++ src/turbomind/models/llama/llama_kernels.cu | 2 +- .../triton_backend/llama/LlamaTritonModel.cc | 19 +++++++++++++++++++ 8 files changed, 46 insertions(+), 3 deletions(-) diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py index 7ba0e048be..89df986fe8 100644 --- a/lmdeploy/turbomind/deploy/converter.py +++ b/lmdeploy/turbomind/deploy/converter.py @@ -137,6 +137,16 @@ def _infer_output_format(config): return _infer_output_format(config) +def update_config_weight_type(output_format: str, + config: TurbomindModelConfig): + WEIGHT_TYPE_MAP = {'fp32': 'fp32', + 'fp16': 'fp16', + 'bf16': 'bf16', + 'w4': 'int4', + 'w8': 'int8'} + config.weight_type = WEIGHT_TYPE_MAP[output_format] + + def pack_model_repository(workspace_path: str): """package the model repository. @@ -242,6 +252,7 @@ def main(model_name: str, else: output_format = update_output_format(model_name, inferred_model_format, model_path, output_format) + update_config_weight_type(output_format, cfg) # convert print('model_name ', model_name) diff --git a/lmdeploy/turbomind/deploy/target_model/bf16.py b/lmdeploy/turbomind/deploy/target_model/bf16.py index 5302d42378..f5f9c1a81b 100644 --- a/lmdeploy/turbomind/deploy/target_model/bf16.py +++ b/lmdeploy/turbomind/deploy/target_model/bf16.py @@ -1,5 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. -from .base import OUTPUT_MODELS +from lmdeploy.turbomind.deploy.source_model.base import BaseInputModel +from .base import OUTPUT_MODELS, TurbomindModelConfig from .fp import TurbomindModel -TurbomindBF16Model = OUTPUT_MODELS.register_module(name='bf16')(TurbomindModel) +@OUTPUT_MODELS.register_module(name='bf16') +class TurbomindBF16Model(TurbomindModel): + def __init__(self, input_model: BaseInputModel, cfg: TurbomindModelConfig, to_file: bool = True, out_dir: str = ''): + super().__init__(input_model, cfg, to_file, out_dir) diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h index 410667f1bd..369f26c736 100644 --- a/src/turbomind/models/llama/LlamaDenseWeight.h +++ b/src/turbomind/models/llama/LlamaDenseWeight.h @@ -30,6 +30,7 @@ enum class WeightType : int kFP32, kFP16, kFP8, // not supported yet + kBF16, kINT8, kINT4 }; @@ -43,6 +44,8 @@ inline size_t getBitSize(WeightType type) return 16; case WeightType::kFP8: return 8; + case WeightType::kBF16: + return 16; case WeightType::kINT8: return 8; case WeightType::kINT4: diff --git a/src/turbomind/models/llama/LlamaLinear.h b/src/turbomind/models/llama/LlamaLinear.h index 0e783df33d..a3717b2a90 100644 --- a/src/turbomind/models/llama/LlamaLinear.h +++ b/src/turbomind/models/llama/LlamaLinear.h @@ -31,6 +31,7 @@ class LlamaLinear { switch (weight.type) { case WeightType::kFP16: case WeightType::kFP32: + case WeightType::kBF16: forwardFp(output_data, input_data, batch_size, weight, type); break; case WeightType::kINT4: diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc index 66b5e99e86..6e62eaf420 100644 --- a/src/turbomind/models/llama/LlamaWeight.cc +++ b/src/turbomind/models/llama/LlamaWeight.cc @@ -90,6 +90,9 @@ template void LlamaWeight::loadModel(std::string dir_path) { FtCudaDataType model_file_type = FtCudaDataType::FP16; + if(weight_type_ == WeightType::kBF16){ + model_file_type = FtCudaDataType::BF16; + } dir_path += '/'; loadWeightFromBin((T*)pre_decoder_embedding_table, diff --git a/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu b/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu index 8fbfa007f3..ce45b123e7 100644 --- a/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu +++ b/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu @@ -27,10 +27,12 @@ struct ToCutlassType_ { using Type = cutlass::half_t; }; +#ifdef ENABLE_BF16 template<> struct ToCutlassType_<__nv_bfloat16> { using Type = cutlass::bfloat16_t; }; +#endif template< // dtype of Q/K/V/M diff --git a/src/turbomind/models/llama/llama_kernels.cu b/src/turbomind/models/llama/llama_kernels.cu index 8dcb14b3a6..80d24be682 100644 --- a/src/turbomind/models/llama/llama_kernels.cu +++ b/src/turbomind/models/llama/llama_kernels.cu @@ -837,7 +837,7 @@ FlashAttentionOp::FlashAttentionOp(int batch_size, int head_num, int key_len, #ifdef _MSC_VER op_version_ = 1; #else - op_version_ = std::is_same::type>::value ? 2 : 1; + op_version_ = std::is_same::type>::value ? 1 : 2; if (op_version_ == 2 && getSMVersion() < 80) { op_version_ = 1; } diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index 378cd3ceb7..d6ea9200b0 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -46,6 +46,17 @@ std::shared_ptr AbstractTransformerModel::createLlamaM reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"), reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0), model_dir); + }else if (data_type == "bf16") { +#ifdef ENABLE_BF16 + return std::make_shared>( + reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"), + reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"), + reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0), + model_dir); +#else + TM_LOG_ERROR("[ERROR] Turbomind is not built with ENABLE_BF16"); + ft::FT_CHECK(false); +#endif } else { return std::make_shared>( @@ -190,6 +201,9 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, if (weight_type_str == "fp16") { weight_type_ = ft::WeightType::kFP16; } + else if (weight_type_str == "bf16") { + weight_type_ = ft::WeightType::kBF16; + } else if (weight_type_str == "fp32") { weight_type_ = ft::WeightType::kFP32; } @@ -245,6 +259,11 @@ std::unique_ptr> LlamaTritonModel::createSh else if (std::is_same::value) { cublas_wrapper->setFP32GemmConfig(); } +#ifdef ENABLE_BF16 + else if (std::is_same::value) { + cublas_wrapper->setBF16GemmConfig(); + } +#endif ft::NcclParam tensor_para = nccl_params.first[comms_rank]; ft::NcclParam pipeline_para = nccl_params.second[comms_rank]; From ca9be206cee4ee7142d35695fd017571f7630f52 Mon Sep 17 00:00:00 2001 From: grimoire Date: Tue, 5 Dec 2023 17:02:06 +0800 Subject: [PATCH 03/22] add enable bf --- .../fused_multi_head_attention/llama_flash_attention_kernel.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu b/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu index ce45b123e7..6873f9251a 100644 --- a/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu +++ b/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu @@ -930,6 +930,8 @@ void FlashAttentionOpImpl::operator()(Params& params, cudaStream_t st) con template class FlashAttentionOpImpl; template class FlashAttentionOpImpl; +#ifdef ENABLE_BF16 template class FlashAttentionOpImpl<__nv_bfloat16, 1>; +#endif // ENABLE_BF16 } // namespace turbomind From 9028da0f3ef763c6f099d16a0925561e6e6dc7b0 Mon Sep 17 00:00:00 2001 From: grimoire Date: Tue, 5 Dec 2023 18:06:08 +0800 Subject: [PATCH 04/22] add bf16 decode attention support --- .../decoder_multihead_attention.cu | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/turbomind/kernels/decoder_multihead_attention/decoder_multihead_attention.cu b/src/turbomind/kernels/decoder_multihead_attention/decoder_multihead_attention.cu index 08179b6da6..4ce72a161e 100644 --- a/src/turbomind/kernels/decoder_multihead_attention/decoder_multihead_attention.cu +++ b/src/turbomind/kernels/decoder_multihead_attention/decoder_multihead_attention.cu @@ -107,11 +107,26 @@ void DispatchDecoderMultiheadAttention(const DecoderMultiHeadAttentionParams& invokeDecoderMultiheadAttention(params); } } + +#ifdef ENABLE_BF16 + if constexpr(std::is_same_v) { + int group_size = params.num_heads / params.num_kv_heads; + if (group_size % 4 == 0) { + invokeDecoderMultiheadAttention(params); + } + else if (group_size % 2 == 0) { + invokeDecoderMultiheadAttention(params); + } + else { + invokeDecoderMultiheadAttention(params); + } + } +#endif // ENABLE_BF16 } template void DispatchDecoderMultiheadAttention(const DecoderMultiHeadAttentionParams& params); template void DispatchDecoderMultiheadAttention(const DecoderMultiHeadAttentionParams& params); #ifdef ENABLE_BF16 template void DispatchDecoderMultiheadAttention(const DecoderMultiHeadAttentionParams<__nv_bfloat16>& params); -#endif +#endif // ENABLE_BF16 } // namespace turbomind From 9c1a4be526fa1930f481995973d65f6dd50e3af3 Mon Sep 17 00:00:00 2001 From: grimoire Date: Tue, 5 Dec 2023 18:11:58 +0800 Subject: [PATCH 05/22] fix python lint --- lmdeploy/turbomind/deploy/target_model/bf16.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lmdeploy/turbomind/deploy/target_model/bf16.py b/lmdeploy/turbomind/deploy/target_model/bf16.py index f5f9c1a81b..e588baf7b2 100644 --- a/lmdeploy/turbomind/deploy/target_model/bf16.py +++ b/lmdeploy/turbomind/deploy/target_model/bf16.py @@ -1,9 +1,16 @@ # Copyright (c) OpenMMLab. All rights reserved. from lmdeploy.turbomind.deploy.source_model.base import BaseInputModel + from .base import OUTPUT_MODELS, TurbomindModelConfig from .fp import TurbomindModel + @OUTPUT_MODELS.register_module(name='bf16') class TurbomindBF16Model(TurbomindModel): - def __init__(self, input_model: BaseInputModel, cfg: TurbomindModelConfig, to_file: bool = True, out_dir: str = ''): + + def __init__(self, + input_model: BaseInputModel, + cfg: TurbomindModelConfig, + to_file: bool = True, + out_dir: str = ''): super().__init__(input_model, cfg, to_file, out_dir) From 63bf9c76d5c5499716b4d4db745dce9302b70171 Mon Sep 17 00:00:00 2001 From: grimoire Date: Tue, 5 Dec 2023 18:21:00 +0800 Subject: [PATCH 06/22] fix yapf --- lmdeploy/turbomind/deploy/converter.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py index 89df986fe8..5e60f81864 100644 --- a/lmdeploy/turbomind/deploy/converter.py +++ b/lmdeploy/turbomind/deploy/converter.py @@ -139,11 +139,13 @@ def _infer_output_format(config): def update_config_weight_type(output_format: str, config: TurbomindModelConfig): - WEIGHT_TYPE_MAP = {'fp32': 'fp32', - 'fp16': 'fp16', - 'bf16': 'bf16', - 'w4': 'int4', - 'w8': 'int8'} + WEIGHT_TYPE_MAP = { + 'fp32': 'fp32', + 'fp16': 'fp16', + 'bf16': 'bf16', + 'w4': 'int4', + 'w8': 'int8' + } config.weight_type = WEIGHT_TYPE_MAP[output_format] From bcbd30b5505e2d714c6ec76676510394e9db9d12 Mon Sep 17 00:00:00 2001 From: grimoire Date: Tue, 5 Dec 2023 18:25:23 +0800 Subject: [PATCH 07/22] fix c format --- ...er_masked_multihead_attention_template.cuh | 27 +++++--------- ...decoder_masked_multihead_attention_utils.h | 6 ++- .../decoder_multihead_attention.cu | 6 +-- .../decoder_multihead_attention/kv_cache.cu | 30 +++++++-------- .../llama_flash_attention_kernel.cu | 19 ++++------ .../models/llama/llama_decoder_kernels.cu | 6 +-- src/turbomind/models/llama/llama_kernels.cu | 37 ++++++++++--------- src/turbomind/python/bind.cpp | 4 +- 8 files changed, 63 insertions(+), 72 deletions(-) diff --git a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh index 31dbe147c0..defb6e94e1 100644 --- a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh +++ b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh @@ -79,8 +79,7 @@ namespace mmha { //////////////////////////////////////////////////////////////////////////////////////////////////// template -struct Qk_vec_m_ { -}; +struct Qk_vec_m_ {}; template<> struct Qk_vec_m_ { @@ -180,8 +179,7 @@ struct Qk_vec_k_<__nv_fp8_e4m3, 256> { //////////////////////////////////////////////////////////////////////////////////////////////////// template -struct K_vec_m_ { -}; +struct K_vec_m_ {}; template<> struct K_vec_m_ { @@ -262,8 +260,7 @@ struct K_vec_k_<__nv_fp8_e4m3, 1> { //////////////////////////////////////////////////////////////////////////////////////////////////// template -struct V_vec_m_ { -}; +struct V_vec_m_ {}; template<> struct V_vec_m_ { @@ -343,8 +340,7 @@ struct V_vec_k_<__nv_fp8_e4m3, 16> { #ifdef MMHA_USE_FP32_ACUM_FOR_FMA template -struct Qk_vec_acum_fp32_ { -}; +struct Qk_vec_acum_fp32_ {}; template<> struct Qk_vec_acum_fp32_ { @@ -426,8 +422,7 @@ struct Qk_vec_acum_fp32_ { //////////////////////////////////////////////////////////////////////////////////////////////////// template -struct K_vec_acum_fp32_ { -}; +struct K_vec_acum_fp32_ {}; template<> struct K_vec_acum_fp32_ { @@ -489,8 +484,7 @@ struct K_vec_acum_fp32_ { #ifdef MMHA_USE_FP32_ACUM_FOR_OUT template -struct V_vec_acum_fp32_ { -}; +struct V_vec_acum_fp32_ {}; template<> struct V_vec_acum_fp32_ { @@ -600,7 +594,7 @@ template<> __inline__ __device__ bf16_4_t vec_conversion(const float4& a) { bf16_4_t b; - float2 val; + float2 val; val.x = a.x; val.y = a.y; b.x = vec_conversion<__nv_bfloat162, float2>(val); @@ -793,7 +787,7 @@ inline __device__ void convert_from_float(__nv_bfloat162& dst, float2 src) #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 dst = __float22bfloat162_rn(src); #else - dst = __floats2bfloat162_rn(src.x, src.y); + dst = __floats2bfloat162_rn(src.x, src.y); #endif } #endif // ENABLE_BF16 @@ -1087,7 +1081,6 @@ inline __device__ int64_t quant(uint4 a, const float scale, const float zp) return int64; } - // bfloat16 to int8 inline __device__ int8_t quant(__nv_bfloat16 a, const float scale, const float zp) { @@ -1342,7 +1335,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params } Tk* logits_smem = reinterpret_cast(logits_smem_); #else - float* logits_smem = reinterpret_cast(logits_smem_); + float* logits_smem = reinterpret_cast(logits_smem_); #endif // The shared memory to do the final reduction for the output values. Reuse qk_smem. @@ -1738,7 +1731,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params logit = is_mask ? 0.f : __expf(qk_smem[ti - first_step] - qk_max); } #else - float logit = is_mask ? 0.f : __expf(qk_smem[ti - first_step] - qk_max); + float logit = is_mask ? 0.f : __expf(qk_smem[ti - first_step] - qk_max); #endif sum += logit; qk_smem[ti - first_step] = logit; diff --git a/src/turbomind/kernels/decoder_masked_multihead_attention_utils.h b/src/turbomind/kernels/decoder_masked_multihead_attention_utils.h index bac144c8cc..8e2df5dccc 100644 --- a/src/turbomind/kernels/decoder_masked_multihead_attention_utils.h +++ b/src/turbomind/kernels/decoder_masked_multihead_attention_utils.h @@ -1529,7 +1529,8 @@ inline __device__ void apply_rotary_embedding(bf16_4_t& q, int tid, int rot_embe q.y = rotary_embedding_transform(q.y, coef1); } -inline __device__ void apply_rotary_embedding(bf16_4_t& q, bf16_4_t& k, int tid, int rot_embed_dim, float base, int t_step) +inline __device__ void +apply_rotary_embedding(bf16_4_t& q, bf16_4_t& k, int tid, int rot_embed_dim, float base, int t_step) { if (4 * tid >= rot_embed_dim) { return; @@ -1557,7 +1558,8 @@ inline __device__ void apply_rotary_embedding(bf16_8_t& q, int tid, int rot_embe q.w = rotary_embedding_transform(q.w, coef3); } -inline __device__ void apply_rotary_embedding(bf16_8_t& q, bf16_8_t& k, int tid, int rot_embed_dim, float base, int t_step) +inline __device__ void +apply_rotary_embedding(bf16_8_t& q, bf16_8_t& k, int tid, int rot_embed_dim, float base, int t_step) { if (8 * tid >= rot_embed_dim) { return; diff --git a/src/turbomind/kernels/decoder_multihead_attention/decoder_multihead_attention.cu b/src/turbomind/kernels/decoder_multihead_attention/decoder_multihead_attention.cu index 4ce72a161e..bb7c54e7f0 100644 --- a/src/turbomind/kernels/decoder_multihead_attention/decoder_multihead_attention.cu +++ b/src/turbomind/kernels/decoder_multihead_attention/decoder_multihead_attention.cu @@ -109,7 +109,7 @@ void DispatchDecoderMultiheadAttention(const DecoderMultiHeadAttentionParams& } #ifdef ENABLE_BF16 - if constexpr(std::is_same_v) { + if constexpr (std::is_same_v) { int group_size = params.num_heads / params.num_kv_heads; if (group_size % 4 == 0) { invokeDecoderMultiheadAttention(params); @@ -121,12 +121,12 @@ void DispatchDecoderMultiheadAttention(const DecoderMultiHeadAttentionParams& invokeDecoderMultiheadAttention(params); } } -#endif // ENABLE_BF16 +#endif // ENABLE_BF16 } template void DispatchDecoderMultiheadAttention(const DecoderMultiHeadAttentionParams& params); template void DispatchDecoderMultiheadAttention(const DecoderMultiHeadAttentionParams& params); #ifdef ENABLE_BF16 template void DispatchDecoderMultiheadAttention(const DecoderMultiHeadAttentionParams<__nv_bfloat16>& params); -#endif // ENABLE_BF16 +#endif // ENABLE_BF16 } // namespace turbomind diff --git a/src/turbomind/kernels/decoder_multihead_attention/kv_cache.cu b/src/turbomind/kernels/decoder_multihead_attention/kv_cache.cu index 35406eda6e..4fec0eadb6 100644 --- a/src/turbomind/kernels/decoder_multihead_attention/kv_cache.cu +++ b/src/turbomind/kernels/decoder_multihead_attention/kv_cache.cu @@ -478,20 +478,20 @@ template void ConvertKvCacheBlocksToLinear2(const void** src_k_block_ptrs, const float* kv_params, cudaStream_t st); #ifdef ENABLE_BF16 -template void ConvertKvCacheBlocksToLinear2(const void** src_k_block_ptrs, - const void** src_v_block_ptrs, - __nv_bfloat16** dst_k_ptrs, - __nv_bfloat16** dst_v_ptrs, - const int* src_cu_block_cnts, - const int* seq_lens, - int src_offset, - int src_block_len, - int dst_block_len, - int head_num, - int head_dim, - int batch_size, - int quant_policy, - const float* kv_params, - cudaStream_t st); +template void ConvertKvCacheBlocksToLinear2(const void** src_k_block_ptrs, + const void** src_v_block_ptrs, + __nv_bfloat16** dst_k_ptrs, + __nv_bfloat16** dst_v_ptrs, + const int* src_cu_block_cnts, + const int* seq_lens, + int src_offset, + int src_block_len, + int dst_block_len, + int head_num, + int head_dim, + int batch_size, + int quant_policy, + const float* kv_params, + cudaStream_t st); #endif } // namespace turbomind diff --git a/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu b/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu index 6873f9251a..341bf74652 100644 --- a/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu +++ b/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu @@ -14,8 +14,7 @@ namespace turbomind { template -struct ToCutlassType_ { -}; +struct ToCutlassType_ {}; template<> struct ToCutlassType_ { @@ -676,8 +675,7 @@ void invokeFlashAttention_impl(int batc auto layout_o = attention_params.layout_o; auto group_size = attention_params.group_size; - using scalar_t = - typename ToCutlassType_::Type; + using scalar_t = typename ToCutlassType_::Type; const float qk_scale = static_cast(1.f / sqrtf(size_per_head * 1.f)); @@ -763,8 +761,7 @@ void invokeFlashAttention_impl(int batc template bool get_needs_accum_buffer() { - using scalar_t = - typename ToCutlassType_::Type; + using scalar_t = typename ToCutlassType_::Type; #define GET_NEED_ACCUM_BUFFER(sm) \ ATTENTION_KERNEL(scalar_t, sm, kQueriesPerBlock, kKeysPerBlock, false)::kNeedsOutputAccumulatorBuffer @@ -795,8 +792,7 @@ void invoke_attention_impl(bool single_v typename FlashAttentionOpImpl::Params& params, cudaStream_t st) { - using scalar_t = - typename ToCutlassType_::Type; + using scalar_t = typename ToCutlassType_::Type; #define INVOKE_ATTEN_IMPL(sm, single_value) \ { \ @@ -857,9 +853,8 @@ class FlashAttentionOpImpl::impl { private: static constexpr int kQueriesPerBlock = 32; static constexpr int kKeysPerBlock = 128; - using scalar_t = - typename ToCutlassType_::Type; - using Params = typename FlashAttentionOpImpl::Params; + using scalar_t = typename ToCutlassType_::Type; + using Params = typename FlashAttentionOpImpl::Params; int batch_size_; int head_num_; @@ -932,6 +927,6 @@ template class FlashAttentionOpImpl; template class FlashAttentionOpImpl; #ifdef ENABLE_BF16 template class FlashAttentionOpImpl<__nv_bfloat16, 1>; -#endif // ENABLE_BF16 +#endif // ENABLE_BF16 } // namespace turbomind diff --git a/src/turbomind/models/llama/llama_decoder_kernels.cu b/src/turbomind/models/llama/llama_decoder_kernels.cu index 9ac09b3bf3..6f1f488965 100644 --- a/src/turbomind/models/llama/llama_decoder_kernels.cu +++ b/src/turbomind/models/llama/llama_decoder_kernels.cu @@ -12,8 +12,7 @@ namespace cg = cooperative_groups; namespace turbomind { template -struct res_norm_ops_t { -}; +struct res_norm_ops_t {}; template struct res_norm_t { @@ -191,6 +190,7 @@ template void invokeFusedAddBiasResidualRMSNorm(float*, float*, const float*, const float*, float, int, int, cudaStream_t); template void invokeFusedAddBiasResidualRMSNorm(half*, half*, const half*, const half*, float, int, int, cudaStream_t); #ifdef ENABLE_BF16 -template void invokeFusedAddBiasResidualRMSNorm(__nv_bfloat16*, __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, float, int, int, cudaStream_t); +template void invokeFusedAddBiasResidualRMSNorm( + __nv_bfloat16*, __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, float, int, int, cudaStream_t); #endif } // namespace turbomind diff --git a/src/turbomind/models/llama/llama_kernels.cu b/src/turbomind/models/llama/llama_kernels.cu index a0f4bb857f..73e78573d5 100644 --- a/src/turbomind/models/llama/llama_kernels.cu +++ b/src/turbomind/models/llama/llama_kernels.cu @@ -91,7 +91,8 @@ void invokeRootMeanSquareNorm(T* out, const T* input, const T* scale, float eps, template void invokeRootMeanSquareNorm(float*, const float*, const float*, float, int, int, cudaStream_t); template void invokeRootMeanSquareNorm(half*, const half*, const half*, float, int, int, cudaStream_t); #ifdef ENABLE_BF16 -template void invokeRootMeanSquareNorm(__nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, float, int, int, cudaStream_t); +template void +invokeRootMeanSquareNorm(__nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, float, int, int, cudaStream_t); #endif // #ifdef ENABLE_BF16 @@ -384,22 +385,22 @@ template void invokeExtendKVCache(void** k_dst_ptrs, const float* kv_scale, cudaStream_t stream); #ifdef ENABLE_BF16 -template void invokeExtendKVCache(void** k_dst_ptrs, - void** v_dst_ptrs, - const __nv_bfloat16* k_src, - const __nv_bfloat16* v_src, - const int* cu_block_counts, - const int* query_length, - const int* history_length, - int batch_size, - int block_length, - size_t dst_layer_offset, - int max_q_len, - int head_dim, - int head_num, - int quant, - const float* kv_scale, - cudaStream_t stream); +template void invokeExtendKVCache(void** k_dst_ptrs, + void** v_dst_ptrs, + const __nv_bfloat16* k_src, + const __nv_bfloat16* v_src, + const int* cu_block_counts, + const int* query_length, + const int* history_length, + int batch_size, + int block_length, + size_t dst_layer_offset, + int max_q_len, + int head_dim, + int head_num, + int quant, + const float* kv_scale, + cudaStream_t stream); #endif template @@ -819,7 +820,7 @@ template void invokeGetFeatureOfLastToken(half*, const half*, const int*, int, i template void invokeGetFeatureOfLastToken(float*, const float*, const int*, int, int, cudaStream_t); #ifdef ENABLE_BF16 template void invokeGetFeatureOfLastToken(__nv_bfloat16*, const __nv_bfloat16*, const int*, int, int, cudaStream_t); -#endif // ENABLE_BF16 +#endif // ENABLE_BF16 template struct BatchedCopyParam { diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index e488a04695..5538f4a7f0 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -6,12 +6,12 @@ #include "src/turbomind/utils/nccl_utils.h" #include #include -#include #include #include #include #include #include +#include namespace py = pybind11; namespace ft = turbomind; @@ -381,7 +381,7 @@ PYBIND11_MODULE(_turbomind, m) model->setFfiLock(gil_control); return model; } - if(data_type == "bf16") { + if (data_type == "bf16") { #ifdef ENABLE_BF16 auto model = std::make_shared>( tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config); From 9156e1f73aec2ed0f470f9b81f8e544452f408b6 Mon Sep 17 00:00:00 2001 From: grimoire Date: Tue, 5 Dec 2023 18:29:58 +0800 Subject: [PATCH 08/22] c format11 --- ...er_masked_multihead_attention_template.cuh | 24 ++++++++++++------- .../llama_flash_attention_kernel.cu | 3 ++- .../models/llama/llama_decoder_kernels.cu | 3 ++- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh index defb6e94e1..2bc7d1c908 100644 --- a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh +++ b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh @@ -79,7 +79,8 @@ namespace mmha { //////////////////////////////////////////////////////////////////////////////////////////////////// template -struct Qk_vec_m_ {}; +struct Qk_vec_m_ { +}; template<> struct Qk_vec_m_ { @@ -179,7 +180,8 @@ struct Qk_vec_k_<__nv_fp8_e4m3, 256> { //////////////////////////////////////////////////////////////////////////////////////////////////// template -struct K_vec_m_ {}; +struct K_vec_m_ { +}; template<> struct K_vec_m_ { @@ -260,7 +262,8 @@ struct K_vec_k_<__nv_fp8_e4m3, 1> { //////////////////////////////////////////////////////////////////////////////////////////////////// template -struct V_vec_m_ {}; +struct V_vec_m_ { +}; template<> struct V_vec_m_ { @@ -340,7 +343,8 @@ struct V_vec_k_<__nv_fp8_e4m3, 16> { #ifdef MMHA_USE_FP32_ACUM_FOR_FMA template -struct Qk_vec_acum_fp32_ {}; +struct Qk_vec_acum_fp32_ { +}; template<> struct Qk_vec_acum_fp32_ { @@ -422,7 +426,8 @@ struct Qk_vec_acum_fp32_ { //////////////////////////////////////////////////////////////////////////////////////////////////// template -struct K_vec_acum_fp32_ {}; +struct K_vec_acum_fp32_ { +}; template<> struct K_vec_acum_fp32_ { @@ -484,7 +489,8 @@ struct K_vec_acum_fp32_ { #ifdef MMHA_USE_FP32_ACUM_FOR_OUT template -struct V_vec_acum_fp32_ {}; +struct V_vec_acum_fp32_ { +}; template<> struct V_vec_acum_fp32_ { @@ -787,7 +793,7 @@ inline __device__ void convert_from_float(__nv_bfloat162& dst, float2 src) #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 dst = __float22bfloat162_rn(src); #else - dst = __floats2bfloat162_rn(src.x, src.y); + dst = __floats2bfloat162_rn(src.x, src.y); #endif } #endif // ENABLE_BF16 @@ -1335,7 +1341,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params } Tk* logits_smem = reinterpret_cast(logits_smem_); #else - float* logits_smem = reinterpret_cast(logits_smem_); + float* logits_smem = reinterpret_cast(logits_smem_); #endif // The shared memory to do the final reduction for the output values. Reuse qk_smem. @@ -1731,7 +1737,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params logit = is_mask ? 0.f : __expf(qk_smem[ti - first_step] - qk_max); } #else - float logit = is_mask ? 0.f : __expf(qk_smem[ti - first_step] - qk_max); + float logit = is_mask ? 0.f : __expf(qk_smem[ti - first_step] - qk_max); #endif sum += logit; qk_smem[ti - first_step] = logit; diff --git a/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu b/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu index 341bf74652..12d70f6675 100644 --- a/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu +++ b/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu @@ -14,7 +14,8 @@ namespace turbomind { template -struct ToCutlassType_ {}; +struct ToCutlassType_ { +}; template<> struct ToCutlassType_ { diff --git a/src/turbomind/models/llama/llama_decoder_kernels.cu b/src/turbomind/models/llama/llama_decoder_kernels.cu index 6f1f488965..89f05e90ca 100644 --- a/src/turbomind/models/llama/llama_decoder_kernels.cu +++ b/src/turbomind/models/llama/llama_decoder_kernels.cu @@ -12,7 +12,8 @@ namespace cg = cooperative_groups; namespace turbomind { template -struct res_norm_ops_t {}; +struct res_norm_ops_t { +}; template struct res_norm_t { From 3c48119bcc1cf36ee48ee25d21d7b9e9377c1416 Mon Sep 17 00:00:00 2001 From: grimoire Date: Tue, 5 Dec 2023 19:41:50 +0800 Subject: [PATCH 09/22] fix cast --- .../decoder_masked_multihead_attention_template.cuh | 2 +- .../kernels/decoder_masked_multihead_attention_utils.h | 2 +- src/turbomind/models/llama/llama_decoder_kernels.cu | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh index 2bc7d1c908..e853c242ec 100644 --- a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh +++ b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh @@ -594,7 +594,7 @@ __inline__ __device__ uint4 vec_conversion(const Float8_& a) template<> __inline__ __device__ __nv_bfloat162 vec_conversion<__nv_bfloat162, float2>(const float2& a) { - return __float22bfloat162_rn(a); + return cuda_cast<__nv_bfloat162, float2>(a); } template<> __inline__ __device__ bf16_4_t vec_conversion(const float4& a) diff --git a/src/turbomind/kernels/decoder_masked_multihead_attention_utils.h b/src/turbomind/kernels/decoder_masked_multihead_attention_utils.h index 8e2df5dccc..99f68648aa 100644 --- a/src/turbomind/kernels/decoder_masked_multihead_attention_utils.h +++ b/src/turbomind/kernels/decoder_masked_multihead_attention_utils.h @@ -319,7 +319,7 @@ inline __device__ float bfloat16_to_float(__nv_bfloat16 h) inline __device__ float2 bfloat162_to_float2(__nv_bfloat162 v) { - return __bfloat1622float2(v); + return cuda_cast(v); // __nv_bfloat16 lo, hi; // asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(v)); // return make_float2(bfloat16_to_float(lo), bfloat16_to_float(hi)); diff --git a/src/turbomind/models/llama/llama_decoder_kernels.cu b/src/turbomind/models/llama/llama_decoder_kernels.cu index 89f05e90ca..ab78ed1855 100644 --- a/src/turbomind/models/llama/llama_decoder_kernels.cu +++ b/src/turbomind/models/llama/llama_decoder_kernels.cu @@ -3,6 +3,7 @@ #include "src/turbomind/macro.h" #include "src/turbomind/models/llama/llama_decoder_kernels.h" #include "src/turbomind/utils/cuda_utils.h" +#include "src/turbomind/utils/cuda_type_utils.cuh" #include #include #include @@ -88,11 +89,11 @@ template<> struct res_norm_ops_t<__nv_bfloat16> { __device__ float2 cast(const uint& x) const { - return __bfloat1622float2(reinterpret_cast(x)); + return cuda_cast(reinterpret_cast(x)); } __device__ uint cast(const float2& x) const { - auto y = __float22bfloat162_rn(x); + auto y = cuda_cast<__nv_bfloat162, float2>(x); return reinterpret_cast(y); } __device__ float2 add(const float2& a, const float2& b, const float2& bias, float& accum) const From 61d99cf65d8558138d2aca0683de2b21f2d5881f Mon Sep 17 00:00:00 2001 From: grimoire Date: Tue, 5 Dec 2023 20:16:08 +0800 Subject: [PATCH 10/22] fix on sm<80 --- src/turbomind/kernels/activation_kernels.cu | 2 +- src/turbomind/models/llama/llama_decoder_kernels.cu | 2 +- src/turbomind/utils/cuda_type_utils.cuh | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/turbomind/kernels/activation_kernels.cu b/src/turbomind/kernels/activation_kernels.cu index 97bd7961d1..d42b872ac7 100644 --- a/src/turbomind/kernels/activation_kernels.cu +++ b/src/turbomind/kernels/activation_kernels.cu @@ -117,7 +117,7 @@ struct ReluActivation<__nv_bfloat162> { static __device__ __forceinline__ __nv_bfloat162 apply(const __nv_bfloat162& val) { const __nv_bfloat16 zero_bf16 = static_cast<__nv_bfloat16>(0.0f); - return make_bfloat162(val.x > zero_bf16 ? val.x : zero_bf16, val.y > zero_bf16 ? val.y : zero_bf16); + return ::make_bfloat162(val.x > zero_bf16 ? val.x : zero_bf16, val.y > zero_bf16 ? val.y : zero_bf16); } }; #endif diff --git a/src/turbomind/models/llama/llama_decoder_kernels.cu b/src/turbomind/models/llama/llama_decoder_kernels.cu index ab78ed1855..6bdfa2c5e6 100644 --- a/src/turbomind/models/llama/llama_decoder_kernels.cu +++ b/src/turbomind/models/llama/llama_decoder_kernels.cu @@ -2,8 +2,8 @@ #include "src/turbomind/macro.h" #include "src/turbomind/models/llama/llama_decoder_kernels.h" -#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_type_utils.cuh" +#include "src/turbomind/utils/cuda_utils.h" #include #include #include diff --git a/src/turbomind/utils/cuda_type_utils.cuh b/src/turbomind/utils/cuda_type_utils.cuh index b1b9c40e87..f7f7b95273 100644 --- a/src/turbomind/utils/cuda_type_utils.cuh +++ b/src/turbomind/utils/cuda_type_utils.cuh @@ -507,12 +507,12 @@ __device__ inline __nv_bfloat162 cuda_abs(__nv_bfloat162 val) template<> __device__ inline __nv_bfloat16 cuda_abs(__nv_bfloat16 val) { - return fabs(val); + return fabs(cuda_cast(val)); } template<> __device__ inline __nv_bfloat162 cuda_abs(__nv_bfloat162 val) { - return make_bfloat162(fabs(val.x), fabs(val.y)); + return make_bfloat162(fabs(cuda_cast(val.x)), fabs(cuda_cast(val.y))); } #endif From 1cd6d599465dc880b3d6510502b7bffba749dcc8 Mon Sep 17 00:00:00 2001 From: grimoire Date: Wed, 6 Dec 2023 10:45:40 +0800 Subject: [PATCH 11/22] fix linux bf162 cast --- src/turbomind/kernels/activation_kernels.cu | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/turbomind/kernels/activation_kernels.cu b/src/turbomind/kernels/activation_kernels.cu index d42b872ac7..69b10c78c0 100644 --- a/src/turbomind/kernels/activation_kernels.cu +++ b/src/turbomind/kernels/activation_kernels.cu @@ -117,7 +117,11 @@ struct ReluActivation<__nv_bfloat162> { static __device__ __forceinline__ __nv_bfloat162 apply(const __nv_bfloat162& val) { const __nv_bfloat16 zero_bf16 = static_cast<__nv_bfloat16>(0.0f); - return ::make_bfloat162(val.x > zero_bf16 ? val.x : zero_bf16, val.y > zero_bf16 ? val.y : zero_bf16); +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800) + return turbomind::make_bfloat162(val.x > zero_bf16 ? val.x : zero_bf16, val.y > zero_bf16 ? val.y : zero_bf16); +#else + return make_bfloat162(val.x > zero_bf16 ? val.x : zero_bf16, val.y > zero_bf16 ? val.y : zero_bf16); +#endif } }; #endif From aa74af41c3eecf2cdee09dd9c175357db21db95a Mon Sep 17 00:00:00 2001 From: grimoire Date: Wed, 6 Dec 2023 12:00:12 +0800 Subject: [PATCH 12/22] fix type cast --- .../decoder_multihead_attention/array_ops.h | 31 +++++++++++++++++++ src/turbomind/models/llama/llama_kernels.cu | 13 ++++++++ 2 files changed, 44 insertions(+) diff --git a/src/turbomind/kernels/decoder_multihead_attention/array_ops.h b/src/turbomind/kernels/decoder_multihead_attention/array_ops.h index 5a1300ff2d..f4b4133d95 100644 --- a/src/turbomind/kernels/decoder_multihead_attention/array_ops.h +++ b/src/turbomind/kernels/decoder_multihead_attention/array_ops.h @@ -3,6 +3,7 @@ #pragma once #include "src/turbomind/kernels/gemm_s_f16/common.h" +#include "src/turbomind/utils/cuda_bf16_wrapper.h" #include #include @@ -487,4 +488,34 @@ struct ConvertKvCache { } }; +#ifdef ENABLE_BF16 +template<> +struct ConvertKvCache { + + float scale_; + float zero_; + + __device__ __host__ ConvertKvCache(float scale, float zero): scale_(scale), zero_(zero) + { + zero_ = zero_ - 32896.f * scale_; + } + + template + inline __device__ auto operator()(const Array& vi) const -> Array<__nv_bfloat16, N> + { + Array<__nv_bfloat16, N> vo; + PRAGMA_UNROLL + for (int i = 0; i < N; i += 4) { + auto& vec = (Array<__nv_bfloat16, 4>&)vo[i]; + auto tmp = fast_i2f_f32_s8((const Array&)vi[i]); + PRAGMA_UNROLL + for (int j = 0; j < 4; ++j) { + vec[j] = __nv_bfloat16(tmp[j] * scale_ + zero_); + // vec[j] = half(tmp[j] * scale_ + (zero_ - 32896.f * scale_)); + } + } + return vo; + } +}; +#endif // ENABLE_BF16 } // namespace turbomind diff --git a/src/turbomind/models/llama/llama_kernels.cu b/src/turbomind/models/llama/llama_kernels.cu index 73e78573d5..69db6cddaf 100644 --- a/src/turbomind/models/llama/llama_kernels.cu +++ b/src/turbomind/models/llama/llama_kernels.cu @@ -213,6 +213,19 @@ void invokeCreateCausalMasks( template void invokeCreateCausalMasks(float* mask, const int*, const int*, int, int, int, cudaStream_t); template void invokeCreateCausalMasks(half* mask, const int*, const int*, int, int, int, cudaStream_t); #ifdef ENABLE_BF16 +template<> +__global__ void createCausalMasks<__nv_bfloat16>(__nv_bfloat16* mask, const int* q_lens, const int* k_lens, int max_q_len, int max_k_len) +{ + const auto q_len = q_lens[blockIdx.x]; + const auto k_len = k_lens[blockIdx.x]; + mask += blockIdx.x * max_q_len * max_k_len; + for (int i = threadIdx.x; i < max_q_len * max_k_len; i += blockDim.x) { + const int q = i / max_k_len; // [0, max_q_len) + const int k = i % max_k_len; // [0, max_k_len) + bool is_valid = q < q_len && k < k_len && k <= q + (k_len - q_len); + mask[i] = static_cast<__nv_bfloat16>(float(is_valid)); + } +} template void invokeCreateCausalMasks(__nv_bfloat16* mask, const int*, const int*, int, int, int, cudaStream_t); #endif From 47ed332f851b26517f3509f07dcfec637f8671a0 Mon Sep 17 00:00:00 2001 From: grimoire Date: Wed, 6 Dec 2023 12:24:23 +0800 Subject: [PATCH 13/22] fix lint --- src/turbomind/kernels/decoder_multihead_attention/array_ops.h | 2 +- src/turbomind/models/llama/llama_kernels.cu | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/turbomind/kernels/decoder_multihead_attention/array_ops.h b/src/turbomind/kernels/decoder_multihead_attention/array_ops.h index f4b4133d95..96a43de1e5 100644 --- a/src/turbomind/kernels/decoder_multihead_attention/array_ops.h +++ b/src/turbomind/kernels/decoder_multihead_attention/array_ops.h @@ -517,5 +517,5 @@ struct ConvertKvCache { return vo; } }; -#endif // ENABLE_BF16 +#endif // ENABLE_BF16 } // namespace turbomind diff --git a/src/turbomind/models/llama/llama_kernels.cu b/src/turbomind/models/llama/llama_kernels.cu index 69db6cddaf..1ba17f9f90 100644 --- a/src/turbomind/models/llama/llama_kernels.cu +++ b/src/turbomind/models/llama/llama_kernels.cu @@ -214,7 +214,8 @@ template void invokeCreateCausalMasks(float* mask, const int*, const int*, int, template void invokeCreateCausalMasks(half* mask, const int*, const int*, int, int, int, cudaStream_t); #ifdef ENABLE_BF16 template<> -__global__ void createCausalMasks<__nv_bfloat16>(__nv_bfloat16* mask, const int* q_lens, const int* k_lens, int max_q_len, int max_k_len) +__global__ void createCausalMasks<__nv_bfloat16>( + __nv_bfloat16* mask, const int* q_lens, const int* k_lens, int max_q_len, int max_k_len) { const auto q_len = q_lens[blockIdx.x]; const auto k_len = k_lens[blockIdx.x]; From 7b33258571ec670b42ac6c7b631c1d5a75c48c89 Mon Sep 17 00:00:00 2001 From: grimoire Date: Wed, 6 Dec 2023 14:34:47 +0800 Subject: [PATCH 14/22] support from hf pretrained --- lmdeploy/turbomind/deploy/converter.py | 13 ++++++++-- .../turbomind/deploy/target_model/base.py | 2 +- lmdeploy/turbomind/turbomind.py | 9 ++++++- src/turbomind/python/bind.cpp | 26 +++++++++++-------- 4 files changed, 35 insertions(+), 15 deletions(-) diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py index 5e60f81864..d73a29f423 100644 --- a/lmdeploy/turbomind/deploy/converter.py +++ b/lmdeploy/turbomind/deploy/converter.py @@ -6,6 +6,7 @@ from pathlib import Path import fire +import torch from huggingface_hub import snapshot_download from lmdeploy.model import MODELS @@ -124,10 +125,18 @@ def _infer_output_format(config): """_infer_output_format.""" torch_dtype = getattr(config, 'torch_dtype', None) if torch_dtype: - return TORCH_DTYPE_MAP.get(torch_dtype, output_format) + updated_output_format = TORCH_DTYPE_MAP.get( + torch_dtype, output_format) else: # get model name prefix - return MODEL_NAME_MAP.get(model_name, output_format) + updated_output_format = MODEL_NAME_MAP.get(model_name, + output_format) + if updated_output_format == 'bf16': + if not torch.cuda.is_bf16_supported(): + # device does not support bf16 + print('Device does not support bf16.') + updated_output_format = 'fp16' + return updated_output_format if model_format != 'hf': return MODEL_NAME_MAP.get(model_name, output_format) diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py index eee724cb82..9009e17494 100644 --- a/lmdeploy/turbomind/deploy/target_model/base.py +++ b/lmdeploy/turbomind/deploy/target_model/base.py @@ -151,7 +151,7 @@ def _tofile(tensor, path): elif len(self.tm_params) > 0: tm_params = self.tm_params weight_type = self.cfg.weight_type - assert weight_type in ['fp16', 'fp32', 'int4'] + assert weight_type in ['fp16', 'fp32', 'bf16', 'int4'] # currently, the tensor type should in # [torch.float, torch.half, torch.bfloat16, torch.int32] diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index ad7c0cb518..3d7522462f 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -22,7 +22,8 @@ from lmdeploy.tokenizer import Tokenizer from lmdeploy.utils import get_logger -from .deploy.converter import get_model_format, supported_formats +from .deploy.converter import (get_model_format, supported_formats, + update_config_weight_type, update_output_format) from .deploy.source_model.base import INPUT_MODELS from .deploy.target_model.base import OUTPUT_MODELS, TurbomindModelConfig from .utils import (ModelSource, check_tm_model_input, create_hf_download_args, @@ -246,6 +247,12 @@ def _from_hf(self, output_format = 'w4' data_type = 'int4' assert group_size > 0, f'group_size: {group_size} should > 0' + else: + output_format = update_output_format(model_name, + inferred_model_format, + model_path, output_format) + data_type = output_format + update_config_weight_type(output_format, cfg) self.config = cfg self.model_name = model_name diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index 5538f4a7f0..11d483f6c9 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -290,17 +290,21 @@ PYBIND11_MODULE(_turbomind, m) DLManagedTensor* dlmt = static_cast(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName)); auto src = DLManagedTensorToTritonTensor(dlmt); - if (self->type == triton::TYPE_FP16 || self->type == triton::TYPE_FP32 - || self->type == triton::TYPE_INT32) { - auto num_element = - std::accumulate(src->shape.begin(), src->shape.end(), 1LL, std::multiplies()); - auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8; - ft::FT_CHECK(self->shape.size() == 1 && num_bytes == self->shape[0]); - cudaMemcpy( - const_cast(self->data), const_cast(src->data), num_bytes, cudaMemcpyDefault); - } - else { - ft::FT_CHECK(0); + switch (self->type) { + case triton::TYPE_FP16: + case triton::TYPE_FP32: + case triton::TYPE_INT32: + case triton::TYPE_BF16: { + auto num_element = + std::accumulate(src->shape.begin(), src->shape.end(), 1LL, std::multiplies()); + auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8; + ft::FT_CHECK(self->shape.size() == 1 && num_bytes == self->shape[0]); + cudaMemcpy( + const_cast(self->data), const_cast(src->data), num_bytes, cudaMemcpyDefault); + break; + } + default: + ft::FT_CHECK(0); } }, "tensor"_a) From 9cd7044ca42374b8d940f540391f623df7952ece Mon Sep 17 00:00:00 2001 From: grimoire Date: Wed, 6 Dec 2023 15:15:42 +0800 Subject: [PATCH 15/22] fix pybind --- src/turbomind/python/bind.cpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index 11d483f6c9..ceed8d6bdd 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -24,12 +24,6 @@ using TensorMap = std::unordered_map; PYBIND11_MAKE_OPAQUE(TensorMap); static const char kDlTensorCapsuleName[] = "dltensor"; -template -std::shared_ptr make_shared_nodel(T data) -{ - return std::shared_ptr(&data, [](T*) {}); -} - DLDevice getDLDevice(triton::Tensor& tensor) { int device_id = 0; @@ -47,6 +41,7 @@ DLDevice getDLDevice(triton::Tensor& tensor) break; case triton::MEMORY_CPU_PINNED: device.device_type = DLDeviceType::kDLCUDAHost; + break; case triton::MEMORY_GPU: device.device_type = DLDeviceType::kDLCUDA; break; @@ -133,12 +128,11 @@ std::unique_ptr TritonTensorToDLManagedTensor(triton::Tensor& t triton::MemoryType getMemoryType(DLDevice device) { switch (device.device_type) { - case DLDeviceType::kDLCPU: - return triton::MemoryType::MEMORY_CPU; case DLDeviceType::kDLCUDAHost: return triton::MemoryType::MEMORY_CPU_PINNED; case DLDeviceType::kDLCUDA: return triton::MemoryType::MEMORY_GPU; + case DLDeviceType::kDLCPU: default: return triton::MemoryType::MEMORY_CPU; } From f43c37483fc31551f48cfb876e2c55567f57a74d Mon Sep 17 00:00:00 2001 From: grimoire Date: Mon, 11 Dec 2023 11:42:37 +0800 Subject: [PATCH 16/22] fix converter --- lmdeploy/turbomind/deploy/converter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py index d73a29f423..b75bab2467 100644 --- a/lmdeploy/turbomind/deploy/converter.py +++ b/lmdeploy/turbomind/deploy/converter.py @@ -117,7 +117,7 @@ def copy_tokenizer(model_path: str, tokenizer_path: str, def update_output_format(model_name: str, model_format: str, model_path: str, output_format: str): """Update output format according to model info.""" - TORCH_DTYPE_MAP = {'bfloat16': 'bf16'} + TORCH_DTYPE_MAP = {torch.bfloat16: 'bf16'} MODEL_NAME_MAP = {'qwen': 'bf16'} model_name = model_name.split('-')[0] From c8cdf52180060903e50ebe5124ae39a675ae1832 Mon Sep 17 00:00:00 2001 From: grimoire Date: Mon, 11 Dec 2023 15:27:03 +0800 Subject: [PATCH 17/22] add trust remote code --- lmdeploy/turbomind/deploy/converter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py index b75bab2467..a250e905e5 100644 --- a/lmdeploy/turbomind/deploy/converter.py +++ b/lmdeploy/turbomind/deploy/converter.py @@ -142,7 +142,7 @@ def _infer_output_format(config): return MODEL_NAME_MAP.get(model_name, output_format) else: from transformers import AutoConfig - config = AutoConfig.from_pretrained(model_path) + config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) return _infer_output_format(config) From 232c294c8425b7b742853cf4e92624bb8d6386e1 Mon Sep 17 00:00:00 2001 From: grimoire Date: Thu, 14 Dec 2023 10:42:14 +0800 Subject: [PATCH 18/22] fix comment --- .../turbomind/deploy/target_model/__init__.py | 1 - lmdeploy/turbomind/deploy/target_model/bf16.py | 16 ---------------- lmdeploy/turbomind/deploy/target_model/fp.py | 2 +- .../models/llama/flash_attention2/flash_api.cpp | 4 ++++ src/turbomind/python/bind.cpp | 2 +- .../triton_backend/llama/LlamaTritonModel.cc | 3 ++- 6 files changed, 8 insertions(+), 20 deletions(-) delete mode 100644 lmdeploy/turbomind/deploy/target_model/bf16.py diff --git a/lmdeploy/turbomind/deploy/target_model/__init__.py b/lmdeploy/turbomind/deploy/target_model/__init__.py index 2c15fe2514..fe03500e45 100644 --- a/lmdeploy/turbomind/deploy/target_model/__init__.py +++ b/lmdeploy/turbomind/deploy/target_model/__init__.py @@ -1,4 +1,3 @@ # Copyright (c) OpenMMLab. All rights reserved. -from .bf16 import TurbomindBF16Model # noqa: F401 from .fp import TurbomindModel # noqa: F401 from .w4 import TurbomindW4Model # noqa: F401 diff --git a/lmdeploy/turbomind/deploy/target_model/bf16.py b/lmdeploy/turbomind/deploy/target_model/bf16.py deleted file mode 100644 index e588baf7b2..0000000000 --- a/lmdeploy/turbomind/deploy/target_model/bf16.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from lmdeploy.turbomind.deploy.source_model.base import BaseInputModel - -from .base import OUTPUT_MODELS, TurbomindModelConfig -from .fp import TurbomindModel - - -@OUTPUT_MODELS.register_module(name='bf16') -class TurbomindBF16Model(TurbomindModel): - - def __init__(self, - input_model: BaseInputModel, - cfg: TurbomindModelConfig, - to_file: bool = True, - out_dir: str = ''): - super().__init__(input_model, cfg, to_file, out_dir) diff --git a/lmdeploy/turbomind/deploy/target_model/fp.py b/lmdeploy/turbomind/deploy/target_model/fp.py index d9a7783436..ebebc2fae8 100644 --- a/lmdeploy/turbomind/deploy/target_model/fp.py +++ b/lmdeploy/turbomind/deploy/target_model/fp.py @@ -14,7 +14,7 @@ def transpose_tensor(input: List[torch.Tensor]): return output -@OUTPUT_MODELS.register_module(name='fp16') +@OUTPUT_MODELS.register_module(name=['fp16', 'bf16']) class TurbomindModel(BaseOutputModel): """Export to turbomind fp16 format.""" diff --git a/src/turbomind/models/llama/flash_attention2/flash_api.cpp b/src/turbomind/models/llama/flash_attention2/flash_api.cpp index f0407e04a4..7a042b02fc 100644 --- a/src/turbomind/models/llama/flash_attention2/flash_api.cpp +++ b/src/turbomind/models/llama/flash_attention2/flash_api.cpp @@ -126,7 +126,11 @@ class FlashAttentionOpImpl::impl { fwd_params.blockmask = reinterpret_cast(params.mask); +#ifdef ENABLE_BF16 fwd_params.is_bf16 = std::is_same::value; +#else + fwd_params.is_bf16 = false; +#endif fwd_params.is_causal = true; fwd_params.q_enable_seqlen = params.layout_q.use_seqlens; diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index ceed8d6bdd..85a3c8af83 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -379,7 +379,7 @@ PYBIND11_MODULE(_turbomind, m) model->setFfiLock(gil_control); return model; } - if (data_type == "bf16") { + else if (data_type == "bf16") { #ifdef ENABLE_BF16 auto model = std::make_shared>( tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config); diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index 5096ac7c1a..6127fdf2db 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -46,7 +46,8 @@ std::shared_ptr AbstractTransformerModel::createLlamaM reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"), reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0), model_dir); - }else if (data_type == "bf16") { + } + else if (data_type == "bf16") { #ifdef ENABLE_BF16 return std::make_shared>( reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"), From 32237ff9fc9b364a78469b93d5ba842b832b778d Mon Sep 17 00:00:00 2001 From: grimoire Date: Thu, 14 Dec 2023 11:14:52 +0800 Subject: [PATCH 19/22] fix convert qwen --- lmdeploy/turbomind/deploy/converter.py | 19 ++++++++++++------- .../turbomind/deploy/target_model/base.py | 3 +++ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py index a250e905e5..7a56b141a3 100644 --- a/lmdeploy/turbomind/deploy/converter.py +++ b/lmdeploy/turbomind/deploy/converter.py @@ -121,6 +121,15 @@ def update_output_format(model_name: str, model_format: str, model_path: str, MODEL_NAME_MAP = {'qwen': 'bf16'} model_name = model_name.split('-')[0] + def _fix_device_support(output_format): + """fix device support.""" + if output_format == 'bf16': + if not torch.cuda.is_bf16_supported(): + # device does not support bf16 + print('Device does not support bf16.') + output_format = 'fp16' + return output_format + def _infer_output_format(config): """_infer_output_format.""" torch_dtype = getattr(config, 'torch_dtype', None) @@ -131,15 +140,11 @@ def _infer_output_format(config): # get model name prefix updated_output_format = MODEL_NAME_MAP.get(model_name, output_format) - if updated_output_format == 'bf16': - if not torch.cuda.is_bf16_supported(): - # device does not support bf16 - print('Device does not support bf16.') - updated_output_format = 'fp16' - return updated_output_format + return _fix_device_support(updated_output_format) if model_format != 'hf': - return MODEL_NAME_MAP.get(model_name, output_format) + updated_output_format = MODEL_NAME_MAP.get(model_name, output_format) + return _fix_device_support(updated_output_format) else: from transformers import AutoConfig config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py index 9009e17494..25f24aed52 100644 --- a/lmdeploy/turbomind/deploy/target_model/base.py +++ b/lmdeploy/turbomind/deploy/target_model/base.py @@ -144,8 +144,11 @@ def _tofile(tensor, path): tensor.contiguous().cpu().numpy().tofile(path) if self.to_file: + weight_type = self.cfg.weight_type if param.dtype in [torch.float]: param = param.half() + if weight_type == 'fp16' and param.dtype == torch.bfloat16: + param = param.half() tprint(name, param.shape) _tofile(param, osp.join(self.out_dir, name)) elif len(self.tm_params) > 0: From 706192f4c35d7aa44e6d608c22186f8d950129c1 Mon Sep 17 00:00:00 2001 From: grimoire Date: Thu, 14 Dec 2023 11:19:15 +0800 Subject: [PATCH 20/22] fix lint --- src/turbomind/models/llama/flash_attention2/flash_api.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/turbomind/models/llama/flash_attention2/flash_api.cpp b/src/turbomind/models/llama/flash_attention2/flash_api.cpp index 7a042b02fc..3fda17428c 100644 --- a/src/turbomind/models/llama/flash_attention2/flash_api.cpp +++ b/src/turbomind/models/llama/flash_attention2/flash_api.cpp @@ -127,9 +127,9 @@ class FlashAttentionOpImpl::impl { fwd_params.blockmask = reinterpret_cast(params.mask); #ifdef ENABLE_BF16 - fwd_params.is_bf16 = std::is_same::value; + fwd_params.is_bf16 = std::is_same::value; #else - fwd_params.is_bf16 = false; + fwd_params.is_bf16 = false; #endif fwd_params.is_causal = true; From f79e080524cc4f88ae8cd0739965d473dda739c4 Mon Sep 17 00:00:00 2001 From: grimoire Date: Thu, 14 Dec 2023 20:32:28 +0800 Subject: [PATCH 21/22] fix baichuan --- lmdeploy/turbomind/deploy/converter.py | 4 ++-- lmdeploy/turbomind/deploy/target_model/base.py | 18 +++++++++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py index 7a56b141a3..02a9957f58 100644 --- a/lmdeploy/turbomind/deploy/converter.py +++ b/lmdeploy/turbomind/deploy/converter.py @@ -118,7 +118,7 @@ def update_output_format(model_name: str, model_format: str, model_path: str, output_format: str): """Update output format according to model info.""" TORCH_DTYPE_MAP = {torch.bfloat16: 'bf16'} - MODEL_NAME_MAP = {'qwen': 'bf16'} + MODEL_NAME_MAP = {'qwen': 'bf16', 'llama': 'half'} model_name = model_name.split('-')[0] def _fix_device_support(output_format): @@ -142,7 +142,7 @@ def _infer_output_format(config): output_format) return _fix_device_support(updated_output_format) - if model_format != 'hf': + if model_format in MODEL_NAME_MAP: updated_output_format = MODEL_NAME_MAP.get(model_name, output_format) return _fix_device_support(updated_output_format) else: diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py index 25f24aed52..6f0564a32c 100644 --- a/lmdeploy/turbomind/deploy/target_model/base.py +++ b/lmdeploy/turbomind/deploy/target_model/base.py @@ -83,6 +83,16 @@ def valid(self): return True +_WEIGHT_DTYPE_MAP = dict( + w4=torch.float16, + int4=torch.float16, + half=torch.float16, + fp16=torch.float16, + fp32=torch.float16, + bf16=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16, +) + + class BaseOutputModel(ABC): """Base output model.""" @@ -144,11 +154,9 @@ def _tofile(tensor, path): tensor.contiguous().cpu().numpy().tofile(path) if self.to_file: - weight_type = self.cfg.weight_type - if param.dtype in [torch.float]: - param = param.half() - if weight_type == 'fp16' and param.dtype == torch.bfloat16: - param = param.half() + torch_type = _WEIGHT_DTYPE_MAP.get(self.cfg.weight_type, + torch.float16) + param = param.to(torch_type) tprint(name, param.shape) _tofile(param, osp.join(self.out_dir, name)) elif len(self.tm_params) > 0: From 0935ddb462361d0e1cf3986ee21f8f3e99383ad4 Mon Sep 17 00:00:00 2001 From: grimoire Date: Fri, 15 Dec 2023 12:04:43 +0800 Subject: [PATCH 22/22] update weight map --- lmdeploy/turbomind/deploy/target_model/base.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py index 6f0564a32c..a1ca79b98d 100644 --- a/lmdeploy/turbomind/deploy/target_model/base.py +++ b/lmdeploy/turbomind/deploy/target_model/base.py @@ -84,9 +84,7 @@ def valid(self): _WEIGHT_DTYPE_MAP = dict( - w4=torch.float16, int4=torch.float16, - half=torch.float16, fp16=torch.float16, fp32=torch.float16, bf16=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,