diff --git a/src/turbomind/kernels/attention/decoding.cu b/src/turbomind/kernels/attention/decoding.cu index 5d2f1c0b53..98d20b5ac7 100644 --- a/src/turbomind/kernels/attention/decoding.cu +++ b/src/turbomind/kernels/attention/decoding.cu @@ -95,7 +95,6 @@ void dispatchDecoding(const AttentionParams& params) FT_CHECK(0); } - template<> void dispatchDecoding(const AttentionParams& params) { @@ -113,10 +112,12 @@ void dispatchDecoding(const AttentionParams& params) if (params.arch >= 80) { if (0) {} else if (query_group_sz % 2 == 0) { - return invokeDecoding::Kernel>(params); + return invokeDecoding::Kernel>( + params); } else { - return invokeDecoding::Kernel>(params); + return invokeDecoding::Kernel>( + params); } } } @@ -124,16 +125,20 @@ void dispatchDecoding(const AttentionParams& params) if (params.arch >= 80) { if (0) {} else if (query_group_sz % 8 == 0) { - return invokeDecoding::Kernel>(params); + return invokeDecoding< + typename DecodingConfig::Kernel>(params); } else if (query_group_sz % 4 == 0) { - return invokeDecoding::Kernel>(params); + return invokeDecoding< + typename DecodingConfig::Kernel>(params); } else if (query_group_sz % 2 == 0) { - return invokeDecoding::Kernel>(params); + return invokeDecoding< + typename DecodingConfig::Kernel>(params); } else { - return invokeDecoding::Kernel>(params); + return invokeDecoding< + typename DecodingConfig::Kernel>(params); } } } diff --git a/src/turbomind/kernels/attention/decoding_simt.h b/src/turbomind/kernels/attention/decoding_simt.h index cf19d30a6a..ca63b92877 100644 --- a/src/turbomind/kernels/attention/decoding_simt.h +++ b/src/turbomind/kernels/attention/decoding_simt.h @@ -208,7 +208,7 @@ struct Impl {}; +struct Sm80_16816: Arch<80> { +}; -struct Sm80_81616: Arch<80> {}; +struct Sm80_81616: Arch<80> { +}; -struct Sm75_1688: Arch<75, 80> {}; +struct Sm75_1688: Arch<75, 80> { +}; -struct Sm70_884: Arch<70, 75> {}; +struct Sm70_884: Arch<70, 75> { +}; -struct Sm70_Simt: Arch<70> {}; +struct Sm70_Simt: Arch<70> { +}; template -struct Impl {}; +struct Impl { +}; } // namespace attention diff --git a/src/turbomind/kernels/attention/impl_sm70.h b/src/turbomind/kernels/attention/impl_sm70.h index 3b62ee9082..aa241fef44 100644 --- a/src/turbomind/kernels/attention/impl_sm70.h +++ b/src/turbomind/kernels/attention/impl_sm70.h @@ -151,7 +151,7 @@ struct Impl&)frag_O[m][n][d1 * 4 + q * 2]); + ((Func &&) func)(0, qi, di, (Array&)frag_O[m][n][d1 * 4 + q * 2]); } } } diff --git a/src/turbomind/kernels/attention/impl_sm80.h b/src/turbomind/kernels/attention/impl_sm80.h index ba9962975f..8030d834c9 100644 --- a/src/turbomind/kernels/attention/impl_sm80.h +++ b/src/turbomind/kernels/attention/impl_sm80.h @@ -224,7 +224,7 @@ struct Impl -struct Sm80_CpAsync {}; +struct Sm80_CpAsync { +}; template struct Mainloop, Impl_> { @@ -56,17 +57,17 @@ struct Mainloop, Impl_> { template __device__ void operator()(Args&&... args) { - Run(Sm80_CpAsync{}, ((Args&&)args)...); + Run(Sm80_CpAsync{}, ((Args &&) args)...); } template __device__ static decltype(auto) Select(A&& a, B&& b) { if constexpr (Idx) { - return (B&&)b; + return (B &&) b; } else { - return (A&&)a; + return (A &&) a; } } diff --git a/src/turbomind/kernels/attention/reference.h b/src/turbomind/kernels/attention/reference.h index 900e6c675b..8d543cb1fa 100644 --- a/src/turbomind/kernels/attention/reference.h +++ b/src/turbomind/kernels/attention/reference.h @@ -16,7 +16,8 @@ void invokeApplyRotaryEmbedding( template class Reference { public: - enum Type { + enum Type + { kUNFUSED, kFLASH_ATTENTION }; diff --git a/src/turbomind/kernels/attention/test_attention.cu b/src/turbomind/kernels/attention/test_attention.cu index 7f8b30cf0d..43201d608a 100644 --- a/src/turbomind/kernels/attention/test_attention.cu +++ b/src/turbomind/kernels/attention/test_attention.cu @@ -206,7 +206,7 @@ int test_attention() constexpr size_t kSequenceLen = 0; constexpr int kMaxSplitK = 1; - constexpr int kBlockSz = 128; + constexpr int kBlockSz = 128; #endif diff --git a/src/turbomind/kernels/attention/test_utils.cu b/src/turbomind/kernels/attention/test_utils.cu index 04b8fe8899..857b85fb6b 100644 --- a/src/turbomind/kernels/attention/test_utils.cu +++ b/src/turbomind/kernels/attention/test_utils.cu @@ -56,10 +56,10 @@ template void Compare(const half* src, const half* ref, size_t stride, int m, in template void Compare(const float* src, const float* ref, size_t stride, int m, int n, bool show, float rtol, float atol); #if ENABLE_BF16 -template void Compare(const nv_bfloat16* src, const nv_bfloat16* ref, size_t stride, int m, int n, bool show, float rtol, float atol); +template void +Compare(const nv_bfloat16* src, const nv_bfloat16* ref, size_t stride, int m, int n, bool show, float rtol, float atol); #endif - void LoadBinary(const std::string& path, size_t size, void* dst) { std::ifstream ifs(path, std::ios::binary | std::ios::in); @@ -181,7 +181,6 @@ template void RNG::GenerateNormal(float* out, size_t count, float scale, float s template void RNG::GenerateNormal(nv_bfloat16* out, size_t count, float scale, float shift); #endif - template struct SATypeConverter { using Type = T; diff --git a/src/turbomind/kernels/flash_attention/fused_multi_head_attention/llama_flash_attention_kernel.cu b/src/turbomind/kernels/flash_attention/fused_multi_head_attention/llama_flash_attention_kernel.cu index 01d4f84a8e..f9ffd756b1 100644 --- a/src/turbomind/kernels/flash_attention/fused_multi_head_attention/llama_flash_attention_kernel.cu +++ b/src/turbomind/kernels/flash_attention/fused_multi_head_attention/llama_flash_attention_kernel.cu @@ -15,7 +15,8 @@ namespace turbomind { template -struct ToCutlassType_ {}; +struct ToCutlassType_ { +}; template<> struct ToCutlassType_ { diff --git a/src/turbomind/models/llama/llama_utils.cu b/src/turbomind/models/llama/llama_utils.cu index 2202f35a04..57e00ed7f1 100644 --- a/src/turbomind/models/llama/llama_utils.cu +++ b/src/turbomind/models/llama/llama_utils.cu @@ -33,7 +33,6 @@ struct abs_diff_t<__nv_bfloat16> { using type = float; }; - template struct abs_diff: public thrust::unary_function, typename abs_diff_t::type> { __host__ __device__ float operator()(thrust::tuple x) const