diff --git a/src/turbomind/kernels/attention/CMakeLists.txt b/src/turbomind/kernels/attention/CMakeLists.txt index 4e347f6dc7..3ecec478da 100644 --- a/src/turbomind/kernels/attention/CMakeLists.txt +++ b/src/turbomind/kernels/attention/CMakeLists.txt @@ -1,8 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. -add_library(attention STATIC - attention.cu - decoding.cu +add_library(attention STATIC + attention.cu + decoding.cu kv_cache_utils.cu utils.cc attention_128_f16_sm80.cu diff --git a/src/turbomind/kernels/attention/arch.h b/src/turbomind/kernels/attention/arch.h index 1380499396..772852dae8 100644 --- a/src/turbomind/kernels/attention/arch.h +++ b/src/turbomind/kernels/attention/arch.h @@ -16,4 +16,4 @@ struct Sm80 { static constexpr int value = 80; }; -} // namespace turbomind::arch \ No newline at end of file +} // namespace turbomind::arch diff --git a/src/turbomind/kernels/attention/attention.cu b/src/turbomind/kernels/attention/attention.cu index ac1a181466..717540e620 100644 --- a/src/turbomind/kernels/attention/attention.cu +++ b/src/turbomind/kernels/attention/attention.cu @@ -46,4 +46,4 @@ template void dispatchAttention(const AttentionParams& params); // template void dispatchAttention(const AttentionParams& params); // #endif -} // namespace turbomind \ No newline at end of file +} // namespace turbomind diff --git a/src/turbomind/kernels/attention/attention_128_bf16_sm80.cu b/src/turbomind/kernels/attention/attention_128_bf16_sm80.cu index 50a6d1cb31..84fdcb380e 100644 --- a/src/turbomind/kernels/attention/attention_128_bf16_sm80.cu +++ b/src/turbomind/kernels/attention/attention_128_bf16_sm80.cu @@ -8,4 +8,4 @@ namespace turbomind { using Kernel = typename attention::AttentionConfig::Kernel; template void invokeAttention(const typename Kernel::ParamType& params); -} // namespace turbomind \ No newline at end of file +} // namespace turbomind diff --git a/src/turbomind/kernels/attention/attention_128_f16_sm70.cu b/src/turbomind/kernels/attention/attention_128_f16_sm70.cu index 0f177b419c..2ed93c7e0f 100644 --- a/src/turbomind/kernels/attention/attention_128_f16_sm70.cu +++ b/src/turbomind/kernels/attention/attention_128_f16_sm70.cu @@ -9,4 +9,4 @@ using Kernel = typename attention::AttentionConfig(const typename Kernel::ParamType& params); -} // namespace turbomind \ No newline at end of file +} // namespace turbomind diff --git a/src/turbomind/kernels/attention/attention_128_f16_sm80.cu b/src/turbomind/kernels/attention/attention_128_f16_sm80.cu index 24059ded6f..92ed792a64 100644 --- a/src/turbomind/kernels/attention/attention_128_f16_sm80.cu +++ b/src/turbomind/kernels/attention/attention_128_f16_sm80.cu @@ -8,4 +8,4 @@ namespace turbomind { using Kernel = typename attention::AttentionConfig::Kernel; template void invokeAttention(const typename Kernel::ParamType& params); -} // namespace turbomind \ No newline at end of file +} // namespace turbomind diff --git a/src/turbomind/kernels/attention/attention_config.h b/src/turbomind/kernels/attention/attention_config.h index 5f1f988f9c..29dc0d1890 100644 --- a/src/turbomind/kernels/attention/attention_config.h +++ b/src/turbomind/kernels/attention/attention_config.h @@ -42,4 +42,4 @@ struct AttentionConfig { using Kernel = AttentionUniversal; }; -} // namespace turbomind::attention \ No newline at end of file +} // namespace turbomind::attention diff --git a/src/turbomind/kernels/attention/attention_template.h b/src/turbomind/kernels/attention/attention_template.h index fc8dedb3c6..09f1ab2aec 100644 --- a/src/turbomind/kernels/attention/attention_template.h +++ b/src/turbomind/kernels/attention/attention_template.h @@ -80,4 +80,4 @@ void invokeAttention(const typename Kernel::ParamType& params) } } -} // namespace turbomind \ No newline at end of file +} // namespace turbomind diff --git a/src/turbomind/kernels/attention/cta_map.h b/src/turbomind/kernels/attention/cta_map.h index 2500798427..ae8502503c 100644 --- a/src/turbomind/kernels/attention/cta_map.h +++ b/src/turbomind/kernels/attention/cta_map.h @@ -146,4 +146,4 @@ struct ReduceCtaMap { } }; -} // namespace turbomind::attention \ No newline at end of file +} // namespace turbomind::attention diff --git a/src/turbomind/kernels/attention/decoding.cu b/src/turbomind/kernels/attention/decoding.cu index 7107b1b115..5d2f1c0b53 100644 --- a/src/turbomind/kernels/attention/decoding.cu +++ b/src/turbomind/kernels/attention/decoding.cu @@ -143,4 +143,4 @@ void dispatchDecoding(const AttentionParams& params) template void dispatchDecoding(const AttentionParams& params); -} // namespace turbomind \ No newline at end of file +} // namespace turbomind diff --git a/src/turbomind/kernels/attention/decoding.h b/src/turbomind/kernels/attention/decoding.h index 6c2ae7d223..9886ef9b2c 100644 --- a/src/turbomind/kernels/attention/decoding.h +++ b/src/turbomind/kernels/attention/decoding.h @@ -9,4 +9,4 @@ namespace turbomind { template void dispatchDecoding(const AttentionParams& params); -} \ No newline at end of file +} diff --git a/src/turbomind/kernels/attention/decoding_128_bf16_sm80.cu b/src/turbomind/kernels/attention/decoding_128_bf16_sm80.cu index c28fc30b26..70ceb21ba3 100644 --- a/src/turbomind/kernels/attention/decoding_128_bf16_sm80.cu +++ b/src/turbomind/kernels/attention/decoding_128_bf16_sm80.cu @@ -25,4 +25,4 @@ template void invokeDecoding(const typename sm80_bf16_s8_g using sm80_bf16_s8_g2_d128 = Decoding; template void invokeDecoding(const typename sm80_bf16_s8_g2_d128::ParamType& params); -} // namespace turbomind \ No newline at end of file +} // namespace turbomind diff --git a/src/turbomind/kernels/attention/decoding_128_f16_sm70.cu b/src/turbomind/kernels/attention/decoding_128_f16_sm70.cu index 9c7819b6d2..268c9450f8 100644 --- a/src/turbomind/kernels/attention/decoding_128_f16_sm70.cu +++ b/src/turbomind/kernels/attention/decoding_128_f16_sm70.cu @@ -25,4 +25,4 @@ template void invokeDecoding(const typename sm70_f16_s8_g2_ using sm70_f16_s8_g4_d128 = Decoding; template void invokeDecoding(const typename sm70_f16_s8_g4_d128::ParamType& params); -} // namespace turbomind \ No newline at end of file +} // namespace turbomind diff --git a/src/turbomind/kernels/attention/decoding_128_f16_sm80.cu b/src/turbomind/kernels/attention/decoding_128_f16_sm80.cu index 616200fb15..aca02ab302 100644 --- a/src/turbomind/kernels/attention/decoding_128_f16_sm80.cu +++ b/src/turbomind/kernels/attention/decoding_128_f16_sm80.cu @@ -25,4 +25,4 @@ template void invokeDecoding(const typename sm80_f16_s8_g1_ using sm80_f16_s8_g2_d128 = Decoding; template void invokeDecoding(const typename sm80_f16_s8_g2_d128::ParamType& params); -} // namespace turbomind \ No newline at end of file +} // namespace turbomind diff --git a/src/turbomind/kernels/attention/decoding_config.h b/src/turbomind/kernels/attention/decoding_config.h index 65f309bdf6..3197a71825 100644 --- a/src/turbomind/kernels/attention/decoding_config.h +++ b/src/turbomind/kernels/attention/decoding_config.h @@ -85,4 +85,4 @@ struct DecodingConfig { // using Kernel = AttentionUniversal; // }; -} // namespace turbomind::attention \ No newline at end of file +} // namespace turbomind::attention diff --git a/src/turbomind/kernels/attention/decoding_simt.h b/src/turbomind/kernels/attention/decoding_simt.h index 3861363e85..cf19d30a6a 100644 --- a/src/turbomind/kernels/attention/decoding_simt.h +++ b/src/turbomind/kernels/attention/decoding_simt.h @@ -559,4 +559,4 @@ struct Impl& d, const Array& a, const Array& d, const Array& a, const Array { } }; -} // namespace turbomind \ No newline at end of file +} // namespace turbomind diff --git a/src/turbomind/kernels/attention/iterator_sm80.h b/src/turbomind/kernels/attention/iterator_sm80.h index 93c185efc0..3810357b2b 100644 --- a/src/turbomind/kernels/attention/iterator_sm80.h +++ b/src/turbomind/kernels/attention/iterator_sm80.h @@ -90,4 +90,4 @@ struct Sm80GmemIterator: BaseGmemIterator { } }; -} // namespace turbomind \ No newline at end of file +} // namespace turbomind diff --git a/src/turbomind/kernels/attention/kv_cache_utils.cu b/src/turbomind/kernels/attention/kv_cache_utils.cu index 8f53863224..b5e475c3d7 100644 --- a/src/turbomind/kernels/attention/kv_cache_utils.cu +++ b/src/turbomind/kernels/attention/kv_cache_utils.cu @@ -478,4 +478,4 @@ template void invokeFlattenKV(nv_bfloat16* k, cudaStream_t stream); #endif -} // namespace turbomind \ No newline at end of file +} // namespace turbomind diff --git a/src/turbomind/kernels/attention/kv_cache_utils.h b/src/turbomind/kernels/attention/kv_cache_utils.h index 7277725967..5558b0525c 100644 --- a/src/turbomind/kernels/attention/kv_cache_utils.h +++ b/src/turbomind/kernels/attention/kv_cache_utils.h @@ -103,4 +103,4 @@ void invokeFlattenKV_(const AttentionParams& params, int sum_k_len) params.stream); } -} // namespace turbomind \ No newline at end of file +} // namespace turbomind diff --git a/src/turbomind/kernels/attention/mainloop.h b/src/turbomind/kernels/attention/mainloop.h index 713616c8b6..beb6cc4f29 100644 --- a/src/turbomind/kernels/attention/mainloop.h +++ b/src/turbomind/kernels/attention/mainloop.h @@ -7,4 +7,4 @@ namespace turbomind::attention { template struct Mainloop {}; -} // namespace turbomind::attention \ No newline at end of file +} // namespace turbomind::attention diff --git a/src/turbomind/kernels/attention/mainloop_sm70.h b/src/turbomind/kernels/attention/mainloop_sm70.h index 50f4d86cdd..0da6f7f348 100644 --- a/src/turbomind/kernels/attention/mainloop_sm70.h +++ b/src/turbomind/kernels/attention/mainloop_sm70.h @@ -139,4 +139,4 @@ struct Mainloop { } }; -} // namespace turbomind::attention \ No newline at end of file +} // namespace turbomind::attention diff --git a/src/turbomind/kernels/attention/mainloop_sm80.h b/src/turbomind/kernels/attention/mainloop_sm80.h index 1fa936d1cb..b84080d185 100644 --- a/src/turbomind/kernels/attention/mainloop_sm80.h +++ b/src/turbomind/kernels/attention/mainloop_sm80.h @@ -317,7 +317,7 @@ struct Mainloop, Impl_> { // Load : K0,K1 | V0,K2,V1,K3 ... // Compute : K0 | K1,V0,K2,V1 ... // Conclusion: - // - more reigster consumption (209 -> 250) + // - more register consumption (209 -> 250) // - more interleaved HMMA and FMA // - slight performance gain template @@ -457,4 +457,4 @@ struct Mainloop, Impl_> { } }; -} // namespace turbomind::attention \ No newline at end of file +} // namespace turbomind::attention diff --git a/src/turbomind/kernels/attention/quantization.h b/src/turbomind/kernels/attention/quantization.h index 2e1197686f..9c89130e9a 100644 --- a/src/turbomind/kernels/attention/quantization.h +++ b/src/turbomind/kernels/attention/quantization.h @@ -391,4 +391,4 @@ __device__ void permute_V(Array (&x)[Map::kIterS][Map::k } } -} // namespace turbomind \ No newline at end of file +} // namespace turbomind diff --git a/src/turbomind/kernels/attention/reduce.cu b/src/turbomind/kernels/attention/reduce.cu index e20e399ed4..9dd9c71e78 100644 --- a/src/turbomind/kernels/attention/reduce.cu +++ b/src/turbomind/kernels/attention/reduce.cu @@ -20,4 +20,4 @@ void dispatchReduce(T* out, { } -} // namespace turbomind \ No newline at end of file +} // namespace turbomind diff --git a/src/turbomind/kernels/attention/reduce.h b/src/turbomind/kernels/attention/reduce.h index 1f807aa189..b6d5be966b 100644 --- a/src/turbomind/kernels/attention/reduce.h +++ b/src/turbomind/kernels/attention/reduce.h @@ -206,4 +206,4 @@ struct Reduce { } }; -} // namespace turbomind::attention \ No newline at end of file +} // namespace turbomind::attention diff --git a/src/turbomind/kernels/attention/reduce_template.h b/src/turbomind/kernels/attention/reduce_template.h index 7ac134f0f4..69d91e2dda 100644 --- a/src/turbomind/kernels/attention/reduce_template.h +++ b/src/turbomind/kernels/attention/reduce_template.h @@ -99,4 +99,4 @@ void dispatchReduce(T* out, invoke(std::true_type{}, stride_k); } -} // namespace turbomind::attention \ No newline at end of file +} // namespace turbomind::attention diff --git a/src/turbomind/kernels/attention/reference.cu b/src/turbomind/kernels/attention/reference.cu index 3dfee23e23..d06e0f37a5 100644 --- a/src/turbomind/kernels/attention/reference.cu +++ b/src/turbomind/kernels/attention/reference.cu @@ -331,4 +331,4 @@ template class Reference; template class Reference; #endif -} // namespace turbomind \ No newline at end of file +} // namespace turbomind diff --git a/src/turbomind/kernels/attention/reference.h b/src/turbomind/kernels/attention/reference.h index 05a394909c..900e6c675b 100644 --- a/src/turbomind/kernels/attention/reference.h +++ b/src/turbomind/kernels/attention/reference.h @@ -67,4 +67,4 @@ class Reference { int batch_size_{}; }; -} // namespace turbomind \ No newline at end of file +} // namespace turbomind diff --git a/src/turbomind/kernels/attention/smem_layout.h b/src/turbomind/kernels/attention/smem_layout.h index 5134c8c323..9407471b22 100644 --- a/src/turbomind/kernels/attention/smem_layout.h +++ b/src/turbomind/kernels/attention/smem_layout.h @@ -127,4 +127,4 @@ struct SmemAccessor { // } }; -} // namespace turbomind \ No newline at end of file +} // namespace turbomind diff --git a/src/turbomind/kernels/attention/test_attention.cu b/src/turbomind/kernels/attention/test_attention.cu index 94bf785923..7f8b30cf0d 100644 --- a/src/turbomind/kernels/attention/test_attention.cu +++ b/src/turbomind/kernels/attention/test_attention.cu @@ -522,4 +522,4 @@ int main(int argc, char* argv[]) // test_attention(); test_attention(); -} \ No newline at end of file +} diff --git a/src/turbomind/kernels/attention/utils.cc b/src/turbomind/kernels/attention/utils.cc index feed1e22a8..aabbecccbd 100644 --- a/src/turbomind/kernels/attention/utils.cc +++ b/src/turbomind/kernels/attention/utils.cc @@ -45,4 +45,4 @@ int GetSplitCount( return std::get(best); } -} // namespace turbomind \ No newline at end of file +} // namespace turbomind diff --git a/src/turbomind/kernels/attention/utils.h b/src/turbomind/kernels/attention/utils.h index 1681d28fd6..e43a78598c 100644 --- a/src/turbomind/kernels/attention/utils.h +++ b/src/turbomind/kernels/attention/utils.h @@ -10,4 +10,4 @@ int GetSplitCount(int max_split_cnt, float alpha = 1, float beta = 1e-3); -} \ No newline at end of file +} diff --git a/src/turbomind/kernels/flash_attention/CMakeLists.txt b/src/turbomind/kernels/flash_attention/CMakeLists.txt index 85749aa0b2..e65ffbe0d5 100644 --- a/src/turbomind/kernels/flash_attention/CMakeLists.txt +++ b/src/turbomind/kernels/flash_attention/CMakeLists.txt @@ -10,4 +10,4 @@ target_link_libraries(flash_attention PRIVATE llama_fmha) if (NOT MSVC) add_subdirectory(flash_attention2) target_link_libraries(flash_attention PRIVATE flash_attention2) -endif() \ No newline at end of file +endif() diff --git a/src/turbomind/kernels/flash_attention/flash_attention.cu b/src/turbomind/kernels/flash_attention/flash_attention.cu index 49d70a872c..cbe0bb8bc1 100644 --- a/src/turbomind/kernels/flash_attention/flash_attention.cu +++ b/src/turbomind/kernels/flash_attention/flash_attention.cu @@ -66,4 +66,4 @@ template class FlashAttentionOp; template class FlashAttentionOp<__nv_bfloat16>; #endif -} // namespace turbomind \ No newline at end of file +} // namespace turbomind