diff --git a/csrc/core/math.hpp b/csrc/core/math.hpp index 80cce89560f70..30632f20b61c0 100644 --- a/csrc/core/math.hpp +++ b/csrc/core/math.hpp @@ -6,6 +6,7 @@ inline uint32_t next_pow_2(uint32_t const num) { if (num <= 1) return num; return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); +} template static inline constexpr auto div_ceil(A a, B b) { diff --git a/csrc/quantization/activation_kernels.cu b/csrc/quantization/activation_kernels.cu index bac41e65e7b30..33610a898af61 100644 --- a/csrc/quantization/activation_kernels.cu +++ b/csrc/quantization/activation_kernels.cu @@ -107,10 +107,9 @@ __global__ void act_and_mul_quant_kernel( scale.data_ptr(), d); \ }); -void silu_and_mul_quant(torch::Tensor& out, // [..., d] - torch::Tensor& input, // [..., 2 * d] - torch::Tensor& scale) -{ +void silu_and_mul_quant(torch::Tensor& out, // [..., d] + torch::Tensor& input, // [..., 2 * d] + torch::Tensor& scale) { TORCH_CHECK(out.dtype() == torch::kFloat8_e4m3fn); TORCH_CHECK(input.dtype() == torch::kFloat16 || input.dtype() == torch::kBFloat16);