From ce9f9941cc29e9d14001395dd631df563b79b2f0 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 20 Jun 2023 12:12:19 +0200 Subject: [PATCH 01/50] Initial rewrite to support N-dimensional tensors --- include/kernel_float.h | 26 +- include/kernel_float/base.h | 328 ++++++++++++++++++++ include/kernel_float/bf16.h | 214 ------------- include/kernel_float/binops.h | 225 ++++++-------- include/kernel_float/broadcast.h | 237 +++++++++++++++ include/kernel_float/cast.h | 203 ------------- include/kernel_float/complex.h | 259 ++++++++++++++++ include/kernel_float/fp16.h | 188 ------------ include/kernel_float/fp8.h | 0 include/kernel_float/interface.h | 294 ------------------ include/kernel_float/iterate.h | 175 ----------- include/kernel_float/macros.h | 12 +- include/kernel_float/meta.h | 183 ++++++----- include/kernel_float/reduce.h | 49 ++- include/kernel_float/storage.h | 503 ------------------------------- include/kernel_float/swizzle.h | 218 -------------- include/kernel_float/tensor.h | 278 +++++++++++++++++ include/kernel_float/unops.h | 129 ++++---- 18 files changed, 1392 insertions(+), 2129 deletions(-) create mode 100644 include/kernel_float/base.h delete mode 100644 include/kernel_float/bf16.h create mode 100644 include/kernel_float/broadcast.h delete mode 100644 include/kernel_float/cast.h create mode 100644 include/kernel_float/complex.h delete mode 100644 include/kernel_float/fp16.h delete mode 100644 include/kernel_float/fp8.h delete mode 100644 include/kernel_float/interface.h delete mode 100644 include/kernel_float/iterate.h delete mode 100644 include/kernel_float/storage.h delete mode 100644 include/kernel_float/swizzle.h create mode 100644 include/kernel_float/tensor.h diff --git a/include/kernel_float.h b/include/kernel_float.h index 93ada9c..14be925 100644 --- a/include/kernel_float.h +++ b/include/kernel_float.h @@ -1,18 +1,24 @@ #ifndef KERNEL_FLOAT_H #define KERNEL_FLOAT_H -#include "kernel_float/bf16.h" -#include "kernel_float/binops.h" -#include "kernel_float/cast.h" -#include "kernel_float/fp16.h" -#include "kernel_float/fp8.h" -#include "kernel_float/interface.h" -#include "kernel_float/iterate.h" +//#include "kernel_float/bf16.h" +//#include "kernel_float/binops.h" +//#include "kernel_float/broadcast.h" +//#include "kernel_float/fp16.h" +//#include "kernel_float/fp8.h" +//#include "kernel_float/interface.h" +//#include "kernel_float/iterate.h" +//#include "kernel_float/macros.h" +//#include "kernel_float/meta.h" +//#include "kernel_float/reduce.h" +//#include "kernel_float/storage.h" +//#include "kernel_float/swizzle.h" +//#include "kernel_float/unops.h" + +#include "kernel_float/base.h" #include "kernel_float/macros.h" #include "kernel_float/meta.h" -#include "kernel_float/reduce.h" -#include "kernel_float/storage.h" -#include "kernel_float/swizzle.h" +#include "kernel_float/tensor.h" #include "kernel_float/unops.h" #endif \ No newline at end of file diff --git a/include/kernel_float/base.h b/include/kernel_float/base.h new file mode 100644 index 0000000..1563ac7 --- /dev/null +++ b/include/kernel_float/base.h @@ -0,0 +1,328 @@ +#ifndef KERNEL_FLOAT_BASE +#define KERNEL_FLOAT_BASE + +#include "macros.h" +#include "meta.h" + +namespace kernel_float { + +template +struct alignas(Alignment) array { + KERNEL_FLOAT_INLINE + T* data() { + return items_; + } + + KERNEL_FLOAT_INLINE + const T* data() const { + return items_; + } + + KERNEL_FLOAT_INLINE + T& operator[](size_t i) { + return items_[i]; + } + + KERNEL_FLOAT_INLINE + const T& operator[](size_t i) const { + return items_[i]; + } + + T items_[N]; +}; + +template +struct array { + KERNEL_FLOAT_INLINE + array(T value = {}) : value_(value) {} + + KERNEL_FLOAT_INLINE + operator T() const { + return value_; + } + + KERNEL_FLOAT_INLINE + T* data() { + return &value_; + } + + KERNEL_FLOAT_INLINE + const T* data() const { + return &value_; + } + + KERNEL_FLOAT_INLINE + T& operator[](size_t) { + return value_; + } + + KERNEL_FLOAT_INLINE + const T& operator[](size_t) const { + return value_; + } + + T value_; +}; + +template +struct array { + KERNEL_FLOAT_INLINE + T* data() { + while (true) + ; + } + + KERNEL_FLOAT_INLINE + const T* data() const { + while (true) + ; + } + + KERNEL_FLOAT_INLINE + T& operator[](size_t i) { + while (true) + ; + } + + KERNEL_FLOAT_INLINE + const T& operator[](size_t i) const { + while (true) + ; + } +}; + +template +using ndindex = array; + +KERNEL_FLOAT_INLINE +static constexpr size_t compute_max_alignment(size_t total_size, size_t min_align) { + if (total_size % 32 == 0 || min_align >= 32) { + return 32; + } else if (total_size % 16 == 0 || min_align == 16) { + return 16; + } else if (total_size % 8 == 0 || min_align == 8) { + return 8; + } else if (total_size % 4 == 0 || min_align == 4) { + return 4; + } else if (total_size % 2 == 0 || min_align == 2) { + return 2; + } else { + return 1; + } +} + +template +using tensor_storage = array; + +template class S = tensor_storage> +struct tensor; + +template +struct extents; + +template<> +struct extents<> { + static constexpr size_t rank = 0; + static constexpr size_t volume = 1; + + KERNEL_FLOAT_INLINE + static constexpr size_t size(size_t axis) { + return 1; + } + + KERNEL_FLOAT_INLINE + static constexpr size_t stride(size_t axis) { + return 1; + } + + KERNEL_FLOAT_INLINE + static size_t ravel_index(ndindex<0>) { + return 0; + } + + KERNEL_FLOAT_INLINE + static ndindex<0> unravel_index(size_t i) { + return {}; + } +}; + +template +struct extents { + static constexpr size_t rank = 1; + static constexpr size_t volume = N; + + KERNEL_FLOAT_INLINE + static constexpr size_t size(size_t axis) { + return axis == 0 ? N : 1; + } + + KERNEL_FLOAT_INLINE + static constexpr size_t stride(size_t axis) { + return 1; + } + + KERNEL_FLOAT_INLINE + static size_t ravel_index(ndindex<1> ind) { + return ind[0]; + } + + KERNEL_FLOAT_INLINE + static ndindex<1> unravel_index(size_t i) { + return {i}; + } +}; + +template +struct extents { + static constexpr size_t rank = 2; + static constexpr size_t volume = N * M; + + KERNEL_FLOAT_INLINE + static constexpr size_t size(size_t axis) { + return axis == 0 ? N : axis == 1 ? M : 1; + } + + KERNEL_FLOAT_INLINE + static constexpr size_t stride(size_t axis) { + return axis == 0 ? M : 1; + } + + KERNEL_FLOAT_INLINE + static size_t ravel_index(ndindex<2> x) { + return x[0] * M + x[1]; + } + + KERNEL_FLOAT_INLINE + static ndindex<2> unravel_index(size_t i) { + return {i / M, i % M}; + } +}; + +template +struct extents { + static constexpr size_t rank = 3; + static constexpr size_t volume = N * M * K; + + KERNEL_FLOAT_INLINE + static constexpr size_t size(size_t axis) { + return axis == 0 ? N : axis == 1 ? M : axis == 2 ? K : 1; + } + + KERNEL_FLOAT_INLINE + static constexpr size_t stride(size_t axis) { + return axis == 0 ? M * K // + : axis == 1 ? K // + : 1; // + } + + KERNEL_FLOAT_INLINE + static size_t ravel_index(ndindex<3> x) { + return (x[0] * M + x[1]) * K + x[2]; + } + + KERNEL_FLOAT_INLINE + static ndindex<3> unravel_index(size_t i) { + return {i / (K * M), (i / K) % M, i % K}; + } +}; + +template +struct into_tensor_traits; + +template +struct into_tensor_traits { + using type = typename into_tensor_traits::type; + + KERNEL_FLOAT_INLINE + static type call(const V input) { + return into_tensor_traits::call(input); + } +}; + +template +struct into_tensor_traits { + using type = typename into_tensor_traits::type; + + KERNEL_FLOAT_INLINE + static type call(V& input) { + return into_tensor_traits::call(input); + } +}; + +template +struct into_tensor_traits { + using type = typename into_tensor_traits::type; + + KERNEL_FLOAT_INLINE + static type call(const V& input) { + return into_tensor_traits::call(input); + } +}; + +template +struct into_tensor_traits { + using type = typename into_tensor_traits::type; + + KERNEL_FLOAT_INLINE + static type call(V&& input) { + return into_tensor_traits::call(std::move(input)); + } +}; + +template class S> +struct into_tensor_traits> { + using type = tensor; + + KERNEL_FLOAT_INLINE + static type call(const tensor& input) { + return input; + } +}; + +template +struct into_tensor_traits> { + using type = tensor>; + + KERNEL_FLOAT_INLINE + static type call(const array& input) { + return input; + } +}; + +template +struct tensor_traits; + +template class S> +struct tensor_traits> { + using value_type = T; + using extents_type = D; + using storage_type = S; +}; + +template +using into_tensor_type = typename into_tensor_traits::type; + +template +KERNEL_FLOAT_INLINE into_tensor_type into_tensor(V&& input) { + return into_tensor_traits::call(std::forward(input)); +} + +template +using tensor_extents = typename tensor_traits>::extents_type; + +template +static constexpr size_t tensor_rank = tensor_extents::rank; + +template +static constexpr size_t tensor_volume = tensor_extents::volume; + +template +using tensor_value_type = typename tensor_traits>::value_type; + +template +using tensor_promoted_value_type = + promote_t>::value_type...>; + +} // namespace kernel_float + +#endif \ No newline at end of file diff --git a/include/kernel_float/bf16.h b/include/kernel_float/bf16.h deleted file mode 100644 index 8406615..0000000 --- a/include/kernel_float/bf16.h +++ /dev/null @@ -1,214 +0,0 @@ -#ifndef KERNEL_FLOAT_BF16_H -#define KERNEL_FLOAT_BF16_H - -#include "macros.h" - -#if KERNEL_FLOAT_BF16_AVAILABLE -#include - -#include "binops.h" -#include "cast.h" -#include "interface.h" -#include "storage.h" -#include "unops.h" - -namespace kernel_float { -KERNEL_FLOAT_DEFINE_COMMON_TYPE(__nv_bfloat16, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(float, __nv_bfloat16) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(double, __nv_bfloat16) - -template<> -struct vector_traits<__nv_bfloat162> { - using value_type = __nv_bfloat16; - static constexpr size_t size = 2; - - KERNEL_FLOAT_INLINE - static __nv_bfloat162 fill(__nv_bfloat16 value) { -#if KERNEL_FLOAT_ON_DEVICE - return __bfloat162bfloat162(value); -#else - return {value, value}; -#endif - } - - KERNEL_FLOAT_INLINE - static __nv_bfloat162 create(__nv_bfloat16 low, __nv_bfloat16 high) { -#if KERNEL_FLOAT_ON_DEVICE - return __halves2bfloat162(low, high); -#else - return {low, high}; -#endif - } - - KERNEL_FLOAT_INLINE - static __nv_bfloat16 get(__nv_bfloat162 self, size_t index) { -#if KERNEL_FLOAT_ON_DEVICE - if (index == 0) { - return __low2bfloat16(self); - } else { - return __high2bfloat16(self); - } -#else - if (index == 0) { - return self.x; - } else { - return self.y; - } -#endif - } - - KERNEL_FLOAT_INLINE - static void set(__nv_bfloat162& self, size_t index, __nv_bfloat16 value) { - if (index == 0) { - self.x = value; - } else { - self.y = value; - } - } -}; - -template -struct default_storage<__nv_bfloat16, N, Alignment::Maximum, enabled_t<(N >= 2)>> { - using type = nested_array<__nv_bfloat162, N>; -}; - -template -struct default_storage<__nv_bfloat16, N, Alignment::Packed, enabled_t<(N >= 2 && N % 2 == 0)>> { - using type = nested_array<__nv_bfloat162, N>; -}; - -#if KERNEL_FLOAT_ON_DEVICE -#define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2) \ - namespace ops { \ - template<> \ - struct NAME<__nv_bfloat16> { \ - KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(__nv_bfloat16 input) { \ - return FUN1(input); \ - } \ - }; \ - } \ - namespace detail { \ - template<> \ - struct map_helper, __nv_bfloat162, __nv_bfloat162> { \ - KERNEL_FLOAT_INLINE static __nv_bfloat162 \ - call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 input) { \ - return FUN2(input); \ - } \ - }; \ - } - -KERNEL_FLOAT_BF16_UNARY_FUN(abs, ::__habs, ::__habs2); -KERNEL_FLOAT_BF16_UNARY_FUN(negate, ::__hneg, ::__hneg2); -KERNEL_FLOAT_BF16_UNARY_FUN(ceil, ::hceil, ::h2ceil); -KERNEL_FLOAT_BF16_UNARY_FUN(cos, ::hcos, ::h2cos); -KERNEL_FLOAT_BF16_UNARY_FUN(exp, ::hexp, ::h2exp); -KERNEL_FLOAT_BF16_UNARY_FUN(exp10, ::hexp10, ::h2exp10); -KERNEL_FLOAT_BF16_UNARY_FUN(floor, ::hfloor, ::h2floor); -KERNEL_FLOAT_BF16_UNARY_FUN(log, ::hlog, ::h2log); -KERNEL_FLOAT_BF16_UNARY_FUN(log10, ::hlog10, ::h2log2); -KERNEL_FLOAT_BF16_UNARY_FUN(rint, ::hrint, ::h2rint); -KERNEL_FLOAT_BF16_UNARY_FUN(rsqrt, ::hrsqrt, ::h2rsqrt); -KERNEL_FLOAT_BF16_UNARY_FUN(sin, ::hsin, ::h2sin); -KERNEL_FLOAT_BF16_UNARY_FUN(sqrt, ::hsqrt, ::h2sqrt); -KERNEL_FLOAT_BF16_UNARY_FUN(trunc, ::htrunc, ::h2trunc); - -#define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2) \ - namespace ops { \ - template<> \ - struct NAME<__nv_bfloat16> { \ - KERNEL_FLOAT_INLINE __nv_bfloat16 \ - operator()(__nv_bfloat16 left, __nv_bfloat16 right) const { \ - return FUN1(left, right); \ - } \ - }; \ - } \ - namespace detail { \ - template<> \ - struct zip_helper, __nv_bfloat162, __nv_bfloat162, __nv_bfloat162> { \ - KERNEL_FLOAT_INLINE static __nv_bfloat162 \ - call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 left, __nv_bfloat162 right) { \ - return FUN2(left, right); \ - } \ - }; \ - } - -KERNEL_FLOAT_BF16_BINARY_FUN(add, __hadd, __hadd2) -KERNEL_FLOAT_BF16_BINARY_FUN(subtract, __hsub, __hsub2) -KERNEL_FLOAT_BF16_BINARY_FUN(multiply, __hmul, __hmul2) -KERNEL_FLOAT_BF16_BINARY_FUN(divide, __hdiv, __h2div) -KERNEL_FLOAT_BF16_BINARY_FUN(min, __hmin, __hmin2) -KERNEL_FLOAT_BF16_BINARY_FUN(max, __hmax, __hmax2) - -KERNEL_FLOAT_BF16_BINARY_FUN(equal_to, __heq, __heq2) -KERNEL_FLOAT_BF16_BINARY_FUN(not_equal_to, __heq, __heq2) -KERNEL_FLOAT_BF16_BINARY_FUN(less, __hlt, __hlt2) -KERNEL_FLOAT_BF16_BINARY_FUN(less_equal, __hle, __hle2) -KERNEL_FLOAT_BF16_BINARY_FUN(greater, __hgt, __hgt2) -KERNEL_FLOAT_BF16_BINARY_FUN(greater_equal, __hge, __hgt2) - -#endif - -#define KERNEL_FLOAT_BF16_CAST(T, TO_HALF, FROM_HALF) \ - namespace ops { \ - template<> \ - struct cast { \ - KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(T input) { \ - return TO_HALF; \ - } \ - }; \ - template<> \ - struct cast<__nv_bfloat16, T> { \ - KERNEL_FLOAT_INLINE T operator()(__nv_bfloat16 input) { \ - return FROM_HALF; \ - } \ - }; \ - } - -KERNEL_FLOAT_BF16_CAST(double, __double2bfloat16(input), double(__bfloat162float(input))); -KERNEL_FLOAT_BF16_CAST(float, __float2bfloat16(input), __bfloat162float(input)); - -// there are no official char casts. Instead, cast to int and then to char -KERNEL_FLOAT_BF16_CAST(char, __int2bfloat16_rn(input), (char)__bfloat162int_rz(input)); -KERNEL_FLOAT_BF16_CAST( - signed char, - __int2bfloat16_rn(input), - (signed char)__bfloat162int_rz(input)); -KERNEL_FLOAT_BF16_CAST( - unsigned char, - __int2bfloat16_rn(input), - (unsigned char)__bfloat162int_rz(input)); - -KERNEL_FLOAT_BF16_CAST(signed short, __bfloat162short_rz(input), __short2bfloat16_rn(input)); -KERNEL_FLOAT_BF16_CAST(signed int, __bfloat162int_rz(input), __int2bfloat16_rn(input)); -KERNEL_FLOAT_BF16_CAST( - signed long, - __ll2bfloat16_rn(input), - (signed long)(__bfloat162ll_rz(input))); -KERNEL_FLOAT_BF16_CAST(signed long long, __ll2bfloat16_rn(input), __bfloat162ll_rz(input)); - -KERNEL_FLOAT_BF16_CAST(unsigned short, __bfloat162ushort_rz(input), __ushort2bfloat16_rn(input)); -KERNEL_FLOAT_BF16_CAST(unsigned int, __bfloat162uint_rz(input), __uint2bfloat16_rn(input)); -KERNEL_FLOAT_BF16_CAST( - unsigned long, - __ull2bfloat16_rn(input), - (unsigned long)(__bfloat162ull_rz(input))); -KERNEL_FLOAT_BF16_CAST(unsigned long long, __ull2bfloat16_rn(input), __bfloat162ull_rz(input)); - -using bfloat16 = __nv_bfloat16; -//KERNEL_FLOAT_TYPE_ALIAS(half, __nv_bfloat16) -//KERNEL_FLOAT_TYPE_ALIAS(float16x, __nv_bfloat16) -//KERNEL_FLOAT_TYPE_ALIAS(f16x, __nv_bfloat16) - -} // namespace kernel_float - -#if KERNEL_FLOAT_FP16_AVAILABLE -#include "fp16.h" - -namespace kernel_float { -KERNEL_FLOAT_BF16_CAST(__half, __float2bfloat16(input), __bfloat162float(input)); -} - -#endif // KERNEL_FLOAT_FP16_AVAILABLE -#endif - -#endif //KERNEL_FLOAT_BF16_H diff --git a/include/kernel_float/binops.h b/include/kernel_float/binops.h index 6b9bf69..4252c4e 100644 --- a/include/kernel_float/binops.h +++ b/include/kernel_float/binops.h @@ -1,148 +1,98 @@ #ifndef KERNEL_FLOAT_BINOPS_H #define KERNEL_FLOAT_BINOPS_H +#include "broadcast.h" #include "unops.h" namespace kernel_float { namespace detail { -template -struct zip_helper { - KERNEL_FLOAT_INLINE static Output call(F fun, const Left& left, const Right& right) { - return call_with_indices(fun, left, right, make_index_sequence> {}); - } - private: - template - KERNEL_FLOAT_INLINE static Output - call_with_indices(F fun, const Left& left, const Right& right, index_sequence = {}) { - return vector_traits::create(fun(vector_get(left), vector_get(right))...); - } -}; - -template -struct zip_helper, nested_array, nested_array> { - KERNEL_FLOAT_INLINE static nested_array - call(F fun, const nested_array& left, const nested_array& right) { - return call(fun, left, right, make_index_sequence::num_packets> {}); +template +struct zip_helper { + KERNEL_FLOAT_INLINE static tensor_storage + call(F fun, const tensor_storage& left, const tensor_storage& right) { + return call(fun, left, right, make_index_sequence {}); } private: template - KERNEL_FLOAT_INLINE static nested_array call( + KERNEL_FLOAT_INLINE static tensor_storage call( F fun, - const nested_array& left, - const nested_array& right, + const tensor_storage& left, + const tensor_storage& right, index_sequence) { - return {zip_helper::call(fun, left[Is], right[Is])...}; + return {fun(left[Is], right[Is])...}; } }; -}; // namespace detail - -template -using common_vector_value_type = common_t...>; - -template -static constexpr size_t common_vector_size = common_size...>; +} // namespace detail template -using zip_type = default_storage_type< - result_t, vector_value_type>, - common_vector_size>; +using zip_type = + tensor, tensor_value_type>, broadcast_tensor_extents>; -/** - * Applies ``fun`` to each pair of two elements from ``left`` and ``right`` and returns a new - * vector with the results. - * - * If ``left`` and ``right`` are not the same size, they will first be broadcast into a - * common size using ``resize``. - * - * Note that this function does **not** cast the input vectors to a common element type. See - * ``zip_common`` for that functionality. - */ -template> -KERNEL_FLOAT_INLINE vector zip(F fun, Left&& left, Right&& right) { - static constexpr size_t N = vector_size; - using LeftInput = default_storage_type, N>; - using RightInput = default_storage_type, N>; +template +KERNEL_FLOAT_INLINE zip_type zip(F fun, const L& left, const R& right) { + using A = tensor_value_type; + using B = tensor_value_type; + using C = result_t; + using E = broadcast_tensor_extents; - return detail::zip_helper::call( + return detail::zip_helper::call( fun, - broadcast(std::forward(left)), - broadcast(std::forward(right))); + broadcast(left).storage(), + broadcast(right).storage()); } template -using zip_common_type = default_storage_type< - result_t, common_vector_value_type>, - common_vector_size>; +using zip_common_type = tensor< + result_t, tensor_promoted_value_type>, + broadcast_tensor_extents>; -/** - * Applies ``fun`` to each pair of two elements from ``left`` and ``right`` and returns a new - * vector with the results. - * - * If ``left`` and ``right`` are not the same size, they will first be broadcast into a - * common size using ``resize``. - * - * If ``left`` and ``right`` are not of the same type, they will first be case into a common - * data type. For example, zipping ``float`` and ``double`` first cast vectors to ``double``. - * - * Example - * ======= - * ``` - * vec x = {1, 2, 3, 4}; - * vec = {8}; - * vec = zip_common([](auto a, auto b){ return a + b; }, x, y); // [9, 10, 11, 12] - * ``` - */ -template< - typename F, - typename Left, - typename Right, - typename Output = zip_common_type> -KERNEL_FLOAT_INLINE vector zip_common(F fun, Left&& left, Right&& right) { - static constexpr size_t N = vector_size; - using C = common_t, vector_value_type>; - using Input = default_storage_type; - - return detail::zip_helper::call( - fun, - broadcast(std::forward(left)), - broadcast(std::forward(right))); +template +KERNEL_FLOAT_INLINE zip_common_type zip_common(F fun, const L& left, const R& right) { + while (1) + ; + // TODO } -#define KERNEL_FLOAT_DEFINE_BINARY(NAME, EXPR) \ - namespace ops { \ - template \ - struct NAME { \ - KERNEL_FLOAT_INLINE T operator()(T left, T right) { \ - return T(EXPR); \ - } \ - }; \ - } \ - template> \ - KERNEL_FLOAT_INLINE vector, L, R>> NAME(L&& left, R&& right) { \ - return zip_common(ops::NAME {}, std::forward(left), std::forward(right)); \ +#define KERNEL_FLOAT_DEFINE_BINARY(NAME, EXPR) \ + namespace ops { \ + template \ + struct NAME { \ + KERNEL_FLOAT_INLINE T operator()(T left, T right) { \ + return T(EXPR); \ + } \ + }; \ + } \ + template> \ + KERNEL_FLOAT_INLINE zip_common_type, L, R> NAME(L&& left, R&& right) { \ + return zip_common(ops::NAME {}, std::forward(left), std::forward(right)); \ } -#define KERNEL_FLOAT_DEFINE_BINARY_OP(NAME, OP) \ - KERNEL_FLOAT_DEFINE_BINARY(NAME, left OP right) \ - template> \ - KERNEL_FLOAT_INLINE vector, L, R>> operator OP( \ - const vector& left, \ - const vector& right) { \ - return zip_common(ops::NAME {}, left, right); \ - } \ - template> \ - KERNEL_FLOAT_INLINE vector, L, R>> operator OP( \ - const vector& left, \ - const R& right) { \ - return zip_common(ops::NAME {}, left, right); \ - } \ - template> \ - KERNEL_FLOAT_INLINE vector, L, R>> operator OP( \ - const L& left, \ - const vector& right) { \ - return zip_common(ops::NAME {}, left, right); \ +#define KERNEL_FLOAT_DEFINE_BINARY_OP(NAME, OP) \ + KERNEL_FLOAT_DEFINE_BINARY(NAME, left OP right) \ + template< \ + typename L, \ + typename R, \ + typename C = tensor_promoted_value_type, \ + typename E1, \ + typename E2> \ + KERNEL_FLOAT_INLINE zip_common_type, L, R> operator OP( \ + const tensor& left, \ + const tensor& right) { \ + return zip_common(ops::NAME {}, left, right); \ + } \ + template, typename E> \ + KERNEL_FLOAT_INLINE zip_common_type, L, R> operator OP( \ + const tensor& left, \ + const R& right) { \ + return zip_common(ops::NAME {}, left, right); \ + } \ + template, typename E> \ + KERNEL_FLOAT_INLINE zip_common_type, L, R> operator OP( \ + const L& left, \ + const tensor& right) { \ + return zip_common(ops::NAME {}, left, right); \ } KERNEL_FLOAT_DEFINE_BINARY_OP(add, +) @@ -163,28 +113,29 @@ KERNEL_FLOAT_DEFINE_BINARY_OP(bit_or, |) KERNEL_FLOAT_DEFINE_BINARY_OP(bit_xor, ^) // clang-format off -template typename F, typename L, typename R> -static constexpr bool vector_assign_allowed = - common_vector_size == vector_size && - is_implicit_convertible< - result_t< - F, vector_value_type>>, - vector_value_type, - vector_value_type - >, - vector_value_type - >; +template typename F, typename T, typename E, typename R> +static constexpr bool is_tensor_assign_allowed = + is_tensor_broadcastable && + is_implicit_convertible< + result_t< + F>>, + T, + tensor_value_type + >, + T + >; // clang-format on -#define KERNEL_FLOAT_DEFINE_BINARY_ASSIGN_OP(NAME, OP) \ - template< \ - typename L, \ - typename R, \ - typename T = enabled_t, vector_value_type>> \ - KERNEL_FLOAT_INLINE vector& operator OP(vector& lhs, const R& rhs) { \ - using F = ops::NAME; \ - lhs = zip_common(F {}, lhs.storage(), rhs); \ - return lhs; \ +#define KERNEL_FLOAT_DEFINE_BINARY_ASSIGN_OP(NAME, OP) \ + template< \ + typename T, \ + typename E, \ + typename R, \ + typename = enabled_t>> \ + KERNEL_FLOAT_INLINE tensor& operator OP(tensor& lhs, const R& rhs) { \ + using F = ops::NAME; \ + lhs = zip_common(F {}, lhs, rhs); \ + return lhs; \ } KERNEL_FLOAT_DEFINE_BINARY_ASSIGN_OP(add, +=) @@ -271,4 +222,4 @@ struct bit_xor { } // namespace kernel_float -#endif //KERNEL_FLOAT_BINOPS_H +#endif \ No newline at end of file diff --git a/include/kernel_float/broadcast.h b/include/kernel_float/broadcast.h new file mode 100644 index 0000000..5bc47b1 --- /dev/null +++ b/include/kernel_float/broadcast.h @@ -0,0 +1,237 @@ +#ifndef KERNEL_FLOAT_CAST_H +#define KERNEL_FLOAT_CAST_H + +#include "base.h" + +namespace kernel_float { +namespace detail { + +template +struct unify_dimension_helper; + +template<> +struct unify_dimension_helper<1, 1> { + static constexpr size_t value = 1; +}; + +template +struct unify_dimension_helper { + static constexpr size_t value = N; +}; + +template +struct unify_dimension_helper { + static constexpr size_t value = N; +}; + +template +struct unify_dimension_helper<1, N> { + static constexpr size_t value = N; +}; + +template +struct unify_extents_helper; + +template +struct unify_extents_helper, extents> { + using type = extents::value...>; +}; + +template +struct extents_to_rank { + using type = E; +}; + +template +struct extents_to_rank, N, enabled_t<(sizeof...(Ns) < N)>>: + extents_to_rank, N> {}; + +template +struct broadcast_extents_helper { + using type = typename unify_extents_helper< + typename extents_to_rank::type, // + typename extents_to_rank::type // + >::type; +}; + +template +struct broadcast_extents_helper { + using type = E; +}; + +} // namespace detail + +template +using broadcast_extents = typename detail::broadcast_extents_helper::type; + +template +using broadcast_tensor_extents = broadcast_extents, tensor_extents>; + +template +static constexpr bool is_broadcastable = is_same, To>; + +template +static constexpr bool is_tensor_broadcastable = is_broadcastable, To>; + +namespace detail { + +template +struct copy_helper; + +template +struct copy_helper, IS, OS> { + template + static void call(T* output, const T* input) { + ndindex<0> x; + size_t input_index = IS::call(x); + size_t output_index = OS::call(x); + output[output_index] = input[input_index]; + } +}; + +template +struct copy_helper, IS, OS> { + template + static void call(T* output, const T* input) { + for (size_t i = 0; i < N; i++) { + ndindex<1> x = {i}; + size_t input_index = IS::call(x); + size_t output_index = OS::call(x); + output[output_index] = input[input_index]; + } + } +}; + +template +struct copy_helper, IS, OS> { + template + static void call(T* output, const T* input) { + for (size_t i = 0; i < N; i++) { + for (size_t j = 0; j < M; j++) { + ndindex<2> x = {i, j}; + size_t input_index = IS::call(x); + size_t output_index = OS::call(x); + output[output_index] = input[input_index]; + } + } + } +}; + +template +struct copy_helper, IS, OS> { + template + static void call(T* output, const T* input) { + for (size_t i = 0; i < N; i++) { + for (size_t j = 0; j < M; j++) { + for (size_t k = 0; k < K; k++) { + ndindex<3> x = {i, j, k}; + size_t input_index = IS::call(x); + size_t output_index = OS::call(x); + output[output_index] = input[input_index]; + } + } + } + } +}; + +template +struct strides_helper; + +template<> +struct strides_helper> { + KERNEL_FLOAT_INLINE + static size_t call(ndindex<0>) { + return 0; + } +}; + +template +struct strides_helper> { + KERNEL_FLOAT_INLINE + static size_t call(ndindex<1> x) { + return (N != 1 ? x[0] : 0); + } +}; + +template +struct strides_helper> { + KERNEL_FLOAT_INLINE + static size_t call(ndindex<2> x) { + return (N != 1 ? x[0] * M : 0) + // + (M != 1 ? x[1] : 0); + } +}; + +template +struct strides_helper> { + KERNEL_FLOAT_INLINE + static size_t call(ndindex<3> x) { + return (N != 1 ? x[0] * M * K : 0) + // + (M != 1 ? x[1] * K : 0) + // + (K != 1 ? x[2] : 0); + } +}; + +template +struct broadcast_helper { + KERNEL_FLOAT_INLINE static tensor_storage + call(tensor_storage input) { + static_assert(is_broadcastable, "cannot broadcast to required shape"); + using IS = strides_helper>; + using OS = strides_helper; + + tensor_storage output; + copy_helper::call(output.data(), input.data()); + return output; + } +}; + +template +struct broadcast_helper { + KERNEL_FLOAT_INLINE static tensor_storage + call(tensor_storage input) { + return input; + } +}; + +} // namespace detail + +template +tensor, extents> +broadcast(const V& input, extents new_extents = {}) { + using T = tensor_value_type; + return detail::broadcast_helper, extents>::call( + into_tensor(input).storage()); +} + +template +tensor> fill(T value = {}, extents = {}) { + tensor_storage input = {value}; + return detail::broadcast_helper, extents>::call(input); +} + +template +tensor> zeros(extents = {}) { + tensor_storage input = {T {}}; + return detail::broadcast_helper, extents>::call(input); +} + +template +tensor> ones(extents = {}) { + tensor_storage input = {T {1}}; + return detail::broadcast_helper, extents>::call(input); +} + +template, typename E = tensor_extents> +tensor zeros_like(const V&) { + return zeros(E {}); +} + +template, typename E = tensor_extents> +tensor ones_like(const V&) { + return ones(E {}); +} + +} // namespace kernel_float + +#endif \ No newline at end of file diff --git a/include/kernel_float/cast.h b/include/kernel_float/cast.h deleted file mode 100644 index f88ebc8..0000000 --- a/include/kernel_float/cast.h +++ /dev/null @@ -1,203 +0,0 @@ -#ifndef KERNEL_FLOAT_CAST_H -#define KERNEL_FLOAT_CAST_H - -#include "storage.h" - -namespace kernel_float { -namespace ops { -template -struct cast { - KERNEL_FLOAT_INLINE R operator()(T input) noexcept { - return R(input); - } -}; - -template -struct cast { - KERNEL_FLOAT_INLINE T operator()(T input) noexcept { - return input; - } -}; -} // namespace ops - -namespace detail { - -// Cast a vector of type `Input` to type `Output`. Vectors must have the same size. -// The input vector has value type `T` -// The output vector has value type `R` -template< - typename Input, - typename Output, - typename T = vector_value_type, - typename R = vector_value_type> -struct cast_helper { - static_assert(vector_size == vector_size, "sizes must match"); - static constexpr size_t N = vector_size; - - KERNEL_FLOAT_INLINE static Output call(const Input& input) { - return call(input, make_index_sequence {}); - } - - private: - template - KERNEL_FLOAT_INLINE static Output call(const Input& input, index_sequence) { - ops::cast fun; - return vector_traits::create(fun(vector_get(input))...); - } -}; - -// Cast a vector of type `Input` to type `Output`. -// The input vector has value type `T` and size `N`. -// The output vector has value type `R` and size `M`. -template< - typename Input, - typename Output, - typename T = vector_value_type, - size_t N = vector_size, - typename R = vector_value_type, - size_t M = vector_size> -struct broadcast_helper; - -// T[1] => T[1] -template -struct broadcast_helper { - KERNEL_FLOAT_INLINE static Vector call(Vector input) { - return input; - } -}; - -// T[N] => T[N] -template -struct broadcast_helper { - KERNEL_FLOAT_INLINE static Vector call(Vector input) { - return input; - } -}; - -// T[1] => T[N] -template -struct broadcast_helper { - KERNEL_FLOAT_INLINE static Output call(Input input) { - return vector_traits::fill(vector_get<0>(input)); - } -}; - -// T[1] => T[1], but different vector types -template -struct broadcast_helper { - KERNEL_FLOAT_INLINE static Output call(Input input) { - return vector_traits::create(vector_get<0>(input)); - } -}; - -// T[N] => T[N], but different vector types -template -struct broadcast_helper { - KERNEL_FLOAT_INLINE static Output call(Input input) { - return cast_helper::call(input); - } -}; - -// T[1] => R[N] -template -struct broadcast_helper { - KERNEL_FLOAT_INLINE static Output call(Input input) { - return vector_traits::fill(ops::cast {}(vector_get<0>(input))); - } -}; - -// T[1] => R[1] -template -struct broadcast_helper { - KERNEL_FLOAT_INLINE static Output call(Input input) { - return vector_traits::create(ops::cast {}(vector_get<0>(input))); - } -}; - -// T[N] => R[N] -template -struct broadcast_helper { - KERNEL_FLOAT_INLINE static Output call(Input input) { - return cast_helper::call(input); - } -}; -} // namespace detail - -/** - * Cast the elements of the given vector ``input`` to the given type ``R`` and then widen the - * vector to length ``N``. The cast may lead to a loss in precision if ``R`` is a smaller data - * type. Widening is only possible if the input vector has size ``1`` or ``N``, other sizes - * will lead to a compilation error. - * - * Example - * ======= - * ``` - * vec x = {6}; - * vec y = broadcast(x); - * vec z = broadcast(y); - * ``` - */ -template> -KERNEL_FLOAT_INLINE vector broadcast(Input&& input) { - return detail::broadcast_helper, Output>::call( - into_storage(std::forward(input))); -} - -#ifndef DOXYGEN_SHOULD_SKIP_THIS -template< - size_t N, - typename Input, - typename Output = default_storage_type, N>> -KERNEL_FLOAT_INLINE vector broadcast(Input&& input) { - return detail::broadcast_helper, Output>::call( - into_storage(std::forward(input))); -} - -template -KERNEL_FLOAT_INLINE vector broadcast(Input&& input) { - return detail::broadcast_helper, Output>::call( - into_storage(std::forward(input))); -} -#endif - -/** - * Widen the given vector ``input`` to length ``N``. Widening is only possible if the input vector - * has size ``1`` or ``N``, other sizes will lead to a compilation error. - * - * Example - * ======= - * ``` - * vec x = {6}; - * vec y = resize<3>(x); - * ``` - */ -template< - size_t N, - typename Input, - typename Output = default_storage_type, N>> -KERNEL_FLOAT_INLINE vector resize(Input&& input) noexcept { - return detail::broadcast_helper::call(std::forward(input)); -} - -template -using cast_type = default_storage_type>; - -/** - * Cast the elements of given vector ``input`` to the given type ``R``. Note that this cast may - * lead to a loss in precision if ``R`` is a smaller data type. - * - * Example - * ======= - * ``` - * vec x = {1.0f, 2.0f, 3.0f}; - * vec y = cast(x); - * vec z = cast(x); - * ``` - */ -template> -KERNEL_FLOAT_INLINE vector cast(Input&& input) noexcept { - return detail::broadcast_helper::call(std::forward(input)); -} -} // namespace kernel_float - -#endif //KERNEL_FLOAT_CAST_H diff --git a/include/kernel_float/complex.h b/include/kernel_float/complex.h new file mode 100644 index 0000000..ace0901 --- /dev/null +++ b/include/kernel_float/complex.h @@ -0,0 +1,259 @@ +#ifndef KERNEL_FLOAT_COMPLEX_TYPE_H +#define KERNEL_FLOAT_COMPLEX_TYPE_H + +#include "macros.h" + +namespace kernel_float { + +template +struct alignas(2 * alignof(T)) complex_storage { + T re; + T im; +}; + +template +struct complex_type: complex_storage { + using base_type = complex_storage; + + template + KERNEL_FLOAT_INLINE complex_type(complex_type that) : base_type(that.real(), that.imag()) {} + + KERNEL_FLOAT_INLINE + complex_type(T real = {}, T imag = {}) : base_type(real, im) {} + + KERNEL_FLOAT_INLINE + T real() const { + return re; + } + + KERNEL_FLOAT_INLINE + T imag() const { + return im; + } + + KERNEL_FLOAT_INLINE + T norm() const { + return re * re + im * im; + } + + KERNEL_FLOAT_INLINE + complex_type conj() const { + return {re, -im}; + } +}; + +template +KERNEL_FLOAT_INLINE complex_type operator+(complex_type v) { + return v; +} + +template +KERNEL_FLOAT_INLINE complex_type operator+(complex_type a, complex_type b) { + return {a.real() + b.real(), a.imag() + b.imag()}; +} + +template +KERNEL_FLOAT_INLINE complex_type operator+(T a, complex_type b) { + return {a + b.real(), b.imag()}; +} + +template +KERNEL_FLOAT_INLINE complex_type operator+(complex_type a, T b) { + return {a.real() + b, a.imag()}; +} + +template +KERNEL_FLOAT_INLINE complex_type& operator+=(complex_type& a, complex_type b) { + return (a = a + b); +} + +template +KERNEL_FLOAT_INLINE complex_type& operator+=(complex_type& a, T b) { + return (a = a + b); +} + +template +KERNEL_FLOAT_INLINE complex_type operator-(complex_type v) { + return {-v.real(), -v.imag()}; +} + +template +KERNEL_FLOAT_INLINE complex_type operator-(complex_type a, complex_type b) { + return { + a.real() - b.real(), a.imag() - b.imag() + } +} + +template +KERNEL_FLOAT_INLINE complex_type operator-(T a, complex_type b) { + return { + a - b.real(), -b.imag() + } +} + +template +KERNEL_FLOAT_INLINE complex_type operator-(complex_type a, T b) { + return { + a.real() - b, a.imag() + } +} + +template +KERNEL_FLOAT_INLINE complex_type& operator-=(complex_type& a, complex_type b) { + return (a = a - b); +} + +template +KERNEL_FLOAT_INLINE complex_type& operator-=(complex_type& a, T b) { + return (a = a - b); +} + +template +KERNEL_FLOAT_INLINE complex_type operator*(complex_type a, complex_type b) { + return { + a.real() * b.real() - a.imag() * b.imag(), a.real() * b.imag() + a.imag() * b.real() + } +} + +template +KERNEL_FLOAT_INLINE complex_type operator*(complex_type a, T b) { + return {a.real() * b, a.imag() * b}; +} + +template +KERNEL_FLOAT_INLINE complex_type* operator*=(complex_type& a, complex_type b) { + return (a = a * b); +} + +template +KERNEL_FLOAT_INLINE complex_type& operator*=(complex_type& a, T b) { + return (a = a * b); +} + +template +KERNEL_FLOAT_INLINE complex_type operator*(T a, complex_type b) { + return { + a * b.real(), + a * b.imag(), + }; +} + +template +KERNEL_FLOAT_INLINE complex_type operator/(complex_type a, complex_type b) { + T normi = 1 / b.norm(); + + return { + (a.real() * b.real() + a.imag() * b.imag()) * normi, + (a.imag() * b.real() - a.real() * b.imag()) * normi}; +} + +template +KERNEL_FLOAT_INLINE complex_type operator/(complex_type a, T b) { + return {a.real() * (1 / b), a.imag() * (1 / b)}; +} + +template +KERNEL_FLOAT_INLINE complex_type operator/(T a, complex_type b) { + T normi = 1 / b.norm(); + + return {a * b.real() * normi, -a * b.imag() * normi}; +} + +template +KERNEL_FLOAT_INLINE complex_type* operator/=(complex_type& a, complex_type b) { + return (a = a / b); +} + +template +KERNEL_FLOAT_INLINE complex_type& operator/=(complex_type& a, T b) { + return (a = a / b); +} + +template +KERNEL_FLOAT_INLINE T real(complex_type v) { + return v.real(); +} + +template +KERNEL_FLOAT_INLINE T imag(complex_type v) { + return v.real(); +} + +template +KERNEL_FLOAT_INLINE T abs(complex_type v) { + return hypot(v.real(), v.imag()); +} + +template +KERNEL_FLOAT_INLINE T arg(complex_type v) { + return atan2(v.imag(), v.real()); +} + +template +KERNEL_FLOAT_INLINE complex_type sqrt(complex_type v) { + T radius = abs(v); + T cosA = v.real() / radius; + + complex_type out = { + sqrt(radius * (cosA + T(1)) * T(.5)), + sqrt(radius * (T(1) - cosA) * T(.5))}; + + // signbit should be false if x.y is negative + if (v.imag() < 0) { + out = complex_type {out.real, -out.im}; + } + + return out; +} + +template +KERNEL_FLOAT_INLINE complex_type norm(complex_type v) { + return v.real() * v.real() + v.imag() * v.imag(); +} + +template +KERNEL_FLOAT_INLINE complex_type conj(complex_type v) { + return {v.real(), -v.imag()}; +} + +template +KERNEL_FLOAT_INLINE complex_type exp(complex_type v) { + // TODO: Handle nan and inf correctly + T e = exp(v.real()); + T a = v.imag(); + return complex_type(e * cos(a), e * sin(a)); +} + +template +KERNEL_FLOAT_INLINE complex_type log(complex_type v) { + return {log(abs(v)), arg(v)}; +} + +template +KERNEL_FLOAT_INLINE complex_type pow(complex_type a, T b) { + return exp(a * log(b)); +} + +template +KERNEL_FLOAT_INLINE complex_type pow(complex_type a, complex_type b) { + return exp(a * log(b)); +} + +template +struct promote_type, complex_type> { + using type = complex_type>; +}; + +template +struct promote_type, R> { + using type = complex_type>; +}; + +template +struct promote_type> { + using type = complex_type>; +}; + +} // namespace kernel_float + +#endif \ No newline at end of file diff --git a/include/kernel_float/fp16.h b/include/kernel_float/fp16.h deleted file mode 100644 index d95edce..0000000 --- a/include/kernel_float/fp16.h +++ /dev/null @@ -1,188 +0,0 @@ -#ifndef KERNEL_FLOAT_FP16_H -#define KERNEL_FLOAT_FP16_H - -#include "macros.h" - -#if KERNEL_FLOAT_FP16_AVAILABLE -#include - -#include "interface.h" - -namespace kernel_float { -KERNEL_FLOAT_DEFINE_COMMON_TYPE(__half, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(float, __half) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(double, __half) - -template<> -struct vector_traits<__half2> { - using value_type = __half; - static constexpr size_t size = 2; - - KERNEL_FLOAT_INLINE - static __half2 fill(__half value) { -#if KERNEL_FLOAT_ON_DEVICE - return __half2half2(value); -#else - return {value, value}; -#endif - } - - KERNEL_FLOAT_INLINE - static __half2 create(__half low, __half high) { -#if KERNEL_FLOAT_ON_DEVICE - return __halves2half2(low, high); -#else - return {low, high}; -#endif - } - - KERNEL_FLOAT_INLINE - static __half get(__half2 self, size_t index) { -#if KERNEL_FLOAT_ON_DEVICE - if (index == 0) { - return __low2half(self); - } else { - return __high2half(self); - } -#else - if (index == 0) { - return self.x; - } else { - return self.y; - } -#endif - } - - KERNEL_FLOAT_INLINE - static void set(__half2& self, size_t index, __half value) { - if (index == 0) { - self.x = value; - } else { - self.y = value; - } - } -}; - -template -struct default_storage<__half, N, Alignment::Maximum, enabled_t<(N >= 2)>> { - using type = nested_array<__half2, N>; -}; - -template -struct default_storage<__half, N, Alignment::Packed, enabled_t<(N >= 2 && N % 2 == 0)>> { - using type = nested_array<__half2, N>; -}; - -#if KERNEL_FLOAT_ON_DEVICE -#define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2) \ - namespace ops { \ - template<> \ - struct NAME<__half> { \ - KERNEL_FLOAT_INLINE __half operator()(__half input) { \ - return FUN1(input); \ - } \ - }; \ - } \ - namespace detail { \ - template<> \ - struct map_helper, __half2, __half2> { \ - KERNEL_FLOAT_INLINE static __half2 call(ops::NAME<__half>, __half2 input) { \ - return FUN2(input); \ - } \ - }; \ - } - -KERNEL_FLOAT_FP16_UNARY_FUN(abs, ::__habs, ::__habs2); -KERNEL_FLOAT_FP16_UNARY_FUN(negate, ::__hneg, ::__hneg2); -KERNEL_FLOAT_FP16_UNARY_FUN(ceil, ::hceil, ::h2ceil); -KERNEL_FLOAT_FP16_UNARY_FUN(cos, ::hcos, ::h2cos); -KERNEL_FLOAT_FP16_UNARY_FUN(exp, ::hexp, ::h2exp); -KERNEL_FLOAT_FP16_UNARY_FUN(exp10, ::hexp10, ::h2exp10); -KERNEL_FLOAT_FP16_UNARY_FUN(floor, ::hfloor, ::h2floor); -KERNEL_FLOAT_FP16_UNARY_FUN(log, ::hlog, ::h2log); -KERNEL_FLOAT_FP16_UNARY_FUN(log10, ::hlog10, ::h2log2); -KERNEL_FLOAT_FP16_UNARY_FUN(rint, ::hrint, ::h2rint); -KERNEL_FLOAT_FP16_UNARY_FUN(rsqrt, ::hrsqrt, ::h2rsqrt); -KERNEL_FLOAT_FP16_UNARY_FUN(sin, ::hsin, ::h2sin); -KERNEL_FLOAT_FP16_UNARY_FUN(sqrt, ::hsqrt, ::h2sqrt); -KERNEL_FLOAT_FP16_UNARY_FUN(trunc, ::htrunc, ::h2trunc); - -#define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2) \ - namespace ops { \ - template<> \ - struct NAME<__half> { \ - KERNEL_FLOAT_INLINE __half operator()(__half left, __half right) const { \ - return FUN1(left, right); \ - } \ - }; \ - } \ - namespace detail { \ - template<> \ - struct zip_helper, __half2, __half2, __half2> { \ - KERNEL_FLOAT_INLINE static __half2 call(ops::NAME<__half>, __half2 left, __half2 right) { \ - return FUN2(left, right); \ - } \ - }; \ - } - -KERNEL_FLOAT_FP16_BINARY_FUN(add, __hadd, __hadd2) -KERNEL_FLOAT_FP16_BINARY_FUN(subtract, __hsub, __hsub2) -KERNEL_FLOAT_FP16_BINARY_FUN(multiply, __hmul, __hmul2) -KERNEL_FLOAT_FP16_BINARY_FUN(divide, __hdiv, __h2div) -KERNEL_FLOAT_FP16_BINARY_FUN(min, __hmin, __hmin2) -KERNEL_FLOAT_FP16_BINARY_FUN(max, __hmax, __hmax2) - -KERNEL_FLOAT_FP16_BINARY_FUN(equal_to, __heq, __heq2) -KERNEL_FLOAT_FP16_BINARY_FUN(not_equal_to, __heq, __heq2) -KERNEL_FLOAT_FP16_BINARY_FUN(less, __hlt, __hlt2) -KERNEL_FLOAT_FP16_BINARY_FUN(less_equal, __hle, __hle2) -KERNEL_FLOAT_FP16_BINARY_FUN(greater, __hgt, __hgt2) -KERNEL_FLOAT_FP16_BINARY_FUN(greater_equal, __hge, __hgt2) - -#endif - -#define KERNEL_FLOAT_FP16_CAST(T, TO_HALF, FROM_HALF) \ - namespace ops { \ - template<> \ - struct cast { \ - KERNEL_FLOAT_INLINE __half operator()(T input) { \ - return TO_HALF; \ - } \ - }; \ - template<> \ - struct cast<__half, T> { \ - KERNEL_FLOAT_INLINE T operator()(__half input) { \ - return FROM_HALF; \ - } \ - }; \ - } - -KERNEL_FLOAT_FP16_CAST(double, __double2half(input), double(__half2float(input))); -KERNEL_FLOAT_FP16_CAST(float, __float2half(input), __half2float(input)); - -// there are no official char casts. Instead, cast to int and then to char -KERNEL_FLOAT_FP16_CAST(char, __int2half_rn(input), (char)__half2int_rz(input)); -KERNEL_FLOAT_FP16_CAST(signed char, __int2half_rn(input), (signed char)__half2int_rz(input)); -KERNEL_FLOAT_FP16_CAST(unsigned char, __int2half_rn(input), (unsigned char)__half2int_rz(input)); - -KERNEL_FLOAT_FP16_CAST(signed short, __short2half_rn(input), __half2short_rz(input)); -KERNEL_FLOAT_FP16_CAST(signed int, __int2half_rn(input), __half2int_rz(input)); -KERNEL_FLOAT_FP16_CAST(signed long, __ll2half_rn(input), (signed long)(__half2ll_rz(input))); -KERNEL_FLOAT_FP16_CAST(signed long long, __ll2half_rn(input), __half2ll_rz(input)); - -KERNEL_FLOAT_FP16_CAST(unsigned int, __uint2half_rn(input), __half2uint_rz(input)); -KERNEL_FLOAT_FP16_CAST(unsigned short, __ushort2half_rn(input), __half2ushort_rz(input)); -KERNEL_FLOAT_FP16_CAST(unsigned long, __ull2half_rn(input), (unsigned long)(__half2ull_rz(input))); -KERNEL_FLOAT_FP16_CAST(unsigned long long, __ull2half_rn(input), __half2ull_rz(input)); - -using half = __half; -using float16 = __half; -//KERNEL_FLOAT_TYPE_ALIAS(half, __half) -//KERNEL_FLOAT_TYPE_ALIAS(float16x, __half) -//KERNEL_FLOAT_TYPE_ALIAS(f16x, __half) - -} // namespace kernel_float - -#endif - -#endif //KERNEL_FLOAT_FP16_H diff --git a/include/kernel_float/fp8.h b/include/kernel_float/fp8.h deleted file mode 100644 index e69de29..0000000 diff --git a/include/kernel_float/interface.h b/include/kernel_float/interface.h deleted file mode 100644 index d191b65..0000000 --- a/include/kernel_float/interface.h +++ /dev/null @@ -1,294 +0,0 @@ -#ifndef KERNEL_FLOAT_INTERFACE_H -#define KERNEL_FLOAT_INTERFACE_H - -#include "binops.h" -#include "iterate.h" -#include "reduce.h" -#include "storage.h" -#include "swizzle.h" -#include "unops.h" - -namespace kernel_float { - -template -KERNEL_FLOAT_INLINE vector broadcast(Input&& input); - -template -struct index_proxy { - using value_type = typename vector_traits::value_type; - - KERNEL_FLOAT_INLINE - index_proxy(V& storage, I index) : storage_(storage), index_(index) {} - - KERNEL_FLOAT_INLINE - index_proxy& operator=(value_type value) { - vector_traits::set(storage_, index_, value); - return *this; - } - - KERNEL_FLOAT_INLINE - operator value_type() const { - return vector_traits::get(storage_, index_); - } - - private: - V& storage_; - I index_; -}; - -template -struct index_proxy> { - using value_type = typename vector_traits::value_type; - - KERNEL_FLOAT_INLINE - index_proxy(V& storage, const_index) : storage_(storage) {} - - KERNEL_FLOAT_INLINE - index_proxy& operator=(value_type value) { - vector_index::set(storage_, value); - return *this; - } - - KERNEL_FLOAT_INLINE - operator value_type() const { - return vector_index::get(storage_); - } - - private: - V& storage_; -}; - -template -struct vector { - using storage_type = V; - using traits_type = vector_traits; - using value_type = typename traits_type::value_type; - static constexpr size_t const_size = traits_type::size; - - vector(const vector&) = default; - vector(vector&) = default; - vector(vector&&) = default; - - vector& operator=(const vector&) = default; - vector& operator=(vector&) = default; - vector& operator=(vector&&) = default; - - KERNEL_FLOAT_INLINE - vector() : storage_(traits_type::fill(value_type {})) {} - - KERNEL_FLOAT_INLINE - vector(storage_type storage) : storage_(storage) {} - - template< - typename U, - enabled_t, value_type>, int> = 0> - KERNEL_FLOAT_INLINE vector(U&& init) : vector(broadcast(std::forward(init))) {} - - template = 0> - KERNEL_FLOAT_INLINE vector(Args&&... args) : storage_(traits_type::create(args...)) {} - - KERNEL_FLOAT_INLINE - operator storage_type() const { - return storage_; - } - - KERNEL_FLOAT_INLINE - storage_type& storage() { - return storage_; - } - - KERNEL_FLOAT_INLINE - const storage_type& storage() const { - return storage_; - } - - KERNEL_FLOAT_INLINE - value_type get(size_t index) const { - return traits_type::get(storage_, index); - } - - KERNEL_FLOAT_INLINE - void set(size_t index, value_type value) { - traits_type::set(storage_, index, value); - } - - template - KERNEL_FLOAT_INLINE value_type get(const_index) const { - return vector_index::get(storage_); - } - - template - KERNEL_FLOAT_INLINE void set(const_index, value_type value) { - return vector_index::set(storage_, value); - } - - KERNEL_FLOAT_INLINE - value_type operator[](size_t index) const { - return get(index); - } - - template - KERNEL_FLOAT_INLINE value_type operator[](const_index) const { - return get(const_index {}); - } - - KERNEL_FLOAT_INLINE - index_proxy operator[](size_t index) { - return {storage_, index}; - } - - template - KERNEL_FLOAT_INLINE index_proxy> operator[](const_index) { - return {storage_, const_index {}}; - } - - KERNEL_FLOAT_INLINE - static constexpr size_t size() { - return const_size; - } - - private: - storage_type storage_; -}; - -template -struct vector_traits> { - using value_type = vector_value_type; - static constexpr size_t size = vector_size; - - KERNEL_FLOAT_INLINE - static vector fill(value_type value) { - return vector_traits::fill(value); - } - - template - KERNEL_FLOAT_INLINE static vector create(Args... args) { - return vector_traits::create(args...); - } - - KERNEL_FLOAT_INLINE - static value_type get(const vector& self, size_t index) { - return vector_traits::get(self.storage(), index); - } - - KERNEL_FLOAT_INLINE - static void set(vector& self, size_t index, value_type value) { - vector_traits::set(self.storage(), index, value); - } -}; - -template -struct vector_index, I> { - using value_type = vector_value_type; - - KERNEL_FLOAT_INLINE - static value_type get(const vector& self) { - return vector_index::get(self.storage()); - } - - KERNEL_FLOAT_INLINE - static void set(vector& self, value_type value) { - vector_index::set(self.storage(), value); - } -}; - -template -struct into_storage_traits> { - using type = V; - - KERNEL_FLOAT_INLINE - static constexpr type call(const vector& self) { - return self.storage(); - } -}; - -template -struct vector_swizzle, index_sequence> { - KERNEL_FLOAT_INLINE static Output call(const vector& self) { - return vector_swizzle>::call(self.storage()); - } -}; - -template -using vec = vector>; - -template -using unaligned_vec = vector>; - -template -KERNEL_FLOAT_INLINE vec, sizeof...(Args)> make_vec(Args&&... args) { - using value_type = common_t; - using vector_type = default_storage_type; - return vector_traits::create(value_type(args)...); -} - -template -KERNEL_FLOAT_INLINE vector> into_vec(V&& input) { - return into_storage(input); -} - -using float32 = float; -using float64 = double; - -template -using vec1 = vec; -template -using vec2 = vec; -template -using vec3 = vec; -template -using vec4 = vec; -template -using vec5 = vec; -template -using vec6 = vec; -template -using vec7 = vec; -template -using vec8 = vec; - -#define KERNEL_FLOAT_TYPE_ALIAS(NAME, T) \ - template \ - using NAME##N = vec; \ - using NAME##1 = vec; \ - using NAME##2 = vec; \ - using NAME##3 = vec; \ - using NAME##4 = vec; \ - using NAME##5 = vec; \ - using NAME##6 = vec; \ - using NAME##7 = vec; \ - using NAME##8 = vec; \ - template \ - using unaligned_##NAME##X = unaligned_vec; \ - using unaligned_##NAME##1 = unaligned_vec; \ - using unaligned_##NAME##2 = unaligned_vec; \ - using unaligned_##NAME##3 = unaligned_vec; \ - using unaligned_##NAME##4 = unaligned_vec; \ - using unaligned_##NAME##5 = unaligned_vec; \ - using unaligned_##NAME##6 = unaligned_vec; \ - using unaligned_##NAME##7 = unaligned_vec; \ - using unaligned_##NAME##8 = unaligned_vec; - -KERNEL_FLOAT_TYPE_ALIAS(char, char) -KERNEL_FLOAT_TYPE_ALIAS(short, short) -KERNEL_FLOAT_TYPE_ALIAS(int, int) -KERNEL_FLOAT_TYPE_ALIAS(long, long) -KERNEL_FLOAT_TYPE_ALIAS(longlong, long long) - -KERNEL_FLOAT_TYPE_ALIAS(uchar, unsigned char) -KERNEL_FLOAT_TYPE_ALIAS(ushort, unsigned short) -KERNEL_FLOAT_TYPE_ALIAS(uint, unsigned int) -KERNEL_FLOAT_TYPE_ALIAS(ulong, unsigned long) -KERNEL_FLOAT_TYPE_ALIAS(ulonglong, unsigned long long) - -KERNEL_FLOAT_TYPE_ALIAS(float, float) -KERNEL_FLOAT_TYPE_ALIAS(f32x, float) -KERNEL_FLOAT_TYPE_ALIAS(float32x, float) - -KERNEL_FLOAT_TYPE_ALIAS(double, double) -KERNEL_FLOAT_TYPE_ALIAS(f64x, double) -KERNEL_FLOAT_TYPE_ALIAS(float64x, double) - -} // namespace kernel_float - -#endif //KERNEL_FLOAT_INTERFACE_H diff --git a/include/kernel_float/iterate.h b/include/kernel_float/iterate.h deleted file mode 100644 index 2b98194..0000000 --- a/include/kernel_float/iterate.h +++ /dev/null @@ -1,175 +0,0 @@ -#ifndef KERNEL_FLOAT_ITERATE_H -#define KERNEL_FLOAT_ITERATE_H - -#include "storage.h" -#include "unops.h" - -namespace kernel_float { - -namespace detail { -template>> -struct range_helper; - -template -struct range_helper> { - KERNEL_FLOAT_INLINE static V call(F fun) { - return vector_traits::create(fun(const_index {})...); - } -}; -} // namespace detail - -/** - * Generate vector of length ``N`` by applying the given function ``fun`` to - * each index ``0...N-1``. - * - * Example - * ======= - * ``` - * // returns [0, 2, 4] - * vector vec = range<3>([](auto i) { return float(i * 2); }); - * ``` - */ -template< - size_t N, - typename F, - typename T = result_t, - typename Output = default_storage_type> -KERNEL_FLOAT_INLINE vector range(F fun) { - return detail::range_helper::call(fun); -} - -/** - * Generate vector consisting of the numbers ``0...N-1`` of type ``T``. - * - * Example - * ======= - * ``` - * // Returns [0, 1, 2] - * vector vec = range(); - * ``` - */ -template> -KERNEL_FLOAT_INLINE vector range() { - using F = ops::cast; - return detail::range_helper::call(F {}); -} - -/** - * Generate vector having same size and type as ``V``, but filled with the numbers ``0..N-1``. - */ -template> -KERNEL_FLOAT_INLINE vector range_like(const Input&) { - using F = ops::cast>; - return detail::range_helper::call(F {}); -} - -/** - * Generate vector of `N` elements of type `T` - * - * Example - * ======= - * ``` - * // Returns [1.0, 1.0, 1.0] - * vector = fill(1.0f); - * ``` - */ -template> -KERNEL_FLOAT_INLINE vector fill(T value) { - return vector_traits::fill(value); -} - -/** - * Generate vector having same size and type as ``V``, but filled with the given ``value``. - */ -template -KERNEL_FLOAT_INLINE vector fill_like(const Output&, vector_value_type value) { - return vector_traits::fill(value); -} - -/** - * Generate vector of ``N`` zeros of type ``T`` - * - * Example - * ======= - * ``` - * // Returns [0.0, 0.0, 0.0] - * vector = zeros(); - * ``` - */ -template> -KERNEL_FLOAT_INLINE vector zeros() { - return vector_traits::fill(T(0)); -} - -/** - * Generate vector having same size and type as ``V``, but filled with zeros. - * - */ -template -KERNEL_FLOAT_INLINE vector zeros_like(const Output& output = {}) { - return vector_traits::fill(0); -} - -/** - * Generate vector of ``N`` ones of type ``T`` - * - * Example - * ======= - * ``` - * // Returns [1.0, 1.0, 1.0] - * vector = ones(); - * ``` - */ -template> -KERNEL_FLOAT_INLINE vector ones() { - return vector_traits::fill(T(1)); -} - -/** - * Generate vector having same size and type as ``V``, but filled with ones. - * - */ -template -KERNEL_FLOAT_INLINE vector ones_like(const Output& output = {}) { - return vector_traits::fill(1); -} - -namespace detail { -template>> -struct iterate_helper; - -template -struct iterate_helper> { - KERNEL_FLOAT_INLINE - static void call(F fun, const V& input) {} -}; - -template -struct iterate_helper> { - KERNEL_FLOAT_INLINE - static void call(F fun, const V& input) { - fun(vector_get(input)); - iterate_helper>::call(fun, input); - } -}; -} // namespace detail - -/** - * Apply the function ``fun`` for each element from ``input``. - * - * Example - * ======= - * ``` - * for_each(range<3>(), [&](auto i) { - * printf("element: %d\n", i); - * }); - * ``` - */ -template -KERNEL_FLOAT_INLINE void for_each(const V& input, F fun) { - detail::iterate_helper>::call(fun, into_storage(input)); -} - -} // namespace kernel_float - -#endif //KERNEL_FLOAT_ITERATE_H diff --git a/include/kernel_float/macros.h b/include/kernel_float/macros.h index 761360e..ab70d2a 100644 --- a/include/kernel_float/macros.h +++ b/include/kernel_float/macros.h @@ -6,20 +6,20 @@ #ifdef __CUDA_ARCH__ #define KERNEL_FLOAT_INLINE __forceinline__ __device__ -#define KERNEL_FLOAT_ON_DEVICE (1) -#define KERNEL_FLOAT_ON_HOST (0) +#define KERNEL_FLOAT_IS_DEVICE (1) +#define KERNEL_FLOAT_IS_HOST (0) #define KERNEL_FLOAT_CUDA_ARCH (__CUDA_ARCH__) #else #define KERNEL_FLOAT_INLINE __forceinline__ __host__ -#define KERNEL_FLOAT_ON_DEVICE (0) -#define KERNEL_FLOAT_ON_HOST (1) +#define KERNEL_FLOAT_IS_DEVICE (0) +#define KERNEL_FLOAT_IS_HOST (1) #define KERNEL_FLOAT_CUDA_ARCH (0) #endif #else #define KERNEL_FLOAT_INLINE inline #define KERNEL_FLOAT_CUDA (0) -#define KERNEL_FLOAT_ON_HOST (1) -#define KERNEL_FLOAT_ON_DEVICE (0) +#define KERNEL_FLOAT_IS_HOST (1) +#define KERNEL_FLOAT_IS_DEVICE (0) #define KERNEL_FLOAT_CUDA_ARCH (0) #endif diff --git a/include/kernel_float/meta.h b/include/kernel_float/meta.h index e8bba21..e6cb31e 100644 --- a/include/kernel_float/meta.h +++ b/include/kernel_float/meta.h @@ -5,15 +5,6 @@ namespace kernel_float { -template -struct const_index { - static constexpr size_t value = I; - - KERNEL_FLOAT_INLINE constexpr operator size_t() const noexcept { - return I; - } -}; - template struct index_sequence { static constexpr size_t size = sizeof...(Is); @@ -85,130 +76,126 @@ struct decay_helper { template using decay_t = typename detail::decay_helper::type; -template -struct common_type; +template +struct promote_type; template -struct common_type { +struct promote_type { using type = T; }; -#define KERNEL_FLOAT_DEFINE_COMMON_TYPE(T, U) \ - template<> \ - struct common_type { \ - using type = T; \ - }; \ - template<> \ - struct common_type { \ - using type = T; \ +#define KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, U) \ + template<> \ + struct promote_type { \ + using type = T; \ + }; \ + template<> \ + struct promote_type { \ + using type = T; \ }; -KERNEL_FLOAT_DEFINE_COMMON_TYPE(long double, double) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(long double, float) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(double, float) -//KERNEL_FLOAT_DEFINE_COMMON_TYPE(double, half) -//KERNEL_FLOAT_DEFINE_COMMON_TYPE(float, half) - -#define KERNEL_FLOAT_DEFINE_COMMON_INTEGRAL(T, U) \ - KERNEL_FLOAT_DEFINE_COMMON_TYPE(signed T, signed U) \ - KERNEL_FLOAT_DEFINE_COMMON_TYPE(unsigned T, unsigned U) - -KERNEL_FLOAT_DEFINE_COMMON_INTEGRAL(long long, long) -KERNEL_FLOAT_DEFINE_COMMON_INTEGRAL(long long, int) -KERNEL_FLOAT_DEFINE_COMMON_INTEGRAL(long long, short) -KERNEL_FLOAT_DEFINE_COMMON_INTEGRAL(long long, char) -KERNEL_FLOAT_DEFINE_COMMON_INTEGRAL(long, int) -KERNEL_FLOAT_DEFINE_COMMON_INTEGRAL(long, short) -KERNEL_FLOAT_DEFINE_COMMON_INTEGRAL(long, char) -KERNEL_FLOAT_DEFINE_COMMON_INTEGRAL(int, short) -KERNEL_FLOAT_DEFINE_COMMON_INTEGRAL(int, char) -KERNEL_FLOAT_DEFINE_COMMON_INTEGRAL(short, char) - -KERNEL_FLOAT_DEFINE_COMMON_TYPE(long double, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(double, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(float, bool) - -KERNEL_FLOAT_DEFINE_COMMON_TYPE(signed long long, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(signed long, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(signed int, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(signed short, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(signed char, bool) - -KERNEL_FLOAT_DEFINE_COMMON_TYPE(unsigned long long, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(unsigned long, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(unsigned int, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(unsigned short, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(unsigned char, bool) +// T and bool becomes T +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(char, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(signed char, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(signed short, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(signed int, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(signed long, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(signed long long, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(unsigned char, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(unsigned short, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(unsigned int, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(unsigned long, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(unsigned long long, bool) + +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, float) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(long double, float) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(long double, double) + +#define KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(T) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, char) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, signed char) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, signed short) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, signed int) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, signed long) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, signed long long) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, unsigned char) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, unsigned short) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, unsigned int) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, unsigned long) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, unsigned long long) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, bool) + +KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(float) +KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(double) +KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(long double) + +#define KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(T, U) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(signed T, signed U) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(unsigned T, unsigned U) + +KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(short, char) +KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(int, char) +KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(int, short) +KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(long, char) +KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(long, short) +KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(long, int) +KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(long long, char) +KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(long long, short) +KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(long long, int) +KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(long long, long) + +// half precision +// KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(half) +// KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(half, bool) +// KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, half) +// KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, half) +// KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(long double, half) namespace detail { template -struct common_type_helper; +struct multi_promote_type; template -struct common_type_helper { +struct multi_promote_type { using type = T; }; -template -struct common_type_helper { - using type = typename common_type::type; -}; +template +struct multi_promote_type: promote_type {}; + +template +struct multi_promote_type: + multi_promote_type::type, C, Rest...> {}; -template -struct common_type_helper: - common_type_helper::type, R, Rest...> {}; } // namespace detail template -using common_t = typename detail::common_type_helper...>::type; +using promote_t = typename detail::multi_promote_type::type; namespace detail { -template -struct common_size_helper; - -template<> -struct common_size_helper<> { - static constexpr size_t value = 1; -}; - -template -struct common_size_helper { - static constexpr size_t value = N; -}; -template -struct common_size_helper { - static constexpr size_t value = N; -}; - -template -struct common_size_helper { - static constexpr size_t value = N; -}; - -template -struct common_size_helper<1, N> { - static constexpr size_t value = N; +template +struct is_same_helper { + static constexpr bool value = false; }; -template<> -struct common_size_helper<1, 1> { - static constexpr size_t value = 1; +template +struct is_same_helper { + static constexpr bool value = true; }; } // namespace detail -template -static constexpr size_t common_size = detail::common_size_helper::value; +template +static constexpr bool is_same = detail::is_same_helper::value; namespace detail { - template struct is_implicit_convertible_helper { static constexpr bool value = false; }; template -struct is_implicit_convertible_helper::type> { +struct is_implicit_convertible_helper::type> { static constexpr bool value = true; }; } // namespace detail diff --git a/include/kernel_float/reduce.h b/include/kernel_float/reduce.h index f3bc520..ddfabef 100644 --- a/include/kernel_float/reduce.h +++ b/include/kernel_float/reduce.h @@ -5,29 +5,22 @@ namespace kernel_float { namespace detail { -template +template struct reduce_helper { - using value_type = vector_value_type; - - KERNEL_FLOAT_INLINE static value_type call(F fun, const V& input) { - return call(fun, input, make_index_sequence> {}); + KERNEL_FLOAT_INLINE static T call(F fun, const tensor_storage& input) { + return call(fun, input, make_index_sequence {}); } private: template - KERNEL_FLOAT_INLINE static value_type call(F fun, const V& vector, index_sequence<0, Is...>) { - return call(fun, vector, vector_get<0>(vector), index_sequence {}); - } - - template - KERNEL_FLOAT_INLINE static value_type - call(F fun, const V& vector, value_type accum, index_sequence) { - return call(fun, vector, fun(accum, vector_get(vector)), index_sequence {}); - } - - KERNEL_FLOAT_INLINE static value_type - call(F fun, const V& vector, value_type accum, index_sequence<>) { - return accum; + KERNEL_FLOAT_INLINE static T + call(F fun, const tensor_storage& input, index_sequence<0, Is...>) { + T result = input[0]; +#pragma unroll + for (size_t i = 1; i < N; i++) { + result = fun(result, input[i]); + } + return result; } }; } // namespace detail @@ -47,8 +40,10 @@ struct reduce_helper { * ``` */ template -KERNEL_FLOAT_INLINE vector_value_type reduce(F fun, const V& input) { - return detail::reduce_helper>::call(fun, into_storage(input)); +KERNEL_FLOAT_INLINE tensor_value_type reduce(F fun, const V& input) { + return detail::reduce_helper, tensor_value_type>::call( + fun, + into_tensor(input)); } /** @@ -61,7 +56,7 @@ KERNEL_FLOAT_INLINE vector_value_type reduce(F fun, const V& input) { * int y = min(x); // Returns 0 * ``` */ -template> +template> KERNEL_FLOAT_INLINE T min(const V& input) { return reduce(ops::min {}, input); } @@ -76,7 +71,7 @@ KERNEL_FLOAT_INLINE T min(const V& input) { * int y = max(x); // Returns 5 * ``` */ -template> +template> KERNEL_FLOAT_INLINE T max(const V& input) { return reduce(ops::max {}, input); } @@ -91,7 +86,7 @@ KERNEL_FLOAT_INLINE T max(const V& input) { * int y = sum(x); // Returns 8 * ``` */ -template> +template> KERNEL_FLOAT_INLINE T sum(const V& input) { return reduce(ops::add {}, input); } @@ -106,7 +101,7 @@ KERNEL_FLOAT_INLINE T sum(const V& input) { * int y = sum(x); // Returns 5*0*2*1*0 = 0 * ``` */ -template> +template> KERNEL_FLOAT_INLINE T product(const V& input) { return reduce(ops::multiply {}, input); } @@ -116,7 +111,7 @@ KERNEL_FLOAT_INLINE T product(const V& input) { * non-zero if ``bool(v)==true``. */ template -KERNEL_FLOAT_INLINE bool all(V&& input) { +KERNEL_FLOAT_INLINE bool all(const V& input) { return reduce(ops::bit_and {}, cast(input)); } @@ -125,7 +120,7 @@ KERNEL_FLOAT_INLINE bool all(V&& input) { * non-zero if ``bool(v)==true``. */ template -KERNEL_FLOAT_INLINE bool any(V&& input) { +KERNEL_FLOAT_INLINE bool any(const V& input) { return reduce(ops::bit_or {}, cast(input)); } @@ -141,7 +136,7 @@ KERNEL_FLOAT_INLINE bool any(V&& input) { * ``` */ template -KERNEL_FLOAT_INLINE int count(V&& input) { +KERNEL_FLOAT_INLINE int count(const V& input) { return sum(cast(cast(input))); } } // namespace kernel_float diff --git a/include/kernel_float/storage.h b/include/kernel_float/storage.h deleted file mode 100644 index 4d28339..0000000 --- a/include/kernel_float/storage.h +++ /dev/null @@ -1,503 +0,0 @@ -#ifndef KERNEL_FLOAT_STORAGE -#define KERNEL_FLOAT_STORAGE - -#include "meta.h" - -namespace kernel_float { - -template -struct vector_traits { - using value_type = V; - static constexpr size_t size = 1; - - KERNEL_FLOAT_INLINE - static V fill(value_type value) { - return value; - } - - KERNEL_FLOAT_INLINE - static V create(value_type value) { - return value; - } - - KERNEL_FLOAT_INLINE - static value_type get(const V& self, size_t index) { - KERNEL_FLOAT_ASSERT(index == 0); - return self; - } - - KERNEL_FLOAT_INLINE - static void set(V& self, size_t index, value_type value) { - KERNEL_FLOAT_ASSERT(index == 0); - self = value; - } -}; - -template -struct into_storage_traits { - using type = V; - - KERNEL_FLOAT_INLINE - static constexpr type call(V self) { - return self; - } -}; - -template -struct into_storage_traits: into_storage_traits {}; - -template -struct into_storage_traits: into_storage_traits {}; - -template -struct into_storage_traits: into_storage_traits {}; - -template -using into_storage_type = typename into_storage_traits::type; - -template -KERNEL_FLOAT_INLINE into_storage_type into_storage(V&& input) { - return into_storage_traits::call(input); -} - -template -static constexpr size_t vector_size = vector_traits>::size; - -template -using vector_value_type = typename vector_traits>::value_type; - -template -struct vector_index { - using value_type = vector_value_type; - - KERNEL_FLOAT_INLINE - static value_type get(const V& self) { - return vector_traits::get(self, I); - } - - KERNEL_FLOAT_INLINE - static void set(V& self, value_type value) { - return vector_traits::set(self, I, value); - } -}; - -template -KERNEL_FLOAT_INLINE vector_value_type vector_get(const V& self, size_t index) { - return vector_traits::get(self, index); -} - -template -KERNEL_FLOAT_INLINE vector_value_type vector_get(const V& self, const_index = {}) { - return vector_index::get(self); -} - -template -struct vector_swizzle; - -template -struct vector_swizzle> { - KERNEL_FLOAT_INLINE static Output call(const Input& storage) { - return vector_traits::create(vector_get(storage)...); - } -}; - -template -struct vector; - -template -struct alignas(alignment) array { - T items_[N]; - - KERNEL_FLOAT_INLINE - T& operator[](size_t i) { - KERNEL_FLOAT_ASSERT(i < N); - return items_[i]; - } - - KERNEL_FLOAT_INLINE - const T& operator[](size_t i) const { - KERNEL_FLOAT_ASSERT(i < N); - return items_[i]; - } -}; - -template -struct vector_traits> { - using self_type = array; - using value_type = T; - static constexpr size_t size = N; - - template - KERNEL_FLOAT_INLINE static self_type create(Args&&... args) { - return {args...}; - } - - KERNEL_FLOAT_INLINE - static self_type fill(value_type value) { - self_type result; - for (size_t i = 0; i < N; i++) { - result[i] = value; - } - return result; - } - - KERNEL_FLOAT_INLINE - static value_type get(const self_type& self, size_t index) { - KERNEL_FLOAT_ASSERT(index < N); - return self[index]; - } - - KERNEL_FLOAT_INLINE - static void set(self_type& self, size_t index, value_type value) { - KERNEL_FLOAT_ASSERT(index < N); - self[index] = value; - } -}; - -template -struct array {}; - -template -struct vector_traits> { - using self_type = array; - using value_type = T; - static constexpr size_t size = 0; - - KERNEL_FLOAT_INLINE - static self_type create() { - return {}; - } - - KERNEL_FLOAT_INLINE - static self_type fill(value_type value) { - return {}; - } - - KERNEL_FLOAT_INLINE - static value_type get(const self_type& self, size_t index) { - KERNEL_FLOAT_UNREACHABLE; - } - - KERNEL_FLOAT_INLINE - static void set(self_type& self, size_t index, value_type value) { - KERNEL_FLOAT_UNREACHABLE; - } -}; - -enum struct Alignment { - Minimum, - Packed, - Maximum, -}; - -constexpr size_t calculate_alignment(Alignment required, size_t min_alignment, size_t total_size) { - size_t alignment = 1; - - if (required == Alignment::Maximum) { - if (total_size <= 1) { - alignment = 1; - } else if (total_size <= 2) { - alignment = 2; - } else if (total_size <= 4) { - alignment = 4; - } else if (total_size <= 8) { - alignment = 8; - } else { - alignment = 16; - } - } else if (required == Alignment::Packed) { - if (total_size % 16 == 0) { - alignment = 16; - } else if (total_size % 8 == 0) { - alignment = 8; - } else if (total_size % 4 == 0) { - alignment = 4; - } else if (total_size % 2 == 0) { - alignment = 2; - } else { - alignment = 1; - } - } - - if (min_alignment > alignment) { - alignment = min_alignment; - } - - return alignment; -} - -template -struct default_storage { - using type = array; -}; - -template -struct default_storage { - using type = T; -}; - -template -using default_storage_type = typename default_storage::type; - -#define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ - template<> \ - struct vector_traits { \ - using value_type = T; \ - static constexpr size_t size = 1; \ - \ - KERNEL_FLOAT_INLINE \ - static T1 create(T x) { \ - return {x}; \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static T1 fill(T v) { \ - return {v}; \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static T get(const T1& self, size_t index) { \ - switch (index) { \ - case 0: \ - return self.x; \ - default: \ - KERNEL_FLOAT_UNREACHABLE; \ - } \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static void set(T1& self, size_t index, T value) { \ - switch (index) { \ - case 0: \ - self.x = value; \ - default: \ - KERNEL_FLOAT_UNREACHABLE; \ - } \ - } \ - }; \ - \ - template<> \ - struct vector_traits { \ - using value_type = T; \ - static constexpr size_t size = 2; \ - \ - KERNEL_FLOAT_INLINE \ - static T2 create(T x, T y) { \ - return {x, y}; \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static T2 fill(T v) { \ - return {v, v}; \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static T get(const T2& self, size_t index) { \ - switch (index) { \ - case 0: \ - return self.x; \ - case 1: \ - return self.y; \ - default: \ - KERNEL_FLOAT_UNREACHABLE; \ - } \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static void set(T2& self, size_t index, T value) { \ - switch (index) { \ - case 0: \ - self.x = value; \ - case 1: \ - self.y = value; \ - default: \ - KERNEL_FLOAT_UNREACHABLE; \ - } \ - } \ - }; \ - \ - template<> \ - struct vector_traits { \ - using value_type = T; \ - static constexpr size_t size = 3; \ - \ - KERNEL_FLOAT_INLINE \ - static T3 create(T x, T y, T z) { \ - return {x, y, z}; \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static T3 fill(T v) { \ - return {v, v, v}; \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static T get(const T3& self, size_t index) { \ - switch (index) { \ - case 0: \ - return self.x; \ - case 1: \ - return self.y; \ - case 2: \ - return self.z; \ - default: \ - KERNEL_FLOAT_UNREACHABLE; \ - } \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static void set(T3& self, size_t index, T value) { \ - switch (index) { \ - case 0: \ - self.x = value; \ - return; \ - case 1: \ - self.y = value; \ - return; \ - case 2: \ - self.z = value; \ - return; \ - default: \ - KERNEL_FLOAT_UNREACHABLE; \ - } \ - } \ - }; \ - \ - template<> \ - struct vector_traits { \ - using value_type = T; \ - static constexpr size_t size = 4; \ - \ - KERNEL_FLOAT_INLINE \ - static T4 create(T x, T y, T z, T w) { \ - return {x, y, z, w}; \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static T4 fill(T v) { \ - return {v, v, v, v}; \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static T get(const T4& self, size_t index) { \ - switch (index) { \ - case 0: \ - return self.x; \ - case 1: \ - return self.y; \ - case 2: \ - return self.z; \ - case 3: \ - return self.w; \ - default: \ - KERNEL_FLOAT_UNREACHABLE; \ - } \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static void set(T4& self, size_t index, T value) { \ - switch (index) { \ - case 0: \ - self.x = value; \ - return; \ - case 1: \ - self.y = value; \ - return; \ - case 2: \ - self.z = value; \ - return; \ - case 3: \ - self.w = value; \ - return; \ - default: \ - KERNEL_FLOAT_UNREACHABLE; \ - } \ - } \ - }; - -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(char, char1, char2, char3, char4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(short, short1, short2, short3, short4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(int, int1, int2, int3, int4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long, long1, long2, long3, long4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long long, longlong1, longlong2, longlong3, longlong4) - -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned char, uchar1, uchar2, uchar3, uchar4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned short, ushort1, ushort2, ushort3, ushort4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned int, uint1, uint2, uint3, uint4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long, ulong1, ulong2, ulong3, ulong4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong1, ulonglong2, ulonglong3, ulonglong4) - -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(float, float1, float2, float3, float4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(double, double1, double2, double3, double4) - -template -struct nested_array { - static constexpr size_t num_packets = (N + vector_size - 1) / vector_size; - static_assert(num_packets * vector_size >= N, "internal error"); - - V packets[num_packets]; - - KERNEL_FLOAT_INLINE - V& operator[](size_t i) { - KERNEL_FLOAT_ASSERT(i < num_packets); - return packets[i]; - } - - KERNEL_FLOAT_INLINE - const V& operator[](size_t i) const { - KERNEL_FLOAT_ASSERT(i < num_packets); - return packets[i]; - } -}; - -template -struct vector_traits> { - using self_type = nested_array; - using value_type = vector_value_type; - static constexpr size_t size = N; - - template - KERNEL_FLOAT_INLINE static self_type create(Args&&... args) { - value_type items[N] = {args...}; - self_type output; - - size_t i = 0; - for (; i + vector_size - 1 < N; i += vector_size) { - // How to generalize this? - output.packets[i / vector_size] = vector_traits::create(items[i], items[i + 1]); - } - - for (; i < N; i++) { - vector_traits::set(output.packets[i / vector_size], i % vector_size, items[i]); - } - - return output; - } - - KERNEL_FLOAT_INLINE - static self_type fill(value_type value) { - self_type output; - - for (size_t i = 0; i < self_type::num_packets; i++) { - output.packets[i] = vector_traits::fill(value); - } - - return output; - } - - KERNEL_FLOAT_INLINE - static value_type get(const self_type& self, size_t index) { - KERNEL_FLOAT_ASSERT(index < N); - return vector_traits::get(self.packets[index / vector_size], index % vector_size); - } - - KERNEL_FLOAT_INLINE - static void set(self_type& self, size_t index, value_type value) { - KERNEL_FLOAT_ASSERT(index < N); - vector_traits::set(self.packets[index / vector_size], index % vector_size, value); - } -}; - -}; // namespace kernel_float - -#endif \ No newline at end of file diff --git a/include/kernel_float/swizzle.h b/include/kernel_float/swizzle.h deleted file mode 100644 index 50a023a..0000000 --- a/include/kernel_float/swizzle.h +++ /dev/null @@ -1,218 +0,0 @@ -#ifndef KERNEL_FLOAT_SWIZZLE_H -#define KERNEL_FLOAT_SWIZZLE_H - -#include "storage.h" - -namespace kernel_float { - -/** - * "Swizzles" the vector. Returns a new vector where the elements are provided by the given indices. - * - * # Example - * ``` - * vec x = {0, 1, 2, 3, 4, 5, 6}; - * vec a = swizzle<0, 1, 2>(x); // 0, 1, 2 - * vec b = swizzle<2, 1, 0>(x); // 2, 1, 0 - * vec c = swizzle<1, 1, 1>(x); // 1, 1, 1 - * vec d = swizzle<0, 2, 4, 6>(x); // 0, 2, 4, 6 - * ``` - */ -template< - size_t... Is, - typename V, - typename Output = default_storage_type, sizeof...(Is)>> -KERNEL_FLOAT_INLINE vector swizzle(const V& input, index_sequence _ = {}) { - return vector_swizzle, index_sequence>::call( - into_storage(input)); -} - -/** - * Takes the first ``N`` elements from the given vector and returns a new vector of length ``N``. - * - * # Example - * ``` - * vec x = {1, 2, 3, 4, 5, 6}; - * vec y = first<3>(x); // 1, 2, 3 - * int z = first(x); // 1 - * ``` - */ -template, K>> -KERNEL_FLOAT_INLINE vector first(const V& input) { - static_assert(K <= vector_size, "K cannot exceed vector size"); - using Indices = make_index_sequence; - return vector_swizzle, Indices>::call(into_storage(input)); -} - -namespace detail { -template -struct offset_index_sequence_helper; - -template -struct offset_index_sequence_helper> { - using type = index_sequence; -}; -} // namespace detail - -/** - * Takes the last ``N`` elements from the given vector and returns a new vector of length ``N``. - * - * # Example - * ``` - * vec x = {1, 2, 3, 4, 5, 6}; - * vec y = last<3>(x); // 4, 5, 6 - * int z = last(x); // 6 - * ``` - */ -template, K>> -KERNEL_FLOAT_INLINE vector last(const V& input) { - static_assert(K <= vector_size, "K cannot exceed vector size"); - using Indices = typename detail::offset_index_sequence_helper< // - vector_size - K, - make_index_sequence>::type; - - return vector_swizzle, Indices>::call(into_storage(input)); -} - -namespace detail { -template -struct reverse_index_sequence_helper: reverse_index_sequence_helper {}; - -template -struct reverse_index_sequence_helper<0, Is...> { - using type = index_sequence; -}; -} // namespace detail - -/** - * Reverses the elements in the given vector. - * - * # Example - * ``` - * vec x = {1, 2, 3, 4, 5, 6}; - * vec y = reversed(x); // 6, 5, 4, 3, 2, 1 - * ``` - */ -template> -KERNEL_FLOAT_INLINE vector reversed(const V& input) { - using Indices = typename detail::reverse_index_sequence_helper>::type; - - return vector_swizzle, Indices>::call(into_storage(input)); -} - -namespace detail { -template -struct concat_index_sequence_helper {}; - -template -struct concat_index_sequence_helper, index_sequence> { - using type = index_sequence; -}; -} // namespace detail - -/** - * Rotate the given vector ``K`` steps to the right. In other words, this move the front element to the back - * ``K`` times. This is the inverse of ``rotate_left``. - * - * # Example - * ``` - * vec x = {1, 2, 3, 4, 5, 6}; - * vec y = rotate_right<2>(x); // 5, 6, 1, 2, 3, 4 - * ``` - */ -template> -KERNEL_FLOAT_INLINE vector rotate_right(const V& input) { - static constexpr size_t N = vector_size; - static constexpr size_t I = (N > 0) ? (K % N) : 0; - - using First = - typename detail::offset_index_sequence_helper>::type; - using Second = make_index_sequence; - using Indices = typename detail::concat_index_sequence_helper::type; - - return vector_swizzle, Indices>::call(into_storage(input)); -} - -/** - * Rotate the given vector ``K`` steps to the left. In other words, this move the back element to the front - * ``K`` times. This is the inverse of ``rotate_right``. - * - * # Example - * ``` - * vec x = {1, 2, 3, 4, 5, 6}; - * vec y = rotate_left<4>(x); // 5, 6, 1, 2, 3, 4 - * ``` - */ -template> -KERNEL_FLOAT_INLINE vector rotate_left(const V& input) { - static constexpr size_t N = vector_size; - static constexpr size_t K_rev = N > 0 ? (N - K % N) : 0; - - return rotate_right(input); -} - -namespace detail { -template< - typename U, - typename V, - typename Is = make_index_sequence>, - typename Js = make_index_sequence>> -struct concat_helper; - -template -struct concat_helper, index_sequence> { - using type = default_storage_type< - common_t, vector_value_type>, - vector_size + vector_size>; - - KERNEL_FLOAT_INLINE static type call(const U& left, const V& right) { - return vector_traits::create(vector_get(left)..., vector_get(right)...); - } -}; - -template -struct recur_concat_helper; - -template -struct recur_concat_helper { - using type = U; - - KERNEL_FLOAT_INLINE static U call(U&& input) { - return input; - } -}; - -template -struct recur_concat_helper { - using recur_helper = recur_concat_helper::type, Rest...>; - using type = typename recur_helper::type; - - KERNEL_FLOAT_INLINE static type call(const U& left, const V& right, const Rest&... rest) { - return recur_helper::call(concat_helper::call(left, right), rest...); - } -}; -} // namespace detail - -template -using concat_type = typename detail::recur_concat_helper...>::type; - -/** - * Concatenate the given vectors into one large vector. For example, given vectors of size 3, size 2 and size 5, - * this function returns a new vector of size 3+2+5=8. If the vectors are not of the same element type, they - * will first be cast into a common data type. - * - * # Examples - * ``` - * vec x = {1, 2, 3}; - * int y = 4; - * vec z = {5, 6, 7, 8}; - * vec xyz = concat(x, y, z); // 1, 2, 3, 4, 5, 6, 7, 8 - * ``` - */ -template -KERNEL_FLOAT_INLINE vector> concat(const Vs&... inputs) { - return detail::recur_concat_helper...>::call(into_storage(inputs)...); -} - -} // namespace kernel_float - -#endif //KERNEL_FLOAT_SWIZZLE_H diff --git a/include/kernel_float/tensor.h b/include/kernel_float/tensor.h new file mode 100644 index 0000000..c111f42 --- /dev/null +++ b/include/kernel_float/tensor.h @@ -0,0 +1,278 @@ +#ifndef KERNEL_FLOAT_BASE_H +#define KERNEL_FLOAT_BASE_H + +#include "base.h" +#include "broadcast.h" +#include "macros.h" +#include "reduce.h" +#include "unops.h" + +namespace kernel_float { + +template class S> +struct tensor { + static constexpr size_t rank = E::rank; + static constexpr size_t volume = E::volume; + + using value_type = T; + using extents_type = E; + using ndindex_type = ndindex; + using storage_type = S; + + KERNEL_FLOAT_INLINE + static constexpr size_t size() { + return E::volume; + } + + KERNEL_FLOAT_INLINE + static constexpr size_t size(size_t axis) { + return E::size(axis); + } + + KERNEL_FLOAT_INLINE + static constexpr size_t stride(size_t axis) { + return E::stride(axis); + } + + KERNEL_FLOAT_INLINE + static constexpr size_t linearize_index(ndindex_type index) { + return E::ravel_index(index); + } + + KERNEL_FLOAT_INLINE + tensor(T init = {}) { + for (size_t i = 0; i < size(); i++) { + storage_[i] = init; + } + } + + KERNEL_FLOAT_INLINE + tensor(storage_type storage) : storage_(storage) {} + + KERNEL_FLOAT_INLINE + storage_type& storage() { + return storage_; + } + + KERNEL_FLOAT_INLINE + const storage_type& storage() const { + return storage_; + } + + KERNEL_FLOAT_INLINE + T* data() { + return storage_.data(); + } + + KERNEL_FLOAT_INLINE + const T* data() const { + return storage_.data(); + } + + KERNEL_FLOAT_INLINE + const T* cdata() const { + return storage_.data(); + } + + KERNEL_FLOAT_INLINE + T* begin() { + return storage_.data(); + } + + KERNEL_FLOAT_INLINE + const T* begin() const { + return storage_.data(); + } + + KERNEL_FLOAT_INLINE + const T* cbegin() const { + return storage_.data(); + } + + KERNEL_FLOAT_INLINE + T* end() { + return storage_.data() + E::volume; + } + + KERNEL_FLOAT_INLINE + const T* end() const { + return storage_.data() + E::volume; + } + + KERNEL_FLOAT_INLINE + const T* cend() const { + return storage_.data() + E::volume; + } + + KERNEL_FLOAT_INLINE + T& at(ndindex_type x) { + return *(data() + linearize_index(x)); + } + + KERNEL_FLOAT_INLINE + const T& at(ndindex_type x) const { + return *(data() + linearize_index(x)); + } + + KERNEL_FLOAT_INLINE + T get(ndindex_type x) const { + return at(x); + } + + KERNEL_FLOAT_INLINE + void set(ndindex_type x, T value) { + at(x) = std::move(value); + } + + KERNEL_FLOAT_INLINE + T& operator[](ndindex_type x) { + return at(x); + } + + KERNEL_FLOAT_INLINE + const T& operator[](ndindex_type x) const { + return at(x); + } + + KERNEL_FLOAT_INLINE + T& operator()(ndindex_type x) { + return at(x); + } + + KERNEL_FLOAT_INLINE + const T& operator()(ndindex_type x) const { + return at(x); + } + + KERNEL_FLOAT_INLINE + tensor> flatten() const { + return storage_; + } + + template + KERNEL_FLOAT_INLINE tensor> reshape(extents = {}) const { + static_assert(extents::volume == volume, "invalid reshape shape"); + return storage_; + } + + template + KERNEL_FLOAT_INLINE tensor> broadcast(extents new_shape = {}) const { + return kernel_float::broadcast(*this, new_shape); + } + + template + KERNEL_FLOAT_INLINE tensor, E> map(F fun = {}) const { + return kernel_float::map(fun, *this); + } + + template + KERNEL_FLOAT_INLINE T reduce(F fun = {}) const { + return kernel_float::reduce(fun, *this); + } + + private: + storage_type storage_; +}; + +#define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ + template<> \ + struct into_tensor_traits<::T2> { \ + using type = tensor>; \ + \ + KERNEL_FLOAT_INLINE \ + static type call(::T2 v) { \ + return tensor_storage {v.x, v.y}; \ + } \ + }; \ + \ + template<> \ + struct into_tensor_traits<::T3> { \ + using type = tensor>; \ + \ + KERNEL_FLOAT_INLINE \ + static type call(::T3 v) { \ + return tensor_storage {v.x, v.y, v.z}; \ + } \ + }; \ + \ + template<> \ + struct into_tensor_traits<::T4> { \ + using type = tensor>; \ + \ + KERNEL_FLOAT_INLINE \ + static type call(::T4 v) { \ + return tensor_storage {v.x, v.y, v.z, v.w}; \ + } \ + }; + +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(char, char1, char2, char3, char4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(short, short1, short2, short3, short4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(int, int1, int2, int3, int4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long, long1, long2, long3, long4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long long, longlong1, longlong2, longlong3, longlong4) + +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned char, uchar1, uchar2, uchar3, uchar4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned short, ushort1, ushort2, ushort3, ushort4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned int, uint1, uint2, uint3, uint4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long, ulong1, ulong2, ulong3, ulong4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong1, ulonglong2, ulonglong3, ulonglong4) + +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(float, float1, float2, float3, float4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(double, double1, double2, double3, double4) + +template +using scalar = tensor>; + +template +using vec = tensor>; + +template +using mat = tensor>; + +// clang-format off +template using vec1 = vec; +template using vec2 = vec; +template using vec3 = vec; +template using vec4 = vec; +template using vec5 = vec; +template using vec6 = vec; +template using vec7 = vec; +template using vec8 = vec; +// clang-format on + +#define KERNEL_FLOAT_TYPE_ALIAS(NAME, T) \ + using k##NAME = scalar; \ + template \ + using NAME##X = vec; \ + using NAME##1 = vec; \ + using NAME##2 = vec; \ + using NAME##3 = vec; \ + using NAME##4 = vec; \ + using NAME##5 = vec; \ + using NAME##6 = vec; \ + using NAME##7 = vec; \ + using NAME##8 = vec; + +KERNEL_FLOAT_TYPE_ALIAS(char, char) +KERNEL_FLOAT_TYPE_ALIAS(short, short) +KERNEL_FLOAT_TYPE_ALIAS(int, int) +KERNEL_FLOAT_TYPE_ALIAS(long, long) +KERNEL_FLOAT_TYPE_ALIAS(longlong, long long) + +KERNEL_FLOAT_TYPE_ALIAS(uchar, unsigned char) +KERNEL_FLOAT_TYPE_ALIAS(ushort, unsigned short) +KERNEL_FLOAT_TYPE_ALIAS(uint, unsigned int) +KERNEL_FLOAT_TYPE_ALIAS(ulong, unsigned long) +KERNEL_FLOAT_TYPE_ALIAS(ulonglong, unsigned long long) + +KERNEL_FLOAT_TYPE_ALIAS(float, float) +KERNEL_FLOAT_TYPE_ALIAS(f32x, float) +KERNEL_FLOAT_TYPE_ALIAS(float32x, float) + +KERNEL_FLOAT_TYPE_ALIAS(double, double) +KERNEL_FLOAT_TYPE_ALIAS(f64x, double) +KERNEL_FLOAT_TYPE_ALIAS(float64x, double) + +} // namespace kernel_float + +#endif \ No newline at end of file diff --git a/include/kernel_float/unops.h b/include/kernel_float/unops.h index 6f1b4fd..41759f6 100644 --- a/include/kernel_float/unops.h +++ b/include/kernel_float/unops.h @@ -1,77 +1,58 @@ #ifndef KERNEL_FLOAT_UNOPS_H #define KERNEL_FLOAT_UNOPS_H -#include "cast.h" -#include "storage.h" +#include "base.h" namespace kernel_float { namespace detail { -template +template struct map_helper { - KERNEL_FLOAT_INLINE static Output call(F fun, const Input& input) { - return call(fun, input, make_index_sequence> {}); + KERNEL_FLOAT_INLINE static tensor_storage + call(F fun, const tensor_storage& input) { + return call(fun, input, make_index_sequence {}); } private: template - KERNEL_FLOAT_INLINE static Output call(F fun, const Input& input, index_sequence) { - return vector_traits::create(fun(vector_get(input))...); - } -}; - -template -struct map_helper, nested_array> { - KERNEL_FLOAT_INLINE static nested_array call(F fun, const nested_array& input) { - return call(fun, input, make_index_sequence::num_packets> {}); - } - - private: - template - KERNEL_FLOAT_INLINE static nested_array - call(F fun, const nested_array& input, index_sequence) { - return {map_helper::call(fun, input[Is])...}; + KERNEL_FLOAT_INLINE static tensor_storage + call(F fun, const tensor_storage& input, index_sequence) { + return {fun(input[Is])...}; } }; } // namespace detail -template -using map_type = default_storage_type>, vector_size>; - -/** - * Applies ``fun`` to each element from vector ``input`` and returns a new vector with the results. - * This function is the basis for all unary operators like ``sin`` and ``sqrt``. - * - * Example - * ======= - * ``` - * vector v = {1, 2, 3}; - * vector w = map([](auto i) { return i * 2; }); // 2, 4, 6 - * ``` - */ -template> -KERNEL_FLOAT_INLINE Output map(F fun, const Input& input) { - return detail::map_helper>::call(fun, into_storage(input)); +template +using map_type = tensor>, tensor_extents>; + +template +KERNEL_FLOAT_INLINE map_type map(F fun, const V& input) { + using Input = tensor_value_type; + using Output = result_t; + return detail::map_helper, Output, Input>::call( + fun, + into_tensor(input).storage()); } -#define KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ - namespace ops { \ - template \ - struct NAME { \ - KERNEL_FLOAT_INLINE T operator()(T input) { \ - return T(EXPR); \ - } \ - }; \ - } \ - template \ - KERNEL_FLOAT_INLINE vector> NAME(const V& input) { \ - return map>, V, into_storage_type>({}, input); \ +#define KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ + namespace ops { \ + template \ + struct NAME { \ + KERNEL_FLOAT_INLINE T operator()(T input) { \ + return T(EXPR); \ + } \ + }; \ + } \ + template \ + KERNEL_FLOAT_INLINE into_tensor_type NAME(const V& input) { \ + using F = ops::NAME>; \ + return map(F {}, input); \ } -#define KERNEL_FLOAT_DEFINE_UNARY_OP(NAME, OP, EXPR) \ - KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ - template \ - KERNEL_FLOAT_INLINE vector operator OP(const vector& vec) { \ - return NAME(vec); \ +#define KERNEL_FLOAT_DEFINE_UNARY_OP(NAME, OP, EXPR) \ + KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ + template \ + KERNEL_FLOAT_INLINE tensor operator OP(const tensor& vec) { \ + return NAME(vec); \ } KERNEL_FLOAT_DEFINE_UNARY_OP(negate, -, -input) @@ -128,6 +109,42 @@ KERNEL_FLOAT_DEFINE_UNARY_FUN(signbit) KERNEL_FLOAT_DEFINE_UNARY_FUN(isinf) KERNEL_FLOAT_DEFINE_UNARY_FUN(isnan) +enum struct RoundingMode { ANY, DOWN, UP, NEAREST, TOWARD_ZERO }; + +namespace ops { +template +struct cast; + +template +struct cast { + KERNEL_FLOAT_INLINE R operator()(T input) noexcept { + return R(input); + } +}; + +template +struct cast { + KERNEL_FLOAT_INLINE T operator()(T input) noexcept { + return input; + } +}; +} // namespace ops + +namespace detail { +template +struct map_helper, N, T, T> { + KERNEL_FLOAT_INLINE static tensor_storage + call(ops::cast fun, const tensor_storage& input) { + return input; + } +}; +} // namespace detail + +template +KERNEL_FLOAT_INLINE tensor> cast(const V& input) { + using F = ops::cast, R, Mode>; + return map(F {}, input); +} } // namespace kernel_float -#endif //KERNEL_FLOAT_UNOPS_H +#endif //KERNEL_FLOAT_UNOPS_H \ No newline at end of file From 1a5fd7464a76374c013847a7f114133e57f8a080 Mon Sep 17 00:00:00 2001 From: stijn Date: Thu, 22 Jun 2023 10:10:17 +0200 Subject: [PATCH 02/50] Tests pass for new tensor implementation --- include/kernel_float.h | 10 +- include/kernel_float/base.h | 20 +- include/kernel_float/bf16.h | 227 ++ include/kernel_float/binops.h | 68 +- include/kernel_float/broadcast.h | 58 +- include/kernel_float/fp16.h | 202 ++ include/kernel_float/meta.h | 2 +- include/kernel_float/reduce.h | 2 +- include/kernel_float/tensor.h | 9 + include/kernel_float/unops.h | 21 +- single_include/kernel_float.h | 3565 ++++++++++++++---------------- tests/basic.cu | 4 +- tests/binops.cu | 2 +- tests/common.h | 4 +- tests/swizzle.cu | 3 +- 15 files changed, 2220 insertions(+), 1977 deletions(-) create mode 100644 include/kernel_float/bf16.h create mode 100644 include/kernel_float/fp16.h diff --git a/include/kernel_float.h b/include/kernel_float.h index 14be925..51ae25b 100644 --- a/include/kernel_float.h +++ b/include/kernel_float.h @@ -1,23 +1,21 @@ #ifndef KERNEL_FLOAT_H #define KERNEL_FLOAT_H -//#include "kernel_float/bf16.h" -//#include "kernel_float/binops.h" //#include "kernel_float/broadcast.h" -//#include "kernel_float/fp16.h" //#include "kernel_float/fp8.h" //#include "kernel_float/interface.h" //#include "kernel_float/iterate.h" //#include "kernel_float/macros.h" -//#include "kernel_float/meta.h" -//#include "kernel_float/reduce.h" //#include "kernel_float/storage.h" //#include "kernel_float/swizzle.h" -//#include "kernel_float/unops.h" #include "kernel_float/base.h" +#include "kernel_float/bf16.h" +#include "kernel_float/binops.h" +#include "kernel_float/fp16.h" #include "kernel_float/macros.h" #include "kernel_float/meta.h" +#include "kernel_float/reduce.h" #include "kernel_float/tensor.h" #include "kernel_float/unops.h" diff --git a/include/kernel_float/base.h b/include/kernel_float/base.h index 1563ac7..aeb5fc6 100644 --- a/include/kernel_float/base.h +++ b/include/kernel_float/base.h @@ -302,11 +302,6 @@ struct tensor_traits> { template using into_tensor_type = typename into_tensor_traits::type; -template -KERNEL_FLOAT_INLINE into_tensor_type into_tensor(V&& input) { - return into_tensor_traits::call(std::forward(input)); -} - template using tensor_extents = typename tensor_traits>::extents_type; @@ -319,10 +314,23 @@ static constexpr size_t tensor_volume = tensor_extents::volume; template using tensor_value_type = typename tensor_traits>::value_type; +template +using tensor_storage_type = tensor_storage, tensor_volume>; + template -using tensor_promoted_value_type = +using promoted_tensor_value_type = promote_t>::value_type...>; +template +KERNEL_FLOAT_INLINE into_tensor_type into_tensor(V&& input) { + return into_tensor_traits::call(std::forward(input)); +} + +template +KERNEL_FLOAT_INLINE tensor_storage_type into_tensor_storage(V&& input) { + return into_tensor_traits::call(std::forward(input)).storage(); +} + } // namespace kernel_float #endif \ No newline at end of file diff --git a/include/kernel_float/bf16.h b/include/kernel_float/bf16.h new file mode 100644 index 0000000..4ff9905 --- /dev/null +++ b/include/kernel_float/bf16.h @@ -0,0 +1,227 @@ +#ifndef KERNEL_FLOAT_BF16_H +#define KERNEL_FLOAT_BF16_H + +#include "macros.h" + +#if KERNEL_FLOAT_BF16_AVAILABLE +#include + +#include "binops.h" + +namespace kernel_float { +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(__nv_bfloat16, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __nv_bfloat16) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __nv_bfloat16) + +template<> +struct into_tensor_traits<__nv_bfloat162> { + using type = tensor<__nv_bfloat16, extents<2>>; + + KERNEL_FLOAT_INLINE + static type call(__nv_bfloat162 input) { + return tensor_storage<__nv_bfloat16, 2> {input.x, input.y}; + } +}; + +namespace detail { +template +struct map_bfloat16x2 { + KERNEL_FLOAT_INLINE + static __nv_bfloat162 call(F fun, __nv_bfloat162 input) { + __nv_bfloat16 a = fun(input.x); + __nv_bfloat16 b = fun(input.y); + return {a, b}; + } +}; + +template +struct zip_bfloat16x2 { + KERNEL_FLOAT_INLINE + static __nv_bfloat162 call(F fun, __nv_bfloat162 left, __nv_bfloat162 right) { + __nv_bfloat16 a = fun(left.x, left.y); + __nv_bfloat16 b = fun(right.y, right.y); + return {a, b}; + } +}; + +template +struct apply_impl { + KERNEL_FLOAT_INLINE static tensor_storage<__nv_bfloat16, N> + call(F fun, const tensor_storage<__nv_bfloat16, N>& input) { + tensor_storage<__nv_bfloat16, N> result; + +#pragma unroll + for (size_t i = 0; i < N; i += 2) { + __nv_bfloat162 a = {input[i], input[i + 1]}; + __nv_bfloat162 b = map_bfloat16x2::call(fun, a); + result[i + 0] = b.x; + result[i + 1] = b.y; + } + + if (N % 2 != 0) { + result[N - 1] = fun(input[N - 1]); + } + + return result; + } +}; + +template +struct apply_impl { + KERNEL_FLOAT_INLINE static tensor_storage<__nv_bfloat16, N> call( + F fun, + const tensor_storage<__nv_bfloat16, N>& left, + const tensor_storage<__nv_bfloat16, N>& right) { + tensor_storage<__nv_bfloat16, N> result; +#pragma unroll + for (size_t i = 0; i < N; i += 2) { + __nv_bfloat162 a = {left[i], left[i + 1]}; + __nv_bfloat162 b = {right[i], right[i + 1]}; + __nv_bfloat162 c = zip_bfloat16x2::call(fun, a, b); + result[i + 0] = c.x; + result[i + 1] = c.y; + } + + if (N % 2 != 0) { + result[N - 1] = fun(left[N - 1], right[N - 1]); + } + + return result; + } +}; +} // namespace detail + +#if KERNEL_FLOAT_IS_DEVICE +#define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__nv_bfloat16> { \ + KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(__nv_bfloat16 input) { \ + return FUN1(input); \ + } \ + }; \ + } \ + namespace detail { \ + template<> \ + struct map_bfloat16x2> { \ + KERNEL_FLOAT_INLINE static __nv_bfloat162 \ + call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 input) { \ + return FUN2(input); \ + } \ + }; \ + } + +KERNEL_FLOAT_BF16_UNARY_FUN(abs, ::__habs, ::__habs2); +KERNEL_FLOAT_BF16_UNARY_FUN(negate, ::__hneg, ::__hneg2); +KERNEL_FLOAT_BF16_UNARY_FUN(ceil, ::hceil, ::h2ceil); +KERNEL_FLOAT_BF16_UNARY_FUN(cos, ::hcos, ::h2cos); +KERNEL_FLOAT_BF16_UNARY_FUN(exp, ::hexp, ::h2exp); +KERNEL_FLOAT_BF16_UNARY_FUN(exp10, ::hexp10, ::h2exp10); +KERNEL_FLOAT_BF16_UNARY_FUN(floor, ::hfloor, ::h2floor); +KERNEL_FLOAT_BF16_UNARY_FUN(log, ::hlog, ::h2log); +KERNEL_FLOAT_BF16_UNARY_FUN(log10, ::hlog10, ::h2log2); +KERNEL_FLOAT_BF16_UNARY_FUN(rint, ::hrint, ::h2rint); +KERNEL_FLOAT_BF16_UNARY_FUN(rsqrt, ::hrsqrt, ::h2rsqrt); +KERNEL_FLOAT_BF16_UNARY_FUN(sin, ::hsin, ::h2sin); +KERNEL_FLOAT_BF16_UNARY_FUN(sqrt, ::hsqrt, ::h2sqrt); +KERNEL_FLOAT_BF16_UNARY_FUN(trunc, ::htrunc, ::h2trunc); + +#define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__nv_bfloat16> { \ + KERNEL_FLOAT_INLINE __nv_bfloat16 \ + operator()(__nv_bfloat16 left, __nv_bfloat16 right) const { \ + return FUN1(left, right); \ + } \ + }; \ + } \ + namespace detail { \ + template<> \ + struct zip_bfloat16x2> { \ + KERNEL_FLOAT_INLINE static __nv_bfloat162 \ + call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 left, __nv_bfloat162 right) { \ + return FUN2(left, right); \ + } \ + }; \ + } + +KERNEL_FLOAT_BF16_BINARY_FUN(add, __hadd, __hadd2) +KERNEL_FLOAT_BF16_BINARY_FUN(subtract, __hsub, __hsub2) +KERNEL_FLOAT_BF16_BINARY_FUN(multiply, __hmul, __hmul2) +KERNEL_FLOAT_BF16_BINARY_FUN(divide, __hdiv, __h2div) +KERNEL_FLOAT_BF16_BINARY_FUN(min, __hmin, __hmin2) +KERNEL_FLOAT_BF16_BINARY_FUN(max, __hmax, __hmax2) + +KERNEL_FLOAT_BF16_BINARY_FUN(equal_to, __heq, __heq2) +KERNEL_FLOAT_BF16_BINARY_FUN(not_equal_to, __heq, __heq2) +KERNEL_FLOAT_BF16_BINARY_FUN(less, __hlt, __hlt2) +KERNEL_FLOAT_BF16_BINARY_FUN(less_equal, __hle, __hle2) +KERNEL_FLOAT_BF16_BINARY_FUN(greater, __hgt, __hgt2) +KERNEL_FLOAT_BF16_BINARY_FUN(greater_equal, __hge, __hgt2) + +#endif + +#define KERNEL_FLOAT_BF16_CAST(T, TO_HALF, FROM_HALF) \ + namespace ops { \ + template<> \ + struct cast { \ + KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(T input) { \ + return TO_HALF; \ + } \ + }; \ + template<> \ + struct cast<__nv_bfloat16, T> { \ + KERNEL_FLOAT_INLINE T operator()(__nv_bfloat16 input) { \ + return FROM_HALF; \ + } \ + }; \ + } + +KERNEL_FLOAT_BF16_CAST(double, __double2bfloat16(input), double(__bfloat162float(input))); +KERNEL_FLOAT_BF16_CAST(float, __float2bfloat16(input), __bfloat162float(input)); + +// there are no official char casts. Instead, cast to int and then to char +KERNEL_FLOAT_BF16_CAST(char, __int2bfloat16_rn(input), (char)__bfloat162int_rz(input)); +KERNEL_FLOAT_BF16_CAST( + signed char, + __int2bfloat16_rn(input), + (signed char)__bfloat162int_rz(input)); +KERNEL_FLOAT_BF16_CAST( + unsigned char, + __int2bfloat16_rn(input), + (unsigned char)__bfloat162int_rz(input)); + +KERNEL_FLOAT_BF16_CAST(signed short, __bfloat162short_rz(input), __short2bfloat16_rn(input)); +KERNEL_FLOAT_BF16_CAST(signed int, __bfloat162int_rz(input), __int2bfloat16_rn(input)); +KERNEL_FLOAT_BF16_CAST( + signed long, + __ll2bfloat16_rn(input), + (signed long)(__bfloat162ll_rz(input))); +KERNEL_FLOAT_BF16_CAST(signed long long, __ll2bfloat16_rn(input), __bfloat162ll_rz(input)); + +KERNEL_FLOAT_BF16_CAST(unsigned short, __bfloat162ushort_rz(input), __ushort2bfloat16_rn(input)); +KERNEL_FLOAT_BF16_CAST(unsigned int, __bfloat162uint_rz(input), __uint2bfloat16_rn(input)); +KERNEL_FLOAT_BF16_CAST( + unsigned long, + __ull2bfloat16_rn(input), + (unsigned long)(__bfloat162ull_rz(input))); +KERNEL_FLOAT_BF16_CAST(unsigned long long, __ull2bfloat16_rn(input), __bfloat162ull_rz(input)); + +using bfloat16 = __nv_bfloat16; +//KERNEL_FLOAT_TYPE_ALIAS(float16x, __nv_bfloat16) +//KERNEL_FLOAT_TYPE_ALIAS(f16x, __nv_bfloat16) + +} // namespace kernel_float + +#if KERNEL_FLOAT_FP16_AVAILABLE +#include "fp16.h" + +namespace kernel_float { +KERNEL_FLOAT_BF16_CAST(__half, __float2bfloat16(input), __bfloat162float(input)); +} + +#endif // KERNEL_FLOAT_FP16_AVAILABLE +#endif + +#endif //KERNEL_FLOAT_BF16_H diff --git a/include/kernel_float/binops.h b/include/kernel_float/binops.h index 4252c4e..8c50499 100644 --- a/include/kernel_float/binops.h +++ b/include/kernel_float/binops.h @@ -7,8 +7,8 @@ namespace kernel_float { namespace detail { -template -struct zip_helper { +template +struct apply_impl { KERNEL_FLOAT_INLINE static tensor_storage call(F fun, const tensor_storage& left, const tensor_storage& right) { return call(fun, left, right, make_index_sequence {}); @@ -34,10 +34,10 @@ template KERNEL_FLOAT_INLINE zip_type zip(F fun, const L& left, const R& right) { using A = tensor_value_type; using B = tensor_value_type; - using C = result_t; + using O = result_t; using E = broadcast_tensor_extents; - return detail::zip_helper::call( + return detail::apply_impl::call( fun, broadcast(left).storage(), broadcast(right).storage()); @@ -45,14 +45,21 @@ KERNEL_FLOAT_INLINE zip_type zip(F fun, const L& left, const R& right) template using zip_common_type = tensor< - result_t, tensor_promoted_value_type>, + result_t, promoted_tensor_value_type>, broadcast_tensor_extents>; template KERNEL_FLOAT_INLINE zip_common_type zip_common(F fun, const L& left, const R& right) { - while (1) - ; - // TODO + using T = promoted_tensor_value_type; + using O = result_t; + using E = broadcast_tensor_extents; + + return detail::apply_impl::call( + fun, + detail::convert_helper, tensor_extents, T, E>::call( + into_tensor_storage(left)), + detail::convert_helper, tensor_extents, T, E>::call( + into_tensor_storage(right))); } #define KERNEL_FLOAT_DEFINE_BINARY(NAME, EXPR) \ @@ -64,35 +71,30 @@ KERNEL_FLOAT_INLINE zip_common_type zip_common(F fun, const L& left, co } \ }; \ } \ - template> \ + template> \ KERNEL_FLOAT_INLINE zip_common_type, L, R> NAME(L&& left, R&& right) { \ return zip_common(ops::NAME {}, std::forward(left), std::forward(right)); \ } -#define KERNEL_FLOAT_DEFINE_BINARY_OP(NAME, OP) \ - KERNEL_FLOAT_DEFINE_BINARY(NAME, left OP right) \ - template< \ - typename L, \ - typename R, \ - typename C = tensor_promoted_value_type, \ - typename E1, \ - typename E2> \ - KERNEL_FLOAT_INLINE zip_common_type, L, R> operator OP( \ - const tensor& left, \ - const tensor& right) { \ - return zip_common(ops::NAME {}, left, right); \ - } \ - template, typename E> \ - KERNEL_FLOAT_INLINE zip_common_type, L, R> operator OP( \ - const tensor& left, \ - const R& right) { \ - return zip_common(ops::NAME {}, left, right); \ - } \ - template, typename E> \ - KERNEL_FLOAT_INLINE zip_common_type, L, R> operator OP( \ - const L& left, \ - const tensor& right) { \ - return zip_common(ops::NAME {}, left, right); \ +#define KERNEL_FLOAT_DEFINE_BINARY_OP(NAME, OP) \ + KERNEL_FLOAT_DEFINE_BINARY(NAME, left OP right) \ + template, typename E1, typename E2> \ + KERNEL_FLOAT_INLINE zip_common_type, tensor, tensor> operator OP( \ + const tensor& left, \ + const tensor& right) { \ + return zip_common(ops::NAME {}, left, right); \ + } \ + template>, typename E> \ + KERNEL_FLOAT_INLINE zip_common_type, tensor, R> operator OP( \ + const tensor& left, \ + const R& right) { \ + return zip_common(ops::NAME {}, left, right); \ + } \ + template, R>, typename E> \ + KERNEL_FLOAT_INLINE zip_common_type, L, tensor> operator OP( \ + const L& left, \ + const tensor& right) { \ + return zip_common(ops::NAME {}, left, right); \ } KERNEL_FLOAT_DEFINE_BINARY_OP(add, +) diff --git a/include/kernel_float/broadcast.h b/include/kernel_float/broadcast.h index 5bc47b1..976488b 100644 --- a/include/kernel_float/broadcast.h +++ b/include/kernel_float/broadcast.h @@ -2,6 +2,7 @@ #define KERNEL_FLOAT_CAST_H #include "base.h" +#include "unops.h" namespace kernel_float { namespace detail { @@ -173,7 +174,7 @@ struct strides_helper> { }; template -struct broadcast_helper { +struct broadcast_impl { KERNEL_FLOAT_INLINE static tensor_storage call(tensor_storage input) { static_assert(is_broadcastable, "cannot broadcast to required shape"); @@ -187,7 +188,7 @@ struct broadcast_helper { }; template -struct broadcast_helper { +struct broadcast_impl { KERNEL_FLOAT_INLINE static tensor_storage call(tensor_storage input) { return input; @@ -200,26 +201,26 @@ template tensor, extents> broadcast(const V& input, extents new_extents = {}) { using T = tensor_value_type; - return detail::broadcast_helper, extents>::call( + return detail::broadcast_impl, extents>::call( into_tensor(input).storage()); } template tensor> fill(T value = {}, extents = {}) { tensor_storage input = {value}; - return detail::broadcast_helper, extents>::call(input); + return detail::broadcast_impl, extents>::call(input); } template tensor> zeros(extents = {}) { tensor_storage input = {T {}}; - return detail::broadcast_helper, extents>::call(input); + return detail::broadcast_impl, extents>::call(input); } template tensor> ones(extents = {}) { tensor_storage input = {T {1}}; - return detail::broadcast_helper, extents>::call(input); + return detail::broadcast_impl, extents>::call(input); } template, typename E = tensor_extents> @@ -232,6 +233,51 @@ tensor ones_like(const V&) { return ones(E {}); } +namespace detail { +template +struct convert_helper { + KERNEL_FLOAT_INLINE + static tensor_storage call(tensor_storage input) { + tensor_storage intermediate = + detail::apply_impl, E::volume, T2, T>::call(input); + return detail::broadcast_impl::call(intermediate); + } +}; + +template +struct convert_helper { + KERNEL_FLOAT_INLINE + static tensor_storage call(tensor_storage input) { + return input; + } +}; + +template +struct convert_helper { + KERNEL_FLOAT_INLINE + static tensor_storage call(tensor_storage input) { + return detail::broadcast_impl::call(input); + } +}; + +template +struct convert_helper { + KERNEL_FLOAT_INLINE + static tensor_storage call(tensor_storage input) { + return detail::apply_impl, E::volume, T2, T>::call(input); + } +}; +} // namespace detail + +/** + * Cast the values of the given input tensor to type `R` and then broadcast the result to the given shape `(Ns...)`. + */ +template +tensor> convert(const V& input, extents new_shape = {}) { + return detail::convert_helper, tensor_extents, R, extents, M, >( + into_tensor(input).storage()); +} + } // namespace kernel_float #endif \ No newline at end of file diff --git a/include/kernel_float/fp16.h b/include/kernel_float/fp16.h new file mode 100644 index 0000000..cf8e154 --- /dev/null +++ b/include/kernel_float/fp16.h @@ -0,0 +1,202 @@ +#ifndef KERNEL_FLOAT_FP16_H +#define KERNEL_FLOAT_FP16_H + +#include "macros.h" + +#if KERNEL_FLOAT_FP16_AVAILABLE +#include + +#include "binops.h" + +namespace kernel_float { +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(__half, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __half) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __half) + +template<> +struct into_tensor_traits<__half2> { + using type = tensor<__half, extents<2>>; + + KERNEL_FLOAT_INLINE + static type call(__half2 input) { + return tensor_storage<__half, 2> {input.x, input.y}; + } +}; + +namespace detail { +template +struct map_halfx2 { + KERNEL_FLOAT_INLINE + static __half2 call(F fun, __half2 input) { + __half a = fun(input.x); + __half b = fun(input.y); + return {a, b}; + } +}; + +template +struct zip_halfx2 { + KERNEL_FLOAT_INLINE + static __half2 call(F fun, __half2 left, __half2 right) { + __half a = fun(left.x, left.y); + __half b = fun(right.y, right.y); + return {a, b}; + } +}; + +template +struct apply_impl { + KERNEL_FLOAT_INLINE static tensor_storage<__half, N> + call(F fun, const tensor_storage<__half, N>& input) { + tensor_storage<__half, N> result; + +#pragma unroll + for (size_t i = 0; i < N; i += 2) { + __half2 a = {input[i], input[i + 1]}; + __half2 b = map_halfx2::call(fun, a); + result[i + 0] = b.x; + result[i + 1] = b.y; + } + + if (N % 2 != 0) { + result[N - 1] = fun(input[N - 1]); + } + + return result; + } +}; + +template +struct apply_impl { + KERNEL_FLOAT_INLINE static tensor_storage<__half, N> + call(F fun, const tensor_storage<__half, N>& left, const tensor_storage<__half, N>& right) { + tensor_storage<__half, N> result; +#pragma unroll + for (size_t i = 0; i < N; i += 2) { + __half2 a = {left[i], left[i + 1]}; + __half2 b = {right[i], right[i + 1]}; + __half2 c = zip_halfx2::call(fun, a, b); + result[i + 0] = c.x; + result[i + 1] = c.y; + } + + if (N % 2 != 0) { + result[N - 1] = fun(left[N - 1], right[N - 1]); + } + + return result; + } +}; +} // namespace detail + +#if KERNEL_FLOAT_IS_DEVICE +#define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__half> { \ + KERNEL_FLOAT_INLINE __half operator()(__half input) { \ + return FUN1(input); \ + } \ + }; \ + } \ + namespace detail { \ + template<> \ + struct map_halfx2> { \ + KERNEL_FLOAT_INLINE static __half2 call(ops::NAME<__half>, __half2 input) { \ + return FUN2(input); \ + } \ + }; \ + } + +KERNEL_FLOAT_FP16_UNARY_FUN(abs, ::__habs, ::__habs2); +KERNEL_FLOAT_FP16_UNARY_FUN(negate, ::__hneg, ::__hneg2); +KERNEL_FLOAT_FP16_UNARY_FUN(ceil, ::hceil, ::h2ceil); +KERNEL_FLOAT_FP16_UNARY_FUN(cos, ::hcos, ::h2cos); +KERNEL_FLOAT_FP16_UNARY_FUN(exp, ::hexp, ::h2exp); +KERNEL_FLOAT_FP16_UNARY_FUN(exp10, ::hexp10, ::h2exp10); +KERNEL_FLOAT_FP16_UNARY_FUN(floor, ::hfloor, ::h2floor); +KERNEL_FLOAT_FP16_UNARY_FUN(log, ::hlog, ::h2log); +KERNEL_FLOAT_FP16_UNARY_FUN(log10, ::hlog10, ::h2log2); +KERNEL_FLOAT_FP16_UNARY_FUN(rint, ::hrint, ::h2rint); +KERNEL_FLOAT_FP16_UNARY_FUN(rsqrt, ::hrsqrt, ::h2rsqrt); +KERNEL_FLOAT_FP16_UNARY_FUN(sin, ::hsin, ::h2sin); +KERNEL_FLOAT_FP16_UNARY_FUN(sqrt, ::hsqrt, ::h2sqrt); +KERNEL_FLOAT_FP16_UNARY_FUN(trunc, ::htrunc, ::h2trunc); + +#define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__half> { \ + KERNEL_FLOAT_INLINE __half operator()(__half left, __half right) const { \ + return FUN1(left, right); \ + } \ + }; \ + } \ + namespace detail { \ + template<> \ + struct zip_halfx2> { \ + KERNEL_FLOAT_INLINE static __half2 call(ops::NAME<__half>, __half2 left, __half2 right) { \ + return FUN2(left, right); \ + } \ + }; \ + } + +KERNEL_FLOAT_FP16_BINARY_FUN(add, __hadd, __hadd2) +KERNEL_FLOAT_FP16_BINARY_FUN(subtract, __hsub, __hsub2) +KERNEL_FLOAT_FP16_BINARY_FUN(multiply, __hmul, __hmul2) +KERNEL_FLOAT_FP16_BINARY_FUN(divide, __hdiv, __h2div) +KERNEL_FLOAT_FP16_BINARY_FUN(min, __hmin, __hmin2) +KERNEL_FLOAT_FP16_BINARY_FUN(max, __hmax, __hmax2) + +KERNEL_FLOAT_FP16_BINARY_FUN(equal_to, __heq, __heq2) +KERNEL_FLOAT_FP16_BINARY_FUN(not_equal_to, __heq, __heq2) +KERNEL_FLOAT_FP16_BINARY_FUN(less, __hlt, __hlt2) +KERNEL_FLOAT_FP16_BINARY_FUN(less_equal, __hle, __hle2) +KERNEL_FLOAT_FP16_BINARY_FUN(greater, __hgt, __hgt2) +KERNEL_FLOAT_FP16_BINARY_FUN(greater_equal, __hge, __hgt2) + +#endif + +#define KERNEL_FLOAT_FP16_CAST(T, TO_HALF, FROM_HALF) \ + namespace ops { \ + template<> \ + struct cast { \ + KERNEL_FLOAT_INLINE __half operator()(T input) { \ + return TO_HALF; \ + } \ + }; \ + template<> \ + struct cast<__half, T> { \ + KERNEL_FLOAT_INLINE T operator()(__half input) { \ + return FROM_HALF; \ + } \ + }; \ + } + +KERNEL_FLOAT_FP16_CAST(double, __double2half(input), double(__half2float(input))); +KERNEL_FLOAT_FP16_CAST(float, __float2half(input), __half2float(input)); + +// there are no official char casts. Instead, cast to int and then to char +KERNEL_FLOAT_FP16_CAST(char, __int2half_rn(input), (char)__half2int_rz(input)); +KERNEL_FLOAT_FP16_CAST(signed char, __int2half_rn(input), (signed char)__half2int_rz(input)); +KERNEL_FLOAT_FP16_CAST(unsigned char, __int2half_rn(input), (unsigned char)__half2int_rz(input)); + +KERNEL_FLOAT_FP16_CAST(signed short, __half2short_rz(input), __short2half_rn(input)); +KERNEL_FLOAT_FP16_CAST(signed int, __half2int_rz(input), __int2half_rn(input)); +KERNEL_FLOAT_FP16_CAST(signed long, __ll2half_rn(input), (signed long)(__half2ll_rz(input))); +KERNEL_FLOAT_FP16_CAST(signed long long, __ll2half_rn(input), __half2ll_rz(input)); + +KERNEL_FLOAT_FP16_CAST(unsigned short, __half2ushort_rz(input), __ushort2half_rn(input)); +KERNEL_FLOAT_FP16_CAST(unsigned int, __half2uint_rz(input), __uint2half_rn(input)); +KERNEL_FLOAT_FP16_CAST(unsigned long, __ull2half_rn(input), (unsigned long)(__half2ull_rz(input))); +KERNEL_FLOAT_FP16_CAST(unsigned long long, __ull2half_rn(input), __half2ull_rz(input)); + +using half = __half; +//KERNEL_FLOAT_TYPE_ALIAS(float16x, __half) +//KERNEL_FLOAT_TYPE_ALIAS(f16x, __half) + +} // namespace kernel_float + +#endif + +#endif //KERNEL_FLOAT_FP16_H diff --git a/include/kernel_float/meta.h b/include/kernel_float/meta.h index e6cb31e..54873ba 100644 --- a/include/kernel_float/meta.h +++ b/include/kernel_float/meta.h @@ -170,7 +170,7 @@ struct multi_promote_type: } // namespace detail template -using promote_t = typename detail::multi_promote_type::type; +using promote_t = typename detail::multi_promote_type...>::type; namespace detail { diff --git a/include/kernel_float/reduce.h b/include/kernel_float/reduce.h index ddfabef..7738071 100644 --- a/include/kernel_float/reduce.h +++ b/include/kernel_float/reduce.h @@ -43,7 +43,7 @@ template KERNEL_FLOAT_INLINE tensor_value_type reduce(F fun, const V& input) { return detail::reduce_helper, tensor_value_type>::call( fun, - into_tensor(input)); + into_tensor_storage(input)); } /** diff --git a/include/kernel_float/tensor.h b/include/kernel_float/tensor.h index c111f42..ba5bdc8 100644 --- a/include/kernel_float/tensor.h +++ b/include/kernel_float/tensor.h @@ -39,6 +39,9 @@ struct tensor { return E::ravel_index(index); } + template = 0> + KERNEL_FLOAT_INLINE tensor(Args&&... args) : storage_ {std::forward(args)...} {} + KERNEL_FLOAT_INLINE tensor(T init = {}) { for (size_t i = 0; i < size(); i++) { @@ -229,6 +232,12 @@ using vec = tensor>; template using mat = tensor>; +template +KERNEL_FLOAT_INLINE vec, sizeof...(Args)> make_vec(Args&&... args) { + using T = promote_t; + return tensor_storage {T {args}...}; +}; + // clang-format off template using vec1 = vec; template using vec2 = vec; diff --git a/include/kernel_float/unops.h b/include/kernel_float/unops.h index 41759f6..185eba4 100644 --- a/include/kernel_float/unops.h +++ b/include/kernel_float/unops.h @@ -5,8 +5,12 @@ namespace kernel_float { namespace detail { -template -struct map_helper { + +template +struct apply_impl; + +template +struct apply_impl { KERNEL_FLOAT_INLINE static tensor_storage call(F fun, const tensor_storage& input) { return call(fun, input, make_index_sequence {}); @@ -28,7 +32,7 @@ template KERNEL_FLOAT_INLINE map_type map(F fun, const V& input) { using Input = tensor_value_type; using Output = result_t; - return detail::map_helper, Output, Input>::call( + return detail::apply_impl, Output, Input>::call( fun, into_tensor(input).storage()); } @@ -128,17 +132,14 @@ struct cast { return input; } }; -} // namespace ops -namespace detail { -template -struct map_helper, N, T, T> { - KERNEL_FLOAT_INLINE static tensor_storage - call(ops::cast fun, const tensor_storage& input) { +template +struct cast { + KERNEL_FLOAT_INLINE T operator()(T input) noexcept { return input; } }; -} // namespace detail +} // namespace ops template KERNEL_FLOAT_INLINE tensor> cast(const V& input) { diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 72edd39..687970f 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,10 +1,9 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-03-31 16:04:17.777000 -// git hash: 5a6b682ac483b61ec8a1697bf6adf4e929021574 +// date: 2023-06-22 10:09:53.221460 +// git hash: ce9f9941cc29e9d14001395dd631df563b79b2f0 //================================================================================ - #ifndef KERNEL_FLOAT_MACROS_H #define KERNEL_FLOAT_MACROS_H @@ -13,20 +12,20 @@ #ifdef __CUDA_ARCH__ #define KERNEL_FLOAT_INLINE __forceinline__ __device__ -#define KERNEL_FLOAT_ON_DEVICE (1) -#define KERNEL_FLOAT_ON_HOST (0) +#define KERNEL_FLOAT_IS_DEVICE (1) +#define KERNEL_FLOAT_IS_HOST (0) #define KERNEL_FLOAT_CUDA_ARCH (__CUDA_ARCH__) #else #define KERNEL_FLOAT_INLINE __forceinline__ __host__ -#define KERNEL_FLOAT_ON_DEVICE (0) -#define KERNEL_FLOAT_ON_HOST (1) +#define KERNEL_FLOAT_IS_DEVICE (0) +#define KERNEL_FLOAT_IS_HOST (1) #define KERNEL_FLOAT_CUDA_ARCH (0) #endif #else #define KERNEL_FLOAT_INLINE inline #define KERNEL_FLOAT_CUDA (0) -#define KERNEL_FLOAT_ON_HOST (1) -#define KERNEL_FLOAT_ON_DEVICE (0) +#define KERNEL_FLOAT_IS_HOST (1) +#define KERNEL_FLOAT_IS_DEVICE (0) #define KERNEL_FLOAT_CUDA_ARCH (0) #endif @@ -48,22 +47,272 @@ #define KERNEL_FLOAT_UNREACHABLE __builtin_unreachable() #endif //KERNEL_FLOAT_MACROS_H -#ifndef KERNEL_FLOAT_CORE_H -#define KERNEL_FLOAT_CORE_H +#ifndef KERNEL_FLOAT_COMPLEX_TYPE_H +#define KERNEL_FLOAT_COMPLEX_TYPE_H namespace kernel_float { -template -struct const_index { - static constexpr size_t value = I; +template +struct alignas(2 * alignof(T)) complex_storage { + T re; + T im; +}; + +template +struct complex_type: complex_storage { + using base_type = complex_storage; + + template + KERNEL_FLOAT_INLINE complex_type(complex_type that) : base_type(that.real(), that.imag()) {} + + KERNEL_FLOAT_INLINE + complex_type(T real = {}, T imag = {}) : base_type(real, im) {} + + KERNEL_FLOAT_INLINE + T real() const { + return re; + } + + KERNEL_FLOAT_INLINE + T imag() const { + return im; + } + + KERNEL_FLOAT_INLINE + T norm() const { + return re * re + im * im; + } + + KERNEL_FLOAT_INLINE + complex_type conj() const { + return {re, -im}; + } +}; + +template +KERNEL_FLOAT_INLINE complex_type operator+(complex_type v) { + return v; +} + +template +KERNEL_FLOAT_INLINE complex_type operator+(complex_type a, complex_type b) { + return {a.real() + b.real(), a.imag() + b.imag()}; +} + +template +KERNEL_FLOAT_INLINE complex_type operator+(T a, complex_type b) { + return {a + b.real(), b.imag()}; +} + +template +KERNEL_FLOAT_INLINE complex_type operator+(complex_type a, T b) { + return {a.real() + b, a.imag()}; +} + +template +KERNEL_FLOAT_INLINE complex_type& operator+=(complex_type& a, complex_type b) { + return (a = a + b); +} + +template +KERNEL_FLOAT_INLINE complex_type& operator+=(complex_type& a, T b) { + return (a = a + b); +} + +template +KERNEL_FLOAT_INLINE complex_type operator-(complex_type v) { + return {-v.real(), -v.imag()}; +} + +template +KERNEL_FLOAT_INLINE complex_type operator-(complex_type a, complex_type b) { + return { + a.real() - b.real(), a.imag() - b.imag() + } +} + +template +KERNEL_FLOAT_INLINE complex_type operator-(T a, complex_type b) { + return { + a - b.real(), -b.imag() + } +} + +template +KERNEL_FLOAT_INLINE complex_type operator-(complex_type a, T b) { + return { + a.real() - b, a.imag() + } +} + +template +KERNEL_FLOAT_INLINE complex_type& operator-=(complex_type& a, complex_type b) { + return (a = a - b); +} + +template +KERNEL_FLOAT_INLINE complex_type& operator-=(complex_type& a, T b) { + return (a = a - b); +} + +template +KERNEL_FLOAT_INLINE complex_type operator*(complex_type a, complex_type b) { + return { + a.real() * b.real() - a.imag() * b.imag(), a.real() * b.imag() + a.imag() * b.real() + } +} + +template +KERNEL_FLOAT_INLINE complex_type operator*(complex_type a, T b) { + return {a.real() * b, a.imag() * b}; +} + +template +KERNEL_FLOAT_INLINE complex_type* operator*=(complex_type& a, complex_type b) { + return (a = a * b); +} + +template +KERNEL_FLOAT_INLINE complex_type& operator*=(complex_type& a, T b) { + return (a = a * b); +} + +template +KERNEL_FLOAT_INLINE complex_type operator*(T a, complex_type b) { + return { + a * b.real(), + a * b.imag(), + }; +} + +template +KERNEL_FLOAT_INLINE complex_type operator/(complex_type a, complex_type b) { + T normi = 1 / b.norm(); + + return { + (a.real() * b.real() + a.imag() * b.imag()) * normi, + (a.imag() * b.real() - a.real() * b.imag()) * normi}; +} + +template +KERNEL_FLOAT_INLINE complex_type operator/(complex_type a, T b) { + return {a.real() * (1 / b), a.imag() * (1 / b)}; +} + +template +KERNEL_FLOAT_INLINE complex_type operator/(T a, complex_type b) { + T normi = 1 / b.norm(); + + return {a * b.real() * normi, -a * b.imag() * normi}; +} + +template +KERNEL_FLOAT_INLINE complex_type* operator/=(complex_type& a, complex_type b) { + return (a = a / b); +} + +template +KERNEL_FLOAT_INLINE complex_type& operator/=(complex_type& a, T b) { + return (a = a / b); +} + +template +KERNEL_FLOAT_INLINE T real(complex_type v) { + return v.real(); +} + +template +KERNEL_FLOAT_INLINE T imag(complex_type v) { + return v.real(); +} + +template +KERNEL_FLOAT_INLINE T abs(complex_type v) { + return hypot(v.real(), v.imag()); +} + +template +KERNEL_FLOAT_INLINE T arg(complex_type v) { + return atan2(v.imag(), v.real()); +} + +template +KERNEL_FLOAT_INLINE complex_type sqrt(complex_type v) { + T radius = abs(v); + T cosA = v.real() / radius; + + complex_type out = { + sqrt(radius * (cosA + T(1)) * T(.5)), + sqrt(radius * (T(1) - cosA) * T(.5))}; - KERNEL_FLOAT_INLINE constexpr operator size_t() const noexcept { - return I; + // signbit should be false if x.y is negative + if (v.imag() < 0) { + out = complex_type {out.real, -out.im}; } + + return out; +} + +template +KERNEL_FLOAT_INLINE complex_type norm(complex_type v) { + return v.real() * v.real() + v.imag() * v.imag(); +} + +template +KERNEL_FLOAT_INLINE complex_type conj(complex_type v) { + return {v.real(), -v.imag()}; +} + +template +KERNEL_FLOAT_INLINE complex_type exp(complex_type v) { + // TODO: Handle nan and inf correctly + T e = exp(v.real()); + T a = v.imag(); + return complex_type(e * cos(a), e * sin(a)); +} + +template +KERNEL_FLOAT_INLINE complex_type log(complex_type v) { + return {log(abs(v)), arg(v)}; +} + +template +KERNEL_FLOAT_INLINE complex_type pow(complex_type a, T b) { + return exp(a * log(b)); +} + +template +KERNEL_FLOAT_INLINE complex_type pow(complex_type a, complex_type b) { + return exp(a * log(b)); +} + +template +struct promote_type, complex_type> { + using type = complex_type>; +}; + +template +struct promote_type, R> { + using type = complex_type>; +}; + +template +struct promote_type> { + using type = complex_type>; }; +} // namespace kernel_float + +#endif +#ifndef KERNEL_FLOAT_CORE_H +#define KERNEL_FLOAT_CORE_H + + + +namespace kernel_float { + template struct index_sequence { static constexpr size_t size = sizeof...(Is); @@ -135,130 +384,126 @@ struct decay_helper { template using decay_t = typename detail::decay_helper::type; -template -struct common_type; +template +struct promote_type; template -struct common_type { +struct promote_type { using type = T; }; -#define KERNEL_FLOAT_DEFINE_COMMON_TYPE(T, U) \ - template<> \ - struct common_type { \ - using type = T; \ - }; \ - template<> \ - struct common_type { \ - using type = T; \ +#define KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, U) \ + template<> \ + struct promote_type { \ + using type = T; \ + }; \ + template<> \ + struct promote_type { \ + using type = T; \ }; -KERNEL_FLOAT_DEFINE_COMMON_TYPE(long double, double) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(long double, float) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(double, float) -//KERNEL_FLOAT_DEFINE_COMMON_TYPE(double, half) -//KERNEL_FLOAT_DEFINE_COMMON_TYPE(float, half) - -#define KERNEL_FLOAT_DEFINE_COMMON_INTEGRAL(T, U) \ - KERNEL_FLOAT_DEFINE_COMMON_TYPE(signed T, signed U) \ - KERNEL_FLOAT_DEFINE_COMMON_TYPE(unsigned T, unsigned U) - -KERNEL_FLOAT_DEFINE_COMMON_INTEGRAL(long long, long) -KERNEL_FLOAT_DEFINE_COMMON_INTEGRAL(long long, int) -KERNEL_FLOAT_DEFINE_COMMON_INTEGRAL(long long, short) -KERNEL_FLOAT_DEFINE_COMMON_INTEGRAL(long long, char) -KERNEL_FLOAT_DEFINE_COMMON_INTEGRAL(long, int) -KERNEL_FLOAT_DEFINE_COMMON_INTEGRAL(long, short) -KERNEL_FLOAT_DEFINE_COMMON_INTEGRAL(long, char) -KERNEL_FLOAT_DEFINE_COMMON_INTEGRAL(int, short) -KERNEL_FLOAT_DEFINE_COMMON_INTEGRAL(int, char) -KERNEL_FLOAT_DEFINE_COMMON_INTEGRAL(short, char) - -KERNEL_FLOAT_DEFINE_COMMON_TYPE(long double, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(double, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(float, bool) - -KERNEL_FLOAT_DEFINE_COMMON_TYPE(signed long long, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(signed long, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(signed int, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(signed short, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(signed char, bool) - -KERNEL_FLOAT_DEFINE_COMMON_TYPE(unsigned long long, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(unsigned long, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(unsigned int, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(unsigned short, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(unsigned char, bool) +// T and bool becomes T +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(char, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(signed char, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(signed short, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(signed int, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(signed long, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(signed long long, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(unsigned char, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(unsigned short, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(unsigned int, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(unsigned long, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(unsigned long long, bool) + +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, float) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(long double, float) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(long double, double) + +#define KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(T) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, char) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, signed char) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, signed short) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, signed int) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, signed long) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, signed long long) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, unsigned char) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, unsigned short) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, unsigned int) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, unsigned long) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, unsigned long long) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, bool) + +KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(float) +KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(double) +KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(long double) + +#define KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(T, U) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(signed T, signed U) \ + KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(unsigned T, unsigned U) + +KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(short, char) +KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(int, char) +KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(int, short) +KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(long, char) +KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(long, short) +KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(long, int) +KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(long long, char) +KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(long long, short) +KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(long long, int) +KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(long long, long) + +// half precision +// KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(half) +// KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(half, bool) +// KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, half) +// KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, half) +// KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(long double, half) namespace detail { template -struct common_type_helper; +struct multi_promote_type; template -struct common_type_helper { +struct multi_promote_type { using type = T; }; -template -struct common_type_helper { - using type = typename common_type::type; -}; +template +struct multi_promote_type: promote_type {}; + +template +struct multi_promote_type: + multi_promote_type::type, C, Rest...> {}; -template -struct common_type_helper: - common_type_helper::type, R, Rest...> {}; } // namespace detail template -using common_t = typename detail::common_type_helper...>::type; +using promote_t = typename detail::multi_promote_type...>::type; namespace detail { -template -struct common_size_helper; - -template<> -struct common_size_helper<> { - static constexpr size_t value = 1; -}; - -template -struct common_size_helper { - static constexpr size_t value = N; -}; - -template -struct common_size_helper { - static constexpr size_t value = N; -}; -template -struct common_size_helper { - static constexpr size_t value = N; -}; - -template -struct common_size_helper<1, N> { - static constexpr size_t value = N; +template +struct is_same_helper { + static constexpr bool value = false; }; -template<> -struct common_size_helper<1, 1> { - static constexpr size_t value = 1; +template +struct is_same_helper { + static constexpr bool value = true; }; } // namespace detail -template -static constexpr size_t common_size = detail::common_size_helper::value; +template +static constexpr bool is_same = detail::is_same_helper::value; namespace detail { - template struct is_implicit_convertible_helper { static constexpr bool value = false; }; template -struct is_implicit_convertible_helper::type> { +struct is_implicit_convertible_helper::type> { static constexpr bool value = true; }; } // namespace detail @@ -294,1004 +539,401 @@ using enabled_t = typename detail::enabled_helper::type; } // namespace kernel_float #endif -#ifndef KERNEL_FLOAT_STORAGE -#define KERNEL_FLOAT_STORAGE +#ifndef KERNEL_FLOAT_BASE +#define KERNEL_FLOAT_BASE -namespace kernel_float { -template -struct vector_traits { - using value_type = V; - static constexpr size_t size = 1; +namespace kernel_float { +template +struct alignas(Alignment) array { KERNEL_FLOAT_INLINE - static V fill(value_type value) { - return value; + T* data() { + return items_; } KERNEL_FLOAT_INLINE - static V create(value_type value) { - return value; + const T* data() const { + return items_; } KERNEL_FLOAT_INLINE - static value_type get(const V& self, size_t index) { - KERNEL_FLOAT_ASSERT(index == 0); - return self; + T& operator[](size_t i) { + return items_[i]; } KERNEL_FLOAT_INLINE - static void set(V& self, size_t index, value_type value) { - KERNEL_FLOAT_ASSERT(index == 0); - self = value; + const T& operator[](size_t i) const { + return items_[i]; } + + T items_[N]; }; -template -struct into_storage_traits { - using type = V; +template +struct array { + KERNEL_FLOAT_INLINE + array(T value = {}) : value_(value) {} KERNEL_FLOAT_INLINE - static constexpr type call(V self) { - return self; + operator T() const { + return value_; } -}; -template -struct into_storage_traits: into_storage_traits {}; - -template -struct into_storage_traits: into_storage_traits {}; + KERNEL_FLOAT_INLINE + T* data() { + return &value_; + } -template -struct into_storage_traits: into_storage_traits {}; + KERNEL_FLOAT_INLINE + const T* data() const { + return &value_; + } -template -using into_storage_type = typename into_storage_traits::type; + KERNEL_FLOAT_INLINE + T& operator[](size_t) { + return value_; + } -template -KERNEL_FLOAT_INLINE into_storage_type into_storage(V&& input) { - return into_storage_traits::call(input); -} + KERNEL_FLOAT_INLINE + const T& operator[](size_t) const { + return value_; + } -template -static constexpr size_t vector_size = vector_traits>::size; + T value_; +}; -template -using vector_value_type = typename vector_traits>::value_type; +template +struct array { + KERNEL_FLOAT_INLINE + T* data() { + while (true) + ; + } -template -struct vector_index { - using value_type = vector_value_type; + KERNEL_FLOAT_INLINE + const T* data() const { + while (true) + ; + } KERNEL_FLOAT_INLINE - static value_type get(const V& self) { - return vector_traits::get(self, I); + T& operator[](size_t i) { + while (true) + ; } KERNEL_FLOAT_INLINE - static void set(V& self, value_type value) { - return vector_traits::set(self, I, value); + const T& operator[](size_t i) const { + while (true) + ; } }; -template -KERNEL_FLOAT_INLINE vector_value_type vector_get(const V& self, size_t index) { - return vector_traits::get(self, index); +template +using ndindex = array; + +KERNEL_FLOAT_INLINE +static constexpr size_t compute_max_alignment(size_t total_size, size_t min_align) { + if (total_size % 32 == 0 || min_align >= 32) { + return 32; + } else if (total_size % 16 == 0 || min_align == 16) { + return 16; + } else if (total_size % 8 == 0 || min_align == 8) { + return 8; + } else if (total_size % 4 == 0 || min_align == 4) { + return 4; + } else if (total_size % 2 == 0 || min_align == 2) { + return 2; + } else { + return 1; + } } -template -KERNEL_FLOAT_INLINE vector_value_type vector_get(const V& self, const_index = {}) { - return vector_index::get(self); -} +template +using tensor_storage = array; -template -struct vector_swizzle; +template class S = tensor_storage> +struct tensor; -template -struct vector_swizzle> { - KERNEL_FLOAT_INLINE static Output call(const Input& storage) { - return vector_traits::create(vector_get(storage)...); - } -}; +template +struct extents; -template -struct vector; +template<> +struct extents<> { + static constexpr size_t rank = 0; + static constexpr size_t volume = 1; -template -struct alignas(alignment) array { - T items_[N]; + KERNEL_FLOAT_INLINE + static constexpr size_t size(size_t axis) { + return 1; + } KERNEL_FLOAT_INLINE - T& operator[](size_t i) { - KERNEL_FLOAT_ASSERT(i < N); - return items_[i]; + static constexpr size_t stride(size_t axis) { + return 1; } KERNEL_FLOAT_INLINE - const T& operator[](size_t i) const { - KERNEL_FLOAT_ASSERT(i < N); - return items_[i]; + static size_t ravel_index(ndindex<0>) { + return 0; + } + + KERNEL_FLOAT_INLINE + static ndindex<0> unravel_index(size_t i) { + return {}; } }; -template -struct vector_traits> { - using self_type = array; - using value_type = T; - static constexpr size_t size = N; +template +struct extents { + static constexpr size_t rank = 1; + static constexpr size_t volume = N; - template - KERNEL_FLOAT_INLINE static self_type create(Args&&... args) { - return {args...}; + KERNEL_FLOAT_INLINE + static constexpr size_t size(size_t axis) { + return axis == 0 ? N : 1; } KERNEL_FLOAT_INLINE - static self_type fill(value_type value) { - self_type result; - for (size_t i = 0; i < N; i++) { - result[i] = value; - } - return result; + static constexpr size_t stride(size_t axis) { + return 1; } KERNEL_FLOAT_INLINE - static value_type get(const self_type& self, size_t index) { - KERNEL_FLOAT_ASSERT(index < N); - return self[index]; + static size_t ravel_index(ndindex<1> ind) { + return ind[0]; } KERNEL_FLOAT_INLINE - static void set(self_type& self, size_t index, value_type value) { - KERNEL_FLOAT_ASSERT(index < N); - self[index] = value; + static ndindex<1> unravel_index(size_t i) { + return {i}; } }; -template -struct array {}; - -template -struct vector_traits> { - using self_type = array; - using value_type = T; - static constexpr size_t size = 0; +template +struct extents { + static constexpr size_t rank = 2; + static constexpr size_t volume = N * M; KERNEL_FLOAT_INLINE - static self_type create() { - return {}; + static constexpr size_t size(size_t axis) { + return axis == 0 ? N : axis == 1 ? M : 1; } KERNEL_FLOAT_INLINE - static self_type fill(value_type value) { - return {}; + static constexpr size_t stride(size_t axis) { + return axis == 0 ? M : 1; } KERNEL_FLOAT_INLINE - static value_type get(const self_type& self, size_t index) { - KERNEL_FLOAT_UNREACHABLE; + static size_t ravel_index(ndindex<2> x) { + return x[0] * M + x[1]; } KERNEL_FLOAT_INLINE - static void set(self_type& self, size_t index, value_type value) { - KERNEL_FLOAT_UNREACHABLE; + static ndindex<2> unravel_index(size_t i) { + return {i / M, i % M}; } }; -enum struct Alignment { - Minimum, - Packed, - Maximum, -}; - -constexpr size_t calculate_alignment(Alignment required, size_t min_alignment, size_t total_size) { - size_t alignment = 1; +template +struct extents { + static constexpr size_t rank = 3; + static constexpr size_t volume = N * M * K; - if (required == Alignment::Maximum) { - if (total_size <= 1) { - alignment = 1; - } else if (total_size <= 2) { - alignment = 2; - } else if (total_size <= 4) { - alignment = 4; - } else if (total_size <= 8) { - alignment = 8; - } else { - alignment = 16; - } - } else if (required == Alignment::Packed) { - if (total_size % 16 == 0) { - alignment = 16; - } else if (total_size % 8 == 0) { - alignment = 8; - } else if (total_size % 4 == 0) { - alignment = 4; - } else if (total_size % 2 == 0) { - alignment = 2; - } else { - alignment = 1; - } + KERNEL_FLOAT_INLINE + static constexpr size_t size(size_t axis) { + return axis == 0 ? N : axis == 1 ? M : axis == 2 ? K : 1; } - if (min_alignment > alignment) { - alignment = min_alignment; + KERNEL_FLOAT_INLINE + static constexpr size_t stride(size_t axis) { + return axis == 0 ? M * K // + : axis == 1 ? K // + : 1; // } - return alignment; -} + KERNEL_FLOAT_INLINE + static size_t ravel_index(ndindex<3> x) { + return (x[0] * M + x[1]) * K + x[2]; + } -template -struct default_storage { - using type = array; + KERNEL_FLOAT_INLINE + static ndindex<3> unravel_index(size_t i) { + return {i / (K * M), (i / K) % M, i % K}; + } }; -template -struct default_storage { - using type = T; -}; +template +struct into_tensor_traits; -template -using default_storage_type = typename default_storage::type; - -#define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ - template<> \ - struct vector_traits { \ - using value_type = T; \ - static constexpr size_t size = 1; \ - \ - KERNEL_FLOAT_INLINE \ - static T1 create(T x) { \ - return {x}; \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static T1 fill(T v) { \ - return {v}; \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static T get(const T1& self, size_t index) { \ - switch (index) { \ - case 0: \ - return self.x; \ - default: \ - KERNEL_FLOAT_UNREACHABLE; \ - } \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static void set(T1& self, size_t index, T value) { \ - switch (index) { \ - case 0: \ - self.x = value; \ - default: \ - KERNEL_FLOAT_UNREACHABLE; \ - } \ - } \ - }; \ - \ - template<> \ - struct vector_traits { \ - using value_type = T; \ - static constexpr size_t size = 2; \ - \ - KERNEL_FLOAT_INLINE \ - static T2 create(T x, T y) { \ - return {x, y}; \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static T2 fill(T v) { \ - return {v, v}; \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static T get(const T2& self, size_t index) { \ - switch (index) { \ - case 0: \ - return self.x; \ - case 1: \ - return self.y; \ - default: \ - KERNEL_FLOAT_UNREACHABLE; \ - } \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static void set(T2& self, size_t index, T value) { \ - switch (index) { \ - case 0: \ - self.x = value; \ - case 1: \ - self.y = value; \ - default: \ - KERNEL_FLOAT_UNREACHABLE; \ - } \ - } \ - }; \ - \ - template<> \ - struct vector_traits { \ - using value_type = T; \ - static constexpr size_t size = 3; \ - \ - KERNEL_FLOAT_INLINE \ - static T3 create(T x, T y, T z) { \ - return {x, y, z}; \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static T3 fill(T v) { \ - return {v, v, v}; \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static T get(const T3& self, size_t index) { \ - switch (index) { \ - case 0: \ - return self.x; \ - case 1: \ - return self.y; \ - case 2: \ - return self.z; \ - default: \ - KERNEL_FLOAT_UNREACHABLE; \ - } \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static void set(T3& self, size_t index, T value) { \ - switch (index) { \ - case 0: \ - self.x = value; \ - return; \ - case 1: \ - self.y = value; \ - return; \ - case 2: \ - self.z = value; \ - return; \ - default: \ - KERNEL_FLOAT_UNREACHABLE; \ - } \ - } \ - }; \ - \ - template<> \ - struct vector_traits { \ - using value_type = T; \ - static constexpr size_t size = 4; \ - \ - KERNEL_FLOAT_INLINE \ - static T4 create(T x, T y, T z, T w) { \ - return {x, y, z, w}; \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static T4 fill(T v) { \ - return {v, v, v, v}; \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static T get(const T4& self, size_t index) { \ - switch (index) { \ - case 0: \ - return self.x; \ - case 1: \ - return self.y; \ - case 2: \ - return self.z; \ - case 3: \ - return self.w; \ - default: \ - KERNEL_FLOAT_UNREACHABLE; \ - } \ - } \ - \ - KERNEL_FLOAT_INLINE \ - static void set(T4& self, size_t index, T value) { \ - switch (index) { \ - case 0: \ - self.x = value; \ - return; \ - case 1: \ - self.y = value; \ - return; \ - case 2: \ - self.z = value; \ - return; \ - case 3: \ - self.w = value; \ - return; \ - default: \ - KERNEL_FLOAT_UNREACHABLE; \ - } \ - } \ - }; - -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(char, char1, char2, char3, char4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(short, short1, short2, short3, short4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(int, int1, int2, int3, int4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long, long1, long2, long3, long4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long long, longlong1, longlong2, longlong3, longlong4) - -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned char, uchar1, uchar2, uchar3, uchar4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned short, ushort1, ushort2, ushort3, ushort4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned int, uint1, uint2, uint3, uint4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long, ulong1, ulong2, ulong3, ulong4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong1, ulonglong2, ulonglong3, ulonglong4) - -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(float, float1, float2, float3, float4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(double, double1, double2, double3, double4) - -template -struct nested_array { - static constexpr size_t num_packets = (N + vector_size - 1) / vector_size; - static_assert(num_packets * vector_size >= N, "internal error"); - - V packets[num_packets]; - - KERNEL_FLOAT_INLINE - V& operator[](size_t i) { - KERNEL_FLOAT_ASSERT(i < num_packets); - return packets[i]; - } +template +struct into_tensor_traits { + using type = typename into_tensor_traits::type; KERNEL_FLOAT_INLINE - const V& operator[](size_t i) const { - KERNEL_FLOAT_ASSERT(i < num_packets); - return packets[i]; + static type call(const V input) { + return into_tensor_traits::call(input); } }; -template -struct vector_traits> { - using self_type = nested_array; - using value_type = vector_value_type; - static constexpr size_t size = N; - - template - KERNEL_FLOAT_INLINE static self_type create(Args&&... args) { - value_type items[N] = {args...}; - self_type output; - - size_t i = 0; - for (; i + vector_size - 1 < N; i += vector_size) { - // How to generalize this? - output.packets[i / vector_size] = vector_traits::create(items[i], items[i + 1]); - } - - for (; i < N; i++) { - vector_traits::set(output.packets[i / vector_size], i % vector_size, items[i]); - } - - return output; - } +template +struct into_tensor_traits { + using type = typename into_tensor_traits::type; KERNEL_FLOAT_INLINE - static self_type fill(value_type value) { - self_type output; - - for (size_t i = 0; i < self_type::num_packets; i++) { - output.packets[i] = vector_traits::fill(value); - } - - return output; + static type call(V& input) { + return into_tensor_traits::call(input); } +}; - KERNEL_FLOAT_INLINE - static value_type get(const self_type& self, size_t index) { - KERNEL_FLOAT_ASSERT(index < N); - return vector_traits::get(self.packets[index / vector_size], index % vector_size); - } +template +struct into_tensor_traits { + using type = typename into_tensor_traits::type; KERNEL_FLOAT_INLINE - static void set(self_type& self, size_t index, value_type value) { - KERNEL_FLOAT_ASSERT(index < N); - vector_traits::set(self.packets[index / vector_size], index % vector_size, value); + static type call(const V& input) { + return into_tensor_traits::call(input); } }; -}; // namespace kernel_float - -#endif -#ifndef KERNEL_FLOAT_CAST_H -#define KERNEL_FLOAT_CAST_H - - - -namespace kernel_float { -namespace ops { -template -struct cast { - KERNEL_FLOAT_INLINE R operator()(T input) noexcept { - return R(input); - } -}; +template +struct into_tensor_traits { + using type = typename into_tensor_traits::type; -template -struct cast { - KERNEL_FLOAT_INLINE T operator()(T input) noexcept { - return input; + KERNEL_FLOAT_INLINE + static type call(V&& input) { + return into_tensor_traits::call(std::move(input)); } }; -} // namespace ops - -namespace detail { - -// Cast a vector of type `Input` to type `Output`. Vectors must have the same size. -// The input vector has value type `T` -// The output vector has value type `R` -template< - typename Input, - typename Output, - typename T = vector_value_type, - typename R = vector_value_type> -struct cast_helper { - static_assert(vector_size == vector_size, "sizes must match"); - static constexpr size_t N = vector_size; - KERNEL_FLOAT_INLINE static Output call(const Input& input) { - return call(input, make_index_sequence {}); - } - - private: - template - KERNEL_FLOAT_INLINE static Output call(const Input& input, index_sequence) { - ops::cast fun; - return vector_traits::create(fun(vector_get(input))...); - } -}; - -// Cast a vector of type `Input` to type `Output`. -// The input vector has value type `T` and size `N`. -// The output vector has value type `R` and size `M`. -template< - typename Input, - typename Output, - typename T = vector_value_type, - size_t N = vector_size, - typename R = vector_value_type, - size_t M = vector_size> -struct broadcast_helper; - -// T[1] => T[1] -template -struct broadcast_helper { - KERNEL_FLOAT_INLINE static Vector call(Vector input) { - return input; - } -}; +template class S> +struct into_tensor_traits> { + using type = tensor; -// T[N] => T[N] -template -struct broadcast_helper { - KERNEL_FLOAT_INLINE static Vector call(Vector input) { + KERNEL_FLOAT_INLINE + static type call(const tensor& input) { return input; } }; -// T[1] => T[N] -template -struct broadcast_helper { - KERNEL_FLOAT_INLINE static Output call(Input input) { - return vector_traits::fill(vector_get<0>(input)); - } -}; - -// T[1] => T[1], but different vector types -template -struct broadcast_helper { - KERNEL_FLOAT_INLINE static Output call(Input input) { - return vector_traits::create(vector_get<0>(input)); - } -}; - -// T[N] => T[N], but different vector types -template -struct broadcast_helper { - KERNEL_FLOAT_INLINE static Output call(Input input) { - return cast_helper::call(input); - } -}; - -// T[1] => R[N] -template -struct broadcast_helper { - KERNEL_FLOAT_INLINE static Output call(Input input) { - return vector_traits::fill(ops::cast {}(vector_get<0>(input))); - } -}; - -// T[1] => R[1] -template -struct broadcast_helper { - KERNEL_FLOAT_INLINE static Output call(Input input) { - return vector_traits::create(ops::cast {}(vector_get<0>(input))); - } -}; +template +struct into_tensor_traits> { + using type = tensor>; -// T[N] => R[N] -template -struct broadcast_helper { - KERNEL_FLOAT_INLINE static Output call(Input input) { - return cast_helper::call(input); + KERNEL_FLOAT_INLINE + static type call(const array& input) { + return input; } }; -} // namespace detail - -/** - * Cast the elements of the given vector ``input`` to the given type ``R`` and then widen the - * vector to length ``N``. The cast may lead to a loss in precision if ``R`` is a smaller data - * type. Widening is only possible if the input vector has size ``1`` or ``N``, other sizes - * will lead to a compilation error. - * - * Example - * ======= - * ``` - * vec x = {6}; - * vec y = broadcast(x); - * vec z = broadcast(y); - * ``` - */ -template> -KERNEL_FLOAT_INLINE vector broadcast(Input&& input) { - return detail::broadcast_helper, Output>::call( - into_storage(std::forward(input))); -} - -#ifndef DOXYGEN_SHOULD_SKIP_THIS -template< - size_t N, - typename Input, - typename Output = default_storage_type, N>> -KERNEL_FLOAT_INLINE vector broadcast(Input&& input) { - return detail::broadcast_helper, Output>::call( - into_storage(std::forward(input))); -} - -template -KERNEL_FLOAT_INLINE vector broadcast(Input&& input) { - return detail::broadcast_helper, Output>::call( - into_storage(std::forward(input))); -} -#endif - -/** - * Widen the given vector ``input`` to length ``N``. Widening is only possible if the input vector - * has size ``1`` or ``N``, other sizes will lead to a compilation error. - * - * Example - * ======= - * ``` - * vec x = {6}; - * vec y = resize<3>(x); - * ``` - */ -template< - size_t N, - typename Input, - typename Output = default_storage_type, N>> -KERNEL_FLOAT_INLINE vector resize(Input&& input) noexcept { - return detail::broadcast_helper::call(std::forward(input)); -} - -template -using cast_type = default_storage_type>; - -/** - * Cast the elements of given vector ``input`` to the given type ``R``. Note that this cast may - * lead to a loss in precision if ``R`` is a smaller data type. - * - * Example - * ======= - * ``` - * vec x = {1.0f, 2.0f, 3.0f}; - * vec y = cast(x); - * vec z = cast(x); - * ``` - */ -template> -KERNEL_FLOAT_INLINE vector cast(Input&& input) noexcept { - return detail::broadcast_helper::call(std::forward(input)); -} -} // namespace kernel_float - -#endif //KERNEL_FLOAT_CAST_H -#ifndef KERNEL_FLOAT_SWIZZLE_H -#define KERNEL_FLOAT_SWIZZLE_H - - - -namespace kernel_float { - -/** - * "Swizzles" the vector. Returns a new vector where the elements are provided by the given indices. - * - * # Example - * ``` - * vec x = {0, 1, 2, 3, 4, 5, 6}; - * vec a = swizzle<0, 1, 2>(x); // 0, 1, 2 - * vec b = swizzle<2, 1, 0>(x); // 2, 1, 0 - * vec c = swizzle<1, 1, 1>(x); // 1, 1, 1 - * vec d = swizzle<0, 2, 4, 6>(x); // 0, 2, 4, 6 - * ``` - */ -template< - size_t... Is, - typename V, - typename Output = default_storage_type, sizeof...(Is)>> -KERNEL_FLOAT_INLINE vector swizzle(const V& input, index_sequence _ = {}) { - return vector_swizzle, index_sequence>::call( - into_storage(input)); -} - -/** - * Takes the first ``N`` elements from the given vector and returns a new vector of length ``N``. - * - * # Example - * ``` - * vec x = {1, 2, 3, 4, 5, 6}; - * vec y = first<3>(x); // 1, 2, 3 - * int z = first(x); // 1 - * ``` - */ -template, K>> -KERNEL_FLOAT_INLINE vector first(const V& input) { - static_assert(K <= vector_size, "K cannot exceed vector size"); - using Indices = make_index_sequence; - return vector_swizzle, Indices>::call(into_storage(input)); -} - -namespace detail { -template -struct offset_index_sequence_helper; - -template -struct offset_index_sequence_helper> { - using type = index_sequence; -}; -} // namespace detail -/** - * Takes the last ``N`` elements from the given vector and returns a new vector of length ``N``. - * - * # Example - * ``` - * vec x = {1, 2, 3, 4, 5, 6}; - * vec y = last<3>(x); // 4, 5, 6 - * int z = last(x); // 6 - * ``` - */ -template, K>> -KERNEL_FLOAT_INLINE vector last(const V& input) { - static_assert(K <= vector_size, "K cannot exceed vector size"); - using Indices = typename detail::offset_index_sequence_helper< // - vector_size - K, - make_index_sequence>::type; - - return vector_swizzle, Indices>::call(into_storage(input)); -} - -namespace detail { -template -struct reverse_index_sequence_helper: reverse_index_sequence_helper {}; +template +struct tensor_traits; -template -struct reverse_index_sequence_helper<0, Is...> { - using type = index_sequence; +template class S> +struct tensor_traits> { + using value_type = T; + using extents_type = D; + using storage_type = S; }; -} // namespace detail - -/** - * Reverses the elements in the given vector. - * - * # Example - * ``` - * vec x = {1, 2, 3, 4, 5, 6}; - * vec y = reversed(x); // 6, 5, 4, 3, 2, 1 - * ``` - */ -template> -KERNEL_FLOAT_INLINE vector reversed(const V& input) { - using Indices = typename detail::reverse_index_sequence_helper>::type; - return vector_swizzle, Indices>::call(into_storage(input)); -} +template +using into_tensor_type = typename into_tensor_traits::type; -namespace detail { -template -struct concat_index_sequence_helper {}; +template +using tensor_extents = typename tensor_traits>::extents_type; -template -struct concat_index_sequence_helper, index_sequence> { - using type = index_sequence; -}; -} // namespace detail +template +static constexpr size_t tensor_rank = tensor_extents::rank; -/** - * Rotate the given vector ``K`` steps to the right. In other words, this move the front element to the back - * ``K`` times. This is the inverse of ``rotate_left``. - * - * # Example - * ``` - * vec x = {1, 2, 3, 4, 5, 6}; - * vec y = rotate_right<2>(x); // 5, 6, 1, 2, 3, 4 - * ``` - */ -template> -KERNEL_FLOAT_INLINE vector rotate_right(const V& input) { - static constexpr size_t N = vector_size; - static constexpr size_t I = (N > 0) ? (K % N) : 0; +template +static constexpr size_t tensor_volume = tensor_extents::volume; - using First = - typename detail::offset_index_sequence_helper>::type; - using Second = make_index_sequence; - using Indices = typename detail::concat_index_sequence_helper::type; +template +using tensor_value_type = typename tensor_traits>::value_type; - return vector_swizzle, Indices>::call(into_storage(input)); -} +template +using tensor_storage_type = tensor_storage, tensor_volume>; -/** - * Rotate the given vector ``K`` steps to the left. In other words, this move the back element to the front - * ``K`` times. This is the inverse of ``rotate_right``. - * - * # Example - * ``` - * vec x = {1, 2, 3, 4, 5, 6}; - * vec y = rotate_left<4>(x); // 5, 6, 1, 2, 3, 4 - * ``` - */ -template> -KERNEL_FLOAT_INLINE vector rotate_left(const V& input) { - static constexpr size_t N = vector_size; - static constexpr size_t K_rev = N > 0 ? (N - K % N) : 0; +template +using promoted_tensor_value_type = + promote_t>::value_type...>; - return rotate_right(input); +template +KERNEL_FLOAT_INLINE into_tensor_type into_tensor(V&& input) { + return into_tensor_traits::call(std::forward(input)); } -namespace detail { -template< - typename U, - typename V, - typename Is = make_index_sequence>, - typename Js = make_index_sequence>> -struct concat_helper; - -template -struct concat_helper, index_sequence> { - using type = default_storage_type< - common_t, vector_value_type>, - vector_size + vector_size>; - - KERNEL_FLOAT_INLINE static type call(const U& left, const V& right) { - return vector_traits::create(vector_get(left)..., vector_get(right)...); - } -}; - -template -struct recur_concat_helper; - -template -struct recur_concat_helper { - using type = U; - - KERNEL_FLOAT_INLINE static U call(U&& input) { - return input; - } -}; - -template -struct recur_concat_helper { - using recur_helper = recur_concat_helper::type, Rest...>; - using type = typename recur_helper::type; - - KERNEL_FLOAT_INLINE static type call(const U& left, const V& right, const Rest&... rest) { - return recur_helper::call(concat_helper::call(left, right), rest...); - } -}; -} // namespace detail - -template -using concat_type = typename detail::recur_concat_helper...>::type; - -/** - * Concatenate the given vectors into one large vector. For example, given vectors of size 3, size 2 and size 5, - * this function returns a new vector of size 3+2+5=8. If the vectors are not of the same element type, they - * will first be cast into a common data type. - * - * # Examples - * ``` - * vec x = {1, 2, 3}; - * int y = 4; - * vec z = {5, 6, 7, 8}; - * vec xyz = concat(x, y, z); // 1, 2, 3, 4, 5, 6, 7, 8 - * ``` - */ -template -KERNEL_FLOAT_INLINE vector> concat(const Vs&... inputs) { - return detail::recur_concat_helper...>::call(into_storage(inputs)...); +template +KERNEL_FLOAT_INLINE tensor_storage_type into_tensor_storage(V&& input) { + return into_tensor_traits::call(std::forward(input)).storage(); } } // namespace kernel_float -#endif //KERNEL_FLOAT_SWIZZLE_H +#endif #ifndef KERNEL_FLOAT_UNOPS_H #define KERNEL_FLOAT_UNOPS_H - namespace kernel_float { namespace detail { -template -struct map_helper { - KERNEL_FLOAT_INLINE static Output call(F fun, const Input& input) { - return call(fun, input, make_index_sequence> {}); - } - private: - template - KERNEL_FLOAT_INLINE static Output call(F fun, const Input& input, index_sequence) { - return vector_traits::create(fun(vector_get(input))...); - } -}; +template +struct apply_impl; -template -struct map_helper, nested_array> { - KERNEL_FLOAT_INLINE static nested_array call(F fun, const nested_array& input) { - return call(fun, input, make_index_sequence::num_packets> {}); +template +struct apply_impl { + KERNEL_FLOAT_INLINE static tensor_storage + call(F fun, const tensor_storage& input) { + return call(fun, input, make_index_sequence {}); } private: template - KERNEL_FLOAT_INLINE static nested_array - call(F fun, const nested_array& input, index_sequence) { - return {map_helper::call(fun, input[Is])...}; + KERNEL_FLOAT_INLINE static tensor_storage + call(F fun, const tensor_storage& input, index_sequence) { + return {fun(input[Is])...}; } }; } // namespace detail -template -using map_type = default_storage_type>, vector_size>; +template +using map_type = tensor>, tensor_extents>; -/** - * Applies ``fun`` to each element from vector ``input`` and returns a new vector with the results. - * This function is the basis for all unary operators like ``sin`` and ``sqrt``. - * - * Example - * ======= - * ``` - * vector v = {1, 2, 3}; - * vector w = map([](auto i) { return i * 2; }); // 2, 4, 6 - * ``` - */ -template> -KERNEL_FLOAT_INLINE Output map(F fun, const Input& input) { - return detail::map_helper>::call(fun, into_storage(input)); +template +KERNEL_FLOAT_INLINE map_type map(F fun, const V& input) { + using Input = tensor_value_type; + using Output = result_t; + return detail::apply_impl, Output, Input>::call( + fun, + into_tensor(input).storage()); } -#define KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ - namespace ops { \ - template \ - struct NAME { \ - KERNEL_FLOAT_INLINE T operator()(T input) { \ - return T(EXPR); \ - } \ - }; \ - } \ - template \ - KERNEL_FLOAT_INLINE vector> NAME(const V& input) { \ - return map>, V, into_storage_type>({}, input); \ +#define KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ + namespace ops { \ + template \ + struct NAME { \ + KERNEL_FLOAT_INLINE T operator()(T input) { \ + return T(EXPR); \ + } \ + }; \ + } \ + template \ + KERNEL_FLOAT_INLINE into_tensor_type NAME(const V& input) { \ + using F = ops::NAME>; \ + return map(F {}, input); \ } -#define KERNEL_FLOAT_DEFINE_UNARY_OP(NAME, OP, EXPR) \ - KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ - template \ - KERNEL_FLOAT_INLINE vector operator OP(const vector& vec) { \ - return NAME(vec); \ +#define KERNEL_FLOAT_DEFINE_UNARY_OP(NAME, OP, EXPR) \ + KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ + template \ + KERNEL_FLOAT_INLINE tensor operator OP(const tensor& vec) { \ + return NAME(vec); \ } KERNEL_FLOAT_DEFINE_UNARY_OP(negate, -, -input) @@ -1348,154 +990,422 @@ KERNEL_FLOAT_DEFINE_UNARY_FUN(signbit) KERNEL_FLOAT_DEFINE_UNARY_FUN(isinf) KERNEL_FLOAT_DEFINE_UNARY_FUN(isnan) -} // namespace kernel_float - -#endif //KERNEL_FLOAT_UNOPS_H -#ifndef KERNEL_FLOAT_BINOPS_H -#define KERNEL_FLOAT_BINOPS_H - - +enum struct RoundingMode { ANY, DOWN, UP, NEAREST, TOWARD_ZERO }; -namespace kernel_float { -namespace detail { -template -struct zip_helper { - KERNEL_FLOAT_INLINE static Output call(F fun, const Left& left, const Right& right) { - return call_with_indices(fun, left, right, make_index_sequence> {}); - } +namespace ops { +template +struct cast; - private: - template - KERNEL_FLOAT_INLINE static Output - call_with_indices(F fun, const Left& left, const Right& right, index_sequence = {}) { - return vector_traits::create(fun(vector_get(left), vector_get(right))...); +template +struct cast { + KERNEL_FLOAT_INLINE R operator()(T input) noexcept { + return R(input); } }; -template -struct zip_helper, nested_array, nested_array> { - KERNEL_FLOAT_INLINE static nested_array - call(F fun, const nested_array& left, const nested_array& right) { - return call(fun, left, right, make_index_sequence::num_packets> {}); +template +struct cast { + KERNEL_FLOAT_INLINE T operator()(T input) noexcept { + return input; } +}; - private: - template - KERNEL_FLOAT_INLINE static nested_array call( - F fun, - const nested_array& left, - const nested_array& right, - index_sequence) { - return {zip_helper::call(fun, left[Is], right[Is])...}; +template +struct cast { + KERNEL_FLOAT_INLINE T operator()(T input) noexcept { + return input; } }; -}; // namespace detail +} // namespace ops -template -using common_vector_value_type = common_t...>; +template +KERNEL_FLOAT_INLINE tensor> cast(const V& input) { + using F = ops::cast, R, Mode>; + return map(F {}, input); +} +} // namespace kernel_float -template -static constexpr size_t common_vector_size = common_size...>; +#endif //KERNEL_FLOAT_UNOPS_H +#ifndef KERNEL_FLOAT_CAST_H +#define KERNEL_FLOAT_CAST_H -template -using zip_type = default_storage_type< - result_t, vector_value_type>, - common_vector_size>; + + + +namespace kernel_float { +namespace detail { + +template +struct unify_dimension_helper; + +template<> +struct unify_dimension_helper<1, 1> { + static constexpr size_t value = 1; +}; + +template +struct unify_dimension_helper { + static constexpr size_t value = N; +}; + +template +struct unify_dimension_helper { + static constexpr size_t value = N; +}; + +template +struct unify_dimension_helper<1, N> { + static constexpr size_t value = N; +}; + +template +struct unify_extents_helper; + +template +struct unify_extents_helper, extents> { + using type = extents::value...>; +}; + +template +struct extents_to_rank { + using type = E; +}; + +template +struct extents_to_rank, N, enabled_t<(sizeof...(Ns) < N)>>: + extents_to_rank, N> {}; + +template +struct broadcast_extents_helper { + using type = typename unify_extents_helper< + typename extents_to_rank::type, // + typename extents_to_rank::type // + >::type; +}; + +template +struct broadcast_extents_helper { + using type = E; +}; + +} // namespace detail + +template +using broadcast_extents = typename detail::broadcast_extents_helper::type; + +template +using broadcast_tensor_extents = broadcast_extents, tensor_extents>; + +template +static constexpr bool is_broadcastable = is_same, To>; + +template +static constexpr bool is_tensor_broadcastable = is_broadcastable, To>; + +namespace detail { + +template +struct copy_helper; + +template +struct copy_helper, IS, OS> { + template + static void call(T* output, const T* input) { + ndindex<0> x; + size_t input_index = IS::call(x); + size_t output_index = OS::call(x); + output[output_index] = input[input_index]; + } +}; + +template +struct copy_helper, IS, OS> { + template + static void call(T* output, const T* input) { + for (size_t i = 0; i < N; i++) { + ndindex<1> x = {i}; + size_t input_index = IS::call(x); + size_t output_index = OS::call(x); + output[output_index] = input[input_index]; + } + } +}; + +template +struct copy_helper, IS, OS> { + template + static void call(T* output, const T* input) { + for (size_t i = 0; i < N; i++) { + for (size_t j = 0; j < M; j++) { + ndindex<2> x = {i, j}; + size_t input_index = IS::call(x); + size_t output_index = OS::call(x); + output[output_index] = input[input_index]; + } + } + } +}; + +template +struct copy_helper, IS, OS> { + template + static void call(T* output, const T* input) { + for (size_t i = 0; i < N; i++) { + for (size_t j = 0; j < M; j++) { + for (size_t k = 0; k < K; k++) { + ndindex<3> x = {i, j, k}; + size_t input_index = IS::call(x); + size_t output_index = OS::call(x); + output[output_index] = input[input_index]; + } + } + } + } +}; + +template +struct strides_helper; + +template<> +struct strides_helper> { + KERNEL_FLOAT_INLINE + static size_t call(ndindex<0>) { + return 0; + } +}; + +template +struct strides_helper> { + KERNEL_FLOAT_INLINE + static size_t call(ndindex<1> x) { + return (N != 1 ? x[0] : 0); + } +}; + +template +struct strides_helper> { + KERNEL_FLOAT_INLINE + static size_t call(ndindex<2> x) { + return (N != 1 ? x[0] * M : 0) + // + (M != 1 ? x[1] : 0); + } +}; + +template +struct strides_helper> { + KERNEL_FLOAT_INLINE + static size_t call(ndindex<3> x) { + return (N != 1 ? x[0] * M * K : 0) + // + (M != 1 ? x[1] * K : 0) + // + (K != 1 ? x[2] : 0); + } +}; + +template +struct broadcast_impl { + KERNEL_FLOAT_INLINE static tensor_storage + call(tensor_storage input) { + static_assert(is_broadcastable, "cannot broadcast to required shape"); + using IS = strides_helper>; + using OS = strides_helper; + + tensor_storage output; + copy_helper::call(output.data(), input.data()); + return output; + } +}; + +template +struct broadcast_impl { + KERNEL_FLOAT_INLINE static tensor_storage + call(tensor_storage input) { + return input; + } +}; + +} // namespace detail + +template +tensor, extents> +broadcast(const V& input, extents new_extents = {}) { + using T = tensor_value_type; + return detail::broadcast_impl, extents>::call( + into_tensor(input).storage()); +} + +template +tensor> fill(T value = {}, extents = {}) { + tensor_storage input = {value}; + return detail::broadcast_impl, extents>::call(input); +} + +template +tensor> zeros(extents = {}) { + tensor_storage input = {T {}}; + return detail::broadcast_impl, extents>::call(input); +} + +template +tensor> ones(extents = {}) { + tensor_storage input = {T {1}}; + return detail::broadcast_impl, extents>::call(input); +} + +template, typename E = tensor_extents> +tensor zeros_like(const V&) { + return zeros(E {}); +} + +template, typename E = tensor_extents> +tensor ones_like(const V&) { + return ones(E {}); +} + +namespace detail { +template +struct convert_helper { + KERNEL_FLOAT_INLINE + static tensor_storage call(tensor_storage input) { + tensor_storage intermediate = + detail::apply_impl, E::volume, T2, T>::call(input); + return detail::broadcast_impl::call(intermediate); + } +}; + +template +struct convert_helper { + KERNEL_FLOAT_INLINE + static tensor_storage call(tensor_storage input) { + return input; + } +}; + +template +struct convert_helper { + KERNEL_FLOAT_INLINE + static tensor_storage call(tensor_storage input) { + return detail::broadcast_impl::call(input); + } +}; + +template +struct convert_helper { + KERNEL_FLOAT_INLINE + static tensor_storage call(tensor_storage input) { + return detail::apply_impl, E::volume, T2, T>::call(input); + } +}; +} // namespace detail /** - * Applies ``fun`` to each pair of two elements from ``left`` and ``right`` and returns a new - * vector with the results. - * - * If ``left`` and ``right`` are not the same size, they will first be broadcast into a - * common size using ``resize``. - * - * Note that this function does **not** cast the input vectors to a common element type. See - * ``zip_common`` for that functionality. + * Cast the values of the given input tensor to type `R` and then broadcast the result to the given shape `(Ns...)`. */ -template> -KERNEL_FLOAT_INLINE vector zip(F fun, Left&& left, Right&& right) { - static constexpr size_t N = vector_size; - using LeftInput = default_storage_type, N>; - using RightInput = default_storage_type, N>; +template +tensor> convert(const V& input, extents new_shape = {}) { + return detail::convert_helper, tensor_extents, R, extents, M, >( + into_tensor(input).storage()); +} + +} // namespace kernel_float + +#endif +#ifndef KERNEL_FLOAT_BINOPS_H +#define KERNEL_FLOAT_BINOPS_H + + + + +namespace kernel_float { +namespace detail { - return detail::zip_helper::call( +template +struct apply_impl { + KERNEL_FLOAT_INLINE static tensor_storage + call(F fun, const tensor_storage& left, const tensor_storage& right) { + return call(fun, left, right, make_index_sequence {}); + } + + private: + template + KERNEL_FLOAT_INLINE static tensor_storage call( + F fun, + const tensor_storage& left, + const tensor_storage& right, + index_sequence) { + return {fun(left[Is], right[Is])...}; + } +}; +} // namespace detail + +template +using zip_type = + tensor, tensor_value_type>, broadcast_tensor_extents>; + +template +KERNEL_FLOAT_INLINE zip_type zip(F fun, const L& left, const R& right) { + using A = tensor_value_type; + using B = tensor_value_type; + using O = result_t; + using E = broadcast_tensor_extents; + + return detail::apply_impl::call( fun, - broadcast(std::forward(left)), - broadcast(std::forward(right))); + broadcast(left).storage(), + broadcast(right).storage()); } template -using zip_common_type = default_storage_type< - result_t, common_vector_value_type>, - common_vector_size>; +using zip_common_type = tensor< + result_t, promoted_tensor_value_type>, + broadcast_tensor_extents>; -/** - * Applies ``fun`` to each pair of two elements from ``left`` and ``right`` and returns a new - * vector with the results. - * - * If ``left`` and ``right`` are not the same size, they will first be broadcast into a - * common size using ``resize``. - * - * If ``left`` and ``right`` are not of the same type, they will first be case into a common - * data type. For example, zipping ``float`` and ``double`` first cast vectors to ``double``. - * - * Example - * ======= - * ``` - * vec x = {1, 2, 3, 4}; - * vec = {8}; - * vec = zip_common([](auto a, auto b){ return a + b; }, x, y); // [9, 10, 11, 12] - * ``` - */ -template< - typename F, - typename Left, - typename Right, - typename Output = zip_common_type> -KERNEL_FLOAT_INLINE vector zip_common(F fun, Left&& left, Right&& right) { - static constexpr size_t N = vector_size; - using C = common_t, vector_value_type>; - using Input = default_storage_type; - - return detail::zip_helper::call( +template +KERNEL_FLOAT_INLINE zip_common_type zip_common(F fun, const L& left, const R& right) { + using T = promoted_tensor_value_type; + using O = result_t; + using E = broadcast_tensor_extents; + + return detail::apply_impl::call( fun, - broadcast(std::forward(left)), - broadcast(std::forward(right))); + detail::convert_helper, tensor_extents, T, E>::call( + into_tensor_storage(left)), + detail::convert_helper, tensor_extents, T, E>::call( + into_tensor_storage(right))); } -#define KERNEL_FLOAT_DEFINE_BINARY(NAME, EXPR) \ - namespace ops { \ - template \ - struct NAME { \ - KERNEL_FLOAT_INLINE T operator()(T left, T right) { \ - return T(EXPR); \ - } \ - }; \ - } \ - template> \ - KERNEL_FLOAT_INLINE vector, L, R>> NAME(L&& left, R&& right) { \ - return zip_common(ops::NAME {}, std::forward(left), std::forward(right)); \ - } - -#define KERNEL_FLOAT_DEFINE_BINARY_OP(NAME, OP) \ - KERNEL_FLOAT_DEFINE_BINARY(NAME, left OP right) \ - template> \ - KERNEL_FLOAT_INLINE vector, L, R>> operator OP( \ - const vector& left, \ - const vector& right) { \ - return zip_common(ops::NAME {}, left, right); \ - } \ - template> \ - KERNEL_FLOAT_INLINE vector, L, R>> operator OP( \ - const vector& left, \ - const R& right) { \ - return zip_common(ops::NAME {}, left, right); \ - } \ - template> \ - KERNEL_FLOAT_INLINE vector, L, R>> operator OP( \ - const L& left, \ - const vector& right) { \ - return zip_common(ops::NAME {}, left, right); \ +#define KERNEL_FLOAT_DEFINE_BINARY(NAME, EXPR) \ + namespace ops { \ + template \ + struct NAME { \ + KERNEL_FLOAT_INLINE T operator()(T left, T right) { \ + return T(EXPR); \ + } \ + }; \ + } \ + template> \ + KERNEL_FLOAT_INLINE zip_common_type, L, R> NAME(L&& left, R&& right) { \ + return zip_common(ops::NAME {}, std::forward(left), std::forward(right)); \ + } + +#define KERNEL_FLOAT_DEFINE_BINARY_OP(NAME, OP) \ + KERNEL_FLOAT_DEFINE_BINARY(NAME, left OP right) \ + template, typename E1, typename E2> \ + KERNEL_FLOAT_INLINE zip_common_type, tensor, tensor> operator OP( \ + const tensor& left, \ + const tensor& right) { \ + return zip_common(ops::NAME {}, left, right); \ + } \ + template>, typename E> \ + KERNEL_FLOAT_INLINE zip_common_type, tensor, R> operator OP( \ + const tensor& left, \ + const R& right) { \ + return zip_common(ops::NAME {}, left, right); \ + } \ + template, R>, typename E> \ + KERNEL_FLOAT_INLINE zip_common_type, L, tensor> operator OP( \ + const L& left, \ + const tensor& right) { \ + return zip_common(ops::NAME {}, left, right); \ } KERNEL_FLOAT_DEFINE_BINARY_OP(add, +) @@ -1516,28 +1426,29 @@ KERNEL_FLOAT_DEFINE_BINARY_OP(bit_or, |) KERNEL_FLOAT_DEFINE_BINARY_OP(bit_xor, ^) // clang-format off -template typename F, typename L, typename R> -static constexpr bool vector_assign_allowed = - common_vector_size == vector_size && - is_implicit_convertible< - result_t< - F, vector_value_type>>, - vector_value_type, - vector_value_type - >, - vector_value_type - >; +template typename F, typename T, typename E, typename R> +static constexpr bool is_tensor_assign_allowed = + is_tensor_broadcastable && + is_implicit_convertible< + result_t< + F>>, + T, + tensor_value_type + >, + T + >; // clang-format on -#define KERNEL_FLOAT_DEFINE_BINARY_ASSIGN_OP(NAME, OP) \ - template< \ - typename L, \ - typename R, \ - typename T = enabled_t, vector_value_type>> \ - KERNEL_FLOAT_INLINE vector& operator OP(vector& lhs, const R& rhs) { \ - using F = ops::NAME; \ - lhs = zip_common(F {}, lhs.storage(), rhs); \ - return lhs; \ +#define KERNEL_FLOAT_DEFINE_BINARY_ASSIGN_OP(NAME, OP) \ + template< \ + typename T, \ + typename E, \ + typename R, \ + typename = enabled_t>> \ + KERNEL_FLOAT_INLINE tensor& operator OP(tensor& lhs, const R& rhs) { \ + using F = ops::NAME; \ + lhs = zip_common(F {}, lhs, rhs); \ + return lhs; \ } KERNEL_FLOAT_DEFINE_BINARY_ASSIGN_OP(add, +=) @@ -1624,182 +1535,436 @@ struct bit_xor { } // namespace kernel_float -#endif //KERNEL_FLOAT_BINOPS_H -#ifndef KERNEL_FLOAT_ITERATE_H -#define KERNEL_FLOAT_ITERATE_H +#endif +#ifndef KERNEL_FLOAT_FP16_H +#define KERNEL_FLOAT_FP16_H + + +#if KERNEL_FLOAT_FP16_AVAILABLE +#include namespace kernel_float { +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(__half, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __half) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __half) -namespace detail { -template>> -struct range_helper; +template<> +struct into_tensor_traits<__half2> { + using type = tensor<__half, extents<2>>; -template -struct range_helper> { - KERNEL_FLOAT_INLINE static V call(F fun) { - return vector_traits::create(fun(const_index {})...); + KERNEL_FLOAT_INLINE + static type call(__half2 input) { + return tensor_storage<__half, 2> {input.x, input.y}; } }; -} // namespace detail - -/** - * Generate vector of length ``N`` by applying the given function ``fun`` to - * each index ``0...N-1``. - * - * Example - * ======= - * ``` - * // returns [0, 2, 4] - * vector vec = range<3>([](auto i) { return float(i * 2); }); - * ``` - */ -template< - size_t N, - typename F, - typename T = result_t, - typename Output = default_storage_type> -KERNEL_FLOAT_INLINE vector range(F fun) { - return detail::range_helper::call(fun); -} -/** - * Generate vector consisting of the numbers ``0...N-1`` of type ``T``. - * - * Example - * ======= - * ``` - * // Returns [0, 1, 2] - * vector vec = range(); - * ``` - */ -template> -KERNEL_FLOAT_INLINE vector range() { - using F = ops::cast; - return detail::range_helper::call(F {}); -} +namespace detail { +template +struct map_halfx2 { + KERNEL_FLOAT_INLINE + static __half2 call(F fun, __half2 input) { + __half a = fun(input.x); + __half b = fun(input.y); + return {a, b}; + } +}; -/** - * Generate vector having same size and type as ``V``, but filled with the numbers ``0..N-1``. - */ -template> -KERNEL_FLOAT_INLINE vector range_like(const Input&) { - using F = ops::cast>; - return detail::range_helper::call(F {}); -} +template +struct zip_halfx2 { + KERNEL_FLOAT_INLINE + static __half2 call(F fun, __half2 left, __half2 right) { + __half a = fun(left.x, left.y); + __half b = fun(right.y, right.y); + return {a, b}; + } +}; -/** - * Generate vector of `N` elements of type `T` - * - * Example - * ======= - * ``` - * // Returns [1.0, 1.0, 1.0] - * vector = fill(1.0f); - * ``` - */ -template> -KERNEL_FLOAT_INLINE vector fill(T value) { - return vector_traits::fill(value); -} +template +struct apply_impl { + KERNEL_FLOAT_INLINE static tensor_storage<__half, N> + call(F fun, const tensor_storage<__half, N>& input) { + tensor_storage<__half, N> result; -/** - * Generate vector having same size and type as ``V``, but filled with the given ``value``. - */ -template -KERNEL_FLOAT_INLINE vector fill_like(const Output&, vector_value_type value) { - return vector_traits::fill(value); -} +#pragma unroll + for (size_t i = 0; i < N; i += 2) { + __half2 a = {input[i], input[i + 1]}; + __half2 b = map_halfx2::call(fun, a); + result[i + 0] = b.x; + result[i + 1] = b.y; + } -/** - * Generate vector of ``N`` zeros of type ``T`` - * - * Example - * ======= - * ``` - * // Returns [0.0, 0.0, 0.0] - * vector = zeros(); - * ``` - */ -template> -KERNEL_FLOAT_INLINE vector zeros() { - return vector_traits::fill(T(0)); -} + if (N % 2 != 0) { + result[N - 1] = fun(input[N - 1]); + } -/** - * Generate vector having same size and type as ``V``, but filled with zeros. - * - */ -template -KERNEL_FLOAT_INLINE vector zeros_like(const Output& output = {}) { - return vector_traits::fill(0); -} + return result; + } +}; -/** - * Generate vector of ``N`` ones of type ``T`` - * - * Example - * ======= - * ``` - * // Returns [1.0, 1.0, 1.0] - * vector = ones(); - * ``` - */ -template> -KERNEL_FLOAT_INLINE vector ones() { - return vector_traits::fill(T(1)); -} +template +struct apply_impl { + KERNEL_FLOAT_INLINE static tensor_storage<__half, N> + call(F fun, const tensor_storage<__half, N>& left, const tensor_storage<__half, N>& right) { + tensor_storage<__half, N> result; +#pragma unroll + for (size_t i = 0; i < N; i += 2) { + __half2 a = {left[i], left[i + 1]}; + __half2 b = {right[i], right[i + 1]}; + __half2 c = zip_halfx2::call(fun, a, b); + result[i + 0] = c.x; + result[i + 1] = c.y; + } -/** - * Generate vector having same size and type as ``V``, but filled with ones. - * - */ -template -KERNEL_FLOAT_INLINE vector ones_like(const Output& output = {}) { - return vector_traits::fill(1); -} + if (N % 2 != 0) { + result[N - 1] = fun(left[N - 1], right[N - 1]); + } -namespace detail { -template>> -struct iterate_helper; + return result; + } +}; +} // namespace detail -template -struct iterate_helper> { +#if KERNEL_FLOAT_IS_DEVICE +#define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__half> { \ + KERNEL_FLOAT_INLINE __half operator()(__half input) { \ + return FUN1(input); \ + } \ + }; \ + } \ + namespace detail { \ + template<> \ + struct map_halfx2> { \ + KERNEL_FLOAT_INLINE static __half2 call(ops::NAME<__half>, __half2 input) { \ + return FUN2(input); \ + } \ + }; \ + } + +KERNEL_FLOAT_FP16_UNARY_FUN(abs, ::__habs, ::__habs2); +KERNEL_FLOAT_FP16_UNARY_FUN(negate, ::__hneg, ::__hneg2); +KERNEL_FLOAT_FP16_UNARY_FUN(ceil, ::hceil, ::h2ceil); +KERNEL_FLOAT_FP16_UNARY_FUN(cos, ::hcos, ::h2cos); +KERNEL_FLOAT_FP16_UNARY_FUN(exp, ::hexp, ::h2exp); +KERNEL_FLOAT_FP16_UNARY_FUN(exp10, ::hexp10, ::h2exp10); +KERNEL_FLOAT_FP16_UNARY_FUN(floor, ::hfloor, ::h2floor); +KERNEL_FLOAT_FP16_UNARY_FUN(log, ::hlog, ::h2log); +KERNEL_FLOAT_FP16_UNARY_FUN(log10, ::hlog10, ::h2log2); +KERNEL_FLOAT_FP16_UNARY_FUN(rint, ::hrint, ::h2rint); +KERNEL_FLOAT_FP16_UNARY_FUN(rsqrt, ::hrsqrt, ::h2rsqrt); +KERNEL_FLOAT_FP16_UNARY_FUN(sin, ::hsin, ::h2sin); +KERNEL_FLOAT_FP16_UNARY_FUN(sqrt, ::hsqrt, ::h2sqrt); +KERNEL_FLOAT_FP16_UNARY_FUN(trunc, ::htrunc, ::h2trunc); + +#define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__half> { \ + KERNEL_FLOAT_INLINE __half operator()(__half left, __half right) const { \ + return FUN1(left, right); \ + } \ + }; \ + } \ + namespace detail { \ + template<> \ + struct zip_halfx2> { \ + KERNEL_FLOAT_INLINE static __half2 call(ops::NAME<__half>, __half2 left, __half2 right) { \ + return FUN2(left, right); \ + } \ + }; \ + } + +KERNEL_FLOAT_FP16_BINARY_FUN(add, __hadd, __hadd2) +KERNEL_FLOAT_FP16_BINARY_FUN(subtract, __hsub, __hsub2) +KERNEL_FLOAT_FP16_BINARY_FUN(multiply, __hmul, __hmul2) +KERNEL_FLOAT_FP16_BINARY_FUN(divide, __hdiv, __h2div) +KERNEL_FLOAT_FP16_BINARY_FUN(min, __hmin, __hmin2) +KERNEL_FLOAT_FP16_BINARY_FUN(max, __hmax, __hmax2) + +KERNEL_FLOAT_FP16_BINARY_FUN(equal_to, __heq, __heq2) +KERNEL_FLOAT_FP16_BINARY_FUN(not_equal_to, __heq, __heq2) +KERNEL_FLOAT_FP16_BINARY_FUN(less, __hlt, __hlt2) +KERNEL_FLOAT_FP16_BINARY_FUN(less_equal, __hle, __hle2) +KERNEL_FLOAT_FP16_BINARY_FUN(greater, __hgt, __hgt2) +KERNEL_FLOAT_FP16_BINARY_FUN(greater_equal, __hge, __hgt2) + +#endif + +#define KERNEL_FLOAT_FP16_CAST(T, TO_HALF, FROM_HALF) \ + namespace ops { \ + template<> \ + struct cast { \ + KERNEL_FLOAT_INLINE __half operator()(T input) { \ + return TO_HALF; \ + } \ + }; \ + template<> \ + struct cast<__half, T> { \ + KERNEL_FLOAT_INLINE T operator()(__half input) { \ + return FROM_HALF; \ + } \ + }; \ + } + +KERNEL_FLOAT_FP16_CAST(double, __double2half(input), double(__half2float(input))); +KERNEL_FLOAT_FP16_CAST(float, __float2half(input), __half2float(input)); + +// there are no official char casts. Instead, cast to int and then to char +KERNEL_FLOAT_FP16_CAST(char, __int2half_rn(input), (char)__half2int_rz(input)); +KERNEL_FLOAT_FP16_CAST(signed char, __int2half_rn(input), (signed char)__half2int_rz(input)); +KERNEL_FLOAT_FP16_CAST(unsigned char, __int2half_rn(input), (unsigned char)__half2int_rz(input)); + +KERNEL_FLOAT_FP16_CAST(signed short, __half2short_rz(input), __short2half_rn(input)); +KERNEL_FLOAT_FP16_CAST(signed int, __half2int_rz(input), __int2half_rn(input)); +KERNEL_FLOAT_FP16_CAST(signed long, __ll2half_rn(input), (signed long)(__half2ll_rz(input))); +KERNEL_FLOAT_FP16_CAST(signed long long, __ll2half_rn(input), __half2ll_rz(input)); + +KERNEL_FLOAT_FP16_CAST(unsigned short, __half2ushort_rz(input), __ushort2half_rn(input)); +KERNEL_FLOAT_FP16_CAST(unsigned int, __half2uint_rz(input), __uint2half_rn(input)); +KERNEL_FLOAT_FP16_CAST(unsigned long, __ull2half_rn(input), (unsigned long)(__half2ull_rz(input))); +KERNEL_FLOAT_FP16_CAST(unsigned long long, __ull2half_rn(input), __half2ull_rz(input)); + +using half = __half; +//KERNEL_FLOAT_TYPE_ALIAS(float16x, __half) +//KERNEL_FLOAT_TYPE_ALIAS(f16x, __half) + +} // namespace kernel_float + +#endif + +#endif //KERNEL_FLOAT_FP16_H +#ifndef KERNEL_FLOAT_BF16_H +#define KERNEL_FLOAT_BF16_H + + + +#if KERNEL_FLOAT_BF16_AVAILABLE +#include + + + +namespace kernel_float { +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(__nv_bfloat16, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __nv_bfloat16) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __nv_bfloat16) + +template<> +struct into_tensor_traits<__nv_bfloat162> { + using type = tensor<__nv_bfloat16, extents<2>>; + + KERNEL_FLOAT_INLINE + static type call(__nv_bfloat162 input) { + return tensor_storage<__nv_bfloat16, 2> {input.x, input.y}; + } +}; + +namespace detail { +template +struct map_bfloat16x2 { KERNEL_FLOAT_INLINE - static void call(F fun, const V& input) {} + static __nv_bfloat162 call(F fun, __nv_bfloat162 input) { + __nv_bfloat16 a = fun(input.x); + __nv_bfloat16 b = fun(input.y); + return {a, b}; + } }; -template -struct iterate_helper> { +template +struct zip_bfloat16x2 { KERNEL_FLOAT_INLINE - static void call(F fun, const V& input) { - fun(vector_get(input)); - iterate_helper>::call(fun, input); + static __nv_bfloat162 call(F fun, __nv_bfloat162 left, __nv_bfloat162 right) { + __nv_bfloat16 a = fun(left.x, left.y); + __nv_bfloat16 b = fun(right.y, right.y); + return {a, b}; + } +}; + +template +struct apply_impl { + KERNEL_FLOAT_INLINE static tensor_storage<__nv_bfloat16, N> + call(F fun, const tensor_storage<__nv_bfloat16, N>& input) { + tensor_storage<__nv_bfloat16, N> result; + +#pragma unroll + for (size_t i = 0; i < N; i += 2) { + __nv_bfloat162 a = {input[i], input[i + 1]}; + __nv_bfloat162 b = map_bfloat16x2::call(fun, a); + result[i + 0] = b.x; + result[i + 1] = b.y; + } + + if (N % 2 != 0) { + result[N - 1] = fun(input[N - 1]); + } + + return result; + } +}; + +template +struct apply_impl { + KERNEL_FLOAT_INLINE static tensor_storage<__nv_bfloat16, N> call( + F fun, + const tensor_storage<__nv_bfloat16, N>& left, + const tensor_storage<__nv_bfloat16, N>& right) { + tensor_storage<__nv_bfloat16, N> result; +#pragma unroll + for (size_t i = 0; i < N; i += 2) { + __nv_bfloat162 a = {left[i], left[i + 1]}; + __nv_bfloat162 b = {right[i], right[i + 1]}; + __nv_bfloat162 c = zip_bfloat16x2::call(fun, a, b); + result[i + 0] = c.x; + result[i + 1] = c.y; + } + + if (N % 2 != 0) { + result[N - 1] = fun(left[N - 1], right[N - 1]); + } + + return result; } }; } // namespace detail -/** - * Apply the function ``fun`` for each element from ``input``. - * - * Example - * ======= - * ``` - * for_each(range<3>(), [&](auto i) { - * printf("element: %d\n", i); - * }); - * ``` - */ -template -KERNEL_FLOAT_INLINE void for_each(const V& input, F fun) { - detail::iterate_helper>::call(fun, into_storage(input)); -} +#if KERNEL_FLOAT_IS_DEVICE +#define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__nv_bfloat16> { \ + KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(__nv_bfloat16 input) { \ + return FUN1(input); \ + } \ + }; \ + } \ + namespace detail { \ + template<> \ + struct map_bfloat16x2> { \ + KERNEL_FLOAT_INLINE static __nv_bfloat162 \ + call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 input) { \ + return FUN2(input); \ + } \ + }; \ + } + +KERNEL_FLOAT_BF16_UNARY_FUN(abs, ::__habs, ::__habs2); +KERNEL_FLOAT_BF16_UNARY_FUN(negate, ::__hneg, ::__hneg2); +KERNEL_FLOAT_BF16_UNARY_FUN(ceil, ::hceil, ::h2ceil); +KERNEL_FLOAT_BF16_UNARY_FUN(cos, ::hcos, ::h2cos); +KERNEL_FLOAT_BF16_UNARY_FUN(exp, ::hexp, ::h2exp); +KERNEL_FLOAT_BF16_UNARY_FUN(exp10, ::hexp10, ::h2exp10); +KERNEL_FLOAT_BF16_UNARY_FUN(floor, ::hfloor, ::h2floor); +KERNEL_FLOAT_BF16_UNARY_FUN(log, ::hlog, ::h2log); +KERNEL_FLOAT_BF16_UNARY_FUN(log10, ::hlog10, ::h2log2); +KERNEL_FLOAT_BF16_UNARY_FUN(rint, ::hrint, ::h2rint); +KERNEL_FLOAT_BF16_UNARY_FUN(rsqrt, ::hrsqrt, ::h2rsqrt); +KERNEL_FLOAT_BF16_UNARY_FUN(sin, ::hsin, ::h2sin); +KERNEL_FLOAT_BF16_UNARY_FUN(sqrt, ::hsqrt, ::h2sqrt); +KERNEL_FLOAT_BF16_UNARY_FUN(trunc, ::htrunc, ::h2trunc); + +#define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__nv_bfloat16> { \ + KERNEL_FLOAT_INLINE __nv_bfloat16 \ + operator()(__nv_bfloat16 left, __nv_bfloat16 right) const { \ + return FUN1(left, right); \ + } \ + }; \ + } \ + namespace detail { \ + template<> \ + struct zip_bfloat16x2> { \ + KERNEL_FLOAT_INLINE static __nv_bfloat162 \ + call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 left, __nv_bfloat162 right) { \ + return FUN2(left, right); \ + } \ + }; \ + } + +KERNEL_FLOAT_BF16_BINARY_FUN(add, __hadd, __hadd2) +KERNEL_FLOAT_BF16_BINARY_FUN(subtract, __hsub, __hsub2) +KERNEL_FLOAT_BF16_BINARY_FUN(multiply, __hmul, __hmul2) +KERNEL_FLOAT_BF16_BINARY_FUN(divide, __hdiv, __h2div) +KERNEL_FLOAT_BF16_BINARY_FUN(min, __hmin, __hmin2) +KERNEL_FLOAT_BF16_BINARY_FUN(max, __hmax, __hmax2) + +KERNEL_FLOAT_BF16_BINARY_FUN(equal_to, __heq, __heq2) +KERNEL_FLOAT_BF16_BINARY_FUN(not_equal_to, __heq, __heq2) +KERNEL_FLOAT_BF16_BINARY_FUN(less, __hlt, __hlt2) +KERNEL_FLOAT_BF16_BINARY_FUN(less_equal, __hle, __hle2) +KERNEL_FLOAT_BF16_BINARY_FUN(greater, __hgt, __hgt2) +KERNEL_FLOAT_BF16_BINARY_FUN(greater_equal, __hge, __hgt2) + +#endif + +#define KERNEL_FLOAT_BF16_CAST(T, TO_HALF, FROM_HALF) \ + namespace ops { \ + template<> \ + struct cast { \ + KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(T input) { \ + return TO_HALF; \ + } \ + }; \ + template<> \ + struct cast<__nv_bfloat16, T> { \ + KERNEL_FLOAT_INLINE T operator()(__nv_bfloat16 input) { \ + return FROM_HALF; \ + } \ + }; \ + } + +KERNEL_FLOAT_BF16_CAST(double, __double2bfloat16(input), double(__bfloat162float(input))); +KERNEL_FLOAT_BF16_CAST(float, __float2bfloat16(input), __bfloat162float(input)); + +// there are no official char casts. Instead, cast to int and then to char +KERNEL_FLOAT_BF16_CAST(char, __int2bfloat16_rn(input), (char)__bfloat162int_rz(input)); +KERNEL_FLOAT_BF16_CAST( + signed char, + __int2bfloat16_rn(input), + (signed char)__bfloat162int_rz(input)); +KERNEL_FLOAT_BF16_CAST( + unsigned char, + __int2bfloat16_rn(input), + (unsigned char)__bfloat162int_rz(input)); + +KERNEL_FLOAT_BF16_CAST(signed short, __bfloat162short_rz(input), __short2bfloat16_rn(input)); +KERNEL_FLOAT_BF16_CAST(signed int, __bfloat162int_rz(input), __int2bfloat16_rn(input)); +KERNEL_FLOAT_BF16_CAST( + signed long, + __ll2bfloat16_rn(input), + (signed long)(__bfloat162ll_rz(input))); +KERNEL_FLOAT_BF16_CAST(signed long long, __ll2bfloat16_rn(input), __bfloat162ll_rz(input)); + +KERNEL_FLOAT_BF16_CAST(unsigned short, __bfloat162ushort_rz(input), __ushort2bfloat16_rn(input)); +KERNEL_FLOAT_BF16_CAST(unsigned int, __bfloat162uint_rz(input), __uint2bfloat16_rn(input)); +KERNEL_FLOAT_BF16_CAST( + unsigned long, + __ull2bfloat16_rn(input), + (unsigned long)(__bfloat162ull_rz(input))); +KERNEL_FLOAT_BF16_CAST(unsigned long long, __ull2bfloat16_rn(input), __bfloat162ull_rz(input)); + +using bfloat16 = __nv_bfloat16; +//KERNEL_FLOAT_TYPE_ALIAS(float16x, __nv_bfloat16) +//KERNEL_FLOAT_TYPE_ALIAS(f16x, __nv_bfloat16) } // namespace kernel_float -#endif //KERNEL_FLOAT_ITERATE_H +#if KERNEL_FLOAT_FP16_AVAILABLE + + +namespace kernel_float { +KERNEL_FLOAT_BF16_CAST(__half, __float2bfloat16(input), __bfloat162float(input)); +} + +#endif // KERNEL_FLOAT_FP16_AVAILABLE +#endif + +#endif //KERNEL_FLOAT_BF16_H #ifndef KERNEL_FLOAT_REDUCE_H #define KERNEL_FLOAT_REDUCE_H @@ -1807,29 +1972,22 @@ KERNEL_FLOAT_INLINE void for_each(const V& input, F fun) { namespace kernel_float { namespace detail { -template +template struct reduce_helper { - using value_type = vector_value_type; - - KERNEL_FLOAT_INLINE static value_type call(F fun, const V& input) { - return call(fun, input, make_index_sequence> {}); + KERNEL_FLOAT_INLINE static T call(F fun, const tensor_storage& input) { + return call(fun, input, make_index_sequence {}); } private: template - KERNEL_FLOAT_INLINE static value_type call(F fun, const V& vector, index_sequence<0, Is...>) { - return call(fun, vector, vector_get<0>(vector), index_sequence {}); - } - - template - KERNEL_FLOAT_INLINE static value_type - call(F fun, const V& vector, value_type accum, index_sequence) { - return call(fun, vector, fun(accum, vector_get(vector)), index_sequence {}); - } - - KERNEL_FLOAT_INLINE static value_type - call(F fun, const V& vector, value_type accum, index_sequence<>) { - return accum; + KERNEL_FLOAT_INLINE static T + call(F fun, const tensor_storage& input, index_sequence<0, Is...>) { + T result = input[0]; +#pragma unroll + for (size_t i = 1; i < N; i++) { + result = fun(result, input[i]); + } + return result; } }; } // namespace detail @@ -1849,8 +2007,10 @@ struct reduce_helper { * ``` */ template -KERNEL_FLOAT_INLINE vector_value_type reduce(F fun, const V& input) { - return detail::reduce_helper>::call(fun, into_storage(input)); +KERNEL_FLOAT_INLINE tensor_value_type reduce(F fun, const V& input) { + return detail::reduce_helper, tensor_value_type>::call( + fun, + into_tensor_storage(input)); } /** @@ -1863,7 +2023,7 @@ KERNEL_FLOAT_INLINE vector_value_type reduce(F fun, const V& input) { * int y = min(x); // Returns 0 * ``` */ -template> +template> KERNEL_FLOAT_INLINE T min(const V& input) { return reduce(ops::min {}, input); } @@ -1878,7 +2038,7 @@ KERNEL_FLOAT_INLINE T min(const V& input) { * int y = max(x); // Returns 5 * ``` */ -template> +template> KERNEL_FLOAT_INLINE T max(const V& input) { return reduce(ops::max {}, input); } @@ -1893,7 +2053,7 @@ KERNEL_FLOAT_INLINE T max(const V& input) { * int y = sum(x); // Returns 8 * ``` */ -template> +template> KERNEL_FLOAT_INLINE T sum(const V& input) { return reduce(ops::add {}, input); } @@ -1908,7 +2068,7 @@ KERNEL_FLOAT_INLINE T sum(const V& input) { * int y = sum(x); // Returns 5*0*2*1*0 = 0 * ``` */ -template> +template> KERNEL_FLOAT_INLINE T product(const V& input) { return reduce(ops::multiply {}, input); } @@ -1918,7 +2078,7 @@ KERNEL_FLOAT_INLINE T product(const V& input) { * non-zero if ``bool(v)==true``. */ template -KERNEL_FLOAT_INLINE bool all(V&& input) { +KERNEL_FLOAT_INLINE bool all(const V& input) { return reduce(ops::bit_and {}, cast(input)); } @@ -1927,7 +2087,7 @@ KERNEL_FLOAT_INLINE bool all(V&& input) { * non-zero if ``bool(v)==true``. */ template -KERNEL_FLOAT_INLINE bool any(V&& input) { +KERNEL_FLOAT_INLINE bool any(const V& input) { return reduce(ops::bit_or {}, cast(input)); } @@ -1943,705 +2103,296 @@ KERNEL_FLOAT_INLINE bool any(V&& input) { * ``` */ template -KERNEL_FLOAT_INLINE int count(V&& input) { +KERNEL_FLOAT_INLINE int count(const V& input) { return sum(cast(cast(input))); } } // namespace kernel_float #endif //KERNEL_FLOAT_REDUCE_H -#ifndef KERNEL_FLOAT_INTERFACE_H -#define KERNEL_FLOAT_INTERFACE_H - - - - - - - - -namespace kernel_float { - -template -KERNEL_FLOAT_INLINE vector broadcast(Input&& input); - -template -struct index_proxy { - using value_type = typename vector_traits::value_type; - - KERNEL_FLOAT_INLINE - index_proxy(V& storage, I index) : storage_(storage), index_(index) {} - - KERNEL_FLOAT_INLINE - index_proxy& operator=(value_type value) { - vector_traits::set(storage_, index_, value); - return *this; - } - - KERNEL_FLOAT_INLINE - operator value_type() const { - return vector_traits::get(storage_, index_); - } - - private: - V& storage_; - I index_; -}; - -template -struct index_proxy> { - using value_type = typename vector_traits::value_type; - - KERNEL_FLOAT_INLINE - index_proxy(V& storage, const_index) : storage_(storage) {} - - KERNEL_FLOAT_INLINE - index_proxy& operator=(value_type value) { - vector_index::set(storage_, value); - return *this; - } - - KERNEL_FLOAT_INLINE - operator value_type() const { - return vector_index::get(storage_); - } - - private: - V& storage_; -}; - -template -struct vector { - using storage_type = V; - using traits_type = vector_traits; - using value_type = typename traits_type::value_type; - static constexpr size_t const_size = traits_type::size; - - vector(const vector&) = default; - vector(vector&) = default; - vector(vector&&) = default; - - vector& operator=(const vector&) = default; - vector& operator=(vector&) = default; - vector& operator=(vector&&) = default; - - KERNEL_FLOAT_INLINE - vector() : storage_(traits_type::fill(value_type {})) {} - - KERNEL_FLOAT_INLINE - vector(storage_type storage) : storage_(storage) {} - - template< - typename U, - enabled_t, value_type>, int> = 0> - KERNEL_FLOAT_INLINE vector(U&& init) : vector(broadcast(std::forward(init))) {} - - template = 0> - KERNEL_FLOAT_INLINE vector(Args&&... args) : storage_(traits_type::create(args...)) {} - - KERNEL_FLOAT_INLINE - operator storage_type() const { - return storage_; - } - - KERNEL_FLOAT_INLINE - storage_type& storage() { - return storage_; - } - - KERNEL_FLOAT_INLINE - const storage_type& storage() const { - return storage_; - } - - KERNEL_FLOAT_INLINE - value_type get(size_t index) const { - return traits_type::get(storage_, index); - } - - KERNEL_FLOAT_INLINE - void set(size_t index, value_type value) { - traits_type::set(storage_, index, value); - } - - template - KERNEL_FLOAT_INLINE value_type get(const_index) const { - return vector_index::get(storage_); - } - - template - KERNEL_FLOAT_INLINE void set(const_index, value_type value) { - return vector_index::set(storage_, value); - } - - KERNEL_FLOAT_INLINE - value_type operator[](size_t index) const { - return get(index); - } - - template - KERNEL_FLOAT_INLINE value_type operator[](const_index) const { - return get(const_index {}); - } - - KERNEL_FLOAT_INLINE - index_proxy operator[](size_t index) { - return {storage_, index}; - } - - template - KERNEL_FLOAT_INLINE index_proxy> operator[](const_index) { - return {storage_, const_index {}}; - } - - KERNEL_FLOAT_INLINE - static constexpr size_t size() { - return const_size; - } - - private: - storage_type storage_; -}; - -template -struct vector_traits> { - using value_type = vector_value_type; - static constexpr size_t size = vector_size; - - KERNEL_FLOAT_INLINE - static vector fill(value_type value) { - return vector_traits::fill(value); - } - - template - KERNEL_FLOAT_INLINE static vector create(Args... args) { - return vector_traits::create(args...); - } - - KERNEL_FLOAT_INLINE - static value_type get(const vector& self, size_t index) { - return vector_traits::get(self.storage(), index); - } - - KERNEL_FLOAT_INLINE - static void set(vector& self, size_t index, value_type value) { - vector_traits::set(self.storage(), index, value); - } -}; - -template -struct vector_index, I> { - using value_type = vector_value_type; - - KERNEL_FLOAT_INLINE - static value_type get(const vector& self) { - return vector_index::get(self.storage()); - } - - KERNEL_FLOAT_INLINE - static void set(vector& self, value_type value) { - vector_index::set(self.storage(), value); - } -}; - -template -struct into_storage_traits> { - using type = V; - - KERNEL_FLOAT_INLINE - static constexpr type call(const vector& self) { - return self.storage(); - } -}; - -template -struct vector_swizzle, index_sequence> { - KERNEL_FLOAT_INLINE static Output call(const vector& self) { - return vector_swizzle>::call(self.storage()); - } -}; - -template -using vec = vector>; - -template -using unaligned_vec = vector>; +#ifndef KERNEL_FLOAT_BASE_H +#define KERNEL_FLOAT_BASE_H -template -KERNEL_FLOAT_INLINE vec, sizeof...(Args)> make_vec(Args&&... args) { - using value_type = common_t; - using vector_type = default_storage_type; - return vector_traits::create(value_type(args)...); -} - -template -KERNEL_FLOAT_INLINE vector> into_vec(V&& input) { - return into_storage(input); -} - -using float32 = float; -using float64 = double; - -template -using vec1 = vec; -template -using vec2 = vec; -template -using vec3 = vec; -template -using vec4 = vec; -template -using vec5 = vec; -template -using vec6 = vec; -template -using vec7 = vec; -template -using vec8 = vec; - -#define KERNEL_FLOAT_TYPE_ALIAS(NAME, T) \ - template \ - using NAME##N = vec; \ - using NAME##1 = vec; \ - using NAME##2 = vec; \ - using NAME##3 = vec; \ - using NAME##4 = vec; \ - using NAME##5 = vec; \ - using NAME##6 = vec; \ - using NAME##7 = vec; \ - using NAME##8 = vec; \ - template \ - using unaligned_##NAME##X = unaligned_vec; \ - using unaligned_##NAME##1 = unaligned_vec; \ - using unaligned_##NAME##2 = unaligned_vec; \ - using unaligned_##NAME##3 = unaligned_vec; \ - using unaligned_##NAME##4 = unaligned_vec; \ - using unaligned_##NAME##5 = unaligned_vec; \ - using unaligned_##NAME##6 = unaligned_vec; \ - using unaligned_##NAME##7 = unaligned_vec; \ - using unaligned_##NAME##8 = unaligned_vec; - -KERNEL_FLOAT_TYPE_ALIAS(char, char) -KERNEL_FLOAT_TYPE_ALIAS(short, short) -KERNEL_FLOAT_TYPE_ALIAS(int, int) -KERNEL_FLOAT_TYPE_ALIAS(long, long) -KERNEL_FLOAT_TYPE_ALIAS(longlong, long long) - -KERNEL_FLOAT_TYPE_ALIAS(uchar, unsigned char) -KERNEL_FLOAT_TYPE_ALIAS(ushort, unsigned short) -KERNEL_FLOAT_TYPE_ALIAS(uint, unsigned int) -KERNEL_FLOAT_TYPE_ALIAS(ulong, unsigned long) -KERNEL_FLOAT_TYPE_ALIAS(ulonglong, unsigned long long) - -KERNEL_FLOAT_TYPE_ALIAS(float, float) -KERNEL_FLOAT_TYPE_ALIAS(f32x, float) -KERNEL_FLOAT_TYPE_ALIAS(float32x, float) - -KERNEL_FLOAT_TYPE_ALIAS(double, double) -KERNEL_FLOAT_TYPE_ALIAS(f64x, double) -KERNEL_FLOAT_TYPE_ALIAS(float64x, double) - -} // namespace kernel_float - -#endif //KERNEL_FLOAT_INTERFACE_H -#ifndef KERNEL_FLOAT_FP16_H -#define KERNEL_FLOAT_FP16_H - - - -#if KERNEL_FLOAT_FP16_AVAILABLE -#include - - - -namespace kernel_float { -KERNEL_FLOAT_DEFINE_COMMON_TYPE(__half, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(float, __half) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(double, __half) - -template<> -struct vector_traits<__half2> { - using value_type = __half; - static constexpr size_t size = 2; - - KERNEL_FLOAT_INLINE - static __half2 fill(__half value) { -#if KERNEL_FLOAT_ON_DEVICE - return __half2half2(value); -#else - return {value, value}; -#endif - } - - KERNEL_FLOAT_INLINE - static __half2 create(__half low, __half high) { -#if KERNEL_FLOAT_ON_DEVICE - return __halves2half2(low, high); -#else - return {low, high}; -#endif - } - - KERNEL_FLOAT_INLINE - static __half get(__half2 self, size_t index) { -#if KERNEL_FLOAT_ON_DEVICE - if (index == 0) { - return __low2half(self); - } else { - return __high2half(self); - } -#else - if (index == 0) { - return self.x; - } else { - return self.y; - } -#endif - } - - KERNEL_FLOAT_INLINE - static void set(__half2& self, size_t index, __half value) { - if (index == 0) { - self.x = value; - } else { - self.y = value; - } - } -}; - -template -struct default_storage<__half, N, Alignment::Maximum, enabled_t<(N >= 2)>> { - using type = nested_array<__half2, N>; -}; - -template -struct default_storage<__half, N, Alignment::Packed, enabled_t<(N >= 2 && N % 2 == 0)>> { - using type = nested_array<__half2, N>; -}; - -#if KERNEL_FLOAT_ON_DEVICE -#define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2) \ - namespace ops { \ - template<> \ - struct NAME<__half> { \ - KERNEL_FLOAT_INLINE __half operator()(__half input) { \ - return FUN1(input); \ - } \ - }; \ - } \ - namespace detail { \ - template<> \ - struct map_helper, __half2, __half2> { \ - KERNEL_FLOAT_INLINE static __half2 call(ops::NAME<__half>, __half2 input) { \ - return FUN2(input); \ - } \ - }; \ - } - -KERNEL_FLOAT_FP16_UNARY_FUN(abs, ::__habs, ::__habs2); -KERNEL_FLOAT_FP16_UNARY_FUN(negate, ::__hneg, ::__hneg2); -KERNEL_FLOAT_FP16_UNARY_FUN(ceil, ::hceil, ::h2ceil); -KERNEL_FLOAT_FP16_UNARY_FUN(cos, ::hcos, ::h2cos); -KERNEL_FLOAT_FP16_UNARY_FUN(exp, ::hexp, ::h2exp); -KERNEL_FLOAT_FP16_UNARY_FUN(exp10, ::hexp10, ::h2exp10); -KERNEL_FLOAT_FP16_UNARY_FUN(floor, ::hfloor, ::h2floor); -KERNEL_FLOAT_FP16_UNARY_FUN(log, ::hlog, ::h2log); -KERNEL_FLOAT_FP16_UNARY_FUN(log10, ::hlog10, ::h2log2); -KERNEL_FLOAT_FP16_UNARY_FUN(rint, ::hrint, ::h2rint); -KERNEL_FLOAT_FP16_UNARY_FUN(rsqrt, ::hrsqrt, ::h2rsqrt); -KERNEL_FLOAT_FP16_UNARY_FUN(sin, ::hsin, ::h2sin); -KERNEL_FLOAT_FP16_UNARY_FUN(sqrt, ::hsqrt, ::h2sqrt); -KERNEL_FLOAT_FP16_UNARY_FUN(trunc, ::htrunc, ::h2trunc); - -#define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2) \ - namespace ops { \ - template<> \ - struct NAME<__half> { \ - KERNEL_FLOAT_INLINE __half operator()(__half left, __half right) const { \ - return FUN1(left, right); \ - } \ - }; \ - } \ - namespace detail { \ - template<> \ - struct zip_helper, __half2, __half2, __half2> { \ - KERNEL_FLOAT_INLINE static __half2 call(ops::NAME<__half>, __half2 left, __half2 right) { \ - return FUN2(left, right); \ - } \ - }; \ - } -KERNEL_FLOAT_FP16_BINARY_FUN(add, __hadd, __hadd2) -KERNEL_FLOAT_FP16_BINARY_FUN(subtract, __hsub, __hsub2) -KERNEL_FLOAT_FP16_BINARY_FUN(multiply, __hmul, __hmul2) -KERNEL_FLOAT_FP16_BINARY_FUN(divide, __hdiv, __h2div) -KERNEL_FLOAT_FP16_BINARY_FUN(min, __hmin, __hmin2) -KERNEL_FLOAT_FP16_BINARY_FUN(max, __hmax, __hmax2) -KERNEL_FLOAT_FP16_BINARY_FUN(equal_to, __heq, __heq2) -KERNEL_FLOAT_FP16_BINARY_FUN(not_equal_to, __heq, __heq2) -KERNEL_FLOAT_FP16_BINARY_FUN(less, __hlt, __hlt2) -KERNEL_FLOAT_FP16_BINARY_FUN(less_equal, __hle, __hle2) -KERNEL_FLOAT_FP16_BINARY_FUN(greater, __hgt, __hgt2) -KERNEL_FLOAT_FP16_BINARY_FUN(greater_equal, __hge, __hgt2) -#endif -#define KERNEL_FLOAT_FP16_CAST(T, TO_HALF, FROM_HALF) \ - namespace ops { \ - template<> \ - struct cast { \ - KERNEL_FLOAT_INLINE __half operator()(T input) { \ - return TO_HALF; \ - } \ - }; \ - template<> \ - struct cast<__half, T> { \ - KERNEL_FLOAT_INLINE T operator()(__half input) { \ - return FROM_HALF; \ - } \ - }; \ - } -KERNEL_FLOAT_FP16_CAST(double, __double2half(input), double(__half2float(input))); -KERNEL_FLOAT_FP16_CAST(float, __float2half(input), __half2float(input)); -// there are no official char casts. Instead, cast to int and then to char -KERNEL_FLOAT_FP16_CAST(char, __int2half_rn(input), (char)__half2int_rz(input)); -KERNEL_FLOAT_FP16_CAST(signed char, __int2half_rn(input), (signed char)__half2int_rz(input)); -KERNEL_FLOAT_FP16_CAST(unsigned char, __int2half_rn(input), (unsigned char)__half2int_rz(input)); +namespace kernel_float { -KERNEL_FLOAT_FP16_CAST(signed short, __short2half_rn(input), __half2short_rz(input)); -KERNEL_FLOAT_FP16_CAST(signed int, __int2half_rn(input), __half2int_rz(input)); -KERNEL_FLOAT_FP16_CAST(signed long, __ll2half_rn(input), (signed long)(__half2ll_rz(input))); -KERNEL_FLOAT_FP16_CAST(signed long long, __ll2half_rn(input), __half2ll_rz(input)); +template class S> +struct tensor { + static constexpr size_t rank = E::rank; + static constexpr size_t volume = E::volume; -KERNEL_FLOAT_FP16_CAST(unsigned int, __uint2half_rn(input), __half2uint_rz(input)); -KERNEL_FLOAT_FP16_CAST(unsigned short, __ushort2half_rn(input), __half2ushort_rz(input)); -KERNEL_FLOAT_FP16_CAST(unsigned long, __ull2half_rn(input), (unsigned long)(__half2ull_rz(input))); -KERNEL_FLOAT_FP16_CAST(unsigned long long, __ull2half_rn(input), __half2ull_rz(input)); + using value_type = T; + using extents_type = E; + using ndindex_type = ndindex; + using storage_type = S; -using half = __half; -using float16 = __half; -//KERNEL_FLOAT_TYPE_ALIAS(half, __half) -//KERNEL_FLOAT_TYPE_ALIAS(float16x, __half) -//KERNEL_FLOAT_TYPE_ALIAS(f16x, __half) + KERNEL_FLOAT_INLINE + static constexpr size_t size() { + return E::volume; + } -} // namespace kernel_float + KERNEL_FLOAT_INLINE + static constexpr size_t size(size_t axis) { + return E::size(axis); + } -#endif + KERNEL_FLOAT_INLINE + static constexpr size_t stride(size_t axis) { + return E::stride(axis); + } -#endif //KERNEL_FLOAT_FP16_H -#ifndef KERNEL_FLOAT_BF16_H -#define KERNEL_FLOAT_BF16_H + KERNEL_FLOAT_INLINE + static constexpr size_t linearize_index(ndindex_type index) { + return E::ravel_index(index); + } + template = 0> + KERNEL_FLOAT_INLINE tensor(Args&&... args) : storage_ {std::forward(args)...} {} + KERNEL_FLOAT_INLINE + tensor(T init = {}) { + for (size_t i = 0; i < size(); i++) { + storage_[i] = init; + } + } -#if KERNEL_FLOAT_BF16_AVAILABLE -#include + KERNEL_FLOAT_INLINE + tensor(storage_type storage) : storage_(storage) {} + KERNEL_FLOAT_INLINE + storage_type& storage() { + return storage_; + } + KERNEL_FLOAT_INLINE + const storage_type& storage() const { + return storage_; + } + KERNEL_FLOAT_INLINE + T* data() { + return storage_.data(); + } + KERNEL_FLOAT_INLINE + const T* data() const { + return storage_.data(); + } + KERNEL_FLOAT_INLINE + const T* cdata() const { + return storage_.data(); + } + KERNEL_FLOAT_INLINE + T* begin() { + return storage_.data(); + } -namespace kernel_float { -KERNEL_FLOAT_DEFINE_COMMON_TYPE(__nv_bfloat16, bool) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(float, __nv_bfloat16) -KERNEL_FLOAT_DEFINE_COMMON_TYPE(double, __nv_bfloat16) + KERNEL_FLOAT_INLINE + const T* begin() const { + return storage_.data(); + } -template<> -struct vector_traits<__nv_bfloat162> { - using value_type = __nv_bfloat16; - static constexpr size_t size = 2; + KERNEL_FLOAT_INLINE + const T* cbegin() const { + return storage_.data(); + } KERNEL_FLOAT_INLINE - static __nv_bfloat162 fill(__nv_bfloat16 value) { -#if KERNEL_FLOAT_ON_DEVICE - return __bfloat162bfloat162(value); -#else - return {value, value}; -#endif + T* end() { + return storage_.data() + E::volume; } KERNEL_FLOAT_INLINE - static __nv_bfloat162 create(__nv_bfloat16 low, __nv_bfloat16 high) { -#if KERNEL_FLOAT_ON_DEVICE - return __halves2bfloat162(low, high); -#else - return {low, high}; -#endif + const T* end() const { + return storage_.data() + E::volume; } KERNEL_FLOAT_INLINE - static __nv_bfloat16 get(__nv_bfloat162 self, size_t index) { -#if KERNEL_FLOAT_ON_DEVICE - if (index == 0) { - return __low2bfloat16(self); - } else { - return __high2bfloat16(self); - } -#else - if (index == 0) { - return self.x; - } else { - return self.y; - } -#endif + const T* cend() const { + return storage_.data() + E::volume; } KERNEL_FLOAT_INLINE - static void set(__nv_bfloat162& self, size_t index, __nv_bfloat16 value) { - if (index == 0) { - self.x = value; - } else { - self.y = value; - } + T& at(ndindex_type x) { + return *(data() + linearize_index(x)); } -}; -template -struct default_storage<__nv_bfloat16, N, Alignment::Maximum, enabled_t<(N >= 2)>> { - using type = nested_array<__nv_bfloat162, N>; -}; + KERNEL_FLOAT_INLINE + const T& at(ndindex_type x) const { + return *(data() + linearize_index(x)); + } -template -struct default_storage<__nv_bfloat16, N, Alignment::Packed, enabled_t<(N >= 2 && N % 2 == 0)>> { - using type = nested_array<__nv_bfloat162, N>; -}; - -#if KERNEL_FLOAT_ON_DEVICE -#define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2) \ - namespace ops { \ - template<> \ - struct NAME<__nv_bfloat16> { \ - KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(__nv_bfloat16 input) { \ - return FUN1(input); \ - } \ - }; \ - } \ - namespace detail { \ - template<> \ - struct map_helper, __nv_bfloat162, __nv_bfloat162> { \ - KERNEL_FLOAT_INLINE static __nv_bfloat162 \ - call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 input) { \ - return FUN2(input); \ - } \ - }; \ + KERNEL_FLOAT_INLINE + T get(ndindex_type x) const { + return at(x); } -KERNEL_FLOAT_BF16_UNARY_FUN(abs, ::__habs, ::__habs2); -KERNEL_FLOAT_BF16_UNARY_FUN(negate, ::__hneg, ::__hneg2); -KERNEL_FLOAT_BF16_UNARY_FUN(ceil, ::hceil, ::h2ceil); -KERNEL_FLOAT_BF16_UNARY_FUN(cos, ::hcos, ::h2cos); -KERNEL_FLOAT_BF16_UNARY_FUN(exp, ::hexp, ::h2exp); -KERNEL_FLOAT_BF16_UNARY_FUN(exp10, ::hexp10, ::h2exp10); -KERNEL_FLOAT_BF16_UNARY_FUN(floor, ::hfloor, ::h2floor); -KERNEL_FLOAT_BF16_UNARY_FUN(log, ::hlog, ::h2log); -KERNEL_FLOAT_BF16_UNARY_FUN(log10, ::hlog10, ::h2log2); -KERNEL_FLOAT_BF16_UNARY_FUN(rint, ::hrint, ::h2rint); -KERNEL_FLOAT_BF16_UNARY_FUN(rsqrt, ::hrsqrt, ::h2rsqrt); -KERNEL_FLOAT_BF16_UNARY_FUN(sin, ::hsin, ::h2sin); -KERNEL_FLOAT_BF16_UNARY_FUN(sqrt, ::hsqrt, ::h2sqrt); -KERNEL_FLOAT_BF16_UNARY_FUN(trunc, ::htrunc, ::h2trunc); + KERNEL_FLOAT_INLINE + void set(ndindex_type x, T value) { + at(x) = std::move(value); + } -#define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2) \ - namespace ops { \ - template<> \ - struct NAME<__nv_bfloat16> { \ - KERNEL_FLOAT_INLINE __nv_bfloat16 \ - operator()(__nv_bfloat16 left, __nv_bfloat16 right) const { \ - return FUN1(left, right); \ - } \ - }; \ - } \ - namespace detail { \ - template<> \ - struct zip_helper, __nv_bfloat162, __nv_bfloat162, __nv_bfloat162> { \ - KERNEL_FLOAT_INLINE static __nv_bfloat162 \ - call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 left, __nv_bfloat162 right) { \ - return FUN2(left, right); \ - } \ - }; \ + KERNEL_FLOAT_INLINE + T& operator[](ndindex_type x) { + return at(x); } -KERNEL_FLOAT_BF16_BINARY_FUN(add, __hadd, __hadd2) -KERNEL_FLOAT_BF16_BINARY_FUN(subtract, __hsub, __hsub2) -KERNEL_FLOAT_BF16_BINARY_FUN(multiply, __hmul, __hmul2) -KERNEL_FLOAT_BF16_BINARY_FUN(divide, __hdiv, __h2div) -KERNEL_FLOAT_BF16_BINARY_FUN(min, __hmin, __hmin2) -KERNEL_FLOAT_BF16_BINARY_FUN(max, __hmax, __hmax2) + KERNEL_FLOAT_INLINE + const T& operator[](ndindex_type x) const { + return at(x); + } -KERNEL_FLOAT_BF16_BINARY_FUN(equal_to, __heq, __heq2) -KERNEL_FLOAT_BF16_BINARY_FUN(not_equal_to, __heq, __heq2) -KERNEL_FLOAT_BF16_BINARY_FUN(less, __hlt, __hlt2) -KERNEL_FLOAT_BF16_BINARY_FUN(less_equal, __hle, __hle2) -KERNEL_FLOAT_BF16_BINARY_FUN(greater, __hgt, __hgt2) -KERNEL_FLOAT_BF16_BINARY_FUN(greater_equal, __hge, __hgt2) + KERNEL_FLOAT_INLINE + T& operator()(ndindex_type x) { + return at(x); + } -#endif + KERNEL_FLOAT_INLINE + const T& operator()(ndindex_type x) const { + return at(x); + } -#define KERNEL_FLOAT_BF16_CAST(T, TO_HALF, FROM_HALF) \ - namespace ops { \ - template<> \ - struct cast { \ - KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(T input) { \ - return TO_HALF; \ - } \ - }; \ - template<> \ - struct cast<__nv_bfloat16, T> { \ - KERNEL_FLOAT_INLINE T operator()(__nv_bfloat16 input) { \ - return FROM_HALF; \ - } \ - }; \ + KERNEL_FLOAT_INLINE + tensor> flatten() const { + return storage_; } -KERNEL_FLOAT_BF16_CAST(double, __double2bfloat16(input), double(__bfloat162float(input))); -KERNEL_FLOAT_BF16_CAST(float, __float2bfloat16(input), __bfloat162float(input)); + template + KERNEL_FLOAT_INLINE tensor> reshape(extents = {}) const { + static_assert(extents::volume == volume, "invalid reshape shape"); + return storage_; + } -// there are no official char casts. Instead, cast to int and then to char -KERNEL_FLOAT_BF16_CAST(char, __int2bfloat16_rn(input), (char)__bfloat162int_rz(input)); -KERNEL_FLOAT_BF16_CAST( - signed char, - __int2bfloat16_rn(input), - (signed char)__bfloat162int_rz(input)); -KERNEL_FLOAT_BF16_CAST( - unsigned char, - __int2bfloat16_rn(input), - (unsigned char)__bfloat162int_rz(input)); + template + KERNEL_FLOAT_INLINE tensor> broadcast(extents new_shape = {}) const { + return kernel_float::broadcast(*this, new_shape); + } -KERNEL_FLOAT_BF16_CAST(signed short, __bfloat162short_rz(input), __short2bfloat16_rn(input)); -KERNEL_FLOAT_BF16_CAST(signed int, __bfloat162int_rz(input), __int2bfloat16_rn(input)); -KERNEL_FLOAT_BF16_CAST( - signed long, - __ll2bfloat16_rn(input), - (signed long)(__bfloat162ll_rz(input))); -KERNEL_FLOAT_BF16_CAST(signed long long, __ll2bfloat16_rn(input), __bfloat162ll_rz(input)); + template + KERNEL_FLOAT_INLINE tensor, E> map(F fun = {}) const { + return kernel_float::map(fun, *this); + } -KERNEL_FLOAT_BF16_CAST(unsigned short, __bfloat162ushort_rz(input), __ushort2bfloat16_rn(input)); -KERNEL_FLOAT_BF16_CAST(unsigned int, __bfloat162uint_rz(input), __uint2bfloat16_rn(input)); -KERNEL_FLOAT_BF16_CAST( - unsigned long, - __ull2bfloat16_rn(input), - (unsigned long)(__bfloat162ull_rz(input))); -KERNEL_FLOAT_BF16_CAST(unsigned long long, __ull2bfloat16_rn(input), __bfloat162ull_rz(input)); + template + KERNEL_FLOAT_INLINE T reduce(F fun = {}) const { + return kernel_float::reduce(fun, *this); + } -using bfloat16 = __nv_bfloat16; -//KERNEL_FLOAT_TYPE_ALIAS(half, __nv_bfloat16) -//KERNEL_FLOAT_TYPE_ALIAS(float16x, __nv_bfloat16) -//KERNEL_FLOAT_TYPE_ALIAS(f16x, __nv_bfloat16) + private: + storage_type storage_; +}; -} // namespace kernel_float +#define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ + template<> \ + struct into_tensor_traits<::T2> { \ + using type = tensor>; \ + \ + KERNEL_FLOAT_INLINE \ + static type call(::T2 v) { \ + return tensor_storage {v.x, v.y}; \ + } \ + }; \ + \ + template<> \ + struct into_tensor_traits<::T3> { \ + using type = tensor>; \ + \ + KERNEL_FLOAT_INLINE \ + static type call(::T3 v) { \ + return tensor_storage {v.x, v.y, v.z}; \ + } \ + }; \ + \ + template<> \ + struct into_tensor_traits<::T4> { \ + using type = tensor>; \ + \ + KERNEL_FLOAT_INLINE \ + static type call(::T4 v) { \ + return tensor_storage {v.x, v.y, v.z, v.w}; \ + } \ + }; -#if KERNEL_FLOAT_FP16_AVAILABLE +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(char, char1, char2, char3, char4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(short, short1, short2, short3, short4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(int, int1, int2, int3, int4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long, long1, long2, long3, long4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long long, longlong1, longlong2, longlong3, longlong4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned char, uchar1, uchar2, uchar3, uchar4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned short, ushort1, ushort2, ushort3, ushort4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned int, uint1, uint2, uint3, uint4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long, ulong1, ulong2, ulong3, ulong4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong1, ulonglong2, ulonglong3, ulonglong4) -namespace kernel_float { -KERNEL_FLOAT_BF16_CAST(__half, __float2bfloat16(input), __bfloat162float(input)); -} +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(float, float1, float2, float3, float4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(double, double1, double2, double3, double4) -#endif // KERNEL_FLOAT_FP16_AVAILABLE -#endif +template +using scalar = tensor>; -#endif //KERNEL_FLOAT_BF16_H +template +using vec = tensor>; + +template +using mat = tensor>; + +template +KERNEL_FLOAT_INLINE vec, sizeof...(Args)> make_vec(Args&&... args) { + using T = promote_t; + return tensor_storage {T {args}...}; +}; + +// clang-format off +template using vec1 = vec; +template using vec2 = vec; +template using vec3 = vec; +template using vec4 = vec; +template using vec5 = vec; +template using vec6 = vec; +template using vec7 = vec; +template using vec8 = vec; +// clang-format on + +#define KERNEL_FLOAT_TYPE_ALIAS(NAME, T) \ + using k##NAME = scalar; \ + template \ + using NAME##X = vec; \ + using NAME##1 = vec; \ + using NAME##2 = vec; \ + using NAME##3 = vec; \ + using NAME##4 = vec; \ + using NAME##5 = vec; \ + using NAME##6 = vec; \ + using NAME##7 = vec; \ + using NAME##8 = vec; + +KERNEL_FLOAT_TYPE_ALIAS(char, char) +KERNEL_FLOAT_TYPE_ALIAS(short, short) +KERNEL_FLOAT_TYPE_ALIAS(int, int) +KERNEL_FLOAT_TYPE_ALIAS(long, long) +KERNEL_FLOAT_TYPE_ALIAS(longlong, long long) + +KERNEL_FLOAT_TYPE_ALIAS(uchar, unsigned char) +KERNEL_FLOAT_TYPE_ALIAS(ushort, unsigned short) +KERNEL_FLOAT_TYPE_ALIAS(uint, unsigned int) +KERNEL_FLOAT_TYPE_ALIAS(ulong, unsigned long) +KERNEL_FLOAT_TYPE_ALIAS(ulonglong, unsigned long long) + +KERNEL_FLOAT_TYPE_ALIAS(float, float) +KERNEL_FLOAT_TYPE_ALIAS(f32x, float) +KERNEL_FLOAT_TYPE_ALIAS(float32x, float) + +KERNEL_FLOAT_TYPE_ALIAS(double, double) +KERNEL_FLOAT_TYPE_ALIAS(f64x, double) +KERNEL_FLOAT_TYPE_ALIAS(float64x, double) + +} // namespace kernel_float + +#endif diff --git a/tests/basic.cu b/tests/basic.cu index b76580d..ee49d28 100644 --- a/tests/basic.cu +++ b/tests/basic.cu @@ -14,13 +14,11 @@ struct basic_test> { // check if getters work ASSERT(equals(a.get(Is), items[Is]) && ...); - ASSERT(equals(a.get(kf::const_index {}), items[Is]) && ...); ASSERT(equals(a[Is], items[Is]) && ...); - ASSERT(equals(a[kf::const_index {}], items[Is]) && ...); // check if setter works T new_items[N] = {gen.next(Is)...}; - (a.set(kf::const_index {}, new_items[Is]), ...); + (a.set(Is, new_items[Is]), ...); ASSERT(equals(a.get(Is), new_items[Is]) && ...); // check if setter works diff --git a/tests/binops.cu b/tests/binops.cu index 8409b71..5124020 100644 --- a/tests/binops.cu +++ b/tests/binops.cu @@ -4,7 +4,7 @@ namespace kf = kernel_float; template> -struct arithmetic_test; + struct / arithmetic_test; template struct arithmetic_test> { diff --git a/tests/common.h b/tests/common.h index 712d945..40c70ce 100644 --- a/tests/common.h +++ b/tests/common.h @@ -215,7 +215,7 @@ void run_on_host(type_sequence, size_sequence) { template class F, typename... Ts> void run_on_host(type_sequence = {}) { - run_on_host(type_sequence {}, size_sequence<1, 2, 3, 4, 5, 6, 7, 8> {}); + run_on_host(type_sequence {}, size_sequence<1, 2, 3, 4, 7, 8> {}); } template @@ -266,7 +266,7 @@ void run_on_device(type_sequence, size_sequence) { template class F, typename... Ts> void run_on_device(type_sequence = {}) { - run_on_device(type_sequence {}, size_sequence<1, 2, 3, 4, 5, 6, 7, 8> {}); + run_on_device(type_sequence {}, size_sequence<1, 2, 3, 4, 7, 8> {}); } template class F, typename... Ts> diff --git a/tests/swizzle.cu b/tests/swizzle.cu index e44394e..0fc7e46 100644 --- a/tests/swizzle.cu +++ b/tests/swizzle.cu @@ -1,6 +1,6 @@ #include "common.h" #include "kernel_float.h" - +/* namespace kf = kernel_float; template> @@ -40,3 +40,4 @@ struct swizzle_test> { TEST_CASE("swizzle") { run_on_host_and_device(); } +*/ \ No newline at end of file From 9f4a8e610fbfefc3b67f36b501f913c36a81f67e Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 27 Jun 2023 15:26:56 +0200 Subject: [PATCH 03/50] Various features add for tensor support --- examples/vector_add/main.cu | 28 +- include/kernel_float.h | 7 +- include/kernel_float/base.h | 11 +- include/kernel_float/bf16.h | 25 +- include/kernel_float/binops.h | 17 +- include/kernel_float/broadcast.h | 49 +- include/kernel_float/complex.h | 6 +- include/kernel_float/fp16.h | 26 +- include/kernel_float/meta.h | 29 +- include/kernel_float/prelude.h | 86 +++ include/kernel_float/tensor.h | 74 +- include/kernel_float/unops.h | 14 +- single_include/kernel_float.h | 1197 ++++++++++++++++-------------- tests/binops.cu | 2 +- tests/broadcast.cu | 67 ++ 15 files changed, 975 insertions(+), 663 deletions(-) create mode 100644 include/kernel_float/prelude.h create mode 100644 tests/broadcast.cu diff --git a/examples/vector_add/main.cu b/examples/vector_add/main.cu index fe69857..82bad8a 100644 --- a/examples/vector_add/main.cu +++ b/examples/vector_add/main.cu @@ -4,9 +4,7 @@ #include #include "kernel_float.h" -namespace kf = kernel_float; - -using x = kf::half; +using namespace kernel_float::prelude; void cuda_check(cudaError_t code) { if (code != cudaSuccess) { @@ -15,39 +13,35 @@ void cuda_check(cudaError_t code) { } template -__global__ void my_kernel( - int length, - const kf::unaligned_vec<__half, N>* input, - double constant, - kf::unaligned_vec* output) { +__global__ void my_kernel(int length, const vhalf* input, kdouble constant, vfloat* output) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i * N < length) { - output[i] = kf::cast((input[i] * input[i]) * constant); + output[i] = kernel_float::cast((input[i] * input[i]) * constant); } } template void run_kernel(int n) { double constant = 1.0; - std::vector<__half> input(n); + std::vector input(n); std::vector output_expected; std::vector output_result; // Generate input data for (int i = 0; i < n; i++) { - input[i] = __half(i); + input[i] = half(i); output_expected[i] = float(i + constant); } // Allocate device memory - kf::unaligned_vec<__half, items_per_thread>* input_dev; - kf::unaligned_vec* output_dev; - cuda_check(cudaMalloc(&input_dev, sizeof(__half) * n)); - cuda_check(cudaMalloc(&output_dev, sizeof(float) * n)); + vhalf* input_dev; + vfloat* output_dev; + cuda_check(cudaMalloc(&input_dev, sizeof(khalf) * n)); + cuda_check(cudaMalloc(&output_dev, sizeof(kfloat) * n)); // Copy device memory - cuda_check(cudaMemcpy(input_dev, input.data(), sizeof(__half) * n, cudaMemcpyDefault)); + cuda_check(cudaMemcpy(input_dev, input.data(), sizeof(khalf) * n, cudaMemcpyDefault)); // Launch kernel! int block_size = 256; @@ -56,7 +50,7 @@ void run_kernel(int n) { my_kernel<<>>(n, input_dev, constant, output_dev); // Copy results back - cuda_check(cudaMemcpy(output_dev, output_result.data(), sizeof(float) * n, cudaMemcpyDefault)); + cuda_check(cudaMemcpy(output_dev, output_result.data(), sizeof(kfloat) * n, cudaMemcpyDefault)); // Check results for (int i = 0; i < n; i++) { diff --git a/include/kernel_float.h b/include/kernel_float.h index 51ae25b..888ca72 100644 --- a/include/kernel_float.h +++ b/include/kernel_float.h @@ -1,20 +1,17 @@ #ifndef KERNEL_FLOAT_H #define KERNEL_FLOAT_H -//#include "kernel_float/broadcast.h" //#include "kernel_float/fp8.h" -//#include "kernel_float/interface.h" -//#include "kernel_float/iterate.h" -//#include "kernel_float/macros.h" -//#include "kernel_float/storage.h" //#include "kernel_float/swizzle.h" #include "kernel_float/base.h" #include "kernel_float/bf16.h" #include "kernel_float/binops.h" +#include "kernel_float/broadcast.h" #include "kernel_float/fp16.h" #include "kernel_float/macros.h" #include "kernel_float/meta.h" +#include "kernel_float/prelude.h" #include "kernel_float/reduce.h" #include "kernel_float/tensor.h" #include "kernel_float/unops.h" diff --git a/include/kernel_float/base.h b/include/kernel_float/base.h index aeb5fc6..88bdffa 100644 --- a/include/kernel_float/base.h +++ b/include/kernel_float/base.h @@ -226,8 +226,15 @@ struct extents { } }; -template -struct into_tensor_traits; +template +struct into_tensor_traits { + using type = tensor>; + + KERNEL_FLOAT_INLINE + static type call(const T& input) { + return tensor_storage {input}; + } +}; template struct into_tensor_traits { diff --git a/include/kernel_float/bf16.h b/include/kernel_float/bf16.h index 4ff9905..1a23282 100644 --- a/include/kernel_float/bf16.h +++ b/include/kernel_float/bf16.h @@ -7,9 +7,10 @@ #include #include "binops.h" +#include "tensor.h" namespace kernel_float { -KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(__nv_bfloat16, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(__nv_bfloat16) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __nv_bfloat16) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __nv_bfloat16) @@ -89,6 +90,28 @@ struct apply_impl { return result; } }; + +template +struct reduce_helper= 2)>> { + KERNEL_FLOAT_INLINE static __nv_bfloat16 + call(F fun, const tensor_storage<__nv_bfloat16, N>& input) { + __nv_bfloat162 accum = {input[0], input[1]}; + +#pragma unroll + for (size_t i = 2; i < N; i += 2) { + __nv_bfloat162 a = {input[i], input[i + 1]}; + accum = zip_bfloat16x2::call(fun, accum, a); + } + + __nv_bfloat16 result = fun(accum.x, accum.y); + + if (N % 2 != 0) { + result = fun(result, input[N - 1]); + } + + return result; + } +}; } // namespace detail #if KERNEL_FLOAT_IS_DEVICE diff --git a/include/kernel_float/binops.h b/include/kernel_float/binops.h index 8c50499..8d87f92 100644 --- a/include/kernel_float/binops.h +++ b/include/kernel_float/binops.h @@ -11,17 +11,14 @@ template struct apply_impl { KERNEL_FLOAT_INLINE static tensor_storage call(F fun, const tensor_storage& left, const tensor_storage& right) { - return call(fun, left, right, make_index_sequence {}); - } + tensor_storage result; + +#pragma unroll + for (size_t i = 0; i < N; i++) { + result[i] = fun(left[i], right[i]); + } - private: - template - KERNEL_FLOAT_INLINE static tensor_storage call( - F fun, - const tensor_storage& left, - const tensor_storage& right, - index_sequence) { - return {fun(left[Is], right[Is])...}; + return result; } }; } // namespace detail diff --git a/include/kernel_float/broadcast.h b/include/kernel_float/broadcast.h index 976488b..596bb27 100644 --- a/include/kernel_float/broadcast.h +++ b/include/kernel_float/broadcast.h @@ -82,7 +82,7 @@ struct copy_helper; template struct copy_helper, IS, OS> { template - static void call(T* output, const T* input) { + KERNEL_FLOAT_INLINE static void call(T* output, const T* input) { ndindex<0> x; size_t input_index = IS::call(x); size_t output_index = OS::call(x); @@ -93,7 +93,7 @@ struct copy_helper, IS, OS> { template struct copy_helper, IS, OS> { template - static void call(T* output, const T* input) { + KERNEL_FLOAT_INLINE static void call(T* output, const T* input) { for (size_t i = 0; i < N; i++) { ndindex<1> x = {i}; size_t input_index = IS::call(x); @@ -106,7 +106,7 @@ struct copy_helper, IS, OS> { template struct copy_helper, IS, OS> { template - static void call(T* output, const T* input) { + KERNEL_FLOAT_INLINE static void call(T* output, const T* input) { for (size_t i = 0; i < N; i++) { for (size_t j = 0; j < M; j++) { ndindex<2> x = {i, j}; @@ -121,7 +121,7 @@ struct copy_helper, IS, OS> { template struct copy_helper, IS, OS> { template - static void call(T* output, const T* input) { + KERNEL_FLOAT_INLINE static void call(T* output, const T* input) { for (size_t i = 0; i < N; i++) { for (size_t j = 0; j < M; j++) { for (size_t k = 0; k < K; k++) { @@ -175,13 +175,13 @@ struct strides_helper> { template struct broadcast_impl { - KERNEL_FLOAT_INLINE static tensor_storage - call(tensor_storage input) { + KERNEL_FLOAT_INLINE static tensor_storage + call(tensor_storage input) { static_assert(is_broadcastable, "cannot broadcast to required shape"); - using IS = strides_helper>; + using IS = strides_helper::type>; using OS = strides_helper; - tensor_storage output; + tensor_storage output; copy_helper::call(output.data(), input.data()); return output; } @@ -198,38 +198,46 @@ struct broadcast_impl { } // namespace detail template -tensor, extents> +KERNEL_FLOAT_INLINE tensor, extents> broadcast(const V& input, extents new_extents = {}) { using T = tensor_value_type; return detail::broadcast_impl, extents>::call( into_tensor(input).storage()); } +template +KERNEL_FLOAT_INLINE tensor, tensor_extents> +broadcast_like(const V& input, const R&) { + using T = tensor_value_type; + return detail::broadcast_impl, tensor_extents>::call( + into_tensor(input).storage()); +} + template -tensor> fill(T value = {}, extents = {}) { +KERNEL_FLOAT_INLINE tensor> fill(T value = {}, extents = {}) { tensor_storage input = {value}; return detail::broadcast_impl, extents>::call(input); } template -tensor> zeros(extents = {}) { +KERNEL_FLOAT_INLINE tensor> zeros(extents = {}) { tensor_storage input = {T {}}; return detail::broadcast_impl, extents>::call(input); } template -tensor> ones(extents = {}) { +KERNEL_FLOAT_INLINE tensor> ones(extents = {}) { tensor_storage input = {T {1}}; return detail::broadcast_impl, extents>::call(input); } template, typename E = tensor_extents> -tensor zeros_like(const V&) { +KERNEL_FLOAT_INLINE tensor zeros_like(const V&) { return zeros(E {}); } template, typename E = tensor_extents> -tensor ones_like(const V&) { +KERNEL_FLOAT_INLINE tensor ones_like(const V&) { return ones(E {}); } @@ -238,8 +246,9 @@ template call(tensor_storage input) { + using F = ops::cast; tensor_storage intermediate = - detail::apply_impl, E::volume, T2, T>::call(input); + detail::apply_impl::call(F {}, input); return detail::broadcast_impl::call(intermediate); } }; @@ -264,7 +273,8 @@ template struct convert_helper { KERNEL_FLOAT_INLINE static tensor_storage call(tensor_storage input) { - return detail::apply_impl, E::volume, T2, T>::call(input); + using F = ops::cast; + return detail::apply_impl::call(F {}, input); } }; } // namespace detail @@ -273,9 +283,10 @@ struct convert_helper { * Cast the values of the given input tensor to type `R` and then broadcast the result to the given shape `(Ns...)`. */ template -tensor> convert(const V& input, extents new_shape = {}) { - return detail::convert_helper, tensor_extents, R, extents, M, >( - into_tensor(input).storage()); +KERNEL_FLOAT_INLINE tensor> +convert(const V& input, extents new_shape = {}) { + return detail::convert_helper, tensor_extents, R, extents, M>:: + call(into_tensor(input).storage()); } } // namespace kernel_float diff --git a/include/kernel_float/complex.h b/include/kernel_float/complex.h index ace0901..25ddb7f 100644 --- a/include/kernel_float/complex.h +++ b/include/kernel_float/complex.h @@ -6,14 +6,14 @@ namespace kernel_float { template -struct alignas(2 * alignof(T)) complex_storage { +struct alignas(2 * alignof(T)) complex_type_storage { T re; T im; }; template -struct complex_type: complex_storage { - using base_type = complex_storage; +struct complex_type: complex_type_storage { + using base_type = complex_type_storage; template KERNEL_FLOAT_INLINE complex_type(complex_type that) : base_type(that.real(), that.imag()) {} diff --git a/include/kernel_float/fp16.h b/include/kernel_float/fp16.h index cf8e154..b31991e 100644 --- a/include/kernel_float/fp16.h +++ b/include/kernel_float/fp16.h @@ -9,7 +9,7 @@ #include "binops.h" namespace kernel_float { -KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(__half, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(__half) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __half) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __half) @@ -87,7 +87,29 @@ struct apply_impl { return result; } }; -} // namespace detail + +template +struct reduce_helper= 2)>> { + KERNEL_FLOAT_INLINE static __half call(F fun, const tensor_storage<__half, N>& input) { + __half2 accum = {input[0], input[1]}; + +#pragma unroll + for (size_t i = 2; i < N; i += 2) { + __half2 a = {input[i], input[i + 1]}; + accum = zip_halfx2::call(fun, accum, a); + } + + __half result = fun(accum.x, accum.y); + + if (N % 2 != 0) { + result = fun(result, input[N - 1]); + } + + return result; + } +}; + +}; // namespace detail #if KERNEL_FLOAT_IS_DEVICE #define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2) \ diff --git a/include/kernel_float/meta.h b/include/kernel_float/meta.h index 54873ba..04242d2 100644 --- a/include/kernel_float/meta.h +++ b/include/kernel_float/meta.h @@ -94,7 +94,7 @@ struct promote_type { using type = T; \ }; -// T and bool becomes T +// T + bool becomes T KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(char, bool) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(signed char, bool) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(signed short, bool) @@ -144,6 +144,33 @@ KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(long long, short) KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(long long, int) KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(long long, long) +template +struct promote_type { + using type = T*; +}; + +#define KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(I) \ + template \ + struct promote_type { \ + using type = T*; \ + }; \ + template \ + struct promote_type { \ + using type = T*; \ + }; + +KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(char) +KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(signed char) +KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(signed short) +KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(signed int) +KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(signed long) +KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(signed long long) +KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(unsigned char) +KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(unsigned short) +KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(unsigned int) +KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(unsigned long) +KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(unsigned long long) + // half precision // KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(half) // KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(half, bool) diff --git a/include/kernel_float/prelude.h b/include/kernel_float/prelude.h new file mode 100644 index 0000000..e2c54d5 --- /dev/null +++ b/include/kernel_float/prelude.h @@ -0,0 +1,86 @@ +#ifndef KERNEL_FLOAT_PRELUDE_H +#define KERNEL_FLOAT_PRELUDE_H + +#include "tensor.h" + +namespace kernel_float { +namespace prelude { + +template +using kscalar = tensor>; + +template +using kvec = tensor>; + +template +using kmat = tensor>; + +template +using ktensor = tensor>; + +// clang-format off +template using kvec1 = kvec; +template using kvec2 = kvec; +template using kvec3 = kvec; +template using kvec4 = kvec; +template using kvec5 = kvec; +template using kvec6 = kvec; +template using kvec7 = kvec; +template using kvec8 = kvec; +// clang-format on + +#define KERNEL_FLOAT_TYPE_ALIAS(NAME, T) \ + using k##NAME = scalar; \ + template \ + using v##NAME = vec; \ + using v##NAME##1 = vec; \ + using v##NAME##2 = vec; \ + using v##NAME##3 = vec; \ + using v##NAME##4 = vec; \ + using v##NAME##5 = vec; \ + using v##NAME##6 = vec; \ + using v##NAME##7 = vec; \ + using v##NAME##8 = vec; + +KERNEL_FLOAT_TYPE_ALIAS(char, char) +KERNEL_FLOAT_TYPE_ALIAS(short, short) +KERNEL_FLOAT_TYPE_ALIAS(int, int) +KERNEL_FLOAT_TYPE_ALIAS(long, long) +KERNEL_FLOAT_TYPE_ALIAS(longlong, long long) + +KERNEL_FLOAT_TYPE_ALIAS(uchar, unsigned char) +KERNEL_FLOAT_TYPE_ALIAS(ushort, unsigned short) +KERNEL_FLOAT_TYPE_ALIAS(uint, unsigned int) +KERNEL_FLOAT_TYPE_ALIAS(ulong, unsigned long) +KERNEL_FLOAT_TYPE_ALIAS(ulonglong, unsigned long long) + +KERNEL_FLOAT_TYPE_ALIAS(float, float) +KERNEL_FLOAT_TYPE_ALIAS(f32x, float) +KERNEL_FLOAT_TYPE_ALIAS(float32x, float) + +KERNEL_FLOAT_TYPE_ALIAS(double, double) +KERNEL_FLOAT_TYPE_ALIAS(f64x, double) +KERNEL_FLOAT_TYPE_ALIAS(float64x, double) + +#if KERNEL_FLOAT_FP16_AVAILABLE +KERNEL_FLOAT_TYPE_ALIAS(half, __half) +KERNEL_FLOAT_TYPE_ALIAS(f16x, __half) +KERNEL_FLOAT_TYPE_ALIAS(float16x, __half) +#endif + +#if KERNEL_FLOAT_BF16_AVAILABLE +KERNEL_FLOAT_TYPE_ALIAS(bfloat16, __nv_bfloat16) +KERNEL_FLOAT_TYPE_ALIAS(bf16, __nv_bfloat16) +#endif + +template +static constexpr extents kshape = {}; + +template +KERNEL_FLOAT_INLINE kvec, sizeof...(Args)> make_kvec(Args&&... args) { + return make_vec(std::forward(args)...); +}; +} // namespace prelude +} // namespace kernel_float + +#endif \ No newline at end of file diff --git a/include/kernel_float/tensor.h b/include/kernel_float/tensor.h index ba5bdc8..09ff0a5 100644 --- a/include/kernel_float/tensor.h +++ b/include/kernel_float/tensor.h @@ -29,6 +29,11 @@ struct tensor { return E::size(axis); } + KERNEL_FLOAT_INLINE + static constexpr extents_type shape() { + return {}; + } + KERNEL_FLOAT_INLINE static constexpr size_t stride(size_t axis) { return E::stride(axis); @@ -39,19 +44,26 @@ struct tensor { return E::ravel_index(index); } - template = 0> - KERNEL_FLOAT_INLINE tensor(Args&&... args) : storage_ {std::forward(args)...} {} - - KERNEL_FLOAT_INLINE - tensor(T init = {}) { - for (size_t i = 0; i < size(); i++) { - storage_[i] = init; - } - } + tensor(const tensor&) = default; KERNEL_FLOAT_INLINE tensor(storage_type storage) : storage_(storage) {} + template= 2, int> = 0> + KERNEL_FLOAT_INLINE tensor(Args&&... args) : storage_ {std::forward(args)...} {} + + template< + typename U, + typename F, + enabled_t< + is_implicit_convertible && is_tensor_broadcastable, + int> = 0> + KERNEL_FLOAT_INLINE tensor(const tensor& input) : + tensor(convert(input, extents_type {})) {} + + KERNEL_FLOAT_INLINE tensor(const value_type& input = {}) : + tensor(convert(input, extents_type {})) {} + KERNEL_FLOAT_INLINE storage_type& storage() { return storage_; @@ -238,50 +250,6 @@ KERNEL_FLOAT_INLINE vec, sizeof...(Args)> make_vec(Args&&... return tensor_storage {T {args}...}; }; -// clang-format off -template using vec1 = vec; -template using vec2 = vec; -template using vec3 = vec; -template using vec4 = vec; -template using vec5 = vec; -template using vec6 = vec; -template using vec7 = vec; -template using vec8 = vec; -// clang-format on - -#define KERNEL_FLOAT_TYPE_ALIAS(NAME, T) \ - using k##NAME = scalar; \ - template \ - using NAME##X = vec; \ - using NAME##1 = vec; \ - using NAME##2 = vec; \ - using NAME##3 = vec; \ - using NAME##4 = vec; \ - using NAME##5 = vec; \ - using NAME##6 = vec; \ - using NAME##7 = vec; \ - using NAME##8 = vec; - -KERNEL_FLOAT_TYPE_ALIAS(char, char) -KERNEL_FLOAT_TYPE_ALIAS(short, short) -KERNEL_FLOAT_TYPE_ALIAS(int, int) -KERNEL_FLOAT_TYPE_ALIAS(long, long) -KERNEL_FLOAT_TYPE_ALIAS(longlong, long long) - -KERNEL_FLOAT_TYPE_ALIAS(uchar, unsigned char) -KERNEL_FLOAT_TYPE_ALIAS(ushort, unsigned short) -KERNEL_FLOAT_TYPE_ALIAS(uint, unsigned int) -KERNEL_FLOAT_TYPE_ALIAS(ulong, unsigned long) -KERNEL_FLOAT_TYPE_ALIAS(ulonglong, unsigned long long) - -KERNEL_FLOAT_TYPE_ALIAS(float, float) -KERNEL_FLOAT_TYPE_ALIAS(f32x, float) -KERNEL_FLOAT_TYPE_ALIAS(float32x, float) - -KERNEL_FLOAT_TYPE_ALIAS(double, double) -KERNEL_FLOAT_TYPE_ALIAS(f64x, double) -KERNEL_FLOAT_TYPE_ALIAS(float64x, double) - } // namespace kernel_float #endif \ No newline at end of file diff --git a/include/kernel_float/unops.h b/include/kernel_float/unops.h index 185eba4..860b5a6 100644 --- a/include/kernel_float/unops.h +++ b/include/kernel_float/unops.h @@ -13,14 +13,14 @@ template struct apply_impl { KERNEL_FLOAT_INLINE static tensor_storage call(F fun, const tensor_storage& input) { - return call(fun, input, make_index_sequence {}); - } + tensor_storage result; - private: - template - KERNEL_FLOAT_INLINE static tensor_storage - call(F fun, const tensor_storage& input, index_sequence) { - return {fun(input[Is])...}; +#pragma unroll + for (size_t i = 0; i < N; i++) { + result[i] = fun(input[i]); + } + + return result; } }; } // namespace detail diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 687970f..37547b6 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-06-22 10:09:53.221460 -// git hash: ce9f9941cc29e9d14001395dd631df563b79b2f0 +// date: 2023-06-27 14:02:57.585212 +// git hash: 1a5fd7464a76374c013847a7f114133e57f8a080 //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -55,14 +55,14 @@ namespace kernel_float { template -struct alignas(2 * alignof(T)) complex_storage { +struct alignas(2 * alignof(T)) complex_type_storage { T re; T im; }; template -struct complex_type: complex_storage { - using base_type = complex_storage; +struct complex_type: complex_type_storage { + using base_type = complex_type_storage; template KERNEL_FLOAT_INLINE complex_type(complex_type that) : base_type(that.real(), that.imag()) {} @@ -402,7 +402,7 @@ struct promote_type { using type = T; \ }; -// T and bool becomes T +// T + bool becomes T KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(char, bool) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(signed char, bool) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(signed short, bool) @@ -452,6 +452,33 @@ KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(long long, short) KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(long long, int) KERNEL_FLOAT_DEFINE_PROMOTED_INTEGRAL(long long, long) +template +struct promote_type { + using type = T*; +}; + +#define KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(I) \ + template \ + struct promote_type { \ + using type = T*; \ + }; \ + template \ + struct promote_type { \ + using type = T*; \ + }; + +KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(char) +KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(signed char) +KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(signed short) +KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(signed int) +KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(signed long) +KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(signed long long) +KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(unsigned char) +KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(unsigned short) +KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(unsigned int) +KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(unsigned long) +KERNEL_FLOAT_DEFINE_PROMOTED_POINTER(unsigned long long) + // half precision // KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(half) // KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(half, bool) @@ -632,9 +659,6 @@ struct array { } }; -template -using ndindex = array; - KERNEL_FLOAT_INLINE static constexpr size_t compute_max_alignment(size_t total_size, size_t min_align) { if (total_size % 32 == 0 || min_align >= 32) { @@ -767,8 +791,15 @@ struct extents { } }; -template -struct into_tensor_traits; +template +struct into_tensor_traits { + using type = tensor>; + + KERNEL_FLOAT_INLINE + static type call(const T& input) { + return input; + } +}; template struct into_tensor_traits { @@ -1110,7 +1141,7 @@ struct copy_helper; template struct copy_helper, IS, OS> { template - static void call(T* output, const T* input) { + KERNEL_FLOAT_INLINE static void call(T* output, const T* input) { ndindex<0> x; size_t input_index = IS::call(x); size_t output_index = OS::call(x); @@ -1121,7 +1152,7 @@ struct copy_helper, IS, OS> { template struct copy_helper, IS, OS> { template - static void call(T* output, const T* input) { + KERNEL_FLOAT_INLINE static void call(T* output, const T* input) { for (size_t i = 0; i < N; i++) { ndindex<1> x = {i}; size_t input_index = IS::call(x); @@ -1134,7 +1165,7 @@ struct copy_helper, IS, OS> { template struct copy_helper, IS, OS> { template - static void call(T* output, const T* input) { + KERNEL_FLOAT_INLINE static void call(T* output, const T* input) { for (size_t i = 0; i < N; i++) { for (size_t j = 0; j < M; j++) { ndindex<2> x = {i, j}; @@ -1149,7 +1180,7 @@ struct copy_helper, IS, OS> { template struct copy_helper, IS, OS> { template - static void call(T* output, const T* input) { + KERNEL_FLOAT_INLINE static void call(T* output, const T* input) { for (size_t i = 0; i < N; i++) { for (size_t j = 0; j < M; j++) { for (size_t k = 0; k < K; k++) { @@ -1203,13 +1234,13 @@ struct strides_helper> { template struct broadcast_impl { - KERNEL_FLOAT_INLINE static tensor_storage - call(tensor_storage input) { + KERNEL_FLOAT_INLINE static tensor_storage + call(tensor_storage input) { static_assert(is_broadcastable, "cannot broadcast to required shape"); - using IS = strides_helper>; + using IS = strides_helper::type>; using OS = strides_helper; - tensor_storage output; + tensor_storage output; copy_helper::call(output.data(), input.data()); return output; } @@ -1266,8 +1297,9 @@ template call(tensor_storage input) { + using F = ops::cast; tensor_storage intermediate = - detail::apply_impl, E::volume, T2, T>::call(input); + detail::apply_impl::call(F {}, input); return detail::broadcast_impl::call(intermediate); } }; @@ -1292,7 +1324,8 @@ template struct convert_helper { KERNEL_FLOAT_INLINE static tensor_storage call(tensor_storage input) { - return detail::apply_impl, E::volume, T2, T>::call(input); + using F = ops::cast; + return detail::apply_impl::call(F {}, input); } }; } // namespace detail @@ -1547,7 +1580,7 @@ struct bit_xor { namespace kernel_float { -KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(__half, bool) +KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(__half) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __half) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __half) @@ -1625,7 +1658,29 @@ struct apply_impl { return result; } }; -} // namespace detail + +template +struct reduce_helper= 2)>> { + KERNEL_FLOAT_INLINE static T call(F fun, const tensor_storage<__half, N>& input) { + __half2 accum = {input[0], input[1]}; + +#pragma unroll + for (size_t i = 2; i < N; i += 2) { + __half2 a = {input[i], input[i + 1]}; + accum = zip_halfx2::call(fun, accum, a); + } + + __half result = fun(accum.x, accum.y); + + if (N % 2 != 0) { + result = fun(result, input[N - 1]); + } + + return result; + } +}; + +}; // namespace detail #if KERNEL_FLOAT_IS_DEVICE #define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2) \ @@ -1738,640 +1793,690 @@ using half = __half; #endif #endif //KERNEL_FLOAT_FP16_H -#ifndef KERNEL_FLOAT_BF16_H -#define KERNEL_FLOAT_BF16_H +#ifndef KERNEL_FLOAT_REDUCE_H +#define KERNEL_FLOAT_REDUCE_H + + + +namespace kernel_float { +namespace detail { +template +struct reduce_helper { + KERNEL_FLOAT_INLINE static T call(F fun, const tensor_storage& input) { + return call(fun, input, make_index_sequence {}); + } + + private: + template + KERNEL_FLOAT_INLINE static T + call(F fun, const tensor_storage& input, index_sequence<0, Is...>) { + T result = input[0]; +#pragma unroll + for (size_t i = 1; i < N; i++) { + result = fun(result, input[i]); + } + return result; + } +}; +} // namespace detail + +/** + * Reduce the elements of the given vector ``input`` into a single value using + * the function ``fun``. This function should be a binary function that takes + * two elements and returns one element. The order in which the elements + * are reduced is not specified and depends on the reduction function and + * the vector type. + * + * Example + * ======= + * ``` + * vec x = {5, 2, 1}; + * int y = reduce(x, [](int a, int b) { return a + b; }); // returns 5+2+1=8 + * ``` + */ +template +KERNEL_FLOAT_INLINE tensor_value_type reduce(F fun, const V& input) { + return detail::reduce_helper, tensor_value_type>::call( + fun, + into_tensor_storage(input)); +} + +/** + * Find the minimum element in the given vector ``input``. + * + * Example + * ======= + * ``` + * vec x = {5, 0, 2, 1, 0}; + * int y = min(x); // Returns 0 + * ``` + */ +template> +KERNEL_FLOAT_INLINE T min(const V& input) { + return reduce(ops::min {}, input); +} + +/** + * Find the maximum element in the given vector ``input``. + * + * Example + * ======= + * ``` + * vec x = {5, 0, 2, 1, 0}; + * int y = max(x); // Returns 5 + * ``` + */ +template> +KERNEL_FLOAT_INLINE T max(const V& input) { + return reduce(ops::max {}, input); +} + +/** + * Sum the items in the given vector ``input``. + * + * Example + * ======= + * ``` + * vec x = {5, 0, 2, 1, 0}; + * int y = sum(x); // Returns 8 + * ``` + */ +template> +KERNEL_FLOAT_INLINE T sum(const V& input) { + return reduce(ops::add {}, input); +} + +/** + * Multiply the items in the given vector ``input``. + * + * Example + * ======= + * ``` + * vec x = {5, 0, 2, 1, 0}; + * int y = sum(x); // Returns 5*0*2*1*0 = 0 + * ``` + */ +template> +KERNEL_FLOAT_INLINE T product(const V& input) { + return reduce(ops::multiply {}, input); +} + +/** + * Check if all elements in the given vector ``input`` are non-zero. An element ``v`` is considered + * non-zero if ``bool(v)==true``. + */ +template +KERNEL_FLOAT_INLINE bool all(const V& input) { + return reduce(ops::bit_and {}, cast(input)); +} + +/** + * Check if any element in the given vector ``input`` is non-zero. An element ``v`` is considered + * non-zero if ``bool(v)==true``. + */ +template +KERNEL_FLOAT_INLINE bool any(const V& input) { + return reduce(ops::bit_or {}, cast(input)); +} + +/** + * Count the number of non-zero items in the given vector ``input``. An element ``v`` is considered + * non-zero if ``bool(v)==true``. + * + * Example + * ======= + * ``` + * vec x = {5, 0, 2, 1, 0}; + * int y = count(x); // Returns 3 (5, 2, 1 are non-zero) + * ``` + */ +template +KERNEL_FLOAT_INLINE int count(const V& input) { + return sum(cast(cast(input))); +} +} // namespace kernel_float + +#endif //KERNEL_FLOAT_REDUCE_H +#ifndef KERNEL_FLOAT_BASE_H +#define KERNEL_FLOAT_BASE_H + -#if KERNEL_FLOAT_BF16_AVAILABLE -#include namespace kernel_float { -KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(__nv_bfloat16, bool) -KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __nv_bfloat16) -KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __nv_bfloat16) -template<> -struct into_tensor_traits<__nv_bfloat162> { - using type = tensor<__nv_bfloat16, extents<2>>; +template class S> +struct tensor { + static constexpr size_t rank = E::rank; + static constexpr size_t volume = E::volume; + + using value_type = T; + using extents_type = E; + using ndindex_type = ndindex; + using storage_type = S; KERNEL_FLOAT_INLINE - static type call(__nv_bfloat162 input) { - return tensor_storage<__nv_bfloat16, 2> {input.x, input.y}; + static constexpr size_t size() { + return E::volume; } -}; -namespace detail { -template -struct map_bfloat16x2 { KERNEL_FLOAT_INLINE - static __nv_bfloat162 call(F fun, __nv_bfloat162 input) { - __nv_bfloat16 a = fun(input.x); - __nv_bfloat16 b = fun(input.y); - return {a, b}; + static constexpr size_t size(size_t axis) { + return E::size(axis); } -}; -template -struct zip_bfloat16x2 { KERNEL_FLOAT_INLINE - static __nv_bfloat162 call(F fun, __nv_bfloat162 left, __nv_bfloat162 right) { - __nv_bfloat16 a = fun(left.x, left.y); - __nv_bfloat16 b = fun(right.y, right.y); - return {a, b}; + static constexpr extents_type shape() { + return {}; } -}; -template -struct apply_impl { - KERNEL_FLOAT_INLINE static tensor_storage<__nv_bfloat16, N> - call(F fun, const tensor_storage<__nv_bfloat16, N>& input) { - tensor_storage<__nv_bfloat16, N> result; + KERNEL_FLOAT_INLINE + static constexpr size_t stride(size_t axis) { + return E::stride(axis); + } -#pragma unroll - for (size_t i = 0; i < N; i += 2) { - __nv_bfloat162 a = {input[i], input[i + 1]}; - __nv_bfloat162 b = map_bfloat16x2::call(fun, a); - result[i + 0] = b.x; - result[i + 1] = b.y; - } + KERNEL_FLOAT_INLINE + static constexpr size_t linearize_index(ndindex_type index) { + return E::ravel_index(index); + } - if (N % 2 != 0) { - result[N - 1] = fun(input[N - 1]); - } + template = 0> + KERNEL_FLOAT_INLINE tensor(Args&&... args) : storage_ {std::forward(args)...} {} - return result; + KERNEL_FLOAT_INLINE + tensor(T init = {}) { + for (size_t i = 0; i < size(); i++) { + storage_[i] = init; + } } -}; - -template -struct apply_impl { - KERNEL_FLOAT_INLINE static tensor_storage<__nv_bfloat16, N> call( - F fun, - const tensor_storage<__nv_bfloat16, N>& left, - const tensor_storage<__nv_bfloat16, N>& right) { - tensor_storage<__nv_bfloat16, N> result; -#pragma unroll - for (size_t i = 0; i < N; i += 2) { - __nv_bfloat162 a = {left[i], left[i + 1]}; - __nv_bfloat162 b = {right[i], right[i + 1]}; - __nv_bfloat162 c = zip_bfloat16x2::call(fun, a, b); - result[i + 0] = c.x; - result[i + 1] = c.y; - } - - if (N % 2 != 0) { - result[N - 1] = fun(left[N - 1], right[N - 1]); - } - - return result; - } -}; -} // namespace detail - -#if KERNEL_FLOAT_IS_DEVICE -#define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2) \ - namespace ops { \ - template<> \ - struct NAME<__nv_bfloat16> { \ - KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(__nv_bfloat16 input) { \ - return FUN1(input); \ - } \ - }; \ - } \ - namespace detail { \ - template<> \ - struct map_bfloat16x2> { \ - KERNEL_FLOAT_INLINE static __nv_bfloat162 \ - call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 input) { \ - return FUN2(input); \ - } \ - }; \ - } - -KERNEL_FLOAT_BF16_UNARY_FUN(abs, ::__habs, ::__habs2); -KERNEL_FLOAT_BF16_UNARY_FUN(negate, ::__hneg, ::__hneg2); -KERNEL_FLOAT_BF16_UNARY_FUN(ceil, ::hceil, ::h2ceil); -KERNEL_FLOAT_BF16_UNARY_FUN(cos, ::hcos, ::h2cos); -KERNEL_FLOAT_BF16_UNARY_FUN(exp, ::hexp, ::h2exp); -KERNEL_FLOAT_BF16_UNARY_FUN(exp10, ::hexp10, ::h2exp10); -KERNEL_FLOAT_BF16_UNARY_FUN(floor, ::hfloor, ::h2floor); -KERNEL_FLOAT_BF16_UNARY_FUN(log, ::hlog, ::h2log); -KERNEL_FLOAT_BF16_UNARY_FUN(log10, ::hlog10, ::h2log2); -KERNEL_FLOAT_BF16_UNARY_FUN(rint, ::hrint, ::h2rint); -KERNEL_FLOAT_BF16_UNARY_FUN(rsqrt, ::hrsqrt, ::h2rsqrt); -KERNEL_FLOAT_BF16_UNARY_FUN(sin, ::hsin, ::h2sin); -KERNEL_FLOAT_BF16_UNARY_FUN(sqrt, ::hsqrt, ::h2sqrt); -KERNEL_FLOAT_BF16_UNARY_FUN(trunc, ::htrunc, ::h2trunc); - -#define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2) \ - namespace ops { \ - template<> \ - struct NAME<__nv_bfloat16> { \ - KERNEL_FLOAT_INLINE __nv_bfloat16 \ - operator()(__nv_bfloat16 left, __nv_bfloat16 right) const { \ - return FUN1(left, right); \ - } \ - }; \ - } \ - namespace detail { \ - template<> \ - struct zip_bfloat16x2> { \ - KERNEL_FLOAT_INLINE static __nv_bfloat162 \ - call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 left, __nv_bfloat162 right) { \ - return FUN2(left, right); \ - } \ - }; \ - } - -KERNEL_FLOAT_BF16_BINARY_FUN(add, __hadd, __hadd2) -KERNEL_FLOAT_BF16_BINARY_FUN(subtract, __hsub, __hsub2) -KERNEL_FLOAT_BF16_BINARY_FUN(multiply, __hmul, __hmul2) -KERNEL_FLOAT_BF16_BINARY_FUN(divide, __hdiv, __h2div) -KERNEL_FLOAT_BF16_BINARY_FUN(min, __hmin, __hmin2) -KERNEL_FLOAT_BF16_BINARY_FUN(max, __hmax, __hmax2) - -KERNEL_FLOAT_BF16_BINARY_FUN(equal_to, __heq, __heq2) -KERNEL_FLOAT_BF16_BINARY_FUN(not_equal_to, __heq, __heq2) -KERNEL_FLOAT_BF16_BINARY_FUN(less, __hlt, __hlt2) -KERNEL_FLOAT_BF16_BINARY_FUN(less_equal, __hle, __hle2) -KERNEL_FLOAT_BF16_BINARY_FUN(greater, __hgt, __hgt2) -KERNEL_FLOAT_BF16_BINARY_FUN(greater_equal, __hge, __hgt2) - -#endif - -#define KERNEL_FLOAT_BF16_CAST(T, TO_HALF, FROM_HALF) \ - namespace ops { \ - template<> \ - struct cast { \ - KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(T input) { \ - return TO_HALF; \ - } \ - }; \ - template<> \ - struct cast<__nv_bfloat16, T> { \ - KERNEL_FLOAT_INLINE T operator()(__nv_bfloat16 input) { \ - return FROM_HALF; \ - } \ - }; \ - } - -KERNEL_FLOAT_BF16_CAST(double, __double2bfloat16(input), double(__bfloat162float(input))); -KERNEL_FLOAT_BF16_CAST(float, __float2bfloat16(input), __bfloat162float(input)); - -// there are no official char casts. Instead, cast to int and then to char -KERNEL_FLOAT_BF16_CAST(char, __int2bfloat16_rn(input), (char)__bfloat162int_rz(input)); -KERNEL_FLOAT_BF16_CAST( - signed char, - __int2bfloat16_rn(input), - (signed char)__bfloat162int_rz(input)); -KERNEL_FLOAT_BF16_CAST( - unsigned char, - __int2bfloat16_rn(input), - (unsigned char)__bfloat162int_rz(input)); - -KERNEL_FLOAT_BF16_CAST(signed short, __bfloat162short_rz(input), __short2bfloat16_rn(input)); -KERNEL_FLOAT_BF16_CAST(signed int, __bfloat162int_rz(input), __int2bfloat16_rn(input)); -KERNEL_FLOAT_BF16_CAST( - signed long, - __ll2bfloat16_rn(input), - (signed long)(__bfloat162ll_rz(input))); -KERNEL_FLOAT_BF16_CAST(signed long long, __ll2bfloat16_rn(input), __bfloat162ll_rz(input)); - -KERNEL_FLOAT_BF16_CAST(unsigned short, __bfloat162ushort_rz(input), __ushort2bfloat16_rn(input)); -KERNEL_FLOAT_BF16_CAST(unsigned int, __bfloat162uint_rz(input), __uint2bfloat16_rn(input)); -KERNEL_FLOAT_BF16_CAST( - unsigned long, - __ull2bfloat16_rn(input), - (unsigned long)(__bfloat162ull_rz(input))); -KERNEL_FLOAT_BF16_CAST(unsigned long long, __ull2bfloat16_rn(input), __bfloat162ull_rz(input)); - -using bfloat16 = __nv_bfloat16; -//KERNEL_FLOAT_TYPE_ALIAS(float16x, __nv_bfloat16) -//KERNEL_FLOAT_TYPE_ALIAS(f16x, __nv_bfloat16) - -} // namespace kernel_float - -#if KERNEL_FLOAT_FP16_AVAILABLE - - -namespace kernel_float { -KERNEL_FLOAT_BF16_CAST(__half, __float2bfloat16(input), __bfloat162float(input)); -} - -#endif // KERNEL_FLOAT_FP16_AVAILABLE -#endif - -#endif //KERNEL_FLOAT_BF16_H -#ifndef KERNEL_FLOAT_REDUCE_H -#define KERNEL_FLOAT_REDUCE_H - + KERNEL_FLOAT_INLINE + tensor(storage_type storage) : storage_(storage) {} -namespace kernel_float { -namespace detail { -template -struct reduce_helper { - KERNEL_FLOAT_INLINE static T call(F fun, const tensor_storage& input) { - return call(fun, input, make_index_sequence {}); + KERNEL_FLOAT_INLINE + storage_type& storage() { + return storage_; } - private: - template - KERNEL_FLOAT_INLINE static T - call(F fun, const tensor_storage& input, index_sequence<0, Is...>) { - T result = input[0]; -#pragma unroll - for (size_t i = 1; i < N; i++) { - result = fun(result, input[i]); - } - return result; + KERNEL_FLOAT_INLINE + const storage_type& storage() const { + return storage_; } -}; -} // namespace detail - -/** - * Reduce the elements of the given vector ``input`` into a single value using - * the function ``fun``. This function should be a binary function that takes - * two elements and returns one element. The order in which the elements - * are reduced is not specified and depends on the reduction function and - * the vector type. - * - * Example - * ======= - * ``` - * vec x = {5, 2, 1}; - * int y = reduce(x, [](int a, int b) { return a + b; }); // returns 5+2+1=8 - * ``` - */ -template -KERNEL_FLOAT_INLINE tensor_value_type reduce(F fun, const V& input) { - return detail::reduce_helper, tensor_value_type>::call( - fun, - into_tensor_storage(input)); -} - -/** - * Find the minimum element in the given vector ``input``. - * - * Example - * ======= - * ``` - * vec x = {5, 0, 2, 1, 0}; - * int y = min(x); // Returns 0 - * ``` - */ -template> -KERNEL_FLOAT_INLINE T min(const V& input) { - return reduce(ops::min {}, input); -} - -/** - * Find the maximum element in the given vector ``input``. - * - * Example - * ======= - * ``` - * vec x = {5, 0, 2, 1, 0}; - * int y = max(x); // Returns 5 - * ``` - */ -template> -KERNEL_FLOAT_INLINE T max(const V& input) { - return reduce(ops::max {}, input); -} - -/** - * Sum the items in the given vector ``input``. - * - * Example - * ======= - * ``` - * vec x = {5, 0, 2, 1, 0}; - * int y = sum(x); // Returns 8 - * ``` - */ -template> -KERNEL_FLOAT_INLINE T sum(const V& input) { - return reduce(ops::add {}, input); -} - -/** - * Multiply the items in the given vector ``input``. - * - * Example - * ======= - * ``` - * vec x = {5, 0, 2, 1, 0}; - * int y = sum(x); // Returns 5*0*2*1*0 = 0 - * ``` - */ -template> -KERNEL_FLOAT_INLINE T product(const V& input) { - return reduce(ops::multiply {}, input); -} - -/** - * Check if all elements in the given vector ``input`` are non-zero. An element ``v`` is considered - * non-zero if ``bool(v)==true``. - */ -template -KERNEL_FLOAT_INLINE bool all(const V& input) { - return reduce(ops::bit_and {}, cast(input)); -} - -/** - * Check if any element in the given vector ``input`` is non-zero. An element ``v`` is considered - * non-zero if ``bool(v)==true``. - */ -template -KERNEL_FLOAT_INLINE bool any(const V& input) { - return reduce(ops::bit_or {}, cast(input)); -} - -/** - * Count the number of non-zero items in the given vector ``input``. An element ``v`` is considered - * non-zero if ``bool(v)==true``. - * - * Example - * ======= - * ``` - * vec x = {5, 0, 2, 1, 0}; - * int y = count(x); // Returns 3 (5, 2, 1 are non-zero) - * ``` - */ -template -KERNEL_FLOAT_INLINE int count(const V& input) { - return sum(cast(cast(input))); -} -} // namespace kernel_float -#endif //KERNEL_FLOAT_REDUCE_H -#ifndef KERNEL_FLOAT_BASE_H -#define KERNEL_FLOAT_BASE_H + KERNEL_FLOAT_INLINE + T* data() { + return storage_.data(); + } + KERNEL_FLOAT_INLINE + const T* data() const { + return storage_.data(); + } + KERNEL_FLOAT_INLINE + const T* cdata() const { + return storage_.data(); + } + KERNEL_FLOAT_INLINE + T* begin() { + return storage_.data(); + } + KERNEL_FLOAT_INLINE + const T* begin() const { + return storage_.data(); + } + KERNEL_FLOAT_INLINE + const T* cbegin() const { + return storage_.data(); + } + KERNEL_FLOAT_INLINE + T* end() { + return storage_.data() + E::volume; + } -namespace kernel_float { + KERNEL_FLOAT_INLINE + const T* end() const { + return storage_.data() + E::volume; + } -template class S> -struct tensor { - static constexpr size_t rank = E::rank; - static constexpr size_t volume = E::volume; + KERNEL_FLOAT_INLINE + const T* cend() const { + return storage_.data() + E::volume; + } - using value_type = T; - using extents_type = E; - using ndindex_type = ndindex; - using storage_type = S; + KERNEL_FLOAT_INLINE + T& at(ndindex_type x) { + return *(data() + linearize_index(x)); + } KERNEL_FLOAT_INLINE - static constexpr size_t size() { - return E::volume; + const T& at(ndindex_type x) const { + return *(data() + linearize_index(x)); } KERNEL_FLOAT_INLINE - static constexpr size_t size(size_t axis) { - return E::size(axis); + T get(ndindex_type x) const { + return at(x); } KERNEL_FLOAT_INLINE - static constexpr size_t stride(size_t axis) { - return E::stride(axis); + void set(ndindex_type x, T value) { + at(x) = std::move(value); } KERNEL_FLOAT_INLINE - static constexpr size_t linearize_index(ndindex_type index) { - return E::ravel_index(index); + T& operator[](ndindex_type x) { + return at(x); } - template = 0> - KERNEL_FLOAT_INLINE tensor(Args&&... args) : storage_ {std::forward(args)...} {} + KERNEL_FLOAT_INLINE + const T& operator[](ndindex_type x) const { + return at(x); + } KERNEL_FLOAT_INLINE - tensor(T init = {}) { - for (size_t i = 0; i < size(); i++) { - storage_[i] = init; - } + T& operator()(ndindex_type x) { + return at(x); } KERNEL_FLOAT_INLINE - tensor(storage_type storage) : storage_(storage) {} + const T& operator()(ndindex_type x) const { + return at(x); + } KERNEL_FLOAT_INLINE - storage_type& storage() { + tensor> flatten() const { return storage_; } - KERNEL_FLOAT_INLINE - const storage_type& storage() const { + template + KERNEL_FLOAT_INLINE tensor> reshape(extents = {}) const { + static_assert(extents::volume == volume, "invalid reshape shape"); return storage_; } - KERNEL_FLOAT_INLINE - T* data() { - return storage_.data(); + template + KERNEL_FLOAT_INLINE tensor> broadcast(extents new_shape = {}) const { + return kernel_float::broadcast(*this, new_shape); + } + + template + KERNEL_FLOAT_INLINE tensor, E> map(F fun = {}) const { + return kernel_float::map(fun, *this); + } + + template + KERNEL_FLOAT_INLINE T reduce(F fun = {}) const { + return kernel_float::reduce(fun, *this); } + private: + storage_type storage_; +}; + +#define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ + template<> \ + struct into_tensor_traits<::T2> { \ + using type = tensor>; \ + \ + KERNEL_FLOAT_INLINE \ + static type call(::T2 v) { \ + return tensor_storage {v.x, v.y}; \ + } \ + }; \ + \ + template<> \ + struct into_tensor_traits<::T3> { \ + using type = tensor>; \ + \ + KERNEL_FLOAT_INLINE \ + static type call(::T3 v) { \ + return tensor_storage {v.x, v.y, v.z}; \ + } \ + }; \ + \ + template<> \ + struct into_tensor_traits<::T4> { \ + using type = tensor>; \ + \ + KERNEL_FLOAT_INLINE \ + static type call(::T4 v) { \ + return tensor_storage {v.x, v.y, v.z, v.w}; \ + } \ + }; + +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(char, char1, char2, char3, char4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(short, short1, short2, short3, short4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(int, int1, int2, int3, int4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long, long1, long2, long3, long4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long long, longlong1, longlong2, longlong3, longlong4) + +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned char, uchar1, uchar2, uchar3, uchar4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned short, ushort1, ushort2, ushort3, ushort4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned int, uint1, uint2, uint3, uint4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long, ulong1, ulong2, ulong3, ulong4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong1, ulonglong2, ulonglong3, ulonglong4) + +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(float, float1, float2, float3, float4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(double, double1, double2, double3, double4) + +template +using scalar = tensor>; + +template +using vec = tensor>; + +template +using mat = tensor>; + +template +KERNEL_FLOAT_INLINE vec, sizeof...(Args)> make_vec(Args&&... args) { + using T = promote_t; + return tensor_storage {T {args}...}; +}; + +} // namespace kernel_float + +#endif +#ifndef KERNEL_FLOAT_BF16_H +#define KERNEL_FLOAT_BF16_H + + + +#if KERNEL_FLOAT_BF16_AVAILABLE +#include + + + + +namespace kernel_float { +KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(__nv_bfloat16) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __nv_bfloat16) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __nv_bfloat16) + +template<> +struct into_tensor_traits<__nv_bfloat162> { + using type = tensor<__nv_bfloat16, extents<2>>; + KERNEL_FLOAT_INLINE - const T* data() const { - return storage_.data(); + static type call(__nv_bfloat162 input) { + return tensor_storage<__nv_bfloat16, 2> {input.x, input.y}; } +}; +namespace detail { +template +struct map_bfloat16x2 { KERNEL_FLOAT_INLINE - const T* cdata() const { - return storage_.data(); + static __nv_bfloat162 call(F fun, __nv_bfloat162 input) { + __nv_bfloat16 a = fun(input.x); + __nv_bfloat16 b = fun(input.y); + return {a, b}; } +}; +template +struct zip_bfloat16x2 { KERNEL_FLOAT_INLINE - T* begin() { - return storage_.data(); + static __nv_bfloat162 call(F fun, __nv_bfloat162 left, __nv_bfloat162 right) { + __nv_bfloat16 a = fun(left.x, left.y); + __nv_bfloat16 b = fun(right.y, right.y); + return {a, b}; + } +}; + +template +struct apply_impl { + KERNEL_FLOAT_INLINE static tensor_storage<__nv_bfloat16, N> + call(F fun, const tensor_storage<__nv_bfloat16, N>& input) { + tensor_storage<__nv_bfloat16, N> result; + +#pragma unroll + for (size_t i = 0; i < N; i += 2) { + __nv_bfloat162 a = {input[i], input[i + 1]}; + __nv_bfloat162 b = map_bfloat16x2::call(fun, a); + result[i + 0] = b.x; + result[i + 1] = b.y; + } + + if (N % 2 != 0) { + result[N - 1] = fun(input[N - 1]); + } + + return result; } +}; + +template +struct apply_impl { + KERNEL_FLOAT_INLINE static tensor_storage<__nv_bfloat16, N> call( + F fun, + const tensor_storage<__nv_bfloat16, N>& left, + const tensor_storage<__nv_bfloat16, N>& right) { + tensor_storage<__nv_bfloat16, N> result; +#pragma unroll + for (size_t i = 0; i < N; i += 2) { + __nv_bfloat162 a = {left[i], left[i + 1]}; + __nv_bfloat162 b = {right[i], right[i + 1]}; + __nv_bfloat162 c = zip_bfloat16x2::call(fun, a, b); + result[i + 0] = c.x; + result[i + 1] = c.y; + } + + if (N % 2 != 0) { + result[N - 1] = fun(left[N - 1], right[N - 1]); + } - KERNEL_FLOAT_INLINE - const T* begin() const { - return storage_.data(); + return result; } +}; - KERNEL_FLOAT_INLINE - const T* cbegin() const { - return storage_.data(); - } +template +struct reduce_helper= 2)>> { + KERNEL_FLOAT_INLINE static T call(F fun, const tensor_storage<__nv_bfloat16, N>& input) { + __nv_bfloat162 accum = {input[0], input[1]}; - KERNEL_FLOAT_INLINE - T* end() { - return storage_.data() + E::volume; - } +#pragma unroll + for (size_t i = 2; i < N; i += 2) { + __nv_bfloat162 a = {input[i], input[i + 1]}; + accum = zip_bfloat16x2::call(fun, accum, a); + } - KERNEL_FLOAT_INLINE - const T* end() const { - return storage_.data() + E::volume; - } + __nv_bfloat16 result = fun(accum.x, accum.y); - KERNEL_FLOAT_INLINE - const T* cend() const { - return storage_.data() + E::volume; - } + if (N % 2 != 0) { + result = fun(result, input[N - 1]); + } - KERNEL_FLOAT_INLINE - T& at(ndindex_type x) { - return *(data() + linearize_index(x)); + return result; } +}; +} // namespace detail - KERNEL_FLOAT_INLINE - const T& at(ndindex_type x) const { - return *(data() + linearize_index(x)); +#if KERNEL_FLOAT_IS_DEVICE +#define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__nv_bfloat16> { \ + KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(__nv_bfloat16 input) { \ + return FUN1(input); \ + } \ + }; \ + } \ + namespace detail { \ + template<> \ + struct map_bfloat16x2> { \ + KERNEL_FLOAT_INLINE static __nv_bfloat162 \ + call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 input) { \ + return FUN2(input); \ + } \ + }; \ } - KERNEL_FLOAT_INLINE - T get(ndindex_type x) const { - return at(x); - } +KERNEL_FLOAT_BF16_UNARY_FUN(abs, ::__habs, ::__habs2); +KERNEL_FLOAT_BF16_UNARY_FUN(negate, ::__hneg, ::__hneg2); +KERNEL_FLOAT_BF16_UNARY_FUN(ceil, ::hceil, ::h2ceil); +KERNEL_FLOAT_BF16_UNARY_FUN(cos, ::hcos, ::h2cos); +KERNEL_FLOAT_BF16_UNARY_FUN(exp, ::hexp, ::h2exp); +KERNEL_FLOAT_BF16_UNARY_FUN(exp10, ::hexp10, ::h2exp10); +KERNEL_FLOAT_BF16_UNARY_FUN(floor, ::hfloor, ::h2floor); +KERNEL_FLOAT_BF16_UNARY_FUN(log, ::hlog, ::h2log); +KERNEL_FLOAT_BF16_UNARY_FUN(log10, ::hlog10, ::h2log2); +KERNEL_FLOAT_BF16_UNARY_FUN(rint, ::hrint, ::h2rint); +KERNEL_FLOAT_BF16_UNARY_FUN(rsqrt, ::hrsqrt, ::h2rsqrt); +KERNEL_FLOAT_BF16_UNARY_FUN(sin, ::hsin, ::h2sin); +KERNEL_FLOAT_BF16_UNARY_FUN(sqrt, ::hsqrt, ::h2sqrt); +KERNEL_FLOAT_BF16_UNARY_FUN(trunc, ::htrunc, ::h2trunc); - KERNEL_FLOAT_INLINE - void set(ndindex_type x, T value) { - at(x) = std::move(value); +#define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__nv_bfloat16> { \ + KERNEL_FLOAT_INLINE __nv_bfloat16 \ + operator()(__nv_bfloat16 left, __nv_bfloat16 right) const { \ + return FUN1(left, right); \ + } \ + }; \ + } \ + namespace detail { \ + template<> \ + struct zip_bfloat16x2> { \ + KERNEL_FLOAT_INLINE static __nv_bfloat162 \ + call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 left, __nv_bfloat162 right) { \ + return FUN2(left, right); \ + } \ + }; \ } - KERNEL_FLOAT_INLINE - T& operator[](ndindex_type x) { - return at(x); - } +KERNEL_FLOAT_BF16_BINARY_FUN(add, __hadd, __hadd2) +KERNEL_FLOAT_BF16_BINARY_FUN(subtract, __hsub, __hsub2) +KERNEL_FLOAT_BF16_BINARY_FUN(multiply, __hmul, __hmul2) +KERNEL_FLOAT_BF16_BINARY_FUN(divide, __hdiv, __h2div) +KERNEL_FLOAT_BF16_BINARY_FUN(min, __hmin, __hmin2) +KERNEL_FLOAT_BF16_BINARY_FUN(max, __hmax, __hmax2) - KERNEL_FLOAT_INLINE - const T& operator[](ndindex_type x) const { - return at(x); - } +KERNEL_FLOAT_BF16_BINARY_FUN(equal_to, __heq, __heq2) +KERNEL_FLOAT_BF16_BINARY_FUN(not_equal_to, __heq, __heq2) +KERNEL_FLOAT_BF16_BINARY_FUN(less, __hlt, __hlt2) +KERNEL_FLOAT_BF16_BINARY_FUN(less_equal, __hle, __hle2) +KERNEL_FLOAT_BF16_BINARY_FUN(greater, __hgt, __hgt2) +KERNEL_FLOAT_BF16_BINARY_FUN(greater_equal, __hge, __hgt2) - KERNEL_FLOAT_INLINE - T& operator()(ndindex_type x) { - return at(x); - } +#endif - KERNEL_FLOAT_INLINE - const T& operator()(ndindex_type x) const { - return at(x); +#define KERNEL_FLOAT_BF16_CAST(T, TO_HALF, FROM_HALF) \ + namespace ops { \ + template<> \ + struct cast { \ + KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(T input) { \ + return TO_HALF; \ + } \ + }; \ + template<> \ + struct cast<__nv_bfloat16, T> { \ + KERNEL_FLOAT_INLINE T operator()(__nv_bfloat16 input) { \ + return FROM_HALF; \ + } \ + }; \ } - KERNEL_FLOAT_INLINE - tensor> flatten() const { - return storage_; - } +KERNEL_FLOAT_BF16_CAST(double, __double2bfloat16(input), double(__bfloat162float(input))); +KERNEL_FLOAT_BF16_CAST(float, __float2bfloat16(input), __bfloat162float(input)); - template - KERNEL_FLOAT_INLINE tensor> reshape(extents = {}) const { - static_assert(extents::volume == volume, "invalid reshape shape"); - return storage_; - } +// there are no official char casts. Instead, cast to int and then to char +KERNEL_FLOAT_BF16_CAST(char, __int2bfloat16_rn(input), (char)__bfloat162int_rz(input)); +KERNEL_FLOAT_BF16_CAST( + signed char, + __int2bfloat16_rn(input), + (signed char)__bfloat162int_rz(input)); +KERNEL_FLOAT_BF16_CAST( + unsigned char, + __int2bfloat16_rn(input), + (unsigned char)__bfloat162int_rz(input)); - template - KERNEL_FLOAT_INLINE tensor> broadcast(extents new_shape = {}) const { - return kernel_float::broadcast(*this, new_shape); - } +KERNEL_FLOAT_BF16_CAST(signed short, __bfloat162short_rz(input), __short2bfloat16_rn(input)); +KERNEL_FLOAT_BF16_CAST(signed int, __bfloat162int_rz(input), __int2bfloat16_rn(input)); +KERNEL_FLOAT_BF16_CAST( + signed long, + __ll2bfloat16_rn(input), + (signed long)(__bfloat162ll_rz(input))); +KERNEL_FLOAT_BF16_CAST(signed long long, __ll2bfloat16_rn(input), __bfloat162ll_rz(input)); - template - KERNEL_FLOAT_INLINE tensor, E> map(F fun = {}) const { - return kernel_float::map(fun, *this); - } +KERNEL_FLOAT_BF16_CAST(unsigned short, __bfloat162ushort_rz(input), __ushort2bfloat16_rn(input)); +KERNEL_FLOAT_BF16_CAST(unsigned int, __bfloat162uint_rz(input), __uint2bfloat16_rn(input)); +KERNEL_FLOAT_BF16_CAST( + unsigned long, + __ull2bfloat16_rn(input), + (unsigned long)(__bfloat162ull_rz(input))); +KERNEL_FLOAT_BF16_CAST(unsigned long long, __ull2bfloat16_rn(input), __bfloat162ull_rz(input)); - template - KERNEL_FLOAT_INLINE T reduce(F fun = {}) const { - return kernel_float::reduce(fun, *this); - } +using bfloat16 = __nv_bfloat16; +//KERNEL_FLOAT_TYPE_ALIAS(float16x, __nv_bfloat16) +//KERNEL_FLOAT_TYPE_ALIAS(f16x, __nv_bfloat16) - private: - storage_type storage_; -}; +} // namespace kernel_float -#define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ - template<> \ - struct into_tensor_traits<::T2> { \ - using type = tensor>; \ - \ - KERNEL_FLOAT_INLINE \ - static type call(::T2 v) { \ - return tensor_storage {v.x, v.y}; \ - } \ - }; \ - \ - template<> \ - struct into_tensor_traits<::T3> { \ - using type = tensor>; \ - \ - KERNEL_FLOAT_INLINE \ - static type call(::T3 v) { \ - return tensor_storage {v.x, v.y, v.z}; \ - } \ - }; \ - \ - template<> \ - struct into_tensor_traits<::T4> { \ - using type = tensor>; \ - \ - KERNEL_FLOAT_INLINE \ - static type call(::T4 v) { \ - return tensor_storage {v.x, v.y, v.z, v.w}; \ - } \ - }; +#if KERNEL_FLOAT_FP16_AVAILABLE -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(char, char1, char2, char3, char4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(short, short1, short2, short3, short4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(int, int1, int2, int3, int4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long, long1, long2, long3, long4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long long, longlong1, longlong2, longlong3, longlong4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned char, uchar1, uchar2, uchar3, uchar4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned short, ushort1, ushort2, ushort3, ushort4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned int, uint1, uint2, uint3, uint4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long, ulong1, ulong2, ulong3, ulong4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong1, ulonglong2, ulonglong3, ulonglong4) +namespace kernel_float { +KERNEL_FLOAT_BF16_CAST(__half, __float2bfloat16(input), __bfloat162float(input)); +} + +#endif // KERNEL_FLOAT_FP16_AVAILABLE +#endif + +#endif //KERNEL_FLOAT_BF16_H +#ifndef KERNEL_FLOAT_PRELUDE_H +#define KERNEL_FLOAT_PRELUDE_H -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(float, float1, float2, float3, float4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(double, double1, double2, double3, double4) + + +namespace kernel_float { +namespace prelude { template -using scalar = tensor>; +using kscalar = tensor>; template -using vec = tensor>; +using kvec = tensor>; template -using mat = tensor>; +using kvec = tensor>; -template -KERNEL_FLOAT_INLINE vec, sizeof...(Args)> make_vec(Args&&... args) { - using T = promote_t; - return tensor_storage {T {args}...}; -}; +template +using ktensor = tensor>; // clang-format off -template using vec1 = vec; -template using vec2 = vec; -template using vec3 = vec; -template using vec4 = vec; -template using vec5 = vec; -template using vec6 = vec; -template using vec7 = vec; -template using vec8 = vec; +template using kvec1 = kvec; +template using kvec2 = kvec; +template using kvec3 = kvec; +template using kvec4 = kvec; +template using kvec5 = kvec; +template using kvec6 = kvec; +template using kvec7 = kvec; +template using kvec8 = kvec; // clang-format on #define KERNEL_FLOAT_TYPE_ALIAS(NAME, T) \ using k##NAME = scalar; \ template \ - using NAME##X = vec; \ - using NAME##1 = vec; \ - using NAME##2 = vec; \ - using NAME##3 = vec; \ - using NAME##4 = vec; \ - using NAME##5 = vec; \ - using NAME##6 = vec; \ - using NAME##7 = vec; \ - using NAME##8 = vec; + using k##NAME##X = vec; \ + using k##NAME##1 = vec; \ + using k##NAME##2 = vec; \ + using k##NAME##3 = vec; \ + using k##NAME##4 = vec; \ + using k##NAME##5 = vec; \ + using k##NAME##6 = vec; \ + using k##NAME##7 = vec; \ + using k##NAME##8 = vec; KERNEL_FLOAT_TYPE_ALIAS(char, char) KERNEL_FLOAT_TYPE_ALIAS(short, short) @@ -2393,6 +2498,14 @@ KERNEL_FLOAT_TYPE_ALIAS(double, double) KERNEL_FLOAT_TYPE_ALIAS(f64x, double) KERNEL_FLOAT_TYPE_ALIAS(float64x, double) +template +static constexpr extents kshape = {}; + +template +KERNEL_FLOAT_INLINE kvec, sizeof...(Args)> make_kvec(Args&&... args) { + return make_vec(std::forward(args)...); +}; +} // namespace prelude } // namespace kernel_float #endif diff --git a/tests/binops.cu b/tests/binops.cu index 5124020..8409b71 100644 --- a/tests/binops.cu +++ b/tests/binops.cu @@ -4,7 +4,7 @@ namespace kf = kernel_float; template> - struct / arithmetic_test; +struct arithmetic_test; template struct arithmetic_test> { diff --git a/tests/broadcast.cu b/tests/broadcast.cu new file mode 100644 index 0000000..c70c6ce --- /dev/null +++ b/tests/broadcast.cu @@ -0,0 +1,67 @@ +#include "common.h" +#include "kernel_float.h" + +namespace kf = kernel_float; + +template< + typename T, + size_t N, + typename = std::make_index_sequence, + typename = std::make_index_sequence> +struct broadcast_test; + +template +struct broadcast_test, std::index_sequence> { + __host__ __device__ void operator()(generator gen) { + { + kf::tensor> x = gen.next(); + T y = gen.next(); + kf::tensor> z = x + y; + } + + { + kf::tensor> x = {gen.next(Is)...}; + T y = gen.next(); + kf::tensor> z = x + y; + } + + { + kf::tensor> x = {gen.next(IIs)...}; + T y = gen.next(); + kf::tensor> z = x + y; + } + + { + kf::tensor> x = gen.next(); + kf::tensor> y = {gen.next(Is)...}; + kf::tensor> z = x + y; + } + + { + kf::tensor> x = {gen.next(Is)...}; + kf::tensor> y = {gen.next(Is)...}; + kf::tensor> z = x - y; + } + + { + kf::tensor> x = gen.next(); + kf::tensor> y = {gen.next(IIs)...}; + kf::tensor> z = x * y; + } + + { + kf::tensor> x = {gen.next(Is)...}; + kf::tensor> y = {gen.next(Is)...}; + kf::tensor> z = x / y; + } + + { + kf::tensor> x; + kf::tensor> y = x; + } + } +}; + +TEST_CASE("cast operators") { + run_on_host_and_device(); +} From 1812de6d6fd205d35e0f07a5eb4fc3e2c190bdfd Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 27 Jun 2023 15:33:37 +0200 Subject: [PATCH 04/50] Rename aliases in prelude --- examples/vector_add/main.cu | 16 ++-- include/kernel_float/prelude.h | 24 +++--- single_include/kernel_float.h | 132 ++++++++++++++++++++------------- 3 files changed, 100 insertions(+), 72 deletions(-) diff --git a/examples/vector_add/main.cu b/examples/vector_add/main.cu index 82bad8a..ea78d1a 100644 --- a/examples/vector_add/main.cu +++ b/examples/vector_add/main.cu @@ -13,11 +13,11 @@ void cuda_check(cudaError_t code) { } template -__global__ void my_kernel(int length, const vhalf* input, kdouble constant, vfloat* output) { +__global__ void my_kernel(int length, const khalf* input, double constant, kfloat* output) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i * N < length) { - output[i] = kernel_float::cast((input[i] * input[i]) * constant); + output[i] = kf::cast((input[i] * input[i]) * constant); } } @@ -35,13 +35,13 @@ void run_kernel(int n) { } // Allocate device memory - vhalf* input_dev; - vfloat* output_dev; - cuda_check(cudaMalloc(&input_dev, sizeof(khalf) * n)); - cuda_check(cudaMalloc(&output_dev, sizeof(kfloat) * n)); + khalf* input_dev; + kfloat* output_dev; + cuda_check(cudaMalloc(&input_dev, sizeof(half) * n)); + cuda_check(cudaMalloc(&output_dev, sizeof(float) * n)); // Copy device memory - cuda_check(cudaMemcpy(input_dev, input.data(), sizeof(khalf) * n, cudaMemcpyDefault)); + cuda_check(cudaMemcpy(input_dev, input.data(), sizeof(half) * n, cudaMemcpyDefault)); // Launch kernel! int block_size = 256; @@ -50,7 +50,7 @@ void run_kernel(int n) { my_kernel<<>>(n, input_dev, constant, output_dev); // Copy results back - cuda_check(cudaMemcpy(output_dev, output_result.data(), sizeof(kfloat) * n, cudaMemcpyDefault)); + cuda_check(cudaMemcpy(output_dev, output_result.data(), sizeof(float) * n, cudaMemcpyDefault)); // Check results for (int i = 0; i < n; i++) { diff --git a/include/kernel_float/prelude.h b/include/kernel_float/prelude.h index e2c54d5..c1d1fe7 100644 --- a/include/kernel_float/prelude.h +++ b/include/kernel_float/prelude.h @@ -5,6 +5,7 @@ namespace kernel_float { namespace prelude { +namespace kf = ::kernel_float; template using kscalar = tensor>; @@ -29,18 +30,17 @@ template using kvec7 = kvec; template using kvec8 = kvec; // clang-format on -#define KERNEL_FLOAT_TYPE_ALIAS(NAME, T) \ - using k##NAME = scalar; \ - template \ - using v##NAME = vec; \ - using v##NAME##1 = vec; \ - using v##NAME##2 = vec; \ - using v##NAME##3 = vec; \ - using v##NAME##4 = vec; \ - using v##NAME##5 = vec; \ - using v##NAME##6 = vec; \ - using v##NAME##7 = vec; \ - using v##NAME##8 = vec; +#define KERNEL_FLOAT_TYPE_ALIAS(NAME, T) \ + template \ + using k##NAME = tensor>; \ + using k##NAME##1 = vec; \ + using k##NAME##2 = vec; \ + using k##NAME##3 = vec; \ + using k##NAME##4 = vec; \ + using k##NAME##5 = vec; \ + using k##NAME##6 = vec; \ + using k##NAME##7 = vec; \ + using k##NAME##8 = vec; KERNEL_FLOAT_TYPE_ALIAS(char, char) KERNEL_FLOAT_TYPE_ALIAS(short, short) diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 37547b6..b1dda5e 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-06-27 14:02:57.585212 -// git hash: 1a5fd7464a76374c013847a7f114133e57f8a080 +// date: 2023-06-27 15:32:45.699220 +// git hash: 9f4a8e610fbfefc3b67f36b501f913c36a81f67e //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -659,6 +659,9 @@ struct array { } }; +template +using ndindex = array; + KERNEL_FLOAT_INLINE static constexpr size_t compute_max_alignment(size_t total_size, size_t min_align) { if (total_size % 32 == 0 || min_align >= 32) { @@ -797,7 +800,7 @@ struct into_tensor_traits { KERNEL_FLOAT_INLINE static type call(const T& input) { - return input; + return tensor_storage {input}; } }; @@ -921,14 +924,14 @@ template struct apply_impl { KERNEL_FLOAT_INLINE static tensor_storage call(F fun, const tensor_storage& input) { - return call(fun, input, make_index_sequence {}); - } + tensor_storage result; - private: - template - KERNEL_FLOAT_INLINE static tensor_storage - call(F fun, const tensor_storage& input, index_sequence) { - return {fun(input[Is])...}; +#pragma unroll + for (size_t i = 0; i < N; i++) { + result[i] = fun(input[i]); + } + + return result; } }; } // namespace detail @@ -1257,38 +1260,46 @@ struct broadcast_impl { } // namespace detail template -tensor, extents> +KERNEL_FLOAT_INLINE tensor, extents> broadcast(const V& input, extents new_extents = {}) { using T = tensor_value_type; return detail::broadcast_impl, extents>::call( into_tensor(input).storage()); } +template +KERNEL_FLOAT_INLINE tensor, tensor_extents> +broadcast_like(const V& input, const R&) { + using T = tensor_value_type; + return detail::broadcast_impl, tensor_extents>::call( + into_tensor(input).storage()); +} + template -tensor> fill(T value = {}, extents = {}) { +KERNEL_FLOAT_INLINE tensor> fill(T value = {}, extents = {}) { tensor_storage input = {value}; return detail::broadcast_impl, extents>::call(input); } template -tensor> zeros(extents = {}) { +KERNEL_FLOAT_INLINE tensor> zeros(extents = {}) { tensor_storage input = {T {}}; return detail::broadcast_impl, extents>::call(input); } template -tensor> ones(extents = {}) { +KERNEL_FLOAT_INLINE tensor> ones(extents = {}) { tensor_storage input = {T {1}}; return detail::broadcast_impl, extents>::call(input); } template, typename E = tensor_extents> -tensor zeros_like(const V&) { +KERNEL_FLOAT_INLINE tensor zeros_like(const V&) { return zeros(E {}); } template, typename E = tensor_extents> -tensor ones_like(const V&) { +KERNEL_FLOAT_INLINE tensor ones_like(const V&) { return ones(E {}); } @@ -1334,9 +1345,10 @@ struct convert_helper { * Cast the values of the given input tensor to type `R` and then broadcast the result to the given shape `(Ns...)`. */ template -tensor> convert(const V& input, extents new_shape = {}) { - return detail::convert_helper, tensor_extents, R, extents, M, >( - into_tensor(input).storage()); +KERNEL_FLOAT_INLINE tensor> +convert(const V& input, extents new_shape = {}) { + return detail::convert_helper, tensor_extents, R, extents, M>:: + call(into_tensor(input).storage()); } } // namespace kernel_float @@ -1355,17 +1367,14 @@ template struct apply_impl { KERNEL_FLOAT_INLINE static tensor_storage call(F fun, const tensor_storage& left, const tensor_storage& right) { - return call(fun, left, right, make_index_sequence {}); - } + tensor_storage result; - private: - template - KERNEL_FLOAT_INLINE static tensor_storage call( - F fun, - const tensor_storage& left, - const tensor_storage& right, - index_sequence) { - return {fun(left[Is], right[Is])...}; +#pragma unroll + for (size_t i = 0; i < N; i++) { + result[i] = fun(left[i], right[i]); + } + + return result; } }; } // namespace detail @@ -1661,7 +1670,7 @@ struct apply_impl { template struct reduce_helper= 2)>> { - KERNEL_FLOAT_INLINE static T call(F fun, const tensor_storage<__half, N>& input) { + KERNEL_FLOAT_INLINE static __half call(F fun, const tensor_storage<__half, N>& input) { __half2 accum = {input[0], input[1]}; #pragma unroll @@ -1983,19 +1992,26 @@ struct tensor { return E::ravel_index(index); } - template = 0> - KERNEL_FLOAT_INLINE tensor(Args&&... args) : storage_ {std::forward(args)...} {} - - KERNEL_FLOAT_INLINE - tensor(T init = {}) { - for (size_t i = 0; i < size(); i++) { - storage_[i] = init; - } - } + tensor(const tensor&) = default; KERNEL_FLOAT_INLINE tensor(storage_type storage) : storage_(storage) {} + template= 2, int> = 0> + KERNEL_FLOAT_INLINE tensor(Args&&... args) : storage_ {std::forward(args)...} {} + + template< + typename U, + typename F, + enabled_t< + is_implicit_convertible && is_tensor_broadcastable, + int> = 0> + KERNEL_FLOAT_INLINE tensor(const tensor& input) : + tensor(convert(input, extents_type {})) {} + + KERNEL_FLOAT_INLINE tensor(const value_type& input = {}) : + tensor(convert(input, extents_type {})) {} + KERNEL_FLOAT_INLINE storage_type& storage() { return storage_; @@ -2280,7 +2296,8 @@ struct apply_impl { template struct reduce_helper= 2)>> { - KERNEL_FLOAT_INLINE static T call(F fun, const tensor_storage<__nv_bfloat16, N>& input) { + KERNEL_FLOAT_INLINE static __nv_bfloat16 + call(F fun, const tensor_storage<__nv_bfloat16, N>& input) { __nv_bfloat162 accum = {input[0], input[1]}; #pragma unroll @@ -2441,6 +2458,7 @@ KERNEL_FLOAT_BF16_CAST(__half, __float2bfloat16(input), __bfloat162float(input)) namespace kernel_float { namespace prelude { +namespace kf = ::kernel_float; template using kscalar = tensor>; @@ -2449,7 +2467,7 @@ template using kvec = tensor>; template -using kvec = tensor>; +using kmat = tensor>; template using ktensor = tensor>; @@ -2465,17 +2483,16 @@ template using kvec7 = kvec; template using kvec8 = kvec; // clang-format on -#define KERNEL_FLOAT_TYPE_ALIAS(NAME, T) \ - using k##NAME = scalar; \ - template \ - using k##NAME##X = vec; \ - using k##NAME##1 = vec; \ - using k##NAME##2 = vec; \ - using k##NAME##3 = vec; \ - using k##NAME##4 = vec; \ - using k##NAME##5 = vec; \ - using k##NAME##6 = vec; \ - using k##NAME##7 = vec; \ +#define KERNEL_FLOAT_TYPE_ALIAS(NAME, T) \ + template \ + using k##NAME = tensor>; \ + using k##NAME##1 = vec; \ + using k##NAME##2 = vec; \ + using k##NAME##3 = vec; \ + using k##NAME##4 = vec; \ + using k##NAME##5 = vec; \ + using k##NAME##6 = vec; \ + using k##NAME##7 = vec; \ using k##NAME##8 = vec; KERNEL_FLOAT_TYPE_ALIAS(char, char) @@ -2498,6 +2515,17 @@ KERNEL_FLOAT_TYPE_ALIAS(double, double) KERNEL_FLOAT_TYPE_ALIAS(f64x, double) KERNEL_FLOAT_TYPE_ALIAS(float64x, double) +#if KERNEL_FLOAT_FP16_AVAILABLE +KERNEL_FLOAT_TYPE_ALIAS(half, __half) +KERNEL_FLOAT_TYPE_ALIAS(f16x, __half) +KERNEL_FLOAT_TYPE_ALIAS(float16x, __half) +#endif + +#if KERNEL_FLOAT_BF16_AVAILABLE +KERNEL_FLOAT_TYPE_ALIAS(bfloat16, __nv_bfloat16) +KERNEL_FLOAT_TYPE_ALIAS(bf16, __nv_bfloat16) +#endif + template static constexpr extents kshape = {}; From 3c63bbc2bf97e91ae0b8b81dfd7640e037d8be75 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 17 Jul 2023 15:02:21 +0200 Subject: [PATCH 05/50] Add missing include in complex.h --- include/kernel_float/complex.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/kernel_float/complex.h b/include/kernel_float/complex.h index 25ddb7f..ca55076 100644 --- a/include/kernel_float/complex.h +++ b/include/kernel_float/complex.h @@ -2,6 +2,7 @@ #define KERNEL_FLOAT_COMPLEX_TYPE_H #include "macros.h" +#include "meta.h" namespace kernel_float { @@ -256,4 +257,4 @@ struct promote_type> { } // namespace kernel_float -#endif \ No newline at end of file +#endif From 8fb8b94a0c1c385cc596c5df458c3219b8f40937 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 17 Jul 2023 15:03:29 +0200 Subject: [PATCH 06/50] Allow implicit conversion of `tensor` to type `T` --- include/kernel_float/tensor.h | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/include/kernel_float/tensor.h b/include/kernel_float/tensor.h index 09ff0a5..89ba96e 100644 --- a/include/kernel_float/tensor.h +++ b/include/kernel_float/tensor.h @@ -9,8 +9,11 @@ namespace kernel_float { +template +struct tensor_extension {}; + template class S> -struct tensor { +struct tensor: tensor_extension, T, E::volume> { static constexpr size_t rank = E::rank; static constexpr size_t volume = E::volume; @@ -61,6 +64,15 @@ struct tensor { KERNEL_FLOAT_INLINE tensor(const tensor& input) : tensor(convert(input, extents_type {})) {} + template< + typename U, + typename F, + enabled_t< + !is_implicit_convertible && is_tensor_broadcastable, + int> = 0> + explicit KERNEL_FLOAT_INLINE tensor(const tensor& input) : + tensor(convert(input, extents_type {})) {} + KERNEL_FLOAT_INLINE tensor(const value_type& input = {}) : tensor(convert(input, extents_type {})) {} @@ -189,6 +201,24 @@ struct tensor { storage_type storage_; }; +template +struct tensor_extension { + KERNEL_FLOAT_INLINE + T get() const { + return static_cast(this)->get({}); + } + + KERNEL_FLOAT_INLINE + void set(T value) { + static_cast(this)->set({}, value); + } + + KERNEL_FLOAT_INLINE + operator T() const { + return get(); + } +}; + #define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ template<> \ struct into_tensor_traits<::T2> { \ @@ -252,4 +282,4 @@ KERNEL_FLOAT_INLINE vec, sizeof...(Args)> make_vec(Args&&... } // namespace kernel_float -#endif \ No newline at end of file +#endif From bdfed8013d0b15349459ba92a43746c8bcb7857c Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 17 Jul 2023 15:05:14 +0200 Subject: [PATCH 07/50] Rename aliases in prelude --- include/kernel_float/fp16.h | 2 +- include/kernel_float/prelude.h | 23 +- single_include/kernel_float.h | 1052 ++++++++++++++++---------------- 3 files changed, 555 insertions(+), 522 deletions(-) diff --git a/include/kernel_float/fp16.h b/include/kernel_float/fp16.h index b31991e..479b36a 100644 --- a/include/kernel_float/fp16.h +++ b/include/kernel_float/fp16.h @@ -6,7 +6,7 @@ #if KERNEL_FLOAT_FP16_AVAILABLE #include -#include "binops.h" +#include "tensor.h" namespace kernel_float { KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(__half) diff --git a/include/kernel_float/prelude.h b/include/kernel_float/prelude.h index c1d1fe7..d4b314a 100644 --- a/include/kernel_float/prelude.h +++ b/include/kernel_float/prelude.h @@ -30,16 +30,17 @@ template using kvec7 = kvec; template using kvec8 = kvec; // clang-format on -#define KERNEL_FLOAT_TYPE_ALIAS(NAME, T) \ - template \ - using k##NAME = tensor>; \ - using k##NAME##1 = vec; \ - using k##NAME##2 = vec; \ - using k##NAME##3 = vec; \ - using k##NAME##4 = vec; \ - using k##NAME##5 = vec; \ - using k##NAME##6 = vec; \ - using k##NAME##7 = vec; \ +#define KERNEL_FLOAT_TYPE_ALIAS(NAME, T) \ + using k##NAME = tensor>; \ + template \ + using k##NAME##N = tensor>; \ + using k##NAME##1 = vec; \ + using k##NAME##2 = vec; \ + using k##NAME##3 = vec; \ + using k##NAME##4 = vec; \ + using k##NAME##5 = vec; \ + using k##NAME##6 = vec; \ + using k##NAME##7 = vec; \ using k##NAME##8 = vec; KERNEL_FLOAT_TYPE_ALIAS(char, char) @@ -83,4 +84,4 @@ KERNEL_FLOAT_INLINE kvec, sizeof...(Args)> make_kvec(Args&&.. } // namespace prelude } // namespace kernel_float -#endif \ No newline at end of file +#endif diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index b1dda5e..853e471 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-06-27 15:32:45.699220 -// git hash: 9f4a8e610fbfefc3b67f36b501f913c36a81f67e +// date: 2023-07-17 15:01:51.588582 +// git hash: 1812de6d6fd205d35e0f07a5eb4fc3e2c190bdfd //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -47,265 +47,6 @@ #define KERNEL_FLOAT_UNREACHABLE __builtin_unreachable() #endif //KERNEL_FLOAT_MACROS_H -#ifndef KERNEL_FLOAT_COMPLEX_TYPE_H -#define KERNEL_FLOAT_COMPLEX_TYPE_H - - - -namespace kernel_float { - -template -struct alignas(2 * alignof(T)) complex_type_storage { - T re; - T im; -}; - -template -struct complex_type: complex_type_storage { - using base_type = complex_type_storage; - - template - KERNEL_FLOAT_INLINE complex_type(complex_type that) : base_type(that.real(), that.imag()) {} - - KERNEL_FLOAT_INLINE - complex_type(T real = {}, T imag = {}) : base_type(real, im) {} - - KERNEL_FLOAT_INLINE - T real() const { - return re; - } - - KERNEL_FLOAT_INLINE - T imag() const { - return im; - } - - KERNEL_FLOAT_INLINE - T norm() const { - return re * re + im * im; - } - - KERNEL_FLOAT_INLINE - complex_type conj() const { - return {re, -im}; - } -}; - -template -KERNEL_FLOAT_INLINE complex_type operator+(complex_type v) { - return v; -} - -template -KERNEL_FLOAT_INLINE complex_type operator+(complex_type a, complex_type b) { - return {a.real() + b.real(), a.imag() + b.imag()}; -} - -template -KERNEL_FLOAT_INLINE complex_type operator+(T a, complex_type b) { - return {a + b.real(), b.imag()}; -} - -template -KERNEL_FLOAT_INLINE complex_type operator+(complex_type a, T b) { - return {a.real() + b, a.imag()}; -} - -template -KERNEL_FLOAT_INLINE complex_type& operator+=(complex_type& a, complex_type b) { - return (a = a + b); -} - -template -KERNEL_FLOAT_INLINE complex_type& operator+=(complex_type& a, T b) { - return (a = a + b); -} - -template -KERNEL_FLOAT_INLINE complex_type operator-(complex_type v) { - return {-v.real(), -v.imag()}; -} - -template -KERNEL_FLOAT_INLINE complex_type operator-(complex_type a, complex_type b) { - return { - a.real() - b.real(), a.imag() - b.imag() - } -} - -template -KERNEL_FLOAT_INLINE complex_type operator-(T a, complex_type b) { - return { - a - b.real(), -b.imag() - } -} - -template -KERNEL_FLOAT_INLINE complex_type operator-(complex_type a, T b) { - return { - a.real() - b, a.imag() - } -} - -template -KERNEL_FLOAT_INLINE complex_type& operator-=(complex_type& a, complex_type b) { - return (a = a - b); -} - -template -KERNEL_FLOAT_INLINE complex_type& operator-=(complex_type& a, T b) { - return (a = a - b); -} - -template -KERNEL_FLOAT_INLINE complex_type operator*(complex_type a, complex_type b) { - return { - a.real() * b.real() - a.imag() * b.imag(), a.real() * b.imag() + a.imag() * b.real() - } -} - -template -KERNEL_FLOAT_INLINE complex_type operator*(complex_type a, T b) { - return {a.real() * b, a.imag() * b}; -} - -template -KERNEL_FLOAT_INLINE complex_type* operator*=(complex_type& a, complex_type b) { - return (a = a * b); -} - -template -KERNEL_FLOAT_INLINE complex_type& operator*=(complex_type& a, T b) { - return (a = a * b); -} - -template -KERNEL_FLOAT_INLINE complex_type operator*(T a, complex_type b) { - return { - a * b.real(), - a * b.imag(), - }; -} - -template -KERNEL_FLOAT_INLINE complex_type operator/(complex_type a, complex_type b) { - T normi = 1 / b.norm(); - - return { - (a.real() * b.real() + a.imag() * b.imag()) * normi, - (a.imag() * b.real() - a.real() * b.imag()) * normi}; -} - -template -KERNEL_FLOAT_INLINE complex_type operator/(complex_type a, T b) { - return {a.real() * (1 / b), a.imag() * (1 / b)}; -} - -template -KERNEL_FLOAT_INLINE complex_type operator/(T a, complex_type b) { - T normi = 1 / b.norm(); - - return {a * b.real() * normi, -a * b.imag() * normi}; -} - -template -KERNEL_FLOAT_INLINE complex_type* operator/=(complex_type& a, complex_type b) { - return (a = a / b); -} - -template -KERNEL_FLOAT_INLINE complex_type& operator/=(complex_type& a, T b) { - return (a = a / b); -} - -template -KERNEL_FLOAT_INLINE T real(complex_type v) { - return v.real(); -} - -template -KERNEL_FLOAT_INLINE T imag(complex_type v) { - return v.real(); -} - -template -KERNEL_FLOAT_INLINE T abs(complex_type v) { - return hypot(v.real(), v.imag()); -} - -template -KERNEL_FLOAT_INLINE T arg(complex_type v) { - return atan2(v.imag(), v.real()); -} - -template -KERNEL_FLOAT_INLINE complex_type sqrt(complex_type v) { - T radius = abs(v); - T cosA = v.real() / radius; - - complex_type out = { - sqrt(radius * (cosA + T(1)) * T(.5)), - sqrt(radius * (T(1) - cosA) * T(.5))}; - - // signbit should be false if x.y is negative - if (v.imag() < 0) { - out = complex_type {out.real, -out.im}; - } - - return out; -} - -template -KERNEL_FLOAT_INLINE complex_type norm(complex_type v) { - return v.real() * v.real() + v.imag() * v.imag(); -} - -template -KERNEL_FLOAT_INLINE complex_type conj(complex_type v) { - return {v.real(), -v.imag()}; -} - -template -KERNEL_FLOAT_INLINE complex_type exp(complex_type v) { - // TODO: Handle nan and inf correctly - T e = exp(v.real()); - T a = v.imag(); - return complex_type(e * cos(a), e * sin(a)); -} - -template -KERNEL_FLOAT_INLINE complex_type log(complex_type v) { - return {log(abs(v)), arg(v)}; -} - -template -KERNEL_FLOAT_INLINE complex_type pow(complex_type a, T b) { - return exp(a * log(b)); -} - -template -KERNEL_FLOAT_INLINE complex_type pow(complex_type a, complex_type b) { - return exp(a * log(b)); -} - -template -struct promote_type, complex_type> { - using type = complex_type>; -}; - -template -struct promote_type, R> { - using type = complex_type>; -}; - -template -struct promote_type> { - using type = complex_type>; -}; - -} // namespace kernel_float - -#endif #ifndef KERNEL_FLOAT_CORE_H #define KERNEL_FLOAT_CORE_H @@ -896,16 +637,276 @@ template using promoted_tensor_value_type = promote_t>::value_type...>; -template -KERNEL_FLOAT_INLINE into_tensor_type into_tensor(V&& input) { - return into_tensor_traits::call(std::forward(input)); +template +KERNEL_FLOAT_INLINE into_tensor_type into_tensor(V&& input) { + return into_tensor_traits::call(std::forward(input)); +} + +template +KERNEL_FLOAT_INLINE tensor_storage_type into_tensor_storage(V&& input) { + return into_tensor_traits::call(std::forward(input)).storage(); +} + +} // namespace kernel_float + +#endif +#ifndef KERNEL_FLOAT_COMPLEX_TYPE_H +#define KERNEL_FLOAT_COMPLEX_TYPE_H + + + + +namespace kernel_float { + +template +struct alignas(2 * alignof(T)) complex_type_storage { + T re; + T im; +}; + +template +struct complex_type: complex_type_storage { + using base_type = complex_type_storage; + + template + KERNEL_FLOAT_INLINE complex_type(complex_type that) : base_type(that.real(), that.imag()) {} + + KERNEL_FLOAT_INLINE + complex_type(T real = {}, T imag = {}) : base_type(real, im) {} + + KERNEL_FLOAT_INLINE + T real() const { + return re; + } + + KERNEL_FLOAT_INLINE + T imag() const { + return im; + } + + KERNEL_FLOAT_INLINE + T norm() const { + return re * re + im * im; + } + + KERNEL_FLOAT_INLINE + complex_type conj() const { + return {re, -im}; + } +}; + +template +KERNEL_FLOAT_INLINE complex_type operator+(complex_type v) { + return v; +} + +template +KERNEL_FLOAT_INLINE complex_type operator+(complex_type a, complex_type b) { + return {a.real() + b.real(), a.imag() + b.imag()}; +} + +template +KERNEL_FLOAT_INLINE complex_type operator+(T a, complex_type b) { + return {a + b.real(), b.imag()}; +} + +template +KERNEL_FLOAT_INLINE complex_type operator+(complex_type a, T b) { + return {a.real() + b, a.imag()}; +} + +template +KERNEL_FLOAT_INLINE complex_type& operator+=(complex_type& a, complex_type b) { + return (a = a + b); +} + +template +KERNEL_FLOAT_INLINE complex_type& operator+=(complex_type& a, T b) { + return (a = a + b); +} + +template +KERNEL_FLOAT_INLINE complex_type operator-(complex_type v) { + return {-v.real(), -v.imag()}; +} + +template +KERNEL_FLOAT_INLINE complex_type operator-(complex_type a, complex_type b) { + return { + a.real() - b.real(), a.imag() - b.imag() + } +} + +template +KERNEL_FLOAT_INLINE complex_type operator-(T a, complex_type b) { + return { + a - b.real(), -b.imag() + } +} + +template +KERNEL_FLOAT_INLINE complex_type operator-(complex_type a, T b) { + return { + a.real() - b, a.imag() + } +} + +template +KERNEL_FLOAT_INLINE complex_type& operator-=(complex_type& a, complex_type b) { + return (a = a - b); +} + +template +KERNEL_FLOAT_INLINE complex_type& operator-=(complex_type& a, T b) { + return (a = a - b); +} + +template +KERNEL_FLOAT_INLINE complex_type operator*(complex_type a, complex_type b) { + return { + a.real() * b.real() - a.imag() * b.imag(), a.real() * b.imag() + a.imag() * b.real() + } +} + +template +KERNEL_FLOAT_INLINE complex_type operator*(complex_type a, T b) { + return {a.real() * b, a.imag() * b}; +} + +template +KERNEL_FLOAT_INLINE complex_type* operator*=(complex_type& a, complex_type b) { + return (a = a * b); +} + +template +KERNEL_FLOAT_INLINE complex_type& operator*=(complex_type& a, T b) { + return (a = a * b); +} + +template +KERNEL_FLOAT_INLINE complex_type operator*(T a, complex_type b) { + return { + a * b.real(), + a * b.imag(), + }; +} + +template +KERNEL_FLOAT_INLINE complex_type operator/(complex_type a, complex_type b) { + T normi = 1 / b.norm(); + + return { + (a.real() * b.real() + a.imag() * b.imag()) * normi, + (a.imag() * b.real() - a.real() * b.imag()) * normi}; +} + +template +KERNEL_FLOAT_INLINE complex_type operator/(complex_type a, T b) { + return {a.real() * (1 / b), a.imag() * (1 / b)}; +} + +template +KERNEL_FLOAT_INLINE complex_type operator/(T a, complex_type b) { + T normi = 1 / b.norm(); + + return {a * b.real() * normi, -a * b.imag() * normi}; +} + +template +KERNEL_FLOAT_INLINE complex_type* operator/=(complex_type& a, complex_type b) { + return (a = a / b); +} + +template +KERNEL_FLOAT_INLINE complex_type& operator/=(complex_type& a, T b) { + return (a = a / b); +} + +template +KERNEL_FLOAT_INLINE T real(complex_type v) { + return v.real(); +} + +template +KERNEL_FLOAT_INLINE T imag(complex_type v) { + return v.real(); +} + +template +KERNEL_FLOAT_INLINE T abs(complex_type v) { + return hypot(v.real(), v.imag()); +} + +template +KERNEL_FLOAT_INLINE T arg(complex_type v) { + return atan2(v.imag(), v.real()); +} + +template +KERNEL_FLOAT_INLINE complex_type sqrt(complex_type v) { + T radius = abs(v); + T cosA = v.real() / radius; + + complex_type out = { + sqrt(radius * (cosA + T(1)) * T(.5)), + sqrt(radius * (T(1) - cosA) * T(.5))}; + + // signbit should be false if x.y is negative + if (v.imag() < 0) { + out = complex_type {out.real, -out.im}; + } + + return out; +} + +template +KERNEL_FLOAT_INLINE complex_type norm(complex_type v) { + return v.real() * v.real() + v.imag() * v.imag(); +} + +template +KERNEL_FLOAT_INLINE complex_type conj(complex_type v) { + return {v.real(), -v.imag()}; +} + +template +KERNEL_FLOAT_INLINE complex_type exp(complex_type v) { + // TODO: Handle nan and inf correctly + T e = exp(v.real()); + T a = v.imag(); + return complex_type(e * cos(a), e * sin(a)); +} + +template +KERNEL_FLOAT_INLINE complex_type log(complex_type v) { + return {log(abs(v)), arg(v)}; +} + +template +KERNEL_FLOAT_INLINE complex_type pow(complex_type a, T b) { + return exp(a * log(b)); } -template -KERNEL_FLOAT_INLINE tensor_storage_type into_tensor_storage(V&& input) { - return into_tensor_traits::call(std::forward(input)).storage(); +template +KERNEL_FLOAT_INLINE complex_type pow(complex_type a, complex_type b) { + return exp(a * log(b)); } +template +struct promote_type, complex_type> { + using type = complex_type>; +}; + +template +struct promote_type, R> { + using type = complex_type>; +}; + +template +struct promote_type> { + using type = complex_type>; +}; + } // namespace kernel_float #endif @@ -1565,243 +1566,19 @@ struct bit_or { KERNEL_FLOAT_INLINE double operator()(double left, double right) { return double(bool(left) || bool(right)); } -}; - -template<> -struct bit_xor { - KERNEL_FLOAT_INLINE double operator()(double left, double right) { - return double(bool(left) ^ bool(right)); - } -}; -}; // namespace ops - -} // namespace kernel_float - -#endif -#ifndef KERNEL_FLOAT_FP16_H -#define KERNEL_FLOAT_FP16_H - - - -#if KERNEL_FLOAT_FP16_AVAILABLE -#include - - - -namespace kernel_float { -KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(__half) -KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __half) -KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __half) - -template<> -struct into_tensor_traits<__half2> { - using type = tensor<__half, extents<2>>; - - KERNEL_FLOAT_INLINE - static type call(__half2 input) { - return tensor_storage<__half, 2> {input.x, input.y}; - } -}; - -namespace detail { -template -struct map_halfx2 { - KERNEL_FLOAT_INLINE - static __half2 call(F fun, __half2 input) { - __half a = fun(input.x); - __half b = fun(input.y); - return {a, b}; - } -}; - -template -struct zip_halfx2 { - KERNEL_FLOAT_INLINE - static __half2 call(F fun, __half2 left, __half2 right) { - __half a = fun(left.x, left.y); - __half b = fun(right.y, right.y); - return {a, b}; - } -}; - -template -struct apply_impl { - KERNEL_FLOAT_INLINE static tensor_storage<__half, N> - call(F fun, const tensor_storage<__half, N>& input) { - tensor_storage<__half, N> result; - -#pragma unroll - for (size_t i = 0; i < N; i += 2) { - __half2 a = {input[i], input[i + 1]}; - __half2 b = map_halfx2::call(fun, a); - result[i + 0] = b.x; - result[i + 1] = b.y; - } - - if (N % 2 != 0) { - result[N - 1] = fun(input[N - 1]); - } - - return result; - } -}; - -template -struct apply_impl { - KERNEL_FLOAT_INLINE static tensor_storage<__half, N> - call(F fun, const tensor_storage<__half, N>& left, const tensor_storage<__half, N>& right) { - tensor_storage<__half, N> result; -#pragma unroll - for (size_t i = 0; i < N; i += 2) { - __half2 a = {left[i], left[i + 1]}; - __half2 b = {right[i], right[i + 1]}; - __half2 c = zip_halfx2::call(fun, a, b); - result[i + 0] = c.x; - result[i + 1] = c.y; - } - - if (N % 2 != 0) { - result[N - 1] = fun(left[N - 1], right[N - 1]); - } - - return result; - } -}; - -template -struct reduce_helper= 2)>> { - KERNEL_FLOAT_INLINE static __half call(F fun, const tensor_storage<__half, N>& input) { - __half2 accum = {input[0], input[1]}; - -#pragma unroll - for (size_t i = 2; i < N; i += 2) { - __half2 a = {input[i], input[i + 1]}; - accum = zip_halfx2::call(fun, accum, a); - } - - __half result = fun(accum.x, accum.y); - - if (N % 2 != 0) { - result = fun(result, input[N - 1]); - } - - return result; - } -}; - -}; // namespace detail - -#if KERNEL_FLOAT_IS_DEVICE -#define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2) \ - namespace ops { \ - template<> \ - struct NAME<__half> { \ - KERNEL_FLOAT_INLINE __half operator()(__half input) { \ - return FUN1(input); \ - } \ - }; \ - } \ - namespace detail { \ - template<> \ - struct map_halfx2> { \ - KERNEL_FLOAT_INLINE static __half2 call(ops::NAME<__half>, __half2 input) { \ - return FUN2(input); \ - } \ - }; \ - } - -KERNEL_FLOAT_FP16_UNARY_FUN(abs, ::__habs, ::__habs2); -KERNEL_FLOAT_FP16_UNARY_FUN(negate, ::__hneg, ::__hneg2); -KERNEL_FLOAT_FP16_UNARY_FUN(ceil, ::hceil, ::h2ceil); -KERNEL_FLOAT_FP16_UNARY_FUN(cos, ::hcos, ::h2cos); -KERNEL_FLOAT_FP16_UNARY_FUN(exp, ::hexp, ::h2exp); -KERNEL_FLOAT_FP16_UNARY_FUN(exp10, ::hexp10, ::h2exp10); -KERNEL_FLOAT_FP16_UNARY_FUN(floor, ::hfloor, ::h2floor); -KERNEL_FLOAT_FP16_UNARY_FUN(log, ::hlog, ::h2log); -KERNEL_FLOAT_FP16_UNARY_FUN(log10, ::hlog10, ::h2log2); -KERNEL_FLOAT_FP16_UNARY_FUN(rint, ::hrint, ::h2rint); -KERNEL_FLOAT_FP16_UNARY_FUN(rsqrt, ::hrsqrt, ::h2rsqrt); -KERNEL_FLOAT_FP16_UNARY_FUN(sin, ::hsin, ::h2sin); -KERNEL_FLOAT_FP16_UNARY_FUN(sqrt, ::hsqrt, ::h2sqrt); -KERNEL_FLOAT_FP16_UNARY_FUN(trunc, ::htrunc, ::h2trunc); - -#define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2) \ - namespace ops { \ - template<> \ - struct NAME<__half> { \ - KERNEL_FLOAT_INLINE __half operator()(__half left, __half right) const { \ - return FUN1(left, right); \ - } \ - }; \ - } \ - namespace detail { \ - template<> \ - struct zip_halfx2> { \ - KERNEL_FLOAT_INLINE static __half2 call(ops::NAME<__half>, __half2 left, __half2 right) { \ - return FUN2(left, right); \ - } \ - }; \ - } - -KERNEL_FLOAT_FP16_BINARY_FUN(add, __hadd, __hadd2) -KERNEL_FLOAT_FP16_BINARY_FUN(subtract, __hsub, __hsub2) -KERNEL_FLOAT_FP16_BINARY_FUN(multiply, __hmul, __hmul2) -KERNEL_FLOAT_FP16_BINARY_FUN(divide, __hdiv, __h2div) -KERNEL_FLOAT_FP16_BINARY_FUN(min, __hmin, __hmin2) -KERNEL_FLOAT_FP16_BINARY_FUN(max, __hmax, __hmax2) - -KERNEL_FLOAT_FP16_BINARY_FUN(equal_to, __heq, __heq2) -KERNEL_FLOAT_FP16_BINARY_FUN(not_equal_to, __heq, __heq2) -KERNEL_FLOAT_FP16_BINARY_FUN(less, __hlt, __hlt2) -KERNEL_FLOAT_FP16_BINARY_FUN(less_equal, __hle, __hle2) -KERNEL_FLOAT_FP16_BINARY_FUN(greater, __hgt, __hgt2) -KERNEL_FLOAT_FP16_BINARY_FUN(greater_equal, __hge, __hgt2) - -#endif - -#define KERNEL_FLOAT_FP16_CAST(T, TO_HALF, FROM_HALF) \ - namespace ops { \ - template<> \ - struct cast { \ - KERNEL_FLOAT_INLINE __half operator()(T input) { \ - return TO_HALF; \ - } \ - }; \ - template<> \ - struct cast<__half, T> { \ - KERNEL_FLOAT_INLINE T operator()(__half input) { \ - return FROM_HALF; \ - } \ - }; \ - } - -KERNEL_FLOAT_FP16_CAST(double, __double2half(input), double(__half2float(input))); -KERNEL_FLOAT_FP16_CAST(float, __float2half(input), __half2float(input)); - -// there are no official char casts. Instead, cast to int and then to char -KERNEL_FLOAT_FP16_CAST(char, __int2half_rn(input), (char)__half2int_rz(input)); -KERNEL_FLOAT_FP16_CAST(signed char, __int2half_rn(input), (signed char)__half2int_rz(input)); -KERNEL_FLOAT_FP16_CAST(unsigned char, __int2half_rn(input), (unsigned char)__half2int_rz(input)); - -KERNEL_FLOAT_FP16_CAST(signed short, __half2short_rz(input), __short2half_rn(input)); -KERNEL_FLOAT_FP16_CAST(signed int, __half2int_rz(input), __int2half_rn(input)); -KERNEL_FLOAT_FP16_CAST(signed long, __ll2half_rn(input), (signed long)(__half2ll_rz(input))); -KERNEL_FLOAT_FP16_CAST(signed long long, __ll2half_rn(input), __half2ll_rz(input)); - -KERNEL_FLOAT_FP16_CAST(unsigned short, __half2ushort_rz(input), __ushort2half_rn(input)); -KERNEL_FLOAT_FP16_CAST(unsigned int, __half2uint_rz(input), __uint2half_rn(input)); -KERNEL_FLOAT_FP16_CAST(unsigned long, __ull2half_rn(input), (unsigned long)(__half2ull_rz(input))); -KERNEL_FLOAT_FP16_CAST(unsigned long long, __ull2half_rn(input), __half2ull_rz(input)); +}; -using half = __half; -//KERNEL_FLOAT_TYPE_ALIAS(float16x, __half) -//KERNEL_FLOAT_TYPE_ALIAS(f16x, __half) +template<> +struct bit_xor { + KERNEL_FLOAT_INLINE double operator()(double left, double right) { + return double(bool(left) ^ bool(right)); + } +}; +}; // namespace ops } // namespace kernel_float #endif - -#endif //KERNEL_FLOAT_FP16_H #ifndef KERNEL_FLOAT_REDUCE_H #define KERNEL_FLOAT_REDUCE_H @@ -1957,8 +1734,11 @@ KERNEL_FLOAT_INLINE int count(const V& input) { namespace kernel_float { +template +struct tensor_extension {}; + template class S> -struct tensor { +struct tensor: tensor_extension, T, E::volume> { static constexpr size_t rank = E::rank; static constexpr size_t volume = E::volume; @@ -2009,6 +1789,15 @@ struct tensor { KERNEL_FLOAT_INLINE tensor(const tensor& input) : tensor(convert(input, extents_type {})) {} + template< + typename U, + typename F, + enabled_t< + !is_implicit_convertible && is_tensor_broadcastable, + int> = 0> + explicit KERNEL_FLOAT_INLINE tensor(const tensor& input) : + tensor(convert(input, extents_type {})) {} + KERNEL_FLOAT_INLINE tensor(const value_type& input = {}) : tensor(convert(input, extents_type {})) {} @@ -2137,6 +1926,24 @@ struct tensor { storage_type storage_; }; +template +struct tensor_extension { + KERNEL_FLOAT_INLINE + T get() const { + return static_cast(this)->get({}); + } + + KERNEL_FLOAT_INLINE + void set(T value) { + static_cast(this)->set({}, value); + } + + KERNEL_FLOAT_INLINE + operator T() const { + return get(); + } +}; + #define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ template<> \ struct into_tensor_traits<::T2> { \ @@ -2201,6 +2008,230 @@ KERNEL_FLOAT_INLINE vec, sizeof...(Args)> make_vec(Args&&... } // namespace kernel_float #endif +#ifndef KERNEL_FLOAT_FP16_H +#define KERNEL_FLOAT_FP16_H + + + +#if KERNEL_FLOAT_FP16_AVAILABLE +#include + + + +namespace kernel_float { +KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(__half) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __half) +KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __half) + +template<> +struct into_tensor_traits<__half2> { + using type = tensor<__half, extents<2>>; + + KERNEL_FLOAT_INLINE + static type call(__half2 input) { + return tensor_storage<__half, 2> {input.x, input.y}; + } +}; + +namespace detail { +template +struct map_halfx2 { + KERNEL_FLOAT_INLINE + static __half2 call(F fun, __half2 input) { + __half a = fun(input.x); + __half b = fun(input.y); + return {a, b}; + } +}; + +template +struct zip_halfx2 { + KERNEL_FLOAT_INLINE + static __half2 call(F fun, __half2 left, __half2 right) { + __half a = fun(left.x, left.y); + __half b = fun(right.y, right.y); + return {a, b}; + } +}; + +template +struct apply_impl { + KERNEL_FLOAT_INLINE static tensor_storage<__half, N> + call(F fun, const tensor_storage<__half, N>& input) { + tensor_storage<__half, N> result; + +#pragma unroll + for (size_t i = 0; i < N; i += 2) { + __half2 a = {input[i], input[i + 1]}; + __half2 b = map_halfx2::call(fun, a); + result[i + 0] = b.x; + result[i + 1] = b.y; + } + + if (N % 2 != 0) { + result[N - 1] = fun(input[N - 1]); + } + + return result; + } +}; + +template +struct apply_impl { + KERNEL_FLOAT_INLINE static tensor_storage<__half, N> + call(F fun, const tensor_storage<__half, N>& left, const tensor_storage<__half, N>& right) { + tensor_storage<__half, N> result; +#pragma unroll + for (size_t i = 0; i < N; i += 2) { + __half2 a = {left[i], left[i + 1]}; + __half2 b = {right[i], right[i + 1]}; + __half2 c = zip_halfx2::call(fun, a, b); + result[i + 0] = c.x; + result[i + 1] = c.y; + } + + if (N % 2 != 0) { + result[N - 1] = fun(left[N - 1], right[N - 1]); + } + + return result; + } +}; + +template +struct reduce_helper= 2)>> { + KERNEL_FLOAT_INLINE static __half call(F fun, const tensor_storage<__half, N>& input) { + __half2 accum = {input[0], input[1]}; + +#pragma unroll + for (size_t i = 2; i < N; i += 2) { + __half2 a = {input[i], input[i + 1]}; + accum = zip_halfx2::call(fun, accum, a); + } + + __half result = fun(accum.x, accum.y); + + if (N % 2 != 0) { + result = fun(result, input[N - 1]); + } + + return result; + } +}; + +}; // namespace detail + +#if KERNEL_FLOAT_IS_DEVICE +#define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__half> { \ + KERNEL_FLOAT_INLINE __half operator()(__half input) { \ + return FUN1(input); \ + } \ + }; \ + } \ + namespace detail { \ + template<> \ + struct map_halfx2> { \ + KERNEL_FLOAT_INLINE static __half2 call(ops::NAME<__half>, __half2 input) { \ + return FUN2(input); \ + } \ + }; \ + } + +KERNEL_FLOAT_FP16_UNARY_FUN(abs, ::__habs, ::__habs2); +KERNEL_FLOAT_FP16_UNARY_FUN(negate, ::__hneg, ::__hneg2); +KERNEL_FLOAT_FP16_UNARY_FUN(ceil, ::hceil, ::h2ceil); +KERNEL_FLOAT_FP16_UNARY_FUN(cos, ::hcos, ::h2cos); +KERNEL_FLOAT_FP16_UNARY_FUN(exp, ::hexp, ::h2exp); +KERNEL_FLOAT_FP16_UNARY_FUN(exp10, ::hexp10, ::h2exp10); +KERNEL_FLOAT_FP16_UNARY_FUN(floor, ::hfloor, ::h2floor); +KERNEL_FLOAT_FP16_UNARY_FUN(log, ::hlog, ::h2log); +KERNEL_FLOAT_FP16_UNARY_FUN(log10, ::hlog10, ::h2log2); +KERNEL_FLOAT_FP16_UNARY_FUN(rint, ::hrint, ::h2rint); +KERNEL_FLOAT_FP16_UNARY_FUN(rsqrt, ::hrsqrt, ::h2rsqrt); +KERNEL_FLOAT_FP16_UNARY_FUN(sin, ::hsin, ::h2sin); +KERNEL_FLOAT_FP16_UNARY_FUN(sqrt, ::hsqrt, ::h2sqrt); +KERNEL_FLOAT_FP16_UNARY_FUN(trunc, ::htrunc, ::h2trunc); + +#define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__half> { \ + KERNEL_FLOAT_INLINE __half operator()(__half left, __half right) const { \ + return FUN1(left, right); \ + } \ + }; \ + } \ + namespace detail { \ + template<> \ + struct zip_halfx2> { \ + KERNEL_FLOAT_INLINE static __half2 call(ops::NAME<__half>, __half2 left, __half2 right) { \ + return FUN2(left, right); \ + } \ + }; \ + } + +KERNEL_FLOAT_FP16_BINARY_FUN(add, __hadd, __hadd2) +KERNEL_FLOAT_FP16_BINARY_FUN(subtract, __hsub, __hsub2) +KERNEL_FLOAT_FP16_BINARY_FUN(multiply, __hmul, __hmul2) +KERNEL_FLOAT_FP16_BINARY_FUN(divide, __hdiv, __h2div) +KERNEL_FLOAT_FP16_BINARY_FUN(min, __hmin, __hmin2) +KERNEL_FLOAT_FP16_BINARY_FUN(max, __hmax, __hmax2) + +KERNEL_FLOAT_FP16_BINARY_FUN(equal_to, __heq, __heq2) +KERNEL_FLOAT_FP16_BINARY_FUN(not_equal_to, __heq, __heq2) +KERNEL_FLOAT_FP16_BINARY_FUN(less, __hlt, __hlt2) +KERNEL_FLOAT_FP16_BINARY_FUN(less_equal, __hle, __hle2) +KERNEL_FLOAT_FP16_BINARY_FUN(greater, __hgt, __hgt2) +KERNEL_FLOAT_FP16_BINARY_FUN(greater_equal, __hge, __hgt2) + +#endif + +#define KERNEL_FLOAT_FP16_CAST(T, TO_HALF, FROM_HALF) \ + namespace ops { \ + template<> \ + struct cast { \ + KERNEL_FLOAT_INLINE __half operator()(T input) { \ + return TO_HALF; \ + } \ + }; \ + template<> \ + struct cast<__half, T> { \ + KERNEL_FLOAT_INLINE T operator()(__half input) { \ + return FROM_HALF; \ + } \ + }; \ + } + +KERNEL_FLOAT_FP16_CAST(double, __double2half(input), double(__half2float(input))); +KERNEL_FLOAT_FP16_CAST(float, __float2half(input), __half2float(input)); + +// there are no official char casts. Instead, cast to int and then to char +KERNEL_FLOAT_FP16_CAST(char, __int2half_rn(input), (char)__half2int_rz(input)); +KERNEL_FLOAT_FP16_CAST(signed char, __int2half_rn(input), (signed char)__half2int_rz(input)); +KERNEL_FLOAT_FP16_CAST(unsigned char, __int2half_rn(input), (unsigned char)__half2int_rz(input)); + +KERNEL_FLOAT_FP16_CAST(signed short, __half2short_rz(input), __short2half_rn(input)); +KERNEL_FLOAT_FP16_CAST(signed int, __half2int_rz(input), __int2half_rn(input)); +KERNEL_FLOAT_FP16_CAST(signed long, __ll2half_rn(input), (signed long)(__half2ll_rz(input))); +KERNEL_FLOAT_FP16_CAST(signed long long, __ll2half_rn(input), __half2ll_rz(input)); + +KERNEL_FLOAT_FP16_CAST(unsigned short, __half2ushort_rz(input), __ushort2half_rn(input)); +KERNEL_FLOAT_FP16_CAST(unsigned int, __half2uint_rz(input), __uint2half_rn(input)); +KERNEL_FLOAT_FP16_CAST(unsigned long, __ull2half_rn(input), (unsigned long)(__half2ull_rz(input))); +KERNEL_FLOAT_FP16_CAST(unsigned long long, __ull2half_rn(input), __half2ull_rz(input)); + +using half = __half; +//KERNEL_FLOAT_TYPE_ALIAS(float16x, __half) +//KERNEL_FLOAT_TYPE_ALIAS(f16x, __half) + +} // namespace kernel_float + +#endif + +#endif //KERNEL_FLOAT_FP16_H #ifndef KERNEL_FLOAT_BF16_H #define KERNEL_FLOAT_BF16_H @@ -2483,16 +2514,17 @@ template using kvec7 = kvec; template using kvec8 = kvec; // clang-format on -#define KERNEL_FLOAT_TYPE_ALIAS(NAME, T) \ - template \ - using k##NAME = tensor>; \ - using k##NAME##1 = vec; \ - using k##NAME##2 = vec; \ - using k##NAME##3 = vec; \ - using k##NAME##4 = vec; \ - using k##NAME##5 = vec; \ - using k##NAME##6 = vec; \ - using k##NAME##7 = vec; \ +#define KERNEL_FLOAT_TYPE_ALIAS(NAME, T) \ + using k##NAME = tensor>; \ + template \ + using k##NAME##N = tensor>; \ + using k##NAME##1 = vec; \ + using k##NAME##2 = vec; \ + using k##NAME##3 = vec; \ + using k##NAME##4 = vec; \ + using k##NAME##5 = vec; \ + using k##NAME##6 = vec; \ + using k##NAME##7 = vec; \ using k##NAME##8 = vec; KERNEL_FLOAT_TYPE_ALIAS(char, char) From 96f18a6360d66ce8b016e80dcb96580dea25fc02 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 17 Jul 2023 15:26:44 +0200 Subject: [PATCH 08/50] Add `dot` function --- include/kernel_float/reduce.h | 22 +++++++++++++++++++--- single_include/kernel_float.h | 26 +++++++++++++++++++++----- 2 files changed, 40 insertions(+), 8 deletions(-) diff --git a/include/kernel_float/reduce.h b/include/kernel_float/reduce.h index 7738071..c909ea0 100644 --- a/include/kernel_float/reduce.h +++ b/include/kernel_float/reduce.h @@ -52,7 +52,7 @@ KERNEL_FLOAT_INLINE tensor_value_type reduce(F fun, const V& input) { * Example * ======= * ``` - * vec x = {5, 0, 2, 1, 0}; + * vec x = {5, 0, 2, 1, 0}; * int y = min(x); // Returns 0 * ``` */ @@ -67,7 +67,7 @@ KERNEL_FLOAT_INLINE T min(const V& input) { * Example * ======= * ``` - * vec x = {5, 0, 2, 1, 0}; + * vec x = {5, 0, 2, 1, 0}; * int y = max(x); // Returns 5 * ``` */ @@ -82,7 +82,7 @@ KERNEL_FLOAT_INLINE T max(const V& input) { * Example * ======= * ``` - * vec x = {5, 0, 2, 1, 0}; + * vec x = {5, 0, 2, 1, 0}; * int y = sum(x); // Returns 8 * ``` */ @@ -91,6 +91,22 @@ KERNEL_FLOAT_INLINE T sum(const V& input) { return reduce(ops::add {}, input); } +/** + * Compute the dot product of the given vectors ``left`` and ``right`` + * + * Example + * ======= + * ``` + * vec x = {1, 2, 3}; + * vec y = {4, 5, 6}; + * int y = dot(x, y); // Returns 1*4+2*5+3*6 = 32 + * ``` + */ +template> +KERNEL_FLOAT_INLINE T dot(const L& left, const R& right) { + return reduce(ops::add {}, zip_common(ops::multiply {}, left, right)); +} + /** * Multiply the items in the given vector ``input``. * diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 853e471..e1cbaaa 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-07-17 15:01:51.588582 -// git hash: 1812de6d6fd205d35e0f07a5eb4fc3e2c190bdfd +// date: 2023-07-17 15:26:35.501561 +// git hash: bdfed8013d0b15349459ba92a43746c8bcb7857c //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -1633,7 +1633,7 @@ KERNEL_FLOAT_INLINE tensor_value_type reduce(F fun, const V& input) { * Example * ======= * ``` - * vec x = {5, 0, 2, 1, 0}; + * vec x = {5, 0, 2, 1, 0}; * int y = min(x); // Returns 0 * ``` */ @@ -1648,7 +1648,7 @@ KERNEL_FLOAT_INLINE T min(const V& input) { * Example * ======= * ``` - * vec x = {5, 0, 2, 1, 0}; + * vec x = {5, 0, 2, 1, 0}; * int y = max(x); // Returns 5 * ``` */ @@ -1663,7 +1663,7 @@ KERNEL_FLOAT_INLINE T max(const V& input) { * Example * ======= * ``` - * vec x = {5, 0, 2, 1, 0}; + * vec x = {5, 0, 2, 1, 0}; * int y = sum(x); // Returns 8 * ``` */ @@ -1672,6 +1672,22 @@ KERNEL_FLOAT_INLINE T sum(const V& input) { return reduce(ops::add {}, input); } +/** + * Compute the dot product of the given vectors ``left`` and ``right`` + * + * Example + * ======= + * ``` + * vec x = {1, 2, 3}; + * vec y = {4, 5, 6}; + * int y = dot(x, y); // Returns 1*4+2*5+3*6 = 32 + * ``` + */ +template> +KERNEL_FLOAT_INLINE T dot(const L& left, const R& right) { + return reduce(ops::add {}, zip_common(ops::multiply {}, left, right)); +} + /** * Multiply the items in the given vector ``input``. * From a7489e474c820e3796d333d0417dbd8c2c0b77c0 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 24 Jul 2023 16:59:42 +0200 Subject: [PATCH 09/50] Add ternary `where` function --- examples/vector_add/main.cu | 6 +- include/kernel_float.h | 1 + include/kernel_float/triops.h | 111 ++++++++++++++++++++++++++++++++ single_include/kernel_float.h | 115 +++++++++++++++++++++++++++++++++- tests/broadcast.cu | 2 +- tests/triops.cu | 29 +++++++++ 6 files changed, 258 insertions(+), 6 deletions(-) create mode 100644 include/kernel_float/triops.h create mode 100644 tests/triops.cu diff --git a/examples/vector_add/main.cu b/examples/vector_add/main.cu index ea78d1a..d9268d5 100644 --- a/examples/vector_add/main.cu +++ b/examples/vector_add/main.cu @@ -13,7 +13,7 @@ void cuda_check(cudaError_t code) { } template -__global__ void my_kernel(int length, const khalf* input, double constant, kfloat* output) { +__global__ void my_kernel(int length, const khalfN* input, double constant, kfloatN* output) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i * N < length) { @@ -35,8 +35,8 @@ void run_kernel(int n) { } // Allocate device memory - khalf* input_dev; - kfloat* output_dev; + khalfN* input_dev; + kfloatN* output_dev; cuda_check(cudaMalloc(&input_dev, sizeof(half) * n)); cuda_check(cudaMalloc(&output_dev, sizeof(float) * n)); diff --git a/include/kernel_float.h b/include/kernel_float.h index 888ca72..5eef892 100644 --- a/include/kernel_float.h +++ b/include/kernel_float.h @@ -14,6 +14,7 @@ #include "kernel_float/prelude.h" #include "kernel_float/reduce.h" #include "kernel_float/tensor.h" +#include "kernel_float/triops.h" #include "kernel_float/unops.h" #endif \ No newline at end of file diff --git a/include/kernel_float/triops.h b/include/kernel_float/triops.h new file mode 100644 index 0000000..85658fd --- /dev/null +++ b/include/kernel_float/triops.h @@ -0,0 +1,111 @@ +#ifndef KERNEL_FLOAT_TRIOPS_H +#define KERNEL_FLOAT_TRIOPS_H + +#include "broadcast.h" +#include "unops.h" + +namespace kernel_float { + +namespace detail { + +template +struct apply_impl { + KERNEL_FLOAT_INLINE static tensor_storage call( + F fun, + const tensor_storage& a, + const tensor_storage& b, + const tensor_storage& c) { + tensor_storage result; + +#pragma unroll + for (size_t i = 0; i < N; i++) { + result[i] = fun(a[i], b[i], c[i]); + } + + return result; + } +}; +} // namespace detail + +namespace ops { + +template +struct conditional { + KERNEL_FLOAT_INLINE T operator()(bool cond, T true_value, T false_value) { + if (cond) { + return true_value; + } else { + return false_value; + } + } +}; + +} // namespace ops + +/** + * Return elements chosen from `true_values` and `false_values` depending on `cond`. + * + * This function broadcasts all arguments to the same shape and it promotes the values of `true_values` and + * `false_values` into the same type. Next, it casts the values of `cond` to booleans and returns a tensor where + * the values are taken from `true_values` if the condition is true and `false_values` otherwise. + * + * @param cond The condition used for selection. + * @param true_values The tensor of values to choose from when the condition is true. + * @param false_values The tensor of values to choose from when the condition is false. + * @return A tensor containing selected elements as per the condition. + */ +template< + typename C, + typename L, + typename R, + typename T = promoted_tensor_value_type, + typename E = broadcast_extents, broadcast_tensor_extents>> +KERNEL_FLOAT_INLINE tensor where(const C& cond, const L& true_values, const R& false_values) { + using F = ops::conditional; + + return detail::apply_impl::call( + F {}, + detail::convert_helper, tensor_extents, bool, E>::call( + into_tensor_storage(cond)), + detail::convert_helper, tensor_extents, T, E>::call( + into_tensor_storage(true_values)), + detail::convert_helper, tensor_extents, T, E>::call( + into_tensor_storage(false_values))); +} + +/** + * Selects elements from `true_values` depending on `cond`. + * + * This function returns a tensor where the values are taken from `true_values` where `cond` is `true` and `0` where + * `cond is `false`. + * + * @param cond The condition used for selection. + * @param true_values The tensor of values to choose from when the condition is true. + * @return A tensor containing selected elements as per the condition. + */ +template< + typename C, + typename L, + typename T = tensor_value_type, + typename E = broadcast_extents, tensor_extents>> +KERNEL_FLOAT_INLINE tensor where(const C& cond, const L& true_values) { + tensor> false_values = T {}; + return where(cond, true_values, false_values); +} + +/** + * Returns a tensor where the values are `T(1)` where `cond` is `true` and `T(0)` where `cond` is `false`. + * + * @param cond The condition used for selection. + * @return A tensor containing elements as per the condition. + */ +template> +KERNEL_FLOAT_INLINE tensor where(const C& cond) { + tensor> true_values = T {true}; + tensor> false_values = T {false}; + return where(cond, true_values, false_values); +} + +} // namespace kernel_float + +#endif //KERNEL_FLOAT_TRIOPS_H diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index e1cbaaa..bebf0eb 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-07-17 15:26:35.501561 -// git hash: bdfed8013d0b15349459ba92a43746c8bcb7857c +// date: 2023-07-24 17:00:55.602691 +// git hash: 73f1cb7e69c4fde74b7535a3429c1b877fb79fef //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -2585,3 +2585,114 @@ KERNEL_FLOAT_INLINE kvec, sizeof...(Args)> make_kvec(Args&&.. } // namespace kernel_float #endif +#ifndef KERNEL_FLOAT_TRIOPS_H +#define KERNEL_FLOAT_TRIOPS_H + + + + +namespace kernel_float { + +namespace detail { + +template +struct apply_impl { + KERNEL_FLOAT_INLINE static tensor_storage call( + F fun, + const tensor_storage& a, + const tensor_storage& b, + const tensor_storage& c) { + tensor_storage result; + +#pragma unroll + for (size_t i = 0; i < N; i++) { + result[i] = fun(a[i], b[i], c[i]); + } + + return result; + } +}; +} // namespace detail + +namespace ops { + +template +struct conditional { + KERNEL_FLOAT_INLINE T operator()(bool cond, T true_value, T false_value) { + if (cond) { + return true_value; + } else { + return false_value; + } + } +}; + +} // namespace ops + +/** + * Return elements chosen from `true_values` and `false_values` depending on `cond`. + * + * This function broadcasts all arguments to the same shape and it promotes the values of `true_values` and + * `false_values` into the same type. Next, it casts the values of `cond` to booleans and returns a tensor where + * the values are taken from `true_values` if the condition is true and `false_values` otherwise. + * + * @param cond The condition used for selection. + * @param true_values The tensor of values to choose from when the condition is true. + * @param false_values The tensor of values to choose from when the condition is false. + * @return A tensor containing selected elements as per the condition. + */ +template< + typename C, + typename L, + typename R, + typename T = promoted_tensor_value_type, + typename E = broadcast_extents, broadcast_tensor_extents>> +KERNEL_FLOAT_INLINE tensor where(const C& cond, const L& true_values, const R& false_values) { + using F = ops::conditional; + + return detail::apply_impl::call( + F {}, + detail::convert_helper, tensor_extents, bool, E>::call( + into_tensor_storage(cond)), + detail::convert_helper, tensor_extents, T, E>::call( + into_tensor_storage(true_values)), + detail::convert_helper, tensor_extents, T, E>::call( + into_tensor_storage(false_values))); +} + +/** + * Selects elements from `true_values` depending on `cond`. + * + * This function returns a tensor where the values are taken from `true_values` where `cond` is `true` and `0` where + * `cond is `false`. + * + * @param cond The condition used for selection. + * @param true_values The tensor of values to choose from when the condition is true. + * @return A tensor containing selected elements as per the condition. + */ +template< + typename C, + typename L, + typename T = tensor_value_type, + typename E = broadcast_extents, tensor_extents>> +KERNEL_FLOAT_INLINE tensor where(const C& cond, const L& true_values) { + tensor> false_values = T {}; + return where(cond, true_values, false_values); +} + +/** + * Returns a tensor where the values are `T(1)` where `cond` is `true` and `T(0)` where `cond` is `false`. + * + * @param cond The condition used for selection. + * @return A tensor containing elements as per the condition. + */ +template> +KERNEL_FLOAT_INLINE tensor where(const C& cond) { + tensor> true_values = T {true}; + tensor> false_values = T {false}; + return where(cond, true_values, false_values); +} + +} // namespace kernel_float + +#endif //KERNEL_FLOAT_TRIOPS_H diff --git a/tests/broadcast.cu b/tests/broadcast.cu index c70c6ce..205dd0f 100644 --- a/tests/broadcast.cu +++ b/tests/broadcast.cu @@ -62,6 +62,6 @@ struct broadcast_test, std::index_sequence(); } diff --git a/tests/triops.cu b/tests/triops.cu new file mode 100644 index 0000000..1268802 --- /dev/null +++ b/tests/triops.cu @@ -0,0 +1,29 @@ +#include "common.h" +#include "kernel_float.h" + +namespace kf = kernel_float; + +template> +struct where_test; + +template +struct where_test> { + __host__ __device__ void operator()(generator gen) { + kf::vec cond = {gen.next(Is)...}; + kf::vec left = {gen.next(Is)...}; + kf::vec right = {gen.next(Is)...}; + + auto result = kf::where(cond, left, right); + ASSERT(equals(result[Is], cond[Is] ? left[Is] : right[Is]) && ...); + + result = kf::where(cond, left); + ASSERT(equals(result[Is], cond[Is] ? left[Is] : T {0}) && ...); + + result = kf::where(cond); + ASSERT(equals(result[Is], cond[Is] ? T {1} : T {0}) && ...); + } +}; + +TEST_CASE("conditional") { + run_on_host_and_device(); +} From eacf58401b70e1d94bf315e7e192708267f4e9e0 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 24 Jul 2023 17:06:20 +0200 Subject: [PATCH 10/50] Simplify code for `kernel_float::apply_impl` --- include/kernel_float/binops.h | 17 ----------------- include/kernel_float/triops.h | 23 ----------------------- include/kernel_float/unops.h | 9 +++------ 3 files changed, 3 insertions(+), 46 deletions(-) diff --git a/include/kernel_float/binops.h b/include/kernel_float/binops.h index 8d87f92..0060800 100644 --- a/include/kernel_float/binops.h +++ b/include/kernel_float/binops.h @@ -5,23 +5,6 @@ #include "unops.h" namespace kernel_float { -namespace detail { - -template -struct apply_impl { - KERNEL_FLOAT_INLINE static tensor_storage - call(F fun, const tensor_storage& left, const tensor_storage& right) { - tensor_storage result; - -#pragma unroll - for (size_t i = 0; i < N; i++) { - result[i] = fun(left[i], right[i]); - } - - return result; - } -}; -} // namespace detail template using zip_type = diff --git a/include/kernel_float/triops.h b/include/kernel_float/triops.h index 85658fd..1e8f1e1 100644 --- a/include/kernel_float/triops.h +++ b/include/kernel_float/triops.h @@ -6,29 +6,7 @@ namespace kernel_float { -namespace detail { - -template -struct apply_impl { - KERNEL_FLOAT_INLINE static tensor_storage call( - F fun, - const tensor_storage& a, - const tensor_storage& b, - const tensor_storage& c) { - tensor_storage result; - -#pragma unroll - for (size_t i = 0; i < N; i++) { - result[i] = fun(a[i], b[i], c[i]); - } - - return result; - } -}; -} // namespace detail - namespace ops { - template struct conditional { KERNEL_FLOAT_INLINE T operator()(bool cond, T true_value, T false_value) { @@ -39,7 +17,6 @@ struct conditional { } } }; - } // namespace ops /** diff --git a/include/kernel_float/unops.h b/include/kernel_float/unops.h index 860b5a6..c737c80 100644 --- a/include/kernel_float/unops.h +++ b/include/kernel_float/unops.h @@ -7,17 +7,14 @@ namespace kernel_float { namespace detail { template -struct apply_impl; - -template -struct apply_impl { +struct apply_impl { KERNEL_FLOAT_INLINE static tensor_storage - call(F fun, const tensor_storage& input) { + call(F fun, const tensor_storage&... inputs) { tensor_storage result; #pragma unroll for (size_t i = 0; i < N; i++) { - result[i] = fun(input[i]); + result[i] = fun(inputs[i]...); } return result; From 5475ee68121976f8b9be212c15de4dcdd66248ad Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 25 Jul 2023 14:25:37 +0200 Subject: [PATCH 11/50] Added `fast_*` functions for fast math --- include/kernel_float/bf16.h | 35 +++++--- include/kernel_float/binops.h | 19 ++++ include/kernel_float/fp16.h | 35 +++++--- include/kernel_float/unops.h | 22 +++++ single_include/kernel_float.h | 164 ++++++++++++++++++---------------- 5 files changed, 171 insertions(+), 104 deletions(-) diff --git a/include/kernel_float/bf16.h b/include/kernel_float/bf16.h index 1a23282..be0ce41 100644 --- a/include/kernel_float/bf16.h +++ b/include/kernel_float/bf16.h @@ -134,20 +134,25 @@ struct reduce_helper= 2)>> { }; \ } -KERNEL_FLOAT_BF16_UNARY_FUN(abs, ::__habs, ::__habs2); -KERNEL_FLOAT_BF16_UNARY_FUN(negate, ::__hneg, ::__hneg2); -KERNEL_FLOAT_BF16_UNARY_FUN(ceil, ::hceil, ::h2ceil); -KERNEL_FLOAT_BF16_UNARY_FUN(cos, ::hcos, ::h2cos); -KERNEL_FLOAT_BF16_UNARY_FUN(exp, ::hexp, ::h2exp); -KERNEL_FLOAT_BF16_UNARY_FUN(exp10, ::hexp10, ::h2exp10); -KERNEL_FLOAT_BF16_UNARY_FUN(floor, ::hfloor, ::h2floor); -KERNEL_FLOAT_BF16_UNARY_FUN(log, ::hlog, ::h2log); -KERNEL_FLOAT_BF16_UNARY_FUN(log10, ::hlog10, ::h2log2); -KERNEL_FLOAT_BF16_UNARY_FUN(rint, ::hrint, ::h2rint); -KERNEL_FLOAT_BF16_UNARY_FUN(rsqrt, ::hrsqrt, ::h2rsqrt); -KERNEL_FLOAT_BF16_UNARY_FUN(sin, ::hsin, ::h2sin); -KERNEL_FLOAT_BF16_UNARY_FUN(sqrt, ::hsqrt, ::h2sqrt); -KERNEL_FLOAT_BF16_UNARY_FUN(trunc, ::htrunc, ::h2trunc); +KERNEL_FLOAT_BF16_UNARY_FUN(abs, ::__habs, ::__habs2) +KERNEL_FLOAT_BF16_UNARY_FUN(negate, ::__hneg, ::__hneg2) +KERNEL_FLOAT_BF16_UNARY_FUN(ceil, ::hceil, ::h2ceil) +KERNEL_FLOAT_BF16_UNARY_FUN(cos, ::hcos, ::h2cos) +KERNEL_FLOAT_BF16_UNARY_FUN(exp, ::hexp, ::h2exp) +KERNEL_FLOAT_BF16_UNARY_FUN(exp10, ::hexp10, ::h2exp10) +KERNEL_FLOAT_BF16_UNARY_FUN(floor, ::hfloor, ::h2floor) +KERNEL_FLOAT_BF16_UNARY_FUN(log, ::hlog, ::h2log) +KERNEL_FLOAT_BF16_UNARY_FUN(log10, ::hlog10, ::h2log2) +KERNEL_FLOAT_BF16_UNARY_FUN(rint, ::hrint, ::h2rint) +KERNEL_FLOAT_BF16_UNARY_FUN(rsqrt, ::hrsqrt, ::h2rsqrt) +KERNEL_FLOAT_BF16_UNARY_FUN(sin, ::hsin, ::h2sin) +KERNEL_FLOAT_BF16_UNARY_FUN(sqrt, ::hsqrt, ::h2sqrt) +KERNEL_FLOAT_BF16_UNARY_FUN(trunc, ::htrunc, ::h2trunc) + +KERNEL_FLOAT_BF16_UNARY_FUN(fast_exp, ::hexp, ::h2exp) +KERNEL_FLOAT_BF16_UNARY_FUN(fast_log, ::hlog, ::h2log) +KERNEL_FLOAT_BF16_UNARY_FUN(fast_cos, ::hcos, ::h2cos) +KERNEL_FLOAT_BF16_UNARY_FUN(fast_sin, ::hsin, ::h2sin) #define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2) \ namespace ops { \ @@ -176,6 +181,8 @@ KERNEL_FLOAT_BF16_BINARY_FUN(divide, __hdiv, __h2div) KERNEL_FLOAT_BF16_BINARY_FUN(min, __hmin, __hmin2) KERNEL_FLOAT_BF16_BINARY_FUN(max, __hmax, __hmax2) +KERNEL_FLOAT_BF16_BINARY_FUN(fast_div, __hdiv, __h2div) + KERNEL_FLOAT_BF16_BINARY_FUN(equal_to, __heq, __heq2) KERNEL_FLOAT_BF16_BINARY_FUN(not_equal_to, __heq, __heq2) KERNEL_FLOAT_BF16_BINARY_FUN(less, __hlt, __hlt2) diff --git a/include/kernel_float/binops.h b/include/kernel_float/binops.h index 0060800..0141a0a 100644 --- a/include/kernel_float/binops.h +++ b/include/kernel_float/binops.h @@ -144,6 +144,25 @@ KERNEL_FLOAT_DEFINE_BINARY_FUN(remainder) KERNEL_FLOAT_DEFINE_BINARY_FUN(rhypot) #endif +#if KERNEL_FLOAT_IS_DEVICE +#define KERNEL_FLOAT_DEFINE_BINARY_FAST(FUN_NAME, OP_NAME, FLOAT_FUN) \ + KERNEL_FLOAT_DEFINE_BINARY(FUN_NAME, ops::OP_NAME {}(left, right)) \ + namespace ops { \ + template<> \ + struct OP_NAME { \ + KERNEL_FLOAT_INLINE float operator()(float left, float right) { \ + return FLOAT_FUN(left, right); \ + } \ + }; \ + } +#else +#define KERNEL_FLOAT_DEFINE_BINARY_FAST(FUN_NAME, OP_NAME, FLOAT_FUN) \ + KERNEL_FLOAT_DEFINE_BINARY(FUN_NAME, ops::OP_NAME {}(left, right)) +#endif + +KERNEL_FLOAT_DEFINE_BINARY_FAST(fast_div, divide, __fdividef) +KERNEL_FLOAT_DEFINE_BINARY_FAST(fast_pow, pow, __powf) + namespace ops { template<> struct add { diff --git a/include/kernel_float/fp16.h b/include/kernel_float/fp16.h index 479b36a..d64f436 100644 --- a/include/kernel_float/fp16.h +++ b/include/kernel_float/fp16.h @@ -130,20 +130,25 @@ struct reduce_helper= 2)>> { }; \ } -KERNEL_FLOAT_FP16_UNARY_FUN(abs, ::__habs, ::__habs2); -KERNEL_FLOAT_FP16_UNARY_FUN(negate, ::__hneg, ::__hneg2); -KERNEL_FLOAT_FP16_UNARY_FUN(ceil, ::hceil, ::h2ceil); -KERNEL_FLOAT_FP16_UNARY_FUN(cos, ::hcos, ::h2cos); -KERNEL_FLOAT_FP16_UNARY_FUN(exp, ::hexp, ::h2exp); -KERNEL_FLOAT_FP16_UNARY_FUN(exp10, ::hexp10, ::h2exp10); -KERNEL_FLOAT_FP16_UNARY_FUN(floor, ::hfloor, ::h2floor); -KERNEL_FLOAT_FP16_UNARY_FUN(log, ::hlog, ::h2log); -KERNEL_FLOAT_FP16_UNARY_FUN(log10, ::hlog10, ::h2log2); -KERNEL_FLOAT_FP16_UNARY_FUN(rint, ::hrint, ::h2rint); -KERNEL_FLOAT_FP16_UNARY_FUN(rsqrt, ::hrsqrt, ::h2rsqrt); -KERNEL_FLOAT_FP16_UNARY_FUN(sin, ::hsin, ::h2sin); -KERNEL_FLOAT_FP16_UNARY_FUN(sqrt, ::hsqrt, ::h2sqrt); -KERNEL_FLOAT_FP16_UNARY_FUN(trunc, ::htrunc, ::h2trunc); +KERNEL_FLOAT_FP16_UNARY_FUN(abs, ::__habs, ::__habs2) +KERNEL_FLOAT_FP16_UNARY_FUN(negate, ::__hneg, ::__hneg2) +KERNEL_FLOAT_FP16_UNARY_FUN(ceil, ::hceil, ::h2ceil) +KERNEL_FLOAT_FP16_UNARY_FUN(cos, ::hcos, ::h2cos) +KERNEL_FLOAT_FP16_UNARY_FUN(exp, ::hexp, ::h2exp) +KERNEL_FLOAT_FP16_UNARY_FUN(exp10, ::hexp10, ::h2exp10) +KERNEL_FLOAT_FP16_UNARY_FUN(floor, ::hfloor, ::h2floor) +KERNEL_FLOAT_FP16_UNARY_FUN(log, ::hlog, ::h2log) +KERNEL_FLOAT_FP16_UNARY_FUN(log10, ::hlog10, ::h2log2) +KERNEL_FLOAT_FP16_UNARY_FUN(rint, ::hrint, ::h2rint) +KERNEL_FLOAT_FP16_UNARY_FUN(rsqrt, ::hrsqrt, ::h2rsqrt) +KERNEL_FLOAT_FP16_UNARY_FUN(sin, ::hsin, ::h2sin) +KERNEL_FLOAT_FP16_UNARY_FUN(sqrt, ::hsqrt, ::h2sqrt) +KERNEL_FLOAT_FP16_UNARY_FUN(trunc, ::htrunc, ::h2trunc) + +KERNEL_FLOAT_FP16_UNARY_FUN(fast_exp, ::hexp, ::h2exp) +KERNEL_FLOAT_FP16_UNARY_FUN(fast_log, ::hlog, ::h2log) +KERNEL_FLOAT_FP16_UNARY_FUN(fast_cos, ::hcos, ::h2cos) +KERNEL_FLOAT_FP16_UNARY_FUN(fast_sin, ::hsin, ::h2sin) #define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2) \ namespace ops { \ @@ -170,6 +175,8 @@ KERNEL_FLOAT_FP16_BINARY_FUN(divide, __hdiv, __h2div) KERNEL_FLOAT_FP16_BINARY_FUN(min, __hmin, __hmin2) KERNEL_FLOAT_FP16_BINARY_FUN(max, __hmax, __hmax2) +KERNEL_FLOAT_FP16_BINARY_FUN(fast_div, __hdiv, __h2div) + KERNEL_FLOAT_FP16_BINARY_FUN(equal_to, __heq, __heq2) KERNEL_FLOAT_FP16_BINARY_FUN(not_equal_to, __heq, __heq2) KERNEL_FLOAT_FP16_BINARY_FUN(less, __hlt, __hlt2) diff --git a/include/kernel_float/unops.h b/include/kernel_float/unops.h index c737c80..72cac18 100644 --- a/include/kernel_float/unops.h +++ b/include/kernel_float/unops.h @@ -110,6 +110,28 @@ KERNEL_FLOAT_DEFINE_UNARY_FUN(signbit) KERNEL_FLOAT_DEFINE_UNARY_FUN(isinf) KERNEL_FLOAT_DEFINE_UNARY_FUN(isnan) +#if KERNEL_FLOAT_IS_DEVICE +#define KERNEL_FLOAT_DEFINE_UNARY_FAST(FUN_NAME, OP_NAME, FLOAT_FUN) \ + KERNEL_FLOAT_DEFINE_UNARY(FUN_NAME, ops::OP_NAME {}(input)) \ + namespace ops { \ + template<> \ + struct OP_NAME { \ + KERNEL_FLOAT_INLINE float operator()(float input) { \ + return FLOAT_FUN(input); \ + } \ + }; \ + } +#else +#define KERNEL_FLOAT_DEFINE_UNARY_FAST(FUN_NAME, OP_NAME, FLOAT_FUN) \ + KERNEL_FLOAT_DEFINE_UNARY(FUN_NAME, ops::OP_NAME {}(input)) +#endif + +KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_exp, exp, __expf) +KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_log, log, __logf) +KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_cos, cos, __cosf) +KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_sin, sin, __sinf) +KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_tan, tan, __tanf) + enum struct RoundingMode { ANY, DOWN, UP, NEAREST, TOWARD_ZERO }; namespace ops { diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index bebf0eb..f786c00 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-07-24 17:00:55.602691 -// git hash: 73f1cb7e69c4fde74b7535a3429c1b877fb79fef +// date: 2023-07-25 14:25:22.766896 +// git hash: eacf58401b70e1d94bf315e7e192708267f4e9e0 //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -919,17 +919,14 @@ namespace kernel_float { namespace detail { template -struct apply_impl; - -template -struct apply_impl { +struct apply_impl { KERNEL_FLOAT_INLINE static tensor_storage - call(F fun, const tensor_storage& input) { + call(F fun, const tensor_storage&... inputs) { tensor_storage result; #pragma unroll for (size_t i = 0; i < N; i++) { - result[i] = fun(input[i]); + result[i] = fun(inputs[i]...); } return result; @@ -1025,6 +1022,28 @@ KERNEL_FLOAT_DEFINE_UNARY_FUN(signbit) KERNEL_FLOAT_DEFINE_UNARY_FUN(isinf) KERNEL_FLOAT_DEFINE_UNARY_FUN(isnan) +#if KERNEL_FLOAT_IS_DEVICE +#define KERNEL_FLOAT_DEFINE_UNARY_FAST(FUN_NAME, OP_NAME, FLOAT_FUN) \ + KERNEL_FLOAT_DEFINE_UNARY(FUN_NAME, ops::OP_NAME {}(input)) \ + namespace ops { \ + template<> \ + struct OP_NAME { \ + KERNEL_FLOAT_INLINE float operator()(float input) { \ + return FLOAT_FUN(input); \ + } \ + }; \ + } +#else +#define KERNEL_FLOAT_DEFINE_UNARY_FAST(FUN_NAME, OP_NAME, FLOAT_FUN) \ + KERNEL_FLOAT_DEFINE_UNARY(FUN_NAME, ops::OP_NAME {}(input)) +#endif + +KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_exp, exp, __expf) +KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_log, log, __logf) +KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_cos, cos, __cosf) +KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_sin, sin, __sinf) +KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_tan, tan, __tanf) + enum struct RoundingMode { ANY, DOWN, UP, NEAREST, TOWARD_ZERO }; namespace ops { @@ -1362,23 +1381,6 @@ convert(const V& input, extents new_shape = {}) { namespace kernel_float { -namespace detail { - -template -struct apply_impl { - KERNEL_FLOAT_INLINE static tensor_storage - call(F fun, const tensor_storage& left, const tensor_storage& right) { - tensor_storage result; - -#pragma unroll - for (size_t i = 0; i < N; i++) { - result[i] = fun(left[i], right[i]); - } - - return result; - } -}; -} // namespace detail template using zip_type = @@ -1518,6 +1520,25 @@ KERNEL_FLOAT_DEFINE_BINARY_FUN(remainder) KERNEL_FLOAT_DEFINE_BINARY_FUN(rhypot) #endif +#if KERNEL_FLOAT_IS_DEVICE +#define KERNEL_FLOAT_DEFINE_BINARY_FAST(FUN_NAME, OP_NAME, FLOAT_FUN) \ + KERNEL_FLOAT_DEFINE_BINARY(FUN_NAME, ops::OP_NAME {}(left, right)) \ + namespace ops { \ + template<> \ + struct OP_NAME { \ + KERNEL_FLOAT_INLINE float operator()(float left, float right) { \ + return FLOAT_FUN(left, right); \ + } \ + }; \ + } +#else +#define KERNEL_FLOAT_DEFINE_BINARY_FAST(FUN_NAME, OP_NAME, FLOAT_FUN) \ + KERNEL_FLOAT_DEFINE_BINARY(FUN_NAME, ops::OP_NAME {}(left, right)) +#endif + +KERNEL_FLOAT_DEFINE_BINARY_FAST(fast_div, divide, __fdividef) +KERNEL_FLOAT_DEFINE_BINARY_FAST(fast_pow, pow, __powf) + namespace ops { template<> struct add { @@ -2156,20 +2177,25 @@ struct reduce_helper= 2)>> { }; \ } -KERNEL_FLOAT_FP16_UNARY_FUN(abs, ::__habs, ::__habs2); -KERNEL_FLOAT_FP16_UNARY_FUN(negate, ::__hneg, ::__hneg2); -KERNEL_FLOAT_FP16_UNARY_FUN(ceil, ::hceil, ::h2ceil); -KERNEL_FLOAT_FP16_UNARY_FUN(cos, ::hcos, ::h2cos); -KERNEL_FLOAT_FP16_UNARY_FUN(exp, ::hexp, ::h2exp); -KERNEL_FLOAT_FP16_UNARY_FUN(exp10, ::hexp10, ::h2exp10); -KERNEL_FLOAT_FP16_UNARY_FUN(floor, ::hfloor, ::h2floor); -KERNEL_FLOAT_FP16_UNARY_FUN(log, ::hlog, ::h2log); -KERNEL_FLOAT_FP16_UNARY_FUN(log10, ::hlog10, ::h2log2); -KERNEL_FLOAT_FP16_UNARY_FUN(rint, ::hrint, ::h2rint); -KERNEL_FLOAT_FP16_UNARY_FUN(rsqrt, ::hrsqrt, ::h2rsqrt); -KERNEL_FLOAT_FP16_UNARY_FUN(sin, ::hsin, ::h2sin); -KERNEL_FLOAT_FP16_UNARY_FUN(sqrt, ::hsqrt, ::h2sqrt); -KERNEL_FLOAT_FP16_UNARY_FUN(trunc, ::htrunc, ::h2trunc); +KERNEL_FLOAT_FP16_UNARY_FUN(abs, ::__habs, ::__habs2) +KERNEL_FLOAT_FP16_UNARY_FUN(negate, ::__hneg, ::__hneg2) +KERNEL_FLOAT_FP16_UNARY_FUN(ceil, ::hceil, ::h2ceil) +KERNEL_FLOAT_FP16_UNARY_FUN(cos, ::hcos, ::h2cos) +KERNEL_FLOAT_FP16_UNARY_FUN(exp, ::hexp, ::h2exp) +KERNEL_FLOAT_FP16_UNARY_FUN(exp10, ::hexp10, ::h2exp10) +KERNEL_FLOAT_FP16_UNARY_FUN(floor, ::hfloor, ::h2floor) +KERNEL_FLOAT_FP16_UNARY_FUN(log, ::hlog, ::h2log) +KERNEL_FLOAT_FP16_UNARY_FUN(log10, ::hlog10, ::h2log2) +KERNEL_FLOAT_FP16_UNARY_FUN(rint, ::hrint, ::h2rint) +KERNEL_FLOAT_FP16_UNARY_FUN(rsqrt, ::hrsqrt, ::h2rsqrt) +KERNEL_FLOAT_FP16_UNARY_FUN(sin, ::hsin, ::h2sin) +KERNEL_FLOAT_FP16_UNARY_FUN(sqrt, ::hsqrt, ::h2sqrt) +KERNEL_FLOAT_FP16_UNARY_FUN(trunc, ::htrunc, ::h2trunc) + +KERNEL_FLOAT_FP16_UNARY_FUN(fast_exp, ::hexp, ::h2exp) +KERNEL_FLOAT_FP16_UNARY_FUN(fast_log, ::hlog, ::h2log) +KERNEL_FLOAT_FP16_UNARY_FUN(fast_cos, ::hcos, ::h2cos) +KERNEL_FLOAT_FP16_UNARY_FUN(fast_sin, ::hsin, ::h2sin) #define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2) \ namespace ops { \ @@ -2196,6 +2222,8 @@ KERNEL_FLOAT_FP16_BINARY_FUN(divide, __hdiv, __h2div) KERNEL_FLOAT_FP16_BINARY_FUN(min, __hmin, __hmin2) KERNEL_FLOAT_FP16_BINARY_FUN(max, __hmax, __hmax2) +KERNEL_FLOAT_FP16_BINARY_FUN(fast_div, __hdiv, __h2div) + KERNEL_FLOAT_FP16_BINARY_FUN(equal_to, __heq, __heq2) KERNEL_FLOAT_FP16_BINARY_FUN(not_equal_to, __heq, __heq2) KERNEL_FLOAT_FP16_BINARY_FUN(less, __hlt, __hlt2) @@ -2384,20 +2412,25 @@ struct reduce_helper= 2)>> { }; \ } -KERNEL_FLOAT_BF16_UNARY_FUN(abs, ::__habs, ::__habs2); -KERNEL_FLOAT_BF16_UNARY_FUN(negate, ::__hneg, ::__hneg2); -KERNEL_FLOAT_BF16_UNARY_FUN(ceil, ::hceil, ::h2ceil); -KERNEL_FLOAT_BF16_UNARY_FUN(cos, ::hcos, ::h2cos); -KERNEL_FLOAT_BF16_UNARY_FUN(exp, ::hexp, ::h2exp); -KERNEL_FLOAT_BF16_UNARY_FUN(exp10, ::hexp10, ::h2exp10); -KERNEL_FLOAT_BF16_UNARY_FUN(floor, ::hfloor, ::h2floor); -KERNEL_FLOAT_BF16_UNARY_FUN(log, ::hlog, ::h2log); -KERNEL_FLOAT_BF16_UNARY_FUN(log10, ::hlog10, ::h2log2); -KERNEL_FLOAT_BF16_UNARY_FUN(rint, ::hrint, ::h2rint); -KERNEL_FLOAT_BF16_UNARY_FUN(rsqrt, ::hrsqrt, ::h2rsqrt); -KERNEL_FLOAT_BF16_UNARY_FUN(sin, ::hsin, ::h2sin); -KERNEL_FLOAT_BF16_UNARY_FUN(sqrt, ::hsqrt, ::h2sqrt); -KERNEL_FLOAT_BF16_UNARY_FUN(trunc, ::htrunc, ::h2trunc); +KERNEL_FLOAT_BF16_UNARY_FUN(abs, ::__habs, ::__habs2) +KERNEL_FLOAT_BF16_UNARY_FUN(negate, ::__hneg, ::__hneg2) +KERNEL_FLOAT_BF16_UNARY_FUN(ceil, ::hceil, ::h2ceil) +KERNEL_FLOAT_BF16_UNARY_FUN(cos, ::hcos, ::h2cos) +KERNEL_FLOAT_BF16_UNARY_FUN(exp, ::hexp, ::h2exp) +KERNEL_FLOAT_BF16_UNARY_FUN(exp10, ::hexp10, ::h2exp10) +KERNEL_FLOAT_BF16_UNARY_FUN(floor, ::hfloor, ::h2floor) +KERNEL_FLOAT_BF16_UNARY_FUN(log, ::hlog, ::h2log) +KERNEL_FLOAT_BF16_UNARY_FUN(log10, ::hlog10, ::h2log2) +KERNEL_FLOAT_BF16_UNARY_FUN(rint, ::hrint, ::h2rint) +KERNEL_FLOAT_BF16_UNARY_FUN(rsqrt, ::hrsqrt, ::h2rsqrt) +KERNEL_FLOAT_BF16_UNARY_FUN(sin, ::hsin, ::h2sin) +KERNEL_FLOAT_BF16_UNARY_FUN(sqrt, ::hsqrt, ::h2sqrt) +KERNEL_FLOAT_BF16_UNARY_FUN(trunc, ::htrunc, ::h2trunc) + +KERNEL_FLOAT_BF16_UNARY_FUN(fast_exp, ::hexp, ::h2exp) +KERNEL_FLOAT_BF16_UNARY_FUN(fast_log, ::hlog, ::h2log) +KERNEL_FLOAT_BF16_UNARY_FUN(fast_cos, ::hcos, ::h2cos) +KERNEL_FLOAT_BF16_UNARY_FUN(fast_sin, ::hsin, ::h2sin) #define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2) \ namespace ops { \ @@ -2426,6 +2459,8 @@ KERNEL_FLOAT_BF16_BINARY_FUN(divide, __hdiv, __h2div) KERNEL_FLOAT_BF16_BINARY_FUN(min, __hmin, __hmin2) KERNEL_FLOAT_BF16_BINARY_FUN(max, __hmax, __hmax2) +KERNEL_FLOAT_BF16_BINARY_FUN(fast_div, __hdiv, __h2div) + KERNEL_FLOAT_BF16_BINARY_FUN(equal_to, __heq, __heq2) KERNEL_FLOAT_BF16_BINARY_FUN(not_equal_to, __heq, __heq2) KERNEL_FLOAT_BF16_BINARY_FUN(less, __hlt, __hlt2) @@ -2593,29 +2628,7 @@ KERNEL_FLOAT_INLINE kvec, sizeof...(Args)> make_kvec(Args&&.. namespace kernel_float { -namespace detail { - -template -struct apply_impl { - KERNEL_FLOAT_INLINE static tensor_storage call( - F fun, - const tensor_storage& a, - const tensor_storage& b, - const tensor_storage& c) { - tensor_storage result; - -#pragma unroll - for (size_t i = 0; i < N; i++) { - result[i] = fun(a[i], b[i], c[i]); - } - - return result; - } -}; -} // namespace detail - namespace ops { - template struct conditional { KERNEL_FLOAT_INLINE T operator()(bool cond, T true_value, T false_value) { @@ -2626,7 +2639,6 @@ struct conditional { } } }; - } // namespace ops /** From df48350ff5f4362e8220188c09f48c37ba9d0335 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 25 Jul 2023 14:36:28 +0200 Subject: [PATCH 12/50] Add fma function --- include/kernel_float/triops.h | 47 ++++++++++++++++++++++++++++++++ single_include/kernel_float.h | 51 +++++++++++++++++++++++++++++++++-- 2 files changed, 96 insertions(+), 2 deletions(-) diff --git a/include/kernel_float/triops.h b/include/kernel_float/triops.h index 1e8f1e1..4445468 100644 --- a/include/kernel_float/triops.h +++ b/include/kernel_float/triops.h @@ -83,6 +83,53 @@ KERNEL_FLOAT_INLINE tensor where(const C& cond) { return where(cond, true_values, false_values); } +namespace ops { +template +struct fma { + KERNEL_FLOAT_INLINE T operator()(T a, T b, T c) { + return a + b * c; + } +}; + +#if KERNEL_FLOAT_IS_DEVICE +template<> +struct fma { + KERNEL_FLOAT_INLINE float operator()(float a, float b, float c) { + return __fmaf_rn(a, b, c); + } +}; + +template<> +struct fma { + KERNEL_FLOAT_INLINE double operator()(double a, double b, double c) { + return __fma_rn(a, b, c); + } +}; +#endif +} // namespace ops + +/** + * Computes the result of `a * b + c`. This is done in a single operation if possible. + */ +template< + typename A, + typename B, + typename C, + typename T = promoted_tensor_value_type, + typename E = broadcast_extents, broadcast_tensor_extents>> +KERNEL_FLOAT_INLINE tensor fma(const A& a, const B& b, const C& c) { + using F = ops::fma; + + return detail::apply_impl::call( + F {}, + detail::convert_helper, tensor_extents, T, E>::call( + into_tensor_storage(a)), + detail::convert_helper, tensor_extents, T, E>::call( + into_tensor_storage(b)), + detail::convert_helper, tensor_extents, T, E>::call( + into_tensor_storage(c))); +} + } // namespace kernel_float #endif //KERNEL_FLOAT_TRIOPS_H diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index f786c00..5c19f71 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-07-25 14:25:22.766896 -// git hash: eacf58401b70e1d94bf315e7e192708267f4e9e0 +// date: 2023-07-25 14:36:17.388029 +// git hash: 5475ee68121976f8b9be212c15de4dcdd66248ad //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -2705,6 +2705,53 @@ KERNEL_FLOAT_INLINE tensor where(const C& cond) { return where(cond, true_values, false_values); } +namespace ops { +template +struct fma { + KERNEL_FLOAT_INLINE T operator()(T a, T b, T c) { + return a + b * c; + } +}; + +#if KERNEL_FLOAT_IS_DEVICE +template<> +struct fma { + KERNEL_FLOAT_INLINE float operator()(float a, float b, float c) { + return __fmaf_rn(a, b, c); + } +}; + +template<> +struct fma { + KERNEL_FLOAT_INLINE double operator()(double a, double b, double c) { + return __fma_rn(a, b, c); + } +}; +#endif +} // namespace ops + +/** + * Computes the result of `a * b + c`. This is done in a single operation if possible. + */ +template< + typename A, + typename B, + typename C, + typename T = promoted_tensor_value_type, + typename E = broadcast_extents, broadcast_tensor_extents>> +KERNEL_FLOAT_INLINE tensor fma(const A& a, const B& b, const C& c) { + using F = ops::fma; + + return detail::apply_impl::call( + F {}, + detail::convert_helper, tensor_extents, T, E>::call( + into_tensor_storage(a)), + detail::convert_helper, tensor_extents, T, E>::call( + into_tensor_storage(b)), + detail::convert_helper, tensor_extents, T, E>::call( + into_tensor_storage(c))); +} + } // namespace kernel_float #endif //KERNEL_FLOAT_TRIOPS_H From f905f62c8ecfffef0fc774808388a54d662fa19b Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 25 Jul 2023 14:50:27 +0200 Subject: [PATCH 13/50] Add `kconstant` type --- include/kernel_float/constant.h | 58 ++++++++++++++++++++++++++ include/kernel_float/prelude.h | 10 +++++ single_include/kernel_float.h | 72 ++++++++++++++++++++++++++++++++- 3 files changed, 138 insertions(+), 2 deletions(-) create mode 100644 include/kernel_float/constant.h diff --git a/include/kernel_float/constant.h b/include/kernel_float/constant.h new file mode 100644 index 0000000..12bec35 --- /dev/null +++ b/include/kernel_float/constant.h @@ -0,0 +1,58 @@ +#ifndef KERNEL_FLOAT_CONSTANT +#define KERNEL_FLOAT_CONSTANT + +#include "broadcast.h" + +namespace kernel_float { + +template +struct constant { + KERNEL_FLOAT_INLINE + constexpr constant(T value = {}) : value_(value) {} + + KERNEL_FLOAT_INLINE + constexpr T get() const { + return value_; + } + + KERNEL_FLOAT_INLINE + constexpr operator T() const { + return value_; + } + + private: + T value_; +}; + +template +KERNEL_FLOAT_INLINE constexpr constant make_constant(T value) { + return value; +} + +template +struct promote_type, constant> { + using type = typename promote_type::type; +}; + +template +struct promote_type, R> { + using type = typename promote_type::type; +}; + +template +struct promote_type> { + using type = typename promote_type::type; +}; + +namespace ops { +template +struct cast, R, m> { + KERNEL_FLOAT_INLINE R operator()(const T& input) noexcept { + return cast {}(input); + } +}; +} // namespace ops + +} // namespace kernel_float + +#endif \ No newline at end of file diff --git a/include/kernel_float/prelude.h b/include/kernel_float/prelude.h index d4b314a..210f059 100644 --- a/include/kernel_float/prelude.h +++ b/include/kernel_float/prelude.h @@ -1,6 +1,7 @@ #ifndef KERNEL_FLOAT_PRELUDE_H #define KERNEL_FLOAT_PRELUDE_H +#include "constant.h" #include "tensor.h" namespace kernel_float { @@ -81,6 +82,15 @@ template KERNEL_FLOAT_INLINE kvec, sizeof...(Args)> make_kvec(Args&&... args) { return make_vec(std::forward(args)...); }; + +template +using kconstant = constant; + +template +KERNEL_FLOAT_INLINE constexpr kconstant kconst(T value) { + return value; +} + } // namespace prelude } // namespace kernel_float diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 5c19f71..202adde 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-07-25 14:36:17.388029 -// git hash: 5475ee68121976f8b9be212c15de4dcdd66248ad +// date: 2023-07-25 14:50:15.560873 +// git hash: df48350ff5f4362e8220188c09f48c37ba9d0335 //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -1599,6 +1599,64 @@ struct bit_xor { } // namespace kernel_float +#endif +#ifndef KERNEL_FLOAT_CONSTANT +#define KERNEL_FLOAT_CONSTANT + + + +namespace kernel_float { + +template +struct constant { + KERNEL_FLOAT_INLINE + constexpr constant(T value = {}) : value_(value) {} + + KERNEL_FLOAT_INLINE + constexpr T get() const { + return value_; + } + + KERNEL_FLOAT_INLINE + constexpr operator T() const { + return value_; + } + + private: + T value_; +}; + +template +KERNEL_FLOAT_INLINE constexpr constant make_constant(T value) { + return value; +} + +template +struct promote_type, constant> { + using type = typename promote_type::type; +}; + +template +struct promote_type, R> { + using type = typename promote_type::type; +}; + +template +struct promote_type> { + using type = typename promote_type::type; +}; + +namespace ops { +template +struct cast, R, m> { + KERNEL_FLOAT_INLINE R operator()(const T& input) noexcept { + return cast {}(input); + } +}; +} // namespace ops + +} // namespace kernel_float + #endif #ifndef KERNEL_FLOAT_REDUCE_H #define KERNEL_FLOAT_REDUCE_H @@ -2538,6 +2596,7 @@ KERNEL_FLOAT_BF16_CAST(__half, __float2bfloat16(input), __bfloat162float(input)) + namespace kernel_float { namespace prelude { namespace kf = ::kernel_float; @@ -2616,6 +2675,15 @@ template KERNEL_FLOAT_INLINE kvec, sizeof...(Args)> make_kvec(Args&&... args) { return make_vec(std::forward(args)...); }; + +template +using kconstant = constant; + +template +KERNEL_FLOAT_INLINE constexpr kconstant kconst(T value) { + return value; +} + } // namespace prelude } // namespace kernel_float From 0cf26708f3b17b2f74940e5806bad2aeaae4c076 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 25 Jul 2023 17:16:02 +0200 Subject: [PATCH 14/50] Remove support for N-d tensors and only support 1D vectors --- examples/vector_add/main.cu | 6 +- include/kernel_float.h | 4 +- include/kernel_float/base.h | 262 ++++++---------------------- include/kernel_float/bf16.h | 52 +++--- include/kernel_float/binops.h | 67 ++++---- include/kernel_float/broadcast.h | 257 +++++++--------------------- include/kernel_float/fp16.h | 49 +++--- include/kernel_float/meta.h | 2 +- include/kernel_float/prelude.h | 38 ++--- include/kernel_float/reduce.h | 24 +-- include/kernel_float/tensor.h | 285 ------------------------------- include/kernel_float/triops.h | 76 ++++----- include/kernel_float/unops.h | 28 +-- include/kernel_float/vector.h | 250 +++++++++++++++++++++++++++ tests/broadcast.cu | 4 +- 15 files changed, 537 insertions(+), 867 deletions(-) delete mode 100644 include/kernel_float/tensor.h create mode 100644 include/kernel_float/vector.h diff --git a/examples/vector_add/main.cu b/examples/vector_add/main.cu index d9268d5..ea78d1a 100644 --- a/examples/vector_add/main.cu +++ b/examples/vector_add/main.cu @@ -13,7 +13,7 @@ void cuda_check(cudaError_t code) { } template -__global__ void my_kernel(int length, const khalfN* input, double constant, kfloatN* output) { +__global__ void my_kernel(int length, const khalf* input, double constant, kfloat* output) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i * N < length) { @@ -35,8 +35,8 @@ void run_kernel(int n) { } // Allocate device memory - khalfN* input_dev; - kfloatN* output_dev; + khalf* input_dev; + kfloat* output_dev; cuda_check(cudaMalloc(&input_dev, sizeof(half) * n)); cuda_check(cudaMalloc(&output_dev, sizeof(float) * n)); diff --git a/include/kernel_float.h b/include/kernel_float.h index 5eef892..db9249a 100644 --- a/include/kernel_float.h +++ b/include/kernel_float.h @@ -13,8 +13,8 @@ #include "kernel_float/meta.h" #include "kernel_float/prelude.h" #include "kernel_float/reduce.h" -#include "kernel_float/tensor.h" #include "kernel_float/triops.h" #include "kernel_float/unops.h" +#include "kernel_float/vector.h" -#endif \ No newline at end of file +#endif diff --git a/include/kernel_float/base.h b/include/kernel_float/base.h index 88bdffa..33ee2f1 100644 --- a/include/kernel_float/base.h +++ b/include/kernel_float/base.h @@ -1,5 +1,5 @@ -#ifndef KERNEL_FLOAT_BASE -#define KERNEL_FLOAT_BASE +#ifndef KERNEL_FLOAT_BASE_H +#define KERNEL_FLOAT_BASE_H #include "macros.h" #include "meta.h" @@ -7,7 +7,7 @@ namespace kernel_float { template -struct alignas(Alignment) array { +struct alignas(Alignment) aligned_array { KERNEL_FLOAT_INLINE T* data() { return items_; @@ -18,23 +18,13 @@ struct alignas(Alignment) array { return items_; } - KERNEL_FLOAT_INLINE - T& operator[](size_t i) { - return items_[i]; - } - - KERNEL_FLOAT_INLINE - const T& operator[](size_t i) const { - return items_[i]; - } - T items_[N]; }; template -struct array { +struct aligned_array { KERNEL_FLOAT_INLINE - array(T value = {}) : value_(value) {} + aligned_array(T value = {}) : value_(value) {} KERNEL_FLOAT_INLINE operator T() const { @@ -51,21 +41,11 @@ struct array { return &value_; } - KERNEL_FLOAT_INLINE - T& operator[](size_t) { - return value_; - } - - KERNEL_FLOAT_INLINE - const T& operator[](size_t) const { - return value_; - } - T value_; }; template -struct array { +struct aligned_array { KERNEL_FLOAT_INLINE T* data() { while (true) @@ -77,23 +57,8 @@ struct array { while (true) ; } - - KERNEL_FLOAT_INLINE - T& operator[](size_t i) { - while (true) - ; - } - - KERNEL_FLOAT_INLINE - const T& operator[](size_t i) const { - while (true) - ; - } }; -template -using ndindex = array; - KERNEL_FLOAT_INLINE static constexpr size_t compute_max_alignment(size_t total_size, size_t min_align) { if (total_size % 32 == 0 || min_align >= 32) { @@ -112,232 +77,101 @@ static constexpr size_t compute_max_alignment(size_t total_size, size_t min_alig } template -using tensor_storage = array; - -template class S = tensor_storage> -struct tensor; +using vector_storage = aligned_array; -template -struct extents; - -template<> -struct extents<> { - static constexpr size_t rank = 0; - static constexpr size_t volume = 1; - - KERNEL_FLOAT_INLINE - static constexpr size_t size(size_t axis) { - return 1; - } - - KERNEL_FLOAT_INLINE - static constexpr size_t stride(size_t axis) { - return 1; - } - - KERNEL_FLOAT_INLINE - static size_t ravel_index(ndindex<0>) { - return 0; - } - - KERNEL_FLOAT_INLINE - static ndindex<0> unravel_index(size_t i) { - return {}; - } -}; +template class S = vector_storage> +struct vector; template -struct extents { - static constexpr size_t rank = 1; - static constexpr size_t volume = N; - - KERNEL_FLOAT_INLINE - static constexpr size_t size(size_t axis) { - return axis == 0 ? N : 1; - } - - KERNEL_FLOAT_INLINE - static constexpr size_t stride(size_t axis) { - return 1; - } - - KERNEL_FLOAT_INLINE - static size_t ravel_index(ndindex<1> ind) { - return ind[0]; - } - - KERNEL_FLOAT_INLINE - static ndindex<1> unravel_index(size_t i) { - return {i}; - } -}; - -template -struct extents { - static constexpr size_t rank = 2; - static constexpr size_t volume = N * M; - - KERNEL_FLOAT_INLINE - static constexpr size_t size(size_t axis) { - return axis == 0 ? N : axis == 1 ? M : 1; - } - - KERNEL_FLOAT_INLINE - static constexpr size_t stride(size_t axis) { - return axis == 0 ? M : 1; - } - - KERNEL_FLOAT_INLINE - static size_t ravel_index(ndindex<2> x) { - return x[0] * M + x[1]; - } - - KERNEL_FLOAT_INLINE - static ndindex<2> unravel_index(size_t i) { - return {i / M, i % M}; - } -}; - -template -struct extents { - static constexpr size_t rank = 3; - static constexpr size_t volume = N * M * K; - - KERNEL_FLOAT_INLINE - static constexpr size_t size(size_t axis) { - return axis == 0 ? N : axis == 1 ? M : axis == 2 ? K : 1; - } - - KERNEL_FLOAT_INLINE - static constexpr size_t stride(size_t axis) { - return axis == 0 ? M * K // - : axis == 1 ? K // - : 1; // - } - - KERNEL_FLOAT_INLINE - static size_t ravel_index(ndindex<3> x) { - return (x[0] * M + x[1]) * K + x[2]; - } - - KERNEL_FLOAT_INLINE - static ndindex<3> unravel_index(size_t i) { - return {i / (K * M), (i / K) % M, i % K}; - } +struct extent { + static constexpr size_t value = N; + static constexpr size_t size = N; }; template -struct into_tensor_traits { - using type = tensor>; +struct into_vector_traits { + using value_type = T; + using extent_type = extent<1>; KERNEL_FLOAT_INLINE - static type call(const T& input) { - return tensor_storage {input}; + static vector_storage call(const T& input) { + return vector_storage {input}; } }; template -struct into_tensor_traits { - using type = typename into_tensor_traits::type; - - KERNEL_FLOAT_INLINE - static type call(const V input) { - return into_tensor_traits::call(input); - } -}; +struct into_vector_traits: into_vector_traits {}; template -struct into_tensor_traits { - using type = typename into_tensor_traits::type; - - KERNEL_FLOAT_INLINE - static type call(V& input) { - return into_tensor_traits::call(input); - } -}; +struct into_vector_traits: into_vector_traits {}; template -struct into_tensor_traits { - using type = typename into_tensor_traits::type; - - KERNEL_FLOAT_INLINE - static type call(const V& input) { - return into_tensor_traits::call(input); - } -}; +struct into_vector_traits: into_vector_traits {}; template -struct into_tensor_traits { - using type = typename into_tensor_traits::type; +struct into_vector_traits: into_vector_traits {}; - KERNEL_FLOAT_INLINE - static type call(V&& input) { - return into_tensor_traits::call(std::move(input)); - } -}; - -template class S> -struct into_tensor_traits> { - using type = tensor; +template class S> +struct into_vector_traits> { + using value_type = T; + using extent_type = E; KERNEL_FLOAT_INLINE - static type call(const tensor& input) { - return input; + static vector_storage call(const vector& input) { + return input.storage(); } }; template -struct into_tensor_traits> { - using type = tensor>; +struct into_vector_traits> { + using value_type = T; + using extent_type = extent; KERNEL_FLOAT_INLINE - static type call(const array& input) { + static vector_storage call(const aligned_array& input) { return input; } }; template -struct tensor_traits; +struct vector_traits; -template class S> -struct tensor_traits> { +template class S> +struct vector_traits> { using value_type = T; - using extents_type = D; - using storage_type = S; + using extent_type = E; + using storage_type = S; + using vector_type = vector; }; template -using into_tensor_type = typename into_tensor_traits::type; - -template -using tensor_extents = typename tensor_traits>::extents_type; +using vector_value_type = typename into_vector_traits::value_type; template -static constexpr size_t tensor_rank = tensor_extents::rank; +using vector_extent_type = typename into_vector_traits::extent_type; template -static constexpr size_t tensor_volume = tensor_extents::volume; +static constexpr size_t vector_extent = vector_extent_type::value; template -using tensor_value_type = typename tensor_traits>::value_type; +using into_vector_type = vector, vector_extent_type>; template -using tensor_storage_type = tensor_storage, tensor_volume>; +using vector_storage_type = vector_storage, vector_extent>; template -using promoted_tensor_value_type = - promote_t>::value_type...>; +using promoted_vector_value_type = promote_t...>; template -KERNEL_FLOAT_INLINE into_tensor_type into_tensor(V&& input) { - return into_tensor_traits::call(std::forward(input)); +KERNEL_FLOAT_INLINE vector_storage_type into_vector_storage(V&& input) { + return into_vector_traits::call(std::forward(input)); } template -KERNEL_FLOAT_INLINE tensor_storage_type into_tensor_storage(V&& input) { - return into_tensor_traits::call(std::forward(input)).storage(); +KERNEL_FLOAT_INLINE into_vector_type into_vector(V&& input) { + return into_vector_traits::call(std::forward(input)); } } // namespace kernel_float -#endif \ No newline at end of file +#endif diff --git a/include/kernel_float/bf16.h b/include/kernel_float/bf16.h index be0ce41..2f8e870 100644 --- a/include/kernel_float/bf16.h +++ b/include/kernel_float/bf16.h @@ -7,7 +7,8 @@ #include #include "binops.h" -#include "tensor.h" +#include "reduce.h" +#include "vector.h" namespace kernel_float { KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(__nv_bfloat16) @@ -15,12 +16,13 @@ KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __nv_bfloat16) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __nv_bfloat16) template<> -struct into_tensor_traits<__nv_bfloat162> { - using type = tensor<__nv_bfloat16, extents<2>>; +struct into_vector_traits<__nv_bfloat162> { + using value_type = __nv_bfloat16; + using extent_type = extent<2>; KERNEL_FLOAT_INLINE - static type call(__nv_bfloat162 input) { - return tensor_storage<__nv_bfloat16, 2> {input.x, input.y}; + static vector_storage<__nv_bfloat16, 2> call(__nv_bfloat162 input) { + return {input.x, input.y}; } }; @@ -47,20 +49,20 @@ struct zip_bfloat16x2 { template struct apply_impl { - KERNEL_FLOAT_INLINE static tensor_storage<__nv_bfloat16, N> - call(F fun, const tensor_storage<__nv_bfloat16, N>& input) { - tensor_storage<__nv_bfloat16, N> result; + KERNEL_FLOAT_INLINE static vector_storage<__nv_bfloat16, N> + call(F fun, const vector_storage<__nv_bfloat16, N>& input) { + vector_storage<__nv_bfloat16, N> result; #pragma unroll for (size_t i = 0; i < N; i += 2) { - __nv_bfloat162 a = {input[i], input[i + 1]}; + __nv_bfloat162 a = {input.data()[i], input.data()[i + 1]}; __nv_bfloat162 b = map_bfloat16x2::call(fun, a); - result[i + 0] = b.x; - result[i + 1] = b.y; + result.data()[i + 0] = b.x; + result.data()[i + 1] = b.y; } if (N % 2 != 0) { - result[N - 1] = fun(input[N - 1]); + result.data()[N - 1] = fun(input.data()[N - 1]); } return result; @@ -69,22 +71,22 @@ struct apply_impl { template struct apply_impl { - KERNEL_FLOAT_INLINE static tensor_storage<__nv_bfloat16, N> call( + KERNEL_FLOAT_INLINE static vector_storage<__nv_bfloat16, N> call( F fun, - const tensor_storage<__nv_bfloat16, N>& left, - const tensor_storage<__nv_bfloat16, N>& right) { - tensor_storage<__nv_bfloat16, N> result; + const vector_storage<__nv_bfloat16, N>& left, + const vector_storage<__nv_bfloat16, N>& right) { + vector_storage<__nv_bfloat16, N> result; #pragma unroll for (size_t i = 0; i < N; i += 2) { - __nv_bfloat162 a = {left[i], left[i + 1]}; - __nv_bfloat162 b = {right[i], right[i + 1]}; + __nv_bfloat162 a = {left.data()[i], left.data()[i + 1]}; + __nv_bfloat162 b = {right.data()[i], right.data()[i + 1]}; __nv_bfloat162 c = zip_bfloat16x2::call(fun, a, b); - result[i + 0] = c.x; - result[i + 1] = c.y; + result.data()[i + 0] = c.x; + result.data()[i + 1] = c.y; } if (N % 2 != 0) { - result[N - 1] = fun(left[N - 1], right[N - 1]); + result.data()[N - 1] = fun(left.data()[N - 1], right.data()[N - 1]); } return result; @@ -94,19 +96,19 @@ struct apply_impl { template struct reduce_helper= 2)>> { KERNEL_FLOAT_INLINE static __nv_bfloat16 - call(F fun, const tensor_storage<__nv_bfloat16, N>& input) { - __nv_bfloat162 accum = {input[0], input[1]}; + call(F fun, const vector_storage<__nv_bfloat16, N>& input) { + __nv_bfloat162 accum = {input.data()[0], input.data()[1]}; #pragma unroll for (size_t i = 2; i < N; i += 2) { - __nv_bfloat162 a = {input[i], input[i + 1]}; + __nv_bfloat162 a = {input.data()[i], input.data()[i + 1]}; accum = zip_bfloat16x2::call(fun, accum, a); } __nv_bfloat16 result = fun(accum.x, accum.y); if (N % 2 != 0) { - result = fun(result, input[N - 1]); + result = fun(result, input.data()[N - 1]); } return result; diff --git a/include/kernel_float/binops.h b/include/kernel_float/binops.h index 0141a0a..db589b4 100644 --- a/include/kernel_float/binops.h +++ b/include/kernel_float/binops.h @@ -7,39 +7,40 @@ namespace kernel_float { template -using zip_type = - tensor, tensor_value_type>, broadcast_tensor_extents>; +using zip_type = vector< + result_t, vector_value_type>, + broadcast_vector_extent_type>; template KERNEL_FLOAT_INLINE zip_type zip(F fun, const L& left, const R& right) { - using A = tensor_value_type; - using B = tensor_value_type; + using A = vector_value_type; + using B = vector_value_type; using O = result_t; - using E = broadcast_tensor_extents; + using E = broadcast_vector_extent_type; - return detail::apply_impl::call( + return detail::apply_impl::call( fun, broadcast(left).storage(), broadcast(right).storage()); } template -using zip_common_type = tensor< - result_t, promoted_tensor_value_type>, - broadcast_tensor_extents>; +using zip_common_type = vector< + result_t, promoted_vector_value_type>, + broadcast_vector_extent_type>; template KERNEL_FLOAT_INLINE zip_common_type zip_common(F fun, const L& left, const R& right) { - using T = promoted_tensor_value_type; + using T = promoted_vector_value_type; using O = result_t; - using E = broadcast_tensor_extents; + using E = broadcast_vector_extent_type; - return detail::apply_impl::call( + return detail::apply_impl::call( fun, - detail::convert_helper, tensor_extents, T, E>::call( - into_tensor_storage(left)), - detail::convert_helper, tensor_extents, T, E>::call( - into_tensor_storage(right))); + detail::convert_helper, vector_extent_type, T, E>::call( + into_vector_storage(left)), + detail::convert_helper, vector_extent_type, T, E>::call( + into_vector_storage(right))); } #define KERNEL_FLOAT_DEFINE_BINARY(NAME, EXPR) \ @@ -51,7 +52,7 @@ KERNEL_FLOAT_INLINE zip_common_type zip_common(F fun, const L& left, co } \ }; \ } \ - template> \ + template> \ KERNEL_FLOAT_INLINE zip_common_type, L, R> NAME(L&& left, R&& right) { \ return zip_common(ops::NAME {}, std::forward(left), std::forward(right)); \ } @@ -59,21 +60,21 @@ KERNEL_FLOAT_INLINE zip_common_type zip_common(F fun, const L& left, co #define KERNEL_FLOAT_DEFINE_BINARY_OP(NAME, OP) \ KERNEL_FLOAT_DEFINE_BINARY(NAME, left OP right) \ template, typename E1, typename E2> \ - KERNEL_FLOAT_INLINE zip_common_type, tensor, tensor> operator OP( \ - const tensor& left, \ - const tensor& right) { \ + KERNEL_FLOAT_INLINE zip_common_type, vector, vector> operator OP( \ + const vector& left, \ + const vector& right) { \ return zip_common(ops::NAME {}, left, right); \ } \ - template>, typename E> \ - KERNEL_FLOAT_INLINE zip_common_type, tensor, R> operator OP( \ - const tensor& left, \ + template>, typename E> \ + KERNEL_FLOAT_INLINE zip_common_type, vector, R> operator OP( \ + const vector& left, \ const R& right) { \ return zip_common(ops::NAME {}, left, right); \ } \ - template, R>, typename E> \ - KERNEL_FLOAT_INLINE zip_common_type, L, tensor> operator OP( \ + template, R>, typename E> \ + KERNEL_FLOAT_INLINE zip_common_type, L, vector> operator OP( \ const L& left, \ - const tensor& right) { \ + const vector& right) { \ return zip_common(ops::NAME {}, left, right); \ } @@ -96,13 +97,13 @@ KERNEL_FLOAT_DEFINE_BINARY_OP(bit_xor, ^) // clang-format off template typename F, typename T, typename E, typename R> -static constexpr bool is_tensor_assign_allowed = - is_tensor_broadcastable && +static constexpr bool is_vector_assign_allowed = + is_vector_broadcastable && is_implicit_convertible< result_t< - F>>, + F>>, T, - tensor_value_type + vector_value_type >, T >; @@ -113,8 +114,8 @@ static constexpr bool is_tensor_assign_allowed = typename T, \ typename E, \ typename R, \ - typename = enabled_t>> \ - KERNEL_FLOAT_INLINE tensor& operator OP(tensor& lhs, const R& rhs) { \ + typename = enabled_t>> \ + KERNEL_FLOAT_INLINE vector& operator OP(vector& lhs, const R& rhs) { \ using F = ops::NAME; \ lhs = zip_common(F {}, lhs, rhs); \ return lhs; \ @@ -223,4 +224,4 @@ struct bit_xor { } // namespace kernel_float -#endif \ No newline at end of file +#endif diff --git a/include/kernel_float/broadcast.h b/include/kernel_float/broadcast.h index 596bb27..aac1a45 100644 --- a/include/kernel_float/broadcast.h +++ b/include/kernel_float/broadcast.h @@ -7,237 +7,109 @@ namespace kernel_float { namespace detail { -template -struct unify_dimension_helper; - -template<> -struct unify_dimension_helper<1, 1> { - static constexpr size_t value = 1; -}; +template +struct broadcast_extent_helper; template -struct unify_dimension_helper { - static constexpr size_t value = N; +struct broadcast_extent_helper, extent> { + using type = extent; }; template -struct unify_dimension_helper { - static constexpr size_t value = N; +struct broadcast_extent_helper, extent> { + using type = extent; }; template -struct unify_dimension_helper<1, N> { - static constexpr size_t value = N; -}; - -template -struct unify_extents_helper; - -template -struct unify_extents_helper, extents> { - using type = extents::value...>; -}; - -template -struct extents_to_rank { - using type = E; -}; - -template -struct extents_to_rank, N, enabled_t<(sizeof...(Ns) < N)>>: - extents_to_rank, N> {}; - -template -struct broadcast_extents_helper { - using type = typename unify_extents_helper< - typename extents_to_rank::type, // - typename extents_to_rank::type // - >::type; +struct broadcast_extent_helper, extent<1>> { + using type = extent; }; -template -struct broadcast_extents_helper { - using type = E; +template<> +struct broadcast_extent_helper, extent<1>> { + using type = extent<1>; }; } // namespace detail template -using broadcast_extents = typename detail::broadcast_extents_helper::type; +using broadcast_extent = typename detail::broadcast_extent_helper::type; template -using broadcast_tensor_extents = broadcast_extents, tensor_extents>; +using broadcast_vector_extent_type = broadcast_extent, vector_extent_type>; template -static constexpr bool is_broadcastable = is_same, To>; +static constexpr bool is_broadcastable = is_same, To>; template -static constexpr bool is_tensor_broadcastable = is_broadcastable, To>; +static constexpr bool is_vector_broadcastable = is_broadcastable, To>; namespace detail { -template -struct copy_helper; - -template -struct copy_helper, IS, OS> { - template - KERNEL_FLOAT_INLINE static void call(T* output, const T* input) { - ndindex<0> x; - size_t input_index = IS::call(x); - size_t output_index = OS::call(x); - output[output_index] = input[input_index]; - } -}; - -template -struct copy_helper, IS, OS> { - template - KERNEL_FLOAT_INLINE static void call(T* output, const T* input) { - for (size_t i = 0; i < N; i++) { - ndindex<1> x = {i}; - size_t input_index = IS::call(x); - size_t output_index = OS::call(x); - output[output_index] = input[input_index]; - } - } -}; - -template -struct copy_helper, IS, OS> { - template - KERNEL_FLOAT_INLINE static void call(T* output, const T* input) { - for (size_t i = 0; i < N; i++) { - for (size_t j = 0; j < M; j++) { - ndindex<2> x = {i, j}; - size_t input_index = IS::call(x); - size_t output_index = OS::call(x); - output[output_index] = input[input_index]; - } - } - } -}; +template +struct broadcast_impl; -template -struct copy_helper, IS, OS> { - template - KERNEL_FLOAT_INLINE static void call(T* output, const T* input) { +template +struct broadcast_impl, extent> { + KERNEL_FLOAT_INLINE static vector_storage call(const vector_storage& input) { + vector_storage output; for (size_t i = 0; i < N; i++) { - for (size_t j = 0; j < M; j++) { - for (size_t k = 0; k < K; k++) { - ndindex<3> x = {i, j, k}; - size_t input_index = IS::call(x); - size_t output_index = OS::call(x); - output[output_index] = input[input_index]; - } - } + output.data()[i] = input.data()[0]; } - } -}; - -template -struct strides_helper; - -template<> -struct strides_helper> { - KERNEL_FLOAT_INLINE - static size_t call(ndindex<0>) { - return 0; - } -}; - -template -struct strides_helper> { - KERNEL_FLOAT_INLINE - static size_t call(ndindex<1> x) { - return (N != 1 ? x[0] : 0); - } -}; - -template -struct strides_helper> { - KERNEL_FLOAT_INLINE - static size_t call(ndindex<2> x) { - return (N != 1 ? x[0] * M : 0) + // - (M != 1 ? x[1] : 0); - } -}; - -template -struct strides_helper> { - KERNEL_FLOAT_INLINE - static size_t call(ndindex<3> x) { - return (N != 1 ? x[0] * M * K : 0) + // - (M != 1 ? x[1] * K : 0) + // - (K != 1 ? x[2] : 0); - } -}; - -template -struct broadcast_impl { - KERNEL_FLOAT_INLINE static tensor_storage - call(tensor_storage input) { - static_assert(is_broadcastable, "cannot broadcast to required shape"); - using IS = strides_helper::type>; - using OS = strides_helper; - - tensor_storage output; - copy_helper::call(output.data(), input.data()); return output; } }; -template -struct broadcast_impl { - KERNEL_FLOAT_INLINE static tensor_storage - call(tensor_storage input) { +template +struct broadcast_impl, extent> { + KERNEL_FLOAT_INLINE static vector_storage call(vector_storage input) { return input; } }; } // namespace detail -template -KERNEL_FLOAT_INLINE tensor, extents> -broadcast(const V& input, extents new_extents = {}) { - using T = tensor_value_type; - return detail::broadcast_impl, extents>::call( - into_tensor(input).storage()); +template +KERNEL_FLOAT_INLINE vector, extent> +broadcast(const V& input, extent new_size = {}) { + using T = vector_value_type; + return detail::broadcast_impl, extent>::call( + into_vector(input).storage()); } template -KERNEL_FLOAT_INLINE tensor, tensor_extents> +KERNEL_FLOAT_INLINE vector, vector_extent_type> broadcast_like(const V& input, const R&) { - using T = tensor_value_type; - return detail::broadcast_impl, tensor_extents>::call( - into_tensor(input).storage()); + using T = vector_value_type; + return detail::broadcast_impl, vector_extent_type>::call( + into_vector(input).storage()); } -template -KERNEL_FLOAT_INLINE tensor> fill(T value = {}, extents = {}) { - tensor_storage input = {value}; - return detail::broadcast_impl, extents>::call(input); +template +KERNEL_FLOAT_INLINE vector> fill(T value = {}, extent = {}) { + vector_storage input = {value}; + return detail::broadcast_impl, extent>::call(input); } -template -KERNEL_FLOAT_INLINE tensor> zeros(extents = {}) { - tensor_storage input = {T {}}; - return detail::broadcast_impl, extents>::call(input); +template +KERNEL_FLOAT_INLINE vector> zeros(extent = {}) { + vector_storage input = {T {}}; + return detail::broadcast_impl, extent>::call(input); } -template -KERNEL_FLOAT_INLINE tensor> ones(extents = {}) { - tensor_storage input = {T {1}}; - return detail::broadcast_impl, extents>::call(input); +template +KERNEL_FLOAT_INLINE vector> ones(extent = {}) { + vector_storage input = {T {1}}; + return detail::broadcast_impl, extent>::call(input); } -template, typename E = tensor_extents> -KERNEL_FLOAT_INLINE tensor zeros_like(const V&) { +template, typename E = vector_extent_type> +KERNEL_FLOAT_INLINE vector zeros_like(const V&) { return zeros(E {}); } -template, typename E = tensor_extents> -KERNEL_FLOAT_INLINE tensor ones_like(const V&) { +template, typename E = vector_extent_type> +KERNEL_FLOAT_INLINE vector ones_like(const V&) { return ones(E {}); } @@ -245,10 +117,10 @@ namespace detail { template struct convert_helper { KERNEL_FLOAT_INLINE - static tensor_storage call(tensor_storage input) { + static vector_storage call(vector_storage input) { using F = ops::cast; - tensor_storage intermediate = - detail::apply_impl::call(F {}, input); + vector_storage intermediate = + detail::apply_impl::call(F {}, input); return detail::broadcast_impl::call(intermediate); } }; @@ -256,7 +128,7 @@ struct convert_helper { template struct convert_helper { KERNEL_FLOAT_INLINE - static tensor_storage call(tensor_storage input) { + static vector_storage call(vector_storage input) { return input; } }; @@ -264,7 +136,7 @@ struct convert_helper { template struct convert_helper { KERNEL_FLOAT_INLINE - static tensor_storage call(tensor_storage input) { + static vector_storage call(vector_storage input) { return detail::broadcast_impl::call(input); } }; @@ -272,23 +144,22 @@ struct convert_helper { template struct convert_helper { KERNEL_FLOAT_INLINE - static tensor_storage call(tensor_storage input) { + static vector_storage call(vector_storage input) { using F = ops::cast; - return detail::apply_impl::call(F {}, input); + return detail::apply_impl::call(F {}, input); } }; } // namespace detail /** - * Cast the values of the given input tensor to type `R` and then broadcast the result to the given shape `(Ns...)`. + * Cast the values of the given input vector to type `R` and then broadcast the result to the given size `N`. */ -template -KERNEL_FLOAT_INLINE tensor> -convert(const V& input, extents new_shape = {}) { - return detail::convert_helper, tensor_extents, R, extents, M>:: - call(into_tensor(input).storage()); +template +KERNEL_FLOAT_INLINE vector> convert(const V& input, extent new_size = {}) { + return detail::convert_helper, vector_extent_type, R, extent, M>:: + call(into_vector(input).storage()); } } // namespace kernel_float -#endif \ No newline at end of file +#endif diff --git a/include/kernel_float/fp16.h b/include/kernel_float/fp16.h index d64f436..ee90ebf 100644 --- a/include/kernel_float/fp16.h +++ b/include/kernel_float/fp16.h @@ -6,7 +6,7 @@ #if KERNEL_FLOAT_FP16_AVAILABLE #include -#include "tensor.h" +#include "vector.h" namespace kernel_float { KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(__half) @@ -14,12 +14,13 @@ KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __half) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __half) template<> -struct into_tensor_traits<__half2> { - using type = tensor<__half, extents<2>>; +struct into_vector_traits<__half2> { + using value_type = __half; + using extent_type = extent<2>; KERNEL_FLOAT_INLINE - static type call(__half2 input) { - return tensor_storage<__half, 2> {input.x, input.y}; + static vector_storage<__half, 2> call(__half2 input) { + return {input.x, input.y}; } }; @@ -46,20 +47,20 @@ struct zip_halfx2 { template struct apply_impl { - KERNEL_FLOAT_INLINE static tensor_storage<__half, N> - call(F fun, const tensor_storage<__half, N>& input) { - tensor_storage<__half, N> result; + KERNEL_FLOAT_INLINE static vector_storage<__half, N> + call(F fun, const vector_storage<__half, N>& input) { + vector_storage<__half, N> result; #pragma unroll for (size_t i = 0; i < N; i += 2) { - __half2 a = {input[i], input[i + 1]}; + __half2 a = {input.data()[i], input.data()[i + 1]}; __half2 b = map_halfx2::call(fun, a); - result[i + 0] = b.x; - result[i + 1] = b.y; + result.data()[i + 0] = b.x; + result.data()[i + 1] = b.y; } if (N % 2 != 0) { - result[N - 1] = fun(input[N - 1]); + result.data()[N - 1] = fun(input.data()[N - 1]); } return result; @@ -68,20 +69,20 @@ struct apply_impl { template struct apply_impl { - KERNEL_FLOAT_INLINE static tensor_storage<__half, N> - call(F fun, const tensor_storage<__half, N>& left, const tensor_storage<__half, N>& right) { - tensor_storage<__half, N> result; + KERNEL_FLOAT_INLINE static vector_storage<__half, N> + call(F fun, const vector_storage<__half, N>& left, const vector_storage<__half, N>& right) { + vector_storage<__half, N> result; #pragma unroll for (size_t i = 0; i < N; i += 2) { - __half2 a = {left[i], left[i + 1]}; - __half2 b = {right[i], right[i + 1]}; + __half2 a = {left.data()[i], left.data()[i + 1]}; + __half2 b = {right.data()[i], right.data()[i + 1]}; __half2 c = zip_halfx2::call(fun, a, b); - result[i + 0] = c.x; - result[i + 1] = c.y; + result.data()[i + 0] = c.x; + result.data()[i + 1] = c.y; } if (N % 2 != 0) { - result[N - 1] = fun(left[N - 1], right[N - 1]); + result.data()[N - 1] = fun(left.data()[N - 1], right.data()[N - 1]); } return result; @@ -90,19 +91,19 @@ struct apply_impl { template struct reduce_helper= 2)>> { - KERNEL_FLOAT_INLINE static __half call(F fun, const tensor_storage<__half, N>& input) { - __half2 accum = {input[0], input[1]}; + KERNEL_FLOAT_INLINE static __half call(F fun, const vector_storage<__half, N>& input) { + __half2 accum = {input.data()[0], input.data()[1]}; #pragma unroll for (size_t i = 2; i < N; i += 2) { - __half2 a = {input[i], input[i + 1]}; + __half2 a = {input.data()[i], input.data()[i + 1]}; accum = zip_halfx2::call(fun, accum, a); } __half result = fun(accum.x, accum.y); if (N % 2 != 0) { - result = fun(result, input[N - 1]); + result = fun(result, input.data()[N - 1]); } return result; diff --git a/include/kernel_float/meta.h b/include/kernel_float/meta.h index 04242d2..1bfb8a7 100644 --- a/include/kernel_float/meta.h +++ b/include/kernel_float/meta.h @@ -257,4 +257,4 @@ using enabled_t = typename detail::enabled_helper::type; } // namespace kernel_float -#endif \ No newline at end of file +#endif diff --git a/include/kernel_float/prelude.h b/include/kernel_float/prelude.h index 210f059..20f8598 100644 --- a/include/kernel_float/prelude.h +++ b/include/kernel_float/prelude.h @@ -2,23 +2,20 @@ #define KERNEL_FLOAT_PRELUDE_H #include "constant.h" -#include "tensor.h" +#include "vector.h" namespace kernel_float { namespace prelude { namespace kf = ::kernel_float; template -using kscalar = tensor>; +using kscalar = vector>; template -using kvec = tensor>; +using kvec = vector>; -template -using kmat = tensor>; - -template -using ktensor = tensor>; +template +using kvector = vector>; // clang-format off template using kvec1 = kvec; @@ -31,17 +28,16 @@ template using kvec7 = kvec; template using kvec8 = kvec; // clang-format on -#define KERNEL_FLOAT_TYPE_ALIAS(NAME, T) \ - using k##NAME = tensor>; \ - template \ - using k##NAME##N = tensor>; \ - using k##NAME##1 = vec; \ - using k##NAME##2 = vec; \ - using k##NAME##3 = vec; \ - using k##NAME##4 = vec; \ - using k##NAME##5 = vec; \ - using k##NAME##6 = vec; \ - using k##NAME##7 = vec; \ +#define KERNEL_FLOAT_TYPE_ALIAS(NAME, T) \ + template \ + using k##NAME = vector>; \ + using k##NAME##1 = vec; \ + using k##NAME##2 = vec; \ + using k##NAME##3 = vec; \ + using k##NAME##4 = vec; \ + using k##NAME##5 = vec; \ + using k##NAME##6 = vec; \ + using k##NAME##7 = vec; \ using k##NAME##8 = vec; KERNEL_FLOAT_TYPE_ALIAS(char, char) @@ -75,8 +71,8 @@ KERNEL_FLOAT_TYPE_ALIAS(bfloat16, __nv_bfloat16) KERNEL_FLOAT_TYPE_ALIAS(bf16, __nv_bfloat16) #endif -template -static constexpr extents kshape = {}; +template +static constexpr extent kextent = {}; template KERNEL_FLOAT_INLINE kvec, sizeof...(Args)> make_kvec(Args&&... args) { diff --git a/include/kernel_float/reduce.h b/include/kernel_float/reduce.h index c909ea0..3dc22ad 100644 --- a/include/kernel_float/reduce.h +++ b/include/kernel_float/reduce.h @@ -7,18 +7,18 @@ namespace kernel_float { namespace detail { template struct reduce_helper { - KERNEL_FLOAT_INLINE static T call(F fun, const tensor_storage& input) { + KERNEL_FLOAT_INLINE static T call(F fun, const vector_storage& input) { return call(fun, input, make_index_sequence {}); } private: template KERNEL_FLOAT_INLINE static T - call(F fun, const tensor_storage& input, index_sequence<0, Is...>) { - T result = input[0]; + call(F fun, const vector_storage& input, index_sequence<0, Is...>) { + T result = input.data()[0]; #pragma unroll for (size_t i = 1; i < N; i++) { - result = fun(result, input[i]); + result = fun(result, input.data()[i]); } return result; } @@ -40,10 +40,10 @@ struct reduce_helper { * ``` */ template -KERNEL_FLOAT_INLINE tensor_value_type reduce(F fun, const V& input) { - return detail::reduce_helper, tensor_value_type>::call( +KERNEL_FLOAT_INLINE vector_value_type reduce(F fun, const V& input) { + return detail::reduce_helper, vector_value_type>::call( fun, - into_tensor_storage(input)); + into_vector_storage(input)); } /** @@ -56,7 +56,7 @@ KERNEL_FLOAT_INLINE tensor_value_type reduce(F fun, const V& input) { * int y = min(x); // Returns 0 * ``` */ -template> +template> KERNEL_FLOAT_INLINE T min(const V& input) { return reduce(ops::min {}, input); } @@ -71,7 +71,7 @@ KERNEL_FLOAT_INLINE T min(const V& input) { * int y = max(x); // Returns 5 * ``` */ -template> +template> KERNEL_FLOAT_INLINE T max(const V& input) { return reduce(ops::max {}, input); } @@ -86,7 +86,7 @@ KERNEL_FLOAT_INLINE T max(const V& input) { * int y = sum(x); // Returns 8 * ``` */ -template> +template> KERNEL_FLOAT_INLINE T sum(const V& input) { return reduce(ops::add {}, input); } @@ -102,7 +102,7 @@ KERNEL_FLOAT_INLINE T sum(const V& input) { * int y = dot(x, y); // Returns 1*4+2*5+3*6 = 32 * ``` */ -template> +template> KERNEL_FLOAT_INLINE T dot(const L& left, const R& right) { return reduce(ops::add {}, zip_common(ops::multiply {}, left, right)); } @@ -117,7 +117,7 @@ KERNEL_FLOAT_INLINE T dot(const L& left, const R& right) { * int y = sum(x); // Returns 5*0*2*1*0 = 0 * ``` */ -template> +template> KERNEL_FLOAT_INLINE T product(const V& input) { return reduce(ops::multiply {}, input); } diff --git a/include/kernel_float/tensor.h b/include/kernel_float/tensor.h deleted file mode 100644 index 89ba96e..0000000 --- a/include/kernel_float/tensor.h +++ /dev/null @@ -1,285 +0,0 @@ -#ifndef KERNEL_FLOAT_BASE_H -#define KERNEL_FLOAT_BASE_H - -#include "base.h" -#include "broadcast.h" -#include "macros.h" -#include "reduce.h" -#include "unops.h" - -namespace kernel_float { - -template -struct tensor_extension {}; - -template class S> -struct tensor: tensor_extension, T, E::volume> { - static constexpr size_t rank = E::rank; - static constexpr size_t volume = E::volume; - - using value_type = T; - using extents_type = E; - using ndindex_type = ndindex; - using storage_type = S; - - KERNEL_FLOAT_INLINE - static constexpr size_t size() { - return E::volume; - } - - KERNEL_FLOAT_INLINE - static constexpr size_t size(size_t axis) { - return E::size(axis); - } - - KERNEL_FLOAT_INLINE - static constexpr extents_type shape() { - return {}; - } - - KERNEL_FLOAT_INLINE - static constexpr size_t stride(size_t axis) { - return E::stride(axis); - } - - KERNEL_FLOAT_INLINE - static constexpr size_t linearize_index(ndindex_type index) { - return E::ravel_index(index); - } - - tensor(const tensor&) = default; - - KERNEL_FLOAT_INLINE - tensor(storage_type storage) : storage_(storage) {} - - template= 2, int> = 0> - KERNEL_FLOAT_INLINE tensor(Args&&... args) : storage_ {std::forward(args)...} {} - - template< - typename U, - typename F, - enabled_t< - is_implicit_convertible && is_tensor_broadcastable, - int> = 0> - KERNEL_FLOAT_INLINE tensor(const tensor& input) : - tensor(convert(input, extents_type {})) {} - - template< - typename U, - typename F, - enabled_t< - !is_implicit_convertible && is_tensor_broadcastable, - int> = 0> - explicit KERNEL_FLOAT_INLINE tensor(const tensor& input) : - tensor(convert(input, extents_type {})) {} - - KERNEL_FLOAT_INLINE tensor(const value_type& input = {}) : - tensor(convert(input, extents_type {})) {} - - KERNEL_FLOAT_INLINE - storage_type& storage() { - return storage_; - } - - KERNEL_FLOAT_INLINE - const storage_type& storage() const { - return storage_; - } - - KERNEL_FLOAT_INLINE - T* data() { - return storage_.data(); - } - - KERNEL_FLOAT_INLINE - const T* data() const { - return storage_.data(); - } - - KERNEL_FLOAT_INLINE - const T* cdata() const { - return storage_.data(); - } - - KERNEL_FLOAT_INLINE - T* begin() { - return storage_.data(); - } - - KERNEL_FLOAT_INLINE - const T* begin() const { - return storage_.data(); - } - - KERNEL_FLOAT_INLINE - const T* cbegin() const { - return storage_.data(); - } - - KERNEL_FLOAT_INLINE - T* end() { - return storage_.data() + E::volume; - } - - KERNEL_FLOAT_INLINE - const T* end() const { - return storage_.data() + E::volume; - } - - KERNEL_FLOAT_INLINE - const T* cend() const { - return storage_.data() + E::volume; - } - - KERNEL_FLOAT_INLINE - T& at(ndindex_type x) { - return *(data() + linearize_index(x)); - } - - KERNEL_FLOAT_INLINE - const T& at(ndindex_type x) const { - return *(data() + linearize_index(x)); - } - - KERNEL_FLOAT_INLINE - T get(ndindex_type x) const { - return at(x); - } - - KERNEL_FLOAT_INLINE - void set(ndindex_type x, T value) { - at(x) = std::move(value); - } - - KERNEL_FLOAT_INLINE - T& operator[](ndindex_type x) { - return at(x); - } - - KERNEL_FLOAT_INLINE - const T& operator[](ndindex_type x) const { - return at(x); - } - - KERNEL_FLOAT_INLINE - T& operator()(ndindex_type x) { - return at(x); - } - - KERNEL_FLOAT_INLINE - const T& operator()(ndindex_type x) const { - return at(x); - } - - KERNEL_FLOAT_INLINE - tensor> flatten() const { - return storage_; - } - - template - KERNEL_FLOAT_INLINE tensor> reshape(extents = {}) const { - static_assert(extents::volume == volume, "invalid reshape shape"); - return storage_; - } - - template - KERNEL_FLOAT_INLINE tensor> broadcast(extents new_shape = {}) const { - return kernel_float::broadcast(*this, new_shape); - } - - template - KERNEL_FLOAT_INLINE tensor, E> map(F fun = {}) const { - return kernel_float::map(fun, *this); - } - - template - KERNEL_FLOAT_INLINE T reduce(F fun = {}) const { - return kernel_float::reduce(fun, *this); - } - - private: - storage_type storage_; -}; - -template -struct tensor_extension { - KERNEL_FLOAT_INLINE - T get() const { - return static_cast(this)->get({}); - } - - KERNEL_FLOAT_INLINE - void set(T value) { - static_cast(this)->set({}, value); - } - - KERNEL_FLOAT_INLINE - operator T() const { - return get(); - } -}; - -#define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ - template<> \ - struct into_tensor_traits<::T2> { \ - using type = tensor>; \ - \ - KERNEL_FLOAT_INLINE \ - static type call(::T2 v) { \ - return tensor_storage {v.x, v.y}; \ - } \ - }; \ - \ - template<> \ - struct into_tensor_traits<::T3> { \ - using type = tensor>; \ - \ - KERNEL_FLOAT_INLINE \ - static type call(::T3 v) { \ - return tensor_storage {v.x, v.y, v.z}; \ - } \ - }; \ - \ - template<> \ - struct into_tensor_traits<::T4> { \ - using type = tensor>; \ - \ - KERNEL_FLOAT_INLINE \ - static type call(::T4 v) { \ - return tensor_storage {v.x, v.y, v.z, v.w}; \ - } \ - }; - -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(char, char1, char2, char3, char4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(short, short1, short2, short3, short4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(int, int1, int2, int3, int4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long, long1, long2, long3, long4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long long, longlong1, longlong2, longlong3, longlong4) - -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned char, uchar1, uchar2, uchar3, uchar4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned short, ushort1, ushort2, ushort3, ushort4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned int, uint1, uint2, uint3, uint4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long, ulong1, ulong2, ulong3, ulong4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong1, ulonglong2, ulonglong3, ulonglong4) - -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(float, float1, float2, float3, float4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(double, double1, double2, double3, double4) - -template -using scalar = tensor>; - -template -using vec = tensor>; - -template -using mat = tensor>; - -template -KERNEL_FLOAT_INLINE vec, sizeof...(Args)> make_vec(Args&&... args) { - using T = promote_t; - return tensor_storage {T {args}...}; -}; - -} // namespace kernel_float - -#endif diff --git a/include/kernel_float/triops.h b/include/kernel_float/triops.h index 4445468..9e059ad 100644 --- a/include/kernel_float/triops.h +++ b/include/kernel_float/triops.h @@ -22,64 +22,64 @@ struct conditional { /** * Return elements chosen from `true_values` and `false_values` depending on `cond`. * - * This function broadcasts all arguments to the same shape and it promotes the values of `true_values` and - * `false_values` into the same type. Next, it casts the values of `cond` to booleans and returns a tensor where + * This function broadcasts all arguments to the same size and it promotes the values of `true_values` and + * `false_values` into the same type. Next, it casts the values of `cond` to booleans and returns a vector where * the values are taken from `true_values` if the condition is true and `false_values` otherwise. * * @param cond The condition used for selection. - * @param true_values The tensor of values to choose from when the condition is true. - * @param false_values The tensor of values to choose from when the condition is false. - * @return A tensor containing selected elements as per the condition. + * @param true_values The vector of values to choose from when the condition is true. + * @param false_values The vector of values to choose from when the condition is false. + * @return A vector containing selected elements as per the condition. */ template< typename C, typename L, typename R, - typename T = promoted_tensor_value_type, - typename E = broadcast_extents, broadcast_tensor_extents>> -KERNEL_FLOAT_INLINE tensor where(const C& cond, const L& true_values, const R& false_values) { + typename T = promoted_vector_value_type, + typename E = broadcast_extent, broadcast_vector_extent_type>> +KERNEL_FLOAT_INLINE vector where(const C& cond, const L& true_values, const R& false_values) { using F = ops::conditional; - return detail::apply_impl::call( + return detail::apply_impl::call( F {}, - detail::convert_helper, tensor_extents, bool, E>::call( - into_tensor_storage(cond)), - detail::convert_helper, tensor_extents, T, E>::call( - into_tensor_storage(true_values)), - detail::convert_helper, tensor_extents, T, E>::call( - into_tensor_storage(false_values))); + detail::convert_helper, vector_extent_type, bool, E>::call( + into_vector_storage(cond)), + detail::convert_helper, vector_extent_type, T, E>::call( + into_vector_storage(true_values)), + detail::convert_helper, vector_extent_type, T, E>::call( + into_vector_storage(false_values))); } /** * Selects elements from `true_values` depending on `cond`. * - * This function returns a tensor where the values are taken from `true_values` where `cond` is `true` and `0` where + * This function returns a vector where the values are taken from `true_values` where `cond` is `true` and `0` where * `cond is `false`. * * @param cond The condition used for selection. - * @param true_values The tensor of values to choose from when the condition is true. - * @return A tensor containing selected elements as per the condition. + * @param true_values The vector of values to choose from when the condition is true. + * @return A vector containing selected elements as per the condition. */ template< typename C, typename L, - typename T = tensor_value_type, - typename E = broadcast_extents, tensor_extents>> -KERNEL_FLOAT_INLINE tensor where(const C& cond, const L& true_values) { - tensor> false_values = T {}; + typename T = vector_value_type, + typename E = broadcast_extent, vector_extent_type>> +KERNEL_FLOAT_INLINE vector where(const C& cond, const L& true_values) { + vector> false_values = T {}; return where(cond, true_values, false_values); } /** - * Returns a tensor where the values are `T(1)` where `cond` is `true` and `T(0)` where `cond` is `false`. + * Returns a vector where the values are `T(1)` where `cond` is `true` and `T(0)` where `cond` is `false`. * * @param cond The condition used for selection. - * @return A tensor containing elements as per the condition. + * @return A vector containing elements as per the condition. */ -template> -KERNEL_FLOAT_INLINE tensor where(const C& cond) { - tensor> true_values = T {true}; - tensor> false_values = T {false}; +template> +KERNEL_FLOAT_INLINE vector where(const C& cond) { + vector> true_values = T {true}; + vector> false_values = T {false}; return where(cond, true_values, false_values); } @@ -115,19 +115,19 @@ template< typename A, typename B, typename C, - typename T = promoted_tensor_value_type, - typename E = broadcast_extents, broadcast_tensor_extents>> -KERNEL_FLOAT_INLINE tensor fma(const A& a, const B& b, const C& c) { + typename T = promoted_vector_value_type, + typename E = broadcast_extent, broadcast_vector_extent_type>> +KERNEL_FLOAT_INLINE vector fma(const A& a, const B& b, const C& c) { using F = ops::fma; - return detail::apply_impl::call( + return detail::apply_impl::call( F {}, - detail::convert_helper, tensor_extents, T, E>::call( - into_tensor_storage(a)), - detail::convert_helper, tensor_extents, T, E>::call( - into_tensor_storage(b)), - detail::convert_helper, tensor_extents, T, E>::call( - into_tensor_storage(c))); + detail::convert_helper, vector_extent_type, T, E>::call( + into_vector_storage(a)), + detail::convert_helper, vector_extent_type, T, E>::call( + into_vector_storage(b)), + detail::convert_helper, vector_extent_type, T, E>::call( + into_vector_storage(c))); } } // namespace kernel_float diff --git a/include/kernel_float/unops.h b/include/kernel_float/unops.h index 72cac18..94e3f11 100644 --- a/include/kernel_float/unops.h +++ b/include/kernel_float/unops.h @@ -8,13 +8,13 @@ namespace detail { template struct apply_impl { - KERNEL_FLOAT_INLINE static tensor_storage - call(F fun, const tensor_storage&... inputs) { - tensor_storage result; + KERNEL_FLOAT_INLINE static vector_storage + call(F fun, const vector_storage&... inputs) { + vector_storage result; #pragma unroll for (size_t i = 0; i < N; i++) { - result[i] = fun(inputs[i]...); + result.data()[i] = fun(inputs.data()[i]...); } return result; @@ -23,15 +23,15 @@ struct apply_impl { } // namespace detail template -using map_type = tensor>, tensor_extents>; +using map_type = vector>, vector_extent_type>; template KERNEL_FLOAT_INLINE map_type map(F fun, const V& input) { - using Input = tensor_value_type; + using Input = vector_value_type; using Output = result_t; - return detail::apply_impl, Output, Input>::call( + return detail::apply_impl, Output, Input>::call( fun, - into_tensor(input).storage()); + into_vector(input).storage()); } #define KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ @@ -44,15 +44,15 @@ KERNEL_FLOAT_INLINE map_type map(F fun, const V& input) { }; \ } \ template \ - KERNEL_FLOAT_INLINE into_tensor_type NAME(const V& input) { \ - using F = ops::NAME>; \ + KERNEL_FLOAT_INLINE into_vector_type NAME(const V& input) { \ + using F = ops::NAME>; \ return map(F {}, input); \ } #define KERNEL_FLOAT_DEFINE_UNARY_OP(NAME, OP, EXPR) \ KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ template \ - KERNEL_FLOAT_INLINE tensor operator OP(const tensor& vec) { \ + KERNEL_FLOAT_INLINE vector operator OP(const vector& vec) { \ return NAME(vec); \ } @@ -161,10 +161,10 @@ struct cast { } // namespace ops template -KERNEL_FLOAT_INLINE tensor> cast(const V& input) { - using F = ops::cast, R, Mode>; +KERNEL_FLOAT_INLINE vector> cast(const V& input) { + using F = ops::cast, R, Mode>; return map(F {}, input); } } // namespace kernel_float -#endif //KERNEL_FLOAT_UNOPS_H \ No newline at end of file +#endif //KERNEL_FLOAT_UNOPS_H diff --git a/include/kernel_float/vector.h b/include/kernel_float/vector.h new file mode 100644 index 0000000..9e6539a --- /dev/null +++ b/include/kernel_float/vector.h @@ -0,0 +1,250 @@ +#ifndef KERNEL_FLOAT_VECTOR_H +#define KERNEL_FLOAT_VECTOR_H + +#include "base.h" +#include "broadcast.h" +#include "macros.h" +#include "reduce.h" +#include "unops.h" + +namespace kernel_float { + +template +struct vector_extension {}; + +template class S> +struct vector: vector_extension, T, E::value> { + using value_type = T; + using extent_type = E; + using storage_type = S; + + KERNEL_FLOAT_INLINE + static constexpr size_t size() { + return E::value; + } + + vector(const vector&) = default; + + KERNEL_FLOAT_INLINE + vector(storage_type storage) : storage_(storage) {} + + template= 2, int> = 0> + KERNEL_FLOAT_INLINE vector(Args&&... args) : storage_ {std::forward(args)...} {} + + template< + typename U, + typename F, + enabled_t< + is_implicit_convertible && is_vector_broadcastable, + int> = 0> + KERNEL_FLOAT_INLINE vector(const vector& input) : + vector(convert(input, extent_type {})) {} + + template< + typename U, + typename F, + enabled_t< + !is_implicit_convertible && is_vector_broadcastable, + int> = 0> + explicit KERNEL_FLOAT_INLINE vector(const vector& input) : + vector(convert(input, extent_type {})) {} + + KERNEL_FLOAT_INLINE vector(const value_type& input = {}) : + vector(convert(input, extent_type {})) {} + + KERNEL_FLOAT_INLINE + storage_type& storage() { + return storage_; + } + + KERNEL_FLOAT_INLINE + const storage_type& storage() const { + return storage_; + } + + KERNEL_FLOAT_INLINE + T* data() { + return storage_.data(); + } + + KERNEL_FLOAT_INLINE + const T* data() const { + return storage_.data(); + } + + KERNEL_FLOAT_INLINE + const T* cdata() const { + return storage_.data(); + } + + KERNEL_FLOAT_INLINE + T* begin() { + return storage_.data(); + } + + KERNEL_FLOAT_INLINE + const T* begin() const { + return storage_.data(); + } + + KERNEL_FLOAT_INLINE + const T* cbegin() const { + return storage_.data(); + } + + KERNEL_FLOAT_INLINE + T* end() { + return storage_.data() + size(); + } + + KERNEL_FLOAT_INLINE + const T* end() const { + return storage_.data() + size(); + } + + KERNEL_FLOAT_INLINE + const T* cend() const { + return storage_.data() + size(); + } + + KERNEL_FLOAT_INLINE + T& at(size_t x) { + return *(data() + x); + } + + KERNEL_FLOAT_INLINE + const T& at(size_t x) const { + return *(data() + x); + } + + KERNEL_FLOAT_INLINE + T get(size_t x) const { + return at(x); + } + + KERNEL_FLOAT_INLINE + void set(size_t x, T value) { + at(x) = std::move(value); + } + + KERNEL_FLOAT_INLINE + T& operator[](size_t x) { + return at(x); + } + + KERNEL_FLOAT_INLINE + const T& operator[](size_t x) const { + return at(x); + } + + KERNEL_FLOAT_INLINE + T& operator()(size_t x) { + return at(x); + } + + KERNEL_FLOAT_INLINE + const T& operator()(size_t x) const { + return at(x); + } + + template + KERNEL_FLOAT_INLINE vector> broadcast(extent new_size = {}) const { + return kernel_float::broadcast(*this, new_size); + } + + template + KERNEL_FLOAT_INLINE vector, E> map(F fun = {}) const { + return kernel_float::map(fun, *this); + } + + template + KERNEL_FLOAT_INLINE T reduce(F fun = {}) const { + return kernel_float::reduce(fun, *this); + } + + private: + storage_type storage_; +}; + +template +struct vector_extension { + KERNEL_FLOAT_INLINE + T get() const { + return static_cast(this)->get({}); + } + + KERNEL_FLOAT_INLINE + void set(T value) { + static_cast(this)->set({}, value); + } + + KERNEL_FLOAT_INLINE + operator T() const { + return get(); + } +}; + +#define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ + template<> \ + struct into_vector_traits<::T2> { \ + using value_type = T; \ + using extent_type = extent<2>; \ + \ + KERNEL_FLOAT_INLINE \ + static vector_storage call(::T2 v) { \ + return {v.x, v.y}; \ + } \ + }; \ + \ + template<> \ + struct into_vector_traits<::T3> { \ + using value_type = T; \ + using extent_type = extent<3>; \ + \ + KERNEL_FLOAT_INLINE \ + static vector_storage call(::T3 v) { \ + return {v.x, v.y, v.z}; \ + } \ + }; \ + \ + template<> \ + struct into_vector_traits<::T4> { \ + using value_type = T; \ + using extent_type = extent<4>; \ + \ + KERNEL_FLOAT_INLINE \ + static vector_storage call(::T4 v) { \ + return {v.x, v.y, v.z, v.w}; \ + } \ + }; + +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(char, char1, char2, char3, char4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(short, short1, short2, short3, short4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(int, int1, int2, int3, int4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long, long1, long2, long3, long4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long long, longlong1, longlong2, longlong3, longlong4) + +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned char, uchar1, uchar2, uchar3, uchar4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned short, ushort1, ushort2, ushort3, ushort4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned int, uint1, uint2, uint3, uint4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long, ulong1, ulong2, ulong3, ulong4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong1, ulonglong2, ulonglong3, ulonglong4) + +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(float, float1, float2, float3, float4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(double, double1, double2, double3, double4) + +template +using scalar = vector>; + +template +using vec = vector>; + +template +KERNEL_FLOAT_INLINE vec, sizeof...(Args)> make_vec(Args&&... args) { + using T = promote_t; + return vector_storage {T {args}...}; +}; + +} // namespace kernel_float + +#endif diff --git a/tests/broadcast.cu b/tests/broadcast.cu index 205dd0f..aece5f7 100644 --- a/tests/broadcast.cu +++ b/tests/broadcast.cu @@ -12,7 +12,7 @@ struct broadcast_test; template struct broadcast_test, std::index_sequence> { - __host__ __device__ void operator()(generator gen) { + __host__ __device__ void operator()(generator gen) { /* { kf::tensor> x = gen.next(); T y = gen.next(); @@ -58,7 +58,7 @@ struct broadcast_test, std::index_sequence> x; kf::tensor> y = x; - } + }*/ } }; From e503d23594419ae4340e817c6d86f8a5485a657f Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 1 Aug 2023 16:04:07 +0200 Subject: [PATCH 15/50] Clean up of vector constructors --- include/kernel_float/base.h | 145 +++- include/kernel_float/broadcast.h | 13 +- include/kernel_float/complex.h | 2 +- include/kernel_float/unops.h | 2 +- include/kernel_float/vector.h | 119 ++- single_include/kernel_float.h | 1357 ++++++++++++------------------ 6 files changed, 717 insertions(+), 921 deletions(-) diff --git a/include/kernel_float/base.h b/include/kernel_float/base.h index 33ee2f1..c194186 100644 --- a/include/kernel_float/base.h +++ b/include/kernel_float/base.h @@ -18,45 +18,129 @@ struct alignas(Alignment) aligned_array { return items_; } - T items_[N]; + T items_[N] = {}; }; template -struct aligned_array { +struct aligned_array { KERNEL_FLOAT_INLINE - aligned_array(T value = {}) : value_(value) {} + T* data() { + while (true) + ; + } + + KERNEL_FLOAT_INLINE + const T* data() const { + while (true) + ; + } +}; + +template +struct alignas(Alignment) aligned_array { + KERNEL_FLOAT_INLINE + aligned_array(T value = {}) : x(value) {} KERNEL_FLOAT_INLINE operator T() const { - return value_; + return x; } KERNEL_FLOAT_INLINE T* data() { - return &value_; + return &x; } KERNEL_FLOAT_INLINE const T* data() const { - return &value_; + return &x; } - T value_; + T x; }; template -struct aligned_array { +struct alignas(Alignment) aligned_array { + KERNEL_FLOAT_INLINE + aligned_array(T x, T y) : x(x), y(y) {} + + KERNEL_FLOAT_INLINE + aligned_array() : aligned_array(T {}, T {}) {} + KERNEL_FLOAT_INLINE T* data() { - while (true) - ; + return items; } KERNEL_FLOAT_INLINE const T* data() const { - while (true) - ; + return items; } + + union { + T items[2]; + struct { + T x; + T y; + }; + }; +}; + +template +struct alignas(Alignment) aligned_array { + KERNEL_FLOAT_INLINE + aligned_array(T x, T y, T z) : x(x), y(y), z(z) {} + + KERNEL_FLOAT_INLINE + aligned_array() : aligned_array(T {}, T {}, T {}) {} + + KERNEL_FLOAT_INLINE + T* data() { + return items; + } + + KERNEL_FLOAT_INLINE + const T* data() const { + return items; + } + + union { + T items[3]; + struct { + T x; + T y; + T z; + }; + }; +}; + +template +struct alignas(Alignment) aligned_array { + KERNEL_FLOAT_INLINE + aligned_array(T x, T y, T z, T w) : x(x), y(y), z(z), w(w) {} + + KERNEL_FLOAT_INLINE + aligned_array() : aligned_array(T {}, T {}, T {}, T {}) {} + + KERNEL_FLOAT_INLINE + T* data() { + return items; + } + + KERNEL_FLOAT_INLINE + const T* data() const { + return items; + } + + union { + T items[4]; + struct { + T x; + T y; + T z; + T w; + }; + }; }; KERNEL_FLOAT_INLINE @@ -79,9 +163,6 @@ static constexpr size_t compute_max_alignment(size_t total_size, size_t min_alig template using vector_storage = aligned_array; -template class S = vector_storage> -struct vector; - template struct extent { static constexpr size_t value = N; @@ -111,17 +192,6 @@ struct into_vector_traits: into_vector_traits {}; template struct into_vector_traits: into_vector_traits {}; -template class S> -struct into_vector_traits> { - using value_type = T; - using extent_type = E; - - KERNEL_FLOAT_INLINE - static vector_storage call(const vector& input) { - return input.storage(); - } -}; - template struct into_vector_traits> { using value_type = T; @@ -136,11 +206,25 @@ struct into_vector_traits> { template struct vector_traits; -template class S> +template> +struct vector; + +template +struct into_vector_traits> { + using value_type = T; + using extent_type = E; + + KERNEL_FLOAT_INLINE + static vector_storage call(const vector& input) { + return input.storage(); + } +}; + +template struct vector_traits> { using value_type = T; using extent_type = E; - using storage_type = S; + using storage_type = S; using vector_type = vector; }; @@ -167,11 +251,6 @@ KERNEL_FLOAT_INLINE vector_storage_type into_vector_storage(V&& input) { return into_vector_traits::call(std::forward(input)); } -template -KERNEL_FLOAT_INLINE into_vector_type into_vector(V&& input) { - return into_vector_traits::call(std::forward(input)); -} - } // namespace kernel_float #endif diff --git a/include/kernel_float/broadcast.h b/include/kernel_float/broadcast.h index aac1a45..aa6fa9d 100644 --- a/include/kernel_float/broadcast.h +++ b/include/kernel_float/broadcast.h @@ -74,7 +74,7 @@ KERNEL_FLOAT_INLINE vector, extent> broadcast(const V& input, extent new_size = {}) { using T = vector_value_type; return detail::broadcast_impl, extent>::call( - into_vector(input).storage()); + into_vector_storage(input)); } template @@ -82,7 +82,7 @@ KERNEL_FLOAT_INLINE vector, vector_extent_type> broadcast_like(const V& input, const R&) { using T = vector_value_type; return detail::broadcast_impl, vector_extent_type>::call( - into_vector(input).storage()); + into_vector_storage(input)); } template @@ -151,13 +151,18 @@ struct convert_helper { }; } // namespace detail +template +KERNEL_FLOAT_INLINE vector_storage convert_storage(const V& input, extent new_size = {}) { + return detail::convert_helper, vector_extent_type, R, extent, M>:: + call(into_vector_storage(input)); +} + /** * Cast the values of the given input vector to type `R` and then broadcast the result to the given size `N`. */ template KERNEL_FLOAT_INLINE vector> convert(const V& input, extent new_size = {}) { - return detail::convert_helper, vector_extent_type, R, extent, M>:: - call(into_vector(input).storage()); + return convert_storage(input); } } // namespace kernel_float diff --git a/include/kernel_float/complex.h b/include/kernel_float/complex.h index ca55076..a2d2062 100644 --- a/include/kernel_float/complex.h +++ b/include/kernel_float/complex.h @@ -177,7 +177,7 @@ KERNEL_FLOAT_INLINE T real(complex_type v) { template KERNEL_FLOAT_INLINE T imag(complex_type v) { - return v.real(); + return v.imag(); } template diff --git a/include/kernel_float/unops.h b/include/kernel_float/unops.h index 94e3f11..eb1d305 100644 --- a/include/kernel_float/unops.h +++ b/include/kernel_float/unops.h @@ -31,7 +31,7 @@ KERNEL_FLOAT_INLINE map_type map(F fun, const V& input) { using Output = result_t; return detail::apply_impl, Output, Input>::call( fun, - into_vector(input).storage()); + into_vector_storage(input)); } #define KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ diff --git a/include/kernel_float/vector.h b/include/kernel_float/vector.h index 9e6539a..5ad09a0 100644 --- a/include/kernel_float/vector.h +++ b/include/kernel_float/vector.h @@ -9,112 +9,99 @@ namespace kernel_float { -template -struct vector_extension {}; - -template class S> -struct vector: vector_extension, T, E::value> { +template +struct vector: S { using value_type = T; using extent_type = E; - using storage_type = S; - - KERNEL_FLOAT_INLINE - static constexpr size_t size() { - return E::value; - } + using storage_type = S; + // Copy another `vector` vector(const vector&) = default; + // Copy anything of type `storage_type` KERNEL_FLOAT_INLINE - vector(storage_type storage) : storage_(storage) {} + vector(const storage_type& storage) : storage_type(storage) {} - template= 2, int> = 0> - KERNEL_FLOAT_INLINE vector(Args&&... args) : storage_ {std::forward(args)...} {} + // Copy anything of type `storage_type` + KERNEL_FLOAT_INLINE + vector(const value_type& input = {}) : + storage_type(detail::broadcast_impl, E>::call(input)) {} - template< - typename U, - typename F, - enabled_t< - is_implicit_convertible && is_vector_broadcastable, - int> = 0> - KERNEL_FLOAT_INLINE vector(const vector& input) : - vector(convert(input, extent_type {})) {} + // For all other arguments, we convert it using `convert_storage` according to broadcast rules + template, T>, int> = 0> + KERNEL_FLOAT_INLINE vector(U&& input) : storage_type(convert_storage(input)) {} - template< - typename U, - typename F, - enabled_t< - !is_implicit_convertible && is_vector_broadcastable, - int> = 0> - explicit KERNEL_FLOAT_INLINE vector(const vector& input) : - vector(convert(input, extent_type {})) {} + template, T>, int> = 0> + KERNEL_FLOAT_INLINE explicit vector(U&& input) : + storage_type(convert_storage(input)) {} - KERNEL_FLOAT_INLINE vector(const value_type& input = {}) : - vector(convert(input, extent_type {})) {} - - KERNEL_FLOAT_INLINE - storage_type& storage() { - return storage_; - } + // List of `N` (where N >= 2), simply pass forward to the storage + template< + typename A, + typename B, + typename... Rest, + typename = enabled_t> + KERNEL_FLOAT_INLINE vector(const A& a, const B& b, const Rest&... rest) : + storage_type {a, b, rest...} {} KERNEL_FLOAT_INLINE - const storage_type& storage() const { - return storage_; + static constexpr size_t size() { + return E::size; } KERNEL_FLOAT_INLINE - T* data() { - return storage_.data(); + storage_type& storage() { + return *this; } KERNEL_FLOAT_INLINE - const T* data() const { - return storage_.data(); + const storage_type& storage() const { + return *this; } KERNEL_FLOAT_INLINE const T* cdata() const { - return storage_.data(); + return this->data(); } KERNEL_FLOAT_INLINE T* begin() { - return storage_.data(); + return this->data(); } KERNEL_FLOAT_INLINE const T* begin() const { - return storage_.data(); + return this->data(); } KERNEL_FLOAT_INLINE const T* cbegin() const { - return storage_.data(); + return this->data(); } KERNEL_FLOAT_INLINE T* end() { - return storage_.data() + size(); + return this->data() + size(); } KERNEL_FLOAT_INLINE const T* end() const { - return storage_.data() + size(); + return this->data() + size(); } KERNEL_FLOAT_INLINE const T* cend() const { - return storage_.data() + size(); + return this->data() + size(); } KERNEL_FLOAT_INLINE T& at(size_t x) { - return *(data() + x); + return *(this->data() + x); } KERNEL_FLOAT_INLINE const T& at(size_t x) const { - return *(data() + x); + return *(this->data() + x); } KERNEL_FLOAT_INLINE @@ -147,6 +134,11 @@ struct vector: vector_extension, T, E::value> { return at(x); } + template + KERNEL_FLOAT_INLINE vector cast() const { + return kernel_float::cast(*this); + } + template KERNEL_FLOAT_INLINE vector> broadcast(extent new_size = {}) const { return kernel_float::broadcast(*this, new_size); @@ -166,24 +158,6 @@ struct vector: vector_extension, T, E::value> { storage_type storage_; }; -template -struct vector_extension { - KERNEL_FLOAT_INLINE - T get() const { - return static_cast(this)->get({}); - } - - KERNEL_FLOAT_INLINE - void set(T value) { - static_cast(this)->set({}, value); - } - - KERNEL_FLOAT_INLINE - operator T() const { - return get(); - } -}; - #define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ template<> \ struct into_vector_traits<::T2> { \ @@ -218,6 +192,11 @@ struct vector_extension { } \ }; +template +KERNEL_FLOAT_INLINE into_vector_type into_vector(V&& input) { + return into_vector_traits::call(std::forward(input)); +} + KERNEL_FLOAT_DEFINE_VECTOR_TYPE(char, char1, char2, char3, char4) KERNEL_FLOAT_DEFINE_VECTOR_TYPE(short, short1, short2, short3, short4) KERNEL_FLOAT_DEFINE_VECTOR_TYPE(int, int1, int2, int3, int4) diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 202adde..ceb0344 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-07-25 14:50:15.560873 -// git hash: df48350ff5f4362e8220188c09f48c37ba9d0335 +// date: 2023-08-01 16:03:54.596340 +// git hash: 0cf26708f3b17b2f74940e5806bad2aeaae4c076 //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -307,8 +307,8 @@ using enabled_t = typename detail::enabled_helper::type; } // namespace kernel_float #endif -#ifndef KERNEL_FLOAT_BASE -#define KERNEL_FLOAT_BASE +#ifndef KERNEL_FLOAT_BASE_H +#define KERNEL_FLOAT_BASE_H @@ -316,7 +316,7 @@ using enabled_t = typename detail::enabled_helper::type; namespace kernel_float { template -struct alignas(Alignment) array { +struct alignas(Alignment) aligned_array { KERNEL_FLOAT_INLINE T* data() { return items_; @@ -327,81 +327,130 @@ struct alignas(Alignment) array { return items_; } + T items_[N] = {}; +}; + +template +struct aligned_array { KERNEL_FLOAT_INLINE - T& operator[](size_t i) { - return items_[i]; + T* data() { + while (true) + ; } KERNEL_FLOAT_INLINE - const T& operator[](size_t i) const { - return items_[i]; + const T* data() const { + while (true) + ; } - - T items_[N]; }; template -struct array { +struct alignas(Alignment) aligned_array { KERNEL_FLOAT_INLINE - array(T value = {}) : value_(value) {} + aligned_array(T value = {}) : x(value) {} KERNEL_FLOAT_INLINE operator T() const { - return value_; + return x; } KERNEL_FLOAT_INLINE T* data() { - return &value_; + return &x; } KERNEL_FLOAT_INLINE const T* data() const { - return &value_; + return &x; } + T x; +}; + +template +struct alignas(Alignment) aligned_array { KERNEL_FLOAT_INLINE - T& operator[](size_t) { - return value_; + aligned_array(T x, T y) : x(x), y(y) {} + + KERNEL_FLOAT_INLINE + aligned_array() : aligned_array(T {}, T {}) {} + + KERNEL_FLOAT_INLINE + T* data() { + return items; } KERNEL_FLOAT_INLINE - const T& operator[](size_t) const { - return value_; + const T* data() const { + return items; } - T value_; + union { + T items[2]; + struct { + T x; + T y; + }; + }; }; template -struct array { +struct alignas(Alignment) aligned_array { + KERNEL_FLOAT_INLINE + aligned_array(T x, T y, T z) : x(x), y(y), z(z) {} + + KERNEL_FLOAT_INLINE + aligned_array() : aligned_array(T {}, T {}, T {}) {} + KERNEL_FLOAT_INLINE T* data() { - while (true) - ; + return items; } KERNEL_FLOAT_INLINE const T* data() const { - while (true) - ; + return items; } + union { + T items[3]; + struct { + T x; + T y; + T z; + }; + }; +}; + +template +struct alignas(Alignment) aligned_array { KERNEL_FLOAT_INLINE - T& operator[](size_t i) { - while (true) - ; + aligned_array(T x, T y, T z, T w) : x(x), y(y), z(z), w(w) {} + + KERNEL_FLOAT_INLINE + aligned_array() : aligned_array(T {}, T {}, T {}, T {}) {} + + KERNEL_FLOAT_INLINE + T* data() { + return items; } KERNEL_FLOAT_INLINE - const T& operator[](size_t i) const { - while (true) - ; + const T* data() const { + return items; } -}; -template -using ndindex = array; + union { + T items[4]; + struct { + T x; + T y; + T z; + T w; + }; + }; +}; KERNEL_FLOAT_INLINE static constexpr size_t compute_max_alignment(size_t total_size, size_t min_align) { @@ -421,230 +470,94 @@ static constexpr size_t compute_max_alignment(size_t total_size, size_t min_alig } template -using tensor_storage = array; - -template class S = tensor_storage> -struct tensor; - -template -struct extents; - -template<> -struct extents<> { - static constexpr size_t rank = 0; - static constexpr size_t volume = 1; - - KERNEL_FLOAT_INLINE - static constexpr size_t size(size_t axis) { - return 1; - } - - KERNEL_FLOAT_INLINE - static constexpr size_t stride(size_t axis) { - return 1; - } - - KERNEL_FLOAT_INLINE - static size_t ravel_index(ndindex<0>) { - return 0; - } - - KERNEL_FLOAT_INLINE - static ndindex<0> unravel_index(size_t i) { - return {}; - } -}; +using vector_storage = aligned_array; template -struct extents { - static constexpr size_t rank = 1; - static constexpr size_t volume = N; - - KERNEL_FLOAT_INLINE - static constexpr size_t size(size_t axis) { - return axis == 0 ? N : 1; - } - - KERNEL_FLOAT_INLINE - static constexpr size_t stride(size_t axis) { - return 1; - } - - KERNEL_FLOAT_INLINE - static size_t ravel_index(ndindex<1> ind) { - return ind[0]; - } - - KERNEL_FLOAT_INLINE - static ndindex<1> unravel_index(size_t i) { - return {i}; - } -}; - -template -struct extents { - static constexpr size_t rank = 2; - static constexpr size_t volume = N * M; - - KERNEL_FLOAT_INLINE - static constexpr size_t size(size_t axis) { - return axis == 0 ? N : axis == 1 ? M : 1; - } - - KERNEL_FLOAT_INLINE - static constexpr size_t stride(size_t axis) { - return axis == 0 ? M : 1; - } - - KERNEL_FLOAT_INLINE - static size_t ravel_index(ndindex<2> x) { - return x[0] * M + x[1]; - } - - KERNEL_FLOAT_INLINE - static ndindex<2> unravel_index(size_t i) { - return {i / M, i % M}; - } -}; - -template -struct extents { - static constexpr size_t rank = 3; - static constexpr size_t volume = N * M * K; - - KERNEL_FLOAT_INLINE - static constexpr size_t size(size_t axis) { - return axis == 0 ? N : axis == 1 ? M : axis == 2 ? K : 1; - } - - KERNEL_FLOAT_INLINE - static constexpr size_t stride(size_t axis) { - return axis == 0 ? M * K // - : axis == 1 ? K // - : 1; // - } - - KERNEL_FLOAT_INLINE - static size_t ravel_index(ndindex<3> x) { - return (x[0] * M + x[1]) * K + x[2]; - } - - KERNEL_FLOAT_INLINE - static ndindex<3> unravel_index(size_t i) { - return {i / (K * M), (i / K) % M, i % K}; - } +struct extent { + static constexpr size_t value = N; + static constexpr size_t size = N; }; template -struct into_tensor_traits { - using type = tensor>; +struct into_vector_traits { + using value_type = T; + using extent_type = extent<1>; KERNEL_FLOAT_INLINE - static type call(const T& input) { - return tensor_storage {input}; + static vector_storage call(const T& input) { + return vector_storage {input}; } }; template -struct into_tensor_traits { - using type = typename into_tensor_traits::type; - - KERNEL_FLOAT_INLINE - static type call(const V input) { - return into_tensor_traits::call(input); - } -}; +struct into_vector_traits: into_vector_traits {}; template -struct into_tensor_traits { - using type = typename into_tensor_traits::type; - - KERNEL_FLOAT_INLINE - static type call(V& input) { - return into_tensor_traits::call(input); - } -}; +struct into_vector_traits: into_vector_traits {}; template -struct into_tensor_traits { - using type = typename into_tensor_traits::type; - - KERNEL_FLOAT_INLINE - static type call(const V& input) { - return into_tensor_traits::call(input); - } -}; +struct into_vector_traits: into_vector_traits {}; template -struct into_tensor_traits { - using type = typename into_tensor_traits::type; +struct into_vector_traits: into_vector_traits {}; - KERNEL_FLOAT_INLINE - static type call(V&& input) { - return into_tensor_traits::call(std::move(input)); - } -}; - -template class S> -struct into_tensor_traits> { - using type = tensor; +template +struct into_vector_traits> { + using value_type = T; + using extent_type = extent; KERNEL_FLOAT_INLINE - static type call(const tensor& input) { + static vector_storage call(const aligned_array& input) { return input; } }; -template -struct into_tensor_traits> { - using type = tensor>; +template +struct vector_traits; + +template> +struct vector; + +template +struct into_vector_traits> { + using value_type = T; + using extent_type = E; KERNEL_FLOAT_INLINE - static type call(const array& input) { - return input; + static vector_storage call(const vector& input) { + return input.storage(); } }; -template -struct tensor_traits; - -template class S> -struct tensor_traits> { +template +struct vector_traits> { using value_type = T; - using extents_type = D; - using storage_type = S; + using extent_type = E; + using storage_type = S; + using vector_type = vector; }; template -using into_tensor_type = typename into_tensor_traits::type; - -template -using tensor_extents = typename tensor_traits>::extents_type; +using vector_value_type = typename into_vector_traits::value_type; template -static constexpr size_t tensor_rank = tensor_extents::rank; +using vector_extent_type = typename into_vector_traits::extent_type; template -static constexpr size_t tensor_volume = tensor_extents::volume; +static constexpr size_t vector_extent = vector_extent_type::value; template -using tensor_value_type = typename tensor_traits>::value_type; +using into_vector_type = vector, vector_extent_type>; template -using tensor_storage_type = tensor_storage, tensor_volume>; +using vector_storage_type = vector_storage, vector_extent>; template -using promoted_tensor_value_type = - promote_t>::value_type...>; - -template -KERNEL_FLOAT_INLINE into_tensor_type into_tensor(V&& input) { - return into_tensor_traits::call(std::forward(input)); -} +using promoted_vector_value_type = promote_t...>; template -KERNEL_FLOAT_INLINE tensor_storage_type into_tensor_storage(V&& input) { - return into_tensor_traits::call(std::forward(input)).storage(); +KERNEL_FLOAT_INLINE vector_storage_type into_vector_storage(V&& input) { + return into_vector_traits::call(std::forward(input)); } } // namespace kernel_float @@ -829,7 +742,7 @@ KERNEL_FLOAT_INLINE T real(complex_type v) { template KERNEL_FLOAT_INLINE T imag(complex_type v) { - return v.real(); + return v.imag(); } template @@ -920,13 +833,13 @@ namespace detail { template struct apply_impl { - KERNEL_FLOAT_INLINE static tensor_storage - call(F fun, const tensor_storage&... inputs) { - tensor_storage result; + KERNEL_FLOAT_INLINE static vector_storage + call(F fun, const vector_storage&... inputs) { + vector_storage result; #pragma unroll for (size_t i = 0; i < N; i++) { - result[i] = fun(inputs[i]...); + result.data()[i] = fun(inputs.data()[i]...); } return result; @@ -935,15 +848,15 @@ struct apply_impl { } // namespace detail template -using map_type = tensor>, tensor_extents>; +using map_type = vector>, vector_extent_type>; template KERNEL_FLOAT_INLINE map_type map(F fun, const V& input) { - using Input = tensor_value_type; + using Input = vector_value_type; using Output = result_t; - return detail::apply_impl, Output, Input>::call( + return detail::apply_impl, Output, Input>::call( fun, - into_tensor(input).storage()); + into_vector_storage(input)); } #define KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ @@ -956,15 +869,15 @@ KERNEL_FLOAT_INLINE map_type map(F fun, const V& input) { }; \ } \ template \ - KERNEL_FLOAT_INLINE into_tensor_type NAME(const V& input) { \ - using F = ops::NAME>; \ + KERNEL_FLOAT_INLINE into_vector_type NAME(const V& input) { \ + using F = ops::NAME>; \ return map(F {}, input); \ } #define KERNEL_FLOAT_DEFINE_UNARY_OP(NAME, OP, EXPR) \ KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ template \ - KERNEL_FLOAT_INLINE tensor operator OP(const tensor& vec) { \ + KERNEL_FLOAT_INLINE vector operator OP(const vector& vec) { \ return NAME(vec); \ } @@ -1073,8 +986,8 @@ struct cast { } // namespace ops template -KERNEL_FLOAT_INLINE tensor> cast(const V& input) { - using F = ops::cast, R, Mode>; +KERNEL_FLOAT_INLINE vector> cast(const V& input) { + using F = ops::cast, R, Mode>; return map(F {}, input); } } // namespace kernel_float @@ -1089,237 +1002,109 @@ KERNEL_FLOAT_INLINE tensor> cast(const V& input) { namespace kernel_float { namespace detail { -template -struct unify_dimension_helper; - -template<> -struct unify_dimension_helper<1, 1> { - static constexpr size_t value = 1; -}; +template +struct broadcast_extent_helper; template -struct unify_dimension_helper { - static constexpr size_t value = N; +struct broadcast_extent_helper, extent> { + using type = extent; }; template -struct unify_dimension_helper { - static constexpr size_t value = N; +struct broadcast_extent_helper, extent> { + using type = extent; }; template -struct unify_dimension_helper<1, N> { - static constexpr size_t value = N; -}; - -template -struct unify_extents_helper; - -template -struct unify_extents_helper, extents> { - using type = extents::value...>; -}; - -template -struct extents_to_rank { - using type = E; +struct broadcast_extent_helper, extent<1>> { + using type = extent; }; -template -struct extents_to_rank, N, enabled_t<(sizeof...(Ns) < N)>>: - extents_to_rank, N> {}; - -template -struct broadcast_extents_helper { - using type = typename unify_extents_helper< - typename extents_to_rank::type, // - typename extents_to_rank::type // - >::type; -}; - -template -struct broadcast_extents_helper { - using type = E; +template<> +struct broadcast_extent_helper, extent<1>> { + using type = extent<1>; }; } // namespace detail template -using broadcast_extents = typename detail::broadcast_extents_helper::type; +using broadcast_extent = typename detail::broadcast_extent_helper::type; template -using broadcast_tensor_extents = broadcast_extents, tensor_extents>; +using broadcast_vector_extent_type = broadcast_extent, vector_extent_type>; template -static constexpr bool is_broadcastable = is_same, To>; +static constexpr bool is_broadcastable = is_same, To>; template -static constexpr bool is_tensor_broadcastable = is_broadcastable, To>; +static constexpr bool is_vector_broadcastable = is_broadcastable, To>; namespace detail { -template -struct copy_helper; - -template -struct copy_helper, IS, OS> { - template - KERNEL_FLOAT_INLINE static void call(T* output, const T* input) { - ndindex<0> x; - size_t input_index = IS::call(x); - size_t output_index = OS::call(x); - output[output_index] = input[input_index]; - } -}; - -template -struct copy_helper, IS, OS> { - template - KERNEL_FLOAT_INLINE static void call(T* output, const T* input) { - for (size_t i = 0; i < N; i++) { - ndindex<1> x = {i}; - size_t input_index = IS::call(x); - size_t output_index = OS::call(x); - output[output_index] = input[input_index]; - } - } -}; - -template -struct copy_helper, IS, OS> { - template - KERNEL_FLOAT_INLINE static void call(T* output, const T* input) { - for (size_t i = 0; i < N; i++) { - for (size_t j = 0; j < M; j++) { - ndindex<2> x = {i, j}; - size_t input_index = IS::call(x); - size_t output_index = OS::call(x); - output[output_index] = input[input_index]; - } - } - } -}; +template +struct broadcast_impl; -template -struct copy_helper, IS, OS> { - template - KERNEL_FLOAT_INLINE static void call(T* output, const T* input) { +template +struct broadcast_impl, extent> { + KERNEL_FLOAT_INLINE static vector_storage call(const vector_storage& input) { + vector_storage output; for (size_t i = 0; i < N; i++) { - for (size_t j = 0; j < M; j++) { - for (size_t k = 0; k < K; k++) { - ndindex<3> x = {i, j, k}; - size_t input_index = IS::call(x); - size_t output_index = OS::call(x); - output[output_index] = input[input_index]; - } - } + output.data()[i] = input.data()[0]; } - } -}; - -template -struct strides_helper; - -template<> -struct strides_helper> { - KERNEL_FLOAT_INLINE - static size_t call(ndindex<0>) { - return 0; - } -}; - -template -struct strides_helper> { - KERNEL_FLOAT_INLINE - static size_t call(ndindex<1> x) { - return (N != 1 ? x[0] : 0); - } -}; - -template -struct strides_helper> { - KERNEL_FLOAT_INLINE - static size_t call(ndindex<2> x) { - return (N != 1 ? x[0] * M : 0) + // - (M != 1 ? x[1] : 0); - } -}; - -template -struct strides_helper> { - KERNEL_FLOAT_INLINE - static size_t call(ndindex<3> x) { - return (N != 1 ? x[0] * M * K : 0) + // - (M != 1 ? x[1] * K : 0) + // - (K != 1 ? x[2] : 0); - } -}; - -template -struct broadcast_impl { - KERNEL_FLOAT_INLINE static tensor_storage - call(tensor_storage input) { - static_assert(is_broadcastable, "cannot broadcast to required shape"); - using IS = strides_helper::type>; - using OS = strides_helper; - - tensor_storage output; - copy_helper::call(output.data(), input.data()); return output; } }; -template -struct broadcast_impl { - KERNEL_FLOAT_INLINE static tensor_storage - call(tensor_storage input) { +template +struct broadcast_impl, extent> { + KERNEL_FLOAT_INLINE static vector_storage call(vector_storage input) { return input; } }; } // namespace detail -template -KERNEL_FLOAT_INLINE tensor, extents> -broadcast(const V& input, extents new_extents = {}) { - using T = tensor_value_type; - return detail::broadcast_impl, extents>::call( - into_tensor(input).storage()); +template +KERNEL_FLOAT_INLINE vector, extent> +broadcast(const V& input, extent new_size = {}) { + using T = vector_value_type; + return detail::broadcast_impl, extent>::call( + into_vector_storage(input)); } template -KERNEL_FLOAT_INLINE tensor, tensor_extents> +KERNEL_FLOAT_INLINE vector, vector_extent_type> broadcast_like(const V& input, const R&) { - using T = tensor_value_type; - return detail::broadcast_impl, tensor_extents>::call( - into_tensor(input).storage()); + using T = vector_value_type; + return detail::broadcast_impl, vector_extent_type>::call( + into_vector_storage(input)); } -template -KERNEL_FLOAT_INLINE tensor> fill(T value = {}, extents = {}) { - tensor_storage input = {value}; - return detail::broadcast_impl, extents>::call(input); +template +KERNEL_FLOAT_INLINE vector> fill(T value = {}, extent = {}) { + vector_storage input = {value}; + return detail::broadcast_impl, extent>::call(input); } -template -KERNEL_FLOAT_INLINE tensor> zeros(extents = {}) { - tensor_storage input = {T {}}; - return detail::broadcast_impl, extents>::call(input); +template +KERNEL_FLOAT_INLINE vector> zeros(extent = {}) { + vector_storage input = {T {}}; + return detail::broadcast_impl, extent>::call(input); } -template -KERNEL_FLOAT_INLINE tensor> ones(extents = {}) { - tensor_storage input = {T {1}}; - return detail::broadcast_impl, extents>::call(input); +template +KERNEL_FLOAT_INLINE vector> ones(extent = {}) { + vector_storage input = {T {1}}; + return detail::broadcast_impl, extent>::call(input); } -template, typename E = tensor_extents> -KERNEL_FLOAT_INLINE tensor zeros_like(const V&) { +template, typename E = vector_extent_type> +KERNEL_FLOAT_INLINE vector zeros_like(const V&) { return zeros(E {}); } -template, typename E = tensor_extents> -KERNEL_FLOAT_INLINE tensor ones_like(const V&) { +template, typename E = vector_extent_type> +KERNEL_FLOAT_INLINE vector ones_like(const V&) { return ones(E {}); } @@ -1327,10 +1112,10 @@ namespace detail { template struct convert_helper { KERNEL_FLOAT_INLINE - static tensor_storage call(tensor_storage input) { + static vector_storage call(vector_storage input) { using F = ops::cast; - tensor_storage intermediate = - detail::apply_impl::call(F {}, input); + vector_storage intermediate = + detail::apply_impl::call(F {}, input); return detail::broadcast_impl::call(intermediate); } }; @@ -1338,7 +1123,7 @@ struct convert_helper { template struct convert_helper { KERNEL_FLOAT_INLINE - static tensor_storage call(tensor_storage input) { + static vector_storage call(vector_storage input) { return input; } }; @@ -1346,7 +1131,7 @@ struct convert_helper { template struct convert_helper { KERNEL_FLOAT_INLINE - static tensor_storage call(tensor_storage input) { + static vector_storage call(vector_storage input) { return detail::broadcast_impl::call(input); } }; @@ -1354,21 +1139,25 @@ struct convert_helper { template struct convert_helper { KERNEL_FLOAT_INLINE - static tensor_storage call(tensor_storage input) { + static vector_storage call(vector_storage input) { using F = ops::cast; - return detail::apply_impl::call(F {}, input); + return detail::apply_impl::call(F {}, input); } }; } // namespace detail +template +KERNEL_FLOAT_INLINE vector_storage convert_storage(const V& input, extent new_size = {}) { + return detail::convert_helper, vector_extent_type, R, extent, M>:: + call(into_vector_storage(input)); +} + /** - * Cast the values of the given input tensor to type `R` and then broadcast the result to the given shape `(Ns...)`. + * Cast the values of the given input vector to type `R` and then broadcast the result to the given size `N`. */ -template -KERNEL_FLOAT_INLINE tensor> -convert(const V& input, extents new_shape = {}) { - return detail::convert_helper, tensor_extents, R, extents, M>:: - call(into_tensor(input).storage()); +template +KERNEL_FLOAT_INLINE vector> convert(const V& input, extent new_size = {}) { + return convert_storage(input); } } // namespace kernel_float @@ -1383,39 +1172,40 @@ convert(const V& input, extents new_shape = {}) { namespace kernel_float { template -using zip_type = - tensor, tensor_value_type>, broadcast_tensor_extents>; +using zip_type = vector< + result_t, vector_value_type>, + broadcast_vector_extent_type>; template KERNEL_FLOAT_INLINE zip_type zip(F fun, const L& left, const R& right) { - using A = tensor_value_type; - using B = tensor_value_type; + using A = vector_value_type; + using B = vector_value_type; using O = result_t; - using E = broadcast_tensor_extents; + using E = broadcast_vector_extent_type; - return detail::apply_impl::call( + return detail::apply_impl::call( fun, broadcast(left).storage(), broadcast(right).storage()); } template -using zip_common_type = tensor< - result_t, promoted_tensor_value_type>, - broadcast_tensor_extents>; +using zip_common_type = vector< + result_t, promoted_vector_value_type>, + broadcast_vector_extent_type>; template KERNEL_FLOAT_INLINE zip_common_type zip_common(F fun, const L& left, const R& right) { - using T = promoted_tensor_value_type; + using T = promoted_vector_value_type; using O = result_t; - using E = broadcast_tensor_extents; + using E = broadcast_vector_extent_type; - return detail::apply_impl::call( + return detail::apply_impl::call( fun, - detail::convert_helper, tensor_extents, T, E>::call( - into_tensor_storage(left)), - detail::convert_helper, tensor_extents, T, E>::call( - into_tensor_storage(right))); + detail::convert_helper, vector_extent_type, T, E>::call( + into_vector_storage(left)), + detail::convert_helper, vector_extent_type, T, E>::call( + into_vector_storage(right))); } #define KERNEL_FLOAT_DEFINE_BINARY(NAME, EXPR) \ @@ -1427,7 +1217,7 @@ KERNEL_FLOAT_INLINE zip_common_type zip_common(F fun, const L& left, co } \ }; \ } \ - template> \ + template> \ KERNEL_FLOAT_INLINE zip_common_type, L, R> NAME(L&& left, R&& right) { \ return zip_common(ops::NAME {}, std::forward(left), std::forward(right)); \ } @@ -1435,21 +1225,21 @@ KERNEL_FLOAT_INLINE zip_common_type zip_common(F fun, const L& left, co #define KERNEL_FLOAT_DEFINE_BINARY_OP(NAME, OP) \ KERNEL_FLOAT_DEFINE_BINARY(NAME, left OP right) \ template, typename E1, typename E2> \ - KERNEL_FLOAT_INLINE zip_common_type, tensor, tensor> operator OP( \ - const tensor& left, \ - const tensor& right) { \ + KERNEL_FLOAT_INLINE zip_common_type, vector, vector> operator OP( \ + const vector& left, \ + const vector& right) { \ return zip_common(ops::NAME {}, left, right); \ } \ - template>, typename E> \ - KERNEL_FLOAT_INLINE zip_common_type, tensor, R> operator OP( \ - const tensor& left, \ + template>, typename E> \ + KERNEL_FLOAT_INLINE zip_common_type, vector, R> operator OP( \ + const vector& left, \ const R& right) { \ return zip_common(ops::NAME {}, left, right); \ } \ - template, R>, typename E> \ - KERNEL_FLOAT_INLINE zip_common_type, L, tensor> operator OP( \ + template, R>, typename E> \ + KERNEL_FLOAT_INLINE zip_common_type, L, vector> operator OP( \ const L& left, \ - const tensor& right) { \ + const vector& right) { \ return zip_common(ops::NAME {}, left, right); \ } @@ -1472,13 +1262,13 @@ KERNEL_FLOAT_DEFINE_BINARY_OP(bit_xor, ^) // clang-format off template typename F, typename T, typename E, typename R> -static constexpr bool is_tensor_assign_allowed = - is_tensor_broadcastable && +static constexpr bool is_vector_assign_allowed = + is_vector_broadcastable && is_implicit_convertible< result_t< - F>>, + F>>, T, - tensor_value_type + vector_value_type >, T >; @@ -1489,8 +1279,8 @@ static constexpr bool is_tensor_assign_allowed = typename T, \ typename E, \ typename R, \ - typename = enabled_t>> \ - KERNEL_FLOAT_INLINE tensor& operator OP(tensor& lhs, const R& rhs) { \ + typename = enabled_t>> \ + KERNEL_FLOAT_INLINE vector& operator OP(vector& lhs, const R& rhs) { \ using F = ops::NAME; \ lhs = zip_common(F {}, lhs, rhs); \ return lhs; \ @@ -1667,18 +1457,18 @@ namespace kernel_float { namespace detail { template struct reduce_helper { - KERNEL_FLOAT_INLINE static T call(F fun, const tensor_storage& input) { + KERNEL_FLOAT_INLINE static T call(F fun, const vector_storage& input) { return call(fun, input, make_index_sequence {}); } private: template KERNEL_FLOAT_INLINE static T - call(F fun, const tensor_storage& input, index_sequence<0, Is...>) { - T result = input[0]; + call(F fun, const vector_storage& input, index_sequence<0, Is...>) { + T result = input.data()[0]; #pragma unroll for (size_t i = 1; i < N; i++) { - result = fun(result, input[i]); + result = fun(result, input.data()[i]); } return result; } @@ -1700,10 +1490,10 @@ struct reduce_helper { * ``` */ template -KERNEL_FLOAT_INLINE tensor_value_type reduce(F fun, const V& input) { - return detail::reduce_helper, tensor_value_type>::call( +KERNEL_FLOAT_INLINE vector_value_type reduce(F fun, const V& input) { + return detail::reduce_helper, vector_value_type>::call( fun, - into_tensor_storage(input)); + into_vector_storage(input)); } /** @@ -1716,7 +1506,7 @@ KERNEL_FLOAT_INLINE tensor_value_type reduce(F fun, const V& input) { * int y = min(x); // Returns 0 * ``` */ -template> +template> KERNEL_FLOAT_INLINE T min(const V& input) { return reduce(ops::min {}, input); } @@ -1731,7 +1521,7 @@ KERNEL_FLOAT_INLINE T min(const V& input) { * int y = max(x); // Returns 5 * ``` */ -template> +template> KERNEL_FLOAT_INLINE T max(const V& input) { return reduce(ops::max {}, input); } @@ -1746,7 +1536,7 @@ KERNEL_FLOAT_INLINE T max(const V& input) { * int y = sum(x); // Returns 8 * ``` */ -template> +template> KERNEL_FLOAT_INLINE T sum(const V& input) { return reduce(ops::add {}, input); } @@ -1762,7 +1552,7 @@ KERNEL_FLOAT_INLINE T sum(const V& input) { * int y = dot(x, y); // Returns 1*4+2*5+3*6 = 32 * ``` */ -template> +template> KERNEL_FLOAT_INLINE T dot(const L& left, const R& right) { return reduce(ops::add {}, zip_common(ops::multiply {}, left, right)); } @@ -1777,7 +1567,7 @@ KERNEL_FLOAT_INLINE T dot(const L& left, const R& right) { * int y = sum(x); // Returns 5*0*2*1*0 = 0 * ``` */ -template> +template> KERNEL_FLOAT_INLINE T product(const V& input) { return reduce(ops::multiply {}, input); } @@ -1818,197 +1608,289 @@ KERNEL_FLOAT_INLINE int count(const V& input) { } // namespace kernel_float #endif //KERNEL_FLOAT_REDUCE_H -#ifndef KERNEL_FLOAT_BASE_H -#define KERNEL_FLOAT_BASE_H - +#ifndef KERNEL_FLOAT_TRIOPS_H +#define KERNEL_FLOAT_TRIOPS_H +namespace kernel_float { +namespace ops { +template +struct conditional { + KERNEL_FLOAT_INLINE T operator()(bool cond, T true_value, T false_value) { + if (cond) { + return true_value; + } else { + return false_value; + } + } +}; +} // namespace ops -namespace kernel_float { +/** + * Return elements chosen from `true_values` and `false_values` depending on `cond`. + * + * This function broadcasts all arguments to the same size and it promotes the values of `true_values` and + * `false_values` into the same type. Next, it casts the values of `cond` to booleans and returns a vector where + * the values are taken from `true_values` if the condition is true and `false_values` otherwise. + * + * @param cond The condition used for selection. + * @param true_values The vector of values to choose from when the condition is true. + * @param false_values The vector of values to choose from when the condition is false. + * @return A vector containing selected elements as per the condition. + */ +template< + typename C, + typename L, + typename R, + typename T = promoted_vector_value_type, + typename E = broadcast_extent, broadcast_vector_extent_type>> +KERNEL_FLOAT_INLINE vector where(const C& cond, const L& true_values, const R& false_values) { + using F = ops::conditional; -template -struct tensor_extension {}; + return detail::apply_impl::call( + F {}, + detail::convert_helper, vector_extent_type, bool, E>::call( + into_vector_storage(cond)), + detail::convert_helper, vector_extent_type, T, E>::call( + into_vector_storage(true_values)), + detail::convert_helper, vector_extent_type, T, E>::call( + into_vector_storage(false_values))); +} -template class S> -struct tensor: tensor_extension, T, E::volume> { - static constexpr size_t rank = E::rank; - static constexpr size_t volume = E::volume; +/** + * Selects elements from `true_values` depending on `cond`. + * + * This function returns a vector where the values are taken from `true_values` where `cond` is `true` and `0` where + * `cond is `false`. + * + * @param cond The condition used for selection. + * @param true_values The vector of values to choose from when the condition is true. + * @return A vector containing selected elements as per the condition. + */ +template< + typename C, + typename L, + typename T = vector_value_type, + typename E = broadcast_extent, vector_extent_type>> +KERNEL_FLOAT_INLINE vector where(const C& cond, const L& true_values) { + vector> false_values = T {}; + return where(cond, true_values, false_values); +} - using value_type = T; - using extents_type = E; - using ndindex_type = ndindex; - using storage_type = S; +/** + * Returns a vector where the values are `T(1)` where `cond` is `true` and `T(0)` where `cond` is `false`. + * + * @param cond The condition used for selection. + * @return A vector containing elements as per the condition. + */ +template> +KERNEL_FLOAT_INLINE vector where(const C& cond) { + vector> true_values = T {true}; + vector> false_values = T {false}; + return where(cond, true_values, false_values); +} - KERNEL_FLOAT_INLINE - static constexpr size_t size() { - return E::volume; +namespace ops { +template +struct fma { + KERNEL_FLOAT_INLINE T operator()(T a, T b, T c) { + return a + b * c; } +}; - KERNEL_FLOAT_INLINE - static constexpr size_t size(size_t axis) { - return E::size(axis); +#if KERNEL_FLOAT_IS_DEVICE +template<> +struct fma { + KERNEL_FLOAT_INLINE float operator()(float a, float b, float c) { + return __fmaf_rn(a, b, c); } +}; - KERNEL_FLOAT_INLINE - static constexpr extents_type shape() { - return {}; +template<> +struct fma { + KERNEL_FLOAT_INLINE double operator()(double a, double b, double c) { + return __fma_rn(a, b, c); } +}; +#endif +} // namespace ops - KERNEL_FLOAT_INLINE - static constexpr size_t stride(size_t axis) { - return E::stride(axis); - } +/** + * Computes the result of `a * b + c`. This is done in a single operation if possible. + */ +template< + typename A, + typename B, + typename C, + typename T = promoted_vector_value_type, + typename E = broadcast_extent, broadcast_vector_extent_type>> +KERNEL_FLOAT_INLINE vector fma(const A& a, const B& b, const C& c) { + using F = ops::fma; - KERNEL_FLOAT_INLINE - static constexpr size_t linearize_index(ndindex_type index) { - return E::ravel_index(index); - } + return detail::apply_impl::call( + F {}, + detail::convert_helper, vector_extent_type, T, E>::call( + into_vector_storage(a)), + detail::convert_helper, vector_extent_type, T, E>::call( + into_vector_storage(b)), + detail::convert_helper, vector_extent_type, T, E>::call( + into_vector_storage(c))); +} - tensor(const tensor&) = default; +} // namespace kernel_float + +#endif //KERNEL_FLOAT_TRIOPS_H +#ifndef KERNEL_FLOAT_VECTOR_H +#define KERNEL_FLOAT_VECTOR_H - KERNEL_FLOAT_INLINE - tensor(storage_type storage) : storage_(storage) {} - template= 2, int> = 0> - KERNEL_FLOAT_INLINE tensor(Args&&... args) : storage_ {std::forward(args)...} {} - template< - typename U, - typename F, - enabled_t< - is_implicit_convertible && is_tensor_broadcastable, - int> = 0> - KERNEL_FLOAT_INLINE tensor(const tensor& input) : - tensor(convert(input, extents_type {})) {} - template< - typename U, - typename F, - enabled_t< - !is_implicit_convertible && is_tensor_broadcastable, - int> = 0> - explicit KERNEL_FLOAT_INLINE tensor(const tensor& input) : - tensor(convert(input, extents_type {})) {} - KERNEL_FLOAT_INLINE tensor(const value_type& input = {}) : - tensor(convert(input, extents_type {})) {} + +namespace kernel_float { + +template +struct vector: S { + using value_type = T; + using extent_type = E; + using storage_type = S; + + // Copy another `vector` + vector(const vector&) = default; + + // Copy anything of type `storage_type` KERNEL_FLOAT_INLINE - storage_type& storage() { - return storage_; - } + vector(const storage_type& storage) : storage_type(storage) {} + // Copy anything of type `storage_type` KERNEL_FLOAT_INLINE - const storage_type& storage() const { - return storage_; + vector(const value_type& input = {}) : + storage_type(detail::broadcast_impl, E>::call(input)) {} + + // For all other arguments, we convert it using `convert_storage` according to broadcast rules + template, T>, int> = 0> + KERNEL_FLOAT_INLINE vector(U&& input) : storage_type(convert_storage(input)) {} + + template, T>, int> = 0> + KERNEL_FLOAT_INLINE explicit vector(U&& input) : + storage_type(convert_storage(input)) {} + + // List of `N` (where N >= 2), simply pass forward to the storage + template< + typename A, + typename B, + typename... Rest, + typename = enabled_t> + KERNEL_FLOAT_INLINE vector(const A& a, const B& b, const Rest&... rest) : + storage_type {a, b, rest...} {} + + KERNEL_FLOAT_INLINE + static constexpr size_t size() { + return E::size; } KERNEL_FLOAT_INLINE - T* data() { - return storage_.data(); + storage_type& storage() { + return *this; } KERNEL_FLOAT_INLINE - const T* data() const { - return storage_.data(); + const storage_type& storage() const { + return *this; } KERNEL_FLOAT_INLINE const T* cdata() const { - return storage_.data(); + return this->data(); } KERNEL_FLOAT_INLINE T* begin() { - return storage_.data(); + return this->data(); } KERNEL_FLOAT_INLINE const T* begin() const { - return storage_.data(); + return this->data(); } KERNEL_FLOAT_INLINE const T* cbegin() const { - return storage_.data(); + return this->data(); } KERNEL_FLOAT_INLINE T* end() { - return storage_.data() + E::volume; + return this->data() + size(); } KERNEL_FLOAT_INLINE const T* end() const { - return storage_.data() + E::volume; + return this->data() + size(); } KERNEL_FLOAT_INLINE const T* cend() const { - return storage_.data() + E::volume; + return this->data() + size(); } KERNEL_FLOAT_INLINE - T& at(ndindex_type x) { - return *(data() + linearize_index(x)); + T& at(size_t x) { + return *(this->data() + x); } KERNEL_FLOAT_INLINE - const T& at(ndindex_type x) const { - return *(data() + linearize_index(x)); + const T& at(size_t x) const { + return *(this->data() + x); } KERNEL_FLOAT_INLINE - T get(ndindex_type x) const { + T get(size_t x) const { return at(x); } KERNEL_FLOAT_INLINE - void set(ndindex_type x, T value) { + void set(size_t x, T value) { at(x) = std::move(value); } KERNEL_FLOAT_INLINE - T& operator[](ndindex_type x) { + T& operator[](size_t x) { return at(x); } KERNEL_FLOAT_INLINE - const T& operator[](ndindex_type x) const { + const T& operator[](size_t x) const { return at(x); } KERNEL_FLOAT_INLINE - T& operator()(ndindex_type x) { + T& operator()(size_t x) { return at(x); } KERNEL_FLOAT_INLINE - const T& operator()(ndindex_type x) const { + const T& operator()(size_t x) const { return at(x); } - KERNEL_FLOAT_INLINE - tensor> flatten() const { - return storage_; + template + KERNEL_FLOAT_INLINE vector cast() const { + return kernel_float::cast(*this); } - template - KERNEL_FLOAT_INLINE tensor> reshape(extents = {}) const { - static_assert(extents::volume == volume, "invalid reshape shape"); - return storage_; - } - - template - KERNEL_FLOAT_INLINE tensor> broadcast(extents new_shape = {}) const { - return kernel_float::broadcast(*this, new_shape); + template + KERNEL_FLOAT_INLINE vector> broadcast(extent new_size = {}) const { + return kernel_float::broadcast(*this, new_size); } template - KERNEL_FLOAT_INLINE tensor, E> map(F fun = {}) const { + KERNEL_FLOAT_INLINE vector, E> map(F fun = {}) const { return kernel_float::map(fun, *this); } @@ -2021,55 +1903,45 @@ struct tensor: tensor_extension, T, E::volume> { storage_type storage_; }; -template -struct tensor_extension { - KERNEL_FLOAT_INLINE - T get() const { - return static_cast(this)->get({}); - } - - KERNEL_FLOAT_INLINE - void set(T value) { - static_cast(this)->set({}, value); - } - - KERNEL_FLOAT_INLINE - operator T() const { - return get(); - } -}; - -#define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ - template<> \ - struct into_tensor_traits<::T2> { \ - using type = tensor>; \ - \ - KERNEL_FLOAT_INLINE \ - static type call(::T2 v) { \ - return tensor_storage {v.x, v.y}; \ - } \ - }; \ - \ - template<> \ - struct into_tensor_traits<::T3> { \ - using type = tensor>; \ - \ - KERNEL_FLOAT_INLINE \ - static type call(::T3 v) { \ - return tensor_storage {v.x, v.y, v.z}; \ - } \ - }; \ - \ - template<> \ - struct into_tensor_traits<::T4> { \ - using type = tensor>; \ - \ - KERNEL_FLOAT_INLINE \ - static type call(::T4 v) { \ - return tensor_storage {v.x, v.y, v.z, v.w}; \ - } \ +#define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ + template<> \ + struct into_vector_traits<::T2> { \ + using value_type = T; \ + using extent_type = extent<2>; \ + \ + KERNEL_FLOAT_INLINE \ + static vector_storage call(::T2 v) { \ + return {v.x, v.y}; \ + } \ + }; \ + \ + template<> \ + struct into_vector_traits<::T3> { \ + using value_type = T; \ + using extent_type = extent<3>; \ + \ + KERNEL_FLOAT_INLINE \ + static vector_storage call(::T3 v) { \ + return {v.x, v.y, v.z}; \ + } \ + }; \ + \ + template<> \ + struct into_vector_traits<::T4> { \ + using value_type = T; \ + using extent_type = extent<4>; \ + \ + KERNEL_FLOAT_INLINE \ + static vector_storage call(::T4 v) { \ + return {v.x, v.y, v.z, v.w}; \ + } \ }; +template +KERNEL_FLOAT_INLINE into_vector_type into_vector(V&& input) { + return into_vector_traits::call(std::forward(input)); +} + KERNEL_FLOAT_DEFINE_VECTOR_TYPE(char, char1, char2, char3, char4) KERNEL_FLOAT_DEFINE_VECTOR_TYPE(short, short1, short2, short3, short4) KERNEL_FLOAT_DEFINE_VECTOR_TYPE(int, int1, int2, int3, int4) @@ -2086,18 +1958,15 @@ KERNEL_FLOAT_DEFINE_VECTOR_TYPE(float, float1, float2, float3, float4) KERNEL_FLOAT_DEFINE_VECTOR_TYPE(double, double1, double2, double3, double4) template -using scalar = tensor>; +using scalar = vector>; template -using vec = tensor>; - -template -using mat = tensor>; +using vec = vector>; template KERNEL_FLOAT_INLINE vec, sizeof...(Args)> make_vec(Args&&... args) { using T = promote_t; - return tensor_storage {T {args}...}; + return vector_storage {T {args}...}; }; } // namespace kernel_float @@ -2119,12 +1988,13 @@ KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __half) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __half) template<> -struct into_tensor_traits<__half2> { - using type = tensor<__half, extents<2>>; +struct into_vector_traits<__half2> { + using value_type = __half; + using extent_type = extent<2>; KERNEL_FLOAT_INLINE - static type call(__half2 input) { - return tensor_storage<__half, 2> {input.x, input.y}; + static vector_storage<__half, 2> call(__half2 input) { + return {input.x, input.y}; } }; @@ -2151,20 +2021,20 @@ struct zip_halfx2 { template struct apply_impl { - KERNEL_FLOAT_INLINE static tensor_storage<__half, N> - call(F fun, const tensor_storage<__half, N>& input) { - tensor_storage<__half, N> result; + KERNEL_FLOAT_INLINE static vector_storage<__half, N> + call(F fun, const vector_storage<__half, N>& input) { + vector_storage<__half, N> result; #pragma unroll for (size_t i = 0; i < N; i += 2) { - __half2 a = {input[i], input[i + 1]}; + __half2 a = {input.data()[i], input.data()[i + 1]}; __half2 b = map_halfx2::call(fun, a); - result[i + 0] = b.x; - result[i + 1] = b.y; + result.data()[i + 0] = b.x; + result.data()[i + 1] = b.y; } if (N % 2 != 0) { - result[N - 1] = fun(input[N - 1]); + result.data()[N - 1] = fun(input.data()[N - 1]); } return result; @@ -2173,20 +2043,20 @@ struct apply_impl { template struct apply_impl { - KERNEL_FLOAT_INLINE static tensor_storage<__half, N> - call(F fun, const tensor_storage<__half, N>& left, const tensor_storage<__half, N>& right) { - tensor_storage<__half, N> result; + KERNEL_FLOAT_INLINE static vector_storage<__half, N> + call(F fun, const vector_storage<__half, N>& left, const vector_storage<__half, N>& right) { + vector_storage<__half, N> result; #pragma unroll for (size_t i = 0; i < N; i += 2) { - __half2 a = {left[i], left[i + 1]}; - __half2 b = {right[i], right[i + 1]}; + __half2 a = {left.data()[i], left.data()[i + 1]}; + __half2 b = {right.data()[i], right.data()[i + 1]}; __half2 c = zip_halfx2::call(fun, a, b); - result[i + 0] = c.x; - result[i + 1] = c.y; + result.data()[i + 0] = c.x; + result.data()[i + 1] = c.y; } if (N % 2 != 0) { - result[N - 1] = fun(left[N - 1], right[N - 1]); + result.data()[N - 1] = fun(left.data()[N - 1], right.data()[N - 1]); } return result; @@ -2195,19 +2065,19 @@ struct apply_impl { template struct reduce_helper= 2)>> { - KERNEL_FLOAT_INLINE static __half call(F fun, const tensor_storage<__half, N>& input) { - __half2 accum = {input[0], input[1]}; + KERNEL_FLOAT_INLINE static __half call(F fun, const vector_storage<__half, N>& input) { + __half2 accum = {input.data()[0], input.data()[1]}; #pragma unroll for (size_t i = 2; i < N; i += 2) { - __half2 a = {input[i], input[i + 1]}; + __half2 a = {input.data()[i], input.data()[i + 1]}; accum = zip_halfx2::call(fun, accum, a); } __half result = fun(accum.x, accum.y); if (N % 2 != 0) { - result = fun(result, input[N - 1]); + result = fun(result, input.data()[N - 1]); } return result; @@ -2345,18 +2215,20 @@ using half = __half; + namespace kernel_float { KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(__nv_bfloat16) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __nv_bfloat16) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __nv_bfloat16) template<> -struct into_tensor_traits<__nv_bfloat162> { - using type = tensor<__nv_bfloat16, extents<2>>; +struct into_vector_traits<__nv_bfloat162> { + using value_type = __nv_bfloat16; + using extent_type = extent<2>; KERNEL_FLOAT_INLINE - static type call(__nv_bfloat162 input) { - return tensor_storage<__nv_bfloat16, 2> {input.x, input.y}; + static vector_storage<__nv_bfloat16, 2> call(__nv_bfloat162 input) { + return {input.x, input.y}; } }; @@ -2383,20 +2255,20 @@ struct zip_bfloat16x2 { template struct apply_impl { - KERNEL_FLOAT_INLINE static tensor_storage<__nv_bfloat16, N> - call(F fun, const tensor_storage<__nv_bfloat16, N>& input) { - tensor_storage<__nv_bfloat16, N> result; + KERNEL_FLOAT_INLINE static vector_storage<__nv_bfloat16, N> + call(F fun, const vector_storage<__nv_bfloat16, N>& input) { + vector_storage<__nv_bfloat16, N> result; #pragma unroll for (size_t i = 0; i < N; i += 2) { - __nv_bfloat162 a = {input[i], input[i + 1]}; + __nv_bfloat162 a = {input.data()[i], input.data()[i + 1]}; __nv_bfloat162 b = map_bfloat16x2::call(fun, a); - result[i + 0] = b.x; - result[i + 1] = b.y; + result.data()[i + 0] = b.x; + result.data()[i + 1] = b.y; } if (N % 2 != 0) { - result[N - 1] = fun(input[N - 1]); + result.data()[N - 1] = fun(input.data()[N - 1]); } return result; @@ -2405,22 +2277,22 @@ struct apply_impl { template struct apply_impl { - KERNEL_FLOAT_INLINE static tensor_storage<__nv_bfloat16, N> call( + KERNEL_FLOAT_INLINE static vector_storage<__nv_bfloat16, N> call( F fun, - const tensor_storage<__nv_bfloat16, N>& left, - const tensor_storage<__nv_bfloat16, N>& right) { - tensor_storage<__nv_bfloat16, N> result; + const vector_storage<__nv_bfloat16, N>& left, + const vector_storage<__nv_bfloat16, N>& right) { + vector_storage<__nv_bfloat16, N> result; #pragma unroll for (size_t i = 0; i < N; i += 2) { - __nv_bfloat162 a = {left[i], left[i + 1]}; - __nv_bfloat162 b = {right[i], right[i + 1]}; + __nv_bfloat162 a = {left.data()[i], left.data()[i + 1]}; + __nv_bfloat162 b = {right.data()[i], right.data()[i + 1]}; __nv_bfloat162 c = zip_bfloat16x2::call(fun, a, b); - result[i + 0] = c.x; - result[i + 1] = c.y; + result.data()[i + 0] = c.x; + result.data()[i + 1] = c.y; } if (N % 2 != 0) { - result[N - 1] = fun(left[N - 1], right[N - 1]); + result.data()[N - 1] = fun(left.data()[N - 1], right.data()[N - 1]); } return result; @@ -2430,19 +2302,19 @@ struct apply_impl { template struct reduce_helper= 2)>> { KERNEL_FLOAT_INLINE static __nv_bfloat16 - call(F fun, const tensor_storage<__nv_bfloat16, N>& input) { - __nv_bfloat162 accum = {input[0], input[1]}; + call(F fun, const vector_storage<__nv_bfloat16, N>& input) { + __nv_bfloat162 accum = {input.data()[0], input.data()[1]}; #pragma unroll for (size_t i = 2; i < N; i += 2) { - __nv_bfloat162 a = {input[i], input[i + 1]}; + __nv_bfloat162 a = {input.data()[i], input.data()[i + 1]}; accum = zip_bfloat16x2::call(fun, accum, a); } __nv_bfloat16 result = fun(accum.x, accum.y); if (N % 2 != 0) { - result = fun(result, input[N - 1]); + result = fun(result, input.data()[N - 1]); } return result; @@ -2602,16 +2474,13 @@ namespace prelude { namespace kf = ::kernel_float; template -using kscalar = tensor>; +using kscalar = vector>; template -using kvec = tensor>; +using kvec = vector>; -template -using kmat = tensor>; - -template -using ktensor = tensor>; +template +using kvector = vector>; // clang-format off template using kvec1 = kvec; @@ -2624,17 +2493,16 @@ template using kvec7 = kvec; template using kvec8 = kvec; // clang-format on -#define KERNEL_FLOAT_TYPE_ALIAS(NAME, T) \ - using k##NAME = tensor>; \ - template \ - using k##NAME##N = tensor>; \ - using k##NAME##1 = vec; \ - using k##NAME##2 = vec; \ - using k##NAME##3 = vec; \ - using k##NAME##4 = vec; \ - using k##NAME##5 = vec; \ - using k##NAME##6 = vec; \ - using k##NAME##7 = vec; \ +#define KERNEL_FLOAT_TYPE_ALIAS(NAME, T) \ + template \ + using k##NAME = vector>; \ + using k##NAME##1 = vec; \ + using k##NAME##2 = vec; \ + using k##NAME##3 = vec; \ + using k##NAME##4 = vec; \ + using k##NAME##5 = vec; \ + using k##NAME##6 = vec; \ + using k##NAME##7 = vec; \ using k##NAME##8 = vec; KERNEL_FLOAT_TYPE_ALIAS(char, char) @@ -2668,8 +2536,8 @@ KERNEL_FLOAT_TYPE_ALIAS(bfloat16, __nv_bfloat16) KERNEL_FLOAT_TYPE_ALIAS(bf16, __nv_bfloat16) #endif -template -static constexpr extents kshape = {}; +template +static constexpr extent kextent = {}; template KERNEL_FLOAT_INLINE kvec, sizeof...(Args)> make_kvec(Args&&... args) { @@ -2688,138 +2556,3 @@ KERNEL_FLOAT_INLINE constexpr kconstant kconst(T value) { } // namespace kernel_float #endif -#ifndef KERNEL_FLOAT_TRIOPS_H -#define KERNEL_FLOAT_TRIOPS_H - - - - -namespace kernel_float { - -namespace ops { -template -struct conditional { - KERNEL_FLOAT_INLINE T operator()(bool cond, T true_value, T false_value) { - if (cond) { - return true_value; - } else { - return false_value; - } - } -}; -} // namespace ops - -/** - * Return elements chosen from `true_values` and `false_values` depending on `cond`. - * - * This function broadcasts all arguments to the same shape and it promotes the values of `true_values` and - * `false_values` into the same type. Next, it casts the values of `cond` to booleans and returns a tensor where - * the values are taken from `true_values` if the condition is true and `false_values` otherwise. - * - * @param cond The condition used for selection. - * @param true_values The tensor of values to choose from when the condition is true. - * @param false_values The tensor of values to choose from when the condition is false. - * @return A tensor containing selected elements as per the condition. - */ -template< - typename C, - typename L, - typename R, - typename T = promoted_tensor_value_type, - typename E = broadcast_extents, broadcast_tensor_extents>> -KERNEL_FLOAT_INLINE tensor where(const C& cond, const L& true_values, const R& false_values) { - using F = ops::conditional; - - return detail::apply_impl::call( - F {}, - detail::convert_helper, tensor_extents, bool, E>::call( - into_tensor_storage(cond)), - detail::convert_helper, tensor_extents, T, E>::call( - into_tensor_storage(true_values)), - detail::convert_helper, tensor_extents, T, E>::call( - into_tensor_storage(false_values))); -} - -/** - * Selects elements from `true_values` depending on `cond`. - * - * This function returns a tensor where the values are taken from `true_values` where `cond` is `true` and `0` where - * `cond is `false`. - * - * @param cond The condition used for selection. - * @param true_values The tensor of values to choose from when the condition is true. - * @return A tensor containing selected elements as per the condition. - */ -template< - typename C, - typename L, - typename T = tensor_value_type, - typename E = broadcast_extents, tensor_extents>> -KERNEL_FLOAT_INLINE tensor where(const C& cond, const L& true_values) { - tensor> false_values = T {}; - return where(cond, true_values, false_values); -} - -/** - * Returns a tensor where the values are `T(1)` where `cond` is `true` and `T(0)` where `cond` is `false`. - * - * @param cond The condition used for selection. - * @return A tensor containing elements as per the condition. - */ -template> -KERNEL_FLOAT_INLINE tensor where(const C& cond) { - tensor> true_values = T {true}; - tensor> false_values = T {false}; - return where(cond, true_values, false_values); -} - -namespace ops { -template -struct fma { - KERNEL_FLOAT_INLINE T operator()(T a, T b, T c) { - return a + b * c; - } -}; - -#if KERNEL_FLOAT_IS_DEVICE -template<> -struct fma { - KERNEL_FLOAT_INLINE float operator()(float a, float b, float c) { - return __fmaf_rn(a, b, c); - } -}; - -template<> -struct fma { - KERNEL_FLOAT_INLINE double operator()(double a, double b, double c) { - return __fma_rn(a, b, c); - } -}; -#endif -} // namespace ops - -/** - * Computes the result of `a * b + c`. This is done in a single operation if possible. - */ -template< - typename A, - typename B, - typename C, - typename T = promoted_tensor_value_type, - typename E = broadcast_extents, broadcast_tensor_extents>> -KERNEL_FLOAT_INLINE tensor fma(const A& a, const B& b, const C& c) { - using F = ops::fma; - - return detail::apply_impl::call( - F {}, - detail::convert_helper, tensor_extents, T, E>::call( - into_tensor_storage(a)), - detail::convert_helper, tensor_extents, T, E>::call( - into_tensor_storage(b)), - detail::convert_helper, tensor_extents, T, E>::call( - into_tensor_storage(c))); -} - -} // namespace kernel_float - -#endif //KERNEL_FLOAT_TRIOPS_H From 3b34b7e0c69015b61ca88387f2430b845dff3c4a Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 1 Aug 2023 16:35:53 +0200 Subject: [PATCH 16/50] Write more documentation --- docs/api.rst | 1 + docs/build_api.py | 48 +++++++++++++++----------------- include/kernel_float/binops.h | 8 ++++++ include/kernel_float/broadcast.h | 43 ++++++++++++++++++++++++---- include/kernel_float/prelude.h | 3 -- include/kernel_float/triops.h | 10 +++---- include/kernel_float/vector.h | 11 ++++++++ 7 files changed, 84 insertions(+), 40 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index e525b1c..4e5176e 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -8,3 +8,4 @@ API Reference api/reductions.rst api/shuffling.rst api/mathematical.rst + api/conditional.rst diff --git a/docs/build_api.py b/docs/build_api.py index 178769f..10c4a01 100644 --- a/docs/build_api.py +++ b/docs/build_api.py @@ -65,22 +65,15 @@ def build_index_page(groups): return filename -aliases = [] -for ty in ["vec", "float", "double", "half", "bfloat16x", ""]: - if ty != "vec": - aliases.append(f"{ty}X") - +aliases = ["scalar", "vec"] +for ty in ["vec"]: for i in range(2, 8 + 1): aliases.append(f"{ty}{i}") groups = { "Types": [ ("vector", "vector", "struct"), - ("Aliases", [ - "unaligned_vec", - "vec", - ] + aliases, - "typedef"), + ("Aliases", aliases, "typedef"), ], "Primitives": [ ("range", "range()"), @@ -91,17 +84,17 @@ def build_index_page(groups): "zip_common", "cast", "broadcast", - "resize", + "convert", "for_each", ], "Shuffling": [ - "concat", - "swizzle", - "first", - "last", - "reversed", - "rotate_left", - "rotate_right", + # "concat", + # "swizzle", + # "first", + # "last", + # "reversed", + # "rotate_left", + # "rotate_right", ], "Unary Operators": [ "fill", @@ -135,21 +128,21 @@ def build_index_page(groups): ("min", "min(L&&, R&&)"), "nextafter", "modf", - "pow", + ("pow", "pow(L&&, R&&)"), "remainder", #"rhypot", ], "Reductions": [ "sum", - ("max", "max(V&&)"), - ("min", "min(V&&)"), + ("max", "max(const V&)"), + ("min", "min(const V&)"), "product", "all", "any", "count", ], "Mathematical": [ - "abs", + ("abs", "abs(const V&)"), "acos", "acosh", "asin", @@ -166,14 +159,14 @@ def build_index_page(groups): "erfcinv", "erfcx", "erfinv", - "exp", + ("exp", "exp(const V&)"), "exp10", "exp2", "fabs", "floor", "ilogb", "lgamma", - "log", + ("log", "log(const V&)"), "log10", "logb", "nearbyint", @@ -181,7 +174,7 @@ def build_index_page(groups): "rcbrt", "sin", "sinh", - "sqrt", + ("sqrt", "sqrt(const V&)"), "tan", "tanh", "tgamma", @@ -193,6 +186,11 @@ def build_index_page(groups): "isinf", "isnan", ], + "Conditional": [ + ("where", "where(const C&, const L&, const R&)"), + ("where", "where(const C&, const L&)"), + ("where", "where(const C&)"), + ] } build_index_page(groups) diff --git a/include/kernel_float/binops.h b/include/kernel_float/binops.h index db589b4..af64ddd 100644 --- a/include/kernel_float/binops.h +++ b/include/kernel_float/binops.h @@ -11,6 +11,10 @@ using zip_type = vector< result_t, vector_value_type>, broadcast_vector_extent_type>; +/** + * Combines the elements from the two inputs (`left` and `right`) element-wise, applying a provided binary + * function (`fun`) to each pair of corresponding elements. + */ template KERNEL_FLOAT_INLINE zip_type zip(F fun, const L& left, const R& right) { using A = vector_value_type; @@ -29,6 +33,10 @@ using zip_common_type = vector< result_t, promoted_vector_value_type>, broadcast_vector_extent_type>; +/** + * Similar to `zip`, except `zip_common` promotes the element types of the inputs to a common type before applying the + * binary function. + */ template KERNEL_FLOAT_INLINE zip_common_type zip_common(F fun, const L& left, const R& right) { using T = promoted_vector_value_type; diff --git a/include/kernel_float/broadcast.h b/include/kernel_float/broadcast.h index aa6fa9d..a8410e2 100644 --- a/include/kernel_float/broadcast.h +++ b/include/kernel_float/broadcast.h @@ -69,6 +69,10 @@ struct broadcast_impl, extent> { } // namespace detail +/** + * Takes the given vector `input` and extends its size to a length of `N`. This is only valid if the size of `input` + * is 1 or `N`. + */ template KERNEL_FLOAT_INLINE vector, extent> broadcast(const V& input, extent new_size = {}) { @@ -77,39 +81,64 @@ broadcast(const V& input, extent new_size = {}) { into_vector_storage(input)); } +/** + * Takes the given vector `input` and extends its size to the same length as vector `other`. This is only valid if the + * size of `input` is 1 or the same as `other`. + */ template KERNEL_FLOAT_INLINE vector, vector_extent_type> -broadcast_like(const V& input, const R&) { - using T = vector_value_type; - return detail::broadcast_impl, vector_extent_type>::call( - into_vector_storage(input)); +broadcast_like(const V& input, const R& other) { + return broadcast(input, vector_extent_type {}); } +/** + * Returns a vector containing `N` copies of `value`. + */ template KERNEL_FLOAT_INLINE vector> fill(T value = {}, extent = {}) { vector_storage input = {value}; return detail::broadcast_impl, extent>::call(input); } +/** + * Returns a vector containing `N` copies of `T(0)`. + */ template KERNEL_FLOAT_INLINE vector> zeros(extent = {}) { vector_storage input = {T {}}; return detail::broadcast_impl, extent>::call(input); } +/** + * Returns a vector containing `N` copies of `T(1)`. + */ template KERNEL_FLOAT_INLINE vector> ones(extent = {}) { vector_storage input = {T {1}}; return detail::broadcast_impl, extent>::call(input); } +/** + * Returns a vector filled with `value` having the same type and size as input vector `V`. + */ +template, typename E = vector_extent_type> +KERNEL_FLOAT_INLINE vector fill_like(const V&, T value) { + return fill(value, E {}); +} + +/** + * Returns a vector filled with zeros having the same type and size as input vector `V`. + */ template, typename E = vector_extent_type> -KERNEL_FLOAT_INLINE vector zeros_like(const V&) { +KERNEL_FLOAT_INLINE vector zeros_like(const V& = {}) { return zeros(E {}); } +/** + * Returns a vector filled with ones having the same type and size as input vector `V`. + */ template, typename E = vector_extent_type> -KERNEL_FLOAT_INLINE vector ones_like(const V&) { +KERNEL_FLOAT_INLINE vector ones_like(const V& = {}) { return ones(E {}); } @@ -159,6 +188,8 @@ KERNEL_FLOAT_INLINE vector_storage convert_storage(const V& input, extent< /** * Cast the values of the given input vector to type `R` and then broadcast the result to the given size `N`. + * + * This function is essentially a `cast` followed by a `broadcast`. */ template KERNEL_FLOAT_INLINE vector> convert(const V& input, extent new_size = {}) { diff --git a/include/kernel_float/prelude.h b/include/kernel_float/prelude.h index 20f8598..feaa7f4 100644 --- a/include/kernel_float/prelude.h +++ b/include/kernel_float/prelude.h @@ -14,9 +14,6 @@ using kscalar = vector>; template using kvec = vector>; -template -using kvector = vector>; - // clang-format off template using kvec1 = kvec; template using kvec2 = kvec; diff --git a/include/kernel_float/triops.h b/include/kernel_float/triops.h index 9e059ad..fc93a94 100644 --- a/include/kernel_float/triops.h +++ b/include/kernel_float/triops.h @@ -22,9 +22,9 @@ struct conditional { /** * Return elements chosen from `true_values` and `false_values` depending on `cond`. * - * This function broadcasts all arguments to the same size and it promotes the values of `true_values` and + * This function broadcasts all arguments to the same size and then promotes the values of `true_values` and * `false_values` into the same type. Next, it casts the values of `cond` to booleans and returns a vector where - * the values are taken from `true_values` if the condition is true and `false_values` otherwise. + * the values are taken from `true_values` where the condition is true and `false_values` otherwise. * * @param cond The condition used for selection. * @param true_values The vector of values to choose from when the condition is true. @@ -71,16 +71,14 @@ KERNEL_FLOAT_INLINE vector where(const C& cond, const L& true_values) { } /** - * Returns a vector where the values are `T(1)` where `cond` is `true` and `T(0)` where `cond` is `false`. + * Returns a vector having the value `T(1)` where `cond` is `true` and `T(0)` where `cond` is `false`. * * @param cond The condition used for selection. * @return A vector containing elements as per the condition. */ template> KERNEL_FLOAT_INLINE vector where(const C& cond) { - vector> true_values = T {true}; - vector> false_values = T {false}; - return where(cond, true_values, false_values); + return cast(cast(cond)); } namespace ops { diff --git a/include/kernel_float/vector.h b/include/kernel_float/vector.h index 5ad09a0..716a48b 100644 --- a/include/kernel_float/vector.h +++ b/include/kernel_float/vector.h @@ -218,6 +218,17 @@ using scalar = vector>; template using vec = vector>; +// clang-format off +template using vec1 = vec; +template using vec2 = vec; +template using vec3 = vec; +template using vec4 = vec; +template using vec5 = vec; +template using vec6 = vec; +template using vec7 = vec; +template using vec8 = vec; +// clang-format on + template KERNEL_FLOAT_INLINE vec, sizeof...(Args)> make_vec(Args&&... args) { using T = promote_t; From 2405950e91d545eda4affdfad84336d4247a3553 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 8 Aug 2023 14:35:41 +0200 Subject: [PATCH 17/50] Small fixes and changes --- include/kernel_float/broadcast.h | 7 ++ include/kernel_float/iterate.h | 51 +++++++++++ include/kernel_float/vector.h | 8 +- single_include/kernel_float.h | 145 ++++++++++++++++++++++++++----- 4 files changed, 188 insertions(+), 23 deletions(-) create mode 100644 include/kernel_float/iterate.h diff --git a/include/kernel_float/broadcast.h b/include/kernel_float/broadcast.h index a8410e2..5aa9438 100644 --- a/include/kernel_float/broadcast.h +++ b/include/kernel_float/broadcast.h @@ -67,6 +67,13 @@ struct broadcast_impl, extent> { } }; +template +struct broadcast_impl, extent<1>> { + KERNEL_FLOAT_INLINE static vector_storage call(vector_storage input) { + return input; + } +}; + } // namespace detail /** diff --git a/include/kernel_float/iterate.h b/include/kernel_float/iterate.h new file mode 100644 index 0000000..45c6fd8 --- /dev/null +++ b/include/kernel_float/iterate.h @@ -0,0 +1,51 @@ +#ifndef KERNEL_FLOAT_ITERATE_H +#define KERNEL_FLOAT_ITERATE_H + +#include "base.h" + +namespace kernel_float { + +namespace detail { +template, size_t N = vector_extent> +struct flatten_helper { + using value_type = typename flatten_helper::value_type; + static constexpr size_t size = N * flatten_helper::size; + + static void call(const V& input, value_type* output) { + vector_storage storage = into_vector_storage(input); + + for (size_t i = 0; i < N; i++) { + flatten_helper::call(storage.data()[i], output + flatten_helper::size * i); + } + } +}; + +template +struct flatten_helper { + using value_type = T; + static constexpr size_t size = 1; + + static void call(const T& input, T* output) { + *output = input; + } +}; +} // namespace detail + +template +using flatten_value_type = typename detail::flatten_helper::value_type; + +template +static constexpr size_t flatten_size = detail::flatten_helper::size; + +template +using flatten_type = vector, extent>>; + +template +flatten_type flatten(const V& input) { + vector_storage, flatten_size> output; + detail::flatten_helper::call(input, output.data()); + return output; +} +} // namespace kernel_float + +#endif \ No newline at end of file diff --git a/include/kernel_float/vector.h b/include/kernel_float/vector.h index 716a48b..85d765c 100644 --- a/include/kernel_float/vector.h +++ b/include/kernel_float/vector.h @@ -3,6 +3,7 @@ #include "base.h" #include "broadcast.h" +#include "iterate.h" #include "macros.h" #include "reduce.h" #include "unops.h" @@ -135,7 +136,7 @@ struct vector: S { } template - KERNEL_FLOAT_INLINE vector cast() const { + KERNEL_FLOAT_INLINE vector cast() const { return kernel_float::cast(*this); } @@ -154,8 +155,9 @@ struct vector: S { return kernel_float::reduce(fun, *this); } - private: - storage_type storage_; + KERNEL_FLOAT_INLINE flatten_type flatten() const { + return kernel_float::flatten(*this); + } }; #define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index ceb0344..4dd6052 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-08-01 16:03:54.596340 -// git hash: 0cf26708f3b17b2f74940e5806bad2aeaae4c076 +// date: 2023-08-08 14:35:09.747868 +// git hash: 3b34b7e0c69015b61ca88387f2430b845dff3c4a //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -822,6 +822,57 @@ struct promote_type> { } // namespace kernel_float +#endif +#ifndef KERNEL_FLOAT_ITERATE_H +#define KERNEL_FLOAT_ITERATE_H + + + +namespace kernel_float { + +namespace detail { +template, size_t N = vector_extent> +struct flatten_helper { + using value_type = typename flatten_helper::value_type; + static constexpr size_t size = N * flatten_helper::size; + + static void call(const V& input, value_type* output) { + vector_storage storage = into_vector_storage(input); + + for (size_t i = 0; i < N; i++) { + flatten_helper::call(storage.data()[i], output + flatten_helper::size * i); + } + } +}; + +template +struct flatten_helper { + using value_type = T; + static constexpr size_t size = 1; + + static void call(const T& input, T* output) { + *output = input; + } +}; +} // namespace detail + +template +using flatten_value_type = typename detail::flatten_helper::value_type; + +template +static constexpr size_t flatten_size = detail::flatten_helper::size; + +template +using flatten_type = vector, extent>>; + +template +flatten_type flatten(const V& input) { + vector_storage, flatten_size> output; + detail::flatten_helper::call(input, output.data()); + return output; +} +} // namespace kernel_float + #endif #ifndef KERNEL_FLOAT_UNOPS_H #define KERNEL_FLOAT_UNOPS_H @@ -1062,8 +1113,19 @@ struct broadcast_impl, extent> { } }; +template +struct broadcast_impl, extent<1>> { + KERNEL_FLOAT_INLINE static vector_storage call(vector_storage input) { + return input; + } +}; + } // namespace detail +/** + * Takes the given vector `input` and extends its size to a length of `N`. This is only valid if the size of `input` + * is 1 or `N`. + */ template KERNEL_FLOAT_INLINE vector, extent> broadcast(const V& input, extent new_size = {}) { @@ -1072,39 +1134,64 @@ broadcast(const V& input, extent new_size = {}) { into_vector_storage(input)); } +/** + * Takes the given vector `input` and extends its size to the same length as vector `other`. This is only valid if the + * size of `input` is 1 or the same as `other`. + */ template KERNEL_FLOAT_INLINE vector, vector_extent_type> -broadcast_like(const V& input, const R&) { - using T = vector_value_type; - return detail::broadcast_impl, vector_extent_type>::call( - into_vector_storage(input)); +broadcast_like(const V& input, const R& other) { + return broadcast(input, vector_extent_type {}); } +/** + * Returns a vector containing `N` copies of `value`. + */ template KERNEL_FLOAT_INLINE vector> fill(T value = {}, extent = {}) { vector_storage input = {value}; return detail::broadcast_impl, extent>::call(input); } +/** + * Returns a vector containing `N` copies of `T(0)`. + */ template KERNEL_FLOAT_INLINE vector> zeros(extent = {}) { vector_storage input = {T {}}; return detail::broadcast_impl, extent>::call(input); } +/** + * Returns a vector containing `N` copies of `T(1)`. + */ template KERNEL_FLOAT_INLINE vector> ones(extent = {}) { vector_storage input = {T {1}}; return detail::broadcast_impl, extent>::call(input); } +/** + * Returns a vector filled with `value` having the same type and size as input vector `V`. + */ template, typename E = vector_extent_type> -KERNEL_FLOAT_INLINE vector zeros_like(const V&) { +KERNEL_FLOAT_INLINE vector fill_like(const V&, T value) { + return fill(value, E {}); +} + +/** + * Returns a vector filled with zeros having the same type and size as input vector `V`. + */ +template, typename E = vector_extent_type> +KERNEL_FLOAT_INLINE vector zeros_like(const V& = {}) { return zeros(E {}); } +/** + * Returns a vector filled with ones having the same type and size as input vector `V`. + */ template, typename E = vector_extent_type> -KERNEL_FLOAT_INLINE vector ones_like(const V&) { +KERNEL_FLOAT_INLINE vector ones_like(const V& = {}) { return ones(E {}); } @@ -1154,6 +1241,8 @@ KERNEL_FLOAT_INLINE vector_storage convert_storage(const V& input, extent< /** * Cast the values of the given input vector to type `R` and then broadcast the result to the given size `N`. + * + * This function is essentially a `cast` followed by a `broadcast`. */ template KERNEL_FLOAT_INLINE vector> convert(const V& input, extent new_size = {}) { @@ -1176,6 +1265,10 @@ using zip_type = vector< result_t, vector_value_type>, broadcast_vector_extent_type>; +/** + * Combines the elements from the two inputs (`left` and `right`) element-wise, applying a provided binary + * function (`fun`) to each pair of corresponding elements. + */ template KERNEL_FLOAT_INLINE zip_type zip(F fun, const L& left, const R& right) { using A = vector_value_type; @@ -1194,6 +1287,10 @@ using zip_common_type = vector< result_t, promoted_vector_value_type>, broadcast_vector_extent_type>; +/** + * Similar to `zip`, except `zip_common` promotes the element types of the inputs to a common type before applying the + * binary function. + */ template KERNEL_FLOAT_INLINE zip_common_type zip_common(F fun, const L& left, const R& right) { using T = promoted_vector_value_type; @@ -1632,9 +1729,9 @@ struct conditional { /** * Return elements chosen from `true_values` and `false_values` depending on `cond`. * - * This function broadcasts all arguments to the same size and it promotes the values of `true_values` and + * This function broadcasts all arguments to the same size and then promotes the values of `true_values` and * `false_values` into the same type. Next, it casts the values of `cond` to booleans and returns a vector where - * the values are taken from `true_values` if the condition is true and `false_values` otherwise. + * the values are taken from `true_values` where the condition is true and `false_values` otherwise. * * @param cond The condition used for selection. * @param true_values The vector of values to choose from when the condition is true. @@ -1681,16 +1778,14 @@ KERNEL_FLOAT_INLINE vector where(const C& cond, const L& true_values) { } /** - * Returns a vector where the values are `T(1)` where `cond` is `true` and `T(0)` where `cond` is `false`. + * Returns a vector having the value `T(1)` where `cond` is `true` and `T(0)` where `cond` is `false`. * * @param cond The condition used for selection. * @return A vector containing elements as per the condition. */ template> KERNEL_FLOAT_INLINE vector where(const C& cond) { - vector> true_values = T {true}; - vector> false_values = T {false}; - return where(cond, true_values, false_values); + return cast(cast(cond)); } namespace ops { @@ -1752,6 +1847,7 @@ KERNEL_FLOAT_INLINE vector fma(const A& a, const B& b, const C& c) { + namespace kernel_float { template @@ -1880,7 +1976,7 @@ struct vector: S { } template - KERNEL_FLOAT_INLINE vector cast() const { + KERNEL_FLOAT_INLINE vector cast() const { return kernel_float::cast(*this); } @@ -1899,8 +1995,9 @@ struct vector: S { return kernel_float::reduce(fun, *this); } - private: - storage_type storage_; + KERNEL_FLOAT_INLINE flatten_type flatten() const { + return kernel_float::flatten(*this); + } }; #define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ @@ -1963,6 +2060,17 @@ using scalar = vector>; template using vec = vector>; +// clang-format off +template using vec1 = vec; +template using vec2 = vec; +template using vec3 = vec; +template using vec4 = vec; +template using vec5 = vec; +template using vec6 = vec; +template using vec7 = vec; +template using vec8 = vec; +// clang-format on + template KERNEL_FLOAT_INLINE vec, sizeof...(Args)> make_vec(Args&&... args) { using T = promote_t; @@ -2479,9 +2587,6 @@ using kscalar = vector>; template using kvec = vector>; -template -using kvector = vector>; - // clang-format off template using kvec1 = kvec; template using kvec2 = kvec; From 2ce8b7dabc15c263791b6ad736915c5a724f1d35 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 8 Aug 2023 18:29:10 +0200 Subject: [PATCH 18/50] Add cast specialization for `constant` --- include/kernel_float/constant.h | 9 ++++++++- single_include/kernel_float.h | 11 +++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/include/kernel_float/constant.h b/include/kernel_float/constant.h index 12bec35..4fe7f35 100644 --- a/include/kernel_float/constant.h +++ b/include/kernel_float/constant.h @@ -45,6 +45,13 @@ struct promote_type> { }; namespace ops { +template +struct cast, R> { + KERNEL_FLOAT_INLINE R operator()(const T& input) noexcept { + return cast {}(input); + } +}; + template struct cast, R, m> { KERNEL_FLOAT_INLINE R operator()(const T& input) noexcept { @@ -55,4 +62,4 @@ struct cast, R, m> { } // namespace kernel_float -#endif \ No newline at end of file +#endif diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 4dd6052..bf19962 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-08-08 14:35:09.747868 -// git hash: 3b34b7e0c69015b61ca88387f2430b845dff3c4a +// date: 2023-08-08 18:25:16.753085 +// git hash: 2405950e91d545eda4affdfad84336d4247a3553 //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -1534,6 +1534,13 @@ struct promote_type> { }; namespace ops { +template +struct cast, R> { + KERNEL_FLOAT_INLINE R operator()(const T& input) noexcept { + return cast {}(input); + } +}; + template struct cast, R, m> { KERNEL_FLOAT_INLINE R operator()(const T& input) noexcept { From b012dbe70def9e6c7da70ee771d5be2304ddc53f Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 14 Aug 2023 12:32:32 +0200 Subject: [PATCH 19/50] Add `for_each` and `range` functions --- include/kernel_float.h | 4 +- include/kernel_float/iterate.h | 81 +++++++- include/kernel_float/vector.h | 9 +- single_include/kernel_float.h | 362 ++++++++++++++++++++++++++++++++- 4 files changed, 445 insertions(+), 11 deletions(-) diff --git a/include/kernel_float.h b/include/kernel_float.h index db9249a..744d499 100644 --- a/include/kernel_float.h +++ b/include/kernel_float.h @@ -1,14 +1,12 @@ #ifndef KERNEL_FLOAT_H #define KERNEL_FLOAT_H -//#include "kernel_float/fp8.h" -//#include "kernel_float/swizzle.h" - #include "kernel_float/base.h" #include "kernel_float/bf16.h" #include "kernel_float/binops.h" #include "kernel_float/broadcast.h" #include "kernel_float/fp16.h" +#include "kernel_float/iterate.h" #include "kernel_float/macros.h" #include "kernel_float/meta.h" #include "kernel_float/prelude.h" diff --git a/include/kernel_float/iterate.h b/include/kernel_float/iterate.h index 45c6fd8..b9b2f81 100644 --- a/include/kernel_float/iterate.h +++ b/include/kernel_float/iterate.h @@ -5,15 +5,93 @@ namespace kernel_float { +/** + * Apply the function fun for each element from input. + * + * ``` + * for_each(range(), [&](auto i) { + * printf("element: %d\n", i); + * }); + * ``` + */ +template +void for_each(V&& input, F fun) { + auto storage = into_vector_storage(input); + +#pragma unroll + for (size_t i = 0; i < vector_extent; i++) { + fun(storage.data()[i]); + } +} + +namespace detail { +template +struct range_helper { + KERNEL_FLOAT_INLINE + static vector_storage call() { + vector_storage result; + +#pragma unroll + for (size_t i = 0; i < N; i++) { + result.data()[i] = T(i); + } + + return result; + } +}; +} // namespace detail + +/** + * Generate vector consisting of the numbers 0...N-1 of type T + * + * ``` + * // Returns [0, 1, 2] + * vector vec = range(); + * ``` + */ +template +KERNEL_FLOAT_INLINE vector> range() { + return detail::range_helper::call(); +} + +/** + * Takes a vector of size ``N`` and element type ``T`` and returns a new vector consisting of the numbers ``0...N-1`` + * of type ``T`` + * + * ``` + * // Returns [0.0f, 1.0f, 2.0f] + * vector vec = range(); + * ``` + */ +template +KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { + return detail::range_helper, vector_extent>::call(); +} + +/** + * Takes a vector of size ``N`` and returns a new vector consisting of the numbers ``0...N-1`` of type ``size_t`` + * + * ``` + * // Returns [0, 1, 2] + * vector vec = enumerate(float3(6, 4, 2)); + * ``` + */ +template +KERNEL_FLOAT_INLINE vector> enumerate(const V& = {}) { + return detail::range_helper>::call(); +} + namespace detail { template, size_t N = vector_extent> struct flatten_helper { using value_type = typename flatten_helper::value_type; static constexpr size_t size = N * flatten_helper::size; + KERNEL_FLOAT_INLINE static void call(const V& input, value_type* output) { vector_storage storage = into_vector_storage(input); +#pragma unroll for (size_t i = 0; i < N; i++) { flatten_helper::call(storage.data()[i], output + flatten_helper::size * i); } @@ -25,6 +103,7 @@ struct flatten_helper { using value_type = T; static constexpr size_t size = 1; + KERNEL_FLOAT_INLINE static void call(const T& input, T* output) { *output = input; } @@ -41,7 +120,7 @@ template using flatten_type = vector, extent>>; template -flatten_type flatten(const V& input) { +KERNEL_FLOAT_INLINE flatten_type flatten(const V& input) { vector_storage, flatten_size> output; detail::flatten_helper::call(input, output.data()); return output; diff --git a/include/kernel_float/vector.h b/include/kernel_float/vector.h index 85d765c..8e8ce1c 100644 --- a/include/kernel_float/vector.h +++ b/include/kernel_float/vector.h @@ -146,18 +146,23 @@ struct vector: S { } template - KERNEL_FLOAT_INLINE vector, E> map(F fun = {}) const { + KERNEL_FLOAT_INLINE vector, E> map(F fun) const { return kernel_float::map(fun, *this); } template - KERNEL_FLOAT_INLINE T reduce(F fun = {}) const { + KERNEL_FLOAT_INLINE T reduce(F fun) const { return kernel_float::reduce(fun, *this); } KERNEL_FLOAT_INLINE flatten_type flatten() const { return kernel_float::flatten(*this); } + + template + KERNEL_FLOAT_INLINE void for_each(F fun) const { + return kernel_float::for_each(*this, std::move(fun)); + } }; #define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index bf19962..2f7a557 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-08-08 18:25:16.753085 -// git hash: 2405950e91d545eda4affdfad84336d4247a3553 +// date: 2023-08-14 12:28:08.921323 +// git hash: 2ce8b7dabc15c263791b6ad736915c5a724f1d35 //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -830,15 +830,93 @@ struct promote_type> { namespace kernel_float { +/** + * Apply the function fun for each element from input. + * + * ``` + * for_each(range(), [&](auto i) { + * printf("element: %d\n", i); + * }); + * ``` + */ +template +void for_each(V&& input, F fun) { + auto storage = into_vector_storage(input); + +#pragma unroll + for (size_t i = 0; i < vector_extent; i++) { + fun(storage.data()[i]); + } +} + +namespace detail { +template +struct range_helper { + KERNEL_FLOAT_INLINE + static vector_storage call() { + vector_storage result; + +#pragma unroll + for (size_t i = 0; i < N; i++) { + result.data()[i] = T(i); + } + + return result; + } +}; +} // namespace detail + +/** + * Generate vector consisting of the numbers 0...N-1 of type T + * + * ``` + * // Returns [0, 1, 2] + * vector vec = range(); + * ``` + */ +template +KERNEL_FLOAT_INLINE vector> range() { + return detail::range_helper::call(); +} + +/** + * Takes a vector of size ``N`` and element type ``T`` and returns a new vector consisting of the numbers ``0...N-1`` + * of type ``T`` + * + * ``` + * // Returns [0.0f, 1.0f, 2.0f] + * vector vec = range(); + * ``` + */ +template +KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { + return detail::range_helper, vector_extent>::call(); +} + +/** + * Takes a vector of size ``N`` and returns a new vector consisting of the numbers ``0...N-1`` of type ``size_t`` + * + * ``` + * // Returns [0, 1, 2] + * vector vec = enumerate(float3(6, 4, 2)); + * ``` + */ +template +KERNEL_FLOAT_INLINE vector> enumerate(const V& = {}) { + return detail::range_helper>::call(); +} + namespace detail { template, size_t N = vector_extent> struct flatten_helper { using value_type = typename flatten_helper::value_type; static constexpr size_t size = N * flatten_helper::size; + KERNEL_FLOAT_INLINE static void call(const V& input, value_type* output) { vector_storage storage = into_vector_storage(input); +#pragma unroll for (size_t i = 0; i < N; i++) { flatten_helper::call(storage.data()[i], output + flatten_helper::size * i); } @@ -850,6 +928,7 @@ struct flatten_helper { using value_type = T; static constexpr size_t size = 1; + KERNEL_FLOAT_INLINE static void call(const T& input, T* output) { *output = input; } @@ -866,7 +945,7 @@ template using flatten_type = vector, extent>>; template -flatten_type flatten(const V& input) { +KERNEL_FLOAT_INLINE flatten_type flatten(const V& input) { vector_storage, flatten_size> output; detail::flatten_helper::call(input, output.data()); return output; @@ -1552,6 +1631,274 @@ struct cast, R, m> { } // namespace kernel_float #endif +#ifndef KERNEL_FLOAT_MEMORY_H +#define KERNEL_FLOAT_MEMORY_H + +/* + + + + +namespace kernel_float { + + namespace detail { + template > + struct load_helper; + + template + struct load_helper> { + KERNEL_FLOAT_INLINE + vector_storage call( + T* base, + vector_storage offsets + ) { + return {base[offsets.data()[Is]]...}; + } + + KERNEL_FLOAT_INLINE + vector_storage call( + T* base, + vector_storage offsets, + vector_storage mask + ) { + if (all(mask)) { + return call(base, offsets); + } else { + return { + (mask.data()[Is] ? base[offsets.data()[Is]] : T())... + }; + } + } + }; + } + + template < + typename T, + typename I, + typename M, + typename E = broadcast_vector_extent_type + > + KERNEL_FLOAT_INLINE + vector load(const T* ptr, const I& indices, const M& mask) { + static constexpr E new_size = {}; + + return detail::load_helper::call( + ptr, + convert_storage(indices, new_size), + convert_storage(mask, new_size) + ); + } + + template + KERNEL_FLOAT_INLINE + vector> load(const T* ptr, const I& indices) { + return detail::load_helper::value>::call( + ptr, + cast(indices) + ); + } + + template + KERNEL_FLOAT_INLINE + vector> load(const T* ptr, ptrdiff_t length) { + using index_type = vector_value_type; + return load_masked(ptr, range(), range() < length); + } + + template + KERNEL_FLOAT_INLINE + vector> load(const T* ptr) { + return load(ptr, range()); + } + + namespace detail { + template + struct store_helper { + KERNEL_FLOAT_INLINE + vector_storage call( + T* base, + vector_storage offsets, + vector_storage mask, + vector_storage values + ) { + for (size_t i = 0; i < N; i++) { + if (mask.data()[i]) { + base[offset.data()[i]] = values.data()[i]; + } + } + } + + KERNEL_FLOAT_INLINE + vector_storage call( + T* base, + vector_storage offsets, + vector_storage values + ) { + for (size_t i = 0; i < N; i++) { + base[offset.data()[i]] = values.data()[i]; + } + } + }; + } + + template < + typename T, + typename I, + typename M, + typename V, + typename E = broadcast_extent, broadcast_vector_extent_type>> + > + KERNEL_FLOAT_INLINE + void store(const T* ptr, const I& indices, const M& mask, const V& values) { + static constexpr E new_size = {}; + + return detail::store_helper::call( + ptr, + convert_storage(indices, new_size), + convert_storage(mask, new_size), + convert_storage(values, new_size) + ); + } + + template < + typename T, + typename I, + typename V, + typename E = broadcast_vector_extent_type + > + KERNEL_FLOAT_INLINE + void store(const T* ptr, const I& indices, const V& values) { + static constexpr E new_size = {}; + + return detail::store_helper::call( + ptr, + convert_storage(indices, new_size), + convert_storage(values, new_size) + ); + } + + + template < + typename T, + typename V + > + KERNEL_FLOAT_INLINE + void store(const T* ptr, const V& values) { + using E = vector_extent; + return store(ptr, range(), values); + } + + template + KERNEL_FLOAT_INLINE + void store(const T* ptr, const I& indices, const S& length, const V& values) { + using index_type = vector_value_type; + return store(ptr, indices, (indices >= I(0)) & (indices < length), values); + } + + + template + struct aligned_ptr_base { + static_assert(alignof(T) % alignment == 0, "invalid alignment, must be multiple of alignment of `T`"); + + KERNEL_FLOAT_INLINE + aligned_ptr_base(): ptr_(nullptr) {} + + KERNEL_FLOAT_INLINE + explicit aligned_ptr_base(T* ptr): ptr_(ptr) {} + + KERNEL_FLOAT_INLINE + T* get() const { + // TOOD: check if this way is support across all compilers +#if defined(__has_builtin) && __has_builtin(__builtin_assume_aligned) + return __builtin_assume_aligned(ptr_, alignment); +#else + return ptr_; +#endif + } + + KERNEL_FLOAT_INLINE + operator T*() const { + return get(); + } + + KERNEL_FLOAT_INLINE + T& operator*() const { + return *get(); + } + + template + KERNEL_FLOAT_INLINE + T& operator[](I index) const { + return get()[index); + } + + private: + T* ptr_ = nullptr; + }; + + template + struct aligned_ptr; + + template + struct aligned_ptr: aligned_ptr_base { + using base_type = aligned_ptr_base; + + KERNEL_FLOAT_INLINE + aligned_ptr(): base_type(nullptr) {} + + KERNEL_FLOAT_INLINE + explicit aligned_ptr(T* ptr): base_type(ptr) {} + + KERNEL_FLOAT_INLINE + aligned_ptr(aligned_ptr ptr): base_type(ptr.get()) {} + }; + + template + struct aligned_ptr: aligned_ptr_base { + using base_type = aligned_ptr_base; + + KERNEL_FLOAT_INLINE + aligned_ptr(): base_type(nullptr) {} + + KERNEL_FLOAT_INLINE + explicit aligned_ptr(T* ptr): base_type(ptr) {} + + KERNEL_FLOAT_INLINE + explicit aligned_ptr(const T* ptr): base_type(ptr) {} + + KERNEL_FLOAT_INLINE + aligned_ptr(aligned_ptr ptr): base_type(ptr.get()) {} + + KERNEL_FLOAT_INLINE + aligned_ptr(aligned_ptr ptr): base_type(ptr.get()) {} + }; + + + template + KERNEL_FLOAT_INLINE + T* operator+(aligned_ptr ptr, ptrdiff_t index) { + return ptr.get() + index; + } + + template + KERNEL_FLOAT_INLINE + T* operator+(ptrdiff_t index, aligned_ptr ptr) { + return ptr.get() + index; + } + + template + KERNEL_FLOAT_INLINE + ptrdiff_t operator-(aligned_ptr left, aligned_ptr right) { + return left.get() - right.get(); + } + + template + using unaligned_ptr = aligned_ptr; + +} +*/ + +#endif //KERNEL_FLOAT_MEMORY_H #ifndef KERNEL_FLOAT_REDUCE_H #define KERNEL_FLOAT_REDUCE_H @@ -1993,18 +2340,23 @@ struct vector: S { } template - KERNEL_FLOAT_INLINE vector, E> map(F fun = {}) const { + KERNEL_FLOAT_INLINE vector, E> map(F fun) const { return kernel_float::map(fun, *this); } template - KERNEL_FLOAT_INLINE T reduce(F fun = {}) const { + KERNEL_FLOAT_INLINE T reduce(F fun) const { return kernel_float::reduce(fun, *this); } KERNEL_FLOAT_INLINE flatten_type flatten() const { return kernel_float::flatten(*this); } + + template + KERNEL_FLOAT_INLINE void for_each(F fun) const { + return kernel_float::for_each(*this, std::move(fun)); + } }; #define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ From b3e93f16bc6caaaf46571085c774d1af3dec8265 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 14 Aug 2023 13:10:23 +0200 Subject: [PATCH 20/50] Add cross product function --- include/kernel_float/binops.h | 28 ++++ include/kernel_float/memory.h | 268 ++++++++++++++++++++++++++++++++++ single_include/kernel_float.h | 32 +++- 3 files changed, 326 insertions(+), 2 deletions(-) create mode 100644 include/kernel_float/memory.h diff --git a/include/kernel_float/binops.h b/include/kernel_float/binops.h index af64ddd..74d7163 100644 --- a/include/kernel_float/binops.h +++ b/include/kernel_float/binops.h @@ -230,6 +230,34 @@ struct bit_xor { }; }; // namespace ops +namespace detail { +template +struct cross_helper { + KERNEL_FLOAT_INLINE + static vector call(const vector_storage& a, const vector_storage& b) { + vector v0 = {a[1], a[2], a[0], a[2], a[0], a[1]}; + vector v1 = {b[2], b[0], b[1], b[1], b[2], b[0]}; + vector r = v0 * v1; + + vector r0 = {r[0], r[1], r[2]}; + vector r1 = {r[3], r[4], r[5]}; + return r0 - r1; + } +}; +}; // namespace detail + +/** + * Calculates the cross-product between two vectors of length 3. + */ +template< + typename L, + typename R, + typename T = promoted_vector_value_type, + typename = enabled_t::value == 3>> +KERNEL_FLOAT_INLINE vector cross(const L& left, const R& right) { + return detail::cross_helper::call(convert_storage(left), convert_storage(right)); +} + } // namespace kernel_float #endif diff --git a/include/kernel_float/memory.h b/include/kernel_float/memory.h new file mode 100644 index 0000000..3602e15 --- /dev/null +++ b/include/kernel_float/memory.h @@ -0,0 +1,268 @@ +#ifndef KERNEL_FLOAT_MEMORY_H +#define KERNEL_FLOAT_MEMORY_H + +/* +#include "binops.h" +#include "broadcast.h" +#include "iterate.h" + +namespace kernel_float { + + namespace detail { + template > + struct load_helper; + + template + struct load_helper> { + KERNEL_FLOAT_INLINE + vector_storage call( + T* base, + vector_storage offsets + ) { + return {base[offsets.data()[Is]]...}; + } + + KERNEL_FLOAT_INLINE + vector_storage call( + T* base, + vector_storage offsets, + vector_storage mask + ) { + if (all(mask)) { + return call(base, offsets); + } else { + return { + (mask.data()[Is] ? base[offsets.data()[Is]] : T())... + }; + } + } + }; + } + + template < + typename T, + typename I, + typename M, + typename E = broadcast_vector_extent_type + > + KERNEL_FLOAT_INLINE + vector load(const T* ptr, const I& indices, const M& mask) { + static constexpr E new_size = {}; + + return detail::load_helper::call( + ptr, + convert_storage(indices, new_size), + convert_storage(mask, new_size) + ); + } + + template + KERNEL_FLOAT_INLINE + vector> load(const T* ptr, const I& indices) { + return detail::load_helper::value>::call( + ptr, + cast(indices) + ); + } + + template + KERNEL_FLOAT_INLINE + vector> load(const T* ptr, ptrdiff_t length) { + using index_type = vector_value_type; + return load_masked(ptr, range(), range() < length); + } + + template + KERNEL_FLOAT_INLINE + vector> load(const T* ptr) { + return load(ptr, range()); + } + + namespace detail { + template + struct store_helper { + KERNEL_FLOAT_INLINE + vector_storage call( + T* base, + vector_storage offsets, + vector_storage mask, + vector_storage values + ) { + for (size_t i = 0; i < N; i++) { + if (mask.data()[i]) { + base[offset.data()[i]] = values.data()[i]; + } + } + } + + KERNEL_FLOAT_INLINE + vector_storage call( + T* base, + vector_storage offsets, + vector_storage values + ) { + for (size_t i = 0; i < N; i++) { + base[offset.data()[i]] = values.data()[i]; + } + } + }; + } + + template < + typename T, + typename I, + typename M, + typename V, + typename E = broadcast_extent, broadcast_vector_extent_type>> + > + KERNEL_FLOAT_INLINE + void store(const T* ptr, const I& indices, const M& mask, const V& values) { + static constexpr E new_size = {}; + + return detail::store_helper::call( + ptr, + convert_storage(indices, new_size), + convert_storage(mask, new_size), + convert_storage(values, new_size) + ); + } + + template < + typename T, + typename I, + typename V, + typename E = broadcast_vector_extent_type + > + KERNEL_FLOAT_INLINE + void store(const T* ptr, const I& indices, const V& values) { + static constexpr E new_size = {}; + + return detail::store_helper::call( + ptr, + convert_storage(indices, new_size), + convert_storage(values, new_size) + ); + } + + + template < + typename T, + typename V + > + KERNEL_FLOAT_INLINE + void store(const T* ptr, const V& values) { + using E = vector_extent; + return store(ptr, range(), values); + } + + template + KERNEL_FLOAT_INLINE + void store(const T* ptr, const I& indices, const S& length, const V& values) { + using index_type = vector_value_type; + return store(ptr, indices, (indices >= I(0)) & (indices < length), values); + } + + + template + struct aligned_ptr_base { + static_assert(alignof(T) % alignment == 0, "invalid alignment, must be multiple of alignment of `T`"); + + KERNEL_FLOAT_INLINE + aligned_ptr_base(): ptr_(nullptr) {} + + KERNEL_FLOAT_INLINE + explicit aligned_ptr_base(T* ptr): ptr_(ptr) {} + + KERNEL_FLOAT_INLINE + T* get() const { + // TOOD: check if this way is support across all compilers +#if defined(__has_builtin) && __has_builtin(__builtin_assume_aligned) + return __builtin_assume_aligned(ptr_, alignment); +#else + return ptr_; +#endif + } + + KERNEL_FLOAT_INLINE + operator T*() const { + return get(); + } + + KERNEL_FLOAT_INLINE + T& operator*() const { + return *get(); + } + + template + KERNEL_FLOAT_INLINE + T& operator[](I index) const { + return get()[index); + } + + private: + T* ptr_ = nullptr; + }; + + template + struct aligned_ptr; + + template + struct aligned_ptr: aligned_ptr_base { + using base_type = aligned_ptr_base; + + KERNEL_FLOAT_INLINE + aligned_ptr(): base_type(nullptr) {} + + KERNEL_FLOAT_INLINE + explicit aligned_ptr(T* ptr): base_type(ptr) {} + + KERNEL_FLOAT_INLINE + aligned_ptr(aligned_ptr ptr): base_type(ptr.get()) {} + }; + + template + struct aligned_ptr: aligned_ptr_base { + using base_type = aligned_ptr_base; + + KERNEL_FLOAT_INLINE + aligned_ptr(): base_type(nullptr) {} + + KERNEL_FLOAT_INLINE + explicit aligned_ptr(T* ptr): base_type(ptr) {} + + KERNEL_FLOAT_INLINE + explicit aligned_ptr(const T* ptr): base_type(ptr) {} + + KERNEL_FLOAT_INLINE + aligned_ptr(aligned_ptr ptr): base_type(ptr.get()) {} + + KERNEL_FLOAT_INLINE + aligned_ptr(aligned_ptr ptr): base_type(ptr.get()) {} + }; + + + template + KERNEL_FLOAT_INLINE + T* operator+(aligned_ptr ptr, ptrdiff_t index) { + return ptr.get() + index; + } + + template + KERNEL_FLOAT_INLINE + T* operator+(ptrdiff_t index, aligned_ptr ptr) { + return ptr.get() + index; + } + + template + KERNEL_FLOAT_INLINE + ptrdiff_t operator-(aligned_ptr left, aligned_ptr right) { + return left.get() - right.get(); + } + + template + using unaligned_ptr = aligned_ptr; + +} +*/ + +#endif //KERNEL_FLOAT_MEMORY_H diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 2f7a557..3f5d818 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-08-14 12:28:08.921323 -// git hash: 2ce8b7dabc15c263791b6ad736915c5a724f1d35 +// date: 2023-08-14 13:10:09.230788 +// git hash: b012dbe70def9e6c7da70ee771d5be2304ddc53f //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -1563,6 +1563,34 @@ struct bit_xor { }; }; // namespace ops +namespace detail { +template +struct cross_helper { + KERNEL_FLOAT_INLINE + static vector call(const vector_storage& a, const vector_storage& b) { + vector v0 = {a[1], a[2], a[0], a[2], a[0], a[1]}; + vector v1 = {b[2], b[0], b[1], b[1], b[2], b[0]}; + vector r = v0 * v1; + + vector r0 = {r[0], r[1], r[2]}; + vector r1 = {r[3], r[4], r[5]}; + return r0 - r1; + } +}; +}; // namespace detail + +/** + * Calculates the cross-product between two vectors of length 3. + */ +template< + typename L, + typename R, + typename T = promoted_vector_value_type, + typename = enabled_t::value == 3>> +KERNEL_FLOAT_INLINE vector cross(const L& left, const R& right) { + return detail::cross_helper::call(convert_storage(left), convert_storage(right)); +} + } // namespace kernel_float #endif From cc83e9218cc246853b9137cfb6835171f47bb5b0 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 14 Aug 2023 13:34:59 +0200 Subject: [PATCH 21/50] Improve how dot product is computed for 16bit floats --- include/kernel_float/bf16.h | 45 ++++++++++- include/kernel_float/fp16.h | 44 ++++++++++- include/kernel_float/reduce.h | 51 +++++++----- single_include/kernel_float.h | 144 +++++++++++++++++++++++++++------- 4 files changed, 232 insertions(+), 52 deletions(-) diff --git a/include/kernel_float/bf16.h b/include/kernel_float/bf16.h index 2f8e870..31ba9c2 100644 --- a/include/kernel_float/bf16.h +++ b/include/kernel_float/bf16.h @@ -54,7 +54,7 @@ struct apply_impl { vector_storage<__nv_bfloat16, N> result; #pragma unroll - for (size_t i = 0; i < N; i += 2) { + for (size_t i = 0; i + 2 <= N; i += 2) { __nv_bfloat162 a = {input.data()[i], input.data()[i + 1]}; __nv_bfloat162 b = map_bfloat16x2::call(fun, a); result.data()[i + 0] = b.x; @@ -77,7 +77,7 @@ struct apply_impl { const vector_storage<__nv_bfloat16, N>& right) { vector_storage<__nv_bfloat16, N> result; #pragma unroll - for (size_t i = 0; i < N; i += 2) { + for (size_t i = 0; i + 2 <= N; i += 2) { __nv_bfloat162 a = {left.data()[i], left.data()[i + 1]}; __nv_bfloat162 b = {right.data()[i], right.data()[i + 1]}; __nv_bfloat162 c = zip_bfloat16x2::call(fun, a, b); @@ -100,7 +100,7 @@ struct reduce_helper= 2)>> { __nv_bfloat162 accum = {input.data()[0], input.data()[1]}; #pragma unroll - for (size_t i = 2; i < N; i += 2) { + for (size_t i = 2; i + 2 <= N; i += 2) { __nv_bfloat162 a = {input.data()[i], input.data()[i + 1]}; accum = zip_bfloat16x2::call(fun, accum, a); } @@ -244,6 +244,45 @@ using bfloat16 = __nv_bfloat16; //KERNEL_FLOAT_TYPE_ALIAS(float16x, __nv_bfloat16) //KERNEL_FLOAT_TYPE_ALIAS(f16x, __nv_bfloat16) +#if KERNEL_FLOAT_IS_DEVICE +namespace detail { +template +struct dot_helper<__nv_bfloat16, N> { + KERNEL_FLOAT_INLINE + static __nv_bfloat16 call( + const vector_storage<__nv_bfloat16, N>& left, + const vector_storage<__nv_bfloat16, N>& right) { + if constexpr (N == 0) { + return __nv_bfloat16(0); + } else if constexpr (N == 1) { + return __hmul(left.data()[0], right.data()[0], ); + } else { + __nv_bfloat162 first_a = {left.data()[0], left.data()[1]}; + __nv_bfloat162 first_b = {right.data()[0], right.data()[1]}; + __nv_bfloat162 accum = __hmul2(first_a, first_b); + +#pragma unroll + for (size_t i = 2; i + 2 <= N; i += 2) { + __nv_bfloat162 a = {left.data()[i], left.data()[i + 1]}; + __nv_bfloat162 b = {right.data()[i], right.data()[i + 1]}; + accum = __hfma2(a, b, accum); + } + + __nv_bfloat16 result = __hadd(accum.x, accum.y); + + if constexpr (N % 2 != 0) { + __nv_bfloat16 a = left.data()[N - 1]; + __nv_bfloat16 b = right.data()[N - 1]); + result = __hfma(a, b, result); + } + + return result; + } + } +}; +} // namespace detail +#endif + } // namespace kernel_float #if KERNEL_FLOAT_FP16_AVAILABLE diff --git a/include/kernel_float/fp16.h b/include/kernel_float/fp16.h index ee90ebf..17adda4 100644 --- a/include/kernel_float/fp16.h +++ b/include/kernel_float/fp16.h @@ -52,7 +52,7 @@ struct apply_impl { vector_storage<__half, N> result; #pragma unroll - for (size_t i = 0; i < N; i += 2) { + for (size_t i = 0; i + 2 <= N; i += 2) { __half2 a = {input.data()[i], input.data()[i + 1]}; __half2 b = map_halfx2::call(fun, a); result.data()[i + 0] = b.x; @@ -73,7 +73,7 @@ struct apply_impl { call(F fun, const vector_storage<__half, N>& left, const vector_storage<__half, N>& right) { vector_storage<__half, N> result; #pragma unroll - for (size_t i = 0; i < N; i += 2) { + for (size_t i = 0; i + 2 <= N; i += 2) { __half2 a = {left.data()[i], left.data()[i + 1]}; __half2 b = {right.data()[i], right.data()[i + 1]}; __half2 c = zip_halfx2::call(fun, a, b); @@ -95,7 +95,7 @@ struct reduce_helper= 2)>> { __half2 accum = {input.data()[0], input.data()[1]}; #pragma unroll - for (size_t i = 2; i < N; i += 2) { + for (size_t i = 2; i + 2 <= N; i += 2) { __half2 a = {input.data()[i], input.data()[i + 1]}; accum = zip_halfx2::call(fun, accum, a); } @@ -225,6 +225,44 @@ using half = __half; //KERNEL_FLOAT_TYPE_ALIAS(float16x, __half) //KERNEL_FLOAT_TYPE_ALIAS(f16x, __half) +#if KERNEL_FLOAT_IS_DEVICE +namespace detail { +template +struct dot_helper<__half, N> { + KERNEL_FLOAT_INLINE + static __half + call(const vector_storage<__half, N>& left, const vector_storage<__half, N>& right) { + if constexpr (N == 0) { + return __half(0); + } else if constexpr (N == 1) { + return __hmul(left.data()[0], right.data()[0], ); + } else { + __half2 first_a = {left.data()[0], left.data()[1]}; + __half2 first_b = {right.data()[0], right.data()[1]}; + __half2 accum = __hmul2(first_a, first_b); + +#pragma unroll + for (size_t i = 2; i + 2 <= N; i += 2) { + __half2 a = {left.data()[i], left.data()[i + 1]}; + __half2 b = {right.data()[i], right.data()[i + 1]}; + accum = __hfma2(a, b, accum); + } + + __half result = __hadd(accum.x, accum.y); + + if constexpr (N % 2 != 0) { + __half a = left.data()[N - 1]; + __half b = right.data()[N - 1]); + result = __hfma(a, b, result); + } + + return result; + } + } +}; +} // namespace detail +#endif + } // namespace kernel_float #endif diff --git a/include/kernel_float/reduce.h b/include/kernel_float/reduce.h index 3dc22ad..2def742 100644 --- a/include/kernel_float/reduce.h +++ b/include/kernel_float/reduce.h @@ -91,22 +91,6 @@ KERNEL_FLOAT_INLINE T sum(const V& input) { return reduce(ops::add {}, input); } -/** - * Compute the dot product of the given vectors ``left`` and ``right`` - * - * Example - * ======= - * ``` - * vec x = {1, 2, 3}; - * vec y = {4, 5, 6}; - * int y = dot(x, y); // Returns 1*4+2*5+3*6 = 32 - * ``` - */ -template> -KERNEL_FLOAT_INLINE T dot(const L& left, const R& right) { - return reduce(ops::add {}, zip_common(ops::multiply {}, left, right)); -} - /** * Multiply the items in the given vector ``input``. * @@ -151,9 +135,38 @@ KERNEL_FLOAT_INLINE bool any(const V& input) { * int y = count(x); // Returns 3 (5, 2, 1 are non-zero) * ``` */ -template -KERNEL_FLOAT_INLINE int count(const V& input) { - return sum(cast(cast(input))); +template +KERNEL_FLOAT_INLINE T count(const V& input) { + return sum(cast(cast(input))); +} + +namespace detail { +template +struct dot_helper { + KERNEL_FLOAT_INLINE + static T call(const vector_storage& left, const vector_storage& right) { + return reduce(ops::add {}, zip(ops::multiply {}, left, right)); + } +}; +} // namespace detail + +/** + * Compute the dot product of the given vectors ``left`` and ``right`` + * + * Example + * ======= + * ``` + * vec x = {1, 2, 3}; + * vec y = {4, 5, 6}; + * int y = dot(x, y); // Returns 1*4+2*5+3*6 = 32 + * ``` + */ +template> +KERNEL_FLOAT_INLINE T dot(const L& left, const R& right) { + using E = broadcast_vector_extent_type; + return detail::dot_helper::call( + convert_storage(left, E {}), + convert_storage(right, E {})); } } // namespace kernel_float diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 3f5d818..a2ed493 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-08-14 13:10:09.230788 -// git hash: b012dbe70def9e6c7da70ee771d5be2304ddc53f +// date: 2023-08-14 13:32:29.526813 +// git hash: b3e93f16bc6caaaf46571085c774d1af3dec8265 //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -2020,22 +2020,6 @@ KERNEL_FLOAT_INLINE T sum(const V& input) { return reduce(ops::add {}, input); } -/** - * Compute the dot product of the given vectors ``left`` and ``right`` - * - * Example - * ======= - * ``` - * vec x = {1, 2, 3}; - * vec y = {4, 5, 6}; - * int y = dot(x, y); // Returns 1*4+2*5+3*6 = 32 - * ``` - */ -template> -KERNEL_FLOAT_INLINE T dot(const L& left, const R& right) { - return reduce(ops::add {}, zip_common(ops::multiply {}, left, right)); -} - /** * Multiply the items in the given vector ``input``. * @@ -2080,9 +2064,38 @@ KERNEL_FLOAT_INLINE bool any(const V& input) { * int y = count(x); // Returns 3 (5, 2, 1 are non-zero) * ``` */ -template -KERNEL_FLOAT_INLINE int count(const V& input) { - return sum(cast(cast(input))); +template +KERNEL_FLOAT_INLINE T count(const V& input) { + return sum(cast(cast(input))); +} + +namespace detail { +template +struct dot_helper { + KERNEL_FLOAT_INLINE + static T call(const vector_storage& left, const vector_storage& right) { + return reduce(ops::add {}, zip(ops::multiply {}, left, right)); + } +}; +} // namespace detail + +/** + * Compute the dot product of the given vectors ``left`` and ``right`` + * + * Example + * ======= + * ``` + * vec x = {1, 2, 3}; + * vec y = {4, 5, 6}; + * int y = dot(x, y); // Returns 1*4+2*5+3*6 = 32 + * ``` + */ +template> +KERNEL_FLOAT_INLINE T dot(const L& left, const R& right) { + using E = broadcast_vector_extent_type; + return detail::dot_helper::call( + convert_storage(left, E {}), + convert_storage(right, E {})); } } // namespace kernel_float @@ -2521,7 +2534,7 @@ struct apply_impl { vector_storage<__half, N> result; #pragma unroll - for (size_t i = 0; i < N; i += 2) { + for (size_t i = 0; i + 2 <= N; i += 2) { __half2 a = {input.data()[i], input.data()[i + 1]}; __half2 b = map_halfx2::call(fun, a); result.data()[i + 0] = b.x; @@ -2542,7 +2555,7 @@ struct apply_impl { call(F fun, const vector_storage<__half, N>& left, const vector_storage<__half, N>& right) { vector_storage<__half, N> result; #pragma unroll - for (size_t i = 0; i < N; i += 2) { + for (size_t i = 0; i + 2 <= N; i += 2) { __half2 a = {left.data()[i], left.data()[i + 1]}; __half2 b = {right.data()[i], right.data()[i + 1]}; __half2 c = zip_halfx2::call(fun, a, b); @@ -2564,7 +2577,7 @@ struct reduce_helper= 2)>> { __half2 accum = {input.data()[0], input.data()[1]}; #pragma unroll - for (size_t i = 2; i < N; i += 2) { + for (size_t i = 2; i + 2 <= N; i += 2) { __half2 a = {input.data()[i], input.data()[i + 1]}; accum = zip_halfx2::call(fun, accum, a); } @@ -2694,6 +2707,44 @@ using half = __half; //KERNEL_FLOAT_TYPE_ALIAS(float16x, __half) //KERNEL_FLOAT_TYPE_ALIAS(f16x, __half) +#if KERNEL_FLOAT_IS_DEVICE +namespace detail { +template +struct dot_helper<__half, N> { + KERNEL_FLOAT_INLINE + static __half + call(const vector_storage<__half, N>& left, const vector_storage<__half, N>& right) { + if constexpr (N == 0) { + return __half(0); + } else if constexpr (N == 1) { + return __hmul(left.data()[0], right.data()[0], ); + } else { + __half2 first_a = {left.data()[0], left.data()[1]}; + __half2 first_b = {right.data()[0], right.data()[1]}; + __half2 accum = __hmul2(first_a, first_b); + +#pragma unroll + for (size_t i = 2; i + 2 <= N; i += 2) { + __half2 a = {left.data()[i], left.data()[i + 1]}; + __half2 b = {right.data()[i], right.data()[i + 1]}; + accum = __hfma2(a, b, accum); + } + + __half result = __hadd(accum.x, accum.y); + + if constexpr (N % 2 != 0) { + __half a = left.data()[N - 1]; + __half b = right.data()[N - 1]); + result = __hfma(a, b, result); + } + + return result; + } + } +}; +} // namespace detail +#endif + } // namespace kernel_float #endif @@ -2755,7 +2806,7 @@ struct apply_impl { vector_storage<__nv_bfloat16, N> result; #pragma unroll - for (size_t i = 0; i < N; i += 2) { + for (size_t i = 0; i + 2 <= N; i += 2) { __nv_bfloat162 a = {input.data()[i], input.data()[i + 1]}; __nv_bfloat162 b = map_bfloat16x2::call(fun, a); result.data()[i + 0] = b.x; @@ -2778,7 +2829,7 @@ struct apply_impl { const vector_storage<__nv_bfloat16, N>& right) { vector_storage<__nv_bfloat16, N> result; #pragma unroll - for (size_t i = 0; i < N; i += 2) { + for (size_t i = 0; i + 2 <= N; i += 2) { __nv_bfloat162 a = {left.data()[i], left.data()[i + 1]}; __nv_bfloat162 b = {right.data()[i], right.data()[i + 1]}; __nv_bfloat162 c = zip_bfloat16x2::call(fun, a, b); @@ -2801,7 +2852,7 @@ struct reduce_helper= 2)>> { __nv_bfloat162 accum = {input.data()[0], input.data()[1]}; #pragma unroll - for (size_t i = 2; i < N; i += 2) { + for (size_t i = 2; i + 2 <= N; i += 2) { __nv_bfloat162 a = {input.data()[i], input.data()[i + 1]}; accum = zip_bfloat16x2::call(fun, accum, a); } @@ -2945,6 +2996,45 @@ using bfloat16 = __nv_bfloat16; //KERNEL_FLOAT_TYPE_ALIAS(float16x, __nv_bfloat16) //KERNEL_FLOAT_TYPE_ALIAS(f16x, __nv_bfloat16) +#if KERNEL_FLOAT_IS_DEVICE +namespace detail { +template +struct dot_helper<__nv_bfloat16, N> { + KERNEL_FLOAT_INLINE + static __nv_bfloat16 call( + const vector_storage<__nv_bfloat16, N>& left, + const vector_storage<__nv_bfloat16, N>& right) { + if constexpr (N == 0) { + return __nv_bfloat16(0); + } else if constexpr (N == 1) { + return __hmul(left.data()[0], right.data()[0], ); + } else { + __nv_bfloat162 first_a = {left.data()[0], left.data()[1]}; + __nv_bfloat162 first_b = {right.data()[0], right.data()[1]}; + __nv_bfloat162 accum = __hmul2(first_a, first_b); + +#pragma unroll + for (size_t i = 2; i + 2 <= N; i += 2) { + __nv_bfloat162 a = {left.data()[i], left.data()[i + 1]}; + __nv_bfloat162 b = {right.data()[i], right.data()[i + 1]}; + accum = __hfma2(a, b, accum); + } + + __nv_bfloat16 result = __hadd(accum.x, accum.y); + + if constexpr (N % 2 != 0) { + __nv_bfloat16 a = left.data()[N - 1]; + __nv_bfloat16 b = right.data()[N - 1]); + result = __hfma(a, b, result); + } + + return result; + } + } +}; +} // namespace detail +#endif + } // namespace kernel_float #if KERNEL_FLOAT_FP16_AVAILABLE From d9efc31a72c17f7885f305176f2ab7e08e088e18 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 14 Aug 2023 13:39:09 +0200 Subject: [PATCH 22/50] Fix compilation error --- include/kernel_float/binops.h | 17 +++++++++-------- include/kernel_float/iterate.h | 2 +- single_include/kernel_float.h | 23 ++++++++++++----------- 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/include/kernel_float/binops.h b/include/kernel_float/binops.h index 74d7163..169493c 100644 --- a/include/kernel_float/binops.h +++ b/include/kernel_float/binops.h @@ -234,13 +234,13 @@ namespace detail { template struct cross_helper { KERNEL_FLOAT_INLINE - static vector call(const vector_storage& a, const vector_storage& b) { - vector v0 = {a[1], a[2], a[0], a[2], a[0], a[1]}; - vector v1 = {b[2], b[0], b[1], b[1], b[2], b[0]}; - vector r = v0 * v1; + static vector> call(const vector_storage& a, const vector_storage& b) { + vector> v0 = {a[1], a[2], a[0], a[2], a[0], a[1]}; + vector> v1 = {b[2], b[0], b[1], b[1], b[2], b[0]}; + vector> r = v0 * v1; - vector r0 = {r[0], r[1], r[2]}; - vector r1 = {r[3], r[4], r[5]}; + vector> r0 = {r[0], r[1], r[2]}; + vector> r1 = {r[3], r[4], r[5]}; return r0 - r1; } }; @@ -253,8 +253,9 @@ template< typename L, typename R, typename T = promoted_vector_value_type, - typename = enabled_t::value == 3>> -KERNEL_FLOAT_INLINE vector cross(const L& left, const R& right) { + typename = + enabled_t> && is_vector_broadcastable>>> +KERNEL_FLOAT_INLINE vector> cross(const L& left, const R& right) { return detail::cross_helper::call(convert_storage(left), convert_storage(right)); } diff --git a/include/kernel_float/iterate.h b/include/kernel_float/iterate.h index b9b2f81..49691b0 100644 --- a/include/kernel_float/iterate.h +++ b/include/kernel_float/iterate.h @@ -77,7 +77,7 @@ KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { * ``` */ template -KERNEL_FLOAT_INLINE vector> enumerate(const V& = {}) { +KERNEL_FLOAT_INLINE vector> enumerate(const V& = {}) { return detail::range_helper>::call(); } diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index a2ed493..78cfb6a 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-08-14 13:32:29.526813 -// git hash: b3e93f16bc6caaaf46571085c774d1af3dec8265 +// date: 2023-08-14 13:38:46.811169 +// git hash: cc83e9218cc246853b9137cfb6835171f47bb5b0 //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -902,7 +902,7 @@ KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { * ``` */ template -KERNEL_FLOAT_INLINE vector> enumerate(const V& = {}) { +KERNEL_FLOAT_INLINE vector> enumerate(const V& = {}) { return detail::range_helper>::call(); } @@ -1567,13 +1567,13 @@ namespace detail { template struct cross_helper { KERNEL_FLOAT_INLINE - static vector call(const vector_storage& a, const vector_storage& b) { - vector v0 = {a[1], a[2], a[0], a[2], a[0], a[1]}; - vector v1 = {b[2], b[0], b[1], b[1], b[2], b[0]}; - vector r = v0 * v1; + static vector> call(const vector_storage& a, const vector_storage& b) { + vector> v0 = {a[1], a[2], a[0], a[2], a[0], a[1]}; + vector> v1 = {b[2], b[0], b[1], b[1], b[2], b[0]}; + vector> r = v0 * v1; - vector r0 = {r[0], r[1], r[2]}; - vector r1 = {r[3], r[4], r[5]}; + vector> r0 = {r[0], r[1], r[2]}; + vector> r1 = {r[3], r[4], r[5]}; return r0 - r1; } }; @@ -1586,8 +1586,9 @@ template< typename L, typename R, typename T = promoted_vector_value_type, - typename = enabled_t::value == 3>> -KERNEL_FLOAT_INLINE vector cross(const L& left, const R& right) { + typename = + enabled_t> && is_vector_broadcastable>>> +KERNEL_FLOAT_INLINE vector> cross(const L& left, const R& right) { return detail::cross_helper::call(convert_storage(left), convert_storage(right)); } From 3c3059aeb5ebc8f951b0d91b761c02cd0bb7a539 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 14 Aug 2023 14:47:19 +0200 Subject: [PATCH 23/50] Add functions to calculate magnitude --- include/kernel_float/complex.h | 6 +-- include/kernel_float/iterate.h | 6 +-- include/kernel_float/reduce.h | 73 +++++++++++++++++++++++++++- single_include/kernel_float.h | 89 ++++++++++++++++++++++++++++++---- 4 files changed, 158 insertions(+), 16 deletions(-) diff --git a/include/kernel_float/complex.h b/include/kernel_float/complex.h index a2d2062..37dbdfc 100644 --- a/include/kernel_float/complex.h +++ b/include/kernel_float/complex.h @@ -141,7 +141,7 @@ KERNEL_FLOAT_INLINE complex_type operator*(T a, complex_type b) { template KERNEL_FLOAT_INLINE complex_type operator/(complex_type a, complex_type b) { - T normi = 1 / b.norm(); + T normi = T(1) / b.norm(); return { (a.real() * b.real() + a.imag() * b.imag()) * normi, @@ -150,12 +150,12 @@ KERNEL_FLOAT_INLINE complex_type operator/(complex_type a, complex_type template KERNEL_FLOAT_INLINE complex_type operator/(complex_type a, T b) { - return {a.real() * (1 / b), a.imag() * (1 / b)}; + return a * (T(1) / b); } template KERNEL_FLOAT_INLINE complex_type operator/(T a, complex_type b) { - T normi = 1 / b.norm(); + T normi = T(1) / b.norm(); return {a * b.real() * normi, -a * b.imag() * normi}; } diff --git a/include/kernel_float/iterate.h b/include/kernel_float/iterate.h index 49691b0..3fb0ffd 100644 --- a/include/kernel_float/iterate.h +++ b/include/kernel_float/iterate.h @@ -76,9 +76,9 @@ KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { * vector vec = enumerate(float3(6, 4, 2)); * ``` */ -template -KERNEL_FLOAT_INLINE vector> enumerate(const V& = {}) { - return detail::range_helper>::call(); +template +KERNEL_FLOAT_INLINE vector> enumerate(const V& = {}) { + return detail::range_helper>::call(); } namespace detail { diff --git a/include/kernel_float/reduce.h b/include/kernel_float/reduce.h index 2def742..c8708e4 100644 --- a/include/kernel_float/reduce.h +++ b/include/kernel_float/reduce.h @@ -145,7 +145,7 @@ template struct dot_helper { KERNEL_FLOAT_INLINE static T call(const vector_storage& left, const vector_storage& right) { - return reduce(ops::add {}, zip(ops::multiply {}, left, right)); + return sum(zip(ops::multiply {}, left, right)); } }; } // namespace detail @@ -170,4 +170,75 @@ KERNEL_FLOAT_INLINE T dot(const L& left, const R& right) { } } // namespace kernel_float +namespace detail { +template +struct magnitude_helper { + KERNEL_FLOAT_INLINE + static T call(const vector_storage& input) { + return ops::sqrt {}(detail::dot_helper::call(input, input)); + } +}; + +template +struct magnitude_helper { + KERNEL_FLOAT_INLINE + static T call(const vector_storage& input) { + return T {}; + } +}; + +template +struct magnitude_helper { + KERNEL_FLOAT_INLINE + static T call(const vector_storage& input) { + return ops::abs {}(input); + } +}; + +template +struct magnitude_helper { + KERNEL_FLOAT_INLINE + static T call(const vector_storage& input) { + return ops::hypot {}(input[0], input[1]); + } +}; + +// The 3-argument overload of hypot is only available from C++17 +#ifdef __cpp_lib_hypot +template<> +struct magnitude_helper { + KERNEL_FLOAT_INLINE + static float call(const vector_storage& input) { + return std::hypot(input[0], input[1], input[2]); + } +}; + +template<> +struct magnitude_helper { + KERNEL_FLOAT_INLINE + static float call(const vector_storage& input) { + return std::hypot(input[0], input[1], input[2]); + } +}; +#endif + +} // namespace detail + +/** + * Compute the magnitude of the given input vector. This calculates the square root of the sum of squares, also + * known as the Euclidian norm of the vector. + * + * Example + * ======= + * ``` + * vec x = {2, 3, 6}; + * float y = mag(x); // Returns sqrt(2*2 + 3*3 + 6*6) = 7 + * ``` + */ +template> +KERNEL_FLOAT_INLINE T mag(const V& input) { + return detail::magnitude_helper>::call(into_vector_storage(input)); +} +} // namespace kernel_float + #endif //KERNEL_FLOAT_REDUCE_H diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 78cfb6a..2b1c003 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-08-14 13:38:46.811169 -// git hash: cc83e9218cc246853b9137cfb6835171f47bb5b0 +// date: 2023-08-14 14:47:10.123460 +// git hash: d9efc31a72c17f7885f305176f2ab7e08e088e18 //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -706,7 +706,7 @@ KERNEL_FLOAT_INLINE complex_type operator*(T a, complex_type b) { template KERNEL_FLOAT_INLINE complex_type operator/(complex_type a, complex_type b) { - T normi = 1 / b.norm(); + T normi = T(1) / b.norm(); return { (a.real() * b.real() + a.imag() * b.imag()) * normi, @@ -715,12 +715,12 @@ KERNEL_FLOAT_INLINE complex_type operator/(complex_type a, complex_type template KERNEL_FLOAT_INLINE complex_type operator/(complex_type a, T b) { - return {a.real() * (1 / b), a.imag() * (1 / b)}; + return a * (T(1) / b); } template KERNEL_FLOAT_INLINE complex_type operator/(T a, complex_type b) { - T normi = 1 / b.norm(); + T normi = T(1) / b.norm(); return {a * b.real() * normi, -a * b.imag() * normi}; } @@ -901,9 +901,9 @@ KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { * vector vec = enumerate(float3(6, 4, 2)); * ``` */ -template -KERNEL_FLOAT_INLINE vector> enumerate(const V& = {}) { - return detail::range_helper>::call(); +template +KERNEL_FLOAT_INLINE vector> enumerate(const V& = {}) { + return detail::range_helper>::call(); } namespace detail { @@ -2075,7 +2075,7 @@ template struct dot_helper { KERNEL_FLOAT_INLINE static T call(const vector_storage& left, const vector_storage& right) { - return reduce(ops::add {}, zip(ops::multiply {}, left, right)); + return sum(zip(ops::multiply {}, left, right)); } }; } // namespace detail @@ -2100,6 +2100,77 @@ KERNEL_FLOAT_INLINE T dot(const L& left, const R& right) { } } // namespace kernel_float +namespace detail { +template +struct magnitude_helper { + KERNEL_FLOAT_INLINE + static T call(const vector_storage& input) { + return ops::sqrt {}(detail::dot_helper::call(input, input)); + } +}; + +template +struct magnitude_helper { + KERNEL_FLOAT_INLINE + static T call(const vector_storage& input) { + return T {}; + } +}; + +template +struct magnitude_helper { + KERNEL_FLOAT_INLINE + static T call(const vector_storage& input) { + return ops::abs {}(input); + } +}; + +template +struct magnitude_helper { + KERNEL_FLOAT_INLINE + static T call(const vector_storage& input) { + return ops::hypot {}(input[0], input[1]); + } +}; + +// The 3-argument overload of hypot is only available from C++17 +#ifdef __cpp_lib_hypot +template<> +struct magnitude_helper { + KERNEL_FLOAT_INLINE + static float call(const vector_storage& input) { + return std::hypot(input[0], input[1], input[2]); + } +}; + +template<> +struct magnitude_helper { + KERNEL_FLOAT_INLINE + static float call(const vector_storage& input) { + return std::hypot(input[0], input[1], input[2]); + } +}; +#endif + +} // namespace detail + +/** + * Compute the magnitude of the given input vector. This calculates the square root of the sum of squares, also + * known as the Euclidian norm of the vector. + * + * Example + * ======= + * ``` + * vec x = {2, 3, 6}; + * float y = mag(x); // Returns sqrt(2*2 + 3*3 + 6*6) = 7 + * ``` + */ +template> +KERNEL_FLOAT_INLINE T mag(const V& input) { + return detail::magnitude_helper>::call(into_vector_storage(input)); +} +} // namespace kernel_float + #endif //KERNEL_FLOAT_REDUCE_H #ifndef KERNEL_FLOAT_TRIOPS_H #define KERNEL_FLOAT_TRIOPS_H From 9b71242d6c1cd9c4f8f6309824fe0a774bf9719d Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 14 Aug 2023 15:39:39 +0200 Subject: [PATCH 24/50] Fix several compilation issues --- include/kernel_float/bf16.h | 12 ++++----- include/kernel_float/binops.h | 12 ++++++--- include/kernel_float/fp16.h | 12 ++++----- include/kernel_float/reduce.h | 7 +++-- single_include/kernel_float.h | 49 +++++++++++++++++++---------------- 5 files changed, 49 insertions(+), 43 deletions(-) diff --git a/include/kernel_float/bf16.h b/include/kernel_float/bf16.h index 31ba9c2..2b6fa3b 100644 --- a/include/kernel_float/bf16.h +++ b/include/kernel_float/bf16.h @@ -252,10 +252,10 @@ struct dot_helper<__nv_bfloat16, N> { static __nv_bfloat16 call( const vector_storage<__nv_bfloat16, N>& left, const vector_storage<__nv_bfloat16, N>& right) { - if constexpr (N == 0) { + if (N == 0) { return __nv_bfloat16(0); - } else if constexpr (N == 1) { - return __hmul(left.data()[0], right.data()[0], ); + } else if (N == 1) { + return __hmul(left.data()[0], right.data()[0]); } else { __nv_bfloat162 first_a = {left.data()[0], left.data()[1]}; __nv_bfloat162 first_b = {right.data()[0], right.data()[1]}; @@ -270,10 +270,10 @@ struct dot_helper<__nv_bfloat16, N> { __nv_bfloat16 result = __hadd(accum.x, accum.y); - if constexpr (N % 2 != 0) { + if (N % 2 != 0) { __nv_bfloat16 a = left.data()[N - 1]; - __nv_bfloat16 b = right.data()[N - 1]); - result = __hfma(a, b, result); + __nv_bfloat16 b = right.data()[N - 1]; + result = __hfma(a, b, result); } return result; diff --git a/include/kernel_float/binops.h b/include/kernel_float/binops.h index 169493c..a4f0d9b 100644 --- a/include/kernel_float/binops.h +++ b/include/kernel_float/binops.h @@ -24,8 +24,8 @@ KERNEL_FLOAT_INLINE zip_type zip(F fun, const L& left, const R& right) return detail::apply_impl::call( fun, - broadcast(left).storage(), - broadcast(right).storage()); + detail::broadcast_impl, E>::call(into_vector_storage(left)), + detail::broadcast_impl, E>::call(into_vector_storage(right))); } template @@ -234,11 +234,15 @@ namespace detail { template struct cross_helper { KERNEL_FLOAT_INLINE - static vector> call(const vector_storage& a, const vector_storage& b) { + static vector> + call(const vector_storage& av, const vector_storage& bv) { + auto a = av.data(); + auto b = bv.data(); vector> v0 = {a[1], a[2], a[0], a[2], a[0], a[1]}; vector> v1 = {b[2], b[0], b[1], b[1], b[2], b[0]}; - vector> r = v0 * v1; + vector> rv = v0 * v1; + auto r = rv.data(); vector> r0 = {r[0], r[1], r[2]}; vector> r1 = {r[3], r[4], r[5]}; return r0 - r1; diff --git a/include/kernel_float/fp16.h b/include/kernel_float/fp16.h index 17adda4..f80978b 100644 --- a/include/kernel_float/fp16.h +++ b/include/kernel_float/fp16.h @@ -232,10 +232,10 @@ struct dot_helper<__half, N> { KERNEL_FLOAT_INLINE static __half call(const vector_storage<__half, N>& left, const vector_storage<__half, N>& right) { - if constexpr (N == 0) { + if (N == 0) { return __half(0); - } else if constexpr (N == 1) { - return __hmul(left.data()[0], right.data()[0], ); + } else if (N == 1) { + return __hmul(left.data()[0], right.data()[0]); } else { __half2 first_a = {left.data()[0], left.data()[1]}; __half2 first_b = {right.data()[0], right.data()[1]}; @@ -250,10 +250,10 @@ struct dot_helper<__half, N> { __half result = __hadd(accum.x, accum.y); - if constexpr (N % 2 != 0) { + if (N % 2 != 0) { __half a = left.data()[N - 1]; - __half b = right.data()[N - 1]); - result = __hfma(a, b, result); + __half b = right.data()[N - 1]; + result = __hfma(a, b, result); } return result; diff --git a/include/kernel_float/reduce.h b/include/kernel_float/reduce.h index c8708e4..8b8da51 100644 --- a/include/kernel_float/reduce.h +++ b/include/kernel_float/reduce.h @@ -168,7 +168,6 @@ KERNEL_FLOAT_INLINE T dot(const L& left, const R& right) { convert_storage(left, E {}), convert_storage(right, E {})); } -} // namespace kernel_float namespace detail { template @@ -199,7 +198,7 @@ template struct magnitude_helper { KERNEL_FLOAT_INLINE static T call(const vector_storage& input) { - return ops::hypot {}(input[0], input[1]); + return ops::hypot {}(input.data()[0], input.data()[1]); } }; @@ -209,7 +208,7 @@ template<> struct magnitude_helper { KERNEL_FLOAT_INLINE static float call(const vector_storage& input) { - return std::hypot(input[0], input[1], input[2]); + return std::hypot(input.data()[0], input.data()[1], input.data()[2]); } }; @@ -217,7 +216,7 @@ template<> struct magnitude_helper { KERNEL_FLOAT_INLINE static float call(const vector_storage& input) { - return std::hypot(input[0], input[1], input[2]); + return std::hypot(input.data()[0], input.data()[1], input.data()[2]); } }; #endif diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 2b1c003..e6f915b 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-08-14 14:47:10.123460 -// git hash: d9efc31a72c17f7885f305176f2ab7e08e088e18 +// date: 2023-08-14 15:47:42.801950 +// git hash: d13ee37ff80691e77dab5f71cf27600dbdad6f2f //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -1357,8 +1357,8 @@ KERNEL_FLOAT_INLINE zip_type zip(F fun, const L& left, const R& right) return detail::apply_impl::call( fun, - broadcast(left).storage(), - broadcast(right).storage()); + detail::broadcast_impl, E>::call(into_vector_storage(left)), + detail::broadcast_impl, E>::call(into_vector_storage(right))); } template @@ -1567,11 +1567,15 @@ namespace detail { template struct cross_helper { KERNEL_FLOAT_INLINE - static vector> call(const vector_storage& a, const vector_storage& b) { + static vector> + call(const vector_storage& av, const vector_storage& bv) { + auto a = av.data(); + auto b = bv.data(); vector> v0 = {a[1], a[2], a[0], a[2], a[0], a[1]}; vector> v1 = {b[2], b[0], b[1], b[1], b[2], b[0]}; - vector> r = v0 * v1; + vector> rv = v0 * v1; + auto r = rv.data(); vector> r0 = {r[0], r[1], r[2]}; vector> r1 = {r[3], r[4], r[5]}; return r0 - r1; @@ -2098,7 +2102,6 @@ KERNEL_FLOAT_INLINE T dot(const L& left, const R& right) { convert_storage(left, E {}), convert_storage(right, E {})); } -} // namespace kernel_float namespace detail { template @@ -2129,7 +2132,7 @@ template struct magnitude_helper { KERNEL_FLOAT_INLINE static T call(const vector_storage& input) { - return ops::hypot {}(input[0], input[1]); + return ops::hypot {}(input.data()[0], input.data()[1]); } }; @@ -2139,15 +2142,15 @@ template<> struct magnitude_helper { KERNEL_FLOAT_INLINE static float call(const vector_storage& input) { - return std::hypot(input[0], input[1], input[2]); + return std::hypot(input.data()[0], input.data()[1], input.data()[2]); } }; template<> struct magnitude_helper { KERNEL_FLOAT_INLINE - static float call(const vector_storage& input) { - return std::hypot(input[0], input[1], input[2]); + static float call(const vector_storage& input) { + return std::hypot(input.data()[0], input.data()[1], input.data()[2]); } }; #endif @@ -2786,10 +2789,10 @@ struct dot_helper<__half, N> { KERNEL_FLOAT_INLINE static __half call(const vector_storage<__half, N>& left, const vector_storage<__half, N>& right) { - if constexpr (N == 0) { + if (N == 0) { return __half(0); - } else if constexpr (N == 1) { - return __hmul(left.data()[0], right.data()[0], ); + } else if (N == 1) { + return __hmul(left.data()[0], right.data()[0]); } else { __half2 first_a = {left.data()[0], left.data()[1]}; __half2 first_b = {right.data()[0], right.data()[1]}; @@ -2804,10 +2807,10 @@ struct dot_helper<__half, N> { __half result = __hadd(accum.x, accum.y); - if constexpr (N % 2 != 0) { + if (N % 2 != 0) { __half a = left.data()[N - 1]; - __half b = right.data()[N - 1]); - result = __hfma(a, b, result); + __half b = right.data()[N - 1]; + result = __hfma(a, b, result); } return result; @@ -3076,10 +3079,10 @@ struct dot_helper<__nv_bfloat16, N> { static __nv_bfloat16 call( const vector_storage<__nv_bfloat16, N>& left, const vector_storage<__nv_bfloat16, N>& right) { - if constexpr (N == 0) { + if (N == 0) { return __nv_bfloat16(0); - } else if constexpr (N == 1) { - return __hmul(left.data()[0], right.data()[0], ); + } else if (N == 1) { + return __hmul(left.data()[0], right.data()[0]); } else { __nv_bfloat162 first_a = {left.data()[0], left.data()[1]}; __nv_bfloat162 first_b = {right.data()[0], right.data()[1]}; @@ -3094,10 +3097,10 @@ struct dot_helper<__nv_bfloat16, N> { __nv_bfloat16 result = __hadd(accum.x, accum.y); - if constexpr (N % 2 != 0) { + if (N % 2 != 0) { __nv_bfloat16 a = left.data()[N - 1]; - __nv_bfloat16 b = right.data()[N - 1]); - result = __hfma(a, b, result); + __nv_bfloat16 b = right.data()[N - 1]; + result = __hfma(a, b, result); } return result; From e1568c097af1d63ac20eab4032da36b68d7fb431 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 15 Aug 2023 12:29:25 +0200 Subject: [PATCH 25/50] Wrote documentation for many functions --- docs/api.rst | 3 +- docs/build_api.py | 20 +- include/kernel_float.h | 2 +- include/kernel_float/base.h | 58 +- include/kernel_float/binops.h | 23 +- include/kernel_float/constant.h | 26 +- .../{broadcast.h => conversion.h} | 234 +++++-- include/kernel_float/iterate.h | 47 +- include/kernel_float/memory.h | 2 +- include/kernel_float/prelude.h | 8 + include/kernel_float/triops.h | 8 +- include/kernel_float/unops.h | 79 +-- include/kernel_float/vector.h | 222 +++--- single_include/kernel_float.h | 653 +++++++++++++----- 14 files changed, 978 insertions(+), 407 deletions(-) rename include/kernel_float/{broadcast.h => conversion.h} (68%) diff --git a/docs/api.rst b/docs/api.rst index 4e5176e..85b407a 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -3,9 +3,10 @@ API Reference .. toctree:: api/types.rst api/primitives.rst + api/generation.rst api/unary_operators.rst api/binary_operators.rst api/reductions.rst - api/shuffling.rst api/mathematical.rst api/conditional.rst + diff --git a/docs/build_api.py b/docs/build_api.py index 10c4a01..eba749c 100644 --- a/docs/build_api.py +++ b/docs/build_api.py @@ -76,8 +76,6 @@ def build_index_page(groups): ("Aliases", aliases, "typedef"), ], "Primitives": [ - ("range", "range()"), - ("range", "range(F)"), "map", "reduce", "zip", @@ -87,6 +85,18 @@ def build_index_page(groups): "convert", "for_each", ], + "Generation": [ + "make_vec", + "range", + "range_like", + "each_index", + "fill", + "fill_like", + "zeros", + "zeros_like", + "ones", + "ones_like", + ], "Shuffling": [ # "concat", # "swizzle", @@ -97,12 +107,6 @@ def build_index_page(groups): # "rotate_right", ], "Unary Operators": [ - "fill", - "fill_like", - "zeros", - "zeros_like", - "ones", - "ones_like", "negate", "bit_not", "logical_not", diff --git a/include/kernel_float.h b/include/kernel_float.h index 744d499..f2b796d 100644 --- a/include/kernel_float.h +++ b/include/kernel_float.h @@ -4,7 +4,7 @@ #include "kernel_float/base.h" #include "kernel_float/bf16.h" #include "kernel_float/binops.h" -#include "kernel_float/broadcast.h" +#include "kernel_float/conversion.h" #include "kernel_float/fp16.h" #include "kernel_float/iterate.h" #include "kernel_float/macros.h" diff --git a/include/kernel_float/base.h b/include/kernel_float/base.h index c194186..20a4735 100644 --- a/include/kernel_float/base.h +++ b/include/kernel_float/base.h @@ -163,8 +163,11 @@ static constexpr size_t compute_max_alignment(size_t total_size, size_t min_alig template using vector_storage = aligned_array; +template +struct extent; + template -struct extent { +struct extent { static constexpr size_t value = N; static constexpr size_t size = N; }; @@ -203,8 +206,54 @@ struct into_vector_traits> { } }; -template -struct vector_traits; +#define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ + template<> \ + struct into_vector_traits<::T2> { \ + using value_type = T; \ + using extent_type = extent<2>; \ + \ + KERNEL_FLOAT_INLINE \ + static vector_storage call(::T2 v) { \ + return {v.x, v.y}; \ + } \ + }; \ + \ + template<> \ + struct into_vector_traits<::T3> { \ + using value_type = T; \ + using extent_type = extent<3>; \ + \ + KERNEL_FLOAT_INLINE \ + static vector_storage call(::T3 v) { \ + return {v.x, v.y, v.z}; \ + } \ + }; \ + \ + template<> \ + struct into_vector_traits<::T4> { \ + using value_type = T; \ + using extent_type = extent<4>; \ + \ + KERNEL_FLOAT_INLINE \ + static vector_storage call(::T4 v) { \ + return {v.x, v.y, v.z, v.w}; \ + } \ + }; + +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(char, char1, char2, char3, char4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(short, short1, short2, short3, short4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(int, int1, int2, int3, int4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long, long1, long2, long3, long4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long long, longlong1, longlong2, longlong3, longlong4) + +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned char, uchar1, uchar2, uchar3, uchar4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned short, ushort1, ushort2, ushort3, ushort4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned int, uint1, uint2, uint3, uint4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long, ulong1, ulong2, ulong3, ulong4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong1, ulonglong2, ulonglong3, ulonglong4) + +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(float, float1, float2, float3, float4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(double, double1, double2, double3, double4) template> struct vector; @@ -220,6 +269,9 @@ struct into_vector_traits> { } }; +template +struct vector_traits; + template struct vector_traits> { using value_type = T; diff --git a/include/kernel_float/binops.h b/include/kernel_float/binops.h index a4f0d9b..378df5a 100644 --- a/include/kernel_float/binops.h +++ b/include/kernel_float/binops.h @@ -1,7 +1,7 @@ #ifndef KERNEL_FLOAT_BINOPS_H #define KERNEL_FLOAT_BINOPS_H -#include "broadcast.h" +#include "conversion.h" #include "unops.h" namespace kernel_float { @@ -14,6 +14,14 @@ using zip_type = vector< /** * Combines the elements from the two inputs (`left` and `right`) element-wise, applying a provided binary * function (`fun`) to each pair of corresponding elements. + * + * Example + * ======= + * ``` + * vec make_negative = {true, false, true}; + * vec input = {1, 2, 3}; + * vec output = zip([](bool b, int n){ return b ? -n : +n; }, make_negative, input); // returns [-1, 2, -3] + * ``` */ template KERNEL_FLOAT_INLINE zip_type zip(F fun, const L& left, const R& right) { @@ -34,8 +42,17 @@ using zip_common_type = vector< broadcast_vector_extent_type>; /** - * Similar to `zip`, except `zip_common` promotes the element types of the inputs to a common type before applying the - * binary function. + * Combines the elements from the two inputs (`left` and `right`) element-wise, applying a provided binary + * function (`fun`) to each pair of corresponding elements. The elements are promoted to a common type before applying + * the binary function. + * + * Example + * ======= + * ``` + * vec a = {1.0f, 2.0f, 3.0f}; + * vec b = {4, 5, 6}; + * vec c = zip_common([](float x, float y){ return x + y; }, a, b); // returns [5.0f, 7.0f, 9.0f] + * ``` */ template KERNEL_FLOAT_INLINE zip_common_type zip_common(F fun, const L& left, const R& right) { diff --git a/include/kernel_float/constant.h b/include/kernel_float/constant.h index 4fe7f35..5ee2fae 100644 --- a/include/kernel_float/constant.h +++ b/include/kernel_float/constant.h @@ -1,7 +1,8 @@ #ifndef KERNEL_FLOAT_CONSTANT #define KERNEL_FLOAT_CONSTANT -#include "broadcast.h" +#include "base.h" +#include "conversion.h" namespace kernel_float { @@ -60,6 +61,29 @@ struct cast, R, m> { }; } // namespace ops +#define KERNEL_FLOAT_CONSTANT_DEFINE_OP(OP) \ + template \ + R operator OP(const constant& left, const R& right) { \ + using T = vector_value_type; \ + return operator OP(T(left.get()), right); \ + } \ + \ + template \ + L operator OP(const L& left, const constant& right) { \ + using T = vector_value_type; \ + return operator OP(left, T(right.get())); \ + } \ + \ + template> \ + constant operator OP(const constant& left, const constant& right) { \ + return constant(operator OP(T(left.get()), T(right.get()))); \ + } + +KERNEL_FLOAT_CONSTANT_DEFINE_OP(+) +KERNEL_FLOAT_CONSTANT_DEFINE_OP(-) +KERNEL_FLOAT_CONSTANT_DEFINE_OP(*) +KERNEL_FLOAT_CONSTANT_DEFINE_OP(/) + } // namespace kernel_float #endif diff --git a/include/kernel_float/broadcast.h b/include/kernel_float/conversion.h similarity index 68% rename from include/kernel_float/broadcast.h rename to include/kernel_float/conversion.h index 5aa9438..ce5553b 100644 --- a/include/kernel_float/broadcast.h +++ b/include/kernel_float/conversion.h @@ -5,11 +5,67 @@ #include "unops.h" namespace kernel_float { + +enum struct RoundingMode { ANY, DOWN, UP, NEAREST, TOWARD_ZERO }; + +namespace ops { +template +struct cast; + +template +struct cast { + KERNEL_FLOAT_INLINE R operator()(T input) noexcept { + return R(input); + } +}; + +template +struct cast { + KERNEL_FLOAT_INLINE T operator()(T input) noexcept { + return input; + } +}; + +template +struct cast { + KERNEL_FLOAT_INLINE T operator()(T input) noexcept { + return input; + } +}; +} // namespace ops + +/** + * Cast the elements of the given vector `input` to a different type `R`. + * + * This function casts each element of the input vector to a different data type specified by + * template parameter `R`. + * + * Optionally, the rounding mode can be set using the `Mode` template parameter. The default mode is `ANY`, which + * uses the fastest rounding mode available. + * + * Example + * ======= + * ``` + * vec input {1.2f, 2.7f, 3.5f, 4.9f}; + * auto casted = cast(input); // [1, 2, 3, 4] + * ``` + */ +template +KERNEL_FLOAT_INLINE vector> cast(const V& input) { + using F = ops::cast, R, Mode>; + return map(F {}, input); +} + namespace detail { -template +template struct broadcast_extent_helper; +template +struct broadcast_extent_helper { + using type = E; +}; + template struct broadcast_extent_helper, extent> { using type = extent; @@ -30,13 +86,17 @@ struct broadcast_extent_helper, extent<1>> { using type = extent<1>; }; +template +struct broadcast_extent_helper: + broadcast_extent_helper::type, C, Rest...> {}; + } // namespace detail -template -using broadcast_extent = typename detail::broadcast_extent_helper::type; +template +using broadcast_extent = typename detail::broadcast_extent_helper::type; -template -using broadcast_vector_extent_type = broadcast_extent, vector_extent_type>; +template +using broadcast_vector_extent_type = broadcast_extent...>; template static constexpr bool is_broadcastable = is_same, To>; @@ -79,6 +139,16 @@ struct broadcast_impl, extent<1>> { /** * Takes the given vector `input` and extends its size to a length of `N`. This is only valid if the size of `input` * is 1 or `N`. + * + * Example + * ======= + * ``` + * vec a = {1.0f}; + * vec x = broadcast<5>(a); // Returns [1.0f, 1.0f, 1.0f, 1.0f, 1.0f] + * + * vec b = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f}; + * vec y = broadcast<5>(b); // Returns [1.0f, 2.0f, 3.0f, 4.0f, 5.0f] + * ``` */ template KERNEL_FLOAT_INLINE vector, extent> @@ -98,57 +168,6 @@ broadcast_like(const V& input, const R& other) { return broadcast(input, vector_extent_type {}); } -/** - * Returns a vector containing `N` copies of `value`. - */ -template -KERNEL_FLOAT_INLINE vector> fill(T value = {}, extent = {}) { - vector_storage input = {value}; - return detail::broadcast_impl, extent>::call(input); -} - -/** - * Returns a vector containing `N` copies of `T(0)`. - */ -template -KERNEL_FLOAT_INLINE vector> zeros(extent = {}) { - vector_storage input = {T {}}; - return detail::broadcast_impl, extent>::call(input); -} - -/** - * Returns a vector containing `N` copies of `T(1)`. - */ -template -KERNEL_FLOAT_INLINE vector> ones(extent = {}) { - vector_storage input = {T {1}}; - return detail::broadcast_impl, extent>::call(input); -} - -/** - * Returns a vector filled with `value` having the same type and size as input vector `V`. - */ -template, typename E = vector_extent_type> -KERNEL_FLOAT_INLINE vector fill_like(const V&, T value) { - return fill(value, E {}); -} - -/** - * Returns a vector filled with zeros having the same type and size as input vector `V`. - */ -template, typename E = vector_extent_type> -KERNEL_FLOAT_INLINE vector zeros_like(const V& = {}) { - return zeros(E {}); -} - -/** - * Returns a vector filled with ones having the same type and size as input vector `V`. - */ -template, typename E = vector_extent_type> -KERNEL_FLOAT_INLINE vector ones_like(const V& = {}) { - return ones(E {}); -} - namespace detail { template struct convert_helper { @@ -196,13 +215,114 @@ KERNEL_FLOAT_INLINE vector_storage convert_storage(const V& input, extent< /** * Cast the values of the given input vector to type `R` and then broadcast the result to the given size `N`. * - * This function is essentially a `cast` followed by a `broadcast`. + * Example + * ======= + * ``` + * int a = 5; + * vec x = convert(a); // returns [5.0f, 5.0f, 5.0f] + * + * float b = 5.0f; + * vec x = convert(b); // returns [5.0f, 5.0f, 5.0f] + * + * vec c = {1, 2, 3}; + * vec x = convert(c); // returns [1.0f, 2.0f, 3.0f] + * ``` */ template KERNEL_FLOAT_INLINE vector> convert(const V& input, extent new_size = {}) { return convert_storage(input); } +/** + * Returns a vector containing `N` copies of `value`. + * + * Example + * ======= + * ``` + * vec a = fill<3>(42); // return [42, 42, 42] + * ``` + */ +template +KERNEL_FLOAT_INLINE vector> fill(T value = {}, extent = {}) { + vector_storage input = {value}; + return detail::broadcast_impl, extent>::call(input); +} + +/** + * Returns a vector containing `N` copies of `T(0)`. + * + * Example + * ======= + * ``` + * vec a = zeros(); // return [0, 0, 0] + * ``` + */ +template +KERNEL_FLOAT_INLINE vector> zeros(extent = {}) { + vector_storage input = {T {}}; + return detail::broadcast_impl, extent>::call(input); +} + +/** + * Returns a vector containing `N` copies of `T(1)`. + * + * Example + * ======= + * ``` + * vec a = ones(); // return [1, 1, 1] + * ``` + */ +template +KERNEL_FLOAT_INLINE vector> ones(extent = {}) { + vector_storage input = {T {1}}; + return detail::broadcast_impl, extent>::call(input); +} + +/** + * Returns a vector filled with `value` having the same type and size as input vector `V`. + * + * Example + * ======= + * ``` + * vec a = {1, 2, 3}; + * vec b = fill_like(a, 42); // return [42, 42, 42] + * ``` + */ +template, typename E = vector_extent_type> +KERNEL_FLOAT_INLINE vector fill_like(const V&, T value) { + return fill(value, E {}); +} + +/** + * Returns a vector filled with zeros having the same type and size as input vector `V`. + * + * Example + * ======= + * ``` + * vec a = {1, 2, 3}; + * vec b = zeros_like(a); // return [0, 0, 0] + * ``` + */ +template, typename E = vector_extent_type> +KERNEL_FLOAT_INLINE vector zeros_like(const V& = {}) { + return zeros(E {}); +} + +/** + * Returns a vector filled with ones having the same type and size as input vector `V`. + * + * Example + * ======= + * ``` + * vec a = {1, 2, 3}; + * vec b = ones_like(a); // return [1, 1, 1] + * ``` + */ +template, typename E = vector_extent_type> +KERNEL_FLOAT_INLINE vector ones_like(const V& = {}) { + return ones(E {}); +} + } // namespace kernel_float #endif diff --git a/include/kernel_float/iterate.h b/include/kernel_float/iterate.h index 3fb0ffd..5531090 100644 --- a/include/kernel_float/iterate.h +++ b/include/kernel_float/iterate.h @@ -8,6 +8,8 @@ namespace kernel_float { /** * Apply the function fun for each element from input. * + * Example + * ======= * ``` * for_each(range(), [&](auto i) { * printf("element: %d\n", i); @@ -42,11 +44,13 @@ struct range_helper { } // namespace detail /** - * Generate vector consisting of the numbers 0...N-1 of type T + * Generate vector consisting of the numbers `0...N-1` of type `T` * + * Example + * ======= * ``` * // Returns [0, 1, 2] - * vector vec = range(); + * vec vec = range(); * ``` */ template @@ -55,12 +59,13 @@ KERNEL_FLOAT_INLINE vector> range() { } /** - * Takes a vector of size ``N`` and element type ``T`` and returns a new vector consisting of the numbers ``0...N-1`` - * of type ``T`` + * Takes a vector `vec` and returns a new vector consisting of the numbers ``0...N-1`` of type ``T`` * + * Example + * ======= * ``` - * // Returns [0.0f, 1.0f, 2.0f] - * vector vec = range(); + * auto input = vec(5.0f, 10.0f, -1.0f); + * auto indices = range_like(input); // returns [0.0f, 1.0f, 2.0f] * ``` */ template @@ -69,15 +74,27 @@ KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { } /** - * Takes a vector of size ``N`` and returns a new vector consisting of the numbers ``0...N-1`` of type ``size_t`` + * Takes a vector of size ``N`` and returns a new vector consisting of the numbers ``0...N-1``. The data type used + * for the indices is given by the first template argument, which is `size_t` by default. This function is useful when + * needing to iterate over the indices of a vector. * + * Example + * ======= * ``` - * // Returns [0, 1, 2] - * vector vec = enumerate(float3(6, 4, 2)); + * // Returns [0, 1, 2] of type size_t + * vec a = each_index(float3(6, 4, 2)); + * + * // Returns [0, 1, 2] of type int. + * vec b = each_index(float3(6, 4, 2)); + * + * vec input = {1.0f, 2.0f, 3.0f, 4.0f}; + * for (auto index: each_index(input)) { + * printf("%d] %f\n", index, input[index]); + * } * ``` */ template -KERNEL_FLOAT_INLINE vector> enumerate(const V& = {}) { +KERNEL_FLOAT_INLINE vector> each_index(const V& = {}) { return detail::range_helper>::call(); } @@ -119,6 +136,16 @@ static constexpr size_t flatten_size = detail::flatten_helper::size; template using flatten_type = vector, extent>>; +/** + * Flattens the elements of this vector. For example, this turns a `vec, 3>` into a `vec`. + * + * Example + * ======= + * ``` + * vec input = {{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}}; + * vec result = flatten(input); // returns [1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f] + * ``` + */ template KERNEL_FLOAT_INLINE flatten_type flatten(const V& input) { vector_storage, flatten_size> output; diff --git a/include/kernel_float/memory.h b/include/kernel_float/memory.h index 3602e15..1c136a6 100644 --- a/include/kernel_float/memory.h +++ b/include/kernel_float/memory.h @@ -3,7 +3,7 @@ /* #include "binops.h" -#include "broadcast.h" +#include "conversion.h" #include "iterate.h" namespace kernel_float { diff --git a/include/kernel_float/prelude.h b/include/kernel_float/prelude.h index feaa7f4..9f9c1cf 100644 --- a/include/kernel_float/prelude.h +++ b/include/kernel_float/prelude.h @@ -84,6 +84,14 @@ KERNEL_FLOAT_INLINE constexpr kconstant kconst(T value) { return value; } +static constexpr kconstant operator""_c(long double v) { + return static_cast(v); +} + +static constexpr kconstant operator""_c(unsigned long long int v) { + return static_cast(v); +} + } // namespace prelude } // namespace kernel_float diff --git a/include/kernel_float/triops.h b/include/kernel_float/triops.h index fc93a94..7144c4f 100644 --- a/include/kernel_float/triops.h +++ b/include/kernel_float/triops.h @@ -1,7 +1,7 @@ #ifndef KERNEL_FLOAT_TRIOPS_H #define KERNEL_FLOAT_TRIOPS_H -#include "broadcast.h" +#include "conversion.h" #include "unops.h" namespace kernel_float { @@ -36,7 +36,7 @@ template< typename L, typename R, typename T = promoted_vector_value_type, - typename E = broadcast_extent, broadcast_vector_extent_type>> + typename E = broadcast_vector_extent_type> KERNEL_FLOAT_INLINE vector where(const C& cond, const L& true_values, const R& false_values) { using F = ops::conditional; @@ -64,7 +64,7 @@ template< typename C, typename L, typename T = vector_value_type, - typename E = broadcast_extent, vector_extent_type>> + typename E = broadcast_vector_extent_type> KERNEL_FLOAT_INLINE vector where(const C& cond, const L& true_values) { vector> false_values = T {}; return where(cond, true_values, false_values); @@ -114,7 +114,7 @@ template< typename B, typename C, typename T = promoted_vector_value_type, - typename E = broadcast_extent, broadcast_vector_extent_type>> + typename E = broadcast_vector_extent> KERNEL_FLOAT_INLINE vector fma(const A& a, const B& b, const C& c) { using F = ops::fma; diff --git a/include/kernel_float/unops.h b/include/kernel_float/unops.h index eb1d305..18619d6 100644 --- a/include/kernel_float/unops.h +++ b/include/kernel_float/unops.h @@ -25,6 +25,16 @@ struct apply_impl { template using map_type = vector>, vector_extent_type>; +/** + * Apply the function `F` to each element from the vector `input` and return the results as a new vector. + * + * Examples + * ======== + * ``` + * vec input = {1.0f, 2.0f, 3.0f, 4.0f}; + * vec squared = map([](auto x) { return x * x; }, input); // [1.0f, 4.0f, 9.0f, 16.0f] + * ``` + */ template KERNEL_FLOAT_INLINE map_type map(F fun, const V& input) { using Input = vector_value_type; @@ -34,26 +44,26 @@ KERNEL_FLOAT_INLINE map_type map(F fun, const V& input) { into_vector_storage(input)); } -#define KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ - namespace ops { \ - template \ - struct NAME { \ - KERNEL_FLOAT_INLINE T operator()(T input) { \ - return T(EXPR); \ - } \ - }; \ - } \ - template \ - KERNEL_FLOAT_INLINE into_vector_type NAME(const V& input) { \ - using F = ops::NAME>; \ - return map(F {}, input); \ +#define KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ + namespace ops { \ + template \ + struct NAME { \ + KERNEL_FLOAT_INLINE T operator()(T input) { \ + return T(EXPR); \ + } \ + }; \ + } \ + template \ + KERNEL_FLOAT_INLINE vector, vector_extent_type> NAME(const V& input) { \ + using F = ops::NAME>; \ + return map(F {}, input); \ } -#define KERNEL_FLOAT_DEFINE_UNARY_OP(NAME, OP, EXPR) \ - KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ - template \ - KERNEL_FLOAT_INLINE vector operator OP(const vector& vec) { \ - return NAME(vec); \ +#define KERNEL_FLOAT_DEFINE_UNARY_OP(NAME, OP, EXPR) \ + KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ + template \ + KERNEL_FLOAT_INLINE vector operator OP(const vector& vec) { \ + return NAME(vec); \ } KERNEL_FLOAT_DEFINE_UNARY_OP(negate, -, -input) @@ -132,39 +142,6 @@ KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_cos, cos, __cosf) KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_sin, sin, __sinf) KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_tan, tan, __tanf) -enum struct RoundingMode { ANY, DOWN, UP, NEAREST, TOWARD_ZERO }; - -namespace ops { -template -struct cast; - -template -struct cast { - KERNEL_FLOAT_INLINE R operator()(T input) noexcept { - return R(input); - } -}; - -template -struct cast { - KERNEL_FLOAT_INLINE T operator()(T input) noexcept { - return input; - } -}; - -template -struct cast { - KERNEL_FLOAT_INLINE T operator()(T input) noexcept { - return input; - } -}; -} // namespace ops - -template -KERNEL_FLOAT_INLINE vector> cast(const V& input) { - using F = ops::cast, R, Mode>; - return map(F {}, input); -} } // namespace kernel_float #endif //KERNEL_FLOAT_UNOPS_H diff --git a/include/kernel_float/vector.h b/include/kernel_float/vector.h index 8e8ce1c..8b045b2 100644 --- a/include/kernel_float/vector.h +++ b/include/kernel_float/vector.h @@ -2,7 +2,7 @@ #define KERNEL_FLOAT_VECTOR_H #include "base.h" -#include "broadcast.h" +#include "conversion.h" #include "iterate.h" #include "macros.h" #include "reduce.h" @@ -10,8 +10,18 @@ namespace kernel_float { +/** + * Container that stores ``N`` values of type ``T``. + * + * It is not recommended to use this class directly, but instead, use the type `vec` which is an alias for + * `vector, vector_storage>`. + * + * @tparam T The type of the values stored within the vector. + * @tparam E The size of this vector. Should be of type `extent`. + * @tparam S The object's storage class. Should be the type `vector_storage` + */ template -struct vector: S { +struct vector: public S { using value_type = T; using extent_type = E; using storage_type = S; @@ -30,11 +40,12 @@ struct vector: S { // For all other arguments, we convert it using `convert_storage` according to broadcast rules template, T>, int> = 0> - KERNEL_FLOAT_INLINE vector(U&& input) : storage_type(convert_storage(input)) {} + KERNEL_FLOAT_INLINE vector(U&& input) : + storage_type(convert_storage(input, extent_type {})) {} template, T>, int> = 0> KERNEL_FLOAT_INLINE explicit vector(U&& input) : - storage_type(convert_storage(input)) {} + storage_type(convert_storage(input, extent_type {})) {} // List of `N` (where N >= 2), simply pass forward to the storage template< @@ -45,6 +56,9 @@ struct vector: S { KERNEL_FLOAT_INLINE vector(const A& a, const B& b, const Rest&... rest) : storage_type {a, b, rest...} {} + /** + * Returns the number of elements in this vector. + */ KERNEL_FLOAT_INLINE static constexpr size_t size() { return E::size; @@ -60,165 +74,196 @@ struct vector: S { return *this; } + /** + * Returns a pointer to the underlying storage data. + */ + KERNEL_FLOAT_INLINE + T* data() { + return storage().data(); + } + + /** + * Returns a pointer to the underlying storage data. + */ + KERNEL_FLOAT_INLINE + const T* data() const { + return storage().data(); + } + KERNEL_FLOAT_INLINE const T* cdata() const { return this->data(); } + /** + * Returns a reference to the item at index `i`. + */ KERNEL_FLOAT_INLINE - T* begin() { - return this->data(); + T& at(size_t i) { + return *(this->data() + i); } + /** + * Returns a constant reference to the item at index `i`. + */ KERNEL_FLOAT_INLINE - const T* begin() const { - return this->data(); + const T& at(size_t i) const { + return *(this->data() + i); } + /** + * Returns a reference to the item at index `i`. + */ KERNEL_FLOAT_INLINE - const T* cbegin() const { - return this->data(); + T& operator[](size_t i) { + return at(i); } + /** + * Returns a constant reference to the item at index `i`. + */ KERNEL_FLOAT_INLINE - T* end() { - return this->data() + size(); + const T& operator[](size_t i) const { + return at(i); } KERNEL_FLOAT_INLINE - const T* end() const { - return this->data() + size(); + T& operator()(size_t i) { + return at(i); } KERNEL_FLOAT_INLINE - const T* cend() const { - return this->data() + size(); + const T& operator()(size_t i) const { + return at(i); } + /** + * Returns a pointer to the first element. + */ KERNEL_FLOAT_INLINE - T& at(size_t x) { - return *(this->data() + x); + T* begin() { + return this->data(); } + /** + * Returns a pointer to the first element. + */ KERNEL_FLOAT_INLINE - const T& at(size_t x) const { - return *(this->data() + x); + const T* begin() const { + return this->data(); } + /** + * Returns a pointer to the first element. + */ KERNEL_FLOAT_INLINE - T get(size_t x) const { - return at(x); + const T* cbegin() const { + return this->data(); } + /** + * Returns a pointer to one past the last element. + */ KERNEL_FLOAT_INLINE - void set(size_t x, T value) { - at(x) = std::move(value); + T* end() { + return this->data() + size(); } + /** + * Returns a pointer to one past the last element. + */ KERNEL_FLOAT_INLINE - T& operator[](size_t x) { - return at(x); + const T* end() const { + return this->data() + size(); } + /** + * Returns a pointer to one past the last element. + */ KERNEL_FLOAT_INLINE - const T& operator[](size_t x) const { - return at(x); + const T* cend() const { + return this->data() + size(); } + /** + * Copy the element at index `i`. + */ KERNEL_FLOAT_INLINE - T& operator()(size_t x) { + T get(size_t x) const { return at(x); } + /** + * Set the element at index `i`. + */ KERNEL_FLOAT_INLINE - const T& operator()(size_t x) const { - return at(x); + void set(size_t x, T value) { + at(x) = std::move(value); } + /** + * Cast the elements of this vector to type `R` and returns a new vector. + */ template KERNEL_FLOAT_INLINE vector cast() const { return kernel_float::cast(*this); } - template - KERNEL_FLOAT_INLINE vector> broadcast(extent new_size = {}) const { + /** + * Broadcast this vector into a new size `(Ns...)`. + */ + template + KERNEL_FLOAT_INLINE vector> broadcast(extent new_size = {}) const { return kernel_float::broadcast(*this, new_size); } + /** + * Apply the given function `F` to each element of this vector and returns a new vector with the results. + */ template KERNEL_FLOAT_INLINE vector, E> map(F fun) const { return kernel_float::map(fun, *this); } + /** + * Reduce the elements of the given vector input into a single value using the function `F`. + * + * This function should be a binary function that takes two elements and returns one element. The order in which + * the elements are reduced is not specified and depends on the reduction function and the vector type. + */ template KERNEL_FLOAT_INLINE T reduce(F fun) const { return kernel_float::reduce(fun, *this); } + /** + * Flattens the elements of this vector. For example, this turns a `vec, 3>` into a `vec`. + */ KERNEL_FLOAT_INLINE flatten_type flatten() const { return kernel_float::flatten(*this); } + /** + * Apply the given function `F` to each element of this vector. + */ template KERNEL_FLOAT_INLINE void for_each(F fun) const { return kernel_float::for_each(*this, std::move(fun)); } }; -#define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ - template<> \ - struct into_vector_traits<::T2> { \ - using value_type = T; \ - using extent_type = extent<2>; \ - \ - KERNEL_FLOAT_INLINE \ - static vector_storage call(::T2 v) { \ - return {v.x, v.y}; \ - } \ - }; \ - \ - template<> \ - struct into_vector_traits<::T3> { \ - using value_type = T; \ - using extent_type = extent<3>; \ - \ - KERNEL_FLOAT_INLINE \ - static vector_storage call(::T3 v) { \ - return {v.x, v.y, v.z}; \ - } \ - }; \ - \ - template<> \ - struct into_vector_traits<::T4> { \ - using value_type = T; \ - using extent_type = extent<4>; \ - \ - KERNEL_FLOAT_INLINE \ - static vector_storage call(::T4 v) { \ - return {v.x, v.y, v.z, v.w}; \ - } \ - }; - +/** + * Convert the given `input` into a vector. This function can perform one of the following actions: + * + * - For vectors `vec`, it simply returns the original vector. + * - For primitive types `T` (e.g., `int`, `float`, `double`), it returns a `vec`. + * - For array-like types (e.g., `int2`, `std::array`, `T[N]`), it returns `vec`. + */ template KERNEL_FLOAT_INLINE into_vector_type into_vector(V&& input) { return into_vector_traits::call(std::forward(input)); } -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(char, char1, char2, char3, char4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(short, short1, short2, short3, short4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(int, int1, int2, int3, int4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long, long1, long2, long3, long4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long long, longlong1, longlong2, longlong3, longlong4) - -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned char, uchar1, uchar2, uchar3, uchar4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned short, ushort1, ushort2, ushort3, ushort4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned int, uint1, uint2, uint3, uint4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long, ulong1, ulong2, ulong3, ulong4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong1, ulonglong2, ulonglong3, ulonglong4) - -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(float, float1, float2, float3, float4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(double, double1, double2, double3, double4) - template using scalar = vector>; @@ -236,6 +281,19 @@ template using vec7 = vec; template using vec8 = vec; // clang-format on +/** + * Create a vector from a variable number of input values. + * + * The resulting vector type is determined by promoting the types of the input values into a common type. + * The number of input values determines the dimension of the resulting vector. + * + * Example + * ======= + * ``` + * auto v1 = make_vec(1.0f, 2.0f, 3.0f); // Creates a vec [1.0f, 2.0f, 3.0f] + * auto v2 = make_vec(1, 2, 3, 4); // Creates a vec [1, 2, 3, 4] + * ``` + */ template KERNEL_FLOAT_INLINE vec, sizeof...(Args)> make_vec(Args&&... args) { using T = promote_t; diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index e6f915b..07f068e 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-08-14 15:47:42.801950 -// git hash: d13ee37ff80691e77dab5f71cf27600dbdad6f2f +// date: 2023-08-15 12:29:08.022922 +// git hash: 9b71242d6c1cd9c4f8f6309824fe0a774bf9719d //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -472,8 +472,11 @@ static constexpr size_t compute_max_alignment(size_t total_size, size_t min_alig template using vector_storage = aligned_array; +template +struct extent; + template -struct extent { +struct extent { static constexpr size_t value = N; static constexpr size_t size = N; }; @@ -512,8 +515,54 @@ struct into_vector_traits> { } }; -template -struct vector_traits; +#define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ + template<> \ + struct into_vector_traits<::T2> { \ + using value_type = T; \ + using extent_type = extent<2>; \ + \ + KERNEL_FLOAT_INLINE \ + static vector_storage call(::T2 v) { \ + return {v.x, v.y}; \ + } \ + }; \ + \ + template<> \ + struct into_vector_traits<::T3> { \ + using value_type = T; \ + using extent_type = extent<3>; \ + \ + KERNEL_FLOAT_INLINE \ + static vector_storage call(::T3 v) { \ + return {v.x, v.y, v.z}; \ + } \ + }; \ + \ + template<> \ + struct into_vector_traits<::T4> { \ + using value_type = T; \ + using extent_type = extent<4>; \ + \ + KERNEL_FLOAT_INLINE \ + static vector_storage call(::T4 v) { \ + return {v.x, v.y, v.z, v.w}; \ + } \ + }; + +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(char, char1, char2, char3, char4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(short, short1, short2, short3, short4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(int, int1, int2, int3, int4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long, long1, long2, long3, long4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long long, longlong1, longlong2, longlong3, longlong4) + +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned char, uchar1, uchar2, uchar3, uchar4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned short, ushort1, ushort2, ushort3, ushort4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned int, uint1, uint2, uint3, uint4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long, ulong1, ulong2, ulong3, ulong4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong1, ulonglong2, ulonglong3, ulonglong4) + +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(float, float1, float2, float3, float4) +KERNEL_FLOAT_DEFINE_VECTOR_TYPE(double, double1, double2, double3, double4) template> struct vector; @@ -529,6 +578,9 @@ struct into_vector_traits> { } }; +template +struct vector_traits; + template struct vector_traits> { using value_type = T; @@ -833,6 +885,8 @@ namespace kernel_float { /** * Apply the function fun for each element from input. * + * Example + * ======= * ``` * for_each(range(), [&](auto i) { * printf("element: %d\n", i); @@ -867,11 +921,13 @@ struct range_helper { } // namespace detail /** - * Generate vector consisting of the numbers 0...N-1 of type T + * Generate vector consisting of the numbers `0...N-1` of type `T` * + * Example + * ======= * ``` * // Returns [0, 1, 2] - * vector vec = range(); + * vec vec = range(); * ``` */ template @@ -880,12 +936,13 @@ KERNEL_FLOAT_INLINE vector> range() { } /** - * Takes a vector of size ``N`` and element type ``T`` and returns a new vector consisting of the numbers ``0...N-1`` - * of type ``T`` + * Takes a vector `vec` and returns a new vector consisting of the numbers ``0...N-1`` of type ``T`` * + * Example + * ======= * ``` - * // Returns [0.0f, 1.0f, 2.0f] - * vector vec = range(); + * auto input = vec(5.0f, 10.0f, -1.0f); + * auto indices = range_like(input); // returns [0.0f, 1.0f, 2.0f] * ``` */ template @@ -894,15 +951,27 @@ KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { } /** - * Takes a vector of size ``N`` and returns a new vector consisting of the numbers ``0...N-1`` of type ``size_t`` + * Takes a vector of size ``N`` and returns a new vector consisting of the numbers ``0...N-1``. The data type used + * for the indices is given by the first template argument, which is `size_t` by default. This function is useful when + * needing to iterate over the indices of a vector. * + * Example + * ======= * ``` - * // Returns [0, 1, 2] - * vector vec = enumerate(float3(6, 4, 2)); + * // Returns [0, 1, 2] of type size_t + * vec a = each_index(float3(6, 4, 2)); + * + * // Returns [0, 1, 2] of type int. + * vec b = each_index(float3(6, 4, 2)); + * + * vec input = {1.0f, 2.0f, 3.0f, 4.0f}; + * for (auto index: each_index(input)) { + * printf("%d] %f\n", index, input[index]); + * } * ``` */ template -KERNEL_FLOAT_INLINE vector> enumerate(const V& = {}) { +KERNEL_FLOAT_INLINE vector> each_index(const V& = {}) { return detail::range_helper>::call(); } @@ -944,6 +1013,16 @@ static constexpr size_t flatten_size = detail::flatten_helper::size; template using flatten_type = vector, extent>>; +/** + * Flattens the elements of this vector. For example, this turns a `vec, 3>` into a `vec`. + * + * Example + * ======= + * ``` + * vec input = {{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}}; + * vec result = flatten(input); // returns [1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f] + * ``` + */ template KERNEL_FLOAT_INLINE flatten_type flatten(const V& input) { vector_storage, flatten_size> output; @@ -980,6 +1059,16 @@ struct apply_impl { template using map_type = vector>, vector_extent_type>; +/** + * Apply the function `F` to each element from the vector `input` and return the results as a new vector. + * + * Examples + * ======== + * ``` + * vec input = {1.0f, 2.0f, 3.0f, 4.0f}; + * vec squared = map([](auto x) { return x * x; }, input); // [1.0f, 4.0f, 9.0f, 16.0f] + * ``` + */ template KERNEL_FLOAT_INLINE map_type map(F fun, const V& input) { using Input = vector_value_type; @@ -989,26 +1078,26 @@ KERNEL_FLOAT_INLINE map_type map(F fun, const V& input) { into_vector_storage(input)); } -#define KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ - namespace ops { \ - template \ - struct NAME { \ - KERNEL_FLOAT_INLINE T operator()(T input) { \ - return T(EXPR); \ - } \ - }; \ - } \ - template \ - KERNEL_FLOAT_INLINE into_vector_type NAME(const V& input) { \ - using F = ops::NAME>; \ - return map(F {}, input); \ +#define KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ + namespace ops { \ + template \ + struct NAME { \ + KERNEL_FLOAT_INLINE T operator()(T input) { \ + return T(EXPR); \ + } \ + }; \ + } \ + template \ + KERNEL_FLOAT_INLINE vector, vector_extent_type> NAME(const V& input) { \ + using F = ops::NAME>; \ + return map(F {}, input); \ } -#define KERNEL_FLOAT_DEFINE_UNARY_OP(NAME, OP, EXPR) \ - KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ - template \ - KERNEL_FLOAT_INLINE vector operator OP(const vector& vec) { \ - return NAME(vec); \ +#define KERNEL_FLOAT_DEFINE_UNARY_OP(NAME, OP, EXPR) \ + KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ + template \ + KERNEL_FLOAT_INLINE vector operator OP(const vector& vec) { \ + return NAME(vec); \ } KERNEL_FLOAT_DEFINE_UNARY_OP(negate, -, -input) @@ -1087,6 +1176,17 @@ KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_cos, cos, __cosf) KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_sin, sin, __sinf) KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_tan, tan, __tanf) +} // namespace kernel_float + +#endif //KERNEL_FLOAT_UNOPS_H +#ifndef KERNEL_FLOAT_CAST_H +#define KERNEL_FLOAT_CAST_H + + + + +namespace kernel_float { + enum struct RoundingMode { ANY, DOWN, UP, NEAREST, TOWARD_ZERO }; namespace ops { @@ -1115,26 +1215,38 @@ struct cast { }; } // namespace ops +/** + * Cast the elements of the given vector `input` to a different type `R`. + * + * This function casts each element of the input vector to a different data type specified by + * template parameter `R`. + * + * Optionally, the rounding mode can be set using the `Mode` template parameter. The default mode is `ANY`, which + * uses the fastest rounding mode available. + * + * Example + * ======= + * ``` + * vec input {1.2f, 2.7f, 3.5f, 4.9f}; + * auto casted = cast(input); // [1, 2, 3, 4] + * ``` + */ template KERNEL_FLOAT_INLINE vector> cast(const V& input) { using F = ops::cast, R, Mode>; return map(F {}, input); } -} // namespace kernel_float - -#endif //KERNEL_FLOAT_UNOPS_H -#ifndef KERNEL_FLOAT_CAST_H -#define KERNEL_FLOAT_CAST_H - - - -namespace kernel_float { namespace detail { -template +template struct broadcast_extent_helper; +template +struct broadcast_extent_helper { + using type = E; +}; + template struct broadcast_extent_helper, extent> { using type = extent; @@ -1155,13 +1267,17 @@ struct broadcast_extent_helper, extent<1>> { using type = extent<1>; }; +template +struct broadcast_extent_helper: + broadcast_extent_helper::type, C, Rest...> {}; + } // namespace detail -template -using broadcast_extent = typename detail::broadcast_extent_helper::type; +template +using broadcast_extent = typename detail::broadcast_extent_helper::type; -template -using broadcast_vector_extent_type = broadcast_extent, vector_extent_type>; +template +using broadcast_vector_extent_type = broadcast_extent...>; template static constexpr bool is_broadcastable = is_same, To>; @@ -1204,6 +1320,16 @@ struct broadcast_impl, extent<1>> { /** * Takes the given vector `input` and extends its size to a length of `N`. This is only valid if the size of `input` * is 1 or `N`. + * + * Example + * ======= + * ``` + * vec a = {1.0f}; + * vec x = broadcast<5>(a); // Returns [1.0f, 1.0f, 1.0f, 1.0f, 1.0f] + * + * vec b = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f}; + * vec y = broadcast<5>(b); // Returns [1.0f, 2.0f, 3.0f, 4.0f, 5.0f] + * ``` */ template KERNEL_FLOAT_INLINE vector, extent> @@ -1223,57 +1349,6 @@ broadcast_like(const V& input, const R& other) { return broadcast(input, vector_extent_type {}); } -/** - * Returns a vector containing `N` copies of `value`. - */ -template -KERNEL_FLOAT_INLINE vector> fill(T value = {}, extent = {}) { - vector_storage input = {value}; - return detail::broadcast_impl, extent>::call(input); -} - -/** - * Returns a vector containing `N` copies of `T(0)`. - */ -template -KERNEL_FLOAT_INLINE vector> zeros(extent = {}) { - vector_storage input = {T {}}; - return detail::broadcast_impl, extent>::call(input); -} - -/** - * Returns a vector containing `N` copies of `T(1)`. - */ -template -KERNEL_FLOAT_INLINE vector> ones(extent = {}) { - vector_storage input = {T {1}}; - return detail::broadcast_impl, extent>::call(input); -} - -/** - * Returns a vector filled with `value` having the same type and size as input vector `V`. - */ -template, typename E = vector_extent_type> -KERNEL_FLOAT_INLINE vector fill_like(const V&, T value) { - return fill(value, E {}); -} - -/** - * Returns a vector filled with zeros having the same type and size as input vector `V`. - */ -template, typename E = vector_extent_type> -KERNEL_FLOAT_INLINE vector zeros_like(const V& = {}) { - return zeros(E {}); -} - -/** - * Returns a vector filled with ones having the same type and size as input vector `V`. - */ -template, typename E = vector_extent_type> -KERNEL_FLOAT_INLINE vector ones_like(const V& = {}) { - return ones(E {}); -} - namespace detail { template struct convert_helper { @@ -1321,13 +1396,114 @@ KERNEL_FLOAT_INLINE vector_storage convert_storage(const V& input, extent< /** * Cast the values of the given input vector to type `R` and then broadcast the result to the given size `N`. * - * This function is essentially a `cast` followed by a `broadcast`. + * Example + * ======= + * ``` + * int a = 5; + * vec x = convert(a); // returns [5.0f, 5.0f, 5.0f] + * + * float b = 5.0f; + * vec x = convert(b); // returns [5.0f, 5.0f, 5.0f] + * + * vec c = {1, 2, 3}; + * vec x = convert(c); // returns [1.0f, 2.0f, 3.0f] + * ``` */ template KERNEL_FLOAT_INLINE vector> convert(const V& input, extent new_size = {}) { return convert_storage(input); } +/** + * Returns a vector containing `N` copies of `value`. + * + * Example + * ======= + * ``` + * vec a = fill<3>(42); // return [42, 42, 42] + * ``` + */ +template +KERNEL_FLOAT_INLINE vector> fill(T value = {}, extent = {}) { + vector_storage input = {value}; + return detail::broadcast_impl, extent>::call(input); +} + +/** + * Returns a vector containing `N` copies of `T(0)`. + * + * Example + * ======= + * ``` + * vec a = zeros(); // return [0, 0, 0] + * ``` + */ +template +KERNEL_FLOAT_INLINE vector> zeros(extent = {}) { + vector_storage input = {T {}}; + return detail::broadcast_impl, extent>::call(input); +} + +/** + * Returns a vector containing `N` copies of `T(1)`. + * + * Example + * ======= + * ``` + * vec a = ones(); // return [1, 1, 1] + * ``` + */ +template +KERNEL_FLOAT_INLINE vector> ones(extent = {}) { + vector_storage input = {T {1}}; + return detail::broadcast_impl, extent>::call(input); +} + +/** + * Returns a vector filled with `value` having the same type and size as input vector `V`. + * + * Example + * ======= + * ``` + * vec a = {1, 2, 3}; + * vec b = fill_like(a, 42); // return [42, 42, 42] + * ``` + */ +template, typename E = vector_extent_type> +KERNEL_FLOAT_INLINE vector fill_like(const V&, T value) { + return fill(value, E {}); +} + +/** + * Returns a vector filled with zeros having the same type and size as input vector `V`. + * + * Example + * ======= + * ``` + * vec a = {1, 2, 3}; + * vec b = zeros_like(a); // return [0, 0, 0] + * ``` + */ +template, typename E = vector_extent_type> +KERNEL_FLOAT_INLINE vector zeros_like(const V& = {}) { + return zeros(E {}); +} + +/** + * Returns a vector filled with ones having the same type and size as input vector `V`. + * + * Example + * ======= + * ``` + * vec a = {1, 2, 3}; + * vec b = ones_like(a); // return [1, 1, 1] + * ``` + */ +template, typename E = vector_extent_type> +KERNEL_FLOAT_INLINE vector ones_like(const V& = {}) { + return ones(E {}); +} + } // namespace kernel_float #endif @@ -1347,6 +1523,14 @@ using zip_type = vector< /** * Combines the elements from the two inputs (`left` and `right`) element-wise, applying a provided binary * function (`fun`) to each pair of corresponding elements. + * + * Example + * ======= + * ``` + * vec make_negative = {true, false, true}; + * vec input = {1, 2, 3}; + * vec output = zip([](bool b, int n){ return b ? -n : +n; }, make_negative, input); // returns [-1, 2, -3] + * ``` */ template KERNEL_FLOAT_INLINE zip_type zip(F fun, const L& left, const R& right) { @@ -1367,8 +1551,17 @@ using zip_common_type = vector< broadcast_vector_extent_type>; /** - * Similar to `zip`, except `zip_common` promotes the element types of the inputs to a common type before applying the - * binary function. + * Combines the elements from the two inputs (`left` and `right`) element-wise, applying a provided binary + * function (`fun`) to each pair of corresponding elements. The elements are promoted to a common type before applying + * the binary function. + * + * Example + * ======= + * ``` + * vec a = {1.0f, 2.0f, 3.0f}; + * vec b = {4, 5, 6}; + * vec c = zip_common([](float x, float y){ return x + y; }, a, b); // returns [5.0f, 7.0f, 9.0f] + * ``` */ template KERNEL_FLOAT_INLINE zip_common_type zip_common(F fun, const L& left, const R& right) { @@ -1604,6 +1797,7 @@ KERNEL_FLOAT_INLINE vector> cross(const L& left, const R& right) { + namespace kernel_float { template @@ -1661,6 +1855,29 @@ struct cast, R, m> { }; } // namespace ops +#define KERNEL_FLOAT_CONSTANT_DEFINE_OP(OP) \ + template \ + R operator OP(const constant& left, const R& right) { \ + using T = vector_value_type; \ + return operator OP(T(left.get()), right); \ + } \ + \ + template \ + L operator OP(const L& left, const constant& right) { \ + using T = vector_value_type; \ + return operator OP(left, T(right.get())); \ + } \ + \ + template> \ + constant operator OP(const constant& left, const constant& right) { \ + return constant(operator OP(T(left.get()), T(right.get()))); \ + } + +KERNEL_FLOAT_CONSTANT_DEFINE_OP(+) +KERNEL_FLOAT_CONSTANT_DEFINE_OP(-) +KERNEL_FLOAT_CONSTANT_DEFINE_OP(*) +KERNEL_FLOAT_CONSTANT_DEFINE_OP(/) + } // namespace kernel_float #endif @@ -2213,7 +2430,7 @@ template< typename L, typename R, typename T = promoted_vector_value_type, - typename E = broadcast_extent, broadcast_vector_extent_type>> + typename E = broadcast_vector_extent_type> KERNEL_FLOAT_INLINE vector where(const C& cond, const L& true_values, const R& false_values) { using F = ops::conditional; @@ -2241,7 +2458,7 @@ template< typename C, typename L, typename T = vector_value_type, - typename E = broadcast_extent, vector_extent_type>> + typename E = broadcast_vector_extent_type> KERNEL_FLOAT_INLINE vector where(const C& cond, const L& true_values) { vector> false_values = T {}; return where(cond, true_values, false_values); @@ -2291,7 +2508,7 @@ template< typename B, typename C, typename T = promoted_vector_value_type, - typename E = broadcast_extent, broadcast_vector_extent_type>> + typename E = broadcast_vector_extent> KERNEL_FLOAT_INLINE vector fma(const A& a, const B& b, const C& c) { using F = ops::fma; @@ -2320,8 +2537,18 @@ KERNEL_FLOAT_INLINE vector fma(const A& a, const B& b, const C& c) { namespace kernel_float { +/** + * Container that stores ``N`` values of type ``T``. + * + * It is not recommended to use this class directly, but instead, use the type `vec` which is an alias for + * `vector, vector_storage>`. + * + * @tparam T The type of the values stored within the vector. + * @tparam E The size of this vector. Should be of type `extent`. + * @tparam S The object's storage class. Should be the type `vector_storage` + */ template -struct vector: S { +struct vector: public S { using value_type = T; using extent_type = E; using storage_type = S; @@ -2340,11 +2567,12 @@ struct vector: S { // For all other arguments, we convert it using `convert_storage` according to broadcast rules template, T>, int> = 0> - KERNEL_FLOAT_INLINE vector(U&& input) : storage_type(convert_storage(input)) {} + KERNEL_FLOAT_INLINE vector(U&& input) : + storage_type(convert_storage(input, extent_type {})) {} template, T>, int> = 0> KERNEL_FLOAT_INLINE explicit vector(U&& input) : - storage_type(convert_storage(input)) {} + storage_type(convert_storage(input, extent_type {})) {} // List of `N` (where N >= 2), simply pass forward to the storage template< @@ -2355,6 +2583,9 @@ struct vector: S { KERNEL_FLOAT_INLINE vector(const A& a, const B& b, const Rest&... rest) : storage_type {a, b, rest...} {} + /** + * Returns the number of elements in this vector. + */ KERNEL_FLOAT_INLINE static constexpr size_t size() { return E::size; @@ -2370,165 +2601,196 @@ struct vector: S { return *this; } + /** + * Returns a pointer to the underlying storage data. + */ + KERNEL_FLOAT_INLINE + T* data() { + return storage().data(); + } + + /** + * Returns a pointer to the underlying storage data. + */ + KERNEL_FLOAT_INLINE + const T* data() const { + return storage().data(); + } + KERNEL_FLOAT_INLINE const T* cdata() const { return this->data(); } + /** + * Returns a reference to the item at index `i`. + */ KERNEL_FLOAT_INLINE - T* begin() { - return this->data(); + T& at(size_t i) { + return *(this->data() + i); } + /** + * Returns a constant reference to the item at index `i`. + */ KERNEL_FLOAT_INLINE - const T* begin() const { - return this->data(); + const T& at(size_t i) const { + return *(this->data() + i); } + /** + * Returns a reference to the item at index `i`. + */ KERNEL_FLOAT_INLINE - const T* cbegin() const { - return this->data(); + T& operator[](size_t i) { + return at(i); } + /** + * Returns a constant reference to the item at index `i`. + */ KERNEL_FLOAT_INLINE - T* end() { - return this->data() + size(); + const T& operator[](size_t i) const { + return at(i); } KERNEL_FLOAT_INLINE - const T* end() const { - return this->data() + size(); + T& operator()(size_t i) { + return at(i); } KERNEL_FLOAT_INLINE - const T* cend() const { - return this->data() + size(); + const T& operator()(size_t i) const { + return at(i); } + /** + * Returns a pointer to the first element. + */ KERNEL_FLOAT_INLINE - T& at(size_t x) { - return *(this->data() + x); + T* begin() { + return this->data(); } + /** + * Returns a pointer to the first element. + */ KERNEL_FLOAT_INLINE - const T& at(size_t x) const { - return *(this->data() + x); + const T* begin() const { + return this->data(); } + /** + * Returns a pointer to the first element. + */ KERNEL_FLOAT_INLINE - T get(size_t x) const { - return at(x); + const T* cbegin() const { + return this->data(); } + /** + * Returns a pointer to one past the last element. + */ KERNEL_FLOAT_INLINE - void set(size_t x, T value) { - at(x) = std::move(value); + T* end() { + return this->data() + size(); } + /** + * Returns a pointer to one past the last element. + */ KERNEL_FLOAT_INLINE - T& operator[](size_t x) { - return at(x); + const T* end() const { + return this->data() + size(); } + /** + * Returns a pointer to one past the last element. + */ KERNEL_FLOAT_INLINE - const T& operator[](size_t x) const { - return at(x); + const T* cend() const { + return this->data() + size(); } + /** + * Copy the element at index `i`. + */ KERNEL_FLOAT_INLINE - T& operator()(size_t x) { + T get(size_t x) const { return at(x); } + /** + * Set the element at index `i`. + */ KERNEL_FLOAT_INLINE - const T& operator()(size_t x) const { - return at(x); + void set(size_t x, T value) { + at(x) = std::move(value); } + /** + * Cast the elements of this vector to type `R` and returns a new vector. + */ template KERNEL_FLOAT_INLINE vector cast() const { return kernel_float::cast(*this); } - template - KERNEL_FLOAT_INLINE vector> broadcast(extent new_size = {}) const { + /** + * Broadcast this vector into a new size `(Ns...)`. + */ + template + KERNEL_FLOAT_INLINE vector> broadcast(extent new_size = {}) const { return kernel_float::broadcast(*this, new_size); } + /** + * Apply the given function `F` to each element of this vector and returns a new vector with the results. + */ template KERNEL_FLOAT_INLINE vector, E> map(F fun) const { return kernel_float::map(fun, *this); } + /** + * Reduce the elements of the given vector input into a single value using the function `F`. + * + * This function should be a binary function that takes two elements and returns one element. The order in which + * the elements are reduced is not specified and depends on the reduction function and the vector type. + */ template KERNEL_FLOAT_INLINE T reduce(F fun) const { return kernel_float::reduce(fun, *this); } + /** + * Flattens the elements of this vector. For example, this turns a `vec, 3>` into a `vec`. + */ KERNEL_FLOAT_INLINE flatten_type flatten() const { return kernel_float::flatten(*this); } + /** + * Apply the given function `F` to each element of this vector. + */ template KERNEL_FLOAT_INLINE void for_each(F fun) const { return kernel_float::for_each(*this, std::move(fun)); } }; -#define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ - template<> \ - struct into_vector_traits<::T2> { \ - using value_type = T; \ - using extent_type = extent<2>; \ - \ - KERNEL_FLOAT_INLINE \ - static vector_storage call(::T2 v) { \ - return {v.x, v.y}; \ - } \ - }; \ - \ - template<> \ - struct into_vector_traits<::T3> { \ - using value_type = T; \ - using extent_type = extent<3>; \ - \ - KERNEL_FLOAT_INLINE \ - static vector_storage call(::T3 v) { \ - return {v.x, v.y, v.z}; \ - } \ - }; \ - \ - template<> \ - struct into_vector_traits<::T4> { \ - using value_type = T; \ - using extent_type = extent<4>; \ - \ - KERNEL_FLOAT_INLINE \ - static vector_storage call(::T4 v) { \ - return {v.x, v.y, v.z, v.w}; \ - } \ - }; - +/** + * Convert the given `input` into a vector. This function can perform one of the following actions: + * + * - For vectors `vec`, it simply returns the original vector. + * - For primitive types `T` (e.g., `int`, `float`, `double`), it returns a `vec`. + * - For array-like types (e.g., `int2`, `std::array`, `T[N]`), it returns `vec`. + */ template KERNEL_FLOAT_INLINE into_vector_type into_vector(V&& input) { return into_vector_traits::call(std::forward(input)); } -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(char, char1, char2, char3, char4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(short, short1, short2, short3, short4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(int, int1, int2, int3, int4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long, long1, long2, long3, long4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(long long, longlong1, longlong2, longlong3, longlong4) - -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned char, uchar1, uchar2, uchar3, uchar4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned short, ushort1, ushort2, ushort3, ushort4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned int, uint1, uint2, uint3, uint4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long, ulong1, ulong2, ulong3, ulong4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong1, ulonglong2, ulonglong3, ulonglong4) - -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(float, float1, float2, float3, float4) -KERNEL_FLOAT_DEFINE_VECTOR_TYPE(double, double1, double2, double3, double4) - template using scalar = vector>; @@ -2546,6 +2808,19 @@ template using vec7 = vec; template using vec8 = vec; // clang-format on +/** + * Create a vector from a variable number of input values. + * + * The resulting vector type is determined by promoting the types of the input values into a common type. + * The number of input values determines the dimension of the resulting vector. + * + * Example + * ======= + * ``` + * auto v1 = make_vec(1.0f, 2.0f, 3.0f); // Creates a vec [1.0f, 2.0f, 3.0f] + * auto v2 = make_vec(1, 2, 3, 4); // Creates a vec [1, 2, 3, 4] + * ``` + */ template KERNEL_FLOAT_INLINE vec, sizeof...(Args)> make_vec(Args&&... args) { using T = promote_t; @@ -3209,6 +3484,14 @@ KERNEL_FLOAT_INLINE constexpr kconstant kconst(T value) { return value; } +static constexpr kconstant operator""_c(long double v) { + return static_cast(v); +} + +static constexpr kconstant operator""_c(unsigned long long int v) { + return static_cast(v); +} + } // namespace prelude } // namespace kernel_float From 35d55328fd0cd6644f23137ea17dd4dd470a870d Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 15 Aug 2023 13:40:48 +0200 Subject: [PATCH 26/50] Add functions `concat` and `select` --- docs/build_api.py | 4 +- include/kernel_float/iterate.h | 124 ++++++++++++++++++++++++++- include/kernel_float/triops.h | 2 +- include/kernel_float/vector.h | 18 ++++ single_include/kernel_float.h | 148 +++++++++++++++++++++++++++++++-- 5 files changed, 287 insertions(+), 9 deletions(-) diff --git a/docs/build_api.py b/docs/build_api.py index eba749c..fde6fbb 100644 --- a/docs/build_api.py +++ b/docs/build_api.py @@ -83,10 +83,12 @@ def build_index_page(groups): "cast", "broadcast", "convert", + "make_vec", + "concat", + "select", "for_each", ], "Generation": [ - "make_vec", "range", "range_like", "each_index", diff --git a/include/kernel_float/iterate.h b/include/kernel_float/iterate.h index 5531090..8df9980 100644 --- a/include/kernel_float/iterate.h +++ b/include/kernel_float/iterate.h @@ -104,8 +104,8 @@ struct flatten_helper { using value_type = typename flatten_helper::value_type; static constexpr size_t size = N * flatten_helper::size; - KERNEL_FLOAT_INLINE - static void call(const V& input, value_type* output) { + template + KERNEL_FLOAT_INLINE static void call(const V& input, U* output) { vector_storage storage = into_vector_storage(input); #pragma unroll @@ -124,6 +124,11 @@ struct flatten_helper { static void call(const T& input, T* output) { *output = input; } + + template + KERNEL_FLOAT_INLINE static void call(const T& input, U* output) { + *output = ops::cast {}(input); + } }; } // namespace detail @@ -152,6 +157,121 @@ KERNEL_FLOAT_INLINE flatten_type flatten(const V& input) { detail::flatten_helper::call(input, output.data()); return output; } + +namespace detail { +template +struct concat_helper {}; + +template +struct concat_helper { + using value_type = typename promote_type< + typename flatten_helper::value_type, + typename concat_helper::value_type>::type; + static constexpr size_t size = flatten_helper::size + concat_helper::size; + + template + KERNEL_FLOAT_INLINE static void call(U* output, const V& input, const Vs&... rest) { + flatten_helper::call(input, output); + concat_helper::call(output + flatten_helper::size, rest...); + } +}; + +template +struct concat_helper { + using value_type = typename promote_type< + typename flatten_helper::value_type, + typename concat_helper::value_type>::type; + static constexpr size_t size = flatten_helper::size + concat_helper::size; + + template + KERNEL_FLOAT_INLINE static void call(U* output, const V& input, const Vs&... rest) { + flatten_helper::call(input, output); + concat_helper::call(output + flatten_helper::size, rest...); + } +}; +} // namespace detail + +template +using concat_value_type = promote_t::value_type>; + +template +static constexpr size_t concat_size = detail::concat_helper::size; + +template +using concat_type = vector, extent>>; + +/** + * Concatenates the provided input values into a single one-dimensional vector. + * + * This function works in three steps: + * - All input values are converted into vectors using the `into_vector` operation. + * - The resulting vectors' elements are then promoted into a shared value type. + * - The resultant vectors are finally concatenated together. + * + * For instance, when invoking this function with arguments of types `float, double2, double`: + * - After the first step: `vec, vec, vec` + * - After the second step: `vec, vec, vec` + * - After the third step: `vec` + * + * Example + * ======= + * ``` + * double vec1 = 1.0; + * double3 vec2 = {3.0, 4.0, 5.0); + * double4 vec3 = {6.0, 7.0, 8.0, 9.0}; + * vec concatenated = concat(vec1, vec2, vec3); // contains [1, 2, 3, 4, 5, 6, 7, 8, 9] + * + * int num1 = 42; + * float num2 = 3.14159; + * int2 num3 = {-10, 10}; + * vec concatenated = concat(num1, num2, num3); // contains [42, 3.14159, -10, 10] + * ``` + */ +template +KERNEL_FLOAT_INLINE concat_type concat(const Vs&... inputs) { + vector_storage, concat_size> output; + detail::concat_helper::call(output.data(), inputs...); + return output; +} + +template +using select_type = vector, extent>>; + +/** + * Selects elements from the this vector based on the specified indices. + * + * Example + * ======= + * ``` + * vec input = {0, 10, 20, 30, 40, 50}; + * vec vec1 = select(input, 0, 4, 4, 2); // [0, 40, 40, 20] + * + * vec indices = {0, 4, 4, 2}; + * vec vec2 = select(input, indices); // [0, 40, 40, 20] + * ``` + */ +template +KERNEL_FLOAT_INLINE select_type select(const V& input, const Is&... indices) { + using T = vector_value_type; + static constexpr size_t N = vector_extent; + static constexpr size_t M = concat_size; + + vector_storage index_set; + detail::concat_helper::call(index_set.data(), indices...); + + vector_storage inputs = into_vector_storage(input); + vector_storage outputs; + for (size_t i = 0; i < M; i++) { + size_t j = index_set.data()[i]; + + if (j < N) { + outputs.data()[i] = inputs.data()[j]; + } + } + + return outputs; +} + } // namespace kernel_float #endif \ No newline at end of file diff --git a/include/kernel_float/triops.h b/include/kernel_float/triops.h index 7144c4f..e24971e 100644 --- a/include/kernel_float/triops.h +++ b/include/kernel_float/triops.h @@ -114,7 +114,7 @@ template< typename B, typename C, typename T = promoted_vector_value_type, - typename E = broadcast_vector_extent> + typename E = broadcast_vector_extent_type> KERNEL_FLOAT_INLINE vector fma(const A& a, const B& b, const C& c) { using F = ops::fma; diff --git a/include/kernel_float/vector.h b/include/kernel_float/vector.h index 8b045b2..2685534 100644 --- a/include/kernel_float/vector.h +++ b/include/kernel_float/vector.h @@ -201,6 +201,24 @@ struct vector: public S { at(x) = std::move(value); } + /** + * Selects elements from the this vector based on the specified indices. + * + * Example + * ======= + * ``` + * vec input = {0, 10, 20, 30, 40, 50}; + * vec vec1 = select(input, 0, 4, 4, 2); // [0, 40, 40, 20] + * + * vec indices = {0, 4, 4, 2}; + * vec vec2 = select(input, indices); // [0, 40, 40, 20] + * ``` + */ + template + KERNEL_FLOAT_INLINE select_type select(const Is&... indices) { + return kernel_float::select(*this, indices...); + } + /** * Cast the elements of this vector to type `R` and returns a new vector. */ diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 07f068e..4ab2de1 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-08-15 12:29:08.022922 -// git hash: 9b71242d6c1cd9c4f8f6309824fe0a774bf9719d +// date: 2023-08-15 13:37:05.439788 +// git hash: e1568c097af1d63ac20eab4032da36b68d7fb431 //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -981,8 +981,8 @@ struct flatten_helper { using value_type = typename flatten_helper::value_type; static constexpr size_t size = N * flatten_helper::size; - KERNEL_FLOAT_INLINE - static void call(const V& input, value_type* output) { + template + KERNEL_FLOAT_INLINE static void call(const V& input, U* output) { vector_storage storage = into_vector_storage(input); #pragma unroll @@ -1001,6 +1001,11 @@ struct flatten_helper { static void call(const T& input, T* output) { *output = input; } + + template + KERNEL_FLOAT_INLINE static void call(const T& input, U* output) { + *output = ops::cast {}(input); + } }; } // namespace detail @@ -1029,6 +1034,121 @@ KERNEL_FLOAT_INLINE flatten_type flatten(const V& input) { detail::flatten_helper::call(input, output.data()); return output; } + +namespace detail { +template +struct concat_helper {}; + +template +struct concat_helper { + using value_type = typename promote_type< + typename flatten_helper::value_type, + typename concat_helper::value_type>::type; + static constexpr size_t size = flatten_helper::size + concat_helper::size; + + template + KERNEL_FLOAT_INLINE static void call(U* output, const V& input, const Vs&... rest) { + flatten_helper::call(input, output); + concat_helper::call(output + flatten_helper::size, rest...); + } +}; + +template +struct concat_helper { + using value_type = typename promote_type< + typename flatten_helper::value_type, + typename concat_helper::value_type>::type; + static constexpr size_t size = flatten_helper::size + concat_helper::size; + + template + KERNEL_FLOAT_INLINE static void call(U* output, const V& input, const Vs&... rest) { + flatten_helper::call(input, output); + concat_helper::call(output + flatten_helper::size, rest...); + } +}; +} // namespace detail + +template +using concat_value_type = promote_t::value_type>; + +template +static constexpr size_t concat_size = detail::concat_helper::size; + +template +using concat_type = vector, extent>>; + +/** + * Concatenates the provided input values into a single one-dimensional vector. + * + * This function works in three steps: + * - All input values are converted into vectors using the `into_vector` operation. + * - The resulting vectors' elements are then promoted into a shared value type. + * - The resultant vectors are finally concatenated together. + * + * For instance, when invoking this function with arguments of types `float, double2, double`: + * - After the first step: `vec, vec, vec` + * - After the second step: `vec, vec, vec` + * - After the third step: `vec` + * + * Example + * ======= + * ``` + * double vec1 = 1.0; + * double3 vec2 = {3.0, 4.0, 5.0); + * double4 vec3 = {6.0, 7.0, 8.0, 9.0}; + * vec concatenated = concat(vec1, vec2, vec3); // contains [1, 2, 3, 4, 5, 6, 7, 8, 9] + * + * int num1 = 42; + * float num2 = 3.14159; + * int2 num3 = {-10, 10}; + * vec concatenated = concat(num1, num2, num3); // contains [42, 3.14159, -10, 10] + * ``` + */ +template +KERNEL_FLOAT_INLINE concat_type concat(const Vs&... inputs) { + vector_storage, concat_size> output; + detail::concat_helper::call(output.data(), inputs...); + return output; +} + +template +using select_type = vector, extent>>; + +/** + * Selects elements from the this vector based on the specified indices. + * + * Example + * ======= + * ``` + * vec input = {0, 10, 20, 30, 40, 50}; + * vec vec1 = select(input, 0, 4, 4, 2); // [0, 40, 40, 20] + * + * vec indices = {0, 4, 4, 2}; + * vec vec2 = select(input, indices); // [0, 40, 40, 20] + * ``` + */ +template +KERNEL_FLOAT_INLINE select_type select(const V& input, const Is&... indices) { + using T = vector_value_type; + static constexpr size_t N = vector_extent; + static constexpr size_t M = concat_size; + + vector_storage index_set; + detail::concat_helper::call(index_set.data(), indices...); + + vector_storage inputs = into_vector_storage(input); + vector_storage outputs; + for (size_t i = 0; i < M; i++) { + size_t j = index_set.data()[i]; + + if (j < N) { + outputs.data()[i] = inputs.data()[j]; + } + } + + return outputs; +} + } // namespace kernel_float #endif @@ -2508,7 +2628,7 @@ template< typename B, typename C, typename T = promoted_vector_value_type, - typename E = broadcast_vector_extent> + typename E = broadcast_vector_extent_type> KERNEL_FLOAT_INLINE vector fma(const A& a, const B& b, const C& c) { using F = ops::fma; @@ -2728,6 +2848,24 @@ struct vector: public S { at(x) = std::move(value); } + /** + * Selects elements from the this vector based on the specified indices. + * + * Example + * ======= + * ``` + * vec input = {0, 10, 20, 30, 40, 50}; + * vec vec1 = select(input, 0, 4, 4, 2); // [0, 40, 40, 20] + * + * vec indices = {0, 4, 4, 2}; + * vec vec2 = select(input, indices); // [0, 40, 40, 20] + * ``` + */ + template + KERNEL_FLOAT_INLINE select_type select(const Is&... indices) { + return kernel_float::select(*this, indices...); + } + /** * Cast the elements of this vector to type `R` and returns a new vector. */ From 2acc2625f5edfd033b9cd2bc8c827c7d91058d78 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 15 Aug 2023 14:44:52 +0200 Subject: [PATCH 27/50] Change how is implemented --- include/kernel_float/iterate.h | 69 ++- include/kernel_float/meta.h | 15 + include/kernel_float/vector.h | 2 +- single_include/kernel_float.h | 838 +++++++++++++++++---------------- 4 files changed, 504 insertions(+), 420 deletions(-) diff --git a/include/kernel_float/iterate.h b/include/kernel_float/iterate.h index 8df9980..7b46db2 100644 --- a/include/kernel_float/iterate.h +++ b/include/kernel_float/iterate.h @@ -2,6 +2,7 @@ #define KERNEL_FLOAT_ITERATE_H #include "base.h" +#include "conversion.h" namespace kernel_float { @@ -105,12 +106,12 @@ struct flatten_helper { static constexpr size_t size = N * flatten_helper::size; template - KERNEL_FLOAT_INLINE static void call(const V& input, U* output) { + KERNEL_FLOAT_INLINE static void call(U* output, const V& input) { vector_storage storage = into_vector_storage(input); #pragma unroll for (size_t i = 0; i < N; i++) { - flatten_helper::call(storage.data()[i], output + flatten_helper::size * i); + flatten_helper::call(output + flatten_helper::size * i, storage.data()[i]); } } }; @@ -121,12 +122,12 @@ struct flatten_helper { static constexpr size_t size = 1; KERNEL_FLOAT_INLINE - static void call(const T& input, T* output) { + static void call(T* output, const T& input) { *output = input; } template - KERNEL_FLOAT_INLINE static void call(const T& input, U* output) { + KERNEL_FLOAT_INLINE static void call(U* output, const T& input) { *output = ops::cast {}(input); } }; @@ -154,40 +155,66 @@ using flatten_type = vector, extent>>; template KERNEL_FLOAT_INLINE flatten_type flatten(const V& input) { vector_storage, flatten_size> output; - detail::flatten_helper::call(input, output.data()); + detail::flatten_helper::call(output.data(), input); return output; } namespace detail { +template> +struct concat_base_helper { + static constexpr size_t size = vector_extent; + + KERNEL_FLOAT_INLINE static void call(U* output, const V& input) { + vector_storage storage = into_vector_storage(input); + + for (size_t i = 0; i < size; i++) { + output[i] = ops::cast {}(storage.data()[i]); + } + } +}; + +template +struct concat_base_helper { + static constexpr size_t size = 1; + + KERNEL_FLOAT_INLINE static void call(U* output, const T& input) { + *output = ops::cast {}(input); + } +}; + +template +struct concat_base_helper { + static constexpr size_t size = 1; + + KERNEL_FLOAT_INLINE static void call(T* output, const T& input) { + *output = input; + } +}; + template struct concat_helper {}; template struct concat_helper { - using value_type = typename promote_type< - typename flatten_helper::value_type, - typename concat_helper::value_type>::type; - static constexpr size_t size = flatten_helper::size + concat_helper::size; + using value_type = + typename promote_type, typename concat_helper::value_type>:: + type; + static constexpr size_t size = concat_base_helper::size + concat_helper::size; template KERNEL_FLOAT_INLINE static void call(U* output, const V& input, const Vs&... rest) { - flatten_helper::call(input, output); - concat_helper::call(output + flatten_helper::size, rest...); + concat_base_helper::call(output, input); + concat_helper::call(output + concat_base_helper::size, rest...); } }; -template -struct concat_helper { - using value_type = typename promote_type< - typename flatten_helper::value_type, - typename concat_helper::value_type>::type; - static constexpr size_t size = flatten_helper::size + concat_helper::size; +template<> +struct concat_helper<> { + using value_type = void; + static constexpr size_t size = 1; template - KERNEL_FLOAT_INLINE static void call(U* output, const V& input, const Vs&... rest) { - flatten_helper::call(input, output); - concat_helper::call(output + flatten_helper::size, rest...); - } + KERNEL_FLOAT_INLINE static void call(U* output) {} }; } // namespace detail diff --git a/include/kernel_float/meta.h b/include/kernel_float/meta.h index 1bfb8a7..dcaac78 100644 --- a/include/kernel_float/meta.h +++ b/include/kernel_float/meta.h @@ -84,6 +84,21 @@ struct promote_type { using type = T; }; +template +struct promote_type { + using type = T; +}; + +template +struct promote_type { + using type = T; +}; + +template<> +struct promote_type { + using type = void; +}; + #define KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, U) \ template<> \ struct promote_type { \ diff --git a/include/kernel_float/vector.h b/include/kernel_float/vector.h index 2685534..7447b7e 100644 --- a/include/kernel_float/vector.h +++ b/include/kernel_float/vector.h @@ -11,7 +11,7 @@ namespace kernel_float { /** - * Container that stores ``N`` values of type ``T``. + * Container that stores ``N`` elements of type ``T``. * * It is not recommended to use this class directly, but instead, use the type `vec` which is an alias for * `vector, vector_storage>`. diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 4ab2de1..e129d70 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-08-15 13:37:05.439788 -// git hash: e1568c097af1d63ac20eab4032da36b68d7fb431 +// date: 2023-08-15 14:44:32.635916 +// git hash: 35d55328fd0cd6644f23137ea17dd4dd470a870d //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -133,6 +133,21 @@ struct promote_type { using type = T; }; +template +struct promote_type { + using type = T; +}; + +template +struct promote_type { + using type = T; +}; + +template<> +struct promote_type { + using type = void; +}; + #define KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(T, U) \ template<> \ struct promote_type { \ @@ -875,44 +890,23 @@ struct promote_type> { } // namespace kernel_float #endif -#ifndef KERNEL_FLOAT_ITERATE_H -#define KERNEL_FLOAT_ITERATE_H +#ifndef KERNEL_FLOAT_UNOPS_H +#define KERNEL_FLOAT_UNOPS_H namespace kernel_float { - -/** - * Apply the function fun for each element from input. - * - * Example - * ======= - * ``` - * for_each(range(), [&](auto i) { - * printf("element: %d\n", i); - * }); - * ``` - */ -template -void for_each(V&& input, F fun) { - auto storage = into_vector_storage(input); - -#pragma unroll - for (size_t i = 0; i < vector_extent; i++) { - fun(storage.data()[i]); - } -} - namespace detail { -template -struct range_helper { - KERNEL_FLOAT_INLINE - static vector_storage call() { - vector_storage result; + +template +struct apply_impl { + KERNEL_FLOAT_INLINE static vector_storage + call(F fun, const vector_storage&... inputs) { + vector_storage result; #pragma unroll for (size_t i = 0; i < N; i++) { - result.data()[i] = T(i); + result.data()[i] = fun(inputs.data()[i]...); } return result; @@ -920,392 +914,136 @@ struct range_helper { }; } // namespace detail -/** - * Generate vector consisting of the numbers `0...N-1` of type `T` - * - * Example - * ======= - * ``` - * // Returns [0, 1, 2] - * vec vec = range(); - * ``` - */ -template -KERNEL_FLOAT_INLINE vector> range() { - return detail::range_helper::call(); -} +template +using map_type = vector>, vector_extent_type>; /** - * Takes a vector `vec` and returns a new vector consisting of the numbers ``0...N-1`` of type ``T`` + * Apply the function `F` to each element from the vector `input` and return the results as a new vector. * - * Example - * ======= + * Examples + * ======== * ``` - * auto input = vec(5.0f, 10.0f, -1.0f); - * auto indices = range_like(input); // returns [0.0f, 1.0f, 2.0f] + * vec input = {1.0f, 2.0f, 3.0f, 4.0f}; + * vec squared = map([](auto x) { return x * x; }, input); // [1.0f, 4.0f, 9.0f, 16.0f] * ``` */ -template -KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { - return detail::range_helper, vector_extent>::call(); +template +KERNEL_FLOAT_INLINE map_type map(F fun, const V& input) { + using Input = vector_value_type; + using Output = result_t; + return detail::apply_impl, Output, Input>::call( + fun, + into_vector_storage(input)); } -/** - * Takes a vector of size ``N`` and returns a new vector consisting of the numbers ``0...N-1``. The data type used - * for the indices is given by the first template argument, which is `size_t` by default. This function is useful when - * needing to iterate over the indices of a vector. - * - * Example - * ======= - * ``` - * // Returns [0, 1, 2] of type size_t - * vec a = each_index(float3(6, 4, 2)); - * - * // Returns [0, 1, 2] of type int. - * vec b = each_index(float3(6, 4, 2)); - * - * vec input = {1.0f, 2.0f, 3.0f, 4.0f}; - * for (auto index: each_index(input)) { - * printf("%d] %f\n", index, input[index]); - * } - * ``` - */ -template -KERNEL_FLOAT_INLINE vector> each_index(const V& = {}) { - return detail::range_helper>::call(); -} +#define KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ + namespace ops { \ + template \ + struct NAME { \ + KERNEL_FLOAT_INLINE T operator()(T input) { \ + return T(EXPR); \ + } \ + }; \ + } \ + template \ + KERNEL_FLOAT_INLINE vector, vector_extent_type> NAME(const V& input) { \ + using F = ops::NAME>; \ + return map(F {}, input); \ + } -namespace detail { -template, size_t N = vector_extent> -struct flatten_helper { - using value_type = typename flatten_helper::value_type; - static constexpr size_t size = N * flatten_helper::size; +#define KERNEL_FLOAT_DEFINE_UNARY_OP(NAME, OP, EXPR) \ + KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ + template \ + KERNEL_FLOAT_INLINE vector operator OP(const vector& vec) { \ + return NAME(vec); \ + } - template - KERNEL_FLOAT_INLINE static void call(const V& input, U* output) { - vector_storage storage = into_vector_storage(input); +KERNEL_FLOAT_DEFINE_UNARY_OP(negate, -, -input) +KERNEL_FLOAT_DEFINE_UNARY_OP(bit_not, ~, ~input) +KERNEL_FLOAT_DEFINE_UNARY_OP(logical_not, !, !bool(input)) -#pragma unroll - for (size_t i = 0; i < N; i++) { - flatten_helper::call(storage.data()[i], output + flatten_helper::size * i); - } - } -}; +#define KERNEL_FLOAT_DEFINE_UNARY_FUN(NAME) KERNEL_FLOAT_DEFINE_UNARY(NAME, ::NAME(input)) -template -struct flatten_helper { - using value_type = T; - static constexpr size_t size = 1; +KERNEL_FLOAT_DEFINE_UNARY_FUN(acos) +KERNEL_FLOAT_DEFINE_UNARY_FUN(abs) +KERNEL_FLOAT_DEFINE_UNARY_FUN(acosh) +KERNEL_FLOAT_DEFINE_UNARY_FUN(asin) +KERNEL_FLOAT_DEFINE_UNARY_FUN(asinh) +KERNEL_FLOAT_DEFINE_UNARY_FUN(atan) +KERNEL_FLOAT_DEFINE_UNARY_FUN(atanh) +KERNEL_FLOAT_DEFINE_UNARY_FUN(cbrt) +KERNEL_FLOAT_DEFINE_UNARY_FUN(ceil) +KERNEL_FLOAT_DEFINE_UNARY_FUN(cos) +KERNEL_FLOAT_DEFINE_UNARY_FUN(cosh) +KERNEL_FLOAT_DEFINE_UNARY_FUN(cospi) +KERNEL_FLOAT_DEFINE_UNARY_FUN(erf) +KERNEL_FLOAT_DEFINE_UNARY_FUN(erfc) +KERNEL_FLOAT_DEFINE_UNARY_FUN(erfcinv) +KERNEL_FLOAT_DEFINE_UNARY_FUN(erfcx) +KERNEL_FLOAT_DEFINE_UNARY_FUN(erfinv) +KERNEL_FLOAT_DEFINE_UNARY_FUN(exp) +KERNEL_FLOAT_DEFINE_UNARY_FUN(exp10) +KERNEL_FLOAT_DEFINE_UNARY_FUN(exp2) +KERNEL_FLOAT_DEFINE_UNARY_FUN(expm1) +KERNEL_FLOAT_DEFINE_UNARY_FUN(fabs) +KERNEL_FLOAT_DEFINE_UNARY_FUN(floor) +KERNEL_FLOAT_DEFINE_UNARY_FUN(ilogb) +KERNEL_FLOAT_DEFINE_UNARY_FUN(lgamma) +KERNEL_FLOAT_DEFINE_UNARY_FUN(log) +KERNEL_FLOAT_DEFINE_UNARY_FUN(log10) +KERNEL_FLOAT_DEFINE_UNARY_FUN(logb) +KERNEL_FLOAT_DEFINE_UNARY_FUN(nearbyint) +KERNEL_FLOAT_DEFINE_UNARY_FUN(normcdf) +KERNEL_FLOAT_DEFINE_UNARY_FUN(rcbrt) +KERNEL_FLOAT_DEFINE_UNARY_FUN(sin) +KERNEL_FLOAT_DEFINE_UNARY_FUN(sinh) +KERNEL_FLOAT_DEFINE_UNARY_FUN(sqrt) +KERNEL_FLOAT_DEFINE_UNARY_FUN(tan) +KERNEL_FLOAT_DEFINE_UNARY_FUN(tanh) +KERNEL_FLOAT_DEFINE_UNARY_FUN(tgamma) +KERNEL_FLOAT_DEFINE_UNARY_FUN(trunc) +KERNEL_FLOAT_DEFINE_UNARY_FUN(y0) +KERNEL_FLOAT_DEFINE_UNARY_FUN(y1) +KERNEL_FLOAT_DEFINE_UNARY_FUN(yn) +KERNEL_FLOAT_DEFINE_UNARY_FUN(rint) +KERNEL_FLOAT_DEFINE_UNARY_FUN(rsqrt) +KERNEL_FLOAT_DEFINE_UNARY_FUN(round) +KERNEL_FLOAT_DEFINE_UNARY_FUN(signbit) +KERNEL_FLOAT_DEFINE_UNARY_FUN(isinf) +KERNEL_FLOAT_DEFINE_UNARY_FUN(isnan) - KERNEL_FLOAT_INLINE - static void call(const T& input, T* output) { - *output = input; +#if KERNEL_FLOAT_IS_DEVICE +#define KERNEL_FLOAT_DEFINE_UNARY_FAST(FUN_NAME, OP_NAME, FLOAT_FUN) \ + KERNEL_FLOAT_DEFINE_UNARY(FUN_NAME, ops::OP_NAME {}(input)) \ + namespace ops { \ + template<> \ + struct OP_NAME { \ + KERNEL_FLOAT_INLINE float operator()(float input) { \ + return FLOAT_FUN(input); \ + } \ + }; \ } +#else +#define KERNEL_FLOAT_DEFINE_UNARY_FAST(FUN_NAME, OP_NAME, FLOAT_FUN) \ + KERNEL_FLOAT_DEFINE_UNARY(FUN_NAME, ops::OP_NAME {}(input)) +#endif - template - KERNEL_FLOAT_INLINE static void call(const T& input, U* output) { - *output = ops::cast {}(input); - } -}; -} // namespace detail +KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_exp, exp, __expf) +KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_log, log, __logf) +KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_cos, cos, __cosf) +KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_sin, sin, __sinf) +KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_tan, tan, __tanf) -template -using flatten_value_type = typename detail::flatten_helper::value_type; +} // namespace kernel_float -template -static constexpr size_t flatten_size = detail::flatten_helper::size; +#endif //KERNEL_FLOAT_UNOPS_H +#ifndef KERNEL_FLOAT_CAST_H +#define KERNEL_FLOAT_CAST_H -template -using flatten_type = vector, extent>>; -/** - * Flattens the elements of this vector. For example, this turns a `vec, 3>` into a `vec`. - * - * Example - * ======= - * ``` - * vec input = {{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}}; - * vec result = flatten(input); // returns [1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f] - * ``` - */ -template -KERNEL_FLOAT_INLINE flatten_type flatten(const V& input) { - vector_storage, flatten_size> output; - detail::flatten_helper::call(input, output.data()); - return output; -} -namespace detail { -template -struct concat_helper {}; -template -struct concat_helper { - using value_type = typename promote_type< - typename flatten_helper::value_type, - typename concat_helper::value_type>::type; - static constexpr size_t size = flatten_helper::size + concat_helper::size; - - template - KERNEL_FLOAT_INLINE static void call(U* output, const V& input, const Vs&... rest) { - flatten_helper::call(input, output); - concat_helper::call(output + flatten_helper::size, rest...); - } -}; - -template -struct concat_helper { - using value_type = typename promote_type< - typename flatten_helper::value_type, - typename concat_helper::value_type>::type; - static constexpr size_t size = flatten_helper::size + concat_helper::size; - - template - KERNEL_FLOAT_INLINE static void call(U* output, const V& input, const Vs&... rest) { - flatten_helper::call(input, output); - concat_helper::call(output + flatten_helper::size, rest...); - } -}; -} // namespace detail - -template -using concat_value_type = promote_t::value_type>; - -template -static constexpr size_t concat_size = detail::concat_helper::size; - -template -using concat_type = vector, extent>>; - -/** - * Concatenates the provided input values into a single one-dimensional vector. - * - * This function works in three steps: - * - All input values are converted into vectors using the `into_vector` operation. - * - The resulting vectors' elements are then promoted into a shared value type. - * - The resultant vectors are finally concatenated together. - * - * For instance, when invoking this function with arguments of types `float, double2, double`: - * - After the first step: `vec, vec, vec` - * - After the second step: `vec, vec, vec` - * - After the third step: `vec` - * - * Example - * ======= - * ``` - * double vec1 = 1.0; - * double3 vec2 = {3.0, 4.0, 5.0); - * double4 vec3 = {6.0, 7.0, 8.0, 9.0}; - * vec concatenated = concat(vec1, vec2, vec3); // contains [1, 2, 3, 4, 5, 6, 7, 8, 9] - * - * int num1 = 42; - * float num2 = 3.14159; - * int2 num3 = {-10, 10}; - * vec concatenated = concat(num1, num2, num3); // contains [42, 3.14159, -10, 10] - * ``` - */ -template -KERNEL_FLOAT_INLINE concat_type concat(const Vs&... inputs) { - vector_storage, concat_size> output; - detail::concat_helper::call(output.data(), inputs...); - return output; -} - -template -using select_type = vector, extent>>; - -/** - * Selects elements from the this vector based on the specified indices. - * - * Example - * ======= - * ``` - * vec input = {0, 10, 20, 30, 40, 50}; - * vec vec1 = select(input, 0, 4, 4, 2); // [0, 40, 40, 20] - * - * vec indices = {0, 4, 4, 2}; - * vec vec2 = select(input, indices); // [0, 40, 40, 20] - * ``` - */ -template -KERNEL_FLOAT_INLINE select_type select(const V& input, const Is&... indices) { - using T = vector_value_type; - static constexpr size_t N = vector_extent; - static constexpr size_t M = concat_size; - - vector_storage index_set; - detail::concat_helper::call(index_set.data(), indices...); - - vector_storage inputs = into_vector_storage(input); - vector_storage outputs; - for (size_t i = 0; i < M; i++) { - size_t j = index_set.data()[i]; - - if (j < N) { - outputs.data()[i] = inputs.data()[j]; - } - } - - return outputs; -} - -} // namespace kernel_float - -#endif -#ifndef KERNEL_FLOAT_UNOPS_H -#define KERNEL_FLOAT_UNOPS_H - - - -namespace kernel_float { -namespace detail { - -template -struct apply_impl { - KERNEL_FLOAT_INLINE static vector_storage - call(F fun, const vector_storage&... inputs) { - vector_storage result; - -#pragma unroll - for (size_t i = 0; i < N; i++) { - result.data()[i] = fun(inputs.data()[i]...); - } - - return result; - } -}; -} // namespace detail - -template -using map_type = vector>, vector_extent_type>; - -/** - * Apply the function `F` to each element from the vector `input` and return the results as a new vector. - * - * Examples - * ======== - * ``` - * vec input = {1.0f, 2.0f, 3.0f, 4.0f}; - * vec squared = map([](auto x) { return x * x; }, input); // [1.0f, 4.0f, 9.0f, 16.0f] - * ``` - */ -template -KERNEL_FLOAT_INLINE map_type map(F fun, const V& input) { - using Input = vector_value_type; - using Output = result_t; - return detail::apply_impl, Output, Input>::call( - fun, - into_vector_storage(input)); -} - -#define KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ - namespace ops { \ - template \ - struct NAME { \ - KERNEL_FLOAT_INLINE T operator()(T input) { \ - return T(EXPR); \ - } \ - }; \ - } \ - template \ - KERNEL_FLOAT_INLINE vector, vector_extent_type> NAME(const V& input) { \ - using F = ops::NAME>; \ - return map(F {}, input); \ - } - -#define KERNEL_FLOAT_DEFINE_UNARY_OP(NAME, OP, EXPR) \ - KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ - template \ - KERNEL_FLOAT_INLINE vector operator OP(const vector& vec) { \ - return NAME(vec); \ - } - -KERNEL_FLOAT_DEFINE_UNARY_OP(negate, -, -input) -KERNEL_FLOAT_DEFINE_UNARY_OP(bit_not, ~, ~input) -KERNEL_FLOAT_DEFINE_UNARY_OP(logical_not, !, !bool(input)) - -#define KERNEL_FLOAT_DEFINE_UNARY_FUN(NAME) KERNEL_FLOAT_DEFINE_UNARY(NAME, ::NAME(input)) - -KERNEL_FLOAT_DEFINE_UNARY_FUN(acos) -KERNEL_FLOAT_DEFINE_UNARY_FUN(abs) -KERNEL_FLOAT_DEFINE_UNARY_FUN(acosh) -KERNEL_FLOAT_DEFINE_UNARY_FUN(asin) -KERNEL_FLOAT_DEFINE_UNARY_FUN(asinh) -KERNEL_FLOAT_DEFINE_UNARY_FUN(atan) -KERNEL_FLOAT_DEFINE_UNARY_FUN(atanh) -KERNEL_FLOAT_DEFINE_UNARY_FUN(cbrt) -KERNEL_FLOAT_DEFINE_UNARY_FUN(ceil) -KERNEL_FLOAT_DEFINE_UNARY_FUN(cos) -KERNEL_FLOAT_DEFINE_UNARY_FUN(cosh) -KERNEL_FLOAT_DEFINE_UNARY_FUN(cospi) -KERNEL_FLOAT_DEFINE_UNARY_FUN(erf) -KERNEL_FLOAT_DEFINE_UNARY_FUN(erfc) -KERNEL_FLOAT_DEFINE_UNARY_FUN(erfcinv) -KERNEL_FLOAT_DEFINE_UNARY_FUN(erfcx) -KERNEL_FLOAT_DEFINE_UNARY_FUN(erfinv) -KERNEL_FLOAT_DEFINE_UNARY_FUN(exp) -KERNEL_FLOAT_DEFINE_UNARY_FUN(exp10) -KERNEL_FLOAT_DEFINE_UNARY_FUN(exp2) -KERNEL_FLOAT_DEFINE_UNARY_FUN(expm1) -KERNEL_FLOAT_DEFINE_UNARY_FUN(fabs) -KERNEL_FLOAT_DEFINE_UNARY_FUN(floor) -KERNEL_FLOAT_DEFINE_UNARY_FUN(ilogb) -KERNEL_FLOAT_DEFINE_UNARY_FUN(lgamma) -KERNEL_FLOAT_DEFINE_UNARY_FUN(log) -KERNEL_FLOAT_DEFINE_UNARY_FUN(log10) -KERNEL_FLOAT_DEFINE_UNARY_FUN(logb) -KERNEL_FLOAT_DEFINE_UNARY_FUN(nearbyint) -KERNEL_FLOAT_DEFINE_UNARY_FUN(normcdf) -KERNEL_FLOAT_DEFINE_UNARY_FUN(rcbrt) -KERNEL_FLOAT_DEFINE_UNARY_FUN(sin) -KERNEL_FLOAT_DEFINE_UNARY_FUN(sinh) -KERNEL_FLOAT_DEFINE_UNARY_FUN(sqrt) -KERNEL_FLOAT_DEFINE_UNARY_FUN(tan) -KERNEL_FLOAT_DEFINE_UNARY_FUN(tanh) -KERNEL_FLOAT_DEFINE_UNARY_FUN(tgamma) -KERNEL_FLOAT_DEFINE_UNARY_FUN(trunc) -KERNEL_FLOAT_DEFINE_UNARY_FUN(y0) -KERNEL_FLOAT_DEFINE_UNARY_FUN(y1) -KERNEL_FLOAT_DEFINE_UNARY_FUN(yn) -KERNEL_FLOAT_DEFINE_UNARY_FUN(rint) -KERNEL_FLOAT_DEFINE_UNARY_FUN(rsqrt) -KERNEL_FLOAT_DEFINE_UNARY_FUN(round) -KERNEL_FLOAT_DEFINE_UNARY_FUN(signbit) -KERNEL_FLOAT_DEFINE_UNARY_FUN(isinf) -KERNEL_FLOAT_DEFINE_UNARY_FUN(isnan) - -#if KERNEL_FLOAT_IS_DEVICE -#define KERNEL_FLOAT_DEFINE_UNARY_FAST(FUN_NAME, OP_NAME, FLOAT_FUN) \ - KERNEL_FLOAT_DEFINE_UNARY(FUN_NAME, ops::OP_NAME {}(input)) \ - namespace ops { \ - template<> \ - struct OP_NAME { \ - KERNEL_FLOAT_INLINE float operator()(float input) { \ - return FLOAT_FUN(input); \ - } \ - }; \ - } -#else -#define KERNEL_FLOAT_DEFINE_UNARY_FAST(FUN_NAME, OP_NAME, FLOAT_FUN) \ - KERNEL_FLOAT_DEFINE_UNARY(FUN_NAME, ops::OP_NAME {}(input)) -#endif - -KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_exp, exp, __expf) -KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_log, log, __logf) -KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_cos, cos, __cosf) -KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_sin, sin, __sinf) -KERNEL_FLOAT_DEFINE_UNARY_FAST(fast_tan, tan, __tanf) - -} // namespace kernel_float - -#endif //KERNEL_FLOAT_UNOPS_H -#ifndef KERNEL_FLOAT_CAST_H -#define KERNEL_FLOAT_CAST_H - - - - -namespace kernel_float { +namespace kernel_float { enum struct RoundingMode { ANY, DOWN, UP, NEAREST, TOWARD_ZERO }; @@ -2000,6 +1738,310 @@ KERNEL_FLOAT_CONSTANT_DEFINE_OP(/) } // namespace kernel_float +#endif +#ifndef KERNEL_FLOAT_ITERATE_H +#define KERNEL_FLOAT_ITERATE_H + + + + +namespace kernel_float { + +/** + * Apply the function fun for each element from input. + * + * Example + * ======= + * ``` + * for_each(range(), [&](auto i) { + * printf("element: %d\n", i); + * }); + * ``` + */ +template +void for_each(V&& input, F fun) { + auto storage = into_vector_storage(input); + +#pragma unroll + for (size_t i = 0; i < vector_extent; i++) { + fun(storage.data()[i]); + } +} + +namespace detail { +template +struct range_helper { + KERNEL_FLOAT_INLINE + static vector_storage call() { + vector_storage result; + +#pragma unroll + for (size_t i = 0; i < N; i++) { + result.data()[i] = T(i); + } + + return result; + } +}; +} // namespace detail + +/** + * Generate vector consisting of the numbers `0...N-1` of type `T` + * + * Example + * ======= + * ``` + * // Returns [0, 1, 2] + * vec vec = range(); + * ``` + */ +template +KERNEL_FLOAT_INLINE vector> range() { + return detail::range_helper::call(); +} + +/** + * Takes a vector `vec` and returns a new vector consisting of the numbers ``0...N-1`` of type ``T`` + * + * Example + * ======= + * ``` + * auto input = vec(5.0f, 10.0f, -1.0f); + * auto indices = range_like(input); // returns [0.0f, 1.0f, 2.0f] + * ``` + */ +template +KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { + return detail::range_helper, vector_extent>::call(); +} + +/** + * Takes a vector of size ``N`` and returns a new vector consisting of the numbers ``0...N-1``. The data type used + * for the indices is given by the first template argument, which is `size_t` by default. This function is useful when + * needing to iterate over the indices of a vector. + * + * Example + * ======= + * ``` + * // Returns [0, 1, 2] of type size_t + * vec a = each_index(float3(6, 4, 2)); + * + * // Returns [0, 1, 2] of type int. + * vec b = each_index(float3(6, 4, 2)); + * + * vec input = {1.0f, 2.0f, 3.0f, 4.0f}; + * for (auto index: each_index(input)) { + * printf("%d] %f\n", index, input[index]); + * } + * ``` + */ +template +KERNEL_FLOAT_INLINE vector> each_index(const V& = {}) { + return detail::range_helper>::call(); +} + +namespace detail { +template, size_t N = vector_extent> +struct flatten_helper { + using value_type = typename flatten_helper::value_type; + static constexpr size_t size = N * flatten_helper::size; + + template + KERNEL_FLOAT_INLINE static void call(U* output, const V& input) { + vector_storage storage = into_vector_storage(input); + +#pragma unroll + for (size_t i = 0; i < N; i++) { + flatten_helper::call(output + flatten_helper::size * i, storage.data()[i]); + } + } +}; + +template +struct flatten_helper { + using value_type = T; + static constexpr size_t size = 1; + + KERNEL_FLOAT_INLINE + static void call(T* output, const T& input) { + *output = input; + } + + template + KERNEL_FLOAT_INLINE static void call(U* output, const T& input) { + *output = ops::cast {}(input); + } +}; +} // namespace detail + +template +using flatten_value_type = typename detail::flatten_helper::value_type; + +template +static constexpr size_t flatten_size = detail::flatten_helper::size; + +template +using flatten_type = vector, extent>>; + +/** + * Flattens the elements of this vector. For example, this turns a `vec, 3>` into a `vec`. + * + * Example + * ======= + * ``` + * vec input = {{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}}; + * vec result = flatten(input); // returns [1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f] + * ``` + */ +template +KERNEL_FLOAT_INLINE flatten_type flatten(const V& input) { + vector_storage, flatten_size> output; + detail::flatten_helper::call(output.data(), input); + return output; +} + +namespace detail { +template> +struct concat_base_helper { + static constexpr size_t size = vector_extent; + + KERNEL_FLOAT_INLINE static void call(U* output, const V& input) { + vector_storage storage = into_vector_storage(input); + + for (size_t i = 0; i < size; i++) { + output[i] = ops::cast {}(storage.data()[i]); + } + } +}; + +template +struct concat_base_helper { + static constexpr size_t size = 1; + + KERNEL_FLOAT_INLINE static void call(U* output, const T& input) { + *output = ops::cast {}(input); + } +}; + +template +struct concat_base_helper { + static constexpr size_t size = 1; + + KERNEL_FLOAT_INLINE static void call(T* output, const T& input) { + *output = input; + } +}; + +template +struct concat_helper {}; + +template +struct concat_helper { + using value_type = + typename promote_type, typename concat_helper::value_type>:: + type; + static constexpr size_t size = concat_base_helper::size + concat_helper::size; + + template + KERNEL_FLOAT_INLINE static void call(U* output, const V& input, const Vs&... rest) { + concat_base_helper::call(output, input); + concat_helper::call(output + concat_base_helper::size, rest...); + } +}; + +template<> +struct concat_helper<> { + using value_type = void; + static constexpr size_t size = 1; + + template + KERNEL_FLOAT_INLINE static void call(U* output) {} +}; +} // namespace detail + +template +using concat_value_type = promote_t::value_type>; + +template +static constexpr size_t concat_size = detail::concat_helper::size; + +template +using concat_type = vector, extent>>; + +/** + * Concatenates the provided input values into a single one-dimensional vector. + * + * This function works in three steps: + * - All input values are converted into vectors using the `into_vector` operation. + * - The resulting vectors' elements are then promoted into a shared value type. + * - The resultant vectors are finally concatenated together. + * + * For instance, when invoking this function with arguments of types `float, double2, double`: + * - After the first step: `vec, vec, vec` + * - After the second step: `vec, vec, vec` + * - After the third step: `vec` + * + * Example + * ======= + * ``` + * double vec1 = 1.0; + * double3 vec2 = {3.0, 4.0, 5.0); + * double4 vec3 = {6.0, 7.0, 8.0, 9.0}; + * vec concatenated = concat(vec1, vec2, vec3); // contains [1, 2, 3, 4, 5, 6, 7, 8, 9] + * + * int num1 = 42; + * float num2 = 3.14159; + * int2 num3 = {-10, 10}; + * vec concatenated = concat(num1, num2, num3); // contains [42, 3.14159, -10, 10] + * ``` + */ +template +KERNEL_FLOAT_INLINE concat_type concat(const Vs&... inputs) { + vector_storage, concat_size> output; + detail::concat_helper::call(output.data(), inputs...); + return output; +} + +template +using select_type = vector, extent>>; + +/** + * Selects elements from the this vector based on the specified indices. + * + * Example + * ======= + * ``` + * vec input = {0, 10, 20, 30, 40, 50}; + * vec vec1 = select(input, 0, 4, 4, 2); // [0, 40, 40, 20] + * + * vec indices = {0, 4, 4, 2}; + * vec vec2 = select(input, indices); // [0, 40, 40, 20] + * ``` + */ +template +KERNEL_FLOAT_INLINE select_type select(const V& input, const Is&... indices) { + using T = vector_value_type; + static constexpr size_t N = vector_extent; + static constexpr size_t M = concat_size; + + vector_storage index_set; + detail::concat_helper::call(index_set.data(), indices...); + + vector_storage inputs = into_vector_storage(input); + vector_storage outputs; + for (size_t i = 0; i < M; i++) { + size_t j = index_set.data()[i]; + + if (j < N) { + outputs.data()[i] = inputs.data()[j]; + } + } + + return outputs; +} + +} // namespace kernel_float + #endif #ifndef KERNEL_FLOAT_MEMORY_H #define KERNEL_FLOAT_MEMORY_H @@ -2658,7 +2700,7 @@ KERNEL_FLOAT_INLINE vector fma(const A& a, const B& b, const C& c) { namespace kernel_float { /** - * Container that stores ``N`` values of type ``T``. + * Container that stores ``N`` elements of type ``T``. * * It is not recommended to use this class directly, but instead, use the type `vec` which is an alias for * `vector, vector_storage>`. From fdbb671df1fc1fb26f84f634b4b7ed40e5e4db3f Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 16 Aug 2023 11:21:45 +0200 Subject: [PATCH 28/50] Add missing INLINEs for literal operators --- include/kernel_float/prelude.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/kernel_float/prelude.h b/include/kernel_float/prelude.h index 9f9c1cf..637ed9c 100644 --- a/include/kernel_float/prelude.h +++ b/include/kernel_float/prelude.h @@ -84,10 +84,12 @@ KERNEL_FLOAT_INLINE constexpr kconstant kconst(T value) { return value; } +KERNEL_FLOAT_INLINE static constexpr kconstant operator""_c(long double v) { return static_cast(v); } +KERNEL_FLOAT_INLINE static constexpr kconstant operator""_c(unsigned long long int v) { return static_cast(v); } From 67e7c5ee325e0b0f1225c4e9b239bf8dd1bf1c9f Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 16 Aug 2023 11:22:09 +0200 Subject: [PATCH 29/50] Remove `constant` operator overloads since they are ambiguous --- include/kernel_float/constant.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/kernel_float/constant.h b/include/kernel_float/constant.h index 5ee2fae..095ec7c 100644 --- a/include/kernel_float/constant.h +++ b/include/kernel_float/constant.h @@ -79,10 +79,10 @@ struct cast, R, m> { return constant(operator OP(T(left.get()), T(right.get()))); \ } -KERNEL_FLOAT_CONSTANT_DEFINE_OP(+) -KERNEL_FLOAT_CONSTANT_DEFINE_OP(-) -KERNEL_FLOAT_CONSTANT_DEFINE_OP(*) -KERNEL_FLOAT_CONSTANT_DEFINE_OP(/) +//KERNEL_FLOAT_CONSTANT_DEFINE_OP(+) +//KERNEL_FLOAT_CONSTANT_DEFINE_OP(-) +//KERNEL_FLOAT_CONSTANT_DEFINE_OP(*) +//KERNEL_FLOAT_CONSTANT_DEFINE_OP(/) } // namespace kernel_float From b236a521d5decdd59b17361febbb7ee39803b715 Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 16 Aug 2023 11:22:23 +0200 Subject: [PATCH 30/50] Add vector conversion for `T[N]` --- include/kernel_float/base.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/kernel_float/base.h b/include/kernel_float/base.h index 20a4735..b72bc42 100644 --- a/include/kernel_float/base.h +++ b/include/kernel_float/base.h @@ -183,6 +183,24 @@ struct into_vector_traits { } }; +template +struct into_vector_traits { + using value_type = T; + using extent_type = extent; + + KERNEL_FLOAT_INLINE + static vector_storage call(const T (&input)[N]) { + return call(input, make_index_sequence()); + } + + private: + template + KERNEL_FLOAT_INLINE static vector_storage + call(const T (&input)[N], index_sequence) { + return {input[Is]...}; + } +}; + template struct into_vector_traits: into_vector_traits {}; From 2551fb2d3f9e13b7e5e88fd4a852f7395e287305 Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 16 Aug 2023 12:44:20 +0200 Subject: [PATCH 31/50] Add `into_vector` to documentation --- docs/build_api.py | 1 + include/kernel_float/vector.h | 3 ++- single_include/kernel_float.h | 35 ++++++++++++++++++++++++++++------- 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/docs/build_api.py b/docs/build_api.py index fde6fbb..ee15b1e 100644 --- a/docs/build_api.py +++ b/docs/build_api.py @@ -84,6 +84,7 @@ def build_index_page(groups): "broadcast", "convert", "make_vec", + "into_vector", "concat", "select", "for_each", diff --git a/include/kernel_float/vector.h b/include/kernel_float/vector.h index 7447b7e..50e3860 100644 --- a/include/kernel_float/vector.h +++ b/include/kernel_float/vector.h @@ -275,7 +275,8 @@ struct vector: public S { * * - For vectors `vec`, it simply returns the original vector. * - For primitive types `T` (e.g., `int`, `float`, `double`), it returns a `vec`. - * - For array-like types (e.g., `int2`, `std::array`, `T[N]`), it returns `vec`. + * - For array-like types (e.g., `std::array`, `T[N]`), it returns `vec`. + * - For vector-like types (e.g., `int2`, `dim3`), it returns `vec`. */ template KERNEL_FLOAT_INLINE into_vector_type into_vector(V&& input) { diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index e129d70..c410f9a 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-08-15 14:44:32.635916 -// git hash: 35d55328fd0cd6644f23137ea17dd4dd470a870d +// date: 2023-08-16 12:43:52.493856 +// git hash: b236a521d5decdd59b17361febbb7ee39803b715 //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -507,6 +507,24 @@ struct into_vector_traits { } }; +template +struct into_vector_traits { + using value_type = T; + using extent_type = extent; + + KERNEL_FLOAT_INLINE + static vector_storage call(const T (&input)[N]) { + return call(input, make_index_sequence()); + } + + private: + template + KERNEL_FLOAT_INLINE static vector_storage + call(const T (&input)[N], index_sequence) { + return {input[Is]...}; + } +}; + template struct into_vector_traits: into_vector_traits {}; @@ -1731,10 +1749,10 @@ struct cast, R, m> { return constant(operator OP(T(left.get()), T(right.get()))); \ } -KERNEL_FLOAT_CONSTANT_DEFINE_OP(+) -KERNEL_FLOAT_CONSTANT_DEFINE_OP(-) -KERNEL_FLOAT_CONSTANT_DEFINE_OP(*) -KERNEL_FLOAT_CONSTANT_DEFINE_OP(/) +//KERNEL_FLOAT_CONSTANT_DEFINE_OP(+) +//KERNEL_FLOAT_CONSTANT_DEFINE_OP(-) +//KERNEL_FLOAT_CONSTANT_DEFINE_OP(*) +//KERNEL_FLOAT_CONSTANT_DEFINE_OP(/) } // namespace kernel_float @@ -2964,7 +2982,8 @@ struct vector: public S { * * - For vectors `vec`, it simply returns the original vector. * - For primitive types `T` (e.g., `int`, `float`, `double`), it returns a `vec`. - * - For array-like types (e.g., `int2`, `std::array`, `T[N]`), it returns `vec`. + * - For array-like types (e.g., `std::array`, `T[N]`), it returns `vec`. + * - For vector-like types (e.g., `int2`, `dim3`), it returns `vec`. */ template KERNEL_FLOAT_INLINE into_vector_type into_vector(V&& input) { @@ -3664,10 +3683,12 @@ KERNEL_FLOAT_INLINE constexpr kconstant kconst(T value) { return value; } +KERNEL_FLOAT_INLINE static constexpr kconstant operator""_c(long double v) { return static_cast(v); } +KERNEL_FLOAT_INLINE static constexpr kconstant operator""_c(unsigned long long int v) { return static_cast(v); } From df42b93bfd36d8d9f1a397218cd91ebe1c13325f Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 16 Aug 2023 12:53:28 +0200 Subject: [PATCH 32/50] Add template deduction guides --- include/kernel_float/vector.h | 12 ++++++++++++ single_include/kernel_float.h | 16 ++++++++++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/include/kernel_float/vector.h b/include/kernel_float/vector.h index 50e3860..81b7ea5 100644 --- a/include/kernel_float/vector.h +++ b/include/kernel_float/vector.h @@ -319,6 +319,18 @@ KERNEL_FLOAT_INLINE vec, sizeof...(Args)> make_vec(Args&&... return vector_storage {T {args}...}; }; +#if defined(__cpp_deduction_guides) +// Deduction guide for `vector` +template +vector(Args&&... args) -> vector, extent>; + +// Deduction guides for aliases are only supported from C++20 +#if __cpp_deduction_guides >= 201907L +template +vec(Args&&... args) -> vec, sizeof...(Args)>; +#endif +#endif + } // namespace kernel_float #endif diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index c410f9a..8a64bcc 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-08-16 12:43:52.493856 -// git hash: b236a521d5decdd59b17361febbb7ee39803b715 +// date: 2023-08-16 12:52:02.575852 +// git hash: 2551fb2d3f9e13b7e5e88fd4a852f7395e287305 //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -3026,6 +3026,18 @@ KERNEL_FLOAT_INLINE vec, sizeof...(Args)> make_vec(Args&&... return vector_storage {T {args}...}; }; +#if defined(__cpp_deduction_guides) +// Deduction guide for `vector` +template +vector(Args&&... args) -> vector, extent>; + +// Deduction guides for aliases are only supported from C++20 +#if __cpp_deduction_guides >= 201907L +template +vec(Args&&... args) -> vec, sizeof...(Args)>; +#endif +#endif + } // namespace kernel_float #endif From 8ab491f94309bc6bcb81642d3d1020906bbac606 Mon Sep 17 00:00:00 2001 From: stijn Date: Thu, 24 Aug 2023 20:19:15 +0200 Subject: [PATCH 33/50] Fix several issues in `complex.h` --- include/kernel_float/complex.h | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/include/kernel_float/complex.h b/include/kernel_float/complex.h index 37dbdfc..aa133e3 100644 --- a/include/kernel_float/complex.h +++ b/include/kernel_float/complex.h @@ -20,26 +20,26 @@ struct complex_type: complex_type_storage { KERNEL_FLOAT_INLINE complex_type(complex_type that) : base_type(that.real(), that.imag()) {} KERNEL_FLOAT_INLINE - complex_type(T real = {}, T imag = {}) : base_type(real, im) {} + complex_type(T real = {}, T imag = {}) : base_type(real, imag) {} KERNEL_FLOAT_INLINE T real() const { - return re; + return this->re; } KERNEL_FLOAT_INLINE T imag() const { - return im; + return this->im; } KERNEL_FLOAT_INLINE T norm() const { - return re * re + im * im; + return real() * real() + imag() * imag(); } KERNEL_FLOAT_INLINE complex_type conj() const { - return {re, -im}; + return {real(), -imag()}; } }; @@ -80,23 +80,17 @@ KERNEL_FLOAT_INLINE complex_type operator-(complex_type v) { template KERNEL_FLOAT_INLINE complex_type operator-(complex_type a, complex_type b) { - return { - a.real() - b.real(), a.imag() - b.imag() - } + return {a.real() - b.real(), a.imag() - b.imag()}; } template KERNEL_FLOAT_INLINE complex_type operator-(T a, complex_type b) { - return { - a - b.real(), -b.imag() - } + return {a - b.real(), -b.imag()}; } template KERNEL_FLOAT_INLINE complex_type operator-(complex_type a, T b) { - return { - a.real() - b, a.imag() - } + return {a.real() - b, a.imag()}; } template @@ -111,9 +105,7 @@ KERNEL_FLOAT_INLINE complex_type& operator-=(complex_type& a, T b) { template KERNEL_FLOAT_INLINE complex_type operator*(complex_type a, complex_type b) { - return { - a.real() * b.real() - a.imag() * b.imag(), a.real() * b.imag() + a.imag() * b.real() - } + return {a.real() * b.real() - a.imag() * b.imag(), a.real() * b.imag() + a.imag() * b.real()}; } template @@ -133,10 +125,7 @@ KERNEL_FLOAT_INLINE complex_type& operator*=(complex_type& a, T b) { template KERNEL_FLOAT_INLINE complex_type operator*(T a, complex_type b) { - return { - a * b.real(), - a * b.imag(), - }; + return {a * b.real(), a * b.imag()}; } template From be214a21c6321511a551dc493a45e56abe348848 Mon Sep 17 00:00:00 2001 From: stijn Date: Thu, 24 Aug 2023 20:20:24 +0200 Subject: [PATCH 34/50] Fallback to using fp32 for fp16 operations that are not supported (e.g., `tan`, `expm1`) --- include/kernel_float/bf16.h | 36 +++++++++++- include/kernel_float/fp16.h | 33 ++++++++++- single_include/kernel_float.h | 104 +++++++++++++++++++++++++--------- 3 files changed, 140 insertions(+), 33 deletions(-) diff --git a/include/kernel_float/bf16.h b/include/kernel_float/bf16.h index 2b6fa3b..20f0a9e 100644 --- a/include/kernel_float/bf16.h +++ b/include/kernel_float/bf16.h @@ -116,6 +116,22 @@ struct reduce_helper= 2)>> { }; } // namespace detail +#define KERNEL_FLOAT_BF16_UNARY_FORWARD(NAME) \ + namespace ops { \ + template<> \ + struct NAME<__nv_bfloat16> { \ + KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(__nv_bfloat16 input) { \ + return __nv_bfloat16(ops::NAME {}(float(input))); \ + } \ + }; \ + } + +KERNEL_FLOAT_BF16_UNARY_FORWARD(tan) +KERNEL_FLOAT_BF16_UNARY_FORWARD(asin) +KERNEL_FLOAT_BF16_UNARY_FORWARD(acos) +KERNEL_FLOAT_BF16_UNARY_FORWARD(atan) +KERNEL_FLOAT_BF16_UNARY_FORWARD(expm1) + #if KERNEL_FLOAT_IS_DEVICE #define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2) \ namespace ops { \ @@ -128,13 +144,16 @@ struct reduce_helper= 2)>> { } \ namespace detail { \ template<> \ - struct map_bfloat16x2> { \ + struct map_halfx2> { \ KERNEL_FLOAT_INLINE static __nv_bfloat162 \ call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 input) { \ return FUN2(input); \ } \ }; \ } +#else +#define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2) KERNEL_FLOAT_BF16_UNARY_FORWARD(NAME) +#endif KERNEL_FLOAT_BF16_UNARY_FUN(abs, ::__habs, ::__habs2) KERNEL_FLOAT_BF16_UNARY_FUN(negate, ::__hneg, ::__hneg2) @@ -156,6 +175,7 @@ KERNEL_FLOAT_BF16_UNARY_FUN(fast_log, ::hlog, ::h2log) KERNEL_FLOAT_BF16_UNARY_FUN(fast_cos, ::hcos, ::h2cos) KERNEL_FLOAT_BF16_UNARY_FUN(fast_sin, ::hsin, ::h2sin) +#if KERNEL_FLOAT_IS_DEVICE #define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2) \ namespace ops { \ template<> \ @@ -175,6 +195,18 @@ KERNEL_FLOAT_BF16_UNARY_FUN(fast_sin, ::hsin, ::h2sin) } \ }; \ } +#else +#define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__nv_bfloat16> { \ + KERNEL_FLOAT_INLINE __nv_bfloat16 \ + operator()(__nv_bfloat16 left, __nv_bfloat16 right) const { \ + return __nv_bfloat16(ops::NAME {}(float(left), float(right))); \ + } \ + }; \ + } +#endif KERNEL_FLOAT_BF16_BINARY_FUN(add, __hadd, __hadd2) KERNEL_FLOAT_BF16_BINARY_FUN(subtract, __hsub, __hsub2) @@ -192,8 +224,6 @@ KERNEL_FLOAT_BF16_BINARY_FUN(less_equal, __hle, __hle2) KERNEL_FLOAT_BF16_BINARY_FUN(greater, __hgt, __hgt2) KERNEL_FLOAT_BF16_BINARY_FUN(greater_equal, __hge, __hgt2) -#endif - #define KERNEL_FLOAT_BF16_CAST(T, TO_HALF, FROM_HALF) \ namespace ops { \ template<> \ diff --git a/include/kernel_float/fp16.h b/include/kernel_float/fp16.h index f80978b..aa9675f 100644 --- a/include/kernel_float/fp16.h +++ b/include/kernel_float/fp16.h @@ -112,6 +112,22 @@ struct reduce_helper= 2)>> { }; // namespace detail +#define KERNEL_FLOAT_FP16_UNARY_FORWARD(NAME) \ + namespace ops { \ + template<> \ + struct NAME<__half> { \ + KERNEL_FLOAT_INLINE __half operator()(__half input) { \ + return __half(ops::NAME {}(float(input))); \ + } \ + }; \ + } + +KERNEL_FLOAT_FP16_UNARY_FORWARD(tan) +KERNEL_FLOAT_FP16_UNARY_FORWARD(asin) +KERNEL_FLOAT_FP16_UNARY_FORWARD(acos) +KERNEL_FLOAT_FP16_UNARY_FORWARD(atan) +KERNEL_FLOAT_FP16_UNARY_FORWARD(expm1) + #if KERNEL_FLOAT_IS_DEVICE #define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2) \ namespace ops { \ @@ -130,6 +146,9 @@ struct reduce_helper= 2)>> { } \ }; \ } +#else +#define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2) KERNEL_FLOAT_FP16_UNARY_FORWARD(NAME) +#endif KERNEL_FLOAT_FP16_UNARY_FUN(abs, ::__habs, ::__habs2) KERNEL_FLOAT_FP16_UNARY_FUN(negate, ::__hneg, ::__hneg2) @@ -151,6 +170,7 @@ KERNEL_FLOAT_FP16_UNARY_FUN(fast_log, ::hlog, ::h2log) KERNEL_FLOAT_FP16_UNARY_FUN(fast_cos, ::hcos, ::h2cos) KERNEL_FLOAT_FP16_UNARY_FUN(fast_sin, ::hsin, ::h2sin) +#if KERNEL_FLOAT_IS_DEVICE #define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2) \ namespace ops { \ template<> \ @@ -168,6 +188,17 @@ KERNEL_FLOAT_FP16_UNARY_FUN(fast_sin, ::hsin, ::h2sin) } \ }; \ } +#else +#define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__half> { \ + KERNEL_FLOAT_INLINE __half operator()(__half left, __half right) const { \ + return __half(ops::NAME {}(float(left), float(right))); \ + } \ + }; \ + } +#endif KERNEL_FLOAT_FP16_BINARY_FUN(add, __hadd, __hadd2) KERNEL_FLOAT_FP16_BINARY_FUN(subtract, __hsub, __hsub2) @@ -185,8 +216,6 @@ KERNEL_FLOAT_FP16_BINARY_FUN(less_equal, __hle, __hle2) KERNEL_FLOAT_FP16_BINARY_FUN(greater, __hgt, __hgt2) KERNEL_FLOAT_FP16_BINARY_FUN(greater_equal, __hge, __hgt2) -#endif - #define KERNEL_FLOAT_FP16_CAST(T, TO_HALF, FROM_HALF) \ namespace ops { \ template<> \ diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 8a64bcc..2cd1450 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-08-16 12:52:02.575852 -// git hash: 2551fb2d3f9e13b7e5e88fd4a852f7395e287305 +// date: 2023-08-24 20:18:55.064697 +// git hash: df42b93bfd36d8d9f1a397218cd91ebe1c13325f //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -670,26 +670,26 @@ struct complex_type: complex_type_storage { KERNEL_FLOAT_INLINE complex_type(complex_type that) : base_type(that.real(), that.imag()) {} KERNEL_FLOAT_INLINE - complex_type(T real = {}, T imag = {}) : base_type(real, im) {} + complex_type(T real = {}, T imag = {}) : base_type(real, imag) {} KERNEL_FLOAT_INLINE T real() const { - return re; + return this->re; } KERNEL_FLOAT_INLINE T imag() const { - return im; + return this->im; } KERNEL_FLOAT_INLINE T norm() const { - return re * re + im * im; + return real() * real() + imag() * imag(); } KERNEL_FLOAT_INLINE complex_type conj() const { - return {re, -im}; + return {real(), -imag()}; } }; @@ -730,23 +730,17 @@ KERNEL_FLOAT_INLINE complex_type operator-(complex_type v) { template KERNEL_FLOAT_INLINE complex_type operator-(complex_type a, complex_type b) { - return { - a.real() - b.real(), a.imag() - b.imag() - } + return {a.real() - b.real(), a.imag() - b.imag()}; } template KERNEL_FLOAT_INLINE complex_type operator-(T a, complex_type b) { - return { - a - b.real(), -b.imag() - } + return {a - b.real(), -b.imag()}; } template KERNEL_FLOAT_INLINE complex_type operator-(complex_type a, T b) { - return { - a.real() - b, a.imag() - } + return {a.real() - b, a.imag()}; } template @@ -761,9 +755,7 @@ KERNEL_FLOAT_INLINE complex_type& operator-=(complex_type& a, T b) { template KERNEL_FLOAT_INLINE complex_type operator*(complex_type a, complex_type b) { - return { - a.real() * b.real() - a.imag() * b.imag(), a.real() * b.imag() + a.imag() * b.real() - } + return {a.real() * b.real() - a.imag() * b.imag(), a.real() * b.imag() + a.imag() * b.real()}; } template @@ -783,10 +775,7 @@ KERNEL_FLOAT_INLINE complex_type& operator*=(complex_type& a, T b) { template KERNEL_FLOAT_INLINE complex_type operator*(T a, complex_type b) { - return { - a * b.real(), - a * b.imag(), - }; + return {a * b.real(), a * b.imag()}; } template @@ -3155,6 +3144,22 @@ struct reduce_helper= 2)>> { }; // namespace detail +#define KERNEL_FLOAT_FP16_UNARY_FORWARD(NAME) \ + namespace ops { \ + template<> \ + struct NAME<__half> { \ + KERNEL_FLOAT_INLINE __half operator()(__half input) { \ + return __half(ops::NAME {}(float(input))); \ + } \ + }; \ + } + +KERNEL_FLOAT_FP16_UNARY_FORWARD(tan) +KERNEL_FLOAT_FP16_UNARY_FORWARD(asin) +KERNEL_FLOAT_FP16_UNARY_FORWARD(acos) +KERNEL_FLOAT_FP16_UNARY_FORWARD(atan) +KERNEL_FLOAT_FP16_UNARY_FORWARD(expm1) + #if KERNEL_FLOAT_IS_DEVICE #define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2) \ namespace ops { \ @@ -3173,6 +3178,9 @@ struct reduce_helper= 2)>> { } \ }; \ } +#else +#define KERNEL_FLOAT_FP16_UNARY_FUN(NAME, FUN1, FUN2) KERNEL_FLOAT_FP16_UNARY_FORWARD(NAME) +#endif KERNEL_FLOAT_FP16_UNARY_FUN(abs, ::__habs, ::__habs2) KERNEL_FLOAT_FP16_UNARY_FUN(negate, ::__hneg, ::__hneg2) @@ -3194,6 +3202,7 @@ KERNEL_FLOAT_FP16_UNARY_FUN(fast_log, ::hlog, ::h2log) KERNEL_FLOAT_FP16_UNARY_FUN(fast_cos, ::hcos, ::h2cos) KERNEL_FLOAT_FP16_UNARY_FUN(fast_sin, ::hsin, ::h2sin) +#if KERNEL_FLOAT_IS_DEVICE #define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2) \ namespace ops { \ template<> \ @@ -3211,6 +3220,17 @@ KERNEL_FLOAT_FP16_UNARY_FUN(fast_sin, ::hsin, ::h2sin) } \ }; \ } +#else +#define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__half> { \ + KERNEL_FLOAT_INLINE __half operator()(__half left, __half right) const { \ + return __half(ops::NAME {}(float(left), float(right))); \ + } \ + }; \ + } +#endif KERNEL_FLOAT_FP16_BINARY_FUN(add, __hadd, __hadd2) KERNEL_FLOAT_FP16_BINARY_FUN(subtract, __hsub, __hsub2) @@ -3228,8 +3248,6 @@ KERNEL_FLOAT_FP16_BINARY_FUN(less_equal, __hle, __hle2) KERNEL_FLOAT_FP16_BINARY_FUN(greater, __hgt, __hgt2) KERNEL_FLOAT_FP16_BINARY_FUN(greater_equal, __hge, __hgt2) -#endif - #define KERNEL_FLOAT_FP16_CAST(T, TO_HALF, FROM_HALF) \ namespace ops { \ template<> \ @@ -3429,6 +3447,22 @@ struct reduce_helper= 2)>> { }; } // namespace detail +#define KERNEL_FLOAT_BF16_UNARY_FORWARD(NAME) \ + namespace ops { \ + template<> \ + struct NAME<__nv_bfloat16> { \ + KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(__nv_bfloat16 input) { \ + return __nv_bfloat16(ops::NAME {}(float(input))); \ + } \ + }; \ + } + +KERNEL_FLOAT_BF16_UNARY_FORWARD(tan) +KERNEL_FLOAT_BF16_UNARY_FORWARD(asin) +KERNEL_FLOAT_BF16_UNARY_FORWARD(acos) +KERNEL_FLOAT_BF16_UNARY_FORWARD(atan) +KERNEL_FLOAT_BF16_UNARY_FORWARD(expm1) + #if KERNEL_FLOAT_IS_DEVICE #define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2) \ namespace ops { \ @@ -3441,13 +3475,16 @@ struct reduce_helper= 2)>> { } \ namespace detail { \ template<> \ - struct map_bfloat16x2> { \ + struct map_halfx2> { \ KERNEL_FLOAT_INLINE static __nv_bfloat162 \ call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 input) { \ return FUN2(input); \ } \ }; \ } +#else +#define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2) KERNEL_FLOAT_BF16_UNARY_FORWARD(NAME) +#endif KERNEL_FLOAT_BF16_UNARY_FUN(abs, ::__habs, ::__habs2) KERNEL_FLOAT_BF16_UNARY_FUN(negate, ::__hneg, ::__hneg2) @@ -3469,6 +3506,7 @@ KERNEL_FLOAT_BF16_UNARY_FUN(fast_log, ::hlog, ::h2log) KERNEL_FLOAT_BF16_UNARY_FUN(fast_cos, ::hcos, ::h2cos) KERNEL_FLOAT_BF16_UNARY_FUN(fast_sin, ::hsin, ::h2sin) +#if KERNEL_FLOAT_IS_DEVICE #define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2) \ namespace ops { \ template<> \ @@ -3488,6 +3526,18 @@ KERNEL_FLOAT_BF16_UNARY_FUN(fast_sin, ::hsin, ::h2sin) } \ }; \ } +#else +#define KERNEL_FLOAT_BF16_BINARY_FUN(NAME, FUN1, FUN2) \ + namespace ops { \ + template<> \ + struct NAME<__nv_bfloat16> { \ + KERNEL_FLOAT_INLINE __nv_bfloat16 \ + operator()(__nv_bfloat16 left, __nv_bfloat16 right) const { \ + return __nv_bfloat16(ops::NAME {}(float(left), float(right))); \ + } \ + }; \ + } +#endif KERNEL_FLOAT_BF16_BINARY_FUN(add, __hadd, __hadd2) KERNEL_FLOAT_BF16_BINARY_FUN(subtract, __hsub, __hsub2) @@ -3505,8 +3555,6 @@ KERNEL_FLOAT_BF16_BINARY_FUN(less_equal, __hle, __hle2) KERNEL_FLOAT_BF16_BINARY_FUN(greater, __hgt, __hgt2) KERNEL_FLOAT_BF16_BINARY_FUN(greater_equal, __hge, __hgt2) -#endif - #define KERNEL_FLOAT_BF16_CAST(T, TO_HALF, FROM_HALF) \ namespace ops { \ template<> \ From 903d677fac309fc2247dbe4da29632f4b2df2c77 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 28 Aug 2023 14:21:21 +0200 Subject: [PATCH 35/50] Promote `half` + `bfloat16` to `float` --- include/kernel_float/bf16.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/kernel_float/bf16.h b/include/kernel_float/bf16.h index 20f0a9e..3065855 100644 --- a/include/kernel_float/bf16.h +++ b/include/kernel_float/bf16.h @@ -320,8 +320,19 @@ struct dot_helper<__nv_bfloat16, N> { namespace kernel_float { KERNEL_FLOAT_BF16_CAST(__half, __float2bfloat16(input), __bfloat162float(input)); + +template<> +struct promote_type<__nv_bfloat16, __half> { + using type = float; } +template<> +struct promote_type<__half, __nv_bfloat16> { + using type = float; +} + +} // namespace kernel_float + #endif // KERNEL_FLOAT_FP16_AVAILABLE #endif From 31ffbb7ca20f9c4a1c43b37e06c99600a8f15b91 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 28 Aug 2023 14:23:27 +0200 Subject: [PATCH 36/50] Promote `constant` + `constant` to `constant>` --- include/kernel_float/constant.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/kernel_float/constant.h b/include/kernel_float/constant.h index 095ec7c..11c76a8 100644 --- a/include/kernel_float/constant.h +++ b/include/kernel_float/constant.h @@ -8,6 +8,9 @@ namespace kernel_float { template struct constant { + template + KERNEL_FLOAT_INLINE explicit constexpr constant(const constant& that) : value_(that.get()) {} + KERNEL_FLOAT_INLINE constexpr constant(T value = {}) : value_(value) {} @@ -32,7 +35,7 @@ KERNEL_FLOAT_INLINE constexpr constant make_constant(T value) { template struct promote_type, constant> { - using type = typename promote_type::type; + using type = constant::type>; }; template From 1212e8f8dca2ea08efe841bfa4e4193a95b58f0a Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 28 Aug 2023 14:30:37 +0200 Subject: [PATCH 37/50] Add deduction guides for type aliases in prelude --- include/kernel_float/constant.h | 6 ++++++ include/kernel_float/prelude.h | 14 ++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/include/kernel_float/constant.h b/include/kernel_float/constant.h index 11c76a8..19ecfd8 100644 --- a/include/kernel_float/constant.h +++ b/include/kernel_float/constant.h @@ -28,6 +28,12 @@ struct constant { T value_; }; +// Deduction guide for `constant` +#if defined(__cpp_deduction_guides) +template +constant(T&&) -> constant>; +#endif + template KERNEL_FLOAT_INLINE constexpr constant make_constant(T value) { return value; diff --git a/include/kernel_float/prelude.h b/include/kernel_float/prelude.h index 637ed9c..2bc06a2 100644 --- a/include/kernel_float/prelude.h +++ b/include/kernel_float/prelude.h @@ -1,7 +1,9 @@ #ifndef KERNEL_FLOAT_PRELUDE_H #define KERNEL_FLOAT_PRELUDE_H +#include "bf16.h" #include "constant.h" +#include "fp16.h" #include "vector.h" namespace kernel_float { @@ -94,6 +96,18 @@ static constexpr kconstant operator""_c(unsigned long long int v) return static_cast(v); } +// Deduction guides for aliases are only supported from C++20 +#if defined(__cpp_deduction_guides) && __cpp_deduction_guides >= 201907L +template +kscalar(T&&) -> kscalar>; + +template +kvec(Args&&...) -> kvec, sizeof...(Args)>; + +template +kconstant(T&&) -> kconstant>; +#endif + } // namespace prelude } // namespace kernel_float From cc846b6677c99135bbcb63d8c57bbbdd8ce3ba60 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 28 Aug 2023 14:30:47 +0200 Subject: [PATCH 38/50] Update single include --- single_include/kernel_float.h | 40 ++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 2cd1450..7ea6bf2 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-08-24 20:18:55.064697 -// git hash: df42b93bfd36d8d9f1a397218cd91ebe1c13325f +// date: 2023-08-28 14:29:52.760763 +// git hash: 31ffbb7ca20f9c4a1c43b37e06c99600a8f15b91 //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -1667,6 +1667,9 @@ namespace kernel_float { template struct constant { + template + KERNEL_FLOAT_INLINE explicit constexpr constant(const constant& that) : value_(that.get()) {} + KERNEL_FLOAT_INLINE constexpr constant(T value = {}) : value_(value) {} @@ -1684,6 +1687,12 @@ struct constant { T value_; }; +// Deduction guide for `constant` +#if defined(__cpp_deduction_guides) +template +constant(T&&) -> constant>; +#endif + template KERNEL_FLOAT_INLINE constexpr constant make_constant(T value) { return value; @@ -1691,7 +1700,7 @@ KERNEL_FLOAT_INLINE constexpr constant make_constant(T value) { template struct promote_type, constant> { - using type = typename promote_type::type; + using type = constant::type>; }; template @@ -3651,8 +3660,19 @@ struct dot_helper<__nv_bfloat16, N> { namespace kernel_float { KERNEL_FLOAT_BF16_CAST(__half, __float2bfloat16(input), __bfloat162float(input)); + +template<> +struct promote_type<__nv_bfloat16, __half> { + using type = float; +} + +template<> +struct promote_type<__half, __nv_bfloat16> { + using type = float; } +} // namespace kernel_float + #endif // KERNEL_FLOAT_FP16_AVAILABLE #endif @@ -3663,6 +3683,8 @@ KERNEL_FLOAT_BF16_CAST(__half, __float2bfloat16(input), __bfloat162float(input)) + + namespace kernel_float { namespace prelude { namespace kf = ::kernel_float; @@ -3753,6 +3775,18 @@ static constexpr kconstant operator""_c(unsigned long long int v) return static_cast(v); } +// Deduction guides for aliases are only supported from C++20 +#if defined(__cpp_deduction_guides) && __cpp_deduction_guides >= 201907L +template +kscalar(T&&) -> kscalar>; + +template +kvec(Args&&...) -> kvec, sizeof...(Args)>; + +template +kconstant(T&&) -> kconstant>; +#endif + } // namespace prelude } // namespace kernel_float From 64f21903e8049e4a46c53897a167f31174e1a231 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 18 Sep 2023 17:40:25 +0200 Subject: [PATCH 39/50] Rewrite test framework --- tests/basic.cu | 47 ------ tests/basics.cu | 148 +++++++++++++++++++ tests/binops.cu | 210 +++++++++++++-------------- tests/broadcast.cu | 67 --------- tests/cast.cu | 156 -------------------- tests/common.h | 353 +++++++++++++++++++++++++-------------------- tests/promotion.cu | 116 +++++++++++++++ tests/reduce.cu | 59 -------- tests/swizzle.cu | 43 ------ tests/triops.cu | 29 ---- tests/unops.cu | 105 ++++++-------- 11 files changed, 600 insertions(+), 733 deletions(-) delete mode 100644 tests/basic.cu create mode 100644 tests/basics.cu delete mode 100644 tests/broadcast.cu delete mode 100644 tests/cast.cu create mode 100644 tests/promotion.cu delete mode 100644 tests/reduce.cu delete mode 100644 tests/swizzle.cu delete mode 100644 tests/triops.cu diff --git a/tests/basic.cu b/tests/basic.cu deleted file mode 100644 index ee49d28..0000000 --- a/tests/basic.cu +++ /dev/null @@ -1,47 +0,0 @@ -#include "common.h" -#include "kernel_float.h" - -namespace kf = kernel_float; - -template> -struct basic_test; - -template -struct basic_test> { - __host__ __device__ void operator()(generator gen) { - T items[N] = {gen.next(Is)...}; - kf::vec a = {items[Is]...}; - - // check if getters work - ASSERT(equals(a.get(Is), items[Is]) && ...); - ASSERT(equals(a[Is], items[Is]) && ...); - - // check if setter works - T new_items[N] = {gen.next(Is)...}; - (a.set(Is, new_items[Is]), ...); - ASSERT(equals(a.get(Is), new_items[Is]) && ...); - - // check if setter works - T more_new_items[N] = {gen.next(Is)...}; - ((a[Is] = more_new_items[Is]), ...); - ASSERT(equals(a.get(Is), more_new_items[Is]) && ...); - - // check default constructor - kf::vec b; - ASSERT(equals(b.get(Is), T {}) && ...); - - // check broadcast constructor - T value = gen(); - kf::vec c {value}; - ASSERT(equals(c.get(Is), value) && ...); - - // check make_vec - kf::vec d = kf::make_vec(items[Is]...); - ASSERT(equals(d.get(Is), items[Is]) && ...); - } -}; - -TEST_CASE("basic") { - run_on_host_and_device(); - run_on_device(); -} diff --git a/tests/basics.cu b/tests/basics.cu new file mode 100644 index 0000000..f5c9061 --- /dev/null +++ b/tests/basics.cu @@ -0,0 +1,148 @@ +#include "common.h" + +struct basics_tests { + template + __host__ __device__ void operator()(generator gen, std::index_sequence) { + // default constructor + { + kf::vec x; + ASSERT(equals(x[I], T()) && ...); + } + + // filled with one + { + kf::vec x = {T((gen.next(I), 1))...}; + ASSERT(equals(x[I], T(1)) && ...); + } + + // filled with steps + { + kf::vec x = {T(I)...}; + ASSERT(equals(x[I], T(I)) && ...); + } + + // broadcast constructor + { + T init = gen.next(); + kf::vec x {init}; + ASSERT(equals(x[I], init) && ...); + } + + // Getters + T items[N] = {gen.next(I)...}; + kf::vec a = {items[I]...}; + + ASSERT(equals(a[I], items[I]) && ...); + ASSERT(equals(a.get(I), items[I]) && ...); + ASSERT(equals(a.at(I), items[I]) && ...); + ASSERT(equals(a(I), items[I]) && ...); + + // Data, begin, end + ASSERT(a.size() == N); + ASSERT(&a[0] == a.data()); + ASSERT(&a[0] == a.begin()); + ASSERT(&a[0] + N == a.end()); + ASSERT(&a[0] == a.cdata()); + ASSERT(&a[0] == a.cbegin()); + ASSERT(&a[0] + N == a.cend()); + + // setters + T new_items[N] = {gen.next(I)...}; + (a.set(I, new_items[I]), ...); + ASSERT(equals(a[I], new_items[I]) && ...); + } +}; + +REGISTER_TEST_CASE("basics", basics_tests, int, float) + +struct creation_tests { + __host__ __device__ void operator()(generator gen) { + using kernel_float::into_vector; + using kernel_float::make_vec; + + // into_vector on scalar + { + kf::vec a = into_vector(int(5)); + ASSERT(a[0] == 5); + } + + // into_vector on CUDA vector types + { + kf::vec a = into_vector(make_int1(5)); + kf::vec b = into_vector(make_int2(5, 4)); + kf::vec c = into_vector(make_int3(5, 4, -1)); + kf::vec d = into_vector(make_int4(5, 4, -1, 0)); + + ASSERT(a[0] == 5); + ASSERT(b[0] == 5 && b[1] == 4); + ASSERT(c[0] == 5 && c[1] == 4 && c[2] == -1); + ASSERT(d[0] == 5 && d[1] == 4 && d[2] == -1 && d[3] == 0); + } + + // into_vector on C-style array + { + int items[3] = {1, 2, 3}; + kf::vec a = into_vector(items); + ASSERT(a[0] == 1 && a[1] == 2 && a[2] == 3); + } + + // into_vector on kf array + { + kf::vec items = {1, 2, 3}; + kf::vec a = into_vector(items); + ASSERT(a[0] == 1 && a[1] == 2 && a[2] == 3); + } + + // make_vec + { + kf::vec a = make_vec(true, short(2), int(3)); + ASSERT(a[0] == 1 && a[1] == 2 && a[2] == 3); + } + } + + __host__ __device__ void operator()(generator gen) { + using kernel_float::into_vector; + using kernel_float::make_vec; + + // into_vector on scalar + { + kf::vec a = into_vector(int(5.0f)); + ASSERT(a[0] == 5.0f); + } + + // into_vector on CUDA vector types + { + kf::vec a = into_vector(make_float1(5.0f)); + kf::vec b = into_vector(make_float2(5.0f, 4.0f)); + kf::vec c = into_vector(make_float3(5.0f, 4.0f, -1.0f)); + kf::vec d = into_vector(make_float4(5.0f, 4.0f, -1.0f, 0.0f)); + + ASSERT(a[0] == 5.0f); + ASSERT(b[0] == 5.0f && b[1] == 4.0f); + ASSERT(c[0] == 5.0f && c[1] == 4.0f && c[2] == -1.0f); + ASSERT(d[0] == 5.0f && d[1] == 4.0f && d[2] == -1.0f && d[3] == 0.0f); + } + + // into_vector on C-style array + { + float items[3] = {1.0f, 2.0f, 3.0f}; + kf::vec a = into_vector(items); + ASSERT(a[0] == 1.0f && a[1] == 2.0f && a[2] == 3.0f); + } + + // into_vector on kf array + { + kf::vec items = {1.0f, 2.0f, 3.0f}; + kf::vec a = into_vector(items); + ASSERT(a[0] == 1.0f && a[1] == 2.0f && a[2] == 3.0f); + } + + // make_vec + { + kf::vec a = make_vec(true, int(2), 3.0f); + ASSERT(a[0] == 1.0f && a[1] == 2.0f && a[2] == 3.0f); + } + } +}; + +REGISTER_TEST_CASE("into_vec and make_vec", creation_tests, int, float) \ No newline at end of file diff --git a/tests/binops.cu b/tests/binops.cu index 8409b71..e7e51ee 100644 --- a/tests/binops.cu +++ b/tests/binops.cu @@ -1,149 +1,137 @@ #include "common.h" -#include "kernel_float.h" -namespace kf = kernel_float; +struct binops_tests { + template + __host__ __device__ void operator()(generator gen, std::index_sequence) { + T x[N] = {gen.next(I)...}; + T y[N] = {gen.next(I)...}; -template> -struct arithmetic_test; - -template -struct arithmetic_test> { - __host__ __device__ void operator()(generator gen) { - kf::vec a {gen.next(Is)...}, b {gen.next(Is)...}, c; + kf::vec a = {x[I]...}; + kf::vec b = {y[I]...}; + kf::vec c; - // binary operator + // Arithmetic c = a + b; - ASSERT(equals(c.get(Is), a.get(Is) + b.get(Is)) && ...); + ASSERT(equals(T(x[I] + y[I]), c[I]) && ...); c = a - b; - ASSERT(equals(c.get(Is), a.get(Is) - b.get(Is)) && ...); + ASSERT(equals(T(x[I] - y[I]), c[I]) && ...); c = a * b; - ASSERT(equals(c.get(Is), a.get(Is) * b.get(Is)) && ...); + ASSERT(equals(T(x[I] * y[I]), c[I]) && ...); - c = a / b; - ASSERT(equals(c.get(Is), a.get(Is) / b.get(Is)) && ...); + // Results in division by zero + // c = a / b; + // ASSERT(equals(T(x[I] / y[I]), c[I]) && ...); - // assignment operator - c = a; - c += b; - ASSERT(equals(c.get(Is), a.get(Is) + b.get(Is)) && ...); + // Results in division by zero + // c = a % b; + // ASSERT(equals(T(x[I] % y[I]), c[I]) && ...); - c = a; - c -= b; - ASSERT(equals(c.get(Is), a.get(Is) - b.get(Is)) && ...); - - c = a; - c *= b; - ASSERT(equals(c.get(Is), a.get(Is) * b.get(Is)) && ...); + // Comparison + c = a < b; + ASSERT(equals(T(x[I] < y[I]), c[I]) && ...); - c = a; - c /= b; - ASSERT(equals(c.get(Is), a.get(Is) / b.get(Is)) && ...); - } -}; + c = a > b; + ASSERT(equals(T(x[I] > y[I]), c[I]) && ...); -template> -struct minmax_test; + c = a <= b; + ASSERT(equals(T(x[I] <= y[I]), c[I]) && ...); -template -struct minmax_test> { - __host__ __device__ void operator()(generator gen) { - kf::vec a {gen.next(Is)...}, b {gen.next(Is)...}, c; + c = a >= b; + ASSERT(equals(T(x[I] >= y[I]), c[I]) && ...); - c = kf::min(a, b); - ASSERT(equals(c.get(Is), a.get(Is) < b.get(Is) ? a.get(Is) : b.get(Is)) && ...); + c = a == b; + ASSERT(equals(T(x[I] == y[I]), c[I]) && ...); - c = kf::max(a, b); - ASSERT(equals(c.get(Is), a.get(Is) > b.get(Is) ? a.get(Is) : b.get(Is)) && ...); - } -}; + c = a != b; + ASSERT(equals(T(x[I] != y[I]), c[I]) && ...); -template -struct minmax_test> { - __host__ __device__ void operator()(generator gen) { - kf::vec a {gen.next(Is)...}, b {gen.next(Is)...}, c; + // Assignment + c = a; + c += b; + ASSERT(equals(T(x[I] + y[I]), c[I]) && ...); - c = kf::min(a, b); - ASSERT(equals(c.get(Is), fminf(a.get(Is), b.get(Is))) && ...); + c = a; + c -= b; + ASSERT(equals(T(x[I] - y[I]), c[I]) && ...); - c = kf::max(a, b); - ASSERT(equals(c.get(Is), fmaxf(a.get(Is), b.get(Is))) && ...); + c = a; + c *= b; + ASSERT(equals(T(x[I] * y[I]), c[I]) && ...); } }; -template -struct minmax_test> { - __host__ __device__ void operator()(generator gen) { - kf::vec a {gen.next(Is)...}, b {gen.next(Is)...}, c; - - c = kf::min(a, b); - ASSERT(equals(c.get(Is), fmin(a.get(Is), b.get(Is))) && ...); - - c = kf::max(a, b); - ASSERT(equals(c.get(Is), fmax(a.get(Is), b.get(Is))) && ...); - } -}; +REGISTER_TEST_CASE("binary operators", binops_tests, bool, int, float, double) +REGISTER_TEST_CASE_GPU("binary operators", binops_tests, __half, __nv_bfloat16) -template> -struct relational_test; +struct binops_float_tests { + template + __host__ __device__ void operator()(generator gen, std::index_sequence) { + T x[N] = {gen.next(I)...}; + T y[N] = {gen.next(I)...}; -template -struct relational_test> { - __host__ __device__ void operator()(generator gen) { - kf::vec a {gen.next(Is)...}; - kf::vec b {gen.next(Is)...}; + kf::vec a = {x[I]...}; + kf::vec b = {y[I]...}; kf::vec c; - c = a == b; - ASSERT(equals(c.get(Is), T(a.get(Is) == b.get(Is))) && ...); - - c = a != b; - ASSERT(equals(c.get(Is), T(a.get(Is) != b.get(Is))) && ...); - - c = a < b; - ASSERT(equals(c.get(Is), T(a.get(Is) < b.get(Is))) && ...); - - c = a <= b; - ASSERT(equals(c.get(Is), T(a.get(Is) <= b.get(Is))) && ...); + c = a / b; + ASSERT(equals(T(x[I] / y[I]), c[I]) && ...); - c = a > b; - ASSERT(equals(c.get(Is), T(a.get(Is) > b.get(Is))) && ...); + // remainder is not support for fp16 + if constexpr (is_none_of) { + c = a % b; + ASSERT(equals(T(fmod(x[I], y[I])), c[I]) && ...); + } + } +}; - c = a >= b; - ASSERT(equals(c.get(Is), T(a.get(Is) >= b.get(Is))) && ...); +REGISTER_TEST_CASE("binary float operators", binops_float_tests, float, double) +REGISTER_TEST_CASE_GPU("binary float operators", binops_float_tests, __half, __nv_bfloat16) + +struct minmax_tests { + template + __host__ __device__ void operator()(generator gen, std::index_sequence) { + T x[N] = {gen.next(I)...}; + T y[N] = {gen.next(I)...}; + + kf::vec a = {x[I]...}; + kf::vec b = {y[I]...}; + + kf::vec lo = min(a, b); + kf::vec hi = max(a, b); + + if constexpr (is_one_of) { + ASSERT(equals(fmin(a[I], b[I]), lo[I]) && ...); + ASSERT(equals(fmax(a[I], b[I]), hi[I]) && ...); + } else if constexpr (is_one_of) { + ASSERT(equals(fminf(a[I], b[I]), lo[I]) && ...); + ASSERT(equals(fmaxf(a[I], b[I]), hi[I]) && ...); + } else if constexpr (is_one_of) { + ASSERT(equals(__hmin(a[I], b[I]), lo[I]) && ...); + ASSERT(equals(__hmax(a[I], b[I]), hi[I]) && ...); + } else { + ASSERT(equals(x[I] < y[I] ? x[I] : y[I], lo[I]) && ...); + ASSERT(equals(x[I] < y[I] ? y[I] : x[I], hi[I]) && ...); + } } }; -template> -struct bitwise_test; +REGISTER_TEST_CASE("min/max functions", minmax_tests, bool, int, float, double) +REGISTER_TEST_CASE_GPU("min/max functions", minmax_tests, __half, __nv_bfloat16) -template -struct bitwise_test> { +struct cross_test { + template __host__ __device__ void operator()(generator gen) { - kf::vec a = {gen.next(Is)...}; - kf::vec b = {gen.next(Is)...}; - - kf::vec c = a | b; - ASSERT(equals(c.get(Is), T(a.get(Is) | b.get(Is))) && ...); - - c = a & b; - ASSERT(equals(c.get(Is), T(a.get(Is) & b.get(Is))) && ...); + kf::vec a = {1, 2, 3}; + kf::vec b = {4, 5, 6}; + kf::vec c = cross(a, b); - c = a ^ b; - ASSERT(equals(c.get(Is), T(a.get(Is) ^ b.get(Is))) && ...); + ASSERT(c[0] == T(-3)); + ASSERT(c[1] == T(6)); + ASSERT(c[2] == T(-3)); } }; -TEST_CASE("binary operators") { - run_on_host_and_device(); - run_on_device(); - - run_on_host_and_device(); - run_on_device(); - - run_on_host_and_device(); - run_on_device(); - - run_on_host_and_device(); -} +REGISTER_TEST_CASE("cross product", cross_test, float, double) +REGISTER_TEST_CASE_GPU("cross product", cross_test, __half, __nv_bfloat16) \ No newline at end of file diff --git a/tests/broadcast.cu b/tests/broadcast.cu deleted file mode 100644 index aece5f7..0000000 --- a/tests/broadcast.cu +++ /dev/null @@ -1,67 +0,0 @@ -#include "common.h" -#include "kernel_float.h" - -namespace kf = kernel_float; - -template< - typename T, - size_t N, - typename = std::make_index_sequence, - typename = std::make_index_sequence> -struct broadcast_test; - -template -struct broadcast_test, std::index_sequence> { - __host__ __device__ void operator()(generator gen) { /* - { - kf::tensor> x = gen.next(); - T y = gen.next(); - kf::tensor> z = x + y; - } - - { - kf::tensor> x = {gen.next(Is)...}; - T y = gen.next(); - kf::tensor> z = x + y; - } - - { - kf::tensor> x = {gen.next(IIs)...}; - T y = gen.next(); - kf::tensor> z = x + y; - } - - { - kf::tensor> x = gen.next(); - kf::tensor> y = {gen.next(Is)...}; - kf::tensor> z = x + y; - } - - { - kf::tensor> x = {gen.next(Is)...}; - kf::tensor> y = {gen.next(Is)...}; - kf::tensor> z = x - y; - } - - { - kf::tensor> x = gen.next(); - kf::tensor> y = {gen.next(IIs)...}; - kf::tensor> z = x * y; - } - - { - kf::tensor> x = {gen.next(Is)...}; - kf::tensor> y = {gen.next(Is)...}; - kf::tensor> z = x / y; - } - - { - kf::tensor> x; - kf::tensor> y = x; - }*/ - } -}; - -TEST_CASE("broadcast operators") { - run_on_host_and_device(); -} diff --git a/tests/cast.cu b/tests/cast.cu deleted file mode 100644 index 9f13e7c..0000000 --- a/tests/cast.cu +++ /dev/null @@ -1,156 +0,0 @@ -#include "common.h" -#include "kernel_float.h" - -namespace kf = kernel_float; - -template> -struct cast_test; - -template -struct cast_test> { - __host__ __device__ void operator()(generator gen) { - kf::vec a {gen.next(Is)...}; - kf::vec b = kf::cast(a); - - ASSERT(equals(B(a.get(Is)), b.get(Is)) && ...); - } -}; - -template -struct cast_test> { - __host__ __device__ void operator()(generator gen) { - kf::vec a {gen.next(Is)...}; - kf::vec<__half, N> b = kf::cast<__half>(a); - - for (size_t i = 0; i < N; i++) { - printf("%d/%d] %f %d\n", int(i), int(N), (double)(b.get(i)), int(a[i])); - } - - ASSERT(equals(__half(a.get(Is)), b.get(Is)) && ...); - } -}; - -template -struct cast_test<__half, long, N, std::index_sequence> { - __host__ __device__ void operator()(generator<__half> gen) { - kf::vec<__half, N> a {gen.next(Is)...}; - kf::vec b = kf::cast(a); - ASSERT(equals((long)(long long)a.get(Is), b.get(Is)) && ...); - } -}; - -template -struct cast_test> { - __host__ __device__ void operator()(generator gen) { - kf::vec a {gen.next(Is)...}; - kf::vec<__half, N> b = kf::cast<__half>(a); - ASSERT(equals(__half((long long)a.get(Is)), b.get(Is)) && ...); - } -}; - -template -struct cast_test> { - __host__ __device__ void operator()(generator gen) { - kf::vec a {gen.next(Is)...}; - kf::vec<__half, N> b = kf::cast<__half>(a); - ASSERT(equals((__half)(unsigned long long)(a.get(Is)), b.get(Is)) && ...); - } -}; - -template -struct cast_test<__half, char, N, std::index_sequence> { - __host__ __device__ void operator()(generator<__half> gen) { - kf::vec<__half, N> a {gen.next(Is)...}; - kf::vec b = kf::cast(a); - ASSERT(equals((char)(int)(a.get(Is)), b.get(Is)) && ...); - } -}; - -template -struct cast_test<__nv_bfloat16, long, N, std::index_sequence> { - __host__ __device__ void operator()(generator<__nv_bfloat16> gen) { - kf::vec<__nv_bfloat16, N> a {gen.next(Is)...}; - kf::vec b = kf::cast(a); - ASSERT(equals((long)(long long)a.get(Is), b.get(Is)) && ...); - } -}; - -template -struct cast_test> { - __host__ __device__ void operator()(generator gen) { - kf::vec a {gen.next(Is)...}; - kf::vec<__nv_bfloat16, N> b = kf::cast<__nv_bfloat16>(a); - ASSERT(equals(__nv_bfloat16((long long)a.get(Is)), b.get(Is)) && ...); - } -}; - -template -struct cast_test> { - __host__ __device__ void operator()(generator gen) { - kf::vec a {gen.next(Is)...}; - kf::vec<__nv_bfloat16, N> b = kf::cast<__nv_bfloat16>(a); - ASSERT(equals((__nv_bfloat16)(unsigned long long)(a.get(Is)), b.get(Is)) && ...); - } -}; - -template -struct cast_test<__nv_bfloat16, char, N, std::index_sequence> { - __host__ __device__ void operator()(generator<__nv_bfloat16> gen) { - kf::vec<__nv_bfloat16, N> a {gen.next(Is)...}; - kf::vec b = kf::cast(a); - ASSERT(equals((char)(int)(a.get(Is)), b.get(Is)) && ...); - } -}; - -template -struct cast_test<__nv_bfloat16, __half, N, std::index_sequence> { - __host__ __device__ void operator()(generator<__nv_bfloat16> gen) { - kf::vec<__nv_bfloat16, N> a {gen.next(Is)...}; - kf::vec<__half, N> b = kf::cast<__half>(a); - ASSERT(equals((__half)(float)(a.get(Is)), b.get(Is)) && ...); - } -}; - -template -struct cast_test<__half, __nv_bfloat16, N, std::index_sequence> { - __host__ __device__ void operator()(generator<__half> gen) { - kf::vec<__half, N> a {gen.next(Is)...}; - kf::vec<__nv_bfloat16, N> b = kf::cast<__nv_bfloat16>(a); - ASSERT(equals((__nv_bfloat16)(float)(a.get(Is)), b.get(Is)) && ...); - } -}; - -template -struct cast_to { - template - using type = cast_test; -}; - -TEST_CASE("cast operators") { - auto types = type_sequence< - bool, - char, - short, - int, - unsigned int, - long, - unsigned long, - long long, - float, - double, - __half, - __nv_bfloat16> {}; - - run_on_host_and_device::template type>(types); - run_on_host_and_device::template type>(types); - run_on_host_and_device::template type>(types); - run_on_host_and_device::template type>(types); - run_on_host_and_device::template type>(types); - run_on_host_and_device::template type>(types); - run_on_host_and_device::template type>(types); - run_on_host_and_device::template type>(types); - run_on_host_and_device::template type>(types); - run_on_host_and_device::template type>(types); - - //bool, char, short, int, long long, __half, float, double -} diff --git a/tests/common.h b/tests/common.h index 40c70ce..fb60624 100644 --- a/tests/common.h +++ b/tests/common.h @@ -4,19 +4,14 @@ #include #include -#include -#include #include "catch2/catch_all.hpp" #include "kernel_float.h" -#define ASSERT(expr) check_assertions((expr), #expr, __FILE__, __LINE__); - -static __host__ __device__ int -check_assertions(bool result, const char* expr, const char* file, int line) { - if (result) - return 0; +namespace kf = kernel_float; +namespace detail { +static __host__ __device__ void __assertion_failed(const char* expr, const char* file, int line) { #ifndef __CUDA_ARCH__ std::string msg = "assertion failed: " + std::string(expr) + " (" + file + ":" + std::to_string(line) + ")"; @@ -28,249 +23,287 @@ check_assertions(bool result, const char* expr, const char* file, int line) { ; #endif } +} // namespace detail -template -__host__ __device__ void ignore(Ts...) {} +#define ASSERT(...) \ + do { \ + bool __result = (__VA_ARGS__); \ + if (!__result) { \ + ::detail::__assertion_failed(#__VA_ARGS__, __FILE__, __LINE__); \ + } \ + } while (0) +namespace detail { template struct equals_helper { - __host__ __device__ static bool call(T left, T right) { + static __host__ __device__ bool call(const T& left, const T& right) { return left == right; } }; template<> struct equals_helper { - __host__ __device__ static bool call(double left, double right) { - return (isnan(left) && isnan(right)) || (isinf(left) && isinf(right)) || left == right; + static __host__ __device__ bool call(const double& left, const double& right) { + return (isnan(left) && isnan(right)) || (isinf(left) && isinf(right)) || (left == right); + } +}; + +template<> +struct equals_helper { + static __host__ __device__ bool call(const float& left, const float& right) { + return (isnan(left) && isnan(right)) || (isinf(left) && isinf(right)) || (left == right); } }; template<> -struct equals_helper: equals_helper {}; +struct equals_helper<__half> { + static __host__ __device__ bool call(const __half& left, const __half& right) { + return equals_helper::call(float(left), float(right)); + } +}; template<> -struct equals_helper<__half>: equals_helper {}; +struct equals_helper<__nv_bfloat16> { + static __host__ __device__ bool call(const __nv_bfloat16& left, const __nv_bfloat16& right) { + return equals_helper::call(float(left), float(right)); + } +}; + +} // namespace detail template -__host__ __device__ bool equals(T left, T right) { - return equals_helper::call(left, right); +__host__ __device__ bool equals(const T& left, const T& right) { + return detail::equals_helper::call(left, right); } -template -struct type_sequence {}; - -template -struct size_sequence {}; +namespace detail { +template +struct is_one_of_helper; template -struct type_name {}; -#define DEFINE_TYPE_NAME(T) \ - template<> \ - struct type_name { \ - static constexpr const char* value = #T; \ - }; +struct is_one_of_helper: std::false_type {}; -DEFINE_TYPE_NAME(bool) -DEFINE_TYPE_NAME(char) -DEFINE_TYPE_NAME(short) -DEFINE_TYPE_NAME(int) -DEFINE_TYPE_NAME(unsigned int) -DEFINE_TYPE_NAME(long) -DEFINE_TYPE_NAME(unsigned long) -DEFINE_TYPE_NAME(long long) -DEFINE_TYPE_NAME(__half) -DEFINE_TYPE_NAME(__nv_bfloat16) -DEFINE_TYPE_NAME(float) -DEFINE_TYPE_NAME(double) +template +struct is_one_of_helper: std::true_type {}; + +template +struct is_one_of_helper: is_one_of_helper {}; +} // namespace detail + +template +static constexpr bool is_one_of = detail::is_one_of_helper::value; +template +static constexpr bool is_none_of = !detail::is_one_of_helper::value; + +namespace detail { template -struct generate_value; +struct generator_value; template<> -struct generate_value { - __host__ __device__ static bool call(uint64_t value) { - return bool(value & 0x1); +struct generator_value { + static __host__ __device__ bool call(uint64_t bits) { + return bool(bits % 2); } }; template -struct generate_value< - T, - typename std::enable_if::value && !std::is_same::value>::type> { - __host__ __device__ static T call(uint64_t value) { - return T(value); +struct generator_value && !std::is_same_v>> { + static constexpr T min_value = std::numeric_limits::min(); + static constexpr T max_value = std::numeric_limits::max(); + + static __host__ __device__ T call(uint64_t bits) { + if ((bits & 0xf) == 0xa) { + return T(0); + } else if ((bits & 0xf) == 0xb) { + return min_value; + } else if ((bits & 0xf) == 0xc) { + return max_value; + } else { + return T(bits); + } } }; template -struct generate_value::value>::type> { - __host__ __device__ static T call(uint64_t value) { - if ((value & 0xf) == 0) { +struct generator_value>> { + static constexpr T max_value = std::numeric_limits::max(); + + __host__ __device__ static T call(uint64_t bits) { + if ((bits & 0xf) == 0) { return T(0) / T(0); // nan - } else if ((value & 0xf) == 1) { + } else if ((bits & 0xf) == 1) { return T(1) / T(0); // inf - } else if ((value & 0xf) == 2) { - return -T(0) / T(0); // +inf - } else if ((value & 0xf) == 3) { - return 0; + } else if ((bits & 0xf) == 2) { + return -T(1) / T(0); // +inf + } else if ((bits & 0xf) == 3) { + return T(0); } else { - return T(value) / T(UINT64_MAX); + return (T(bits) / T(max_value)) * (bits % 2 ? T(-1) : T(+1)); } } }; template<> -struct generate_value<__half> { +struct generator_value<__half> { __host__ __device__ static __half call(uint64_t seed) { - return __half(generate_value::call(seed)); + return __half(generator_value::call(seed)); } }; template<> -struct generate_value<__nv_bfloat16> { +struct generator_value<__nv_bfloat16> { __host__ __device__ static __nv_bfloat16 call(uint64_t seed) { - return __nv_bfloat16(generate_value::call(seed)); + return __nv_bfloat16(generator_value::call(seed)); } }; +} // namespace detail -template +template struct generator { __host__ __device__ generator(uint64_t seed = 6364136223846793005ULL) : seed_(seed) { next(); } - __host__ __device__ T next(uint64_t ignore = 0) { + template + __host__ __device__ T next(R ignore = {}) { seed_ = 6364136223846793005ULL * seed_ + 1442695040888963407ULL; - return generate_value::call(seed_); - } - - __host__ __device__ T operator()() { - return next(); + return detail::generator_value::call(seed_); } private: uint64_t seed_; }; -template class F, typename T> -void run_sizes(size_sequence<>) { - // empty -} +template +struct type_name { + static constexpr const char* value = "???"; +}; -template class F, typename T, size_t N, size_t... Is, typename... Args> -void run_sizes(size_sequence, Args... args) { - //SECTION("size=" + std::to_string(N)) - { - INFO("N=" << N); - F {}(args...); - } +#define DEFINE_TYPE_NAME(T) \ + template<> \ + struct type_name { \ + static constexpr const char* value = #T; \ + }; - run_sizes(size_sequence {}, args...); -} +DEFINE_TYPE_NAME(bool) +DEFINE_TYPE_NAME(signed char) +DEFINE_TYPE_NAME(char) +DEFINE_TYPE_NAME(short) +DEFINE_TYPE_NAME(int) +DEFINE_TYPE_NAME(long) +DEFINE_TYPE_NAME(long long) +DEFINE_TYPE_NAME(unsigned char) +DEFINE_TYPE_NAME(unsigned short) +DEFINE_TYPE_NAME(unsigned int) +DEFINE_TYPE_NAME(unsigned long) +DEFINE_TYPE_NAME(unsigned long long) +DEFINE_TYPE_NAME(__half) +DEFINE_TYPE_NAME(__nv_bfloat16) +DEFINE_TYPE_NAME(float) +DEFINE_TYPE_NAME(double) -template< - template - class F, - typename T, - typename... Ts, - size_t... Is, - typename... Args> -void run_combinations(type_sequence, size_sequence, Args... args) { - //SECTION(std::string("type=") + type_name::value) - { - INFO("T=" << type_name::value); - run_sizes(size_sequence {}); - } +template +struct type_sequence {}; - run_combinations(type_sequence {}, size_sequence {}, args...); +template +struct size_sequence {}; + +using default_size_sequence = size_sequence<1, 2, 3, 4, 5, 6, 7, 8>; + +namespace detail { +template +void iterate_sizes(F runner, size_sequence) { + runner.template run(); + iterate_sizes(runner, size_sequence {}); } -template class F, typename... Ts, size_t... Is, typename... Args> -void run_combinations(type_sequence<>, size_sequence, Args... args) {} +template +void iterate_sizes(F, size_sequence<>) {} -template class F, typename T, size_t N> +template struct host_runner { - template - void operator()(Args... args) { - for (size_t i = 0; i < 5; i++) { - INFO("seed=" << i); - F {}(generator(i), args...); - } - } -}; + F fun; + + host_runner(F fun) : fun(fun) {} -template class F> -struct host_runner_helper { template - using type = host_runner; + void run() { + for (int seed = 0; seed < 5; seed++) { + INFO("T=" << type_name::value); + INFO("N=" << N); + INFO("seed=" << seed); + + if constexpr (std::is_invocable_v) { + fun(); + } else if constexpr (std::is_invocable_v>) { + fun(generator(seed)); + } else { + fun(generator(seed), std::make_index_sequence {}); + } + } + } }; -template class F, typename... Ts, size_t... Is> -void run_on_host(type_sequence, size_sequence) { - run_combinations::template type>( - type_sequence {}, - size_sequence {}); -} - -template class F, typename... Ts> -void run_on_host(type_sequence = {}) { - run_on_host(type_sequence {}, size_sequence<1, 2, 3, 4, 7, 8> {}); -} - template __global__ void kernel(F fun, Args... args) { fun(args...); } -template class F, typename T, size_t N> +template struct device_runner { - template - void operator()(Args... args) { - static bool gpu_enabled = true; - if (!gpu_enabled) { - return; - } + F fun; + + device_runner(F fun) : fun(fun) {} - cudaError_t code = cudaSetDevice(0); - if (code != cudaSuccess) { - gpu_enabled = false; - WARN("skipping device code"); - return; + template + void run() { + if (cudaSetDevice(0) != cudaSuccess) { + FAIL("failed to initialize CUDA device, does this machine have a GPU?"); } - //SECTION("environment=GPU") - { - for (size_t i = 0; i < 5; i++) { - INFO("seed=" << i); - CHECK(cudaDeviceSynchronize() == cudaSuccess); - kernel<<<1, 1>>>(F {}, generator(i), args...); - CHECK(cudaDeviceSynchronize() == cudaSuccess); + for (int seed = 0; seed < 5; seed++) { + INFO("T=" << type_name::value); + INFO("N=" << N); + INFO("seed=" << seed); + + CHECK(cudaDeviceSynchronize() == cudaSuccess); + + if constexpr (std::is_invocable_v) { + kernel<<<1, 1>>>(fun); + } else if constexpr (std::is_invocable_v>) { + kernel<<<1, 1>>>(fun, generator(seed)); + } else { + kernel<<<1, 1>>>(fun, generator(seed), std::make_index_sequence {}); } + + CHECK(cudaDeviceSynchronize() == cudaSuccess); } } }; +} // namespace detail -template class F> -struct device_runner_helper { - template - using type = device_runner; -}; - -template class F, typename... Ts, size_t... Is> -void run_on_device(type_sequence, size_sequence) { - run_combinations::template type>( - type_sequence {}, - size_sequence {}); +template +void run_tests_host(F fun, type_sequence, size_sequence) { + detail::iterate_sizes(detail::host_runner(fun), size_sequence {}); } -template class F, typename... Ts> -void run_on_device(type_sequence = {}) { - run_on_device(type_sequence {}, size_sequence<1, 2, 3, 4, 7, 8> {}); +template +void run_tests_device(F fun, type_sequence, size_sequence) { + detail::iterate_sizes(detail::device_runner(fun), size_sequence {}); } -template class F, typename... Ts> -void run_on_host_and_device(type_sequence = {}) { - run_on_host(type_sequence {}); - run_on_device(type_sequence {}); -} +#define REGISTER_TEST_CASE_CPU(NAME, F, ...) \ + TEMPLATE_TEST_CASE(NAME " - CPU", "", __VA_ARGS__) { \ + run_tests_host(F {}, type_sequence {}, default_size_sequence {}); \ + } + +#define REGISTER_TEST_CASE_GPU(NAME, F, ...) \ + TEMPLATE_TEST_CASE(NAME " - GPU", "[GPU]", __VA_ARGS__) { \ + run_tests_device(F {}, type_sequence {}, default_size_sequence {}); \ + } + +#undef REGISTER_TEST_CASE +#define REGISTER_TEST_CASE(NAME, F, ...) \ + REGISTER_TEST_CASE_CPU(NAME, F, __VA_ARGS__) \ + REGISTER_TEST_CASE_GPU(NAME, F, __VA_ARGS__) diff --git a/tests/promotion.cu b/tests/promotion.cu new file mode 100644 index 0000000..beb7b67 --- /dev/null +++ b/tests/promotion.cu @@ -0,0 +1,116 @@ +#include "common.h" + +// Check if combining type `A` and `B` results in `C` +#define CHECK_PROMOTION(A, B, C) CHECK(std::is_same, C>::value); + +TEST_CASE("type promotion") { + CHECK_PROMOTION(int, int, int); + CHECK_PROMOTION(int, float, float); + CHECK_PROMOTION(int, double, double); + // CHECK_PROMOTION(int, unsigned int, int); + CHECK_PROMOTION(int, bool, int); + CHECK_PROMOTION(int, __half, __half); + CHECK_PROMOTION(int, __nv_bfloat16, __nv_bfloat16); + // CHECK_PROMOTION(int, char, int); + CHECK_PROMOTION(int, signed char, int); + // CHECK_PROMOTION(int, unsigned char, int); + + CHECK_PROMOTION(float, int, float); + CHECK_PROMOTION(float, float, float); + CHECK_PROMOTION(float, double, double); + CHECK_PROMOTION(float, unsigned int, float); + CHECK_PROMOTION(float, bool, float); + CHECK_PROMOTION(float, __half, float); + CHECK_PROMOTION(float, __nv_bfloat16, float); + CHECK_PROMOTION(float, char, float); + CHECK_PROMOTION(float, signed char, float); + CHECK_PROMOTION(float, unsigned char, float); + + CHECK_PROMOTION(double, int, double); + CHECK_PROMOTION(double, float, double); + CHECK_PROMOTION(double, double, double); + CHECK_PROMOTION(double, unsigned int, double); + CHECK_PROMOTION(double, bool, double); + CHECK_PROMOTION(double, __half, double); + CHECK_PROMOTION(double, __nv_bfloat16, double); + CHECK_PROMOTION(double, char, double); + CHECK_PROMOTION(double, signed char, double); + CHECK_PROMOTION(double, unsigned char, double); + + // CHECK_PROMOTION(unsigned int, int, unsigned int); + CHECK_PROMOTION(unsigned int, float, float); + CHECK_PROMOTION(unsigned int, double, double); + CHECK_PROMOTION(unsigned int, unsigned int, unsigned int); + CHECK_PROMOTION(unsigned int, bool, unsigned int); + CHECK_PROMOTION(unsigned int, __half, __half); + CHECK_PROMOTION(unsigned int, __nv_bfloat16, __nv_bfloat16); + // CHECK_PROMOTION(unsigned int, char, unsigned int); + // CHECK_PROMOTION(unsigned int, signed char, unsigned int); + CHECK_PROMOTION(unsigned int, unsigned char, unsigned int); + + CHECK_PROMOTION(bool, int, int); + CHECK_PROMOTION(bool, float, float); + CHECK_PROMOTION(bool, double, double); + CHECK_PROMOTION(bool, unsigned int, unsigned int); + CHECK_PROMOTION(bool, bool, bool); + CHECK_PROMOTION(bool, __half, __half); + CHECK_PROMOTION(bool, __nv_bfloat16, __nv_bfloat16); + CHECK_PROMOTION(bool, char, char); + CHECK_PROMOTION(bool, signed char, signed char); + CHECK_PROMOTION(bool, unsigned char, unsigned char); + + CHECK_PROMOTION(__half, int, __half); + CHECK_PROMOTION(__half, float, float); + CHECK_PROMOTION(__half, double, double); + CHECK_PROMOTION(__half, unsigned int, __half); + CHECK_PROMOTION(__half, bool, __half); + CHECK_PROMOTION(__half, __half, __half); + CHECK_PROMOTION(__half, __nv_bfloat16, float); + CHECK_PROMOTION(__half, char, __half); + CHECK_PROMOTION(__half, signed char, __half); + CHECK_PROMOTION(__half, unsigned char, __half); + + CHECK_PROMOTION(__nv_bfloat16, int, __nv_bfloat16); + CHECK_PROMOTION(__nv_bfloat16, float, float); + CHECK_PROMOTION(__nv_bfloat16, double, double); + CHECK_PROMOTION(__nv_bfloat16, unsigned int, __nv_bfloat16); + CHECK_PROMOTION(__nv_bfloat16, bool, __nv_bfloat16); + CHECK_PROMOTION(__nv_bfloat16, __half, float); + CHECK_PROMOTION(__nv_bfloat16, __nv_bfloat16, __nv_bfloat16); + CHECK_PROMOTION(__nv_bfloat16, char, __nv_bfloat16); + CHECK_PROMOTION(__nv_bfloat16, signed char, __nv_bfloat16); + CHECK_PROMOTION(__nv_bfloat16, unsigned char, __nv_bfloat16); + + // CHECK_PROMOTION(char, int, char); + CHECK_PROMOTION(char, float, float); + CHECK_PROMOTION(char, double, double); + // CHECK_PROMOTION(char, unsigned int, char); + CHECK_PROMOTION(char, bool, char); + CHECK_PROMOTION(char, __half, __half); + CHECK_PROMOTION(char, __nv_bfloat16, __nv_bfloat16); + CHECK_PROMOTION(char, char, char); + // CHECK_PROMOTION(char, signed char, char); + // CHECK_PROMOTION(char, unsigned char, char); + + CHECK_PROMOTION(signed char, int, int); + CHECK_PROMOTION(signed char, float, float); + CHECK_PROMOTION(signed char, double, double); + // CHECK_PROMOTION(signed char, unsigned int, signed char); + CHECK_PROMOTION(signed char, bool, signed char); + CHECK_PROMOTION(signed char, __half, __half); + CHECK_PROMOTION(signed char, __nv_bfloat16, __nv_bfloat16); + // CHECK_PROMOTION(signed char, char, signed char); + CHECK_PROMOTION(signed char, signed char, signed char); + // CHECK_PROMOTION(signed char, unsigned char, signed char); + + // CHECK_PROMOTION(unsigned char, int, unsigned char); + CHECK_PROMOTION(unsigned char, float, float); + CHECK_PROMOTION(unsigned char, double, double); + CHECK_PROMOTION(unsigned char, unsigned int, unsigned int); + CHECK_PROMOTION(unsigned char, bool, unsigned char); + CHECK_PROMOTION(unsigned char, __half, __half); + CHECK_PROMOTION(unsigned char, __nv_bfloat16, __nv_bfloat16); + // CHECK_PROMOTION(unsigned char, char, unsigned char); + // CHECK_PROMOTION(unsigned char, signed char, unsigned char); + CHECK_PROMOTION(unsigned char, unsigned char, unsigned char); +} \ No newline at end of file diff --git a/tests/reduce.cu b/tests/reduce.cu deleted file mode 100644 index 4190f0e..0000000 --- a/tests/reduce.cu +++ /dev/null @@ -1,59 +0,0 @@ -#include "common.h" -#include "kernel_float.h" - -namespace kf = kernel_float; - -__host__ __device__ bool is_close(double a, double b) { - return (isnan(a) && isnan(b)) || (isinf(a) && isinf(b)) || fabs(a - b) < 0.0001; -} - -__host__ __device__ bool is_close(__half a, __half b) { - return is_close(double(a), double(b)); -} - -__host__ __device__ bool is_close(long long a, long long b) { - return a == b; -} - -__host__ __device__ bool is_close(int a, int b) { - return a == b; -} - -template> -struct reduction_test; - -template -struct reduction_test> { - __host__ __device__ void operator()(generator gen) { - kf::vec v {gen.next(Is)...}; - - bool b = (bool(v.get(Is)) && ...); - ASSERT(kf::all(v) == b); - - b = (bool(v.get(Is)) || ...); - ASSERT(kf::any(v) == b); - - T sum = v.get(0); - for (int i = 1; i < N; i++) { - sum = sum + v.get(i); - } - ASSERT(is_close(kf::sum(v), sum)); - - T minimum = v.get(0); - for (int i = 1; i < N; i++) { - minimum = kf::ops::min {}(minimum, v.get(i)); - } - ASSERT(is_close(kf::min(v), minimum)); - - T maximum = v.get(0); - for (int i = 1; i < N; i++) { - maximum = kf::ops::max {}(maximum, v.get(i)); - } - ASSERT(is_close(kf::max(v), maximum)); - } -}; - -TEST_CASE("reduction operations") { - run_on_host_and_device(); - run_on_device(); -} diff --git a/tests/swizzle.cu b/tests/swizzle.cu deleted file mode 100644 index 0fc7e46..0000000 --- a/tests/swizzle.cu +++ /dev/null @@ -1,43 +0,0 @@ -#include "common.h" -#include "kernel_float.h" -/* -namespace kf = kernel_float; - -template> -struct swizzle_test; - -template -struct swizzle_test> { - __host__ __device__ void operator()(generator gen) { - T items[N] = {gen.next(Is)...}; - kf::vec a = {items[Is]...}; - - ASSERT(equals(items[0], kf::first(a))); - ASSERT(equals(items[N - 1], kf::last(a))); - - kf::vec b = kf::reversed(a); - ASSERT(equals(b[Is], items[N - Is - 1]) && ...); - - b = kf::rotate_left<1>(a); - ASSERT(equals(b[Is], items[(Is + 1) % N]) && ...); - - b = kf::rotate_right<1>(a); - ASSERT(equals(b[Is], items[(Is + N - 1) % N]) && ...); - - b = kf::rotate_left<2>(a); - ASSERT(equals(b[Is], items[(Is + 2) % N]) && ...); - - b = kf::rotate_right<2>(a); - ASSERT(equals(b[Is], items[(Is + N - 2) % N]) && ...); - - kf::vec c = kf::concat(a, T {}, a); - ASSERT(equals(c[Is], items[Is]) && ...); - ASSERT(equals(c[N], T {})); - ASSERT(equals(c[N + 1 + Is], items[Is]) && ...); - } -}; - -TEST_CASE("swizzle") { - run_on_host_and_device(); -} -*/ \ No newline at end of file diff --git a/tests/triops.cu b/tests/triops.cu deleted file mode 100644 index 1268802..0000000 --- a/tests/triops.cu +++ /dev/null @@ -1,29 +0,0 @@ -#include "common.h" -#include "kernel_float.h" - -namespace kf = kernel_float; - -template> -struct where_test; - -template -struct where_test> { - __host__ __device__ void operator()(generator gen) { - kf::vec cond = {gen.next(Is)...}; - kf::vec left = {gen.next(Is)...}; - kf::vec right = {gen.next(Is)...}; - - auto result = kf::where(cond, left, right); - ASSERT(equals(result[Is], cond[Is] ? left[Is] : right[Is]) && ...); - - result = kf::where(cond, left); - ASSERT(equals(result[Is], cond[Is] ? left[Is] : T {0}) && ...); - - result = kf::where(cond); - ASSERT(equals(result[Is], cond[Is] ? T {1} : T {0}) && ...); - } -}; - -TEST_CASE("conditional") { - run_on_host_and_device(); -} diff --git a/tests/unops.cu b/tests/unops.cu index 2075c19..e50d8d3 100644 --- a/tests/unops.cu +++ b/tests/unops.cu @@ -1,90 +1,73 @@ #include "common.h" -#include "kernel_float.h" -namespace kf = kernel_float; - -template> -struct int_test; - -template -struct int_test> { - __host__ __device__ void operator()(generator gen) { - kf::vec a {gen.next(Is)...}; +struct unops_tests { + template + __host__ __device__ void operator()(generator gen, std::index_sequence) { + T items[N] = {gen.next(I)...}; + kf::vec a = {items[I]...}; kf::vec b; b = -a; - ASSERT((b.get(Is) == -(a.get(Is))) && ...); + ASSERT(equals(b[I], T(-items[I])) && ...); b = ~a; - ASSERT((b.get(Is) == ~(a.get(Is))) && ...); + ASSERT(equals(b[I], T(~items[I])) && ...); b = !a; - ASSERT((b.get(Is) == !(a.get(Is))) && ...); + ASSERT(equals(b[I], T(!items[I])) && ...); } }; -template> -struct float_test; +REGISTER_TEST_CASE("unary operators", unops_tests, bool, int) -template -struct float_test> { - __host__ __device__ void operator()(generator gen) { - kf::vec a {gen.next(Is)...}; +struct unops_float_tests { + template + __host__ __device__ void operator()(generator gen, std::index_sequence) { + double items[N] = {gen.next(I)...}; + kf::vec a = {T(items[I])...}; kf::vec b; b = -a; - ASSERT(equals(-a.get(Is), b.get(Is)) && ...); + ASSERT(equals(b[I], T(-items[I])) && ...); - // just some examples - b = kf::cos(a); - ASSERT(equals(cos(a.get(Is)), b.get(Is)) && ...); + b = !a; + ASSERT(equals(b[I], T(!items[I])) && ...); - b = kf::floor(a); - ASSERT(equals(floor(a.get(Is)), b.get(Is)) && ...); + // Ideally, we would test all unary operators, but that would be a lot of work and not that useful since + // all operators are generators by the same macro. Instead, we only check a few of them + if constexpr (is_one_of) { + b = sqrt(a); + ASSERT(equals(b[I], hsqrt(T(items[I]))) && ...); - b = kf::abs(a); - ASSERT(equals(abs(a.get(Is)), b.get(Is)) && ...); + b = sin(a); + ASSERT(equals(b[I], hsin(T(items[I]))) && ...); - b = kf::sqrt(a); - ASSERT(equals(sqrt(a.get(Is)), b.get(Is)) && ...); - } -}; + b = cos(a); + ASSERT(equals(b[I], hcos(T(items[I]))) && ...); -template -struct float_test<__half, N, std::index_sequence> { - template - __host__ __device__ void operator()(generator gen) { - kf::vec a {gen.next(Is)...}; - kf::vec b; + b = log(a); + ASSERT(equals(b[I], hlog(T(items[I]))) && ...); - b = -a; - ASSERT(equals(__hneg(a.get(Is)), b.get(Is)) && ...); + b = exp(a); + ASSERT(equals(b[I], hexp(T(items[I]))) && ...); + } else { + b = sqrt(a); + ASSERT(equals(b[I], sqrt(T(items[I]))) && ...); - // just some examples - b = kf::cos(a); - ASSERT(equals(hcos(a.get(Is)), b.get(Is)) && ...); + b = sin(a); + ASSERT(equals(b[I], sin(T(items[I]))) && ...); - b = kf::floor(a); - ASSERT(equals(hfloor(a.get(Is)), b.get(Is)) && ...); + b = cos(a); + ASSERT(equals(b[I], cos(T(items[I]))) && ...); - b = kf::abs(a); - ASSERT(equals(__habs(a.get(Is)), b.get(Is)) && ...); + b = log(a); + ASSERT(equals(b[I], log(T(items[I]))) && ...); - b = kf::sqrt(a); - ASSERT(equals(hsqrt(a.get(Is)), b.get(Is)) && ...); + b = exp(a); + ASSERT(equals(b[I], exp(T(items[I]))) && ...); + } } }; -template -struct float_test<__nv_bfloat16, N, std::index_sequence> { - __host__ __device__ void operator()(generator<__nv_bfloat16> gen) { - float_test<__half, N> {}(gen); - } -}; - -TEST_CASE("unary operators") { - run_on_host_and_device(); - - run_on_host_and_device(); - run_on_device(); -} +REGISTER_TEST_CASE("unary float operators", unops_float_tests, float, double) +REGISTER_TEST_CASE_GPU("unary float operators", unops_float_tests, __half, __nv_bfloat16) \ No newline at end of file From 90372b24d662adf7c86965cfa40f60788d0d0608 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 18 Sep 2023 17:41:23 +0200 Subject: [PATCH 40/50] Small bug fixes --- include/kernel_float/base.h | 11 +++++++++++ include/kernel_float/bf16.h | 6 +++--- include/kernel_float/vector.h | 4 ++-- single_include/kernel_float.h | 25 ++++++++++++++++++------- 4 files changed, 34 insertions(+), 12 deletions(-) diff --git a/include/kernel_float/base.h b/include/kernel_float/base.h index b72bc42..50566be 100644 --- a/include/kernel_float/base.h +++ b/include/kernel_float/base.h @@ -226,6 +226,17 @@ struct into_vector_traits> { #define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ template<> \ + struct into_vector_traits<::T1> { \ + using value_type = T; \ + using extent_type = extent<1>; \ + \ + KERNEL_FLOAT_INLINE \ + static vector_storage call(::T1 v) { \ + return {v.x}; \ + } \ + }; \ + \ + template<> \ struct into_vector_traits<::T2> { \ using value_type = T; \ using extent_type = extent<2>; \ diff --git a/include/kernel_float/bf16.h b/include/kernel_float/bf16.h index 3065855..3d60d55 100644 --- a/include/kernel_float/bf16.h +++ b/include/kernel_float/bf16.h @@ -144,7 +144,7 @@ KERNEL_FLOAT_BF16_UNARY_FORWARD(expm1) } \ namespace detail { \ template<> \ - struct map_halfx2> { \ + struct map_bfloat16x2> { \ KERNEL_FLOAT_INLINE static __nv_bfloat162 \ call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 input) { \ return FUN2(input); \ @@ -324,12 +324,12 @@ KERNEL_FLOAT_BF16_CAST(__half, __float2bfloat16(input), __bfloat162float(input)) template<> struct promote_type<__nv_bfloat16, __half> { using type = float; -} +}; template<> struct promote_type<__half, __nv_bfloat16> { using type = float; -} +}; } // namespace kernel_float diff --git a/include/kernel_float/vector.h b/include/kernel_float/vector.h index 81b7ea5..f294b48 100644 --- a/include/kernel_float/vector.h +++ b/include/kernel_float/vector.h @@ -54,7 +54,7 @@ struct vector: public S { typename... Rest, typename = enabled_t> KERNEL_FLOAT_INLINE vector(const A& a, const B& b, const Rest&... rest) : - storage_type {a, b, rest...} {} + storage_type {T(a), T(b), T(rest)...} {} /** * Returns the number of elements in this vector. @@ -316,7 +316,7 @@ template using vec8 = vec; template KERNEL_FLOAT_INLINE vec, sizeof...(Args)> make_vec(Args&&... args) { using T = promote_t; - return vector_storage {T {args}...}; + return vector_storage {T(args)...}; }; #if defined(__cpp_deduction_guides) diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 7ea6bf2..47786d3 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-08-28 14:29:52.760763 -// git hash: 31ffbb7ca20f9c4a1c43b37e06c99600a8f15b91 +// date: 2023-09-18 17:41:12.641561 +// git hash: 64f21903e8049e4a46c53897a167f31174e1a231 //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -550,6 +550,17 @@ struct into_vector_traits> { #define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ template<> \ + struct into_vector_traits<::T1> { \ + using value_type = T; \ + using extent_type = extent<1>; \ + \ + KERNEL_FLOAT_INLINE \ + static vector_storage call(::T1 v) { \ + return {v.x}; \ + } \ + }; \ + \ + template<> \ struct into_vector_traits<::T2> { \ using value_type = T; \ using extent_type = extent<2>; \ @@ -2759,7 +2770,7 @@ struct vector: public S { typename... Rest, typename = enabled_t> KERNEL_FLOAT_INLINE vector(const A& a, const B& b, const Rest&... rest) : - storage_type {a, b, rest...} {} + storage_type {T(a), T(b), T(rest)...} {} /** * Returns the number of elements in this vector. @@ -3021,7 +3032,7 @@ template using vec8 = vec; template KERNEL_FLOAT_INLINE vec, sizeof...(Args)> make_vec(Args&&... args) { using T = promote_t; - return vector_storage {T {args}...}; + return vector_storage {T(args)...}; }; #if defined(__cpp_deduction_guides) @@ -3484,7 +3495,7 @@ KERNEL_FLOAT_BF16_UNARY_FORWARD(expm1) } \ namespace detail { \ template<> \ - struct map_halfx2> { \ + struct map_bfloat16x2> { \ KERNEL_FLOAT_INLINE static __nv_bfloat162 \ call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 input) { \ return FUN2(input); \ @@ -3664,12 +3675,12 @@ KERNEL_FLOAT_BF16_CAST(__half, __float2bfloat16(input), __bfloat162float(input)) template<> struct promote_type<__nv_bfloat16, __half> { using type = float; -} +}; template<> struct promote_type<__half, __nv_bfloat16> { using type = float; -} +}; } // namespace kernel_float From ebd0967645097b22c5485beba16c2391595231d1 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 19 Sep 2023 16:41:14 +0200 Subject: [PATCH 41/50] Rename several helper structs from `X_helper` to `X_impl` --- include/kernel_float/bf16.h | 62 +++++++++++++++++++------------ include/kernel_float/binops.h | 16 ++++---- include/kernel_float/conversion.h | 21 +++++++---- include/kernel_float/fp16.h | 60 ++++++++++++++++++------------ include/kernel_float/iterate.h | 53 +++++++++++++------------- include/kernel_float/meta.h | 36 +++++++++--------- include/kernel_float/reduce.h | 28 +++++++------- include/kernel_float/triops.h | 12 +++--- include/kernel_float/vector.h | 6 +-- 9 files changed, 165 insertions(+), 129 deletions(-) diff --git a/include/kernel_float/bf16.h b/include/kernel_float/bf16.h index 3d60d55..d3e3a5d 100644 --- a/include/kernel_float/bf16.h +++ b/include/kernel_float/bf16.h @@ -94,7 +94,7 @@ struct apply_impl { }; template -struct reduce_helper= 2)>> { +struct reduce_impl= 2)>> { KERNEL_FLOAT_INLINE static __nv_bfloat16 call(F fun, const vector_storage<__nv_bfloat16, N>& input) { __nv_bfloat162 accum = {input.data()[0], input.data()[1]}; @@ -276,38 +276,54 @@ using bfloat16 = __nv_bfloat16; #if KERNEL_FLOAT_IS_DEVICE namespace detail { +template<> +struct dot_impl<__nv_bfloat16, 0> { + KERNEL_FLOAT_INLINE + static __nv_bfloat16 call( + const vector_storage<__nv_bfloat16, 0>& left, + const vector_storage<__nv_bfloat16, 0>& right) { + return __nv_bfloat16(0); + } +}; + +template<> +struct dot_impl<__nv_bfloat16, 1> { + KERNEL_FLOAT_INLINE + static __nv_bfloat16 call( + const vector_storage<__nv_bfloat16, 1>& left, + const vector_storage<__nv_bfloat16, 1>& right) { + return __hmul(left.data()[0], right.data()[0]); + } +}; + template -struct dot_helper<__nv_bfloat16, N> { +struct dot_impl<__nv_bfloat16, N> { + static_assert(N >= 2, "internal error"); + KERNEL_FLOAT_INLINE static __nv_bfloat16 call( const vector_storage<__nv_bfloat16, N>& left, const vector_storage<__nv_bfloat16, N>& right) { - if (N == 0) { - return __nv_bfloat16(0); - } else if (N == 1) { - return __hmul(left.data()[0], right.data()[0]); - } else { - __nv_bfloat162 first_a = {left.data()[0], left.data()[1]}; - __nv_bfloat162 first_b = {right.data()[0], right.data()[1]}; - __nv_bfloat162 accum = __hmul2(first_a, first_b); + __nv_bfloat162 first_a = {left.data()[0], left.data()[1]}; + __nv_bfloat162 first_b = {right.data()[0], right.data()[1]}; + __nv_bfloat162 accum = __hmul2(first_a, first_b); #pragma unroll - for (size_t i = 2; i + 2 <= N; i += 2) { - __nv_bfloat162 a = {left.data()[i], left.data()[i + 1]}; - __nv_bfloat162 b = {right.data()[i], right.data()[i + 1]}; - accum = __hfma2(a, b, accum); - } - - __nv_bfloat16 result = __hadd(accum.x, accum.y); + for (size_t i = 2; i + 2 <= N; i += 2) { + __nv_bfloat162 a = {left.data()[i], left.data()[i + 1]}; + __nv_bfloat162 b = {right.data()[i], right.data()[i + 1]}; + accum = __hfma2(a, b, accum); + } - if (N % 2 != 0) { - __nv_bfloat16 a = left.data()[N - 1]; - __nv_bfloat16 b = right.data()[N - 1]; - result = __hfma(a, b, result); - } + __nv_bfloat16 result = __hadd(accum.x, accum.y); - return result; + if (N % 2 != 0) { + __nv_bfloat16 a = left.data()[N - 1]; + __nv_bfloat16 b = right.data()[N - 1]; + result = __hfma(a, b, result); } + + return result; } }; } // namespace detail diff --git a/include/kernel_float/binops.h b/include/kernel_float/binops.h index 378df5a..89d21c7 100644 --- a/include/kernel_float/binops.h +++ b/include/kernel_float/binops.h @@ -49,9 +49,9 @@ using zip_common_type = vector< * Example * ======= * ``` - * vec a = {1.0f, 2.0f, 3.0f}; + * vec a = {1.0f, 2.0f, 3.0f}; * vec b = {4, 5, 6}; - * vec c = zip_common([](float x, float y){ return x + y; }, a, b); // returns [5.0f, 7.0f, 9.0f] + * vec c = zip_common([](float x, float y){ return x + y; }, a, b); // returns [5.0f, 7.0f, 9.0f] * ``` */ template @@ -62,9 +62,9 @@ KERNEL_FLOAT_INLINE zip_common_type zip_common(F fun, const L& left, co return detail::apply_impl::call( fun, - detail::convert_helper, vector_extent_type, T, E>::call( + detail::convert_impl, vector_extent_type, T, E>::call( into_vector_storage(left)), - detail::convert_helper, vector_extent_type, T, E>::call( + detail::convert_impl, vector_extent_type, T, E>::call( into_vector_storage(right))); } @@ -139,7 +139,7 @@ static constexpr bool is_vector_assign_allowed = typename T, \ typename E, \ typename R, \ - typename = enabled_t>> \ + typename = enable_if_t>> \ KERNEL_FLOAT_INLINE vector& operator OP(vector& lhs, const R& rhs) { \ using F = ops::NAME; \ lhs = zip_common(F {}, lhs, rhs); \ @@ -249,7 +249,7 @@ struct bit_xor { namespace detail { template -struct cross_helper { +struct cross_impl { KERNEL_FLOAT_INLINE static vector> call(const vector_storage& av, const vector_storage& bv) { @@ -275,9 +275,9 @@ template< typename R, typename T = promoted_vector_value_type, typename = - enabled_t> && is_vector_broadcastable>>> + enable_if_t> && is_vector_broadcastable>>> KERNEL_FLOAT_INLINE vector> cross(const L& left, const R& right) { - return detail::cross_helper::call(convert_storage(left), convert_storage(right)); + return detail::cross_impl::call(convert_storage(left), convert_storage(right)); } } // namespace kernel_float diff --git a/include/kernel_float/conversion.h b/include/kernel_float/conversion.h index ce5553b..508881b 100644 --- a/include/kernel_float/conversion.h +++ b/include/kernel_float/conversion.h @@ -99,7 +99,7 @@ template using broadcast_vector_extent_type = broadcast_extent...>; template -static constexpr bool is_broadcastable = is_same, To>; +static constexpr bool is_broadcastable = is_same_type, To>; template static constexpr bool is_vector_broadcastable = is_broadcastable, To>; @@ -169,8 +169,12 @@ broadcast_like(const V& input, const R& other) { } namespace detail { +/** + * Convert vector of element type `T` and extent type `E` to vector of element type `T2` and extent type `E2`. + * Specialization exist for the cases where `T==T2` and/or `E==E2`. + */ template -struct convert_helper { +struct convert_impl { KERNEL_FLOAT_INLINE static vector_storage call(vector_storage input) { using F = ops::cast; @@ -180,24 +184,27 @@ struct convert_helper { } }; +// T == T2, E == E2 template -struct convert_helper { +struct convert_impl { KERNEL_FLOAT_INLINE static vector_storage call(vector_storage input) { return input; } }; +// T == T2, E != E2 template -struct convert_helper { +struct convert_impl { KERNEL_FLOAT_INLINE static vector_storage call(vector_storage input) { return detail::broadcast_impl::call(input); } }; +// T != T2, E == E2 template -struct convert_helper { +struct convert_impl { KERNEL_FLOAT_INLINE static vector_storage call(vector_storage input) { using F = ops::cast; @@ -208,8 +215,8 @@ struct convert_helper { template KERNEL_FLOAT_INLINE vector_storage convert_storage(const V& input, extent new_size = {}) { - return detail::convert_helper, vector_extent_type, R, extent, M>:: - call(into_vector_storage(input)); + return detail::convert_impl, vector_extent_type, R, extent, M>::call( + into_vector_storage(input)); } /** diff --git a/include/kernel_float/fp16.h b/include/kernel_float/fp16.h index aa9675f..ca9b9fb 100644 --- a/include/kernel_float/fp16.h +++ b/include/kernel_float/fp16.h @@ -90,7 +90,7 @@ struct apply_impl { }; template -struct reduce_helper= 2)>> { +struct reduce_impl= 2)>> { KERNEL_FLOAT_INLINE static __half call(F fun, const vector_storage<__half, N>& input) { __half2 accum = {input.data()[0], input.data()[1]}; @@ -256,37 +256,51 @@ using half = __half; #if KERNEL_FLOAT_IS_DEVICE namespace detail { +template<> +struct dot_impl<__half, 0> { + KERNEL_FLOAT_INLINE + static __half + call(const vector_storage<__half, 0>& left, const vector_storage<__half, 0>& right) { + return __half(0); + } +}; + +template<> +struct dot_impl<__half, 1> { + KERNEL_FLOAT_INLINE + static __half + call(const vector_storage<__half, 1>& left, const vector_storage<__half, 1>& right) { + return __hmul(left.data()[0], right.data()[0]); + } +}; + template -struct dot_helper<__half, N> { +struct dot_impl<__half, N> { + static_assert(N >= 2, "internal error"); + KERNEL_FLOAT_INLINE static __half call(const vector_storage<__half, N>& left, const vector_storage<__half, N>& right) { - if (N == 0) { - return __half(0); - } else if (N == 1) { - return __hmul(left.data()[0], right.data()[0]); - } else { - __half2 first_a = {left.data()[0], left.data()[1]}; - __half2 first_b = {right.data()[0], right.data()[1]}; - __half2 accum = __hmul2(first_a, first_b); + __half2 first_a = {left.data()[0], left.data()[1]}; + __half2 first_b = {right.data()[0], right.data()[1]}; + __half2 accum = __hmul2(first_a, first_b); #pragma unroll - for (size_t i = 2; i + 2 <= N; i += 2) { - __half2 a = {left.data()[i], left.data()[i + 1]}; - __half2 b = {right.data()[i], right.data()[i + 1]}; - accum = __hfma2(a, b, accum); - } - - __half result = __hadd(accum.x, accum.y); + for (size_t i = 2; i + 2 <= N; i += 2) { + __half2 a = {left.data()[i], left.data()[i + 1]}; + __half2 b = {right.data()[i], right.data()[i + 1]}; + accum = __hfma2(a, b, accum); + } - if (N % 2 != 0) { - __half a = left.data()[N - 1]; - __half b = right.data()[N - 1]; - result = __hfma(a, b, result); - } + __half result = __hadd(accum.x, accum.y); - return result; + if (N % 2 != 0) { + __half a = left.data()[N - 1]; + __half b = right.data()[N - 1]; + result = __hfma(a, b, result); } + + return result; } }; } // namespace detail diff --git a/include/kernel_float/iterate.h b/include/kernel_float/iterate.h index 7b46db2..68c1645 100644 --- a/include/kernel_float/iterate.h +++ b/include/kernel_float/iterate.h @@ -29,7 +29,7 @@ void for_each(V&& input, F fun) { namespace detail { template -struct range_helper { +struct range_impl { KERNEL_FLOAT_INLINE static vector_storage call() { vector_storage result; @@ -56,7 +56,7 @@ struct range_helper { */ template KERNEL_FLOAT_INLINE vector> range() { - return detail::range_helper::call(); + return detail::range_impl::call(); } /** @@ -71,7 +71,7 @@ KERNEL_FLOAT_INLINE vector> range() { */ template KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { - return detail::range_helper, vector_extent>::call(); + return detail::range_impl, vector_extent>::call(); } /** @@ -96,14 +96,14 @@ KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { */ template KERNEL_FLOAT_INLINE vector> each_index(const V& = {}) { - return detail::range_helper>::call(); + return detail::range_impl>::call(); } namespace detail { template, size_t N = vector_extent> -struct flatten_helper { - using value_type = typename flatten_helper::value_type; - static constexpr size_t size = N * flatten_helper::size; +struct flatten_impl { + using value_type = typename flatten_impl::value_type; + static constexpr size_t size = N * flatten_impl::size; template KERNEL_FLOAT_INLINE static void call(U* output, const V& input) { @@ -111,13 +111,13 @@ struct flatten_helper { #pragma unroll for (size_t i = 0; i < N; i++) { - flatten_helper::call(output + flatten_helper::size * i, storage.data()[i]); + flatten_impl::call(output + flatten_impl::size * i, storage.data()[i]); } } }; template -struct flatten_helper { +struct flatten_impl { using value_type = T; static constexpr size_t size = 1; @@ -134,10 +134,10 @@ struct flatten_helper { } // namespace detail template -using flatten_value_type = typename detail::flatten_helper::value_type; +using flatten_value_type = typename detail::flatten_impl::value_type; template -static constexpr size_t flatten_size = detail::flatten_helper::size; +static constexpr size_t flatten_size = detail::flatten_impl::size; template using flatten_type = vector, extent>>; @@ -155,13 +155,13 @@ using flatten_type = vector, extent>>; template KERNEL_FLOAT_INLINE flatten_type flatten(const V& input) { vector_storage, flatten_size> output; - detail::flatten_helper::call(output.data(), input); + detail::flatten_impl::call(output.data(), input); return output; } namespace detail { template> -struct concat_base_helper { +struct concat_base_impl { static constexpr size_t size = vector_extent; KERNEL_FLOAT_INLINE static void call(U* output, const V& input) { @@ -174,7 +174,7 @@ struct concat_base_helper { }; template -struct concat_base_helper { +struct concat_base_impl { static constexpr size_t size = 1; KERNEL_FLOAT_INLINE static void call(U* output, const T& input) { @@ -183,7 +183,7 @@ struct concat_base_helper { }; template -struct concat_base_helper { +struct concat_base_impl { static constexpr size_t size = 1; KERNEL_FLOAT_INLINE static void call(T* output, const T& input) { @@ -192,24 +192,23 @@ struct concat_base_helper { }; template -struct concat_helper {}; +struct concat_impl {}; template -struct concat_helper { +struct concat_impl { using value_type = - typename promote_type, typename concat_helper::value_type>:: - type; - static constexpr size_t size = concat_base_helper::size + concat_helper::size; + typename promote_type, typename concat_impl::value_type>::type; + static constexpr size_t size = concat_base_impl::size + concat_impl::size; template KERNEL_FLOAT_INLINE static void call(U* output, const V& input, const Vs&... rest) { - concat_base_helper::call(output, input); - concat_helper::call(output + concat_base_helper::size, rest...); + concat_base_impl::call(output, input); + concat_impl::call(output + concat_base_impl::size, rest...); } }; template<> -struct concat_helper<> { +struct concat_impl<> { using value_type = void; static constexpr size_t size = 1; @@ -219,10 +218,10 @@ struct concat_helper<> { } // namespace detail template -using concat_value_type = promote_t::value_type>; +using concat_value_type = promote_t::value_type>; template -static constexpr size_t concat_size = detail::concat_helper::size; +static constexpr size_t concat_size = detail::concat_impl::size; template using concat_type = vector, extent>>; @@ -257,7 +256,7 @@ using concat_type = vector, extent>> template KERNEL_FLOAT_INLINE concat_type concat(const Vs&... inputs) { vector_storage, concat_size> output; - detail::concat_helper::call(output.data(), inputs...); + detail::concat_impl::call(output.data(), inputs...); return output; } @@ -284,7 +283,7 @@ KERNEL_FLOAT_INLINE select_type select(const V& input, const Is&... in static constexpr size_t M = concat_size; vector_storage index_set; - detail::concat_helper::call(index_set.data(), indices...); + detail::concat_impl::call(index_set.data(), indices...); vector_storage inputs = into_vector_storage(input); vector_storage outputs; diff --git a/include/kernel_float/meta.h b/include/kernel_float/meta.h index dcaac78..9c133a3 100644 --- a/include/kernel_float/meta.h +++ b/include/kernel_float/meta.h @@ -12,13 +12,13 @@ struct index_sequence { namespace detail { template -struct make_index_sequence_helper {}; +struct make_index_sequence_impl {}; // Benchmarks show that it is much faster to predefine all possible index sequences instead of doing something // recursive with variadic templates. #define KERNEL_FLOAT_INDEX_SEQ(N, ...) \ template<> \ - struct make_index_sequence_helper { \ + struct make_index_sequence_impl { \ using type = index_sequence<__VA_ARGS__>; \ }; @@ -44,37 +44,37 @@ KERNEL_FLOAT_INDEX_SEQ(17, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, } // namespace detail template -using make_index_sequence = typename detail::make_index_sequence_helper::type; +using make_index_sequence = typename detail::make_index_sequence_impl::type; namespace detail { template -struct decay_helper { +struct decay_impl { using type = T; }; template -struct decay_helper { +struct decay_impl { using type = T; }; template -struct decay_helper { +struct decay_impl { using type = T; }; template -struct decay_helper { +struct decay_impl { using type = T; }; template -struct decay_helper { +struct decay_impl { using type = T; }; } // namespace detail template -using decay_t = typename detail::decay_helper::type; +using decay_t = typename detail::decay_impl::type; template struct promote_type; @@ -217,34 +217,34 @@ using promote_t = typename detail::multi_promote_type...>::type; namespace detail { template -struct is_same_helper { +struct is_same_type_impl { static constexpr bool value = false; }; template -struct is_same_helper { +struct is_same_type_impl { static constexpr bool value = true; }; } // namespace detail template -static constexpr bool is_same = detail::is_same_helper::value; +static constexpr bool is_same_type = detail::is_same_type_impl::value; namespace detail { template -struct is_implicit_convertible_helper { +struct is_implicit_convertible_impl { static constexpr bool value = false; }; template -struct is_implicit_convertible_helper::type> { +struct is_implicit_convertible_impl::type> { static constexpr bool value = true; }; } // namespace detail template static constexpr bool is_implicit_convertible = - detail::is_implicit_convertible_helper, decay_t>::value; + detail::is_implicit_convertible_impl, decay_t>::value; namespace detail { template @@ -259,16 +259,16 @@ using result_t = decltype((detail::declval())(detail::declval()...)); namespace detail { template -struct enabled_helper {}; +struct enable_if_impl {}; template -struct enabled_helper { +struct enable_if_impl { using type = T; }; } // namespace detail template -using enabled_t = typename detail::enabled_helper::type; +using enable_if_t = typename detail::enable_if_impl::type; } // namespace kernel_float diff --git a/include/kernel_float/reduce.h b/include/kernel_float/reduce.h index 8b8da51..424e641 100644 --- a/include/kernel_float/reduce.h +++ b/include/kernel_float/reduce.h @@ -6,7 +6,7 @@ namespace kernel_float { namespace detail { template -struct reduce_helper { +struct reduce_impl { KERNEL_FLOAT_INLINE static T call(F fun, const vector_storage& input) { return call(fun, input, make_index_sequence {}); } @@ -29,7 +29,7 @@ struct reduce_helper { * Reduce the elements of the given vector ``input`` into a single value using * the function ``fun``. This function should be a binary function that takes * two elements and returns one element. The order in which the elements - * are reduced is not specified and depends on the reduction function and + * are reduced is not specified and depends on both the reduction function and * the vector type. * * Example @@ -41,7 +41,7 @@ struct reduce_helper { */ template KERNEL_FLOAT_INLINE vector_value_type reduce(F fun, const V& input) { - return detail::reduce_helper, vector_value_type>::call( + return detail::reduce_impl, vector_value_type>::call( fun, into_vector_storage(input)); } @@ -142,7 +142,7 @@ KERNEL_FLOAT_INLINE T count(const V& input) { namespace detail { template -struct dot_helper { +struct dot_impl { KERNEL_FLOAT_INLINE static T call(const vector_storage& left, const vector_storage& right) { return sum(zip(ops::multiply {}, left, right)); @@ -164,22 +164,22 @@ struct dot_helper { template> KERNEL_FLOAT_INLINE T dot(const L& left, const R& right) { using E = broadcast_vector_extent_type; - return detail::dot_helper::call( + return detail::dot_impl::call( convert_storage(left, E {}), convert_storage(right, E {})); } namespace detail { template -struct magnitude_helper { +struct magnitude_impl { KERNEL_FLOAT_INLINE static T call(const vector_storage& input) { - return ops::sqrt {}(detail::dot_helper::call(input, input)); + return ops::sqrt {}(detail::dot_impl::call(input, input)); } }; template -struct magnitude_helper { +struct magnitude_impl { KERNEL_FLOAT_INLINE static T call(const vector_storage& input) { return T {}; @@ -187,7 +187,7 @@ struct magnitude_helper { }; template -struct magnitude_helper { +struct magnitude_impl { KERNEL_FLOAT_INLINE static T call(const vector_storage& input) { return ops::abs {}(input); @@ -195,7 +195,7 @@ struct magnitude_helper { }; template -struct magnitude_helper { +struct magnitude_impl { KERNEL_FLOAT_INLINE static T call(const vector_storage& input) { return ops::hypot {}(input.data()[0], input.data()[1]); @@ -205,7 +205,7 @@ struct magnitude_helper { // The 3-argument overload of hypot is only available from C++17 #ifdef __cpp_lib_hypot template<> -struct magnitude_helper { +struct magnitude_impl { KERNEL_FLOAT_INLINE static float call(const vector_storage& input) { return std::hypot(input.data()[0], input.data()[1], input.data()[2]); @@ -213,7 +213,7 @@ struct magnitude_helper { }; template<> -struct magnitude_helper { +struct magnitude_impl { KERNEL_FLOAT_INLINE static float call(const vector_storage& input) { return std::hypot(input.data()[0], input.data()[1], input.data()[2]); @@ -225,7 +225,7 @@ struct magnitude_helper { /** * Compute the magnitude of the given input vector. This calculates the square root of the sum of squares, also - * known as the Euclidian norm of the vector. + * known as the Euclidian norm, of a vector. * * Example * ======= @@ -236,7 +236,7 @@ struct magnitude_helper { */ template> KERNEL_FLOAT_INLINE T mag(const V& input) { - return detail::magnitude_helper>::call(into_vector_storage(input)); + return detail::magnitude_impl>::call(into_vector_storage(input)); } } // namespace kernel_float diff --git a/include/kernel_float/triops.h b/include/kernel_float/triops.h index e24971e..a2faabd 100644 --- a/include/kernel_float/triops.h +++ b/include/kernel_float/triops.h @@ -42,11 +42,11 @@ KERNEL_FLOAT_INLINE vector where(const C& cond, const L& true_values, cons return detail::apply_impl::call( F {}, - detail::convert_helper, vector_extent_type, bool, E>::call( + detail::convert_impl, vector_extent_type, bool, E>::call( into_vector_storage(cond)), - detail::convert_helper, vector_extent_type, T, E>::call( + detail::convert_impl, vector_extent_type, T, E>::call( into_vector_storage(true_values)), - detail::convert_helper, vector_extent_type, T, E>::call( + detail::convert_impl, vector_extent_type, T, E>::call( into_vector_storage(false_values))); } @@ -120,11 +120,11 @@ KERNEL_FLOAT_INLINE vector fma(const A& a, const B& b, const C& c) { return detail::apply_impl::call( F {}, - detail::convert_helper, vector_extent_type, T, E>::call( + detail::convert_impl, vector_extent_type, T, E>::call( into_vector_storage(a)), - detail::convert_helper, vector_extent_type, T, E>::call( + detail::convert_impl, vector_extent_type, T, E>::call( into_vector_storage(b)), - detail::convert_helper, vector_extent_type, T, E>::call( + detail::convert_impl, vector_extent_type, T, E>::call( into_vector_storage(c))); } diff --git a/include/kernel_float/vector.h b/include/kernel_float/vector.h index f294b48..1782dbc 100644 --- a/include/kernel_float/vector.h +++ b/include/kernel_float/vector.h @@ -39,11 +39,11 @@ struct vector: public S { storage_type(detail::broadcast_impl, E>::call(input)) {} // For all other arguments, we convert it using `convert_storage` according to broadcast rules - template, T>, int> = 0> + template, T>, int> = 0> KERNEL_FLOAT_INLINE vector(U&& input) : storage_type(convert_storage(input, extent_type {})) {} - template, T>, int> = 0> + template, T>, int> = 0> KERNEL_FLOAT_INLINE explicit vector(U&& input) : storage_type(convert_storage(input, extent_type {})) {} @@ -52,7 +52,7 @@ struct vector: public S { typename A, typename B, typename... Rest, - typename = enabled_t> + typename = enable_if_t> KERNEL_FLOAT_INLINE vector(const A& a, const B& b, const Rest&... rest) : storage_type {T(a), T(b), T(rest)...} {} From 7acff4cc2414483de4162d0b47453422f6ebe215 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 19 Sep 2023 17:22:04 +0200 Subject: [PATCH 42/50] Use raw pointers in `apply_impl` and `reduce_impl` --- include/kernel_float/bf16.h | 72 +++++++++++-------------------- include/kernel_float/binops.h | 24 ++++++++--- include/kernel_float/conversion.h | 9 ++-- include/kernel_float/fp16.h | 47 +++++++++----------- include/kernel_float/reduce.h | 11 +++-- include/kernel_float/triops.h | 30 +++++++++---- include/kernel_float/unops.h | 18 ++++---- tests/binops.cu | 4 +- 8 files changed, 108 insertions(+), 107 deletions(-) diff --git a/include/kernel_float/bf16.h b/include/kernel_float/bf16.h index d3e3a5d..3eb587e 100644 --- a/include/kernel_float/bf16.h +++ b/include/kernel_float/bf16.h @@ -49,66 +49,55 @@ struct zip_bfloat16x2 { template struct apply_impl { - KERNEL_FLOAT_INLINE static vector_storage<__nv_bfloat16, N> - call(F fun, const vector_storage<__nv_bfloat16, N>& input) { - vector_storage<__nv_bfloat16, N> result; - + KERNEL_FLOAT_INLINE static void call(F fun, __nv_bfloat16* result, const __nv_bfloat16* input) { #pragma unroll - for (size_t i = 0; i + 2 <= N; i += 2) { - __nv_bfloat162 a = {input.data()[i], input.data()[i + 1]}; + for (size_t i = 0; 2 * i + 1 < N; i++) { + __nv_bfloat162 a = {input[2 * i], input[2 * i + 1]}; __nv_bfloat162 b = map_bfloat16x2::call(fun, a); - result.data()[i + 0] = b.x; - result.data()[i + 1] = b.y; + result[2 * i + 0] = b.x; + result[2 * i + 1] = b.y; } if (N % 2 != 0) { - result.data()[N - 1] = fun(input.data()[N - 1]); + result[N - 1] = fun(input[N - 1]); } - - return result; } }; template struct apply_impl { - KERNEL_FLOAT_INLINE static vector_storage<__nv_bfloat16, N> call( - F fun, - const vector_storage<__nv_bfloat16, N>& left, - const vector_storage<__nv_bfloat16, N>& right) { - vector_storage<__nv_bfloat16, N> result; + KERNEL_FLOAT_INLINE static void + call(F fun, __nv_bfloat16* result, const __nv_bfloat16* left, const __nv_bfloat16* right) { #pragma unroll - for (size_t i = 0; i + 2 <= N; i += 2) { - __nv_bfloat162 a = {left.data()[i], left.data()[i + 1]}; - __nv_bfloat162 b = {right.data()[i], right.data()[i + 1]}; + for (size_t i = 0; 2 * i + 1 < N; i++) { + __nv_bfloat162 a = {left[2 * i], left[2 * i + 1]}; + __nv_bfloat162 b = {right[2 * i], right[2 * i + 1]}; __nv_bfloat162 c = zip_bfloat16x2::call(fun, a, b); - result.data()[i + 0] = c.x; - result.data()[i + 1] = c.y; + result[2 * i + 0] = c.x; + result[2 * i + 1] = c.y; } if (N % 2 != 0) { - result.data()[N - 1] = fun(left.data()[N - 1], right.data()[N - 1]); + result[N - 1] = fun(left[N - 1], right[N - 1]); } - - return result; } }; template struct reduce_impl= 2)>> { - KERNEL_FLOAT_INLINE static __nv_bfloat16 - call(F fun, const vector_storage<__nv_bfloat16, N>& input) { - __nv_bfloat162 accum = {input.data()[0], input.data()[1]}; + KERNEL_FLOAT_INLINE static __nv_bfloat16 call(F fun, const __nv_bfloat16* input) { + __nv_bfloat162 accum = {input[0], input[1]}; #pragma unroll - for (size_t i = 2; i + 2 <= N; i += 2) { - __nv_bfloat162 a = {input.data()[i], input.data()[i + 1]}; + for (size_t i = 0; 2 * i + 1 < N; i++) { + __nv_bfloat162 a = {input[2 * i], input[2 * i + 1]}; accum = zip_bfloat16x2::call(fun, accum, a); } __nv_bfloat16 result = fun(accum.x, accum.y); if (N % 2 != 0) { - result = fun(result, input.data()[N - 1]); + result = fun(result, input[N - 1]); } return result; @@ -126,6 +115,7 @@ struct reduce_impl= 2)>> { }; \ } +// There operations are not implemented in half precision, so they are forward to single precision KERNEL_FLOAT_BF16_UNARY_FORWARD(tan) KERNEL_FLOAT_BF16_UNARY_FORWARD(asin) KERNEL_FLOAT_BF16_UNARY_FORWARD(acos) @@ -243,32 +233,22 @@ KERNEL_FLOAT_BF16_BINARY_FUN(greater_equal, __hge, __hgt2) KERNEL_FLOAT_BF16_CAST(double, __double2bfloat16(input), double(__bfloat162float(input))); KERNEL_FLOAT_BF16_CAST(float, __float2bfloat16(input), __bfloat162float(input)); +// clang-format off // there are no official char casts. Instead, cast to int and then to char KERNEL_FLOAT_BF16_CAST(char, __int2bfloat16_rn(input), (char)__bfloat162int_rz(input)); -KERNEL_FLOAT_BF16_CAST( - signed char, - __int2bfloat16_rn(input), - (signed char)__bfloat162int_rz(input)); -KERNEL_FLOAT_BF16_CAST( - unsigned char, - __int2bfloat16_rn(input), - (unsigned char)__bfloat162int_rz(input)); +KERNEL_FLOAT_BF16_CAST(signed char, __int2bfloat16_rn(input), (signed char)__bfloat162int_rz(input)); +KERNEL_FLOAT_BF16_CAST(unsigned char, __int2bfloat16_rn(input), (unsigned char)__bfloat162int_rz(input)); KERNEL_FLOAT_BF16_CAST(signed short, __bfloat162short_rz(input), __short2bfloat16_rn(input)); KERNEL_FLOAT_BF16_CAST(signed int, __bfloat162int_rz(input), __int2bfloat16_rn(input)); -KERNEL_FLOAT_BF16_CAST( - signed long, - __ll2bfloat16_rn(input), - (signed long)(__bfloat162ll_rz(input))); +KERNEL_FLOAT_BF16_CAST(signed long, __ll2bfloat16_rn(input), (signed long)(__bfloat162ll_rz(input))); KERNEL_FLOAT_BF16_CAST(signed long long, __ll2bfloat16_rn(input), __bfloat162ll_rz(input)); KERNEL_FLOAT_BF16_CAST(unsigned short, __bfloat162ushort_rz(input), __ushort2bfloat16_rn(input)); KERNEL_FLOAT_BF16_CAST(unsigned int, __bfloat162uint_rz(input), __uint2bfloat16_rn(input)); -KERNEL_FLOAT_BF16_CAST( - unsigned long, - __ull2bfloat16_rn(input), - (unsigned long)(__bfloat162ull_rz(input))); +KERNEL_FLOAT_BF16_CAST(unsigned long, __ull2bfloat16_rn(input), (unsigned long)(__bfloat162ull_rz(input))); KERNEL_FLOAT_BF16_CAST(unsigned long long, __ull2bfloat16_rn(input), __bfloat162ull_rz(input)); +// clang-format on using bfloat16 = __nv_bfloat16; //KERNEL_FLOAT_TYPE_ALIAS(float16x, __nv_bfloat16) diff --git a/include/kernel_float/binops.h b/include/kernel_float/binops.h index 89d21c7..3231106 100644 --- a/include/kernel_float/binops.h +++ b/include/kernel_float/binops.h @@ -29,11 +29,16 @@ KERNEL_FLOAT_INLINE zip_type zip(F fun, const L& left, const R& right) using B = vector_value_type; using O = result_t; using E = broadcast_vector_extent_type; + vector_storage result; - return detail::apply_impl::call( + detail::apply_impl::call( fun, - detail::broadcast_impl, E>::call(into_vector_storage(left)), - detail::broadcast_impl, E>::call(into_vector_storage(right))); + result.data(), + detail::broadcast_impl, E>::call(into_vector_storage(left)).data(), + detail::broadcast_impl, E>::call(into_vector_storage(right)) + .data()); + + return result; } template @@ -60,12 +65,19 @@ KERNEL_FLOAT_INLINE zip_common_type zip_common(F fun, const L& left, co using O = result_t; using E = broadcast_vector_extent_type; - return detail::apply_impl::call( + vector_storage result; + + detail::apply_impl::call( fun, + result.data(), detail::convert_impl, vector_extent_type, T, E>::call( - into_vector_storage(left)), + into_vector_storage(left)) + .data(), detail::convert_impl, vector_extent_type, T, E>::call( - into_vector_storage(right))); + into_vector_storage(right)) + .data()); + + return result; } #define KERNEL_FLOAT_DEFINE_BINARY(NAME, EXPR) \ diff --git a/include/kernel_float/conversion.h b/include/kernel_float/conversion.h index 508881b..2e6a454 100644 --- a/include/kernel_float/conversion.h +++ b/include/kernel_float/conversion.h @@ -178,8 +178,8 @@ struct convert_impl { KERNEL_FLOAT_INLINE static vector_storage call(vector_storage input) { using F = ops::cast; - vector_storage intermediate = - detail::apply_impl::call(F {}, input); + vector_storage intermediate; + detail::apply_impl::call(F {}, intermediate.data(), input.data()); return detail::broadcast_impl::call(intermediate); } }; @@ -208,7 +208,10 @@ struct convert_impl { KERNEL_FLOAT_INLINE static vector_storage call(vector_storage input) { using F = ops::cast; - return detail::apply_impl::call(F {}, input); + + vector_storage result; + detail::apply_impl::call(F {}, result.data(), input.data()); + return result; } }; } // namespace detail diff --git a/include/kernel_float/fp16.h b/include/kernel_float/fp16.h index ca9b9fb..8e05376 100644 --- a/include/kernel_float/fp16.h +++ b/include/kernel_float/fp16.h @@ -47,63 +47,55 @@ struct zip_halfx2 { template struct apply_impl { - KERNEL_FLOAT_INLINE static vector_storage<__half, N> - call(F fun, const vector_storage<__half, N>& input) { - vector_storage<__half, N> result; - + KERNEL_FLOAT_INLINE static void call(F fun, __half* result, const __half* input) { #pragma unroll - for (size_t i = 0; i + 2 <= N; i += 2) { - __half2 a = {input.data()[i], input.data()[i + 1]}; + for (size_t i = 0; 2 * i + 1 < N; i++) { + __half2 a = {input[2 * i], input[2 * i + 1]}; __half2 b = map_halfx2::call(fun, a); - result.data()[i + 0] = b.x; - result.data()[i + 1] = b.y; + result[2 * i + 0] = b.x; + result[2 * i + 1] = b.y; } if (N % 2 != 0) { - result.data()[N - 1] = fun(input.data()[N - 1]); + result[N - 1] = fun(input[N - 1]); } - - return result; } }; template struct apply_impl { - KERNEL_FLOAT_INLINE static vector_storage<__half, N> - call(F fun, const vector_storage<__half, N>& left, const vector_storage<__half, N>& right) { - vector_storage<__half, N> result; + KERNEL_FLOAT_INLINE static void + call(F fun, __half* result, const __half* left, const __half* right) { #pragma unroll - for (size_t i = 0; i + 2 <= N; i += 2) { - __half2 a = {left.data()[i], left.data()[i + 1]}; - __half2 b = {right.data()[i], right.data()[i + 1]}; + for (size_t i = 0; 2 * i + 1 < N; i++) { + __half2 a = {left[2 * i], left[2 * i + 1]}; + __half2 b = {right[2 * i], right[2 * i + 1]}; __half2 c = zip_halfx2::call(fun, a, b); - result.data()[i + 0] = c.x; - result.data()[i + 1] = c.y; + result[2 * i + 0] = c.x; + result[2 * i + 1] = c.y; } if (N % 2 != 0) { - result.data()[N - 1] = fun(left.data()[N - 1], right.data()[N - 1]); + result[N - 1] = fun(left[N - 1], right[N - 1]); } - - return result; } }; template struct reduce_impl= 2)>> { - KERNEL_FLOAT_INLINE static __half call(F fun, const vector_storage<__half, N>& input) { - __half2 accum = {input.data()[0], input.data()[1]}; + KERNEL_FLOAT_INLINE static __half call(F fun, const __half* input) { + __half2 accum = {input[0], input[1]}; #pragma unroll - for (size_t i = 2; i + 2 <= N; i += 2) { - __half2 a = {input.data()[i], input.data()[i + 1]}; + for (size_t i = 0; 2 * i + 1 < N; i++) { + __half2 a = {input[2 * i], input[2 * i + 1]}; accum = zip_halfx2::call(fun, accum, a); } __half result = fun(accum.x, accum.y); if (N % 2 != 0) { - result = fun(result, input.data()[N - 1]); + result = fun(result, input[N - 1]); } return result; @@ -122,6 +114,7 @@ struct reduce_impl= 2)>> { }; \ } +// There operations are not implemented in half precision, so they are forward to single precision KERNEL_FLOAT_FP16_UNARY_FORWARD(tan) KERNEL_FLOAT_FP16_UNARY_FORWARD(asin) KERNEL_FLOAT_FP16_UNARY_FORWARD(acos) diff --git a/include/kernel_float/reduce.h b/include/kernel_float/reduce.h index 424e641..57db624 100644 --- a/include/kernel_float/reduce.h +++ b/include/kernel_float/reduce.h @@ -7,18 +7,17 @@ namespace kernel_float { namespace detail { template struct reduce_impl { - KERNEL_FLOAT_INLINE static T call(F fun, const vector_storage& input) { + KERNEL_FLOAT_INLINE static T call(F fun, const T* input) { return call(fun, input, make_index_sequence {}); } private: template - KERNEL_FLOAT_INLINE static T - call(F fun, const vector_storage& input, index_sequence<0, Is...>) { - T result = input.data()[0]; + KERNEL_FLOAT_INLINE static T call(F fun, const T* input, index_sequence<0, Is...>) { + T result = input[0]; #pragma unroll for (size_t i = 1; i < N; i++) { - result = fun(result, input.data()[i]); + result = fun(result, input[i]); } return result; } @@ -43,7 +42,7 @@ template KERNEL_FLOAT_INLINE vector_value_type reduce(F fun, const V& input) { return detail::reduce_impl, vector_value_type>::call( fun, - into_vector_storage(input)); + into_vector_storage(input).data()); } /** diff --git a/include/kernel_float/triops.h b/include/kernel_float/triops.h index a2faabd..afee449 100644 --- a/include/kernel_float/triops.h +++ b/include/kernel_float/triops.h @@ -39,15 +39,22 @@ template< typename E = broadcast_vector_extent_type> KERNEL_FLOAT_INLINE vector where(const C& cond, const L& true_values, const R& false_values) { using F = ops::conditional; + vector_storage result; - return detail::apply_impl::call( + detail::apply_impl::call( F {}, + result.data(), detail::convert_impl, vector_extent_type, bool, E>::call( - into_vector_storage(cond)), + into_vector_storage(cond)) + .data(), detail::convert_impl, vector_extent_type, T, E>::call( - into_vector_storage(true_values)), + into_vector_storage(true_values)) + .data(), detail::convert_impl, vector_extent_type, T, E>::call( - into_vector_storage(false_values))); + into_vector_storage(false_values)) + .data()); + + return result; } /** @@ -117,15 +124,22 @@ template< typename E = broadcast_vector_extent_type> KERNEL_FLOAT_INLINE vector fma(const A& a, const B& b, const C& c) { using F = ops::fma; + vector_storage result; - return detail::apply_impl::call( + detail::apply_impl::call( F {}, + result.data(), detail::convert_impl, vector_extent_type, T, E>::call( - into_vector_storage(a)), + into_vector_storage(a)) + .data(), detail::convert_impl, vector_extent_type, T, E>::call( - into_vector_storage(b)), + into_vector_storage(b)) + .data(), detail::convert_impl, vector_extent_type, T, E>::call( - into_vector_storage(c))); + into_vector_storage(c)) + .data()); + + return result; } } // namespace kernel_float diff --git a/include/kernel_float/unops.h b/include/kernel_float/unops.h index 18619d6..bca3796 100644 --- a/include/kernel_float/unops.h +++ b/include/kernel_float/unops.h @@ -8,16 +8,11 @@ namespace detail { template struct apply_impl { - KERNEL_FLOAT_INLINE static vector_storage - call(F fun, const vector_storage&... inputs) { - vector_storage result; - + KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) { #pragma unroll for (size_t i = 0; i < N; i++) { - result.data()[i] = fun(inputs.data()[i]...); + result[i] = fun(inputs[i]...); } - - return result; } }; } // namespace detail @@ -39,9 +34,14 @@ template KERNEL_FLOAT_INLINE map_type map(F fun, const V& input) { using Input = vector_value_type; using Output = result_t; - return detail::apply_impl, Output, Input>::call( + vector_storage> result; + + detail::apply_impl, Output, Input>::call( fun, - into_vector_storage(input)); + result.data(), + into_vector_storage(input).data()); + + return result; } #define KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ diff --git a/tests/binops.cu b/tests/binops.cu index e7e51ee..114889f 100644 --- a/tests/binops.cu +++ b/tests/binops.cu @@ -80,8 +80,8 @@ struct binops_float_tests { // remainder is not support for fp16 if constexpr (is_none_of) { - c = a % b; - ASSERT(equals(T(fmod(x[I], y[I])), c[I]) && ...); + // c = a % b; + // ASSERT(equals(T(fmod(x[I], y[I])), c[I]) && ...); } } }; From da0a46b533ef9d25638748eb951284f14e7c48bb Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 19 Sep 2023 20:34:25 +0200 Subject: [PATCH 43/50] Add tests for reductions --- include/kernel_float/binops.h | 37 ++- include/kernel_float/fp16.h | 1 - include/kernel_float/reduce.h | 19 +- single_include/kernel_float.h | 566 +++++++++++++++++++--------------- tests/common.h | 54 +++- tests/reduce.cu | 170 ++++++++++ 6 files changed, 589 insertions(+), 258 deletions(-) create mode 100644 tests/reduce.cu diff --git a/include/kernel_float/binops.h b/include/kernel_float/binops.h index 3231106..705562b 100644 --- a/include/kernel_float/binops.h +++ b/include/kernel_float/binops.h @@ -172,15 +172,46 @@ KERNEL_FLOAT_DEFINE_BINARY_ASSIGN_OP(bit_xor, ^=) KERNEL_FLOAT_DEFINE_BINARY_FUN(min) KERNEL_FLOAT_DEFINE_BINARY_FUN(max) KERNEL_FLOAT_DEFINE_BINARY_FUN(copysign) -KERNEL_FLOAT_DEFINE_BINARY_FUN(hypot) KERNEL_FLOAT_DEFINE_BINARY_FUN(modf) KERNEL_FLOAT_DEFINE_BINARY_FUN(nextafter) KERNEL_FLOAT_DEFINE_BINARY_FUN(pow) KERNEL_FLOAT_DEFINE_BINARY_FUN(remainder) -#if KERNEL_FLOAT_CUDA_DEVICE -KERNEL_FLOAT_DEFINE_BINARY_FUN(rhypot) +KERNEL_FLOAT_DEFINE_BINARY(hypot, (ops::sqrt()(left * left + right * right))) +KERNEL_FLOAT_DEFINE_BINARY(rhypot, (T(1) / ops::hypot()(left, right))) + +namespace ops { +template<> +struct hypot { + KERNEL_FLOAT_INLINE double operator()(double left, double right) { + return ::hypot(left, right); + }; +}; + +template<> +struct hypot { + KERNEL_FLOAT_INLINE float operator()(float left, float right) { + return ::hypotf(left, right); + }; +}; + +// rhypot is only support on the GPU +#if KERNEL_FLOAT_IS_DEVICE +template<> +struct rhypot { + KERNEL_FLOAT_INLINE double operator()(double left, double right) { + return ::rhypot(left, right); + }; +}; + +template<> +struct rhypot { + KERNEL_FLOAT_INLINE float operator()(float left, float right) { + return ::rhypotf(left, right); + }; +}; #endif +}; // namespace ops #if KERNEL_FLOAT_IS_DEVICE #define KERNEL_FLOAT_DEFINE_BINARY_FAST(FUN_NAME, OP_NAME, FLOAT_FUN) \ diff --git a/include/kernel_float/fp16.h b/include/kernel_float/fp16.h index 8e05376..e939341 100644 --- a/include/kernel_float/fp16.h +++ b/include/kernel_float/fp16.h @@ -199,7 +199,6 @@ KERNEL_FLOAT_FP16_BINARY_FUN(multiply, __hmul, __hmul2) KERNEL_FLOAT_FP16_BINARY_FUN(divide, __hdiv, __h2div) KERNEL_FLOAT_FP16_BINARY_FUN(min, __hmin, __hmin2) KERNEL_FLOAT_FP16_BINARY_FUN(max, __hmax, __hmax2) - KERNEL_FLOAT_FP16_BINARY_FUN(fast_div, __hdiv, __h2div) KERNEL_FLOAT_FP16_BINARY_FUN(equal_to, __heq, __heq2) diff --git a/include/kernel_float/reduce.h b/include/kernel_float/reduce.h index 57db624..8cc5362 100644 --- a/include/kernel_float/reduce.h +++ b/include/kernel_float/reduce.h @@ -144,7 +144,14 @@ template struct dot_impl { KERNEL_FLOAT_INLINE static T call(const vector_storage& left, const vector_storage& right) { - return sum(zip(ops::multiply {}, left, right)); + vector_storage intermediate; + detail::apply_impl, N, T, T, T>::call( + ops::multiply(), + intermediate.data(), + left.data(), + right.data()); + + return detail::reduce_impl, N, T>::call(ops::add(), intermediate.data()); } }; } // namespace detail @@ -197,17 +204,17 @@ template struct magnitude_impl { KERNEL_FLOAT_INLINE static T call(const vector_storage& input) { - return ops::hypot {}(input.data()[0], input.data()[1]); + return ops::hypot()(input.data()[0], input.data()[1]); } }; -// The 3-argument overload of hypot is only available from C++17 -#ifdef __cpp_lib_hypot +// The 3-argument overload of hypot is only available on host from C++17 +#if defined(__cpp_lib_hypot) && KERNEL_FLOAT_IS_HOST template<> struct magnitude_impl { KERNEL_FLOAT_INLINE static float call(const vector_storage& input) { - return std::hypot(input.data()[0], input.data()[1], input.data()[2]); + return ::hypot(input.data()[0], input.data()[1], input.data()[2]); } }; @@ -215,7 +222,7 @@ template<> struct magnitude_impl { KERNEL_FLOAT_INLINE static float call(const vector_storage& input) { - return std::hypot(input.data()[0], input.data()[1], input.data()[2]); + return ::hypot(input.data()[0], input.data()[1], input.data()[2]); } }; #endif diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 47786d3..b371866 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-09-18 17:41:12.641561 -// git hash: 64f21903e8049e4a46c53897a167f31174e1a231 +// date: 2023-09-19 20:34:16.094065 +// git hash: 7acff4cc2414483de4162d0b47453422f6ebe215 //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -61,13 +61,13 @@ struct index_sequence { namespace detail { template -struct make_index_sequence_helper {}; +struct make_index_sequence_impl {}; // Benchmarks show that it is much faster to predefine all possible index sequences instead of doing something // recursive with variadic templates. #define KERNEL_FLOAT_INDEX_SEQ(N, ...) \ template<> \ - struct make_index_sequence_helper { \ + struct make_index_sequence_impl { \ using type = index_sequence<__VA_ARGS__>; \ }; @@ -93,37 +93,37 @@ KERNEL_FLOAT_INDEX_SEQ(17, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, } // namespace detail template -using make_index_sequence = typename detail::make_index_sequence_helper::type; +using make_index_sequence = typename detail::make_index_sequence_impl::type; namespace detail { template -struct decay_helper { +struct decay_impl { using type = T; }; template -struct decay_helper { +struct decay_impl { using type = T; }; template -struct decay_helper { +struct decay_impl { using type = T; }; template -struct decay_helper { +struct decay_impl { using type = T; }; template -struct decay_helper { +struct decay_impl { using type = T; }; } // namespace detail template -using decay_t = typename detail::decay_helper::type; +using decay_t = typename detail::decay_impl::type; template struct promote_type; @@ -266,34 +266,34 @@ using promote_t = typename detail::multi_promote_type...>::type; namespace detail { template -struct is_same_helper { +struct is_same_type_impl { static constexpr bool value = false; }; template -struct is_same_helper { +struct is_same_type_impl { static constexpr bool value = true; }; } // namespace detail template -static constexpr bool is_same = detail::is_same_helper::value; +static constexpr bool is_same_type = detail::is_same_type_impl::value; namespace detail { template -struct is_implicit_convertible_helper { +struct is_implicit_convertible_impl { static constexpr bool value = false; }; template -struct is_implicit_convertible_helper::type> { +struct is_implicit_convertible_impl::type> { static constexpr bool value = true; }; } // namespace detail template static constexpr bool is_implicit_convertible = - detail::is_implicit_convertible_helper, decay_t>::value; + detail::is_implicit_convertible_impl, decay_t>::value; namespace detail { template @@ -308,16 +308,16 @@ using result_t = decltype((detail::declval())(detail::declval()...)); namespace detail { template -struct enabled_helper {}; +struct enable_if_impl {}; template -struct enabled_helper { +struct enable_if_impl { using type = T; }; } // namespace detail template -using enabled_t = typename detail::enabled_helper::type; +using enable_if_t = typename detail::enable_if_impl::type; } // namespace kernel_float @@ -918,16 +918,11 @@ namespace detail { template struct apply_impl { - KERNEL_FLOAT_INLINE static vector_storage - call(F fun, const vector_storage&... inputs) { - vector_storage result; - + KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) { #pragma unroll for (size_t i = 0; i < N; i++) { - result.data()[i] = fun(inputs.data()[i]...); + result[i] = fun(inputs[i]...); } - - return result; } }; } // namespace detail @@ -949,9 +944,14 @@ template KERNEL_FLOAT_INLINE map_type map(F fun, const V& input) { using Input = vector_value_type; using Output = result_t; - return detail::apply_impl, Output, Input>::call( + vector_storage> result; + + detail::apply_impl, Output, Input>::call( fun, - into_vector_storage(input)); + result.data(), + into_vector_storage(input).data()); + + return result; } #define KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ @@ -1156,7 +1156,7 @@ template using broadcast_vector_extent_type = broadcast_extent...>; template -static constexpr bool is_broadcastable = is_same, To>; +static constexpr bool is_broadcastable = is_same_type, To>; template static constexpr bool is_vector_broadcastable = is_broadcastable, To>; @@ -1226,47 +1226,57 @@ broadcast_like(const V& input, const R& other) { } namespace detail { +/** + * Convert vector of element type `T` and extent type `E` to vector of element type `T2` and extent type `E2`. + * Specialization exist for the cases where `T==T2` and/or `E==E2`. + */ template -struct convert_helper { +struct convert_impl { KERNEL_FLOAT_INLINE static vector_storage call(vector_storage input) { using F = ops::cast; - vector_storage intermediate = - detail::apply_impl::call(F {}, input); + vector_storage intermediate; + detail::apply_impl::call(F {}, intermediate.data(), input.data()); return detail::broadcast_impl::call(intermediate); } }; +// T == T2, E == E2 template -struct convert_helper { +struct convert_impl { KERNEL_FLOAT_INLINE static vector_storage call(vector_storage input) { return input; } }; +// T == T2, E != E2 template -struct convert_helper { +struct convert_impl { KERNEL_FLOAT_INLINE static vector_storage call(vector_storage input) { return detail::broadcast_impl::call(input); } }; +// T != T2, E == E2 template -struct convert_helper { +struct convert_impl { KERNEL_FLOAT_INLINE static vector_storage call(vector_storage input) { using F = ops::cast; - return detail::apply_impl::call(F {}, input); + + vector_storage result; + detail::apply_impl::call(F {}, result.data(), input.data()); + return result; } }; } // namespace detail template KERNEL_FLOAT_INLINE vector_storage convert_storage(const V& input, extent new_size = {}) { - return detail::convert_helper, vector_extent_type, R, extent, M>:: - call(into_vector_storage(input)); + return detail::convert_impl, vector_extent_type, R, extent, M>::call( + into_vector_storage(input)); } /** @@ -1414,11 +1424,16 @@ KERNEL_FLOAT_INLINE zip_type zip(F fun, const L& left, const R& right) using B = vector_value_type; using O = result_t; using E = broadcast_vector_extent_type; + vector_storage result; - return detail::apply_impl::call( + detail::apply_impl::call( fun, - detail::broadcast_impl, E>::call(into_vector_storage(left)), - detail::broadcast_impl, E>::call(into_vector_storage(right))); + result.data(), + detail::broadcast_impl, E>::call(into_vector_storage(left)).data(), + detail::broadcast_impl, E>::call(into_vector_storage(right)) + .data()); + + return result; } template @@ -1434,9 +1449,9 @@ using zip_common_type = vector< * Example * ======= * ``` - * vec a = {1.0f, 2.0f, 3.0f}; + * vec a = {1.0f, 2.0f, 3.0f}; * vec b = {4, 5, 6}; - * vec c = zip_common([](float x, float y){ return x + y; }, a, b); // returns [5.0f, 7.0f, 9.0f] + * vec c = zip_common([](float x, float y){ return x + y; }, a, b); // returns [5.0f, 7.0f, 9.0f] * ``` */ template @@ -1445,12 +1460,19 @@ KERNEL_FLOAT_INLINE zip_common_type zip_common(F fun, const L& left, co using O = result_t; using E = broadcast_vector_extent_type; - return detail::apply_impl::call( + vector_storage result; + + detail::apply_impl::call( fun, - detail::convert_helper, vector_extent_type, T, E>::call( - into_vector_storage(left)), - detail::convert_helper, vector_extent_type, T, E>::call( - into_vector_storage(right))); + result.data(), + detail::convert_impl, vector_extent_type, T, E>::call( + into_vector_storage(left)) + .data(), + detail::convert_impl, vector_extent_type, T, E>::call( + into_vector_storage(right)) + .data()); + + return result; } #define KERNEL_FLOAT_DEFINE_BINARY(NAME, EXPR) \ @@ -1524,7 +1546,7 @@ static constexpr bool is_vector_assign_allowed = typename T, \ typename E, \ typename R, \ - typename = enabled_t>> \ + typename = enable_if_t>> \ KERNEL_FLOAT_INLINE vector& operator OP(vector& lhs, const R& rhs) { \ using F = ops::NAME; \ lhs = zip_common(F {}, lhs, rhs); \ @@ -1545,15 +1567,46 @@ KERNEL_FLOAT_DEFINE_BINARY_ASSIGN_OP(bit_xor, ^=) KERNEL_FLOAT_DEFINE_BINARY_FUN(min) KERNEL_FLOAT_DEFINE_BINARY_FUN(max) KERNEL_FLOAT_DEFINE_BINARY_FUN(copysign) -KERNEL_FLOAT_DEFINE_BINARY_FUN(hypot) KERNEL_FLOAT_DEFINE_BINARY_FUN(modf) KERNEL_FLOAT_DEFINE_BINARY_FUN(nextafter) KERNEL_FLOAT_DEFINE_BINARY_FUN(pow) KERNEL_FLOAT_DEFINE_BINARY_FUN(remainder) -#if KERNEL_FLOAT_CUDA_DEVICE -KERNEL_FLOAT_DEFINE_BINARY_FUN(rhypot) +KERNEL_FLOAT_DEFINE_BINARY(hypot, (ops::sqrt()(left * left + right * right))) +KERNEL_FLOAT_DEFINE_BINARY(rhypot, (T(1) / ops::hypot()(left, right))) + +namespace ops { +template<> +struct hypot { + KERNEL_FLOAT_INLINE double operator()(double left, double right) { + return ::hypot(left, right); + }; +}; + +template<> +struct hypot { + KERNEL_FLOAT_INLINE float operator()(float left, float right) { + return ::hypotf(left, right); + }; +}; + +// rhypot is only support on the GPU +#if KERNEL_FLOAT_IS_DEVICE +template<> +struct rhypot { + KERNEL_FLOAT_INLINE double operator()(double left, double right) { + return ::rhypot(left, right); + }; +}; + +template<> +struct rhypot { + KERNEL_FLOAT_INLINE float operator()(float left, float right) { + return ::rhypotf(left, right); + }; +}; #endif +}; // namespace ops #if KERNEL_FLOAT_IS_DEVICE #define KERNEL_FLOAT_DEFINE_BINARY_FAST(FUN_NAME, OP_NAME, FLOAT_FUN) \ @@ -1634,7 +1687,7 @@ struct bit_xor { namespace detail { template -struct cross_helper { +struct cross_impl { KERNEL_FLOAT_INLINE static vector> call(const vector_storage& av, const vector_storage& bv) { @@ -1660,9 +1713,9 @@ template< typename R, typename T = promoted_vector_value_type, typename = - enabled_t> && is_vector_broadcastable>>> + enable_if_t> && is_vector_broadcastable>>> KERNEL_FLOAT_INLINE vector> cross(const L& left, const R& right) { - return detail::cross_helper::call(convert_storage(left), convert_storage(right)); + return detail::cross_impl::call(convert_storage(left), convert_storage(right)); } } // namespace kernel_float @@ -1797,7 +1850,7 @@ void for_each(V&& input, F fun) { namespace detail { template -struct range_helper { +struct range_impl { KERNEL_FLOAT_INLINE static vector_storage call() { vector_storage result; @@ -1824,7 +1877,7 @@ struct range_helper { */ template KERNEL_FLOAT_INLINE vector> range() { - return detail::range_helper::call(); + return detail::range_impl::call(); } /** @@ -1839,7 +1892,7 @@ KERNEL_FLOAT_INLINE vector> range() { */ template KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { - return detail::range_helper, vector_extent>::call(); + return detail::range_impl, vector_extent>::call(); } /** @@ -1864,14 +1917,14 @@ KERNEL_FLOAT_INLINE into_vector_type range_like(const V& = {}) { */ template KERNEL_FLOAT_INLINE vector> each_index(const V& = {}) { - return detail::range_helper>::call(); + return detail::range_impl>::call(); } namespace detail { template, size_t N = vector_extent> -struct flatten_helper { - using value_type = typename flatten_helper::value_type; - static constexpr size_t size = N * flatten_helper::size; +struct flatten_impl { + using value_type = typename flatten_impl::value_type; + static constexpr size_t size = N * flatten_impl::size; template KERNEL_FLOAT_INLINE static void call(U* output, const V& input) { @@ -1879,13 +1932,13 @@ struct flatten_helper { #pragma unroll for (size_t i = 0; i < N; i++) { - flatten_helper::call(output + flatten_helper::size * i, storage.data()[i]); + flatten_impl::call(output + flatten_impl::size * i, storage.data()[i]); } } }; template -struct flatten_helper { +struct flatten_impl { using value_type = T; static constexpr size_t size = 1; @@ -1902,10 +1955,10 @@ struct flatten_helper { } // namespace detail template -using flatten_value_type = typename detail::flatten_helper::value_type; +using flatten_value_type = typename detail::flatten_impl::value_type; template -static constexpr size_t flatten_size = detail::flatten_helper::size; +static constexpr size_t flatten_size = detail::flatten_impl::size; template using flatten_type = vector, extent>>; @@ -1923,13 +1976,13 @@ using flatten_type = vector, extent>>; template KERNEL_FLOAT_INLINE flatten_type flatten(const V& input) { vector_storage, flatten_size> output; - detail::flatten_helper::call(output.data(), input); + detail::flatten_impl::call(output.data(), input); return output; } namespace detail { template> -struct concat_base_helper { +struct concat_base_impl { static constexpr size_t size = vector_extent; KERNEL_FLOAT_INLINE static void call(U* output, const V& input) { @@ -1942,7 +1995,7 @@ struct concat_base_helper { }; template -struct concat_base_helper { +struct concat_base_impl { static constexpr size_t size = 1; KERNEL_FLOAT_INLINE static void call(U* output, const T& input) { @@ -1951,7 +2004,7 @@ struct concat_base_helper { }; template -struct concat_base_helper { +struct concat_base_impl { static constexpr size_t size = 1; KERNEL_FLOAT_INLINE static void call(T* output, const T& input) { @@ -1960,24 +2013,23 @@ struct concat_base_helper { }; template -struct concat_helper {}; +struct concat_impl {}; template -struct concat_helper { +struct concat_impl { using value_type = - typename promote_type, typename concat_helper::value_type>:: - type; - static constexpr size_t size = concat_base_helper::size + concat_helper::size; + typename promote_type, typename concat_impl::value_type>::type; + static constexpr size_t size = concat_base_impl::size + concat_impl::size; template KERNEL_FLOAT_INLINE static void call(U* output, const V& input, const Vs&... rest) { - concat_base_helper::call(output, input); - concat_helper::call(output + concat_base_helper::size, rest...); + concat_base_impl::call(output, input); + concat_impl::call(output + concat_base_impl::size, rest...); } }; template<> -struct concat_helper<> { +struct concat_impl<> { using value_type = void; static constexpr size_t size = 1; @@ -1987,10 +2039,10 @@ struct concat_helper<> { } // namespace detail template -using concat_value_type = promote_t::value_type>; +using concat_value_type = promote_t::value_type>; template -static constexpr size_t concat_size = detail::concat_helper::size; +static constexpr size_t concat_size = detail::concat_impl::size; template using concat_type = vector, extent>>; @@ -2025,7 +2077,7 @@ using concat_type = vector, extent>> template KERNEL_FLOAT_INLINE concat_type concat(const Vs&... inputs) { vector_storage, concat_size> output; - detail::concat_helper::call(output.data(), inputs...); + detail::concat_impl::call(output.data(), inputs...); return output; } @@ -2052,7 +2104,7 @@ KERNEL_FLOAT_INLINE select_type select(const V& input, const Is&... in static constexpr size_t M = concat_size; vector_storage index_set; - detail::concat_helper::call(index_set.data(), indices...); + detail::concat_impl::call(index_set.data(), indices...); vector_storage inputs = into_vector_storage(input); vector_storage outputs; @@ -2346,19 +2398,18 @@ namespace kernel_float { namespace kernel_float { namespace detail { template -struct reduce_helper { - KERNEL_FLOAT_INLINE static T call(F fun, const vector_storage& input) { +struct reduce_impl { + KERNEL_FLOAT_INLINE static T call(F fun, const T* input) { return call(fun, input, make_index_sequence {}); } private: template - KERNEL_FLOAT_INLINE static T - call(F fun, const vector_storage& input, index_sequence<0, Is...>) { - T result = input.data()[0]; + KERNEL_FLOAT_INLINE static T call(F fun, const T* input, index_sequence<0, Is...>) { + T result = input[0]; #pragma unroll for (size_t i = 1; i < N; i++) { - result = fun(result, input.data()[i]); + result = fun(result, input[i]); } return result; } @@ -2369,7 +2420,7 @@ struct reduce_helper { * Reduce the elements of the given vector ``input`` into a single value using * the function ``fun``. This function should be a binary function that takes * two elements and returns one element. The order in which the elements - * are reduced is not specified and depends on the reduction function and + * are reduced is not specified and depends on both the reduction function and * the vector type. * * Example @@ -2381,9 +2432,9 @@ struct reduce_helper { */ template KERNEL_FLOAT_INLINE vector_value_type reduce(F fun, const V& input) { - return detail::reduce_helper, vector_value_type>::call( + return detail::reduce_impl, vector_value_type>::call( fun, - into_vector_storage(input)); + into_vector_storage(input).data()); } /** @@ -2482,10 +2533,17 @@ KERNEL_FLOAT_INLINE T count(const V& input) { namespace detail { template -struct dot_helper { +struct dot_impl { KERNEL_FLOAT_INLINE static T call(const vector_storage& left, const vector_storage& right) { - return sum(zip(ops::multiply {}, left, right)); + vector_storage intermediate; + detail::apply_impl, N, T, T, T>::call( + ops::multiply(), + intermediate.data(), + left.data(), + right.data()); + + return detail::reduce_impl, N, T>::call(ops::add(), intermediate.data()); } }; } // namespace detail @@ -2504,22 +2562,22 @@ struct dot_helper { template> KERNEL_FLOAT_INLINE T dot(const L& left, const R& right) { using E = broadcast_vector_extent_type; - return detail::dot_helper::call( + return detail::dot_impl::call( convert_storage(left, E {}), convert_storage(right, E {})); } namespace detail { template -struct magnitude_helper { +struct magnitude_impl { KERNEL_FLOAT_INLINE static T call(const vector_storage& input) { - return ops::sqrt {}(detail::dot_helper::call(input, input)); + return ops::sqrt {}(detail::dot_impl::call(input, input)); } }; template -struct magnitude_helper { +struct magnitude_impl { KERNEL_FLOAT_INLINE static T call(const vector_storage& input) { return T {}; @@ -2527,7 +2585,7 @@ struct magnitude_helper { }; template -struct magnitude_helper { +struct magnitude_impl { KERNEL_FLOAT_INLINE static T call(const vector_storage& input) { return ops::abs {}(input); @@ -2535,28 +2593,28 @@ struct magnitude_helper { }; template -struct magnitude_helper { +struct magnitude_impl { KERNEL_FLOAT_INLINE static T call(const vector_storage& input) { - return ops::hypot {}(input.data()[0], input.data()[1]); + return ops::hypot()(input.data()[0], input.data()[1]); } }; -// The 3-argument overload of hypot is only available from C++17 -#ifdef __cpp_lib_hypot +// The 3-argument overload of hypot is only available on host from C++17 +#if defined(__cpp_lib_hypot) && KERNEL_FLOAT_IS_HOST template<> -struct magnitude_helper { +struct magnitude_impl { KERNEL_FLOAT_INLINE static float call(const vector_storage& input) { - return std::hypot(input.data()[0], input.data()[1], input.data()[2]); + return ::hypot(input.data()[0], input.data()[1], input.data()[2]); } }; template<> -struct magnitude_helper { +struct magnitude_impl { KERNEL_FLOAT_INLINE static float call(const vector_storage& input) { - return std::hypot(input.data()[0], input.data()[1], input.data()[2]); + return ::hypot(input.data()[0], input.data()[1], input.data()[2]); } }; #endif @@ -2565,7 +2623,7 @@ struct magnitude_helper { /** * Compute the magnitude of the given input vector. This calculates the square root of the sum of squares, also - * known as the Euclidian norm of the vector. + * known as the Euclidian norm, of a vector. * * Example * ======= @@ -2576,7 +2634,7 @@ struct magnitude_helper { */ template> KERNEL_FLOAT_INLINE T mag(const V& input) { - return detail::magnitude_helper>::call(into_vector_storage(input)); + return detail::magnitude_impl>::call(into_vector_storage(input)); } } // namespace kernel_float @@ -2622,15 +2680,22 @@ template< typename E = broadcast_vector_extent_type> KERNEL_FLOAT_INLINE vector where(const C& cond, const L& true_values, const R& false_values) { using F = ops::conditional; + vector_storage result; - return detail::apply_impl::call( + detail::apply_impl::call( F {}, - detail::convert_helper, vector_extent_type, bool, E>::call( - into_vector_storage(cond)), - detail::convert_helper, vector_extent_type, T, E>::call( - into_vector_storage(true_values)), - detail::convert_helper, vector_extent_type, T, E>::call( - into_vector_storage(false_values))); + result.data(), + detail::convert_impl, vector_extent_type, bool, E>::call( + into_vector_storage(cond)) + .data(), + detail::convert_impl, vector_extent_type, T, E>::call( + into_vector_storage(true_values)) + .data(), + detail::convert_impl, vector_extent_type, T, E>::call( + into_vector_storage(false_values)) + .data()); + + return result; } /** @@ -2700,15 +2765,22 @@ template< typename E = broadcast_vector_extent_type> KERNEL_FLOAT_INLINE vector fma(const A& a, const B& b, const C& c) { using F = ops::fma; + vector_storage result; - return detail::apply_impl::call( + detail::apply_impl::call( F {}, - detail::convert_helper, vector_extent_type, T, E>::call( - into_vector_storage(a)), - detail::convert_helper, vector_extent_type, T, E>::call( - into_vector_storage(b)), - detail::convert_helper, vector_extent_type, T, E>::call( - into_vector_storage(c))); + result.data(), + detail::convert_impl, vector_extent_type, T, E>::call( + into_vector_storage(a)) + .data(), + detail::convert_impl, vector_extent_type, T, E>::call( + into_vector_storage(b)) + .data(), + detail::convert_impl, vector_extent_type, T, E>::call( + into_vector_storage(c)) + .data()); + + return result; } } // namespace kernel_float @@ -2755,11 +2827,11 @@ struct vector: public S { storage_type(detail::broadcast_impl, E>::call(input)) {} // For all other arguments, we convert it using `convert_storage` according to broadcast rules - template, T>, int> = 0> + template, T>, int> = 0> KERNEL_FLOAT_INLINE vector(U&& input) : storage_type(convert_storage(input, extent_type {})) {} - template, T>, int> = 0> + template, T>, int> = 0> KERNEL_FLOAT_INLINE explicit vector(U&& input) : storage_type(convert_storage(input, extent_type {})) {} @@ -2768,7 +2840,7 @@ struct vector: public S { typename A, typename B, typename... Rest, - typename = enabled_t> + typename = enable_if_t> KERNEL_FLOAT_INLINE vector(const A& a, const B& b, const Rest&... rest) : storage_type {T(a), T(b), T(rest)...} {} @@ -3099,63 +3171,55 @@ struct zip_halfx2 { template struct apply_impl { - KERNEL_FLOAT_INLINE static vector_storage<__half, N> - call(F fun, const vector_storage<__half, N>& input) { - vector_storage<__half, N> result; - + KERNEL_FLOAT_INLINE static void call(F fun, __half* result, const __half* input) { #pragma unroll - for (size_t i = 0; i + 2 <= N; i += 2) { - __half2 a = {input.data()[i], input.data()[i + 1]}; + for (size_t i = 0; 2 * i + 1 < N; i++) { + __half2 a = {input[2 * i], input[2 * i + 1]}; __half2 b = map_halfx2::call(fun, a); - result.data()[i + 0] = b.x; - result.data()[i + 1] = b.y; + result[2 * i + 0] = b.x; + result[2 * i + 1] = b.y; } if (N % 2 != 0) { - result.data()[N - 1] = fun(input.data()[N - 1]); + result[N - 1] = fun(input[N - 1]); } - - return result; } }; template struct apply_impl { - KERNEL_FLOAT_INLINE static vector_storage<__half, N> - call(F fun, const vector_storage<__half, N>& left, const vector_storage<__half, N>& right) { - vector_storage<__half, N> result; + KERNEL_FLOAT_INLINE static void + call(F fun, __half* result, const __half* left, const __half* right) { #pragma unroll - for (size_t i = 0; i + 2 <= N; i += 2) { - __half2 a = {left.data()[i], left.data()[i + 1]}; - __half2 b = {right.data()[i], right.data()[i + 1]}; + for (size_t i = 0; 2 * i + 1 < N; i++) { + __half2 a = {left[2 * i], left[2 * i + 1]}; + __half2 b = {right[2 * i], right[2 * i + 1]}; __half2 c = zip_halfx2::call(fun, a, b); - result.data()[i + 0] = c.x; - result.data()[i + 1] = c.y; + result[2 * i + 0] = c.x; + result[2 * i + 1] = c.y; } if (N % 2 != 0) { - result.data()[N - 1] = fun(left.data()[N - 1], right.data()[N - 1]); + result[N - 1] = fun(left[N - 1], right[N - 1]); } - - return result; } }; template -struct reduce_helper= 2)>> { - KERNEL_FLOAT_INLINE static __half call(F fun, const vector_storage<__half, N>& input) { - __half2 accum = {input.data()[0], input.data()[1]}; +struct reduce_impl= 2)>> { + KERNEL_FLOAT_INLINE static __half call(F fun, const __half* input) { + __half2 accum = {input[0], input[1]}; #pragma unroll - for (size_t i = 2; i + 2 <= N; i += 2) { - __half2 a = {input.data()[i], input.data()[i + 1]}; + for (size_t i = 0; 2 * i + 1 < N; i++) { + __half2 a = {input[2 * i], input[2 * i + 1]}; accum = zip_halfx2::call(fun, accum, a); } __half result = fun(accum.x, accum.y); if (N % 2 != 0) { - result = fun(result, input.data()[N - 1]); + result = fun(result, input[N - 1]); } return result; @@ -3174,6 +3238,7 @@ struct reduce_helper= 2)>> { }; \ } +// There operations are not implemented in half precision, so they are forward to single precision KERNEL_FLOAT_FP16_UNARY_FORWARD(tan) KERNEL_FLOAT_FP16_UNARY_FORWARD(asin) KERNEL_FLOAT_FP16_UNARY_FORWARD(acos) @@ -3258,7 +3323,6 @@ KERNEL_FLOAT_FP16_BINARY_FUN(multiply, __hmul, __hmul2) KERNEL_FLOAT_FP16_BINARY_FUN(divide, __hdiv, __h2div) KERNEL_FLOAT_FP16_BINARY_FUN(min, __hmin, __hmin2) KERNEL_FLOAT_FP16_BINARY_FUN(max, __hmax, __hmax2) - KERNEL_FLOAT_FP16_BINARY_FUN(fast_div, __hdiv, __h2div) KERNEL_FLOAT_FP16_BINARY_FUN(equal_to, __heq, __heq2) @@ -3308,37 +3372,51 @@ using half = __half; #if KERNEL_FLOAT_IS_DEVICE namespace detail { +template<> +struct dot_impl<__half, 0> { + KERNEL_FLOAT_INLINE + static __half + call(const vector_storage<__half, 0>& left, const vector_storage<__half, 0>& right) { + return __half(0); + } +}; + +template<> +struct dot_impl<__half, 1> { + KERNEL_FLOAT_INLINE + static __half + call(const vector_storage<__half, 1>& left, const vector_storage<__half, 1>& right) { + return __hmul(left.data()[0], right.data()[0]); + } +}; + template -struct dot_helper<__half, N> { +struct dot_impl<__half, N> { + static_assert(N >= 2, "internal error"); + KERNEL_FLOAT_INLINE static __half call(const vector_storage<__half, N>& left, const vector_storage<__half, N>& right) { - if (N == 0) { - return __half(0); - } else if (N == 1) { - return __hmul(left.data()[0], right.data()[0]); - } else { - __half2 first_a = {left.data()[0], left.data()[1]}; - __half2 first_b = {right.data()[0], right.data()[1]}; - __half2 accum = __hmul2(first_a, first_b); + __half2 first_a = {left.data()[0], left.data()[1]}; + __half2 first_b = {right.data()[0], right.data()[1]}; + __half2 accum = __hmul2(first_a, first_b); #pragma unroll - for (size_t i = 2; i + 2 <= N; i += 2) { - __half2 a = {left.data()[i], left.data()[i + 1]}; - __half2 b = {right.data()[i], right.data()[i + 1]}; - accum = __hfma2(a, b, accum); - } - - __half result = __hadd(accum.x, accum.y); + for (size_t i = 2; i + 2 <= N; i += 2) { + __half2 a = {left.data()[i], left.data()[i + 1]}; + __half2 b = {right.data()[i], right.data()[i + 1]}; + accum = __hfma2(a, b, accum); + } - if (N % 2 != 0) { - __half a = left.data()[N - 1]; - __half b = right.data()[N - 1]; - result = __hfma(a, b, result); - } + __half result = __hadd(accum.x, accum.y); - return result; + if (N % 2 != 0) { + __half a = left.data()[N - 1]; + __half b = right.data()[N - 1]; + result = __hfma(a, b, result); } + + return result; } }; } // namespace detail @@ -3400,66 +3478,55 @@ struct zip_bfloat16x2 { template struct apply_impl { - KERNEL_FLOAT_INLINE static vector_storage<__nv_bfloat16, N> - call(F fun, const vector_storage<__nv_bfloat16, N>& input) { - vector_storage<__nv_bfloat16, N> result; - + KERNEL_FLOAT_INLINE static void call(F fun, __nv_bfloat16* result, const __nv_bfloat16* input) { #pragma unroll - for (size_t i = 0; i + 2 <= N; i += 2) { - __nv_bfloat162 a = {input.data()[i], input.data()[i + 1]}; + for (size_t i = 0; 2 * i + 1 < N; i++) { + __nv_bfloat162 a = {input[2 * i], input[2 * i + 1]}; __nv_bfloat162 b = map_bfloat16x2::call(fun, a); - result.data()[i + 0] = b.x; - result.data()[i + 1] = b.y; + result[2 * i + 0] = b.x; + result[2 * i + 1] = b.y; } if (N % 2 != 0) { - result.data()[N - 1] = fun(input.data()[N - 1]); + result[N - 1] = fun(input[N - 1]); } - - return result; } }; template struct apply_impl { - KERNEL_FLOAT_INLINE static vector_storage<__nv_bfloat16, N> call( - F fun, - const vector_storage<__nv_bfloat16, N>& left, - const vector_storage<__nv_bfloat16, N>& right) { - vector_storage<__nv_bfloat16, N> result; + KERNEL_FLOAT_INLINE static void + call(F fun, __nv_bfloat16* result, const __nv_bfloat16* left, const __nv_bfloat16* right) { #pragma unroll - for (size_t i = 0; i + 2 <= N; i += 2) { - __nv_bfloat162 a = {left.data()[i], left.data()[i + 1]}; - __nv_bfloat162 b = {right.data()[i], right.data()[i + 1]}; + for (size_t i = 0; 2 * i + 1 < N; i++) { + __nv_bfloat162 a = {left[2 * i], left[2 * i + 1]}; + __nv_bfloat162 b = {right[2 * i], right[2 * i + 1]}; __nv_bfloat162 c = zip_bfloat16x2::call(fun, a, b); - result.data()[i + 0] = c.x; - result.data()[i + 1] = c.y; + result[2 * i + 0] = c.x; + result[2 * i + 1] = c.y; } if (N % 2 != 0) { - result.data()[N - 1] = fun(left.data()[N - 1], right.data()[N - 1]); + result[N - 1] = fun(left[N - 1], right[N - 1]); } - - return result; } }; template -struct reduce_helper= 2)>> { - KERNEL_FLOAT_INLINE static __nv_bfloat16 - call(F fun, const vector_storage<__nv_bfloat16, N>& input) { - __nv_bfloat162 accum = {input.data()[0], input.data()[1]}; +struct reduce_impl= 2)>> { + KERNEL_FLOAT_INLINE static __nv_bfloat16 call(F fun, const __nv_bfloat16* input) { + __nv_bfloat162 accum = {input[0], input[1]}; #pragma unroll - for (size_t i = 2; i + 2 <= N; i += 2) { - __nv_bfloat162 a = {input.data()[i], input.data()[i + 1]}; + for (size_t i = 0; 2 * i + 1 < N; i++) { + __nv_bfloat162 a = {input[2 * i], input[2 * i + 1]}; accum = zip_bfloat16x2::call(fun, accum, a); } __nv_bfloat16 result = fun(accum.x, accum.y); if (N % 2 != 0) { - result = fun(result, input.data()[N - 1]); + result = fun(result, input[N - 1]); } return result; @@ -3477,6 +3544,7 @@ struct reduce_helper= 2)>> { }; \ } +// There operations are not implemented in half precision, so they are forward to single precision KERNEL_FLOAT_BF16_UNARY_FORWARD(tan) KERNEL_FLOAT_BF16_UNARY_FORWARD(asin) KERNEL_FLOAT_BF16_UNARY_FORWARD(acos) @@ -3594,32 +3662,22 @@ KERNEL_FLOAT_BF16_BINARY_FUN(greater_equal, __hge, __hgt2) KERNEL_FLOAT_BF16_CAST(double, __double2bfloat16(input), double(__bfloat162float(input))); KERNEL_FLOAT_BF16_CAST(float, __float2bfloat16(input), __bfloat162float(input)); +// clang-format off // there are no official char casts. Instead, cast to int and then to char KERNEL_FLOAT_BF16_CAST(char, __int2bfloat16_rn(input), (char)__bfloat162int_rz(input)); -KERNEL_FLOAT_BF16_CAST( - signed char, - __int2bfloat16_rn(input), - (signed char)__bfloat162int_rz(input)); -KERNEL_FLOAT_BF16_CAST( - unsigned char, - __int2bfloat16_rn(input), - (unsigned char)__bfloat162int_rz(input)); +KERNEL_FLOAT_BF16_CAST(signed char, __int2bfloat16_rn(input), (signed char)__bfloat162int_rz(input)); +KERNEL_FLOAT_BF16_CAST(unsigned char, __int2bfloat16_rn(input), (unsigned char)__bfloat162int_rz(input)); KERNEL_FLOAT_BF16_CAST(signed short, __bfloat162short_rz(input), __short2bfloat16_rn(input)); KERNEL_FLOAT_BF16_CAST(signed int, __bfloat162int_rz(input), __int2bfloat16_rn(input)); -KERNEL_FLOAT_BF16_CAST( - signed long, - __ll2bfloat16_rn(input), - (signed long)(__bfloat162ll_rz(input))); +KERNEL_FLOAT_BF16_CAST(signed long, __ll2bfloat16_rn(input), (signed long)(__bfloat162ll_rz(input))); KERNEL_FLOAT_BF16_CAST(signed long long, __ll2bfloat16_rn(input), __bfloat162ll_rz(input)); KERNEL_FLOAT_BF16_CAST(unsigned short, __bfloat162ushort_rz(input), __ushort2bfloat16_rn(input)); KERNEL_FLOAT_BF16_CAST(unsigned int, __bfloat162uint_rz(input), __uint2bfloat16_rn(input)); -KERNEL_FLOAT_BF16_CAST( - unsigned long, - __ull2bfloat16_rn(input), - (unsigned long)(__bfloat162ull_rz(input))); +KERNEL_FLOAT_BF16_CAST(unsigned long, __ull2bfloat16_rn(input), (unsigned long)(__bfloat162ull_rz(input))); KERNEL_FLOAT_BF16_CAST(unsigned long long, __ull2bfloat16_rn(input), __bfloat162ull_rz(input)); +// clang-format on using bfloat16 = __nv_bfloat16; //KERNEL_FLOAT_TYPE_ALIAS(float16x, __nv_bfloat16) @@ -3627,38 +3685,54 @@ using bfloat16 = __nv_bfloat16; #if KERNEL_FLOAT_IS_DEVICE namespace detail { +template<> +struct dot_impl<__nv_bfloat16, 0> { + KERNEL_FLOAT_INLINE + static __nv_bfloat16 call( + const vector_storage<__nv_bfloat16, 0>& left, + const vector_storage<__nv_bfloat16, 0>& right) { + return __nv_bfloat16(0); + } +}; + +template<> +struct dot_impl<__nv_bfloat16, 1> { + KERNEL_FLOAT_INLINE + static __nv_bfloat16 call( + const vector_storage<__nv_bfloat16, 1>& left, + const vector_storage<__nv_bfloat16, 1>& right) { + return __hmul(left.data()[0], right.data()[0]); + } +}; + template -struct dot_helper<__nv_bfloat16, N> { +struct dot_impl<__nv_bfloat16, N> { + static_assert(N >= 2, "internal error"); + KERNEL_FLOAT_INLINE static __nv_bfloat16 call( const vector_storage<__nv_bfloat16, N>& left, const vector_storage<__nv_bfloat16, N>& right) { - if (N == 0) { - return __nv_bfloat16(0); - } else if (N == 1) { - return __hmul(left.data()[0], right.data()[0]); - } else { - __nv_bfloat162 first_a = {left.data()[0], left.data()[1]}; - __nv_bfloat162 first_b = {right.data()[0], right.data()[1]}; - __nv_bfloat162 accum = __hmul2(first_a, first_b); + __nv_bfloat162 first_a = {left.data()[0], left.data()[1]}; + __nv_bfloat162 first_b = {right.data()[0], right.data()[1]}; + __nv_bfloat162 accum = __hmul2(first_a, first_b); #pragma unroll - for (size_t i = 2; i + 2 <= N; i += 2) { - __nv_bfloat162 a = {left.data()[i], left.data()[i + 1]}; - __nv_bfloat162 b = {right.data()[i], right.data()[i + 1]}; - accum = __hfma2(a, b, accum); - } - - __nv_bfloat16 result = __hadd(accum.x, accum.y); + for (size_t i = 2; i + 2 <= N; i += 2) { + __nv_bfloat162 a = {left.data()[i], left.data()[i + 1]}; + __nv_bfloat162 b = {right.data()[i], right.data()[i + 1]}; + accum = __hfma2(a, b, accum); + } - if (N % 2 != 0) { - __nv_bfloat16 a = left.data()[N - 1]; - __nv_bfloat16 b = right.data()[N - 1]; - result = __hfma(a, b, result); - } + __nv_bfloat16 result = __hadd(accum.x, accum.y); - return result; + if (N % 2 != 0) { + __nv_bfloat16 a = left.data()[N - 1]; + __nv_bfloat16 b = right.data()[N - 1]; + result = __hfma(a, b, result); } + + return result; } }; } // namespace detail diff --git a/tests/common.h b/tests/common.h index fb60624..d3fa1b8 100644 --- a/tests/common.h +++ b/tests/common.h @@ -33,6 +33,13 @@ static __host__ __device__ void __assertion_failed(const char* expr, const char* } \ } while (0) +#define ASSERT_EQ(A, B) ASSERT(equals(A, B)) +#define ASSERT_APPROX(A, B) ASSERT(approx(A, B)) + +#define ASSERT_ALL(E) ASSERT((E) && ...) +#define ASSERT_EQ_ALL(A, B) ASSERT_ALL(equals(A, B)) +#define ASSERT_APPROX_ALL(A, B) ASSERT_ALL(approx(A, B)) + namespace detail { template struct equals_helper { @@ -44,14 +51,14 @@ struct equals_helper { template<> struct equals_helper { static __host__ __device__ bool call(const double& left, const double& right) { - return (isnan(left) && isnan(right)) || (isinf(left) && isinf(right)) || (left == right); + return (isnan(left) && isnan(right)) || (left == right); } }; template<> struct equals_helper { static __host__ __device__ bool call(const float& left, const float& right) { - return (isnan(left) && isnan(right)) || (isinf(left) && isinf(right)) || (left == right); + return (isnan(left) && isnan(right)) || (left == right); } }; @@ -76,6 +83,49 @@ __host__ __device__ bool equals(const T& left, const T& right) { return detail::equals_helper::call(left, right); } +namespace detail { +template +struct approx_helper { + static __host__ __device__ bool call(const T& left, const T& right) { + return equals_helper::call(left, right); + } +}; + +template<> +struct approx_helper { + static __host__ __device__ bool call(double left, double right, double threshold = 1e-8) { + return equals_helper::call(left, right) + || ::fabs(left - right) < threshold * ::fabs(left); + } +}; + +template<> +struct approx_helper { + static __host__ __device__ bool call(float left, float right) { + return approx_helper::call(double(left), double(right), 1e-4); + } +}; + +template<> +struct approx_helper<__half> { + static __host__ __device__ bool call(__half left, __half right) { + return approx_helper::call(double(left), double(right), 0.01); + } +}; + +template<> +struct approx_helper<__nv_bfloat16> { + static __host__ __device__ bool call(__nv_bfloat16 left, __nv_bfloat16 right) { + return approx_helper::call(double(left), double(right), 0.05); + } +}; +} // namespace detail + +template +__host__ __device__ bool approx(const T& left, const T& right) { + return detail::approx_helper::call(left, right); +} + namespace detail { template struct is_one_of_helper; diff --git a/tests/reduce.cu b/tests/reduce.cu new file mode 100644 index 0000000..73a6752 --- /dev/null +++ b/tests/reduce.cu @@ -0,0 +1,170 @@ +#include "common.h" + +struct reduction_tests { + template + __host__ __device__ void operator()(generator gen) { + // TODO: these tests do not consider special numbers: NaN, -Inf, +Inf, and -0.0 + + { + kf::vec a; + ASSERT_APPROX(kf::min(a), T(0)); + ASSERT_APPROX(kf::max(a), T(0)); + ASSERT_APPROX(kf::sum(a), T(0)); + ASSERT_APPROX(kf::product(a), T(0)); + ASSERT_EQ(kf::all(a), false); + ASSERT_EQ(kf::any(a), false); + ASSERT_EQ(kf::count(a), 0); + + a = {T(1)}; + ASSERT_APPROX(kf::min(a), T(1)); + ASSERT_APPROX(kf::max(a), T(1)); + ASSERT_APPROX(kf::sum(a), T(1)); + ASSERT_APPROX(kf::product(a), T(1)); + ASSERT_EQ(kf::all(a), true); + ASSERT_EQ(kf::any(a), true); + ASSERT_EQ(kf::count(a), 1); + + a = {T(5)}; + ASSERT_APPROX(kf::min(a), T(5)); + ASSERT_APPROX(kf::max(a), T(5)); + ASSERT_APPROX(kf::sum(a), T(5)); + ASSERT_APPROX(kf::product(a), T(5)); + ASSERT_EQ(kf::all(a), true); + ASSERT_EQ(kf::any(a), true); + ASSERT_EQ(kf::count(a), 1); + } + + { + kf::vec a = {T(0), T(0)}; + ASSERT_APPROX(kf::min(a), T(0)); + ASSERT_APPROX(kf::max(a), T(0)); + ASSERT_APPROX(kf::sum(a), T(0)); + ASSERT_APPROX(kf::product(a), T(0)); + ASSERT_EQ(kf::all(a), false); + ASSERT_EQ(kf::any(a), false); + ASSERT_EQ(kf::count(a), 0); + + a = {T(5), T(0)}; + ASSERT_APPROX(kf::min(a), T(0)); + ASSERT_APPROX(kf::max(a), T(5)); + ASSERT_APPROX(kf::sum(a), T(5)); + ASSERT_APPROX(kf::product(a), T(0)); + ASSERT_EQ(kf::all(a), false); + ASSERT_EQ(kf::any(a), true); + ASSERT_EQ(kf::count(a), 1); + + a = {T(5), T(-3)}; + ASSERT_APPROX(kf::min(a), T(-3)); + ASSERT_APPROX(kf::max(a), T(5)); + ASSERT_APPROX(kf::sum(a), T(2)); + ASSERT_APPROX(kf::product(a), T(-15)); + ASSERT_EQ(kf::all(a), true); + ASSERT_EQ(kf::any(a), true); + ASSERT_EQ(kf::count(a), 2); + } + + { + kf::vec a; + ASSERT_APPROX(kf::min(a), T(0)); + ASSERT_APPROX(kf::max(a), T(0)); + ASSERT_APPROX(kf::sum(a), T(0)); + ASSERT_APPROX(kf::product(a), T(0)); + ASSERT_EQ(kf::all(a), false); + ASSERT_EQ(kf::any(a), false); + ASSERT_EQ(kf::count(a), 0); + + a = {T(5), T(0), T(-1)}; + ASSERT_APPROX(kf::min(a), T(-1)); + ASSERT_APPROX(kf::max(a), T(5)); + ASSERT_APPROX(kf::sum(a), T(4)); + ASSERT_APPROX(kf::product(a), T(0)); + ASSERT_EQ(kf::all(a), false); + ASSERT_EQ(kf::any(a), true); + ASSERT_EQ(kf::count(a), 2); + + a = {T(5), T(-3), T(1)}; + ASSERT_APPROX(kf::min(a), T(-3)); + ASSERT_APPROX(kf::max(a), T(5)); + ASSERT_APPROX(kf::sum(a), T(3)); + ASSERT_APPROX(kf::product(a), T(-15)); + ASSERT_EQ(kf::all(a), true); + ASSERT_EQ(kf::any(a), true); + ASSERT_EQ(kf::count(a), 3); + } + + { + kf::vec a; + ASSERT_APPROX(kf::min(a), T(0)); + ASSERT_APPROX(kf::max(a), T(0)); + ASSERT_APPROX(kf::sum(a), T(0)); + ASSERT_APPROX(kf::product(a), T(0)); + ASSERT_EQ(kf::all(a), false); + ASSERT_EQ(kf::any(a), false); + ASSERT_EQ(kf::count(a), 0); + + a = {T(5), T(0), T(-1), T(0)}; + ASSERT_APPROX(kf::min(a), T(-1)); + ASSERT_APPROX(kf::max(a), T(5)); + ASSERT_APPROX(kf::sum(a), T(4)); + ASSERT_APPROX(kf::product(a), T(0)); + ASSERT_EQ(kf::all(a), false); + ASSERT_EQ(kf::any(a), true); + ASSERT_EQ(kf::count(a), 2); + + a = {T(5), T(-3), T(1), T(-2)}; + ASSERT_APPROX(kf::min(a), T(-3)); + ASSERT_APPROX(kf::max(a), T(5)); + ASSERT_APPROX(kf::sum(a), T(1)); + ASSERT_APPROX(kf::product(a), T(30)); + ASSERT_EQ(kf::all(a), true); + ASSERT_EQ(kf::any(a), true); + ASSERT_EQ(kf::count(a), 4); + } + } +}; + +REGISTER_TEST_CASE("reductions", reduction_tests, int, float, double) +REGISTER_TEST_CASE_GPU("reductions", reduction_tests, __half, __nv_bfloat16) + +struct dot_mag_tests { + template + __host__ __device__ void operator()(generator gen) { + { + kf::vec a = {-1}; + kf::vec b = {2}; + ASSERT_APPROX(kf::dot(a, b), T(-2)); + ASSERT_APPROX(kf::mag(a), T(1)); + } + + { + kf::vec a = {3, -4}; + kf::vec b = {2, 1}; + ASSERT_APPROX(kf::dot(a, b), T(2)); + ASSERT_APPROX(kf::mag(a), T(5)); + } + + { + kf::vec a = {2, -3, 6}; + kf::vec b = {2, -1, 3}; + ASSERT_APPROX(kf::dot(a, b), T(25)); + ASSERT_APPROX(kf::mag(a), T(7)); + } + + { + kf::vec a = {2, -4, 5, 6}; + kf::vec b = {2, 1, -3, 1}; + ASSERT_APPROX(kf::dot(a, b), T(-9)); + ASSERT_APPROX(kf::mag(a), T(9)); + } + + { + kf::vec a = {1, -3, 4, 5, 7}; + kf::vec b = {2, 0, 1, -1, 2}; + ASSERT_APPROX(kf::dot(a, b), T(15)); + ASSERT_APPROX(kf::mag(a), T(10)); + } + } +}; + +REGISTER_TEST_CASE("dot product/magnitude", dot_mag_tests, float, double) +REGISTER_TEST_CASE_GPU("dot product/magnitude", dot_mag_tests, __half, __nv_bfloat16) From 3f3edaa91d22b082f0b217cd3a541e48f409acfc Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 19 Sep 2023 20:45:45 +0200 Subject: [PATCH 44/50] Rewrite `magnitude_impl` and `dot_impl` to take direct pointers instead of `vector_storage` --- include/kernel_float/bf16.h | 28 +++++------ include/kernel_float/fp16.h | 23 ++++----- include/kernel_float/reduce.h | 34 +++++++------ single_include/kernel_float.h | 89 +++++++++++++++-------------------- 4 files changed, 76 insertions(+), 98 deletions(-) diff --git a/include/kernel_float/bf16.h b/include/kernel_float/bf16.h index 3eb587e..608fddd 100644 --- a/include/kernel_float/bf16.h +++ b/include/kernel_float/bf16.h @@ -259,9 +259,7 @@ namespace detail { template<> struct dot_impl<__nv_bfloat16, 0> { KERNEL_FLOAT_INLINE - static __nv_bfloat16 call( - const vector_storage<__nv_bfloat16, 0>& left, - const vector_storage<__nv_bfloat16, 0>& right) { + static __nv_bfloat16 call(const __nv_bfloat16* left, const __nv_bfloat16* right) { return __nv_bfloat16(0); } }; @@ -269,10 +267,8 @@ struct dot_impl<__nv_bfloat16, 0> { template<> struct dot_impl<__nv_bfloat16, 1> { KERNEL_FLOAT_INLINE - static __nv_bfloat16 call( - const vector_storage<__nv_bfloat16, 1>& left, - const vector_storage<__nv_bfloat16, 1>& right) { - return __hmul(left.data()[0], right.data()[0]); + static __nv_bfloat16 call(const __nv_bfloat16* left, const __nv_bfloat16* right) { + return __hmul(left[0], right[0]); } }; @@ -281,25 +277,23 @@ struct dot_impl<__nv_bfloat16, N> { static_assert(N >= 2, "internal error"); KERNEL_FLOAT_INLINE - static __nv_bfloat16 call( - const vector_storage<__nv_bfloat16, N>& left, - const vector_storage<__nv_bfloat16, N>& right) { - __nv_bfloat162 first_a = {left.data()[0], left.data()[1]}; - __nv_bfloat162 first_b = {right.data()[0], right.data()[1]}; + static __nv_bfloat16 call(const __nv_bfloat16* left, const __nv_bfloat16* right) { + __nv_bfloat162 first_a = {left[0], left[1]}; + __nv_bfloat162 first_b = {right[0], right[1]}; __nv_bfloat162 accum = __hmul2(first_a, first_b); #pragma unroll - for (size_t i = 2; i + 2 <= N; i += 2) { - __nv_bfloat162 a = {left.data()[i], left.data()[i + 1]}; - __nv_bfloat162 b = {right.data()[i], right.data()[i + 1]}; + for (size_t i = 2; i + 1 < N; i += 2) { + __nv_bfloat162 a = {left[i], left[i + 1]}; + __nv_bfloat162 b = {right[i], right[i + 1]}; accum = __hfma2(a, b, accum); } __nv_bfloat16 result = __hadd(accum.x, accum.y); if (N % 2 != 0) { - __nv_bfloat16 a = left.data()[N - 1]; - __nv_bfloat16 b = right.data()[N - 1]; + __nv_bfloat16 a = left[N - 1]; + __nv_bfloat16 b = right[N - 1]; result = __hfma(a, b, result); } diff --git a/include/kernel_float/fp16.h b/include/kernel_float/fp16.h index e939341..abceb05 100644 --- a/include/kernel_float/fp16.h +++ b/include/kernel_float/fp16.h @@ -251,8 +251,7 @@ namespace detail { template<> struct dot_impl<__half, 0> { KERNEL_FLOAT_INLINE - static __half - call(const vector_storage<__half, 0>& left, const vector_storage<__half, 0>& right) { + static __half call(const __half* left, const __half* right) { return __half(0); } }; @@ -260,9 +259,8 @@ struct dot_impl<__half, 0> { template<> struct dot_impl<__half, 1> { KERNEL_FLOAT_INLINE - static __half - call(const vector_storage<__half, 1>& left, const vector_storage<__half, 1>& right) { - return __hmul(left.data()[0], right.data()[0]); + static __half call(const __half* left, const __half* right) { + return __hmul(left[0], right[0]); } }; @@ -271,24 +269,23 @@ struct dot_impl<__half, N> { static_assert(N >= 2, "internal error"); KERNEL_FLOAT_INLINE - static __half - call(const vector_storage<__half, N>& left, const vector_storage<__half, N>& right) { - __half2 first_a = {left.data()[0], left.data()[1]}; - __half2 first_b = {right.data()[0], right.data()[1]}; + static __half call(const __half* left, const __half* right) { + __half2 first_a = {left[0], left[1]}; + __half2 first_b = {right[0], right[1]}; __half2 accum = __hmul2(first_a, first_b); #pragma unroll for (size_t i = 2; i + 2 <= N; i += 2) { - __half2 a = {left.data()[i], left.data()[i + 1]}; - __half2 b = {right.data()[i], right.data()[i + 1]}; + __half2 a = {left[i], left[i + 1]}; + __half2 b = {right[i], right[i + 1]}; accum = __hfma2(a, b, accum); } __half result = __hadd(accum.x, accum.y); if (N % 2 != 0) { - __half a = left.data()[N - 1]; - __half b = right.data()[N - 1]; + __half a = left[N - 1]; + __half b = right[N - 1]; result = __hfma(a, b, result); } diff --git a/include/kernel_float/reduce.h b/include/kernel_float/reduce.h index 8cc5362..dfa52c3 100644 --- a/include/kernel_float/reduce.h +++ b/include/kernel_float/reduce.h @@ -143,13 +143,13 @@ namespace detail { template struct dot_impl { KERNEL_FLOAT_INLINE - static T call(const vector_storage& left, const vector_storage& right) { + static T call(const T* left, const T* right) { vector_storage intermediate; detail::apply_impl, N, T, T, T>::call( ops::multiply(), intermediate.data(), - left.data(), - right.data()); + left, + right); return detail::reduce_impl, N, T>::call(ops::add(), intermediate.data()); } @@ -171,15 +171,15 @@ template> KERNEL_FLOAT_INLINE T dot(const L& left, const R& right) { using E = broadcast_vector_extent_type; return detail::dot_impl::call( - convert_storage(left, E {}), - convert_storage(right, E {})); + convert_storage(left, E {}).data(), + convert_storage(right, E {}).data()); } namespace detail { template struct magnitude_impl { KERNEL_FLOAT_INLINE - static T call(const vector_storage& input) { + static T call(const T* input) { return ops::sqrt {}(detail::dot_impl::call(input, input)); } }; @@ -187,7 +187,7 @@ struct magnitude_impl { template struct magnitude_impl { KERNEL_FLOAT_INLINE - static T call(const vector_storage& input) { + static T call(const T* input) { return T {}; } }; @@ -195,16 +195,16 @@ struct magnitude_impl { template struct magnitude_impl { KERNEL_FLOAT_INLINE - static T call(const vector_storage& input) { - return ops::abs {}(input); + static T call(const T* input) { + return ops::abs {}(input[0]); } }; template struct magnitude_impl { KERNEL_FLOAT_INLINE - static T call(const vector_storage& input) { - return ops::hypot()(input.data()[0], input.data()[1]); + static T call(const T* input) { + return ops::hypot()(input[0], input[1]); } }; @@ -212,17 +212,15 @@ struct magnitude_impl { #if defined(__cpp_lib_hypot) && KERNEL_FLOAT_IS_HOST template<> struct magnitude_impl { - KERNEL_FLOAT_INLINE - static float call(const vector_storage& input) { - return ::hypot(input.data()[0], input.data()[1], input.data()[2]); + static float call(const float* input) { + return ::hypot(input[0], input[1], input[2]); } }; template<> struct magnitude_impl { - KERNEL_FLOAT_INLINE - static float call(const vector_storage& input) { - return ::hypot(input.data()[0], input.data()[1], input.data()[2]); + static double call(const double* input) { + return ::hypot(input[0], input[1], input[2]); } }; #endif @@ -242,7 +240,7 @@ struct magnitude_impl { */ template> KERNEL_FLOAT_INLINE T mag(const V& input) { - return detail::magnitude_impl>::call(into_vector_storage(input)); + return detail::magnitude_impl>::call(into_vector_storage(input).data()); } } // namespace kernel_float diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index b371866..2052d96 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-09-19 20:34:16.094065 -// git hash: 7acff4cc2414483de4162d0b47453422f6ebe215 +// date: 2023-09-19 20:45:16.880746 +// git hash: da0a46b533ef9d25638748eb951284f14e7c48bb //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -2535,13 +2535,13 @@ namespace detail { template struct dot_impl { KERNEL_FLOAT_INLINE - static T call(const vector_storage& left, const vector_storage& right) { + static T call(const T* left, const T* right) { vector_storage intermediate; detail::apply_impl, N, T, T, T>::call( ops::multiply(), intermediate.data(), - left.data(), - right.data()); + left, + right); return detail::reduce_impl, N, T>::call(ops::add(), intermediate.data()); } @@ -2563,15 +2563,15 @@ template> KERNEL_FLOAT_INLINE T dot(const L& left, const R& right) { using E = broadcast_vector_extent_type; return detail::dot_impl::call( - convert_storage(left, E {}), - convert_storage(right, E {})); + convert_storage(left, E {}).data(), + convert_storage(right, E {}).data()); } namespace detail { template struct magnitude_impl { KERNEL_FLOAT_INLINE - static T call(const vector_storage& input) { + static T call(const T* input) { return ops::sqrt {}(detail::dot_impl::call(input, input)); } }; @@ -2579,7 +2579,7 @@ struct magnitude_impl { template struct magnitude_impl { KERNEL_FLOAT_INLINE - static T call(const vector_storage& input) { + static T call(const T* input) { return T {}; } }; @@ -2587,16 +2587,16 @@ struct magnitude_impl { template struct magnitude_impl { KERNEL_FLOAT_INLINE - static T call(const vector_storage& input) { - return ops::abs {}(input); + static T call(const T* input) { + return ops::abs {}(input[0]); } }; template struct magnitude_impl { KERNEL_FLOAT_INLINE - static T call(const vector_storage& input) { - return ops::hypot()(input.data()[0], input.data()[1]); + static T call(const T* input) { + return ops::hypot()(input[0], input[1]); } }; @@ -2604,17 +2604,15 @@ struct magnitude_impl { #if defined(__cpp_lib_hypot) && KERNEL_FLOAT_IS_HOST template<> struct magnitude_impl { - KERNEL_FLOAT_INLINE - static float call(const vector_storage& input) { - return ::hypot(input.data()[0], input.data()[1], input.data()[2]); + static float call(const float* input) { + return ::hypot(input[0], input[1], input[2]); } }; template<> struct magnitude_impl { - KERNEL_FLOAT_INLINE - static float call(const vector_storage& input) { - return ::hypot(input.data()[0], input.data()[1], input.data()[2]); + static double call(const double* input) { + return ::hypot(input[0], input[1], input[2]); } }; #endif @@ -2634,7 +2632,7 @@ struct magnitude_impl { */ template> KERNEL_FLOAT_INLINE T mag(const V& input) { - return detail::magnitude_impl>::call(into_vector_storage(input)); + return detail::magnitude_impl>::call(into_vector_storage(input).data()); } } // namespace kernel_float @@ -3375,8 +3373,7 @@ namespace detail { template<> struct dot_impl<__half, 0> { KERNEL_FLOAT_INLINE - static __half - call(const vector_storage<__half, 0>& left, const vector_storage<__half, 0>& right) { + static __half call(const __half* left, const __half* right) { return __half(0); } }; @@ -3384,9 +3381,8 @@ struct dot_impl<__half, 0> { template<> struct dot_impl<__half, 1> { KERNEL_FLOAT_INLINE - static __half - call(const vector_storage<__half, 1>& left, const vector_storage<__half, 1>& right) { - return __hmul(left.data()[0], right.data()[0]); + static __half call(const __half* left, const __half* right) { + return __hmul(left[0], right[0]); } }; @@ -3395,24 +3391,23 @@ struct dot_impl<__half, N> { static_assert(N >= 2, "internal error"); KERNEL_FLOAT_INLINE - static __half - call(const vector_storage<__half, N>& left, const vector_storage<__half, N>& right) { - __half2 first_a = {left.data()[0], left.data()[1]}; - __half2 first_b = {right.data()[0], right.data()[1]}; + static __half call(const __half* left, const __half* right) { + __half2 first_a = {left[0], left[1]}; + __half2 first_b = {right[0], right[1]}; __half2 accum = __hmul2(first_a, first_b); #pragma unroll for (size_t i = 2; i + 2 <= N; i += 2) { - __half2 a = {left.data()[i], left.data()[i + 1]}; - __half2 b = {right.data()[i], right.data()[i + 1]}; + __half2 a = {left[i], left[i + 1]}; + __half2 b = {right[i], right[i + 1]}; accum = __hfma2(a, b, accum); } __half result = __hadd(accum.x, accum.y); if (N % 2 != 0) { - __half a = left.data()[N - 1]; - __half b = right.data()[N - 1]; + __half a = left[N - 1]; + __half b = right[N - 1]; result = __hfma(a, b, result); } @@ -3688,9 +3683,7 @@ namespace detail { template<> struct dot_impl<__nv_bfloat16, 0> { KERNEL_FLOAT_INLINE - static __nv_bfloat16 call( - const vector_storage<__nv_bfloat16, 0>& left, - const vector_storage<__nv_bfloat16, 0>& right) { + static __nv_bfloat16 call(const __nv_bfloat16* left, const __nv_bfloat16* right) { return __nv_bfloat16(0); } }; @@ -3698,10 +3691,8 @@ struct dot_impl<__nv_bfloat16, 0> { template<> struct dot_impl<__nv_bfloat16, 1> { KERNEL_FLOAT_INLINE - static __nv_bfloat16 call( - const vector_storage<__nv_bfloat16, 1>& left, - const vector_storage<__nv_bfloat16, 1>& right) { - return __hmul(left.data()[0], right.data()[0]); + static __nv_bfloat16 call(const __nv_bfloat16* left, const __nv_bfloat16* right) { + return __hmul(left[0], right[0]); } }; @@ -3710,25 +3701,23 @@ struct dot_impl<__nv_bfloat16, N> { static_assert(N >= 2, "internal error"); KERNEL_FLOAT_INLINE - static __nv_bfloat16 call( - const vector_storage<__nv_bfloat16, N>& left, - const vector_storage<__nv_bfloat16, N>& right) { - __nv_bfloat162 first_a = {left.data()[0], left.data()[1]}; - __nv_bfloat162 first_b = {right.data()[0], right.data()[1]}; + static __nv_bfloat16 call(const __nv_bfloat16* left, const __nv_bfloat16* right) { + __nv_bfloat162 first_a = {left[0], left[1]}; + __nv_bfloat162 first_b = {right[0], right[1]}; __nv_bfloat162 accum = __hmul2(first_a, first_b); #pragma unroll - for (size_t i = 2; i + 2 <= N; i += 2) { - __nv_bfloat162 a = {left.data()[i], left.data()[i + 1]}; - __nv_bfloat162 b = {right.data()[i], right.data()[i + 1]}; + for (size_t i = 2; i + 1 < N; i += 2) { + __nv_bfloat162 a = {left[i], left[i + 1]}; + __nv_bfloat162 b = {right[i], right[i + 1]}; accum = __hfma2(a, b, accum); } __nv_bfloat16 result = __hadd(accum.x, accum.y); if (N % 2 != 0) { - __nv_bfloat16 a = left.data()[N - 1]; - __nv_bfloat16 b = right.data()[N - 1]; + __nv_bfloat16 a = left[N - 1]; + __nv_bfloat16 b = right[N - 1]; result = __hfma(a, b, result); } From 07af0ad9ff5c16595790d579577244bc482f0999 Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 20 Sep 2023 09:42:53 +0200 Subject: [PATCH 45/50] Rename `into_vector_traits` to `into_vector_impl` --- include/kernel_float/base.h | 30 +++++++++++++++--------------- include/kernel_float/bf16.h | 2 +- include/kernel_float/fp16.h | 2 +- include/kernel_float/vector.h | 2 +- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/include/kernel_float/base.h b/include/kernel_float/base.h index 50566be..b3edb20 100644 --- a/include/kernel_float/base.h +++ b/include/kernel_float/base.h @@ -173,7 +173,7 @@ struct extent { }; template -struct into_vector_traits { +struct into_vector_impl { using value_type = T; using extent_type = extent<1>; @@ -184,7 +184,7 @@ struct into_vector_traits { }; template -struct into_vector_traits { +struct into_vector_impl { using value_type = T; using extent_type = extent; @@ -202,19 +202,19 @@ struct into_vector_traits { }; template -struct into_vector_traits: into_vector_traits {}; +struct into_vector_impl: into_vector_impl {}; template -struct into_vector_traits: into_vector_traits {}; +struct into_vector_impl: into_vector_impl {}; template -struct into_vector_traits: into_vector_traits {}; +struct into_vector_impl: into_vector_impl {}; template -struct into_vector_traits: into_vector_traits {}; +struct into_vector_impl: into_vector_impl {}; template -struct into_vector_traits> { +struct into_vector_impl> { using value_type = T; using extent_type = extent; @@ -226,7 +226,7 @@ struct into_vector_traits> { #define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ template<> \ - struct into_vector_traits<::T1> { \ + struct into_vector_impl<::T1> { \ using value_type = T; \ using extent_type = extent<1>; \ \ @@ -237,7 +237,7 @@ struct into_vector_traits> { }; \ \ template<> \ - struct into_vector_traits<::T2> { \ + struct into_vector_impl<::T2> { \ using value_type = T; \ using extent_type = extent<2>; \ \ @@ -248,7 +248,7 @@ struct into_vector_traits> { }; \ \ template<> \ - struct into_vector_traits<::T3> { \ + struct into_vector_impl<::T3> { \ using value_type = T; \ using extent_type = extent<3>; \ \ @@ -259,7 +259,7 @@ struct into_vector_traits> { }; \ \ template<> \ - struct into_vector_traits<::T4> { \ + struct into_vector_impl<::T4> { \ using value_type = T; \ using extent_type = extent<4>; \ \ @@ -288,7 +288,7 @@ template> struct vector; template -struct into_vector_traits> { +struct into_vector_impl> { using value_type = T; using extent_type = E; @@ -310,10 +310,10 @@ struct vector_traits> { }; template -using vector_value_type = typename into_vector_traits::value_type; +using vector_value_type = typename into_vector_impl::value_type; template -using vector_extent_type = typename into_vector_traits::extent_type; +using vector_extent_type = typename into_vector_impl::extent_type; template static constexpr size_t vector_extent = vector_extent_type::value; @@ -329,7 +329,7 @@ using promoted_vector_value_type = promote_t...>; template KERNEL_FLOAT_INLINE vector_storage_type into_vector_storage(V&& input) { - return into_vector_traits::call(std::forward(input)); + return into_vector_impl::call(std::forward(input)); } } // namespace kernel_float diff --git a/include/kernel_float/bf16.h b/include/kernel_float/bf16.h index 608fddd..9580a69 100644 --- a/include/kernel_float/bf16.h +++ b/include/kernel_float/bf16.h @@ -16,7 +16,7 @@ KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __nv_bfloat16) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __nv_bfloat16) template<> -struct into_vector_traits<__nv_bfloat162> { +struct into_vector_impl<__nv_bfloat162> { using value_type = __nv_bfloat16; using extent_type = extent<2>; diff --git a/include/kernel_float/fp16.h b/include/kernel_float/fp16.h index abceb05..41330bb 100644 --- a/include/kernel_float/fp16.h +++ b/include/kernel_float/fp16.h @@ -14,7 +14,7 @@ KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __half) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __half) template<> -struct into_vector_traits<__half2> { +struct into_vector_impl<__half2> { using value_type = __half; using extent_type = extent<2>; diff --git a/include/kernel_float/vector.h b/include/kernel_float/vector.h index 1782dbc..642541b 100644 --- a/include/kernel_float/vector.h +++ b/include/kernel_float/vector.h @@ -280,7 +280,7 @@ struct vector: public S { */ template KERNEL_FLOAT_INLINE into_vector_type into_vector(V&& input) { - return into_vector_traits::call(std::forward(input)); + return into_vector_impl::call(std::forward(input)); } template From c0939d07cbf203653d5a520ab9ab7a06a7ee5a78 Mon Sep 17 00:00:00 2001 From: stijn Date: Thu, 21 Sep 2023 09:38:08 +0200 Subject: [PATCH 46/50] Fix incorrect definition of FMA --- include/kernel_float/triops.h | 2 +- tests/constant.cu | 30 ++++++++ tests/triops.cu | 137 ++++++++++++++++++++++++++++++++++ 3 files changed, 168 insertions(+), 1 deletion(-) create mode 100644 tests/constant.cu create mode 100644 tests/triops.cu diff --git a/include/kernel_float/triops.h b/include/kernel_float/triops.h index afee449..44f6db2 100644 --- a/include/kernel_float/triops.h +++ b/include/kernel_float/triops.h @@ -92,7 +92,7 @@ namespace ops { template struct fma { KERNEL_FLOAT_INLINE T operator()(T a, T b, T c) { - return a + b * c; + return a * b + c; } }; diff --git a/tests/constant.cu b/tests/constant.cu new file mode 100644 index 0000000..618994b --- /dev/null +++ b/tests/constant.cu @@ -0,0 +1,30 @@ +#include "common.h" + +struct triops_tests { + template + __host__ __device__ void operator()(generator gen, std::index_sequence) { + T x[N] = {gen.next(I)...}; + T y[N] = {gen.next(I)...}; + T z[N] = {gen.next(I)...}; + + kf::vec a = {x[I]...}; + kf::vec b = {y[I]...}; + kf::vec c = {z[I]...}; + + kf::vec answer = kf::where(a, b, c); + ASSERT_EQ_ALL(answer[I], bool(x[I]) ? y[I] : z[I]); + + answer = kf::where(a, b); + ASSERT_EQ_ALL(answer[I], bool(x[I]) ? y[I] : T()); + + answer = kf::where(a); + ASSERT_EQ_ALL(answer[I], T(bool(x[I]))); + + answer = kf::fma(a, b, c); + ASSERT_EQ_ALL(answer[I], x[I] * y[I] + z[I]); + + } +}; + +REGISTER_TEST_CASE("ternary operators", triops_tests, int, float, double) +REGISTER_TEST_CASE_GPU("ternary operators", triops_tests, __half, __nv_bfloat16) diff --git a/tests/triops.cu b/tests/triops.cu new file mode 100644 index 0000000..114889f --- /dev/null +++ b/tests/triops.cu @@ -0,0 +1,137 @@ +#include "common.h" + +struct binops_tests { + template + __host__ __device__ void operator()(generator gen, std::index_sequence) { + T x[N] = {gen.next(I)...}; + T y[N] = {gen.next(I)...}; + + kf::vec a = {x[I]...}; + kf::vec b = {y[I]...}; + kf::vec c; + + // Arithmetic + c = a + b; + ASSERT(equals(T(x[I] + y[I]), c[I]) && ...); + + c = a - b; + ASSERT(equals(T(x[I] - y[I]), c[I]) && ...); + + c = a * b; + ASSERT(equals(T(x[I] * y[I]), c[I]) && ...); + + // Results in division by zero + // c = a / b; + // ASSERT(equals(T(x[I] / y[I]), c[I]) && ...); + + // Results in division by zero + // c = a % b; + // ASSERT(equals(T(x[I] % y[I]), c[I]) && ...); + + // Comparison + c = a < b; + ASSERT(equals(T(x[I] < y[I]), c[I]) && ...); + + c = a > b; + ASSERT(equals(T(x[I] > y[I]), c[I]) && ...); + + c = a <= b; + ASSERT(equals(T(x[I] <= y[I]), c[I]) && ...); + + c = a >= b; + ASSERT(equals(T(x[I] >= y[I]), c[I]) && ...); + + c = a == b; + ASSERT(equals(T(x[I] == y[I]), c[I]) && ...); + + c = a != b; + ASSERT(equals(T(x[I] != y[I]), c[I]) && ...); + + // Assignment + c = a; + c += b; + ASSERT(equals(T(x[I] + y[I]), c[I]) && ...); + + c = a; + c -= b; + ASSERT(equals(T(x[I] - y[I]), c[I]) && ...); + + c = a; + c *= b; + ASSERT(equals(T(x[I] * y[I]), c[I]) && ...); + } +}; + +REGISTER_TEST_CASE("binary operators", binops_tests, bool, int, float, double) +REGISTER_TEST_CASE_GPU("binary operators", binops_tests, __half, __nv_bfloat16) + +struct binops_float_tests { + template + __host__ __device__ void operator()(generator gen, std::index_sequence) { + T x[N] = {gen.next(I)...}; + T y[N] = {gen.next(I)...}; + + kf::vec a = {x[I]...}; + kf::vec b = {y[I]...}; + kf::vec c; + + c = a / b; + ASSERT(equals(T(x[I] / y[I]), c[I]) && ...); + + // remainder is not support for fp16 + if constexpr (is_none_of) { + // c = a % b; + // ASSERT(equals(T(fmod(x[I], y[I])), c[I]) && ...); + } + } +}; + +REGISTER_TEST_CASE("binary float operators", binops_float_tests, float, double) +REGISTER_TEST_CASE_GPU("binary float operators", binops_float_tests, __half, __nv_bfloat16) + +struct minmax_tests { + template + __host__ __device__ void operator()(generator gen, std::index_sequence) { + T x[N] = {gen.next(I)...}; + T y[N] = {gen.next(I)...}; + + kf::vec a = {x[I]...}; + kf::vec b = {y[I]...}; + + kf::vec lo = min(a, b); + kf::vec hi = max(a, b); + + if constexpr (is_one_of) { + ASSERT(equals(fmin(a[I], b[I]), lo[I]) && ...); + ASSERT(equals(fmax(a[I], b[I]), hi[I]) && ...); + } else if constexpr (is_one_of) { + ASSERT(equals(fminf(a[I], b[I]), lo[I]) && ...); + ASSERT(equals(fmaxf(a[I], b[I]), hi[I]) && ...); + } else if constexpr (is_one_of) { + ASSERT(equals(__hmin(a[I], b[I]), lo[I]) && ...); + ASSERT(equals(__hmax(a[I], b[I]), hi[I]) && ...); + } else { + ASSERT(equals(x[I] < y[I] ? x[I] : y[I], lo[I]) && ...); + ASSERT(equals(x[I] < y[I] ? y[I] : x[I], hi[I]) && ...); + } + } +}; + +REGISTER_TEST_CASE("min/max functions", minmax_tests, bool, int, float, double) +REGISTER_TEST_CASE_GPU("min/max functions", minmax_tests, __half, __nv_bfloat16) + +struct cross_test { + template + __host__ __device__ void operator()(generator gen) { + kf::vec a = {1, 2, 3}; + kf::vec b = {4, 5, 6}; + kf::vec c = cross(a, b); + + ASSERT(c[0] == T(-3)); + ASSERT(c[1] == T(6)); + ASSERT(c[2] == T(-3)); + } +}; + +REGISTER_TEST_CASE("cross product", cross_test, float, double) +REGISTER_TEST_CASE_GPU("cross product", cross_test, __half, __nv_bfloat16) \ No newline at end of file From 6a7bd2e348fc75b75b7f8713fa232363fe562a1e Mon Sep 17 00:00:00 2001 From: stijn Date: Thu, 21 Sep 2023 09:38:30 +0200 Subject: [PATCH 47/50] Add operator overloads for constants --- include/kernel_float/constant.h | 60 +++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 21 deletions(-) diff --git a/include/kernel_float/constant.h b/include/kernel_float/constant.h index 19ecfd8..1b98925 100644 --- a/include/kernel_float/constant.h +++ b/include/kernel_float/constant.h @@ -9,7 +9,10 @@ namespace kernel_float { template struct constant { template - KERNEL_FLOAT_INLINE explicit constexpr constant(const constant& that) : value_(that.get()) {} + KERNEL_FLOAT_INLINE explicit constexpr constant(const constant& that) { + auto f = ops::cast(); + value_ = f(that.get()); + } KERNEL_FLOAT_INLINE constexpr constant(T value = {}) : value_(value) {} @@ -70,28 +73,43 @@ struct cast, R, m> { }; } // namespace ops -#define KERNEL_FLOAT_CONSTANT_DEFINE_OP(OP) \ - template \ - R operator OP(const constant& left, const R& right) { \ - using T = vector_value_type; \ - return operator OP(T(left.get()), right); \ - } \ - \ - template \ - L operator OP(const L& left, const constant& right) { \ - using T = vector_value_type; \ - return operator OP(left, T(right.get())); \ - } \ - \ - template> \ - constant operator OP(const constant& left, const constant& right) { \ - return constant(operator OP(T(left.get()), T(right.get()))); \ +#define KERNEL_FLOAT_CONSTANT_DEFINE_OP(OP) \ + template \ + KERNEL_FLOAT_INLINE auto operator OP(const constant& left, const R& right) { \ + auto f = ops::cast>(); \ + return f(left.get()) OP right; \ + } \ + \ + template \ + KERNEL_FLOAT_INLINE auto operator OP(const L& left, const constant& right) { \ + auto f = ops::cast>(); \ + return left OP f(right.get()); \ + } \ + \ + template \ + KERNEL_FLOAT_INLINE auto operator OP(const constant& left, const vector& right) { \ + auto f = ops::cast(); \ + return f(left.get()) OP right; \ + } \ + \ + template \ + KERNEL_FLOAT_INLINE auto operator OP(const vector& left, const constant& right) { \ + auto f = ops::cast(); \ + return left OP f(right.get()); \ + } \ + \ + template> \ + KERNEL_FLOAT_INLINE constant operator OP( \ + const constant& left, \ + const constant& right) { \ + return constant(left.get()) OP constant(right.get()); \ } -//KERNEL_FLOAT_CONSTANT_DEFINE_OP(+) -//KERNEL_FLOAT_CONSTANT_DEFINE_OP(-) -//KERNEL_FLOAT_CONSTANT_DEFINE_OP(*) -//KERNEL_FLOAT_CONSTANT_DEFINE_OP(/) +KERNEL_FLOAT_CONSTANT_DEFINE_OP(+) +KERNEL_FLOAT_CONSTANT_DEFINE_OP(-) +KERNEL_FLOAT_CONSTANT_DEFINE_OP(*) +KERNEL_FLOAT_CONSTANT_DEFINE_OP(/) +KERNEL_FLOAT_CONSTANT_DEFINE_OP(%) } // namespace kernel_float From 227f987d3fc10499e680bb68f00e1c579afeda97 Mon Sep 17 00:00:00 2001 From: stijn Date: Thu, 21 Sep 2023 09:39:38 +0200 Subject: [PATCH 48/50] Add more tests --- single_include/kernel_float.h | 106 ++++++++++++++++----------- tests/basics.cu | 2 +- tests/common.h | 15 ++++ tests/constant.cu | 60 ++++++++------- tests/promotion.cu | 5 +- tests/triops.cu | 134 ++++------------------------------ 6 files changed, 129 insertions(+), 193 deletions(-) diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 2052d96..904309d 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,7 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-09-19 20:45:16.880746 -// git hash: da0a46b533ef9d25638748eb951284f14e7c48bb +// date: 2023-09-21 09:37:28.638971 +// git hash: 07af0ad9ff5c16595790d579577244bc482f0999 //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -497,7 +497,7 @@ struct extent { }; template -struct into_vector_traits { +struct into_vector_impl { using value_type = T; using extent_type = extent<1>; @@ -508,7 +508,7 @@ struct into_vector_traits { }; template -struct into_vector_traits { +struct into_vector_impl { using value_type = T; using extent_type = extent; @@ -526,19 +526,19 @@ struct into_vector_traits { }; template -struct into_vector_traits: into_vector_traits {}; +struct into_vector_impl: into_vector_impl {}; template -struct into_vector_traits: into_vector_traits {}; +struct into_vector_impl: into_vector_impl {}; template -struct into_vector_traits: into_vector_traits {}; +struct into_vector_impl: into_vector_impl {}; template -struct into_vector_traits: into_vector_traits {}; +struct into_vector_impl: into_vector_impl {}; template -struct into_vector_traits> { +struct into_vector_impl> { using value_type = T; using extent_type = extent; @@ -550,7 +550,7 @@ struct into_vector_traits> { #define KERNEL_FLOAT_DEFINE_VECTOR_TYPE(T, T1, T2, T3, T4) \ template<> \ - struct into_vector_traits<::T1> { \ + struct into_vector_impl<::T1> { \ using value_type = T; \ using extent_type = extent<1>; \ \ @@ -561,7 +561,7 @@ struct into_vector_traits> { }; \ \ template<> \ - struct into_vector_traits<::T2> { \ + struct into_vector_impl<::T2> { \ using value_type = T; \ using extent_type = extent<2>; \ \ @@ -572,7 +572,7 @@ struct into_vector_traits> { }; \ \ template<> \ - struct into_vector_traits<::T3> { \ + struct into_vector_impl<::T3> { \ using value_type = T; \ using extent_type = extent<3>; \ \ @@ -583,7 +583,7 @@ struct into_vector_traits> { }; \ \ template<> \ - struct into_vector_traits<::T4> { \ + struct into_vector_impl<::T4> { \ using value_type = T; \ using extent_type = extent<4>; \ \ @@ -612,7 +612,7 @@ template> struct vector; template -struct into_vector_traits> { +struct into_vector_impl> { using value_type = T; using extent_type = E; @@ -634,10 +634,10 @@ struct vector_traits> { }; template -using vector_value_type = typename into_vector_traits::value_type; +using vector_value_type = typename into_vector_impl::value_type; template -using vector_extent_type = typename into_vector_traits::extent_type; +using vector_extent_type = typename into_vector_impl::extent_type; template static constexpr size_t vector_extent = vector_extent_type::value; @@ -653,7 +653,7 @@ using promoted_vector_value_type = promote_t...>; template KERNEL_FLOAT_INLINE vector_storage_type into_vector_storage(V&& input) { - return into_vector_traits::call(std::forward(input)); + return into_vector_impl::call(std::forward(input)); } } // namespace kernel_float @@ -1732,7 +1732,10 @@ namespace kernel_float { template struct constant { template - KERNEL_FLOAT_INLINE explicit constexpr constant(const constant& that) : value_(that.get()) {} + KERNEL_FLOAT_INLINE explicit constexpr constant(const constant& that) { + auto f = ops::cast(); + value_ = f(that.get()); + } KERNEL_FLOAT_INLINE constexpr constant(T value = {}) : value_(value) {} @@ -1793,28 +1796,43 @@ struct cast, R, m> { }; } // namespace ops -#define KERNEL_FLOAT_CONSTANT_DEFINE_OP(OP) \ - template \ - R operator OP(const constant& left, const R& right) { \ - using T = vector_value_type; \ - return operator OP(T(left.get()), right); \ - } \ - \ - template \ - L operator OP(const L& left, const constant& right) { \ - using T = vector_value_type; \ - return operator OP(left, T(right.get())); \ - } \ - \ - template> \ - constant operator OP(const constant& left, const constant& right) { \ - return constant(operator OP(T(left.get()), T(right.get()))); \ - } - -//KERNEL_FLOAT_CONSTANT_DEFINE_OP(+) -//KERNEL_FLOAT_CONSTANT_DEFINE_OP(-) -//KERNEL_FLOAT_CONSTANT_DEFINE_OP(*) -//KERNEL_FLOAT_CONSTANT_DEFINE_OP(/) +#define KERNEL_FLOAT_CONSTANT_DEFINE_OP(OP) \ + template \ + KERNEL_FLOAT_INLINE auto operator OP(const constant& left, const R& right) { \ + auto f = ops::cast>(); \ + return f(left.get()) OP right; \ + } \ + \ + template \ + KERNEL_FLOAT_INLINE auto operator OP(const L& left, const constant& right) { \ + auto f = ops::cast>(); \ + return left OP f(right.get()); \ + } \ + \ + template \ + KERNEL_FLOAT_INLINE auto operator OP(const constant& left, const vector& right) { \ + auto f = ops::cast(); \ + return f(left.get()) OP right; \ + } \ + \ + template \ + KERNEL_FLOAT_INLINE auto operator OP(const vector& left, const constant& right) { \ + auto f = ops::cast(); \ + return left OP f(right.get()); \ + } \ + \ + template> \ + KERNEL_FLOAT_INLINE constant operator OP( \ + const constant& left, \ + const constant& right) { \ + return constant(left.get()) OP constant(right.get()); \ + } + +KERNEL_FLOAT_CONSTANT_DEFINE_OP(+) +KERNEL_FLOAT_CONSTANT_DEFINE_OP(-) +KERNEL_FLOAT_CONSTANT_DEFINE_OP(*) +KERNEL_FLOAT_CONSTANT_DEFINE_OP(/) +KERNEL_FLOAT_CONSTANT_DEFINE_OP(%) } // namespace kernel_float @@ -2731,7 +2749,7 @@ namespace ops { template struct fma { KERNEL_FLOAT_INLINE T operator()(T a, T b, T c) { - return a + b * c; + return a * b + c; } }; @@ -3066,7 +3084,7 @@ struct vector: public S { */ template KERNEL_FLOAT_INLINE into_vector_type into_vector(V&& input) { - return into_vector_traits::call(std::forward(input)); + return into_vector_impl::call(std::forward(input)); } template @@ -3136,7 +3154,7 @@ KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __half) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __half) template<> -struct into_vector_traits<__half2> { +struct into_vector_impl<__half2> { using value_type = __half; using extent_type = extent<2>; @@ -3440,7 +3458,7 @@ KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __nv_bfloat16) KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __nv_bfloat16) template<> -struct into_vector_traits<__nv_bfloat162> { +struct into_vector_impl<__nv_bfloat162> { using value_type = __nv_bfloat16; using extent_type = extent<2>; diff --git a/tests/basics.cu b/tests/basics.cu index f5c9061..9d15b56 100644 --- a/tests/basics.cu +++ b/tests/basics.cu @@ -106,7 +106,7 @@ struct creation_tests { // into_vector on scalar { - kf::vec a = into_vector(int(5.0f)); + kf::vec a = into_vector(float(5.0f)); ASSERT(a[0] == 5.0f); } diff --git a/tests/common.h b/tests/common.h index d3fa1b8..dac12bb 100644 --- a/tests/common.h +++ b/tests/common.h @@ -76,6 +76,19 @@ struct equals_helper<__nv_bfloat16> { } }; +template +struct equals_helper> { + static __host__ __device__ bool call(const kf::vec& left, const kf::vec& right) { + for (int i = 0; i < N; i++) { + if (!equals_helper::call(left[i], right[i])) { + return false; + } + } + + return true; + } +}; + } // namespace detail template @@ -346,11 +359,13 @@ void run_tests_device(F fun, type_sequence, size_sequence) { #define REGISTER_TEST_CASE_CPU(NAME, F, ...) \ TEMPLATE_TEST_CASE(NAME " - CPU", "", __VA_ARGS__) { \ run_tests_host(F {}, type_sequence {}, default_size_sequence {}); \ + CHECK("done"); \ } #define REGISTER_TEST_CASE_GPU(NAME, F, ...) \ TEMPLATE_TEST_CASE(NAME " - GPU", "[GPU]", __VA_ARGS__) { \ run_tests_device(F {}, type_sequence {}, default_size_sequence {}); \ + CHECK("done"); \ } #undef REGISTER_TEST_CASE diff --git a/tests/constant.cu b/tests/constant.cu index 618994b..a6e011c 100644 --- a/tests/constant.cu +++ b/tests/constant.cu @@ -1,30 +1,40 @@ #include "common.h" -struct triops_tests { - template - __host__ __device__ void operator()(generator gen, std::index_sequence) { - T x[N] = {gen.next(I)...}; - T y[N] = {gen.next(I)...}; - T z[N] = {gen.next(I)...}; - - kf::vec a = {x[I]...}; - kf::vec b = {y[I]...}; - kf::vec c = {z[I]...}; - - kf::vec answer = kf::where(a, b, c); - ASSERT_EQ_ALL(answer[I], bool(x[I]) ? y[I] : z[I]); - - answer = kf::where(a, b); - ASSERT_EQ_ALL(answer[I], bool(x[I]) ? y[I] : T()); - - answer = kf::where(a); - ASSERT_EQ_ALL(answer[I], T(bool(x[I]))); - - answer = kf::fma(a, b, c); - ASSERT_EQ_ALL(answer[I], x[I] * y[I] + z[I]); - +#define ASSERT_TYPE(A, B) ASSERT(std::is_same::value); + +struct constant_tests { + template + __host__ __device__ void operator()(generator gen) { + T value = gen.next(); + kf::vec vector = {gen.next(), gen.next()}; + + ASSERT_EQ(kf::make_constant(5.0) + value, T(5) + value); + ASSERT_EQ(value + kf::make_constant(5.0), value + T(5)); + ASSERT_EQ(kf::make_constant(5.0) + vector, T(5) + vector); + ASSERT_EQ(vector + kf::make_constant(5.0), vector + T(5)); + + ASSERT_EQ(kf::make_constant(5.0) - value, T(5) - value); + ASSERT_EQ(value - kf::make_constant(5.0), value - T(5)); + ASSERT_EQ(kf::make_constant(5.0) - vector, T(5) - vector); + ASSERT_EQ(vector - kf::make_constant(5.0), vector - T(5)); + + ASSERT_EQ(kf::make_constant(5.0) * value, T(5) * value); + ASSERT_EQ(value * kf::make_constant(5.0), value * T(5)); + ASSERT_EQ(kf::make_constant(5.0) * vector, T(5) * vector); + ASSERT_EQ(vector * kf::make_constant(5.0), vector * T(5)); + + // These results in division by zero for integers + // ASSERT_EQ(kf::make_constant(5.0) / value, T(5) / value); + // ASSERT_EQ(value / kf::make_constant(5.0), value / T(5)); + // ASSERT_EQ(kf::make_constant(5.0) / vector, T(5) / vector); + // ASSERT_EQ(vector / kf::make_constant(5.0), vector / T(5)); + // + // ASSERT_EQ(kf::make_constant(5.0) % value, T(5) % value); + // ASSERT_EQ(value % kf::make_constant(5.0), value % T(5)); + // ASSERT_EQ(kf::make_constant(5.0) % vector, T(5) % vector); + // ASSERT_EQ(vector % kf::make_constant(5.0), vector % T(5)); } }; -REGISTER_TEST_CASE("ternary operators", triops_tests, int, float, double) -REGISTER_TEST_CASE_GPU("ternary operators", triops_tests, __half, __nv_bfloat16) +REGISTER_TEST_CASE("constant tests", constant_tests, int, float, double) +REGISTER_TEST_CASE_GPU("constant tests", constant_tests, __half, __nv_bfloat16) diff --git a/tests/promotion.cu b/tests/promotion.cu index beb7b67..3367c3a 100644 --- a/tests/promotion.cu +++ b/tests/promotion.cu @@ -1,7 +1,8 @@ #include "common.h" -// Check if combining type `A` and `B` results in `C` -#define CHECK_PROMOTION(A, B, C) CHECK(std::is_same, C>::value); +// Check if combining type `vec` and `vec` results in `vec` +#define CHECK_PROMOTION(A, B, C) \ + CHECK(std::is_same() + kf::vec()), kf::vec>::value); TEST_CASE("type promotion") { CHECK_PROMOTION(int, int, int); diff --git a/tests/triops.cu b/tests/triops.cu index 114889f..4b899b1 100644 --- a/tests/triops.cu +++ b/tests/triops.cu @@ -1,137 +1,29 @@ #include "common.h" -struct binops_tests { +struct triops_tests { template __host__ __device__ void operator()(generator gen, std::index_sequence) { T x[N] = {gen.next(I)...}; T y[N] = {gen.next(I)...}; + T z[N] = {gen.next(I)...}; kf::vec a = {x[I]...}; kf::vec b = {y[I]...}; - kf::vec c; + kf::vec c = {z[I]...}; - // Arithmetic - c = a + b; - ASSERT(equals(T(x[I] + y[I]), c[I]) && ...); + kf::vec answer = kf::where(a, b, c); + ASSERT_EQ_ALL(answer[I], bool(x[I]) ? y[I] : z[I]); - c = a - b; - ASSERT(equals(T(x[I] - y[I]), c[I]) && ...); + answer = kf::where(a, b); + ASSERT_EQ_ALL(answer[I], bool(x[I]) ? y[I] : T()); - c = a * b; - ASSERT(equals(T(x[I] * y[I]), c[I]) && ...); + answer = kf::where(a); + ASSERT_EQ_ALL(answer[I], T(bool(x[I]))); - // Results in division by zero - // c = a / b; - // ASSERT(equals(T(x[I] / y[I]), c[I]) && ...); - - // Results in division by zero - // c = a % b; - // ASSERT(equals(T(x[I] % y[I]), c[I]) && ...); - - // Comparison - c = a < b; - ASSERT(equals(T(x[I] < y[I]), c[I]) && ...); - - c = a > b; - ASSERT(equals(T(x[I] > y[I]), c[I]) && ...); - - c = a <= b; - ASSERT(equals(T(x[I] <= y[I]), c[I]) && ...); - - c = a >= b; - ASSERT(equals(T(x[I] >= y[I]), c[I]) && ...); - - c = a == b; - ASSERT(equals(T(x[I] == y[I]), c[I]) && ...); - - c = a != b; - ASSERT(equals(T(x[I] != y[I]), c[I]) && ...); - - // Assignment - c = a; - c += b; - ASSERT(equals(T(x[I] + y[I]), c[I]) && ...); - - c = a; - c -= b; - ASSERT(equals(T(x[I] - y[I]), c[I]) && ...); - - c = a; - c *= b; - ASSERT(equals(T(x[I] * y[I]), c[I]) && ...); - } -}; - -REGISTER_TEST_CASE("binary operators", binops_tests, bool, int, float, double) -REGISTER_TEST_CASE_GPU("binary operators", binops_tests, __half, __nv_bfloat16) - -struct binops_float_tests { - template - __host__ __device__ void operator()(generator gen, std::index_sequence) { - T x[N] = {gen.next(I)...}; - T y[N] = {gen.next(I)...}; - - kf::vec a = {x[I]...}; - kf::vec b = {y[I]...}; - kf::vec c; - - c = a / b; - ASSERT(equals(T(x[I] / y[I]), c[I]) && ...); - - // remainder is not support for fp16 - if constexpr (is_none_of) { - // c = a % b; - // ASSERT(equals(T(fmod(x[I], y[I])), c[I]) && ...); - } - } -}; - -REGISTER_TEST_CASE("binary float operators", binops_float_tests, float, double) -REGISTER_TEST_CASE_GPU("binary float operators", binops_float_tests, __half, __nv_bfloat16) - -struct minmax_tests { - template - __host__ __device__ void operator()(generator gen, std::index_sequence) { - T x[N] = {gen.next(I)...}; - T y[N] = {gen.next(I)...}; - - kf::vec a = {x[I]...}; - kf::vec b = {y[I]...}; - - kf::vec lo = min(a, b); - kf::vec hi = max(a, b); - - if constexpr (is_one_of) { - ASSERT(equals(fmin(a[I], b[I]), lo[I]) && ...); - ASSERT(equals(fmax(a[I], b[I]), hi[I]) && ...); - } else if constexpr (is_one_of) { - ASSERT(equals(fminf(a[I], b[I]), lo[I]) && ...); - ASSERT(equals(fmaxf(a[I], b[I]), hi[I]) && ...); - } else if constexpr (is_one_of) { - ASSERT(equals(__hmin(a[I], b[I]), lo[I]) && ...); - ASSERT(equals(__hmax(a[I], b[I]), hi[I]) && ...); - } else { - ASSERT(equals(x[I] < y[I] ? x[I] : y[I], lo[I]) && ...); - ASSERT(equals(x[I] < y[I] ? y[I] : x[I], hi[I]) && ...); - } - } -}; - -REGISTER_TEST_CASE("min/max functions", minmax_tests, bool, int, float, double) -REGISTER_TEST_CASE_GPU("min/max functions", minmax_tests, __half, __nv_bfloat16) - -struct cross_test { - template - __host__ __device__ void operator()(generator gen) { - kf::vec a = {1, 2, 3}; - kf::vec b = {4, 5, 6}; - kf::vec c = cross(a, b); - - ASSERT(c[0] == T(-3)); - ASSERT(c[1] == T(6)); - ASSERT(c[2] == T(-3)); + answer = kf::fma(a, b, c); + ASSERT_EQ_ALL(answer[I], x[I] * y[I] + z[I]); } }; -REGISTER_TEST_CASE("cross product", cross_test, float, double) -REGISTER_TEST_CASE_GPU("cross product", cross_test, __half, __nv_bfloat16) \ No newline at end of file +REGISTER_TEST_CASE("ternary operators", triops_tests, int, float, double) +REGISTER_TEST_CASE_GPU("ternary operators", triops_tests, __half, __nv_bfloat16) From 4c74866f41784260a61152d59511bde303a3e886 Mon Sep 17 00:00:00 2001 From: stijn Date: Thu, 21 Sep 2023 10:00:37 +0200 Subject: [PATCH 49/50] Add license boilerplate to single-header include --- combine.py | 21 ++++++++++++++++++++- single_include/kernel_float.h | 20 ++++++++++++++++++-- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/combine.py b/combine.py index c22e2ba..5a7c857 100644 --- a/combine.py +++ b/combine.py @@ -2,6 +2,24 @@ import subprocess from datetime import datetime +license_boilerplate = """/* + * Kernel Float: Header-only library for vector types and reduced precision floating-point math. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +""" + directory = "include/kernel_float" contents = dict() @@ -28,7 +46,8 @@ except Exception as e: print(f"warning: {e}") -output = "\n".join([ +output = license_boilerplate +output += "\n".join([ "//" + "=" * 80, "// this file has been auto-generated, do not modify its contents!", f"// date: {date}", diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 904309d..c31082f 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -1,7 +1,23 @@ +/* + * Kernel Float: Header-only library for vector types and reduced precision floating-point math. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2023-09-21 09:37:28.638971 -// git hash: 07af0ad9ff5c16595790d579577244bc482f0999 +// date: 2023-09-21 10:00:11.122069 +// git hash: 227f987d3fc10499e680bb68f00e1c579afeda97 //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H From 46d598cbca2b9e15abe91848fdcb417d69f0820a Mon Sep 17 00:00:00 2001 From: stijn Date: Thu, 21 Sep 2023 13:07:03 +0200 Subject: [PATCH 50/50] Add github workflow to run unittests --- .github/workflows/cmake-action.yml | 47 ++++++++++++++++++++++++++++++ .github/workflows/cmake.yml | 28 ++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 .github/workflows/cmake-action.yml create mode 100644 .github/workflows/cmake.yml diff --git a/.github/workflows/cmake-action.yml b/.github/workflows/cmake-action.yml new file mode 100644 index 0000000..fd621db --- /dev/null +++ b/.github/workflows/cmake-action.yml @@ -0,0 +1,47 @@ +name: CMake + +on: + workflow_call: + inputs: + cuda-version: + required: true + type: string + +env: + # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.) + BUILD_TYPE: Debug + +jobs: + build: + # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac. + # You can convert this to a matrix build if you need cross-platform coverage. + # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix + runs-on: ubuntu-latest + + steps: + - uses: Jimver/cuda-toolkit@v0.2.11 + id: cuda-toolkit + with: + method: network + sub-packages: '["nvcc"]' + cuda: ${{ inputs.cuda-version }} + + - uses: actions/checkout@v3 + with: + submodules: 'true' + + - name: Configure CMake + # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make. + # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type + run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DKERNEL_FLOAT_BUILD_TEST=1 -DKERNEL_FLOAT_BUILD_EXAMPLE=1 + + - name: Build + # Build your program with the given configuration + run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} + + - name: Test + working-directory: ${{github.workspace}}/build + # Execute tests defined by the CMake configuration. + # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail + run: ./tests/kernel_float_tests --durations=yes --success --verbosity=high ~[GPU] + diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml new file mode 100644 index 0000000..136fcd3 --- /dev/null +++ b/.github/workflows/cmake.yml @@ -0,0 +1,28 @@ +name: CMake + +on: + push: + pull_request: + branches: [ "main" ] + +env: + # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.) + BUILD_TYPE: Debug + +jobs: + build-cuda: + uses: ./.github/workflows/cmake-action.yml + with: + cuda-version: "12.2.0" + + build-cuda-11-7: + needs: build-cuda + uses: ./.github/workflows/cmake-action.yml + with: + cuda-version: "11.7.0" + + build-cuda-12-0: + needs: build-cuda + uses: ./.github/workflows/cmake-action.yml + with: + cuda-version: "12.0.0"