From 3c73971bb8a8fd67ab0a013703c2ded9001236b1 Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 24 Jul 2024 15:36:04 +0200 Subject: [PATCH] Change implementation of `map_impl` to support policies --- docs/guides.rst | 1 + docs/guides/accuracy.md | 49 +++++++++++++++ include/kernel_float/apply.h | 81 ++++++++---------------- include/kernel_float/binops.h | 13 +--- include/kernel_float/macros.h | 2 +- include/kernel_float/unops.h | 15 +++-- single_include/kernel_float.h | 115 ++++++++++++---------------------- 7 files changed, 126 insertions(+), 150 deletions(-) create mode 100644 docs/guides/accuracy.md diff --git a/docs/guides.rst b/docs/guides.rst index fb8e2b7..aec50da 100644 --- a/docs/guides.rst +++ b/docs/guides.rst @@ -5,4 +5,5 @@ Guides guides/introduction.rst guides/promotion.rst + guides/accuracy.rst guides/constant.rst diff --git a/docs/guides/accuracy.md b/docs/guides/accuracy.md new file mode 100644 index 0000000..ddac7fb --- /dev/null +++ b/docs/guides/accuracy.md @@ -0,0 +1,49 @@ +Accuracy level +=== + +Many of the functions in Kernel Float take an additional `Accuracy` option as a template parameter. +This option can be used to increase the performance of certain operations, at the cost of lower accuracy. + +There are four possible values for this parameter: + +* `accurate_policy`: Use the most accurate version of the function available. +* `fast_policy`: Use the "fast math" version (for example, `__sinf` for sin on CUDA devices). Falls back to `accurate_policy` if such a version is not available. +* `approx_policy`: Rough approximation using a polynomial of degree `N`. Falls back to `fast_policy` if no such polynomial exists. +* `default_policy`: Use a global default policy (see the next section). + + +For example, consider this code: + +```C++ + +#include "kernel_float.h" +namespace kf = kernel_float; + + +int main() { + kf::vec input = {1.0f, 2.0f}; + + // Use the default policy + kf::vec A = kf::cos(input); + + // Use the most accuracy policy + kf::vec B = kf::cos(input); + + // Use the fastest policy + kf::vec C = kf::cos(input); + + printf("A = %f, %f", A[0], A[1]); + printf("B = %f, %f", B[0], B[1]); + printf("C = %f, %f", C[0], C[1]); + + return EXIT_SUCCESS; +} + +``` + + +Setting `default_policy` +--- +By default, the value for `default_policy` is `accurate_policy`. + +Set the preprocessor option `KERNEL_FLOAT_FAST_MATH=1` to change the default policy to `fast_policy`. diff --git a/include/kernel_float/apply.h b/include/kernel_float/apply.h index b548480..b7610f2 100644 --- a/include/kernel_float/apply.h +++ b/include/kernel_float/apply.h @@ -130,51 +130,50 @@ struct apply_impl { template struct apply_fastmath_impl: apply_impl {}; +} // namespace detail -template -struct map_impl { - static constexpr size_t packet_size = preferred_vector_size::value; - - KERNEL_FLOAT_INLINE static void call(F fun, Output* output, const Args*... args) { - if constexpr (N / packet_size > 0) { -#pragma unroll - for (size_t i = 0; i < N - N % packet_size; i += packet_size) { - apply_impl::call(fun, output + i, (args + i)...); - } - } +struct accurate_policy { + template + using type = detail::apply_impl; +}; - if constexpr (N % packet_size > 0) { -#pragma unroll - for (size_t i = N - N % packet_size; i < N; i++) { - apply_impl::call(fun, output + i, (args + i)...); - } - } - } +struct fast_policy { + template + using type = detail::apply_fastmath_impl; }; -template -struct fast_map_impl { +#ifdef KERNEL_FLOAT_POLICY +using default_policy = KERNEL_FLOAT_POLICY; +#else +using default_policy = accurate_policy; +#endif + +namespace detail { + +template +struct map_policy_impl { static constexpr size_t packet_size = preferred_vector_size::value; KERNEL_FLOAT_INLINE static void call(F fun, Output* output, const Args*... args) { if constexpr (N / packet_size > 0) { #pragma unroll for (size_t i = 0; i < N - N % packet_size; i += packet_size) { - apply_fastmath_impl::call( - fun, - output + i, - (args + i)...); + Policy::template type::call(fun, output + i, (args + i)...); } } if constexpr (N % packet_size > 0) { #pragma unroll for (size_t i = N - N % packet_size; i < N; i++) { - apply_fastmath_impl::call(fun, output + i, (args + i)...); + Policy::template type::call(fun, output + i, (args + i)...); } } } }; + +template +using map_impl = map_policy_impl; + } // namespace detail template @@ -191,41 +190,13 @@ using map_type = * vec squared = map([](auto x) { return x * x; }, input); // [1.0f, 4.0f, 9.0f, 16.0f] * ``` */ -template +template KERNEL_FLOAT_INLINE map_type map(F fun, const Args&... args) { using Output = result_t...>; using E = broadcast_vector_extent_type; vector_storage> result; - // Use the `apply_fastmath_impl` if KERNEL_FLOAT_FAST_MATH is enabled -#if KERNEL_FLOAT_FAST_MATH - using apply_impl = - detail::fast_math_impl, Output, vector_value_type...>; -#else - using map_impl = detail::map_impl, Output, vector_value_type...>; -#endif - - map_impl::call( - fun, - result.data(), - (detail::broadcast_impl, vector_extent_type, E>::call( - into_vector_storage(args)) - .data())...); - - return result; -} - -/** - * Apply the function `F` to each element from the vector `input` and return the results as a new vector. This - * uses fast-math if available for the given function `F`, otherwise this function behaves like `map`. - */ -template -KERNEL_FLOAT_INLINE map_type fast_map(F fun, const Args&... args) { - using Output = result_t...>; - using E = broadcast_vector_extent_type; - vector_storage> result; - - detail::fast_map_impl, Output, vector_value_type...>::call( + detail::map_policy_impl, Output, vector_value_type...>::call( fun, result.data(), (detail::broadcast_impl, vector_extent_type, E>::call( diff --git a/include/kernel_float/binops.h b/include/kernel_float/binops.h index 99b810d..eb958f1 100644 --- a/include/kernel_float/binops.h +++ b/include/kernel_float/binops.h @@ -52,14 +52,7 @@ KERNEL_FLOAT_INLINE zip_common_type zip_common(F fun, const L& left, co vector_storage> result; -// Use the `apply_fastmath_impl` if KERNEL_FLOAT_FAST_MATH is enabled -#if KERNEL_FLOAT_FAST_MATH - using map_impl = detail::fast_map_impl, O, T, T>; -#else - using map_impl = detail::map_impl, O, T, T>; -#endif - - map_impl::call( + detail::map_impl, O, T, T>::call( fun, result.data(), detail::convert_impl, vector_extent_type, T, E>::call( @@ -304,7 +297,7 @@ struct apply_fastmath_impl, N, T, T, T> { T rhs_rcp[N]; // Fast way to perform division is to multiply by the reciprocal - apply_fastmath_impl, N, T, T, T>::call({}, rhs_rcp, rhs); + apply_fastmath_impl, N, T, T>::call({}, rhs_rcp, rhs); apply_fastmath_impl, N, T, T, T>::call({}, result, lhs, rhs_rcp); } }; @@ -326,7 +319,7 @@ fast_divide(const L& left, const R& right) { using E = broadcast_vector_extent_type; vector_storage> result; - detail::fast_map_impl, extent_size, T, T, T>::call( + detail::map_policy_impl, extent_size, T, T, T>::call( ops::divide {}, result.data(), detail::convert_impl, vector_extent_type, T, E>::call( diff --git a/include/kernel_float/macros.h b/include/kernel_float/macros.h index 621cdea..f21dbe5 100644 --- a/include/kernel_float/macros.h +++ b/include/kernel_float/macros.h @@ -64,7 +64,7 @@ #define KERNEL_FLOAT_MAX_ALIGNMENT (32) #ifndef KERNEL_FLOAT_FAST_MATH -#define KERNEL_FLOAT_FAST_MATH (0) +#define KERNEL_FLOAT_POLICY ::kernel_float::fast_policy; #endif #endif //KERNEL_FLOAT_MACROS_H diff --git a/include/kernel_float/unops.h b/include/kernel_float/unops.h index bae2c3e..fce130e 100644 --- a/include/kernel_float/unops.h +++ b/include/kernel_float/unops.h @@ -85,10 +85,10 @@ KERNEL_FLOAT_INLINE vector> cast(const V& input) { } #define KERNEL_FLOAT_DEFINE_UNARY_FUN(NAME) \ - template \ + template \ KERNEL_FLOAT_INLINE vector, vector_extent_type> NAME(const V& input) { \ using F = ops::NAME>; \ - return map(F {}, input); \ + return ::kernel_float::map(F {}, input); \ } #define KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ @@ -193,12 +193,11 @@ KERNEL_FLOAT_DEFINE_UNARY_STRUCT(rcp, 1.0 / input, 1.0f / input) KERNEL_FLOAT_DEFINE_UNARY_FUN(rcp) -#define KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(NAME) \ - template \ - KERNEL_FLOAT_INLINE vector, vector_extent_type> fast_##NAME( \ - const V& input) { \ - using F = ops::NAME>; \ - return fast_map(F {}, input); \ +#define KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(NAME) \ + template \ + KERNEL_FLOAT_INLINE vector, vector_extent_type> fast_##NAME( \ + const V& input) { \ + return ::kernel_float::map(ops::NAME> {}, input); \ } KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(exp) diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index d430283..8d6e279 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -16,8 +16,8 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2024-07-22 11:31:53.132636 -// git hash: 4278106f2a14629445668d7e3684dbc8faf8b94d +// date: 2024-07-24 15:35:29.178410 +// git hash: 986ca557aa59f869d68fe1e7184c2228517ea52d //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -86,7 +86,7 @@ #define KERNEL_FLOAT_MAX_ALIGNMENT (32) #ifndef KERNEL_FLOAT_FAST_MATH -#define KERNEL_FLOAT_FAST_MATH (0) +#define KERNEL_FLOAT_POLICY ::kernel_float::fast_policy; #endif #endif //KERNEL_FLOAT_MACROS_H @@ -784,51 +784,50 @@ struct apply_impl { template struct apply_fastmath_impl: apply_impl {}; +} // namespace detail -template -struct map_impl { - static constexpr size_t packet_size = preferred_vector_size::value; - - KERNEL_FLOAT_INLINE static void call(F fun, Output* output, const Args*... args) { - if constexpr (N / packet_size > 0) { -#pragma unroll - for (size_t i = 0; i < N - N % packet_size; i += packet_size) { - apply_impl::call(fun, output + i, (args + i)...); - } - } +struct accurate_policy { + template + using type = detail::apply_impl; +}; - if constexpr (N % packet_size > 0) { -#pragma unroll - for (size_t i = N - N % packet_size; i < N; i++) { - apply_impl::call(fun, output + i, (args + i)...); - } - } - } +struct fast_policy { + template + using type = detail::apply_fastmath_impl; }; -template -struct fast_map_impl { +#ifdef KERNEL_FLOAT_POLICY +using default_policy = KERNEL_FLOAT_POLICY; +#else +using default_policy = accurate_policy; +#endif + +namespace detail { + +template +struct map_policy_impl { static constexpr size_t packet_size = preferred_vector_size::value; KERNEL_FLOAT_INLINE static void call(F fun, Output* output, const Args*... args) { if constexpr (N / packet_size > 0) { #pragma unroll for (size_t i = 0; i < N - N % packet_size; i += packet_size) { - apply_fastmath_impl::call( - fun, - output + i, - (args + i)...); + Policy::template type::call(fun, output + i, (args + i)...); } } if constexpr (N % packet_size > 0) { #pragma unroll for (size_t i = N - N % packet_size; i < N; i++) { - apply_fastmath_impl::call(fun, output + i, (args + i)...); + Policy::template type::call(fun, output + i, (args + i)...); } } } }; + +template +using map_impl = map_policy_impl; + } // namespace detail template @@ -845,41 +844,13 @@ using map_type = * vec squared = map([](auto x) { return x * x; }, input); // [1.0f, 4.0f, 9.0f, 16.0f] * ``` */ -template +template KERNEL_FLOAT_INLINE map_type map(F fun, const Args&... args) { using Output = result_t...>; using E = broadcast_vector_extent_type; vector_storage> result; - // Use the `apply_fastmath_impl` if KERNEL_FLOAT_FAST_MATH is enabled -#if KERNEL_FLOAT_FAST_MATH - using apply_impl = - detail::fast_math_impl, Output, vector_value_type...>; -#else - using map_impl = detail::map_impl, Output, vector_value_type...>; -#endif - - map_impl::call( - fun, - result.data(), - (detail::broadcast_impl, vector_extent_type, E>::call( - into_vector_storage(args)) - .data())...); - - return result; -} - -/** - * Apply the function `F` to each element from the vector `input` and return the results as a new vector. This - * uses fast-math if available for the given function `F`, otherwise this function behaves like `map`. - */ -template -KERNEL_FLOAT_INLINE map_type fast_map(F fun, const Args&... args) { - using Output = result_t...>; - using E = broadcast_vector_extent_type; - vector_storage> result; - - detail::fast_map_impl, Output, vector_value_type...>::call( + detail::map_policy_impl, Output, vector_value_type...>::call( fun, result.data(), (detail::broadcast_impl, vector_extent_type, E>::call( @@ -1228,10 +1199,10 @@ KERNEL_FLOAT_INLINE vector> cast(const V& input) { } #define KERNEL_FLOAT_DEFINE_UNARY_FUN(NAME) \ - template \ + template \ KERNEL_FLOAT_INLINE vector, vector_extent_type> NAME(const V& input) { \ using F = ops::NAME>; \ - return map(F {}, input); \ + return ::kernel_float::map(F {}, input); \ } #define KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR) \ @@ -1336,12 +1307,11 @@ KERNEL_FLOAT_DEFINE_UNARY_STRUCT(rcp, 1.0 / input, 1.0f / input) KERNEL_FLOAT_DEFINE_UNARY_FUN(rcp) -#define KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(NAME) \ - template \ - KERNEL_FLOAT_INLINE vector, vector_extent_type> fast_##NAME( \ - const V& input) { \ - using F = ops::NAME>; \ - return fast_map(F {}, input); \ +#define KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(NAME) \ + template \ + KERNEL_FLOAT_INLINE vector, vector_extent_type> fast_##NAME( \ + const V& input) { \ + return ::kernel_float::map(ops::NAME> {}, input); \ } KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(exp) @@ -1669,14 +1639,7 @@ KERNEL_FLOAT_INLINE zip_common_type zip_common(F fun, const L& left, co vector_storage> result; -// Use the `apply_fastmath_impl` if KERNEL_FLOAT_FAST_MATH is enabled -#if KERNEL_FLOAT_FAST_MATH - using map_impl = detail::fast_map_impl, O, T, T>; -#else - using map_impl = detail::map_impl, O, T, T>; -#endif - - map_impl::call( + detail::map_impl, O, T, T>::call( fun, result.data(), detail::convert_impl, vector_extent_type, T, E>::call( @@ -1921,7 +1884,7 @@ struct apply_fastmath_impl, N, T, T, T> { T rhs_rcp[N]; // Fast way to perform division is to multiply by the reciprocal - apply_fastmath_impl, N, T, T, T>::call({}, rhs_rcp, rhs); + apply_fastmath_impl, N, T, T>::call({}, rhs_rcp, rhs); apply_fastmath_impl, N, T, T, T>::call({}, result, lhs, rhs_rcp); } }; @@ -1943,7 +1906,7 @@ fast_divide(const L& left, const R& right) { using E = broadcast_vector_extent_type; vector_storage> result; - detail::fast_map_impl, extent_size, T, T, T>::call( + detail::map_policy_impl, extent_size, T, T, T>::call( ops::divide {}, result.data(), detail::convert_impl, vector_extent_type, T, E>::call(