diff --git a/include/kernel_float/binops.h b/include/kernel_float/binops.h index 7c9ec2d..0c3d04a 100644 --- a/include/kernel_float/binops.h +++ b/include/kernel_float/binops.h @@ -292,8 +292,7 @@ struct multiply { namespace detail { template struct apply_impl, N, T, T, T> { - KERNEL_FLOAT_INLINE static void - call(ops::divide fun, T* result, const T* lhs, const T* rhs) { + KERNEL_FLOAT_INLINE static void call(ops::divide, T* result, const T* lhs, const T* rhs) { T rhs_rcp[N]; // Fast way to perform division is to multiply by the reciprocal @@ -310,13 +309,33 @@ struct apply_impl, N, T, T, T>: template<> struct apply_impl, 1, float, float, float> { KERNEL_FLOAT_INLINE static void - call(ops::divide fun, float* result, const float* lhs, const float* rhs) { + call(ops::divide, float* result, const float* lhs, const float* rhs) { *result = __fdividef(*lhs, *rhs); } }; #endif } // namespace detail +namespace detail { +// Override `pow` using `log2` and `exp2` +template +struct apply_impl, N, T, T, T> { + KERNEL_FLOAT_INLINE static void call(ops::divide, T* result, const T* lhs, const T* rhs) { + T lhs_log[N]; + T result_log[N]; + + // Fast way to perform power function is using log2 and exp2 + apply_impl, N, T, T>::call({}, lhs_log, lhs); + apply_impl, N, T, T, T>::call({}, result_log, lhs_log, rhs); + apply_impl, N, T, T, T>::call({}, result, result_log); + } +}; + +template +struct apply_impl, N, T, T, T>: + apply_base_impl, N, T, T, T> {}; +} // namespace detail + template> KERNEL_FLOAT_INLINE zip_common_type, T, T> fast_divide(const L& left, const R& right) { diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h index 1b9126d..31fcb0f 100644 --- a/single_include/kernel_float.h +++ b/single_include/kernel_float.h @@ -16,8 +16,8 @@ //================================================================================ // this file has been auto-generated, do not modify its contents! -// date: 2024-11-18 13:40:03.668017 -// git hash: ae0e6b16ac2d626e69bb08554044a77671f408ab +// date: 2024-11-18 13:50:24.614671 +// git hash: f89cf98f79e78ab6013063dea4b4b516ce163855 //================================================================================ #ifndef KERNEL_FLOAT_MACROS_H @@ -1950,8 +1950,7 @@ struct multiply { namespace detail { template struct apply_impl, N, T, T, T> { - KERNEL_FLOAT_INLINE static void - call(ops::divide fun, T* result, const T* lhs, const T* rhs) { + KERNEL_FLOAT_INLINE static void call(ops::divide, T* result, const T* lhs, const T* rhs) { T rhs_rcp[N]; // Fast way to perform division is to multiply by the reciprocal @@ -1968,13 +1967,33 @@ struct apply_impl, N, T, T, T>: template<> struct apply_impl, 1, float, float, float> { KERNEL_FLOAT_INLINE static void - call(ops::divide fun, float* result, const float* lhs, const float* rhs) { + call(ops::divide, float* result, const float* lhs, const float* rhs) { *result = __fdividef(*lhs, *rhs); } }; #endif } // namespace detail +namespace detail { +// Override `pow` using `log2` and `exp2` +template +struct apply_impl, N, T, T, T> { + KERNEL_FLOAT_INLINE static void call(ops::divide, T* result, const T* lhs, const T* rhs) { + T lhs_log[N]; + T result_log[N]; + + // Fast way to perform power function is using log2 and exp2 + apply_impl, N, T, T>::call({}, lhs_log, lhs); + apply_impl, N, T, T, T>::call({}, result_log, lhs_log, rhs); + apply_impl, N, T, T, T>::call({}, result, result_log); + } +}; + +template +struct apply_impl, N, T, T, T>: + apply_base_impl, N, T, T, T> {}; +} // namespace detail + template> KERNEL_FLOAT_INLINE zip_common_type, T, T> fast_divide(const L& left, const R& right) {