diff --git a/doc/changelog.rst b/doc/changelog.rst index e0492dd25..389e5aa24 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -13,6 +13,10 @@ New Changes ~~~~~~~ +- Enable the use of 512-bit vectors on Zen>=4 cpus + (`#464 `__). +- Improve support for non-standard SIMD sizes in vector functions + (`#464 `__). - The parallel compilation feature has been temporarily disabled due to several LLVM bugs (`#463 `__). diff --git a/src/detail/llvm_helpers.cpp b/src/detail/llvm_helpers.cpp index 0c0954b21..4fb02b048 100644 --- a/src/detail/llvm_helpers.cpp +++ b/src/detail/llvm_helpers.cpp @@ -37,6 +37,7 @@ #include #include +#include #include #include #include @@ -548,6 +549,167 @@ llvm::Value *llvm_scalarise_ext_math_vector_call(llvm_state &s, const std::vecto vfi); } +// Helper to invoke an external vector function with arguments args, automatically handling +// mismatches between the width of the vector function and the width of the arguments. +// +// vfi is a vector of vf_info instances listing the available implementations of the vector function +// (each one supporting a different vector width). attrs is the set of attributes to attach to the +// invocation(s) of the vector function. +// +// This function has several preconditions: +// +// - there must be at least 1 arg, +// - vfi cannot be empty, +// - all args must be vectors of the same type with a size greater than 1. +llvm::Value *llvm_invoke_vector_impl(llvm_state &s, const auto &vfi, const auto &attrs, auto *...args) +{ + constexpr auto nargs = sizeof...(args); + static_assert(nargs > 0u); + static_assert((std::same_as && ...)); + + assert(((args != nullptr) && ...)); + assert(!vfi.empty()); + + // Build an array with the original arguments. + const std::array orig_args{args...}; + + // Check that all arguments have the same type. + auto *x_t = orig_args[0]->getType(); + assert(((args->getType() == x_t) && ...)); + + // Ensure that the arguments are vectors. + auto *vec_t = llvm::dyn_cast(x_t); + assert(vec_t != nullptr); + + // Fetch the vector width. + const auto vector_width = boost::numeric_cast(vec_t->getNumElements()); + assert(vector_width > 1u); + + // Fetch the builder. + auto &bld = s.builder(); + + // Can we use the faster but less precise vectorised implementations? + const auto use_fast_math = bld.getFastMathFlags().approxFunc(); + + // Lookup a vector implementation with width *greater than or equal to* vector_width. + auto vfi_it = std::lower_bound(vfi.begin(), vfi.end(), vector_width, + [](const auto &vfi_item, std::uint32_t n) { return vfi_item.width < n; }); + + if (vfi_it == vfi.end()) { + // All vector implementations have a SIMD width *less than* vector_width. We will need + // to decompose the vector arguments into smaller vectors, perform the calculations + // on the smaller vectors, and reassemble the results into a single large vector. + + // Step back to the widest available vector implementation and fetch its width. + --vfi_it; + const auto available_vector_width = vfi_it->width; + assert(available_vector_width > 0u); + assert(vfi_it->nargs == nargs); + + // Fetch the vector type matching the chosen implementation. + auto *available_vec_t = make_vector_type(vec_t->getScalarType(), available_vector_width); + + // Fetch the vector function name (either the low-precision or standard version). + const auto &vf_name = (use_fast_math && !vfi_it->lp_name.empty()) ? vfi_it->lp_name : vfi_it->name; + + // Compute the number of chunks into which the original vector arguments will be split. + const auto n_chunks = vector_width / available_vector_width + + static_cast(vector_width % available_vector_width != 0u); + + // Prepare the vector of results of the invocations of the vector implementations. + std::vector vec_results; + vec_results.reserve(n_chunks); + + // Prepare the vector of arguments for the invocations of the vector implementations. + std::vector vec_args; + vec_args.reserve(nargs); + + // Prepare the mask vector. + std::vector mask; + mask.reserve(vector_width); + + for (std::uint32_t i = 0; i < n_chunks; ++i) { + // Construct the mask vector for the current iteration. + mask.clear(); + const auto chunk_begin = i * available_vector_width; + // NOTE: special case for the last iteration. + const auto chunk_end = (i == n_chunks - 1u) ? vector_width : (chunk_begin + available_vector_width); + for (auto idx = chunk_begin; idx != chunk_end; ++idx) { + mask.push_back(boost::numeric_cast(idx)); + } + // Pad the mask if needed (this will happen only at the last iteration). + // NOTE: the pad value is the last value in the original (large) vector. + mask.insert(mask.end(), available_vector_width - mask.size(), boost::numeric_cast(vector_width - 1u)); + + // Build the vector of arguments. + vec_args.clear(); + for (std::size_t arg_idx = 0; arg_idx < nargs; ++arg_idx) { + vec_args.push_back(bld.CreateShuffleVector(orig_args[arg_idx], mask)); + } + + // Invoke the vector implementation and add the result to vec_results. + vec_results.push_back(llvm_invoke_external(s, vf_name, available_vec_t, vec_args, attrs)); + } + + // Reassemble vec_results into a large vector. + auto *ret = llvm::concatenateVectors(bld, vec_results); + + // We need one last shuffle to trim the padded values at the end of ret (if any). + mask.clear(); + for (std::uint32_t idx = 0; idx < vector_width; ++idx) { + mask.push_back(boost::numeric_cast(idx)); + } + return bld.CreateShuffleVector(ret, mask); + } else if (vfi_it->width == vector_width) { + // We have a vector implementation with exactly the correct width. Use it. + assert(vfi_it->nargs == nargs); + + // Fetch the vector function name (either the low-precision + // or standard version). + const auto &vf_name = (use_fast_math && !vfi_it->lp_name.empty()) ? vfi_it->lp_name : vfi_it->name; + + // Invoke it. + return llvm_invoke_external(s, vf_name, vec_t, {args...}, attrs); + } else { + // We have a vector implemention with SIMD width *greater than* vector_width. We need + // to pad the input arguments, invoke the SIMD implementation, trim the result and return. + + // Fetch the width of the vector implementation. + const auto available_vector_width = vfi_it->width; + assert(available_vector_width > 0u); + assert(vfi_it->nargs == nargs); + + // Fetch the vector type matching the chosen implementation. + auto *available_vec_t = make_vector_type(vec_t->getScalarType(), available_vector_width); + + // Fetch the vector function name (either the low-precision or standard version). + const auto &vf_name = (use_fast_math && !vfi_it->lp_name.empty()) ? vfi_it->lp_name : vfi_it->name; + + // Prepare the mask vector. + std::vector mask; + mask.reserve(available_vector_width); + for (std::uint32_t idx = 0; idx < vector_width; ++idx) { + mask.push_back(boost::numeric_cast(idx)); + } + // Pad the mask with the last value in the original vector. + mask.insert(mask.end(), available_vector_width - vector_width, boost::numeric_cast(vector_width - 1u)); + + // Prepare the vector of arguments for the invocation of the vector implementation. + std::vector vec_args; + vec_args.reserve(nargs); + for (std::size_t arg_idx = 0; arg_idx < nargs; ++arg_idx) { + vec_args.push_back(bld.CreateShuffleVector(orig_args[arg_idx], mask)); + } + + // Invoke the vector implementation. + auto *ret = llvm_invoke_external(s, vf_name, available_vec_t, vec_args, attrs); + + // We need one last shuffle to trim the padded values at the end of ret. + mask.resize(vector_width); + return bld.CreateShuffleVector(ret, mask); + } +} + // Implementation of an LLVM math function built on top of an intrinsic (if possible). // intr_name is the name of the intrinsic (without type information), // f128/real_name are the names of the functions to be used for the @@ -584,9 +746,6 @@ llvm::Value *llvm_math_intr(llvm_state &s, const std::string &intr_name, auto &builder = s.builder(); - // Can we use the faster but less precise vectorised implementations? - const auto use_fast_math = builder.getFastMathFlags().approxFunc(); - if (llvm_stype_can_use_math_intrinsics(s, scal_t)) { // We can use the LLVM intrinsics for the given scalar type. @@ -598,48 +757,23 @@ llvm::Value *llvm_math_intr(llvm_state &s, const std::string &intr_name, const auto &vfi = lookup_vf_info(std::string(s_intr->getName())); if (auto *vec_t = llvm::dyn_cast(x_t)) { - // The inputs are vectors. Check if we have a vector implementation - // with the correct vector width in vfi. + // The inputs are vectors. Fetch their SIMD width. const auto vector_width = boost::numeric_cast(vec_t->getNumElements()); - const auto vfi_it - = std::lower_bound(vfi.begin(), vfi.end(), vector_width, - [](const auto &vfi_item, std::uint32_t n) { return vfi_item.width < n; }); - - if (vfi_it != vfi.end() && vfi_it->width == vector_width) { - // A vector implementation with precisely the correct width is available, use it. - assert(vfi_it->nargs == nargs); - // Fetch the vector function name (either the low-precision - // or standard version). - const auto &vf_name = (use_fast_math && !vfi_it->lp_name.empty()) ? vfi_it->lp_name : vfi_it->name; - - // NOTE: make sure to use the same attributes as the scalar intrinsic for the vector - // call. This ensures that the vector variant is declared with the same attributes as those that would - // be declared by invoking llvm_add_vfabi_attrs() on the scalar invocation. - return llvm_invoke_external(s, vf_name, vec_t, {args...}, s_intr->getAttributes()); - } - - if (!vfi.empty()) { - // We have *some* vector implementations available (albeit not with the correct - // size). Decompose into scalar calls adding the vfabi info to let the LLVM auto-vectorizer do its - // thing. - return llvm_scalarise_vector_call( - s, {args...}, - [&builder, s_intr](const std::vector &scal_args) { - return builder.CreateCall(s_intr, scal_args); - }, - vfi); + if (vector_width == 1u || vfi.empty()) { + // If the vector width is 1, or we do not have any vector implementation available, + // we let LLVM handle it. + return llvm_invoke_intrinsic(builder, intr_name, {x_t}, {args...}); + } else { + // The vector width is > 1 and we have one or more vector implementations available. Use them. + return llvm_invoke_vector_impl(s, vfi, s_intr->getAttributes(), args...); } - - // No vector implementation available, just let LLVM handle it. - // NOTE: this will lookup and invoke an intrinsic for vector arguments. - return llvm_invoke_intrinsic(builder, intr_name, {x_t}, {args...}); + } else { + // The input is **not** a vector. Invoke the scalar intrinsic attaching vector + // variants if available. + auto *ret = builder.CreateCall(s_intr, {args...}); + return llvm_add_vfabi_attrs(s, ret, vfi); } - - // The input is **not** a vector. Invoke the scalar intrinsic attaching vector - // variants if available. - auto *ret = builder.CreateCall(s_intr, {args...}); - return llvm_add_vfabi_attrs(s, ret, vfi); } #if defined(HEYOKA_HAVE_REAL128) @@ -730,9 +864,6 @@ llvm::Value *llvm_math_cmath(llvm_state &s, const std::string &base_name, Args * auto &builder = s.builder(); - // Can we use the faster but less precise vectorised implementations? - const auto use_fast_math = builder.getFastMathFlags().approxFunc(); - // Determine the type and scalar type of the arguments. auto *x_t = arg_types[0]; auto *scal_t = x_t->getScalarType(); @@ -752,35 +883,23 @@ llvm::Value *llvm_math_cmath(llvm_state &s, const std::string &base_name, Args * const auto attrs = llvm_ext_math_func_attrs(s); if (auto *vec_t = llvm::dyn_cast(x_t)) { - // The inputs are vectors. Check if we have a vector implementation - // with the correct vector width in vfi. + // The inputs are vectors. Fetch their SIMD width. const auto vector_width = boost::numeric_cast(vec_t->getNumElements()); - const auto vfi_it - = std::lower_bound(vfi.begin(), vfi.end(), vector_width, - [](const auto &vfi_item, std::uint32_t n) { return vfi_item.width < n; }); - - if (vfi_it != vfi.end() && vfi_it->width == vector_width) { - // A vector implementation with precisely the correct width is available, use it. - assert(vfi_it->nargs == nargs); - // Fetch the vector function name (either the low-precision - // or standard version). - const auto &vf_name = (use_fast_math && !vfi_it->lp_name.empty()) ? vfi_it->lp_name : vfi_it->name; - - return llvm_invoke_external(s, vf_name, vec_t, {args...}, attrs); + if (vector_width == 1u || vfi.empty()) { + // If the vector width is 1, or we do not have any vector implementation available, + // we scalarise the function call. + return llvm_scalarise_ext_math_vector_call(s, {args...}, scal_name, vfi, attrs); + } else { + // The vector width is > 1 and we have one or more vector implementations available. Use them. + return llvm_invoke_vector_impl(s, vfi, attrs, args...); } - - // A vector implementation with the correct width is **not** available: scalarise the - // vector call. - // NOTE: if there are other vector implementations available, these will be made available - // to the autovectorizer via the info contained in vfi. - return llvm_scalarise_ext_math_vector_call(s, {args...}, scal_name, vfi, attrs); + } else { + // The input is **not** a vector. Invoke the scalar function attaching vector + // variants if available. + auto *ret = llvm_invoke_external(s, scal_name, scal_t, {args...}, attrs); + return llvm_add_vfabi_attrs(s, ret, vfi); } - - // The input is **not** a vector. Invoke the scalar function attaching vector - // variants if available. - auto *ret = llvm_invoke_external(s, scal_name, scal_t, {args...}, attrs); - return llvm_add_vfabi_attrs(s, ret, vfi); } #if defined(HEYOKA_HAVE_REAL) diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp index e3e3cd85c..58962ab07 100644 --- a/src/llvm_state.cpp +++ b/src/llvm_state.cpp @@ -144,6 +144,10 @@ static_assert(alignof(__float128) == alignof(mppp::real128)); // NOLINTNEXTLINE(cert-err58-cpp) const std::regex ppc_regex_pattern("pwr([1-9]*)"); +// Regex to check for AMD Zen processors. +// NOLINTNEXTLINE(cert-err58-cpp) +const std::regex zen_regex_pattern("znver([1-9]*)"); + // Helper function to detect specific features // on the host machine via LLVM's machinery. target_features get_target_features_impl() @@ -158,10 +162,15 @@ target_features get_target_features_impl() throw std::invalid_argument("Error creating the target machine"); } + // Init the return value. target_features retval; + // Fetch the target name. const auto target_name = std::string{(*tm)->getTarget().getName()}; + // Flag to signal if we are on a Zen>=4 CPU. + bool zen4_or_later = false; + if (boost::starts_with(target_name, "x86")) { const auto t_features = (*tm)->getTargetFeatureString(); @@ -180,6 +189,22 @@ target_features get_target_features_impl() if (boost::algorithm::contains(t_features, "+sse2")) { retval.sse2 = true; } + + // Check if we are on Zen version 4 or later. + const auto target_cpu = std::string{(*tm)->getTargetCPU()}; + std::cmatch m; + if (std::regex_match(target_cpu.c_str(), m, zen_regex_pattern)) { + if (m.size() == 2u) { + // The CPU name matches and contains a subgroup. + // Extract the N from "znverN". + std::uint32_t zen_version{}; + const auto ret = std::from_chars(m[1].first, m[1].second, zen_version); + + if (ret.ec == std::errc{} && zen_version >= 4u) { + zen4_or_later = true; + } + } + } } if (boost::starts_with(target_name, "aarch64")) { @@ -198,7 +223,7 @@ target_features get_target_features_impl() // The CPU name matches and contains a subgroup. // Extract the N from "pwrN". std::uint32_t pwr_idx{}; - auto ret = std::from_chars(m[1].first, m[1].second, pwr_idx); + const auto ret = std::from_chars(m[1].first, m[1].second, pwr_idx); // NOTE: it looks like VSX3 is supported from Power9, // VSX from Power7. @@ -217,12 +242,14 @@ target_features get_target_features_impl() } // Compute the recommended SIMD sizes. - if (retval.avx512f || retval.avx2 || retval.avx) { - // NOTE: keep the recommended SIMD size to - // 4/8 also for AVX512 due to perf issues in early - // implementations. Revisit this in the future, possibly - // making it conditional on the specific CPU model - // in use. + if (zen4_or_later) { + // NOTE: on zen>=4, it is convenient to use 512-bit vectors. + retval.simd_size_flt = 16; + retval.simd_size_dbl = 8; + } else if (retval.avx512f || retval.avx2 || retval.avx) { + // NOTE: on Intel processors with avx512, it is difficult + // to establish whether or not 512-bit vectors are worth it. + // Let us be prudent about this. retval.simd_size_flt = 8; retval.simd_size_dbl = 4; } else if (retval.sse2 || retval.aarch64 || retval.vsx || retval.vsx3) { @@ -407,10 +434,25 @@ void optimise_module(llvm::Module &M, llvm::TargetMachine &tm, unsigned opt_leve f.setAttributes(attrs.addFnAttributes(ctx, new_attrs)); } - // Force usage of AVX512 registers, if requested. - if (force_avx512 && get_target_features().avx512f) { + // AVX512 setup. + const auto &tf = get_target_features(); + if (tf.avx512f) { + // NOTE: we enable 512-bit vectors if either forced by the + // user or if simd_size_dbl is 8 (which means that 512-bit vectors + // are a performance win). + if (force_avx512 || tf.simd_size_dbl == 8u) { + for (auto &f : M) { + f.addFnAttr("prefer-vector-width", "512"); + } + } + + // NOTE: explicitly disable scatter/gather when auto-vectorising + // as they can currently result in slowdowns: + // + // https://github.com/llvm/llvm-project/issues/91370 for (auto &f : M) { - f.addFnAttr("prefer-vector-width", "512"); + f.addFnAttr("no-gather"); + f.addFnAttr("no-scatter"); } } diff --git a/test/sin.cpp b/test/sin.cpp index de0eb13e8..c5016508a 100644 --- a/test/sin.cpp +++ b/test/sin.cpp @@ -339,50 +339,6 @@ TEST_CASE("vfabi double") REQUIRE(count == 11u); } - // Check that the autovec works also on batch sizes which do not correspond - // exactly to an available vector width. - llvm_state s3{kw::slp_vectorize = true}; - - add_cfunc(s3, "cfunc", {sin(a)}, {a}, kw::batch_size = 3u); - add_cfunc(s3, "cfuncs", {sin(a)}, {a}, kw::batch_size = 3u, kw::strided = true); - - s3.compile(); - - auto *cf3_ptr = reinterpret_cast( - s3.jit_lookup("cfunc")); - - std::vector ins3 = {1., 2., 3.}, outs3 = {0., 0., 0.}; - - cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr); - - REQUIRE(outs3[0] == approximately(std::sin(1.))); - REQUIRE(outs3[1] == approximately(std::sin(2.))); - REQUIRE(outs3[2] == approximately(std::sin(3.))); - - ir = s3.get_ir(); - - count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f64", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } - - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 1 call in the remainder of the unstrided cfunc, - // - 1 call in the remainder of the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } - -#if LLVM_VERSION_MAJOR >= 16 - - if (tf.aarch64) { - REQUIRE(count == 3u); - } - -#endif - #endif } } @@ -516,52 +472,41 @@ TEST_CASE("vfabi float") REQUIRE(count == 19u); } - // Check that the autovec works also on batch sizes which do not correspond - // exactly to an available vector width. - llvm_state s3{kw::slp_vectorize = true}; - - add_cfunc(s3, "cfunc", {sin(a)}, {a}, kw::batch_size = 5u); - add_cfunc(s3, "cfuncs", {sin(a)}, {a}, kw::strided = true, kw::batch_size = 5u); +#endif + } +} - s3.compile(); +// This is a test to check the machinery to invoke vector functions +// on vectors with nonstandard SIMD sizes. +TEST_CASE("nonstandard batch sizes") +{ + auto [x, y] = make_vars("x", "y"); - auto *cf3_ptr - = reinterpret_cast(s3.jit_lookup("cfunc")); + auto ex = sin(x) + cos(x); - std::vector ins3 = {1., 2., 3., 4., 5.}, outs3 = {0., 0., 0., 0., 0.}; + std::vector in, out; - cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr); + for (auto batch_size : {3u, 17u, 20u, 23u}) { + for (auto cm : {false, true}) { + for (auto opt_level : {0u, 1u, 2u, 3u}) { + llvm_state s{kw::opt_level = opt_level}; - REQUIRE(outs3[0] == approximately(std::sin(1.f))); - REQUIRE(outs3[1] == approximately(std::sin(2.f))); - REQUIRE(outs3[2] == approximately(std::sin(3.f))); - REQUIRE(outs3[3] == approximately(std::sin(4.f))); - REQUIRE(outs3[4] == approximately(std::sin(5.f))); + add_cfunc(s, "cf", {ex}, {x, y}, kw::batch_size = batch_size, kw::compact_mode = cm); - ir = s3.get_ir(); + s.compile(); - count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f32", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto *cf_ptr = reinterpret_cast( + s.jit_lookup("cf")); - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 1 call in the remainder of the unstrided cfunc, - // - 1 call in the remainder of the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } + in.resize(2u * batch_size, .3); + out.clear(); + out.resize(batch_size); -#if LLVM_VERSION_MAJOR >= 16 + cf_ptr(out.data(), in.data(), nullptr, nullptr); - if (tf.aarch64) { - REQUIRE(count == 3u); + std::ranges::for_each(out, + [](auto val) { REQUIRE(val == approximately(std::sin(.3) + std::cos(.3))); }); + } } - -#endif - -#endif } } diff --git a/test/sinh.cpp b/test/sinh.cpp index 993cb04c5..7f5619497 100644 --- a/test/sinh.cpp +++ b/test/sinh.cpp @@ -326,47 +326,6 @@ TEST_CASE("vfabi double") // 5-argument version. REQUIRE(count == 11u); } - - // Check that the autovec works also on batch sizes which do not correspond - // exactly to an available vector width. - llvm_state s3{kw::slp_vectorize = true}; - - add_cfunc(s3, "cfunc", {sinh(a)}, {a}, kw::batch_size = 3u); - add_cfunc(s3, "cfuncs", {sinh(a)}, {a}, kw::batch_size = 3u, kw::strided = true); - - s3.compile(); - - auto *cf3_ptr = reinterpret_cast( - s3.jit_lookup("cfunc")); - - std::vector ins3 = {1., 2., 3.}, outs3 = {0., 0., 0.}; - - cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr); - - REQUIRE(outs3[0] == approximately(std::sinh(1.))); - REQUIRE(outs3[1] == approximately(std::sinh(2.))); - REQUIRE(outs3[2] == approximately(std::sinh(3.))); - - ir = s3.get_ir(); - - count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinh", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } - - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 1 call in the remainder of the unstrided cfunc, - // - 1 call in the remainder of the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } - - if (tf.aarch64) { - REQUIRE(count == 3u); - } - #endif } } @@ -498,49 +457,41 @@ TEST_CASE("vfabi float") // 9-argument version. REQUIRE(count == 19u); } +#endif + } +} - // Check that the autovec works also on batch sizes which do not correspond - // exactly to an available vector width. - llvm_state s3{kw::slp_vectorize = true}; +// This is a test to check the machinery to invoke vector functions +// on vectors with nonstandard SIMD sizes. +TEST_CASE("nonstandard batch sizes") +{ + auto [x, y] = make_vars("x", "y"); - add_cfunc(s3, "cfunc", {sinh(a)}, {a}, kw::batch_size = 5u); - add_cfunc(s3, "cfuncs", {sinh(a)}, {a}, kw::batch_size = 5u, kw::strided = true); + auto ex = sinh(x) + cosh(x); - s3.compile(); + std::vector in, out; - auto *cf3_ptr - = reinterpret_cast(s3.jit_lookup("cfunc")); + for (auto batch_size : {3u, 17u, 20u, 23u}) { + for (auto cm : {false, true}) { + for (auto opt_level : {0u, 1u, 2u, 3u}) { + llvm_state s{kw::opt_level = opt_level}; - std::vector ins3 = {1., 2., 3., 4., 5.}, outs3 = {0., 0., 0., 0., 0.}; + add_cfunc(s, "cf", {ex}, {x, y}, kw::batch_size = batch_size, kw::compact_mode = cm); - cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr); + s.compile(); - REQUIRE(outs3[0] == approximately(std::sinh(1.f))); - REQUIRE(outs3[1] == approximately(std::sinh(2.f))); - REQUIRE(outs3[2] == approximately(std::sinh(3.f))); - REQUIRE(outs3[3] == approximately(std::sinh(4.f))); - REQUIRE(outs3[4] == approximately(std::sinh(5.f))); + auto *cf_ptr = reinterpret_cast( + s.jit_lookup("cf")); - ir = s3.get_ir(); + in.resize(2u * batch_size, .3); + out.clear(); + out.resize(batch_size); - count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinhf", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + cf_ptr(out.data(), in.data(), nullptr, nullptr); - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 1 call in the remainder of the unstrided cfunc, - // - 1 call in the remainder of the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } - - if (tf.aarch64) { - REQUIRE(count == 3u); + std::ranges::for_each(out, + [](auto val) { REQUIRE(val == approximately(std::sinh(.3) + std::cosh(.3))); }); + } } - -#endif } }