diff --git a/doc/changelog.rst b/doc/changelog.rst
index e0492dd25..389e5aa24 100644
--- a/doc/changelog.rst
+++ b/doc/changelog.rst
@@ -13,6 +13,10 @@ New
 Changes
 ~~~~~~~
 
+- Enable the use of 512-bit vectors on Zen>=4 cpus
+  (`#464 <https://github.com/bluescarni/heyoka/pull/464>`__).
+- Improve support for non-standard SIMD sizes in vector functions
+  (`#464 <https://github.com/bluescarni/heyoka/pull/464>`__).
 - The parallel compilation feature has been temporarily disabled
   due to several LLVM bugs
   (`#463 <https://github.com/bluescarni/heyoka/pull/463>`__).
diff --git a/src/detail/llvm_helpers.cpp b/src/detail/llvm_helpers.cpp
index 0c0954b21..4fb02b048 100644
--- a/src/detail/llvm_helpers.cpp
+++ b/src/detail/llvm_helpers.cpp
@@ -37,6 +37,7 @@
 #include <fmt/format.h>
 #include <fmt/ranges.h>
 
+#include <llvm/Analysis/VectorUtils.h>
 #include <llvm/Config/llvm-config.h>
 #include <llvm/IR/Attributes.h>
 #include <llvm/IR/BasicBlock.h>
@@ -548,6 +549,167 @@ llvm::Value *llvm_scalarise_ext_math_vector_call(llvm_state &s, const std::vecto
         vfi);
 }
 
+// Helper to invoke an external vector function with arguments args, automatically handling
+// mismatches between the width of the vector function and the width of the arguments.
+//
+// vfi is a vector of vf_info instances listing the available implementations of the vector function
+// (each one supporting a different vector width). attrs is the set of attributes to attach to the
+// invocation(s) of the vector function.
+//
+// This function has several preconditions:
+//
+// - there must be at least 1 arg,
+// - vfi cannot be empty,
+// - all args must be vectors of the same type with a size greater than 1.
+llvm::Value *llvm_invoke_vector_impl(llvm_state &s, const auto &vfi, const auto &attrs, auto *...args)
+{
+    constexpr auto nargs = sizeof...(args);
+    static_assert(nargs > 0u);
+    static_assert((std::same_as<llvm::Value *, decltype(args)> && ...));
+
+    assert(((args != nullptr) && ...));
+    assert(!vfi.empty());
+
+    // Build an array with the original arguments.
+    const std::array orig_args{args...};
+
+    // Check that all arguments have the same type.
+    auto *x_t = orig_args[0]->getType();
+    assert(((args->getType() == x_t) && ...));
+
+    // Ensure that the arguments are vectors.
+    auto *vec_t = llvm::dyn_cast<llvm_vector_type>(x_t);
+    assert(vec_t != nullptr);
+
+    // Fetch the vector width.
+    const auto vector_width = boost::numeric_cast<std::uint32_t>(vec_t->getNumElements());
+    assert(vector_width > 1u);
+
+    // Fetch the builder.
+    auto &bld = s.builder();
+
+    // Can we use the faster but less precise vectorised implementations?
+    const auto use_fast_math = bld.getFastMathFlags().approxFunc();
+
+    // Lookup a vector implementation with width *greater than or equal to* vector_width.
+    auto vfi_it = std::lower_bound(vfi.begin(), vfi.end(), vector_width,
+                                   [](const auto &vfi_item, std::uint32_t n) { return vfi_item.width < n; });
+
+    if (vfi_it == vfi.end()) {
+        // All vector implementations have a SIMD width *less than* vector_width. We will need
+        // to decompose the vector arguments into smaller vectors, perform the calculations
+        // on the smaller vectors, and reassemble the results into a single large vector.
+
+        // Step back to the widest available vector implementation and fetch its width.
+        --vfi_it;
+        const auto available_vector_width = vfi_it->width;
+        assert(available_vector_width > 0u);
+        assert(vfi_it->nargs == nargs);
+
+        // Fetch the vector type matching the chosen implementation.
+        auto *available_vec_t = make_vector_type(vec_t->getScalarType(), available_vector_width);
+
+        // Fetch the vector function name (either the low-precision or standard version).
+        const auto &vf_name = (use_fast_math && !vfi_it->lp_name.empty()) ? vfi_it->lp_name : vfi_it->name;
+
+        // Compute the number of chunks into which the original vector arguments will be split.
+        const auto n_chunks = vector_width / available_vector_width
+                              + static_cast<std::uint32_t>(vector_width % available_vector_width != 0u);
+
+        // Prepare the vector of results of the invocations of the vector implementations.
+        std::vector<llvm::Value *> vec_results;
+        vec_results.reserve(n_chunks);
+
+        // Prepare the vector of arguments for the invocations of the vector implementations.
+        std::vector<llvm::Value *> vec_args;
+        vec_args.reserve(nargs);
+
+        // Prepare the mask vector.
+        std::vector<int> mask;
+        mask.reserve(vector_width);
+
+        for (std::uint32_t i = 0; i < n_chunks; ++i) {
+            // Construct the mask vector for the current iteration.
+            mask.clear();
+            const auto chunk_begin = i * available_vector_width;
+            // NOTE: special case for the last iteration.
+            const auto chunk_end = (i == n_chunks - 1u) ? vector_width : (chunk_begin + available_vector_width);
+            for (auto idx = chunk_begin; idx != chunk_end; ++idx) {
+                mask.push_back(boost::numeric_cast<int>(idx));
+            }
+            // Pad the mask if needed (this will happen only at the last iteration).
+            // NOTE: the pad value is the last value in the original (large) vector.
+            mask.insert(mask.end(), available_vector_width - mask.size(), boost::numeric_cast<int>(vector_width - 1u));
+
+            // Build the vector of arguments.
+            vec_args.clear();
+            for (std::size_t arg_idx = 0; arg_idx < nargs; ++arg_idx) {
+                vec_args.push_back(bld.CreateShuffleVector(orig_args[arg_idx], mask));
+            }
+
+            // Invoke the vector implementation and add the result to vec_results.
+            vec_results.push_back(llvm_invoke_external(s, vf_name, available_vec_t, vec_args, attrs));
+        }
+
+        // Reassemble vec_results into a large vector.
+        auto *ret = llvm::concatenateVectors(bld, vec_results);
+
+        // We need one last shuffle to trim the padded values at the end of ret (if any).
+        mask.clear();
+        for (std::uint32_t idx = 0; idx < vector_width; ++idx) {
+            mask.push_back(boost::numeric_cast<int>(idx));
+        }
+        return bld.CreateShuffleVector(ret, mask);
+    } else if (vfi_it->width == vector_width) {
+        // We have a vector implementation with exactly the correct width. Use it.
+        assert(vfi_it->nargs == nargs);
+
+        // Fetch the vector function name (either the low-precision
+        // or standard version).
+        const auto &vf_name = (use_fast_math && !vfi_it->lp_name.empty()) ? vfi_it->lp_name : vfi_it->name;
+
+        // Invoke it.
+        return llvm_invoke_external(s, vf_name, vec_t, {args...}, attrs);
+    } else {
+        // We have a vector implemention with SIMD width *greater than* vector_width. We need
+        // to pad the input arguments, invoke the SIMD implementation, trim the result and return.
+
+        // Fetch the width of the vector implementation.
+        const auto available_vector_width = vfi_it->width;
+        assert(available_vector_width > 0u);
+        assert(vfi_it->nargs == nargs);
+
+        // Fetch the vector type matching the chosen implementation.
+        auto *available_vec_t = make_vector_type(vec_t->getScalarType(), available_vector_width);
+
+        // Fetch the vector function name (either the low-precision or standard version).
+        const auto &vf_name = (use_fast_math && !vfi_it->lp_name.empty()) ? vfi_it->lp_name : vfi_it->name;
+
+        // Prepare the mask vector.
+        std::vector<int> mask;
+        mask.reserve(available_vector_width);
+        for (std::uint32_t idx = 0; idx < vector_width; ++idx) {
+            mask.push_back(boost::numeric_cast<int>(idx));
+        }
+        // Pad the mask with the last value in the original vector.
+        mask.insert(mask.end(), available_vector_width - vector_width, boost::numeric_cast<int>(vector_width - 1u));
+
+        // Prepare the vector of arguments for the invocation of the vector implementation.
+        std::vector<llvm::Value *> vec_args;
+        vec_args.reserve(nargs);
+        for (std::size_t arg_idx = 0; arg_idx < nargs; ++arg_idx) {
+            vec_args.push_back(bld.CreateShuffleVector(orig_args[arg_idx], mask));
+        }
+
+        // Invoke the vector implementation.
+        auto *ret = llvm_invoke_external(s, vf_name, available_vec_t, vec_args, attrs);
+
+        // We need one last shuffle to trim the padded values at the end of ret.
+        mask.resize(vector_width);
+        return bld.CreateShuffleVector(ret, mask);
+    }
+}
+
 // Implementation of an LLVM math function built on top of an intrinsic (if possible).
 // intr_name is the name of the intrinsic (without type information),
 // f128/real_name are the names of the functions to be used for the
@@ -584,9 +746,6 @@ llvm::Value *llvm_math_intr(llvm_state &s, const std::string &intr_name,
 
     auto &builder = s.builder();
 
-    // Can we use the faster but less precise vectorised implementations?
-    const auto use_fast_math = builder.getFastMathFlags().approxFunc();
-
     if (llvm_stype_can_use_math_intrinsics(s, scal_t)) {
         // We can use the LLVM intrinsics for the given scalar type.
 
@@ -598,48 +757,23 @@ llvm::Value *llvm_math_intr(llvm_state &s, const std::string &intr_name,
         const auto &vfi = lookup_vf_info(std::string(s_intr->getName()));
 
         if (auto *vec_t = llvm::dyn_cast<llvm_vector_type>(x_t)) {
-            // The inputs are vectors. Check if we have a vector implementation
-            // with the correct vector width in vfi.
+            // The inputs are vectors. Fetch their SIMD width.
             const auto vector_width = boost::numeric_cast<std::uint32_t>(vec_t->getNumElements());
-            const auto vfi_it
-                = std::lower_bound(vfi.begin(), vfi.end(), vector_width,
-                                   [](const auto &vfi_item, std::uint32_t n) { return vfi_item.width < n; });
-
-            if (vfi_it != vfi.end() && vfi_it->width == vector_width) {
-                // A vector implementation with precisely the correct width is available, use it.
-                assert(vfi_it->nargs == nargs);
 
-                // Fetch the vector function name (either the low-precision
-                // or standard version).
-                const auto &vf_name = (use_fast_math && !vfi_it->lp_name.empty()) ? vfi_it->lp_name : vfi_it->name;
-
-                // NOTE: make sure to use the same attributes as the scalar intrinsic for the vector
-                // call. This ensures that the vector variant is declared with the same attributes as those that would
-                // be declared by invoking llvm_add_vfabi_attrs() on the scalar invocation.
-                return llvm_invoke_external(s, vf_name, vec_t, {args...}, s_intr->getAttributes());
-            }
-
-            if (!vfi.empty()) {
-                // We have *some* vector implementations available (albeit not with the correct
-                // size). Decompose into scalar calls adding the vfabi info to let the LLVM auto-vectorizer do its
-                // thing.
-                return llvm_scalarise_vector_call(
-                    s, {args...},
-                    [&builder, s_intr](const std::vector<llvm::Value *> &scal_args) {
-                        return builder.CreateCall(s_intr, scal_args);
-                    },
-                    vfi);
+            if (vector_width == 1u || vfi.empty()) {
+                // If the vector width is 1, or we do not have any vector implementation available,
+                // we let LLVM handle it.
+                return llvm_invoke_intrinsic(builder, intr_name, {x_t}, {args...});
+            } else {
+                // The vector width is > 1 and we have one or more vector implementations available. Use them.
+                return llvm_invoke_vector_impl(s, vfi, s_intr->getAttributes(), args...);
             }
-
-            // No vector implementation available, just let LLVM handle it.
-            // NOTE: this will lookup and invoke an intrinsic for vector arguments.
-            return llvm_invoke_intrinsic(builder, intr_name, {x_t}, {args...});
+        } else {
+            // The input is **not** a vector. Invoke the scalar intrinsic attaching vector
+            // variants if available.
+            auto *ret = builder.CreateCall(s_intr, {args...});
+            return llvm_add_vfabi_attrs(s, ret, vfi);
         }
-
-        // The input is **not** a vector. Invoke the scalar intrinsic attaching vector
-        // variants if available.
-        auto *ret = builder.CreateCall(s_intr, {args...});
-        return llvm_add_vfabi_attrs(s, ret, vfi);
     }
 
 #if defined(HEYOKA_HAVE_REAL128)
@@ -730,9 +864,6 @@ llvm::Value *llvm_math_cmath(llvm_state &s, const std::string &base_name, Args *
 
     auto &builder = s.builder();
 
-    // Can we use the faster but less precise vectorised implementations?
-    const auto use_fast_math = builder.getFastMathFlags().approxFunc();
-
     // Determine the type and scalar type of the arguments.
     auto *x_t = arg_types[0];
     auto *scal_t = x_t->getScalarType();
@@ -752,35 +883,23 @@ llvm::Value *llvm_math_cmath(llvm_state &s, const std::string &base_name, Args *
         const auto attrs = llvm_ext_math_func_attrs(s);
 
         if (auto *vec_t = llvm::dyn_cast<llvm_vector_type>(x_t)) {
-            // The inputs are vectors. Check if we have a vector implementation
-            // with the correct vector width in vfi.
+            // The inputs are vectors. Fetch their SIMD width.
             const auto vector_width = boost::numeric_cast<std::uint32_t>(vec_t->getNumElements());
-            const auto vfi_it
-                = std::lower_bound(vfi.begin(), vfi.end(), vector_width,
-                                   [](const auto &vfi_item, std::uint32_t n) { return vfi_item.width < n; });
-
-            if (vfi_it != vfi.end() && vfi_it->width == vector_width) {
-                // A vector implementation with precisely the correct width is available, use it.
-                assert(vfi_it->nargs == nargs);
 
-                // Fetch the vector function name (either the low-precision
-                // or standard version).
-                const auto &vf_name = (use_fast_math && !vfi_it->lp_name.empty()) ? vfi_it->lp_name : vfi_it->name;
-
-                return llvm_invoke_external(s, vf_name, vec_t, {args...}, attrs);
+            if (vector_width == 1u || vfi.empty()) {
+                // If the vector width is 1, or we do not have any vector implementation available,
+                // we scalarise the function call.
+                return llvm_scalarise_ext_math_vector_call(s, {args...}, scal_name, vfi, attrs);
+            } else {
+                // The vector width is > 1 and we have one or more vector implementations available. Use them.
+                return llvm_invoke_vector_impl(s, vfi, attrs, args...);
             }
-
-            // A vector implementation with the correct width is **not** available: scalarise the
-            // vector call.
-            // NOTE: if there are other vector implementations available, these will be made available
-            // to the autovectorizer via the info contained in vfi.
-            return llvm_scalarise_ext_math_vector_call(s, {args...}, scal_name, vfi, attrs);
+        } else {
+            // The input is **not** a vector. Invoke the scalar function attaching vector
+            // variants if available.
+            auto *ret = llvm_invoke_external(s, scal_name, scal_t, {args...}, attrs);
+            return llvm_add_vfabi_attrs(s, ret, vfi);
         }
-
-        // The input is **not** a vector. Invoke the scalar function attaching vector
-        // variants if available.
-        auto *ret = llvm_invoke_external(s, scal_name, scal_t, {args...}, attrs);
-        return llvm_add_vfabi_attrs(s, ret, vfi);
     }
 
 #if defined(HEYOKA_HAVE_REAL)
diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index e3e3cd85c..58962ab07 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -144,6 +144,10 @@ static_assert(alignof(__float128) == alignof(mppp::real128));
 // NOLINTNEXTLINE(cert-err58-cpp)
 const std::regex ppc_regex_pattern("pwr([1-9]*)");
 
+// Regex to check for AMD Zen processors.
+// NOLINTNEXTLINE(cert-err58-cpp)
+const std::regex zen_regex_pattern("znver([1-9]*)");
+
 // Helper function to detect specific features
 // on the host machine via LLVM's machinery.
 target_features get_target_features_impl()
@@ -158,10 +162,15 @@ target_features get_target_features_impl()
         throw std::invalid_argument("Error creating the target machine");
     }
 
+    // Init the return value.
     target_features retval;
 
+    // Fetch the target name.
     const auto target_name = std::string{(*tm)->getTarget().getName()};
 
+    // Flag to signal if we are on a Zen>=4 CPU.
+    bool zen4_or_later = false;
+
     if (boost::starts_with(target_name, "x86")) {
         const auto t_features = (*tm)->getTargetFeatureString();
 
@@ -180,6 +189,22 @@ target_features get_target_features_impl()
         if (boost::algorithm::contains(t_features, "+sse2")) {
             retval.sse2 = true;
         }
+
+        // Check if we are on Zen version 4 or later.
+        const auto target_cpu = std::string{(*tm)->getTargetCPU()};
+        std::cmatch m;
+        if (std::regex_match(target_cpu.c_str(), m, zen_regex_pattern)) {
+            if (m.size() == 2u) {
+                // The CPU name matches and contains a subgroup.
+                // Extract the N from "znverN".
+                std::uint32_t zen_version{};
+                const auto ret = std::from_chars(m[1].first, m[1].second, zen_version);
+
+                if (ret.ec == std::errc{} && zen_version >= 4u) {
+                    zen4_or_later = true;
+                }
+            }
+        }
     }
 
     if (boost::starts_with(target_name, "aarch64")) {
@@ -198,7 +223,7 @@ target_features get_target_features_impl()
                 // The CPU name matches and contains a subgroup.
                 // Extract the N from "pwrN".
                 std::uint32_t pwr_idx{};
-                auto ret = std::from_chars(m[1].first, m[1].second, pwr_idx);
+                const auto ret = std::from_chars(m[1].first, m[1].second, pwr_idx);
 
                 // NOTE: it looks like VSX3 is supported from Power9,
                 // VSX from Power7.
@@ -217,12 +242,14 @@ target_features get_target_features_impl()
     }
 
     // Compute the recommended SIMD sizes.
-    if (retval.avx512f || retval.avx2 || retval.avx) {
-        // NOTE: keep the recommended SIMD size to
-        // 4/8 also for AVX512 due to perf issues in early
-        // implementations. Revisit this in the future, possibly
-        // making it conditional on the specific CPU model
-        // in use.
+    if (zen4_or_later) {
+        // NOTE: on zen>=4, it is convenient to use 512-bit vectors.
+        retval.simd_size_flt = 16;
+        retval.simd_size_dbl = 8;
+    } else if (retval.avx512f || retval.avx2 || retval.avx) {
+        // NOTE: on Intel processors with avx512, it is difficult
+        // to establish whether or not 512-bit vectors are worth it.
+        // Let us be prudent about this.
         retval.simd_size_flt = 8;
         retval.simd_size_dbl = 4;
     } else if (retval.sse2 || retval.aarch64 || retval.vsx || retval.vsx3) {
@@ -407,10 +434,25 @@ void optimise_module(llvm::Module &M, llvm::TargetMachine &tm, unsigned opt_leve
         f.setAttributes(attrs.addFnAttributes(ctx, new_attrs));
     }
 
-    // Force usage of AVX512 registers, if requested.
-    if (force_avx512 && get_target_features().avx512f) {
+    // AVX512 setup.
+    const auto &tf = get_target_features();
+    if (tf.avx512f) {
+        // NOTE: we enable 512-bit vectors if either forced by the
+        // user or if simd_size_dbl is 8 (which means that 512-bit vectors
+        // are a performance win).
+        if (force_avx512 || tf.simd_size_dbl == 8u) {
+            for (auto &f : M) {
+                f.addFnAttr("prefer-vector-width", "512");
+            }
+        }
+
+        // NOTE: explicitly disable scatter/gather when auto-vectorising
+        // as they can currently result in slowdowns:
+        //
+        // https://github.com/llvm/llvm-project/issues/91370
         for (auto &f : M) {
-            f.addFnAttr("prefer-vector-width", "512");
+            f.addFnAttr("no-gather");
+            f.addFnAttr("no-scatter");
         }
     }
 
diff --git a/test/sin.cpp b/test/sin.cpp
index de0eb13e8..c5016508a 100644
--- a/test/sin.cpp
+++ b/test/sin.cpp
@@ -339,50 +339,6 @@ TEST_CASE("vfabi double")
             REQUIRE(count == 11u);
         }
 
-        // Check that the autovec works also on batch sizes which do not correspond
-        // exactly to an available vector width.
-        llvm_state s3{kw::slp_vectorize = true};
-
-        add_cfunc<double>(s3, "cfunc", {sin(a)}, {a}, kw::batch_size = 3u);
-        add_cfunc<double>(s3, "cfuncs", {sin(a)}, {a}, kw::batch_size = 3u, kw::strided = true);
-
-        s3.compile();
-
-        auto *cf3_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
-            s3.jit_lookup("cfunc"));
-
-        std::vector<double> ins3 = {1., 2., 3.}, outs3 = {0., 0., 0.};
-
-        cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr);
-
-        REQUIRE(outs3[0] == approximately(std::sin(1.)));
-        REQUIRE(outs3[1] == approximately(std::sin(2.)));
-        REQUIRE(outs3[2] == approximately(std::sin(3.)));
-
-        ir = s3.get_ir();
-
-        count = 0u;
-        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f64", boost::is_iequal()));
-             it != string_find_iterator(); ++it) {
-            ++count;
-        }
-
-        if (tf.sse2) {
-            // NOTE: occurrences of the scalar version:
-            // - 1 call in the remainder of the unstrided cfunc,
-            // - 1 call in the remainder of the strided cfunc,
-            // - 1 declaration.
-            REQUIRE(count == 3u);
-        }
-
-#if LLVM_VERSION_MAJOR >= 16
-
-        if (tf.aarch64) {
-            REQUIRE(count == 3u);
-        }
-
-#endif
-
 #endif
     }
 }
@@ -516,52 +472,41 @@ TEST_CASE("vfabi float")
             REQUIRE(count == 19u);
         }
 
-        // Check that the autovec works also on batch sizes which do not correspond
-        // exactly to an available vector width.
-        llvm_state s3{kw::slp_vectorize = true};
-
-        add_cfunc<float>(s3, "cfunc", {sin(a)}, {a}, kw::batch_size = 5u);
-        add_cfunc<float>(s3, "cfuncs", {sin(a)}, {a}, kw::strided = true, kw::batch_size = 5u);
+#endif
+    }
+}
 
-        s3.compile();
+// This is a test to check the machinery to invoke vector functions
+// on vectors with nonstandard SIMD sizes.
+TEST_CASE("nonstandard batch sizes")
+{
+    auto [x, y] = make_vars("x", "y");
 
-        auto *cf3_ptr
-            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s3.jit_lookup("cfunc"));
+    auto ex = sin(x) + cos(x);
 
-        std::vector<float> ins3 = {1., 2., 3., 4., 5.}, outs3 = {0., 0., 0., 0., 0.};
+    std::vector<double> in, out;
 
-        cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr);
+    for (auto batch_size : {3u, 17u, 20u, 23u}) {
+        for (auto cm : {false, true}) {
+            for (auto opt_level : {0u, 1u, 2u, 3u}) {
+                llvm_state s{kw::opt_level = opt_level};
 
-        REQUIRE(outs3[0] == approximately(std::sin(1.f)));
-        REQUIRE(outs3[1] == approximately(std::sin(2.f)));
-        REQUIRE(outs3[2] == approximately(std::sin(3.f)));
-        REQUIRE(outs3[3] == approximately(std::sin(4.f)));
-        REQUIRE(outs3[4] == approximately(std::sin(5.f)));
+                add_cfunc<double>(s, "cf", {ex}, {x, y}, kw::batch_size = batch_size, kw::compact_mode = cm);
 
-        ir = s3.get_ir();
+                s.compile();
 
-        count = 0u;
-        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f32", boost::is_iequal()));
-             it != string_find_iterator(); ++it) {
-            ++count;
-        }
+                auto *cf_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+                    s.jit_lookup("cf"));
 
-        if (tf.sse2) {
-            // NOTE: occurrences of the scalar version:
-            // - 1 call in the remainder of the unstrided cfunc,
-            // - 1 call in the remainder of the strided cfunc,
-            // - 1 declaration.
-            REQUIRE(count == 3u);
-        }
+                in.resize(2u * batch_size, .3);
+                out.clear();
+                out.resize(batch_size);
 
-#if LLVM_VERSION_MAJOR >= 16
+                cf_ptr(out.data(), in.data(), nullptr, nullptr);
 
-        if (tf.aarch64) {
-            REQUIRE(count == 3u);
+                std::ranges::for_each(out,
+                                      [](auto val) { REQUIRE(val == approximately(std::sin(.3) + std::cos(.3))); });
+            }
         }
-
-#endif
-
-#endif
     }
 }
diff --git a/test/sinh.cpp b/test/sinh.cpp
index 993cb04c5..7f5619497 100644
--- a/test/sinh.cpp
+++ b/test/sinh.cpp
@@ -326,47 +326,6 @@ TEST_CASE("vfabi double")
             //   5-argument version.
             REQUIRE(count == 11u);
         }
-
-        // Check that the autovec works also on batch sizes which do not correspond
-        // exactly to an available vector width.
-        llvm_state s3{kw::slp_vectorize = true};
-
-        add_cfunc<double>(s3, "cfunc", {sinh(a)}, {a}, kw::batch_size = 3u);
-        add_cfunc<double>(s3, "cfuncs", {sinh(a)}, {a}, kw::batch_size = 3u, kw::strided = true);
-
-        s3.compile();
-
-        auto *cf3_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
-            s3.jit_lookup("cfunc"));
-
-        std::vector<double> ins3 = {1., 2., 3.}, outs3 = {0., 0., 0.};
-
-        cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr);
-
-        REQUIRE(outs3[0] == approximately(std::sinh(1.)));
-        REQUIRE(outs3[1] == approximately(std::sinh(2.)));
-        REQUIRE(outs3[2] == approximately(std::sinh(3.)));
-
-        ir = s3.get_ir();
-
-        count = 0u;
-        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinh", boost::is_iequal()));
-             it != string_find_iterator(); ++it) {
-            ++count;
-        }
-
-        if (tf.sse2) {
-            // NOTE: occurrences of the scalar version:
-            // - 1 call in the remainder of the unstrided cfunc,
-            // - 1 call in the remainder of the strided cfunc,
-            // - 1 declaration.
-            REQUIRE(count == 3u);
-        }
-
-        if (tf.aarch64) {
-            REQUIRE(count == 3u);
-        }
-
 #endif
     }
 }
@@ -498,49 +457,41 @@ TEST_CASE("vfabi float")
             //   9-argument version.
             REQUIRE(count == 19u);
         }
+#endif
+    }
+}
 
-        // Check that the autovec works also on batch sizes which do not correspond
-        // exactly to an available vector width.
-        llvm_state s3{kw::slp_vectorize = true};
+// This is a test to check the machinery to invoke vector functions
+// on vectors with nonstandard SIMD sizes.
+TEST_CASE("nonstandard batch sizes")
+{
+    auto [x, y] = make_vars("x", "y");
 
-        add_cfunc<float>(s3, "cfunc", {sinh(a)}, {a}, kw::batch_size = 5u);
-        add_cfunc<float>(s3, "cfuncs", {sinh(a)}, {a}, kw::batch_size = 5u, kw::strided = true);
+    auto ex = sinh(x) + cosh(x);
 
-        s3.compile();
+    std::vector<double> in, out;
 
-        auto *cf3_ptr
-            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s3.jit_lookup("cfunc"));
+    for (auto batch_size : {3u, 17u, 20u, 23u}) {
+        for (auto cm : {false, true}) {
+            for (auto opt_level : {0u, 1u, 2u, 3u}) {
+                llvm_state s{kw::opt_level = opt_level};
 
-        std::vector<float> ins3 = {1., 2., 3., 4., 5.}, outs3 = {0., 0., 0., 0., 0.};
+                add_cfunc<double>(s, "cf", {ex}, {x, y}, kw::batch_size = batch_size, kw::compact_mode = cm);
 
-        cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr);
+                s.compile();
 
-        REQUIRE(outs3[0] == approximately(std::sinh(1.f)));
-        REQUIRE(outs3[1] == approximately(std::sinh(2.f)));
-        REQUIRE(outs3[2] == approximately(std::sinh(3.f)));
-        REQUIRE(outs3[3] == approximately(std::sinh(4.f)));
-        REQUIRE(outs3[4] == approximately(std::sinh(5.f)));
+                auto *cf_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+                    s.jit_lookup("cf"));
 
-        ir = s3.get_ir();
+                in.resize(2u * batch_size, .3);
+                out.clear();
+                out.resize(batch_size);
 
-        count = 0u;
-        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinhf", boost::is_iequal()));
-             it != string_find_iterator(); ++it) {
-            ++count;
-        }
+                cf_ptr(out.data(), in.data(), nullptr, nullptr);
 
-        if (tf.sse2) {
-            // NOTE: occurrences of the scalar version:
-            // - 1 call in the remainder of the unstrided cfunc,
-            // - 1 call in the remainder of the strided cfunc,
-            // - 1 declaration.
-            REQUIRE(count == 3u);
-        }
-
-        if (tf.aarch64) {
-            REQUIRE(count == 3u);
+                std::ranges::for_each(out,
+                                      [](auto val) { REQUIRE(val == approximately(std::sinh(.3) + std::cosh(.3))); });
+            }
         }
-
-#endif
     }
 }