diff --git a/.github/workflows/gha_ci.yml b/.github/workflows/gha_ci.yml
index 81cba6d3e..ee5e0f980 100644
--- a/.github/workflows/gha_ci.yml
+++ b/.github/workflows/gha_ci.yml
@@ -98,15 +98,3 @@ jobs:
       - uses: actions/checkout@v4
       - name: Build
         run: bash tools/gha_llvm15_conda_asan.sh
-  conda_llvm14_asan:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - name: Build
-        run: bash tools/gha_llvm14_conda_asan.sh
-  conda_llvm13_asan:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - name: Build
-        run: bash tools/gha_llvm13_conda_asan.sh
diff --git a/CMakeLists.txt b/CMakeLists.txt
index be252abd3..86717c759 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,7 +11,7 @@ if(NOT CMAKE_BUILD_TYPE)
 	FORCE)
 endif()
 
-project(heyoka VERSION 5.1.0 LANGUAGES CXX C)
+project(heyoka VERSION 6.0.0 LANGUAGES CXX C)
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake" "${CMAKE_CURRENT_SOURCE_DIR}/cmake/yacma")
 
@@ -158,8 +158,8 @@ endif()
 include(GNUInstallDirs)
 find_package(LLVM REQUIRED CONFIG)
 
-if(${LLVM_VERSION_MAJOR} LESS 13 OR ${LLVM_VERSION_MAJOR} GREATER 18)
-  message(FATAL_ERROR "LLVM >= 13 and <= 18 is required.")
+if(${LLVM_VERSION_MAJOR} LESS 15 OR ${LLVM_VERSION_MAJOR} GREATER 18)
+  message(FATAL_ERROR "LLVM >= 15 and <= 18 is required.")
 endif()
 
 # List of source files.
@@ -334,7 +334,7 @@ if(HEYOKA_WITH_SLEEF)
 endif()
 
 # Setup the heyoka ABI version number.
-set(HEYOKA_ABI_VERSION 29)
+set(HEYOKA_ABI_VERSION 30)
 
 if(HEYOKA_BUILD_STATIC_LIBRARY)
     # Setup of the heyoka static library.
diff --git a/doc/install.rst b/doc/install.rst
index 64d638c87..7beed7bf4 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -14,7 +14,7 @@ and several CPU architectures (x86-64, 64-bit ARM and 64-bit PowerPC).
 
 heyoka has the following **mandatory** dependencies:
 
-* the `LLVM <https://llvm.org/>`__ compiler infrastructure library (version >=13 and <=18),
+* the `LLVM <https://llvm.org/>`__ compiler infrastructure library (version >=15 and <=18),
 * the `Boost <https://www.boost.org/>`__ C++ libraries (version >=1.69),
 * the `{fmt} <https://fmt.dev/latest/index.html>`__ library (version >=9 and <=11),
 * the `spdlog <https://github.com/gabime/spdlog>`__ library,
diff --git a/include/heyoka/detail/fwd_decl.hpp b/include/heyoka/detail/fwd_decl.hpp
index b723469ce..0bdb24d61 100644
--- a/include/heyoka/detail/fwd_decl.hpp
+++ b/include/heyoka/detail/fwd_decl.hpp
@@ -26,6 +26,7 @@ class HEYOKA_DLL_PUBLIC func;
 class HEYOKA_DLL_PUBLIC param;
 
 class HEYOKA_DLL_PUBLIC llvm_state;
+class HEYOKA_DLL_PUBLIC llvm_multi_state;
 
 template <typename>
 class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive;
diff --git a/include/heyoka/llvm_state.hpp b/include/heyoka/llvm_state.hpp
index c314c2105..1c021ecfa 100644
--- a/include/heyoka/llvm_state.hpp
+++ b/include/heyoka/llvm_state.hpp
@@ -12,14 +12,15 @@
 #include <heyoka/config.hpp>
 
 #include <concepts>
+#include <cstddef>
 #include <cstdint>
-#include <initializer_list>
 #include <memory>
 #include <ostream>
 #include <string>
 #include <tuple>
 #include <type_traits>
 #include <utility>
+#include <vector>
 
 #include <boost/numeric/conversion/cast.hpp>
 
@@ -78,6 +79,7 @@ HEYOKA_DLL_PUBLIC const target_features &get_target_features();
 } // namespace detail
 
 HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const llvm_state &);
+HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const llvm_multi_state &);
 
 template <typename T>
 inline std::uint32_t recommended_simd_size()
@@ -132,6 +134,8 @@ HEYOKA_BEGIN_NAMESPACE
 class HEYOKA_DLL_PUBLIC llvm_state
 {
     friend HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const llvm_state &);
+    friend HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const llvm_multi_state &);
+    friend class HEYOKA_DLL_PUBLIC llvm_multi_state;
 
     struct jit;
 
@@ -337,15 +341,65 @@ namespace detail
 
 // The value contained in the in-memory cache.
 struct llvm_mc_value {
-    std::string opt_bc, opt_ir, obj;
+    std::vector<std::string> opt_bc, opt_ir, obj;
+
+    std::size_t total_size() const;
 };
 
 // Cache lookup and insertion.
-std::optional<llvm_mc_value> llvm_state_mem_cache_lookup(const std::string &, unsigned);
-void llvm_state_mem_cache_try_insert(std::string, unsigned, llvm_mc_value);
+std::optional<llvm_mc_value> llvm_state_mem_cache_lookup(const std::vector<std::string> &, unsigned);
+void llvm_state_mem_cache_try_insert(std::vector<std::string>, unsigned, llvm_mc_value);
 
 } // namespace detail
 
+class HEYOKA_DLL_PUBLIC llvm_multi_state
+{
+    friend HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const llvm_multi_state &);
+
+    struct impl;
+
+    std::unique_ptr<impl> m_impl;
+
+    HEYOKA_DLL_LOCAL void compile_impl();
+    HEYOKA_DLL_LOCAL void add_obj_triggers();
+
+    // Check functions.
+    HEYOKA_DLL_LOCAL void check_compiled(const char *) const;
+    HEYOKA_DLL_LOCAL void check_uncompiled(const char *) const;
+
+    friend class boost::serialization::access;
+    void save(boost::archive::binary_oarchive &, unsigned) const;
+    void load(boost::archive::binary_iarchive &, unsigned);
+    BOOST_SERIALIZATION_SPLIT_MEMBER()
+
+public:
+    llvm_multi_state();
+    explicit llvm_multi_state(std::vector<llvm_state>);
+    llvm_multi_state(const llvm_multi_state &);
+    llvm_multi_state(llvm_multi_state &&) noexcept;
+    llvm_multi_state &operator=(const llvm_multi_state &);
+    llvm_multi_state &operator=(llvm_multi_state &&) noexcept;
+    ~llvm_multi_state();
+
+    [[nodiscard]] bool is_compiled() const noexcept;
+
+    [[nodiscard]] unsigned get_n_modules() const noexcept;
+
+    [[nodiscard]] bool fast_math() const noexcept;
+    [[nodiscard]] bool force_avx512() const noexcept;
+    [[nodiscard]] unsigned get_opt_level() const noexcept;
+    [[nodiscard]] bool get_slp_vectorize() const noexcept;
+    [[nodiscard]] code_model get_code_model() const noexcept;
+
+    [[nodiscard]] std::vector<std::string> get_ir() const;
+    [[nodiscard]] std::vector<std::string> get_bc() const;
+    [[nodiscard]] const std::vector<std::string> &get_object_code() const;
+
+    void compile();
+
+    std::uintptr_t jit_lookup(const std::string &);
+};
+
 HEYOKA_END_NAMESPACE
 
 // Archive version changelog:
diff --git a/src/detail/llvm_helpers.cpp b/src/detail/llvm_helpers.cpp
index f32c4c6fa..279a7609e 100644
--- a/src/detail/llvm_helpers.cpp
+++ b/src/detail/llvm_helpers.cpp
@@ -31,6 +31,7 @@
 #include <boost/algorithm/string/predicate.hpp>
 #include <boost/core/demangle.hpp>
 #include <boost/numeric/conversion/cast.hpp>
+#include <boost/safe_numerics/safe_integer.hpp>
 
 #include <fmt/format.h>
 #include <fmt/ranges.h>
@@ -271,6 +272,74 @@ llvm::AttributeList llvm_ext_math_func_attrs(llvm_state &s)
     return f->getAttributes();
 }
 
+// Add a pointer to the llvm.used global variable of a module:
+//
+// https://llvm.org/docs/LangRef.html#the-llvm-used-global-variable
+//
+// If the llvm.used variable does not exist yet, create it.
+//
+// NOTE: this has quadratic complexity when appending ptr to an existing
+// array. It should not be a problem for the type of use we do as we expect
+// just a few entries in this array, but something to keep in mind.
+void llvm_append_used(llvm_state &s, llvm::Constant *ptr)
+{
+    assert(ptr != nullptr);
+    assert(ptr->getType()->isPointerTy());
+
+    auto &md = s.module();
+    auto &ctx = s.context();
+
+    // Fetch the pointer type.
+    auto *ptr_type = llvm::PointerType::getUnqual(ctx);
+
+    if (auto *orig_used = md.getGlobalVariable("llvm.used")) {
+        // The llvm.used variable exists already.
+
+        // Fetch the original initializer.
+        assert(orig_used->hasInitializer());
+        auto *orig_init = llvm::cast<llvm::ConstantArray>(orig_used->getInitializer());
+
+        // Construct a new initializer with the original values
+        // plus the new pointer.
+        std::vector<llvm::Constant *> arr_values;
+        arr_values.reserve(
+            boost::safe_numerics::safe<decltype(arr_values.size())>(orig_init->getType()->getNumElements()) + 1);
+        for (decltype(orig_init->getType()->getNumElements()) i = 0; i < orig_init->getType()->getNumElements(); ++i) {
+            auto *orig_el = orig_init->getAggregateElement(boost::numeric_cast<unsigned>(i));
+            assert(orig_el->getType()->isPointerTy());
+
+            // NOTE: if ptr was already in the llvm.used vector, just bail
+            // out early.
+            if (orig_el->isElementWiseEqual(ptr)) {
+                return;
+            }
+
+            arr_values.push_back(orig_el);
+        }
+        arr_values.push_back(ptr);
+
+        // Create the new array.
+        auto *used_array_type = llvm::ArrayType::get(ptr_type, boost::numeric_cast<std::uint64_t>(arr_values.size()));
+        auto *used_arr = llvm::ConstantArray::get(used_array_type, arr_values);
+
+        // Remove the original one.
+        orig_used->eraseFromParent();
+
+        // Add the new global variable.
+        auto *g_used_arr = new llvm::GlobalVariable(md, used_arr->getType(), true,
+                                                    llvm::GlobalVariable::AppendingLinkage, used_arr, "llvm.used");
+        g_used_arr->setSection("llvm.metadata");
+    } else {
+        // The llvm.used variable does not exist yet, create it.
+        auto *used_array_type = llvm::ArrayType::get(ptr_type, 1);
+        std::vector<llvm::Constant *> arr_values{ptr};
+        auto *used_arr = llvm::ConstantArray::get(used_array_type, arr_values);
+        auto *g_used_arr = new llvm::GlobalVariable(md, used_arr->getType(), true,
+                                                    llvm::GlobalVariable::AppendingLinkage, used_arr, "llvm.used");
+        g_used_arr->setSection("llvm.metadata");
+    }
+}
+
 // Attach the vfabi attributes to "call", which must be a call to a function with scalar arguments.
 // The necessary vfabi information is stored in vfi. The function returns "call".
 // The attributes of the scalar function will be attached to the vector variants.
@@ -312,17 +381,8 @@ llvm::CallInst *llvm_add_vfabi_attrs(llvm_state &s, llvm::CallInst *call, const
                 = (use_fast_math && !el.lp_vf_abi_attr.empty()) ? el.lp_vf_abi_attr : el.vf_abi_attr;
             vf_abi_strs.push_back(vf_abi_attr);
         }
-#if LLVM_VERSION_MAJOR >= 14
         call->addFnAttr(llvm::Attribute::get(context, "vector-function-abi-variant",
                                              fmt::format("{}", fmt::join(vf_abi_strs, ","))));
-#else
-        {
-            auto attrs = call->getAttributes();
-            attrs = attrs.addAttribute(context, llvm::AttributeList::FunctionIndex, "vector-function-abi-variant",
-                                       fmt::format("{}", fmt::join(vf_abi_strs, ",")));
-            call->setAttributes(attrs);
-        }
-#endif
 
         // Now we need to:
         // - add the declarations of the vector variants to the module,
@@ -374,33 +434,9 @@ llvm::CallInst *llvm_add_vfabi_attrs(llvm_state &s, llvm::CallInst *call, const
                 assert(vf_ptr->getAttributes() == f->getAttributes());
             }
 
-            // Create the name of the dummy function to ensure the variant is not optimised out.
-            //
-            // NOTE: another way of doing this involves the llvm.used global variable - need
-            // to learn about the metadata API apparently.
-            //
-            // https://llvm.org/docs/LangRef.html#the-llvm-used-global-variable
-            // https://godbolt.org/z/1neaG4bYj
-            const auto dummy_name = fmt::format("heyoka.dummy_vector_call.{}", el_name);
-
-            if (auto *dummy_ptr = md.getFunction(dummy_name); dummy_ptr == nullptr) {
-                // The dummy function has not been defined yet, do it.
-                auto *dummy = llvm_func_create(vec_ft, llvm::Function::ExternalLinkage, dummy_name, &md);
-
-                builder.SetInsertPoint(llvm::BasicBlock::Create(context, "entry", dummy));
-
-                // The dummy function just forwards its arguments to the variant.
-                std::vector<llvm::Value *> dummy_args;
-                for (auto *dummy_arg = dummy->args().begin(); dummy_arg != dummy->args().end(); ++dummy_arg) {
-                    dummy_args.emplace_back(dummy_arg);
-                }
-
-                builder.CreateRet(builder.CreateCall(vf_ptr, dummy_args));
-            } else {
-                // The declaration of the dummy function is already there.
-                // Check that the signatures match.
-                assert(dummy_ptr->getFunctionType() == vec_ft);
-            }
+            // Ensure that the variant is not optimised out because it is not
+            // explicitly used in the code.
+            detail::llvm_append_used(s, vf_ptr);
         }
 
         // Restore the original insertion block.
diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 7b303bc6f..a5af287bf 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -14,30 +14,33 @@
 #include <cstddef>
 #include <cstdint>
 #include <fstream>
-#include <initializer_list>
 #include <ios>
 #include <limits>
 #include <memory>
 #include <mutex>
 #include <optional>
 #include <ostream>
+#include <ranges>
 #include <regex>
 #include <sstream>
 #include <stdexcept>
 #include <string>
 #include <system_error>
+#include <thread>
 #include <tuple>
 #include <type_traits>
 #include <utility>
-#include <variant>
+#include <vector>
 
 #include <boost/algorithm/string/predicate.hpp>
 #include <boost/numeric/conversion/cast.hpp>
+#include <boost/safe_numerics/safe_integer.hpp>
 
 #include <fmt/format.h>
 
 #include <llvm/ADT/SmallString.h>
-#include <llvm/ADT/SmallVector.h>
+#include <llvm/Analysis/CGSCCPassManager.h>
+#include <llvm/Analysis/LoopAnalysisManager.h>
 #include <llvm/Analysis/TargetLibraryInfo.h>
 #include <llvm/Analysis/TargetTransformInfo.h>
 #include <llvm/Bitcode/BitcodeReader.h>
@@ -60,63 +63,19 @@
 #include <llvm/IR/LLVMContext.h>
 #include <llvm/IR/Module.h>
 #include <llvm/IR/Operator.h>
+#include <llvm/IR/PassManager.h>
 #include <llvm/IR/Value.h>
 #include <llvm/IR/Verifier.h>
+#include <llvm/MC/TargetRegistry.h>
+#include <llvm/Passes/OptimizationLevel.h>
+#include <llvm/Passes/PassBuilder.h>
 #include <llvm/Support/Casting.h>
 #include <llvm/Support/CodeGen.h>
+#include <llvm/Support/Error.h>
 #include <llvm/Support/MemoryBuffer.h>
-#include <llvm/Support/SmallVectorMemoryBuffer.h>
 #include <llvm/Support/TargetSelect.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Target/TargetMachine.h>
-#include <llvm/Transforms/Vectorize/LoadStoreVectorizer.h>
-
-#if LLVM_VERSION_MAJOR < 14
-
-// NOTE: this header was moved in LLVM 14.
-#include <llvm/Support/TargetRegistry.h>
-
-#else
-
-#include <llvm/MC/TargetRegistry.h>
-
-#endif
-
-// NOTE: new pass manager API.
-// NOTE: this is available since LLVM 13, but in that
-// version it seems like auto-vectorization with
-// vector-function-abi-variant is not working
-// properly with the new pass manager. Hence, we
-// enable it from LLVM 14.
-#if LLVM_VERSION_MAJOR >= 14
-
-#define HEYOKA_USE_NEW_LLVM_PASS_MANAGER
-
-#endif
-
-#if defined(HEYOKA_USE_NEW_LLVM_PASS_MANAGER)
-
-#include <llvm/Analysis/CGSCCPassManager.h>
-#include <llvm/Analysis/LoopAnalysisManager.h>
-#include <llvm/IR/PassManager.h>
-#include <llvm/Passes/PassBuilder.h>
-
-#if LLVM_VERSION_MAJOR >= 14
-
-// NOTE: this header is available since LLVM 14.
-#include <llvm/Passes/OptimizationLevel.h>
-
-#endif
-
-#else
-
-#include <llvm/CodeGen/TargetPassConfig.h>
-#include <llvm/IR/LegacyPassManager.h>
-#include <llvm/Pass.h>
-#include <llvm/Transforms/IPO.h>
-#include <llvm/Transforms/IPO/PassManagerBuilder.h>
-
-#endif
 
 #if LLVM_VERSION_MAJOR >= 17
 
@@ -151,26 +110,6 @@
 #include <heyoka/s11n.hpp>
 #include <heyoka/variable.hpp>
 
-// NOTE: logging here lhames' instructions on how to set up LLJIT
-// for parallel compilation of multiple modules.
-//
-//   auto J = LLJITBuilder()
-//              .setNumCompileThreads(<N>)
-//              .create();
-//   if (!J) { /* bail on error */ }
-//   (*J)->getIRTransformLayer().setTransform(
-//     [](ThreadSafeModule TSM, MaterializationResponsibility &R) -> Expected<ThreadSafeModule> {
-//       TSM.withModuleDo([](Module &M) {
-//         /* Apply your IR optimizations here */
-//       });
-//       return std::move(TSM);
-//     });
-//
-// Note that the optimisation passes in this approach are moved into the
-// transform layer. References:
-// https://discord.com/channels/636084430946959380/687692371038830597/1252428080648163328
-// https://discord.com/channels/636084430946959380/687692371038830597/1252118666187640892
-
 HEYOKA_BEGIN_NAMESPACE
 
 namespace detail
@@ -207,12 +146,12 @@ const std::regex ppc_regex_pattern("pwr([1-9]*)");
 target_features get_target_features_impl()
 {
     auto jtmb = llvm::orc::JITTargetMachineBuilder::detectHost();
-    if (!jtmb) {
+    if (!jtmb) [[unlikely]] {
         throw std::invalid_argument("Error creating a JITTargetMachineBuilder for the host system");
     }
 
     auto tm = jtmb->createTargetMachine();
-    if (!tm) {
+    if (!tm) [[unlikely]] {
         throw std::invalid_argument("Error creating the target machine");
     }
 
@@ -307,6 +246,342 @@ void init_native_target()
     });
 }
 
+// Helper to create a builder for target machines.
+llvm::orc::JITTargetMachineBuilder create_jit_tmb(unsigned opt_level, code_model c_model)
+{
+    // NOTE: codegen opt level changed in LLVM 18.
+#if LLVM_VERSION_MAJOR < 18
+
+    using cg_opt_level = llvm::CodeGenOpt::Level;
+
+#else
+
+    using cg_opt_level = llvm::CodeGenOptLevel;
+
+#endif
+
+    // Try creating the target machine builder.
+    auto jtmb = llvm::orc::JITTargetMachineBuilder::detectHost();
+    // LCOV_EXCL_START
+    if (!jtmb) [[unlikely]] {
+        throw std::invalid_argument("Error creating a JITTargetMachineBuilder for the host system");
+    }
+    // LCOV_EXCL_STOP
+
+    // Set the codegen optimisation level.
+    switch (opt_level) {
+        case 0u:
+            jtmb->setCodeGenOptLevel(cg_opt_level::None);
+            break;
+        case 1u:
+            jtmb->setCodeGenOptLevel(cg_opt_level::Less);
+            break;
+        case 2u:
+            jtmb->setCodeGenOptLevel(cg_opt_level::Default);
+            break;
+        default:
+            assert(opt_level == 3u);
+            jtmb->setCodeGenOptLevel(cg_opt_level::Aggressive);
+    }
+
+    // NOTE: not all code models are supported on all archs. We make an effort
+    // here to prevent unsupported code models to be requested, as that will
+    // result in the termination of the program.
+    constexpr code_model supported_code_models[] = {
+#if defined(HEYOKA_ARCH_X86)
+        code_model::small, code_model::kernel, code_model::medium, code_model::large
+#elif defined(HEYOKA_ARCH_ARM)
+        code_model::tiny, code_model::small, code_model::large
+#elif defined(HEYOKA_ARCH_PPC)
+        code_model::small, code_model::medium, code_model::large
+#else
+        // NOTE: by default we assume only small and large are supported.
+        code_model::small, code_model::large
+#endif
+    };
+
+    if (std::ranges::find(supported_code_models, c_model) == std::ranges::end(supported_code_models)) [[unlikely]] {
+        throw std::invalid_argument(
+            fmt::format("The code model '{}' is not supported on the current architecture", c_model));
+    }
+
+    // LCOV_EXCL_START
+
+#if LLVM_VERSION_MAJOR >= 17
+
+    // NOTE: the code model setup is working only on LLVM>=19 (or at least
+    // LLVM 18 + patches, as in the conda-forge LLVM package), due to this bug:
+    //
+    // https://github.com/llvm/llvm-project/issues/88115
+    //
+    // Additionally, there are indications from our CI that attempting to set
+    // the code model before LLVM 17 might just be buggy, as we see widespread
+    // ASAN failures all over the place. Thus, let us not do anything with the code
+    // model setting before LLVM 17.
+
+    // Setup the code model.
+    switch (c_model) {
+        case code_model::tiny:
+            jtmb->setCodeModel(llvm::CodeModel::Tiny);
+            break;
+        case code_model::small:
+            jtmb->setCodeModel(llvm::CodeModel::Small);
+            break;
+        case code_model::kernel:
+            jtmb->setCodeModel(llvm::CodeModel::Kernel);
+            break;
+        case code_model::medium:
+            jtmb->setCodeModel(llvm::CodeModel::Medium);
+            break;
+        case code_model::large:
+            jtmb->setCodeModel(llvm::CodeModel::Large);
+            break;
+        default:
+            // NOTE: we should never end up here.
+            assert(false);
+            ;
+    }
+
+#endif
+
+    //  LCOV_EXCL_STOP
+
+    return std::move(*jtmb);
+}
+
+// Helper to optimise the input module M. Implemented here for re-use.
+// NOTE: this may end up being invoked concurrently from multiple threads.
+// If that is the case, we make sure before invocation to construct a different
+// TargetMachine per thread, so that we are sure no data races are possible.
+void optimise_module(llvm::Module &M, llvm::TargetMachine &tm, unsigned opt_level, bool force_avx512,
+                     bool slp_vectorize)
+{
+    // NOTE: don't run any optimisation pass at O0.
+    if (opt_level == 0u) {
+        return;
+    }
+
+    // NOTE: the logic here largely mimics (with a lot of simplifications)
+    // the implementation of the 'opt' tool. See:
+    // https://github.com/llvm/llvm-project/blob/release/10.x/llvm/tools/opt/opt.cpp
+
+    // For every function in the module, setup its attributes
+    // so that the codegen uses all the features available on
+    // the host CPU.
+    const auto cpu = tm.getTargetCPU().str();
+    const auto features = tm.getTargetFeatureString().str();
+
+    // Fetch the module's context.
+    auto &ctx = M.getContext();
+
+    for (auto &f : M) {
+        auto attrs = f.getAttributes();
+
+        llvm::AttrBuilder new_attrs(ctx);
+
+        if (!cpu.empty() && !f.hasFnAttribute("target-cpu")) {
+            new_attrs.addAttribute("target-cpu", cpu);
+        }
+
+        if (!features.empty()) {
+            auto old_features = f.getFnAttribute("target-features").getValueAsString();
+
+            if (old_features.empty()) {
+                new_attrs.addAttribute("target-features", features);
+            } else {
+                llvm::SmallString<256> appended(old_features);
+                appended.push_back(',');
+                appended.append(features);
+                new_attrs.addAttribute("target-features", appended);
+            }
+        }
+
+        // Let new_attrs override attrs.
+        f.setAttributes(attrs.addFnAttributes(ctx, new_attrs));
+    }
+
+    // Force usage of AVX512 registers, if requested.
+    if (force_avx512 && get_target_features().avx512f) {
+        for (auto &f : M) {
+            f.addFnAttr("prefer-vector-width", "512");
+        }
+    }
+
+    // NOTE: adapted from here:
+    // https://llvm.org/docs/NewPassManager.html
+
+    // Create the analysis managers.
+    llvm::LoopAnalysisManager LAM;
+    llvm::FunctionAnalysisManager FAM;
+    llvm::CGSCCAnalysisManager CGAM;
+    llvm::ModuleAnalysisManager MAM;
+
+    // NOTE: in the new pass manager, this seems to be the way to
+    // set the target library info bits. See:
+    // https://github.com/llvm/llvm-project/blob/b7fd30eac3183993806cc218b6deb39eb625c083/llvm/tools/opt/NewPMDriver.cpp#L408
+    // Not sure if this matters, but we did it in the old pass manager
+    // and opt does it too.
+    llvm::TargetLibraryInfoImpl TLII(tm.getTargetTriple());
+    FAM.registerPass([&] { return llvm::TargetLibraryAnalysis(TLII); });
+
+    // Create the new pass manager builder, passing the supplied target machine.
+    // NOTE: if requested, we turn manually on the SLP vectoriser here, which is off
+    // by default. Not sure why it is off, the LLVM docs imply this
+    // is on by default at nonzero optimisation levels for clang and opt.
+    // NOTE: the reason for this inconsistency is that opt uses PB.parsePassPipeline()
+    // (instead of PB.buildPerModuleDefaultPipeline()) to set up the optimisation
+    // pipeline. Indeed, if we replace PB.buildPerModuleDefaultPipeline(ol) with
+    // PB.parsePassPipeline(MPM, "default<O3>") (which corresponds to invoking
+    // "opt -passes='default<O3>'"), we do NOT need to set SLP vectorization on
+    // here to get the SLP vectorizer. Not sure if we should consider switching to this
+    // alternative way of setting up the optimisation pipeline in the future.
+    llvm::PipelineTuningOptions pto;
+    pto.SLPVectorization = slp_vectorize;
+    llvm::PassBuilder PB(&tm, pto);
+
+    // Register all the basic analyses with the managers.
+    PB.registerModuleAnalyses(MAM);
+    PB.registerCGSCCAnalyses(CGAM);
+    PB.registerFunctionAnalyses(FAM);
+    PB.registerLoopAnalyses(LAM);
+    PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+    // Construct the optimisation level.
+    llvm::OptimizationLevel ol{};
+
+    switch (opt_level) {
+        case 1u:
+            ol = llvm::OptimizationLevel::O1;
+            break;
+        case 2u:
+            ol = llvm::OptimizationLevel::O2;
+            break;
+        default:
+            assert(opt_level == 3u);
+            ol = llvm::OptimizationLevel::O3;
+    }
+
+    // Create the module pass manager.
+    auto MPM = PB.buildPerModuleDefaultPipeline(ol);
+
+    // Optimize the IR.
+    MPM.run(M, MAM);
+}
+
+// Helper to add a module to an lljt, throwing on error.
+void add_module_to_lljit(llvm::orc::LLJIT &lljit, std::unique_ptr<llvm::Module> m, llvm::orc::ThreadSafeContext ctx)
+{
+    auto err = lljit.addIRModule(llvm::orc::ThreadSafeModule(std::move(m), std::move(ctx)));
+
+    // LCOV_EXCL_START
+    if (err) {
+        std::string err_report;
+        llvm::raw_string_ostream ostr(err_report);
+
+        ostr << err;
+
+        throw std::invalid_argument(
+            fmt::format("The function for adding a module to the jit failed. The full error message:\n{}", ostr.str()));
+    }
+    // LCOV_EXCL_STOP
+}
+
+// Helper to fetch the bitcode from a module.
+std::string bc_from_module(llvm::Module &m)
+{
+    std::string out;
+    llvm::raw_string_ostream ostr(out);
+
+    llvm::WriteBitcodeToFile(m, ostr);
+
+    return std::move(ostr.str());
+}
+
+// Helper to fetch the textual IR from a module.
+std::string ir_from_module(llvm::Module &m)
+{
+    std::string out;
+    llvm::raw_string_ostream ostr(out);
+
+    m.print(ostr, nullptr);
+
+    return std::move(ostr.str());
+}
+
+// An implementation of llvm::MemoryBuffer offering a view over a std::string.
+class string_view_mem_buffer final : public llvm::MemoryBuffer
+{
+public:
+    explicit string_view_mem_buffer(const std::string &s)
+    {
+        // NOTE: the important bit here is from the LLVM docs:
+        //
+        // """
+        // In addition to basic access to the characters in the file, this interface
+        // guarantees you can read one character past the end of the file, and that
+        // this character will read as '\0'.
+        // """
+        //
+        // This is exactly the guarantee given by std::string:
+        //
+        // https://en.cppreference.com/w/cpp/string/basic_string/data
+        //
+        // Not sure about the third parameter to this function though, it does not
+        // seem to have any influence apart from debug checking:
+        //
+        // https://llvm.org/doxygen/MemoryBuffer_8cpp_source.html
+        this->init(s.data(), s.data() + s.size(), true);
+    }
+    // LCOV_EXCL_START
+    llvm::MemoryBuffer::BufferKind getBufferKind() const final
+    {
+        // Hopefully std::string is not memory-mapped...
+        return llvm::MemoryBuffer::BufferKind::MemoryBuffer_Malloc;
+    }
+    // LCOV_EXCL_STOP
+};
+
+// Helper to add an object file to the jit, throwing in case of errors.
+void add_obj_to_lljit(llvm::orc::LLJIT &lljit, const std::string &obj)
+{
+    // NOTE: an empty obj can happen when we are copying a compiled
+    // llvm_multi_state. In such case, the object files of the individual
+    // states have all be empty-inited. We then need to avoid adding
+    // obj to the jit because that will result in an error.
+    if (obj.empty()) {
+        return;
+    }
+
+    // Add the object file.
+    auto err = lljit.addObjectFile(std::make_unique<string_view_mem_buffer>(obj));
+
+    // LCOV_EXCL_START
+    if (err) {
+        std::string err_report;
+        llvm::raw_string_ostream ostr(err_report);
+
+        ostr << err;
+
+        throw std::invalid_argument(fmt::format(
+            "The function for adding an object file to an lljit failed. The full error message:\n{}", ostr.str()));
+    }
+    // LCOV_EXCL_STOP
+}
+
+// Helper to verify a module, throwing if verification fails.
+void verify_module(const llvm::Module &m)
+{
+    std::string out;
+    llvm::raw_string_ostream ostr(out);
+
+    if (llvm::verifyModule(m, &ostr)) {
+        // LCOV_EXCL_START
+        throw std::runtime_error(fmt::format("The verification of the module '{}' produced an error:\n{}",
+                                             m.getModuleIdentifier(), ostr.str()));
+        // LCOV_EXCL_STOP
+    }
+}
+
 } // namespace
 
 // Helper function to fetch a const ref to a global object
@@ -371,116 +646,26 @@ struct llvm_state::jit {
     std::unique_ptr<llvm::orc::ThreadSafeContext> m_ctx;
     std::optional<std::string> m_object_file;
 
+    // NOTE: make sure to coordinate changes in this constructor with multi_jit.
     explicit jit(unsigned opt_level, code_model c_model)
     {
-        // NOTE: we assume here the opt level has already been clamped
-        // from the outside.
+        // NOTE: we assume here that the input arguments have
+        // been validated already.
         assert(opt_level <= 3u);
+        assert(c_model >= code_model::tiny && c_model <= code_model::large);
 
         // Ensure the native target is inited.
         detail::init_native_target();
 
-        // NOTE: codegen opt level changed in LLVM 18.
-#if LLVM_VERSION_MAJOR < 18
+        // Create the target machine builder.
+        auto jtmb = detail::create_jit_tmb(opt_level, c_model);
 
-        using cg_opt_level = llvm::CodeGenOpt::Level;
-
-#else
-
-        using cg_opt_level = llvm::CodeGenOptLevel;
-
-#endif
-
-        // Create the target machine builder.
-        auto jtmb = llvm::orc::JITTargetMachineBuilder::detectHost();
-        // LCOV_EXCL_START
-        if (!jtmb) {
-            throw std::invalid_argument("Error creating a JITTargetMachineBuilder for the host system");
-        }
-        // LCOV_EXCL_STOP
-        // Set the codegen optimisation level.
-        switch (opt_level) {
-            case 0u:
-                jtmb->setCodeGenOptLevel(cg_opt_level::None);
-                break;
-            case 1u:
-                jtmb->setCodeGenOptLevel(cg_opt_level::Less);
-                break;
-            case 2u:
-                jtmb->setCodeGenOptLevel(cg_opt_level::Default);
-                break;
-            default:
-                assert(opt_level == 3u);
-                jtmb->setCodeGenOptLevel(cg_opt_level::Aggressive);
-        }
-
-        // NOTE: not all code models are supported on all archs. We make an effort
-        // here to prevent unsupported code models to be requested, as that will
-        // result in the termination of the program.
-        constexpr code_model supported_code_models[] = {
-#if defined(HEYOKA_ARCH_X86)
-            code_model::small, code_model::kernel, code_model::medium, code_model::large
-#elif defined(HEYOKA_ARCH_ARM)
-            code_model::tiny, code_model::small, code_model::large
-#elif defined(HEYOKA_ARCH_PPC)
-            code_model::small, code_model::medium, code_model::large
-#else
-            // NOTE: by default we assume only small and large are supported.
-            code_model::small, code_model::large
-#endif
-        };
-
-        if (std::ranges::find(supported_code_models, c_model) == std::ranges::end(supported_code_models)) [[unlikely]] {
-            throw std::invalid_argument(
-                fmt::format("The code model '{}' is not supported on the current architecture", c_model));
-        }
-
-        // LCOV_EXCL_START
-
-#if LLVM_VERSION_MAJOR >= 17
-        // NOTE: the code model setup is working only on LLVM>=19 (or at least
-        // LLVM 18 + patches, as in the conda-forge LLVM package), due to this bug:
-        //
-        // https://github.com/llvm/llvm-project/issues/88115
-        //
-        // Additionally, there are indications from our CI that attempting to set
-        // the code model before LLVM 17 might just be buggy, as we see widespread
-        // ASAN failures all over the place. Thus, let us not do anything with the code
-        // model setting before LLVM 17.
-
-        // Setup the code model.
-        switch (c_model) {
-            case code_model::tiny:
-                jtmb->setCodeModel(llvm::CodeModel::Tiny);
-                break;
-            case code_model::small:
-                jtmb->setCodeModel(llvm::CodeModel::Small);
-                break;
-            case code_model::kernel:
-                jtmb->setCodeModel(llvm::CodeModel::Kernel);
-                break;
-            case code_model::medium:
-                jtmb->setCodeModel(llvm::CodeModel::Medium);
-                break;
-            case code_model::large:
-                jtmb->setCodeModel(llvm::CodeModel::Large);
-                break;
-            default:
-                // NOTE: we should never end up here.
-                assert(false);
-                ;
-        }
-
-#endif
-
-        //  LCOV_EXCL_STOP
-
-        // Create the jit builder.
-        llvm::orc::LLJITBuilder lljit_builder;
-        // NOTE: other settable properties may
-        // be of interest:
-        // https://www.llvm.org/doxygen/classllvm_1_1orc_1_1LLJITBuilder.html
-        lljit_builder.setJITTargetMachineBuilder(*jtmb);
+        // Create the jit builder.
+        llvm::orc::LLJITBuilder lljit_builder;
+        // NOTE: other settable properties may
+        // be of interest:
+        // https://www.llvm.org/doxygen/classllvm_1_1orc_1_1LLJITBuilder.html
+        lljit_builder.setJITTargetMachineBuilder(jtmb);
 
         // Create the jit.
         auto lljit = lljit_builder.create();
@@ -531,7 +716,7 @@ struct llvm_state::jit {
 
         // Keep a target machine around to fetch various
         // properties of the host CPU.
-        auto tm = jtmb->createTargetMachine();
+        auto tm = jtmb.createTargetMachine();
         // LCOV_EXCL_START
         if (!tm) {
             throw std::invalid_argument("Error creating the target machine");
@@ -586,10 +771,6 @@ struct llvm_state::jit {
     {
         return m_tm->getTargetFeatureString().str();
     }
-    [[nodiscard]] llvm::TargetIRAnalysis get_target_ir_analysis() const
-    {
-        return m_tm->getTargetIRAnalysis();
-    }
     [[nodiscard]] const llvm::Triple &get_target_triple() const
     {
         return m_lljit->getTargetTriple();
@@ -597,19 +778,7 @@ struct llvm_state::jit {
 
     void add_module(std::unique_ptr<llvm::Module> m) const
     {
-        auto err = m_lljit->addIRModule(llvm::orc::ThreadSafeModule(std::move(m), *m_ctx));
-
-        // LCOV_EXCL_START
-        if (err) {
-            std::string err_report;
-            llvm::raw_string_ostream ostr(err_report);
-
-            ostr << err;
-
-            throw std::invalid_argument(fmt::format(
-                "The function for adding a module to the jit failed. The full error message:\n{}", ostr.str()));
-        }
-        // LCOV_EXCL_STOP
+        detail::add_module_to_lljit(*m_lljit, std::move(m), *m_ctx);
     }
 
     // Symbol lookup.
@@ -645,24 +814,12 @@ namespace detail
 namespace
 {
 
-// Helper to load object code into a jit.
+// Helper to load object code into the jit of an llvm_state.
 template <typename Jit>
 void llvm_state_add_obj_to_jit(Jit &j, std::string obj)
 {
-    llvm::SmallVector<char, 0> buffer(obj.begin(), obj.end());
-    auto err = j.m_lljit->addObjectFile(std::make_unique<llvm::SmallVectorMemoryBuffer>(std::move(buffer)));
-
-    // LCOV_EXCL_START
-    if (err) {
-        std::string err_report;
-        llvm::raw_string_ostream ostr(err_report);
-
-        ostr << err;
-
-        throw std::invalid_argument(fmt::format(
-            "The function for adding a compiled module to the jit failed. The full error message:\n{}", ostr.str()));
-    }
-    // LCOV_EXCL_STOP
+    // Add the object code to the lljit.
+    add_obj_to_lljit(*j.m_lljit, obj);
 
     // Add the object code also to the
     // m_object_file member.
@@ -677,11 +834,10 @@ void llvm_state_add_obj_to_jit(Jit &j, std::string obj)
 // contained in the bitcode) because apparently llvm::parseBitcodeFile() discards the module
 // name when parsing.
 // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
-auto llvm_state_bc_to_module(const std::string &module_name, const std::string &bc, llvm::LLVMContext &ctx)
+auto bc_to_module(const std::string &module_name, const std::string &bc, llvm::LLVMContext &ctx)
 {
-    // Create the corresponding memory buffer.
-    auto mb = llvm::MemoryBuffer::getMemBuffer(bc);
-    assert(mb);
+    // Create the corresponding memory buffer view on bc.
+    auto mb = std::make_unique<string_view_mem_buffer>(bc);
 
     // Parse the bitcode.
     auto ret = llvm::parseBitcodeFile(mb->getMemBufferRef(), ctx);
@@ -790,7 +946,7 @@ llvm_state::llvm_state(const llvm_state &other)
         // module and builder. The IR/bitcode snapshots
         // are left in their default-constructed (empty)
         // state.
-        m_module = detail::llvm_state_bc_to_module(m_module_name, other.get_bc(), context());
+        m_module = detail::bc_to_module(m_module_name, other.get_bc(), context());
 
         // Create a new builder for the module.
         m_builder = std::make_unique<ir_builder>(context());
@@ -837,23 +993,12 @@ llvm_state &llvm_state::operator=(llvm_state &&other) noexcept
     return *this;
 }
 
-llvm_state::~llvm_state()
-{
-    // Sanity checks in debug mode.
-    if (m_jitter) {
-        if (is_compiled()) {
-            assert(m_jitter->m_object_file);
-            assert(!m_builder);
-        } else {
-            assert(!m_jitter->m_object_file);
-            assert(m_builder);
-            assert(m_ir_snapshot.empty());
-            assert(m_bc_snapshot.empty());
-        }
-    }
-
-    assert(m_opt_level <= 3u);
-}
+// NOTE: we used to have debug sanity checks here. However, in certain rare corner cases,
+// an invalid llvm_state could end up being destroyed, thus triggering assertion errors
+// in debug mode (this could happen for instance when resetting an llvm_state to the
+// def-cted state after an exception had been thrown during compilation). Thus, just
+// do not run the debug checks.
+llvm_state::~llvm_state() = default;
 
 template <typename Archive>
 void llvm_state::save_impl(Archive &ar, unsigned) const
@@ -997,7 +1142,7 @@ void llvm_state::load_impl(Archive &ar, unsigned version)
             m_bc_snapshot.clear();
 
             // Create the module from the bitcode.
-            m_module = detail::llvm_state_bc_to_module(m_module_name, bc_snapshot, context());
+            m_module = detail::bc_to_module(m_module_name, bc_snapshot, context());
 
             // Create a new builder for the module.
             m_builder = std::make_unique<ir_builder>(context());
@@ -1142,189 +1287,15 @@ void llvm_state::verify_function(const std::string &name)
 
 void llvm_state::optimise()
 {
-    check_uncompiled(__func__);
-
-    // NOTE: don't run any optimisation pass at O0.
-    if (m_opt_level == 0u) {
-        return;
-    }
-
-    // NOTE: the logic here largely mimics (with a lot of simplifications)
-    // the implementation of the 'opt' tool. See:
-    // https://github.com/llvm/llvm-project/blob/release/10.x/llvm/tools/opt/opt.cpp
-
-    // For every function in the module, setup its attributes
-    // so that the codegen uses all the features available on
-    // the host CPU.
-    const auto cpu = m_jitter->get_target_cpu();
-    const auto features = m_jitter->get_target_features();
-
-    auto &ctx = context();
-
-    for (auto &f : module()) {
-        auto attrs = f.getAttributes();
-
-        llvm::AttrBuilder
-#if LLVM_VERSION_MAJOR < 14
-            new_attrs
-#else
-            new_attrs(ctx)
-#endif
-            ;
-
-        if (!cpu.empty() && !f.hasFnAttribute("target-cpu")) {
-            new_attrs.addAttribute("target-cpu", cpu);
-        }
-
-        if (!features.empty()) {
-            auto old_features = f.getFnAttribute("target-features").getValueAsString();
-
-            if (old_features.empty()) {
-                new_attrs.addAttribute("target-features", features);
-            } else {
-                llvm::SmallString<256> appended(old_features);
-                appended.push_back(',');
-                appended.append(features);
-                new_attrs.addAttribute("target-features", appended);
-            }
-        }
-
-        // Let new_attrs override attrs.
-#if LLVM_VERSION_MAJOR < 14
-        f.setAttributes(attrs.addAttributes(ctx, llvm::AttributeList::FunctionIndex, new_attrs));
-#else
-        f.setAttributes(attrs.addFnAttributes(ctx, new_attrs));
-#endif
-    }
-
-    // Force usage of AVX512 registers, if requested.
-    if (m_force_avx512 && detail::get_target_features().avx512f) {
-        for (auto &f : module()) {
-            f.addFnAttr("prefer-vector-width", "512");
-        }
-    }
-
-#if defined(HEYOKA_USE_NEW_LLVM_PASS_MANAGER)
-
-    // NOTE: adapted from here:
-    // https://llvm.org/docs/NewPassManager.html
-
-    // Optimisation level for the module pass manager.
-    // NOTE: the OptimizationLevel class has changed location
-    // since LLVM 14.
-#if LLVM_VERSION_MAJOR >= 14
-    using olevel = llvm::OptimizationLevel;
-#else
-    using olevel = llvm::PassBuilder::OptimizationLevel;
-#endif
-
-    // Create the analysis managers.
-    llvm::LoopAnalysisManager LAM;
-    llvm::FunctionAnalysisManager FAM;
-    llvm::CGSCCAnalysisManager CGAM;
-    llvm::ModuleAnalysisManager MAM;
-
-    // NOTE: in the new pass manager, this seems to be the way to
-    // set the target library info bits. See:
-    // https://github.com/llvm/llvm-project/blob/b7fd30eac3183993806cc218b6deb39eb625c083/llvm/tools/opt/NewPMDriver.cpp#L408
-    // Not sure if this matters, but we did it in the old pass manager
-    // and opt does it too.
-    llvm::TargetLibraryInfoImpl TLII(m_jitter->get_target_triple());
-    FAM.registerPass([&] { return llvm::TargetLibraryAnalysis(TLII); });
-
-    // Create the new pass manager builder, passing
-    // the native target machine from the JIT class.
-    // NOTE: if requested, we turn manually on the SLP vectoriser here, which is off
-    // by default. Not sure why it is off, the LLVM docs imply this
-    // is on by default at nonzero optimisation levels for clang and opt.
-    // NOTE: the reason for this inconsistency is that opt uses PB.parsePassPipeline()
-    // (instead of PB.buildPerModuleDefaultPipeline()) to set up the optimisation
-    // pipeline. Indeed, if we replace PB.buildPerModuleDefaultPipeline(ol) with
-    // PB.parsePassPipeline(MPM, "default<O3>") (which corresponds to invoking
-    // "opt -passes='default<O3>'"), we do NOT need to set SLP vectorization on
-    // here to get the SLP vectorizer. Not sure if we should consider switching to this
-    // alternative way of setting up the optimisation pipeline in the future.
-    llvm::PipelineTuningOptions pto;
-    pto.SLPVectorization = m_slp_vectorize;
-    llvm::PassBuilder PB(m_jitter->m_tm.get(), pto);
-
-    // Register all the basic analyses with the managers.
-    PB.registerModuleAnalyses(MAM);
-    PB.registerCGSCCAnalyses(CGAM);
-    PB.registerFunctionAnalyses(FAM);
-    PB.registerLoopAnalyses(LAM);
-    PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
-
-    // Construct the optimisation level.
-    olevel ol{};
-
-    switch (m_opt_level) {
-        case 1u:
-            ol = olevel::O1;
-            break;
-        case 2u:
-            ol = olevel::O2;
-            break;
-        default:
-            assert(m_opt_level == 3u);
-            ol = olevel::O3;
-    }
-
-    // Create the module pass manager.
-    auto MPM = PB.buildPerModuleDefaultPipeline(ol);
-
-    // Optimize the IR.
-    MPM.run(*m_module, MAM);
-
-#else
-
-    // Init the module pass manager.
-    auto module_pm = std::make_unique<llvm::legacy::PassManager>();
-    // These are passes which set up target-specific info
-    // that are used by successive optimisation passes.
-    auto tliwp = std::make_unique<llvm::TargetLibraryInfoWrapperPass>(
-        llvm::TargetLibraryInfoImpl(m_jitter->get_target_triple()));
-    module_pm->add(tliwp.release());
-    module_pm->add(llvm::createTargetTransformInfoWrapperPass(m_jitter->get_target_ir_analysis()));
-
-    // NOTE: not sure what this does, presumably some target-specifc
-    // configuration.
-    module_pm->add(static_cast<llvm::LLVMTargetMachine &>(*m_jitter->m_tm).createPassConfig(*module_pm));
-
-    // Init the function pass manager.
-    auto f_pm = std::make_unique<llvm::legacy::FunctionPassManager>(m_module.get());
-    f_pm->add(llvm::createTargetTransformInfoWrapperPass(m_jitter->get_target_ir_analysis()));
-
-    // We use the helper class PassManagerBuilder to populate the module
-    // pass manager with standard options.
-    llvm::PassManagerBuilder pm_builder;
-    // See here for the defaults:
-    // https://llvm.org/doxygen/PassManagerBuilder_8cpp_source.html
-    pm_builder.OptLevel = m_opt_level;
-    // Enable function inlining.
-    pm_builder.Inliner = llvm::createFunctionInliningPass(m_opt_level, 0, false);
-    // NOTE: if requested, we turn manually on the SLP vectoriser here, which is off
-    // by default. Not sure why it is off, the LLVM docs imply this
-    // is on by default at nonzero optimisation levels for clang and opt.
-    pm_builder.SLPVectorize = m_slp_vectorize;
-
-    m_jitter->m_tm->adjustPassManager(pm_builder);
-
-    // Populate both the function pass manager and the module pass manager.
-    pm_builder.populateFunctionPassManager(*f_pm);
-    pm_builder.populateModulePassManager(*module_pm);
-
-    // Run the function pass manager on all functions in the module.
-    f_pm->doInitialization();
-    for (auto &f : *m_module) {
-        f_pm->run(f);
-    }
-    f_pm->doFinalization();
-
-    // Run the module passes.
-    module_pm->run(*m_module);
-
-#endif
+    // NOTE: we used to fetch the target triple from the lljit object,
+    // but recently we switched to asking the target triple directly
+    // from the target machine. Assert equality between the two for a while,
+    // just in case.
+    assert(m_jitter->m_lljit->getTargetTriple() == m_jitter->m_tm->getTargetTriple());
+    // NOTE: the target triple is also available in the module.
+    assert(m_jitter->m_lljit->getTargetTriple().str() == module().getTargetTriple());
+
+    detail::optimise_module(module(), *m_jitter->m_tm, m_opt_level, m_force_avx512, m_slp_vectorize);
 }
 
 namespace detail
@@ -1350,6 +1321,7 @@ void llvm_state::add_obj_trigger()
     auto *ft = llvm::FunctionType::get(bld.getVoidTy(), {}, false);
     assert(ft != nullptr);
     auto *f = detail::llvm_func_create(ft, llvm::Function::ExternalLinkage, detail::obj_trigger_name, &module());
+    assert(f != nullptr);
 
     bld.SetInsertPoint(llvm::BasicBlock::Create(context(), "entry", f));
     bld.CreateRetVoid();
@@ -1377,9 +1349,38 @@ void llvm_state::compile_impl()
 
     // Trigger object code materialisation via lookup.
     jit_lookup(detail::obj_trigger_name);
+
     assert(m_jitter->m_object_file);
 }
 
+namespace detail
+{
+
+namespace
+{
+
+// Combine opt_level, force_avx512, slp_vectorize and c_model into a single flag.
+// NOTE: here we need:
+//
+// - 2 bits for opt_level,
+// - 1 bit for force_avx512 and slp_vectorize each,
+// - 3 bits for c_model,
+//
+// for a total of 7 bits.
+unsigned assemble_comp_flag(unsigned opt_level, bool force_avx512, bool slp_vectorize, code_model c_model)
+{
+    assert(opt_level <= 3u);
+    assert(static_cast<unsigned>(c_model) <= 7u);
+    static_assert(std::numeric_limits<unsigned>::digits >= 7u);
+
+    return opt_level + (static_cast<unsigned>(force_avx512) << 2) + (static_cast<unsigned>(slp_vectorize) << 3)
+           + (static_cast<unsigned>(c_model) << 4);
+}
+
+} // namespace
+
+} // namespace detail
+
 // NOTE: we need to emphasise in the docs that compilation
 // triggers an optimisation pass.
 void llvm_state::compile()
@@ -1392,17 +1393,7 @@ void llvm_state::compile()
     auto *logger = detail::get_logger();
 
     // Run a verification on the module before compiling.
-    {
-        std::string out;
-        llvm::raw_string_ostream ostr(out);
-
-        if (llvm::verifyModule(*m_module, &ostr)) {
-            // LCOV_EXCL_START
-            throw std::runtime_error(
-                fmt::format("The verification of the module '{}' produced an error:\n{}", m_module_name, ostr.str()));
-            // LCOV_EXCL_STOP
-        }
-    }
+    detail::verify_module(*m_module);
 
     logger->trace("module verification runtime: {}", sw);
 
@@ -1413,39 +1404,42 @@ void llvm_state::compile()
     // to fix the module and re-attempt compilation without having
     // altered the module and without having already added the trigger
     // function.
+    // NOTE: this function does its own cleanup, no need to
+    // start the try catch block yet.
     add_obj_trigger();
 
     try {
         // Fetch the bitcode *before* optimisation.
         auto orig_bc = get_bc();
+        std::vector<std::string> obc;
+        obc.push_back(std::move(orig_bc));
 
-        // Combine m_opt_level, m_force_avx512, m_slp_vectorize and m_c_model into a single value,
-        // as they all affect codegen.
-        // NOTE: here we need:
-        // - 2 bits for m_opt_level,
-        // - 1 bit for m_force_avx512 and m_slp_vectorize each,
-        // - 3 bits for m_c_model,
-        // for a total of 7 bits.
-        assert(m_opt_level <= 3u);
-        assert(static_cast<unsigned>(m_c_model) <= 7u);
-        static_assert(std::numeric_limits<unsigned>::digits >= 7u);
-        const auto olevel = m_opt_level + (static_cast<unsigned>(m_force_avx512) << 2)
-                            + (static_cast<unsigned>(m_slp_vectorize) << 3) + (static_cast<unsigned>(m_c_model) << 4);
-
-        if (auto cached_data = detail::llvm_state_mem_cache_lookup(orig_bc, olevel)) {
+        // Assemble the compilation flag.
+        const auto comp_flag = detail::assemble_comp_flag(m_opt_level, m_force_avx512, m_slp_vectorize, m_c_model);
+
+        // Lookup in the cache.
+        if (auto cached_data = detail::llvm_state_mem_cache_lookup(obc, comp_flag)) {
             // Cache hit.
 
-            // Assign the snapshots.
-            m_ir_snapshot = std::move(cached_data->opt_ir);
-            m_bc_snapshot = std::move(cached_data->opt_bc);
+            // Assign the optimised snapshots.
+            assert(cached_data->opt_ir.size() == 1u);
+            assert(cached_data->opt_bc.size() == 1u);
+            assert(cached_data->obj.size() == 1u);
+            m_ir_snapshot = std::move(cached_data->opt_ir[0]);
+            m_bc_snapshot = std::move(cached_data->opt_bc[0]);
 
             // Clear out module and builder.
             m_module.reset();
             m_builder.reset();
 
             // Assign the object file.
-            detail::llvm_state_add_obj_to_jit(*m_jitter, std::move(cached_data->obj));
+            detail::llvm_state_add_obj_to_jit(*m_jitter, std::move(cached_data->obj[0]));
+
+            // Look up the trigger.
+            jit_lookup(detail::obj_trigger_name);
         } else {
+            // Cache miss.
+
             sw.reset();
 
             // Run the optimisation pass.
@@ -1460,10 +1454,11 @@ void llvm_state::compile()
 
             logger->trace("materialisation runtime: {}", sw);
 
-            // Try to insert orig_bc into the cache.
-            detail::llvm_state_mem_cache_try_insert(std::move(orig_bc), olevel,
-                                                    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
-                                                    {m_bc_snapshot, m_ir_snapshot, *m_jitter->m_object_file});
+            // Try to insert obc into the cache.
+            detail::llvm_state_mem_cache_try_insert(
+                std::move(obc), comp_flag,
+                // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+                {.opt_bc = {m_bc_snapshot}, .opt_ir = {m_ir_snapshot}, .obj = {*m_jitter->m_object_file}});
         }
         // LCOV_EXCL_START
     } catch (...) {
@@ -1493,11 +1488,7 @@ std::uintptr_t llvm_state::jit_lookup(const std::string &name)
         throw std::invalid_argument(fmt::format("Could not find the symbol '{}' in the compiled module", name));
     }
 
-#if LLVM_VERSION_MAJOR >= 15
     return static_cast<std::uintptr_t>((*sym).getValue());
-#else
-    return static_cast<std::uintptr_t>((*sym).getAddress());
-#endif
 }
 
 std::string llvm_state::get_ir() const
@@ -1505,12 +1496,7 @@ std::string llvm_state::get_ir() const
     if (m_module) {
         // The module has not been compiled yet,
         // get the IR from it.
-        std::string out;
-        llvm::raw_string_ostream ostr(out);
-
-        m_module->print(ostr, nullptr);
-
-        return std::move(ostr.str());
+        return detail::ir_from_module(*m_module);
     } else {
         // The module has been compiled.
         // Return the IR snapshot that
@@ -1524,12 +1510,7 @@ std::string llvm_state::get_bc() const
     if (m_module) {
         // The module has not been compiled yet,
         // get the bitcode from it.
-        std::string out;
-        llvm::raw_string_ostream ostr(out);
-
-        llvm::WriteBitcodeToFile(*m_module, ostr);
-
-        return std::move(ostr.str());
+        return detail::bc_from_module(*m_module);
     } else {
         // The module has been compiled.
         // Return the bitcode snapshot that
@@ -1610,4 +1591,812 @@ std::ostream &operator<<(std::ostream &os, const llvm_state &s)
     return os << oss.str();
 }
 
+namespace detail
+{
+
+namespace
+{
+
+// NOTE: this is a class similar in spirit to llvm_state, but set up for parallel
+// compilation of multiple modules.
+struct multi_jit {
+    // NOTE: this is the total number of modules, including
+    // the master module.
+    const unsigned m_n_modules = 0;
+    // NOTE: enumerate the LLVM members here in the same order
+    // as llvm_state, as this is important to ensure proper
+    // destruction order.
+    std::unique_ptr<llvm::orc::LLJIT> m_lljit;
+    std::unique_ptr<llvm::orc::ThreadSafeContext> m_ctx;
+    std::unique_ptr<llvm::Module> m_module;
+    std::unique_ptr<ir_builder> m_builder;
+    // Object files.
+    // NOTE: these may be modified concurrently during compilation,
+    // protect with mutex.
+    std::mutex m_object_files_mutex;
+    std::vector<std::string> m_object_files;
+    // IR and bc optimised snapshots.
+    // NOTE: these may be modified concurrently during compilation,
+    // protect with mutex.
+    std::mutex m_ir_bc_mutex;
+    std::vector<std::string> m_ir_snapshots;
+    std::vector<std::string> m_bc_snapshots;
+
+    explicit multi_jit(unsigned, unsigned, code_model, bool, bool);
+    multi_jit(const multi_jit &) = delete;
+    multi_jit(multi_jit &&) noexcept = delete;
+    llvm_multi_state &operator=(const multi_jit &) = delete;
+    llvm_multi_state &operator=(multi_jit &&) noexcept = delete;
+    ~multi_jit() = default;
+
+    // Helper to fetch the context from its thread-safe counterpart.
+    [[nodiscard]] llvm::LLVMContext &context() const noexcept
+    {
+        return *m_ctx->getContext();
+    }
+};
+
+#if 0
+
+// A task dispatcher class built on top of TBB's task group.
+class tbb_task_dispatcher : public llvm::orc::TaskDispatcher
+{
+    oneapi::tbb::task_group m_tg;
+
+public:
+    void dispatch(std::unique_ptr<llvm::orc::Task> T) override
+    {
+        m_tg.run([T = std::move(T)]() { T->run(); });
+    }
+    void shutdown() override
+    {
+        m_tg.wait();
+    }
+    ~tbb_task_dispatcher() noexcept
+    {
+        m_tg.wait();
+    }
+};
+
+#endif
+
+// Reserved identifier for the master module in an llvm_multi_state.
+constexpr auto master_module_name = "heyoka.master";
+
+// NOTE: this largely replicates the logic from the constructors of llvm_state and llvm_state::jit.
+// NOTE: make sure to coordinate changes in this constructor with llvm_state::jit.
+multi_jit::multi_jit(unsigned n_modules, unsigned opt_level, code_model c_model, bool force_avx512, bool slp_vectorize)
+    : m_n_modules(n_modules)
+{
+    assert(n_modules >= 2u);
+
+    // NOTE: we assume here that the input arguments have
+    // been validated already.
+    assert(opt_level <= 3u);
+    assert(c_model >= code_model::tiny && c_model <= code_model::large);
+
+    // Ensure the native target is inited.
+    init_native_target();
+
+    // Create the target machine builder.
+    auto jtmb = create_jit_tmb(opt_level, c_model);
+
+    // Create the jit builder.
+    llvm::orc::LLJITBuilder lljit_builder;
+    // NOTE: other settable properties may
+    // be of interest:
+    // https://www.llvm.org/doxygen/classllvm_1_1orc_1_1LLJITBuilder.html
+    lljit_builder.setJITTargetMachineBuilder(jtmb);
+
+#if 0
+    // Create a task dispatcher.
+    auto tdisp = std::make_unique<tbb_task_dispatcher>();
+
+    // Create an ExecutorProcessControl.
+    auto epc = llvm::orc::SelfExecutorProcessControl::Create(nullptr, std::move(tdisp));
+    // LCOV_EXCL_START
+    if (!epc) {
+        auto err = epc.takeError();
+
+        std::string err_report;
+        llvm::raw_string_ostream ostr(err_report);
+
+        ostr << err;
+
+        throw std::invalid_argument(
+            fmt::format("Could not create a SelfExecutorProcessControl. The full error message is:\n{}", ostr.str()));
+    }
+    // LCOV_EXCL_STOP
+
+    // Set it in the lljit builder.
+    lljit_builder.setExecutorProcessControl(std::move(*epc));
+#else
+
+    // Set the number of compilation threads.
+    lljit_builder.setNumCompileThreads(std::thread::hardware_concurrency());
+
+#endif
+
+    // Create the jit.
+    auto lljit = lljit_builder.create();
+    // LCOV_EXCL_START
+    if (!lljit) {
+        auto err = lljit.takeError();
+
+        std::string err_report;
+        llvm::raw_string_ostream ostr(err_report);
+
+        ostr << err;
+
+        throw std::invalid_argument(
+            fmt::format("Could not create an LLJIT object. The full error message is:\n{}", ostr.str()));
+    }
+    // LCOV_EXCL_STOP
+    m_lljit = std::move(*lljit);
+
+    // Setup the machinery to store the modules' binary code
+    // when it is generated.
+    m_lljit->getObjTransformLayer().setTransform([this](std::unique_ptr<llvm::MemoryBuffer> obj_buffer) {
+        assert(obj_buffer);
+
+        // Lock down for access to m_object_files.
+        std::lock_guard lock{m_object_files_mutex};
+
+        assert(m_object_files.size() <= m_n_modules);
+
+        // NOTE: this callback will be invoked the first time a jit lookup is performed,
+        // even if the object code was manually injected. In such a case, m_object_files
+        // has already been set up properly and we just sanity check in debug mode that
+        // one object file matches the content of obj_buffer.
+        if (m_object_files.size() < m_n_modules) {
+            // Add obj_buffer.
+            m_object_files.push_back(std::string(obj_buffer->getBufferStart(), obj_buffer->getBufferEnd()));
+        } else {
+            // Check that at least one buffer in m_object_files is exactly
+            // identical to obj_buffer.
+            assert(std::ranges::any_of(m_object_files, [&obj_buffer](const auto &cur) {
+                return obj_buffer->getBufferSize() == cur.size()
+                       && std::equal(obj_buffer->getBufferStart(), obj_buffer->getBufferEnd(), cur.begin());
+                ;
+            }));
+        }
+
+        return llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>>(std::move(obj_buffer));
+    });
+
+    // Setup the machinery to run the optimisation passes on the modules.
+    m_lljit->getIRTransformLayer().setTransform(
+        [this, opt_level, force_avx512, slp_vectorize, c_model](llvm::orc::ThreadSafeModule TSM,
+                                                                llvm::orc::MaterializationResponsibility &) {
+            // See here for an explanation of what withModuleDo() entails:
+            //
+            // https://groups.google.com/g/llvm-dev/c/QauU4L_bHac
+            //
+            // In our case, the locking/thread safety aspect is not important as we are not sharing
+            // contexts between threads. More references from discord:
+            //
+            // https://discord.com/channels/636084430946959380/687692371038830597/1252428080648163328
+            // https://discord.com/channels/636084430946959380/687692371038830597/1252118666187640892
+            TSM.withModuleDo([this, opt_level, force_avx512, slp_vectorize, c_model](llvm::Module &M) {
+                // NOTE: don't run any optimisation on the master module.
+                if (M.getModuleIdentifier() != master_module_name) {
+                    // NOTE: running the optimisation passes requires mutable access to a target
+                    // machine. Thus, we create a new target machine per thread in order to avoid likely data races
+                    // with a shared target machine.
+
+                    // Fetch a target machine builder.
+                    auto jtmb = detail::create_jit_tmb(opt_level, c_model);
+
+                    // Try creating the target machine.
+                    auto tm = jtmb.createTargetMachine();
+                    // LCOV_EXCL_START
+                    if (!tm) [[unlikely]] {
+                        throw std::invalid_argument("Error creating the target machine");
+                    }
+                    // LCOV_EXCL_STOP
+
+                    // NOTE: we used to fetch the target triple from the lljit object,
+                    // but recently we switched to asking the target triple directly
+                    // from the target machine. Assert equality between the two for a while,
+                    // just in case.
+                    // NOTE: lljit.getTargetTriple() just returns a const ref to an internal
+                    // object, it should be ok with concurrent invocation.
+                    assert(m_lljit->getTargetTriple() == (*tm)->getTargetTriple());
+                    // NOTE: the target triple is also available in the module.
+                    assert(m_lljit->getTargetTriple().str() == M.getTargetTriple());
+
+                    // Optimise the module.
+                    detail::optimise_module(M, **tm, opt_level, force_avx512, slp_vectorize);
+                } else {
+                    ;
+                }
+
+                // Store the optimised bitcode/IR for this module.
+                auto bc_snap = detail::bc_from_module(M);
+                auto ir_snap = detail::ir_from_module(M);
+
+                // NOTE: protect for multi-threaded access.
+                std::lock_guard lock{m_ir_bc_mutex};
+
+                m_bc_snapshots.push_back(std::move(bc_snap));
+                m_ir_snapshots.push_back(std::move(ir_snap));
+            });
+
+            return llvm::Expected<llvm::orc::ThreadSafeModule>(std::move(TSM));
+        });
+
+    // Setup the jit so that it can look up symbols from the current process.
+    auto dlsg
+        = llvm::orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(m_lljit->getDataLayout().getGlobalPrefix());
+    // LCOV_EXCL_START
+    if (!dlsg) {
+        throw std::invalid_argument("Could not create the dynamic library search generator");
+    }
+    // LCOV_EXCL_STOP
+    m_lljit->getMainJITDylib().addGenerator(std::move(*dlsg));
+
+    // Create the master context.
+    m_ctx = std::make_unique<llvm::orc::ThreadSafeContext>(std::make_unique<llvm::LLVMContext>());
+
+    // Create the master module.
+    m_module = std::make_unique<llvm::Module>(master_module_name, context());
+    // Setup the data layout and the target triple.
+    m_module->setDataLayout(m_lljit->getDataLayout());
+    m_module->setTargetTriple(m_lljit->getTargetTriple().str());
+
+    // Create a new builder for the master module.
+    // NOTE: no need to mess around with fast math flags for this builder.
+    m_builder = std::make_unique<ir_builder>(context());
+}
+
+} // namespace
+
+} // namespace detail
+
+struct llvm_multi_state::impl {
+    std::vector<llvm_state> m_states;
+    std::unique_ptr<detail::multi_jit> m_jit;
+};
+
+llvm_multi_state::llvm_multi_state() = default;
+
+llvm_multi_state::llvm_multi_state(std::vector<llvm_state> states_)
+{
+    // Fetch a const ref, as we want to make extra sure we do not modify
+    // states_ until we move it to construct the impl.
+    const auto &states = states_;
+
+    // We need at least 1 state.
+    if (states.empty()) [[unlikely]] {
+        throw std::invalid_argument("At least 1 llvm_state object is needed to construct an llvm_multi_state");
+    }
+
+    // All states must be uncompiled.
+    if (std::ranges::any_of(states, &llvm_state::is_compiled)) [[unlikely]] {
+        throw std::invalid_argument("An llvm_multi_state can be constructed only from uncompiled llvm_state objects");
+    }
+
+    // Module names must not collide with master_module_name.
+    if (std::ranges::any_of(states, [](const auto &s) { return s.module_name() == detail::master_module_name; }))
+        [[unlikely]] {
+        throw std::invalid_argument(
+            fmt::format("An invalid llvm_state was passed to the constructor of an llvm_multi_state: the module name "
+                        "'{}' is reserved for internal use by llvm_multi_state",
+                        detail::master_module_name));
+    }
+
+    // Settings in all states must be consistent.
+    auto states_differ = [](const llvm_state &s1, const llvm_state &s2) {
+        if (s1.get_opt_level() != s2.get_opt_level()) {
+            return true;
+        }
+
+        if (s1.fast_math() != s2.fast_math()) {
+            return true;
+        }
+
+        if (s1.force_avx512() != s2.force_avx512()) {
+            return true;
+        }
+
+        if (s1.get_slp_vectorize() != s2.get_slp_vectorize()) {
+            return true;
+        }
+
+        if (s1.get_code_model() != s2.get_code_model()) {
+            return true;
+        }
+
+        // NOTE: bit of paranoia here.
+        assert(s1.m_jitter->m_lljit->getDataLayout() == s2.m_jitter->m_lljit->getDataLayout());
+        assert(s1.m_jitter->get_target_triple() == s2.m_jitter->get_target_triple());
+        assert(s1.m_jitter->get_target_cpu() == s2.m_jitter->get_target_cpu());
+        assert(s1.m_jitter->get_target_features() == s2.m_jitter->get_target_features());
+
+        return false;
+    };
+
+    if (std::ranges::adjacent_find(states, states_differ) != states.end()) [[unlikely]] {
+        throw std::invalid_argument(
+            "Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state");
+    }
+
+    // Fetch settings from the first state.
+    const auto opt_level = states[0].get_opt_level();
+    const auto c_model = states[0].get_code_model();
+    const auto force_avx512 = states[0].force_avx512();
+    const auto slp_vectorize = states[0].get_slp_vectorize();
+
+    // Create the multi_jit.
+    auto jit = std::make_unique<detail::multi_jit>(boost::safe_numerics::safe<unsigned>(states.size()) + 1, opt_level,
+                                                   c_model, force_avx512, slp_vectorize);
+
+    // Build and assign the implementation.
+    impl imp{.m_states = std::move(states_), .m_jit = std::move(jit)};
+    m_impl = std::make_unique<impl>(std::move(imp));
+}
+
+llvm_multi_state::llvm_multi_state(const llvm_multi_state &other)
+{
+    // NOTE: start off by creating a new jit and copying the states.
+    // This will work regardless of whether other is compiled or not.
+    // No need to do any validation on the states are they are coming
+    // from a llvm_multi_state and they have been checked already.
+    impl imp{.m_states = other.m_impl->m_states,
+             .m_jit = std::make_unique<detail::multi_jit>(other.m_impl->m_jit->m_n_modules, other.get_opt_level(),
+                                                          other.get_code_model(), other.force_avx512(),
+                                                          other.get_slp_vectorize())};
+    m_impl = std::make_unique<impl>(std::move(imp));
+
+    if (other.is_compiled()) {
+        // 'other' was compiled.
+
+        // Reset builder and module.
+        m_impl->m_jit->m_module.reset();
+        m_impl->m_jit->m_builder.reset();
+
+        // Copy over the snapshots and the object files,
+        m_impl->m_jit->m_object_files = other.m_impl->m_jit->m_object_files;
+        m_impl->m_jit->m_ir_snapshots = other.m_impl->m_jit->m_ir_snapshots;
+        m_impl->m_jit->m_bc_snapshots = other.m_impl->m_jit->m_bc_snapshots;
+
+        // Add the files to the jit.
+        for (const auto &obj : m_impl->m_jit->m_object_files) {
+            detail::add_obj_to_lljit(*m_impl->m_jit->m_lljit, obj);
+        }
+    } else {
+        // If 'other' was not compiled, we do not need to do anything - the
+        // copy construction of the states takes care of everything. I.e., this
+        // is basically the same as construction from a list of states.
+        // NOTE: regarding the master module: this is always created empty
+        // and it remains empty until compilation, thus we do not need to care
+        // about it if other is uncompiled - the new empty master module constructed
+        // with the jit is ok.
+        assert(other.m_impl->m_jit->m_object_files.empty());
+        assert(other.m_impl->m_jit->m_ir_snapshots.empty());
+        assert(other.m_impl->m_jit->m_bc_snapshots.empty());
+    }
+}
+
+llvm_multi_state::llvm_multi_state(llvm_multi_state &&) noexcept = default;
+
+llvm_multi_state &llvm_multi_state::operator=(const llvm_multi_state &other)
+{
+    if (this != &other) {
+        *this = llvm_multi_state(other);
+    }
+
+    return *this;
+}
+
+llvm_multi_state &llvm_multi_state::operator=(llvm_multi_state &&) noexcept = default;
+
+llvm_multi_state::~llvm_multi_state() = default;
+
+void llvm_multi_state::save(boost::archive::binary_oarchive &ar, unsigned) const
+{
+    // Start by establishing if the state is compiled.
+    const auto cmp = is_compiled();
+    ar << cmp;
+
+    // Store the states.
+    ar << m_impl->m_states;
+
+    // Store the object files and the snapshots. These may be empty.
+    ar << m_impl->m_jit->m_object_files;
+    ar << m_impl->m_jit->m_ir_snapshots;
+    ar << m_impl->m_jit->m_bc_snapshots;
+
+    // NOTE: no need to explicitly store the bitcode of the master
+    // module: if this is compiled, the master module is in the snapshots.
+    // Otherwise, the master module is empty and there's no need to
+    // store anything.
+}
+
+void llvm_multi_state::load(boost::archive::binary_iarchive &ar, unsigned)
+{
+    try {
+        // Load the compiled status flag from the archive.
+        // NOLINTNEXTLINE(misc-const-correctness)
+        bool cmp{};
+        ar >> cmp;
+
+        // Load the states.
+        ar >> m_impl->m_states;
+
+        // Reset the jit with a new one.
+        m_impl->m_jit = std::make_unique<detail::multi_jit>(
+            boost::safe_numerics::safe<unsigned>(m_impl->m_states.size()) + 1, get_opt_level(), get_code_model(),
+            force_avx512(), get_slp_vectorize());
+
+        // Load the object files and the snapshots.
+        ar >> m_impl->m_jit->m_object_files;
+        ar >> m_impl->m_jit->m_ir_snapshots;
+        ar >> m_impl->m_jit->m_bc_snapshots;
+
+        if (cmp) {
+            // If the stored state was compiled, we need to reset
+            // master builder and module. Otherwise, the empty default-constructed
+            // master module is ok (the master module remains empty until compilation
+            // is triggered).
+            m_impl->m_jit->m_module.reset();
+            m_impl->m_jit->m_builder.reset();
+
+            // We also need to add all the object files to the jit.
+            for (const auto &obj : m_impl->m_jit->m_object_files) {
+                detail::add_obj_to_lljit(*m_impl->m_jit->m_lljit, obj);
+            }
+        }
+
+        // Debug checks.
+        assert((m_impl->m_jit->m_object_files.empty() && !cmp)
+               || m_impl->m_jit->m_object_files.size() == m_impl->m_jit->m_n_modules);
+        assert((m_impl->m_jit->m_object_files.empty() && !cmp)
+               || m_impl->m_jit->m_ir_snapshots.size() == m_impl->m_jit->m_n_modules);
+        assert((m_impl->m_jit->m_object_files.empty() && !cmp)
+               || m_impl->m_jit->m_bc_snapshots.size() == m_impl->m_jit->m_n_modules);
+
+        // LCOV_EXCL_START
+    } catch (...) {
+        m_impl.reset();
+
+        throw;
+    }
+    // LCOV_EXCL_STOP
+}
+
+void llvm_multi_state::add_obj_triggers()
+{
+    // NOTE: the idea here is that we add one trigger function per module, and then
+    // we invoke all the trigger functions from a trigger function in the master module.
+    // Like this, we ensure materialisation of all modules when we lookup the
+    // master trigger.
+
+    // Implement the per-module triggers.
+    for (decltype(m_impl->m_states.size()) i = 0; i < m_impl->m_states.size(); ++i) {
+        // Fetch builder/module/context for the current state.
+        auto &bld = m_impl->m_states[i].builder();
+        auto &md = m_impl->m_states[i].module();
+        auto &ctx = m_impl->m_states[i].context();
+
+        // The function name.
+        const auto fname = fmt::format("{}_{}", detail::obj_trigger_name, i);
+
+        auto *ft = llvm::FunctionType::get(bld.getVoidTy(), {}, false);
+        assert(ft != nullptr);
+        auto *f = detail::llvm_func_create(ft, llvm::Function::ExternalLinkage, fname.c_str(), &md);
+        assert(f != nullptr);
+
+        bld.SetInsertPoint(llvm::BasicBlock::Create(ctx, "entry", f));
+        bld.CreateRetVoid();
+    }
+
+    // Fetch the master builder/module/context.
+    auto &bld = *m_impl->m_jit->m_builder;
+    auto &md = *m_impl->m_jit->m_module;
+    auto &ctx = m_impl->m_jit->context();
+
+    // Add the prototypes of all per-module trigger functions to the master module.
+    std::vector<llvm::Function *> callees;
+    callees.reserve(m_impl->m_states.size());
+    for (decltype(m_impl->m_states.size()) i = 0; i < m_impl->m_states.size(); ++i) {
+        // The function name.
+        const auto fname = fmt::format("{}_{}", detail::obj_trigger_name, i);
+
+        auto *ft = llvm::FunctionType::get(bld.getVoidTy(), {}, false);
+        assert(ft != nullptr);
+        auto *f = detail::llvm_func_create(ft, llvm::Function::ExternalLinkage, fname.c_str(), &md);
+        assert(f != nullptr);
+
+        callees.push_back(f);
+    }
+
+    // Create the master trigger function.
+    auto *ft = llvm::FunctionType::get(bld.getVoidTy(), {}, false);
+    assert(ft != nullptr);
+    auto *f = detail::llvm_func_create(ft, llvm::Function::ExternalLinkage, detail::obj_trigger_name, &md);
+    assert(f != nullptr);
+
+    bld.SetInsertPoint(llvm::BasicBlock::Create(ctx, "entry", f));
+
+    // Invoke all the triggers.
+    for (auto *tf : callees) {
+        bld.CreateCall(tf, {});
+    }
+
+    // Return.
+    bld.CreateRetVoid();
+}
+
+void llvm_multi_state::check_compiled(const char *f) const
+{
+    if (m_impl->m_jit->m_module) [[unlikely]] {
+        throw std::invalid_argument(
+            fmt::format("The function '{}' can be invoked only after the llvm_multi_state has been compiled", f));
+    }
+}
+
+void llvm_multi_state::check_uncompiled(const char *f) const
+{
+    if (!m_impl->m_jit->m_module) [[unlikely]] {
+        throw std::invalid_argument(
+            fmt::format("The function '{}' can be invoked only if the llvm_multi_state has not been compiled yet", f));
+    }
+}
+
+unsigned llvm_multi_state::get_n_modules() const noexcept
+{
+    return m_impl->m_jit->m_n_modules;
+}
+
+unsigned llvm_multi_state::get_opt_level() const noexcept
+{
+    return m_impl->m_states[0].get_opt_level();
+}
+
+bool llvm_multi_state::fast_math() const noexcept
+{
+    return m_impl->m_states[0].fast_math();
+}
+
+bool llvm_multi_state::force_avx512() const noexcept
+{
+    return m_impl->m_states[0].force_avx512();
+}
+
+bool llvm_multi_state::get_slp_vectorize() const noexcept
+{
+    return m_impl->m_states[0].get_slp_vectorize();
+}
+
+code_model llvm_multi_state::get_code_model() const noexcept
+{
+    return m_impl->m_states[0].get_code_model();
+}
+
+bool llvm_multi_state::is_compiled() const noexcept
+{
+    return !m_impl->m_jit->m_module;
+}
+
+std::vector<std::string> llvm_multi_state::get_ir() const
+{
+    if (is_compiled()) {
+        return m_impl->m_jit->m_ir_snapshots;
+    } else {
+        std::vector<std::string> retval;
+        retval.reserve(m_impl->m_jit->m_n_modules);
+
+        for (const auto &s : m_impl->m_states) {
+            retval.push_back(s.get_ir());
+        }
+
+        // Add the IR from the master module.
+        retval.push_back(detail::ir_from_module(*m_impl->m_jit->m_module));
+
+        return retval;
+    }
+}
+
+std::vector<std::string> llvm_multi_state::get_bc() const
+{
+    if (is_compiled()) {
+        return m_impl->m_jit->m_bc_snapshots;
+    } else {
+        std::vector<std::string> retval;
+        retval.reserve(m_impl->m_jit->m_n_modules);
+
+        for (const auto &s : m_impl->m_states) {
+            retval.push_back(s.get_bc());
+        }
+
+        // Add the bitcode from the master module.
+        retval.push_back(detail::bc_from_module(*m_impl->m_jit->m_module));
+
+        return retval;
+    }
+}
+
+const std::vector<std::string> &llvm_multi_state::get_object_code() const
+{
+    check_compiled(__func__);
+
+    return m_impl->m_jit->m_object_files;
+}
+
+// NOTE: this function is NOT exception-safe, proper cleanup
+// needs to be done externally if needed.
+void llvm_multi_state::compile_impl()
+{
+    // Add all the modules from the states.
+    for (auto &s : m_impl->m_states) {
+        detail::add_module_to_lljit(*m_impl->m_jit->m_lljit, std::move(s.m_module), *s.m_jitter->m_ctx);
+
+        // Clear out the builder.
+        s.m_builder.reset();
+
+        // NOTE: need to manually construct the object file, as this would
+        // normally be done by the invocation of s.compile() (which we do not do).
+        s.m_jitter->m_object_file.emplace();
+    }
+
+    // Add the master module.
+    detail::add_module_to_lljit(*m_impl->m_jit->m_lljit, std::move(m_impl->m_jit->m_module), *m_impl->m_jit->m_ctx);
+
+    // Clear out the master builder.
+    m_impl->m_jit->m_builder.reset();
+
+    // Trigger optimisation and object code materialisation via lookup.
+    jit_lookup(detail::obj_trigger_name);
+
+    // Sanity checks.
+    assert(m_impl->m_jit->m_bc_snapshots.size() == m_impl->m_jit->m_n_modules);
+    assert(m_impl->m_jit->m_ir_snapshots.size() == m_impl->m_jit->m_n_modules);
+    assert(m_impl->m_jit->m_object_files.size() == m_impl->m_jit->m_n_modules);
+}
+
+void llvm_multi_state::compile()
+{
+    check_uncompiled(__func__);
+
+    // Log runtime in trace mode.
+    spdlog::stopwatch sw;
+
+    auto *logger = detail::get_logger();
+
+    // Verify the modules before compiling.
+    // NOTE: probably this can be parallelised if needed.
+    for (decltype(m_impl->m_states.size()) i = 0; i < m_impl->m_states.size(); ++i) {
+        detail::verify_module(*m_impl->m_states[i].m_module);
+    }
+
+    logger->trace("llvm_multi_state module verification runtime: {}", sw);
+
+    try {
+        // Add the object materialisation trigger functions.
+        // NOTE: contrary to llvm_state::add_obj_trigger(), add_obj_triggers()
+        // does not implement any automatic cleanup in case of errors. Thus, we fold
+        // it into the try/catch block in order to avoid leaving the
+        // llvm_multi_state in a half-baked state.
+        add_obj_triggers();
+
+        // Fetch the bitcode *before* optimisation.
+        std::vector<std::string> obc;
+        obc.reserve(boost::safe_numerics::safe<decltype(obc.size())>(m_impl->m_states.size()) + 1u);
+        for (const auto &s : m_impl->m_states) {
+            obc.push_back(s.get_bc());
+        }
+        // Add the master bitcode.
+        obc.push_back(detail::bc_from_module(*m_impl->m_jit->m_module));
+
+        // Assemble the compilation flag.
+        const auto comp_flag
+            = detail::assemble_comp_flag(get_opt_level(), force_avx512(), get_slp_vectorize(), get_code_model());
+
+        // Lookup in the cache.
+        if (auto cached_data = detail::llvm_state_mem_cache_lookup(obc, comp_flag)) {
+            // Cache hit.
+
+            // Assign the optimised snapshots.
+            assert(cached_data->opt_ir.size() == m_impl->m_jit->m_n_modules);
+            assert(cached_data->opt_bc.size() == m_impl->m_jit->m_n_modules);
+            assert(cached_data->obj.size() == m_impl->m_jit->m_n_modules);
+            assert(m_impl->m_jit->m_ir_snapshots.empty());
+            assert(m_impl->m_jit->m_bc_snapshots.empty());
+            m_impl->m_jit->m_ir_snapshots = std::move(cached_data->opt_ir);
+            m_impl->m_jit->m_bc_snapshots = std::move(cached_data->opt_bc);
+
+            // NOTE: here it is important that we replicate the logic happening
+            // in llvm_state::compile(): clear out module/builder, construct
+            // the object file. The snapshots can be left empty.
+            for (auto &s : m_impl->m_states) {
+                s.m_module.reset();
+                s.m_builder.reset();
+                s.m_jitter->m_object_file.emplace();
+            }
+
+            // Clear out master module and builder.
+            m_impl->m_jit->m_module.reset();
+            m_impl->m_jit->m_builder.reset();
+
+            // Add and assign the object files.
+            for (const auto &obj : cached_data->obj) {
+                detail::add_obj_to_lljit(*m_impl->m_jit->m_lljit, obj);
+            }
+
+            // Assign the compiled objects.
+            assert(m_impl->m_jit->m_object_files.empty());
+            m_impl->m_jit->m_object_files = std::move(cached_data->obj);
+
+            // Lookup the trigger.
+            jit_lookup(detail::obj_trigger_name);
+        } else {
+            // Cache miss.
+
+            sw.reset();
+
+            // Run the compilation.
+            compile_impl();
+
+            logger->trace("optimisation + materialisation runtime: {}", sw);
+
+            // NOTE: at this point, m_ir_snapshots, m_bc_snapshots and m_object_files
+            // have all been constructed in random order because of multithreading.
+            // Sort them so that we provided deterministic behaviour. Probably
+            // not strictly needed, but let's try to avoid nondeterminism.
+            // All of this can be parallelised if needed.
+            std::ranges::sort(m_impl->m_jit->m_ir_snapshots);
+            std::ranges::sort(m_impl->m_jit->m_bc_snapshots);
+            std::ranges::sort(m_impl->m_jit->m_object_files);
+
+            // Try to insert obc into the cache.
+            detail::llvm_state_mem_cache_try_insert(std::move(obc), comp_flag,
+                                                    {.opt_bc = m_impl->m_jit->m_bc_snapshots,
+                                                     .opt_ir = m_impl->m_jit->m_ir_snapshots,
+                                                     .obj = m_impl->m_jit->m_object_files});
+            // LCOV_EXCL_START
+        }
+    } catch (...) {
+        // Reset to a def-cted state in case of error,
+        // as it looks like there's no way of recovering.
+        m_impl.reset();
+
+        throw;
+    }
+    // LCOV_EXCL_STOP
+}
+
+std::uintptr_t llvm_multi_state::jit_lookup(const std::string &name)
+{
+    check_compiled(__func__);
+
+    auto sym = m_impl->m_jit->m_lljit->lookup(name);
+    if (!sym) {
+        throw std::invalid_argument(fmt::format("Could not find the symbol '{}' in an llvm_multi_state", name));
+    }
+
+    return static_cast<std::uintptr_t>((*sym).getValue());
+}
+
+std::ostream &operator<<(std::ostream &os, const llvm_multi_state &s)
+{
+    std::ostringstream oss;
+    oss << std::boolalpha;
+
+    oss << "N of modules      : " << s.get_n_modules() << '\n';
+    oss << "Compiled          : " << s.is_compiled() << '\n';
+    oss << "Fast math         : " << s.fast_math() << '\n';
+    oss << "Force AVX512      : " << s.force_avx512() << '\n';
+    oss << "SLP vectorization : " << s.get_slp_vectorize() << '\n';
+    oss << "Code model        : " << s.get_code_model() << '\n';
+    oss << "Optimisation level: " << s.get_opt_level() << '\n';
+    oss << "Data layout       : " << s.m_impl->m_states[0].m_jitter->m_lljit->getDataLayout().getStringRepresentation()
+        << '\n';
+    oss << "Target triple     : " << s.m_impl->m_states[0].m_jitter->get_target_triple().str() << '\n';
+    oss << "Target CPU        : " << s.m_impl->m_states[0].m_jitter->get_target_cpu() << '\n';
+    oss << "Target features   : " << s.m_impl->m_states[0].m_jitter->get_target_features() << '\n';
+
+    return os << oss.str();
+}
+
 HEYOKA_END_NAMESPACE
diff --git a/src/llvm_state_mem_cache.cpp b/src/llvm_state_mem_cache.cpp
index e3135ae25..88ba8ea0f 100644
--- a/src/llvm_state_mem_cache.cpp
+++ b/src/llvm_state_mem_cache.cpp
@@ -17,7 +17,7 @@
 #include <optional>
 #include <string>
 #include <utility>
-#include <variant>
+#include <vector>
 
 #include <boost/container_hash/hash.hpp>
 #include <boost/numeric/conversion/cast.hpp>
@@ -28,7 +28,8 @@
 #include <heyoka/llvm_state.hpp>
 
 // This in-memory cache maps the bitcode
-// of an LLVM module and an optimisation level to:
+// of one or more LLVM modules and an integer flag
+// (representing several compilation settings) to:
 //
 // - the optimised version of the bitcode,
 // - the textual IR corresponding
@@ -43,6 +44,26 @@ HEYOKA_BEGIN_NAMESPACE
 namespace detail
 {
 
+// Helper to compute the total size in bytes
+// of the data contained in an llvm_mc_value.
+// Will throw on overflow.
+std::size_t llvm_mc_value::total_size() const
+{
+    assert(!opt_bc.empty());
+    assert(opt_bc.size() == opt_ir.size());
+    assert(opt_bc.size() == obj.size());
+
+    boost::safe_numerics::safe<std::size_t> ret = 0;
+
+    for (decltype(opt_bc.size()) i = 0; i < opt_bc.size(); ++i) {
+        ret += opt_bc[i].size();
+        ret += opt_ir[i].size();
+        ret += obj[i].size();
+    }
+
+    return ret;
+}
+
 namespace
 {
 
@@ -56,16 +77,33 @@ HEYOKA_CONSTINIT
 std::mutex mem_cache_mutex;
 
 // Definition of the data structures for the cache.
-using lru_queue_t = std::list<std::pair<std::string, unsigned>>;
+using lru_queue_t = std::list<std::pair<std::vector<std::string>, unsigned>>;
 
 using lru_key_t = lru_queue_t::iterator;
 
+// Implementation of hashing for std::pair<std::vector<std::string>, unsigned> and
+// its heterogeneous counterpart.
+template <typename T>
+auto cache_key_hasher(const T &k) noexcept
+{
+    assert(!k.first.empty());
+
+    // Combine the bitcodes.
+    auto seed = std::hash<std::string>{}(k.first[0]);
+    for (decltype(k.first.size()) i = 1; i < k.first.size(); ++i) {
+        boost::hash_combine(seed, k.first[i]);
+    }
+
+    // Combine with the compilation flag.
+    boost::hash_combine(seed, static_cast<std::size_t>(k.second));
+
+    return seed;
+}
+
 struct lru_hasher {
     std::size_t operator()(const lru_key_t &k) const noexcept
     {
-        auto seed = std::hash<std::string>{}(k->first);
-        boost::hash_combine(seed, k->second);
-        return seed;
+        return cache_key_hasher(*k);
     }
 };
 
@@ -96,16 +134,16 @@ HEYOKA_CONSTINIT std::uint64_t mem_cache_limit = 2147483648ull;
 
 // Machinery for heterogeneous lookup into the cache.
 // NOTE: this function MUST be invoked while holding the global lock.
-auto llvm_state_mem_cache_hl(const std::string &bc, unsigned opt_level)
+auto llvm_state_mem_cache_hl(const std::vector<std::string> &bc, unsigned comp_flag)
 {
-    using compat_key_t = std::pair<const std::string &, unsigned>;
+    // NOTE: the heterogeneous version of the key replaces std::vector<std::string>
+    // with a const reference.
+    using compat_key_t = std::pair<const std::vector<std::string> &, unsigned>;
 
     struct compat_hasher {
         std::size_t operator()(const compat_key_t &k) const noexcept
         {
-            auto seed = std::hash<std::string>{}(k.first);
-            boost::hash_combine(seed, k.second);
-            return seed;
+            return cache_key_hasher(k);
         }
     };
 
@@ -120,7 +158,7 @@ auto llvm_state_mem_cache_hl(const std::string &bc, unsigned opt_level)
         }
     };
 
-    return lru_map.find(std::make_pair(std::cref(bc), opt_level), compat_hasher{}, compat_cmp{});
+    return lru_map.find(std::make_pair(std::cref(bc), comp_flag), compat_hasher{}, compat_cmp{});
 }
 
 // Debug function to run sanity checks on the cache.
@@ -131,15 +169,13 @@ void llvm_state_mem_cache_sanity_checks()
 
     // Check that the computed size of the cache is consistent with mem_cache_size.
     assert(std::accumulate(lru_map.begin(), lru_map.end(), boost::safe_numerics::safe<std::size_t>(0),
-                           [](const auto &a, const auto &p) {
-                               return a + p.second.opt_bc.size() + p.second.opt_ir.size() + p.second.obj.size();
-                           })
+                           [](const auto &a, const auto &p) { return a + p.second.total_size(); })
            == mem_cache_size);
 }
 
 } // namespace
 
-std::optional<llvm_mc_value> llvm_state_mem_cache_lookup(const std::string &bc, unsigned opt_level)
+std::optional<llvm_mc_value> llvm_state_mem_cache_lookup(const std::vector<std::string> &bc, unsigned comp_flag)
 {
     // Lock down.
     const std::lock_guard lock(mem_cache_mutex);
@@ -147,7 +183,7 @@ std::optional<llvm_mc_value> llvm_state_mem_cache_lookup(const std::string &bc,
     // Sanity checks.
     llvm_state_mem_cache_sanity_checks();
 
-    if (const auto it = llvm_state_mem_cache_hl(bc, opt_level); it == lru_map.end()) {
+    if (const auto it = llvm_state_mem_cache_hl(bc, comp_flag); it == lru_map.end()) {
         // Cache miss.
         return {};
     } else {
@@ -163,7 +199,7 @@ std::optional<llvm_mc_value> llvm_state_mem_cache_lookup(const std::string &bc,
     }
 }
 
-void llvm_state_mem_cache_try_insert(std::string bc, unsigned opt_level, llvm_mc_value val)
+void llvm_state_mem_cache_try_insert(std::vector<std::string> bc, unsigned comp_flag, llvm_mc_value val)
 {
     // Lock down.
     const std::lock_guard lock(mem_cache_mutex);
@@ -174,7 +210,7 @@ void llvm_state_mem_cache_try_insert(std::string bc, unsigned opt_level, llvm_mc
     // Do a first lookup to check if bc is already in the cache.
     // This could happen, e.g., if two threads are compiling the same
     // code concurrently.
-    if (const auto it = llvm_state_mem_cache_hl(bc, opt_level); it != lru_map.end()) {
+    if (const auto it = llvm_state_mem_cache_hl(bc, comp_flag); it != lru_map.end()) {
         assert(val.opt_bc == it->second.opt_bc);
         assert(val.opt_ir == it->second.opt_ir);
         assert(val.obj == it->second.obj);
@@ -183,8 +219,7 @@ void llvm_state_mem_cache_try_insert(std::string bc, unsigned opt_level, llvm_mc
     }
 
     // Compute the new cache size.
-    auto new_cache_size = static_cast<std::size_t>(boost::safe_numerics::safe<std::size_t>(mem_cache_size)
-                                                   + val.opt_bc.size() + val.opt_ir.size() + val.obj.size());
+    auto new_cache_size = boost::safe_numerics::safe<std::size_t>(mem_cache_size) + val.total_size();
 
     // Remove items from the cache if we are exceeding
     // the limit.
@@ -195,8 +230,7 @@ void llvm_state_mem_cache_try_insert(std::string bc, unsigned opt_level, llvm_mc
         const auto &cur_val = cur_it->second;
         // NOTE: no possibility of overflow here, as cur_size is guaranteed
         // not to be greater than mem_cache_size.
-        const auto cur_size
-            = static_cast<std::size_t>(cur_val.opt_bc.size()) + cur_val.opt_ir.size() + cur_val.obj.size();
+        const auto cur_size = cur_val.total_size();
 
         // NOTE: the next 4 lines cannot throw, which ensures that the
         // cache cannot be left in an inconsistent state.
@@ -222,7 +256,7 @@ void llvm_state_mem_cache_try_insert(std::string bc, unsigned opt_level, llvm_mc
     // Add the new item to the front of the queue.
     // NOTE: if this throws, we have not modified lru_map yet,
     // no cleanup needed.
-    lru_queue.emplace_front(std::move(bc), opt_level);
+    lru_queue.emplace_front(std::move(bc), comp_flag);
 
     // Add the new item to the map.
     try {
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 09e360cab..f6c4ccd29 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -159,6 +159,7 @@ ADD_HEYOKA_TESTCASE(cfunc)
 ADD_HEYOKA_TESTCASE(cfunc_multieval)
 ADD_HEYOKA_TESTCASE(dfun)
 ADD_HEYOKA_TESTCASE(var_ode_sys)
+ADD_HEYOKA_TESTCASE(llvm_multi_state)
 
 if(HEYOKA_WITH_MPPP AND mp++_WITH_MPFR)
   ADD_HEYOKA_TESTCASE(event_detection_mp)
diff --git a/test/llvm_multi_state.cpp b/test/llvm_multi_state.cpp
new file mode 100644
index 000000000..652ab5b56
--- /dev/null
+++ b/test/llvm_multi_state.cpp
@@ -0,0 +1,578 @@
+// Copyright 2020, 2021, 2022, 2023, 2024 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com)
+//
+// This file is part of the heyoka library.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <cmath>
+#include <sstream>
+#include <stdexcept>
+
+#include <boost/algorithm/string/find_iterator.hpp>
+#include <boost/algorithm/string/finder.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+
+#include <llvm/Config/llvm-config.h>
+
+#include <heyoka/config.hpp>
+#include <heyoka/expression.hpp>
+#include <heyoka/kw.hpp>
+#include <heyoka/llvm_state.hpp>
+#include <heyoka/math/erf.hpp>
+#include <heyoka/s11n.hpp>
+
+#include "catch.hpp"
+
+using namespace heyoka;
+
+TEST_CASE("basic")
+{
+    using Catch::Matchers::Message;
+
+    // Default construction.
+    {
+        REQUIRE_NOTHROW(llvm_multi_state{});
+    }
+
+    // No states in input.
+    REQUIRE_THROWS_MATCHES(llvm_multi_state{{}}, std::invalid_argument,
+                           Message("At least 1 llvm_state object is needed to construct an llvm_multi_state"));
+
+    // Inconsistent settings.
+    REQUIRE_THROWS_MATCHES(
+        (llvm_multi_state{{llvm_state{kw::opt_level = 1u}, llvm_state{kw::opt_level = 2u}}}), std::invalid_argument,
+        Message("Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state"));
+
+    REQUIRE_THROWS_MATCHES(
+        (llvm_multi_state{{llvm_state{kw::fast_math = true}, llvm_state{}}}), std::invalid_argument,
+        Message("Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state"));
+
+    REQUIRE_THROWS_MATCHES(
+        (llvm_multi_state{{llvm_state{}, llvm_state{kw::force_avx512 = true}}}), std::invalid_argument,
+        Message("Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state"));
+
+    REQUIRE_THROWS_MATCHES(
+        (llvm_multi_state{{llvm_state{}, llvm_state{}, llvm_state{kw::slp_vectorize = true}}}), std::invalid_argument,
+        Message("Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state"));
+    REQUIRE_THROWS_MATCHES(
+        (llvm_multi_state{{llvm_state{}, llvm_state{kw::code_model = code_model::large}, llvm_state{}}}),
+        std::invalid_argument,
+        Message("Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state"));
+
+    {
+        // Construction from compiled modules.
+        llvm_state s;
+        s.compile();
+
+        REQUIRE_THROWS_MATCHES(
+            (llvm_multi_state{{s, llvm_state{}}}), std::invalid_argument,
+            Message("An llvm_multi_state can be constructed only from uncompiled llvm_state objects"));
+        REQUIRE_THROWS_MATCHES(
+            (llvm_multi_state{{llvm_state{}, s}}), std::invalid_argument,
+            Message("An llvm_multi_state can be constructed only from uncompiled llvm_state objects"));
+    }
+
+    {
+        // Invalid module name.
+        llvm_state s{kw::mname = "heyoka.master"};
+        REQUIRE_THROWS_MATCHES(
+            (llvm_multi_state{{s, llvm_state{}}}), std::invalid_argument,
+            Message("An invalid llvm_state was passed to the constructor of an llvm_multi_state: the module name "
+                    "'heyoka.master' is reserved for internal use by llvm_multi_state"));
+    }
+
+    // Test the property getters.
+    {
+        llvm_state s{kw::opt_level = 1u, kw::fast_math = true, kw::force_avx512 = true, kw::slp_vectorize = true,
+                     kw::code_model = code_model::large};
+
+        llvm_multi_state ms{{s, s, s, s}};
+
+        REQUIRE(ms.get_opt_level() == 1u);
+        REQUIRE(ms.fast_math());
+        REQUIRE(ms.force_avx512());
+        REQUIRE(ms.get_slp_vectorize());
+        REQUIRE(ms.get_code_model() == code_model::large);
+        REQUIRE(ms.get_n_modules() == 5u);
+        REQUIRE(!ms.is_compiled());
+
+        ms.compile();
+
+        REQUIRE(ms.is_compiled());
+        REQUIRE(ms.get_opt_level() == 1u);
+        REQUIRE(ms.fast_math());
+        REQUIRE(ms.force_avx512());
+        REQUIRE(ms.get_slp_vectorize());
+        REQUIRE(ms.get_code_model() == code_model::large);
+        REQUIRE(ms.get_n_modules() == 5u);
+
+        REQUIRE_THROWS_MATCHES(
+            ms.compile(), std::invalid_argument,
+            Message("The function 'compile' can be invoked only if the llvm_multi_state has not been compiled yet"));
+    }
+
+    // Move construction/assignment.
+    {
+        llvm_state s{kw::opt_level = 1u, kw::fast_math = true, kw::force_avx512 = true, kw::slp_vectorize = true,
+                     kw::code_model = code_model::large};
+
+        llvm_multi_state ms{{s, s, s, s}};
+
+        auto ms2 = std::move(ms);
+
+        REQUIRE(ms2.get_opt_level() == 1u);
+        REQUIRE(ms2.fast_math());
+        REQUIRE(ms2.force_avx512());
+        REQUIRE(ms2.get_slp_vectorize());
+        REQUIRE(ms2.get_code_model() == code_model::large);
+        REQUIRE(ms2.get_n_modules() == 5u);
+        REQUIRE(!ms2.is_compiled());
+
+        ms2.compile();
+
+        llvm_multi_state ms3;
+        ms3 = std::move(ms2);
+
+        REQUIRE(ms3.is_compiled());
+        REQUIRE(ms3.get_opt_level() == 1u);
+        REQUIRE(ms3.fast_math());
+        REQUIRE(ms3.force_avx512());
+        REQUIRE(ms3.get_slp_vectorize());
+        REQUIRE(ms3.get_code_model() == code_model::large);
+        REQUIRE(ms3.get_n_modules() == 5u);
+    }
+}
+
+TEST_CASE("copy semantics")
+{
+    using Catch::Matchers::Message;
+
+    // NOTE: in order to properly test this, we have to disable the cache.
+    llvm_state::clear_memcache();
+    llvm_state::set_memcache_limit(0);
+
+    auto [x, y] = make_vars("x", "y");
+
+    llvm_state s1, s2;
+
+    add_cfunc<double>(s1, "f1", {x * y}, {x, y}, kw::compact_mode = true);
+    add_cfunc<double>(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true);
+
+    llvm_multi_state ms{{s1, s2}};
+
+    auto ms_copy = ms;
+
+    REQUIRE(ms_copy.get_bc() == ms.get_bc());
+    REQUIRE(ms_copy.get_ir() == ms.get_ir());
+    REQUIRE(ms_copy.is_compiled() == ms.is_compiled());
+    REQUIRE(ms_copy.fast_math() == ms.fast_math());
+    REQUIRE(ms_copy.force_avx512() == ms.force_avx512());
+    REQUIRE(ms_copy.get_opt_level() == ms.get_opt_level());
+    REQUIRE(ms_copy.get_slp_vectorize() == ms.get_slp_vectorize());
+    REQUIRE(ms_copy.get_code_model() == ms.get_code_model());
+    REQUIRE_THROWS_MATCHES(
+        ms_copy.get_object_code(), std::invalid_argument,
+        Message("The function 'get_object_code' can be invoked only after the llvm_multi_state has been compiled"));
+    REQUIRE_THROWS_MATCHES(
+        ms_copy.jit_lookup("foo"), std::invalid_argument,
+        Message("The function 'jit_lookup' can be invoked only after the llvm_multi_state has been compiled"));
+
+    ms.compile();
+    ms_copy.compile();
+
+    REQUIRE(ms_copy.get_bc() == ms.get_bc());
+    REQUIRE(ms_copy.get_ir() == ms.get_ir());
+    REQUIRE(ms_copy.get_object_code() == ms.get_object_code());
+    REQUIRE(ms_copy.is_compiled() == ms.is_compiled());
+    REQUIRE(ms_copy.fast_math() == ms.fast_math());
+    REQUIRE(ms_copy.force_avx512() == ms.force_avx512());
+    REQUIRE(ms_copy.get_opt_level() == ms.get_opt_level());
+    REQUIRE(ms_copy.get_slp_vectorize() == ms.get_slp_vectorize());
+    REQUIRE(ms_copy.get_code_model() == ms.get_code_model());
+    REQUIRE_NOTHROW(ms_copy.jit_lookup("f1"));
+    REQUIRE_NOTHROW(ms_copy.jit_lookup("f2"));
+
+    {
+        auto *cf1_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            ms_copy.jit_lookup("f1"));
+        auto *cf2_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            ms_copy.jit_lookup("f2"));
+
+        REQUIRE_THROWS_MATCHES(ms_copy.jit_lookup("f3"), std::invalid_argument,
+                               Message("Could not find the symbol 'f3' in an llvm_multi_state"));
+
+        const double ins[] = {2., 3.};
+        double outs[2] = {};
+
+        cf1_ptr(outs, ins, nullptr, nullptr);
+        cf2_ptr(outs + 1, ins, nullptr, nullptr);
+
+        REQUIRE(outs[0] == 6);
+        REQUIRE(outs[1] == 2. / 3.);
+    }
+
+    auto ms_copy2 = ms;
+
+    REQUIRE(ms_copy2.get_bc() == ms.get_bc());
+    REQUIRE(ms_copy2.get_ir() == ms.get_ir());
+    REQUIRE(ms_copy2.get_object_code() == ms.get_object_code());
+    REQUIRE(ms_copy2.is_compiled() == ms.is_compiled());
+    REQUIRE(ms_copy2.fast_math() == ms.fast_math());
+    REQUIRE(ms_copy2.force_avx512() == ms.force_avx512());
+    REQUIRE(ms_copy2.get_opt_level() == ms.get_opt_level());
+    REQUIRE(ms_copy2.get_slp_vectorize() == ms.get_slp_vectorize());
+    REQUIRE(ms_copy2.get_code_model() == ms.get_code_model());
+    REQUIRE_NOTHROW(ms_copy2.jit_lookup("f1"));
+    REQUIRE_NOTHROW(ms_copy2.jit_lookup("f2"));
+
+    {
+        auto *cf1_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            ms_copy2.jit_lookup("f1"));
+        auto *cf2_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            ms_copy2.jit_lookup("f2"));
+
+        const double ins[] = {2., 3.};
+        double outs[2] = {};
+
+        cf1_ptr(outs, ins, nullptr, nullptr);
+        cf2_ptr(outs + 1, ins, nullptr, nullptr);
+
+        REQUIRE(outs[0] == 6);
+        REQUIRE(outs[1] == 2. / 3.);
+    }
+
+    // Test also copy assignment.
+    llvm_multi_state ms_copy3;
+    ms_copy3 = ms_copy2;
+
+    REQUIRE(ms_copy3.get_bc() == ms.get_bc());
+    REQUIRE(ms_copy3.get_ir() == ms.get_ir());
+    REQUIRE(ms_copy3.get_object_code() == ms.get_object_code());
+    REQUIRE(ms_copy3.is_compiled() == ms.is_compiled());
+    REQUIRE(ms_copy3.fast_math() == ms.fast_math());
+    REQUIRE(ms_copy3.force_avx512() == ms.force_avx512());
+    REQUIRE(ms_copy3.get_opt_level() == ms.get_opt_level());
+    REQUIRE(ms_copy3.get_slp_vectorize() == ms.get_slp_vectorize());
+    REQUIRE(ms_copy3.get_code_model() == ms.get_code_model());
+    REQUIRE_NOTHROW(ms_copy3.jit_lookup("f1"));
+    REQUIRE_NOTHROW(ms_copy3.jit_lookup("f2"));
+
+    {
+        auto *cf1_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            ms_copy3.jit_lookup("f1"));
+        auto *cf2_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            ms_copy3.jit_lookup("f2"));
+
+        const double ins[] = {2., 3.};
+        double outs[2] = {};
+
+        cf1_ptr(outs, ins, nullptr, nullptr);
+        cf2_ptr(outs + 1, ins, nullptr, nullptr);
+
+        REQUIRE(outs[0] == 6);
+        REQUIRE(outs[1] == 2. / 3.);
+    }
+
+    // Restore the cache.
+    llvm_state::set_memcache_limit(100'000'000ull);
+}
+
+TEST_CASE("s11n")
+{
+    using Catch::Matchers::Message;
+
+    // NOTE: in order to properly test this, we have to disable the cache.
+    llvm_state::clear_memcache();
+    llvm_state::set_memcache_limit(0);
+
+    auto [x, y] = make_vars("x", "y");
+
+    llvm_state s1, s2;
+
+    add_cfunc<double>(s1, "f1", {x * y}, {x, y}, kw::compact_mode = true);
+    add_cfunc<double>(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true);
+
+    // Uncompiled.
+    llvm_multi_state ms{{s1, s2}};
+
+    std::stringstream ss;
+
+    {
+        boost::archive::binary_oarchive oa(ss);
+        oa << ms;
+    }
+
+    llvm_multi_state ms_copy{{llvm_state{}}};
+
+    {
+        boost::archive::binary_iarchive ia(ss);
+        ia >> ms_copy;
+    }
+
+    REQUIRE(ms_copy.get_bc() == ms.get_bc());
+    REQUIRE(ms_copy.get_ir() == ms.get_ir());
+    REQUIRE(ms_copy.is_compiled() == ms.is_compiled());
+    REQUIRE(ms_copy.fast_math() == ms.fast_math());
+    REQUIRE(ms_copy.force_avx512() == ms.force_avx512());
+    REQUIRE(ms_copy.get_opt_level() == ms.get_opt_level());
+    REQUIRE(ms_copy.get_slp_vectorize() == ms.get_slp_vectorize());
+    REQUIRE(ms_copy.get_code_model() == ms.get_code_model());
+    REQUIRE_THROWS_MATCHES(
+        ms_copy.get_object_code(), std::invalid_argument,
+        Message("The function 'get_object_code' can be invoked only after the llvm_multi_state has been compiled"));
+    REQUIRE_THROWS_MATCHES(
+        ms_copy.jit_lookup("foo"), std::invalid_argument,
+        Message("The function 'jit_lookup' can be invoked only after the llvm_multi_state has been compiled"));
+
+    // Compiled.
+    ms.compile();
+
+    ss.str("");
+
+    {
+        boost::archive::binary_oarchive oa(ss);
+        oa << ms;
+    }
+
+    {
+        boost::archive::binary_iarchive ia(ss);
+        ia >> ms_copy;
+    }
+
+    REQUIRE(ms_copy.get_bc() == ms.get_bc());
+    REQUIRE(ms_copy.get_ir() == ms.get_ir());
+    REQUIRE(ms_copy.get_object_code() == ms.get_object_code());
+    REQUIRE(ms_copy.is_compiled() == ms.is_compiled());
+    REQUIRE(ms_copy.fast_math() == ms.fast_math());
+    REQUIRE(ms_copy.force_avx512() == ms.force_avx512());
+    REQUIRE(ms_copy.get_opt_level() == ms.get_opt_level());
+    REQUIRE(ms_copy.get_slp_vectorize() == ms.get_slp_vectorize());
+    REQUIRE(ms_copy.get_code_model() == ms.get_code_model());
+    REQUIRE_NOTHROW(ms_copy.jit_lookup("f1"));
+    REQUIRE_NOTHROW(ms_copy.jit_lookup("f2"));
+
+    {
+        auto *cf1_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            ms_copy.jit_lookup("f1"));
+        auto *cf2_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            ms_copy.jit_lookup("f2"));
+
+        const double ins[] = {2., 3.};
+        double outs[2] = {};
+
+        cf1_ptr(outs, ins, nullptr, nullptr);
+        cf2_ptr(outs + 1, ins, nullptr, nullptr);
+
+        REQUIRE(outs[0] == 6);
+        REQUIRE(outs[1] == 2. / 3.);
+    }
+
+    // Restore the cache.
+    llvm_state::set_memcache_limit(100'000'000ull);
+}
+
+TEST_CASE("cfunc")
+{
+    using Catch::Matchers::Message;
+
+    // Basic test.
+    auto [x, y] = make_vars("x", "y");
+
+    llvm_state s1, s2;
+
+    add_cfunc<double>(s1, "f1", {x * y}, {x, y}, kw::compact_mode = true);
+    add_cfunc<double>(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true);
+
+    const auto orig_ir1 = s1.get_ir();
+    const auto orig_ir2 = s2.get_ir();
+
+    const auto orig_bc1 = s1.get_bc();
+    const auto orig_bc2 = s2.get_bc();
+
+    llvm_multi_state ms{{s1, s2}};
+
+    REQUIRE(ms.get_ir().size() == 3u);
+    REQUIRE(ms.get_bc().size() == 3u);
+    REQUIRE_THROWS_MATCHES(
+        ms.get_object_code(), std::invalid_argument,
+        Message("The function 'get_object_code' can be invoked only after the llvm_multi_state has been compiled"));
+
+    REQUIRE(orig_ir1 == ms.get_ir()[0]);
+    REQUIRE(orig_ir2 == ms.get_ir()[1]);
+
+    REQUIRE(orig_bc1 == ms.get_bc()[0]);
+    REQUIRE(orig_bc2 == ms.get_bc()[1]);
+
+    ms.compile();
+    s1.compile();
+    s2.compile();
+
+    REQUIRE(ms.get_ir().size() == 3u);
+    REQUIRE(ms.get_bc().size() == 3u);
+
+    auto *cf1_ptr
+        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(ms.jit_lookup("f1"));
+    auto *cf2_ptr
+        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(ms.jit_lookup("f2"));
+
+    const double ins[] = {2., 3.};
+    double outs[2] = {};
+
+    cf1_ptr(outs, ins, nullptr, nullptr);
+    cf2_ptr(outs + 1, ins, nullptr, nullptr);
+
+    REQUIRE(outs[0] == 6);
+    REQUIRE(outs[1] == 2. / 3.);
+}
+
+TEST_CASE("stream op")
+{
+    auto [x, y] = make_vars("x", "y");
+
+    llvm_state s1, s2;
+
+    add_cfunc<double>(s1, "f1", {x * y}, {x, y}, kw::compact_mode = true);
+    add_cfunc<double>(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true);
+
+    const auto orig_ir1 = s1.get_ir();
+    const auto orig_ir2 = s2.get_ir();
+
+    const auto orig_bc1 = s1.get_bc();
+    const auto orig_bc2 = s2.get_bc();
+
+    llvm_multi_state ms{{s1, s2}};
+
+    std::ostringstream oss;
+    oss << ms;
+
+    REQUIRE(!oss.str().empty());
+}
+
+// A test to check that, post compilation, snapshots and object files
+// are ordered deterministically.
+TEST_CASE("post compile ordering")
+{
+    auto [x, y] = make_vars("x", "y");
+
+    llvm_state s1, s2, s3, s4;
+
+    add_cfunc<double>(s1, "f1", {x * y}, {x, y});
+    add_cfunc<double>(s2, "f2", {x / y}, {x, y});
+    add_cfunc<double>(s3, "f3", {x + y}, {x, y});
+    add_cfunc<double>(s4, "f4", {x - y}, {x, y});
+
+    llvm_state::clear_memcache();
+
+    llvm_multi_state ms{{s1, s2, s3, s4}};
+    ms.compile();
+
+    const auto orig_obj = ms.get_object_code();
+    const auto orig_ir = ms.get_ir();
+    const auto orig_bc = ms.get_bc();
+
+    for (auto i = 0; i < 20; ++i) {
+        llvm_state::clear_memcache();
+
+        llvm_multi_state ms2{{s1, s2, s3, s4}};
+        ms2.compile();
+
+        REQUIRE(ms2.get_object_code() == orig_obj);
+        REQUIRE(ms2.get_ir() == orig_ir);
+        REQUIRE(ms2.get_bc() == orig_bc);
+    }
+}
+
+TEST_CASE("memcache testing")
+{
+    auto [x, y] = make_vars("x", "y");
+
+    llvm_state s1, s2, s3, s4;
+
+    add_cfunc<double>(s1, "f1", {x * y}, {x, y});
+    add_cfunc<double>(s2, "f2", {x / y}, {x, y});
+    add_cfunc<double>(s3, "f3", {x + y}, {x, y});
+    add_cfunc<double>(s4, "f4", {x - y}, {x, y});
+
+    llvm_state::clear_memcache();
+
+    llvm_multi_state ms{{s1, s2, s3, s4}};
+    ms.compile();
+
+    const auto cur_cache_size = llvm_state::get_memcache_size();
+
+    llvm_multi_state ms2{{s1, s2, s3, s4}};
+    ms2.compile();
+
+    REQUIRE(cur_cache_size == llvm_state::get_memcache_size());
+
+    auto *cf1_ptr
+        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(ms.jit_lookup("f1"));
+    auto *cf2_ptr
+        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(ms.jit_lookup("f2"));
+    auto *cf3_ptr
+        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(ms.jit_lookup("f3"));
+    auto *cf4_ptr
+        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(ms.jit_lookup("f4"));
+
+    const double ins[] = {2., 3.};
+    double outs[4] = {};
+
+    cf1_ptr(outs, ins, nullptr, nullptr);
+    cf2_ptr(outs + 1, ins, nullptr, nullptr);
+    cf3_ptr(outs + 2, ins, nullptr, nullptr);
+    cf4_ptr(outs + 3, ins, nullptr, nullptr);
+
+    REQUIRE(outs[0] == 6);
+    REQUIRE(outs[1] == 2. / 3.);
+    REQUIRE(outs[2] == 5);
+    REQUIRE(outs[3] == -1);
+}
+
+// Tests to check vectorisation via the vector-function-abi-variant machinery.
+TEST_CASE("vfabi double")
+{
+    for (auto fast_math : {false, true}) {
+        llvm_state s1{kw::slp_vectorize = true, kw::fast_math = fast_math};
+        llvm_state s2{kw::slp_vectorize = true, kw::fast_math = fast_math};
+
+        auto [a, b] = make_vars("a", "b");
+
+        add_cfunc<double>(s1, "cfunc", {erf(a), erf(b)}, {a, b});
+        add_cfunc<double>(s2, "cfuncs", {erf(a), erf(b)}, {a, b}, kw::strided = true);
+
+        llvm_multi_state ms{{s1, s2}};
+
+        ms.compile();
+
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
+#if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
+
+        for (auto ir : ms.get_ir()) {
+            using string_find_iterator = boost::find_iterator<std::string::iterator>;
+
+            auto count = 0u;
+            for (auto it = boost::make_find_iterator(ir, boost::first_finder("@erf", boost::is_iequal()));
+                 it != string_find_iterator(); ++it) {
+                ++count;
+            }
+
+            // NOTE: in the master module or in the "cfunc" module, we don't
+            // expect any @erf: the master module contains only the trigger,
+            // the "cfunc" module should have vectorised everything and
+            // there should be no more references to the scalar @erf.
+            if (count == 0u) {
+                continue;
+            }
+
+            // NOTE: occurrences of the scalar version:
+            // - 2 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 3u);
+        }
+
+#endif
+    }
+}
diff --git a/tools/gha_llvm13_conda_asan.sh b/tools/gha_llvm13_conda_asan.sh
deleted file mode 100644
index cba8e3692..000000000
--- a/tools/gha_llvm13_conda_asan.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env bash
-
-# Echo each command
-set -x
-
-# Exit on error.
-set -e
-
-# Core deps.
-sudo apt-get install wget
-
-# Install conda+deps.
-wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O miniconda.sh
-export deps_dir=$HOME/local
-export PATH="$HOME/miniconda/bin:$PATH"
-bash miniconda.sh -b -p $HOME/miniconda
-mamba create -y -p $deps_dir c-compiler cxx-compiler cmake ninja 'llvmdev=13.*' \
-    tbb-devel tbb libboost-devel 'mppp=1.*' sleef xtensor xtensor-blas blas \
-    blas-devel fmt spdlog
-source activate $deps_dir
-
-# Create the build dir and cd into it.
-mkdir build
-cd build
-
-# Clear the compilation flags set up by conda.
-unset CXXFLAGS
-unset CFLAGS
-
-# Configure.
-cmake ../ -G Ninja \
-    -DCMAKE_PREFIX_PATH=$deps_dir \
-    -DCMAKE_BUILD_TYPE=Debug \
-    -DHEYOKA_BUILD_TESTS=yes \
-    -DHEYOKA_BUILD_TUTORIALS=ON \
-    -DHEYOKA_WITH_MPPP=yes \
-    -DHEYOKA_WITH_SLEEF=yes \
-    -DCMAKE_CXX_FLAGS="-fsanitize=address" \
-    -DCMAKE_CXX_FLAGS_DEBUG="-g -Og"
-
-# Build.
-ninja -v
-
-# Run the tests.
-ctest -VV -j4
-
-set +e
-set +x
diff --git a/tools/gha_llvm14_conda_asan.sh b/tools/gha_llvm14_conda_asan.sh
deleted file mode 100644
index cc54de5ae..000000000
--- a/tools/gha_llvm14_conda_asan.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env bash
-
-# Echo each command
-set -x
-
-# Exit on error.
-set -e
-
-# Core deps.
-sudo apt-get install wget
-
-# Install conda+deps.
-wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O miniconda.sh
-export deps_dir=$HOME/local
-export PATH="$HOME/miniconda/bin:$PATH"
-bash miniconda.sh -b -p $HOME/miniconda
-mamba create -y -p $deps_dir c-compiler cxx-compiler cmake ninja 'llvmdev=14.*' \
-    tbb-devel tbb libboost-devel 'mppp=1.*' sleef xtensor xtensor-blas blas \
-    blas-devel fmt spdlog
-source activate $deps_dir
-
-# Create the build dir and cd into it.
-mkdir build
-cd build
-
-# Clear the compilation flags set up by conda.
-unset CXXFLAGS
-unset CFLAGS
-
-# Configure.
-cmake ../ -G Ninja \
-    -DCMAKE_PREFIX_PATH=$deps_dir \
-    -DCMAKE_BUILD_TYPE=Debug \
-    -DHEYOKA_BUILD_TESTS=yes \
-    -DHEYOKA_BUILD_TUTORIALS=ON \
-    -DHEYOKA_WITH_MPPP=yes \
-    -DHEYOKA_WITH_SLEEF=yes \
-    -DCMAKE_CXX_FLAGS="-fsanitize=address" \
-    -DCMAKE_CXX_FLAGS_DEBUG="-g -Og"
-
-# Build.
-ninja -v
-
-# Run the tests.
-ctest -VV -j4
-
-set +e
-set +x