From 82f8228a05b840b254fa7fe02350c294038ac408 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sun, 28 Jul 2024 20:02:51 +0200
Subject: [PATCH 01/41] Drop support for LLVM 13, require LLVM 14.

---
 CMakeLists.txt              |   4 +-
 doc/install.rst             |   2 +-
 src/detail/llvm_helpers.cpp |   9 ---
 src/llvm_state.cpp          | 134 +++---------------------------------
 4 files changed, 14 insertions(+), 135 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 15905e716..a8e4a9d91 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -158,8 +158,8 @@ endif()
 include(GNUInstallDirs)
 find_package(LLVM REQUIRED CONFIG)
 
-if(${LLVM_VERSION_MAJOR} LESS 13 OR ${LLVM_VERSION_MAJOR} GREATER 18)
-  message(FATAL_ERROR "LLVM >= 13 and <= 18 is required.")
+if(${LLVM_VERSION_MAJOR} LESS 14 OR ${LLVM_VERSION_MAJOR} GREATER 18)
+  message(FATAL_ERROR "LLVM >= 14 and <= 18 is required.")
 endif()
 
 # List of source files.
diff --git a/doc/install.rst b/doc/install.rst
index 9b922c73e..276843abc 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -14,7 +14,7 @@ and several CPU architectures (x86-64, 64-bit ARM and 64-bit PowerPC).
 
 heyoka has the following **mandatory** dependencies:
 
-* the `LLVM <https://llvm.org/>`__ compiler infrastructure library (version >=13 and <=18),
+* the `LLVM <https://llvm.org/>`__ compiler infrastructure library (version >=14 and <=18),
 * the `Boost <https://www.boost.org/>`__ C++ libraries (version >=1.69),
 * the `{fmt} <https://fmt.dev/latest/index.html>`__ library (version >=9 and <=10),
 * the `spdlog <https://github.com/gabime/spdlog>`__ library,
diff --git a/src/detail/llvm_helpers.cpp b/src/detail/llvm_helpers.cpp
index f289480f2..affdee4e0 100644
--- a/src/detail/llvm_helpers.cpp
+++ b/src/detail/llvm_helpers.cpp
@@ -311,17 +311,8 @@ llvm::CallInst *llvm_add_vfabi_attrs(llvm_state &s, llvm::CallInst *call, const
                 = (use_fast_math && !el.lp_vf_abi_attr.empty()) ? el.lp_vf_abi_attr : el.vf_abi_attr;
             vf_abi_strs.push_back(vf_abi_attr);
         }
-#if LLVM_VERSION_MAJOR >= 14
         call->addFnAttr(llvm::Attribute::get(context, "vector-function-abi-variant",
                                              fmt::format("{}", fmt::join(vf_abi_strs, ","))));
-#else
-        {
-            auto attrs = call->getAttributes();
-            attrs = attrs.addAttribute(context, llvm::AttributeList::FunctionIndex, "vector-function-abi-variant",
-                                       fmt::format("{}", fmt::join(vf_abi_strs, ",")));
-            call->setAttributes(attrs);
-        }
-#endif
 
         // Now we need to:
         // - add the declarations of the vector variants to the module,
diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index e08e2681c..82c1c0700 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -37,6 +37,8 @@
 
 #include <llvm/ADT/SmallString.h>
 #include <llvm/ADT/SmallVector.h>
+#include <llvm/Analysis/CGSCCPassManager.h>
+#include <llvm/Analysis/LoopAnalysisManager.h>
 #include <llvm/Analysis/TargetLibraryInfo.h>
 #include <llvm/Analysis/TargetTransformInfo.h>
 #include <llvm/Bitcode/BitcodeReader.h>
@@ -59,8 +61,12 @@
 #include <llvm/IR/LLVMContext.h>
 #include <llvm/IR/Module.h>
 #include <llvm/IR/Operator.h>
+#include <llvm/IR/PassManager.h>
 #include <llvm/IR/Value.h>
 #include <llvm/IR/Verifier.h>
+#include <llvm/MC/TargetRegistry.h>
+#include <llvm/Passes/OptimizationLevel.h>
+#include <llvm/Passes/PassBuilder.h>
 #include <llvm/Support/Casting.h>
 #include <llvm/Support/CodeGen.h>
 #include <llvm/Support/MemoryBuffer.h>
@@ -70,53 +76,6 @@
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Transforms/Vectorize/LoadStoreVectorizer.h>
 
-#if LLVM_VERSION_MAJOR < 14
-
-// NOTE: this header was moved in LLVM 14.
-#include <llvm/Support/TargetRegistry.h>
-
-#else
-
-#include <llvm/MC/TargetRegistry.h>
-
-#endif
-
-// NOTE: new pass manager API.
-// NOTE: this is available since LLVM 13, but in that
-// version it seems like auto-vectorization with
-// vector-function-abi-variant is not working
-// properly with the new pass manager. Hence, we
-// enable it from LLVM 14.
-#if LLVM_VERSION_MAJOR >= 14
-
-#define HEYOKA_USE_NEW_LLVM_PASS_MANAGER
-
-#endif
-
-#if defined(HEYOKA_USE_NEW_LLVM_PASS_MANAGER)
-
-#include <llvm/Analysis/CGSCCPassManager.h>
-#include <llvm/Analysis/LoopAnalysisManager.h>
-#include <llvm/IR/PassManager.h>
-#include <llvm/Passes/PassBuilder.h>
-
-#if LLVM_VERSION_MAJOR >= 14
-
-// NOTE: this header is available since LLVM 14.
-#include <llvm/Passes/OptimizationLevel.h>
-
-#endif
-
-#else
-
-#include <llvm/CodeGen/TargetPassConfig.h>
-#include <llvm/IR/LegacyPassManager.h>
-#include <llvm/Pass.h>
-#include <llvm/Transforms/IPO.h>
-#include <llvm/Transforms/IPO/PassManagerBuilder.h>
-
-#endif
-
 #if LLVM_VERSION_MAJOR >= 17
 
 // NOTE: this header was moved in LLVM 17.
@@ -1055,13 +1014,7 @@ void llvm_state::optimise()
     for (auto &f : module()) {
         auto attrs = f.getAttributes();
 
-        llvm::AttrBuilder
-#if LLVM_VERSION_MAJOR < 14
-            new_attrs
-#else
-            new_attrs(ctx)
-#endif
-            ;
+        llvm::AttrBuilder new_attrs(ctx);
 
         if (!cpu.empty() && !f.hasFnAttribute("target-cpu")) {
             new_attrs.addAttribute("target-cpu", cpu);
@@ -1081,11 +1034,7 @@ void llvm_state::optimise()
         }
 
         // Let new_attrs override attrs.
-#if LLVM_VERSION_MAJOR < 14
-        f.setAttributes(attrs.addAttributes(ctx, llvm::AttributeList::FunctionIndex, new_attrs));
-#else
         f.setAttributes(attrs.addFnAttributes(ctx, new_attrs));
-#endif
     }
 
     // Force usage of AVX512 registers, if requested.
@@ -1095,20 +1044,9 @@ void llvm_state::optimise()
         }
     }
 
-#if defined(HEYOKA_USE_NEW_LLVM_PASS_MANAGER)
-
     // NOTE: adapted from here:
     // https://llvm.org/docs/NewPassManager.html
 
-    // Optimisation level for the module pass manager.
-    // NOTE: the OptimizationLevel class has changed location
-    // since LLVM 14.
-#if LLVM_VERSION_MAJOR >= 14
-    using olevel = llvm::OptimizationLevel;
-#else
-    using olevel = llvm::PassBuilder::OptimizationLevel;
-#endif
-
     // Create the analysis managers.
     llvm::LoopAnalysisManager LAM;
     llvm::FunctionAnalysisManager FAM;
@@ -1147,18 +1085,18 @@ void llvm_state::optimise()
     PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
     // Construct the optimisation level.
-    olevel ol{};
+    llvm::OptimizationLevel ol{};
 
     switch (m_opt_level) {
         case 1u:
-            ol = olevel::O1;
+            ol = llvm::OptimizationLevel::O1;
             break;
         case 2u:
-            ol = olevel::O2;
+            ol = llvm::OptimizationLevel::O2;
             break;
         default:
             assert(m_opt_level == 3u);
-            ol = olevel::O3;
+            ol = llvm::OptimizationLevel::O3;
     }
 
     // Create the module pass manager.
@@ -1166,56 +1104,6 @@ void llvm_state::optimise()
 
     // Optimize the IR.
     MPM.run(*m_module, MAM);
-
-#else
-
-    // Init the module pass manager.
-    auto module_pm = std::make_unique<llvm::legacy::PassManager>();
-    // These are passes which set up target-specific info
-    // that are used by successive optimisation passes.
-    auto tliwp = std::make_unique<llvm::TargetLibraryInfoWrapperPass>(
-        llvm::TargetLibraryInfoImpl(m_jitter->get_target_triple()));
-    module_pm->add(tliwp.release());
-    module_pm->add(llvm::createTargetTransformInfoWrapperPass(m_jitter->get_target_ir_analysis()));
-
-    // NOTE: not sure what this does, presumably some target-specifc
-    // configuration.
-    module_pm->add(static_cast<llvm::LLVMTargetMachine &>(*m_jitter->m_tm).createPassConfig(*module_pm));
-
-    // Init the function pass manager.
-    auto f_pm = std::make_unique<llvm::legacy::FunctionPassManager>(m_module.get());
-    f_pm->add(llvm::createTargetTransformInfoWrapperPass(m_jitter->get_target_ir_analysis()));
-
-    // We use the helper class PassManagerBuilder to populate the module
-    // pass manager with standard options.
-    llvm::PassManagerBuilder pm_builder;
-    // See here for the defaults:
-    // https://llvm.org/doxygen/PassManagerBuilder_8cpp_source.html
-    pm_builder.OptLevel = m_opt_level;
-    // Enable function inlining.
-    pm_builder.Inliner = llvm::createFunctionInliningPass(m_opt_level, 0, false);
-    // NOTE: if requested, we turn manually on the SLP vectoriser here, which is off
-    // by default. Not sure why it is off, the LLVM docs imply this
-    // is on by default at nonzero optimisation levels for clang and opt.
-    pm_builder.SLPVectorize = m_slp_vectorize;
-
-    m_jitter->m_tm->adjustPassManager(pm_builder);
-
-    // Populate both the function pass manager and the module pass manager.
-    pm_builder.populateFunctionPassManager(*f_pm);
-    pm_builder.populateModulePassManager(*module_pm);
-
-    // Run the function pass manager on all functions in the module.
-    f_pm->doInitialization();
-    for (auto &f : *m_module) {
-        f_pm->run(f);
-    }
-    f_pm->doFinalization();
-
-    // Run the module passes.
-    module_pm->run(*m_module);
-
-#endif
 }
 
 namespace detail

From b36fa7a7aeeea0d3d06202b55541c22bff084d8c Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Tue, 30 Jul 2024 11:48:54 +0200
Subject: [PATCH 02/41] Abstract out in separate function the module
 optimisation pass.

---
 src/llvm_state.cpp | 227 +++++++++++++++++++++++----------------------
 1 file changed, 117 insertions(+), 110 deletions(-)

diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 82c1c0700..966f0e6fa 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -265,6 +265,122 @@ void init_native_target()
     });
 }
 
+// Helper to optimise the input module M. Implemented here for re-use.
+template <typename Jit>
+void optimise_module(llvm::Module &M, Jit &jit, unsigned opt_level, bool force_avx512, bool slp_vectorize)
+{
+    // NOTE: don't run any optimisation pass at O0.
+    if (opt_level == 0u) {
+        return;
+    }
+
+    // NOTE: the logic here largely mimics (with a lot of simplifications)
+    // the implementation of the 'opt' tool. See:
+    // https://github.com/llvm/llvm-project/blob/release/10.x/llvm/tools/opt/opt.cpp
+
+    // For every function in the module, setup its attributes
+    // so that the codegen uses all the features available on
+    // the host CPU.
+    const auto cpu = jit.get_target_cpu();
+    const auto features = jit.get_target_features();
+
+    auto &ctx = M.getContext();
+
+    for (auto &f : M) {
+        auto attrs = f.getAttributes();
+
+        llvm::AttrBuilder new_attrs(ctx);
+
+        if (!cpu.empty() && !f.hasFnAttribute("target-cpu")) {
+            new_attrs.addAttribute("target-cpu", cpu);
+        }
+
+        if (!features.empty()) {
+            auto old_features = f.getFnAttribute("target-features").getValueAsString();
+
+            if (old_features.empty()) {
+                new_attrs.addAttribute("target-features", features);
+            } else {
+                llvm::SmallString<256> appended(old_features);
+                appended.push_back(',');
+                appended.append(features);
+                new_attrs.addAttribute("target-features", appended);
+            }
+        }
+
+        // Let new_attrs override attrs.
+        f.setAttributes(attrs.addFnAttributes(ctx, new_attrs));
+    }
+
+    // Force usage of AVX512 registers, if requested.
+    if (force_avx512 && get_target_features().avx512f) {
+        for (auto &f : M) {
+            f.addFnAttr("prefer-vector-width", "512");
+        }
+    }
+
+    // NOTE: adapted from here:
+    // https://llvm.org/docs/NewPassManager.html
+
+    // Create the analysis managers.
+    llvm::LoopAnalysisManager LAM;
+    llvm::FunctionAnalysisManager FAM;
+    llvm::CGSCCAnalysisManager CGAM;
+    llvm::ModuleAnalysisManager MAM;
+
+    // NOTE: in the new pass manager, this seems to be the way to
+    // set the target library info bits. See:
+    // https://github.com/llvm/llvm-project/blob/b7fd30eac3183993806cc218b6deb39eb625c083/llvm/tools/opt/NewPMDriver.cpp#L408
+    // Not sure if this matters, but we did it in the old pass manager
+    // and opt does it too.
+    llvm::TargetLibraryInfoImpl TLII(jit.get_target_triple());
+    FAM.registerPass([&] { return llvm::TargetLibraryAnalysis(TLII); });
+
+    // Create the new pass manager builder, passing
+    // the native target machine from the JIT class.
+    // NOTE: if requested, we turn manually on the SLP vectoriser here, which is off
+    // by default. Not sure why it is off, the LLVM docs imply this
+    // is on by default at nonzero optimisation levels for clang and opt.
+    // NOTE: the reason for this inconsistency is that opt uses PB.parsePassPipeline()
+    // (instead of PB.buildPerModuleDefaultPipeline()) to set up the optimisation
+    // pipeline. Indeed, if we replace PB.buildPerModuleDefaultPipeline(ol) with
+    // PB.parsePassPipeline(MPM, "default<O3>") (which corresponds to invoking
+    // "opt -passes='default<O3>'"), we do NOT need to set SLP vectorization on
+    // here to get the SLP vectorizer. Not sure if we should consider switching to this
+    // alternative way of setting up the optimisation pipeline in the future.
+    llvm::PipelineTuningOptions pto;
+    pto.SLPVectorization = slp_vectorize;
+    llvm::PassBuilder PB(jit.m_tm.get(), pto);
+
+    // Register all the basic analyses with the managers.
+    PB.registerModuleAnalyses(MAM);
+    PB.registerCGSCCAnalyses(CGAM);
+    PB.registerFunctionAnalyses(FAM);
+    PB.registerLoopAnalyses(LAM);
+    PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+    // Construct the optimisation level.
+    llvm::OptimizationLevel ol{};
+
+    switch (opt_level) {
+        case 1u:
+            ol = llvm::OptimizationLevel::O1;
+            break;
+        case 2u:
+            ol = llvm::OptimizationLevel::O2;
+            break;
+        default:
+            assert(opt_level == 3u);
+            ol = llvm::OptimizationLevel::O3;
+    }
+
+    // Create the module pass manager.
+    auto MPM = PB.buildPerModuleDefaultPipeline(ol);
+
+    // Optimize the IR.
+    MPM.run(M, MAM);
+}
+
 } // namespace
 
 // Helper function to fetch a const ref to a global object
@@ -994,116 +1110,7 @@ void llvm_state::optimise()
 {
     check_uncompiled(__func__);
 
-    // NOTE: don't run any optimisation pass at O0.
-    if (m_opt_level == 0u) {
-        return;
-    }
-
-    // NOTE: the logic here largely mimics (with a lot of simplifications)
-    // the implementation of the 'opt' tool. See:
-    // https://github.com/llvm/llvm-project/blob/release/10.x/llvm/tools/opt/opt.cpp
-
-    // For every function in the module, setup its attributes
-    // so that the codegen uses all the features available on
-    // the host CPU.
-    const auto cpu = m_jitter->get_target_cpu();
-    const auto features = m_jitter->get_target_features();
-
-    auto &ctx = context();
-
-    for (auto &f : module()) {
-        auto attrs = f.getAttributes();
-
-        llvm::AttrBuilder new_attrs(ctx);
-
-        if (!cpu.empty() && !f.hasFnAttribute("target-cpu")) {
-            new_attrs.addAttribute("target-cpu", cpu);
-        }
-
-        if (!features.empty()) {
-            auto old_features = f.getFnAttribute("target-features").getValueAsString();
-
-            if (old_features.empty()) {
-                new_attrs.addAttribute("target-features", features);
-            } else {
-                llvm::SmallString<256> appended(old_features);
-                appended.push_back(',');
-                appended.append(features);
-                new_attrs.addAttribute("target-features", appended);
-            }
-        }
-
-        // Let new_attrs override attrs.
-        f.setAttributes(attrs.addFnAttributes(ctx, new_attrs));
-    }
-
-    // Force usage of AVX512 registers, if requested.
-    if (m_force_avx512 && detail::get_target_features().avx512f) {
-        for (auto &f : module()) {
-            f.addFnAttr("prefer-vector-width", "512");
-        }
-    }
-
-    // NOTE: adapted from here:
-    // https://llvm.org/docs/NewPassManager.html
-
-    // Create the analysis managers.
-    llvm::LoopAnalysisManager LAM;
-    llvm::FunctionAnalysisManager FAM;
-    llvm::CGSCCAnalysisManager CGAM;
-    llvm::ModuleAnalysisManager MAM;
-
-    // NOTE: in the new pass manager, this seems to be the way to
-    // set the target library info bits. See:
-    // https://github.com/llvm/llvm-project/blob/b7fd30eac3183993806cc218b6deb39eb625c083/llvm/tools/opt/NewPMDriver.cpp#L408
-    // Not sure if this matters, but we did it in the old pass manager
-    // and opt does it too.
-    llvm::TargetLibraryInfoImpl TLII(m_jitter->get_target_triple());
-    FAM.registerPass([&] { return llvm::TargetLibraryAnalysis(TLII); });
-
-    // Create the new pass manager builder, passing
-    // the native target machine from the JIT class.
-    // NOTE: if requested, we turn manually on the SLP vectoriser here, which is off
-    // by default. Not sure why it is off, the LLVM docs imply this
-    // is on by default at nonzero optimisation levels for clang and opt.
-    // NOTE: the reason for this inconsistency is that opt uses PB.parsePassPipeline()
-    // (instead of PB.buildPerModuleDefaultPipeline()) to set up the optimisation
-    // pipeline. Indeed, if we replace PB.buildPerModuleDefaultPipeline(ol) with
-    // PB.parsePassPipeline(MPM, "default<O3>") (which corresponds to invoking
-    // "opt -passes='default<O3>'"), we do NOT need to set SLP vectorization on
-    // here to get the SLP vectorizer. Not sure if we should consider switching to this
-    // alternative way of setting up the optimisation pipeline in the future.
-    llvm::PipelineTuningOptions pto;
-    pto.SLPVectorization = m_slp_vectorize;
-    llvm::PassBuilder PB(m_jitter->m_tm.get(), pto);
-
-    // Register all the basic analyses with the managers.
-    PB.registerModuleAnalyses(MAM);
-    PB.registerCGSCCAnalyses(CGAM);
-    PB.registerFunctionAnalyses(FAM);
-    PB.registerLoopAnalyses(LAM);
-    PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
-
-    // Construct the optimisation level.
-    llvm::OptimizationLevel ol{};
-
-    switch (m_opt_level) {
-        case 1u:
-            ol = llvm::OptimizationLevel::O1;
-            break;
-        case 2u:
-            ol = llvm::OptimizationLevel::O2;
-            break;
-        default:
-            assert(m_opt_level == 3u);
-            ol = llvm::OptimizationLevel::O3;
-    }
-
-    // Create the module pass manager.
-    auto MPM = PB.buildPerModuleDefaultPipeline(ol);
-
-    // Optimize the IR.
-    MPM.run(*m_module, MAM);
+    detail::optimise_module(module(), *m_jitter, m_opt_level, m_force_avx512, m_slp_vectorize);
 }
 
 namespace detail

From 843578f2c63364b5b7283d80420deff9b78a9d97 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Tue, 30 Jul 2024 12:31:31 +0200
Subject: [PATCH 03/41] Some initial code for the multi state support.

---
 include/heyoka/detail/fwd_decl.hpp |  1 +
 include/heyoka/llvm_state.hpp      | 18 +++++++-
 src/llvm_state.cpp                 | 71 +++++++++++++++++++++++++++++-
 3 files changed, 88 insertions(+), 2 deletions(-)

diff --git a/include/heyoka/detail/fwd_decl.hpp b/include/heyoka/detail/fwd_decl.hpp
index b723469ce..0bdb24d61 100644
--- a/include/heyoka/detail/fwd_decl.hpp
+++ b/include/heyoka/detail/fwd_decl.hpp
@@ -26,6 +26,7 @@ class HEYOKA_DLL_PUBLIC func;
 class HEYOKA_DLL_PUBLIC param;
 
 class HEYOKA_DLL_PUBLIC llvm_state;
+class HEYOKA_DLL_PUBLIC llvm_multi_state;
 
 template <typename>
 class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive;
diff --git a/include/heyoka/llvm_state.hpp b/include/heyoka/llvm_state.hpp
index 1d8fb1ca8..eed6918e7 100644
--- a/include/heyoka/llvm_state.hpp
+++ b/include/heyoka/llvm_state.hpp
@@ -13,13 +13,13 @@
 
 #include <concepts>
 #include <cstdint>
-#include <initializer_list>
 #include <memory>
 #include <ostream>
 #include <string>
 #include <tuple>
 #include <type_traits>
 #include <utility>
+#include <vector>
 
 #include <boost/numeric/conversion/cast.hpp>
 
@@ -111,6 +111,7 @@ HEYOKA_DLL_PUBLIC std::uint32_t recommended_simd_size<mppp::real>();
 class HEYOKA_DLL_PUBLIC llvm_state
 {
     friend HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const llvm_state &);
+    friend class HEYOKA_DLL_PUBLIC llvm_multi_state;
 
     struct jit;
 
@@ -305,6 +306,21 @@ void llvm_state_mem_cache_try_insert(std::string, unsigned, llvm_mc_value);
 
 } // namespace detail
 
+class HEYOKA_DLL_PUBLIC llvm_multi_state
+{
+    struct impl;
+
+    std::unique_ptr<impl> m_impl;
+
+public:
+    explicit llvm_multi_state(std::vector<llvm_state>);
+    llvm_multi_state(const llvm_multi_state &);
+    llvm_multi_state(llvm_multi_state &&) noexcept;
+    llvm_multi_state &operator=(const llvm_multi_state &);
+    llvm_multi_state &operator=(llvm_multi_state &&) noexcept;
+    ~llvm_multi_state();
+};
+
 HEYOKA_END_NAMESPACE
 
 // Archive version changelog:
diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 966f0e6fa..9f84254cb 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -14,7 +14,6 @@
 #include <cstddef>
 #include <cstdint>
 #include <fstream>
-#include <initializer_list>
 #include <ios>
 #include <memory>
 #include <mutex>
@@ -29,6 +28,7 @@
 #include <type_traits>
 #include <utility>
 #include <variant>
+#include <vector>
 
 #include <boost/algorithm/string/predicate.hpp>
 #include <boost/numeric/conversion/cast.hpp>
@@ -1387,4 +1387,73 @@ std::ostream &operator<<(std::ostream &os, const llvm_state &s)
     return os << oss.str();
 }
 
+struct llvm_multi_state::impl {
+    std::vector<llvm_state> m_states;
+};
+
+llvm_multi_state::llvm_multi_state(std::vector<llvm_state> states)
+{
+    // All states must be uncompiled.
+    if (std::ranges::any_of(states, &llvm_state::is_compiled)) [[unlikely]] {
+        throw std::invalid_argument("An llvm_multi_state can be constructed only from non-compiled llvm_state objects");
+    }
+
+    // Need at least 2 states.
+    if (states.size() < 2u) [[unlikely]] {
+        throw std::invalid_argument(
+            fmt::format("At least 2 llvm_state objects are needed to construct an llvm_multi_state, but instead the "
+                        "number of llvm_state objects provided on construction is {}",
+                        states.size()));
+    }
+
+    // Settings in all states must be consistent.
+    auto cmp_states = [](const llvm_state &s1, const llvm_state &s2) {
+        if (s1.get_opt_level() != s2.get_opt_level()) {
+            return false;
+        }
+
+        if (s1.fast_math() != s2.fast_math()) {
+            return false;
+        }
+
+        if (s1.force_avx512() != s2.force_avx512()) {
+            return false;
+        }
+
+        if (s1.get_slp_vectorize() != s2.get_slp_vectorize()) {
+            return false;
+        }
+
+        return true;
+    };
+
+    for (decltype(states.size()) i = 1; i < states.size(); ++i) {
+        if (!cmp_states(states[i], states[i - 1u])) [[unlikely]] {
+            throw std::invalid_argument(
+                "Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state");
+        }
+    }
+
+    impl imp{.m_states = std::move(states)};
+
+    m_impl = std::make_unique<impl>(std::move(imp));
+}
+
+llvm_multi_state::llvm_multi_state(const llvm_multi_state &other) : m_impl(std::make_unique<impl>(*other.m_impl)) {}
+
+llvm_multi_state::llvm_multi_state(llvm_multi_state &&) noexcept = default;
+
+llvm_multi_state &llvm_multi_state::operator=(const llvm_multi_state &other)
+{
+    if (this != &other) {
+        *this = llvm_multi_state(other);
+    }
+
+    return *this;
+}
+
+llvm_multi_state &llvm_multi_state::operator=(llvm_multi_state &&) noexcept = default;
+
+llvm_multi_state::~llvm_multi_state() = default;
+
 HEYOKA_END_NAMESPACE

From bfcb7e63af3190c981e3a8484e469e9512ef92a1 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Tue, 30 Jul 2024 21:08:40 +0200
Subject: [PATCH 04/41] Remove unused function.

---
 src/llvm_state.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 1f57e7c60..1c16f152a 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -661,10 +661,6 @@ struct llvm_state::jit {
     {
         return m_tm->getTargetFeatureString().str();
     }
-    [[nodiscard]] llvm::TargetIRAnalysis get_target_ir_analysis() const
-    {
-        return m_tm->getTargetIRAnalysis();
-    }
     [[nodiscard]] const llvm::Triple &get_target_triple() const
     {
         return m_lljit->getTargetTriple();

From 904925982cfb6bade1f1192c9541021491b43912 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Wed, 31 Jul 2024 13:49:54 +0200
Subject: [PATCH 05/41] Internal refactors.

---
 src/llvm_state.cpp | 218 ++++++++++++++++++++++++---------------------
 1 file changed, 115 insertions(+), 103 deletions(-)

diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 1c16f152a..2c92a4c9e 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -70,12 +70,12 @@
 #include <llvm/Passes/PassBuilder.h>
 #include <llvm/Support/Casting.h>
 #include <llvm/Support/CodeGen.h>
+#include <llvm/Support/Error.h>
 #include <llvm/Support/MemoryBuffer.h>
 #include <llvm/Support/SmallVectorMemoryBuffer.h>
 #include <llvm/Support/TargetSelect.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Target/TargetMachine.h>
-#include <llvm/Transforms/Vectorize/LoadStoreVectorizer.h>
 
 #if LLVM_VERSION_MAJOR >= 17
 
@@ -166,12 +166,12 @@ const std::regex ppc_regex_pattern("pwr([1-9]*)");
 target_features get_target_features_impl()
 {
     auto jtmb = llvm::orc::JITTargetMachineBuilder::detectHost();
-    if (!jtmb) {
+    if (!jtmb) [[unlikely]] {
         throw std::invalid_argument("Error creating a JITTargetMachineBuilder for the host system");
     }
 
     auto tm = jtmb->createTargetMachine();
-    if (!tm) {
+    if (!tm) [[unlikely]] {
         throw std::invalid_argument("Error creating the target machine");
     }
 
@@ -266,9 +266,113 @@ void init_native_target()
     });
 }
 
+// Helper to create a builder for target machines.
+llvm::orc::JITTargetMachineBuilder create_jit_tmb(unsigned opt_level, code_model c_model)
+{
+    // NOTE: codegen opt level changed in LLVM 18.
+#if LLVM_VERSION_MAJOR < 18
+
+    using cg_opt_level = llvm::CodeGenOpt::Level;
+
+#else
+
+    using cg_opt_level = llvm::CodeGenOptLevel;
+
+#endif
+
+    // Try creating the target machine builder.
+    auto jtmb = llvm::orc::JITTargetMachineBuilder::detectHost();
+    // LCOV_EXCL_START
+    if (!jtmb) [[unlikely]] {
+        throw std::invalid_argument("Error creating a JITTargetMachineBuilder for the host system");
+    }
+    // LCOV_EXCL_STOP
+
+    // Set the codegen optimisation level.
+    switch (opt_level) {
+        case 0u:
+            jtmb->setCodeGenOptLevel(cg_opt_level::None);
+            break;
+        case 1u:
+            jtmb->setCodeGenOptLevel(cg_opt_level::Less);
+            break;
+        case 2u:
+            jtmb->setCodeGenOptLevel(cg_opt_level::Default);
+            break;
+        default:
+            assert(opt_level == 3u);
+            jtmb->setCodeGenOptLevel(cg_opt_level::Aggressive);
+    }
+
+    // NOTE: not all code models are supported on all archs. We make an effort
+    // here to prevent unsupported code models to be requested, as that will
+    // result in the termination of the program.
+    constexpr code_model supported_code_models[] = {
+#if defined(HEYOKA_ARCH_X86)
+        code_model::small, code_model::kernel, code_model::medium, code_model::large
+#elif defined(HEYOKA_ARCH_ARM)
+        code_model::tiny, code_model::small, code_model::large
+#elif defined(HEYOKA_ARCH_PPC)
+        code_model::small, code_model::medium, code_model::large
+#else
+        // NOTE: by default we assume only small and large are supported.
+        code_model::small, code_model::large
+#endif
+    };
+
+    if (std::ranges::find(supported_code_models, c_model) == std::ranges::end(supported_code_models)) [[unlikely]] {
+        throw std::invalid_argument(
+            fmt::format("The code model '{}' is not supported on the current architecture", c_model));
+    }
+
+    // LCOV_EXCL_START
+
+#if LLVM_VERSION_MAJOR >= 17
+
+    // NOTE: the code model setup is working only on LLVM>=19 (or at least
+    // LLVM 18 + patches, as in the conda-forge LLVM package), due to this bug:
+    //
+    // https://github.com/llvm/llvm-project/issues/88115
+    //
+    // Additionally, there are indications from our CI that attempting to set
+    // the code model before LLVM 17 might just be buggy, as we see widespread
+    // ASAN failures all over the place. Thus, let us not do anything with the code
+    // model setting before LLVM 17.
+
+    // Setup the code model.
+    switch (c_model) {
+        case code_model::tiny:
+            jtmb->setCodeModel(llvm::CodeModel::Tiny);
+            break;
+        case code_model::small:
+            jtmb->setCodeModel(llvm::CodeModel::Small);
+            break;
+        case code_model::kernel:
+            jtmb->setCodeModel(llvm::CodeModel::Kernel);
+            break;
+        case code_model::medium:
+            jtmb->setCodeModel(llvm::CodeModel::Medium);
+            break;
+        case code_model::large:
+            jtmb->setCodeModel(llvm::CodeModel::Large);
+            break;
+        default:
+            // NOTE: we should never end up here.
+            assert(false);
+            ;
+    }
+
+#endif
+
+    //  LCOV_EXCL_STOP
+
+    return std::move(*jtmb);
+}
+
 // Helper to optimise the input module M. Implemented here for re-use.
 template <typename Jit>
-void optimise_module(llvm::Module &M, Jit &jit, unsigned opt_level, bool force_avx512, bool slp_vectorize)
+void optimise_module(llvm::Module &M, const Jit &jit, llvm::TargetMachine &tm, unsigned opt_level, bool force_avx512,
+                     bool slp_vectorize)
 {
     // NOTE: don't run any optimisation pass at O0.
     if (opt_level == 0u) {
@@ -337,8 +441,7 @@ void optimise_module(llvm::Module &M, Jit &jit, unsigned opt_level, bool force_a
     llvm::TargetLibraryInfoImpl TLII(jit.get_target_triple());
     FAM.registerPass([&] { return llvm::TargetLibraryAnalysis(TLII); });
 
-    // Create the new pass manager builder, passing
-    // the native target machine from the JIT class.
+    // Create the new pass manager builder, passing the supplied target machine.
     // NOTE: if requested, we turn manually on the SLP vectoriser here, which is off
     // by default. Not sure why it is off, the LLVM docs imply this
     // is on by default at nonzero optimisation levels for clang and opt.
@@ -351,7 +454,7 @@ void optimise_module(llvm::Module &M, Jit &jit, unsigned opt_level, bool force_a
     // alternative way of setting up the optimisation pipeline in the future.
     llvm::PipelineTuningOptions pto;
     pto.SLPVectorization = slp_vectorize;
-    llvm::PassBuilder PB(jit.m_tm.get(), pto);
+    llvm::PassBuilder PB(&tm, pto);
 
     // Register all the basic analyses with the managers.
     PB.registerModuleAnalyses(MAM);
@@ -455,107 +558,15 @@ struct llvm_state::jit {
         // Ensure the native target is inited.
         detail::init_native_target();
 
-        // NOTE: codegen opt level changed in LLVM 18.
-#if LLVM_VERSION_MAJOR < 18
-
-        using cg_opt_level = llvm::CodeGenOpt::Level;
-
-#else
-
-        using cg_opt_level = llvm::CodeGenOptLevel;
-
-#endif
-
         // Create the target machine builder.
-        auto jtmb = llvm::orc::JITTargetMachineBuilder::detectHost();
-        // LCOV_EXCL_START
-        if (!jtmb) {
-            throw std::invalid_argument("Error creating a JITTargetMachineBuilder for the host system");
-        }
-        // LCOV_EXCL_STOP
-        // Set the codegen optimisation level.
-        switch (opt_level) {
-            case 0u:
-                jtmb->setCodeGenOptLevel(cg_opt_level::None);
-                break;
-            case 1u:
-                jtmb->setCodeGenOptLevel(cg_opt_level::Less);
-                break;
-            case 2u:
-                jtmb->setCodeGenOptLevel(cg_opt_level::Default);
-                break;
-            default:
-                assert(opt_level == 3u);
-                jtmb->setCodeGenOptLevel(cg_opt_level::Aggressive);
-        }
-
-        // NOTE: not all code models are supported on all archs. We make an effort
-        // here to prevent unsupported code models to be requested, as that will
-        // result in the termination of the program.
-        constexpr code_model supported_code_models[] = {
-#if defined(HEYOKA_ARCH_X86)
-            code_model::small, code_model::kernel, code_model::medium, code_model::large
-#elif defined(HEYOKA_ARCH_ARM)
-            code_model::tiny, code_model::small, code_model::large
-#elif defined(HEYOKA_ARCH_PPC)
-            code_model::small, code_model::medium, code_model::large
-#else
-            // NOTE: by default we assume only small and large are supported.
-            code_model::small, code_model::large
-#endif
-        };
-
-        if (std::ranges::find(supported_code_models, c_model) == std::ranges::end(supported_code_models)) [[unlikely]] {
-            throw std::invalid_argument(
-                fmt::format("The code model '{}' is not supported on the current architecture", c_model));
-        }
-
-        // LCOV_EXCL_START
-
-#if LLVM_VERSION_MAJOR >= 17
-        // NOTE: the code model setup is working only on LLVM>=19 (or at least
-        // LLVM 18 + patches, as in the conda-forge LLVM package), due to this bug:
-        //
-        // https://github.com/llvm/llvm-project/issues/88115
-        //
-        // Additionally, there are indications from our CI that attempting to set
-        // the code model before LLVM 17 might just be buggy, as we see widespread
-        // ASAN failures all over the place. Thus, let us not do anything with the code
-        // model setting before LLVM 17.
-
-        // Setup the code model.
-        switch (c_model) {
-            case code_model::tiny:
-                jtmb->setCodeModel(llvm::CodeModel::Tiny);
-                break;
-            case code_model::small:
-                jtmb->setCodeModel(llvm::CodeModel::Small);
-                break;
-            case code_model::kernel:
-                jtmb->setCodeModel(llvm::CodeModel::Kernel);
-                break;
-            case code_model::medium:
-                jtmb->setCodeModel(llvm::CodeModel::Medium);
-                break;
-            case code_model::large:
-                jtmb->setCodeModel(llvm::CodeModel::Large);
-                break;
-            default:
-                // NOTE: we should never end up here.
-                assert(false);
-                ;
-        }
-
-#endif
-
-        //  LCOV_EXCL_STOP
+        auto jtmb = detail::create_jit_tmb(opt_level, c_model);
 
         // Create the jit builder.
         llvm::orc::LLJITBuilder lljit_builder;
         // NOTE: other settable properties may
         // be of interest:
         // https://www.llvm.org/doxygen/classllvm_1_1orc_1_1LLJITBuilder.html
-        lljit_builder.setJITTargetMachineBuilder(*jtmb);
+        lljit_builder.setJITTargetMachineBuilder(jtmb);
 
         // Create the jit.
         auto lljit = lljit_builder.create();
@@ -606,7 +617,7 @@ struct llvm_state::jit {
 
         // Keep a target machine around to fetch various
         // properties of the host CPU.
-        auto tm = jtmb->createTargetMachine();
+        auto tm = jtmb.createTargetMachine();
         // LCOV_EXCL_START
         if (!tm) {
             throw std::invalid_argument("Error creating the target machine");
@@ -1215,7 +1226,7 @@ void llvm_state::optimise()
 {
     check_uncompiled(__func__);
 
-    detail::optimise_module(module(), *m_jitter, m_opt_level, m_force_avx512, m_slp_vectorize);
+    detail::optimise_module(module(), *m_jitter, *m_jitter->m_tm, m_opt_level, m_force_avx512, m_slp_vectorize);
 }
 
 namespace detail
@@ -1241,6 +1252,7 @@ void llvm_state::add_obj_trigger()
     auto *ft = llvm::FunctionType::get(bld.getVoidTy(), {}, false);
     assert(ft != nullptr);
     auto *f = detail::llvm_func_create(ft, llvm::Function::ExternalLinkage, detail::obj_trigger_name, &module());
+    assert(f != nullptr);
 
     bld.SetInsertPoint(llvm::BasicBlock::Create(context(), "entry", f));
     bld.CreateRetVoid();

From 297cbd337ad45aefc8a6926ee367fd9faa31874c Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Wed, 31 Jul 2024 14:44:27 +0200
Subject: [PATCH 06/41] Another cleanup bit.

---
 src/llvm_state.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 2c92a4c9e..c93df881a 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -1224,8 +1224,6 @@ void llvm_state::verify_function(const std::string &name)
 
 void llvm_state::optimise()
 {
-    check_uncompiled(__func__);
-
     detail::optimise_module(module(), *m_jitter, *m_jitter->m_tm, m_opt_level, m_force_avx512, m_slp_vectorize);
 }
 

From d1bcab591854295c8c0c3c4e274d1b6982f46b42 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Thu, 1 Aug 2024 10:16:07 +0200
Subject: [PATCH 07/41] Again.

---
 src/llvm_state.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index c93df881a..be0f522e1 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -28,7 +28,6 @@
 #include <tuple>
 #include <type_traits>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include <boost/algorithm/string/predicate.hpp>

From b2f71efa3b939571c45568470974aafbffc14368 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Thu, 1 Aug 2024 12:00:41 +0200
Subject: [PATCH 08/41] First chunk of really non-functional code, pushing for
 CI checks.

---
 include/heyoka/llvm_state.hpp |   7 +
 src/llvm_state.cpp            | 434 +++++++++++++++++++++++++++++++---
 test/CMakeLists.txt           |   1 +
 test/llvm_multi_state.cpp     |  21 ++
 4 files changed, 433 insertions(+), 30 deletions(-)
 create mode 100644 test/llvm_multi_state.cpp

diff --git a/include/heyoka/llvm_state.hpp b/include/heyoka/llvm_state.hpp
index 28b18021e..2bf169838 100644
--- a/include/heyoka/llvm_state.hpp
+++ b/include/heyoka/llvm_state.hpp
@@ -353,6 +353,9 @@ class HEYOKA_DLL_PUBLIC llvm_multi_state
 
     std::unique_ptr<impl> m_impl;
 
+    HEYOKA_DLL_LOCAL void compile_impl();
+    HEYOKA_DLL_LOCAL void add_obj_triggers();
+
 public:
     explicit llvm_multi_state(std::vector<llvm_state>);
     llvm_multi_state(const llvm_multi_state &);
@@ -360,6 +363,10 @@ class HEYOKA_DLL_PUBLIC llvm_multi_state
     llvm_multi_state &operator=(const llvm_multi_state &);
     llvm_multi_state &operator=(llvm_multi_state &&) noexcept;
     ~llvm_multi_state();
+
+    void compile();
+
+    std::uintptr_t jit_lookup(const std::string &);
 };
 
 HEYOKA_END_NAMESPACE
diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index be0f522e1..659711721 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -25,6 +25,7 @@
 #include <stdexcept>
 #include <string>
 #include <system_error>
+#include <thread>
 #include <tuple>
 #include <type_traits>
 #include <utility>
@@ -369,8 +370,10 @@ llvm::orc::JITTargetMachineBuilder create_jit_tmb(unsigned opt_level, code_model
 }
 
 // Helper to optimise the input module M. Implemented here for re-use.
-template <typename Jit>
-void optimise_module(llvm::Module &M, const Jit &jit, llvm::TargetMachine &tm, unsigned opt_level, bool force_avx512,
+// NOTE: this may end up being invoked concurrently from multiple threads.
+// If that is the case, we make sure before invocation to construct a different
+// TargetMachine per thread, so that we are sure no data races are possible.
+void optimise_module(llvm::Module &M, llvm::TargetMachine &tm, unsigned opt_level, bool force_avx512,
                      bool slp_vectorize)
 {
     // NOTE: don't run any optimisation pass at O0.
@@ -385,9 +388,10 @@ void optimise_module(llvm::Module &M, const Jit &jit, llvm::TargetMachine &tm, u
     // For every function in the module, setup its attributes
     // so that the codegen uses all the features available on
     // the host CPU.
-    const auto cpu = jit.get_target_cpu();
-    const auto features = jit.get_target_features();
+    const auto cpu = tm.getTargetCPU().str();
+    const auto features = tm.getTargetFeatureString().str();
 
+    // Fetch the module's context.
     auto &ctx = M.getContext();
 
     for (auto &f : M) {
@@ -437,7 +441,7 @@ void optimise_module(llvm::Module &M, const Jit &jit, llvm::TargetMachine &tm, u
     // https://github.com/llvm/llvm-project/blob/b7fd30eac3183993806cc218b6deb39eb625c083/llvm/tools/opt/NewPMDriver.cpp#L408
     // Not sure if this matters, but we did it in the old pass manager
     // and opt does it too.
-    llvm::TargetLibraryInfoImpl TLII(jit.get_target_triple());
+    llvm::TargetLibraryInfoImpl TLII(tm.getTargetTriple());
     FAM.registerPass([&] { return llvm::TargetLibraryAnalysis(TLII); });
 
     // Create the new pass manager builder, passing the supplied target machine.
@@ -548,11 +552,13 @@ struct llvm_state::jit {
     std::unique_ptr<llvm::orc::ThreadSafeContext> m_ctx;
     std::optional<std::string> m_object_file;
 
+    // NOTE: make sure to coordinate changes in this constructor with multi_jit.
     explicit jit(unsigned opt_level, code_model c_model)
     {
-        // NOTE: we assume here the opt level has already been clamped
-        // from the outside.
+        // NOTE: we assume here that the input arguments have
+        // been validated already.
         assert(opt_level <= 3u);
+        assert(c_model >= code_model::tiny && c_model <= code_model::large);
 
         // Ensure the native target is inited.
         detail::init_native_target();
@@ -1223,7 +1229,13 @@ void llvm_state::verify_function(const std::string &name)
 
 void llvm_state::optimise()
 {
-    detail::optimise_module(module(), *m_jitter, *m_jitter->m_tm, m_opt_level, m_force_avx512, m_slp_vectorize);
+    // NOTE: we used to fetch the target triple from the lljit object,
+    // but recently we switched to asking the target triple directly
+    // from the target machine. Assert equality between the two for a while,
+    // just in case.
+    assert(m_jitter->m_lljit->getTargetTriple() == m_jitter->m_tm->getTargetTriple());
+
+    detail::optimise_module(module(), *m_jitter->m_tm, m_opt_level, m_force_avx512, m_slp_vectorize);
 }
 
 namespace detail
@@ -1510,59 +1522,274 @@ std::ostream &operator<<(std::ostream &os, const llvm_state &s)
     return os << oss.str();
 }
 
+namespace detail
+{
+
+namespace
+{
+
+// NOTE: this is a class similar in spirit to llvm_state, but set up for parallel
+// compilation of multiple modules.
+struct multi_jit {
+    // NOTE: enumerate the LLVM members here in the same order
+    // as llvm_state, as this is important to ensure proper
+    // destruction order.
+    std::unique_ptr<llvm::orc::LLJIT> m_lljit;
+    std::unique_ptr<llvm::orc::ThreadSafeContext> m_ctx;
+    std::unique_ptr<llvm::Module> m_module;
+    std::unique_ptr<ir_builder> m_builder;
+
+    explicit multi_jit(unsigned, code_model);
+    multi_jit(const multi_jit &) = delete;
+    multi_jit(multi_jit &&) noexcept = delete;
+    llvm_multi_state &operator=(const multi_jit &) = delete;
+    llvm_multi_state &operator=(multi_jit &&) noexcept = delete;
+    ~multi_jit() = default;
+
+    [[nodiscard]] llvm::LLVMContext &context() const noexcept
+    {
+        return *m_ctx->getContext();
+    }
+};
+
+#if 0
+
+// A task dispatcher class built on top of TBB's task group.
+class tbb_task_dispatcher : public llvm::orc::TaskDispatcher
+{
+    oneapi::tbb::task_group m_tg;
+
+public:
+    void dispatch(std::unique_ptr<llvm::orc::Task> T) override
+    {
+        m_tg.run([T = std::move(T)]() { T->run(); });
+    }
+    void shutdown() override
+    {
+        m_tg.wait();
+    }
+    ~tbb_task_dispatcher() noexcept
+    {
+        m_tg.wait();
+    }
+};
+
+#endif
+
+// NOTE: this largely replicates the logic from the constructors of llvm_state and llvm_state::jit.
+// NOTE: make sure to coordinate changes in this constructor with llvm_state::jit.
+multi_jit::multi_jit(unsigned opt_level, code_model c_model)
+{
+    // NOTE: we assume here that the input arguments have
+    // been validated already.
+    assert(opt_level <= 3u);
+    assert(c_model >= code_model::tiny && c_model <= code_model::large);
+
+    // Ensure the native target is inited.
+    init_native_target();
+
+    // Create the target machine builder.
+    auto jtmb = create_jit_tmb(opt_level, c_model);
+
+    // Create the jit builder.
+    llvm::orc::LLJITBuilder lljit_builder;
+    // NOTE: other settable properties may
+    // be of interest:
+    // https://www.llvm.org/doxygen/classllvm_1_1orc_1_1LLJITBuilder.html
+    lljit_builder.setJITTargetMachineBuilder(jtmb);
+
+#if 0
+    // Create a task dispatcher.
+    auto tdisp = std::make_unique<tbb_task_dispatcher>();
+
+    // Create an ExecutorProcessControl.
+    auto epc = llvm::orc::SelfExecutorProcessControl::Create(nullptr, std::move(tdisp));
+    // LCOV_EXCL_START
+    if (!epc) {
+        auto err = epc.takeError();
+
+        std::string err_report;
+        llvm::raw_string_ostream ostr(err_report);
+
+        ostr << err;
+
+        throw std::invalid_argument(
+            fmt::format("Could not create a SelfExecutorProcessControl. The full error message is:\n{}", ostr.str()));
+    }
+    // LCOV_EXCL_STOP
+
+    // Set it in the lljit builder.
+    lljit_builder.setExecutorProcessControl(std::move(*epc));
+#else
+
+    // Set the number of compilation threads.
+    lljit_builder.setNumCompileThreads(std::thread::hardware_concurrency());
+
+#endif
+
+    // Create the jit.
+    auto lljit = lljit_builder.create();
+    // LCOV_EXCL_START
+    if (!lljit) {
+        auto err = lljit.takeError();
+
+        std::string err_report;
+        llvm::raw_string_ostream ostr(err_report);
+
+        ostr << err;
+
+        throw std::invalid_argument(
+            fmt::format("Could not create an LLJIT object. The full error message is:\n{}", ostr.str()));
+    }
+    // LCOV_EXCL_STOP
+    m_lljit = std::move(*lljit);
+
+    // Setup the jit so that it can look up symbols from the current process.
+    auto dlsg
+        = llvm::orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(m_lljit->getDataLayout().getGlobalPrefix());
+    // LCOV_EXCL_START
+    if (!dlsg) {
+        throw std::invalid_argument("Could not create the dynamic library search generator");
+    }
+    // LCOV_EXCL_STOP
+    m_lljit->getMainJITDylib().addGenerator(std::move(*dlsg));
+
+    // Create the master context.
+    m_ctx = std::make_unique<llvm::orc::ThreadSafeContext>(std::make_unique<llvm::LLVMContext>());
+
+    // Create the master module.
+    m_module = std::make_unique<llvm::Module>("master", context());
+    // Setup the data layout and the target triple.
+    m_module->setDataLayout(m_lljit->getDataLayout());
+    m_module->setTargetTriple(m_lljit->getTargetTriple().str());
+
+    // Create a new builder for the master module.
+    // NOTE: no need to mess around with fast math flags for this builder.
+    m_builder = std::make_unique<ir_builder>(context());
+}
+
+} // namespace
+
+} // namespace detail
+
 struct llvm_multi_state::impl {
     std::vector<llvm_state> m_states;
+    std::unique_ptr<detail::multi_jit> m_jit;
 };
 
 llvm_multi_state::llvm_multi_state(std::vector<llvm_state> states)
 {
-    // All states must be uncompiled.
-    if (std::ranges::any_of(states, &llvm_state::is_compiled)) [[unlikely]] {
-        throw std::invalid_argument("An llvm_multi_state can be constructed only from non-compiled llvm_state objects");
+    // We need at least 1 state.
+    if (states.empty()) [[unlikely]] {
+        throw std::invalid_argument("At least 1 llvm_state object is needed to construct an llvm_multi_state");
     }
 
-    // Need at least 2 states.
-    if (states.size() < 2u) [[unlikely]] {
-        throw std::invalid_argument(
-            fmt::format("At least 2 llvm_state objects are needed to construct an llvm_multi_state, but instead the "
-                        "number of llvm_state objects provided on construction is {}",
-                        states.size()));
+    // All states must be uncompiled.
+    if (std::ranges::any_of(states, &llvm_state::is_compiled)) [[unlikely]] {
+        throw std::invalid_argument("An llvm_multi_state can be constructed only from uncompiled llvm_state objects");
     }
 
     // Settings in all states must be consistent.
-    auto cmp_states = [](const llvm_state &s1, const llvm_state &s2) {
+    auto states_differ = [](const llvm_state &s1, const llvm_state &s2) {
         if (s1.get_opt_level() != s2.get_opt_level()) {
-            return false;
+            return true;
         }
 
         if (s1.fast_math() != s2.fast_math()) {
-            return false;
+            return true;
         }
 
         if (s1.force_avx512() != s2.force_avx512()) {
-            return false;
+            return true;
         }
 
         if (s1.get_slp_vectorize() != s2.get_slp_vectorize()) {
-            return false;
+            return true;
+        }
+
+        if (s1.get_code_model() != s2.get_code_model()) {
+            return true;
         }
 
-        return true;
+        return false;
     };
 
-    for (decltype(states.size()) i = 1; i < states.size(); ++i) {
-        if (!cmp_states(states[i], states[i - 1u])) [[unlikely]] {
-            throw std::invalid_argument(
-                "Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state");
-        }
+    if (std::ranges::adjacent_find(states, states_differ) != states.end()) [[unlikely]] {
+        throw std::invalid_argument(
+            "Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state");
+    }
+
+    // Fetch settings from the first state.
+    const auto opt_level = states[0].get_opt_level();
+    const auto c_model = states[0].get_code_model();
+    const auto force_avx512 = states[0].force_avx512();
+    const auto slp_vectorize = states[0].get_slp_vectorize();
+
+    // Rename all states.
+    for (decltype(states.size()) i = 0; i < states.size(); ++i) {
+        const auto new_mname = fmt::format("module_{}", i);
+        states[i].m_module->setModuleIdentifier(new_mname.c_str());
     }
 
-    impl imp{.m_states = std::move(states)};
+    // Create the multi_jit.
+    auto jit = std::make_unique<detail::multi_jit>(opt_level, c_model);
+
+    // In the master jit, setup the machinery to run the optimisation passes on the modules.
+    jit->m_lljit->getIRTransformLayer().setTransform(
+        [&lljit = *jit->m_lljit, opt_level, force_avx512, slp_vectorize,
+         c_model](llvm::orc::ThreadSafeModule TSM, llvm::orc::MaterializationResponsibility &) {
+            // See here for an explanation of what withModuleDo() entails:
+            //
+            // https://groups.google.com/g/llvm-dev/c/QauU4L_bHac
+            //
+            // In our case, the locking/thread safety aspect is not important as we are not sharing
+            // contexts between threads.
+            TSM.withModuleDo([&lljit, opt_level, force_avx512, slp_vectorize, c_model](llvm::Module &M) {
+                // NOTE: don't run any optimisation on the master module.
+                if (M.getModuleIdentifier() == "master") {
+                    return;
+                }
+
+                // NOTE: running the optimisation passes requires mutable access to a target
+                // machine. Thus, we create a new target machine per thread in order to avoid likely data races
+                // with a shared target machine.
+
+                // Fetch a target machine builder.
+                auto jtmb = detail::create_jit_tmb(opt_level, c_model);
 
+                // Try creating the target machine.
+                auto tm = jtmb.createTargetMachine();
+                // LCOV_EXCL_START
+                if (!tm) [[unlikely]] {
+                    throw std::invalid_argument("Error creating the target machine");
+                }
+                // LCOV_EXCL_STOP
+
+                // NOTE: we used to fetch the target triple from the lljit object,
+                // but recently we switched to asking the target triple directly
+                // from the target machine. Assert equality between the two for a while,
+                // just in case.
+                // NOTE: lljit.getTargetTriple() just returns a const ref to an internal
+                // object, it should be ok with concurrent invocation.
+                static_cast<void>(lljit);
+                assert(lljit.getTargetTriple() == (*tm)->getTargetTriple());
+
+                // Optimise the module.
+                detail::optimise_module(M, **tm, opt_level, force_avx512, slp_vectorize);
+            });
+
+            return llvm::Expected<llvm::orc::ThreadSafeModule>(std::move(TSM));
+        });
+
+    // Build and assign the implementation.
+    impl imp{.m_states = std::move(states), .m_jit = std::move(jit)};
     m_impl = std::make_unique<impl>(std::move(imp));
 }
 
-llvm_multi_state::llvm_multi_state(const llvm_multi_state &other) : m_impl(std::make_unique<impl>(*other.m_impl)) {}
+llvm_multi_state::llvm_multi_state(const llvm_multi_state &)
+{
+    // TODO implement.
+}
 
 llvm_multi_state::llvm_multi_state(llvm_multi_state &&) noexcept = default;
 
@@ -1579,4 +1806,151 @@ llvm_multi_state &llvm_multi_state::operator=(llvm_multi_state &&) noexcept = de
 
 llvm_multi_state::~llvm_multi_state() = default;
 
+void llvm_multi_state::add_obj_triggers()
+{
+    // NOTE: the idea here is that we add one trigger function per module, and then
+    // we invoke all the trigger functions from a trigger function in the master module.
+    // Like this, we should ensure materialisation of all modules when we lookup the
+    // master trigger.
+
+    // Implement the per-module triggers.
+    for (decltype(m_impl->m_states.size()) i = 0; i < m_impl->m_states.size(); ++i) {
+        // Fetch builder/module/context for the current state.
+        auto &bld = m_impl->m_states[i].builder();
+        auto &md = m_impl->m_states[i].module();
+        auto &ctx = m_impl->m_states[i].context();
+
+        // The function name.
+        const auto fname = fmt::format("{}_{}", detail::obj_trigger_name, i);
+
+        auto *ft = llvm::FunctionType::get(bld.getVoidTy(), {}, false);
+        assert(ft != nullptr);
+        auto *f = detail::llvm_func_create(ft, llvm::Function::ExternalLinkage, fname.c_str(), &md);
+        assert(f != nullptr);
+
+        bld.SetInsertPoint(llvm::BasicBlock::Create(ctx, "entry", f));
+        bld.CreateRetVoid();
+    }
+
+    // Fetch the master builder/module/context.
+    auto &bld = *m_impl->m_jit->m_builder;
+    auto &md = *m_impl->m_jit->m_module;
+    auto &ctx = m_impl->m_jit->context();
+
+    // Add the prototypes of all per-module trigger functions to the master module.
+    std::vector<llvm::Function *> callees;
+    callees.reserve(m_impl->m_states.size());
+    for (decltype(m_impl->m_states.size()) i = 0; i < m_impl->m_states.size(); ++i) {
+        // The function name.
+        const auto fname = fmt::format("{}_{}", detail::obj_trigger_name, i);
+
+        auto *ft = llvm::FunctionType::get(bld.getVoidTy(), {}, false);
+        assert(ft != nullptr);
+        auto *f = detail::llvm_func_create(ft, llvm::Function::ExternalLinkage, fname.c_str(), &md);
+        assert(f != nullptr);
+
+        callees.push_back(f);
+    }
+
+    // Create the master trigger function.
+    auto *ft = llvm::FunctionType::get(bld.getVoidTy(), {}, false);
+    assert(ft != nullptr);
+    auto *f = detail::llvm_func_create(ft, llvm::Function::ExternalLinkage, detail::obj_trigger_name, &md);
+    assert(f != nullptr);
+
+    bld.SetInsertPoint(llvm::BasicBlock::Create(ctx, "entry", f));
+
+    // Invoke all the triggers.
+    for (auto *tf : callees) {
+        bld.CreateCall(tf, {});
+    }
+
+    // Return.
+    bld.CreateRetVoid();
+}
+
+void llvm_multi_state::compile_impl()
+{
+    for (auto &s : m_impl->m_states) {
+        // Add the module to the jit (this will clear out m_module).
+        auto err = m_impl->m_jit->m_lljit->addIRModule(
+            llvm::orc::ThreadSafeModule(std::move(s.m_module), *s.m_jitter->m_ctx));
+
+        // LCOV_EXCL_START
+        if (err) {
+            std::string err_report;
+            llvm::raw_string_ostream ostr(err_report);
+
+            ostr << err;
+
+            throw std::invalid_argument(fmt::format(
+                "The function for adding a module to the jit failed. The full error message:\n{}", ostr.str()));
+        }
+        // LCOV_EXCL_STOP
+
+        // Clear out the builder, which won't be usable any more.
+        s.m_builder.reset();
+    }
+
+    // TODO error check.
+    auto err = m_impl->m_jit->m_lljit->addIRModule(
+        llvm::orc::ThreadSafeModule(std::move(m_impl->m_jit->m_module), *m_impl->m_jit->m_ctx));
+
+    // Trigger object code materialisation via lookup.
+    jit_lookup(detail::obj_trigger_name);
+
+    // TODO restore?
+    // assert(m_jitter->m_object_file);
+}
+
+void llvm_multi_state::compile()
+{
+    // TODO check uncompiled
+
+    // Log runtime in trace mode.
+    spdlog::stopwatch sw;
+
+    auto *logger = detail::get_logger();
+
+    // Verify the modules before compiling.
+    // NOTE: probably this can be parallelised if needed.
+    for (decltype(m_impl->m_states.size()) i = 0; i < m_impl->m_states.size(); ++i) {
+        std::string out;
+        llvm::raw_string_ostream ostr(out);
+
+        if (llvm::verifyModule(*m_impl->m_states[i].m_module, &ostr)) [[unlikely]] {
+            // LCOV_EXCL_START
+            throw std::runtime_error(
+                fmt::format("The verification of the module at index {} in an llvm_multi_state produced an error:\n{}",
+                            i, ostr.str()));
+            // LCOV_EXCL_STOP
+        }
+    }
+
+    logger->trace("llvm_multi_state module verification runtime: {}", sw);
+
+    // Add the object materialisation trigger functions.
+    add_obj_triggers();
+
+    // Run the compilation.
+    compile_impl();
+}
+
+std::uintptr_t llvm_multi_state::jit_lookup(const std::string &name)
+{
+    // TODO restore.
+    // check_compiled(__func__);
+
+    auto sym = m_impl->m_jit->m_lljit->lookup(name);
+    if (!sym) {
+        throw std::invalid_argument(fmt::format("Could not find the symbol '{}' in the compiled module", name));
+    }
+
+#if LLVM_VERSION_MAJOR >= 15
+    return static_cast<std::uintptr_t>((*sym).getValue());
+#else
+    return static_cast<std::uintptr_t>((*sym).getAddress());
+#endif
+}
+
 HEYOKA_END_NAMESPACE
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 09e360cab..f6c4ccd29 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -159,6 +159,7 @@ ADD_HEYOKA_TESTCASE(cfunc)
 ADD_HEYOKA_TESTCASE(cfunc_multieval)
 ADD_HEYOKA_TESTCASE(dfun)
 ADD_HEYOKA_TESTCASE(var_ode_sys)
+ADD_HEYOKA_TESTCASE(llvm_multi_state)
 
 if(HEYOKA_WITH_MPPP AND mp++_WITH_MPFR)
   ADD_HEYOKA_TESTCASE(event_detection_mp)
diff --git a/test/llvm_multi_state.cpp b/test/llvm_multi_state.cpp
new file mode 100644
index 000000000..a84ff1783
--- /dev/null
+++ b/test/llvm_multi_state.cpp
@@ -0,0 +1,21 @@
+// Copyright 2020, 2021, 2022, 2023, 2024 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com)
+//
+// This file is part of the heyoka library.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <heyoka/llvm_state.hpp>
+
+#include "catch.hpp"
+
+using namespace heyoka;
+
+TEST_CASE("basic")
+{
+#if 0
+    llvm_multi_state ms({llvm_state{}, llvm_state{}, llvm_state{}, llvm_state{}, llvm_state{}, llvm_state{}});
+    ms.compile();
+#endif
+}

From 2f29bdbe281e2f81452a17d65d8ddc661d121109 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Thu, 1 Aug 2024 14:48:49 +0200
Subject: [PATCH 09/41] Allow the LLVM cache machinery to cache multiple
 modules at a time.

---
 include/heyoka/llvm_state.hpp |  9 ++--
 src/llvm_state.cpp            | 26 +++++++----
 src/llvm_state_mem_cache.cpp  | 82 +++++++++++++++++++++++++----------
 3 files changed, 81 insertions(+), 36 deletions(-)

diff --git a/include/heyoka/llvm_state.hpp b/include/heyoka/llvm_state.hpp
index 2bf169838..6cf3dae84 100644
--- a/include/heyoka/llvm_state.hpp
+++ b/include/heyoka/llvm_state.hpp
@@ -12,6 +12,7 @@
 #include <heyoka/config.hpp>
 
 #include <concepts>
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <ostream>
@@ -338,12 +339,14 @@ namespace detail
 
 // The value contained in the in-memory cache.
 struct llvm_mc_value {
-    std::string opt_bc, opt_ir, obj;
+    std::vector<std::string> opt_bc, opt_ir, obj;
+
+    std::size_t total_size() const;
 };
 
 // Cache lookup and insertion.
-std::optional<llvm_mc_value> llvm_state_mem_cache_lookup(const std::string &, unsigned);
-void llvm_state_mem_cache_try_insert(std::string, unsigned, llvm_mc_value);
+std::optional<llvm_mc_value> llvm_state_mem_cache_lookup(const std::vector<std::string> &, unsigned);
+void llvm_state_mem_cache_try_insert(std::vector<std::string>, unsigned, llvm_mc_value);
 
 } // namespace detail
 
diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 659711721..f096fd0c7 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -1325,11 +1325,15 @@ void llvm_state::compile()
     // to fix the module and re-attempt compilation without having
     // altered the module and without having already added the trigger
     // function.
+    // NOTE: this function does its own cleanup, no need to
+    // start the try catch block yet.
     add_obj_trigger();
 
     try {
         // Fetch the bitcode *before* optimisation.
         auto orig_bc = get_bc();
+        std::vector<std::string> obc;
+        obc.push_back(std::move(orig_bc));
 
         // Combine m_opt_level, m_force_avx512, m_slp_vectorize and m_c_model into a single value,
         // as they all affect codegen.
@@ -1341,22 +1345,26 @@ void llvm_state::compile()
         assert(m_opt_level <= 3u);
         assert(static_cast<unsigned>(m_c_model) <= 7u);
         static_assert(std::numeric_limits<unsigned>::digits >= 7u);
-        const auto olevel = m_opt_level + (static_cast<unsigned>(m_force_avx512) << 2)
-                            + (static_cast<unsigned>(m_slp_vectorize) << 3) + (static_cast<unsigned>(m_c_model) << 4);
+        const auto comp_flag = m_opt_level + (static_cast<unsigned>(m_force_avx512) << 2)
+                               + (static_cast<unsigned>(m_slp_vectorize) << 3)
+                               + (static_cast<unsigned>(m_c_model) << 4);
 
-        if (auto cached_data = detail::llvm_state_mem_cache_lookup(orig_bc, olevel)) {
+        if (auto cached_data = detail::llvm_state_mem_cache_lookup(obc, comp_flag)) {
             // Cache hit.
 
             // Assign the snapshots.
-            m_ir_snapshot = std::move(cached_data->opt_ir);
-            m_bc_snapshot = std::move(cached_data->opt_bc);
+            assert(cached_data->opt_ir.size() == 1u);
+            assert(cached_data->opt_bc.size() == 1u);
+            assert(cached_data->obj.size() == 1u);
+            m_ir_snapshot = std::move(cached_data->opt_ir[0]);
+            m_bc_snapshot = std::move(cached_data->opt_bc[0]);
 
             // Clear out module and builder.
             m_module.reset();
             m_builder.reset();
 
             // Assign the object file.
-            detail::llvm_state_add_obj_to_jit(*m_jitter, std::move(cached_data->obj));
+            detail::llvm_state_add_obj_to_jit(*m_jitter, std::move(cached_data->obj[0]));
         } else {
             sw.reset();
 
@@ -1372,10 +1380,10 @@ void llvm_state::compile()
 
             logger->trace("materialisation runtime: {}", sw);
 
-            // Try to insert orig_bc into the cache.
-            detail::llvm_state_mem_cache_try_insert(std::move(orig_bc), olevel,
+            // Try to insert obc into the cache.
+            detail::llvm_state_mem_cache_try_insert(std::move(obc), comp_flag,
                                                     // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
-                                                    {m_bc_snapshot, m_ir_snapshot, *m_jitter->m_object_file});
+                                                    {{m_bc_snapshot}, {m_ir_snapshot}, {*m_jitter->m_object_file}});
         }
         // LCOV_EXCL_START
     } catch (...) {
diff --git a/src/llvm_state_mem_cache.cpp b/src/llvm_state_mem_cache.cpp
index e3135ae25..88ba8ea0f 100644
--- a/src/llvm_state_mem_cache.cpp
+++ b/src/llvm_state_mem_cache.cpp
@@ -17,7 +17,7 @@
 #include <optional>
 #include <string>
 #include <utility>
-#include <variant>
+#include <vector>
 
 #include <boost/container_hash/hash.hpp>
 #include <boost/numeric/conversion/cast.hpp>
@@ -28,7 +28,8 @@
 #include <heyoka/llvm_state.hpp>
 
 // This in-memory cache maps the bitcode
-// of an LLVM module and an optimisation level to:
+// of one or more LLVM modules and an integer flag
+// (representing several compilation settings) to:
 //
 // - the optimised version of the bitcode,
 // - the textual IR corresponding
@@ -43,6 +44,26 @@ HEYOKA_BEGIN_NAMESPACE
 namespace detail
 {
 
+// Helper to compute the total size in bytes
+// of the data contained in an llvm_mc_value.
+// Will throw on overflow.
+std::size_t llvm_mc_value::total_size() const
+{
+    assert(!opt_bc.empty());
+    assert(opt_bc.size() == opt_ir.size());
+    assert(opt_bc.size() == obj.size());
+
+    boost::safe_numerics::safe<std::size_t> ret = 0;
+
+    for (decltype(opt_bc.size()) i = 0; i < opt_bc.size(); ++i) {
+        ret += opt_bc[i].size();
+        ret += opt_ir[i].size();
+        ret += obj[i].size();
+    }
+
+    return ret;
+}
+
 namespace
 {
 
@@ -56,16 +77,33 @@ HEYOKA_CONSTINIT
 std::mutex mem_cache_mutex;
 
 // Definition of the data structures for the cache.
-using lru_queue_t = std::list<std::pair<std::string, unsigned>>;
+using lru_queue_t = std::list<std::pair<std::vector<std::string>, unsigned>>;
 
 using lru_key_t = lru_queue_t::iterator;
 
+// Implementation of hashing for std::pair<std::vector<std::string>, unsigned> and
+// its heterogeneous counterpart.
+template <typename T>
+auto cache_key_hasher(const T &k) noexcept
+{
+    assert(!k.first.empty());
+
+    // Combine the bitcodes.
+    auto seed = std::hash<std::string>{}(k.first[0]);
+    for (decltype(k.first.size()) i = 1; i < k.first.size(); ++i) {
+        boost::hash_combine(seed, k.first[i]);
+    }
+
+    // Combine with the compilation flag.
+    boost::hash_combine(seed, static_cast<std::size_t>(k.second));
+
+    return seed;
+}
+
 struct lru_hasher {
     std::size_t operator()(const lru_key_t &k) const noexcept
     {
-        auto seed = std::hash<std::string>{}(k->first);
-        boost::hash_combine(seed, k->second);
-        return seed;
+        return cache_key_hasher(*k);
     }
 };
 
@@ -96,16 +134,16 @@ HEYOKA_CONSTINIT std::uint64_t mem_cache_limit = 2147483648ull;
 
 // Machinery for heterogeneous lookup into the cache.
 // NOTE: this function MUST be invoked while holding the global lock.
-auto llvm_state_mem_cache_hl(const std::string &bc, unsigned opt_level)
+auto llvm_state_mem_cache_hl(const std::vector<std::string> &bc, unsigned comp_flag)
 {
-    using compat_key_t = std::pair<const std::string &, unsigned>;
+    // NOTE: the heterogeneous version of the key replaces std::vector<std::string>
+    // with a const reference.
+    using compat_key_t = std::pair<const std::vector<std::string> &, unsigned>;
 
     struct compat_hasher {
         std::size_t operator()(const compat_key_t &k) const noexcept
         {
-            auto seed = std::hash<std::string>{}(k.first);
-            boost::hash_combine(seed, k.second);
-            return seed;
+            return cache_key_hasher(k);
         }
     };
 
@@ -120,7 +158,7 @@ auto llvm_state_mem_cache_hl(const std::string &bc, unsigned opt_level)
         }
     };
 
-    return lru_map.find(std::make_pair(std::cref(bc), opt_level), compat_hasher{}, compat_cmp{});
+    return lru_map.find(std::make_pair(std::cref(bc), comp_flag), compat_hasher{}, compat_cmp{});
 }
 
 // Debug function to run sanity checks on the cache.
@@ -131,15 +169,13 @@ void llvm_state_mem_cache_sanity_checks()
 
     // Check that the computed size of the cache is consistent with mem_cache_size.
     assert(std::accumulate(lru_map.begin(), lru_map.end(), boost::safe_numerics::safe<std::size_t>(0),
-                           [](const auto &a, const auto &p) {
-                               return a + p.second.opt_bc.size() + p.second.opt_ir.size() + p.second.obj.size();
-                           })
+                           [](const auto &a, const auto &p) { return a + p.second.total_size(); })
            == mem_cache_size);
 }
 
 } // namespace
 
-std::optional<llvm_mc_value> llvm_state_mem_cache_lookup(const std::string &bc, unsigned opt_level)
+std::optional<llvm_mc_value> llvm_state_mem_cache_lookup(const std::vector<std::string> &bc, unsigned comp_flag)
 {
     // Lock down.
     const std::lock_guard lock(mem_cache_mutex);
@@ -147,7 +183,7 @@ std::optional<llvm_mc_value> llvm_state_mem_cache_lookup(const std::string &bc,
     // Sanity checks.
     llvm_state_mem_cache_sanity_checks();
 
-    if (const auto it = llvm_state_mem_cache_hl(bc, opt_level); it == lru_map.end()) {
+    if (const auto it = llvm_state_mem_cache_hl(bc, comp_flag); it == lru_map.end()) {
         // Cache miss.
         return {};
     } else {
@@ -163,7 +199,7 @@ std::optional<llvm_mc_value> llvm_state_mem_cache_lookup(const std::string &bc,
     }
 }
 
-void llvm_state_mem_cache_try_insert(std::string bc, unsigned opt_level, llvm_mc_value val)
+void llvm_state_mem_cache_try_insert(std::vector<std::string> bc, unsigned comp_flag, llvm_mc_value val)
 {
     // Lock down.
     const std::lock_guard lock(mem_cache_mutex);
@@ -174,7 +210,7 @@ void llvm_state_mem_cache_try_insert(std::string bc, unsigned opt_level, llvm_mc
     // Do a first lookup to check if bc is already in the cache.
     // This could happen, e.g., if two threads are compiling the same
     // code concurrently.
-    if (const auto it = llvm_state_mem_cache_hl(bc, opt_level); it != lru_map.end()) {
+    if (const auto it = llvm_state_mem_cache_hl(bc, comp_flag); it != lru_map.end()) {
         assert(val.opt_bc == it->second.opt_bc);
         assert(val.opt_ir == it->second.opt_ir);
         assert(val.obj == it->second.obj);
@@ -183,8 +219,7 @@ void llvm_state_mem_cache_try_insert(std::string bc, unsigned opt_level, llvm_mc
     }
 
     // Compute the new cache size.
-    auto new_cache_size = static_cast<std::size_t>(boost::safe_numerics::safe<std::size_t>(mem_cache_size)
-                                                   + val.opt_bc.size() + val.opt_ir.size() + val.obj.size());
+    auto new_cache_size = boost::safe_numerics::safe<std::size_t>(mem_cache_size) + val.total_size();
 
     // Remove items from the cache if we are exceeding
     // the limit.
@@ -195,8 +230,7 @@ void llvm_state_mem_cache_try_insert(std::string bc, unsigned opt_level, llvm_mc
         const auto &cur_val = cur_it->second;
         // NOTE: no possibility of overflow here, as cur_size is guaranteed
         // not to be greater than mem_cache_size.
-        const auto cur_size
-            = static_cast<std::size_t>(cur_val.opt_bc.size()) + cur_val.opt_ir.size() + cur_val.obj.size();
+        const auto cur_size = cur_val.total_size();
 
         // NOTE: the next 4 lines cannot throw, which ensures that the
         // cache cannot be left in an inconsistent state.
@@ -222,7 +256,7 @@ void llvm_state_mem_cache_try_insert(std::string bc, unsigned opt_level, llvm_mc
     // Add the new item to the front of the queue.
     // NOTE: if this throws, we have not modified lru_map yet,
     // no cleanup needed.
-    lru_queue.emplace_front(std::move(bc), opt_level);
+    lru_queue.emplace_front(std::move(bc), comp_flag);
 
     // Add the new item to the map.
     try {

From e3a44d468656c423d1c0f50a9212c604e2acb85c Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Thu, 1 Aug 2024 14:50:05 +0200
Subject: [PATCH 10/41] Remove the LLVM 13 build.workflows/gha_ci.yml

---
 .github/workflows/gha_ci.yml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/.github/workflows/gha_ci.yml b/.github/workflows/gha_ci.yml
index 81cba6d3e..6a61654ab 100644
--- a/.github/workflows/gha_ci.yml
+++ b/.github/workflows/gha_ci.yml
@@ -104,9 +104,3 @@ jobs:
       - uses: actions/checkout@v4
       - name: Build
         run: bash tools/gha_llvm14_conda_asan.sh
-  conda_llvm13_asan:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - name: Build
-        run: bash tools/gha_llvm13_conda_asan.sh

From 164bd189facee2d01b33325f96a05f946e6af0eb Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Thu, 1 Aug 2024 20:01:50 +0200
Subject: [PATCH 11/41] Initial work on object files. [skip ci]

---
 src/llvm_state.cpp | 46 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 43 insertions(+), 3 deletions(-)

diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index f096fd0c7..96420ba2d 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -33,6 +33,7 @@
 
 #include <boost/algorithm/string/predicate.hpp>
 #include <boost/numeric/conversion/cast.hpp>
+#include <boost/safe_numerics/safe_integer.hpp>
 
 #include <fmt/format.h>
 
@@ -1539,6 +1540,9 @@ namespace
 // NOTE: this is a class similar in spirit to llvm_state, but set up for parallel
 // compilation of multiple modules.
 struct multi_jit {
+    // NOTE: this is the total number of modules, including
+    // the master module.
+    const unsigned m_n_modules = 0;
     // NOTE: enumerate the LLVM members here in the same order
     // as llvm_state, as this is important to ensure proper
     // destruction order.
@@ -1546,8 +1550,11 @@ struct multi_jit {
     std::unique_ptr<llvm::orc::ThreadSafeContext> m_ctx;
     std::unique_ptr<llvm::Module> m_module;
     std::unique_ptr<ir_builder> m_builder;
+    // Object files.
+    std::mutex m_object_files_mutex;
+    std::vector<std::string> m_object_files;
 
-    explicit multi_jit(unsigned, code_model);
+    explicit multi_jit(unsigned, unsigned, code_model);
     multi_jit(const multi_jit &) = delete;
     multi_jit(multi_jit &&) noexcept = delete;
     llvm_multi_state &operator=(const multi_jit &) = delete;
@@ -1586,8 +1593,10 @@ class tbb_task_dispatcher : public llvm::orc::TaskDispatcher
 
 // NOTE: this largely replicates the logic from the constructors of llvm_state and llvm_state::jit.
 // NOTE: make sure to coordinate changes in this constructor with llvm_state::jit.
-multi_jit::multi_jit(unsigned opt_level, code_model c_model)
+multi_jit::multi_jit(unsigned n_modules, unsigned opt_level, code_model c_model) : m_n_modules(n_modules)
 {
+    assert(n_modules >= 2u);
+
     // NOTE: we assume here that the input arguments have
     // been validated already.
     assert(opt_level <= 3u);
@@ -1652,6 +1661,36 @@ multi_jit::multi_jit(unsigned opt_level, code_model c_model)
     // LCOV_EXCL_STOP
     m_lljit = std::move(*lljit);
 
+    // Setup the machinery to store the modules' binary code
+    // when it is generated.
+    m_lljit->getObjTransformLayer().setTransform([this](std::unique_ptr<llvm::MemoryBuffer> obj_buffer) {
+        assert(obj_buffer);
+
+        // Lock down for access to m_object_files.
+        std::lock_guard lock{m_object_files_mutex};
+
+        assert(m_object_files.size() <= m_n_modules);
+
+        // NOTE: this callback will be invoked the first time a jit lookup is performed,
+        // even if the object code was manually injected. In such a case, m_object_files
+        // has already been set up properly and we just sanity check in debug mode that
+        // one object file matches the content of obj_buffer.
+        if (m_object_files.size() < m_n_modules) {
+            // Add obj_buffer.
+            m_object_files.push_back(std::string(obj_buffer->getBufferStart(), obj_buffer->getBufferEnd()));
+        } else {
+            // Check that at least one buffer in m_object_files is exactly
+            // identical to obj_buffer.
+            assert(std::ranges::any_of(m_object_files, [&obj_buffer](const auto &cur) {
+                return obj_buffer->getBufferSize() == cur.size()
+                       && std::equal(obj_buffer->getBufferStart(), obj_buffer->getBufferEnd(), cur.begin());
+                ;
+            }));
+        }
+
+        return llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>>(std::move(obj_buffer));
+    });
+
     // Setup the jit so that it can look up symbols from the current process.
     auto dlsg
         = llvm::orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(m_lljit->getDataLayout().getGlobalPrefix());
@@ -1740,7 +1779,8 @@ llvm_multi_state::llvm_multi_state(std::vector<llvm_state> states)
     }
 
     // Create the multi_jit.
-    auto jit = std::make_unique<detail::multi_jit>(opt_level, c_model);
+    auto jit = std::make_unique<detail::multi_jit>(boost::safe_numerics::safe<unsigned>(states.size()) + 1, opt_level,
+                                                   c_model);
 
     // In the master jit, setup the machinery to run the optimisation passes on the modules.
     jit->m_lljit->getIRTransformLayer().setTransform(

From e9e127cc4bca76d5ab359029d3ba7f6d21ec5c8e Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Fri, 2 Aug 2024 18:20:02 +0200
Subject: [PATCH 12/41] Avoid pointless copy of the object file.

---
 src/llvm_state.cpp | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 96420ba2d..89c55ea72 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -38,7 +38,6 @@
 #include <fmt/format.h>
 
 #include <llvm/ADT/SmallString.h>
-#include <llvm/ADT/SmallVector.h>
 #include <llvm/Analysis/CGSCCPassManager.h>
 #include <llvm/Analysis/LoopAnalysisManager.h>
 #include <llvm/Analysis/TargetLibraryInfo.h>
@@ -73,7 +72,6 @@
 #include <llvm/Support/CodeGen.h>
 #include <llvm/Support/Error.h>
 #include <llvm/Support/MemoryBuffer.h>
-#include <llvm/Support/SmallVectorMemoryBuffer.h>
 #include <llvm/Support/TargetSelect.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Target/TargetMachine.h>
@@ -733,12 +731,43 @@ namespace detail
 namespace
 {
 
+// An implementation of llvm::MemoryBuffer offering a view over a std::string.
+class string_view_mem_buffer final : public llvm::MemoryBuffer
+{
+public:
+    explicit string_view_mem_buffer(const std::string &s)
+    {
+        // NOTE: the important bit here is from the LLVM docs:
+        //
+        // """
+        // In addition to basic access to the characters in the file, this interface
+        // guarantees you can read one character past the end of the file, and that
+        // this character will read as '\0'.
+        // """
+        //
+        // This is exactly the guarantee given by std::string:
+        //
+        // https://en.cppreference.com/w/cpp/string/basic_string/data
+        //
+        // Not sure about the third parameter to this function though, it does not
+        // seem to have any influence apart from debug checking:
+        //
+        // https://llvm.org/doxygen/MemoryBuffer_8cpp_source.html
+        this->init(s.data(), s.data() + s.size(), true);
+    }
+    llvm::MemoryBuffer::BufferKind getBufferKind() const final
+    {
+        // Hopefully std::string is not memory-mapped...
+        return llvm::MemoryBuffer::BufferKind::MemoryBuffer_Malloc;
+    }
+};
+
 // Helper to load object code into a jit.
 template <typename Jit>
 void llvm_state_add_obj_to_jit(Jit &j, std::string obj)
 {
-    llvm::SmallVector<char, 0> buffer(obj.begin(), obj.end());
-    auto err = j.m_lljit->addObjectFile(std::make_unique<llvm::SmallVectorMemoryBuffer>(std::move(buffer)));
+    // Add the object file to the jit.
+    auto err = j.m_lljit->addObjectFile(std::make_unique<string_view_mem_buffer>(obj));
 
     // LCOV_EXCL_START
     if (err) {

From f147ae8d559135936a2ec776dfcb7cb2260d1bb4 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Fri, 2 Aug 2024 18:38:39 +0200
Subject: [PATCH 13/41] Don't run debug checks when destroying an llvm_state.

---
 src/llvm_state.cpp | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 89c55ea72..275fda902 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -954,23 +954,12 @@ llvm_state &llvm_state::operator=(llvm_state &&other) noexcept
     return *this;
 }
 
-llvm_state::~llvm_state()
-{
-    // Sanity checks in debug mode.
-    if (m_jitter) {
-        if (is_compiled()) {
-            assert(m_jitter->m_object_file);
-            assert(!m_builder);
-        } else {
-            assert(!m_jitter->m_object_file);
-            assert(m_builder);
-            assert(m_ir_snapshot.empty());
-            assert(m_bc_snapshot.empty());
-        }
-    }
-
-    assert(m_opt_level <= 3u);
-}
+// NOTE: we used to have debug sanity checks here. However, in certain rare corner cases,
+// an invalid llvm_state could end up being destroyed, thus triggering assertion errors
+// in debug mode (this could happen for instance when resetting an llvm_state to the
+// def-cted state after an exception had been thrown during compilation). Thus, just
+// do not run the debug checks.
+llvm_state::~llvm_state() = default;
 
 template <typename Archive>
 void llvm_state::save_impl(Archive &ar, unsigned) const

From 16b27d86dfbbc6fc31e501c6d24035f4c6cd67f3 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Fri, 2 Aug 2024 18:41:38 +0200
Subject: [PATCH 14/41] Internal refactor.

---
 src/llvm_state.cpp | 43 ++++++++++++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 275fda902..362532dad 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -1311,6 +1311,34 @@ void llvm_state::compile_impl()
     assert(m_jitter->m_object_file);
 }
 
+namespace detail
+{
+
+namespace
+{
+
+// Combine opt_level, force_avx512, slp_vectorize and c_model into a single flag.
+// NOTE: here we need:
+//
+// - 2 bits for opt_level,
+// - 1 bit for force_avx512 and slp_vectorize each,
+// - 3 bits for c_model,
+//
+// for a total of 7 bits.
+unsigned assemble_comp_flag(unsigned opt_level, bool force_avx512, bool slp_vectorize, code_model c_model)
+{
+    assert(opt_level <= 3u);
+    assert(static_cast<unsigned>(c_model) <= 7u);
+    static_assert(std::numeric_limits<unsigned>::digits >= 7u);
+
+    return opt_level + (static_cast<unsigned>(force_avx512) << 2) + (static_cast<unsigned>(slp_vectorize) << 3)
+           + (static_cast<unsigned>(c_model) << 4);
+}
+
+} // namespace
+
+} // namespace detail
+
 // NOTE: we need to emphasise in the docs that compilation
 // triggers an optimisation pass.
 void llvm_state::compile()
@@ -1354,19 +1382,8 @@ void llvm_state::compile()
         std::vector<std::string> obc;
         obc.push_back(std::move(orig_bc));
 
-        // Combine m_opt_level, m_force_avx512, m_slp_vectorize and m_c_model into a single value,
-        // as they all affect codegen.
-        // NOTE: here we need:
-        // - 2 bits for m_opt_level,
-        // - 1 bit for m_force_avx512 and m_slp_vectorize each,
-        // - 3 bits for m_c_model,
-        // for a total of 7 bits.
-        assert(m_opt_level <= 3u);
-        assert(static_cast<unsigned>(m_c_model) <= 7u);
-        static_assert(std::numeric_limits<unsigned>::digits >= 7u);
-        const auto comp_flag = m_opt_level + (static_cast<unsigned>(m_force_avx512) << 2)
-                               + (static_cast<unsigned>(m_slp_vectorize) << 3)
-                               + (static_cast<unsigned>(m_c_model) << 4);
+        // Assemble the compilation flag.
+        const auto comp_flag = detail::assemble_comp_flag(m_opt_level, m_force_avx512, m_slp_vectorize, m_c_model);
 
         if (auto cached_data = detail::llvm_state_mem_cache_lookup(obc, comp_flag)) {
             // Cache hit.

From c5048e192cd582678384577063bd7859ddc6cf9f Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Fri, 2 Aug 2024 19:41:29 +0200
Subject: [PATCH 15/41] Another internal refactor bit.

---
 src/llvm_state.cpp | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 362532dad..df07f971d 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -487,6 +487,24 @@ void optimise_module(llvm::Module &M, llvm::TargetMachine &tm, unsigned opt_leve
     MPM.run(M, MAM);
 }
 
+// Helper to add a module to an lljt, throwing on error.
+void add_module_to_lljit(llvm::orc::LLJIT &lljit, std::unique_ptr<llvm::Module> m, llvm::orc::ThreadSafeContext ctx)
+{
+    auto err = lljit.addIRModule(llvm::orc::ThreadSafeModule(std::move(m), std::move(ctx)));
+
+    // LCOV_EXCL_START
+    if (err) {
+        std::string err_report;
+        llvm::raw_string_ostream ostr(err_report);
+
+        ostr << err;
+
+        throw std::invalid_argument(
+            fmt::format("The function for adding a module to the jit failed. The full error message:\n{}", ostr.str()));
+    }
+    // LCOV_EXCL_STOP
+}
+
 } // namespace
 
 // Helper function to fetch a const ref to a global object
@@ -683,19 +701,7 @@ struct llvm_state::jit {
 
     void add_module(std::unique_ptr<llvm::Module> m) const
     {
-        auto err = m_lljit->addIRModule(llvm::orc::ThreadSafeModule(std::move(m), *m_ctx));
-
-        // LCOV_EXCL_START
-        if (err) {
-            std::string err_report;
-            llvm::raw_string_ostream ostr(err_report);
-
-            ostr << err;
-
-            throw std::invalid_argument(fmt::format(
-                "The function for adding a module to the jit failed. The full error message:\n{}", ostr.str()));
-        }
-        // LCOV_EXCL_STOP
+        detail::add_module_to_lljit(*m_lljit, std::move(m), *m_ctx);
     }
 
     // Symbol lookup.
@@ -1308,6 +1314,7 @@ void llvm_state::compile_impl()
 
     // Trigger object code materialisation via lookup.
     jit_lookup(detail::obj_trigger_name);
+
     assert(m_jitter->m_object_file);
 }
 

From 621320492712612c1aedd7fff99cb8013156ffa7 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Fri, 2 Aug 2024 19:49:59 +0200
Subject: [PATCH 16/41] Another bit of internal refactor.

---
 src/llvm_state.cpp | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index df07f971d..aaa3b9a53 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -505,6 +505,28 @@ void add_module_to_lljit(llvm::orc::LLJIT &lljit, std::unique_ptr<llvm::Module>
     // LCOV_EXCL_STOP
 }
 
+// Helper to fetch the bitcode from a module.
+std::string bc_from_module(llvm::Module &m)
+{
+    std::string out;
+    llvm::raw_string_ostream ostr(out);
+
+    llvm::WriteBitcodeToFile(m, ostr);
+
+    return std::move(ostr.str());
+}
+
+// Helper to fetch the textual IR from a module.
+std::string ir_from_module(llvm::Module &m)
+{
+    std::string out;
+    llvm::raw_string_ostream ostr(out);
+
+    m.print(ostr, nullptr);
+
+    return std::move(ostr.str());
+}
+
 } // namespace
 
 // Helper function to fetch a const ref to a global object
@@ -1468,12 +1490,7 @@ std::string llvm_state::get_ir() const
     if (m_module) {
         // The module has not been compiled yet,
         // get the IR from it.
-        std::string out;
-        llvm::raw_string_ostream ostr(out);
-
-        m_module->print(ostr, nullptr);
-
-        return std::move(ostr.str());
+        return detail::ir_from_module(*m_module);
     } else {
         // The module has been compiled.
         // Return the IR snapshot that
@@ -1487,12 +1504,7 @@ std::string llvm_state::get_bc() const
     if (m_module) {
         // The module has not been compiled yet,
         // get the bitcode from it.
-        std::string out;
-        llvm::raw_string_ostream ostr(out);
-
-        llvm::WriteBitcodeToFile(*m_module, ostr);
-
-        return std::move(ostr.str());
+        return detail::bc_from_module(*m_module);
     } else {
         // The module has been compiled.
         // Return the bitcode snapshot that

From 7f3b220bb7641b6a28665a75145f3188e2482bef Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Fri, 2 Aug 2024 20:24:41 +0200
Subject: [PATCH 17/41] Another bit of refactor for internal re-use.

---
 src/llvm_state.cpp | 99 +++++++++++++++++++++++++---------------------
 1 file changed, 53 insertions(+), 46 deletions(-)

diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index aaa3b9a53..12c7c4163 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -527,6 +527,56 @@ std::string ir_from_module(llvm::Module &m)
     return std::move(ostr.str());
 }
 
+// Helper to add an object file to the jit, throwing in case of errors.
+void add_obj_to_lljit(llvm::orc::LLJIT &lljit, const std::string &obj)
+{
+    // An implementation of llvm::MemoryBuffer offering a view over a std::string.
+    class string_view_mem_buffer final : public llvm::MemoryBuffer
+    {
+    public:
+        explicit string_view_mem_buffer(const std::string &s)
+        {
+            // NOTE: the important bit here is from the LLVM docs:
+            //
+            // """
+            // In addition to basic access to the characters in the file, this interface
+            // guarantees you can read one character past the end of the file, and that
+            // this character will read as '\0'.
+            // """
+            //
+            // This is exactly the guarantee given by std::string:
+            //
+            // https://en.cppreference.com/w/cpp/string/basic_string/data
+            //
+            // Not sure about the third parameter to this function though, it does not
+            // seem to have any influence apart from debug checking:
+            //
+            // https://llvm.org/doxygen/MemoryBuffer_8cpp_source.html
+            this->init(s.data(), s.data() + s.size(), true);
+        }
+        llvm::MemoryBuffer::BufferKind getBufferKind() const final
+        {
+            // Hopefully std::string is not memory-mapped...
+            return llvm::MemoryBuffer::BufferKind::MemoryBuffer_Malloc;
+        }
+    };
+
+    // Add the object file.
+    auto err = lljit.addObjectFile(std::make_unique<string_view_mem_buffer>(obj));
+
+    // LCOV_EXCL_START
+    if (err) {
+        std::string err_report;
+        llvm::raw_string_ostream ostr(err_report);
+
+        ostr << err;
+
+        throw std::invalid_argument(fmt::format(
+            "The function for adding an object file to an lljit failed. The full error message:\n{}", ostr.str()));
+    }
+    // LCOV_EXCL_STOP
+}
+
 } // namespace
 
 // Helper function to fetch a const ref to a global object
@@ -759,55 +809,12 @@ namespace detail
 namespace
 {
 
-// An implementation of llvm::MemoryBuffer offering a view over a std::string.
-class string_view_mem_buffer final : public llvm::MemoryBuffer
-{
-public:
-    explicit string_view_mem_buffer(const std::string &s)
-    {
-        // NOTE: the important bit here is from the LLVM docs:
-        //
-        // """
-        // In addition to basic access to the characters in the file, this interface
-        // guarantees you can read one character past the end of the file, and that
-        // this character will read as '\0'.
-        // """
-        //
-        // This is exactly the guarantee given by std::string:
-        //
-        // https://en.cppreference.com/w/cpp/string/basic_string/data
-        //
-        // Not sure about the third parameter to this function though, it does not
-        // seem to have any influence apart from debug checking:
-        //
-        // https://llvm.org/doxygen/MemoryBuffer_8cpp_source.html
-        this->init(s.data(), s.data() + s.size(), true);
-    }
-    llvm::MemoryBuffer::BufferKind getBufferKind() const final
-    {
-        // Hopefully std::string is not memory-mapped...
-        return llvm::MemoryBuffer::BufferKind::MemoryBuffer_Malloc;
-    }
-};
-
-// Helper to load object code into a jit.
+// Helper to load object code into the jit of an llvm_state.
 template <typename Jit>
 void llvm_state_add_obj_to_jit(Jit &j, std::string obj)
 {
-    // Add the object file to the jit.
-    auto err = j.m_lljit->addObjectFile(std::make_unique<string_view_mem_buffer>(obj));
-
-    // LCOV_EXCL_START
-    if (err) {
-        std::string err_report;
-        llvm::raw_string_ostream ostr(err_report);
-
-        ostr << err;
-
-        throw std::invalid_argument(fmt::format(
-            "The function for adding a compiled module to the jit failed. The full error message:\n{}", ostr.str()));
-    }
-    // LCOV_EXCL_STOP
+    // Add the object code to the lljit.
+    add_obj_to_lljit(*j.m_lljit, obj);
 
     // Add the object code also to the
     // m_object_file member.

From b5b3937bcb5d6606b55a55d196857171ac05190d Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Fri, 2 Aug 2024 21:10:32 +0200
Subject: [PATCH 18/41] Minor internal doc bits.

---
 src/llvm_state.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 12c7c4163..512866442 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -1421,10 +1421,11 @@ void llvm_state::compile()
         // Assemble the compilation flag.
         const auto comp_flag = detail::assemble_comp_flag(m_opt_level, m_force_avx512, m_slp_vectorize, m_c_model);
 
+        // Lookup in the cache.
         if (auto cached_data = detail::llvm_state_mem_cache_lookup(obc, comp_flag)) {
             // Cache hit.
 
-            // Assign the snapshots.
+            // Assign the optimised snapshots.
             assert(cached_data->opt_ir.size() == 1u);
             assert(cached_data->opt_bc.size() == 1u);
             assert(cached_data->obj.size() == 1u);
@@ -1438,6 +1439,8 @@ void llvm_state::compile()
             // Assign the object file.
             detail::llvm_state_add_obj_to_jit(*m_jitter, std::move(cached_data->obj[0]));
         } else {
+            // Cache miss.
+
             sw.reset();
 
             // Run the optimisation pass.

From bcba26680d37b471e97d9efd19f1334c2068a27e Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Fri, 2 Aug 2024 21:52:19 +0200
Subject: [PATCH 19/41] Initial WIP on the implementation.

---
 include/heyoka/llvm_state.hpp |  11 ++
 src/llvm_state.cpp            | 292 ++++++++++++++++++++++++----------
 test/llvm_multi_state.cpp     |   2 -
 3 files changed, 220 insertions(+), 85 deletions(-)

diff --git a/include/heyoka/llvm_state.hpp b/include/heyoka/llvm_state.hpp
index 6cf3dae84..a0b7173e5 100644
--- a/include/heyoka/llvm_state.hpp
+++ b/include/heyoka/llvm_state.hpp
@@ -359,7 +359,12 @@ class HEYOKA_DLL_PUBLIC llvm_multi_state
     HEYOKA_DLL_LOCAL void compile_impl();
     HEYOKA_DLL_LOCAL void add_obj_triggers();
 
+    // Check functions.
+    HEYOKA_DLL_LOCAL void check_compiled(const char *) const;
+    HEYOKA_DLL_LOCAL void check_uncompiled(const char *) const;
+
 public:
+    llvm_multi_state();
     explicit llvm_multi_state(std::vector<llvm_state>);
     llvm_multi_state(const llvm_multi_state &);
     llvm_multi_state(llvm_multi_state &&) noexcept;
@@ -367,6 +372,12 @@ class HEYOKA_DLL_PUBLIC llvm_multi_state
     llvm_multi_state &operator=(llvm_multi_state &&) noexcept;
     ~llvm_multi_state();
 
+    [[nodiscard]] bool fast_math() const noexcept;
+    [[nodiscard]] bool force_avx512() const noexcept;
+    [[nodiscard]] unsigned get_opt_level() const noexcept;
+    [[nodiscard]] bool get_slp_vectorize() const noexcept;
+    [[nodiscard]] code_model get_code_model() const noexcept;
+
     void compile();
 
     std::uintptr_t jit_lookup(const std::string &);
diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 512866442..6d9186380 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -20,6 +20,7 @@
 #include <mutex>
 #include <optional>
 #include <ostream>
+#include <ranges>
 #include <regex>
 #include <sstream>
 #include <stdexcept>
@@ -109,26 +110,6 @@
 #include <heyoka/s11n.hpp>
 #include <heyoka/variable.hpp>
 
-// NOTE: logging here lhames' instructions on how to set up LLJIT
-// for parallel compilation of multiple modules.
-//
-//   auto J = LLJITBuilder()
-//              .setNumCompileThreads(<N>)
-//              .create();
-//   if (!J) { /* bail on error */ }
-//   (*J)->getIRTransformLayer().setTransform(
-//     [](ThreadSafeModule TSM, MaterializationResponsibility &R) -> Expected<ThreadSafeModule> {
-//       TSM.withModuleDo([](Module &M) {
-//         /* Apply your IR optimizations here */
-//       });
-//       return std::move(TSM);
-//     });
-//
-// Note that the optimisation passes in this approach are moved into the
-// transform layer. References:
-// https://discord.com/channels/636084430946959380/687692371038830597/1252428080648163328
-// https://discord.com/channels/636084430946959380/687692371038830597/1252118666187640892
-
 HEYOKA_BEGIN_NAMESPACE
 
 namespace detail
@@ -1456,9 +1437,10 @@ void llvm_state::compile()
             logger->trace("materialisation runtime: {}", sw);
 
             // Try to insert obc into the cache.
-            detail::llvm_state_mem_cache_try_insert(std::move(obc), comp_flag,
-                                                    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
-                                                    {{m_bc_snapshot}, {m_ir_snapshot}, {*m_jitter->m_object_file}});
+            detail::llvm_state_mem_cache_try_insert(
+                std::move(obc), comp_flag,
+                // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+                {.opt_bc = {m_bc_snapshot}, .opt_ir = {m_ir_snapshot}, .obj = {*m_jitter->m_object_file}});
         }
         // LCOV_EXCL_START
     } catch (...) {
@@ -1615,8 +1597,16 @@ struct multi_jit {
     std::unique_ptr<llvm::Module> m_module;
     std::unique_ptr<ir_builder> m_builder;
     // Object files.
+    // NOTE: these may be modified concurrently during compilation,
+    // protect with mutex.
     std::mutex m_object_files_mutex;
     std::vector<std::string> m_object_files;
+    // IR and bc optimised snapshots.
+    // NOTE: these may be modified concurrently during compilation,
+    // protect with mutex.
+    std::mutex m_ir_bc_mutex;
+    std::vector<std::string> m_ir_snapshots;
+    std::vector<std::string> m_bc_snapshots;
 
     explicit multi_jit(unsigned, unsigned, code_model);
     multi_jit(const multi_jit &) = delete;
@@ -1625,10 +1615,18 @@ struct multi_jit {
     llvm_multi_state &operator=(multi_jit &&) noexcept = delete;
     ~multi_jit() = default;
 
+    // Helper to fetch the context from its thread-safe counterpart.
     [[nodiscard]] llvm::LLVMContext &context() const noexcept
     {
         return *m_ctx->getContext();
     }
+
+    // Helper to fetch the bitcode of the master module.
+    std::string get_master_bc() const
+    {
+        assert(m_module);
+        return detail::bc_from_module(*m_module);
+    }
 };
 
 #if 0
@@ -1788,6 +1786,8 @@ struct llvm_multi_state::impl {
     std::unique_ptr<detail::multi_jit> m_jit;
 };
 
+llvm_multi_state::llvm_multi_state() = default;
+
 llvm_multi_state::llvm_multi_state(std::vector<llvm_state> states)
 {
     // We need at least 1 state.
@@ -1848,46 +1848,56 @@ llvm_multi_state::llvm_multi_state(std::vector<llvm_state> states)
 
     // In the master jit, setup the machinery to run the optimisation passes on the modules.
     jit->m_lljit->getIRTransformLayer().setTransform(
-        [&lljit = *jit->m_lljit, opt_level, force_avx512, slp_vectorize,
-         c_model](llvm::orc::ThreadSafeModule TSM, llvm::orc::MaterializationResponsibility &) {
+        [&j = *jit, opt_level, force_avx512, slp_vectorize, c_model](llvm::orc::ThreadSafeModule TSM,
+                                                                     llvm::orc::MaterializationResponsibility &) {
             // See here for an explanation of what withModuleDo() entails:
             //
             // https://groups.google.com/g/llvm-dev/c/QauU4L_bHac
             //
             // In our case, the locking/thread safety aspect is not important as we are not sharing
-            // contexts between threads.
-            TSM.withModuleDo([&lljit, opt_level, force_avx512, slp_vectorize, c_model](llvm::Module &M) {
+            // contexts between threads. More references from discord:
+            //
+            // https://discord.com/channels/636084430946959380/687692371038830597/1252428080648163328
+            // https://discord.com/channels/636084430946959380/687692371038830597/1252118666187640892
+            TSM.withModuleDo([&j, opt_level, force_avx512, slp_vectorize, c_model](llvm::Module &M) {
                 // NOTE: don't run any optimisation on the master module.
-                if (M.getModuleIdentifier() == "master") {
-                    return;
+                if (M.getModuleIdentifier() != "master") {
+                    // NOTE: running the optimisation passes requires mutable access to a target
+                    // machine. Thus, we create a new target machine per thread in order to avoid likely data races
+                    // with a shared target machine.
+
+                    // Fetch a target machine builder.
+                    auto jtmb = detail::create_jit_tmb(opt_level, c_model);
+
+                    // Try creating the target machine.
+                    auto tm = jtmb.createTargetMachine();
+                    // LCOV_EXCL_START
+                    if (!tm) [[unlikely]] {
+                        throw std::invalid_argument("Error creating the target machine");
+                    }
+                    // LCOV_EXCL_STOP
+
+                    // NOTE: we used to fetch the target triple from the lljit object,
+                    // but recently we switched to asking the target triple directly
+                    // from the target machine. Assert equality between the two for a while,
+                    // just in case.
+                    // NOTE: lljit.getTargetTriple() just returns a const ref to an internal
+                    // object, it should be ok with concurrent invocation.
+                    assert(j.m_lljit->getTargetTriple() == (*tm)->getTargetTriple());
+
+                    // Optimise the module.
+                    detail::optimise_module(M, **tm, opt_level, force_avx512, slp_vectorize);
                 }
 
-                // NOTE: running the optimisation passes requires mutable access to a target
-                // machine. Thus, we create a new target machine per thread in order to avoid likely data races
-                // with a shared target machine.
+                // Store the optimised bitcode/IR for this module.
+                auto bc_snap = detail::bc_from_module(M);
+                auto ir_snap = detail::ir_from_module(M);
 
-                // Fetch a target machine builder.
-                auto jtmb = detail::create_jit_tmb(opt_level, c_model);
+                // NOTE: protect for multi-threaded access.
+                std::lock_guard lock{j.m_ir_bc_mutex};
 
-                // Try creating the target machine.
-                auto tm = jtmb.createTargetMachine();
-                // LCOV_EXCL_START
-                if (!tm) [[unlikely]] {
-                    throw std::invalid_argument("Error creating the target machine");
-                }
-                // LCOV_EXCL_STOP
-
-                // NOTE: we used to fetch the target triple from the lljit object,
-                // but recently we switched to asking the target triple directly
-                // from the target machine. Assert equality between the two for a while,
-                // just in case.
-                // NOTE: lljit.getTargetTriple() just returns a const ref to an internal
-                // object, it should be ok with concurrent invocation.
-                static_cast<void>(lljit);
-                assert(lljit.getTargetTriple() == (*tm)->getTargetTriple());
-
-                // Optimise the module.
-                detail::optimise_module(M, **tm, opt_level, force_avx512, slp_vectorize);
+                j.m_bc_snapshots.push_back(std::move(bc_snap));
+                j.m_ir_snapshots.push_back(std::move(ir_snap));
             });
 
             return llvm::Expected<llvm::orc::ThreadSafeModule>(std::move(TSM));
@@ -1922,7 +1932,7 @@ void llvm_multi_state::add_obj_triggers()
 {
     // NOTE: the idea here is that we add one trigger function per module, and then
     // we invoke all the trigger functions from a trigger function in the master module.
-    // Like this, we should ensure materialisation of all modules when we lookup the
+    // Like this, we ensure materialisation of all modules when we lookup the
     // master trigger.
 
     // Implement the per-module triggers.
@@ -1981,43 +1991,77 @@ void llvm_multi_state::add_obj_triggers()
     bld.CreateRetVoid();
 }
 
-void llvm_multi_state::compile_impl()
+void llvm_multi_state::check_compiled(const char *f) const
 {
-    for (auto &s : m_impl->m_states) {
-        // Add the module to the jit (this will clear out m_module).
-        auto err = m_impl->m_jit->m_lljit->addIRModule(
-            llvm::orc::ThreadSafeModule(std::move(s.m_module), *s.m_jitter->m_ctx));
+    if (m_impl->m_jit->m_module) [[unlikely]] {
+        throw std::invalid_argument(
+            fmt::format("The function '{}' can be invoked only after the llvm_multi_state has been compiled", f));
+    }
+}
 
-        // LCOV_EXCL_START
-        if (err) {
-            std::string err_report;
-            llvm::raw_string_ostream ostr(err_report);
+void llvm_multi_state::check_uncompiled(const char *f) const
+{
+    if (!m_impl->m_jit->m_module) [[unlikely]] {
+        throw std::invalid_argument(
+            fmt::format("The function '{}' can be invoked only if the llvm_multi_state has not been compiled yet", f));
+    }
+}
 
-            ostr << err;
+unsigned llvm_multi_state::get_opt_level() const noexcept
+{
+    return m_impl->m_states[0].get_opt_level();
+}
 
-            throw std::invalid_argument(fmt::format(
-                "The function for adding a module to the jit failed. The full error message:\n{}", ostr.str()));
-        }
-        // LCOV_EXCL_STOP
+bool llvm_multi_state::fast_math() const noexcept
+{
+    return m_impl->m_states[0].fast_math();
+}
+
+bool llvm_multi_state::force_avx512() const noexcept
+{
+    return m_impl->m_states[0].force_avx512();
+}
 
-        // Clear out the builder, which won't be usable any more.
+bool llvm_multi_state::get_slp_vectorize() const noexcept
+{
+    return m_impl->m_states[0].get_slp_vectorize();
+}
+
+code_model llvm_multi_state::get_code_model() const noexcept
+{
+    return m_impl->m_states[0].get_code_model();
+}
+
+// NOTE: this function is NOT exception-safe, proper cleanup
+// needs to be done externally if needed.
+void llvm_multi_state::compile_impl()
+{
+    // Add all the modules from the states.
+    for (auto &s : m_impl->m_states) {
+        detail::add_module_to_lljit(*m_impl->m_jit->m_lljit, std::move(s.m_module), *s.m_jitter->m_ctx);
+
+        // Clear out the builder.
         s.m_builder.reset();
     }
 
-    // TODO error check.
-    auto err = m_impl->m_jit->m_lljit->addIRModule(
-        llvm::orc::ThreadSafeModule(std::move(m_impl->m_jit->m_module), *m_impl->m_jit->m_ctx));
+    // Add the master module.
+    detail::add_module_to_lljit(*m_impl->m_jit->m_lljit, std::move(m_impl->m_jit->m_module), *m_impl->m_jit->m_ctx);
 
-    // Trigger object code materialisation via lookup.
+    // Clear out the master builder.
+    m_impl->m_jit->m_builder.reset();
+
+    // Trigger optimisation and object code materialisation via lookup.
     jit_lookup(detail::obj_trigger_name);
 
-    // TODO restore?
-    // assert(m_jitter->m_object_file);
+    // Sanity checks.
+    assert(m_impl->m_jit->m_bc_snapshots.size() == m_impl->m_jit->m_n_modules);
+    assert(m_impl->m_jit->m_ir_snapshots.size() == m_impl->m_jit->m_n_modules);
+    assert(m_impl->m_jit->m_object_files.size() == m_impl->m_jit->m_n_modules);
 }
 
 void llvm_multi_state::compile()
 {
-    // TODO check uncompiled
+    check_uncompiled(__func__);
 
     // Log runtime in trace mode.
     spdlog::stopwatch sw;
@@ -2041,21 +2085,103 @@ void llvm_multi_state::compile()
 
     logger->trace("llvm_multi_state module verification runtime: {}", sw);
 
-    // Add the object materialisation trigger functions.
-    add_obj_triggers();
+    try {
+        // Add the object materialisation trigger functions.
+        // NOTE: contrary to llvm_state::add_obj_trigger(), add_obj_triggers()
+        // does not implement any automatic cleanup in case of errors. Thus, we fold
+        // it into the try/catch block in order to avoid leaving the
+        // llvm_multi_state in a half-baked state.
+        add_obj_triggers();
+
+        // Fetch the bitcode *before* optimisation.
+        std::vector<std::string> obc;
+        obc.reserve(boost::safe_numerics::safe<decltype(obc.size())>(m_impl->m_states.size()) + 1u);
+        for (const auto &s : m_impl->m_states) {
+            obc.push_back(s.get_bc());
+        }
+        // Add the master bitcode.
+        obc.push_back(m_impl->m_jit->get_master_bc());
+
+        // Assemble the compilation flag.
+        const auto comp_flag
+            = detail::assemble_comp_flag(get_opt_level(), force_avx512(), get_slp_vectorize(), get_code_model());
+
+        // Lookup in the cache.
+        if (auto cached_data = detail::llvm_state_mem_cache_lookup(obc, comp_flag)) {
+            // Cache hit.
+
+            // Assign the optimised snapshots.
+            assert(cached_data->opt_ir.size() == m_impl->m_jit->m_n_modules);
+            assert(cached_data->opt_bc.size() == m_impl->m_jit->m_n_modules);
+            assert(cached_data->obj.size() == m_impl->m_jit->m_n_modules);
+            assert(m_impl->m_jit->m_ir_snapshots.empty());
+            assert(m_impl->m_jit->m_bc_snapshots.empty());
+            m_impl->m_jit->m_ir_snapshots = std::move(cached_data->opt_ir);
+            m_impl->m_jit->m_bc_snapshots = std::move(cached_data->opt_bc);
+
+            // Clear out modules and builders from the states.
+            // NOTE: probably not strictly needed, but we try to keep the same
+            // behaviour as in compile_impl().
+            for (auto &s : m_impl->m_states) {
+                s.m_module.reset();
+                s.m_builder.reset();
+            }
+
+            // Clear out master module and builder.
+            m_impl->m_jit->m_module.reset();
+            m_impl->m_jit->m_builder.reset();
+
+            // Add and assign the object files.
+            for (const auto &obj : cached_data->obj) {
+                detail::add_obj_to_lljit(*m_impl->m_jit->m_lljit, obj);
+            }
+
+            // Assign the compiled objects.
+            assert(m_impl->m_jit->m_object_files.empty());
+            m_impl->m_jit->m_object_files = std::move(cached_data->obj);
+        } else {
+            // Cache miss.
+
+            sw.reset();
+
+            // Run the compilation.
+            compile_impl();
+
+            logger->trace("optimisation + materialisation runtime: {}", sw);
+
+            // NOTE: at this point, m_ir_snapshots, m_bc_snapshots and m_object_files
+            // have all been constructed in random order because of multithreading.
+            // Sort them so that we provided deterministic behaviour. Probably
+            // not strictly needed, but let's try to avoid nondeterminism.
+            // All of this can be parallelised if needed.
+            std::ranges::sort(m_impl->m_jit->m_ir_snapshots);
+            std::ranges::sort(m_impl->m_jit->m_bc_snapshots);
+            std::ranges::sort(m_impl->m_jit->m_object_files);
+
+            // Try to insert obc into the cache.
+            detail::llvm_state_mem_cache_try_insert(std::move(obc), comp_flag,
+                                                    {.opt_bc = m_impl->m_jit->m_bc_snapshots,
+                                                     .opt_ir = m_impl->m_jit->m_ir_snapshots,
+                                                     .obj = m_impl->m_jit->m_object_files});
+        }
+        // LCOV_EXCL_START
+    } catch (...) {
+        // Reset to a def-cted state in case of error,
+        // as it looks like there's no way of recovering.
+        m_impl.reset();
 
-    // Run the compilation.
-    compile_impl();
+        throw;
+        // LCOV_EXCL_STOP
+    }
 }
 
 std::uintptr_t llvm_multi_state::jit_lookup(const std::string &name)
 {
-    // TODO restore.
-    // check_compiled(__func__);
+    check_compiled(__func__);
 
     auto sym = m_impl->m_jit->m_lljit->lookup(name);
     if (!sym) {
-        throw std::invalid_argument(fmt::format("Could not find the symbol '{}' in the compiled module", name));
+        throw std::invalid_argument(fmt::format("Could not find the symbol '{}' in an llvm_multi_state", name));
     }
 
 #if LLVM_VERSION_MAJOR >= 15
diff --git a/test/llvm_multi_state.cpp b/test/llvm_multi_state.cpp
index a84ff1783..f30fb2d05 100644
--- a/test/llvm_multi_state.cpp
+++ b/test/llvm_multi_state.cpp
@@ -14,8 +14,6 @@ using namespace heyoka;
 
 TEST_CASE("basic")
 {
-#if 0
     llvm_multi_state ms({llvm_state{}, llvm_state{}, llvm_state{}, llvm_state{}, llvm_state{}, llvm_state{}});
     ms.compile();
-#endif
 }

From f859621ba74b5235b188a9c112be981deda76a06 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sat, 3 Aug 2024 11:25:54 +0200
Subject: [PATCH 20/41] Implement copy, more testing.

---
 include/heyoka/llvm_state.hpp |   6 +
 src/llvm_state.cpp            | 101 ++++++++++++++-
 test/llvm_multi_state.cpp     | 236 +++++++++++++++++++++++++++++++++-
 3 files changed, 337 insertions(+), 6 deletions(-)

diff --git a/include/heyoka/llvm_state.hpp b/include/heyoka/llvm_state.hpp
index a0b7173e5..4e6af2d02 100644
--- a/include/heyoka/llvm_state.hpp
+++ b/include/heyoka/llvm_state.hpp
@@ -372,12 +372,18 @@ class HEYOKA_DLL_PUBLIC llvm_multi_state
     llvm_multi_state &operator=(llvm_multi_state &&) noexcept;
     ~llvm_multi_state();
 
+    [[nodiscard]] bool is_compiled() const noexcept;
+
     [[nodiscard]] bool fast_math() const noexcept;
     [[nodiscard]] bool force_avx512() const noexcept;
     [[nodiscard]] unsigned get_opt_level() const noexcept;
     [[nodiscard]] bool get_slp_vectorize() const noexcept;
     [[nodiscard]] code_model get_code_model() const noexcept;
 
+    [[nodiscard]] std::vector<std::string> get_ir() const;
+    [[nodiscard]] std::vector<std::string> get_bc() const;
+    [[nodiscard]] const std::vector<std::string> &get_object_code() const;
+
     void compile();
 
     std::uintptr_t jit_lookup(const std::string &);
diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 6d9186380..dfe676503 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -511,6 +511,14 @@ std::string ir_from_module(llvm::Module &m)
 // Helper to add an object file to the jit, throwing in case of errors.
 void add_obj_to_lljit(llvm::orc::LLJIT &lljit, const std::string &obj)
 {
+    // NOTE: an empty obj can happen when we are copying a compiled
+    // llvm_multi_state. In such case, the object files of the individual
+    // states have all be empty-inited. We then need to avoid adding
+    // obj to the jit because that will result in an error.
+    if (obj.empty()) {
+        return;
+    }
+
     // An implementation of llvm::MemoryBuffer offering a view over a std::string.
     class string_view_mem_buffer final : public llvm::MemoryBuffer
     {
@@ -1269,6 +1277,8 @@ void llvm_state::optimise()
     // from the target machine. Assert equality between the two for a while,
     // just in case.
     assert(m_jitter->m_lljit->getTargetTriple() == m_jitter->m_tm->getTargetTriple());
+    // NOTE: the target triple is also available in the module.
+    assert(m_jitter->m_lljit->getTargetTriple().str() == module().getTargetTriple());
 
     detail::optimise_module(module(), *m_jitter->m_tm, m_opt_level, m_force_avx512, m_slp_vectorize);
 }
@@ -1884,6 +1894,8 @@ llvm_multi_state::llvm_multi_state(std::vector<llvm_state> states)
                     // NOTE: lljit.getTargetTriple() just returns a const ref to an internal
                     // object, it should be ok with concurrent invocation.
                     assert(j.m_lljit->getTargetTriple() == (*tm)->getTargetTriple());
+                    // NOTE: the target triple is also available in the module.
+                    assert(j.m_lljit->getTargetTriple().str() == M.getTargetTriple());
 
                     // Optimise the module.
                     detail::optimise_module(M, **tm, opt_level, force_avx512, slp_vectorize);
@@ -1908,9 +1920,33 @@ llvm_multi_state::llvm_multi_state(std::vector<llvm_state> states)
     m_impl = std::make_unique<impl>(std::move(imp));
 }
 
-llvm_multi_state::llvm_multi_state(const llvm_multi_state &)
+llvm_multi_state::llvm_multi_state(const llvm_multi_state &other)
 {
-    // TODO implement.
+    // NOTE: start off by creating a new jit and copying the states.
+    // This will work regardless of whether other is compiled or not.
+    impl imp{.m_states = other.m_impl->m_states,
+             .m_jit = std::make_unique<detail::multi_jit>(other.m_impl->m_jit->m_n_modules, other.get_opt_level(),
+                                                          other.get_code_model())};
+    m_impl = std::make_unique<impl>(std::move(imp));
+
+    if (other.is_compiled()) {
+        // 'other' was compiled. Reset builder and module, copy over the snapshots
+        // and the object files, and add the files to the jit.
+        m_impl->m_jit->m_module.reset();
+        m_impl->m_jit->m_builder.reset();
+
+        m_impl->m_jit->m_object_files = other.m_impl->m_jit->m_object_files;
+        m_impl->m_jit->m_ir_snapshots = other.m_impl->m_jit->m_ir_snapshots;
+        m_impl->m_jit->m_bc_snapshots = other.m_impl->m_jit->m_bc_snapshots;
+
+        for (const auto &obj : m_impl->m_jit->m_object_files) {
+            detail::add_obj_to_lljit(*m_impl->m_jit->m_lljit, obj);
+        }
+    } else {
+        // If 'other' was not compiled, we do not need to do anything - the
+        // copy construction of the states takes care of everything. I.e., this
+        // is basically the same as construction from a list of states.
+    }
 }
 
 llvm_multi_state::llvm_multi_state(llvm_multi_state &&) noexcept = default;
@@ -2032,6 +2068,56 @@ code_model llvm_multi_state::get_code_model() const noexcept
     return m_impl->m_states[0].get_code_model();
 }
 
+bool llvm_multi_state::is_compiled() const noexcept
+{
+    return !m_impl->m_jit->m_module;
+}
+
+std::vector<std::string> llvm_multi_state::get_ir() const
+{
+    if (is_compiled()) {
+        return m_impl->m_jit->m_ir_snapshots;
+    } else {
+        std::vector<std::string> retval;
+        retval.reserve(m_impl->m_jit->m_n_modules);
+
+        for (const auto &s : m_impl->m_states) {
+            retval.push_back(s.get_ir());
+        }
+
+        // Add the IR from the master module.
+        retval.push_back(detail::ir_from_module(*m_impl->m_jit->m_module));
+
+        return retval;
+    }
+}
+
+std::vector<std::string> llvm_multi_state::get_bc() const
+{
+    if (is_compiled()) {
+        return m_impl->m_jit->m_bc_snapshots;
+    } else {
+        std::vector<std::string> retval;
+        retval.reserve(m_impl->m_jit->m_n_modules);
+
+        for (const auto &s : m_impl->m_states) {
+            retval.push_back(s.get_bc());
+        }
+
+        // Add the bitcode from the master module.
+        retval.push_back(detail::bc_from_module(*m_impl->m_jit->m_module));
+
+        return retval;
+    }
+}
+
+const std::vector<std::string> &llvm_multi_state::get_object_code() const
+{
+    check_compiled(__func__);
+
+    return m_impl->m_jit->m_object_files;
+}
+
 // NOTE: this function is NOT exception-safe, proper cleanup
 // needs to be done externally if needed.
 void llvm_multi_state::compile_impl()
@@ -2042,6 +2128,10 @@ void llvm_multi_state::compile_impl()
 
         // Clear out the builder.
         s.m_builder.reset();
+
+        // NOTE: need to manually construct the object file, as this would
+        // normally be done by the invocation of s.compile() (which we do not do).
+        s.m_jitter->m_object_file.emplace();
     }
 
     // Add the master module.
@@ -2119,12 +2209,13 @@ void llvm_multi_state::compile()
             m_impl->m_jit->m_ir_snapshots = std::move(cached_data->opt_ir);
             m_impl->m_jit->m_bc_snapshots = std::move(cached_data->opt_bc);
 
-            // Clear out modules and builders from the states.
-            // NOTE: probably not strictly needed, but we try to keep the same
-            // behaviour as in compile_impl().
+            // NOTE: here it is important that we replicate the logic happening
+            // in llvm_state::compile(): clear out module/builder, construct
+            // the object file.
             for (auto &s : m_impl->m_states) {
                 s.m_module.reset();
                 s.m_builder.reset();
+                s.m_jitter->m_object_file.emplace();
             }
 
             // Clear out master module and builder.
diff --git a/test/llvm_multi_state.cpp b/test/llvm_multi_state.cpp
index f30fb2d05..eac44ecf3 100644
--- a/test/llvm_multi_state.cpp
+++ b/test/llvm_multi_state.cpp
@@ -6,6 +6,10 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#include <stdexcept>
+
+#include <heyoka/expression.hpp>
+#include <heyoka/kw.hpp>
 #include <heyoka/llvm_state.hpp>
 
 #include "catch.hpp"
@@ -14,6 +18,236 @@ using namespace heyoka;
 
 TEST_CASE("basic")
 {
-    llvm_multi_state ms({llvm_state{}, llvm_state{}, llvm_state{}, llvm_state{}, llvm_state{}, llvm_state{}});
+    using Catch::Matchers::Message;
+
+    // Default construction.
+    {
+        REQUIRE_NOTHROW(llvm_multi_state{});
+    }
+
+    // No states in input.
+    REQUIRE_THROWS_MATCHES(llvm_multi_state{{}}, std::invalid_argument,
+                           Message("At least 1 llvm_state object is needed to construct an llvm_multi_state"));
+
+    // Inconsistent settings.
+    REQUIRE_THROWS_MATCHES(
+        (llvm_multi_state{{llvm_state{kw::opt_level = 1u}, llvm_state{kw::opt_level = 2u}}}), std::invalid_argument,
+        Message("Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state"));
+
+    REQUIRE_THROWS_MATCHES(
+        (llvm_multi_state{{llvm_state{kw::fast_math = true}, llvm_state{}}}), std::invalid_argument,
+        Message("Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state"));
+
+    REQUIRE_THROWS_MATCHES(
+        (llvm_multi_state{{llvm_state{}, llvm_state{kw::force_avx512 = true}}}), std::invalid_argument,
+        Message("Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state"));
+
+    REQUIRE_THROWS_MATCHES(
+        (llvm_multi_state{{llvm_state{}, llvm_state{}, llvm_state{kw::slp_vectorize = true}}}), std::invalid_argument,
+        Message("Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state"));
+    REQUIRE_THROWS_MATCHES(
+        (llvm_multi_state{{llvm_state{}, llvm_state{kw::code_model = code_model::large}, llvm_state{}}}),
+        std::invalid_argument,
+        Message("Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state"));
+
+    {
+        // Construction from compiled modules.
+        llvm_state s;
+        s.compile();
+
+        REQUIRE_THROWS_MATCHES(
+            (llvm_multi_state{{s, llvm_state{}}}), std::invalid_argument,
+            Message("An llvm_multi_state can be constructed only from uncompiled llvm_state objects"));
+        REQUIRE_THROWS_MATCHES(
+            (llvm_multi_state{{llvm_state{}, s}}), std::invalid_argument,
+            Message("An llvm_multi_state can be constructed only from uncompiled llvm_state objects"));
+    }
+
+    // Test the property getters.
+    {
+        llvm_state s{kw::opt_level = 1u, kw::fast_math = true, kw::force_avx512 = true, kw::slp_vectorize = true,
+                     kw::code_model = code_model::large};
+
+        llvm_multi_state ms{{s, s, s, s}};
+
+        REQUIRE(ms.get_opt_level() == 1u);
+        REQUIRE(ms.fast_math());
+        REQUIRE(ms.force_avx512());
+        REQUIRE(ms.get_slp_vectorize());
+        REQUIRE(ms.get_code_model() == code_model::large);
+
+        REQUIRE(!ms.is_compiled());
+
+        ms.compile();
+
+        REQUIRE(ms.is_compiled());
+
+        REQUIRE(ms.get_opt_level() == 1u);
+        REQUIRE(ms.fast_math());
+        REQUIRE(ms.force_avx512());
+        REQUIRE(ms.get_slp_vectorize());
+        REQUIRE(ms.get_code_model() == code_model::large);
+    }
+}
+
+TEST_CASE("copy semantics")
+{
+    using Catch::Matchers::Message;
+
+    auto [x, y] = make_vars("x", "y");
+
+    llvm_state s1{kw::mname = "module_0"}, s2{kw::mname = "module_1"};
+
+    add_cfunc<double>(s1, "f1", {x * y}, {x, y}, kw::compact_mode = true);
+    add_cfunc<double>(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true);
+
+    llvm_multi_state ms{{s1, s2}};
+
+    auto ms_copy = ms;
+
+    REQUIRE(ms_copy.get_bc() == ms.get_bc());
+    REQUIRE(ms_copy.get_ir() == ms.get_ir());
+    REQUIRE(ms_copy.is_compiled() == ms.is_compiled());
+    REQUIRE(ms_copy.fast_math() == ms.fast_math());
+    REQUIRE(ms_copy.force_avx512() == ms.force_avx512());
+    REQUIRE(ms_copy.get_opt_level() == ms.get_opt_level());
+    REQUIRE(ms_copy.get_slp_vectorize() == ms.get_slp_vectorize());
+    REQUIRE(ms_copy.get_code_model() == ms.get_code_model());
+    REQUIRE_THROWS_MATCHES(
+        ms_copy.get_object_code(), std::invalid_argument,
+        Message("The function 'get_object_code' can be invoked only after the llvm_multi_state has been compiled"));
+    REQUIRE_THROWS_MATCHES(
+        ms_copy.jit_lookup("foo"), std::invalid_argument,
+        Message("The function 'jit_lookup' can be invoked only after the llvm_multi_state has been compiled"));
+
+    ms.compile();
+    ms_copy.compile();
+
+    REQUIRE(ms_copy.get_bc() == ms.get_bc());
+    REQUIRE(ms_copy.get_ir() == ms.get_ir());
+    REQUIRE(ms_copy.get_object_code() == ms.get_object_code());
+    REQUIRE(ms_copy.is_compiled() == ms.is_compiled());
+    REQUIRE(ms_copy.fast_math() == ms.fast_math());
+    REQUIRE(ms_copy.force_avx512() == ms.force_avx512());
+    REQUIRE(ms_copy.get_opt_level() == ms.get_opt_level());
+    REQUIRE(ms_copy.get_slp_vectorize() == ms.get_slp_vectorize());
+    REQUIRE(ms_copy.get_code_model() == ms.get_code_model());
+    REQUIRE_NOTHROW(ms_copy.jit_lookup("f1"));
+    REQUIRE_NOTHROW(ms_copy.jit_lookup("f2"));
+
+    {
+        auto *cf1_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            ms_copy.jit_lookup("f1"));
+        auto *cf2_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            ms_copy.jit_lookup("f2"));
+
+        const double ins[] = {2., 3.};
+        double outs[2] = {};
+
+        cf1_ptr(outs, ins, nullptr, nullptr);
+        cf2_ptr(outs + 1, ins, nullptr, nullptr);
+
+        REQUIRE(outs[0] == 6);
+        REQUIRE(outs[1] == 2. / 3.);
+    }
+
+    auto ms_copy2 = ms;
+
+    REQUIRE(ms_copy2.get_bc() == ms.get_bc());
+    REQUIRE(ms_copy2.get_ir() == ms.get_ir());
+    REQUIRE(ms_copy2.get_object_code() == ms.get_object_code());
+    REQUIRE(ms_copy2.is_compiled() == ms.is_compiled());
+    REQUIRE(ms_copy2.fast_math() == ms.fast_math());
+    REQUIRE(ms_copy2.force_avx512() == ms.force_avx512());
+    REQUIRE(ms_copy2.get_opt_level() == ms.get_opt_level());
+    REQUIRE(ms_copy2.get_slp_vectorize() == ms.get_slp_vectorize());
+    REQUIRE(ms_copy2.get_code_model() == ms.get_code_model());
+    REQUIRE_NOTHROW(ms_copy2.jit_lookup("f1"));
+    REQUIRE_NOTHROW(ms_copy2.jit_lookup("f2"));
+
+    {
+        auto *cf1_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            ms_copy2.jit_lookup("f1"));
+        auto *cf2_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            ms_copy2.jit_lookup("f2"));
+
+        const double ins[] = {2., 3.};
+        double outs[2] = {};
+
+        cf1_ptr(outs, ins, nullptr, nullptr);
+        cf2_ptr(outs + 1, ins, nullptr, nullptr);
+
+        REQUIRE(outs[0] == 6);
+        REQUIRE(outs[1] == 2. / 3.);
+    }
+}
+
+TEST_CASE("cfunc")
+{
+    using Catch::Matchers::Message;
+
+    // Basic test.
+    auto [x, y] = make_vars("x", "y");
+
+    llvm_state s1{kw::mname = "module_0"}, s2{kw::mname = "module_1"};
+
+    add_cfunc<double>(s1, "f1", {x * y}, {x, y}, kw::compact_mode = true);
+    add_cfunc<double>(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true);
+
+    const auto orig_ir1 = s1.get_ir();
+    const auto orig_ir2 = s2.get_ir();
+
+    const auto orig_bc1 = s1.get_bc();
+    const auto orig_bc2 = s2.get_bc();
+
+    llvm_multi_state ms{{s1, s2}};
+
+    REQUIRE(ms.get_ir().size() == 3u);
+    REQUIRE(ms.get_bc().size() == 3u);
+    REQUIRE_THROWS_MATCHES(
+        ms.get_object_code(), std::invalid_argument,
+        Message("The function 'get_object_code' can be invoked only after the llvm_multi_state has been compiled"));
+
+    REQUIRE(orig_ir1 == ms.get_ir()[0]);
+    REQUIRE(orig_ir2 == ms.get_ir()[1]);
+
+    REQUIRE(orig_bc1 == ms.get_bc()[0]);
+    REQUIRE(orig_bc2 == ms.get_bc()[1]);
+
     ms.compile();
+    s1.compile();
+    s2.compile();
+
+    REQUIRE(ms.get_ir().size() == 3u);
+    REQUIRE(ms.get_bc().size() == 3u);
+
+    // Check the first few characters of the optimised ir/bc match.
+    // Cannot check the entire ir/bc because of the difference in trigger name.
+    REQUIRE((s1.get_ir().substr(0, 200) == ms.get_ir()[0].substr(0, 200)
+             || s1.get_ir().substr(0, 200) == ms.get_ir()[1].substr(0, 200)
+             || s1.get_ir().substr(0, 200) == ms.get_ir()[2].substr(0, 200)));
+    REQUIRE((s2.get_ir().substr(0, 200) == ms.get_ir()[0].substr(0, 200)
+             || s2.get_ir().substr(0, 200) == ms.get_ir()[1].substr(0, 200)
+             || s2.get_ir().substr(0, 200) == ms.get_ir()[2].substr(0, 200)));
+
+    REQUIRE((s1.get_bc().substr(0, 200) == ms.get_bc()[0].substr(0, 200)
+             || s1.get_bc().substr(0, 200) == ms.get_bc()[1].substr(0, 200)
+             || s1.get_bc().substr(0, 200) == ms.get_bc()[2].substr(0, 200)));
+    REQUIRE((s2.get_bc().substr(0, 200) == ms.get_bc()[0].substr(0, 200)
+             || s2.get_bc().substr(0, 200) == ms.get_bc()[1].substr(0, 200)
+             || s2.get_bc().substr(0, 200) == ms.get_bc()[2].substr(0, 200)));
+
+    auto *cf1_ptr
+        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(ms.jit_lookup("f1"));
+    auto *cf2_ptr
+        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(ms.jit_lookup("f2"));
+
+    const double ins[] = {2., 3.};
+    double outs[2] = {};
+
+    cf1_ptr(outs, ins, nullptr, nullptr);
+    cf2_ptr(outs + 1, ins, nullptr, nullptr);
+
+    REQUIRE(outs[0] == 6);
+    REQUIRE(outs[1] == 2. / 3.);
 }

From 6e1e0f6f114674db3cf8ee3f2a3f4851f38ef2e3 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sat, 3 Aug 2024 13:26:54 +0200
Subject: [PATCH 21/41] Tentative test fix.

---
 test/llvm_multi_state.cpp | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/test/llvm_multi_state.cpp b/test/llvm_multi_state.cpp
index eac44ecf3..13e7485fe 100644
--- a/test/llvm_multi_state.cpp
+++ b/test/llvm_multi_state.cpp
@@ -223,19 +223,19 @@ TEST_CASE("cfunc")
 
     // Check the first few characters of the optimised ir/bc match.
     // Cannot check the entire ir/bc because of the difference in trigger name.
-    REQUIRE((s1.get_ir().substr(0, 200) == ms.get_ir()[0].substr(0, 200)
-             || s1.get_ir().substr(0, 200) == ms.get_ir()[1].substr(0, 200)
-             || s1.get_ir().substr(0, 200) == ms.get_ir()[2].substr(0, 200)));
-    REQUIRE((s2.get_ir().substr(0, 200) == ms.get_ir()[0].substr(0, 200)
-             || s2.get_ir().substr(0, 200) == ms.get_ir()[1].substr(0, 200)
-             || s2.get_ir().substr(0, 200) == ms.get_ir()[2].substr(0, 200)));
-
-    REQUIRE((s1.get_bc().substr(0, 200) == ms.get_bc()[0].substr(0, 200)
-             || s1.get_bc().substr(0, 200) == ms.get_bc()[1].substr(0, 200)
-             || s1.get_bc().substr(0, 200) == ms.get_bc()[2].substr(0, 200)));
-    REQUIRE((s2.get_bc().substr(0, 200) == ms.get_bc()[0].substr(0, 200)
-             || s2.get_bc().substr(0, 200) == ms.get_bc()[1].substr(0, 200)
-             || s2.get_bc().substr(0, 200) == ms.get_bc()[2].substr(0, 200)));
+    REQUIRE((s1.get_ir().substr(0, 100) == ms.get_ir()[0].substr(0, 100)
+             || s1.get_ir().substr(0, 100) == ms.get_ir()[1].substr(0, 100)
+             || s1.get_ir().substr(0, 100) == ms.get_ir()[2].substr(0, 100)));
+    REQUIRE((s2.get_ir().substr(0, 100) == ms.get_ir()[0].substr(0, 100)
+             || s2.get_ir().substr(0, 100) == ms.get_ir()[1].substr(0, 100)
+             || s2.get_ir().substr(0, 100) == ms.get_ir()[2].substr(0, 100)));
+
+    REQUIRE((s1.get_bc().substr(0, 100) == ms.get_bc()[0].substr(0, 100)
+             || s1.get_bc().substr(0, 100) == ms.get_bc()[1].substr(0, 100)
+             || s1.get_bc().substr(0, 100) == ms.get_bc()[2].substr(0, 100)));
+    REQUIRE((s2.get_bc().substr(0, 100) == ms.get_bc()[0].substr(0, 100)
+             || s2.get_bc().substr(0, 100) == ms.get_bc()[1].substr(0, 100)
+             || s2.get_bc().substr(0, 100) == ms.get_bc()[2].substr(0, 100)));
 
     auto *cf1_ptr
         = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(ms.jit_lookup("f1"));

From c0de5b5dbf84c72c117eebc9aff2a8ffd80a2bf6 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sat, 3 Aug 2024 16:01:03 +0200
Subject: [PATCH 22/41] Avoid another redundant copy.

---
 src/llvm_state.cpp | 73 +++++++++++++++++++++++-----------------------
 1 file changed, 36 insertions(+), 37 deletions(-)

diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index dfe676503..7f529f1c1 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -508,6 +508,37 @@ std::string ir_from_module(llvm::Module &m)
     return std::move(ostr.str());
 }
 
+// An implementation of llvm::MemoryBuffer offering a view over a std::string.
+class string_view_mem_buffer final : public llvm::MemoryBuffer
+{
+public:
+    explicit string_view_mem_buffer(const std::string &s)
+    {
+        // NOTE: the important bit here is from the LLVM docs:
+        //
+        // """
+        // In addition to basic access to the characters in the file, this interface
+        // guarantees you can read one character past the end of the file, and that
+        // this character will read as '\0'.
+        // """
+        //
+        // This is exactly the guarantee given by std::string:
+        //
+        // https://en.cppreference.com/w/cpp/string/basic_string/data
+        //
+        // Not sure about the third parameter to this function though, it does not
+        // seem to have any influence apart from debug checking:
+        //
+        // https://llvm.org/doxygen/MemoryBuffer_8cpp_source.html
+        this->init(s.data(), s.data() + s.size(), true);
+    }
+    llvm::MemoryBuffer::BufferKind getBufferKind() const final
+    {
+        // Hopefully std::string is not memory-mapped...
+        return llvm::MemoryBuffer::BufferKind::MemoryBuffer_Malloc;
+    }
+};
+
 // Helper to add an object file to the jit, throwing in case of errors.
 void add_obj_to_lljit(llvm::orc::LLJIT &lljit, const std::string &obj)
 {
@@ -519,37 +550,6 @@ void add_obj_to_lljit(llvm::orc::LLJIT &lljit, const std::string &obj)
         return;
     }
 
-    // An implementation of llvm::MemoryBuffer offering a view over a std::string.
-    class string_view_mem_buffer final : public llvm::MemoryBuffer
-    {
-    public:
-        explicit string_view_mem_buffer(const std::string &s)
-        {
-            // NOTE: the important bit here is from the LLVM docs:
-            //
-            // """
-            // In addition to basic access to the characters in the file, this interface
-            // guarantees you can read one character past the end of the file, and that
-            // this character will read as '\0'.
-            // """
-            //
-            // This is exactly the guarantee given by std::string:
-            //
-            // https://en.cppreference.com/w/cpp/string/basic_string/data
-            //
-            // Not sure about the third parameter to this function though, it does not
-            // seem to have any influence apart from debug checking:
-            //
-            // https://llvm.org/doxygen/MemoryBuffer_8cpp_source.html
-            this->init(s.data(), s.data() + s.size(), true);
-        }
-        llvm::MemoryBuffer::BufferKind getBufferKind() const final
-        {
-            // Hopefully std::string is not memory-mapped...
-            return llvm::MemoryBuffer::BufferKind::MemoryBuffer_Malloc;
-        }
-    };
-
     // Add the object file.
     auto err = lljit.addObjectFile(std::make_unique<string_view_mem_buffer>(obj));
 
@@ -818,11 +818,10 @@ void llvm_state_add_obj_to_jit(Jit &j, std::string obj)
 // contained in the bitcode) because apparently llvm::parseBitcodeFile() discards the module
 // name when parsing.
 // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
-auto llvm_state_bc_to_module(const std::string &module_name, const std::string &bc, llvm::LLVMContext &ctx)
+auto bc_to_module(const std::string &module_name, const std::string &bc, llvm::LLVMContext &ctx)
 {
-    // Create the corresponding memory buffer.
-    auto mb = llvm::MemoryBuffer::getMemBuffer(bc);
-    assert(mb);
+    // Create the corresponding memory buffer view on bc.
+    auto mb = std::make_unique<string_view_mem_buffer>(bc);
 
     // Parse the bitcode.
     auto ret = llvm::parseBitcodeFile(mb->getMemBufferRef(), ctx);
@@ -931,7 +930,7 @@ llvm_state::llvm_state(const llvm_state &other)
         // module and builder. The IR/bitcode snapshots
         // are left in their default-constructed (empty)
         // state.
-        m_module = detail::llvm_state_bc_to_module(m_module_name, other.get_bc(), context());
+        m_module = detail::bc_to_module(m_module_name, other.get_bc(), context());
 
         // Create a new builder for the module.
         m_builder = std::make_unique<ir_builder>(context());
@@ -1127,7 +1126,7 @@ void llvm_state::load_impl(Archive &ar, unsigned version)
             m_bc_snapshot.clear();
 
             // Create the module from the bitcode.
-            m_module = detail::llvm_state_bc_to_module(m_module_name, bc_snapshot, context());
+            m_module = detail::bc_to_module(m_module_name, bc_snapshot, context());
 
             // Create a new builder for the module.
             m_builder = std::make_unique<ir_builder>(context());

From a99b36e6acdc7ff9c28fdb52ba87a35a2f5e7cbe Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sat, 3 Aug 2024 16:19:03 +0200
Subject: [PATCH 23/41] Serialisation.

---
 include/heyoka/llvm_state.hpp |  5 ++
 src/llvm_state.cpp            | 78 ++++++++++++++++++++++++++++++
 test/llvm_multi_state.cpp     | 89 +++++++++++++++++++++++++++++++++++
 3 files changed, 172 insertions(+)

diff --git a/include/heyoka/llvm_state.hpp b/include/heyoka/llvm_state.hpp
index 4e6af2d02..5c247e071 100644
--- a/include/heyoka/llvm_state.hpp
+++ b/include/heyoka/llvm_state.hpp
@@ -363,6 +363,11 @@ class HEYOKA_DLL_PUBLIC llvm_multi_state
     HEYOKA_DLL_LOCAL void check_compiled(const char *) const;
     HEYOKA_DLL_LOCAL void check_uncompiled(const char *) const;
 
+    friend class boost::serialization::access;
+    void save(boost::archive::binary_oarchive &, unsigned) const;
+    void load(boost::archive::binary_iarchive &, unsigned);
+    BOOST_SERIALIZATION_SPLIT_MEMBER()
+
 public:
     llvm_multi_state();
     explicit llvm_multi_state(std::vector<llvm_state>);
diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 7f529f1c1..8432e5a9e 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -1945,6 +1945,13 @@ llvm_multi_state::llvm_multi_state(const llvm_multi_state &other)
         // If 'other' was not compiled, we do not need to do anything - the
         // copy construction of the states takes care of everything. I.e., this
         // is basically the same as construction from a list of states.
+        // NOTE: regarding the master module: this is always created empty
+        // and it remains empty until compilation, thus we do not need to care
+        // about it if other is uncompiled - the new empty master module constructed
+        // with the jit is ok.
+        assert(other.m_impl->m_jit->m_object_files.empty());
+        assert(other.m_impl->m_jit->m_ir_snapshots.empty());
+        assert(other.m_impl->m_jit->m_bc_snapshots.empty());
     }
 }
 
@@ -1963,6 +1970,77 @@ llvm_multi_state &llvm_multi_state::operator=(llvm_multi_state &&) noexcept = de
 
 llvm_multi_state::~llvm_multi_state() = default;
 
+void llvm_multi_state::save(boost::archive::binary_oarchive &ar, unsigned) const
+{
+    // Start by establishing if the state is compiled.
+    const auto cmp = is_compiled();
+    ar << cmp;
+
+    // Store the states.
+    ar << m_impl->m_states;
+
+    // Store the object files and the snapshots. These may be empty.
+    ar << m_impl->m_jit->m_object_files;
+    ar << m_impl->m_jit->m_ir_snapshots;
+    ar << m_impl->m_jit->m_bc_snapshots;
+
+    // NOTE: no need to explicitly store the bitcode of the master
+    // module: if this is compiled, the master module is in the snapshots.
+    // Otherwise, the master module is empty and there's no need to
+    // store anything.
+}
+
+void llvm_multi_state::load(boost::archive::binary_iarchive &ar, unsigned)
+{
+    try {
+        // Load the compiled status flag from the archive.
+        // NOLINTNEXTLINE(misc-const-correctness)
+        bool cmp{};
+        ar >> cmp;
+
+        // Load the states.
+        ar >> m_impl->m_states;
+
+        // Reset the jit with a new one.
+        m_impl->m_jit = std::make_unique<detail::multi_jit>(
+            boost::safe_numerics::safe<unsigned>(m_impl->m_states.size()) + 1, get_opt_level(), get_code_model());
+
+        // Load the object files and the snapshots.
+        ar >> m_impl->m_jit->m_object_files;
+        ar >> m_impl->m_jit->m_ir_snapshots;
+        ar >> m_impl->m_jit->m_bc_snapshots;
+
+        if (cmp) {
+            // If the stored state was compiled, we need to reset
+            // master builder and module. Otherwise, the empty default-constructed
+            // master module is ok (the master module remains empty until compilation
+            // is triggered).
+            m_impl->m_jit->m_module.reset();
+            m_impl->m_jit->m_builder.reset();
+
+            // We also need to add all the object files to the jit.
+            for (const auto &obj : m_impl->m_jit->m_object_files) {
+                detail::add_obj_to_lljit(*m_impl->m_jit->m_lljit, obj);
+            }
+        }
+
+        // Debug checks.
+        assert(m_impl->m_jit->m_object_files.empty()
+               || m_impl->m_jit->m_object_files.size() == m_impl->m_jit->m_n_modules);
+        assert(m_impl->m_jit->m_ir_snapshots.empty()
+               || m_impl->m_jit->m_ir_snapshots.size() == m_impl->m_jit->m_n_modules);
+        assert(m_impl->m_jit->m_bc_snapshots.empty()
+               || m_impl->m_jit->m_bc_snapshots.size() == m_impl->m_jit->m_n_modules);
+
+        // LCOV_EXCL_START
+    } catch (...) {
+        m_impl.reset();
+
+        throw;
+    }
+    // LCOV_EXCL_STOP
+}
+
 void llvm_multi_state::add_obj_triggers()
 {
     // NOTE: the idea here is that we add one trigger function per module, and then
diff --git a/test/llvm_multi_state.cpp b/test/llvm_multi_state.cpp
index 13e7485fe..fadf228ed 100644
--- a/test/llvm_multi_state.cpp
+++ b/test/llvm_multi_state.cpp
@@ -6,11 +6,13 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#include <sstream>
 #include <stdexcept>
 
 #include <heyoka/expression.hpp>
 #include <heyoka/kw.hpp>
 #include <heyoka/llvm_state.hpp>
+#include <heyoka/s11n.hpp>
 
 #include "catch.hpp"
 
@@ -182,6 +184,93 @@ TEST_CASE("copy semantics")
     }
 }
 
+TEST_CASE("s11n")
+{
+    using Catch::Matchers::Message;
+
+    auto [x, y] = make_vars("x", "y");
+
+    llvm_state s1{kw::mname = "module_0"}, s2{kw::mname = "module_1"};
+
+    add_cfunc<double>(s1, "f1", {x * y}, {x, y}, kw::compact_mode = true);
+    add_cfunc<double>(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true);
+
+    // Uncompiled.
+    llvm_multi_state ms{{s1, s2}};
+
+    std::stringstream ss;
+
+    {
+        boost::archive::binary_oarchive oa(ss);
+        oa << ms;
+    }
+
+    llvm_multi_state ms_copy{{llvm_state{}}};
+
+    {
+        boost::archive::binary_iarchive ia(ss);
+        ia >> ms_copy;
+    }
+
+    REQUIRE(ms_copy.get_bc() == ms.get_bc());
+    REQUIRE(ms_copy.get_ir() == ms.get_ir());
+    REQUIRE(ms_copy.is_compiled() == ms.is_compiled());
+    REQUIRE(ms_copy.fast_math() == ms.fast_math());
+    REQUIRE(ms_copy.force_avx512() == ms.force_avx512());
+    REQUIRE(ms_copy.get_opt_level() == ms.get_opt_level());
+    REQUIRE(ms_copy.get_slp_vectorize() == ms.get_slp_vectorize());
+    REQUIRE(ms_copy.get_code_model() == ms.get_code_model());
+    REQUIRE_THROWS_MATCHES(
+        ms_copy.get_object_code(), std::invalid_argument,
+        Message("The function 'get_object_code' can be invoked only after the llvm_multi_state has been compiled"));
+    REQUIRE_THROWS_MATCHES(
+        ms_copy.jit_lookup("foo"), std::invalid_argument,
+        Message("The function 'jit_lookup' can be invoked only after the llvm_multi_state has been compiled"));
+
+    // Compiled.
+    ms.compile();
+
+    ss.str("");
+
+    {
+        boost::archive::binary_oarchive oa(ss);
+        oa << ms;
+    }
+
+    {
+        boost::archive::binary_iarchive ia(ss);
+        ia >> ms_copy;
+    }
+
+    REQUIRE(ms_copy.get_bc() == ms.get_bc());
+    REQUIRE(ms_copy.get_ir() == ms.get_ir());
+    REQUIRE(ms_copy.get_object_code() == ms.get_object_code());
+    REQUIRE(ms_copy.is_compiled() == ms.is_compiled());
+    REQUIRE(ms_copy.fast_math() == ms.fast_math());
+    REQUIRE(ms_copy.force_avx512() == ms.force_avx512());
+    REQUIRE(ms_copy.get_opt_level() == ms.get_opt_level());
+    REQUIRE(ms_copy.get_slp_vectorize() == ms.get_slp_vectorize());
+    REQUIRE(ms_copy.get_code_model() == ms.get_code_model());
+    REQUIRE_NOTHROW(ms_copy.jit_lookup("f1"));
+    REQUIRE_NOTHROW(ms_copy.jit_lookup("f2"));
+
+    {
+        auto *cf1_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            ms_copy.jit_lookup("f1"));
+        auto *cf2_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            ms_copy.jit_lookup("f2"));
+
+        const double ins[] = {2., 3.};
+        double outs[2] = {};
+
+        cf1_ptr(outs, ins, nullptr, nullptr);
+        cf2_ptr(outs + 1, ins, nullptr, nullptr);
+
+        REQUIRE(outs[0] == 6);
+        REQUIRE(outs[1] == 2. / 3.);
+    }
+}
+
 TEST_CASE("cfunc")
 {
     using Catch::Matchers::Message;

From 85378c27ec7062fb914f5ddd83367a925e67de9c Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sat, 3 Aug 2024 16:21:19 +0200
Subject: [PATCH 24/41] CI debug.

---
 test/llvm_multi_state.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/llvm_multi_state.cpp b/test/llvm_multi_state.cpp
index fadf228ed..4b93eeb44 100644
--- a/test/llvm_multi_state.cpp
+++ b/test/llvm_multi_state.cpp
@@ -310,6 +310,11 @@ TEST_CASE("cfunc")
     REQUIRE(ms.get_ir().size() == 3u);
     REQUIRE(ms.get_bc().size() == 3u);
 
+    std::cout << "Orig:\n\n" << s1.get_ir() << "\n\n" << std::endl;
+    std::cout << "cmp0:\n\n" << ms.get_ir()[0] << "\n\n" << std::endl;
+    std::cout << "cmp1:\n\n" << ms.get_ir()[1] << "\n\n" << std::endl;
+    std::cout << "cmp2:\n\n" << ms.get_ir()[2] << "\n\n" << std::endl;
+
     // Check the first few characters of the optimised ir/bc match.
     // Cannot check the entire ir/bc because of the difference in trigger name.
     REQUIRE((s1.get_ir().substr(0, 100) == ms.get_ir()[0].substr(0, 100)

From c380ad4ec3041eec9203ef87d62e9ce3d86011bc Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sat, 3 Aug 2024 16:35:17 +0200
Subject: [PATCH 25/41] Abstract out another helper.

---
 src/llvm_state.cpp | 37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 8432e5a9e..44afb83cc 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -566,6 +566,20 @@ void add_obj_to_lljit(llvm::orc::LLJIT &lljit, const std::string &obj)
     // LCOV_EXCL_STOP
 }
 
+// Helper to verify a module, throwing if verification fails.
+void verify_module(const llvm::Module &m)
+{
+    std::string out;
+    llvm::raw_string_ostream ostr(out);
+
+    if (llvm::verifyModule(m, &ostr)) {
+        // LCOV_EXCL_START
+        throw std::runtime_error(fmt::format("The verification of the module '{}' produced an error:\n{}",
+                                             m.getModuleIdentifier(), ostr.str()));
+        // LCOV_EXCL_STOP
+    }
+}
+
 } // namespace
 
 // Helper function to fetch a const ref to a global object
@@ -1377,17 +1391,7 @@ void llvm_state::compile()
     auto *logger = detail::get_logger();
 
     // Run a verification on the module before compiling.
-    {
-        std::string out;
-        llvm::raw_string_ostream ostr(out);
-
-        if (llvm::verifyModule(*m_module, &ostr)) {
-            // LCOV_EXCL_START
-            throw std::runtime_error(
-                fmt::format("The verification of the module '{}' produced an error:\n{}", m_module_name, ostr.str()));
-            // LCOV_EXCL_STOP
-        }
-    }
+    detail::verify_module(*m_module);
 
     logger->trace("module verification runtime: {}", sw);
 
@@ -2238,16 +2242,7 @@ void llvm_multi_state::compile()
     // Verify the modules before compiling.
     // NOTE: probably this can be parallelised if needed.
     for (decltype(m_impl->m_states.size()) i = 0; i < m_impl->m_states.size(); ++i) {
-        std::string out;
-        llvm::raw_string_ostream ostr(out);
-
-        if (llvm::verifyModule(*m_impl->m_states[i].m_module, &ostr)) [[unlikely]] {
-            // LCOV_EXCL_START
-            throw std::runtime_error(
-                fmt::format("The verification of the module at index {} in an llvm_multi_state produced an error:\n{}",
-                            i, ostr.str()));
-            // LCOV_EXCL_STOP
-        }
+        detail::verify_module(*m_impl->m_states[i].m_module);
     }
 
     logger->trace("llvm_multi_state module verification runtime: {}", sw);

From 788d387123ee2fd9f42fded88f80bd4bbd313ef5 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sat, 3 Aug 2024 16:40:26 +0200
Subject: [PATCH 26/41] Another attempt at fixing the test.

---
 test/llvm_multi_state.cpp | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/test/llvm_multi_state.cpp b/test/llvm_multi_state.cpp
index 4b93eeb44..65d898adc 100644
--- a/test/llvm_multi_state.cpp
+++ b/test/llvm_multi_state.cpp
@@ -310,11 +310,6 @@ TEST_CASE("cfunc")
     REQUIRE(ms.get_ir().size() == 3u);
     REQUIRE(ms.get_bc().size() == 3u);
 
-    std::cout << "Orig:\n\n" << s1.get_ir() << "\n\n" << std::endl;
-    std::cout << "cmp0:\n\n" << ms.get_ir()[0] << "\n\n" << std::endl;
-    std::cout << "cmp1:\n\n" << ms.get_ir()[1] << "\n\n" << std::endl;
-    std::cout << "cmp2:\n\n" << ms.get_ir()[2] << "\n\n" << std::endl;
-
     // Check the first few characters of the optimised ir/bc match.
     // Cannot check the entire ir/bc because of the difference in trigger name.
     REQUIRE((s1.get_ir().substr(0, 100) == ms.get_ir()[0].substr(0, 100)
@@ -324,12 +319,12 @@ TEST_CASE("cfunc")
              || s2.get_ir().substr(0, 100) == ms.get_ir()[1].substr(0, 100)
              || s2.get_ir().substr(0, 100) == ms.get_ir()[2].substr(0, 100)));
 
-    REQUIRE((s1.get_bc().substr(0, 100) == ms.get_bc()[0].substr(0, 100)
-             || s1.get_bc().substr(0, 100) == ms.get_bc()[1].substr(0, 100)
-             || s1.get_bc().substr(0, 100) == ms.get_bc()[2].substr(0, 100)));
-    REQUIRE((s2.get_bc().substr(0, 100) == ms.get_bc()[0].substr(0, 100)
-             || s2.get_bc().substr(0, 100) == ms.get_bc()[1].substr(0, 100)
-             || s2.get_bc().substr(0, 100) == ms.get_bc()[2].substr(0, 100)));
+    REQUIRE((s1.get_bc().substr(0, 50) == ms.get_bc()[0].substr(0, 50)
+             || s1.get_bc().substr(0, 50) == ms.get_bc()[1].substr(0, 50)
+             || s1.get_bc().substr(0, 50) == ms.get_bc()[2].substr(0, 50)));
+    REQUIRE((s2.get_bc().substr(0, 50) == ms.get_bc()[0].substr(0, 50)
+             || s2.get_bc().substr(0, 50) == ms.get_bc()[1].substr(0, 50)
+             || s2.get_bc().substr(0, 50) == ms.get_bc()[2].substr(0, 50)));
 
     auto *cf1_ptr
         = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(ms.jit_lookup("f1"));

From c582677367af3487ca1b89d7ba5a7b4e72e56f28 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sat, 3 Aug 2024 17:05:06 +0200
Subject: [PATCH 27/41] Add another getter, stream operator.

---
 include/heyoka/llvm_state.hpp |  6 ++++++
 src/llvm_state.cpp            | 26 ++++++++++++++++++++++++++
 test/llvm_multi_state.cpp     | 25 +++++++++++++++++++++++++
 3 files changed, 57 insertions(+)

diff --git a/include/heyoka/llvm_state.hpp b/include/heyoka/llvm_state.hpp
index 5c247e071..1c021ecfa 100644
--- a/include/heyoka/llvm_state.hpp
+++ b/include/heyoka/llvm_state.hpp
@@ -79,6 +79,7 @@ HEYOKA_DLL_PUBLIC const target_features &get_target_features();
 } // namespace detail
 
 HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const llvm_state &);
+HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const llvm_multi_state &);
 
 template <typename T>
 inline std::uint32_t recommended_simd_size()
@@ -133,6 +134,7 @@ HEYOKA_BEGIN_NAMESPACE
 class HEYOKA_DLL_PUBLIC llvm_state
 {
     friend HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const llvm_state &);
+    friend HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const llvm_multi_state &);
     friend class HEYOKA_DLL_PUBLIC llvm_multi_state;
 
     struct jit;
@@ -352,6 +354,8 @@ void llvm_state_mem_cache_try_insert(std::vector<std::string>, unsigned, llvm_mc
 
 class HEYOKA_DLL_PUBLIC llvm_multi_state
 {
+    friend HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const llvm_multi_state &);
+
     struct impl;
 
     std::unique_ptr<impl> m_impl;
@@ -379,6 +383,8 @@ class HEYOKA_DLL_PUBLIC llvm_multi_state
 
     [[nodiscard]] bool is_compiled() const noexcept;
 
+    [[nodiscard]] unsigned get_n_modules() const noexcept;
+
     [[nodiscard]] bool fast_math() const noexcept;
     [[nodiscard]] bool force_avx512() const noexcept;
     [[nodiscard]] unsigned get_opt_level() const noexcept;
diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 44afb83cc..5f3c4820c 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -2124,6 +2124,11 @@ void llvm_multi_state::check_uncompiled(const char *f) const
     }
 }
 
+unsigned llvm_multi_state::get_n_modules() const noexcept
+{
+    return m_impl->m_jit->m_n_modules;
+}
+
 unsigned llvm_multi_state::get_opt_level() const noexcept
 {
     return m_impl->m_states[0].get_opt_level();
@@ -2354,4 +2359,25 @@ std::uintptr_t llvm_multi_state::jit_lookup(const std::string &name)
 #endif
 }
 
+std::ostream &operator<<(std::ostream &os, const llvm_multi_state &s)
+{
+    std::ostringstream oss;
+    oss << std::boolalpha;
+
+    oss << "N of modules      : " << s.get_n_modules() << '\n';
+    oss << "Compiled          : " << s.is_compiled() << '\n';
+    oss << "Fast math         : " << s.fast_math() << '\n';
+    oss << "Force AVX512      : " << s.force_avx512() << '\n';
+    oss << "SLP vectorization : " << s.get_slp_vectorize() << '\n';
+    oss << "Code model        : " << s.get_code_model() << '\n';
+    oss << "Optimisation level: " << s.get_opt_level() << '\n';
+    oss << "Data layout       : " << s.m_impl->m_states[0].m_jitter->m_lljit->getDataLayout().getStringRepresentation()
+        << '\n';
+    oss << "Target triple     : " << s.m_impl->m_states[0].m_jitter->get_target_triple().str() << '\n';
+    oss << "Target CPU        : " << s.m_impl->m_states[0].m_jitter->get_target_cpu() << '\n';
+    oss << "Target features   : " << s.m_impl->m_states[0].m_jitter->get_target_features() << '\n';
+
+    return os << oss.str();
+}
+
 HEYOKA_END_NAMESPACE
diff --git a/test/llvm_multi_state.cpp b/test/llvm_multi_state.cpp
index 65d898adc..82613b682 100644
--- a/test/llvm_multi_state.cpp
+++ b/test/llvm_multi_state.cpp
@@ -77,6 +77,7 @@ TEST_CASE("basic")
         REQUIRE(ms.force_avx512());
         REQUIRE(ms.get_slp_vectorize());
         REQUIRE(ms.get_code_model() == code_model::large);
+        REQUIRE(ms.get_n_modules() == 5u);
 
         REQUIRE(!ms.is_compiled());
 
@@ -89,6 +90,7 @@ TEST_CASE("basic")
         REQUIRE(ms.force_avx512());
         REQUIRE(ms.get_slp_vectorize());
         REQUIRE(ms.get_code_model() == code_model::large);
+        REQUIRE(ms.get_n_modules() == 5u);
     }
 }
 
@@ -340,3 +342,26 @@ TEST_CASE("cfunc")
     REQUIRE(outs[0] == 6);
     REQUIRE(outs[1] == 2. / 3.);
 }
+
+TEST_CASE("stream op")
+{
+    auto [x, y] = make_vars("x", "y");
+
+    llvm_state s1{kw::mname = "module_0"}, s2{kw::mname = "module_1"};
+
+    add_cfunc<double>(s1, "f1", {x * y}, {x, y}, kw::compact_mode = true);
+    add_cfunc<double>(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true);
+
+    const auto orig_ir1 = s1.get_ir();
+    const auto orig_ir2 = s2.get_ir();
+
+    const auto orig_bc1 = s1.get_bc();
+    const auto orig_bc2 = s2.get_bc();
+
+    llvm_multi_state ms{{s1, s2}};
+
+    std::ostringstream oss;
+    oss << ms;
+
+    REQUIRE(!oss.str().empty());
+}

From b03accbbb47905d16a39d6ce53b8fe9452e40a7c Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sat, 3 Aug 2024 17:06:41 +0200
Subject: [PATCH 28/41] Coverage fix.

---
 test/llvm_multi_state.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/llvm_multi_state.cpp b/test/llvm_multi_state.cpp
index 82613b682..5d2ea026c 100644
--- a/test/llvm_multi_state.cpp
+++ b/test/llvm_multi_state.cpp
@@ -145,6 +145,9 @@ TEST_CASE("copy semantics")
         auto *cf2_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
             ms_copy.jit_lookup("f2"));
 
+        REQUIRE_THROWS_MATCHES(ms_copy.jit_lookup("f3"), std::invalid_argument,
+                               Message("Could not find the symbol 'f3' in an llvm_multi_state"));
+
         const double ins[] = {2., 3.};
         double outs[2] = {};
 

From f39fcb01c59e49a059aeda42153fc7acafc824db Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sat, 3 Aug 2024 17:32:42 +0200
Subject: [PATCH 29/41] More testing.

---
 src/llvm_state.cpp        |  6 ++++++
 test/llvm_multi_state.cpp | 12 ++++++------
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 5f3c4820c..095469349 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -1815,6 +1815,12 @@ llvm_multi_state::llvm_multi_state(std::vector<llvm_state> states)
 
     // Settings in all states must be consistent.
     auto states_differ = [](const llvm_state &s1, const llvm_state &s2) {
+        // NOTE: bit of paranoia here.
+        assert(s1.m_jitter->m_lljit->getDataLayout() == s2.m_jitter->m_lljit->getDataLayout());
+        assert(s1.m_jitter->get_target_triple() == s2.m_jitter->get_target_triple());
+        assert(s1.m_jitter->get_target_cpu() == s2.m_jitter->get_target_cpu());
+        assert(s1.m_jitter->get_target_features() == s2.m_jitter->get_target_features());
+
         if (s1.get_opt_level() != s2.get_opt_level()) {
             return true;
         }
diff --git a/test/llvm_multi_state.cpp b/test/llvm_multi_state.cpp
index 5d2ea026c..f35035672 100644
--- a/test/llvm_multi_state.cpp
+++ b/test/llvm_multi_state.cpp
@@ -324,12 +324,12 @@ TEST_CASE("cfunc")
              || s2.get_ir().substr(0, 100) == ms.get_ir()[1].substr(0, 100)
              || s2.get_ir().substr(0, 100) == ms.get_ir()[2].substr(0, 100)));
 
-    REQUIRE((s1.get_bc().substr(0, 50) == ms.get_bc()[0].substr(0, 50)
-             || s1.get_bc().substr(0, 50) == ms.get_bc()[1].substr(0, 50)
-             || s1.get_bc().substr(0, 50) == ms.get_bc()[2].substr(0, 50)));
-    REQUIRE((s2.get_bc().substr(0, 50) == ms.get_bc()[0].substr(0, 50)
-             || s2.get_bc().substr(0, 50) == ms.get_bc()[1].substr(0, 50)
-             || s2.get_bc().substr(0, 50) == ms.get_bc()[2].substr(0, 50)));
+    REQUIRE((s1.get_bc().substr(0, 25) == ms.get_bc()[0].substr(0, 25)
+             || s1.get_bc().substr(0, 25) == ms.get_bc()[1].substr(0, 25)
+             || s1.get_bc().substr(0, 25) == ms.get_bc()[2].substr(0, 25)));
+    REQUIRE((s2.get_bc().substr(0, 25) == ms.get_bc()[0].substr(0, 25)
+             || s2.get_bc().substr(0, 25) == ms.get_bc()[1].substr(0, 25)
+             || s2.get_bc().substr(0, 25) == ms.get_bc()[2].substr(0, 25)));
 
     auto *cf1_ptr
         = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(ms.jit_lookup("f1"));

From 6d2196451638df832aff86786e47556c55564aaa Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sat, 3 Aug 2024 18:44:46 +0200
Subject: [PATCH 30/41] More CI debug.

---
 test/llvm_multi_state.cpp | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/test/llvm_multi_state.cpp b/test/llvm_multi_state.cpp
index f35035672..074805c7a 100644
--- a/test/llvm_multi_state.cpp
+++ b/test/llvm_multi_state.cpp
@@ -14,6 +14,9 @@
 #include <heyoka/llvm_state.hpp>
 #include <heyoka/s11n.hpp>
 
+#include <fmt/ranges.h>
+#include <ranges>
+
 #include "catch.hpp"
 
 using namespace heyoka;
@@ -324,6 +327,22 @@ TEST_CASE("cfunc")
              || s2.get_ir().substr(0, 100) == ms.get_ir()[1].substr(0, 100)
              || s2.get_ir().substr(0, 100) == ms.get_ir()[2].substr(0, 100)));
 
+    std::cout << "orig:" << std::endl << std::endl;
+    fmt::print("{}\n\n\n",
+               s1.get_bc().substr(0, 25) | std::views::transform([](auto c) { return static_cast<int>(c); }));
+
+    std::cout << "pos0:" << std::endl << std::endl;
+    fmt::print("{}\n\n\n",
+               ms.get_bc()[0].substr(0, 25) | std::views::transform([](auto c) { return static_cast<int>(c); }));
+
+    std::cout << "pos1:" << std::endl << std::endl;
+    fmt::print("{}\n\n\n",
+               ms.get_bc()[1].substr(0, 25) | std::views::transform([](auto c) { return static_cast<int>(c); }));
+
+    std::cout << "pos2:" << std::endl << std::endl;
+    fmt::print("{}\n\n\n",
+               ms.get_bc()[2].substr(0, 25) | std::views::transform([](auto c) { return static_cast<int>(c); }));
+
     REQUIRE((s1.get_bc().substr(0, 25) == ms.get_bc()[0].substr(0, 25)
              || s1.get_bc().substr(0, 25) == ms.get_bc()[1].substr(0, 25)
              || s1.get_bc().substr(0, 25) == ms.get_bc()[2].substr(0, 25)));

From e428f071c381e04e8baad0264eefba10066e3bf5 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sat, 3 Aug 2024 19:10:34 +0200
Subject: [PATCH 31/41] More testing.

---
 test/llvm_multi_state.cpp | 80 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/test/llvm_multi_state.cpp b/test/llvm_multi_state.cpp
index 074805c7a..6e54a66cc 100644
--- a/test/llvm_multi_state.cpp
+++ b/test/llvm_multi_state.cpp
@@ -387,3 +387,83 @@ TEST_CASE("stream op")
 
     REQUIRE(!oss.str().empty());
 }
+
+// A test to check that, post compilation, snapshots and object files
+// are ordered deterministically.
+TEST_CASE("post compile ordering")
+{
+    auto [x, y] = make_vars("x", "y");
+
+    llvm_state s1, s2, s3, s4;
+
+    add_cfunc<double>(s1, "f1", {x * y}, {x, y});
+    add_cfunc<double>(s2, "f2", {x / y}, {x, y});
+    add_cfunc<double>(s3, "f3", {x + y}, {x, y});
+    add_cfunc<double>(s4, "f4", {x - y}, {x, y});
+
+    llvm_state::clear_memcache();
+
+    llvm_multi_state ms{{s1, s2, s3, s4}};
+    ms.compile();
+
+    const auto orig_obj = ms.get_object_code();
+    const auto orig_ir = ms.get_ir();
+    const auto orig_bc = ms.get_bc();
+
+    for (auto i = 0; i < 20; ++i) {
+        llvm_state::clear_memcache();
+
+        llvm_multi_state ms2{{s1, s2, s3, s4}};
+        ms2.compile();
+
+        REQUIRE(ms2.get_object_code() == orig_obj);
+        REQUIRE(ms2.get_ir() == orig_ir);
+        REQUIRE(ms2.get_bc() == orig_bc);
+    }
+}
+
+TEST_CASE("memcache testing")
+{
+    auto [x, y] = make_vars("x", "y");
+
+    llvm_state s1, s2, s3, s4;
+
+    add_cfunc<double>(s1, "f1", {x * y}, {x, y});
+    add_cfunc<double>(s2, "f2", {x / y}, {x, y});
+    add_cfunc<double>(s3, "f3", {x + y}, {x, y});
+    add_cfunc<double>(s4, "f4", {x - y}, {x, y});
+
+    llvm_state::clear_memcache();
+
+    llvm_multi_state ms{{s1, s2, s3, s4}};
+    ms.compile();
+
+    const auto cur_cache_size = llvm_state::get_memcache_size();
+
+    llvm_multi_state ms2{{s1, s2, s3, s4}};
+    ms2.compile();
+
+    REQUIRE(cur_cache_size == llvm_state::get_memcache_size());
+
+    auto *cf1_ptr
+        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(ms.jit_lookup("f1"));
+    auto *cf2_ptr
+        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(ms.jit_lookup("f2"));
+    auto *cf3_ptr
+        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(ms.jit_lookup("f3"));
+    auto *cf4_ptr
+        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(ms.jit_lookup("f4"));
+
+    const double ins[] = {2., 3.};
+    double outs[4] = {};
+
+    cf1_ptr(outs, ins, nullptr, nullptr);
+    cf2_ptr(outs + 1, ins, nullptr, nullptr);
+    cf3_ptr(outs + 2, ins, nullptr, nullptr);
+    cf4_ptr(outs + 3, ins, nullptr, nullptr);
+
+    REQUIRE(outs[0] == 6);
+    REQUIRE(outs[1] == 2. / 3.);
+    REQUIRE(outs[2] == 5);
+    REQUIRE(outs[3] == -1);
+}

From fa11165485894db192e38fa4d5b46cc0a32a122d Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sat, 3 Aug 2024 19:38:10 +0200
Subject: [PATCH 32/41] Remove the problematic test.

---
 test/llvm_multi_state.cpp | 35 -----------------------------------
 1 file changed, 35 deletions(-)

diff --git a/test/llvm_multi_state.cpp b/test/llvm_multi_state.cpp
index 6e54a66cc..c6dfb419f 100644
--- a/test/llvm_multi_state.cpp
+++ b/test/llvm_multi_state.cpp
@@ -14,9 +14,6 @@
 #include <heyoka/llvm_state.hpp>
 #include <heyoka/s11n.hpp>
 
-#include <fmt/ranges.h>
-#include <ranges>
-
 #include "catch.hpp"
 
 using namespace heyoka;
@@ -318,38 +315,6 @@ TEST_CASE("cfunc")
     REQUIRE(ms.get_ir().size() == 3u);
     REQUIRE(ms.get_bc().size() == 3u);
 
-    // Check the first few characters of the optimised ir/bc match.
-    // Cannot check the entire ir/bc because of the difference in trigger name.
-    REQUIRE((s1.get_ir().substr(0, 100) == ms.get_ir()[0].substr(0, 100)
-             || s1.get_ir().substr(0, 100) == ms.get_ir()[1].substr(0, 100)
-             || s1.get_ir().substr(0, 100) == ms.get_ir()[2].substr(0, 100)));
-    REQUIRE((s2.get_ir().substr(0, 100) == ms.get_ir()[0].substr(0, 100)
-             || s2.get_ir().substr(0, 100) == ms.get_ir()[1].substr(0, 100)
-             || s2.get_ir().substr(0, 100) == ms.get_ir()[2].substr(0, 100)));
-
-    std::cout << "orig:" << std::endl << std::endl;
-    fmt::print("{}\n\n\n",
-               s1.get_bc().substr(0, 25) | std::views::transform([](auto c) { return static_cast<int>(c); }));
-
-    std::cout << "pos0:" << std::endl << std::endl;
-    fmt::print("{}\n\n\n",
-               ms.get_bc()[0].substr(0, 25) | std::views::transform([](auto c) { return static_cast<int>(c); }));
-
-    std::cout << "pos1:" << std::endl << std::endl;
-    fmt::print("{}\n\n\n",
-               ms.get_bc()[1].substr(0, 25) | std::views::transform([](auto c) { return static_cast<int>(c); }));
-
-    std::cout << "pos2:" << std::endl << std::endl;
-    fmt::print("{}\n\n\n",
-               ms.get_bc()[2].substr(0, 25) | std::views::transform([](auto c) { return static_cast<int>(c); }));
-
-    REQUIRE((s1.get_bc().substr(0, 25) == ms.get_bc()[0].substr(0, 25)
-             || s1.get_bc().substr(0, 25) == ms.get_bc()[1].substr(0, 25)
-             || s1.get_bc().substr(0, 25) == ms.get_bc()[2].substr(0, 25)));
-    REQUIRE((s2.get_bc().substr(0, 25) == ms.get_bc()[0].substr(0, 25)
-             || s2.get_bc().substr(0, 25) == ms.get_bc()[1].substr(0, 25)
-             || s2.get_bc().substr(0, 25) == ms.get_bc()[2].substr(0, 25)));
-
     auto *cf1_ptr
         = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(ms.jit_lookup("f1"));
     auto *cf2_ptr

From 2ed71728c1b4d0ce85cac4230cd3d263e2dcac7f Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sat, 3 Aug 2024 21:53:03 +0200
Subject: [PATCH 33/41] Fix two horrid bugs that were bizarrely hidden by a
 series of coincidences.

---
 src/llvm_state.cpp        | 153 ++++++++++++++++++++------------------
 test/llvm_multi_state.cpp | 105 +++++++++++++++++++++++++-
 2 files changed, 183 insertions(+), 75 deletions(-)

diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 095469349..fefdb021e 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -1621,7 +1621,7 @@ struct multi_jit {
     std::vector<std::string> m_ir_snapshots;
     std::vector<std::string> m_bc_snapshots;
 
-    explicit multi_jit(unsigned, unsigned, code_model);
+    explicit multi_jit(unsigned, unsigned, code_model, bool, bool);
     multi_jit(const multi_jit &) = delete;
     multi_jit(multi_jit &&) noexcept = delete;
     llvm_multi_state &operator=(const multi_jit &) = delete;
@@ -1666,9 +1666,13 @@ class tbb_task_dispatcher : public llvm::orc::TaskDispatcher
 
 #endif
 
+// Reserved identifier for the master module in an llvm_multi_state.
+constexpr auto master_module_name = "heyoka.master";
+
 // NOTE: this largely replicates the logic from the constructors of llvm_state and llvm_state::jit.
 // NOTE: make sure to coordinate changes in this constructor with llvm_state::jit.
-multi_jit::multi_jit(unsigned n_modules, unsigned opt_level, code_model c_model) : m_n_modules(n_modules)
+multi_jit::multi_jit(unsigned n_modules, unsigned opt_level, code_model c_model, bool force_avx512, bool slp_vectorize)
+    : m_n_modules(n_modules)
 {
     assert(n_modules >= 2u);
 
@@ -1766,6 +1770,67 @@ multi_jit::multi_jit(unsigned n_modules, unsigned opt_level, code_model c_model)
         return llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>>(std::move(obj_buffer));
     });
 
+    // Setup the machinery to run the optimisation passes on the modules.
+    m_lljit->getIRTransformLayer().setTransform(
+        [this, opt_level, force_avx512, slp_vectorize, c_model](llvm::orc::ThreadSafeModule TSM,
+                                                                llvm::orc::MaterializationResponsibility &) {
+            // See here for an explanation of what withModuleDo() entails:
+            //
+            // https://groups.google.com/g/llvm-dev/c/QauU4L_bHac
+            //
+            // In our case, the locking/thread safety aspect is not important as we are not sharing
+            // contexts between threads. More references from discord:
+            //
+            // https://discord.com/channels/636084430946959380/687692371038830597/1252428080648163328
+            // https://discord.com/channels/636084430946959380/687692371038830597/1252118666187640892
+            TSM.withModuleDo([this, opt_level, force_avx512, slp_vectorize, c_model](llvm::Module &M) {
+                // NOTE: don't run any optimisation on the master module.
+                if (M.getModuleIdentifier() != master_module_name) {
+                    // NOTE: running the optimisation passes requires mutable access to a target
+                    // machine. Thus, we create a new target machine per thread in order to avoid likely data races
+                    // with a shared target machine.
+
+                    // Fetch a target machine builder.
+                    auto jtmb = detail::create_jit_tmb(opt_level, c_model);
+
+                    // Try creating the target machine.
+                    auto tm = jtmb.createTargetMachine();
+                    // LCOV_EXCL_START
+                    if (!tm) [[unlikely]] {
+                        throw std::invalid_argument("Error creating the target machine");
+                    }
+                    // LCOV_EXCL_STOP
+
+                    // NOTE: we used to fetch the target triple from the lljit object,
+                    // but recently we switched to asking the target triple directly
+                    // from the target machine. Assert equality between the two for a while,
+                    // just in case.
+                    // NOTE: lljit.getTargetTriple() just returns a const ref to an internal
+                    // object, it should be ok with concurrent invocation.
+                    assert(m_lljit->getTargetTriple() == (*tm)->getTargetTriple());
+                    // NOTE: the target triple is also available in the module.
+                    assert(m_lljit->getTargetTriple().str() == M.getTargetTriple());
+
+                    // Optimise the module.
+                    detail::optimise_module(M, **tm, opt_level, force_avx512, slp_vectorize);
+                } else {
+                    ;
+                }
+
+                // Store the optimised bitcode/IR for this module.
+                auto bc_snap = detail::bc_from_module(M);
+                auto ir_snap = detail::ir_from_module(M);
+
+                // NOTE: protect for multi-threaded access.
+                std::lock_guard lock{m_ir_bc_mutex};
+
+                m_bc_snapshots.push_back(std::move(bc_snap));
+                m_ir_snapshots.push_back(std::move(ir_snap));
+            });
+
+            return llvm::Expected<llvm::orc::ThreadSafeModule>(std::move(TSM));
+        });
+
     // Setup the jit so that it can look up symbols from the current process.
     auto dlsg
         = llvm::orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(m_lljit->getDataLayout().getGlobalPrefix());
@@ -1780,7 +1845,7 @@ multi_jit::multi_jit(unsigned n_modules, unsigned opt_level, code_model c_model)
     m_ctx = std::make_unique<llvm::orc::ThreadSafeContext>(std::make_unique<llvm::LLVMContext>());
 
     // Create the master module.
-    m_module = std::make_unique<llvm::Module>("master", context());
+    m_module = std::make_unique<llvm::Module>(master_module_name, context());
     // Setup the data layout and the target triple.
     m_module->setDataLayout(m_lljit->getDataLayout());
     m_module->setTargetTriple(m_lljit->getTargetTriple().str());
@@ -1813,6 +1878,15 @@ llvm_multi_state::llvm_multi_state(std::vector<llvm_state> states)
         throw std::invalid_argument("An llvm_multi_state can be constructed only from uncompiled llvm_state objects");
     }
 
+    // Module names must not collide with master_module_name.
+    if (std::ranges::any_of(states, [](const auto &s) { return s.module_name() == detail::master_module_name; }))
+        [[unlikely]] {
+        throw std::invalid_argument(
+            fmt::format("An invalid llvm_state was passed to the constructor of an llvm_multi_state: the module name "
+                        "'{}' is reserved for internal use by llvm_multi_state",
+                        detail::master_module_name));
+    }
+
     // Settings in all states must be consistent.
     auto states_differ = [](const llvm_state &s1, const llvm_state &s2) {
         // NOTE: bit of paranoia here.
@@ -1855,74 +1929,9 @@ llvm_multi_state::llvm_multi_state(std::vector<llvm_state> states)
     const auto force_avx512 = states[0].force_avx512();
     const auto slp_vectorize = states[0].get_slp_vectorize();
 
-    // Rename all states.
-    for (decltype(states.size()) i = 0; i < states.size(); ++i) {
-        const auto new_mname = fmt::format("module_{}", i);
-        states[i].m_module->setModuleIdentifier(new_mname.c_str());
-    }
-
     // Create the multi_jit.
     auto jit = std::make_unique<detail::multi_jit>(boost::safe_numerics::safe<unsigned>(states.size()) + 1, opt_level,
-                                                   c_model);
-
-    // In the master jit, setup the machinery to run the optimisation passes on the modules.
-    jit->m_lljit->getIRTransformLayer().setTransform(
-        [&j = *jit, opt_level, force_avx512, slp_vectorize, c_model](llvm::orc::ThreadSafeModule TSM,
-                                                                     llvm::orc::MaterializationResponsibility &) {
-            // See here for an explanation of what withModuleDo() entails:
-            //
-            // https://groups.google.com/g/llvm-dev/c/QauU4L_bHac
-            //
-            // In our case, the locking/thread safety aspect is not important as we are not sharing
-            // contexts between threads. More references from discord:
-            //
-            // https://discord.com/channels/636084430946959380/687692371038830597/1252428080648163328
-            // https://discord.com/channels/636084430946959380/687692371038830597/1252118666187640892
-            TSM.withModuleDo([&j, opt_level, force_avx512, slp_vectorize, c_model](llvm::Module &M) {
-                // NOTE: don't run any optimisation on the master module.
-                if (M.getModuleIdentifier() != "master") {
-                    // NOTE: running the optimisation passes requires mutable access to a target
-                    // machine. Thus, we create a new target machine per thread in order to avoid likely data races
-                    // with a shared target machine.
-
-                    // Fetch a target machine builder.
-                    auto jtmb = detail::create_jit_tmb(opt_level, c_model);
-
-                    // Try creating the target machine.
-                    auto tm = jtmb.createTargetMachine();
-                    // LCOV_EXCL_START
-                    if (!tm) [[unlikely]] {
-                        throw std::invalid_argument("Error creating the target machine");
-                    }
-                    // LCOV_EXCL_STOP
-
-                    // NOTE: we used to fetch the target triple from the lljit object,
-                    // but recently we switched to asking the target triple directly
-                    // from the target machine. Assert equality between the two for a while,
-                    // just in case.
-                    // NOTE: lljit.getTargetTriple() just returns a const ref to an internal
-                    // object, it should be ok with concurrent invocation.
-                    assert(j.m_lljit->getTargetTriple() == (*tm)->getTargetTriple());
-                    // NOTE: the target triple is also available in the module.
-                    assert(j.m_lljit->getTargetTriple().str() == M.getTargetTriple());
-
-                    // Optimise the module.
-                    detail::optimise_module(M, **tm, opt_level, force_avx512, slp_vectorize);
-                }
-
-                // Store the optimised bitcode/IR for this module.
-                auto bc_snap = detail::bc_from_module(M);
-                auto ir_snap = detail::ir_from_module(M);
-
-                // NOTE: protect for multi-threaded access.
-                std::lock_guard lock{j.m_ir_bc_mutex};
-
-                j.m_bc_snapshots.push_back(std::move(bc_snap));
-                j.m_ir_snapshots.push_back(std::move(ir_snap));
-            });
-
-            return llvm::Expected<llvm::orc::ThreadSafeModule>(std::move(TSM));
-        });
+                                                   c_model, force_avx512, slp_vectorize);
 
     // Build and assign the implementation.
     impl imp{.m_states = std::move(states), .m_jit = std::move(jit)};
@@ -1935,7 +1944,8 @@ llvm_multi_state::llvm_multi_state(const llvm_multi_state &other)
     // This will work regardless of whether other is compiled or not.
     impl imp{.m_states = other.m_impl->m_states,
              .m_jit = std::make_unique<detail::multi_jit>(other.m_impl->m_jit->m_n_modules, other.get_opt_level(),
-                                                          other.get_code_model())};
+                                                          other.get_code_model(), other.force_avx512(),
+                                                          other.get_slp_vectorize())};
     m_impl = std::make_unique<impl>(std::move(imp));
 
     if (other.is_compiled()) {
@@ -2013,7 +2023,8 @@ void llvm_multi_state::load(boost::archive::binary_iarchive &ar, unsigned)
 
         // Reset the jit with a new one.
         m_impl->m_jit = std::make_unique<detail::multi_jit>(
-            boost::safe_numerics::safe<unsigned>(m_impl->m_states.size()) + 1, get_opt_level(), get_code_model());
+            boost::safe_numerics::safe<unsigned>(m_impl->m_states.size()) + 1, get_opt_level(), get_code_model(),
+            force_avx512(), get_slp_vectorize());
 
         // Load the object files and the snapshots.
         ar >> m_impl->m_jit->m_object_files;
diff --git a/test/llvm_multi_state.cpp b/test/llvm_multi_state.cpp
index c6dfb419f..af147a9ff 100644
--- a/test/llvm_multi_state.cpp
+++ b/test/llvm_multi_state.cpp
@@ -6,12 +6,21 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#include <cmath>
 #include <sstream>
 #include <stdexcept>
 
+#include <boost/algorithm/string/find_iterator.hpp>
+#include <boost/algorithm/string/finder.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+
+#include <llvm/Config/llvm-config.h>
+
+#include <heyoka/config.hpp>
 #include <heyoka/expression.hpp>
 #include <heyoka/kw.hpp>
 #include <heyoka/llvm_state.hpp>
+#include <heyoka/math/erf.hpp>
 #include <heyoka/s11n.hpp>
 
 #include "catch.hpp"
@@ -65,6 +74,15 @@ TEST_CASE("basic")
             Message("An llvm_multi_state can be constructed only from uncompiled llvm_state objects"));
     }
 
+    {
+        // Invalid module name.
+        llvm_state s{kw::mname = "heyoka.master"};
+        REQUIRE_THROWS_MATCHES(
+            (llvm_multi_state{{s, llvm_state{}}}), std::invalid_argument,
+            Message("An invalid llvm_state was passed to the constructor of an llvm_multi_state: the module name "
+                    "'heyoka.master' is reserved for internal use by llvm_multi_state"));
+    }
+
     // Test the property getters.
     {
         llvm_state s{kw::opt_level = 1u, kw::fast_math = true, kw::force_avx512 = true, kw::slp_vectorize = true,
@@ -98,9 +116,13 @@ TEST_CASE("copy semantics")
 {
     using Catch::Matchers::Message;
 
+    // NOTE: in order to properly test this, we have to disable the cache.
+    llvm_state::clear_memcache();
+    llvm_state::set_memcache_limit(0);
+
     auto [x, y] = make_vars("x", "y");
 
-    llvm_state s1{kw::mname = "module_0"}, s2{kw::mname = "module_1"};
+    llvm_state s1, s2;
 
     add_cfunc<double>(s1, "f1", {x * y}, {x, y}, kw::compact_mode = true);
     add_cfunc<double>(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true);
@@ -187,15 +209,22 @@ TEST_CASE("copy semantics")
         REQUIRE(outs[0] == 6);
         REQUIRE(outs[1] == 2. / 3.);
     }
+
+    // Restore the cache.
+    llvm_state::set_memcache_limit(100'000'000ull);
 }
 
 TEST_CASE("s11n")
 {
     using Catch::Matchers::Message;
 
+    // NOTE: in order to properly test this, we have to disable the cache.
+    llvm_state::clear_memcache();
+    llvm_state::set_memcache_limit(0);
+
     auto [x, y] = make_vars("x", "y");
 
-    llvm_state s1{kw::mname = "module_0"}, s2{kw::mname = "module_1"};
+    llvm_state s1, s2;
 
     add_cfunc<double>(s1, "f1", {x * y}, {x, y}, kw::compact_mode = true);
     add_cfunc<double>(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true);
@@ -274,6 +303,9 @@ TEST_CASE("s11n")
         REQUIRE(outs[0] == 6);
         REQUIRE(outs[1] == 2. / 3.);
     }
+
+    // Restore the cache.
+    llvm_state::set_memcache_limit(100'000'000ull);
 }
 
 TEST_CASE("cfunc")
@@ -283,7 +315,7 @@ TEST_CASE("cfunc")
     // Basic test.
     auto [x, y] = make_vars("x", "y");
 
-    llvm_state s1{kw::mname = "module_0"}, s2{kw::mname = "module_1"};
+    llvm_state s1, s2;
 
     add_cfunc<double>(s1, "f1", {x * y}, {x, y}, kw::compact_mode = true);
     add_cfunc<double>(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true);
@@ -334,7 +366,7 @@ TEST_CASE("stream op")
 {
     auto [x, y] = make_vars("x", "y");
 
-    llvm_state s1{kw::mname = "module_0"}, s2{kw::mname = "module_1"};
+    llvm_state s1, s2;
 
     add_cfunc<double>(s1, "f1", {x * y}, {x, y}, kw::compact_mode = true);
     add_cfunc<double>(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true);
@@ -432,3 +464,68 @@ TEST_CASE("memcache testing")
     REQUIRE(outs[2] == 5);
     REQUIRE(outs[3] == -1);
 }
+
+#if 0
+
+// Tests to check vectorisation via the vector-function-abi-variant machinery.
+TEST_CASE("vfabi double")
+{
+    for (auto fast_math : {false, true}) {
+        llvm_state s1{kw::slp_vectorize = true, kw::fast_math = fast_math};
+        llvm_state s2{kw::slp_vectorize = true, kw::fast_math = fast_math};
+
+        auto [a, b] = make_vars("a", "b");
+
+        add_cfunc<double>(s1, "cfunc", {erf(a), erf(b)}, {a, b});
+        add_cfunc<double>(s2, "cfuncs", {erf(a), erf(b)}, {a, b}, kw::strided = true);
+
+        llvm_multi_state ms{{s1, s2}};
+
+        ms.compile();
+
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
+#if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
+
+        const auto &tf = detail::get_target_features();
+
+        for (auto ir : ms.get_ir()) {
+            using string_find_iterator = boost::find_iterator<std::string::iterator>;
+
+            auto count = 0u;
+            for (auto it = boost::make_find_iterator(ir, boost::first_finder("@erf", boost::is_iequal()));
+                 it != string_find_iterator(); ++it) {
+                ++count;
+            }
+
+            if (count == 0u) {
+                continue;
+            }
+
+            // NOTE: at the moment we have comprehensive coverage of LLVM versions
+            // in the CI only for x86_64.
+            if (tf.sse2) {
+                // NOTE: occurrences of the scalar version:
+                // - 2 calls in the strided cfunc,
+                // - 1 declaration.
+                REQUIRE(count == 3u);
+            }
+
+            if (tf.aarch64) {
+                REQUIRE(count == 3u);
+            }
+
+            // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+            // to the way the target machine is being set up by orc/lljit (it works
+            // fine with the opt tool). When this is resolved, we can test ppc64 too.
+
+            // if (tf.vsx) {
+            //     REQUIRE(count == 3u);
+            // }
+        }
+
+#endif
+    }
+}
+
+#endif

From 9e37b0a4921a534f76c3bab3d5950dd5a85971b3 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sun, 4 Aug 2024 10:02:24 +0200
Subject: [PATCH 34/41] Tweaks, doc additions, coverage fixes.

---
 src/llvm_state.cpp        | 27 ++++++++++-----
 test/llvm_multi_state.cpp | 69 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 86 insertions(+), 10 deletions(-)

diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index fefdb021e..16097f0bd 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -532,11 +532,13 @@ class string_view_mem_buffer final : public llvm::MemoryBuffer
         // https://llvm.org/doxygen/MemoryBuffer_8cpp_source.html
         this->init(s.data(), s.data() + s.size(), true);
     }
+    // LCOV_EXCL_START
     llvm::MemoryBuffer::BufferKind getBufferKind() const final
     {
         // Hopefully std::string is not memory-mapped...
         return llvm::MemoryBuffer::BufferKind::MemoryBuffer_Malloc;
     }
+    // LCOV_EXCL_STOP
 };
 
 // Helper to add an object file to the jit, throwing in case of errors.
@@ -1866,8 +1868,12 @@ struct llvm_multi_state::impl {
 
 llvm_multi_state::llvm_multi_state() = default;
 
-llvm_multi_state::llvm_multi_state(std::vector<llvm_state> states)
+llvm_multi_state::llvm_multi_state(std::vector<llvm_state> states_)
 {
+    // Fetch a const ref, as we want to make extra sure we do not modify
+    // states_ until we move it to construct the impl.
+    const auto &states = states_;
+
     // We need at least 1 state.
     if (states.empty()) [[unlikely]] {
         throw std::invalid_argument("At least 1 llvm_state object is needed to construct an llvm_multi_state");
@@ -1934,7 +1940,7 @@ llvm_multi_state::llvm_multi_state(std::vector<llvm_state> states)
                                                    c_model, force_avx512, slp_vectorize);
 
     // Build and assign the implementation.
-    impl imp{.m_states = std::move(states), .m_jit = std::move(jit)};
+    impl imp{.m_states = std::move(states_), .m_jit = std::move(jit)};
     m_impl = std::make_unique<impl>(std::move(imp));
 }
 
@@ -1942,6 +1948,8 @@ llvm_multi_state::llvm_multi_state(const llvm_multi_state &other)
 {
     // NOTE: start off by creating a new jit and copying the states.
     // This will work regardless of whether other is compiled or not.
+    // No need to do any validation on the states are they are coming
+    // from a llvm_multi_state and they have been checked already.
     impl imp{.m_states = other.m_impl->m_states,
              .m_jit = std::make_unique<detail::multi_jit>(other.m_impl->m_jit->m_n_modules, other.get_opt_level(),
                                                           other.get_code_model(), other.force_avx512(),
@@ -1949,15 +1957,18 @@ llvm_multi_state::llvm_multi_state(const llvm_multi_state &other)
     m_impl = std::make_unique<impl>(std::move(imp));
 
     if (other.is_compiled()) {
-        // 'other' was compiled. Reset builder and module, copy over the snapshots
-        // and the object files, and add the files to the jit.
+        // 'other' was compiled.
+
+        // Reset builder and module.
         m_impl->m_jit->m_module.reset();
         m_impl->m_jit->m_builder.reset();
 
+        // Copy over the snapshots and the object files,
         m_impl->m_jit->m_object_files = other.m_impl->m_jit->m_object_files;
         m_impl->m_jit->m_ir_snapshots = other.m_impl->m_jit->m_ir_snapshots;
         m_impl->m_jit->m_bc_snapshots = other.m_impl->m_jit->m_bc_snapshots;
 
+        // Add the files to the jit.
         for (const auto &obj : m_impl->m_jit->m_object_files) {
             detail::add_obj_to_lljit(*m_impl->m_jit->m_lljit, obj);
         }
@@ -2046,11 +2057,11 @@ void llvm_multi_state::load(boost::archive::binary_iarchive &ar, unsigned)
         }
 
         // Debug checks.
-        assert(m_impl->m_jit->m_object_files.empty()
+        assert((m_impl->m_jit->m_object_files.empty() && !cmp)
                || m_impl->m_jit->m_object_files.size() == m_impl->m_jit->m_n_modules);
-        assert(m_impl->m_jit->m_ir_snapshots.empty()
+        assert((m_impl->m_jit->m_object_files.empty() && !cmp)
                || m_impl->m_jit->m_ir_snapshots.size() == m_impl->m_jit->m_n_modules);
-        assert(m_impl->m_jit->m_bc_snapshots.empty()
+        assert((m_impl->m_jit->m_object_files.empty() && !cmp)
                || m_impl->m_jit->m_bc_snapshots.size() == m_impl->m_jit->m_n_modules);
 
         // LCOV_EXCL_START
@@ -2305,7 +2316,7 @@ void llvm_multi_state::compile()
 
             // NOTE: here it is important that we replicate the logic happening
             // in llvm_state::compile(): clear out module/builder, construct
-            // the object file.
+            // the object file. The snapshots can be left empty.
             for (auto &s : m_impl->m_states) {
                 s.m_module.reset();
                 s.m_builder.reset();
diff --git a/test/llvm_multi_state.cpp b/test/llvm_multi_state.cpp
index af147a9ff..9490a0faf 100644
--- a/test/llvm_multi_state.cpp
+++ b/test/llvm_multi_state.cpp
@@ -96,19 +96,52 @@ TEST_CASE("basic")
         REQUIRE(ms.get_slp_vectorize());
         REQUIRE(ms.get_code_model() == code_model::large);
         REQUIRE(ms.get_n_modules() == 5u);
-
         REQUIRE(!ms.is_compiled());
 
         ms.compile();
 
         REQUIRE(ms.is_compiled());
-
         REQUIRE(ms.get_opt_level() == 1u);
         REQUIRE(ms.fast_math());
         REQUIRE(ms.force_avx512());
         REQUIRE(ms.get_slp_vectorize());
         REQUIRE(ms.get_code_model() == code_model::large);
         REQUIRE(ms.get_n_modules() == 5u);
+
+        REQUIRE_THROWS_MATCHES(
+            ms.compile(), std::invalid_argument,
+            Message("The function 'compile' can be invoked only if the llvm_multi_state has not been compiled yet"));
+    }
+
+    // Move construction/assignment.
+    {
+        llvm_state s{kw::opt_level = 1u, kw::fast_math = true, kw::force_avx512 = true, kw::slp_vectorize = true,
+                     kw::code_model = code_model::large};
+
+        llvm_multi_state ms{{s, s, s, s}};
+
+        auto ms2 = std::move(ms);
+
+        REQUIRE(ms2.get_opt_level() == 1u);
+        REQUIRE(ms2.fast_math());
+        REQUIRE(ms2.force_avx512());
+        REQUIRE(ms2.get_slp_vectorize());
+        REQUIRE(ms2.get_code_model() == code_model::large);
+        REQUIRE(ms2.get_n_modules() == 5u);
+        REQUIRE(!ms2.is_compiled());
+
+        ms2.compile();
+
+        llvm_multi_state ms3;
+        ms3 = std::move(ms2);
+
+        REQUIRE(ms3.is_compiled());
+        REQUIRE(ms3.get_opt_level() == 1u);
+        REQUIRE(ms3.fast_math());
+        REQUIRE(ms3.force_avx512());
+        REQUIRE(ms3.get_slp_vectorize());
+        REQUIRE(ms3.get_code_model() == code_model::large);
+        REQUIRE(ms3.get_n_modules() == 5u);
     }
 }
 
@@ -210,6 +243,38 @@ TEST_CASE("copy semantics")
         REQUIRE(outs[1] == 2. / 3.);
     }
 
+    // Test also copy assignment.
+    llvm_multi_state ms_copy3;
+    ms_copy3 = ms_copy2;
+
+    REQUIRE(ms_copy3.get_bc() == ms.get_bc());
+    REQUIRE(ms_copy3.get_ir() == ms.get_ir());
+    REQUIRE(ms_copy3.get_object_code() == ms.get_object_code());
+    REQUIRE(ms_copy3.is_compiled() == ms.is_compiled());
+    REQUIRE(ms_copy3.fast_math() == ms.fast_math());
+    REQUIRE(ms_copy3.force_avx512() == ms.force_avx512());
+    REQUIRE(ms_copy3.get_opt_level() == ms.get_opt_level());
+    REQUIRE(ms_copy3.get_slp_vectorize() == ms.get_slp_vectorize());
+    REQUIRE(ms_copy3.get_code_model() == ms.get_code_model());
+    REQUIRE_NOTHROW(ms_copy3.jit_lookup("f1"));
+    REQUIRE_NOTHROW(ms_copy3.jit_lookup("f2"));
+
+    {
+        auto *cf1_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            ms_copy3.jit_lookup("f1"));
+        auto *cf2_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            ms_copy3.jit_lookup("f2"));
+
+        const double ins[] = {2., 3.};
+        double outs[2] = {};
+
+        cf1_ptr(outs, ins, nullptr, nullptr);
+        cf2_ptr(outs + 1, ins, nullptr, nullptr);
+
+        REQUIRE(outs[0] == 6);
+        REQUIRE(outs[1] == 2. / 3.);
+    }
+
     // Restore the cache.
     llvm_state::set_memcache_limit(100'000'000ull);
 }

From 31779cd91cfed1b02ea67a8b4a9243f80d46c2ca Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sun, 4 Aug 2024 10:58:07 +0200
Subject: [PATCH 35/41] New approach for preventing the optimisation of the
 vector variants.

---
 src/detail/llvm_helpers.cpp | 99 +++++++++++++++++++++++++++----------
 src/llvm_state.cpp          |  4 +-
 test/llvm_multi_state.cpp   |  4 --
 3 files changed, 74 insertions(+), 33 deletions(-)

diff --git a/src/detail/llvm_helpers.cpp b/src/detail/llvm_helpers.cpp
index bbf7c26c3..7faba2443 100644
--- a/src/detail/llvm_helpers.cpp
+++ b/src/detail/llvm_helpers.cpp
@@ -31,6 +31,7 @@
 #include <boost/algorithm/string/predicate.hpp>
 #include <boost/core/demangle.hpp>
 #include <boost/numeric/conversion/cast.hpp>
+#include <boost/safe_numerics/safe_integer.hpp>
 
 #include <fmt/format.h>
 #include <fmt/ranges.h>
@@ -271,6 +272,74 @@ llvm::AttributeList llvm_ext_math_func_attrs(llvm_state &s)
     return f->getAttributes();
 }
 
+// Add a pointer to the llvm.used global variable of a module:
+//
+// https://llvm.org/docs/LangRef.html#the-llvm-used-global-variable
+//
+// If the llvm.used variable does not exist yet, create it.
+//
+// NOTE: this has quadratic complexity. It should not be a problem
+// for the type of use we do as we expect just a few entries in this
+// array, but something to keep in mind.
+void llvm_append_used(llvm_state &s, llvm::Constant *ptr)
+{
+    assert(ptr != nullptr);
+    assert(ptr->getType()->isPointerTy());
+
+    auto &md = s.module();
+    auto &ctx = s.context();
+
+    // Fetch the pointer type.
+    auto *ptr_type = llvm::PointerType::getUnqual(ctx);
+
+    if (auto *orig_used = md.getGlobalVariable("llvm.used")) {
+        // The llvm.used variable exists already.
+
+        // Fetch the original initializer.
+        assert(orig_used->hasInitializer());
+        auto *orig_init = llvm::cast<llvm::ConstantArray>(orig_used->getInitializer());
+
+        // Construct a new initializer with the original values
+        // plus the new pointer.
+        std::vector<llvm::Constant *> arr_values;
+        arr_values.reserve(
+            boost::safe_numerics::safe<decltype(arr_values.size())>(orig_init->getType()->getNumElements()) + 1);
+        for (decltype(orig_init->getType()->getNumElements()) i = 0; i < orig_init->getType()->getNumElements(); ++i) {
+            auto *orig_el = orig_init->getAggregateElement(boost::numeric_cast<unsigned>(i));
+            assert(orig_el->getType()->isPointerTy());
+
+            // NOTE: if ptr was already in the llvm.used vector, just bail
+            // out early.
+            if (orig_el->isElementWiseEqual(ptr)) {
+                return;
+            }
+
+            arr_values.push_back(orig_el);
+        }
+        arr_values.push_back(ptr);
+
+        // Create the new array.
+        auto *used_array_type = llvm::ArrayType::get(ptr_type, boost::numeric_cast<std::uint64_t>(arr_values.size()));
+        auto *used_arr = llvm::ConstantArray::get(used_array_type, arr_values);
+
+        // Remove the original one.
+        orig_used->eraseFromParent();
+
+        // Add the new global variable.
+        auto *g_used_arr = new llvm::GlobalVariable(md, used_arr->getType(), true,
+                                                    llvm::GlobalVariable::AppendingLinkage, used_arr, "llvm.used");
+        g_used_arr->setSection("llvm.metadata");
+    } else {
+        // The llvm.used variable does not exist yet, create it.
+        auto *used_array_type = llvm::ArrayType::get(ptr_type, 1);
+        std::vector<llvm::Constant *> arr_values{ptr};
+        auto *used_arr = llvm::ConstantArray::get(used_array_type, arr_values);
+        auto *g_used_arr = new llvm::GlobalVariable(md, used_arr->getType(), true,
+                                                    llvm::GlobalVariable::AppendingLinkage, used_arr, "llvm.used");
+        g_used_arr->setSection("llvm.metadata");
+    }
+}
+
 // Attach the vfabi attributes to "call", which must be a call to a function with scalar arguments.
 // The necessary vfabi information is stored in vfi. The function returns "call".
 // The attributes of the scalar function will be attached to the vector variants.
@@ -365,33 +434,9 @@ llvm::CallInst *llvm_add_vfabi_attrs(llvm_state &s, llvm::CallInst *call, const
                 assert(vf_ptr->getAttributes() == f->getAttributes());
             }
 
-            // Create the name of the dummy function to ensure the variant is not optimised out.
-            //
-            // NOTE: another way of doing this involves the llvm.used global variable - need
-            // to learn about the metadata API apparently.
-            //
-            // https://llvm.org/docs/LangRef.html#the-llvm-used-global-variable
-            // https://godbolt.org/z/1neaG4bYj
-            const auto dummy_name = fmt::format("heyoka.dummy_vector_call.{}", el_name);
-
-            if (auto *dummy_ptr = md.getFunction(dummy_name); dummy_ptr == nullptr) {
-                // The dummy function has not been defined yet, do it.
-                auto *dummy = llvm_func_create(vec_ft, llvm::Function::ExternalLinkage, dummy_name, &md);
-
-                builder.SetInsertPoint(llvm::BasicBlock::Create(context, "entry", dummy));
-
-                // The dummy function just forwards its arguments to the variant.
-                std::vector<llvm::Value *> dummy_args;
-                for (auto *dummy_arg = dummy->args().begin(); dummy_arg != dummy->args().end(); ++dummy_arg) {
-                    dummy_args.emplace_back(dummy_arg);
-                }
-
-                builder.CreateRet(builder.CreateCall(vf_ptr, dummy_args));
-            } else {
-                // The declaration of the dummy function is already there.
-                // Check that the signatures match.
-                assert(dummy_ptr->getFunctionType() == vec_ft);
-            }
+            // Ensure that the variant is not optimised out because it is not
+            // explicitly used in the code.
+            detail::llvm_append_used(s, vf_ptr);
         }
 
         // Restore the original insertion block.
diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 16097f0bd..32898805a 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -2359,16 +2359,16 @@ void llvm_multi_state::compile()
                                                     {.opt_bc = m_impl->m_jit->m_bc_snapshots,
                                                      .opt_ir = m_impl->m_jit->m_ir_snapshots,
                                                      .obj = m_impl->m_jit->m_object_files});
+            // LCOV_EXCL_START
         }
-        // LCOV_EXCL_START
     } catch (...) {
         // Reset to a def-cted state in case of error,
         // as it looks like there's no way of recovering.
         m_impl.reset();
 
         throw;
-        // LCOV_EXCL_STOP
     }
+    // LCOV_EXCL_STOP
 }
 
 std::uintptr_t llvm_multi_state::jit_lookup(const std::string &name)
diff --git a/test/llvm_multi_state.cpp b/test/llvm_multi_state.cpp
index 9490a0faf..cbae268c0 100644
--- a/test/llvm_multi_state.cpp
+++ b/test/llvm_multi_state.cpp
@@ -530,8 +530,6 @@ TEST_CASE("memcache testing")
     REQUIRE(outs[3] == -1);
 }
 
-#if 0
-
 // Tests to check vectorisation via the vector-function-abi-variant machinery.
 TEST_CASE("vfabi double")
 {
@@ -592,5 +590,3 @@ TEST_CASE("vfabi double")
 #endif
     }
 }
-
-#endif

From 03b459830beecad7e98c36bcb4f7e7f72573e212 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sun, 4 Aug 2024 11:09:21 +0200
Subject: [PATCH 36/41] Test tweaks.

---
 test/llvm_multi_state.cpp | 30 ++++++++----------------------
 1 file changed, 8 insertions(+), 22 deletions(-)

diff --git a/test/llvm_multi_state.cpp b/test/llvm_multi_state.cpp
index cbae268c0..652ab5b56 100644
--- a/test/llvm_multi_state.cpp
+++ b/test/llvm_multi_state.cpp
@@ -550,8 +550,6 @@ TEST_CASE("vfabi double")
         // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-        const auto &tf = detail::get_target_features();
-
         for (auto ir : ms.get_ir()) {
             using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
@@ -561,30 +559,18 @@ TEST_CASE("vfabi double")
                 ++count;
             }
 
+            // NOTE: in the master module or in the "cfunc" module, we don't
+            // expect any @erf: the master module contains only the trigger,
+            // the "cfunc" module should have vectorised everything and
+            // there should be no more references to the scalar @erf.
             if (count == 0u) {
                 continue;
             }
 
-            // NOTE: at the moment we have comprehensive coverage of LLVM versions
-            // in the CI only for x86_64.
-            if (tf.sse2) {
-                // NOTE: occurrences of the scalar version:
-                // - 2 calls in the strided cfunc,
-                // - 1 declaration.
-                REQUIRE(count == 3u);
-            }
-
-            if (tf.aarch64) {
-                REQUIRE(count == 3u);
-            }
-
-            // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-            // to the way the target machine is being set up by orc/lljit (it works
-            // fine with the opt tool). When this is resolved, we can test ppc64 too.
-
-            // if (tf.vsx) {
-            //     REQUIRE(count == 3u);
-            // }
+            // NOTE: occurrences of the scalar version:
+            // - 2 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 3u);
         }
 
 #endif

From 065bb04d300af9f603a357f9444586db86d0d7a8 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sun, 4 Aug 2024 13:35:00 +0200
Subject: [PATCH 37/41] Fix assertions.

---
 src/llvm_state.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 32898805a..0cfd4e4cc 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -1895,12 +1895,6 @@ llvm_multi_state::llvm_multi_state(std::vector<llvm_state> states_)
 
     // Settings in all states must be consistent.
     auto states_differ = [](const llvm_state &s1, const llvm_state &s2) {
-        // NOTE: bit of paranoia here.
-        assert(s1.m_jitter->m_lljit->getDataLayout() == s2.m_jitter->m_lljit->getDataLayout());
-        assert(s1.m_jitter->get_target_triple() == s2.m_jitter->get_target_triple());
-        assert(s1.m_jitter->get_target_cpu() == s2.m_jitter->get_target_cpu());
-        assert(s1.m_jitter->get_target_features() == s2.m_jitter->get_target_features());
-
         if (s1.get_opt_level() != s2.get_opt_level()) {
             return true;
         }
@@ -1921,6 +1915,12 @@ llvm_multi_state::llvm_multi_state(std::vector<llvm_state> states_)
             return true;
         }
 
+        // NOTE: bit of paranoia here.
+        assert(s1.m_jitter->m_lljit->getDataLayout() == s2.m_jitter->m_lljit->getDataLayout());
+        assert(s1.m_jitter->get_target_triple() == s2.m_jitter->get_target_triple());
+        assert(s1.m_jitter->get_target_cpu() == s2.m_jitter->get_target_cpu());
+        assert(s1.m_jitter->get_target_features() == s2.m_jitter->get_target_features());
+
         return false;
     };
 

From 5e4cd3ab6a6e137ea67d9a8905ac096d92b223f7 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sun, 4 Aug 2024 13:55:38 +0200
Subject: [PATCH 38/41] Remove support for LLVM 14.

---
 .github/workflows/gha_ci.yml   |  6 -----
 CMakeLists.txt                 |  4 +--
 doc/install.rst                |  2 +-
 src/llvm_state.cpp             |  8 ------
 tools/gha_llvm13_conda_asan.sh | 48 ----------------------------------
 5 files changed, 3 insertions(+), 65 deletions(-)
 delete mode 100644 tools/gha_llvm13_conda_asan.sh

diff --git a/.github/workflows/gha_ci.yml b/.github/workflows/gha_ci.yml
index 6a61654ab..ee5e0f980 100644
--- a/.github/workflows/gha_ci.yml
+++ b/.github/workflows/gha_ci.yml
@@ -98,9 +98,3 @@ jobs:
       - uses: actions/checkout@v4
       - name: Build
         run: bash tools/gha_llvm15_conda_asan.sh
-  conda_llvm14_asan:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - name: Build
-        run: bash tools/gha_llvm14_conda_asan.sh
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4de13a5e5..12ade3120 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -158,8 +158,8 @@ endif()
 include(GNUInstallDirs)
 find_package(LLVM REQUIRED CONFIG)
 
-if(${LLVM_VERSION_MAJOR} LESS 14 OR ${LLVM_VERSION_MAJOR} GREATER 18)
-  message(FATAL_ERROR "LLVM >= 14 and <= 18 is required.")
+if(${LLVM_VERSION_MAJOR} LESS 15 OR ${LLVM_VERSION_MAJOR} GREATER 18)
+  message(FATAL_ERROR "LLVM >= 15 and <= 18 is required.")
 endif()
 
 # List of source files.
diff --git a/doc/install.rst b/doc/install.rst
index ff42373b3..7beed7bf4 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -14,7 +14,7 @@ and several CPU architectures (x86-64, 64-bit ARM and 64-bit PowerPC).
 
 heyoka has the following **mandatory** dependencies:
 
-* the `LLVM <https://llvm.org/>`__ compiler infrastructure library (version >=14 and <=18),
+* the `LLVM <https://llvm.org/>`__ compiler infrastructure library (version >=15 and <=18),
 * the `Boost <https://www.boost.org/>`__ C++ libraries (version >=1.69),
 * the `{fmt} <https://fmt.dev/latest/index.html>`__ library (version >=9 and <=11),
 * the `spdlog <https://github.com/gabime/spdlog>`__ library,
diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 0cfd4e4cc..f47355db5 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -1485,11 +1485,7 @@ std::uintptr_t llvm_state::jit_lookup(const std::string &name)
         throw std::invalid_argument(fmt::format("Could not find the symbol '{}' in the compiled module", name));
     }
 
-#if LLVM_VERSION_MAJOR >= 15
     return static_cast<std::uintptr_t>((*sym).getValue());
-#else
-    return static_cast<std::uintptr_t>((*sym).getAddress());
-#endif
 }
 
 std::string llvm_state::get_ir() const
@@ -2380,11 +2376,7 @@ std::uintptr_t llvm_multi_state::jit_lookup(const std::string &name)
         throw std::invalid_argument(fmt::format("Could not find the symbol '{}' in an llvm_multi_state", name));
     }
 
-#if LLVM_VERSION_MAJOR >= 15
     return static_cast<std::uintptr_t>((*sym).getValue());
-#else
-    return static_cast<std::uintptr_t>((*sym).getAddress());
-#endif
 }
 
 std::ostream &operator<<(std::ostream &os, const llvm_multi_state &s)
diff --git a/tools/gha_llvm13_conda_asan.sh b/tools/gha_llvm13_conda_asan.sh
deleted file mode 100644
index cba8e3692..000000000
--- a/tools/gha_llvm13_conda_asan.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env bash
-
-# Echo each command
-set -x
-
-# Exit on error.
-set -e
-
-# Core deps.
-sudo apt-get install wget
-
-# Install conda+deps.
-wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O miniconda.sh
-export deps_dir=$HOME/local
-export PATH="$HOME/miniconda/bin:$PATH"
-bash miniconda.sh -b -p $HOME/miniconda
-mamba create -y -p $deps_dir c-compiler cxx-compiler cmake ninja 'llvmdev=13.*' \
-    tbb-devel tbb libboost-devel 'mppp=1.*' sleef xtensor xtensor-blas blas \
-    blas-devel fmt spdlog
-source activate $deps_dir
-
-# Create the build dir and cd into it.
-mkdir build
-cd build
-
-# Clear the compilation flags set up by conda.
-unset CXXFLAGS
-unset CFLAGS
-
-# Configure.
-cmake ../ -G Ninja \
-    -DCMAKE_PREFIX_PATH=$deps_dir \
-    -DCMAKE_BUILD_TYPE=Debug \
-    -DHEYOKA_BUILD_TESTS=yes \
-    -DHEYOKA_BUILD_TUTORIALS=ON \
-    -DHEYOKA_WITH_MPPP=yes \
-    -DHEYOKA_WITH_SLEEF=yes \
-    -DCMAKE_CXX_FLAGS="-fsanitize=address" \
-    -DCMAKE_CXX_FLAGS_DEBUG="-g -Og"
-
-# Build.
-ninja -v
-
-# Run the tests.
-ctest -VV -j4
-
-set +e
-set +x

From d63c7495318f5ade1672de1cf98e52ef70994300 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sun, 4 Aug 2024 13:56:31 +0200
Subject: [PATCH 39/41] Bump to version 6.0.0 for development.

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 12ade3120..86717c759 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,7 +11,7 @@ if(NOT CMAKE_BUILD_TYPE)
 	FORCE)
 endif()
 
-project(heyoka VERSION 5.1.0 LANGUAGES CXX C)
+project(heyoka VERSION 6.0.0 LANGUAGES CXX C)
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake" "${CMAKE_CURRENT_SOURCE_DIR}/cmake/yacma")
 
@@ -334,7 +334,7 @@ if(HEYOKA_WITH_SLEEF)
 endif()
 
 # Setup the heyoka ABI version number.
-set(HEYOKA_ABI_VERSION 29)
+set(HEYOKA_ABI_VERSION 30)
 
 if(HEYOKA_BUILD_STATIC_LIBRARY)
     # Setup of the heyoka static library.

From c5fee899040d06b9307a7c4872f232a1f3494297 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sun, 4 Aug 2024 14:39:53 +0200
Subject: [PATCH 40/41] Some more tweaks.

---
 src/detail/llvm_helpers.cpp |  6 +++---
 src/llvm_state.cpp          | 15 +++++++--------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/detail/llvm_helpers.cpp b/src/detail/llvm_helpers.cpp
index 7faba2443..279a7609e 100644
--- a/src/detail/llvm_helpers.cpp
+++ b/src/detail/llvm_helpers.cpp
@@ -278,9 +278,9 @@ llvm::AttributeList llvm_ext_math_func_attrs(llvm_state &s)
 //
 // If the llvm.used variable does not exist yet, create it.
 //
-// NOTE: this has quadratic complexity. It should not be a problem
-// for the type of use we do as we expect just a few entries in this
-// array, but something to keep in mind.
+// NOTE: this has quadratic complexity when appending ptr to an existing
+// array. It should not be a problem for the type of use we do as we expect
+// just a few entries in this array, but something to keep in mind.
 void llvm_append_used(llvm_state &s, llvm::Constant *ptr)
 {
     assert(ptr != nullptr);
diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index f47355db5..a5af287bf 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -1434,6 +1434,9 @@ void llvm_state::compile()
 
             // Assign the object file.
             detail::llvm_state_add_obj_to_jit(*m_jitter, std::move(cached_data->obj[0]));
+
+            // Look up the trigger.
+            jit_lookup(detail::obj_trigger_name);
         } else {
             // Cache miss.
 
@@ -1631,13 +1634,6 @@ struct multi_jit {
     {
         return *m_ctx->getContext();
     }
-
-    // Helper to fetch the bitcode of the master module.
-    std::string get_master_bc() const
-    {
-        assert(m_module);
-        return detail::bc_from_module(*m_module);
-    }
 };
 
 #if 0
@@ -2291,7 +2287,7 @@ void llvm_multi_state::compile()
             obc.push_back(s.get_bc());
         }
         // Add the master bitcode.
-        obc.push_back(m_impl->m_jit->get_master_bc());
+        obc.push_back(detail::bc_from_module(*m_impl->m_jit->m_module));
 
         // Assemble the compilation flag.
         const auto comp_flag
@@ -2331,6 +2327,9 @@ void llvm_multi_state::compile()
             // Assign the compiled objects.
             assert(m_impl->m_jit->m_object_files.empty());
             m_impl->m_jit->m_object_files = std::move(cached_data->obj);
+
+            // Lookup the trigger.
+            jit_lookup(detail::obj_trigger_name);
         } else {
             // Cache miss.
 

From a205aabde20aba1702a7f80fbaec06551604b7a2 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sun, 4 Aug 2024 14:41:05 +0200
Subject: [PATCH 41/41] Remove stale file.

---
 tools/gha_llvm14_conda_asan.sh | 48 ----------------------------------
 1 file changed, 48 deletions(-)
 delete mode 100644 tools/gha_llvm14_conda_asan.sh

diff --git a/tools/gha_llvm14_conda_asan.sh b/tools/gha_llvm14_conda_asan.sh
deleted file mode 100644
index cc54de5ae..000000000
--- a/tools/gha_llvm14_conda_asan.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env bash
-
-# Echo each command
-set -x
-
-# Exit on error.
-set -e
-
-# Core deps.
-sudo apt-get install wget
-
-# Install conda+deps.
-wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O miniconda.sh
-export deps_dir=$HOME/local
-export PATH="$HOME/miniconda/bin:$PATH"
-bash miniconda.sh -b -p $HOME/miniconda
-mamba create -y -p $deps_dir c-compiler cxx-compiler cmake ninja 'llvmdev=14.*' \
-    tbb-devel tbb libboost-devel 'mppp=1.*' sleef xtensor xtensor-blas blas \
-    blas-devel fmt spdlog
-source activate $deps_dir
-
-# Create the build dir and cd into it.
-mkdir build
-cd build
-
-# Clear the compilation flags set up by conda.
-unset CXXFLAGS
-unset CFLAGS
-
-# Configure.
-cmake ../ -G Ninja \
-    -DCMAKE_PREFIX_PATH=$deps_dir \
-    -DCMAKE_BUILD_TYPE=Debug \
-    -DHEYOKA_BUILD_TESTS=yes \
-    -DHEYOKA_BUILD_TUTORIALS=ON \
-    -DHEYOKA_WITH_MPPP=yes \
-    -DHEYOKA_WITH_SLEEF=yes \
-    -DCMAKE_CXX_FLAGS="-fsanitize=address" \
-    -DCMAKE_CXX_FLAGS_DEBUG="-g -Og"
-
-# Build.
-ninja -v
-
-# Run the tests.
-ctest -VV -j4
-
-set +e
-set +x