diff --git a/.github/workflows/gha_ci.yml b/.github/workflows/gha_ci.yml
index 81cba6d3e..ee5e0f980 100644
--- a/.github/workflows/gha_ci.yml
+++ b/.github/workflows/gha_ci.yml
@@ -98,15 +98,3 @@ jobs:
- uses: actions/checkout@v4
- name: Build
run: bash tools/gha_llvm15_conda_asan.sh
- conda_llvm14_asan:
- runs-on: ubuntu-latest
- steps:
- - uses: actions/checkout@v4
- - name: Build
- run: bash tools/gha_llvm14_conda_asan.sh
- conda_llvm13_asan:
- runs-on: ubuntu-latest
- steps:
- - uses: actions/checkout@v4
- - name: Build
- run: bash tools/gha_llvm13_conda_asan.sh
diff --git a/CMakeLists.txt b/CMakeLists.txt
index be252abd3..86717c759 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,7 +11,7 @@ if(NOT CMAKE_BUILD_TYPE)
FORCE)
endif()
-project(heyoka VERSION 5.1.0 LANGUAGES CXX C)
+project(heyoka VERSION 6.0.0 LANGUAGES CXX C)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake" "${CMAKE_CURRENT_SOURCE_DIR}/cmake/yacma")
@@ -158,8 +158,8 @@ endif()
include(GNUInstallDirs)
find_package(LLVM REQUIRED CONFIG)
-if(${LLVM_VERSION_MAJOR} LESS 13 OR ${LLVM_VERSION_MAJOR} GREATER 18)
- message(FATAL_ERROR "LLVM >= 13 and <= 18 is required.")
+if(${LLVM_VERSION_MAJOR} LESS 15 OR ${LLVM_VERSION_MAJOR} GREATER 18)
+ message(FATAL_ERROR "LLVM >= 15 and <= 18 is required.")
endif()
# List of source files.
@@ -334,7 +334,7 @@ if(HEYOKA_WITH_SLEEF)
endif()
# Setup the heyoka ABI version number.
-set(HEYOKA_ABI_VERSION 29)
+set(HEYOKA_ABI_VERSION 30)
if(HEYOKA_BUILD_STATIC_LIBRARY)
# Setup of the heyoka static library.
diff --git a/doc/install.rst b/doc/install.rst
index 64d638c87..7beed7bf4 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -14,7 +14,7 @@ and several CPU architectures (x86-64, 64-bit ARM and 64-bit PowerPC).
heyoka has the following **mandatory** dependencies:
-* the `LLVM `__ compiler infrastructure library (version >=13 and <=18),
+* the `LLVM `__ compiler infrastructure library (version >=15 and <=18),
* the `Boost `__ C++ libraries (version >=1.69),
* the `{fmt} `__ library (version >=9 and <=11),
* the `spdlog `__ library,
diff --git a/include/heyoka/detail/fwd_decl.hpp b/include/heyoka/detail/fwd_decl.hpp
index b723469ce..0bdb24d61 100644
--- a/include/heyoka/detail/fwd_decl.hpp
+++ b/include/heyoka/detail/fwd_decl.hpp
@@ -26,6 +26,7 @@ class HEYOKA_DLL_PUBLIC func;
class HEYOKA_DLL_PUBLIC param;
class HEYOKA_DLL_PUBLIC llvm_state;
+class HEYOKA_DLL_PUBLIC llvm_multi_state;
template
class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive;
diff --git a/include/heyoka/llvm_state.hpp b/include/heyoka/llvm_state.hpp
index c314c2105..1c021ecfa 100644
--- a/include/heyoka/llvm_state.hpp
+++ b/include/heyoka/llvm_state.hpp
@@ -12,14 +12,15 @@
#include
#include
+#include
#include
-#include
#include
#include
#include
#include
#include
#include
+#include
#include
@@ -78,6 +79,7 @@ HEYOKA_DLL_PUBLIC const target_features &get_target_features();
} // namespace detail
HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const llvm_state &);
+HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const llvm_multi_state &);
template
inline std::uint32_t recommended_simd_size()
@@ -132,6 +134,8 @@ HEYOKA_BEGIN_NAMESPACE
class HEYOKA_DLL_PUBLIC llvm_state
{
friend HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const llvm_state &);
+ friend HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const llvm_multi_state &);
+ friend class HEYOKA_DLL_PUBLIC llvm_multi_state;
struct jit;
@@ -337,15 +341,65 @@ namespace detail
// The value contained in the in-memory cache.
struct llvm_mc_value {
- std::string opt_bc, opt_ir, obj;
+ std::vector opt_bc, opt_ir, obj;
+
+ std::size_t total_size() const;
};
// Cache lookup and insertion.
-std::optional llvm_state_mem_cache_lookup(const std::string &, unsigned);
-void llvm_state_mem_cache_try_insert(std::string, unsigned, llvm_mc_value);
+std::optional llvm_state_mem_cache_lookup(const std::vector &, unsigned);
+void llvm_state_mem_cache_try_insert(std::vector, unsigned, llvm_mc_value);
} // namespace detail
+class HEYOKA_DLL_PUBLIC llvm_multi_state
+{
+ friend HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const llvm_multi_state &);
+
+ struct impl;
+
+ std::unique_ptr m_impl;
+
+ HEYOKA_DLL_LOCAL void compile_impl();
+ HEYOKA_DLL_LOCAL void add_obj_triggers();
+
+ // Check functions.
+ HEYOKA_DLL_LOCAL void check_compiled(const char *) const;
+ HEYOKA_DLL_LOCAL void check_uncompiled(const char *) const;
+
+ friend class boost::serialization::access;
+ void save(boost::archive::binary_oarchive &, unsigned) const;
+ void load(boost::archive::binary_iarchive &, unsigned);
+ BOOST_SERIALIZATION_SPLIT_MEMBER()
+
+public:
+ llvm_multi_state();
+ explicit llvm_multi_state(std::vector);
+ llvm_multi_state(const llvm_multi_state &);
+ llvm_multi_state(llvm_multi_state &&) noexcept;
+ llvm_multi_state &operator=(const llvm_multi_state &);
+ llvm_multi_state &operator=(llvm_multi_state &&) noexcept;
+ ~llvm_multi_state();
+
+ [[nodiscard]] bool is_compiled() const noexcept;
+
+ [[nodiscard]] unsigned get_n_modules() const noexcept;
+
+ [[nodiscard]] bool fast_math() const noexcept;
+ [[nodiscard]] bool force_avx512() const noexcept;
+ [[nodiscard]] unsigned get_opt_level() const noexcept;
+ [[nodiscard]] bool get_slp_vectorize() const noexcept;
+ [[nodiscard]] code_model get_code_model() const noexcept;
+
+ [[nodiscard]] std::vector get_ir() const;
+ [[nodiscard]] std::vector get_bc() const;
+ [[nodiscard]] const std::vector &get_object_code() const;
+
+ void compile();
+
+ std::uintptr_t jit_lookup(const std::string &);
+};
+
HEYOKA_END_NAMESPACE
// Archive version changelog:
diff --git a/src/detail/llvm_helpers.cpp b/src/detail/llvm_helpers.cpp
index f32c4c6fa..279a7609e 100644
--- a/src/detail/llvm_helpers.cpp
+++ b/src/detail/llvm_helpers.cpp
@@ -31,6 +31,7 @@
#include
#include
#include
+#include
#include
#include
@@ -271,6 +272,74 @@ llvm::AttributeList llvm_ext_math_func_attrs(llvm_state &s)
return f->getAttributes();
}
+// Add a pointer to the llvm.used global variable of a module:
+//
+// https://llvm.org/docs/LangRef.html#the-llvm-used-global-variable
+//
+// If the llvm.used variable does not exist yet, create it.
+//
+// NOTE: this has quadratic complexity when appending ptr to an existing
+// array. It should not be a problem for the type of use we do as we expect
+// just a few entries in this array, but something to keep in mind.
+void llvm_append_used(llvm_state &s, llvm::Constant *ptr)
+{
+ assert(ptr != nullptr);
+ assert(ptr->getType()->isPointerTy());
+
+ auto &md = s.module();
+ auto &ctx = s.context();
+
+ // Fetch the pointer type.
+ auto *ptr_type = llvm::PointerType::getUnqual(ctx);
+
+ if (auto *orig_used = md.getGlobalVariable("llvm.used")) {
+ // The llvm.used variable exists already.
+
+ // Fetch the original initializer.
+ assert(orig_used->hasInitializer());
+ auto *orig_init = llvm::cast(orig_used->getInitializer());
+
+ // Construct a new initializer with the original values
+ // plus the new pointer.
+ std::vector arr_values;
+ arr_values.reserve(
+ boost::safe_numerics::safe(orig_init->getType()->getNumElements()) + 1);
+ for (decltype(orig_init->getType()->getNumElements()) i = 0; i < orig_init->getType()->getNumElements(); ++i) {
+ auto *orig_el = orig_init->getAggregateElement(boost::numeric_cast(i));
+ assert(orig_el->getType()->isPointerTy());
+
+ // NOTE: if ptr was already in the llvm.used vector, just bail
+ // out early.
+ if (orig_el->isElementWiseEqual(ptr)) {
+ return;
+ }
+
+ arr_values.push_back(orig_el);
+ }
+ arr_values.push_back(ptr);
+
+ // Create the new array.
+ auto *used_array_type = llvm::ArrayType::get(ptr_type, boost::numeric_cast(arr_values.size()));
+ auto *used_arr = llvm::ConstantArray::get(used_array_type, arr_values);
+
+ // Remove the original one.
+ orig_used->eraseFromParent();
+
+ // Add the new global variable.
+ auto *g_used_arr = new llvm::GlobalVariable(md, used_arr->getType(), true,
+ llvm::GlobalVariable::AppendingLinkage, used_arr, "llvm.used");
+ g_used_arr->setSection("llvm.metadata");
+ } else {
+ // The llvm.used variable does not exist yet, create it.
+ auto *used_array_type = llvm::ArrayType::get(ptr_type, 1);
+ std::vector arr_values{ptr};
+ auto *used_arr = llvm::ConstantArray::get(used_array_type, arr_values);
+ auto *g_used_arr = new llvm::GlobalVariable(md, used_arr->getType(), true,
+ llvm::GlobalVariable::AppendingLinkage, used_arr, "llvm.used");
+ g_used_arr->setSection("llvm.metadata");
+ }
+}
+
// Attach the vfabi attributes to "call", which must be a call to a function with scalar arguments.
// The necessary vfabi information is stored in vfi. The function returns "call".
// The attributes of the scalar function will be attached to the vector variants.
@@ -312,17 +381,8 @@ llvm::CallInst *llvm_add_vfabi_attrs(llvm_state &s, llvm::CallInst *call, const
= (use_fast_math && !el.lp_vf_abi_attr.empty()) ? el.lp_vf_abi_attr : el.vf_abi_attr;
vf_abi_strs.push_back(vf_abi_attr);
}
-#if LLVM_VERSION_MAJOR >= 14
call->addFnAttr(llvm::Attribute::get(context, "vector-function-abi-variant",
fmt::format("{}", fmt::join(vf_abi_strs, ","))));
-#else
- {
- auto attrs = call->getAttributes();
- attrs = attrs.addAttribute(context, llvm::AttributeList::FunctionIndex, "vector-function-abi-variant",
- fmt::format("{}", fmt::join(vf_abi_strs, ",")));
- call->setAttributes(attrs);
- }
-#endif
// Now we need to:
// - add the declarations of the vector variants to the module,
@@ -374,33 +434,9 @@ llvm::CallInst *llvm_add_vfabi_attrs(llvm_state &s, llvm::CallInst *call, const
assert(vf_ptr->getAttributes() == f->getAttributes());
}
- // Create the name of the dummy function to ensure the variant is not optimised out.
- //
- // NOTE: another way of doing this involves the llvm.used global variable - need
- // to learn about the metadata API apparently.
- //
- // https://llvm.org/docs/LangRef.html#the-llvm-used-global-variable
- // https://godbolt.org/z/1neaG4bYj
- const auto dummy_name = fmt::format("heyoka.dummy_vector_call.{}", el_name);
-
- if (auto *dummy_ptr = md.getFunction(dummy_name); dummy_ptr == nullptr) {
- // The dummy function has not been defined yet, do it.
- auto *dummy = llvm_func_create(vec_ft, llvm::Function::ExternalLinkage, dummy_name, &md);
-
- builder.SetInsertPoint(llvm::BasicBlock::Create(context, "entry", dummy));
-
- // The dummy function just forwards its arguments to the variant.
- std::vector dummy_args;
- for (auto *dummy_arg = dummy->args().begin(); dummy_arg != dummy->args().end(); ++dummy_arg) {
- dummy_args.emplace_back(dummy_arg);
- }
-
- builder.CreateRet(builder.CreateCall(vf_ptr, dummy_args));
- } else {
- // The declaration of the dummy function is already there.
- // Check that the signatures match.
- assert(dummy_ptr->getFunctionType() == vec_ft);
- }
+ // Ensure that the variant is not optimised out because it is not
+ // explicitly used in the code.
+ detail::llvm_append_used(s, vf_ptr);
}
// Restore the original insertion block.
diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 7b303bc6f..a5af287bf 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -14,30 +14,33 @@
#include
#include
#include
-#include
#include
#include
#include
#include
#include
#include
+#include
#include
#include
#include
#include
#include
+#include
#include
#include
#include
-#include
+#include
#include
#include
+#include
#include
#include
-#include
+#include
+#include
#include
#include
#include
@@ -60,63 +63,19 @@
#include
#include
#include
+#include
#include
#include
+#include
+#include
+#include
#include
#include
+#include
#include
-#include
#include
#include
#include
-#include
-
-#if LLVM_VERSION_MAJOR < 14
-
-// NOTE: this header was moved in LLVM 14.
-#include
-
-#else
-
-#include
-
-#endif
-
-// NOTE: new pass manager API.
-// NOTE: this is available since LLVM 13, but in that
-// version it seems like auto-vectorization with
-// vector-function-abi-variant is not working
-// properly with the new pass manager. Hence, we
-// enable it from LLVM 14.
-#if LLVM_VERSION_MAJOR >= 14
-
-#define HEYOKA_USE_NEW_LLVM_PASS_MANAGER
-
-#endif
-
-#if defined(HEYOKA_USE_NEW_LLVM_PASS_MANAGER)
-
-#include
-#include
-#include
-#include
-
-#if LLVM_VERSION_MAJOR >= 14
-
-// NOTE: this header is available since LLVM 14.
-#include
-
-#endif
-
-#else
-
-#include
-#include
-#include
-#include
-#include
-
-#endif
#if LLVM_VERSION_MAJOR >= 17
@@ -151,26 +110,6 @@
#include
#include
-// NOTE: logging here lhames' instructions on how to set up LLJIT
-// for parallel compilation of multiple modules.
-//
-// auto J = LLJITBuilder()
-// .setNumCompileThreads()
-// .create();
-// if (!J) { /* bail on error */ }
-// (*J)->getIRTransformLayer().setTransform(
-// [](ThreadSafeModule TSM, MaterializationResponsibility &R) -> Expected {
-// TSM.withModuleDo([](Module &M) {
-// /* Apply your IR optimizations here */
-// });
-// return std::move(TSM);
-// });
-//
-// Note that the optimisation passes in this approach are moved into the
-// transform layer. References:
-// https://discord.com/channels/636084430946959380/687692371038830597/1252428080648163328
-// https://discord.com/channels/636084430946959380/687692371038830597/1252118666187640892
-
HEYOKA_BEGIN_NAMESPACE
namespace detail
@@ -207,12 +146,12 @@ const std::regex ppc_regex_pattern("pwr([1-9]*)");
target_features get_target_features_impl()
{
auto jtmb = llvm::orc::JITTargetMachineBuilder::detectHost();
- if (!jtmb) {
+ if (!jtmb) [[unlikely]] {
throw std::invalid_argument("Error creating a JITTargetMachineBuilder for the host system");
}
auto tm = jtmb->createTargetMachine();
- if (!tm) {
+ if (!tm) [[unlikely]] {
throw std::invalid_argument("Error creating the target machine");
}
@@ -307,6 +246,342 @@ void init_native_target()
});
}
+// Helper to create a builder for target machines.
+llvm::orc::JITTargetMachineBuilder create_jit_tmb(unsigned opt_level, code_model c_model)
+{
+ // NOTE: codegen opt level changed in LLVM 18.
+#if LLVM_VERSION_MAJOR < 18
+
+ using cg_opt_level = llvm::CodeGenOpt::Level;
+
+#else
+
+ using cg_opt_level = llvm::CodeGenOptLevel;
+
+#endif
+
+ // Try creating the target machine builder.
+ auto jtmb = llvm::orc::JITTargetMachineBuilder::detectHost();
+ // LCOV_EXCL_START
+ if (!jtmb) [[unlikely]] {
+ throw std::invalid_argument("Error creating a JITTargetMachineBuilder for the host system");
+ }
+ // LCOV_EXCL_STOP
+
+ // Set the codegen optimisation level.
+ switch (opt_level) {
+ case 0u:
+ jtmb->setCodeGenOptLevel(cg_opt_level::None);
+ break;
+ case 1u:
+ jtmb->setCodeGenOptLevel(cg_opt_level::Less);
+ break;
+ case 2u:
+ jtmb->setCodeGenOptLevel(cg_opt_level::Default);
+ break;
+ default:
+ assert(opt_level == 3u);
+ jtmb->setCodeGenOptLevel(cg_opt_level::Aggressive);
+ }
+
+ // NOTE: not all code models are supported on all archs. We make an effort
+ // here to prevent unsupported code models to be requested, as that will
+ // result in the termination of the program.
+ constexpr code_model supported_code_models[] = {
+#if defined(HEYOKA_ARCH_X86)
+ code_model::small, code_model::kernel, code_model::medium, code_model::large
+#elif defined(HEYOKA_ARCH_ARM)
+ code_model::tiny, code_model::small, code_model::large
+#elif defined(HEYOKA_ARCH_PPC)
+ code_model::small, code_model::medium, code_model::large
+#else
+ // NOTE: by default we assume only small and large are supported.
+ code_model::small, code_model::large
+#endif
+ };
+
+ if (std::ranges::find(supported_code_models, c_model) == std::ranges::end(supported_code_models)) [[unlikely]] {
+ throw std::invalid_argument(
+ fmt::format("The code model '{}' is not supported on the current architecture", c_model));
+ }
+
+ // LCOV_EXCL_START
+
+#if LLVM_VERSION_MAJOR >= 17
+
+ // NOTE: the code model setup is working only on LLVM>=19 (or at least
+ // LLVM 18 + patches, as in the conda-forge LLVM package), due to this bug:
+ //
+ // https://github.com/llvm/llvm-project/issues/88115
+ //
+ // Additionally, there are indications from our CI that attempting to set
+ // the code model before LLVM 17 might just be buggy, as we see widespread
+ // ASAN failures all over the place. Thus, let us not do anything with the code
+ // model setting before LLVM 17.
+
+ // Setup the code model.
+ switch (c_model) {
+ case code_model::tiny:
+ jtmb->setCodeModel(llvm::CodeModel::Tiny);
+ break;
+ case code_model::small:
+ jtmb->setCodeModel(llvm::CodeModel::Small);
+ break;
+ case code_model::kernel:
+ jtmb->setCodeModel(llvm::CodeModel::Kernel);
+ break;
+ case code_model::medium:
+ jtmb->setCodeModel(llvm::CodeModel::Medium);
+ break;
+ case code_model::large:
+ jtmb->setCodeModel(llvm::CodeModel::Large);
+ break;
+ default:
+ // NOTE: we should never end up here.
+ assert(false);
+ ;
+ }
+
+#endif
+
+ // LCOV_EXCL_STOP
+
+ return std::move(*jtmb);
+}
+
+// Helper to optimise the input module M. Implemented here for re-use.
+// NOTE: this may end up being invoked concurrently from multiple threads.
+// If that is the case, we make sure before invocation to construct a different
+// TargetMachine per thread, so that we are sure no data races are possible.
+void optimise_module(llvm::Module &M, llvm::TargetMachine &tm, unsigned opt_level, bool force_avx512,
+ bool slp_vectorize)
+{
+ // NOTE: don't run any optimisation pass at O0.
+ if (opt_level == 0u) {
+ return;
+ }
+
+ // NOTE: the logic here largely mimics (with a lot of simplifications)
+ // the implementation of the 'opt' tool. See:
+ // https://github.com/llvm/llvm-project/blob/release/10.x/llvm/tools/opt/opt.cpp
+
+ // For every function in the module, setup its attributes
+ // so that the codegen uses all the features available on
+ // the host CPU.
+ const auto cpu = tm.getTargetCPU().str();
+ const auto features = tm.getTargetFeatureString().str();
+
+ // Fetch the module's context.
+ auto &ctx = M.getContext();
+
+ for (auto &f : M) {
+ auto attrs = f.getAttributes();
+
+ llvm::AttrBuilder new_attrs(ctx);
+
+ if (!cpu.empty() && !f.hasFnAttribute("target-cpu")) {
+ new_attrs.addAttribute("target-cpu", cpu);
+ }
+
+ if (!features.empty()) {
+ auto old_features = f.getFnAttribute("target-features").getValueAsString();
+
+ if (old_features.empty()) {
+ new_attrs.addAttribute("target-features", features);
+ } else {
+ llvm::SmallString<256> appended(old_features);
+ appended.push_back(',');
+ appended.append(features);
+ new_attrs.addAttribute("target-features", appended);
+ }
+ }
+
+ // Let new_attrs override attrs.
+ f.setAttributes(attrs.addFnAttributes(ctx, new_attrs));
+ }
+
+ // Force usage of AVX512 registers, if requested.
+ if (force_avx512 && get_target_features().avx512f) {
+ for (auto &f : M) {
+ f.addFnAttr("prefer-vector-width", "512");
+ }
+ }
+
+ // NOTE: adapted from here:
+ // https://llvm.org/docs/NewPassManager.html
+
+ // Create the analysis managers.
+ llvm::LoopAnalysisManager LAM;
+ llvm::FunctionAnalysisManager FAM;
+ llvm::CGSCCAnalysisManager CGAM;
+ llvm::ModuleAnalysisManager MAM;
+
+ // NOTE: in the new pass manager, this seems to be the way to
+ // set the target library info bits. See:
+ // https://github.com/llvm/llvm-project/blob/b7fd30eac3183993806cc218b6deb39eb625c083/llvm/tools/opt/NewPMDriver.cpp#L408
+ // Not sure if this matters, but we did it in the old pass manager
+ // and opt does it too.
+ llvm::TargetLibraryInfoImpl TLII(tm.getTargetTriple());
+ FAM.registerPass([&] { return llvm::TargetLibraryAnalysis(TLII); });
+
+ // Create the new pass manager builder, passing the supplied target machine.
+ // NOTE: if requested, we turn manually on the SLP vectoriser here, which is off
+ // by default. Not sure why it is off, the LLVM docs imply this
+ // is on by default at nonzero optimisation levels for clang and opt.
+ // NOTE: the reason for this inconsistency is that opt uses PB.parsePassPipeline()
+ // (instead of PB.buildPerModuleDefaultPipeline()) to set up the optimisation
+ // pipeline. Indeed, if we replace PB.buildPerModuleDefaultPipeline(ol) with
+ // PB.parsePassPipeline(MPM, "default") (which corresponds to invoking
+ // "opt -passes='default'"), we do NOT need to set SLP vectorization on
+ // here to get the SLP vectorizer. Not sure if we should consider switching to this
+ // alternative way of setting up the optimisation pipeline in the future.
+ llvm::PipelineTuningOptions pto;
+ pto.SLPVectorization = slp_vectorize;
+ llvm::PassBuilder PB(&tm, pto);
+
+ // Register all the basic analyses with the managers.
+ PB.registerModuleAnalyses(MAM);
+ PB.registerCGSCCAnalyses(CGAM);
+ PB.registerFunctionAnalyses(FAM);
+ PB.registerLoopAnalyses(LAM);
+ PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+ // Construct the optimisation level.
+ llvm::OptimizationLevel ol{};
+
+ switch (opt_level) {
+ case 1u:
+ ol = llvm::OptimizationLevel::O1;
+ break;
+ case 2u:
+ ol = llvm::OptimizationLevel::O2;
+ break;
+ default:
+ assert(opt_level == 3u);
+ ol = llvm::OptimizationLevel::O3;
+ }
+
+ // Create the module pass manager.
+ auto MPM = PB.buildPerModuleDefaultPipeline(ol);
+
+ // Optimize the IR.
+ MPM.run(M, MAM);
+}
+
+// Helper to add a module to an lljt, throwing on error.
+void add_module_to_lljit(llvm::orc::LLJIT &lljit, std::unique_ptr m, llvm::orc::ThreadSafeContext ctx)
+{
+ auto err = lljit.addIRModule(llvm::orc::ThreadSafeModule(std::move(m), std::move(ctx)));
+
+ // LCOV_EXCL_START
+ if (err) {
+ std::string err_report;
+ llvm::raw_string_ostream ostr(err_report);
+
+ ostr << err;
+
+ throw std::invalid_argument(
+ fmt::format("The function for adding a module to the jit failed. The full error message:\n{}", ostr.str()));
+ }
+ // LCOV_EXCL_STOP
+}
+
+// Helper to fetch the bitcode from a module.
+std::string bc_from_module(llvm::Module &m)
+{
+ std::string out;
+ llvm::raw_string_ostream ostr(out);
+
+ llvm::WriteBitcodeToFile(m, ostr);
+
+ return std::move(ostr.str());
+}
+
+// Helper to fetch the textual IR from a module.
+std::string ir_from_module(llvm::Module &m)
+{
+ std::string out;
+ llvm::raw_string_ostream ostr(out);
+
+ m.print(ostr, nullptr);
+
+ return std::move(ostr.str());
+}
+
+// An implementation of llvm::MemoryBuffer offering a view over a std::string.
+class string_view_mem_buffer final : public llvm::MemoryBuffer
+{
+public:
+ explicit string_view_mem_buffer(const std::string &s)
+ {
+ // NOTE: the important bit here is from the LLVM docs:
+ //
+ // """
+ // In addition to basic access to the characters in the file, this interface
+ // guarantees you can read one character past the end of the file, and that
+ // this character will read as '\0'.
+ // """
+ //
+ // This is exactly the guarantee given by std::string:
+ //
+ // https://en.cppreference.com/w/cpp/string/basic_string/data
+ //
+ // Not sure about the third parameter to this function though, it does not
+ // seem to have any influence apart from debug checking:
+ //
+ // https://llvm.org/doxygen/MemoryBuffer_8cpp_source.html
+ this->init(s.data(), s.data() + s.size(), true);
+ }
+ // LCOV_EXCL_START
+ llvm::MemoryBuffer::BufferKind getBufferKind() const final
+ {
+ // Hopefully std::string is not memory-mapped...
+ return llvm::MemoryBuffer::BufferKind::MemoryBuffer_Malloc;
+ }
+ // LCOV_EXCL_STOP
+};
+
+// Helper to add an object file to the jit, throwing in case of errors.
+void add_obj_to_lljit(llvm::orc::LLJIT &lljit, const std::string &obj)
+{
+ // NOTE: an empty obj can happen when we are copying a compiled
+ // llvm_multi_state. In such case, the object files of the individual
+ // states have all be empty-inited. We then need to avoid adding
+ // obj to the jit because that will result in an error.
+ if (obj.empty()) {
+ return;
+ }
+
+ // Add the object file.
+ auto err = lljit.addObjectFile(std::make_unique(obj));
+
+ // LCOV_EXCL_START
+ if (err) {
+ std::string err_report;
+ llvm::raw_string_ostream ostr(err_report);
+
+ ostr << err;
+
+ throw std::invalid_argument(fmt::format(
+ "The function for adding an object file to an lljit failed. The full error message:\n{}", ostr.str()));
+ }
+ // LCOV_EXCL_STOP
+}
+
+// Helper to verify a module, throwing if verification fails.
+void verify_module(const llvm::Module &m)
+{
+ std::string out;
+ llvm::raw_string_ostream ostr(out);
+
+ if (llvm::verifyModule(m, &ostr)) {
+ // LCOV_EXCL_START
+ throw std::runtime_error(fmt::format("The verification of the module '{}' produced an error:\n{}",
+ m.getModuleIdentifier(), ostr.str()));
+ // LCOV_EXCL_STOP
+ }
+}
+
} // namespace
// Helper function to fetch a const ref to a global object
@@ -371,116 +646,26 @@ struct llvm_state::jit {
std::unique_ptr m_ctx;
std::optional m_object_file;
+ // NOTE: make sure to coordinate changes in this constructor with multi_jit.
explicit jit(unsigned opt_level, code_model c_model)
{
- // NOTE: we assume here the opt level has already been clamped
- // from the outside.
+ // NOTE: we assume here that the input arguments have
+ // been validated already.
assert(opt_level <= 3u);
+ assert(c_model >= code_model::tiny && c_model <= code_model::large);
// Ensure the native target is inited.
detail::init_native_target();
- // NOTE: codegen opt level changed in LLVM 18.
-#if LLVM_VERSION_MAJOR < 18
+ // Create the target machine builder.
+ auto jtmb = detail::create_jit_tmb(opt_level, c_model);
- using cg_opt_level = llvm::CodeGenOpt::Level;
-
-#else
-
- using cg_opt_level = llvm::CodeGenOptLevel;
-
-#endif
-
- // Create the target machine builder.
- auto jtmb = llvm::orc::JITTargetMachineBuilder::detectHost();
- // LCOV_EXCL_START
- if (!jtmb) {
- throw std::invalid_argument("Error creating a JITTargetMachineBuilder for the host system");
- }
- // LCOV_EXCL_STOP
- // Set the codegen optimisation level.
- switch (opt_level) {
- case 0u:
- jtmb->setCodeGenOptLevel(cg_opt_level::None);
- break;
- case 1u:
- jtmb->setCodeGenOptLevel(cg_opt_level::Less);
- break;
- case 2u:
- jtmb->setCodeGenOptLevel(cg_opt_level::Default);
- break;
- default:
- assert(opt_level == 3u);
- jtmb->setCodeGenOptLevel(cg_opt_level::Aggressive);
- }
-
- // NOTE: not all code models are supported on all archs. We make an effort
- // here to prevent unsupported code models to be requested, as that will
- // result in the termination of the program.
- constexpr code_model supported_code_models[] = {
-#if defined(HEYOKA_ARCH_X86)
- code_model::small, code_model::kernel, code_model::medium, code_model::large
-#elif defined(HEYOKA_ARCH_ARM)
- code_model::tiny, code_model::small, code_model::large
-#elif defined(HEYOKA_ARCH_PPC)
- code_model::small, code_model::medium, code_model::large
-#else
- // NOTE: by default we assume only small and large are supported.
- code_model::small, code_model::large
-#endif
- };
-
- if (std::ranges::find(supported_code_models, c_model) == std::ranges::end(supported_code_models)) [[unlikely]] {
- throw std::invalid_argument(
- fmt::format("The code model '{}' is not supported on the current architecture", c_model));
- }
-
- // LCOV_EXCL_START
-
-#if LLVM_VERSION_MAJOR >= 17
- // NOTE: the code model setup is working only on LLVM>=19 (or at least
- // LLVM 18 + patches, as in the conda-forge LLVM package), due to this bug:
- //
- // https://github.com/llvm/llvm-project/issues/88115
- //
- // Additionally, there are indications from our CI that attempting to set
- // the code model before LLVM 17 might just be buggy, as we see widespread
- // ASAN failures all over the place. Thus, let us not do anything with the code
- // model setting before LLVM 17.
-
- // Setup the code model.
- switch (c_model) {
- case code_model::tiny:
- jtmb->setCodeModel(llvm::CodeModel::Tiny);
- break;
- case code_model::small:
- jtmb->setCodeModel(llvm::CodeModel::Small);
- break;
- case code_model::kernel:
- jtmb->setCodeModel(llvm::CodeModel::Kernel);
- break;
- case code_model::medium:
- jtmb->setCodeModel(llvm::CodeModel::Medium);
- break;
- case code_model::large:
- jtmb->setCodeModel(llvm::CodeModel::Large);
- break;
- default:
- // NOTE: we should never end up here.
- assert(false);
- ;
- }
-
-#endif
-
- // LCOV_EXCL_STOP
-
- // Create the jit builder.
- llvm::orc::LLJITBuilder lljit_builder;
- // NOTE: other settable properties may
- // be of interest:
- // https://www.llvm.org/doxygen/classllvm_1_1orc_1_1LLJITBuilder.html
- lljit_builder.setJITTargetMachineBuilder(*jtmb);
+ // Create the jit builder.
+ llvm::orc::LLJITBuilder lljit_builder;
+ // NOTE: other settable properties may
+ // be of interest:
+ // https://www.llvm.org/doxygen/classllvm_1_1orc_1_1LLJITBuilder.html
+ lljit_builder.setJITTargetMachineBuilder(jtmb);
// Create the jit.
auto lljit = lljit_builder.create();
@@ -531,7 +716,7 @@ struct llvm_state::jit {
// Keep a target machine around to fetch various
// properties of the host CPU.
- auto tm = jtmb->createTargetMachine();
+ auto tm = jtmb.createTargetMachine();
// LCOV_EXCL_START
if (!tm) {
throw std::invalid_argument("Error creating the target machine");
@@ -586,10 +771,6 @@ struct llvm_state::jit {
{
return m_tm->getTargetFeatureString().str();
}
- [[nodiscard]] llvm::TargetIRAnalysis get_target_ir_analysis() const
- {
- return m_tm->getTargetIRAnalysis();
- }
[[nodiscard]] const llvm::Triple &get_target_triple() const
{
return m_lljit->getTargetTriple();
@@ -597,19 +778,7 @@ struct llvm_state::jit {
void add_module(std::unique_ptr m) const
{
- auto err = m_lljit->addIRModule(llvm::orc::ThreadSafeModule(std::move(m), *m_ctx));
-
- // LCOV_EXCL_START
- if (err) {
- std::string err_report;
- llvm::raw_string_ostream ostr(err_report);
-
- ostr << err;
-
- throw std::invalid_argument(fmt::format(
- "The function for adding a module to the jit failed. The full error message:\n{}", ostr.str()));
- }
- // LCOV_EXCL_STOP
+ detail::add_module_to_lljit(*m_lljit, std::move(m), *m_ctx);
}
// Symbol lookup.
@@ -645,24 +814,12 @@ namespace detail
namespace
{
-// Helper to load object code into a jit.
+// Helper to load object code into the jit of an llvm_state.
template
void llvm_state_add_obj_to_jit(Jit &j, std::string obj)
{
- llvm::SmallVector buffer(obj.begin(), obj.end());
- auto err = j.m_lljit->addObjectFile(std::make_unique(std::move(buffer)));
-
- // LCOV_EXCL_START
- if (err) {
- std::string err_report;
- llvm::raw_string_ostream ostr(err_report);
-
- ostr << err;
-
- throw std::invalid_argument(fmt::format(
- "The function for adding a compiled module to the jit failed. The full error message:\n{}", ostr.str()));
- }
- // LCOV_EXCL_STOP
+ // Add the object code to the lljit.
+ add_obj_to_lljit(*j.m_lljit, obj);
// Add the object code also to the
// m_object_file member.
@@ -677,11 +834,10 @@ void llvm_state_add_obj_to_jit(Jit &j, std::string obj)
// contained in the bitcode) because apparently llvm::parseBitcodeFile() discards the module
// name when parsing.
// NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
-auto llvm_state_bc_to_module(const std::string &module_name, const std::string &bc, llvm::LLVMContext &ctx)
+auto bc_to_module(const std::string &module_name, const std::string &bc, llvm::LLVMContext &ctx)
{
- // Create the corresponding memory buffer.
- auto mb = llvm::MemoryBuffer::getMemBuffer(bc);
- assert(mb);
+ // Create the corresponding memory buffer view on bc.
+ auto mb = std::make_unique(bc);
// Parse the bitcode.
auto ret = llvm::parseBitcodeFile(mb->getMemBufferRef(), ctx);
@@ -790,7 +946,7 @@ llvm_state::llvm_state(const llvm_state &other)
// module and builder. The IR/bitcode snapshots
// are left in their default-constructed (empty)
// state.
- m_module = detail::llvm_state_bc_to_module(m_module_name, other.get_bc(), context());
+ m_module = detail::bc_to_module(m_module_name, other.get_bc(), context());
// Create a new builder for the module.
m_builder = std::make_unique(context());
@@ -837,23 +993,12 @@ llvm_state &llvm_state::operator=(llvm_state &&other) noexcept
return *this;
}
-llvm_state::~llvm_state()
-{
- // Sanity checks in debug mode.
- if (m_jitter) {
- if (is_compiled()) {
- assert(m_jitter->m_object_file);
- assert(!m_builder);
- } else {
- assert(!m_jitter->m_object_file);
- assert(m_builder);
- assert(m_ir_snapshot.empty());
- assert(m_bc_snapshot.empty());
- }
- }
-
- assert(m_opt_level <= 3u);
-}
+// NOTE: we used to have debug sanity checks here. However, in certain rare corner cases,
+// an invalid llvm_state could end up being destroyed, thus triggering assertion errors
+// in debug mode (this could happen for instance when resetting an llvm_state to the
+// def-cted state after an exception had been thrown during compilation). Thus, just
+// do not run the debug checks.
+llvm_state::~llvm_state() = default;
template
void llvm_state::save_impl(Archive &ar, unsigned) const
@@ -997,7 +1142,7 @@ void llvm_state::load_impl(Archive &ar, unsigned version)
m_bc_snapshot.clear();
// Create the module from the bitcode.
- m_module = detail::llvm_state_bc_to_module(m_module_name, bc_snapshot, context());
+ m_module = detail::bc_to_module(m_module_name, bc_snapshot, context());
// Create a new builder for the module.
m_builder = std::make_unique(context());
@@ -1142,189 +1287,15 @@ void llvm_state::verify_function(const std::string &name)
void llvm_state::optimise()
{
- check_uncompiled(__func__);
-
- // NOTE: don't run any optimisation pass at O0.
- if (m_opt_level == 0u) {
- return;
- }
-
- // NOTE: the logic here largely mimics (with a lot of simplifications)
- // the implementation of the 'opt' tool. See:
- // https://github.com/llvm/llvm-project/blob/release/10.x/llvm/tools/opt/opt.cpp
-
- // For every function in the module, setup its attributes
- // so that the codegen uses all the features available on
- // the host CPU.
- const auto cpu = m_jitter->get_target_cpu();
- const auto features = m_jitter->get_target_features();
-
- auto &ctx = context();
-
- for (auto &f : module()) {
- auto attrs = f.getAttributes();
-
- llvm::AttrBuilder
-#if LLVM_VERSION_MAJOR < 14
- new_attrs
-#else
- new_attrs(ctx)
-#endif
- ;
-
- if (!cpu.empty() && !f.hasFnAttribute("target-cpu")) {
- new_attrs.addAttribute("target-cpu", cpu);
- }
-
- if (!features.empty()) {
- auto old_features = f.getFnAttribute("target-features").getValueAsString();
-
- if (old_features.empty()) {
- new_attrs.addAttribute("target-features", features);
- } else {
- llvm::SmallString<256> appended(old_features);
- appended.push_back(',');
- appended.append(features);
- new_attrs.addAttribute("target-features", appended);
- }
- }
-
- // Let new_attrs override attrs.
-#if LLVM_VERSION_MAJOR < 14
- f.setAttributes(attrs.addAttributes(ctx, llvm::AttributeList::FunctionIndex, new_attrs));
-#else
- f.setAttributes(attrs.addFnAttributes(ctx, new_attrs));
-#endif
- }
-
- // Force usage of AVX512 registers, if requested.
- if (m_force_avx512 && detail::get_target_features().avx512f) {
- for (auto &f : module()) {
- f.addFnAttr("prefer-vector-width", "512");
- }
- }
-
-#if defined(HEYOKA_USE_NEW_LLVM_PASS_MANAGER)
-
- // NOTE: adapted from here:
- // https://llvm.org/docs/NewPassManager.html
-
- // Optimisation level for the module pass manager.
- // NOTE: the OptimizationLevel class has changed location
- // since LLVM 14.
-#if LLVM_VERSION_MAJOR >= 14
- using olevel = llvm::OptimizationLevel;
-#else
- using olevel = llvm::PassBuilder::OptimizationLevel;
-#endif
-
- // Create the analysis managers.
- llvm::LoopAnalysisManager LAM;
- llvm::FunctionAnalysisManager FAM;
- llvm::CGSCCAnalysisManager CGAM;
- llvm::ModuleAnalysisManager MAM;
-
- // NOTE: in the new pass manager, this seems to be the way to
- // set the target library info bits. See:
- // https://github.com/llvm/llvm-project/blob/b7fd30eac3183993806cc218b6deb39eb625c083/llvm/tools/opt/NewPMDriver.cpp#L408
- // Not sure if this matters, but we did it in the old pass manager
- // and opt does it too.
- llvm::TargetLibraryInfoImpl TLII(m_jitter->get_target_triple());
- FAM.registerPass([&] { return llvm::TargetLibraryAnalysis(TLII); });
-
- // Create the new pass manager builder, passing
- // the native target machine from the JIT class.
- // NOTE: if requested, we turn manually on the SLP vectoriser here, which is off
- // by default. Not sure why it is off, the LLVM docs imply this
- // is on by default at nonzero optimisation levels for clang and opt.
- // NOTE: the reason for this inconsistency is that opt uses PB.parsePassPipeline()
- // (instead of PB.buildPerModuleDefaultPipeline()) to set up the optimisation
- // pipeline. Indeed, if we replace PB.buildPerModuleDefaultPipeline(ol) with
- // PB.parsePassPipeline(MPM, "default") (which corresponds to invoking
- // "opt -passes='default'"), we do NOT need to set SLP vectorization on
- // here to get the SLP vectorizer. Not sure if we should consider switching to this
- // alternative way of setting up the optimisation pipeline in the future.
- llvm::PipelineTuningOptions pto;
- pto.SLPVectorization = m_slp_vectorize;
- llvm::PassBuilder PB(m_jitter->m_tm.get(), pto);
-
- // Register all the basic analyses with the managers.
- PB.registerModuleAnalyses(MAM);
- PB.registerCGSCCAnalyses(CGAM);
- PB.registerFunctionAnalyses(FAM);
- PB.registerLoopAnalyses(LAM);
- PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
-
- // Construct the optimisation level.
- olevel ol{};
-
- switch (m_opt_level) {
- case 1u:
- ol = olevel::O1;
- break;
- case 2u:
- ol = olevel::O2;
- break;
- default:
- assert(m_opt_level == 3u);
- ol = olevel::O3;
- }
-
- // Create the module pass manager.
- auto MPM = PB.buildPerModuleDefaultPipeline(ol);
-
- // Optimize the IR.
- MPM.run(*m_module, MAM);
-
-#else
-
- // Init the module pass manager.
- auto module_pm = std::make_unique();
- // These are passes which set up target-specific info
- // that are used by successive optimisation passes.
- auto tliwp = std::make_unique(
- llvm::TargetLibraryInfoImpl(m_jitter->get_target_triple()));
- module_pm->add(tliwp.release());
- module_pm->add(llvm::createTargetTransformInfoWrapperPass(m_jitter->get_target_ir_analysis()));
-
- // NOTE: not sure what this does, presumably some target-specifc
- // configuration.
- module_pm->add(static_cast(*m_jitter->m_tm).createPassConfig(*module_pm));
-
- // Init the function pass manager.
- auto f_pm = std::make_unique(m_module.get());
- f_pm->add(llvm::createTargetTransformInfoWrapperPass(m_jitter->get_target_ir_analysis()));
-
- // We use the helper class PassManagerBuilder to populate the module
- // pass manager with standard options.
- llvm::PassManagerBuilder pm_builder;
- // See here for the defaults:
- // https://llvm.org/doxygen/PassManagerBuilder_8cpp_source.html
- pm_builder.OptLevel = m_opt_level;
- // Enable function inlining.
- pm_builder.Inliner = llvm::createFunctionInliningPass(m_opt_level, 0, false);
- // NOTE: if requested, we turn manually on the SLP vectoriser here, which is off
- // by default. Not sure why it is off, the LLVM docs imply this
- // is on by default at nonzero optimisation levels for clang and opt.
- pm_builder.SLPVectorize = m_slp_vectorize;
-
- m_jitter->m_tm->adjustPassManager(pm_builder);
-
- // Populate both the function pass manager and the module pass manager.
- pm_builder.populateFunctionPassManager(*f_pm);
- pm_builder.populateModulePassManager(*module_pm);
-
- // Run the function pass manager on all functions in the module.
- f_pm->doInitialization();
- for (auto &f : *m_module) {
- f_pm->run(f);
- }
- f_pm->doFinalization();
-
- // Run the module passes.
- module_pm->run(*m_module);
-
-#endif
+ // NOTE: we used to fetch the target triple from the lljit object,
+ // but recently we switched to asking the target triple directly
+ // from the target machine. Assert equality between the two for a while,
+ // just in case.
+ assert(m_jitter->m_lljit->getTargetTriple() == m_jitter->m_tm->getTargetTriple());
+ // NOTE: the target triple is also available in the module.
+ assert(m_jitter->m_lljit->getTargetTriple().str() == module().getTargetTriple());
+
+ detail::optimise_module(module(), *m_jitter->m_tm, m_opt_level, m_force_avx512, m_slp_vectorize);
}
namespace detail
@@ -1350,6 +1321,7 @@ void llvm_state::add_obj_trigger()
auto *ft = llvm::FunctionType::get(bld.getVoidTy(), {}, false);
assert(ft != nullptr);
auto *f = detail::llvm_func_create(ft, llvm::Function::ExternalLinkage, detail::obj_trigger_name, &module());
+ assert(f != nullptr);
bld.SetInsertPoint(llvm::BasicBlock::Create(context(), "entry", f));
bld.CreateRetVoid();
@@ -1377,9 +1349,38 @@ void llvm_state::compile_impl()
// Trigger object code materialisation via lookup.
jit_lookup(detail::obj_trigger_name);
+
assert(m_jitter->m_object_file);
}
+namespace detail
+{
+
+namespace
+{
+
+// Combine opt_level, force_avx512, slp_vectorize and c_model into a single flag.
+// NOTE: here we need:
+//
+// - 2 bits for opt_level,
+// - 1 bit for force_avx512 and slp_vectorize each,
+// - 3 bits for c_model,
+//
+// for a total of 7 bits.
+unsigned assemble_comp_flag(unsigned opt_level, bool force_avx512, bool slp_vectorize, code_model c_model)
+{
+ assert(opt_level <= 3u);
+ assert(static_cast(c_model) <= 7u);
+ static_assert(std::numeric_limits::digits >= 7u);
+
+ return opt_level + (static_cast(force_avx512) << 2) + (static_cast(slp_vectorize) << 3)
+ + (static_cast(c_model) << 4);
+}
+
+} // namespace
+
+} // namespace detail
+
// NOTE: we need to emphasise in the docs that compilation
// triggers an optimisation pass.
void llvm_state::compile()
@@ -1392,17 +1393,7 @@ void llvm_state::compile()
auto *logger = detail::get_logger();
// Run a verification on the module before compiling.
- {
- std::string out;
- llvm::raw_string_ostream ostr(out);
-
- if (llvm::verifyModule(*m_module, &ostr)) {
- // LCOV_EXCL_START
- throw std::runtime_error(
- fmt::format("The verification of the module '{}' produced an error:\n{}", m_module_name, ostr.str()));
- // LCOV_EXCL_STOP
- }
- }
+ detail::verify_module(*m_module);
logger->trace("module verification runtime: {}", sw);
@@ -1413,39 +1404,42 @@ void llvm_state::compile()
// to fix the module and re-attempt compilation without having
// altered the module and without having already added the trigger
// function.
+ // NOTE: this function does its own cleanup, no need to
+ // start the try catch block yet.
add_obj_trigger();
try {
// Fetch the bitcode *before* optimisation.
auto orig_bc = get_bc();
+ std::vector obc;
+ obc.push_back(std::move(orig_bc));
- // Combine m_opt_level, m_force_avx512, m_slp_vectorize and m_c_model into a single value,
- // as they all affect codegen.
- // NOTE: here we need:
- // - 2 bits for m_opt_level,
- // - 1 bit for m_force_avx512 and m_slp_vectorize each,
- // - 3 bits for m_c_model,
- // for a total of 7 bits.
- assert(m_opt_level <= 3u);
- assert(static_cast(m_c_model) <= 7u);
- static_assert(std::numeric_limits::digits >= 7u);
- const auto olevel = m_opt_level + (static_cast(m_force_avx512) << 2)
- + (static_cast(m_slp_vectorize) << 3) + (static_cast(m_c_model) << 4);
-
- if (auto cached_data = detail::llvm_state_mem_cache_lookup(orig_bc, olevel)) {
+ // Assemble the compilation flag.
+ const auto comp_flag = detail::assemble_comp_flag(m_opt_level, m_force_avx512, m_slp_vectorize, m_c_model);
+
+ // Lookup in the cache.
+ if (auto cached_data = detail::llvm_state_mem_cache_lookup(obc, comp_flag)) {
// Cache hit.
- // Assign the snapshots.
- m_ir_snapshot = std::move(cached_data->opt_ir);
- m_bc_snapshot = std::move(cached_data->opt_bc);
+ // Assign the optimised snapshots.
+ assert(cached_data->opt_ir.size() == 1u);
+ assert(cached_data->opt_bc.size() == 1u);
+ assert(cached_data->obj.size() == 1u);
+ m_ir_snapshot = std::move(cached_data->opt_ir[0]);
+ m_bc_snapshot = std::move(cached_data->opt_bc[0]);
// Clear out module and builder.
m_module.reset();
m_builder.reset();
// Assign the object file.
- detail::llvm_state_add_obj_to_jit(*m_jitter, std::move(cached_data->obj));
+ detail::llvm_state_add_obj_to_jit(*m_jitter, std::move(cached_data->obj[0]));
+
+ // Look up the trigger.
+ jit_lookup(detail::obj_trigger_name);
} else {
+ // Cache miss.
+
sw.reset();
// Run the optimisation pass.
@@ -1460,10 +1454,11 @@ void llvm_state::compile()
logger->trace("materialisation runtime: {}", sw);
- // Try to insert orig_bc into the cache.
- detail::llvm_state_mem_cache_try_insert(std::move(orig_bc), olevel,
- // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
- {m_bc_snapshot, m_ir_snapshot, *m_jitter->m_object_file});
+ // Try to insert obc into the cache.
+ detail::llvm_state_mem_cache_try_insert(
+ std::move(obc), comp_flag,
+ // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+ {.opt_bc = {m_bc_snapshot}, .opt_ir = {m_ir_snapshot}, .obj = {*m_jitter->m_object_file}});
}
// LCOV_EXCL_START
} catch (...) {
@@ -1493,11 +1488,7 @@ std::uintptr_t llvm_state::jit_lookup(const std::string &name)
throw std::invalid_argument(fmt::format("Could not find the symbol '{}' in the compiled module", name));
}
-#if LLVM_VERSION_MAJOR >= 15
return static_cast((*sym).getValue());
-#else
- return static_cast((*sym).getAddress());
-#endif
}
std::string llvm_state::get_ir() const
@@ -1505,12 +1496,7 @@ std::string llvm_state::get_ir() const
if (m_module) {
// The module has not been compiled yet,
// get the IR from it.
- std::string out;
- llvm::raw_string_ostream ostr(out);
-
- m_module->print(ostr, nullptr);
-
- return std::move(ostr.str());
+ return detail::ir_from_module(*m_module);
} else {
// The module has been compiled.
// Return the IR snapshot that
@@ -1524,12 +1510,7 @@ std::string llvm_state::get_bc() const
if (m_module) {
// The module has not been compiled yet,
// get the bitcode from it.
- std::string out;
- llvm::raw_string_ostream ostr(out);
-
- llvm::WriteBitcodeToFile(*m_module, ostr);
-
- return std::move(ostr.str());
+ return detail::bc_from_module(*m_module);
} else {
// The module has been compiled.
// Return the bitcode snapshot that
@@ -1610,4 +1591,812 @@ std::ostream &operator<<(std::ostream &os, const llvm_state &s)
return os << oss.str();
}
+namespace detail
+{
+
+namespace
+{
+
+// NOTE: this is a class similar in spirit to llvm_state, but set up for parallel
+// compilation of multiple modules.
+struct multi_jit {
+ // NOTE: this is the total number of modules, including
+ // the master module.
+ const unsigned m_n_modules = 0;
+ // NOTE: enumerate the LLVM members here in the same order
+ // as llvm_state, as this is important to ensure proper
+ // destruction order.
+ std::unique_ptr m_lljit;
+ std::unique_ptr m_ctx;
+ std::unique_ptr m_module;
+ std::unique_ptr m_builder;
+ // Object files.
+ // NOTE: these may be modified concurrently during compilation,
+ // protect with mutex.
+ std::mutex m_object_files_mutex;
+ std::vector m_object_files;
+ // IR and bc optimised snapshots.
+ // NOTE: these may be modified concurrently during compilation,
+ // protect with mutex.
+ std::mutex m_ir_bc_mutex;
+ std::vector m_ir_snapshots;
+ std::vector m_bc_snapshots;
+
+ explicit multi_jit(unsigned, unsigned, code_model, bool, bool);
+ multi_jit(const multi_jit &) = delete;
+ multi_jit(multi_jit &&) noexcept = delete;
+ llvm_multi_state &operator=(const multi_jit &) = delete;
+ llvm_multi_state &operator=(multi_jit &&) noexcept = delete;
+ ~multi_jit() = default;
+
+ // Helper to fetch the context from its thread-safe counterpart.
+ [[nodiscard]] llvm::LLVMContext &context() const noexcept
+ {
+ return *m_ctx->getContext();
+ }
+};
+
+#if 0
+
+// A task dispatcher class built on top of TBB's task group.
+class tbb_task_dispatcher : public llvm::orc::TaskDispatcher
+{
+ oneapi::tbb::task_group m_tg;
+
+public:
+ void dispatch(std::unique_ptr T) override
+ {
+ m_tg.run([T = std::move(T)]() { T->run(); });
+ }
+ void shutdown() override
+ {
+ m_tg.wait();
+ }
+ ~tbb_task_dispatcher() noexcept
+ {
+ m_tg.wait();
+ }
+};
+
+#endif
+
+// Reserved identifier for the master module in an llvm_multi_state.
+constexpr auto master_module_name = "heyoka.master";
+
+// NOTE: this largely replicates the logic from the constructors of llvm_state and llvm_state::jit.
+// NOTE: make sure to coordinate changes in this constructor with llvm_state::jit.
+multi_jit::multi_jit(unsigned n_modules, unsigned opt_level, code_model c_model, bool force_avx512, bool slp_vectorize)
+ : m_n_modules(n_modules)
+{
+ assert(n_modules >= 2u);
+
+ // NOTE: we assume here that the input arguments have
+ // been validated already.
+ assert(opt_level <= 3u);
+ assert(c_model >= code_model::tiny && c_model <= code_model::large);
+
+ // Ensure the native target is inited.
+ init_native_target();
+
+ // Create the target machine builder.
+ auto jtmb = create_jit_tmb(opt_level, c_model);
+
+ // Create the jit builder.
+ llvm::orc::LLJITBuilder lljit_builder;
+ // NOTE: other settable properties may
+ // be of interest:
+ // https://www.llvm.org/doxygen/classllvm_1_1orc_1_1LLJITBuilder.html
+ lljit_builder.setJITTargetMachineBuilder(jtmb);
+
+#if 0
+ // Create a task dispatcher.
+ auto tdisp = std::make_unique();
+
+ // Create an ExecutorProcessControl.
+ auto epc = llvm::orc::SelfExecutorProcessControl::Create(nullptr, std::move(tdisp));
+ // LCOV_EXCL_START
+ if (!epc) {
+ auto err = epc.takeError();
+
+ std::string err_report;
+ llvm::raw_string_ostream ostr(err_report);
+
+ ostr << err;
+
+ throw std::invalid_argument(
+ fmt::format("Could not create a SelfExecutorProcessControl. The full error message is:\n{}", ostr.str()));
+ }
+ // LCOV_EXCL_STOP
+
+ // Set it in the lljit builder.
+ lljit_builder.setExecutorProcessControl(std::move(*epc));
+#else
+
+ // Set the number of compilation threads.
+ lljit_builder.setNumCompileThreads(std::thread::hardware_concurrency());
+
+#endif
+
+ // Create the jit.
+ auto lljit = lljit_builder.create();
+ // LCOV_EXCL_START
+ if (!lljit) {
+ auto err = lljit.takeError();
+
+ std::string err_report;
+ llvm::raw_string_ostream ostr(err_report);
+
+ ostr << err;
+
+ throw std::invalid_argument(
+ fmt::format("Could not create an LLJIT object. The full error message is:\n{}", ostr.str()));
+ }
+ // LCOV_EXCL_STOP
+ m_lljit = std::move(*lljit);
+
+ // Setup the machinery to store the modules' binary code
+ // when it is generated.
+ m_lljit->getObjTransformLayer().setTransform([this](std::unique_ptr obj_buffer) {
+ assert(obj_buffer);
+
+ // Lock down for access to m_object_files.
+ std::lock_guard lock{m_object_files_mutex};
+
+ assert(m_object_files.size() <= m_n_modules);
+
+ // NOTE: this callback will be invoked the first time a jit lookup is performed,
+ // even if the object code was manually injected. In such a case, m_object_files
+ // has already been set up properly and we just sanity check in debug mode that
+ // one object file matches the content of obj_buffer.
+ if (m_object_files.size() < m_n_modules) {
+ // Add obj_buffer.
+ m_object_files.push_back(std::string(obj_buffer->getBufferStart(), obj_buffer->getBufferEnd()));
+ } else {
+ // Check that at least one buffer in m_object_files is exactly
+ // identical to obj_buffer.
+ assert(std::ranges::any_of(m_object_files, [&obj_buffer](const auto &cur) {
+ return obj_buffer->getBufferSize() == cur.size()
+ && std::equal(obj_buffer->getBufferStart(), obj_buffer->getBufferEnd(), cur.begin());
+ ;
+ }));
+ }
+
+ return llvm::Expected>(std::move(obj_buffer));
+ });
+
+ // Setup the machinery to run the optimisation passes on the modules.
+ m_lljit->getIRTransformLayer().setTransform(
+ [this, opt_level, force_avx512, slp_vectorize, c_model](llvm::orc::ThreadSafeModule TSM,
+ llvm::orc::MaterializationResponsibility &) {
+ // See here for an explanation of what withModuleDo() entails:
+ //
+ // https://groups.google.com/g/llvm-dev/c/QauU4L_bHac
+ //
+ // In our case, the locking/thread safety aspect is not important as we are not sharing
+ // contexts between threads. More references from discord:
+ //
+ // https://discord.com/channels/636084430946959380/687692371038830597/1252428080648163328
+ // https://discord.com/channels/636084430946959380/687692371038830597/1252118666187640892
+ TSM.withModuleDo([this, opt_level, force_avx512, slp_vectorize, c_model](llvm::Module &M) {
+ // NOTE: don't run any optimisation on the master module.
+ if (M.getModuleIdentifier() != master_module_name) {
+ // NOTE: running the optimisation passes requires mutable access to a target
+ // machine. Thus, we create a new target machine per thread in order to avoid likely data races
+ // with a shared target machine.
+
+ // Fetch a target machine builder.
+ auto jtmb = detail::create_jit_tmb(opt_level, c_model);
+
+ // Try creating the target machine.
+ auto tm = jtmb.createTargetMachine();
+ // LCOV_EXCL_START
+ if (!tm) [[unlikely]] {
+ throw std::invalid_argument("Error creating the target machine");
+ }
+ // LCOV_EXCL_STOP
+
+ // NOTE: we used to fetch the target triple from the lljit object,
+ // but recently we switched to asking the target triple directly
+ // from the target machine. Assert equality between the two for a while,
+ // just in case.
+ // NOTE: lljit.getTargetTriple() just returns a const ref to an internal
+ // object, it should be ok with concurrent invocation.
+ assert(m_lljit->getTargetTriple() == (*tm)->getTargetTriple());
+ // NOTE: the target triple is also available in the module.
+ assert(m_lljit->getTargetTriple().str() == M.getTargetTriple());
+
+ // Optimise the module.
+ detail::optimise_module(M, **tm, opt_level, force_avx512, slp_vectorize);
+ } else {
+ ;
+ }
+
+ // Store the optimised bitcode/IR for this module.
+ auto bc_snap = detail::bc_from_module(M);
+ auto ir_snap = detail::ir_from_module(M);
+
+ // NOTE: protect for multi-threaded access.
+ std::lock_guard lock{m_ir_bc_mutex};
+
+ m_bc_snapshots.push_back(std::move(bc_snap));
+ m_ir_snapshots.push_back(std::move(ir_snap));
+ });
+
+ return llvm::Expected(std::move(TSM));
+ });
+
+ // Setup the jit so that it can look up symbols from the current process.
+ auto dlsg
+ = llvm::orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(m_lljit->getDataLayout().getGlobalPrefix());
+ // LCOV_EXCL_START
+ if (!dlsg) {
+ throw std::invalid_argument("Could not create the dynamic library search generator");
+ }
+ // LCOV_EXCL_STOP
+ m_lljit->getMainJITDylib().addGenerator(std::move(*dlsg));
+
+ // Create the master context.
+ m_ctx = std::make_unique(std::make_unique());
+
+ // Create the master module.
+ m_module = std::make_unique(master_module_name, context());
+ // Setup the data layout and the target triple.
+ m_module->setDataLayout(m_lljit->getDataLayout());
+ m_module->setTargetTriple(m_lljit->getTargetTriple().str());
+
+ // Create a new builder for the master module.
+ // NOTE: no need to mess around with fast math flags for this builder.
+ m_builder = std::make_unique(context());
+}
+
+} // namespace
+
+} // namespace detail
+
+struct llvm_multi_state::impl {
+ std::vector m_states;
+ std::unique_ptr m_jit;
+};
+
+llvm_multi_state::llvm_multi_state() = default;
+
+llvm_multi_state::llvm_multi_state(std::vector states_)
+{
+ // Fetch a const ref, as we want to make extra sure we do not modify
+ // states_ until we move it to construct the impl.
+ const auto &states = states_;
+
+ // We need at least 1 state.
+ if (states.empty()) [[unlikely]] {
+ throw std::invalid_argument("At least 1 llvm_state object is needed to construct an llvm_multi_state");
+ }
+
+ // All states must be uncompiled.
+ if (std::ranges::any_of(states, &llvm_state::is_compiled)) [[unlikely]] {
+ throw std::invalid_argument("An llvm_multi_state can be constructed only from uncompiled llvm_state objects");
+ }
+
+ // Module names must not collide with master_module_name.
+ if (std::ranges::any_of(states, [](const auto &s) { return s.module_name() == detail::master_module_name; }))
+ [[unlikely]] {
+ throw std::invalid_argument(
+ fmt::format("An invalid llvm_state was passed to the constructor of an llvm_multi_state: the module name "
+ "'{}' is reserved for internal use by llvm_multi_state",
+ detail::master_module_name));
+ }
+
+ // Settings in all states must be consistent.
+ auto states_differ = [](const llvm_state &s1, const llvm_state &s2) {
+ if (s1.get_opt_level() != s2.get_opt_level()) {
+ return true;
+ }
+
+ if (s1.fast_math() != s2.fast_math()) {
+ return true;
+ }
+
+ if (s1.force_avx512() != s2.force_avx512()) {
+ return true;
+ }
+
+ if (s1.get_slp_vectorize() != s2.get_slp_vectorize()) {
+ return true;
+ }
+
+ if (s1.get_code_model() != s2.get_code_model()) {
+ return true;
+ }
+
+ // NOTE: bit of paranoia here.
+ assert(s1.m_jitter->m_lljit->getDataLayout() == s2.m_jitter->m_lljit->getDataLayout());
+ assert(s1.m_jitter->get_target_triple() == s2.m_jitter->get_target_triple());
+ assert(s1.m_jitter->get_target_cpu() == s2.m_jitter->get_target_cpu());
+ assert(s1.m_jitter->get_target_features() == s2.m_jitter->get_target_features());
+
+ return false;
+ };
+
+ if (std::ranges::adjacent_find(states, states_differ) != states.end()) [[unlikely]] {
+ throw std::invalid_argument(
+ "Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state");
+ }
+
+ // Fetch settings from the first state.
+ const auto opt_level = states[0].get_opt_level();
+ const auto c_model = states[0].get_code_model();
+ const auto force_avx512 = states[0].force_avx512();
+ const auto slp_vectorize = states[0].get_slp_vectorize();
+
+ // Create the multi_jit.
+ auto jit = std::make_unique(boost::safe_numerics::safe(states.size()) + 1, opt_level,
+ c_model, force_avx512, slp_vectorize);
+
+ // Build and assign the implementation.
+ impl imp{.m_states = std::move(states_), .m_jit = std::move(jit)};
+ m_impl = std::make_unique(std::move(imp));
+}
+
+llvm_multi_state::llvm_multi_state(const llvm_multi_state &other)
+{
+ // NOTE: start off by creating a new jit and copying the states.
+ // This will work regardless of whether other is compiled or not.
+ // No need to do any validation on the states are they are coming
+ // from a llvm_multi_state and they have been checked already.
+ impl imp{.m_states = other.m_impl->m_states,
+ .m_jit = std::make_unique(other.m_impl->m_jit->m_n_modules, other.get_opt_level(),
+ other.get_code_model(), other.force_avx512(),
+ other.get_slp_vectorize())};
+ m_impl = std::make_unique(std::move(imp));
+
+ if (other.is_compiled()) {
+ // 'other' was compiled.
+
+ // Reset builder and module.
+ m_impl->m_jit->m_module.reset();
+ m_impl->m_jit->m_builder.reset();
+
+ // Copy over the snapshots and the object files,
+ m_impl->m_jit->m_object_files = other.m_impl->m_jit->m_object_files;
+ m_impl->m_jit->m_ir_snapshots = other.m_impl->m_jit->m_ir_snapshots;
+ m_impl->m_jit->m_bc_snapshots = other.m_impl->m_jit->m_bc_snapshots;
+
+ // Add the files to the jit.
+ for (const auto &obj : m_impl->m_jit->m_object_files) {
+ detail::add_obj_to_lljit(*m_impl->m_jit->m_lljit, obj);
+ }
+ } else {
+ // If 'other' was not compiled, we do not need to do anything - the
+ // copy construction of the states takes care of everything. I.e., this
+ // is basically the same as construction from a list of states.
+ // NOTE: regarding the master module: this is always created empty
+ // and it remains empty until compilation, thus we do not need to care
+ // about it if other is uncompiled - the new empty master module constructed
+ // with the jit is ok.
+ assert(other.m_impl->m_jit->m_object_files.empty());
+ assert(other.m_impl->m_jit->m_ir_snapshots.empty());
+ assert(other.m_impl->m_jit->m_bc_snapshots.empty());
+ }
+}
+
+llvm_multi_state::llvm_multi_state(llvm_multi_state &&) noexcept = default;
+
+llvm_multi_state &llvm_multi_state::operator=(const llvm_multi_state &other)
+{
+ if (this != &other) {
+ *this = llvm_multi_state(other);
+ }
+
+ return *this;
+}
+
+llvm_multi_state &llvm_multi_state::operator=(llvm_multi_state &&) noexcept = default;
+
+llvm_multi_state::~llvm_multi_state() = default;
+
+void llvm_multi_state::save(boost::archive::binary_oarchive &ar, unsigned) const
+{
+ // Start by establishing if the state is compiled.
+ const auto cmp = is_compiled();
+ ar << cmp;
+
+ // Store the states.
+ ar << m_impl->m_states;
+
+ // Store the object files and the snapshots. These may be empty.
+ ar << m_impl->m_jit->m_object_files;
+ ar << m_impl->m_jit->m_ir_snapshots;
+ ar << m_impl->m_jit->m_bc_snapshots;
+
+ // NOTE: no need to explicitly store the bitcode of the master
+ // module: if this is compiled, the master module is in the snapshots.
+ // Otherwise, the master module is empty and there's no need to
+ // store anything.
+}
+
+void llvm_multi_state::load(boost::archive::binary_iarchive &ar, unsigned)
+{
+ try {
+ // Load the compiled status flag from the archive.
+ // NOLINTNEXTLINE(misc-const-correctness)
+ bool cmp{};
+ ar >> cmp;
+
+ // Load the states.
+ ar >> m_impl->m_states;
+
+ // Reset the jit with a new one.
+ m_impl->m_jit = std::make_unique(
+ boost::safe_numerics::safe(m_impl->m_states.size()) + 1, get_opt_level(), get_code_model(),
+ force_avx512(), get_slp_vectorize());
+
+ // Load the object files and the snapshots.
+ ar >> m_impl->m_jit->m_object_files;
+ ar >> m_impl->m_jit->m_ir_snapshots;
+ ar >> m_impl->m_jit->m_bc_snapshots;
+
+ if (cmp) {
+ // If the stored state was compiled, we need to reset
+ // master builder and module. Otherwise, the empty default-constructed
+ // master module is ok (the master module remains empty until compilation
+ // is triggered).
+ m_impl->m_jit->m_module.reset();
+ m_impl->m_jit->m_builder.reset();
+
+ // We also need to add all the object files to the jit.
+ for (const auto &obj : m_impl->m_jit->m_object_files) {
+ detail::add_obj_to_lljit(*m_impl->m_jit->m_lljit, obj);
+ }
+ }
+
+ // Debug checks.
+ assert((m_impl->m_jit->m_object_files.empty() && !cmp)
+ || m_impl->m_jit->m_object_files.size() == m_impl->m_jit->m_n_modules);
+ assert((m_impl->m_jit->m_object_files.empty() && !cmp)
+ || m_impl->m_jit->m_ir_snapshots.size() == m_impl->m_jit->m_n_modules);
+ assert((m_impl->m_jit->m_object_files.empty() && !cmp)
+ || m_impl->m_jit->m_bc_snapshots.size() == m_impl->m_jit->m_n_modules);
+
+ // LCOV_EXCL_START
+ } catch (...) {
+ m_impl.reset();
+
+ throw;
+ }
+ // LCOV_EXCL_STOP
+}
+
+void llvm_multi_state::add_obj_triggers()
+{
+ // NOTE: the idea here is that we add one trigger function per module, and then
+ // we invoke all the trigger functions from a trigger function in the master module.
+ // Like this, we ensure materialisation of all modules when we lookup the
+ // master trigger.
+
+ // Implement the per-module triggers.
+ for (decltype(m_impl->m_states.size()) i = 0; i < m_impl->m_states.size(); ++i) {
+ // Fetch builder/module/context for the current state.
+ auto &bld = m_impl->m_states[i].builder();
+ auto &md = m_impl->m_states[i].module();
+ auto &ctx = m_impl->m_states[i].context();
+
+ // The function name.
+ const auto fname = fmt::format("{}_{}", detail::obj_trigger_name, i);
+
+ auto *ft = llvm::FunctionType::get(bld.getVoidTy(), {}, false);
+ assert(ft != nullptr);
+ auto *f = detail::llvm_func_create(ft, llvm::Function::ExternalLinkage, fname.c_str(), &md);
+ assert(f != nullptr);
+
+ bld.SetInsertPoint(llvm::BasicBlock::Create(ctx, "entry", f));
+ bld.CreateRetVoid();
+ }
+
+ // Fetch the master builder/module/context.
+ auto &bld = *m_impl->m_jit->m_builder;
+ auto &md = *m_impl->m_jit->m_module;
+ auto &ctx = m_impl->m_jit->context();
+
+ // Add the prototypes of all per-module trigger functions to the master module.
+ std::vector callees;
+ callees.reserve(m_impl->m_states.size());
+ for (decltype(m_impl->m_states.size()) i = 0; i < m_impl->m_states.size(); ++i) {
+ // The function name.
+ const auto fname = fmt::format("{}_{}", detail::obj_trigger_name, i);
+
+ auto *ft = llvm::FunctionType::get(bld.getVoidTy(), {}, false);
+ assert(ft != nullptr);
+ auto *f = detail::llvm_func_create(ft, llvm::Function::ExternalLinkage, fname.c_str(), &md);
+ assert(f != nullptr);
+
+ callees.push_back(f);
+ }
+
+ // Create the master trigger function.
+ auto *ft = llvm::FunctionType::get(bld.getVoidTy(), {}, false);
+ assert(ft != nullptr);
+ auto *f = detail::llvm_func_create(ft, llvm::Function::ExternalLinkage, detail::obj_trigger_name, &md);
+ assert(f != nullptr);
+
+ bld.SetInsertPoint(llvm::BasicBlock::Create(ctx, "entry", f));
+
+ // Invoke all the triggers.
+ for (auto *tf : callees) {
+ bld.CreateCall(tf, {});
+ }
+
+ // Return.
+ bld.CreateRetVoid();
+}
+
+void llvm_multi_state::check_compiled(const char *f) const
+{
+ if (m_impl->m_jit->m_module) [[unlikely]] {
+ throw std::invalid_argument(
+ fmt::format("The function '{}' can be invoked only after the llvm_multi_state has been compiled", f));
+ }
+}
+
+void llvm_multi_state::check_uncompiled(const char *f) const
+{
+ if (!m_impl->m_jit->m_module) [[unlikely]] {
+ throw std::invalid_argument(
+ fmt::format("The function '{}' can be invoked only if the llvm_multi_state has not been compiled yet", f));
+ }
+}
+
+unsigned llvm_multi_state::get_n_modules() const noexcept
+{
+ return m_impl->m_jit->m_n_modules;
+}
+
+unsigned llvm_multi_state::get_opt_level() const noexcept
+{
+ return m_impl->m_states[0].get_opt_level();
+}
+
+bool llvm_multi_state::fast_math() const noexcept
+{
+ return m_impl->m_states[0].fast_math();
+}
+
+bool llvm_multi_state::force_avx512() const noexcept
+{
+ return m_impl->m_states[0].force_avx512();
+}
+
+bool llvm_multi_state::get_slp_vectorize() const noexcept
+{
+ return m_impl->m_states[0].get_slp_vectorize();
+}
+
+code_model llvm_multi_state::get_code_model() const noexcept
+{
+ return m_impl->m_states[0].get_code_model();
+}
+
+bool llvm_multi_state::is_compiled() const noexcept
+{
+ return !m_impl->m_jit->m_module;
+}
+
+std::vector llvm_multi_state::get_ir() const
+{
+ if (is_compiled()) {
+ return m_impl->m_jit->m_ir_snapshots;
+ } else {
+ std::vector retval;
+ retval.reserve(m_impl->m_jit->m_n_modules);
+
+ for (const auto &s : m_impl->m_states) {
+ retval.push_back(s.get_ir());
+ }
+
+ // Add the IR from the master module.
+ retval.push_back(detail::ir_from_module(*m_impl->m_jit->m_module));
+
+ return retval;
+ }
+}
+
+std::vector llvm_multi_state::get_bc() const
+{
+ if (is_compiled()) {
+ return m_impl->m_jit->m_bc_snapshots;
+ } else {
+ std::vector retval;
+ retval.reserve(m_impl->m_jit->m_n_modules);
+
+ for (const auto &s : m_impl->m_states) {
+ retval.push_back(s.get_bc());
+ }
+
+ // Add the bitcode from the master module.
+ retval.push_back(detail::bc_from_module(*m_impl->m_jit->m_module));
+
+ return retval;
+ }
+}
+
+const std::vector &llvm_multi_state::get_object_code() const
+{
+ check_compiled(__func__);
+
+ return m_impl->m_jit->m_object_files;
+}
+
+// NOTE: this function is NOT exception-safe, proper cleanup
+// needs to be done externally if needed.
+void llvm_multi_state::compile_impl()
+{
+ // Add all the modules from the states.
+ for (auto &s : m_impl->m_states) {
+ detail::add_module_to_lljit(*m_impl->m_jit->m_lljit, std::move(s.m_module), *s.m_jitter->m_ctx);
+
+ // Clear out the builder.
+ s.m_builder.reset();
+
+ // NOTE: need to manually construct the object file, as this would
+ // normally be done by the invocation of s.compile() (which we do not do).
+ s.m_jitter->m_object_file.emplace();
+ }
+
+ // Add the master module.
+ detail::add_module_to_lljit(*m_impl->m_jit->m_lljit, std::move(m_impl->m_jit->m_module), *m_impl->m_jit->m_ctx);
+
+ // Clear out the master builder.
+ m_impl->m_jit->m_builder.reset();
+
+ // Trigger optimisation and object code materialisation via lookup.
+ jit_lookup(detail::obj_trigger_name);
+
+ // Sanity checks.
+ assert(m_impl->m_jit->m_bc_snapshots.size() == m_impl->m_jit->m_n_modules);
+ assert(m_impl->m_jit->m_ir_snapshots.size() == m_impl->m_jit->m_n_modules);
+ assert(m_impl->m_jit->m_object_files.size() == m_impl->m_jit->m_n_modules);
+}
+
+void llvm_multi_state::compile()
+{
+ check_uncompiled(__func__);
+
+ // Log runtime in trace mode.
+ spdlog::stopwatch sw;
+
+ auto *logger = detail::get_logger();
+
+ // Verify the modules before compiling.
+ // NOTE: probably this can be parallelised if needed.
+ for (decltype(m_impl->m_states.size()) i = 0; i < m_impl->m_states.size(); ++i) {
+ detail::verify_module(*m_impl->m_states[i].m_module);
+ }
+
+ logger->trace("llvm_multi_state module verification runtime: {}", sw);
+
+ try {
+ // Add the object materialisation trigger functions.
+ // NOTE: contrary to llvm_state::add_obj_trigger(), add_obj_triggers()
+ // does not implement any automatic cleanup in case of errors. Thus, we fold
+ // it into the try/catch block in order to avoid leaving the
+ // llvm_multi_state in a half-baked state.
+ add_obj_triggers();
+
+ // Fetch the bitcode *before* optimisation.
+ std::vector obc;
+ obc.reserve(boost::safe_numerics::safe(m_impl->m_states.size()) + 1u);
+ for (const auto &s : m_impl->m_states) {
+ obc.push_back(s.get_bc());
+ }
+ // Add the master bitcode.
+ obc.push_back(detail::bc_from_module(*m_impl->m_jit->m_module));
+
+ // Assemble the compilation flag.
+ const auto comp_flag
+ = detail::assemble_comp_flag(get_opt_level(), force_avx512(), get_slp_vectorize(), get_code_model());
+
+ // Lookup in the cache.
+ if (auto cached_data = detail::llvm_state_mem_cache_lookup(obc, comp_flag)) {
+ // Cache hit.
+
+ // Assign the optimised snapshots.
+ assert(cached_data->opt_ir.size() == m_impl->m_jit->m_n_modules);
+ assert(cached_data->opt_bc.size() == m_impl->m_jit->m_n_modules);
+ assert(cached_data->obj.size() == m_impl->m_jit->m_n_modules);
+ assert(m_impl->m_jit->m_ir_snapshots.empty());
+ assert(m_impl->m_jit->m_bc_snapshots.empty());
+ m_impl->m_jit->m_ir_snapshots = std::move(cached_data->opt_ir);
+ m_impl->m_jit->m_bc_snapshots = std::move(cached_data->opt_bc);
+
+ // NOTE: here it is important that we replicate the logic happening
+ // in llvm_state::compile(): clear out module/builder, construct
+ // the object file. The snapshots can be left empty.
+ for (auto &s : m_impl->m_states) {
+ s.m_module.reset();
+ s.m_builder.reset();
+ s.m_jitter->m_object_file.emplace();
+ }
+
+ // Clear out master module and builder.
+ m_impl->m_jit->m_module.reset();
+ m_impl->m_jit->m_builder.reset();
+
+ // Add and assign the object files.
+ for (const auto &obj : cached_data->obj) {
+ detail::add_obj_to_lljit(*m_impl->m_jit->m_lljit, obj);
+ }
+
+ // Assign the compiled objects.
+ assert(m_impl->m_jit->m_object_files.empty());
+ m_impl->m_jit->m_object_files = std::move(cached_data->obj);
+
+ // Lookup the trigger.
+ jit_lookup(detail::obj_trigger_name);
+ } else {
+ // Cache miss.
+
+ sw.reset();
+
+ // Run the compilation.
+ compile_impl();
+
+ logger->trace("optimisation + materialisation runtime: {}", sw);
+
+ // NOTE: at this point, m_ir_snapshots, m_bc_snapshots and m_object_files
+ // have all been constructed in random order because of multithreading.
+ // Sort them so that we provided deterministic behaviour. Probably
+ // not strictly needed, but let's try to avoid nondeterminism.
+ // All of this can be parallelised if needed.
+ std::ranges::sort(m_impl->m_jit->m_ir_snapshots);
+ std::ranges::sort(m_impl->m_jit->m_bc_snapshots);
+ std::ranges::sort(m_impl->m_jit->m_object_files);
+
+ // Try to insert obc into the cache.
+ detail::llvm_state_mem_cache_try_insert(std::move(obc), comp_flag,
+ {.opt_bc = m_impl->m_jit->m_bc_snapshots,
+ .opt_ir = m_impl->m_jit->m_ir_snapshots,
+ .obj = m_impl->m_jit->m_object_files});
+ // LCOV_EXCL_START
+ }
+ } catch (...) {
+ // Reset to a def-cted state in case of error,
+ // as it looks like there's no way of recovering.
+ m_impl.reset();
+
+ throw;
+ }
+ // LCOV_EXCL_STOP
+}
+
+std::uintptr_t llvm_multi_state::jit_lookup(const std::string &name)
+{
+ check_compiled(__func__);
+
+ auto sym = m_impl->m_jit->m_lljit->lookup(name);
+ if (!sym) {
+ throw std::invalid_argument(fmt::format("Could not find the symbol '{}' in an llvm_multi_state", name));
+ }
+
+ return static_cast((*sym).getValue());
+}
+
+std::ostream &operator<<(std::ostream &os, const llvm_multi_state &s)
+{
+ std::ostringstream oss;
+ oss << std::boolalpha;
+
+ oss << "N of modules : " << s.get_n_modules() << '\n';
+ oss << "Compiled : " << s.is_compiled() << '\n';
+ oss << "Fast math : " << s.fast_math() << '\n';
+ oss << "Force AVX512 : " << s.force_avx512() << '\n';
+ oss << "SLP vectorization : " << s.get_slp_vectorize() << '\n';
+ oss << "Code model : " << s.get_code_model() << '\n';
+ oss << "Optimisation level: " << s.get_opt_level() << '\n';
+ oss << "Data layout : " << s.m_impl->m_states[0].m_jitter->m_lljit->getDataLayout().getStringRepresentation()
+ << '\n';
+ oss << "Target triple : " << s.m_impl->m_states[0].m_jitter->get_target_triple().str() << '\n';
+ oss << "Target CPU : " << s.m_impl->m_states[0].m_jitter->get_target_cpu() << '\n';
+ oss << "Target features : " << s.m_impl->m_states[0].m_jitter->get_target_features() << '\n';
+
+ return os << oss.str();
+}
+
HEYOKA_END_NAMESPACE
diff --git a/src/llvm_state_mem_cache.cpp b/src/llvm_state_mem_cache.cpp
index e3135ae25..88ba8ea0f 100644
--- a/src/llvm_state_mem_cache.cpp
+++ b/src/llvm_state_mem_cache.cpp
@@ -17,7 +17,7 @@
#include
#include
#include
-#include
+#include
#include
#include
@@ -28,7 +28,8 @@
#include
// This in-memory cache maps the bitcode
-// of an LLVM module and an optimisation level to:
+// of one or more LLVM modules and an integer flag
+// (representing several compilation settings) to:
//
// - the optimised version of the bitcode,
// - the textual IR corresponding
@@ -43,6 +44,26 @@ HEYOKA_BEGIN_NAMESPACE
namespace detail
{
+// Helper to compute the total size in bytes
+// of the data contained in an llvm_mc_value.
+// Will throw on overflow.
+std::size_t llvm_mc_value::total_size() const
+{
+ assert(!opt_bc.empty());
+ assert(opt_bc.size() == opt_ir.size());
+ assert(opt_bc.size() == obj.size());
+
+ boost::safe_numerics::safe ret = 0;
+
+ for (decltype(opt_bc.size()) i = 0; i < opt_bc.size(); ++i) {
+ ret += opt_bc[i].size();
+ ret += opt_ir[i].size();
+ ret += obj[i].size();
+ }
+
+ return ret;
+}
+
namespace
{
@@ -56,16 +77,33 @@ HEYOKA_CONSTINIT
std::mutex mem_cache_mutex;
// Definition of the data structures for the cache.
-using lru_queue_t = std::list>;
+using lru_queue_t = std::list, unsigned>>;
using lru_key_t = lru_queue_t::iterator;
+// Implementation of hashing for std::pair, unsigned> and
+// its heterogeneous counterpart.
+template
+auto cache_key_hasher(const T &k) noexcept
+{
+ assert(!k.first.empty());
+
+ // Combine the bitcodes.
+ auto seed = std::hash{}(k.first[0]);
+ for (decltype(k.first.size()) i = 1; i < k.first.size(); ++i) {
+ boost::hash_combine(seed, k.first[i]);
+ }
+
+ // Combine with the compilation flag.
+ boost::hash_combine(seed, static_cast(k.second));
+
+ return seed;
+}
+
struct lru_hasher {
std::size_t operator()(const lru_key_t &k) const noexcept
{
- auto seed = std::hash{}(k->first);
- boost::hash_combine(seed, k->second);
- return seed;
+ return cache_key_hasher(*k);
}
};
@@ -96,16 +134,16 @@ HEYOKA_CONSTINIT std::uint64_t mem_cache_limit = 2147483648ull;
// Machinery for heterogeneous lookup into the cache.
// NOTE: this function MUST be invoked while holding the global lock.
-auto llvm_state_mem_cache_hl(const std::string &bc, unsigned opt_level)
+auto llvm_state_mem_cache_hl(const std::vector &bc, unsigned comp_flag)
{
- using compat_key_t = std::pair;
+ // NOTE: the heterogeneous version of the key replaces std::vector
+ // with a const reference.
+ using compat_key_t = std::pair &, unsigned>;
struct compat_hasher {
std::size_t operator()(const compat_key_t &k) const noexcept
{
- auto seed = std::hash{}(k.first);
- boost::hash_combine(seed, k.second);
- return seed;
+ return cache_key_hasher(k);
}
};
@@ -120,7 +158,7 @@ auto llvm_state_mem_cache_hl(const std::string &bc, unsigned opt_level)
}
};
- return lru_map.find(std::make_pair(std::cref(bc), opt_level), compat_hasher{}, compat_cmp{});
+ return lru_map.find(std::make_pair(std::cref(bc), comp_flag), compat_hasher{}, compat_cmp{});
}
// Debug function to run sanity checks on the cache.
@@ -131,15 +169,13 @@ void llvm_state_mem_cache_sanity_checks()
// Check that the computed size of the cache is consistent with mem_cache_size.
assert(std::accumulate(lru_map.begin(), lru_map.end(), boost::safe_numerics::safe(0),
- [](const auto &a, const auto &p) {
- return a + p.second.opt_bc.size() + p.second.opt_ir.size() + p.second.obj.size();
- })
+ [](const auto &a, const auto &p) { return a + p.second.total_size(); })
== mem_cache_size);
}
} // namespace
-std::optional llvm_state_mem_cache_lookup(const std::string &bc, unsigned opt_level)
+std::optional llvm_state_mem_cache_lookup(const std::vector &bc, unsigned comp_flag)
{
// Lock down.
const std::lock_guard lock(mem_cache_mutex);
@@ -147,7 +183,7 @@ std::optional llvm_state_mem_cache_lookup(const std::string &bc,
// Sanity checks.
llvm_state_mem_cache_sanity_checks();
- if (const auto it = llvm_state_mem_cache_hl(bc, opt_level); it == lru_map.end()) {
+ if (const auto it = llvm_state_mem_cache_hl(bc, comp_flag); it == lru_map.end()) {
// Cache miss.
return {};
} else {
@@ -163,7 +199,7 @@ std::optional llvm_state_mem_cache_lookup(const std::string &bc,
}
}
-void llvm_state_mem_cache_try_insert(std::string bc, unsigned opt_level, llvm_mc_value val)
+void llvm_state_mem_cache_try_insert(std::vector bc, unsigned comp_flag, llvm_mc_value val)
{
// Lock down.
const std::lock_guard lock(mem_cache_mutex);
@@ -174,7 +210,7 @@ void llvm_state_mem_cache_try_insert(std::string bc, unsigned opt_level, llvm_mc
// Do a first lookup to check if bc is already in the cache.
// This could happen, e.g., if two threads are compiling the same
// code concurrently.
- if (const auto it = llvm_state_mem_cache_hl(bc, opt_level); it != lru_map.end()) {
+ if (const auto it = llvm_state_mem_cache_hl(bc, comp_flag); it != lru_map.end()) {
assert(val.opt_bc == it->second.opt_bc);
assert(val.opt_ir == it->second.opt_ir);
assert(val.obj == it->second.obj);
@@ -183,8 +219,7 @@ void llvm_state_mem_cache_try_insert(std::string bc, unsigned opt_level, llvm_mc
}
// Compute the new cache size.
- auto new_cache_size = static_cast(boost::safe_numerics::safe(mem_cache_size)
- + val.opt_bc.size() + val.opt_ir.size() + val.obj.size());
+ auto new_cache_size = boost::safe_numerics::safe(mem_cache_size) + val.total_size();
// Remove items from the cache if we are exceeding
// the limit.
@@ -195,8 +230,7 @@ void llvm_state_mem_cache_try_insert(std::string bc, unsigned opt_level, llvm_mc
const auto &cur_val = cur_it->second;
// NOTE: no possibility of overflow here, as cur_size is guaranteed
// not to be greater than mem_cache_size.
- const auto cur_size
- = static_cast(cur_val.opt_bc.size()) + cur_val.opt_ir.size() + cur_val.obj.size();
+ const auto cur_size = cur_val.total_size();
// NOTE: the next 4 lines cannot throw, which ensures that the
// cache cannot be left in an inconsistent state.
@@ -222,7 +256,7 @@ void llvm_state_mem_cache_try_insert(std::string bc, unsigned opt_level, llvm_mc
// Add the new item to the front of the queue.
// NOTE: if this throws, we have not modified lru_map yet,
// no cleanup needed.
- lru_queue.emplace_front(std::move(bc), opt_level);
+ lru_queue.emplace_front(std::move(bc), comp_flag);
// Add the new item to the map.
try {
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 09e360cab..f6c4ccd29 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -159,6 +159,7 @@ ADD_HEYOKA_TESTCASE(cfunc)
ADD_HEYOKA_TESTCASE(cfunc_multieval)
ADD_HEYOKA_TESTCASE(dfun)
ADD_HEYOKA_TESTCASE(var_ode_sys)
+ADD_HEYOKA_TESTCASE(llvm_multi_state)
if(HEYOKA_WITH_MPPP AND mp++_WITH_MPFR)
ADD_HEYOKA_TESTCASE(event_detection_mp)
diff --git a/test/llvm_multi_state.cpp b/test/llvm_multi_state.cpp
new file mode 100644
index 000000000..652ab5b56
--- /dev/null
+++ b/test/llvm_multi_state.cpp
@@ -0,0 +1,578 @@
+// Copyright 2020, 2021, 2022, 2023, 2024 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com)
+//
+// This file is part of the heyoka library.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "catch.hpp"
+
+using namespace heyoka;
+
+TEST_CASE("basic")
+{
+ using Catch::Matchers::Message;
+
+ // Default construction.
+ {
+ REQUIRE_NOTHROW(llvm_multi_state{});
+ }
+
+ // No states in input.
+ REQUIRE_THROWS_MATCHES(llvm_multi_state{{}}, std::invalid_argument,
+ Message("At least 1 llvm_state object is needed to construct an llvm_multi_state"));
+
+ // Inconsistent settings.
+ REQUIRE_THROWS_MATCHES(
+ (llvm_multi_state{{llvm_state{kw::opt_level = 1u}, llvm_state{kw::opt_level = 2u}}}), std::invalid_argument,
+ Message("Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state"));
+
+ REQUIRE_THROWS_MATCHES(
+ (llvm_multi_state{{llvm_state{kw::fast_math = true}, llvm_state{}}}), std::invalid_argument,
+ Message("Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state"));
+
+ REQUIRE_THROWS_MATCHES(
+ (llvm_multi_state{{llvm_state{}, llvm_state{kw::force_avx512 = true}}}), std::invalid_argument,
+ Message("Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state"));
+
+ REQUIRE_THROWS_MATCHES(
+ (llvm_multi_state{{llvm_state{}, llvm_state{}, llvm_state{kw::slp_vectorize = true}}}), std::invalid_argument,
+ Message("Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state"));
+ REQUIRE_THROWS_MATCHES(
+ (llvm_multi_state{{llvm_state{}, llvm_state{kw::code_model = code_model::large}, llvm_state{}}}),
+ std::invalid_argument,
+ Message("Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state"));
+
+ {
+ // Construction from compiled modules.
+ llvm_state s;
+ s.compile();
+
+ REQUIRE_THROWS_MATCHES(
+ (llvm_multi_state{{s, llvm_state{}}}), std::invalid_argument,
+ Message("An llvm_multi_state can be constructed only from uncompiled llvm_state objects"));
+ REQUIRE_THROWS_MATCHES(
+ (llvm_multi_state{{llvm_state{}, s}}), std::invalid_argument,
+ Message("An llvm_multi_state can be constructed only from uncompiled llvm_state objects"));
+ }
+
+ {
+ // Invalid module name.
+ llvm_state s{kw::mname = "heyoka.master"};
+ REQUIRE_THROWS_MATCHES(
+ (llvm_multi_state{{s, llvm_state{}}}), std::invalid_argument,
+ Message("An invalid llvm_state was passed to the constructor of an llvm_multi_state: the module name "
+ "'heyoka.master' is reserved for internal use by llvm_multi_state"));
+ }
+
+ // Test the property getters.
+ {
+ llvm_state s{kw::opt_level = 1u, kw::fast_math = true, kw::force_avx512 = true, kw::slp_vectorize = true,
+ kw::code_model = code_model::large};
+
+ llvm_multi_state ms{{s, s, s, s}};
+
+ REQUIRE(ms.get_opt_level() == 1u);
+ REQUIRE(ms.fast_math());
+ REQUIRE(ms.force_avx512());
+ REQUIRE(ms.get_slp_vectorize());
+ REQUIRE(ms.get_code_model() == code_model::large);
+ REQUIRE(ms.get_n_modules() == 5u);
+ REQUIRE(!ms.is_compiled());
+
+ ms.compile();
+
+ REQUIRE(ms.is_compiled());
+ REQUIRE(ms.get_opt_level() == 1u);
+ REQUIRE(ms.fast_math());
+ REQUIRE(ms.force_avx512());
+ REQUIRE(ms.get_slp_vectorize());
+ REQUIRE(ms.get_code_model() == code_model::large);
+ REQUIRE(ms.get_n_modules() == 5u);
+
+ REQUIRE_THROWS_MATCHES(
+ ms.compile(), std::invalid_argument,
+ Message("The function 'compile' can be invoked only if the llvm_multi_state has not been compiled yet"));
+ }
+
+ // Move construction/assignment.
+ {
+ llvm_state s{kw::opt_level = 1u, kw::fast_math = true, kw::force_avx512 = true, kw::slp_vectorize = true,
+ kw::code_model = code_model::large};
+
+ llvm_multi_state ms{{s, s, s, s}};
+
+ auto ms2 = std::move(ms);
+
+ REQUIRE(ms2.get_opt_level() == 1u);
+ REQUIRE(ms2.fast_math());
+ REQUIRE(ms2.force_avx512());
+ REQUIRE(ms2.get_slp_vectorize());
+ REQUIRE(ms2.get_code_model() == code_model::large);
+ REQUIRE(ms2.get_n_modules() == 5u);
+ REQUIRE(!ms2.is_compiled());
+
+ ms2.compile();
+
+ llvm_multi_state ms3;
+ ms3 = std::move(ms2);
+
+ REQUIRE(ms3.is_compiled());
+ REQUIRE(ms3.get_opt_level() == 1u);
+ REQUIRE(ms3.fast_math());
+ REQUIRE(ms3.force_avx512());
+ REQUIRE(ms3.get_slp_vectorize());
+ REQUIRE(ms3.get_code_model() == code_model::large);
+ REQUIRE(ms3.get_n_modules() == 5u);
+ }
+}
+
+TEST_CASE("copy semantics")
+{
+ using Catch::Matchers::Message;
+
+ // NOTE: in order to properly test this, we have to disable the cache.
+ llvm_state::clear_memcache();
+ llvm_state::set_memcache_limit(0);
+
+ auto [x, y] = make_vars("x", "y");
+
+ llvm_state s1, s2;
+
+ add_cfunc(s1, "f1", {x * y}, {x, y}, kw::compact_mode = true);
+ add_cfunc(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true);
+
+ llvm_multi_state ms{{s1, s2}};
+
+ auto ms_copy = ms;
+
+ REQUIRE(ms_copy.get_bc() == ms.get_bc());
+ REQUIRE(ms_copy.get_ir() == ms.get_ir());
+ REQUIRE(ms_copy.is_compiled() == ms.is_compiled());
+ REQUIRE(ms_copy.fast_math() == ms.fast_math());
+ REQUIRE(ms_copy.force_avx512() == ms.force_avx512());
+ REQUIRE(ms_copy.get_opt_level() == ms.get_opt_level());
+ REQUIRE(ms_copy.get_slp_vectorize() == ms.get_slp_vectorize());
+ REQUIRE(ms_copy.get_code_model() == ms.get_code_model());
+ REQUIRE_THROWS_MATCHES(
+ ms_copy.get_object_code(), std::invalid_argument,
+ Message("The function 'get_object_code' can be invoked only after the llvm_multi_state has been compiled"));
+ REQUIRE_THROWS_MATCHES(
+ ms_copy.jit_lookup("foo"), std::invalid_argument,
+ Message("The function 'jit_lookup' can be invoked only after the llvm_multi_state has been compiled"));
+
+ ms.compile();
+ ms_copy.compile();
+
+ REQUIRE(ms_copy.get_bc() == ms.get_bc());
+ REQUIRE(ms_copy.get_ir() == ms.get_ir());
+ REQUIRE(ms_copy.get_object_code() == ms.get_object_code());
+ REQUIRE(ms_copy.is_compiled() == ms.is_compiled());
+ REQUIRE(ms_copy.fast_math() == ms.fast_math());
+ REQUIRE(ms_copy.force_avx512() == ms.force_avx512());
+ REQUIRE(ms_copy.get_opt_level() == ms.get_opt_level());
+ REQUIRE(ms_copy.get_slp_vectorize() == ms.get_slp_vectorize());
+ REQUIRE(ms_copy.get_code_model() == ms.get_code_model());
+ REQUIRE_NOTHROW(ms_copy.jit_lookup("f1"));
+ REQUIRE_NOTHROW(ms_copy.jit_lookup("f2"));
+
+ {
+ auto *cf1_ptr = reinterpret_cast(
+ ms_copy.jit_lookup("f1"));
+ auto *cf2_ptr = reinterpret_cast(
+ ms_copy.jit_lookup("f2"));
+
+ REQUIRE_THROWS_MATCHES(ms_copy.jit_lookup("f3"), std::invalid_argument,
+ Message("Could not find the symbol 'f3' in an llvm_multi_state"));
+
+ const double ins[] = {2., 3.};
+ double outs[2] = {};
+
+ cf1_ptr(outs, ins, nullptr, nullptr);
+ cf2_ptr(outs + 1, ins, nullptr, nullptr);
+
+ REQUIRE(outs[0] == 6);
+ REQUIRE(outs[1] == 2. / 3.);
+ }
+
+ auto ms_copy2 = ms;
+
+ REQUIRE(ms_copy2.get_bc() == ms.get_bc());
+ REQUIRE(ms_copy2.get_ir() == ms.get_ir());
+ REQUIRE(ms_copy2.get_object_code() == ms.get_object_code());
+ REQUIRE(ms_copy2.is_compiled() == ms.is_compiled());
+ REQUIRE(ms_copy2.fast_math() == ms.fast_math());
+ REQUIRE(ms_copy2.force_avx512() == ms.force_avx512());
+ REQUIRE(ms_copy2.get_opt_level() == ms.get_opt_level());
+ REQUIRE(ms_copy2.get_slp_vectorize() == ms.get_slp_vectorize());
+ REQUIRE(ms_copy2.get_code_model() == ms.get_code_model());
+ REQUIRE_NOTHROW(ms_copy2.jit_lookup("f1"));
+ REQUIRE_NOTHROW(ms_copy2.jit_lookup("f2"));
+
+ {
+ auto *cf1_ptr = reinterpret_cast(
+ ms_copy2.jit_lookup("f1"));
+ auto *cf2_ptr = reinterpret_cast(
+ ms_copy2.jit_lookup("f2"));
+
+ const double ins[] = {2., 3.};
+ double outs[2] = {};
+
+ cf1_ptr(outs, ins, nullptr, nullptr);
+ cf2_ptr(outs + 1, ins, nullptr, nullptr);
+
+ REQUIRE(outs[0] == 6);
+ REQUIRE(outs[1] == 2. / 3.);
+ }
+
+ // Test also copy assignment.
+ llvm_multi_state ms_copy3;
+ ms_copy3 = ms_copy2;
+
+ REQUIRE(ms_copy3.get_bc() == ms.get_bc());
+ REQUIRE(ms_copy3.get_ir() == ms.get_ir());
+ REQUIRE(ms_copy3.get_object_code() == ms.get_object_code());
+ REQUIRE(ms_copy3.is_compiled() == ms.is_compiled());
+ REQUIRE(ms_copy3.fast_math() == ms.fast_math());
+ REQUIRE(ms_copy3.force_avx512() == ms.force_avx512());
+ REQUIRE(ms_copy3.get_opt_level() == ms.get_opt_level());
+ REQUIRE(ms_copy3.get_slp_vectorize() == ms.get_slp_vectorize());
+ REQUIRE(ms_copy3.get_code_model() == ms.get_code_model());
+ REQUIRE_NOTHROW(ms_copy3.jit_lookup("f1"));
+ REQUIRE_NOTHROW(ms_copy3.jit_lookup("f2"));
+
+ {
+ auto *cf1_ptr = reinterpret_cast(
+ ms_copy3.jit_lookup("f1"));
+ auto *cf2_ptr = reinterpret_cast(
+ ms_copy3.jit_lookup("f2"));
+
+ const double ins[] = {2., 3.};
+ double outs[2] = {};
+
+ cf1_ptr(outs, ins, nullptr, nullptr);
+ cf2_ptr(outs + 1, ins, nullptr, nullptr);
+
+ REQUIRE(outs[0] == 6);
+ REQUIRE(outs[1] == 2. / 3.);
+ }
+
+ // Restore the cache.
+ llvm_state::set_memcache_limit(100'000'000ull);
+}
+
+TEST_CASE("s11n")
+{
+ using Catch::Matchers::Message;
+
+ // NOTE: in order to properly test this, we have to disable the cache.
+ llvm_state::clear_memcache();
+ llvm_state::set_memcache_limit(0);
+
+ auto [x, y] = make_vars("x", "y");
+
+ llvm_state s1, s2;
+
+ add_cfunc(s1, "f1", {x * y}, {x, y}, kw::compact_mode = true);
+ add_cfunc(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true);
+
+ // Uncompiled.
+ llvm_multi_state ms{{s1, s2}};
+
+ std::stringstream ss;
+
+ {
+ boost::archive::binary_oarchive oa(ss);
+ oa << ms;
+ }
+
+ llvm_multi_state ms_copy{{llvm_state{}}};
+
+ {
+ boost::archive::binary_iarchive ia(ss);
+ ia >> ms_copy;
+ }
+
+ REQUIRE(ms_copy.get_bc() == ms.get_bc());
+ REQUIRE(ms_copy.get_ir() == ms.get_ir());
+ REQUIRE(ms_copy.is_compiled() == ms.is_compiled());
+ REQUIRE(ms_copy.fast_math() == ms.fast_math());
+ REQUIRE(ms_copy.force_avx512() == ms.force_avx512());
+ REQUIRE(ms_copy.get_opt_level() == ms.get_opt_level());
+ REQUIRE(ms_copy.get_slp_vectorize() == ms.get_slp_vectorize());
+ REQUIRE(ms_copy.get_code_model() == ms.get_code_model());
+ REQUIRE_THROWS_MATCHES(
+ ms_copy.get_object_code(), std::invalid_argument,
+ Message("The function 'get_object_code' can be invoked only after the llvm_multi_state has been compiled"));
+ REQUIRE_THROWS_MATCHES(
+ ms_copy.jit_lookup("foo"), std::invalid_argument,
+ Message("The function 'jit_lookup' can be invoked only after the llvm_multi_state has been compiled"));
+
+ // Compiled.
+ ms.compile();
+
+ ss.str("");
+
+ {
+ boost::archive::binary_oarchive oa(ss);
+ oa << ms;
+ }
+
+ {
+ boost::archive::binary_iarchive ia(ss);
+ ia >> ms_copy;
+ }
+
+ REQUIRE(ms_copy.get_bc() == ms.get_bc());
+ REQUIRE(ms_copy.get_ir() == ms.get_ir());
+ REQUIRE(ms_copy.get_object_code() == ms.get_object_code());
+ REQUIRE(ms_copy.is_compiled() == ms.is_compiled());
+ REQUIRE(ms_copy.fast_math() == ms.fast_math());
+ REQUIRE(ms_copy.force_avx512() == ms.force_avx512());
+ REQUIRE(ms_copy.get_opt_level() == ms.get_opt_level());
+ REQUIRE(ms_copy.get_slp_vectorize() == ms.get_slp_vectorize());
+ REQUIRE(ms_copy.get_code_model() == ms.get_code_model());
+ REQUIRE_NOTHROW(ms_copy.jit_lookup("f1"));
+ REQUIRE_NOTHROW(ms_copy.jit_lookup("f2"));
+
+ {
+ auto *cf1_ptr = reinterpret_cast(
+ ms_copy.jit_lookup("f1"));
+ auto *cf2_ptr = reinterpret_cast(
+ ms_copy.jit_lookup("f2"));
+
+ const double ins[] = {2., 3.};
+ double outs[2] = {};
+
+ cf1_ptr(outs, ins, nullptr, nullptr);
+ cf2_ptr(outs + 1, ins, nullptr, nullptr);
+
+ REQUIRE(outs[0] == 6);
+ REQUIRE(outs[1] == 2. / 3.);
+ }
+
+ // Restore the cache.
+ llvm_state::set_memcache_limit(100'000'000ull);
+}
+
+TEST_CASE("cfunc")
+{
+ using Catch::Matchers::Message;
+
+ // Basic test.
+ auto [x, y] = make_vars("x", "y");
+
+ llvm_state s1, s2;
+
+ add_cfunc(s1, "f1", {x * y}, {x, y}, kw::compact_mode = true);
+ add_cfunc(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true);
+
+ const auto orig_ir1 = s1.get_ir();
+ const auto orig_ir2 = s2.get_ir();
+
+ const auto orig_bc1 = s1.get_bc();
+ const auto orig_bc2 = s2.get_bc();
+
+ llvm_multi_state ms{{s1, s2}};
+
+ REQUIRE(ms.get_ir().size() == 3u);
+ REQUIRE(ms.get_bc().size() == 3u);
+ REQUIRE_THROWS_MATCHES(
+ ms.get_object_code(), std::invalid_argument,
+ Message("The function 'get_object_code' can be invoked only after the llvm_multi_state has been compiled"));
+
+ REQUIRE(orig_ir1 == ms.get_ir()[0]);
+ REQUIRE(orig_ir2 == ms.get_ir()[1]);
+
+ REQUIRE(orig_bc1 == ms.get_bc()[0]);
+ REQUIRE(orig_bc2 == ms.get_bc()[1]);
+
+ ms.compile();
+ s1.compile();
+ s2.compile();
+
+ REQUIRE(ms.get_ir().size() == 3u);
+ REQUIRE(ms.get_bc().size() == 3u);
+
+ auto *cf1_ptr
+ = reinterpret_cast(ms.jit_lookup("f1"));
+ auto *cf2_ptr
+ = reinterpret_cast(ms.jit_lookup("f2"));
+
+ const double ins[] = {2., 3.};
+ double outs[2] = {};
+
+ cf1_ptr(outs, ins, nullptr, nullptr);
+ cf2_ptr(outs + 1, ins, nullptr, nullptr);
+
+ REQUIRE(outs[0] == 6);
+ REQUIRE(outs[1] == 2. / 3.);
+}
+
+TEST_CASE("stream op")
+{
+ auto [x, y] = make_vars("x", "y");
+
+ llvm_state s1, s2;
+
+ add_cfunc(s1, "f1", {x * y}, {x, y}, kw::compact_mode = true);
+ add_cfunc(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true);
+
+ const auto orig_ir1 = s1.get_ir();
+ const auto orig_ir2 = s2.get_ir();
+
+ const auto orig_bc1 = s1.get_bc();
+ const auto orig_bc2 = s2.get_bc();
+
+ llvm_multi_state ms{{s1, s2}};
+
+ std::ostringstream oss;
+ oss << ms;
+
+ REQUIRE(!oss.str().empty());
+}
+
+// A test to check that, post compilation, snapshots and object files
+// are ordered deterministically.
+TEST_CASE("post compile ordering")
+{
+ auto [x, y] = make_vars("x", "y");
+
+ llvm_state s1, s2, s3, s4;
+
+ add_cfunc(s1, "f1", {x * y}, {x, y});
+ add_cfunc(s2, "f2", {x / y}, {x, y});
+ add_cfunc(s3, "f3", {x + y}, {x, y});
+ add_cfunc(s4, "f4", {x - y}, {x, y});
+
+ llvm_state::clear_memcache();
+
+ llvm_multi_state ms{{s1, s2, s3, s4}};
+ ms.compile();
+
+ const auto orig_obj = ms.get_object_code();
+ const auto orig_ir = ms.get_ir();
+ const auto orig_bc = ms.get_bc();
+
+ for (auto i = 0; i < 20; ++i) {
+ llvm_state::clear_memcache();
+
+ llvm_multi_state ms2{{s1, s2, s3, s4}};
+ ms2.compile();
+
+ REQUIRE(ms2.get_object_code() == orig_obj);
+ REQUIRE(ms2.get_ir() == orig_ir);
+ REQUIRE(ms2.get_bc() == orig_bc);
+ }
+}
+
+TEST_CASE("memcache testing")
+{
+ auto [x, y] = make_vars("x", "y");
+
+ llvm_state s1, s2, s3, s4;
+
+ add_cfunc(s1, "f1", {x * y}, {x, y});
+ add_cfunc(s2, "f2", {x / y}, {x, y});
+ add_cfunc(s3, "f3", {x + y}, {x, y});
+ add_cfunc(s4, "f4", {x - y}, {x, y});
+
+ llvm_state::clear_memcache();
+
+ llvm_multi_state ms{{s1, s2, s3, s4}};
+ ms.compile();
+
+ const auto cur_cache_size = llvm_state::get_memcache_size();
+
+ llvm_multi_state ms2{{s1, s2, s3, s4}};
+ ms2.compile();
+
+ REQUIRE(cur_cache_size == llvm_state::get_memcache_size());
+
+ auto *cf1_ptr
+ = reinterpret_cast(ms.jit_lookup("f1"));
+ auto *cf2_ptr
+ = reinterpret_cast(ms.jit_lookup("f2"));
+ auto *cf3_ptr
+ = reinterpret_cast(ms.jit_lookup("f3"));
+ auto *cf4_ptr
+ = reinterpret_cast(ms.jit_lookup("f4"));
+
+ const double ins[] = {2., 3.};
+ double outs[4] = {};
+
+ cf1_ptr(outs, ins, nullptr, nullptr);
+ cf2_ptr(outs + 1, ins, nullptr, nullptr);
+ cf3_ptr(outs + 2, ins, nullptr, nullptr);
+ cf4_ptr(outs + 3, ins, nullptr, nullptr);
+
+ REQUIRE(outs[0] == 6);
+ REQUIRE(outs[1] == 2. / 3.);
+ REQUIRE(outs[2] == 5);
+ REQUIRE(outs[3] == -1);
+}
+
+// Tests to check vectorisation via the vector-function-abi-variant machinery.
+TEST_CASE("vfabi double")
+{
+ for (auto fast_math : {false, true}) {
+ llvm_state s1{kw::slp_vectorize = true, kw::fast_math = fast_math};
+ llvm_state s2{kw::slp_vectorize = true, kw::fast_math = fast_math};
+
+ auto [a, b] = make_vars("a", "b");
+
+ add_cfunc(s1, "cfunc", {erf(a), erf(b)}, {a, b});
+ add_cfunc(s2, "cfuncs", {erf(a), erf(b)}, {a, b}, kw::strided = true);
+
+ llvm_multi_state ms{{s1, s2}};
+
+ ms.compile();
+
+ // NOTE: autovec with external scalar functions seems to work
+ // only since LLVM 16.
+#if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
+
+ for (auto ir : ms.get_ir()) {
+ using string_find_iterator = boost::find_iterator;
+
+ auto count = 0u;
+ for (auto it = boost::make_find_iterator(ir, boost::first_finder("@erf", boost::is_iequal()));
+ it != string_find_iterator(); ++it) {
+ ++count;
+ }
+
+ // NOTE: in the master module or in the "cfunc" module, we don't
+ // expect any @erf: the master module contains only the trigger,
+ // the "cfunc" module should have vectorised everything and
+ // there should be no more references to the scalar @erf.
+ if (count == 0u) {
+ continue;
+ }
+
+ // NOTE: occurrences of the scalar version:
+ // - 2 calls in the strided cfunc,
+ // - 1 declaration.
+ REQUIRE(count == 3u);
+ }
+
+#endif
+ }
+}
diff --git a/tools/gha_llvm13_conda_asan.sh b/tools/gha_llvm13_conda_asan.sh
deleted file mode 100644
index cba8e3692..000000000
--- a/tools/gha_llvm13_conda_asan.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env bash
-
-# Echo each command
-set -x
-
-# Exit on error.
-set -e
-
-# Core deps.
-sudo apt-get install wget
-
-# Install conda+deps.
-wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O miniconda.sh
-export deps_dir=$HOME/local
-export PATH="$HOME/miniconda/bin:$PATH"
-bash miniconda.sh -b -p $HOME/miniconda
-mamba create -y -p $deps_dir c-compiler cxx-compiler cmake ninja 'llvmdev=13.*' \
- tbb-devel tbb libboost-devel 'mppp=1.*' sleef xtensor xtensor-blas blas \
- blas-devel fmt spdlog
-source activate $deps_dir
-
-# Create the build dir and cd into it.
-mkdir build
-cd build
-
-# Clear the compilation flags set up by conda.
-unset CXXFLAGS
-unset CFLAGS
-
-# Configure.
-cmake ../ -G Ninja \
- -DCMAKE_PREFIX_PATH=$deps_dir \
- -DCMAKE_BUILD_TYPE=Debug \
- -DHEYOKA_BUILD_TESTS=yes \
- -DHEYOKA_BUILD_TUTORIALS=ON \
- -DHEYOKA_WITH_MPPP=yes \
- -DHEYOKA_WITH_SLEEF=yes \
- -DCMAKE_CXX_FLAGS="-fsanitize=address" \
- -DCMAKE_CXX_FLAGS_DEBUG="-g -Og"
-
-# Build.
-ninja -v
-
-# Run the tests.
-ctest -VV -j4
-
-set +e
-set +x
diff --git a/tools/gha_llvm14_conda_asan.sh b/tools/gha_llvm14_conda_asan.sh
deleted file mode 100644
index cc54de5ae..000000000
--- a/tools/gha_llvm14_conda_asan.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env bash
-
-# Echo each command
-set -x
-
-# Exit on error.
-set -e
-
-# Core deps.
-sudo apt-get install wget
-
-# Install conda+deps.
-wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O miniconda.sh
-export deps_dir=$HOME/local
-export PATH="$HOME/miniconda/bin:$PATH"
-bash miniconda.sh -b -p $HOME/miniconda
-mamba create -y -p $deps_dir c-compiler cxx-compiler cmake ninja 'llvmdev=14.*' \
- tbb-devel tbb libboost-devel 'mppp=1.*' sleef xtensor xtensor-blas blas \
- blas-devel fmt spdlog
-source activate $deps_dir
-
-# Create the build dir and cd into it.
-mkdir build
-cd build
-
-# Clear the compilation flags set up by conda.
-unset CXXFLAGS
-unset CFLAGS
-
-# Configure.
-cmake ../ -G Ninja \
- -DCMAKE_PREFIX_PATH=$deps_dir \
- -DCMAKE_BUILD_TYPE=Debug \
- -DHEYOKA_BUILD_TESTS=yes \
- -DHEYOKA_BUILD_TUTORIALS=ON \
- -DHEYOKA_WITH_MPPP=yes \
- -DHEYOKA_WITH_SLEEF=yes \
- -DCMAKE_CXX_FLAGS="-fsanitize=address" \
- -DCMAKE_CXX_FLAGS_DEBUG="-g -Og"
-
-# Build.
-ninja -v
-
-# Run the tests.
-ctest -VV -j4
-
-set +e
-set +x