diff --git a/.github/workflows/gha_ci.yml b/.github/workflows/gha_ci.yml index 81cba6d3e..ee5e0f980 100644 --- a/.github/workflows/gha_ci.yml +++ b/.github/workflows/gha_ci.yml @@ -98,15 +98,3 @@ jobs: - uses: actions/checkout@v4 - name: Build run: bash tools/gha_llvm15_conda_asan.sh - conda_llvm14_asan: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Build - run: bash tools/gha_llvm14_conda_asan.sh - conda_llvm13_asan: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Build - run: bash tools/gha_llvm13_conda_asan.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index be252abd3..86717c759 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,7 +11,7 @@ if(NOT CMAKE_BUILD_TYPE) FORCE) endif() -project(heyoka VERSION 5.1.0 LANGUAGES CXX C) +project(heyoka VERSION 6.0.0 LANGUAGES CXX C) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake" "${CMAKE_CURRENT_SOURCE_DIR}/cmake/yacma") @@ -158,8 +158,8 @@ endif() include(GNUInstallDirs) find_package(LLVM REQUIRED CONFIG) -if(${LLVM_VERSION_MAJOR} LESS 13 OR ${LLVM_VERSION_MAJOR} GREATER 18) - message(FATAL_ERROR "LLVM >= 13 and <= 18 is required.") +if(${LLVM_VERSION_MAJOR} LESS 15 OR ${LLVM_VERSION_MAJOR} GREATER 18) + message(FATAL_ERROR "LLVM >= 15 and <= 18 is required.") endif() # List of source files. @@ -334,7 +334,7 @@ if(HEYOKA_WITH_SLEEF) endif() # Setup the heyoka ABI version number. -set(HEYOKA_ABI_VERSION 29) +set(HEYOKA_ABI_VERSION 30) if(HEYOKA_BUILD_STATIC_LIBRARY) # Setup of the heyoka static library. diff --git a/doc/install.rst b/doc/install.rst index 64d638c87..7beed7bf4 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -14,7 +14,7 @@ and several CPU architectures (x86-64, 64-bit ARM and 64-bit PowerPC). heyoka has the following **mandatory** dependencies: -* the `LLVM `__ compiler infrastructure library (version >=13 and <=18), +* the `LLVM `__ compiler infrastructure library (version >=15 and <=18), * the `Boost `__ C++ libraries (version >=1.69), * the `{fmt} `__ library (version >=9 and <=11), * the `spdlog `__ library, diff --git a/include/heyoka/detail/fwd_decl.hpp b/include/heyoka/detail/fwd_decl.hpp index b723469ce..0bdb24d61 100644 --- a/include/heyoka/detail/fwd_decl.hpp +++ b/include/heyoka/detail/fwd_decl.hpp @@ -26,6 +26,7 @@ class HEYOKA_DLL_PUBLIC func; class HEYOKA_DLL_PUBLIC param; class HEYOKA_DLL_PUBLIC llvm_state; +class HEYOKA_DLL_PUBLIC llvm_multi_state; template class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive; diff --git a/include/heyoka/llvm_state.hpp b/include/heyoka/llvm_state.hpp index c314c2105..1c021ecfa 100644 --- a/include/heyoka/llvm_state.hpp +++ b/include/heyoka/llvm_state.hpp @@ -12,14 +12,15 @@ #include #include +#include #include -#include #include #include #include #include #include #include +#include #include @@ -78,6 +79,7 @@ HEYOKA_DLL_PUBLIC const target_features &get_target_features(); } // namespace detail HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const llvm_state &); +HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const llvm_multi_state &); template inline std::uint32_t recommended_simd_size() @@ -132,6 +134,8 @@ HEYOKA_BEGIN_NAMESPACE class HEYOKA_DLL_PUBLIC llvm_state { friend HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const llvm_state &); + friend HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const llvm_multi_state &); + friend class HEYOKA_DLL_PUBLIC llvm_multi_state; struct jit; @@ -337,15 +341,65 @@ namespace detail // The value contained in the in-memory cache. struct llvm_mc_value { - std::string opt_bc, opt_ir, obj; + std::vector opt_bc, opt_ir, obj; + + std::size_t total_size() const; }; // Cache lookup and insertion. -std::optional llvm_state_mem_cache_lookup(const std::string &, unsigned); -void llvm_state_mem_cache_try_insert(std::string, unsigned, llvm_mc_value); +std::optional llvm_state_mem_cache_lookup(const std::vector &, unsigned); +void llvm_state_mem_cache_try_insert(std::vector, unsigned, llvm_mc_value); } // namespace detail +class HEYOKA_DLL_PUBLIC llvm_multi_state +{ + friend HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const llvm_multi_state &); + + struct impl; + + std::unique_ptr m_impl; + + HEYOKA_DLL_LOCAL void compile_impl(); + HEYOKA_DLL_LOCAL void add_obj_triggers(); + + // Check functions. + HEYOKA_DLL_LOCAL void check_compiled(const char *) const; + HEYOKA_DLL_LOCAL void check_uncompiled(const char *) const; + + friend class boost::serialization::access; + void save(boost::archive::binary_oarchive &, unsigned) const; + void load(boost::archive::binary_iarchive &, unsigned); + BOOST_SERIALIZATION_SPLIT_MEMBER() + +public: + llvm_multi_state(); + explicit llvm_multi_state(std::vector); + llvm_multi_state(const llvm_multi_state &); + llvm_multi_state(llvm_multi_state &&) noexcept; + llvm_multi_state &operator=(const llvm_multi_state &); + llvm_multi_state &operator=(llvm_multi_state &&) noexcept; + ~llvm_multi_state(); + + [[nodiscard]] bool is_compiled() const noexcept; + + [[nodiscard]] unsigned get_n_modules() const noexcept; + + [[nodiscard]] bool fast_math() const noexcept; + [[nodiscard]] bool force_avx512() const noexcept; + [[nodiscard]] unsigned get_opt_level() const noexcept; + [[nodiscard]] bool get_slp_vectorize() const noexcept; + [[nodiscard]] code_model get_code_model() const noexcept; + + [[nodiscard]] std::vector get_ir() const; + [[nodiscard]] std::vector get_bc() const; + [[nodiscard]] const std::vector &get_object_code() const; + + void compile(); + + std::uintptr_t jit_lookup(const std::string &); +}; + HEYOKA_END_NAMESPACE // Archive version changelog: diff --git a/src/detail/llvm_helpers.cpp b/src/detail/llvm_helpers.cpp index f32c4c6fa..279a7609e 100644 --- a/src/detail/llvm_helpers.cpp +++ b/src/detail/llvm_helpers.cpp @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -271,6 +272,74 @@ llvm::AttributeList llvm_ext_math_func_attrs(llvm_state &s) return f->getAttributes(); } +// Add a pointer to the llvm.used global variable of a module: +// +// https://llvm.org/docs/LangRef.html#the-llvm-used-global-variable +// +// If the llvm.used variable does not exist yet, create it. +// +// NOTE: this has quadratic complexity when appending ptr to an existing +// array. It should not be a problem for the type of use we do as we expect +// just a few entries in this array, but something to keep in mind. +void llvm_append_used(llvm_state &s, llvm::Constant *ptr) +{ + assert(ptr != nullptr); + assert(ptr->getType()->isPointerTy()); + + auto &md = s.module(); + auto &ctx = s.context(); + + // Fetch the pointer type. + auto *ptr_type = llvm::PointerType::getUnqual(ctx); + + if (auto *orig_used = md.getGlobalVariable("llvm.used")) { + // The llvm.used variable exists already. + + // Fetch the original initializer. + assert(orig_used->hasInitializer()); + auto *orig_init = llvm::cast(orig_used->getInitializer()); + + // Construct a new initializer with the original values + // plus the new pointer. + std::vector arr_values; + arr_values.reserve( + boost::safe_numerics::safe(orig_init->getType()->getNumElements()) + 1); + for (decltype(orig_init->getType()->getNumElements()) i = 0; i < orig_init->getType()->getNumElements(); ++i) { + auto *orig_el = orig_init->getAggregateElement(boost::numeric_cast(i)); + assert(orig_el->getType()->isPointerTy()); + + // NOTE: if ptr was already in the llvm.used vector, just bail + // out early. + if (orig_el->isElementWiseEqual(ptr)) { + return; + } + + arr_values.push_back(orig_el); + } + arr_values.push_back(ptr); + + // Create the new array. + auto *used_array_type = llvm::ArrayType::get(ptr_type, boost::numeric_cast(arr_values.size())); + auto *used_arr = llvm::ConstantArray::get(used_array_type, arr_values); + + // Remove the original one. + orig_used->eraseFromParent(); + + // Add the new global variable. + auto *g_used_arr = new llvm::GlobalVariable(md, used_arr->getType(), true, + llvm::GlobalVariable::AppendingLinkage, used_arr, "llvm.used"); + g_used_arr->setSection("llvm.metadata"); + } else { + // The llvm.used variable does not exist yet, create it. + auto *used_array_type = llvm::ArrayType::get(ptr_type, 1); + std::vector arr_values{ptr}; + auto *used_arr = llvm::ConstantArray::get(used_array_type, arr_values); + auto *g_used_arr = new llvm::GlobalVariable(md, used_arr->getType(), true, + llvm::GlobalVariable::AppendingLinkage, used_arr, "llvm.used"); + g_used_arr->setSection("llvm.metadata"); + } +} + // Attach the vfabi attributes to "call", which must be a call to a function with scalar arguments. // The necessary vfabi information is stored in vfi. The function returns "call". // The attributes of the scalar function will be attached to the vector variants. @@ -312,17 +381,8 @@ llvm::CallInst *llvm_add_vfabi_attrs(llvm_state &s, llvm::CallInst *call, const = (use_fast_math && !el.lp_vf_abi_attr.empty()) ? el.lp_vf_abi_attr : el.vf_abi_attr; vf_abi_strs.push_back(vf_abi_attr); } -#if LLVM_VERSION_MAJOR >= 14 call->addFnAttr(llvm::Attribute::get(context, "vector-function-abi-variant", fmt::format("{}", fmt::join(vf_abi_strs, ",")))); -#else - { - auto attrs = call->getAttributes(); - attrs = attrs.addAttribute(context, llvm::AttributeList::FunctionIndex, "vector-function-abi-variant", - fmt::format("{}", fmt::join(vf_abi_strs, ","))); - call->setAttributes(attrs); - } -#endif // Now we need to: // - add the declarations of the vector variants to the module, @@ -374,33 +434,9 @@ llvm::CallInst *llvm_add_vfabi_attrs(llvm_state &s, llvm::CallInst *call, const assert(vf_ptr->getAttributes() == f->getAttributes()); } - // Create the name of the dummy function to ensure the variant is not optimised out. - // - // NOTE: another way of doing this involves the llvm.used global variable - need - // to learn about the metadata API apparently. - // - // https://llvm.org/docs/LangRef.html#the-llvm-used-global-variable - // https://godbolt.org/z/1neaG4bYj - const auto dummy_name = fmt::format("heyoka.dummy_vector_call.{}", el_name); - - if (auto *dummy_ptr = md.getFunction(dummy_name); dummy_ptr == nullptr) { - // The dummy function has not been defined yet, do it. - auto *dummy = llvm_func_create(vec_ft, llvm::Function::ExternalLinkage, dummy_name, &md); - - builder.SetInsertPoint(llvm::BasicBlock::Create(context, "entry", dummy)); - - // The dummy function just forwards its arguments to the variant. - std::vector dummy_args; - for (auto *dummy_arg = dummy->args().begin(); dummy_arg != dummy->args().end(); ++dummy_arg) { - dummy_args.emplace_back(dummy_arg); - } - - builder.CreateRet(builder.CreateCall(vf_ptr, dummy_args)); - } else { - // The declaration of the dummy function is already there. - // Check that the signatures match. - assert(dummy_ptr->getFunctionType() == vec_ft); - } + // Ensure that the variant is not optimised out because it is not + // explicitly used in the code. + detail::llvm_append_used(s, vf_ptr); } // Restore the original insertion block. diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp index 7b303bc6f..a5af287bf 100644 --- a/src/llvm_state.cpp +++ b/src/llvm_state.cpp @@ -14,30 +14,33 @@ #include #include #include -#include #include #include #include #include #include #include +#include #include #include #include #include #include +#include #include #include #include -#include +#include #include #include +#include #include #include -#include +#include +#include #include #include #include @@ -60,63 +63,19 @@ #include #include #include +#include #include #include +#include +#include +#include #include #include +#include #include -#include #include #include #include -#include - -#if LLVM_VERSION_MAJOR < 14 - -// NOTE: this header was moved in LLVM 14. -#include - -#else - -#include - -#endif - -// NOTE: new pass manager API. -// NOTE: this is available since LLVM 13, but in that -// version it seems like auto-vectorization with -// vector-function-abi-variant is not working -// properly with the new pass manager. Hence, we -// enable it from LLVM 14. -#if LLVM_VERSION_MAJOR >= 14 - -#define HEYOKA_USE_NEW_LLVM_PASS_MANAGER - -#endif - -#if defined(HEYOKA_USE_NEW_LLVM_PASS_MANAGER) - -#include -#include -#include -#include - -#if LLVM_VERSION_MAJOR >= 14 - -// NOTE: this header is available since LLVM 14. -#include - -#endif - -#else - -#include -#include -#include -#include -#include - -#endif #if LLVM_VERSION_MAJOR >= 17 @@ -151,26 +110,6 @@ #include #include -// NOTE: logging here lhames' instructions on how to set up LLJIT -// for parallel compilation of multiple modules. -// -// auto J = LLJITBuilder() -// .setNumCompileThreads() -// .create(); -// if (!J) { /* bail on error */ } -// (*J)->getIRTransformLayer().setTransform( -// [](ThreadSafeModule TSM, MaterializationResponsibility &R) -> Expected { -// TSM.withModuleDo([](Module &M) { -// /* Apply your IR optimizations here */ -// }); -// return std::move(TSM); -// }); -// -// Note that the optimisation passes in this approach are moved into the -// transform layer. References: -// https://discord.com/channels/636084430946959380/687692371038830597/1252428080648163328 -// https://discord.com/channels/636084430946959380/687692371038830597/1252118666187640892 - HEYOKA_BEGIN_NAMESPACE namespace detail @@ -207,12 +146,12 @@ const std::regex ppc_regex_pattern("pwr([1-9]*)"); target_features get_target_features_impl() { auto jtmb = llvm::orc::JITTargetMachineBuilder::detectHost(); - if (!jtmb) { + if (!jtmb) [[unlikely]] { throw std::invalid_argument("Error creating a JITTargetMachineBuilder for the host system"); } auto tm = jtmb->createTargetMachine(); - if (!tm) { + if (!tm) [[unlikely]] { throw std::invalid_argument("Error creating the target machine"); } @@ -307,6 +246,342 @@ void init_native_target() }); } +// Helper to create a builder for target machines. +llvm::orc::JITTargetMachineBuilder create_jit_tmb(unsigned opt_level, code_model c_model) +{ + // NOTE: codegen opt level changed in LLVM 18. +#if LLVM_VERSION_MAJOR < 18 + + using cg_opt_level = llvm::CodeGenOpt::Level; + +#else + + using cg_opt_level = llvm::CodeGenOptLevel; + +#endif + + // Try creating the target machine builder. + auto jtmb = llvm::orc::JITTargetMachineBuilder::detectHost(); + // LCOV_EXCL_START + if (!jtmb) [[unlikely]] { + throw std::invalid_argument("Error creating a JITTargetMachineBuilder for the host system"); + } + // LCOV_EXCL_STOP + + // Set the codegen optimisation level. + switch (opt_level) { + case 0u: + jtmb->setCodeGenOptLevel(cg_opt_level::None); + break; + case 1u: + jtmb->setCodeGenOptLevel(cg_opt_level::Less); + break; + case 2u: + jtmb->setCodeGenOptLevel(cg_opt_level::Default); + break; + default: + assert(opt_level == 3u); + jtmb->setCodeGenOptLevel(cg_opt_level::Aggressive); + } + + // NOTE: not all code models are supported on all archs. We make an effort + // here to prevent unsupported code models to be requested, as that will + // result in the termination of the program. + constexpr code_model supported_code_models[] = { +#if defined(HEYOKA_ARCH_X86) + code_model::small, code_model::kernel, code_model::medium, code_model::large +#elif defined(HEYOKA_ARCH_ARM) + code_model::tiny, code_model::small, code_model::large +#elif defined(HEYOKA_ARCH_PPC) + code_model::small, code_model::medium, code_model::large +#else + // NOTE: by default we assume only small and large are supported. + code_model::small, code_model::large +#endif + }; + + if (std::ranges::find(supported_code_models, c_model) == std::ranges::end(supported_code_models)) [[unlikely]] { + throw std::invalid_argument( + fmt::format("The code model '{}' is not supported on the current architecture", c_model)); + } + + // LCOV_EXCL_START + +#if LLVM_VERSION_MAJOR >= 17 + + // NOTE: the code model setup is working only on LLVM>=19 (or at least + // LLVM 18 + patches, as in the conda-forge LLVM package), due to this bug: + // + // https://github.com/llvm/llvm-project/issues/88115 + // + // Additionally, there are indications from our CI that attempting to set + // the code model before LLVM 17 might just be buggy, as we see widespread + // ASAN failures all over the place. Thus, let us not do anything with the code + // model setting before LLVM 17. + + // Setup the code model. + switch (c_model) { + case code_model::tiny: + jtmb->setCodeModel(llvm::CodeModel::Tiny); + break; + case code_model::small: + jtmb->setCodeModel(llvm::CodeModel::Small); + break; + case code_model::kernel: + jtmb->setCodeModel(llvm::CodeModel::Kernel); + break; + case code_model::medium: + jtmb->setCodeModel(llvm::CodeModel::Medium); + break; + case code_model::large: + jtmb->setCodeModel(llvm::CodeModel::Large); + break; + default: + // NOTE: we should never end up here. + assert(false); + ; + } + +#endif + + // LCOV_EXCL_STOP + + return std::move(*jtmb); +} + +// Helper to optimise the input module M. Implemented here for re-use. +// NOTE: this may end up being invoked concurrently from multiple threads. +// If that is the case, we make sure before invocation to construct a different +// TargetMachine per thread, so that we are sure no data races are possible. +void optimise_module(llvm::Module &M, llvm::TargetMachine &tm, unsigned opt_level, bool force_avx512, + bool slp_vectorize) +{ + // NOTE: don't run any optimisation pass at O0. + if (opt_level == 0u) { + return; + } + + // NOTE: the logic here largely mimics (with a lot of simplifications) + // the implementation of the 'opt' tool. See: + // https://github.com/llvm/llvm-project/blob/release/10.x/llvm/tools/opt/opt.cpp + + // For every function in the module, setup its attributes + // so that the codegen uses all the features available on + // the host CPU. + const auto cpu = tm.getTargetCPU().str(); + const auto features = tm.getTargetFeatureString().str(); + + // Fetch the module's context. + auto &ctx = M.getContext(); + + for (auto &f : M) { + auto attrs = f.getAttributes(); + + llvm::AttrBuilder new_attrs(ctx); + + if (!cpu.empty() && !f.hasFnAttribute("target-cpu")) { + new_attrs.addAttribute("target-cpu", cpu); + } + + if (!features.empty()) { + auto old_features = f.getFnAttribute("target-features").getValueAsString(); + + if (old_features.empty()) { + new_attrs.addAttribute("target-features", features); + } else { + llvm::SmallString<256> appended(old_features); + appended.push_back(','); + appended.append(features); + new_attrs.addAttribute("target-features", appended); + } + } + + // Let new_attrs override attrs. + f.setAttributes(attrs.addFnAttributes(ctx, new_attrs)); + } + + // Force usage of AVX512 registers, if requested. + if (force_avx512 && get_target_features().avx512f) { + for (auto &f : M) { + f.addFnAttr("prefer-vector-width", "512"); + } + } + + // NOTE: adapted from here: + // https://llvm.org/docs/NewPassManager.html + + // Create the analysis managers. + llvm::LoopAnalysisManager LAM; + llvm::FunctionAnalysisManager FAM; + llvm::CGSCCAnalysisManager CGAM; + llvm::ModuleAnalysisManager MAM; + + // NOTE: in the new pass manager, this seems to be the way to + // set the target library info bits. See: + // https://github.com/llvm/llvm-project/blob/b7fd30eac3183993806cc218b6deb39eb625c083/llvm/tools/opt/NewPMDriver.cpp#L408 + // Not sure if this matters, but we did it in the old pass manager + // and opt does it too. + llvm::TargetLibraryInfoImpl TLII(tm.getTargetTriple()); + FAM.registerPass([&] { return llvm::TargetLibraryAnalysis(TLII); }); + + // Create the new pass manager builder, passing the supplied target machine. + // NOTE: if requested, we turn manually on the SLP vectoriser here, which is off + // by default. Not sure why it is off, the LLVM docs imply this + // is on by default at nonzero optimisation levels for clang and opt. + // NOTE: the reason for this inconsistency is that opt uses PB.parsePassPipeline() + // (instead of PB.buildPerModuleDefaultPipeline()) to set up the optimisation + // pipeline. Indeed, if we replace PB.buildPerModuleDefaultPipeline(ol) with + // PB.parsePassPipeline(MPM, "default") (which corresponds to invoking + // "opt -passes='default'"), we do NOT need to set SLP vectorization on + // here to get the SLP vectorizer. Not sure if we should consider switching to this + // alternative way of setting up the optimisation pipeline in the future. + llvm::PipelineTuningOptions pto; + pto.SLPVectorization = slp_vectorize; + llvm::PassBuilder PB(&tm, pto); + + // Register all the basic analyses with the managers. + PB.registerModuleAnalyses(MAM); + PB.registerCGSCCAnalyses(CGAM); + PB.registerFunctionAnalyses(FAM); + PB.registerLoopAnalyses(LAM); + PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); + + // Construct the optimisation level. + llvm::OptimizationLevel ol{}; + + switch (opt_level) { + case 1u: + ol = llvm::OptimizationLevel::O1; + break; + case 2u: + ol = llvm::OptimizationLevel::O2; + break; + default: + assert(opt_level == 3u); + ol = llvm::OptimizationLevel::O3; + } + + // Create the module pass manager. + auto MPM = PB.buildPerModuleDefaultPipeline(ol); + + // Optimize the IR. + MPM.run(M, MAM); +} + +// Helper to add a module to an lljt, throwing on error. +void add_module_to_lljit(llvm::orc::LLJIT &lljit, std::unique_ptr m, llvm::orc::ThreadSafeContext ctx) +{ + auto err = lljit.addIRModule(llvm::orc::ThreadSafeModule(std::move(m), std::move(ctx))); + + // LCOV_EXCL_START + if (err) { + std::string err_report; + llvm::raw_string_ostream ostr(err_report); + + ostr << err; + + throw std::invalid_argument( + fmt::format("The function for adding a module to the jit failed. The full error message:\n{}", ostr.str())); + } + // LCOV_EXCL_STOP +} + +// Helper to fetch the bitcode from a module. +std::string bc_from_module(llvm::Module &m) +{ + std::string out; + llvm::raw_string_ostream ostr(out); + + llvm::WriteBitcodeToFile(m, ostr); + + return std::move(ostr.str()); +} + +// Helper to fetch the textual IR from a module. +std::string ir_from_module(llvm::Module &m) +{ + std::string out; + llvm::raw_string_ostream ostr(out); + + m.print(ostr, nullptr); + + return std::move(ostr.str()); +} + +// An implementation of llvm::MemoryBuffer offering a view over a std::string. +class string_view_mem_buffer final : public llvm::MemoryBuffer +{ +public: + explicit string_view_mem_buffer(const std::string &s) + { + // NOTE: the important bit here is from the LLVM docs: + // + // """ + // In addition to basic access to the characters in the file, this interface + // guarantees you can read one character past the end of the file, and that + // this character will read as '\0'. + // """ + // + // This is exactly the guarantee given by std::string: + // + // https://en.cppreference.com/w/cpp/string/basic_string/data + // + // Not sure about the third parameter to this function though, it does not + // seem to have any influence apart from debug checking: + // + // https://llvm.org/doxygen/MemoryBuffer_8cpp_source.html + this->init(s.data(), s.data() + s.size(), true); + } + // LCOV_EXCL_START + llvm::MemoryBuffer::BufferKind getBufferKind() const final + { + // Hopefully std::string is not memory-mapped... + return llvm::MemoryBuffer::BufferKind::MemoryBuffer_Malloc; + } + // LCOV_EXCL_STOP +}; + +// Helper to add an object file to the jit, throwing in case of errors. +void add_obj_to_lljit(llvm::orc::LLJIT &lljit, const std::string &obj) +{ + // NOTE: an empty obj can happen when we are copying a compiled + // llvm_multi_state. In such case, the object files of the individual + // states have all be empty-inited. We then need to avoid adding + // obj to the jit because that will result in an error. + if (obj.empty()) { + return; + } + + // Add the object file. + auto err = lljit.addObjectFile(std::make_unique(obj)); + + // LCOV_EXCL_START + if (err) { + std::string err_report; + llvm::raw_string_ostream ostr(err_report); + + ostr << err; + + throw std::invalid_argument(fmt::format( + "The function for adding an object file to an lljit failed. The full error message:\n{}", ostr.str())); + } + // LCOV_EXCL_STOP +} + +// Helper to verify a module, throwing if verification fails. +void verify_module(const llvm::Module &m) +{ + std::string out; + llvm::raw_string_ostream ostr(out); + + if (llvm::verifyModule(m, &ostr)) { + // LCOV_EXCL_START + throw std::runtime_error(fmt::format("The verification of the module '{}' produced an error:\n{}", + m.getModuleIdentifier(), ostr.str())); + // LCOV_EXCL_STOP + } +} + } // namespace // Helper function to fetch a const ref to a global object @@ -371,116 +646,26 @@ struct llvm_state::jit { std::unique_ptr m_ctx; std::optional m_object_file; + // NOTE: make sure to coordinate changes in this constructor with multi_jit. explicit jit(unsigned opt_level, code_model c_model) { - // NOTE: we assume here the opt level has already been clamped - // from the outside. + // NOTE: we assume here that the input arguments have + // been validated already. assert(opt_level <= 3u); + assert(c_model >= code_model::tiny && c_model <= code_model::large); // Ensure the native target is inited. detail::init_native_target(); - // NOTE: codegen opt level changed in LLVM 18. -#if LLVM_VERSION_MAJOR < 18 + // Create the target machine builder. + auto jtmb = detail::create_jit_tmb(opt_level, c_model); - using cg_opt_level = llvm::CodeGenOpt::Level; - -#else - - using cg_opt_level = llvm::CodeGenOptLevel; - -#endif - - // Create the target machine builder. - auto jtmb = llvm::orc::JITTargetMachineBuilder::detectHost(); - // LCOV_EXCL_START - if (!jtmb) { - throw std::invalid_argument("Error creating a JITTargetMachineBuilder for the host system"); - } - // LCOV_EXCL_STOP - // Set the codegen optimisation level. - switch (opt_level) { - case 0u: - jtmb->setCodeGenOptLevel(cg_opt_level::None); - break; - case 1u: - jtmb->setCodeGenOptLevel(cg_opt_level::Less); - break; - case 2u: - jtmb->setCodeGenOptLevel(cg_opt_level::Default); - break; - default: - assert(opt_level == 3u); - jtmb->setCodeGenOptLevel(cg_opt_level::Aggressive); - } - - // NOTE: not all code models are supported on all archs. We make an effort - // here to prevent unsupported code models to be requested, as that will - // result in the termination of the program. - constexpr code_model supported_code_models[] = { -#if defined(HEYOKA_ARCH_X86) - code_model::small, code_model::kernel, code_model::medium, code_model::large -#elif defined(HEYOKA_ARCH_ARM) - code_model::tiny, code_model::small, code_model::large -#elif defined(HEYOKA_ARCH_PPC) - code_model::small, code_model::medium, code_model::large -#else - // NOTE: by default we assume only small and large are supported. - code_model::small, code_model::large -#endif - }; - - if (std::ranges::find(supported_code_models, c_model) == std::ranges::end(supported_code_models)) [[unlikely]] { - throw std::invalid_argument( - fmt::format("The code model '{}' is not supported on the current architecture", c_model)); - } - - // LCOV_EXCL_START - -#if LLVM_VERSION_MAJOR >= 17 - // NOTE: the code model setup is working only on LLVM>=19 (or at least - // LLVM 18 + patches, as in the conda-forge LLVM package), due to this bug: - // - // https://github.com/llvm/llvm-project/issues/88115 - // - // Additionally, there are indications from our CI that attempting to set - // the code model before LLVM 17 might just be buggy, as we see widespread - // ASAN failures all over the place. Thus, let us not do anything with the code - // model setting before LLVM 17. - - // Setup the code model. - switch (c_model) { - case code_model::tiny: - jtmb->setCodeModel(llvm::CodeModel::Tiny); - break; - case code_model::small: - jtmb->setCodeModel(llvm::CodeModel::Small); - break; - case code_model::kernel: - jtmb->setCodeModel(llvm::CodeModel::Kernel); - break; - case code_model::medium: - jtmb->setCodeModel(llvm::CodeModel::Medium); - break; - case code_model::large: - jtmb->setCodeModel(llvm::CodeModel::Large); - break; - default: - // NOTE: we should never end up here. - assert(false); - ; - } - -#endif - - // LCOV_EXCL_STOP - - // Create the jit builder. - llvm::orc::LLJITBuilder lljit_builder; - // NOTE: other settable properties may - // be of interest: - // https://www.llvm.org/doxygen/classllvm_1_1orc_1_1LLJITBuilder.html - lljit_builder.setJITTargetMachineBuilder(*jtmb); + // Create the jit builder. + llvm::orc::LLJITBuilder lljit_builder; + // NOTE: other settable properties may + // be of interest: + // https://www.llvm.org/doxygen/classllvm_1_1orc_1_1LLJITBuilder.html + lljit_builder.setJITTargetMachineBuilder(jtmb); // Create the jit. auto lljit = lljit_builder.create(); @@ -531,7 +716,7 @@ struct llvm_state::jit { // Keep a target machine around to fetch various // properties of the host CPU. - auto tm = jtmb->createTargetMachine(); + auto tm = jtmb.createTargetMachine(); // LCOV_EXCL_START if (!tm) { throw std::invalid_argument("Error creating the target machine"); @@ -586,10 +771,6 @@ struct llvm_state::jit { { return m_tm->getTargetFeatureString().str(); } - [[nodiscard]] llvm::TargetIRAnalysis get_target_ir_analysis() const - { - return m_tm->getTargetIRAnalysis(); - } [[nodiscard]] const llvm::Triple &get_target_triple() const { return m_lljit->getTargetTriple(); @@ -597,19 +778,7 @@ struct llvm_state::jit { void add_module(std::unique_ptr m) const { - auto err = m_lljit->addIRModule(llvm::orc::ThreadSafeModule(std::move(m), *m_ctx)); - - // LCOV_EXCL_START - if (err) { - std::string err_report; - llvm::raw_string_ostream ostr(err_report); - - ostr << err; - - throw std::invalid_argument(fmt::format( - "The function for adding a module to the jit failed. The full error message:\n{}", ostr.str())); - } - // LCOV_EXCL_STOP + detail::add_module_to_lljit(*m_lljit, std::move(m), *m_ctx); } // Symbol lookup. @@ -645,24 +814,12 @@ namespace detail namespace { -// Helper to load object code into a jit. +// Helper to load object code into the jit of an llvm_state. template void llvm_state_add_obj_to_jit(Jit &j, std::string obj) { - llvm::SmallVector buffer(obj.begin(), obj.end()); - auto err = j.m_lljit->addObjectFile(std::make_unique(std::move(buffer))); - - // LCOV_EXCL_START - if (err) { - std::string err_report; - llvm::raw_string_ostream ostr(err_report); - - ostr << err; - - throw std::invalid_argument(fmt::format( - "The function for adding a compiled module to the jit failed. The full error message:\n{}", ostr.str())); - } - // LCOV_EXCL_STOP + // Add the object code to the lljit. + add_obj_to_lljit(*j.m_lljit, obj); // Add the object code also to the // m_object_file member. @@ -677,11 +834,10 @@ void llvm_state_add_obj_to_jit(Jit &j, std::string obj) // contained in the bitcode) because apparently llvm::parseBitcodeFile() discards the module // name when parsing. // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) -auto llvm_state_bc_to_module(const std::string &module_name, const std::string &bc, llvm::LLVMContext &ctx) +auto bc_to_module(const std::string &module_name, const std::string &bc, llvm::LLVMContext &ctx) { - // Create the corresponding memory buffer. - auto mb = llvm::MemoryBuffer::getMemBuffer(bc); - assert(mb); + // Create the corresponding memory buffer view on bc. + auto mb = std::make_unique(bc); // Parse the bitcode. auto ret = llvm::parseBitcodeFile(mb->getMemBufferRef(), ctx); @@ -790,7 +946,7 @@ llvm_state::llvm_state(const llvm_state &other) // module and builder. The IR/bitcode snapshots // are left in their default-constructed (empty) // state. - m_module = detail::llvm_state_bc_to_module(m_module_name, other.get_bc(), context()); + m_module = detail::bc_to_module(m_module_name, other.get_bc(), context()); // Create a new builder for the module. m_builder = std::make_unique(context()); @@ -837,23 +993,12 @@ llvm_state &llvm_state::operator=(llvm_state &&other) noexcept return *this; } -llvm_state::~llvm_state() -{ - // Sanity checks in debug mode. - if (m_jitter) { - if (is_compiled()) { - assert(m_jitter->m_object_file); - assert(!m_builder); - } else { - assert(!m_jitter->m_object_file); - assert(m_builder); - assert(m_ir_snapshot.empty()); - assert(m_bc_snapshot.empty()); - } - } - - assert(m_opt_level <= 3u); -} +// NOTE: we used to have debug sanity checks here. However, in certain rare corner cases, +// an invalid llvm_state could end up being destroyed, thus triggering assertion errors +// in debug mode (this could happen for instance when resetting an llvm_state to the +// def-cted state after an exception had been thrown during compilation). Thus, just +// do not run the debug checks. +llvm_state::~llvm_state() = default; template void llvm_state::save_impl(Archive &ar, unsigned) const @@ -997,7 +1142,7 @@ void llvm_state::load_impl(Archive &ar, unsigned version) m_bc_snapshot.clear(); // Create the module from the bitcode. - m_module = detail::llvm_state_bc_to_module(m_module_name, bc_snapshot, context()); + m_module = detail::bc_to_module(m_module_name, bc_snapshot, context()); // Create a new builder for the module. m_builder = std::make_unique(context()); @@ -1142,189 +1287,15 @@ void llvm_state::verify_function(const std::string &name) void llvm_state::optimise() { - check_uncompiled(__func__); - - // NOTE: don't run any optimisation pass at O0. - if (m_opt_level == 0u) { - return; - } - - // NOTE: the logic here largely mimics (with a lot of simplifications) - // the implementation of the 'opt' tool. See: - // https://github.com/llvm/llvm-project/blob/release/10.x/llvm/tools/opt/opt.cpp - - // For every function in the module, setup its attributes - // so that the codegen uses all the features available on - // the host CPU. - const auto cpu = m_jitter->get_target_cpu(); - const auto features = m_jitter->get_target_features(); - - auto &ctx = context(); - - for (auto &f : module()) { - auto attrs = f.getAttributes(); - - llvm::AttrBuilder -#if LLVM_VERSION_MAJOR < 14 - new_attrs -#else - new_attrs(ctx) -#endif - ; - - if (!cpu.empty() && !f.hasFnAttribute("target-cpu")) { - new_attrs.addAttribute("target-cpu", cpu); - } - - if (!features.empty()) { - auto old_features = f.getFnAttribute("target-features").getValueAsString(); - - if (old_features.empty()) { - new_attrs.addAttribute("target-features", features); - } else { - llvm::SmallString<256> appended(old_features); - appended.push_back(','); - appended.append(features); - new_attrs.addAttribute("target-features", appended); - } - } - - // Let new_attrs override attrs. -#if LLVM_VERSION_MAJOR < 14 - f.setAttributes(attrs.addAttributes(ctx, llvm::AttributeList::FunctionIndex, new_attrs)); -#else - f.setAttributes(attrs.addFnAttributes(ctx, new_attrs)); -#endif - } - - // Force usage of AVX512 registers, if requested. - if (m_force_avx512 && detail::get_target_features().avx512f) { - for (auto &f : module()) { - f.addFnAttr("prefer-vector-width", "512"); - } - } - -#if defined(HEYOKA_USE_NEW_LLVM_PASS_MANAGER) - - // NOTE: adapted from here: - // https://llvm.org/docs/NewPassManager.html - - // Optimisation level for the module pass manager. - // NOTE: the OptimizationLevel class has changed location - // since LLVM 14. -#if LLVM_VERSION_MAJOR >= 14 - using olevel = llvm::OptimizationLevel; -#else - using olevel = llvm::PassBuilder::OptimizationLevel; -#endif - - // Create the analysis managers. - llvm::LoopAnalysisManager LAM; - llvm::FunctionAnalysisManager FAM; - llvm::CGSCCAnalysisManager CGAM; - llvm::ModuleAnalysisManager MAM; - - // NOTE: in the new pass manager, this seems to be the way to - // set the target library info bits. See: - // https://github.com/llvm/llvm-project/blob/b7fd30eac3183993806cc218b6deb39eb625c083/llvm/tools/opt/NewPMDriver.cpp#L408 - // Not sure if this matters, but we did it in the old pass manager - // and opt does it too. - llvm::TargetLibraryInfoImpl TLII(m_jitter->get_target_triple()); - FAM.registerPass([&] { return llvm::TargetLibraryAnalysis(TLII); }); - - // Create the new pass manager builder, passing - // the native target machine from the JIT class. - // NOTE: if requested, we turn manually on the SLP vectoriser here, which is off - // by default. Not sure why it is off, the LLVM docs imply this - // is on by default at nonzero optimisation levels for clang and opt. - // NOTE: the reason for this inconsistency is that opt uses PB.parsePassPipeline() - // (instead of PB.buildPerModuleDefaultPipeline()) to set up the optimisation - // pipeline. Indeed, if we replace PB.buildPerModuleDefaultPipeline(ol) with - // PB.parsePassPipeline(MPM, "default") (which corresponds to invoking - // "opt -passes='default'"), we do NOT need to set SLP vectorization on - // here to get the SLP vectorizer. Not sure if we should consider switching to this - // alternative way of setting up the optimisation pipeline in the future. - llvm::PipelineTuningOptions pto; - pto.SLPVectorization = m_slp_vectorize; - llvm::PassBuilder PB(m_jitter->m_tm.get(), pto); - - // Register all the basic analyses with the managers. - PB.registerModuleAnalyses(MAM); - PB.registerCGSCCAnalyses(CGAM); - PB.registerFunctionAnalyses(FAM); - PB.registerLoopAnalyses(LAM); - PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); - - // Construct the optimisation level. - olevel ol{}; - - switch (m_opt_level) { - case 1u: - ol = olevel::O1; - break; - case 2u: - ol = olevel::O2; - break; - default: - assert(m_opt_level == 3u); - ol = olevel::O3; - } - - // Create the module pass manager. - auto MPM = PB.buildPerModuleDefaultPipeline(ol); - - // Optimize the IR. - MPM.run(*m_module, MAM); - -#else - - // Init the module pass manager. - auto module_pm = std::make_unique(); - // These are passes which set up target-specific info - // that are used by successive optimisation passes. - auto tliwp = std::make_unique( - llvm::TargetLibraryInfoImpl(m_jitter->get_target_triple())); - module_pm->add(tliwp.release()); - module_pm->add(llvm::createTargetTransformInfoWrapperPass(m_jitter->get_target_ir_analysis())); - - // NOTE: not sure what this does, presumably some target-specifc - // configuration. - module_pm->add(static_cast(*m_jitter->m_tm).createPassConfig(*module_pm)); - - // Init the function pass manager. - auto f_pm = std::make_unique(m_module.get()); - f_pm->add(llvm::createTargetTransformInfoWrapperPass(m_jitter->get_target_ir_analysis())); - - // We use the helper class PassManagerBuilder to populate the module - // pass manager with standard options. - llvm::PassManagerBuilder pm_builder; - // See here for the defaults: - // https://llvm.org/doxygen/PassManagerBuilder_8cpp_source.html - pm_builder.OptLevel = m_opt_level; - // Enable function inlining. - pm_builder.Inliner = llvm::createFunctionInliningPass(m_opt_level, 0, false); - // NOTE: if requested, we turn manually on the SLP vectoriser here, which is off - // by default. Not sure why it is off, the LLVM docs imply this - // is on by default at nonzero optimisation levels for clang and opt. - pm_builder.SLPVectorize = m_slp_vectorize; - - m_jitter->m_tm->adjustPassManager(pm_builder); - - // Populate both the function pass manager and the module pass manager. - pm_builder.populateFunctionPassManager(*f_pm); - pm_builder.populateModulePassManager(*module_pm); - - // Run the function pass manager on all functions in the module. - f_pm->doInitialization(); - for (auto &f : *m_module) { - f_pm->run(f); - } - f_pm->doFinalization(); - - // Run the module passes. - module_pm->run(*m_module); - -#endif + // NOTE: we used to fetch the target triple from the lljit object, + // but recently we switched to asking the target triple directly + // from the target machine. Assert equality between the two for a while, + // just in case. + assert(m_jitter->m_lljit->getTargetTriple() == m_jitter->m_tm->getTargetTriple()); + // NOTE: the target triple is also available in the module. + assert(m_jitter->m_lljit->getTargetTriple().str() == module().getTargetTriple()); + + detail::optimise_module(module(), *m_jitter->m_tm, m_opt_level, m_force_avx512, m_slp_vectorize); } namespace detail @@ -1350,6 +1321,7 @@ void llvm_state::add_obj_trigger() auto *ft = llvm::FunctionType::get(bld.getVoidTy(), {}, false); assert(ft != nullptr); auto *f = detail::llvm_func_create(ft, llvm::Function::ExternalLinkage, detail::obj_trigger_name, &module()); + assert(f != nullptr); bld.SetInsertPoint(llvm::BasicBlock::Create(context(), "entry", f)); bld.CreateRetVoid(); @@ -1377,9 +1349,38 @@ void llvm_state::compile_impl() // Trigger object code materialisation via lookup. jit_lookup(detail::obj_trigger_name); + assert(m_jitter->m_object_file); } +namespace detail +{ + +namespace +{ + +// Combine opt_level, force_avx512, slp_vectorize and c_model into a single flag. +// NOTE: here we need: +// +// - 2 bits for opt_level, +// - 1 bit for force_avx512 and slp_vectorize each, +// - 3 bits for c_model, +// +// for a total of 7 bits. +unsigned assemble_comp_flag(unsigned opt_level, bool force_avx512, bool slp_vectorize, code_model c_model) +{ + assert(opt_level <= 3u); + assert(static_cast(c_model) <= 7u); + static_assert(std::numeric_limits::digits >= 7u); + + return opt_level + (static_cast(force_avx512) << 2) + (static_cast(slp_vectorize) << 3) + + (static_cast(c_model) << 4); +} + +} // namespace + +} // namespace detail + // NOTE: we need to emphasise in the docs that compilation // triggers an optimisation pass. void llvm_state::compile() @@ -1392,17 +1393,7 @@ void llvm_state::compile() auto *logger = detail::get_logger(); // Run a verification on the module before compiling. - { - std::string out; - llvm::raw_string_ostream ostr(out); - - if (llvm::verifyModule(*m_module, &ostr)) { - // LCOV_EXCL_START - throw std::runtime_error( - fmt::format("The verification of the module '{}' produced an error:\n{}", m_module_name, ostr.str())); - // LCOV_EXCL_STOP - } - } + detail::verify_module(*m_module); logger->trace("module verification runtime: {}", sw); @@ -1413,39 +1404,42 @@ void llvm_state::compile() // to fix the module and re-attempt compilation without having // altered the module and without having already added the trigger // function. + // NOTE: this function does its own cleanup, no need to + // start the try catch block yet. add_obj_trigger(); try { // Fetch the bitcode *before* optimisation. auto orig_bc = get_bc(); + std::vector obc; + obc.push_back(std::move(orig_bc)); - // Combine m_opt_level, m_force_avx512, m_slp_vectorize and m_c_model into a single value, - // as they all affect codegen. - // NOTE: here we need: - // - 2 bits for m_opt_level, - // - 1 bit for m_force_avx512 and m_slp_vectorize each, - // - 3 bits for m_c_model, - // for a total of 7 bits. - assert(m_opt_level <= 3u); - assert(static_cast(m_c_model) <= 7u); - static_assert(std::numeric_limits::digits >= 7u); - const auto olevel = m_opt_level + (static_cast(m_force_avx512) << 2) - + (static_cast(m_slp_vectorize) << 3) + (static_cast(m_c_model) << 4); - - if (auto cached_data = detail::llvm_state_mem_cache_lookup(orig_bc, olevel)) { + // Assemble the compilation flag. + const auto comp_flag = detail::assemble_comp_flag(m_opt_level, m_force_avx512, m_slp_vectorize, m_c_model); + + // Lookup in the cache. + if (auto cached_data = detail::llvm_state_mem_cache_lookup(obc, comp_flag)) { // Cache hit. - // Assign the snapshots. - m_ir_snapshot = std::move(cached_data->opt_ir); - m_bc_snapshot = std::move(cached_data->opt_bc); + // Assign the optimised snapshots. + assert(cached_data->opt_ir.size() == 1u); + assert(cached_data->opt_bc.size() == 1u); + assert(cached_data->obj.size() == 1u); + m_ir_snapshot = std::move(cached_data->opt_ir[0]); + m_bc_snapshot = std::move(cached_data->opt_bc[0]); // Clear out module and builder. m_module.reset(); m_builder.reset(); // Assign the object file. - detail::llvm_state_add_obj_to_jit(*m_jitter, std::move(cached_data->obj)); + detail::llvm_state_add_obj_to_jit(*m_jitter, std::move(cached_data->obj[0])); + + // Look up the trigger. + jit_lookup(detail::obj_trigger_name); } else { + // Cache miss. + sw.reset(); // Run the optimisation pass. @@ -1460,10 +1454,11 @@ void llvm_state::compile() logger->trace("materialisation runtime: {}", sw); - // Try to insert orig_bc into the cache. - detail::llvm_state_mem_cache_try_insert(std::move(orig_bc), olevel, - // NOLINTNEXTLINE(bugprone-unchecked-optional-access) - {m_bc_snapshot, m_ir_snapshot, *m_jitter->m_object_file}); + // Try to insert obc into the cache. + detail::llvm_state_mem_cache_try_insert( + std::move(obc), comp_flag, + // NOLINTNEXTLINE(bugprone-unchecked-optional-access) + {.opt_bc = {m_bc_snapshot}, .opt_ir = {m_ir_snapshot}, .obj = {*m_jitter->m_object_file}}); } // LCOV_EXCL_START } catch (...) { @@ -1493,11 +1488,7 @@ std::uintptr_t llvm_state::jit_lookup(const std::string &name) throw std::invalid_argument(fmt::format("Could not find the symbol '{}' in the compiled module", name)); } -#if LLVM_VERSION_MAJOR >= 15 return static_cast((*sym).getValue()); -#else - return static_cast((*sym).getAddress()); -#endif } std::string llvm_state::get_ir() const @@ -1505,12 +1496,7 @@ std::string llvm_state::get_ir() const if (m_module) { // The module has not been compiled yet, // get the IR from it. - std::string out; - llvm::raw_string_ostream ostr(out); - - m_module->print(ostr, nullptr); - - return std::move(ostr.str()); + return detail::ir_from_module(*m_module); } else { // The module has been compiled. // Return the IR snapshot that @@ -1524,12 +1510,7 @@ std::string llvm_state::get_bc() const if (m_module) { // The module has not been compiled yet, // get the bitcode from it. - std::string out; - llvm::raw_string_ostream ostr(out); - - llvm::WriteBitcodeToFile(*m_module, ostr); - - return std::move(ostr.str()); + return detail::bc_from_module(*m_module); } else { // The module has been compiled. // Return the bitcode snapshot that @@ -1610,4 +1591,812 @@ std::ostream &operator<<(std::ostream &os, const llvm_state &s) return os << oss.str(); } +namespace detail +{ + +namespace +{ + +// NOTE: this is a class similar in spirit to llvm_state, but set up for parallel +// compilation of multiple modules. +struct multi_jit { + // NOTE: this is the total number of modules, including + // the master module. + const unsigned m_n_modules = 0; + // NOTE: enumerate the LLVM members here in the same order + // as llvm_state, as this is important to ensure proper + // destruction order. + std::unique_ptr m_lljit; + std::unique_ptr m_ctx; + std::unique_ptr m_module; + std::unique_ptr m_builder; + // Object files. + // NOTE: these may be modified concurrently during compilation, + // protect with mutex. + std::mutex m_object_files_mutex; + std::vector m_object_files; + // IR and bc optimised snapshots. + // NOTE: these may be modified concurrently during compilation, + // protect with mutex. + std::mutex m_ir_bc_mutex; + std::vector m_ir_snapshots; + std::vector m_bc_snapshots; + + explicit multi_jit(unsigned, unsigned, code_model, bool, bool); + multi_jit(const multi_jit &) = delete; + multi_jit(multi_jit &&) noexcept = delete; + llvm_multi_state &operator=(const multi_jit &) = delete; + llvm_multi_state &operator=(multi_jit &&) noexcept = delete; + ~multi_jit() = default; + + // Helper to fetch the context from its thread-safe counterpart. + [[nodiscard]] llvm::LLVMContext &context() const noexcept + { + return *m_ctx->getContext(); + } +}; + +#if 0 + +// A task dispatcher class built on top of TBB's task group. +class tbb_task_dispatcher : public llvm::orc::TaskDispatcher +{ + oneapi::tbb::task_group m_tg; + +public: + void dispatch(std::unique_ptr T) override + { + m_tg.run([T = std::move(T)]() { T->run(); }); + } + void shutdown() override + { + m_tg.wait(); + } + ~tbb_task_dispatcher() noexcept + { + m_tg.wait(); + } +}; + +#endif + +// Reserved identifier for the master module in an llvm_multi_state. +constexpr auto master_module_name = "heyoka.master"; + +// NOTE: this largely replicates the logic from the constructors of llvm_state and llvm_state::jit. +// NOTE: make sure to coordinate changes in this constructor with llvm_state::jit. +multi_jit::multi_jit(unsigned n_modules, unsigned opt_level, code_model c_model, bool force_avx512, bool slp_vectorize) + : m_n_modules(n_modules) +{ + assert(n_modules >= 2u); + + // NOTE: we assume here that the input arguments have + // been validated already. + assert(opt_level <= 3u); + assert(c_model >= code_model::tiny && c_model <= code_model::large); + + // Ensure the native target is inited. + init_native_target(); + + // Create the target machine builder. + auto jtmb = create_jit_tmb(opt_level, c_model); + + // Create the jit builder. + llvm::orc::LLJITBuilder lljit_builder; + // NOTE: other settable properties may + // be of interest: + // https://www.llvm.org/doxygen/classllvm_1_1orc_1_1LLJITBuilder.html + lljit_builder.setJITTargetMachineBuilder(jtmb); + +#if 0 + // Create a task dispatcher. + auto tdisp = std::make_unique(); + + // Create an ExecutorProcessControl. + auto epc = llvm::orc::SelfExecutorProcessControl::Create(nullptr, std::move(tdisp)); + // LCOV_EXCL_START + if (!epc) { + auto err = epc.takeError(); + + std::string err_report; + llvm::raw_string_ostream ostr(err_report); + + ostr << err; + + throw std::invalid_argument( + fmt::format("Could not create a SelfExecutorProcessControl. The full error message is:\n{}", ostr.str())); + } + // LCOV_EXCL_STOP + + // Set it in the lljit builder. + lljit_builder.setExecutorProcessControl(std::move(*epc)); +#else + + // Set the number of compilation threads. + lljit_builder.setNumCompileThreads(std::thread::hardware_concurrency()); + +#endif + + // Create the jit. + auto lljit = lljit_builder.create(); + // LCOV_EXCL_START + if (!lljit) { + auto err = lljit.takeError(); + + std::string err_report; + llvm::raw_string_ostream ostr(err_report); + + ostr << err; + + throw std::invalid_argument( + fmt::format("Could not create an LLJIT object. The full error message is:\n{}", ostr.str())); + } + // LCOV_EXCL_STOP + m_lljit = std::move(*lljit); + + // Setup the machinery to store the modules' binary code + // when it is generated. + m_lljit->getObjTransformLayer().setTransform([this](std::unique_ptr obj_buffer) { + assert(obj_buffer); + + // Lock down for access to m_object_files. + std::lock_guard lock{m_object_files_mutex}; + + assert(m_object_files.size() <= m_n_modules); + + // NOTE: this callback will be invoked the first time a jit lookup is performed, + // even if the object code was manually injected. In such a case, m_object_files + // has already been set up properly and we just sanity check in debug mode that + // one object file matches the content of obj_buffer. + if (m_object_files.size() < m_n_modules) { + // Add obj_buffer. + m_object_files.push_back(std::string(obj_buffer->getBufferStart(), obj_buffer->getBufferEnd())); + } else { + // Check that at least one buffer in m_object_files is exactly + // identical to obj_buffer. + assert(std::ranges::any_of(m_object_files, [&obj_buffer](const auto &cur) { + return obj_buffer->getBufferSize() == cur.size() + && std::equal(obj_buffer->getBufferStart(), obj_buffer->getBufferEnd(), cur.begin()); + ; + })); + } + + return llvm::Expected>(std::move(obj_buffer)); + }); + + // Setup the machinery to run the optimisation passes on the modules. + m_lljit->getIRTransformLayer().setTransform( + [this, opt_level, force_avx512, slp_vectorize, c_model](llvm::orc::ThreadSafeModule TSM, + llvm::orc::MaterializationResponsibility &) { + // See here for an explanation of what withModuleDo() entails: + // + // https://groups.google.com/g/llvm-dev/c/QauU4L_bHac + // + // In our case, the locking/thread safety aspect is not important as we are not sharing + // contexts between threads. More references from discord: + // + // https://discord.com/channels/636084430946959380/687692371038830597/1252428080648163328 + // https://discord.com/channels/636084430946959380/687692371038830597/1252118666187640892 + TSM.withModuleDo([this, opt_level, force_avx512, slp_vectorize, c_model](llvm::Module &M) { + // NOTE: don't run any optimisation on the master module. + if (M.getModuleIdentifier() != master_module_name) { + // NOTE: running the optimisation passes requires mutable access to a target + // machine. Thus, we create a new target machine per thread in order to avoid likely data races + // with a shared target machine. + + // Fetch a target machine builder. + auto jtmb = detail::create_jit_tmb(opt_level, c_model); + + // Try creating the target machine. + auto tm = jtmb.createTargetMachine(); + // LCOV_EXCL_START + if (!tm) [[unlikely]] { + throw std::invalid_argument("Error creating the target machine"); + } + // LCOV_EXCL_STOP + + // NOTE: we used to fetch the target triple from the lljit object, + // but recently we switched to asking the target triple directly + // from the target machine. Assert equality between the two for a while, + // just in case. + // NOTE: lljit.getTargetTriple() just returns a const ref to an internal + // object, it should be ok with concurrent invocation. + assert(m_lljit->getTargetTriple() == (*tm)->getTargetTriple()); + // NOTE: the target triple is also available in the module. + assert(m_lljit->getTargetTriple().str() == M.getTargetTriple()); + + // Optimise the module. + detail::optimise_module(M, **tm, opt_level, force_avx512, slp_vectorize); + } else { + ; + } + + // Store the optimised bitcode/IR for this module. + auto bc_snap = detail::bc_from_module(M); + auto ir_snap = detail::ir_from_module(M); + + // NOTE: protect for multi-threaded access. + std::lock_guard lock{m_ir_bc_mutex}; + + m_bc_snapshots.push_back(std::move(bc_snap)); + m_ir_snapshots.push_back(std::move(ir_snap)); + }); + + return llvm::Expected(std::move(TSM)); + }); + + // Setup the jit so that it can look up symbols from the current process. + auto dlsg + = llvm::orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(m_lljit->getDataLayout().getGlobalPrefix()); + // LCOV_EXCL_START + if (!dlsg) { + throw std::invalid_argument("Could not create the dynamic library search generator"); + } + // LCOV_EXCL_STOP + m_lljit->getMainJITDylib().addGenerator(std::move(*dlsg)); + + // Create the master context. + m_ctx = std::make_unique(std::make_unique()); + + // Create the master module. + m_module = std::make_unique(master_module_name, context()); + // Setup the data layout and the target triple. + m_module->setDataLayout(m_lljit->getDataLayout()); + m_module->setTargetTriple(m_lljit->getTargetTriple().str()); + + // Create a new builder for the master module. + // NOTE: no need to mess around with fast math flags for this builder. + m_builder = std::make_unique(context()); +} + +} // namespace + +} // namespace detail + +struct llvm_multi_state::impl { + std::vector m_states; + std::unique_ptr m_jit; +}; + +llvm_multi_state::llvm_multi_state() = default; + +llvm_multi_state::llvm_multi_state(std::vector states_) +{ + // Fetch a const ref, as we want to make extra sure we do not modify + // states_ until we move it to construct the impl. + const auto &states = states_; + + // We need at least 1 state. + if (states.empty()) [[unlikely]] { + throw std::invalid_argument("At least 1 llvm_state object is needed to construct an llvm_multi_state"); + } + + // All states must be uncompiled. + if (std::ranges::any_of(states, &llvm_state::is_compiled)) [[unlikely]] { + throw std::invalid_argument("An llvm_multi_state can be constructed only from uncompiled llvm_state objects"); + } + + // Module names must not collide with master_module_name. + if (std::ranges::any_of(states, [](const auto &s) { return s.module_name() == detail::master_module_name; })) + [[unlikely]] { + throw std::invalid_argument( + fmt::format("An invalid llvm_state was passed to the constructor of an llvm_multi_state: the module name " + "'{}' is reserved for internal use by llvm_multi_state", + detail::master_module_name)); + } + + // Settings in all states must be consistent. + auto states_differ = [](const llvm_state &s1, const llvm_state &s2) { + if (s1.get_opt_level() != s2.get_opt_level()) { + return true; + } + + if (s1.fast_math() != s2.fast_math()) { + return true; + } + + if (s1.force_avx512() != s2.force_avx512()) { + return true; + } + + if (s1.get_slp_vectorize() != s2.get_slp_vectorize()) { + return true; + } + + if (s1.get_code_model() != s2.get_code_model()) { + return true; + } + + // NOTE: bit of paranoia here. + assert(s1.m_jitter->m_lljit->getDataLayout() == s2.m_jitter->m_lljit->getDataLayout()); + assert(s1.m_jitter->get_target_triple() == s2.m_jitter->get_target_triple()); + assert(s1.m_jitter->get_target_cpu() == s2.m_jitter->get_target_cpu()); + assert(s1.m_jitter->get_target_features() == s2.m_jitter->get_target_features()); + + return false; + }; + + if (std::ranges::adjacent_find(states, states_differ) != states.end()) [[unlikely]] { + throw std::invalid_argument( + "Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state"); + } + + // Fetch settings from the first state. + const auto opt_level = states[0].get_opt_level(); + const auto c_model = states[0].get_code_model(); + const auto force_avx512 = states[0].force_avx512(); + const auto slp_vectorize = states[0].get_slp_vectorize(); + + // Create the multi_jit. + auto jit = std::make_unique(boost::safe_numerics::safe(states.size()) + 1, opt_level, + c_model, force_avx512, slp_vectorize); + + // Build and assign the implementation. + impl imp{.m_states = std::move(states_), .m_jit = std::move(jit)}; + m_impl = std::make_unique(std::move(imp)); +} + +llvm_multi_state::llvm_multi_state(const llvm_multi_state &other) +{ + // NOTE: start off by creating a new jit and copying the states. + // This will work regardless of whether other is compiled or not. + // No need to do any validation on the states are they are coming + // from a llvm_multi_state and they have been checked already. + impl imp{.m_states = other.m_impl->m_states, + .m_jit = std::make_unique(other.m_impl->m_jit->m_n_modules, other.get_opt_level(), + other.get_code_model(), other.force_avx512(), + other.get_slp_vectorize())}; + m_impl = std::make_unique(std::move(imp)); + + if (other.is_compiled()) { + // 'other' was compiled. + + // Reset builder and module. + m_impl->m_jit->m_module.reset(); + m_impl->m_jit->m_builder.reset(); + + // Copy over the snapshots and the object files, + m_impl->m_jit->m_object_files = other.m_impl->m_jit->m_object_files; + m_impl->m_jit->m_ir_snapshots = other.m_impl->m_jit->m_ir_snapshots; + m_impl->m_jit->m_bc_snapshots = other.m_impl->m_jit->m_bc_snapshots; + + // Add the files to the jit. + for (const auto &obj : m_impl->m_jit->m_object_files) { + detail::add_obj_to_lljit(*m_impl->m_jit->m_lljit, obj); + } + } else { + // If 'other' was not compiled, we do not need to do anything - the + // copy construction of the states takes care of everything. I.e., this + // is basically the same as construction from a list of states. + // NOTE: regarding the master module: this is always created empty + // and it remains empty until compilation, thus we do not need to care + // about it if other is uncompiled - the new empty master module constructed + // with the jit is ok. + assert(other.m_impl->m_jit->m_object_files.empty()); + assert(other.m_impl->m_jit->m_ir_snapshots.empty()); + assert(other.m_impl->m_jit->m_bc_snapshots.empty()); + } +} + +llvm_multi_state::llvm_multi_state(llvm_multi_state &&) noexcept = default; + +llvm_multi_state &llvm_multi_state::operator=(const llvm_multi_state &other) +{ + if (this != &other) { + *this = llvm_multi_state(other); + } + + return *this; +} + +llvm_multi_state &llvm_multi_state::operator=(llvm_multi_state &&) noexcept = default; + +llvm_multi_state::~llvm_multi_state() = default; + +void llvm_multi_state::save(boost::archive::binary_oarchive &ar, unsigned) const +{ + // Start by establishing if the state is compiled. + const auto cmp = is_compiled(); + ar << cmp; + + // Store the states. + ar << m_impl->m_states; + + // Store the object files and the snapshots. These may be empty. + ar << m_impl->m_jit->m_object_files; + ar << m_impl->m_jit->m_ir_snapshots; + ar << m_impl->m_jit->m_bc_snapshots; + + // NOTE: no need to explicitly store the bitcode of the master + // module: if this is compiled, the master module is in the snapshots. + // Otherwise, the master module is empty and there's no need to + // store anything. +} + +void llvm_multi_state::load(boost::archive::binary_iarchive &ar, unsigned) +{ + try { + // Load the compiled status flag from the archive. + // NOLINTNEXTLINE(misc-const-correctness) + bool cmp{}; + ar >> cmp; + + // Load the states. + ar >> m_impl->m_states; + + // Reset the jit with a new one. + m_impl->m_jit = std::make_unique( + boost::safe_numerics::safe(m_impl->m_states.size()) + 1, get_opt_level(), get_code_model(), + force_avx512(), get_slp_vectorize()); + + // Load the object files and the snapshots. + ar >> m_impl->m_jit->m_object_files; + ar >> m_impl->m_jit->m_ir_snapshots; + ar >> m_impl->m_jit->m_bc_snapshots; + + if (cmp) { + // If the stored state was compiled, we need to reset + // master builder and module. Otherwise, the empty default-constructed + // master module is ok (the master module remains empty until compilation + // is triggered). + m_impl->m_jit->m_module.reset(); + m_impl->m_jit->m_builder.reset(); + + // We also need to add all the object files to the jit. + for (const auto &obj : m_impl->m_jit->m_object_files) { + detail::add_obj_to_lljit(*m_impl->m_jit->m_lljit, obj); + } + } + + // Debug checks. + assert((m_impl->m_jit->m_object_files.empty() && !cmp) + || m_impl->m_jit->m_object_files.size() == m_impl->m_jit->m_n_modules); + assert((m_impl->m_jit->m_object_files.empty() && !cmp) + || m_impl->m_jit->m_ir_snapshots.size() == m_impl->m_jit->m_n_modules); + assert((m_impl->m_jit->m_object_files.empty() && !cmp) + || m_impl->m_jit->m_bc_snapshots.size() == m_impl->m_jit->m_n_modules); + + // LCOV_EXCL_START + } catch (...) { + m_impl.reset(); + + throw; + } + // LCOV_EXCL_STOP +} + +void llvm_multi_state::add_obj_triggers() +{ + // NOTE: the idea here is that we add one trigger function per module, and then + // we invoke all the trigger functions from a trigger function in the master module. + // Like this, we ensure materialisation of all modules when we lookup the + // master trigger. + + // Implement the per-module triggers. + for (decltype(m_impl->m_states.size()) i = 0; i < m_impl->m_states.size(); ++i) { + // Fetch builder/module/context for the current state. + auto &bld = m_impl->m_states[i].builder(); + auto &md = m_impl->m_states[i].module(); + auto &ctx = m_impl->m_states[i].context(); + + // The function name. + const auto fname = fmt::format("{}_{}", detail::obj_trigger_name, i); + + auto *ft = llvm::FunctionType::get(bld.getVoidTy(), {}, false); + assert(ft != nullptr); + auto *f = detail::llvm_func_create(ft, llvm::Function::ExternalLinkage, fname.c_str(), &md); + assert(f != nullptr); + + bld.SetInsertPoint(llvm::BasicBlock::Create(ctx, "entry", f)); + bld.CreateRetVoid(); + } + + // Fetch the master builder/module/context. + auto &bld = *m_impl->m_jit->m_builder; + auto &md = *m_impl->m_jit->m_module; + auto &ctx = m_impl->m_jit->context(); + + // Add the prototypes of all per-module trigger functions to the master module. + std::vector callees; + callees.reserve(m_impl->m_states.size()); + for (decltype(m_impl->m_states.size()) i = 0; i < m_impl->m_states.size(); ++i) { + // The function name. + const auto fname = fmt::format("{}_{}", detail::obj_trigger_name, i); + + auto *ft = llvm::FunctionType::get(bld.getVoidTy(), {}, false); + assert(ft != nullptr); + auto *f = detail::llvm_func_create(ft, llvm::Function::ExternalLinkage, fname.c_str(), &md); + assert(f != nullptr); + + callees.push_back(f); + } + + // Create the master trigger function. + auto *ft = llvm::FunctionType::get(bld.getVoidTy(), {}, false); + assert(ft != nullptr); + auto *f = detail::llvm_func_create(ft, llvm::Function::ExternalLinkage, detail::obj_trigger_name, &md); + assert(f != nullptr); + + bld.SetInsertPoint(llvm::BasicBlock::Create(ctx, "entry", f)); + + // Invoke all the triggers. + for (auto *tf : callees) { + bld.CreateCall(tf, {}); + } + + // Return. + bld.CreateRetVoid(); +} + +void llvm_multi_state::check_compiled(const char *f) const +{ + if (m_impl->m_jit->m_module) [[unlikely]] { + throw std::invalid_argument( + fmt::format("The function '{}' can be invoked only after the llvm_multi_state has been compiled", f)); + } +} + +void llvm_multi_state::check_uncompiled(const char *f) const +{ + if (!m_impl->m_jit->m_module) [[unlikely]] { + throw std::invalid_argument( + fmt::format("The function '{}' can be invoked only if the llvm_multi_state has not been compiled yet", f)); + } +} + +unsigned llvm_multi_state::get_n_modules() const noexcept +{ + return m_impl->m_jit->m_n_modules; +} + +unsigned llvm_multi_state::get_opt_level() const noexcept +{ + return m_impl->m_states[0].get_opt_level(); +} + +bool llvm_multi_state::fast_math() const noexcept +{ + return m_impl->m_states[0].fast_math(); +} + +bool llvm_multi_state::force_avx512() const noexcept +{ + return m_impl->m_states[0].force_avx512(); +} + +bool llvm_multi_state::get_slp_vectorize() const noexcept +{ + return m_impl->m_states[0].get_slp_vectorize(); +} + +code_model llvm_multi_state::get_code_model() const noexcept +{ + return m_impl->m_states[0].get_code_model(); +} + +bool llvm_multi_state::is_compiled() const noexcept +{ + return !m_impl->m_jit->m_module; +} + +std::vector llvm_multi_state::get_ir() const +{ + if (is_compiled()) { + return m_impl->m_jit->m_ir_snapshots; + } else { + std::vector retval; + retval.reserve(m_impl->m_jit->m_n_modules); + + for (const auto &s : m_impl->m_states) { + retval.push_back(s.get_ir()); + } + + // Add the IR from the master module. + retval.push_back(detail::ir_from_module(*m_impl->m_jit->m_module)); + + return retval; + } +} + +std::vector llvm_multi_state::get_bc() const +{ + if (is_compiled()) { + return m_impl->m_jit->m_bc_snapshots; + } else { + std::vector retval; + retval.reserve(m_impl->m_jit->m_n_modules); + + for (const auto &s : m_impl->m_states) { + retval.push_back(s.get_bc()); + } + + // Add the bitcode from the master module. + retval.push_back(detail::bc_from_module(*m_impl->m_jit->m_module)); + + return retval; + } +} + +const std::vector &llvm_multi_state::get_object_code() const +{ + check_compiled(__func__); + + return m_impl->m_jit->m_object_files; +} + +// NOTE: this function is NOT exception-safe, proper cleanup +// needs to be done externally if needed. +void llvm_multi_state::compile_impl() +{ + // Add all the modules from the states. + for (auto &s : m_impl->m_states) { + detail::add_module_to_lljit(*m_impl->m_jit->m_lljit, std::move(s.m_module), *s.m_jitter->m_ctx); + + // Clear out the builder. + s.m_builder.reset(); + + // NOTE: need to manually construct the object file, as this would + // normally be done by the invocation of s.compile() (which we do not do). + s.m_jitter->m_object_file.emplace(); + } + + // Add the master module. + detail::add_module_to_lljit(*m_impl->m_jit->m_lljit, std::move(m_impl->m_jit->m_module), *m_impl->m_jit->m_ctx); + + // Clear out the master builder. + m_impl->m_jit->m_builder.reset(); + + // Trigger optimisation and object code materialisation via lookup. + jit_lookup(detail::obj_trigger_name); + + // Sanity checks. + assert(m_impl->m_jit->m_bc_snapshots.size() == m_impl->m_jit->m_n_modules); + assert(m_impl->m_jit->m_ir_snapshots.size() == m_impl->m_jit->m_n_modules); + assert(m_impl->m_jit->m_object_files.size() == m_impl->m_jit->m_n_modules); +} + +void llvm_multi_state::compile() +{ + check_uncompiled(__func__); + + // Log runtime in trace mode. + spdlog::stopwatch sw; + + auto *logger = detail::get_logger(); + + // Verify the modules before compiling. + // NOTE: probably this can be parallelised if needed. + for (decltype(m_impl->m_states.size()) i = 0; i < m_impl->m_states.size(); ++i) { + detail::verify_module(*m_impl->m_states[i].m_module); + } + + logger->trace("llvm_multi_state module verification runtime: {}", sw); + + try { + // Add the object materialisation trigger functions. + // NOTE: contrary to llvm_state::add_obj_trigger(), add_obj_triggers() + // does not implement any automatic cleanup in case of errors. Thus, we fold + // it into the try/catch block in order to avoid leaving the + // llvm_multi_state in a half-baked state. + add_obj_triggers(); + + // Fetch the bitcode *before* optimisation. + std::vector obc; + obc.reserve(boost::safe_numerics::safe(m_impl->m_states.size()) + 1u); + for (const auto &s : m_impl->m_states) { + obc.push_back(s.get_bc()); + } + // Add the master bitcode. + obc.push_back(detail::bc_from_module(*m_impl->m_jit->m_module)); + + // Assemble the compilation flag. + const auto comp_flag + = detail::assemble_comp_flag(get_opt_level(), force_avx512(), get_slp_vectorize(), get_code_model()); + + // Lookup in the cache. + if (auto cached_data = detail::llvm_state_mem_cache_lookup(obc, comp_flag)) { + // Cache hit. + + // Assign the optimised snapshots. + assert(cached_data->opt_ir.size() == m_impl->m_jit->m_n_modules); + assert(cached_data->opt_bc.size() == m_impl->m_jit->m_n_modules); + assert(cached_data->obj.size() == m_impl->m_jit->m_n_modules); + assert(m_impl->m_jit->m_ir_snapshots.empty()); + assert(m_impl->m_jit->m_bc_snapshots.empty()); + m_impl->m_jit->m_ir_snapshots = std::move(cached_data->opt_ir); + m_impl->m_jit->m_bc_snapshots = std::move(cached_data->opt_bc); + + // NOTE: here it is important that we replicate the logic happening + // in llvm_state::compile(): clear out module/builder, construct + // the object file. The snapshots can be left empty. + for (auto &s : m_impl->m_states) { + s.m_module.reset(); + s.m_builder.reset(); + s.m_jitter->m_object_file.emplace(); + } + + // Clear out master module and builder. + m_impl->m_jit->m_module.reset(); + m_impl->m_jit->m_builder.reset(); + + // Add and assign the object files. + for (const auto &obj : cached_data->obj) { + detail::add_obj_to_lljit(*m_impl->m_jit->m_lljit, obj); + } + + // Assign the compiled objects. + assert(m_impl->m_jit->m_object_files.empty()); + m_impl->m_jit->m_object_files = std::move(cached_data->obj); + + // Lookup the trigger. + jit_lookup(detail::obj_trigger_name); + } else { + // Cache miss. + + sw.reset(); + + // Run the compilation. + compile_impl(); + + logger->trace("optimisation + materialisation runtime: {}", sw); + + // NOTE: at this point, m_ir_snapshots, m_bc_snapshots and m_object_files + // have all been constructed in random order because of multithreading. + // Sort them so that we provided deterministic behaviour. Probably + // not strictly needed, but let's try to avoid nondeterminism. + // All of this can be parallelised if needed. + std::ranges::sort(m_impl->m_jit->m_ir_snapshots); + std::ranges::sort(m_impl->m_jit->m_bc_snapshots); + std::ranges::sort(m_impl->m_jit->m_object_files); + + // Try to insert obc into the cache. + detail::llvm_state_mem_cache_try_insert(std::move(obc), comp_flag, + {.opt_bc = m_impl->m_jit->m_bc_snapshots, + .opt_ir = m_impl->m_jit->m_ir_snapshots, + .obj = m_impl->m_jit->m_object_files}); + // LCOV_EXCL_START + } + } catch (...) { + // Reset to a def-cted state in case of error, + // as it looks like there's no way of recovering. + m_impl.reset(); + + throw; + } + // LCOV_EXCL_STOP +} + +std::uintptr_t llvm_multi_state::jit_lookup(const std::string &name) +{ + check_compiled(__func__); + + auto sym = m_impl->m_jit->m_lljit->lookup(name); + if (!sym) { + throw std::invalid_argument(fmt::format("Could not find the symbol '{}' in an llvm_multi_state", name)); + } + + return static_cast((*sym).getValue()); +} + +std::ostream &operator<<(std::ostream &os, const llvm_multi_state &s) +{ + std::ostringstream oss; + oss << std::boolalpha; + + oss << "N of modules : " << s.get_n_modules() << '\n'; + oss << "Compiled : " << s.is_compiled() << '\n'; + oss << "Fast math : " << s.fast_math() << '\n'; + oss << "Force AVX512 : " << s.force_avx512() << '\n'; + oss << "SLP vectorization : " << s.get_slp_vectorize() << '\n'; + oss << "Code model : " << s.get_code_model() << '\n'; + oss << "Optimisation level: " << s.get_opt_level() << '\n'; + oss << "Data layout : " << s.m_impl->m_states[0].m_jitter->m_lljit->getDataLayout().getStringRepresentation() + << '\n'; + oss << "Target triple : " << s.m_impl->m_states[0].m_jitter->get_target_triple().str() << '\n'; + oss << "Target CPU : " << s.m_impl->m_states[0].m_jitter->get_target_cpu() << '\n'; + oss << "Target features : " << s.m_impl->m_states[0].m_jitter->get_target_features() << '\n'; + + return os << oss.str(); +} + HEYOKA_END_NAMESPACE diff --git a/src/llvm_state_mem_cache.cpp b/src/llvm_state_mem_cache.cpp index e3135ae25..88ba8ea0f 100644 --- a/src/llvm_state_mem_cache.cpp +++ b/src/llvm_state_mem_cache.cpp @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include @@ -28,7 +28,8 @@ #include // This in-memory cache maps the bitcode -// of an LLVM module and an optimisation level to: +// of one or more LLVM modules and an integer flag +// (representing several compilation settings) to: // // - the optimised version of the bitcode, // - the textual IR corresponding @@ -43,6 +44,26 @@ HEYOKA_BEGIN_NAMESPACE namespace detail { +// Helper to compute the total size in bytes +// of the data contained in an llvm_mc_value. +// Will throw on overflow. +std::size_t llvm_mc_value::total_size() const +{ + assert(!opt_bc.empty()); + assert(opt_bc.size() == opt_ir.size()); + assert(opt_bc.size() == obj.size()); + + boost::safe_numerics::safe ret = 0; + + for (decltype(opt_bc.size()) i = 0; i < opt_bc.size(); ++i) { + ret += opt_bc[i].size(); + ret += opt_ir[i].size(); + ret += obj[i].size(); + } + + return ret; +} + namespace { @@ -56,16 +77,33 @@ HEYOKA_CONSTINIT std::mutex mem_cache_mutex; // Definition of the data structures for the cache. -using lru_queue_t = std::list>; +using lru_queue_t = std::list, unsigned>>; using lru_key_t = lru_queue_t::iterator; +// Implementation of hashing for std::pair, unsigned> and +// its heterogeneous counterpart. +template +auto cache_key_hasher(const T &k) noexcept +{ + assert(!k.first.empty()); + + // Combine the bitcodes. + auto seed = std::hash{}(k.first[0]); + for (decltype(k.first.size()) i = 1; i < k.first.size(); ++i) { + boost::hash_combine(seed, k.first[i]); + } + + // Combine with the compilation flag. + boost::hash_combine(seed, static_cast(k.second)); + + return seed; +} + struct lru_hasher { std::size_t operator()(const lru_key_t &k) const noexcept { - auto seed = std::hash{}(k->first); - boost::hash_combine(seed, k->second); - return seed; + return cache_key_hasher(*k); } }; @@ -96,16 +134,16 @@ HEYOKA_CONSTINIT std::uint64_t mem_cache_limit = 2147483648ull; // Machinery for heterogeneous lookup into the cache. // NOTE: this function MUST be invoked while holding the global lock. -auto llvm_state_mem_cache_hl(const std::string &bc, unsigned opt_level) +auto llvm_state_mem_cache_hl(const std::vector &bc, unsigned comp_flag) { - using compat_key_t = std::pair; + // NOTE: the heterogeneous version of the key replaces std::vector + // with a const reference. + using compat_key_t = std::pair &, unsigned>; struct compat_hasher { std::size_t operator()(const compat_key_t &k) const noexcept { - auto seed = std::hash{}(k.first); - boost::hash_combine(seed, k.second); - return seed; + return cache_key_hasher(k); } }; @@ -120,7 +158,7 @@ auto llvm_state_mem_cache_hl(const std::string &bc, unsigned opt_level) } }; - return lru_map.find(std::make_pair(std::cref(bc), opt_level), compat_hasher{}, compat_cmp{}); + return lru_map.find(std::make_pair(std::cref(bc), comp_flag), compat_hasher{}, compat_cmp{}); } // Debug function to run sanity checks on the cache. @@ -131,15 +169,13 @@ void llvm_state_mem_cache_sanity_checks() // Check that the computed size of the cache is consistent with mem_cache_size. assert(std::accumulate(lru_map.begin(), lru_map.end(), boost::safe_numerics::safe(0), - [](const auto &a, const auto &p) { - return a + p.second.opt_bc.size() + p.second.opt_ir.size() + p.second.obj.size(); - }) + [](const auto &a, const auto &p) { return a + p.second.total_size(); }) == mem_cache_size); } } // namespace -std::optional llvm_state_mem_cache_lookup(const std::string &bc, unsigned opt_level) +std::optional llvm_state_mem_cache_lookup(const std::vector &bc, unsigned comp_flag) { // Lock down. const std::lock_guard lock(mem_cache_mutex); @@ -147,7 +183,7 @@ std::optional llvm_state_mem_cache_lookup(const std::string &bc, // Sanity checks. llvm_state_mem_cache_sanity_checks(); - if (const auto it = llvm_state_mem_cache_hl(bc, opt_level); it == lru_map.end()) { + if (const auto it = llvm_state_mem_cache_hl(bc, comp_flag); it == lru_map.end()) { // Cache miss. return {}; } else { @@ -163,7 +199,7 @@ std::optional llvm_state_mem_cache_lookup(const std::string &bc, } } -void llvm_state_mem_cache_try_insert(std::string bc, unsigned opt_level, llvm_mc_value val) +void llvm_state_mem_cache_try_insert(std::vector bc, unsigned comp_flag, llvm_mc_value val) { // Lock down. const std::lock_guard lock(mem_cache_mutex); @@ -174,7 +210,7 @@ void llvm_state_mem_cache_try_insert(std::string bc, unsigned opt_level, llvm_mc // Do a first lookup to check if bc is already in the cache. // This could happen, e.g., if two threads are compiling the same // code concurrently. - if (const auto it = llvm_state_mem_cache_hl(bc, opt_level); it != lru_map.end()) { + if (const auto it = llvm_state_mem_cache_hl(bc, comp_flag); it != lru_map.end()) { assert(val.opt_bc == it->second.opt_bc); assert(val.opt_ir == it->second.opt_ir); assert(val.obj == it->second.obj); @@ -183,8 +219,7 @@ void llvm_state_mem_cache_try_insert(std::string bc, unsigned opt_level, llvm_mc } // Compute the new cache size. - auto new_cache_size = static_cast(boost::safe_numerics::safe(mem_cache_size) - + val.opt_bc.size() + val.opt_ir.size() + val.obj.size()); + auto new_cache_size = boost::safe_numerics::safe(mem_cache_size) + val.total_size(); // Remove items from the cache if we are exceeding // the limit. @@ -195,8 +230,7 @@ void llvm_state_mem_cache_try_insert(std::string bc, unsigned opt_level, llvm_mc const auto &cur_val = cur_it->second; // NOTE: no possibility of overflow here, as cur_size is guaranteed // not to be greater than mem_cache_size. - const auto cur_size - = static_cast(cur_val.opt_bc.size()) + cur_val.opt_ir.size() + cur_val.obj.size(); + const auto cur_size = cur_val.total_size(); // NOTE: the next 4 lines cannot throw, which ensures that the // cache cannot be left in an inconsistent state. @@ -222,7 +256,7 @@ void llvm_state_mem_cache_try_insert(std::string bc, unsigned opt_level, llvm_mc // Add the new item to the front of the queue. // NOTE: if this throws, we have not modified lru_map yet, // no cleanup needed. - lru_queue.emplace_front(std::move(bc), opt_level); + lru_queue.emplace_front(std::move(bc), comp_flag); // Add the new item to the map. try { diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 09e360cab..f6c4ccd29 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -159,6 +159,7 @@ ADD_HEYOKA_TESTCASE(cfunc) ADD_HEYOKA_TESTCASE(cfunc_multieval) ADD_HEYOKA_TESTCASE(dfun) ADD_HEYOKA_TESTCASE(var_ode_sys) +ADD_HEYOKA_TESTCASE(llvm_multi_state) if(HEYOKA_WITH_MPPP AND mp++_WITH_MPFR) ADD_HEYOKA_TESTCASE(event_detection_mp) diff --git a/test/llvm_multi_state.cpp b/test/llvm_multi_state.cpp new file mode 100644 index 000000000..652ab5b56 --- /dev/null +++ b/test/llvm_multi_state.cpp @@ -0,0 +1,578 @@ +// Copyright 2020, 2021, 2022, 2023, 2024 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com) +// +// This file is part of the heyoka library. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include "catch.hpp" + +using namespace heyoka; + +TEST_CASE("basic") +{ + using Catch::Matchers::Message; + + // Default construction. + { + REQUIRE_NOTHROW(llvm_multi_state{}); + } + + // No states in input. + REQUIRE_THROWS_MATCHES(llvm_multi_state{{}}, std::invalid_argument, + Message("At least 1 llvm_state object is needed to construct an llvm_multi_state")); + + // Inconsistent settings. + REQUIRE_THROWS_MATCHES( + (llvm_multi_state{{llvm_state{kw::opt_level = 1u}, llvm_state{kw::opt_level = 2u}}}), std::invalid_argument, + Message("Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state")); + + REQUIRE_THROWS_MATCHES( + (llvm_multi_state{{llvm_state{kw::fast_math = true}, llvm_state{}}}), std::invalid_argument, + Message("Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state")); + + REQUIRE_THROWS_MATCHES( + (llvm_multi_state{{llvm_state{}, llvm_state{kw::force_avx512 = true}}}), std::invalid_argument, + Message("Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state")); + + REQUIRE_THROWS_MATCHES( + (llvm_multi_state{{llvm_state{}, llvm_state{}, llvm_state{kw::slp_vectorize = true}}}), std::invalid_argument, + Message("Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state")); + REQUIRE_THROWS_MATCHES( + (llvm_multi_state{{llvm_state{}, llvm_state{kw::code_model = code_model::large}, llvm_state{}}}), + std::invalid_argument, + Message("Inconsistent llvm_state settings detected in the constructor of an llvm_multi_state")); + + { + // Construction from compiled modules. + llvm_state s; + s.compile(); + + REQUIRE_THROWS_MATCHES( + (llvm_multi_state{{s, llvm_state{}}}), std::invalid_argument, + Message("An llvm_multi_state can be constructed only from uncompiled llvm_state objects")); + REQUIRE_THROWS_MATCHES( + (llvm_multi_state{{llvm_state{}, s}}), std::invalid_argument, + Message("An llvm_multi_state can be constructed only from uncompiled llvm_state objects")); + } + + { + // Invalid module name. + llvm_state s{kw::mname = "heyoka.master"}; + REQUIRE_THROWS_MATCHES( + (llvm_multi_state{{s, llvm_state{}}}), std::invalid_argument, + Message("An invalid llvm_state was passed to the constructor of an llvm_multi_state: the module name " + "'heyoka.master' is reserved for internal use by llvm_multi_state")); + } + + // Test the property getters. + { + llvm_state s{kw::opt_level = 1u, kw::fast_math = true, kw::force_avx512 = true, kw::slp_vectorize = true, + kw::code_model = code_model::large}; + + llvm_multi_state ms{{s, s, s, s}}; + + REQUIRE(ms.get_opt_level() == 1u); + REQUIRE(ms.fast_math()); + REQUIRE(ms.force_avx512()); + REQUIRE(ms.get_slp_vectorize()); + REQUIRE(ms.get_code_model() == code_model::large); + REQUIRE(ms.get_n_modules() == 5u); + REQUIRE(!ms.is_compiled()); + + ms.compile(); + + REQUIRE(ms.is_compiled()); + REQUIRE(ms.get_opt_level() == 1u); + REQUIRE(ms.fast_math()); + REQUIRE(ms.force_avx512()); + REQUIRE(ms.get_slp_vectorize()); + REQUIRE(ms.get_code_model() == code_model::large); + REQUIRE(ms.get_n_modules() == 5u); + + REQUIRE_THROWS_MATCHES( + ms.compile(), std::invalid_argument, + Message("The function 'compile' can be invoked only if the llvm_multi_state has not been compiled yet")); + } + + // Move construction/assignment. + { + llvm_state s{kw::opt_level = 1u, kw::fast_math = true, kw::force_avx512 = true, kw::slp_vectorize = true, + kw::code_model = code_model::large}; + + llvm_multi_state ms{{s, s, s, s}}; + + auto ms2 = std::move(ms); + + REQUIRE(ms2.get_opt_level() == 1u); + REQUIRE(ms2.fast_math()); + REQUIRE(ms2.force_avx512()); + REQUIRE(ms2.get_slp_vectorize()); + REQUIRE(ms2.get_code_model() == code_model::large); + REQUIRE(ms2.get_n_modules() == 5u); + REQUIRE(!ms2.is_compiled()); + + ms2.compile(); + + llvm_multi_state ms3; + ms3 = std::move(ms2); + + REQUIRE(ms3.is_compiled()); + REQUIRE(ms3.get_opt_level() == 1u); + REQUIRE(ms3.fast_math()); + REQUIRE(ms3.force_avx512()); + REQUIRE(ms3.get_slp_vectorize()); + REQUIRE(ms3.get_code_model() == code_model::large); + REQUIRE(ms3.get_n_modules() == 5u); + } +} + +TEST_CASE("copy semantics") +{ + using Catch::Matchers::Message; + + // NOTE: in order to properly test this, we have to disable the cache. + llvm_state::clear_memcache(); + llvm_state::set_memcache_limit(0); + + auto [x, y] = make_vars("x", "y"); + + llvm_state s1, s2; + + add_cfunc(s1, "f1", {x * y}, {x, y}, kw::compact_mode = true); + add_cfunc(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true); + + llvm_multi_state ms{{s1, s2}}; + + auto ms_copy = ms; + + REQUIRE(ms_copy.get_bc() == ms.get_bc()); + REQUIRE(ms_copy.get_ir() == ms.get_ir()); + REQUIRE(ms_copy.is_compiled() == ms.is_compiled()); + REQUIRE(ms_copy.fast_math() == ms.fast_math()); + REQUIRE(ms_copy.force_avx512() == ms.force_avx512()); + REQUIRE(ms_copy.get_opt_level() == ms.get_opt_level()); + REQUIRE(ms_copy.get_slp_vectorize() == ms.get_slp_vectorize()); + REQUIRE(ms_copy.get_code_model() == ms.get_code_model()); + REQUIRE_THROWS_MATCHES( + ms_copy.get_object_code(), std::invalid_argument, + Message("The function 'get_object_code' can be invoked only after the llvm_multi_state has been compiled")); + REQUIRE_THROWS_MATCHES( + ms_copy.jit_lookup("foo"), std::invalid_argument, + Message("The function 'jit_lookup' can be invoked only after the llvm_multi_state has been compiled")); + + ms.compile(); + ms_copy.compile(); + + REQUIRE(ms_copy.get_bc() == ms.get_bc()); + REQUIRE(ms_copy.get_ir() == ms.get_ir()); + REQUIRE(ms_copy.get_object_code() == ms.get_object_code()); + REQUIRE(ms_copy.is_compiled() == ms.is_compiled()); + REQUIRE(ms_copy.fast_math() == ms.fast_math()); + REQUIRE(ms_copy.force_avx512() == ms.force_avx512()); + REQUIRE(ms_copy.get_opt_level() == ms.get_opt_level()); + REQUIRE(ms_copy.get_slp_vectorize() == ms.get_slp_vectorize()); + REQUIRE(ms_copy.get_code_model() == ms.get_code_model()); + REQUIRE_NOTHROW(ms_copy.jit_lookup("f1")); + REQUIRE_NOTHROW(ms_copy.jit_lookup("f2")); + + { + auto *cf1_ptr = reinterpret_cast( + ms_copy.jit_lookup("f1")); + auto *cf2_ptr = reinterpret_cast( + ms_copy.jit_lookup("f2")); + + REQUIRE_THROWS_MATCHES(ms_copy.jit_lookup("f3"), std::invalid_argument, + Message("Could not find the symbol 'f3' in an llvm_multi_state")); + + const double ins[] = {2., 3.}; + double outs[2] = {}; + + cf1_ptr(outs, ins, nullptr, nullptr); + cf2_ptr(outs + 1, ins, nullptr, nullptr); + + REQUIRE(outs[0] == 6); + REQUIRE(outs[1] == 2. / 3.); + } + + auto ms_copy2 = ms; + + REQUIRE(ms_copy2.get_bc() == ms.get_bc()); + REQUIRE(ms_copy2.get_ir() == ms.get_ir()); + REQUIRE(ms_copy2.get_object_code() == ms.get_object_code()); + REQUIRE(ms_copy2.is_compiled() == ms.is_compiled()); + REQUIRE(ms_copy2.fast_math() == ms.fast_math()); + REQUIRE(ms_copy2.force_avx512() == ms.force_avx512()); + REQUIRE(ms_copy2.get_opt_level() == ms.get_opt_level()); + REQUIRE(ms_copy2.get_slp_vectorize() == ms.get_slp_vectorize()); + REQUIRE(ms_copy2.get_code_model() == ms.get_code_model()); + REQUIRE_NOTHROW(ms_copy2.jit_lookup("f1")); + REQUIRE_NOTHROW(ms_copy2.jit_lookup("f2")); + + { + auto *cf1_ptr = reinterpret_cast( + ms_copy2.jit_lookup("f1")); + auto *cf2_ptr = reinterpret_cast( + ms_copy2.jit_lookup("f2")); + + const double ins[] = {2., 3.}; + double outs[2] = {}; + + cf1_ptr(outs, ins, nullptr, nullptr); + cf2_ptr(outs + 1, ins, nullptr, nullptr); + + REQUIRE(outs[0] == 6); + REQUIRE(outs[1] == 2. / 3.); + } + + // Test also copy assignment. + llvm_multi_state ms_copy3; + ms_copy3 = ms_copy2; + + REQUIRE(ms_copy3.get_bc() == ms.get_bc()); + REQUIRE(ms_copy3.get_ir() == ms.get_ir()); + REQUIRE(ms_copy3.get_object_code() == ms.get_object_code()); + REQUIRE(ms_copy3.is_compiled() == ms.is_compiled()); + REQUIRE(ms_copy3.fast_math() == ms.fast_math()); + REQUIRE(ms_copy3.force_avx512() == ms.force_avx512()); + REQUIRE(ms_copy3.get_opt_level() == ms.get_opt_level()); + REQUIRE(ms_copy3.get_slp_vectorize() == ms.get_slp_vectorize()); + REQUIRE(ms_copy3.get_code_model() == ms.get_code_model()); + REQUIRE_NOTHROW(ms_copy3.jit_lookup("f1")); + REQUIRE_NOTHROW(ms_copy3.jit_lookup("f2")); + + { + auto *cf1_ptr = reinterpret_cast( + ms_copy3.jit_lookup("f1")); + auto *cf2_ptr = reinterpret_cast( + ms_copy3.jit_lookup("f2")); + + const double ins[] = {2., 3.}; + double outs[2] = {}; + + cf1_ptr(outs, ins, nullptr, nullptr); + cf2_ptr(outs + 1, ins, nullptr, nullptr); + + REQUIRE(outs[0] == 6); + REQUIRE(outs[1] == 2. / 3.); + } + + // Restore the cache. + llvm_state::set_memcache_limit(100'000'000ull); +} + +TEST_CASE("s11n") +{ + using Catch::Matchers::Message; + + // NOTE: in order to properly test this, we have to disable the cache. + llvm_state::clear_memcache(); + llvm_state::set_memcache_limit(0); + + auto [x, y] = make_vars("x", "y"); + + llvm_state s1, s2; + + add_cfunc(s1, "f1", {x * y}, {x, y}, kw::compact_mode = true); + add_cfunc(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true); + + // Uncompiled. + llvm_multi_state ms{{s1, s2}}; + + std::stringstream ss; + + { + boost::archive::binary_oarchive oa(ss); + oa << ms; + } + + llvm_multi_state ms_copy{{llvm_state{}}}; + + { + boost::archive::binary_iarchive ia(ss); + ia >> ms_copy; + } + + REQUIRE(ms_copy.get_bc() == ms.get_bc()); + REQUIRE(ms_copy.get_ir() == ms.get_ir()); + REQUIRE(ms_copy.is_compiled() == ms.is_compiled()); + REQUIRE(ms_copy.fast_math() == ms.fast_math()); + REQUIRE(ms_copy.force_avx512() == ms.force_avx512()); + REQUIRE(ms_copy.get_opt_level() == ms.get_opt_level()); + REQUIRE(ms_copy.get_slp_vectorize() == ms.get_slp_vectorize()); + REQUIRE(ms_copy.get_code_model() == ms.get_code_model()); + REQUIRE_THROWS_MATCHES( + ms_copy.get_object_code(), std::invalid_argument, + Message("The function 'get_object_code' can be invoked only after the llvm_multi_state has been compiled")); + REQUIRE_THROWS_MATCHES( + ms_copy.jit_lookup("foo"), std::invalid_argument, + Message("The function 'jit_lookup' can be invoked only after the llvm_multi_state has been compiled")); + + // Compiled. + ms.compile(); + + ss.str(""); + + { + boost::archive::binary_oarchive oa(ss); + oa << ms; + } + + { + boost::archive::binary_iarchive ia(ss); + ia >> ms_copy; + } + + REQUIRE(ms_copy.get_bc() == ms.get_bc()); + REQUIRE(ms_copy.get_ir() == ms.get_ir()); + REQUIRE(ms_copy.get_object_code() == ms.get_object_code()); + REQUIRE(ms_copy.is_compiled() == ms.is_compiled()); + REQUIRE(ms_copy.fast_math() == ms.fast_math()); + REQUIRE(ms_copy.force_avx512() == ms.force_avx512()); + REQUIRE(ms_copy.get_opt_level() == ms.get_opt_level()); + REQUIRE(ms_copy.get_slp_vectorize() == ms.get_slp_vectorize()); + REQUIRE(ms_copy.get_code_model() == ms.get_code_model()); + REQUIRE_NOTHROW(ms_copy.jit_lookup("f1")); + REQUIRE_NOTHROW(ms_copy.jit_lookup("f2")); + + { + auto *cf1_ptr = reinterpret_cast( + ms_copy.jit_lookup("f1")); + auto *cf2_ptr = reinterpret_cast( + ms_copy.jit_lookup("f2")); + + const double ins[] = {2., 3.}; + double outs[2] = {}; + + cf1_ptr(outs, ins, nullptr, nullptr); + cf2_ptr(outs + 1, ins, nullptr, nullptr); + + REQUIRE(outs[0] == 6); + REQUIRE(outs[1] == 2. / 3.); + } + + // Restore the cache. + llvm_state::set_memcache_limit(100'000'000ull); +} + +TEST_CASE("cfunc") +{ + using Catch::Matchers::Message; + + // Basic test. + auto [x, y] = make_vars("x", "y"); + + llvm_state s1, s2; + + add_cfunc(s1, "f1", {x * y}, {x, y}, kw::compact_mode = true); + add_cfunc(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true); + + const auto orig_ir1 = s1.get_ir(); + const auto orig_ir2 = s2.get_ir(); + + const auto orig_bc1 = s1.get_bc(); + const auto orig_bc2 = s2.get_bc(); + + llvm_multi_state ms{{s1, s2}}; + + REQUIRE(ms.get_ir().size() == 3u); + REQUIRE(ms.get_bc().size() == 3u); + REQUIRE_THROWS_MATCHES( + ms.get_object_code(), std::invalid_argument, + Message("The function 'get_object_code' can be invoked only after the llvm_multi_state has been compiled")); + + REQUIRE(orig_ir1 == ms.get_ir()[0]); + REQUIRE(orig_ir2 == ms.get_ir()[1]); + + REQUIRE(orig_bc1 == ms.get_bc()[0]); + REQUIRE(orig_bc2 == ms.get_bc()[1]); + + ms.compile(); + s1.compile(); + s2.compile(); + + REQUIRE(ms.get_ir().size() == 3u); + REQUIRE(ms.get_bc().size() == 3u); + + auto *cf1_ptr + = reinterpret_cast(ms.jit_lookup("f1")); + auto *cf2_ptr + = reinterpret_cast(ms.jit_lookup("f2")); + + const double ins[] = {2., 3.}; + double outs[2] = {}; + + cf1_ptr(outs, ins, nullptr, nullptr); + cf2_ptr(outs + 1, ins, nullptr, nullptr); + + REQUIRE(outs[0] == 6); + REQUIRE(outs[1] == 2. / 3.); +} + +TEST_CASE("stream op") +{ + auto [x, y] = make_vars("x", "y"); + + llvm_state s1, s2; + + add_cfunc(s1, "f1", {x * y}, {x, y}, kw::compact_mode = true); + add_cfunc(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true); + + const auto orig_ir1 = s1.get_ir(); + const auto orig_ir2 = s2.get_ir(); + + const auto orig_bc1 = s1.get_bc(); + const auto orig_bc2 = s2.get_bc(); + + llvm_multi_state ms{{s1, s2}}; + + std::ostringstream oss; + oss << ms; + + REQUIRE(!oss.str().empty()); +} + +// A test to check that, post compilation, snapshots and object files +// are ordered deterministically. +TEST_CASE("post compile ordering") +{ + auto [x, y] = make_vars("x", "y"); + + llvm_state s1, s2, s3, s4; + + add_cfunc(s1, "f1", {x * y}, {x, y}); + add_cfunc(s2, "f2", {x / y}, {x, y}); + add_cfunc(s3, "f3", {x + y}, {x, y}); + add_cfunc(s4, "f4", {x - y}, {x, y}); + + llvm_state::clear_memcache(); + + llvm_multi_state ms{{s1, s2, s3, s4}}; + ms.compile(); + + const auto orig_obj = ms.get_object_code(); + const auto orig_ir = ms.get_ir(); + const auto orig_bc = ms.get_bc(); + + for (auto i = 0; i < 20; ++i) { + llvm_state::clear_memcache(); + + llvm_multi_state ms2{{s1, s2, s3, s4}}; + ms2.compile(); + + REQUIRE(ms2.get_object_code() == orig_obj); + REQUIRE(ms2.get_ir() == orig_ir); + REQUIRE(ms2.get_bc() == orig_bc); + } +} + +TEST_CASE("memcache testing") +{ + auto [x, y] = make_vars("x", "y"); + + llvm_state s1, s2, s3, s4; + + add_cfunc(s1, "f1", {x * y}, {x, y}); + add_cfunc(s2, "f2", {x / y}, {x, y}); + add_cfunc(s3, "f3", {x + y}, {x, y}); + add_cfunc(s4, "f4", {x - y}, {x, y}); + + llvm_state::clear_memcache(); + + llvm_multi_state ms{{s1, s2, s3, s4}}; + ms.compile(); + + const auto cur_cache_size = llvm_state::get_memcache_size(); + + llvm_multi_state ms2{{s1, s2, s3, s4}}; + ms2.compile(); + + REQUIRE(cur_cache_size == llvm_state::get_memcache_size()); + + auto *cf1_ptr + = reinterpret_cast(ms.jit_lookup("f1")); + auto *cf2_ptr + = reinterpret_cast(ms.jit_lookup("f2")); + auto *cf3_ptr + = reinterpret_cast(ms.jit_lookup("f3")); + auto *cf4_ptr + = reinterpret_cast(ms.jit_lookup("f4")); + + const double ins[] = {2., 3.}; + double outs[4] = {}; + + cf1_ptr(outs, ins, nullptr, nullptr); + cf2_ptr(outs + 1, ins, nullptr, nullptr); + cf3_ptr(outs + 2, ins, nullptr, nullptr); + cf4_ptr(outs + 3, ins, nullptr, nullptr); + + REQUIRE(outs[0] == 6); + REQUIRE(outs[1] == 2. / 3.); + REQUIRE(outs[2] == 5); + REQUIRE(outs[3] == -1); +} + +// Tests to check vectorisation via the vector-function-abi-variant machinery. +TEST_CASE("vfabi double") +{ + for (auto fast_math : {false, true}) { + llvm_state s1{kw::slp_vectorize = true, kw::fast_math = fast_math}; + llvm_state s2{kw::slp_vectorize = true, kw::fast_math = fast_math}; + + auto [a, b] = make_vars("a", "b"); + + add_cfunc(s1, "cfunc", {erf(a), erf(b)}, {a, b}); + add_cfunc(s2, "cfuncs", {erf(a), erf(b)}, {a, b}, kw::strided = true); + + llvm_multi_state ms{{s1, s2}}; + + ms.compile(); + + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. +#if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 + + for (auto ir : ms.get_ir()) { + using string_find_iterator = boost::find_iterator; + + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@erf", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } + + // NOTE: in the master module or in the "cfunc" module, we don't + // expect any @erf: the master module contains only the trigger, + // the "cfunc" module should have vectorised everything and + // there should be no more references to the scalar @erf. + if (count == 0u) { + continue; + } + + // NOTE: occurrences of the scalar version: + // - 2 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 3u); + } + +#endif + } +} diff --git a/tools/gha_llvm13_conda_asan.sh b/tools/gha_llvm13_conda_asan.sh deleted file mode 100644 index cba8e3692..000000000 --- a/tools/gha_llvm13_conda_asan.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env bash - -# Echo each command -set -x - -# Exit on error. -set -e - -# Core deps. -sudo apt-get install wget - -# Install conda+deps. -wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O miniconda.sh -export deps_dir=$HOME/local -export PATH="$HOME/miniconda/bin:$PATH" -bash miniconda.sh -b -p $HOME/miniconda -mamba create -y -p $deps_dir c-compiler cxx-compiler cmake ninja 'llvmdev=13.*' \ - tbb-devel tbb libboost-devel 'mppp=1.*' sleef xtensor xtensor-blas blas \ - blas-devel fmt spdlog -source activate $deps_dir - -# Create the build dir and cd into it. -mkdir build -cd build - -# Clear the compilation flags set up by conda. -unset CXXFLAGS -unset CFLAGS - -# Configure. -cmake ../ -G Ninja \ - -DCMAKE_PREFIX_PATH=$deps_dir \ - -DCMAKE_BUILD_TYPE=Debug \ - -DHEYOKA_BUILD_TESTS=yes \ - -DHEYOKA_BUILD_TUTORIALS=ON \ - -DHEYOKA_WITH_MPPP=yes \ - -DHEYOKA_WITH_SLEEF=yes \ - -DCMAKE_CXX_FLAGS="-fsanitize=address" \ - -DCMAKE_CXX_FLAGS_DEBUG="-g -Og" - -# Build. -ninja -v - -# Run the tests. -ctest -VV -j4 - -set +e -set +x diff --git a/tools/gha_llvm14_conda_asan.sh b/tools/gha_llvm14_conda_asan.sh deleted file mode 100644 index cc54de5ae..000000000 --- a/tools/gha_llvm14_conda_asan.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env bash - -# Echo each command -set -x - -# Exit on error. -set -e - -# Core deps. -sudo apt-get install wget - -# Install conda+deps. -wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O miniconda.sh -export deps_dir=$HOME/local -export PATH="$HOME/miniconda/bin:$PATH" -bash miniconda.sh -b -p $HOME/miniconda -mamba create -y -p $deps_dir c-compiler cxx-compiler cmake ninja 'llvmdev=14.*' \ - tbb-devel tbb libboost-devel 'mppp=1.*' sleef xtensor xtensor-blas blas \ - blas-devel fmt spdlog -source activate $deps_dir - -# Create the build dir and cd into it. -mkdir build -cd build - -# Clear the compilation flags set up by conda. -unset CXXFLAGS -unset CFLAGS - -# Configure. -cmake ../ -G Ninja \ - -DCMAKE_PREFIX_PATH=$deps_dir \ - -DCMAKE_BUILD_TYPE=Debug \ - -DHEYOKA_BUILD_TESTS=yes \ - -DHEYOKA_BUILD_TUTORIALS=ON \ - -DHEYOKA_WITH_MPPP=yes \ - -DHEYOKA_WITH_SLEEF=yes \ - -DCMAKE_CXX_FLAGS="-fsanitize=address" \ - -DCMAKE_CXX_FLAGS_DEBUG="-g -Og" - -# Build. -ninja -v - -# Run the tests. -ctest -VV -j4 - -set +e -set +x