From 1b7a02c2f1585e9aa4404079d03b508f3626796a Mon Sep 17 00:00:00 2001 From: John Biddiscombe Date: Wed, 24 Apr 2024 08:53:32 +0200 Subject: [PATCH 01/20] Use pika's transform_mpi, polling modes This is a squashed commit containing multiple changes completion_modes: pika supports different completion modes that may be used as an alternative to the dlaf:: transformMPI mechanism that uses yield_while to wait on an MPI request. The completion modes may be set via the environment variable PIKA_MPI_COMPLETION_MODE= which by default will select the one chosen by pika/dlaf developers known to give good results across a broad range of use cases. polling: The pika polling loop may test for one or multiple request completions on each iteration through the scheduling loop the environment var PIKA_MPI_POLLING_SIZE= (default 8) may be used to vary the polling size (typically the default value can be used without any need to play with this value) mpi pool: pika will create the mpi pool if the completion mode has the pool flag set, the user needs only to call the pool create function during the pika::init setup phase. Cleanup of the pool on shutdown will also be handled automatically The user should use pika::mpi::pool_name instead of raw "mpi", mpi pool management has been deferred tom pika::mpi Change: the transform mpi code does not need to return an MPI_SUCCESS value, the return value from mpi_transform has been removed to simplify code and an error is set using senders set_error if any mpi call fails. Should mpi_transform calls thnat return other value be required, this code can be reinstated. --- include/dlaf/init.h | 1 - include/dlaf/schedulers.h | 3 +- include/dlaf/sender/transform_mpi.h | 84 +++++++++++++++---- src/init.cpp | 67 +++++++-------- test/unit/communication/test_comm_sender.cpp | 8 +- .../unit/communication/test_transform_mpi.cpp | 6 +- 6 files changed, 104 insertions(+), 65 deletions(-) diff --git a/include/dlaf/init.h b/include/dlaf/init.h index fe21d84ac5..ff779217fd 100644 --- a/include/dlaf/init.h +++ b/include/dlaf/init.h @@ -50,7 +50,6 @@ struct configuration { double umpire_device_memory_pool_coalescing_reallocation_ratio = 1.0; std::size_t num_gpu_blas_handles = 16; std::size_t num_gpu_lapack_handles = 16; - std::string mpi_pool = "mpi"; }; std::ostream& operator<<(std::ostream& os, const configuration& cfg); diff --git a/include/dlaf/schedulers.h b/include/dlaf/schedulers.h index 1b6aae04fa..34cf6850fb 100644 --- a/include/dlaf/schedulers.h +++ b/include/dlaf/schedulers.h @@ -12,6 +12,7 @@ /// @file #include +#include #include #include @@ -50,6 +51,6 @@ auto getBackendScheduler( inline auto getMPIScheduler() { return pika::execution::experimental::thread_pool_scheduler{ - &pika::resource::get_thread_pool(getConfiguration().mpi_pool)}; + &pika::resource::get_thread_pool(pika::mpi::experimental::get_pool_name())}; } } // namespace dlaf::internal diff --git a/include/dlaf/sender/transform_mpi.h b/include/dlaf/sender/transform_mpi.h index f5c2ac728a..e5028d0474 100644 --- a/include/dlaf/sender/transform_mpi.h +++ b/include/dlaf/sender/transform_mpi.h @@ -12,6 +12,10 @@ #include #include +#ifdef EXTRA_MPI_TYPES_DEBUGGING +#include +#endif +#include #include #include @@ -20,9 +24,14 @@ #include #include #include +// +#include namespace dlaf::comm::internal { +template +static pika::debug::detail::print_threshold dla_debug("DLA_MPI"); + /// This helper "consumes" a CommunicatorPipelineExclusiveWrapper ensuring that after this call /// the one passed as argument gets destroyed. All other types left as they are /// by the second overload. @@ -45,17 +54,12 @@ void consumeCommunicatorWrapper(T&) {} /// least until version 12 fails with an internal compiler error with a trailing /// decltype for SFINAE. GCC has no problems with a lambda. template -struct MPICallHelper { +struct MPIYieldWhileCallHelper { std::decay_t f; template - auto operator()(Ts&&... ts) -> decltype(std::move(f)(dlaf::common::internal::unwrap(ts)..., - std::declval())) { + auto operator()(Ts&&... ts) { + namespace mpid = pika::mpi::experimental::detail; MPI_Request req; - auto is_request_completed = [&req] { - int flag; - MPI_Test(&req, &flag, MPI_STATUS_IGNORE); - return flag == 0; - }; // Note: // Callables passed to transformMPI have their arguments passed by reference, but doing so @@ -71,17 +75,41 @@ struct MPICallHelper { if constexpr (std::is_void_v) { std::move(f)(dlaf::common::internal::unwrap(ts)..., &req); (internal::consumeCommunicatorWrapper(ts), ...); - pika::util::yield_while(is_request_completed); + pika::util::yield_while([req]() { return !mpid::poll_request(req); }); + } + else { + /*auto r = */ std::move(f)(dlaf::common::internal::unwrap(ts)..., &req); + (internal::consumeCommunicatorWrapper(ts), ...); + pika::util::yield_while([req]() { return !mpid::poll_request(req); }); + } + } +}; + +/// Helper type for wrapping MPI calls. +template +struct MPICallHelper { + std::decay_t f; + + template + auto operator()(Ts&&... ts) -> decltype(std::move(f)(dlaf::common::internal::unwrap(ts)...)) { + using namespace pika::debug::detail; + PIKA_DETAIL_DP(dla_debug<5>, debug(str<>("MPICallHelper"), pika::debug::print_type(", "))); + using result_type = decltype(std::move(f)(dlaf::common::internal::unwrap(ts)...)); + if constexpr (std::is_void_v) { + std::move(f)(dlaf::common::internal::unwrap(ts)...); + (internal::consumeCommunicatorWrapper(ts), ...); } else { - auto r = std::move(f)(dlaf::common::internal::unwrap(ts)..., &req); + auto r = std::move(f)(dlaf::common::internal::unwrap(ts)...); (internal::consumeCommunicatorWrapper(ts), ...); - pika::util::yield_while(is_request_completed); return r; } } }; +template +MPIYieldWhileCallHelper(F&&) -> MPIYieldWhileCallHelper>; + template MPICallHelper(F&&) -> MPICallHelper>; @@ -91,12 +119,38 @@ template (sender), dlaf::internal::getMPIScheduler()) | - ex::then(dlaf::common::internal::ConsumeRvalues{MPICallHelper{std::forward(f)}}) | - ex::drop_operation_state(); + namespace mpi = pika::mpi::experimental; + namespace mpid = pika::mpi::experimental::detail; + + if (mpi::get_completion_mode() >= static_cast(mpid::handler_mode::unspecified)) { + auto snd1 = + ex::continues_on(std::forward(sender), dlaf::internal::getMPIScheduler()) | + ex::then(dlaf::common::internal::ConsumeRvalues{MPIYieldWhileCallHelper{std::forward(f)}}); + return ex::make_unique_any_sender(std::move(snd1)); + } + else { +#ifdef EXTRA_MPI_TYPES_DEBUGGING + auto snd1 = + std::forward(sender) | + ex::let_value([=, f = std::move(f)](LArgs&&... largs) { + PIKA_DETAIL_DP(dla_debug<2>, debug(str<>("Args to MPI fn\n"), + pika::debug::print_type(", "), "\nValues\n")); + return ex::just(std::move(largs)...) | + mpi::transform_mpi(dlaf::common::internal::ConsumeRvalues{MPICallHelper{std::move(f)}}); + }); + return ex::make_unique_any_sender(std::move(snd1)); +#else + PIKA_DETAIL_DP(dla_debug<5>, debug(str<>("MPI fn\n"))); + auto snd1 = + std::forward(sender) | + mpi::transform_mpi(dlaf::common::internal::ConsumeRvalues{MPICallHelper{std::forward(f)}}); + return ex::make_unique_any_sender(std::move(snd1)); +#endif + } } + std::forward(ts)...)); + template struct PartialTransformMPIBase { std::decay_t f_; diff --git a/src/init.cpp b/src/init.cpp index 0d4366e15e..ae9f88142c 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -44,9 +45,8 @@ std::ostream& operator<<(std::ostream& os, const configuration& cfg) { os << " umpire_device_memory_pool_coalescing_reallocation_ratio = " << cfg.umpire_device_memory_pool_coalescing_reallocation_ratio << std::endl; os << " num_gpu_blas_handles = " << cfg.num_gpu_blas_handles << std::endl; os << " num_gpu_lapack_handles = " << cfg.num_gpu_lapack_handles << std::endl; - os << " mpi_pool = " << cfg.mpi_pool << std::endl; // clang-format on - return os; + return os;0 } namespace internal { @@ -70,6 +70,9 @@ struct Init { cfg.umpire_host_memory_pool_initial_block_bytes, cfg.umpire_host_memory_pool_next_block_bytes, cfg.umpire_host_memory_pool_alignment_bytes, cfg.umpire_host_memory_pool_coalescing_free_ratio, cfg.umpire_host_memory_pool_coalescing_reallocation_ratio); + // install mpi polling loop + pika::mpi::experimental::init(false, true); + pika::mpi::experimental::register_polling(); } static void finalize() { @@ -125,6 +128,9 @@ struct Init { initializeGpuPool(device, cfg.num_np_gpu_streams_per_thread, cfg.num_hp_gpu_streams_per_thread, cfg.num_gpu_blas_handles, cfg.num_gpu_lapack_handles); pika::cuda::experimental::detail::register_polling(pika::resource::get_thread_pool("default")); + // setup polling on default pool, enable exceptions and init mpi internals + pika::mpi::experimental::init(false, true); + pika::mpi::experimental::register_polling(); } static void finalize() { @@ -262,18 +268,18 @@ void updateConfiguration(const pika::program_options::variables_map& vm, configu cfg.mpi_pool = (pika::resource::pool_exists("mpi")) ? "mpi" : "default"; // Warn if not using MPI pool without --dlaf:no-mpi-pool - int mpi_initialized; - DLAF_MPI_CHECK_ERROR(MPI_Initialized(&mpi_initialized)); - if (mpi_initialized) { - int ntasks; - DLAF_MPI_CHECK_ERROR(MPI_Comm_size(MPI_COMM_WORLD, &ntasks)); - if (ntasks != 1 && cfg.mpi_pool == "default" && !vm["dlaf:no-mpi-pool"].as()) { - std::cerr << "Warning! DLA-Future is not using the \"mpi\" pika thread pool for " - "MPI communication but --dlaf:no-mpi-pool is not set. This may " - "indicate a bug in DLA-Future or pika. Performance may be degraded." - << std::endl; - } - } + // int mpi_initialized; + // DLAF_MPI_CHECK_ERROR(MPI_Initialized(&mpi_initialized)); + // if (mpi_initialized) { + // int ntasks; + // DLAF_MPI_CHECK_ERROR(MPI_Comm_size(MPI_COMM_WORLD, &ntasks)); + // if (ntasks != 1 && cfg.mpi_pool == "default" && !vm["dlaf:no-mpi-pool"].as()) { + // std::cerr << "Warning! DLA-Future is not using the \"mpi\" pika thread pool for " + // "MPI communication but --dlaf:no-mpi-pool is not set. This may " + // "indicate a bug in DLA-Future or pika. Performance may be degraded." + // << std::endl; + // } + // } // update tune parameters // @@ -421,30 +427,17 @@ ScopedInitializer::~ScopedInitializer() { finalize(); } -void initResourcePartitionerHandler(pika::resource::partitioner& rp, +void initResourcePartitionerHandler(pika::resource::partitioner&, const pika::program_options::variables_map& vm) { - // Don't create the MPI pool if the user disabled it + namespace mpi = pika::mpi::experimental; + // Create the MPI pool if needed and unless the user disabled it + mpi::pool_create_mode pool_mode = mpi::pool_create_mode::pika_decides; + namespace mpi = pika::mpi::experimental; if (vm["dlaf:no-mpi-pool"].as()) - return; - - // Don't create the MPI pool if there is a single process - int ntasks; - DLAF_MPI_CHECK_ERROR(MPI_Comm_size(MPI_COMM_WORLD, &ntasks)); - if (ntasks == 1) - return; - - // Disable idle backoff on the MPI pool - using pika::threads::scheduler_mode; - auto mode = scheduler_mode::default_mode; - mode = scheduler_mode(mode & ~scheduler_mode::enable_idle_backoff); - - // Create a thread pool with a single core that we will use for all - // communication related tasks - rp.create_thread_pool("mpi", pika::resource::scheduling_policy::static_priority, mode); -#if PIKA_VERSION_FULL >= 0x001C00 // >= 0.28.0 - rp.add_resource(rp.sockets()[0].cores()[0].pus()[0], "mpi"); -#else - rp.add_resource(rp.numa_domains()[0].cores()[0].pus()[0], "mpi"); -#endif + pool_mode = mpi::pool_create_mode::force_no_create; + + namespace mpix = pika::mpi::experimental; + // create a pool for mpi if necessary + mpix::create_pool(mpix::get_pool_name(), pool_mode); } } diff --git a/test/unit/communication/test_comm_sender.cpp b/test/unit/communication/test_comm_sender.cpp index 3083301660..d03867e696 100644 --- a/test/unit/communication/test_comm_sender.cpp +++ b/test/unit/communication/test_comm_sender.cpp @@ -43,10 +43,7 @@ void test_transform_mpi() { auto send = just(send_buf.data(), size, dtype, send_rank, tag, comm) | transformMPI(MPI_Isend); auto recv = just(recv_buf.data(), size, dtype, recv_rank, tag, comm) | transformMPI(MPI_Irecv); - sync_wait(when_all(std::move(send), std::move(recv)) | then([](int e1, int e2) { - DLAF_MPI_CHECK_ERROR(e1); - DLAF_MPI_CHECK_ERROR(e2); - })); + sync_wait(when_all(std::move(send), std::move(recv)) | then([]() {})); std::vector expected_recv_buf(static_cast(size), recv_rank); @@ -65,8 +62,7 @@ TEST(Bcast, Polling) { double val = (comm.rank() == root_rank) ? 4.2 : 1.2; std::vector buf(static_cast(size), val); - sync_wait(just(buf.data(), size, dtype, root_rank, comm) | transformMPI(MPI_Ibcast) | - then([](int e) { DLAF_MPI_CHECK_ERROR(e); })); + sync_wait(just(buf.data(), size, dtype, root_rank, comm) | transformMPI(MPI_Ibcast) | then([]() {})); std::vector expected_buf(static_cast(size), 4.2); ASSERT_TRUE(expected_buf == buf); diff --git a/test/unit/communication/test_transform_mpi.cpp b/test/unit/communication/test_transform_mpi.cpp index bf9f857e0d..28f67fa28e 100644 --- a/test/unit/communication/test_transform_mpi.cpp +++ b/test/unit/communication/test_transform_mpi.cpp @@ -65,11 +65,7 @@ TEST_F(TransformMPITest, PromiseGuardManagement) { int message; whenAllLift(&message, 1, MPI_INT, 1, 0, chain.exclusive()) | transformMPI(MPI_Irecv) | - ex::then([&sent_guard](auto mpi_err_code) { - EXPECT_EQ(MPI_SUCCESS, mpi_err_code); - sent_guard = true; - }) | - ex::ensure_started(); + ex::then([&sent_guard](/*auto mpi_err_code*/) { sent_guard = true; }) | ex::ensure_started(); // Note: // At this point IRecv is (getting) posted but it won't complete until this Rank 0 will trigger From d1beb5bcaea539112a96c018be8cf2e11cfe2aad Mon Sep 17 00:00:00 2001 From: John Biddiscombe Date: Wed, 15 May 2024 14:20:02 +0200 Subject: [PATCH 02/20] Move MPI debug ifdef --- include/dlaf/sender/transform_mpi.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/dlaf/sender/transform_mpi.h b/include/dlaf/sender/transform_mpi.h index e5028d0474..3f1cb24e4b 100644 --- a/include/dlaf/sender/transform_mpi.h +++ b/include/dlaf/sender/transform_mpi.h @@ -12,9 +12,6 @@ #include #include -#ifdef EXTRA_MPI_TYPES_DEBUGGING -#include -#endif #include #include @@ -26,6 +23,10 @@ #include // #include +// +#ifdef EXTRA_MPI_TYPES_DEBUGGING +#include +#endif namespace dlaf::comm::internal { From 88f2f1389fe8957a6b07a25987b44b6e4fefb721 Mon Sep 17 00:00:00 2001 From: John Biddiscombe Date: Sun, 28 Jul 2024 22:35:41 +0200 Subject: [PATCH 03/20] Update to use latest pika mpi completion mode types --- include/dlaf/sender/transform_mpi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/dlaf/sender/transform_mpi.h b/include/dlaf/sender/transform_mpi.h index 3f1cb24e4b..3f8545893e 100644 --- a/include/dlaf/sender/transform_mpi.h +++ b/include/dlaf/sender/transform_mpi.h @@ -123,7 +123,7 @@ template = static_cast(mpid::handler_mode::unspecified)) { + if (mpi::get_completion_mode() >= static_cast(mpid::handler_method::unspecified)) { auto snd1 = ex::continues_on(std::forward(sender), dlaf::internal::getMPIScheduler()) | ex::then(dlaf::common::internal::ConsumeRvalues{MPIYieldWhileCallHelper{std::forward(f)}}); From eb87dce2cc7326992ebfb8df5c25bc4b89e6547c Mon Sep 17 00:00:00 2001 From: John Biddiscombe Date: Mon, 14 Oct 2024 11:03:49 +0200 Subject: [PATCH 04/20] Update dla mpi to use latest pika::mpi::experimental API A series of changes to pika::mpi have changed both the API and the internal pool creation mechanism to simplify end user access to the transform_mpi features and setup of mpi polling itself. --- include/dlaf/init.h | 6 ---- miniapp/miniapp_band_to_tridiag.cpp | 2 +- miniapp/miniapp_bt_band_to_tridiag.cpp | 2 +- miniapp/miniapp_bt_reduction_to_band.cpp | 2 +- miniapp/miniapp_cholesky.cpp | 2 +- miniapp/miniapp_communication.cpp | 2 +- miniapp/miniapp_eigensolver.cpp | 2 +- miniapp/miniapp_gen_eigensolver.cpp | 2 +- miniapp/miniapp_gen_to_std.cpp | 2 +- miniapp/miniapp_reduction_to_band.cpp | 2 +- miniapp/miniapp_triangular_multiplication.cpp | 2 +- miniapp/miniapp_triangular_solver.cpp | 2 +- miniapp/miniapp_tridiag_solver.cpp | 2 +- src/c_api/init.cpp | 2 +- src/init.cpp | 29 +++++++------------ test/src/gtest_mpipika_main.cpp | 2 +- 16 files changed, 24 insertions(+), 39 deletions(-) diff --git a/include/dlaf/init.h b/include/dlaf/init.h index ff779217fd..31d78e4331 100644 --- a/include/dlaf/init.h +++ b/include/dlaf/init.h @@ -106,10 +106,4 @@ struct [[nodiscard]] ScopedInitializer { ScopedInitializer& operator=(ScopedInitializer&&) = delete; ScopedInitializer& operator=(const ScopedInitializer&) = delete; }; - -/// Initialize the MPI pool. -/// -/// -void initResourcePartitionerHandler(pika::resource::partitioner& rp, - const pika::program_options::variables_map& vm); } diff --git a/miniapp/miniapp_band_to_tridiag.cpp b/miniapp/miniapp_band_to_tridiag.cpp index bace372c48..780bc3f558 100644 --- a/miniapp/miniapp_band_to_tridiag.cpp +++ b/miniapp/miniapp_band_to_tridiag.cpp @@ -209,6 +209,6 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.rp_callback = dlaf::initResourcePartitionerHandler; + p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/miniapp/miniapp_bt_band_to_tridiag.cpp b/miniapp/miniapp_bt_band_to_tridiag.cpp index 0b89769198..9ab31a21c2 100644 --- a/miniapp/miniapp_bt_band_to_tridiag.cpp +++ b/miniapp/miniapp_bt_band_to_tridiag.cpp @@ -219,6 +219,6 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.rp_callback = dlaf::initResourcePartitionerHandler; + p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/miniapp/miniapp_bt_reduction_to_band.cpp b/miniapp/miniapp_bt_reduction_to_band.cpp index 8761bcd91c..5a713d5bc4 100644 --- a/miniapp/miniapp_bt_reduction_to_band.cpp +++ b/miniapp/miniapp_bt_reduction_to_band.cpp @@ -238,6 +238,6 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.rp_callback = dlaf::initResourcePartitionerHandler; + p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/miniapp/miniapp_cholesky.cpp b/miniapp/miniapp_cholesky.cpp index 947835f955..f58fa6729c 100644 --- a/miniapp/miniapp_cholesky.cpp +++ b/miniapp/miniapp_cholesky.cpp @@ -228,7 +228,7 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.rp_callback = dlaf::initResourcePartitionerHandler; + p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/miniapp/miniapp_communication.cpp b/miniapp/miniapp_communication.cpp index dd147d6d26..3a8e0048ca 100644 --- a/miniapp/miniapp_communication.cpp +++ b/miniapp/miniapp_communication.cpp @@ -606,6 +606,6 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.rp_callback = dlaf::initResourcePartitionerHandler; + p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/miniapp/miniapp_eigensolver.cpp b/miniapp/miniapp_eigensolver.cpp index 11abddf5d5..92c47f493b 100644 --- a/miniapp/miniapp_eigensolver.cpp +++ b/miniapp/miniapp_eigensolver.cpp @@ -257,7 +257,7 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.rp_callback = dlaf::initResourcePartitionerHandler; + p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/miniapp/miniapp_gen_eigensolver.cpp b/miniapp/miniapp_gen_eigensolver.cpp index fe27104fbf..7e30b7056d 100644 --- a/miniapp/miniapp_gen_eigensolver.cpp +++ b/miniapp/miniapp_gen_eigensolver.cpp @@ -286,7 +286,7 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.rp_callback = dlaf::initResourcePartitionerHandler; + p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/miniapp/miniapp_gen_to_std.cpp b/miniapp/miniapp_gen_to_std.cpp index 8fa3a8c70d..3b845aa5d7 100644 --- a/miniapp/miniapp_gen_to_std.cpp +++ b/miniapp/miniapp_gen_to_std.cpp @@ -216,6 +216,6 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.rp_callback = dlaf::initResourcePartitionerHandler; + p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/miniapp/miniapp_reduction_to_band.cpp b/miniapp/miniapp_reduction_to_band.cpp index 299af230b0..72dc8cb4da 100644 --- a/miniapp/miniapp_reduction_to_band.cpp +++ b/miniapp/miniapp_reduction_to_band.cpp @@ -257,6 +257,6 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.rp_callback = dlaf::initResourcePartitionerHandler; + p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/miniapp/miniapp_triangular_multiplication.cpp b/miniapp/miniapp_triangular_multiplication.cpp index 93e07fc7b4..74081de853 100644 --- a/miniapp/miniapp_triangular_multiplication.cpp +++ b/miniapp/miniapp_triangular_multiplication.cpp @@ -225,6 +225,6 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.rp_callback = dlaf::initResourcePartitionerHandler; + p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/miniapp/miniapp_triangular_solver.cpp b/miniapp/miniapp_triangular_solver.cpp index 3ebdb9440e..2b7896ee5f 100644 --- a/miniapp/miniapp_triangular_solver.cpp +++ b/miniapp/miniapp_triangular_solver.cpp @@ -244,6 +244,6 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.rp_callback = dlaf::initResourcePartitionerHandler; + p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/miniapp/miniapp_tridiag_solver.cpp b/miniapp/miniapp_tridiag_solver.cpp index 3457748c62..32b68407db 100644 --- a/miniapp/miniapp_tridiag_solver.cpp +++ b/miniapp/miniapp_tridiag_solver.cpp @@ -225,6 +225,6 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.rp_callback = dlaf::initResourcePartitionerHandler; + p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/src/c_api/init.cpp b/src/c_api/init.cpp index 0a36a70226..47218f841a 100644 --- a/src/c_api/init.cpp +++ b/src/c_api/init.cpp @@ -26,7 +26,7 @@ void dlaf_initialize(int argc_pika, const char** argv_pika, int argc_dlaf, // pika initialization pika::init_params params; - params.rp_callback = dlaf::initResourcePartitionerHandler; + params.pool_creation_mode = ::pika::resource::mode_pika_decides; params.desc_cmdline = desc; // After pika 0.21.0 pika::start reports errors only by exception and returns void #if PIKA_VERSION_FULL >= 0x001500 diff --git a/src/init.cpp b/src/init.cpp index ae9f88142c..f90fa7ff4d 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -71,11 +71,15 @@ struct Init { cfg.umpire_host_memory_pool_alignment_bytes, cfg.umpire_host_memory_pool_coalescing_free_ratio, cfg.umpire_host_memory_pool_coalescing_reallocation_ratio); // install mpi polling loop - pika::mpi::experimental::init(false, true); - pika::mpi::experimental::register_polling(); + if (pika::mpi::detail::environment::is_mpi_initialized()) { + pika::mpi::experimental::start_polling(pika::mpi::experimental::exception_mode::no_handler); + } } static void finalize() { + if (pika::mpi::detail::environment::is_mpi_initialized()) { + pika::mpi::experimental::stop_polling(); + } memory::internal::finalizeUmpireHostAllocator(); } }; @@ -125,12 +129,13 @@ struct Init { cfg.umpire_device_memory_pool_initial_block_bytes, cfg.umpire_device_memory_pool_alignment_bytes, cfg.umpire_host_memory_pool_coalescing_free_ratio, cfg.umpire_host_memory_pool_coalescing_reallocation_ratio); + memory::internal::initializeUmpireDeviceAllocator(cfg.umpire_device_memory_pool_initial_bytes); initializeGpuPool(device, cfg.num_np_gpu_streams_per_thread, cfg.num_hp_gpu_streams_per_thread, cfg.num_gpu_blas_handles, cfg.num_gpu_lapack_handles); + if (pika::mpi::detail::environment::is_mpi_initialized()) { + pika::mpi::experimental::start_polling(pika::mpi::experimental::exception_mode::no_handler); + } pika::cuda::experimental::detail::register_polling(pika::resource::get_thread_pool("default")); - // setup polling on default pool, enable exceptions and init mpi internals - pika::mpi::experimental::init(false, true); - pika::mpi::experimental::register_polling(); } static void finalize() { @@ -426,18 +431,4 @@ ScopedInitializer::ScopedInitializer(int argc, const char* const argv[], const c ScopedInitializer::~ScopedInitializer() { finalize(); } - -void initResourcePartitionerHandler(pika::resource::partitioner&, - const pika::program_options::variables_map& vm) { - namespace mpi = pika::mpi::experimental; - // Create the MPI pool if needed and unless the user disabled it - mpi::pool_create_mode pool_mode = mpi::pool_create_mode::pika_decides; - namespace mpi = pika::mpi::experimental; - if (vm["dlaf:no-mpi-pool"].as()) - pool_mode = mpi::pool_create_mode::force_no_create; - - namespace mpix = pika::mpi::experimental; - // create a pool for mpi if necessary - mpix::create_pool(mpix::get_pool_name(), pool_mode); -} } diff --git a/test/src/gtest_mpipika_main.cpp b/test/src/gtest_mpipika_main.cpp index a7f9b23e09..ef7bfbb55e 100644 --- a/test/src/gtest_mpipika_main.cpp +++ b/test/src/gtest_mpipika_main.cpp @@ -103,7 +103,7 @@ GTEST_API_ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.rp_callback = dlaf::initResourcePartitionerHandler; + p.pool_creation_mode = ::pika::resource::mode_pika_decides; // Initialize pika auto ret = pika::init(test_main, argc, argv, p); From 929abd29cb97650a957186519ce97169dc2888b6 Mon Sep 17 00:00:00 2001 From: John Biddiscombe Date: Thu, 17 Oct 2024 10:37:14 +0200 Subject: [PATCH 05/20] Simplify init procedure as pika now uses command-line pool creation latest pika:: transform_mpi is enabled/disabled via the command line and does not require explicit enabling via init_params --- include/dlaf/sender/transform_mpi.h | 36 ++++++++----------- miniapp/miniapp_band_to_tridiag.cpp | 1 - miniapp/miniapp_bt_band_to_tridiag.cpp | 1 - miniapp/miniapp_bt_reduction_to_band.cpp | 1 - miniapp/miniapp_cholesky.cpp | 1 - miniapp/miniapp_communication.cpp | 1 - miniapp/miniapp_eigensolver.cpp | 1 - miniapp/miniapp_gen_eigensolver.cpp | 1 - miniapp/miniapp_gen_to_std.cpp | 1 - miniapp/miniapp_reduction_to_band.cpp | 1 - miniapp/miniapp_triangular_multiplication.cpp | 1 - miniapp/miniapp_triangular_solver.cpp | 1 - miniapp/miniapp_tridiag_solver.cpp | 1 - src/c_api/init.cpp | 1 - test/src/gtest_mpipika_main.cpp | 1 - 15 files changed, 14 insertions(+), 36 deletions(-) diff --git a/include/dlaf/sender/transform_mpi.h b/include/dlaf/sender/transform_mpi.h index 3f8545893e..0e9ff7942d 100644 --- a/include/dlaf/sender/transform_mpi.h +++ b/include/dlaf/sender/transform_mpi.h @@ -123,31 +123,23 @@ template = static_cast(mpid::handler_method::unspecified)) { - auto snd1 = - ex::continues_on(std::forward(sender), dlaf::internal::getMPIScheduler()) | - ex::then(dlaf::common::internal::ConsumeRvalues{MPIYieldWhileCallHelper{std::forward(f)}}); - return ex::make_unique_any_sender(std::move(snd1)); - } - else { #ifdef EXTRA_MPI_TYPES_DEBUGGING - auto snd1 = - std::forward(sender) | - ex::let_value([=, f = std::move(f)](LArgs&&... largs) { - PIKA_DETAIL_DP(dla_debug<2>, debug(str<>("Args to MPI fn\n"), - pika::debug::print_type(", "), "\nValues\n")); - return ex::just(std::move(largs)...) | - mpi::transform_mpi(dlaf::common::internal::ConsumeRvalues{MPICallHelper{std::move(f)}}); - }); - return ex::make_unique_any_sender(std::move(snd1)); + auto snd1 = + std::forward(sender) | + ex::let_value([=, f = std::move(f)](LArgs&&... largs) { + PIKA_DETAIL_DP(dla_debug<2>, debug(str<>("Args to MPI fn\n"), + pika::debug::print_type(", "), "\nValues\n")); + return ex::just(std::move(largs)...) | + mpi::transform_mpi(dlaf::common::internal::ConsumeRvalues{MPICallHelper{std::move(f)}}); + }); + return ex::make_unique_any_sender(std::move(snd1)); #else - PIKA_DETAIL_DP(dla_debug<5>, debug(str<>("MPI fn\n"))); - auto snd1 = - std::forward(sender) | - mpi::transform_mpi(dlaf::common::internal::ConsumeRvalues{MPICallHelper{std::forward(f)}}); - return ex::make_unique_any_sender(std::move(snd1)); + PIKA_DETAIL_DP(dla_debug<5>, debug(str<>("MPI fn\n"))); + auto snd1 = + std::forward(sender) | + mpi::transform_mpi(dlaf::common::internal::ConsumeRvalues{MPICallHelper{std::forward(f)}}); + return ex::make_unique_any_sender(std::move(snd1)); #endif - } } std::forward(ts)...)); diff --git a/miniapp/miniapp_band_to_tridiag.cpp b/miniapp/miniapp_band_to_tridiag.cpp index 780bc3f558..b1ced27259 100644 --- a/miniapp/miniapp_band_to_tridiag.cpp +++ b/miniapp/miniapp_band_to_tridiag.cpp @@ -209,6 +209,5 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/miniapp/miniapp_bt_band_to_tridiag.cpp b/miniapp/miniapp_bt_band_to_tridiag.cpp index 9ab31a21c2..5a6f14b8c1 100644 --- a/miniapp/miniapp_bt_band_to_tridiag.cpp +++ b/miniapp/miniapp_bt_band_to_tridiag.cpp @@ -219,6 +219,5 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/miniapp/miniapp_bt_reduction_to_band.cpp b/miniapp/miniapp_bt_reduction_to_band.cpp index 5a713d5bc4..6d10992788 100644 --- a/miniapp/miniapp_bt_reduction_to_band.cpp +++ b/miniapp/miniapp_bt_reduction_to_band.cpp @@ -238,6 +238,5 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/miniapp/miniapp_cholesky.cpp b/miniapp/miniapp_cholesky.cpp index f58fa6729c..23e7420f5e 100644 --- a/miniapp/miniapp_cholesky.cpp +++ b/miniapp/miniapp_cholesky.cpp @@ -228,7 +228,6 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/miniapp/miniapp_communication.cpp b/miniapp/miniapp_communication.cpp index 3a8e0048ca..21bfdb6570 100644 --- a/miniapp/miniapp_communication.cpp +++ b/miniapp/miniapp_communication.cpp @@ -606,6 +606,5 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/miniapp/miniapp_eigensolver.cpp b/miniapp/miniapp_eigensolver.cpp index 92c47f493b..86fe40447e 100644 --- a/miniapp/miniapp_eigensolver.cpp +++ b/miniapp/miniapp_eigensolver.cpp @@ -257,7 +257,6 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/miniapp/miniapp_gen_eigensolver.cpp b/miniapp/miniapp_gen_eigensolver.cpp index 7e30b7056d..2771b5d6d6 100644 --- a/miniapp/miniapp_gen_eigensolver.cpp +++ b/miniapp/miniapp_gen_eigensolver.cpp @@ -286,7 +286,6 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/miniapp/miniapp_gen_to_std.cpp b/miniapp/miniapp_gen_to_std.cpp index 3b845aa5d7..de136168bb 100644 --- a/miniapp/miniapp_gen_to_std.cpp +++ b/miniapp/miniapp_gen_to_std.cpp @@ -216,6 +216,5 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/miniapp/miniapp_reduction_to_band.cpp b/miniapp/miniapp_reduction_to_band.cpp index 72dc8cb4da..3b6e92e088 100644 --- a/miniapp/miniapp_reduction_to_band.cpp +++ b/miniapp/miniapp_reduction_to_band.cpp @@ -257,6 +257,5 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/miniapp/miniapp_triangular_multiplication.cpp b/miniapp/miniapp_triangular_multiplication.cpp index 74081de853..923c0edf82 100644 --- a/miniapp/miniapp_triangular_multiplication.cpp +++ b/miniapp/miniapp_triangular_multiplication.cpp @@ -225,6 +225,5 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/miniapp/miniapp_triangular_solver.cpp b/miniapp/miniapp_triangular_solver.cpp index 2b7896ee5f..d96fe73d91 100644 --- a/miniapp/miniapp_triangular_solver.cpp +++ b/miniapp/miniapp_triangular_solver.cpp @@ -244,6 +244,5 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/miniapp/miniapp_tridiag_solver.cpp b/miniapp/miniapp_tridiag_solver.cpp index 32b68407db..e61a1e5f4a 100644 --- a/miniapp/miniapp_tridiag_solver.cpp +++ b/miniapp/miniapp_tridiag_solver.cpp @@ -225,6 +225,5 @@ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.pool_creation_mode = ::pika::resource::mode_pika_decides; return pika::init(pika_main, argc, argv, p); } diff --git a/src/c_api/init.cpp b/src/c_api/init.cpp index 47218f841a..bd34b57ccf 100644 --- a/src/c_api/init.cpp +++ b/src/c_api/init.cpp @@ -26,7 +26,6 @@ void dlaf_initialize(int argc_pika, const char** argv_pika, int argc_dlaf, // pika initialization pika::init_params params; - params.pool_creation_mode = ::pika::resource::mode_pika_decides; params.desc_cmdline = desc; // After pika 0.21.0 pika::start reports errors only by exception and returns void #if PIKA_VERSION_FULL >= 0x001500 diff --git a/test/src/gtest_mpipika_main.cpp b/test/src/gtest_mpipika_main.cpp index ef7bfbb55e..2c40f6e14b 100644 --- a/test/src/gtest_mpipika_main.cpp +++ b/test/src/gtest_mpipika_main.cpp @@ -103,7 +103,6 @@ GTEST_API_ int main(int argc, char** argv) { pika::init_params p; p.desc_cmdline = desc_commandline; - p.pool_creation_mode = ::pika::resource::mode_pika_decides; // Initialize pika auto ret = pika::init(test_main, argc, argv, p); From 76a4588f073f8306698773a703a5b003280c6320 Mon Sep 17 00:00:00 2001 From: John Biddiscombe Date: Mon, 4 Nov 2024 11:03:00 +0100 Subject: [PATCH 06/20] Remove obsolete code and cleanup PR review suggestions --- include/dlaf/sender/transform_mpi.h | 60 +------------------- src/init.cpp | 16 ------ test/unit/communication/test_comm_sender.cpp | 4 +- 3 files changed, 3 insertions(+), 77 deletions(-) diff --git a/include/dlaf/sender/transform_mpi.h b/include/dlaf/sender/transform_mpi.h index 0e9ff7942d..fc7a5c2725 100644 --- a/include/dlaf/sender/transform_mpi.h +++ b/include/dlaf/sender/transform_mpi.h @@ -14,6 +14,7 @@ #include #include +#include #include #include @@ -21,12 +22,6 @@ #include #include #include -// -#include -// -#ifdef EXTRA_MPI_TYPES_DEBUGGING -#include -#endif namespace dlaf::comm::internal { @@ -51,42 +46,6 @@ void consumeCommunicatorWrapper(T&) {} /// callable. The wrapper then waits for the request to complete with /// yield_while. /// -/// This could in theory be a lambda inside transformMPI. However, clang at -/// least until version 12 fails with an internal compiler error with a trailing -/// decltype for SFINAE. GCC has no problems with a lambda. -template -struct MPIYieldWhileCallHelper { - std::decay_t f; - template - auto operator()(Ts&&... ts) { - namespace mpid = pika::mpi::experimental::detail; - MPI_Request req; - - // Note: - // Callables passed to transformMPI have their arguments passed by reference, but doing so - // with PromiseGuard would keep the guard alive until the completion of the MPI operation, - // whereas we are only looking to guard the submission of the MPI operation. We therefore - // explicitly release CommunicatorPipelineExclusiveWrapper after submitting the MPI operation - // with consumeCommunicatorWrapper. - // - // We also use unwrap various types passed to the MPI operation, including PromiseGuards of - // any type, to allow the MPI operation not to care whether a Communicator was wrapped in a - // PromiseGuard or not. - using result_type = decltype(std::move(f)(dlaf::common::internal::unwrap(ts)..., &req)); - if constexpr (std::is_void_v) { - std::move(f)(dlaf::common::internal::unwrap(ts)..., &req); - (internal::consumeCommunicatorWrapper(ts), ...); - pika::util::yield_while([req]() { return !mpid::poll_request(req); }); - } - else { - /*auto r = */ std::move(f)(dlaf::common::internal::unwrap(ts)..., &req); - (internal::consumeCommunicatorWrapper(ts), ...); - pika::util::yield_while([req]() { return !mpid::poll_request(req); }); - } - } -}; - -/// Helper type for wrapping MPI calls. template struct MPICallHelper { std::decay_t f; @@ -108,9 +67,6 @@ struct MPICallHelper { } }; -template -MPIYieldWhileCallHelper(F&&) -> MPIYieldWhileCallHelper>; - template MPICallHelper(F&&) -> MPICallHelper>; @@ -123,27 +79,13 @@ template (sender) | - ex::let_value([=, f = std::move(f)](LArgs&&... largs) { - PIKA_DETAIL_DP(dla_debug<2>, debug(str<>("Args to MPI fn\n"), - pika::debug::print_type(", "), "\nValues\n")); - return ex::just(std::move(largs)...) | - mpi::transform_mpi(dlaf::common::internal::ConsumeRvalues{MPICallHelper{std::move(f)}}); - }); - return ex::make_unique_any_sender(std::move(snd1)); -#else PIKA_DETAIL_DP(dla_debug<5>, debug(str<>("MPI fn\n"))); auto snd1 = std::forward(sender) | mpi::transform_mpi(dlaf::common::internal::ConsumeRvalues{MPICallHelper{std::forward(f)}}); return ex::make_unique_any_sender(std::move(snd1)); -#endif } - std::forward(ts)...)); - template struct PartialTransformMPIBase { std::decay_t f_; diff --git a/src/init.cpp b/src/init.cpp index f90fa7ff4d..2ad9750925 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -269,22 +269,6 @@ void updateConfiguration(const pika::program_options::variables_map& vm, configu warnUnusedConfigurationOption(vm, "NUM_GPU_BLAS_HANDLES", "num-gpu-blas-handles", "only supported with pika 0.29.0 or newer"); warnUnusedConfigurationOption(vm, "NUM_GPU_LAPACK_HANDLES", "num-gpu-lapack-handles", "only supported with pika 0.29.0 or newer"); #endif - // clang-format on - cfg.mpi_pool = (pika::resource::pool_exists("mpi")) ? "mpi" : "default"; - - // Warn if not using MPI pool without --dlaf:no-mpi-pool - // int mpi_initialized; - // DLAF_MPI_CHECK_ERROR(MPI_Initialized(&mpi_initialized)); - // if (mpi_initialized) { - // int ntasks; - // DLAF_MPI_CHECK_ERROR(MPI_Comm_size(MPI_COMM_WORLD, &ntasks)); - // if (ntasks != 1 && cfg.mpi_pool == "default" && !vm["dlaf:no-mpi-pool"].as()) { - // std::cerr << "Warning! DLA-Future is not using the \"mpi\" pika thread pool for " - // "MPI communication but --dlaf:no-mpi-pool is not set. This may " - // "indicate a bug in DLA-Future or pika. Performance may be degraded." - // << std::endl; - // } - // } // update tune parameters // diff --git a/test/unit/communication/test_comm_sender.cpp b/test/unit/communication/test_comm_sender.cpp index d03867e696..df8d12622a 100644 --- a/test/unit/communication/test_comm_sender.cpp +++ b/test/unit/communication/test_comm_sender.cpp @@ -43,7 +43,7 @@ void test_transform_mpi() { auto send = just(send_buf.data(), size, dtype, send_rank, tag, comm) | transformMPI(MPI_Isend); auto recv = just(recv_buf.data(), size, dtype, recv_rank, tag, comm) | transformMPI(MPI_Irecv); - sync_wait(when_all(std::move(send), std::move(recv)) | then([]() {})); + sync_wait(when_all(std::move(send), std::move(recv))); std::vector expected_recv_buf(static_cast(size), recv_rank); @@ -62,7 +62,7 @@ TEST(Bcast, Polling) { double val = (comm.rank() == root_rank) ? 4.2 : 1.2; std::vector buf(static_cast(size), val); - sync_wait(just(buf.data(), size, dtype, root_rank, comm) | transformMPI(MPI_Ibcast) | then([]() {})); + sync_wait(just(buf.data(), size, dtype, root_rank, comm) | transformMPI(MPI_Ibcast)); std::vector expected_buf(static_cast(size), 4.2); ASSERT_TRUE(expected_buf == buf); From fb80d4b870df0c6acf323e523ac1f81f7d25f642 Mon Sep 17 00:00:00 2001 From: John Biddiscombe Date: Mon, 4 Nov 2024 13:16:11 +0100 Subject: [PATCH 07/20] Remove unnecessary polling start from gpu backend init --- src/init.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/init.cpp b/src/init.cpp index 2ad9750925..f12101320f 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -46,7 +46,7 @@ std::ostream& operator<<(std::ostream& os, const configuration& cfg) { os << " num_gpu_blas_handles = " << cfg.num_gpu_blas_handles << std::endl; os << " num_gpu_lapack_handles = " << cfg.num_gpu_lapack_handles << std::endl; // clang-format on - return os;0 + return os; } namespace internal { @@ -129,12 +129,8 @@ struct Init { cfg.umpire_device_memory_pool_initial_block_bytes, cfg.umpire_device_memory_pool_alignment_bytes, cfg.umpire_host_memory_pool_coalescing_free_ratio, cfg.umpire_host_memory_pool_coalescing_reallocation_ratio); - memory::internal::initializeUmpireDeviceAllocator(cfg.umpire_device_memory_pool_initial_bytes); initializeGpuPool(device, cfg.num_np_gpu_streams_per_thread, cfg.num_hp_gpu_streams_per_thread, cfg.num_gpu_blas_handles, cfg.num_gpu_lapack_handles); - if (pika::mpi::detail::environment::is_mpi_initialized()) { - pika::mpi::experimental::start_polling(pika::mpi::experimental::exception_mode::no_handler); - } pika::cuda::experimental::detail::register_polling(pika::resource::get_thread_pool("default")); } From 572fd55256bb4454d3b0c1c001ac82fa94b99e0f Mon Sep 17 00:00:00 2001 From: John Biddiscombe Date: Mon, 4 Nov 2024 16:43:19 +0100 Subject: [PATCH 08/20] Cleanup mpi branch after merge/rebase --- include/dlaf/init.h | 8 ++++---- include/dlaf/sender/transform_mpi.h | 17 +++-------------- src/init.cpp | 1 + 3 files changed, 8 insertions(+), 18 deletions(-) diff --git a/include/dlaf/init.h b/include/dlaf/init.h index 31d78e4331..071c175303 100644 --- a/include/dlaf/init.h +++ b/include/dlaf/init.h @@ -38,13 +38,13 @@ struct configuration { bool print_config = false; std::size_t num_np_gpu_streams_per_thread = 3; std::size_t num_hp_gpu_streams_per_thread = 3; - std::size_t umpire_host_memory_pool_initial_block_bytes = 1 << 30; - std::size_t umpire_host_memory_pool_next_block_bytes = 1 << 30; + std::size_t umpire_host_memory_pool_initial_block_bytes = 1 << 16; + std::size_t umpire_host_memory_pool_next_block_bytes = 1 << 16; std::size_t umpire_host_memory_pool_alignment_bytes = 16; double umpire_host_memory_pool_coalescing_free_ratio = 1.0; double umpire_host_memory_pool_coalescing_reallocation_ratio = 1.0; - std::size_t umpire_device_memory_pool_initial_block_bytes = 1 << 30; - std::size_t umpire_device_memory_pool_next_block_bytes = 1 << 30; + std::size_t umpire_device_memory_pool_initial_block_bytes = 1 << 16; + std::size_t umpire_device_memory_pool_next_block_bytes = 1 << 16; std::size_t umpire_device_memory_pool_alignment_bytes = 16; double umpire_device_memory_pool_coalescing_free_ratio = 1.0; double umpire_device_memory_pool_coalescing_reallocation_ratio = 1.0; diff --git a/include/dlaf/sender/transform_mpi.h b/include/dlaf/sender/transform_mpi.h index fc7a5c2725..9a137195a3 100644 --- a/include/dlaf/sender/transform_mpi.h +++ b/include/dlaf/sender/transform_mpi.h @@ -12,7 +12,6 @@ #include #include -#include #include #include @@ -25,9 +24,6 @@ namespace dlaf::comm::internal { -template -static pika::debug::detail::print_threshold dla_debug("DLA_MPI"); - /// This helper "consumes" a CommunicatorPipelineExclusiveWrapper ensuring that after this call /// the one passed as argument gets destroyed. All other types left as they are /// by the second overload. @@ -52,8 +48,6 @@ struct MPICallHelper { template auto operator()(Ts&&... ts) -> decltype(std::move(f)(dlaf::common::internal::unwrap(ts)...)) { - using namespace pika::debug::detail; - PIKA_DETAIL_DP(dla_debug<5>, debug(str<>("MPICallHelper"), pika::debug::print_type(", "))); using result_type = decltype(std::move(f)(dlaf::common::internal::unwrap(ts)...)); if constexpr (std::is_void_v) { std::move(f)(dlaf::common::internal::unwrap(ts)...); @@ -74,16 +68,11 @@ MPICallHelper(F&&) -> MPICallHelper>; template >> [[nodiscard]] decltype(auto) transformMPI(F&& f, Sender&& sender) { - using dlaf::internal::continues_on; namespace ex = pika::execution::experimental; namespace mpi = pika::mpi::experimental; - namespace mpid = pika::mpi::experimental::detail; - - PIKA_DETAIL_DP(dla_debug<5>, debug(str<>("MPI fn\n"))); - auto snd1 = - std::forward(sender) | - mpi::transform_mpi(dlaf::common::internal::ConsumeRvalues{MPICallHelper{std::forward(f)}}); - return ex::make_unique_any_sender(std::move(snd1)); + namespace dci = dlaf::common::internal; + return std::forward(sender) // + | mpi::transform_mpi(dci::ConsumeRvalues{MPICallHelper{std::forward(f)}}); // } template diff --git a/src/init.cpp b/src/init.cpp index f12101320f..8635d58ed2 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -45,6 +45,7 @@ std::ostream& operator<<(std::ostream& os, const configuration& cfg) { os << " umpire_device_memory_pool_coalescing_reallocation_ratio = " << cfg.umpire_device_memory_pool_coalescing_reallocation_ratio << std::endl; os << " num_gpu_blas_handles = " << cfg.num_gpu_blas_handles << std::endl; os << " num_gpu_lapack_handles = " << cfg.num_gpu_lapack_handles << std::endl; + os << " mpi_pool = " << pika::mpi::experimental::get_pool_name() << std::endl; // clang-format on return os; } From 99ed7a146c3e786a5a1807258bf07bd3e63709ca Mon Sep 17 00:00:00 2001 From: John Biddiscombe Date: Mon, 11 Nov 2024 22:08:58 +0100 Subject: [PATCH 09/20] put back drop operation state in mpi transform --- include/dlaf/sender/transform_mpi.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/dlaf/sender/transform_mpi.h b/include/dlaf/sender/transform_mpi.h index 9a137195a3..877175cf56 100644 --- a/include/dlaf/sender/transform_mpi.h +++ b/include/dlaf/sender/transform_mpi.h @@ -71,8 +71,9 @@ template (sender) // - | mpi::transform_mpi(dci::ConsumeRvalues{MPICallHelper{std::forward(f)}}); // + return std::forward(sender) // + | mpi::transform_mpi(dci::ConsumeRvalues{MPICallHelper{std::forward(f)}}) // + | ex::drop_operation_state(); } template From 6f353e6f359c07c975ae23ca2c12731f5954d5ae Mon Sep 17 00:00:00 2001 From: John Biddiscombe Date: Wed, 13 Nov 2024 09:35:46 +0100 Subject: [PATCH 10/20] Clean up unused code and fix review comments --- ci/common-ci.yml | 2 +- include/dlaf/init.h | 4 ++-- include/dlaf/schedulers.h | 6 ----- include/dlaf/sender/transform_mpi.h | 23 ++++++++++--------- src/init.cpp | 8 +++++-- .../unit/communication/test_transform_mpi.cpp | 2 +- 6 files changed, 22 insertions(+), 23 deletions(-) diff --git a/ci/common-ci.yml b/ci/common-ci.yml index f070765240..5ecf679332 100644 --- a/ci/common-ci.yml +++ b/ci/common-ci.yml @@ -34,7 +34,7 @@ stages: reports: dotenv: build.env variables: - SPACK_SHA: develop-2024-10-06 + SPACK_SHA: develop-2024-11-10 SPACK_DLAF_REPO: ./spack DOCKER_BUILD_ARGS: '[ "BASE_IMAGE", diff --git a/include/dlaf/init.h b/include/dlaf/init.h index 071c175303..8f2dba6f85 100644 --- a/include/dlaf/init.h +++ b/include/dlaf/init.h @@ -38,8 +38,8 @@ struct configuration { bool print_config = false; std::size_t num_np_gpu_streams_per_thread = 3; std::size_t num_hp_gpu_streams_per_thread = 3; - std::size_t umpire_host_memory_pool_initial_block_bytes = 1 << 16; - std::size_t umpire_host_memory_pool_next_block_bytes = 1 << 16; + std::size_t umpire_host_memory_pool_initial_block_bytes = 1 << 30; + std::size_t umpire_host_memory_pool_next_block_bytes = 1 << 30; std::size_t umpire_host_memory_pool_alignment_bytes = 16; double umpire_host_memory_pool_coalescing_free_ratio = 1.0; double umpire_host_memory_pool_coalescing_reallocation_ratio = 1.0; diff --git a/include/dlaf/schedulers.h b/include/dlaf/schedulers.h index 34cf6850fb..4e1e7db0c1 100644 --- a/include/dlaf/schedulers.h +++ b/include/dlaf/schedulers.h @@ -12,7 +12,6 @@ /// @file #include -#include #include #include @@ -48,9 +47,4 @@ auto getBackendScheduler( } #endif } - -inline auto getMPIScheduler() { - return pika::execution::experimental::thread_pool_scheduler{ - &pika::resource::get_thread_pool(pika::mpi::experimental::get_pool_name())}; -} } // namespace dlaf::internal diff --git a/include/dlaf/sender/transform_mpi.h b/include/dlaf/sender/transform_mpi.h index 877175cf56..e6f4cddd57 100644 --- a/include/dlaf/sender/transform_mpi.h +++ b/include/dlaf/sender/transform_mpi.h @@ -31,16 +31,17 @@ inline void consumeCommunicatorWrapper(CommunicatorPipelineExclusiveWrapper& com [[maybe_unused]] auto comm_wrapper_local = std::move(comm_wrapper); } -/// \overload consumeCommunicatorWrapper +/// \overload consumeCommunicatorWrapper (for non communicator types) template void consumeCommunicatorWrapper(T&) {} /// Helper type for wrapping MPI calls. /// -/// Wrapper type around calls to MPI functions. Provides a call operator that -/// creates an MPI request and passes it as the last argument to the provided -/// callable. The wrapper then waits for the request to complete with -/// yield_while. +/// The wrapper explicitly releases any dla communicator objects when the pika::transform_mpi +/// function returns (e.g. a message has been sent/posted) to prevent blocking access to many +/// queued mpi operations. +/// The mpi operations can complete asynchronously later, but the commmunicator is +/// released/made available once the mpi task has been safely initiated /// template struct MPICallHelper { @@ -68,12 +69,12 @@ MPICallHelper(F&&) -> MPICallHelper>; template >> [[nodiscard]] decltype(auto) transformMPI(F&& f, Sender&& sender) { - namespace ex = pika::execution::experimental; - namespace mpi = pika::mpi::experimental; - namespace dci = dlaf::common::internal; - return std::forward(sender) // - | mpi::transform_mpi(dci::ConsumeRvalues{MPICallHelper{std::forward(f)}}) // - | ex::drop_operation_state(); + using pika::execution::experimental::drop_operation_state; + using pika::mpi::experimental::transform_mpi; + using dlaf::common::internal::ConsumeRvalues; + return std::forward(sender) // + | transform_mpi(ConsumeRvalues{MPICallHelper{std::forward(f)}}) // + | drop_operation_state(); } template diff --git a/src/init.cpp b/src/init.cpp index 8635d58ed2..005044e638 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -72,13 +72,17 @@ struct Init { cfg.umpire_host_memory_pool_alignment_bytes, cfg.umpire_host_memory_pool_coalescing_free_ratio, cfg.umpire_host_memory_pool_coalescing_reallocation_ratio); // install mpi polling loop - if (pika::mpi::detail::environment::is_mpi_initialized()) { + int mpi_initialized; + DLAF_MPI_CHECK_ERROR(MPI_Initialized(&mpi_initialized)); + if (mpi_initialized) { pika::mpi::experimental::start_polling(pika::mpi::experimental::exception_mode::no_handler); } } static void finalize() { - if (pika::mpi::detail::environment::is_mpi_initialized()) { + int mpi_initialized; + DLAF_MPI_CHECK_ERROR(MPI_Initialized(&mpi_initialized)); + if (mpi_initialized) { pika::mpi::experimental::stop_polling(); } memory::internal::finalizeUmpireHostAllocator(); diff --git a/test/unit/communication/test_transform_mpi.cpp b/test/unit/communication/test_transform_mpi.cpp index 28f67fa28e..86fe68ed35 100644 --- a/test/unit/communication/test_transform_mpi.cpp +++ b/test/unit/communication/test_transform_mpi.cpp @@ -65,7 +65,7 @@ TEST_F(TransformMPITest, PromiseGuardManagement) { int message; whenAllLift(&message, 1, MPI_INT, 1, 0, chain.exclusive()) | transformMPI(MPI_Irecv) | - ex::then([&sent_guard](/*auto mpi_err_code*/) { sent_guard = true; }) | ex::ensure_started(); + ex::then([&sent_guard]() { sent_guard = true; }) | ex::ensure_started(); // Note: // At this point IRecv is (getting) posted but it won't complete until this Rank 0 will trigger From 43763dc2fe86ff12a730ad7d86186128e79e70ae Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 14 Nov 2024 11:15:37 +0100 Subject: [PATCH 11/20] Require pika 0.30.0 or newer --- spack/packages/dla-future/package.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spack/packages/dla-future/package.py b/spack/packages/dla-future/package.py index 927279f9ab..b88cc16488 100644 --- a/spack/packages/dla-future/package.py +++ b/spack/packages/dla-future/package.py @@ -93,6 +93,7 @@ class DlaFuture(CMakePackage, CudaPackage, ROCmPackage): depends_on("pika@0.18:", when="@0.3") depends_on("pika@0.19.1:", when="@0.4.0:") conflicts("^pika@0.28:", when="@:0.6") + depends_on("pika@0.30:", when="@0.7.0:") depends_on("pika-algorithms@0.1:", when="@:0.2") depends_on("pika +mpi") depends_on("pika +cuda", when="+cuda") From 9921466211ac99f05e7f7322e207925ac29105ae Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 14 Nov 2024 11:23:48 +0100 Subject: [PATCH 12/20] Bump minimum pika version tested in CUDA scalapack CI configuration --- ci/docker/debug-cuda-scalapack.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/docker/debug-cuda-scalapack.yaml b/ci/docker/debug-cuda-scalapack.yaml index 24d0785896..d2a5c5ef85 100644 --- a/ci/docker/debug-cuda-scalapack.yaml +++ b/ci/docker/debug-cuda-scalapack.yaml @@ -35,6 +35,6 @@ spack: - '+fortran' pika: require: - - '@0.19.1' + - '@0.30.0' - 'build_type=Debug' - 'malloc=system' From 069c7645494d6dcb34f9ade34a952e54c88113cd Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Mon, 18 Nov 2024 16:35:18 +0100 Subject: [PATCH 13/20] Test with pika 0.30.1 release branch --- ci/docker/common.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ci/docker/common.yaml b/ci/docker/common.yaml index 5ef667ae3c..01a71cf648 100644 --- a/ci/docker/common.yaml +++ b/ci/docker/common.yaml @@ -46,3 +46,6 @@ packages: # Force git as non-buildable to allow deprecated versions in environments # https://github.com/spack/spack/pull/30040 buildable: false + pika: + require: + - '@git.release-0.30.X=0.30.1' From 608c4b920afcf30b0f388609a12ab5726f5ad01f Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Mon, 18 Nov 2024 16:37:30 +0100 Subject: [PATCH 14/20] Use MPI pool on CUDA CI configurations --- ci/cuda/gcc11_codecov.yml | 2 ++ ci/cuda/gcc11_debug_scalapack.yml | 2 ++ ci/cuda/gcc11_release.yml | 2 ++ ci/cuda/gcc11_release_scalapack.yml | 2 ++ 4 files changed, 8 insertions(+) diff --git a/ci/cuda/gcc11_codecov.yml b/ci/cuda/gcc11_codecov.yml index 31fcea1150..2011db1756 100644 --- a/ci/cuda/gcc11_codecov.yml +++ b/ci/cuda/gcc11_codecov.yml @@ -22,6 +22,8 @@ cuda gcc11 codecov build: cuda gcc11 codecov test: extends: .run_common + variables: + PIKA_MPI_ENABLE_POOL: 1 needs: - cuda gcc11 codecov build trigger: diff --git a/ci/cuda/gcc11_debug_scalapack.yml b/ci/cuda/gcc11_debug_scalapack.yml index 98b07a1d03..87443ef852 100644 --- a/ci/cuda/gcc11_debug_scalapack.yml +++ b/ci/cuda/gcc11_debug_scalapack.yml @@ -20,6 +20,8 @@ cuda gcc11 debug scalapack build: cuda gcc11 debug scalapack test: extends: .run_common + variables: + PIKA_MPI_ENABLE_POOL: 1 needs: - cuda gcc11 debug scalapack build trigger: diff --git a/ci/cuda/gcc11_release.yml b/ci/cuda/gcc11_release.yml index 40d2b20bf2..93799b4b66 100644 --- a/ci/cuda/gcc11_release.yml +++ b/ci/cuda/gcc11_release.yml @@ -21,6 +21,8 @@ cuda gcc11 release build: cuda gcc11 release test: extends: .run_common + variables: + PIKA_MPI_ENABLE_POOL: 1 needs: - cuda gcc11 release build trigger: diff --git a/ci/cuda/gcc11_release_scalapack.yml b/ci/cuda/gcc11_release_scalapack.yml index 5a668e4439..f222db5849 100644 --- a/ci/cuda/gcc11_release_scalapack.yml +++ b/ci/cuda/gcc11_release_scalapack.yml @@ -21,6 +21,8 @@ cuda gcc11 release scalapack build: cuda gcc11 release scalapack test: extends: .run_common + variables: + PIKA_MPI_ENABLE_POOL: 1 needs: - cuda gcc11 release scalapack build trigger: From 238c1fb56c582a5a94c4d72f5159c24ebdbc0aa0 Mon Sep 17 00:00:00 2001 From: John Biddiscombe Date: Wed, 13 Nov 2024 16:02:00 +0100 Subject: [PATCH 15/20] Remove the umpire mem default override --- include/dlaf/init.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/dlaf/init.h b/include/dlaf/init.h index 8f2dba6f85..31d78e4331 100644 --- a/include/dlaf/init.h +++ b/include/dlaf/init.h @@ -43,8 +43,8 @@ struct configuration { std::size_t umpire_host_memory_pool_alignment_bytes = 16; double umpire_host_memory_pool_coalescing_free_ratio = 1.0; double umpire_host_memory_pool_coalescing_reallocation_ratio = 1.0; - std::size_t umpire_device_memory_pool_initial_block_bytes = 1 << 16; - std::size_t umpire_device_memory_pool_next_block_bytes = 1 << 16; + std::size_t umpire_device_memory_pool_initial_block_bytes = 1 << 30; + std::size_t umpire_device_memory_pool_next_block_bytes = 1 << 30; std::size_t umpire_device_memory_pool_alignment_bytes = 16; double umpire_device_memory_pool_coalescing_free_ratio = 1.0; double umpire_device_memory_pool_coalescing_reallocation_ratio = 1.0; From 226ebaeffe2da143d3866c0d903361edf38ec86e Mon Sep 17 00:00:00 2001 From: John Biddiscombe Date: Tue, 19 Nov 2024 10:14:34 +0100 Subject: [PATCH 16/20] Clean up use of mpi_initialized and start/stop polling --- src/init.cpp | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/init.cpp b/src/init.cpp index 005044e638..02434eb6cc 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -29,6 +29,7 @@ #include namespace dlaf { + std::ostream& operator<<(std::ostream& os, const configuration& cfg) { // clang-format off os << " num_np_gpu_streams_per_thread = " << cfg.num_np_gpu_streams_per_thread << std::endl; @@ -56,6 +57,11 @@ bool& initialized() { return i; } +int& mpi_initialized() { + static int i = 0; + return i; +} + template struct Init { // Initialization and finalization does nothing by default. Behaviour can be @@ -71,20 +77,9 @@ struct Init { cfg.umpire_host_memory_pool_initial_block_bytes, cfg.umpire_host_memory_pool_next_block_bytes, cfg.umpire_host_memory_pool_alignment_bytes, cfg.umpire_host_memory_pool_coalescing_free_ratio, cfg.umpire_host_memory_pool_coalescing_reallocation_ratio); - // install mpi polling loop - int mpi_initialized; - DLAF_MPI_CHECK_ERROR(MPI_Initialized(&mpi_initialized)); - if (mpi_initialized) { - pika::mpi::experimental::start_polling(pika::mpi::experimental::exception_mode::no_handler); - } } static void finalize() { - int mpi_initialized; - DLAF_MPI_CHECK_ERROR(MPI_Initialized(&mpi_initialized)); - if (mpi_initialized) { - pika::mpi::experimental::stop_polling(); - } memory::internal::finalizeUmpireHostAllocator(); } }; @@ -365,15 +360,15 @@ void initialize(const pika::program_options::variables_map& vm, const configurat std::exit(0); } - int mpi_initialized; - DLAF_MPI_CHECK_ERROR(MPI_Initialized(&mpi_initialized)); - if (mpi_initialized) { + DLAF_MPI_CHECK_ERROR(MPI_Initialized(&dlaf::internal::mpi_initialized())); + if (dlaf::internal::mpi_initialized()) { int provided; DLAF_MPI_CHECK_ERROR(MPI_Query_thread(&provided)); if (provided < MPI_THREAD_MULTIPLE) { std::cerr << "MPI must be initialized to `MPI_THREAD_MULTIPLE` for DLA-Future!\n"; MPI_Abort(MPI_COMM_WORLD, 1); } + pika::mpi::experimental::start_polling(pika::mpi::experimental::exception_mode::no_handler); } DLAF_ASSERT(!internal::initialized(), ""); @@ -400,6 +395,9 @@ void finalize() { #ifdef DLAF_WITH_GPU internal::Init::finalize(); #endif + if (dlaf::internal::mpi_initialized()) { + pika::mpi::experimental::stop_polling(); + } internal::getConfiguration() = {}; internal::initialized() = false; } From be6ba176a6fc0b1137782ba4203f6908618d2847 Mon Sep 17 00:00:00 2001 From: John Biddiscombe Date: Tue, 19 Nov 2024 11:31:24 +0100 Subject: [PATCH 17/20] Cleanup Formatting --- include/dlaf/sender/transform_mpi.h | 4 ++-- src/init.cpp | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/dlaf/sender/transform_mpi.h b/include/dlaf/sender/transform_mpi.h index e6f4cddd57..b11042c444 100644 --- a/include/dlaf/sender/transform_mpi.h +++ b/include/dlaf/sender/transform_mpi.h @@ -65,13 +65,13 @@ struct MPICallHelper { template MPICallHelper(F&&) -> MPICallHelper>; -/// Lazy transformMPI. This does not submit the work and returns a sender. +/// Lazy transformMPI. Returns a sender that will submit the work passed in template >> [[nodiscard]] decltype(auto) transformMPI(F&& f, Sender&& sender) { + using dlaf::common::internal::ConsumeRvalues; using pika::execution::experimental::drop_operation_state; using pika::mpi::experimental::transform_mpi; - using dlaf::common::internal::ConsumeRvalues; return std::forward(sender) // | transform_mpi(ConsumeRvalues{MPICallHelper{std::forward(f)}}) // | drop_operation_state(); diff --git a/src/init.cpp b/src/init.cpp index 02434eb6cc..ff5d79824c 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -58,8 +58,8 @@ bool& initialized() { } int& mpi_initialized() { - static int i = 0; - return i; + static int i = 0; + return i; } template @@ -396,7 +396,7 @@ void finalize() { internal::Init::finalize(); #endif if (dlaf::internal::mpi_initialized()) { - pika::mpi::experimental::stop_polling(); + pika::mpi::experimental::stop_polling(); } internal::getConfiguration() = {}; internal::initialized() = false; From ca2c626a807f8ecd3169217e928433e1fe72c8ae Mon Sep 17 00:00:00 2001 From: John Biddiscombe Date: Thu, 21 Nov 2024 16:40:35 +0100 Subject: [PATCH 18/20] put back comment about lambdas --- include/dlaf/sender/transform_mpi.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/dlaf/sender/transform_mpi.h b/include/dlaf/sender/transform_mpi.h index b11042c444..4a07488d25 100644 --- a/include/dlaf/sender/transform_mpi.h +++ b/include/dlaf/sender/transform_mpi.h @@ -43,6 +43,9 @@ void consumeCommunicatorWrapper(T&) {} /// The mpi operations can complete asynchronously later, but the commmunicator is /// released/made available once the mpi task has been safely initiated /// +/// This could in theory be a lambda inside transformMPI. However, clang at +/// least until version 12 fails with an internal compiler error with a trailing +/// decltype for SFINAE. GCC has no problems with a lambda. template struct MPICallHelper { std::decay_t f; From 3e0e45d0747522e7aefd33ef202556dd8dda9da7 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Fri, 22 Nov 2024 10:06:18 +0100 Subject: [PATCH 19/20] Update ci/docker/common.yaml --- ci/docker/common.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/ci/docker/common.yaml b/ci/docker/common.yaml index 01a71cf648..5ef667ae3c 100644 --- a/ci/docker/common.yaml +++ b/ci/docker/common.yaml @@ -46,6 +46,3 @@ packages: # Force git as non-buildable to allow deprecated versions in environments # https://github.com/spack/spack/pull/30040 buildable: false - pika: - require: - - '@git.release-0.30.X=0.30.1' From f6af9bde6f07e9ec364728c9be326921a7367176 Mon Sep 17 00:00:00 2001 From: John Biddiscombe Date: Fri, 22 Nov 2024 10:38:05 +0100 Subject: [PATCH 20/20] Update pika find_package version --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5788a1359c..b9ef9b7922 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -163,7 +163,7 @@ if(DLAF_WITH_SCALAPACK) endif() # ----- pika -find_package(pika 0.19.1 REQUIRED) +find_package(pika 0.30.0 REQUIRED) # ----- BLASPP/LAPACKPP find_package(blaspp REQUIRED)