From a9abb819957c2ddc3801080d34cf4c3177725a86 Mon Sep 17 00:00:00 2001 From: Alberto Invernizzi <9337627+albestro@users.noreply.github.com> Date: Tue, 18 Feb 2025 19:59:37 +0100 Subject: [PATCH] Rename parameters from "nworkers" to "num-threads" (#1270) --- .../internal/get_red2band_panel_nworkers.h | 4 ++-- .../internal/get_tridiag_rank1_nworkers.h | 4 ++-- .../dlaf/eigensolver/reduction_to_band/impl.h | 4 ++-- .../dlaf/eigensolver/tridiag_solver/merge.h | 4 ++-- include/dlaf/tune.h | 20 +++++++++---------- src/init.cpp | 8 ++++---- src/tune.cpp | 4 ++-- 7 files changed, 24 insertions(+), 24 deletions(-) diff --git a/include/dlaf/eigensolver/internal/get_red2band_panel_nworkers.h b/include/dlaf/eigensolver/internal/get_red2band_panel_nworkers.h index 05099a7c7d..ce6cf86727 100644 --- a/include/dlaf/eigensolver/internal/get_red2band_panel_nworkers.h +++ b/include/dlaf/eigensolver/internal/get_red2band_panel_nworkers.h @@ -23,13 +23,13 @@ inline size_t get_red2band_panel_worker_minwork() noexcept { return 1; } -inline size_t get_red2band_panel_nworkers() noexcept { +inline size_t get_red2band_panel_num_workers() noexcept { // Note: precautionarily we leave at least 1 thread "free" to do other stuff (if possible) const std::size_t available_workers = pika::resource::get_thread_pool("default").get_os_thread_count(); const std::size_t min_workers = 1; const auto max_workers = std::max(min_workers, available_workers - 1); - const std::size_t nworkers = getTuneParameters().red2band_panel_nworkers; + const std::size_t nworkers = getTuneParameters().red2band_panel_num_threads; return std::clamp(nworkers, min_workers, max_workers); } diff --git a/include/dlaf/eigensolver/internal/get_tridiag_rank1_nworkers.h b/include/dlaf/eigensolver/internal/get_tridiag_rank1_nworkers.h index 8ab4269bbe..6b749c9ce7 100644 --- a/include/dlaf/eigensolver/internal/get_tridiag_rank1_nworkers.h +++ b/include/dlaf/eigensolver/internal/get_tridiag_rank1_nworkers.h @@ -19,13 +19,13 @@ namespace dlaf::eigensolver::internal { -inline std::size_t getTridiagRank1NWorkers() noexcept { +inline std::size_t get_tridiag_rank1_num_workers() noexcept { // Note: precautionarily we leave at least 1 thread "free" to do other stuff (if possible) const std::size_t available_workers = pika::resource::get_thread_pool("default").get_os_thread_count(); const std::size_t min_workers = 1; const auto max_workers = std::max(min_workers, available_workers); - const std::size_t nworkers = getTuneParameters().tridiag_rank1_nworkers; + const std::size_t nworkers = getTuneParameters().tridiag_rank1_num_threads; return std::clamp(nworkers, min_workers, max_workers); } diff --git a/include/dlaf/eigensolver/reduction_to_band/impl.h b/include/dlaf/eigensolver/reduction_to_band/impl.h index 87b40f837e..ad6a6950a2 100644 --- a/include/dlaf/eigensolver/reduction_to_band/impl.h +++ b/include/dlaf/eigensolver/reduction_to_band/impl.h @@ -313,7 +313,7 @@ void computePanelReflectors(MatrixLikeA& mat_a, MatrixLikeTaus& mat_taus, const const std::size_t nworkers = [nrtiles = panel_tiles.size()]() { const std::size_t min_workers = 1; - const std::size_t available_workers = get_red2band_panel_nworkers(); + const std::size_t available_workers = get_red2band_panel_num_workers(); const std::size_t ideal_workers = util::ceilDiv(to_sizet(nrtiles), get_red2band_panel_worker_minwork()); return std::clamp(ideal_workers, min_workers, available_workers); @@ -639,7 +639,7 @@ void computePanelReflectors(TriggerSender&& trigger, comm::IndexT_MPI rank_v0, const std::size_t nworkers = [nrtiles = panel_tiles.size()]() { const std::size_t min_workers = 1; - const std::size_t available_workers = get_red2band_panel_nworkers(); + const std::size_t available_workers = get_red2band_panel_num_workers(); const std::size_t ideal_workers = util::ceilDiv(to_sizet(nrtiles), get_red2band_panel_worker_minwork()); return std::clamp(ideal_workers, min_workers, available_workers); diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h index e71ebd654a..3b4bbe874c 100644 --- a/include/dlaf/eigensolver/tridiag_solver/merge.h +++ b/include/dlaf/eigensolver/tridiag_solver/merge.h @@ -812,7 +812,7 @@ void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k // Note: at least two column of tiles per-worker, in the range [1, getTridiagRank1NWorkers()] const std::size_t nthreads = [nrtiles = (i_end - i_begin)]() { const std::size_t min_workers = 1; - const std::size_t available_workers = getTridiagRank1NWorkers(); + const std::size_t available_workers = get_tridiag_rank1_num_workers(); const std::size_t ideal_workers = util::ceilDiv(to_sizet(nrtiles), to_sizet(2)); return std::clamp(ideal_workers, min_workers, available_workers); }(); @@ -1346,7 +1346,7 @@ void solveRank1ProblemDist(CommSender&& row_comm, CommSender&& col_comm, const S const std::size_t workload_unit = 2 * to_sizet(dist_sub.tile_size().linear_size()); const std::size_t min_workers = 1; - const std::size_t available_workers = getTridiagRank1NWorkers(); + const std::size_t available_workers = get_tridiag_rank1_num_workers(); const std::size_t ideal_workers = util::ceilDiv(to_sizet(workload), workload_unit); return std::clamp(ideal_workers, min_workers, available_workers); diff --git a/include/dlaf/tune.h b/include/dlaf/tune.h index 1b1195acc5..50b8aa9b18 100644 --- a/include/dlaf/tune.h +++ b/include/dlaf/tune.h @@ -58,23 +58,23 @@ namespace dlaf { /// - tfactor_num_threads: /// The maximum number of threads to use for computing tfactor (e.g. which is used for /// instance in red2band and its backtransformation). Set with --dlaf:tfactor-num-threads or env -/// variable DLAF_TFACTOR_NTHREADS. +/// variable DLAF_TFACTOR_NUM_THREADS. /// - tfactor_num_streams: /// The maximum number of streams to use for computing tfactor (e.g. which is used for /// instance in red2band and its backtransformation). Set with --dlaf:tfactor-num-streams or env -/// variable DLAF_TFACTOR_NSTREAMS. +/// variable DLAF_TFACTOR_NUM_STREAMS. /// - tfactor_barrier_busy_wait_us: /// The duration in microseconds to busy-wait in barriers in the tfactor algorithm. /// Set with --dlaf:tfactor-barrier-busy-wait-us or env variable DLAF_TFACTOR_BARRIER_BUSY_WAIT_US. -/// - red2band_panel_nworkers: +/// - red2band_panel_num_threads: /// The maximum number of threads to use for computing the panel in the reduction to band algorithm. -/// Set with --dlaf:red2band-panel-nworkers or env variable DLAF_RED2BAND_PANEL_NWORKERS. +/// Set with --dlaf:red2band-panel-num-threads or env variable DLAF_RED2BAND_PANEL_NUM_THREADS. /// - red2band_barrier_busy_wait_us: /// The duration in microseconds to busy-wait in barriers in the reduction to band algorithm. /// Set with --dlaf:red2band-barrier-busy-wait-us or env variable DLAF_RED2BAND_BARRIER_BUSY_WAIT_US. -/// - tridiag_rank1_nworkers: +/// - tridiag_rank1_num_threads: /// The maximum number of threads to use for computing rank1 problem solution in tridiagonal solver -/// algorithm. Set with --dlaf:tridiag-rank1-nworkers or env variable DLAF_TRIDIAG_RANK1_NWORKERS. +/// algorithm. Set with --dlaf:tridiag-rank1-num-threads or env variable DLAF_TRIDIAG_RANK1_NUM_THREADS. /// - tridiag_rank1_barrier_busy_wait_us: /// The duration in microseconds to busy-wait in barriers when computing rank1 problem solution in /// the tridiagonal solver algorithm. Set with --dlaf:tridiag-rank1-barrier-busy-wait-us or env @@ -120,8 +120,8 @@ struct TuneParameters { const auto default_pool_thread_count = pika::resource::get_thread_pool("default").get_os_thread_count(); tfactor_num_threads = std::max(1, default_pool_thread_count / 2); - red2band_panel_nworkers = std::max(1, default_pool_thread_count / 2); - tridiag_rank1_nworkers = default_pool_thread_count; + red2band_panel_num_threads = std::max(1, default_pool_thread_count / 2); + tridiag_rank1_num_threads = default_pool_thread_count; } bool debug_dump_cholesky_factorization_data = false; bool debug_dump_generalized_to_standard_data = false; @@ -134,9 +134,9 @@ struct TuneParameters { std::size_t tfactor_num_threads = 1; std::size_t tfactor_num_streams = 4; std::size_t tfactor_barrier_busy_wait_us = 0; - std::size_t red2band_panel_nworkers = 1; + std::size_t red2band_panel_num_threads = 1; std::size_t red2band_barrier_busy_wait_us = 1000; - std::size_t tridiag_rank1_nworkers = 1; + std::size_t tridiag_rank1_num_threads = 1; std::size_t tridiag_rank1_barrier_busy_wait_us = 0; SizeType eigensolver_min_band = 100; diff --git a/src/init.cpp b/src/init.cpp index 7dd3fa9fb5..2f87c96dfe 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -284,7 +284,7 @@ void updateConfiguration(const pika::program_options::variables_map& vm, configu updateConfigurationValue(vm, param.tfactor_num_threads, "TFACTOR_NUM_THREADS", "tfactor-num-threads"); updateConfigurationValue(vm, param.tfactor_num_streams, "TFACTOR_NUM_STREAMS", "tfactor-num-streams"); updateConfigurationValue(vm, param.tfactor_barrier_busy_wait_us, "TFACTOR_BARRIER_BUSY_WAIT_US", "tfactor-barrier-busy-wait-us"); - updateConfigurationValue(vm, param.red2band_panel_nworkers, "RED2BAND_PANEL_NWORKERS", "red2band-panel-nworkers"); + updateConfigurationValue(vm, param.red2band_panel_num_threads, "RED2BAND_PANEL_NUM_THREADS", "red2band-panel-num-threads"); updateConfigurationValue(vm, param.red2band_barrier_busy_wait_us, "RED2BAND_BARRIER_BUSY_WAIT_US", "red2band-barrier-busy-wait-us"); updateConfigurationValue(vm, param.eigensolver_min_band, "EIGENSOLVER_MIN_BAND", "eigensolver-min-band"); updateConfigurationValue(vm, param.band_to_tridiag_1d_block_size_base, "BAND_TO_TRIDIAG_1D_BLOCK_SIZE_BASE", "band-to-tridiag-1d-block-size-base"); @@ -297,7 +297,7 @@ void updateConfiguration(const pika::program_options::variables_map& vm, configu updateConfigurationValue(vm, param.debug_dump_band_to_tridiagonal_data, "DEBUG_DUMP_BAND_TO_TRIDIAGONAL_DATA", ""); updateConfigurationValue(vm, param.debug_dump_tridiag_solver_data, "DEBUG_DUMP_TRIDIAG_SOLVER_DATA", ""); - updateConfigurationValue(vm, param.tridiag_rank1_nworkers, "TRIDIAG_RANK1_NWORKERS", "tridiag-rank1-nworkers"); + updateConfigurationValue(vm, param.tridiag_rank1_num_threads, "TRIDIAG_RANK1_NUM_THREADS", "tridiag-rank1-num-threads"); updateConfigurationValue(vm, param.tridiag_rank1_barrier_busy_wait_us, "TRIDIAG_RANK1_BARRIER_BUSY_WAIT_US", "tridiag-rank1-barrier-busy-wait-us"); @@ -341,11 +341,11 @@ pika::program_options::options_description getOptionsDescription() { desc.add_options()("dlaf:tfactor-num-threads", pika::program_options::value(), "The maximum number of threads to use for computing the tfactor."); desc.add_options()("dlaf:tfactor-num-streams", pika::program_options::value(), "The maximum number of GPU streams to use for computing the tfactor."); desc.add_options()("dlaf:tfactor-barrier-busy-wait-us", pika::program_options::value(), "The duration in microseconds to busy-wait in barriers in the tfactor algorithm."); - desc.add_options()("dlaf:red2band-panel-nworkers", pika::program_options::value(), "The maximum number of threads to use for computing the panel in the reduction to band algorithm."); + desc.add_options()("dlaf:red2band-panel-num-threads", pika::program_options::value(), "The maximum number of threads to use for computing the panel in the reduction to band algorithm."); desc.add_options()("dlaf:red2band-barrier-busy-wait-us", pika::program_options::value(), "The duration in microseconds to busy-wait in barriers in the reduction to band algorithm."); desc.add_options()("dlaf:eigensolver-min-band", pika::program_options::value(), "The minimum value to start looking for a divisor of the block size. When larger than the block size, the block size will be used instead."); desc.add_options()("dlaf:band-to-tridiag-1d-block-size-base", pika::program_options::value(), "The 1D block size for band_to_tridiagonal is computed as 1d_block_size_base / nb * nb. (The input matrix is distributed with a {nb x nb} block size.)"); - desc.add_options()("dlaf:tridiag-rank1-nworkers", pika::program_options::value(), "The maximum number of threads to use for computing rank1 problem solution in tridiagonal solver algorithm."); + desc.add_options()("dlaf:tridiag-rank1-num-threads", pika::program_options::value(), "The maximum number of threads to use for computing rank1 problem solution in tridiagonal solver algorithm."); desc.add_options()("dlaf:tridiag-rank1-barrier-busy-wait-us", pika::program_options::value(), "The duration in microseconds to busy-wait in barriers when computing rank1 problem solution in the tridiagonal solver algorithm."); desc.add_options()("dlaf:bt-band-to-tridiag-hh-apply-group-size", pika::program_options::value(), "The application of the HH reflector is splitted in smaller applications of group size reflectors."); desc.add_options()("dlaf:communicator-grid-num-pipelines", pika::program_options::value(), "The default number of row, column, and full communicator pipelines to initialize in CommunicatorGrid."); diff --git a/src/tune.cpp b/src/tune.cpp index 03da20c830..7ba2df664f 100644 --- a/src/tune.cpp +++ b/src/tune.cpp @@ -22,9 +22,9 @@ std::ostream& operator<<(std::ostream& os, const TuneParameters& params) { os << " tfactor_num_threads = " << params.tfactor_num_threads << std::endl; os << " tfactor_num_streams = " << params.tfactor_num_streams << std::endl; os << " tfactor_barrier_busy_wait_us = " << params.tfactor_barrier_busy_wait_us << std::endl; - os << " red2band_panel_nworkers = " << params.red2band_panel_nworkers << std::endl; + os << " red2band_panel_num_threads = " << params.red2band_panel_num_threads << std::endl; os << " red2band_barrier_busy_wait_us = " << params.red2band_barrier_busy_wait_us << std::endl; - os << " tridiag_rank1_nworkers = " << params.tridiag_rank1_nworkers << std::endl; + os << " tridiag_rank1_num_threads = " << params.tridiag_rank1_num_threads << std::endl; os << " tridiag_rank1_barrier_busy_wait_us = " << params.tridiag_rank1_barrier_busy_wait_us << std::endl; os << " eigensolver_min_band = " << params.eigensolver_min_band << std::endl;