From e2b1ad8273509a30cd9921b2b03688bdf12a3c11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Ber=C3=A1nek?= Date: Sat, 17 Aug 2024 15:17:35 +0200 Subject: [PATCH] Make worker spawning in Slurm allocations more robust --- .../hyperqueue/src/server/autoalloc/queue/slurm.rs | 12 +++++++++++- tests/autoalloc/test_autoalloc.py | 2 +- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/crates/hyperqueue/src/server/autoalloc/queue/slurm.rs b/crates/hyperqueue/src/server/autoalloc/queue/slurm.rs index d03bddc33..93a38b2d2 100644 --- a/crates/hyperqueue/src/server/autoalloc/queue/slurm.rs +++ b/crates/hyperqueue/src/server/autoalloc/queue/slurm.rs @@ -217,7 +217,17 @@ fn build_slurm_submit_script( writeln!(script, "#SBATCH {sbatch_args}").unwrap(); } - let prefix = if nodes > 1 { "srun --overlap " } else { "" }; + // Some Slurm clusters have a default that does not play well with simply running + // `srun`. For example, they can configure `--ntasks-per-node X` as a default option. + // We should make sure that we execute exactly the number of workers that we want, on exactly + // the number of nodes that we want. Therefore, we use `--ntasks` and `--nodes`. + // The `--overlap` parameter is then used to make sure that nested invocations within the HQ + // worker will be able to still consume Slurm resources. + let prefix = if nodes > 1 { + format!("srun --overlap --ntasks={nodes} --nodes={nodes} ") + } else { + "".to_string() + }; write!(script, "\n{prefix}{worker_cmd}").unwrap(); script } diff --git a/tests/autoalloc/test_autoalloc.py b/tests/autoalloc/test_autoalloc.py index f3df5ed89..0da6fe811 100644 --- a/tests/autoalloc/test_autoalloc.py +++ b/tests/autoalloc/test_autoalloc.py @@ -346,7 +346,7 @@ def test_slurm_multinode_allocation(hq_env: HqEnv): with open(sbatch_script_path) as f: commands = normalize_output(hq_env, "slurm", extract_script_commands(f.read())) assert commands == snapshot( - 'srun --overlap RUST_LOG=tako=trace,hyperqueue=trace worker start --idle-timeout "5m"' + 'srun --overlap --ntasks=2 --nodes=2 RUST_LOG=tako=trace,hyperqueue=trace worker start --idle-timeout "5m"' ' --manager "" --server-dir "/001" --on-server-lost "finish-running" --time-limit' ' "1h"' )