From e2b1ad8273509a30cd9921b2b03688bdf12a3c11 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Ber=C3=A1nek?= <berykubik@gmail.com>
Date: Sat, 17 Aug 2024 15:17:35 +0200
Subject: [PATCH] Make worker spawning in Slurm allocations more robust

---
 .../hyperqueue/src/server/autoalloc/queue/slurm.rs   | 12 +++++++++++-
 tests/autoalloc/test_autoalloc.py                    |  2 +-
 2 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/crates/hyperqueue/src/server/autoalloc/queue/slurm.rs b/crates/hyperqueue/src/server/autoalloc/queue/slurm.rs
index d03bddc33..93a38b2d2 100644
--- a/crates/hyperqueue/src/server/autoalloc/queue/slurm.rs
+++ b/crates/hyperqueue/src/server/autoalloc/queue/slurm.rs
@@ -217,7 +217,17 @@ fn build_slurm_submit_script(
         writeln!(script, "#SBATCH {sbatch_args}").unwrap();
     }
 
-    let prefix = if nodes > 1 { "srun --overlap " } else { "" };
+    // Some Slurm clusters have a default that does not play well with simply running
+    // `srun`. For example, they can configure `--ntasks-per-node X` as a default option.
+    // We should make sure that we execute exactly the number of workers that we want, on exactly
+    // the number of nodes that we want. Therefore, we use `--ntasks` and `--nodes`.
+    // The `--overlap` parameter is then used to make sure that nested invocations within the HQ
+    // worker will be able to still consume Slurm resources.
+    let prefix = if nodes > 1 {
+        format!("srun --overlap --ntasks={nodes} --nodes={nodes} ")
+    } else {
+        "".to_string()
+    };
     write!(script, "\n{prefix}{worker_cmd}").unwrap();
     script
 }
diff --git a/tests/autoalloc/test_autoalloc.py b/tests/autoalloc/test_autoalloc.py
index f3df5ed89..0da6fe811 100644
--- a/tests/autoalloc/test_autoalloc.py
+++ b/tests/autoalloc/test_autoalloc.py
@@ -346,7 +346,7 @@ def test_slurm_multinode_allocation(hq_env: HqEnv):
         with open(sbatch_script_path) as f:
             commands = normalize_output(hq_env, "slurm", extract_script_commands(f.read()))
             assert commands == snapshot(
-                'srun --overlap RUST_LOG=tako=trace,hyperqueue=trace <hq-binary> worker start --idle-timeout "5m"'
+                'srun --overlap --ntasks=2 --nodes=2 RUST_LOG=tako=trace,hyperqueue=trace <hq-binary> worker start --idle-timeout "5m"'
                 ' --manager "<manager>" --server-dir "<server-dir>/001" --on-server-lost "finish-running" --time-limit'
                 ' "1h"'
             )