diff --git a/CHANGELOG.md b/CHANGELOG.md index b1a02d27c..0566c76b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,17 @@ +# Dev + +## New features + +* `HQ_NUM_NODES` for multi-node tasks introduced. It contains the number of nodes assigned to task. + You do not need to manually count lines in `HQ_NODE_FILE` anymore. + +## Changes + +* Node file generated for multi-node tasks now contains only short hostnames + (e.g. if hostname is "cn690.karolina.it4i.cz", only "cn690" is written into node list) + You can read ``HQ_HOST_FILE`` if you need to get full hostnames without stripping. + + # v0.18.0 ## Breaking change diff --git a/crates/hyperqueue/src/common/env.rs b/crates/hyperqueue/src/common/env.rs index 317115f02..f18af33f8 100644 --- a/crates/hyperqueue/src/common/env.rs +++ b/crates/hyperqueue/src/common/env.rs @@ -23,3 +23,5 @@ pub const HQ_TASK_DIR: &str = create_hq_env!("TASK_DIR"); pub const HQ_ERROR_FILENAME: &str = create_hq_env!("ERROR_FILENAME"); pub const HQ_CPUS: &str = create_hq_env!("CPUS"); pub const HQ_NODE_FILE: &str = create_hq_env!("NODE_FILE"); +pub const HQ_HOST_FILE: &str = create_hq_env!("HOST_FILE"); +pub const HQ_NUM_NODES: &str = create_hq_env!("NUM_NODES"); diff --git a/crates/hyperqueue/src/worker/start/program.rs b/crates/hyperqueue/src/worker/start/program.rs index b7e9a4861..31da0bb29 100644 --- a/crates/hyperqueue/src/worker/start/program.rs +++ b/crates/hyperqueue/src/worker/start/program.rs @@ -1,6 +1,5 @@ use std::fs::File; use std::future::Future; -use std::io; use std::io::{BufWriter, ErrorKind, Read, Write}; use std::path::{Path, PathBuf}; use std::process::ExitStatus; @@ -29,8 +28,8 @@ use tako::resources::{ use tako::{format_comma_delimited, InstanceId}; use crate::common::env::{ - HQ_CPUS, HQ_ENTRY, HQ_ERROR_FILENAME, HQ_INSTANCE_ID, HQ_JOB_ID, HQ_NODE_FILE, HQ_PIN, - HQ_SUBMIT_DIR, HQ_TASK_DIR, HQ_TASK_ID, + HQ_CPUS, HQ_ENTRY, HQ_ERROR_FILENAME, HQ_HOST_FILE, HQ_INSTANCE_ID, HQ_JOB_ID, HQ_NODE_FILE, + HQ_NUM_NODES, HQ_PIN, HQ_SUBMIT_DIR, HQ_TASK_DIR, HQ_TASK_ID, }; use crate::common::placeholders::{ fill_placeholders_in_paths, CompletePlaceholderCtx, ResolvablePaths, @@ -109,8 +108,32 @@ pub(super) fn build_program_task( .into(), ); if !build_ctx.node_list().is_empty() { - let filename = task_dir.path().join("hq-nodelist"); - write_node_file(&build_ctx, &filename).map_err(|error| { + program.env.insert( + HQ_NUM_NODES.into(), + build_ctx.node_list().len().to_string().into(), + ); + + /* + We write a hostnames in two forms "full" and "short" names. + Short hostname is a hostname where a part after a first "." is removed + (e.g. a host name for cn710.karolina.it4i.cz is cn710) + + We are providing both because some systems (e.g. SLURM or OpenMPI) use just short names + */ + + let filename = task_dir.path().join("hq-hostfile"); + write_node_file(&build_ctx, &filename, false).map_err(|error| { + format!( + "Cannot write node file at {}: {error:?}", + filename.display() + ) + })?; + program.env.insert( + HQ_HOST_FILE.into(), + filename.to_string_lossy().to_string().into(), + ); + let filename = task_dir.path().join("hq-nodefile"); + write_node_file(&build_ctx, &filename, true).map_err(|error| { format!( "Cannot write node file at {}: {error:?}", filename.display() @@ -204,7 +227,7 @@ async fn resend_stdio( Ok(()) } -fn create_directory_if_needed(file: &StdioDef) -> io::Result<()> { +fn create_directory_if_needed(file: &StdioDef) -> std::io::Result<()> { if let StdioDef::File { path, .. } = file { if let Some(path) = path.parent() { std::fs::create_dir_all(path)?; @@ -217,11 +240,21 @@ fn get_custom_error_filename(task_dir: &TempDir) -> PathBuf { task_dir.path().join("hq-error") } -fn write_node_file(ctx: &TaskBuildContext, path: &Path) -> std::io::Result<()> { +fn is_ip_address(hostname: &str) -> bool { + hostname.chars().all(|c| c.is_ascii_digit() || c == '.') +} + +fn write_node_file(ctx: &TaskBuildContext, path: &Path, short_names: bool) -> std::io::Result<()> { let file = File::create(path)?; let mut file = BufWriter::new(file); for worker_id in ctx.node_list() { - file.write_all(ctx.worker_hostname(*worker_id).unwrap().as_bytes())?; + let hostname = ctx.worker_hostname(*worker_id).unwrap(); + let node_name = if short_names && !is_ip_address(hostname) { + hostname.split_once('.').map(|x| x.0).unwrap_or(hostname) + } else { + hostname + }; + file.write_all(node_name.as_bytes())?; file.write_all(b"\n")?; } file.flush()?; diff --git a/docs/jobs/multinode.md b/docs/jobs/multinode.md index b0bb291f3..ae1e2716b 100644 --- a/docs/jobs/multinode.md +++ b/docs/jobs/multinode.md @@ -24,11 +24,21 @@ $ hq submit --nodes 4 test.sh When the task is started, four nodes are assigned to this task. One of them is chosen as "root" node where ``test.sh`` is started. -Hostnames of all assigned nodes can be found in file which path is in -environmental variable ``HQ_NODE_FILE``. Each line is now host name. +Node names of all assigned nodes can be found in file which path is in +environmental variable ``HQ_NODE_FILE``. Each line is a node name. The first line is always the root node. +The node is a short hostname, i.e. hostname stripped by a suffix after first "." +(e.g. if a hostname of worker is "cn690.karolina.it4i.cz" then node name is "cn690"). +Many HPC applications use only short hostnames, hence we provide them as default. + +If you need a full hostnames, there is file which name is written in ``HQ_HOST_FILE`` and it has the same meaning +as ``NQ_NODE_FILE`` but contains the full node hostnames without stripping. + +Note: Both files are placed in task directory; therefore, a multi-node tasks always enables task directory (``--task-dir``). + +If a multinode task is started, HQ also creates variable `HQ_NUM_NODES` that +holds the number of nodes assigned to a task (i.e. the number of lines of the node file) -Note: Multi-node tasks always enables task directory (``--task-dir``). ## Groups @@ -50,4 +60,14 @@ A script that starts an MPI program in multi-node task may look like as follows: #!/bin/sh mpirun --node-list=$HQ_NODE_FILE ./a-program -``` \ No newline at end of file +``` + +If you are running SLURM you should start the MPI program as follows: + +``` +#!/bin/sh + +srun --nodefile $HQ_NODE_FILE --nodes=$HQ_NUM_NODES mpirun ... +``` + +Note: It is important to set `--nodes` otherwise the node file will not be respected. \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index bdcf8660b..a8d9777c2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -178,6 +178,7 @@ def start_worker( server_dir=None, work_dir: Optional[str] = None, final_check: bool = False, + hostname=None, ) -> subprocess.Popen: self.id_counter += 1 worker_id = self.id_counter @@ -197,7 +198,8 @@ def start_worker( f"--on-server-lost={on_server_lost}", "--no-detect-resources", # Ignore resources on testing machine ] - hostname = f"worker{worker_id}" + if hostname is None: + hostname = f"worker{worker_id}" if set_hostname: worker_args += ["--hostname", hostname] if cpus is not None: diff --git a/tests/test_job_mn.py b/tests/test_job_mn.py index 29cd2518d..947191ff9 100644 --- a/tests/test_job_mn.py +++ b/tests/test_job_mn.py @@ -11,7 +11,7 @@ def test_submit_mn(hq_env: HqEnv): hq_env.start_server() hq_env.start_workers(2) - hq_env.command(["submit", "--nodes=3", "--", "bash", "-c", "sleep 1; cat ${HQ_NODE_FILE}"]) + hq_env.command(["submit", "--nodes=3", "--", "bash", "-c", "sleep 1; echo ${HQ_NUM_NODES}; cat ${HQ_NODE_FILE}"]) time.sleep(0.5) table = hq_env.command(["job", "info", "1"], as_table=True) table.check_row_value("Resources", "nodes: 3") @@ -28,6 +28,7 @@ def test_submit_mn(hq_env: HqEnv): wait_for_job_state(hq_env, 1, "FINISHED", timeout_s=1.2) with open(default_task_output(1)) as f: + assert f.readline() == "3\n" hosts = f.read().rstrip().split("\n") assert hosts == ws # assert len(nodes) == 3 @@ -115,3 +116,16 @@ def test_submit_mn_time_request(hq_env: HqEnv): table.check_row_value("State", "WAITING") hq_env.start_workers(1, args=["--time-limit=3s"]) wait_for_job_state(hq_env, 1, "FINISHED") + + +def test_submit_mn_complex_hostname(hq_env: HqEnv): + hq_env.start_server() + hq_env.start_worker(hostname="cn690.karolina.it4i.cz") + hq_env.start_worker(hostname="cn710.karolina.it4i.cz") + + hq_env.command(["submit", "--nodes=2", "--", "bash", "-c", "sleep 1; cat ${HQ_NODE_FILE}; cat ${HQ_HOST_FILE}"]) + wait_for_job_state(hq_env, 1, "FINISHED", timeout_s=1.2) + with open(default_task_output(1)) as f: + hosts = f.read().rstrip().split("\n") + assert sorted(hosts[:2]) == ["cn690", "cn710"] + assert sorted(hosts[2:4]) == ["cn690.karolina.it4i.cz", "cn710.karolina.it4i.cz"]