Skip to content

Commit

Permalink
HQ_HOST_FILE and HQ_NUM_NODES introduced
Browse files Browse the repository at this point in the history
  • Loading branch information
spirali committed Feb 26, 2024
1 parent 3f19eb8 commit 6ec7912
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 14 deletions.
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# Dev

## New features

* `HQ_NUM_NODES` for multi-node tasks introduced. It contains the number of nodes assigned to task.
You do not need to manually count lines in `HQ_NODE_FILE` anymore.

## Changes

* Node file generated for multi-node tasks now contains only short hostnames
(e.g. if hostname is "cn690.karolina.it4i.cz", only "cn690" is written into node list)
You can read ``HQ_HOST_FILE`` if you need to get full hostnames without stripping.


# v0.18.0

## Breaking change
Expand Down
2 changes: 2 additions & 0 deletions crates/hyperqueue/src/common/env.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,5 @@ pub const HQ_TASK_DIR: &str = create_hq_env!("TASK_DIR");
pub const HQ_ERROR_FILENAME: &str = create_hq_env!("ERROR_FILENAME");
pub const HQ_CPUS: &str = create_hq_env!("CPUS");
pub const HQ_NODE_FILE: &str = create_hq_env!("NODE_FILE");
pub const HQ_HOST_FILE: &str = create_hq_env!("HOST_FILE");
pub const HQ_NUM_NODES: &str = create_hq_env!("NUM_NODES");
49 changes: 41 additions & 8 deletions crates/hyperqueue/src/worker/start/program.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use std::fs::File;
use std::future::Future;
use std::io;
use std::io::{BufWriter, ErrorKind, Read, Write};
use std::path::{Path, PathBuf};
use std::process::ExitStatus;
Expand Down Expand Up @@ -29,8 +28,8 @@ use tako::resources::{
use tako::{format_comma_delimited, InstanceId};

use crate::common::env::{
HQ_CPUS, HQ_ENTRY, HQ_ERROR_FILENAME, HQ_INSTANCE_ID, HQ_JOB_ID, HQ_NODE_FILE, HQ_PIN,
HQ_SUBMIT_DIR, HQ_TASK_DIR, HQ_TASK_ID,
HQ_CPUS, HQ_ENTRY, HQ_ERROR_FILENAME, HQ_HOST_FILE, HQ_INSTANCE_ID, HQ_JOB_ID, HQ_NODE_FILE,
HQ_NUM_NODES, HQ_PIN, HQ_SUBMIT_DIR, HQ_TASK_DIR, HQ_TASK_ID,
};
use crate::common::placeholders::{
fill_placeholders_in_paths, CompletePlaceholderCtx, ResolvablePaths,
Expand Down Expand Up @@ -109,8 +108,32 @@ pub(super) fn build_program_task(
.into(),
);
if !build_ctx.node_list().is_empty() {
let filename = task_dir.path().join("hq-nodelist");
write_node_file(&build_ctx, &filename).map_err(|error| {
program.env.insert(
HQ_NUM_NODES.into(),
build_ctx.node_list().len().to_string().into(),
);

/*
We write a hostnames in two forms "full" and "short" names.
Short hostname is a hostname where a part after a first "." is removed
(e.g. a host name for cn710.karolina.it4i.cz is cn710)
We are providing both because some systems (e.g. SLURM or OpenMPI) use just short names
*/

let filename = task_dir.path().join("hq-hostfile");
write_node_file(&build_ctx, &filename, false).map_err(|error| {
format!(
"Cannot write node file at {}: {error:?}",
filename.display()
)
})?;
program.env.insert(
HQ_HOST_FILE.into(),
filename.to_string_lossy().to_string().into(),
);
let filename = task_dir.path().join("hq-nodefile");
write_node_file(&build_ctx, &filename, true).map_err(|error| {
format!(
"Cannot write node file at {}: {error:?}",
filename.display()
Expand Down Expand Up @@ -204,7 +227,7 @@ async fn resend_stdio(
Ok(())
}

fn create_directory_if_needed(file: &StdioDef) -> io::Result<()> {
fn create_directory_if_needed(file: &StdioDef) -> std::io::Result<()> {
if let StdioDef::File { path, .. } = file {
if let Some(path) = path.parent() {
std::fs::create_dir_all(path)?;
Expand All @@ -217,11 +240,21 @@ fn get_custom_error_filename(task_dir: &TempDir) -> PathBuf {
task_dir.path().join("hq-error")
}

fn write_node_file(ctx: &TaskBuildContext, path: &Path) -> std::io::Result<()> {
fn is_ip_address(hostname: &str) -> bool {
hostname.chars().all(|c| c.is_ascii_digit() || c == '.')
}

fn write_node_file(ctx: &TaskBuildContext, path: &Path, short_names: bool) -> std::io::Result<()> {
let file = File::create(path)?;
let mut file = BufWriter::new(file);
for worker_id in ctx.node_list() {
file.write_all(ctx.worker_hostname(*worker_id).unwrap().as_bytes())?;
let hostname = ctx.worker_hostname(*worker_id).unwrap();
let node_name = if short_names && !is_ip_address(hostname) {
hostname.split_once('.').map(|x| x.0).unwrap_or(hostname)
} else {
hostname
};
file.write_all(node_name.as_bytes())?;
file.write_all(b"\n")?;
}
file.flush()?;
Expand Down
28 changes: 24 additions & 4 deletions docs/jobs/multinode.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,21 @@ $ hq submit --nodes 4 test.sh
When the task is started, four nodes are assigned to this task.
One of them is chosen as "root" node where ``test.sh`` is started.

Hostnames of all assigned nodes can be found in file which path is in
environmental variable ``HQ_NODE_FILE``. Each line is now host name.
Node names of all assigned nodes can be found in file which path is in
environmental variable ``HQ_NODE_FILE``. Each line is a node name.
The first line is always the root node.
The node is a short hostname, i.e. hostname stripped by a suffix after first "."
(e.g. if a hostname of worker is "cn690.karolina.it4i.cz" then node name is "cn690").
Many HPC applications use only short hostnames, hence we provide them as default.

If you need a full hostnames, there is file which name is written in ``HQ_HOST_FILE`` and it has the same meaning
as ``NQ_NODE_FILE`` but contains the full node hostnames without stripping.

Note: Both files are placed in task directory; therefore, a multi-node tasks always enables task directory (``--task-dir``).

If a multinode task is started, HQ also creates variable `HQ_NUM_NODES` that
holds the number of nodes assigned to a task (i.e. the number of lines of the node file)

Note: Multi-node tasks always enables task directory (``--task-dir``).

## Groups

Expand All @@ -50,4 +60,14 @@ A script that starts an MPI program in multi-node task may look like as follows:
#!/bin/sh

mpirun --node-list=$HQ_NODE_FILE ./a-program
```
```

If you are running SLURM you should start the MPI program as follows:

```
#!/bin/sh
srun --nodefile $HQ_NODE_FILE --nodes=$HQ_NUM_NODES mpirun ...
```

Note: It is important to set `--nodes` otherwise the node file will not be respected.
4 changes: 3 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ def start_worker(
server_dir=None,
work_dir: Optional[str] = None,
final_check: bool = False,
hostname=None,
) -> subprocess.Popen:
self.id_counter += 1
worker_id = self.id_counter
Expand All @@ -197,7 +198,8 @@ def start_worker(
f"--on-server-lost={on_server_lost}",
"--no-detect-resources", # Ignore resources on testing machine
]
hostname = f"worker{worker_id}"
if hostname is None:
hostname = f"worker{worker_id}"
if set_hostname:
worker_args += ["--hostname", hostname]
if cpus is not None:
Expand Down
16 changes: 15 additions & 1 deletion tests/test_job_mn.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def test_submit_mn(hq_env: HqEnv):
hq_env.start_server()
hq_env.start_workers(2)

hq_env.command(["submit", "--nodes=3", "--", "bash", "-c", "sleep 1; cat ${HQ_NODE_FILE}"])
hq_env.command(["submit", "--nodes=3", "--", "bash", "-c", "sleep 1; echo ${HQ_NUM_NODES}; cat ${HQ_NODE_FILE}"])
time.sleep(0.5)
table = hq_env.command(["job", "info", "1"], as_table=True)
table.check_row_value("Resources", "nodes: 3")
Expand All @@ -28,6 +28,7 @@ def test_submit_mn(hq_env: HqEnv):

wait_for_job_state(hq_env, 1, "FINISHED", timeout_s=1.2)
with open(default_task_output(1)) as f:
assert f.readline() == "3\n"
hosts = f.read().rstrip().split("\n")
assert hosts == ws
# assert len(nodes) == 3
Expand Down Expand Up @@ -115,3 +116,16 @@ def test_submit_mn_time_request(hq_env: HqEnv):
table.check_row_value("State", "WAITING")
hq_env.start_workers(1, args=["--time-limit=3s"])
wait_for_job_state(hq_env, 1, "FINISHED")


def test_submit_mn_complex_hostname(hq_env: HqEnv):
hq_env.start_server()
hq_env.start_worker(hostname="cn690.karolina.it4i.cz")
hq_env.start_worker(hostname="cn710.karolina.it4i.cz")

hq_env.command(["submit", "--nodes=2", "--", "bash", "-c", "sleep 1; cat ${HQ_NODE_FILE}; cat ${HQ_HOST_FILE}"])
wait_for_job_state(hq_env, 1, "FINISHED", timeout_s=1.2)
with open(default_task_output(1)) as f:
hosts = f.read().rstrip().split("\n")
assert sorted(hosts[:2]) == ["cn690", "cn710"]
assert sorted(hosts[2:4]) == ["cn690.karolina.it4i.cz", "cn710.karolina.it4i.cz"]

0 comments on commit 6ec7912

Please sign in to comment.