From c13756a7c3d53a65e5a3878c7a033e57fb5924d1 Mon Sep 17 00:00:00 2001 From: Pavlina Smolkova Date: Fri, 29 Mar 2024 13:28:42 +0100 Subject: [PATCH] UPDATE: ruth-distributed parameters --- ruth/zeromq/README.md | 5 ++++- ruth/zeromq/bench.py | 5 ----- ruth/zeromq/distributed.py | 12 +++++++----- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/ruth/zeromq/README.md b/ruth/zeromq/README.md index 0f6695f..39e6c98 100644 --- a/ruth/zeromq/README.md +++ b/ruth/zeromq/README.md @@ -40,7 +40,6 @@ distributed spawning of workers accross nodes. In order to run the simulation in distributed fashion we can use ```bench.py```, specifically function ```run```. For correct incorporation of nodes spawned and configuration this function has to be used inside ```ruth```, otherwise we may simply use ```bench.py``` and edit it's parameters since it spawns the ```run``` with following parameters: - ``` WORK_DIR = Path(os.getcwd()).absolute() WORKER_DIR = WORK_DIR / str(sys.argv[1]) @@ -61,6 +60,10 @@ try_to_kill = False run(...) ``` +Or run the command: +``` +ruth-distributed EXPERIMENT_NAME EVKIT_DIR_PATH --config-file="config.json" --workers=32 +``` ## Submitting a job In order to submit a job to a cluster, for example SLURM, we may use: diff --git a/ruth/zeromq/bench.py b/ruth/zeromq/bench.py index 77b0d89..2d1cf9f 100644 --- a/ruth/zeromq/bench.py +++ b/ruth/zeromq/bench.py @@ -1,4 +1,3 @@ -import itertools import logging import pandas as pd import time @@ -7,16 +6,12 @@ import subprocess import os import sys -import signal from typing import List from pathlib import Path -from collections import defaultdict from contextlib import closing from dataclasses import dataclass from cluster.cluster import Cluster, start_process -from cluster import cluster -from src.client import Client def get_pbs_nodes() -> List[str]: diff --git a/ruth/zeromq/distributed.py b/ruth/zeromq/distributed.py index 24e9050..f32d60b 100644 --- a/ruth/zeromq/distributed.py +++ b/ruth/zeromq/distributed.py @@ -10,14 +10,15 @@ @click.command() -@click.argument("worker-dir", type=str) -@click.argument("evkit-path", type=click.Path(exists=True)) +@click.argument("experiment-name", type=str) +@click.argument("evkit-dir-path", type=click.Path(exists=True)) @click.option("--config-file", type=click.Path(exists=True), help="Path to simulation config.", default="config.json") @click.option("--workers", type=int, default=32, help="Number of workers. Default 32.") +@click.option("--spawn-workers-at-main-node", is_flag=True, help="Spawn workers at main node.") @click.option("--try-to-kill", is_flag=True, help="Try to kill workers after simulation is computed.") -def distributed(worker_dir, evkit_path, config_file, workers, try_to_kill): +def distributed(experiment_name, evkit_path, config_file, workers, spawn_workers_at_main_node, try_to_kill): work_dir = Path(os.getcwd()).absolute() - worker_dir = work_dir / worker_dir + worker_dir = work_dir / experiment_name env_path = os.environ["VIRTUAL_ENV"] modules = [ "Python/3.10.8-GCCcore-12.2.0", @@ -39,7 +40,8 @@ def distributed(worker_dir, evkit_path, config_file, workers, try_to_kill): EVKIT_PATH=evkit_path, MODULES=modules, ENV_PATH=env_path, - try_to_kill=try_to_kill + try_to_kill=try_to_kill, + spawn_workers_at_main_node=spawn_workers_at_main_node ) # result = bench(nodes, WORKER_DIR)